Skip to content
Snippets Groups Projects
1-samples_selection.py 6.94 KiB
Newer Older
from Packages import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import *
from Class_Mod.DATA_HANDLING import *

st.session_state["interface"] = st.session_state.get('interface')
if st.session_state["interface"] == 'simple':
    hide_pages("Predictions")

################################### Data Loading and Visualization ########################################
# container1 = st.header("Data loading",border=True)
col2, col1 = st.columns([3, 1])
DIANE's avatar
DIANE committed
col1.header("Data Loading", divider='blue')
col2.header("Spectral Data Visualization", divider='blue')
## Preallocation of data structure
data_import = pd.DataFrame
meta_data = pd.DataFrame
selected_samples = pd.DataFrame
# loader for csv file containing NIRS spectra
sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
#with container1:
if sselectx_csv:
    test = sselectx_csv.name[sselectx_csv.name.find('.'):]
    if test== '.csv':
        with col1:
            # Select list for CSV delimiter
            psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
                # Select list for CSV header True / False
            phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31)
            if phdr == 'yes':
                col = 0
            else:
                col = False
            imp = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
            data_import = col_cat(imp)[0]
            meta_data = col_cat(imp)[1]
            st.success("The data have been loaded successfully", icon="")

    elif test == '.dx':
        # Create a temporary file to save the uploaded file
        with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
            tmp.write(sselectx_csv.read())
            tmp_path = tmp.name
            with col1:
                _, data_import, meta_data = read_dx(file =  tmp_path)
                st.success("The data have been loaded successfully", icon="")
        os.unlink(tmp_path)
if not data_import.empty:
    ## Visualize spectra
    with col2:
        fig = plot_spectra(data_import)
        #plt.annotate(text = info.T, xy =(m, info.loc[:,"Max"]), size=20, color = 'black', backgroundcolor='red')
        st.pyplot(fig)
############################## Exploratory data analysis ###############################
container2 = st.container(border=True)
container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
scores, loadings, pc = st.columns([2, 3, 0.5])
influence, hotelling, qexp = st.columns([2, 2, 1])
dim_red_methods=['', 'PCA','UMAP', 'NMF']
cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
dr_model = None
cl_model = None

# Dimensionality reduction
t = pd.DataFrame
if not data_import.empty:
    dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37)
    clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38)
    if dim_red_method == dim_red_methods[1]:
        dr_model = LinearPCA(data_import, Ncomp=5)
    elif dim_red_method == dim_red_methods[2]:
        dr_model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0)
        
    if dr_model:
        axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
        axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1)
        axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2)
        t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1)


# clustering
labels = pd.DataFrame
if not t.empty:
        # Clustering
        if clus_method == cluster_methods[1]:
            ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
            cl_model = Sk_Kmeans(t, max_clusters = 30)
            fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia')
            scores.plotly_chart(fig2)
            data, labels = cl_model.fit_optimal(nclusters = ncluster)

        elif clus_method == cluster_methods[1]:
                from hdbscan import HDBSCAN_function
                labels, hdbscan_score = HDBSCAN_function(t, min_cluster_size=10)
                
##### Plots 
if not t.empty:
    with scores:
        st.write('Scores plot')
        # scores plot with clustering
        if not pd.DataFrame(labels).empty:
            fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = labels)
        else:
        # scores plot with metadata
            if not meta_data.empty: 
                filter = meta_data.columns[1:]
                col = st.selectbox('filter', options= filter)
                if col == 0:
                    fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3)  
                    fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) )
            else:
        # scores plot with neither metadata nor clustering
                fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3)        
        fig.update_traces(marker=dict(size=4))
        st.plotly_chart(fig)




if not data_import.empty:
    if dim_red_method == dim_red_methods[1]:
        with loadings:
            st.write('Loadings plot')
            p = dr_model.loadings_
            pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
            df1 = pp.melt(id_vars="wl")
            fig = px.line(df1, x = 'wl', y = 'value', color='variable')
            fig.update_layout(legend=dict(x=1, y=0,font=dict(family="Courier", size=12, color="black"),
                                        bordercolor="Black", borderwidth=2))
            st.plotly_chart(fig, use_container_width = True)

        with influence:
            st.write('Influence plot')
            ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3)
            leverage = dr_model.leverage_
            residuals = dr_model.residuals_
            fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
            st.plotly_chart(fig)
        
        with hotelling:
                st.write('T²-Hotelling vs Q residuals plot')
                hotelling = dr_model.hotelling_
                ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4)
                hotelling = dr_model.hotelling_
                fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="",yaxis_title="Residuals")