Skip to content
Snippets Groups Projects
1-samples_selection.py 13.6 KiB
Newer Older
  • Learn to ignore specific revisions
  • from Packages import *
    st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
    from Modules import *
    from Class_Mod.DATA_HANDLING import *
    
    
    
    
    # HTML pour le bandeau "CEFE - CNRS"
    bandeau_html = """
    <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;">
      <h1 style="text-align: center; color: white;">CEFE - CNRS</h1>
    </div>
    """
    # Injecter le code HTML du bandeau
    st.markdown(bandeau_html, unsafe_allow_html=True)
    
    
    st.session_state["interface"] = st.session_state.get('interface')
    if st.session_state["interface"] == 'simple':
        hide_pages("Predictions")
    
    
    ################################### Data Loading and Visualization ########################################
    col2, col1 = st.columns([3, 1])
    
    DIANE's avatar
    DIANE committed
    col1.header("Data Loading", divider='blue')
    col2.header("Spectral Data Visualization", divider='blue')
    
    ## Preallocation of data structure
    
    DIANE's avatar
    DIANE committed
    spectra = pd.DataFrame
    
    meta_data = pd.DataFrame
    selected_samples = pd.DataFrame
    
    DIANE's avatar
    DIANE committed
    # loader for datafile
    data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
    
    DIANE's avatar
    DIANE committed
    if data_file:
        # Retrieve the extension of the file
        test = data_file.name[data_file.name.find('.'):]
    
    DIANE's avatar
    DIANE committed
        ## Load .csv file
    
        if test== '.csv':
            with col1:
                # Select list for CSV delimiter
    
    DIANE's avatar
    DIANE committed
                psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9)
    
    DIANE's avatar
    DIANE committed
                    # Select list for CSV header True / False
    
    DIANE's avatar
    DIANE committed
                phdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31)
    
                if phdr == 'yes':
                    col = 0
                else:
                    col = False
    
    DIANE's avatar
    DIANE committed
                imp = pd.read_csv(data_file, sep=psep, index_col=col)
    
                # spectra = col_cat(imp)[0]
                # meta_data = col_cat(imp)[1]
                spectra, meta_data = col_cat(imp)
    
                st.success("The data have been loaded successfully", icon="")
    
    
    DIANE's avatar
    DIANE committed
        ## Load .dx file
    
        elif test == '.dx':
            # Create a temporary file to save the uploaded file
            with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
    
    DIANE's avatar
    DIANE committed
                tmp.write(data_file.read())
    
                tmp_path = tmp.name
                with col1:
    
    DIANE's avatar
    DIANE committed
                    st.success("The data have been loaded successfully", icon="")
    
            os.unlink(tmp_path)
    
    DIANE's avatar
    DIANE committed
    ## Visualize spectra
    if not spectra.empty:
    
    DIANE's avatar
    DIANE committed
            fig = plot_spectra(spectra)
    
            st.pyplot(fig)
    
            fig.savefig("./Report/figures/Spectra_Plot.png")
    
    
    ############################## Exploratory data analysis ###############################
    
    container2 = st.container(border=True)
    container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
    scores, loadings, pc = st.columns([2, 3, 0.5])
    influence, hotelling, qexp = st.columns([2, 2, 1])
    
    DIANE's avatar
    DIANE committed
    st.header('Selected samples for chemical analysis')
    selected_s, selected_samples_metd = st.columns([3, 3])
    selected_s.write('Samples scores')
    
    DIANE's avatar
    DIANE committed
    dim_red_methods=['', 'PCA','UMAP', 'NMF']  # List of dimensionality reduction algos
    cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos
    
    dr_model = None # dimensionality reduction model
    cl_model = None # clustering model
    
    
    # Dimensionality reduction
    
    DIANE's avatar
    DIANE committed
    t = pd.DataFrame # scores
    p = pd.DataFrame # loadings
    labels = []
    if not spectra.empty:
    
        dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37)
        clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38)
    
        xc = standardize(spectra)
    
    DIANE's avatar
    DIANE committed
    
    
        if dim_red_method == dim_red_methods[1]:
    
    DIANE's avatar
    DIANE committed
            dr_model = LinearPCA(xc, Ncomp=5)
    
        elif dim_red_method == dim_red_methods[2]:
    
            if not meta_data.empty:
                filter = meta_data.columns[1:]
                col = pc.selectbox('Supervised UMAP by:', options= filter, key=108)
                supervised = meta_data[col]
            else:
                supervised = None
            dr_model = Umap(data_import = imp, numerical_data = MinMaxScale(spectra), cat_data = supervised)
    
        if dr_model:
            axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
            axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1)
            axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2)
            t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1)
    
    
    # clustering
    if not t.empty:
    
    DIANE's avatar
    DIANE committed
        tcr = standardize(t)
    
    DIANE's avatar
    DIANE committed
        if clus_method == cluster_methods[1]:
            ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
            cl_model = Sk_Kmeans(tcr, max_clusters = 30)
            fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia')
    
            scores.plotly_chart(fig2,use_container_width=True)
    
            img = pio.to_image(fig2, format="png")
            with open("./Report/figures/Elbow.png", "wb") as f:
                    f.write(img)    
    
    DIANE's avatar
    DIANE committed
            data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster)
    
    DIANE's avatar
    DIANE committed
    
        elif clus_method == cluster_methods[2]:
    
            optimized_hdbscan = Hdbscan(dr_model.scores_raw_)
    
            labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
    
    DIANE's avatar
    DIANE committed
    
    #####################################################################################################
    selec_strategy = ['center','random']
    samples_df_chem = pd.DataFrame
    selected_samples = []
    selected_samples_idx = []
    
    if labels:
        selection = scores.radio('Select samples selection strategy:', options = selec_strategy)
    #################### selection strategy to  be corrected
        if selection == selec_strategy[0]:
            # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
            closest, _ = pairwise_distances_argmin_min(clu_centers, tcr)
            selected_samples_idx = list(closest)
        elif selection == selec_strategy[1]:
            selection_number = scores.number_input('How many samples per cluster?', min_value = 1, step=1, value = 3)
            for i in np.unique(labels):
                C = np.where(np.array(labels) ==i)[0]
                if C.shape[0] >= selection_number:
                    #scores.write(list(tcr.index)[labels== i])
                    km2 = KMeans(n_clusters = selection_number)
                    km2.fit(tcr.iloc[C,:])
                    clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:])
                    selected_samples_idx2 = list(clos)
                    selected_samples_idx.extend(tcr.iloc[C,:].index[selected_samples_idx2])                
                #    selected_samples_idx.extend(tcr.iloc[C,:].sample(n=selection_number).index.to_list())
                else:
                    selected_samples_idx.extend(tcr.iloc[C,:].index.to_list())
                # list indexes of selected samples for colored plot    
    
    if labels:
        if selected_samples_idx:
            sam = pd.DataFrame({'cluster':np.array(labels)[selected_samples_idx],
                                'index': spectra.index[selected_samples_idx]})
            selected_s.write(sam)
    
            if not meta_data.empty:
                selected_samples_metd.write('Corresponding meta-data')
                meta = meta_data.iloc[selected_samples_idx,:]
                meta['cluster'] = np.array(labels)[selected_samples_idx]
                meta['index'] = spectra.index[selected_samples_idx]
                selected_samples_metd.write(meta)
    
    
        ## Scores
    
    if not t.empty:
        with scores:
            st.write('Scores plot')
            # scores plot with clustering
    
            if list(labels) and meta_data.empty:
    
    DIANE's avatar
    DIANE committed
                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
    
            # scores plot with metadata
    
            elif len(list(labels)) == 0 and not meta_data.empty:
                filter = meta_data.columns[1:]
    
                col = st.selectbox('Color by:', options= filter)
    
    DIANE's avatar
    DIANE committed
                    fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
    
    DIANE's avatar
    DIANE committed
                    fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) )
    
    
            # color with scores and metadata
            elif len(list(labels)) > 0  and not meta_data.empty:
                if clus_method in cluster_methods[1:]:
                    filter = ['None', clus_method]
                    filter.extend(meta_data.columns[1:])
    
                    filter = meta_data.columns[1:].insert(0,'None')
    
    
                col = st.selectbox('Color by:', options= filter)
    
                    fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
    
    DIANE's avatar
    DIANE committed
                    fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
    
    DIANE's avatar
    DIANE committed
                    fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])))
    
                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
    
            fig.update_traces(marker=dict(size=4))
    
    DIANE's avatar
    DIANE committed
    
            if selected_samples_idx:
                tt = tcr.iloc[selected_samples_idx,:]
                fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2],
                                  z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 7, color = 'black'),
                                  name = 'selected samples')
    
            st.plotly_chart(fig, use_container_width=True)
    
            import plotly.express as px
    
    DIANE's avatar
    DIANE committed
            if labels:
                num_clusters = len(np.unique(labels))
    
    DIANE's avatar
    DIANE committed
                custom_color_palette = px.colors.qualitative.Plotly[:num_clusters]
                color_discrete_sequence=custom_color_palette
    
    DIANE's avatar
    DIANE committed
                # Créer et exporter le graphique Axe1-Axe2 en PNG
                fig_axe1_axe2 = px.scatter(tcr, x=axis1, y=axis2, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
                fig_axe1_axe2.update_layout(title='Axe1-Axe2')
                fig_axe1_axe2.update_traces(marker=dict(size=4))
    
    Mouhcine's avatar
    Mouhcine committed
                fig_axe1_axe2.write_image("./Report/Figures/plot_axe1_axe2.png")
    
    DIANE's avatar
    DIANE committed
                # Créer et exporter le graphique Axe1-Axe3 en PNG
                fig_axe1_axe3 = px.scatter(tcr, x=axis1, y=axis3, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
                fig_axe1_axe3.update_layout(title='Axe1-Axe3')
                fig_axe1_axe3.update_traces(marker=dict(size=4))
    
    Mouhcine's avatar
    Mouhcine committed
                fig_axe1_axe3.write_image("./Report/Figures/plot_axe1_axe3.png")
    
    DIANE's avatar
    DIANE committed
                # Créer et exporter le graphique Axe2-Axe3 en PNG
                fig_axe2_axe3 = px.scatter(tcr, x=axis2, y=axis3, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
                fig_axe2_axe3.update_layout(title='Axe2-Axe3')
                fig_axe2_axe3.update_traces(marker=dict(size=4))
    
    Mouhcine's avatar
    Mouhcine committed
                fig_axe2_axe3.write_image("./Report/Figures/plot_axe2_axe3.png")
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
    
        if dim_red_method == dim_red_methods[1]:
    
            with loadings:
                st.write('Loadings plot')
                p = dr_model.loadings_
    
                pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis=1)
    
                df1 = pp.melt(id_vars="wl")
    
                fig = px.line(df1, x='wl', y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly)
                fig.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"),
                                            bordercolor="black", borderwidth=2))
                st.plotly_chart(fig, use_container_width=True)
    
                # Export du graphique
                img = pio.to_image(fig, format="png")
                with open("./Report/figures/graphe_loadings.png", "wb") as f:
                    f.write(img)
    
    
            with influence:
                st.write('Influence plot')
                ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3)
                leverage = dr_model.leverage_
                residuals = dr_model.residuals_
    
    Mouhcine's avatar
    Mouhcine committed
                fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color=leverage[ax1]*residuals[ax1], color_continuous_scale='Blues')
                fig.update_layout(xaxis_title="Leverage", yaxis_title="Residuals")
    
                st.plotly_chart(fig)
    
                img = pio.to_image(fig, format="png")
    
    Mouhcine's avatar
    Mouhcine committed
                with open("./Report/figures/graphe_influence.png", "wb") as f:
    
            with hotelling:
    
                st.write('T²-Hotelling vs Q residuals plot')
                hotelling = dr_model.hotelling_
                ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4)
    
                hotelling = dr_model.hotelling_
                fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="",yaxis_title="Residuals")
                st.plotly_chart(fig, use_container_width=True)
                fig.write_image("./Report/figures/graphe_hotelling.png", format="png")
    
    Mouhcine's avatar
    Mouhcine committed
    
    
        if dim_red_method == dim_red_methods[2] and clus_method == cluster_methods[2]: # UMAP clustered by HDBSCAN
            with loadings: # Display some clustering metrics
                st.write('Clustering metrics:')
                clusters_number = set(labels)
                clusters_number.remove(-1)
                st.write('Optimal number of clusters = ' + str(len(clusters_number)))
                st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
                st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')