1-samples_selection.py

from Packages import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import *

# HTML pour le bandeau "CEFE - CNRS"
# bandeau_html = """
# <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;">
#   <h1 style="text-align: center; color: white;">CEFE - CNRS</h1>
# </div>
# """
# # Injecter le code HTML du bandeau
# st.markdown(bandeau_html, unsafe_allow_html=True)
add_header()


st.session_state["interface"] = st.session_state.get('interface')
if st.session_state["interface"] == 'simple':
    hide_pages("Predictions")

################################### I - Data Loading and Visualization ########################################
col2, col1 = st.columns([3, 1])
col1.header("Data Loading", divider='blue')
col2.header("Spectral Data Visualization", divider='blue')


## Preallocation of data structure
spectra = pd.DataFrame
meta_data = pd.DataFrame
selected_samples = pd.DataFrame
non_clustered = None
colnames = []
rownames = []

# loader for datafile
data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)

if data_file:
    # Retrieve the extension of the file
    test = data_file.name[data_file.name.find('.'):]

    ## Load .csv file
    if test== '.csv':
        with col1:
            # Select list for CSV delimiter
            psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9)
                # Select list for CSV header True / False
            phdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31)
            if phdr == 'yes':
                col = 0
            else:
                col = False
            imp = pd.read_csv(data_file, sep=psep, index_col=col)
            # spectra = col_cat(imp)[0]
            # meta_data = col_cat(imp)[1]
            spectra, md_df_st_ = col_cat(imp)
            st.success("The data have been loaded successfully", icon="✅")

    ## Load .dx file
    elif test == '.dx':
        # Create a temporary file to save the uploaded file
        with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
            tmp.write(data_file.read())
            tmp_path = tmp.name
            with col1:
                _, spectra, meta_data, md_df_st_ = read_dx(file = tmp_path)
                st.success("The data have been loaded successfully", icon="✅")
        os.unlink(tmp_path)


## Visualize spectra
if not spectra.empty:
    # retrieve columns name and rows name of spectra
    colnames = list(spectra.columns)
    rownames = [str(i) for i in list(spectra.index)]
    spectra.index = rownames

    with col2:
        if test =='.dx':
            if meta_data.loc[:,'xunits'][0] == '1/cm':
                lab = 'Wavenumber (1/cm)'
            else:
                lab = 'Wavelength (nm)'
            fig = plot_spectra(spectra, xunits = lab, yunits = meta_data.loc[:,'yunits'][0])
        else:
            fig = plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = 'Signal intensity')

        st.pyplot(fig)
        fig.savefig("./Report/figures/Spectra_Plot.png")


############################## Exploratory data analysis ###############################
container2 = st.container(border=True)
container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
scores, loadings, pc = st.columns([2, 3, 0.5])
influence, hotelling, qexp = st.columns([2, 2, 1])
st.header('Selected samples for chemical analysis', divider='blue')
selected_s, selected_samples_metd = st.columns([3, 3])

dim_red_methods=['', 'PCA','UMAP', 'NMF']  # List of dimensionality reduction algos
cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos

dr_model = None # dimensionality reduction model
cl_model = None # clustering model

###### 1- Dimensionality reduction ######
t = pd.DataFrame # scores
p = pd.DataFrame # loadings
labels = []
if not spectra.empty:
    dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37)
    clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38)
    xc = standardize(spectra, center=True, scale=False)


    if dim_red_method == dim_red_methods[1]:
        dr_model = LinearPCA(xc, Ncomp=8)

    elif dim_red_method == dim_red_methods[2]:
        if not meta_data.empty:
            filter = md_df_st_.columns
            col = pc.selectbox('Supervised UMAP by:', options= filter, key=108)
            supervised = md_df_st_[col]
        else:
            supervised = None
        dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised)

    elif dim_red_method == dim_red_methods[3]:
        dr_model = Nmf(spectra, Ncomp= 3)

    if dr_model:
        axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
        axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1)
        axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2)
        t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1)


###### II - clustering #######
if not t.empty:
    tcr = standardize(t)
        # Clustering
    # 1- K-MEANS Clustering
    if clus_method == cluster_methods[1]:
        cl_model = Sk_Kmeans(tcr, max_clusters = 25)
        ncluster = scores.number_input(min_value=2, max_value=25, value=cl_model.suggested_n_clusters_, label = 'Select the desired number of clusters')
        fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia')
        scores.write(f"Suggested n_clusters : {cl_model.suggested_n_clusters_}")
        scores.plotly_chart(fig2,use_container_width=True)
        img = pio.to_image(fig2, format="png")
        with open("./Report/figures/Elbow.png", "wb") as f:
                f.write(img)    
        data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster)

    # 2- HDBSCAN clustering
    elif clus_method == cluster_methods[2]:
        optimized_hdbscan = Hdbscan(np.array(t))
        all_labels, hdbscan_score, clu_centers = optimized_hdbscan.HDBSCAN_scores_
        labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in all_labels]
        non_clustered = np.where(np.array(labels) == 'Non clustered')
        clustered = np.where(np.array(labels) != 'Non clustered')[0]
        
        #st.write(optimized_hdbscan.non_clustered)
    # 3- Affinity propagation
    elif clus_method == cluster_methods[3]:
        cl_model = AP(X = tcr)
        data, labels, clu_centers = cl_model.fit_optimal_


###### III - Samples selection using the reduced data preentation ######
selec_strategy = ['center','random']
samples_df_chem = pd.DataFrame
selected_samples = []
selected_samples_idx = []


if labels:
    if clus_method:
            selection = scores.radio('Select samples selection strategy:',
                                    options = selec_strategy)
    if clus_method == cluster_methods[2]:
        tcr = tcr.iloc[clustered,:]

    if selection == selec_strategy[0]:
        # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
        closest, _ = pairwise_distances_argmin_min(clu_centers, tcr)
        selected_samples_idx = list(closest)

    elif selection == selec_strategy[1]:
        selection_number = scores.number_input('How many samples per cluster?',
                                                min_value = 1, step=1, value = 3)
        for i in np.unique(labels):
            C = np.where(np.array(labels) == i)[0]
            if C.shape[0] >= selection_number:
                # scores.write(list(tcr.index)[labels== i])
                km2 = KMeans(n_clusters = selection_number)
                km2.fit(tcr.iloc[C,:])
                clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:])
                selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index)
            else:
                selected_samples_idx.extend(tcr.iloc[C,:].index.to_list())
            # list indexes of selected samples for colored plot    

    if selected_samples_idx:
        if meta_data.empty:
            sam = pd.DataFrame({'name': spectra.index[selected_samples_idx],
                                'cluster':np.array(labels)[selected_samples_idx]},
                                index = selected_samples_idx)
        else:
            sam = meta_data.iloc[selected_samples_idx,:]
            sam.insert(loc=0, column='index', value=selected_samples_idx)
            sam.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx])
        
        sam.index = np.arange(len(selected_samples_idx))+1
        
        selected_s.write(f' The selected subset consists of {sam.shape[0]} samples')
        selected_s.write(sam)
        selected_s.checkbox("Include non clustered samples (for HDBSCAN clustering)")
        
################################      Plots visualization          ############################################
    ## Scores
if not t.empty:
    with scores:
        fig1, ((ax1, ax2),(ax3,ax4)) = plt.subplots(2,2)
        st.write('Scores plot')
        # scores plot with clustering
        if list(labels) and meta_data.empty:
            fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
            sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = labels, ax = ax1)
            

        # scores plot with metadata
        elif len(list(labels)) == 0 and not meta_data.empty:
            filter = md_df_st_.columns
            col = st.selectbox('Color by:', options= filter)
            if col == 0:
                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
                sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1)
                sns.scatterplot(data = tcr, x = axis2, y =axis3 , ax = ax2)
                sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3)


            else:
                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,md_df_st_[col])) )
                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax1)
                sns.scatterplot(data = tcr, x = axis2, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax2)
                sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3)

        # color with scores and metadata
        elif len(list(labels)) > 0  and not meta_data.empty:
            if clus_method in cluster_methods[1:]:
                filter = ['None', clus_method]
                filter.extend(md_df_st_.columns)
            else:
                filter = md_df_st_.columns.insert(0,'None')

            col = st.selectbox('Color by:', options= filter)
            if col == "None":
                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
                sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1)
            elif col == clus_method:
                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
                sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1)
            else:
                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,md_df_st_[col])))
                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax1)
                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax2)
                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3)

        else:
            fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
            sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1)
        fig.update_traces(marker=dict(size=4))

        if selected_samples_idx:
            tt = tcr.iloc[selected_samples_idx,:]
            fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2],
                              z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 5, color = 'black'),
                              name = 'selected samples')
        
        plt.savefig("./Report/Figures/test.png")
        st.plotly_chart(fig, use_container_width=True)

        if labels:
            num_clusters = len(np.unique(labels))

            custom_color_palette = px.colors.qualitative.Plotly[:num_clusters]
            color_discrete_sequence=custom_color_palette

            # Créer et exporter le graphique Axe1-Axe2 en PNG
            fig_axe1_axe2 = px.scatter(tcr, x=axis1, y=axis2, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
            fig_axe1_axe2.update_layout(title='Axe1-Axe2')
            fig_axe1_axe2.update_traces(marker=dict(size=4))
            fig_axe1_axe2.write_image("./Report/Figures/plot_axe1_axe2.png")


            # Créer et exporter le graphique Axe1-Axe3 en PNG
            fig_axe1_axe3 = px.scatter(tcr, x=axis1, y=axis3, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
            fig_axe1_axe3.update_layout(title='Axe1-Axe3')
            fig_axe1_axe3.update_traces(marker=dict(size=4))
            fig_axe1_axe3.write_image("./Report/Figures/plot_axe1_axe3.png")


            # Créer et exporter le graphique Axe2-Axe3 en PNG
            fig_axe2_axe3 = px.scatter(tcr, x=axis2, y=axis3, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
            fig_axe2_axe3.update_layout(title='Axe2-Axe3')
            fig_axe2_axe3.update_traces(marker=dict(size=4))
            fig_axe2_axe3.write_image("./Report/Figures/plot_axe2_axe3.png")


if not spectra.empty:
    if dim_red_method == dim_red_methods[1] or dim_red_method == dim_red_methods[3]:
        with loadings:
            st.write('Loadings plot')
            p = dr_model.loadings_
            freq = pd.DataFrame(colnames, index=p.index)
            
            
            if test =='.dx':
                if meta_data.loc[:,'xunits'][0] == '1/cm':
                    freq.columns = ['Wavenumber (1/cm)']
                    xlab = "Wavenumber (1/cm)"
                    inv = 'reversed'
                else:
                    freq.columns = ['Wavelength (nm)']
                    xlab = 'Wavelength (nm)'
                    inv = None
                    
            else:
                freq.columns = ['Wavelength/Wavenumber']
                xlab = 'Wavelength/Wavenumber'
                inv = None
                
            pp = pd.concat([p, freq], axis=1)
            #########################################
            df1 = pp.melt(id_vars=freq.columns)
            fig = px.line(df1, x=freq.columns, y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly)
            fig.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"),
                                        bordercolor="black", borderwidth=2))
            fig.update_layout(xaxis_title = xlab,yaxis_title = "Intensity" ,xaxis = dict(autorange= inv))

            
            st.plotly_chart(fig, use_container_width=True)
            

            # Export du graphique
            img = pio.to_image(fig, format="png")
            with open("./Report/figures/graphe_loadings.png", "wb") as f:
                f.write(img)

    if dim_red_method == dim_red_methods[1]:
        with influence:
            st.write('Influence plot')
            ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3)
            leverage = dr_model.leverage_
            residuals = dr_model.residuals_
            fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color=leverage[ax1]*residuals[ax1], color_continuous_scale='Blues')
            fig.update_layout(xaxis_title="Leverage", yaxis_title="Residuals")
            st.plotly_chart(fig, use_container_width=True)
            img = pio.to_image(fig, format="png")
            with open("./Report/figures/graphe_influence.png", "wb") as f:
                f.write(img)


        with hotelling:
            st.write('T²-Hotelling vs Q residuals plot')
            hotelling = dr_model.hotelling_
            ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4)

            hotelling = dr_model.hotelling_
            fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals")
            st.plotly_chart(fig, use_container_width=True)
            fig.write_image("./Report/figures/graphe_hotelling.png", format="png")