from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * # HTML pour le bandeau "CEFE - CNRS" # bandeau_html = """ # <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;"> # <h1 style="text-align: center; color: white;">CEFE - CNRS</h1> # </div> # """ # # Injecter le code HTML du bandeau # st.markdown(bandeau_html, unsafe_allow_html=True) add_header() st.session_state["interface"] = st.session_state.get('interface') if st.session_state["interface"] == 'simple': hide_pages("Predictions") ################################### I - Data Loading and Visualization ######################################## col2, col1 = st.columns([3, 1]) col1.header("Data Loading", divider='blue') col2.header("Spectral Data Visualization", divider='blue') ## Preallocation of data structure spectra = pd.DataFrame meta_data = pd.DataFrame selected_samples = pd.DataFrame colnames = [] rownames = [] # loader for datafile data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) if data_file: # Retrieve the extension of the file test = data_file.name[data_file.name.find('.'):] ## Load .csv file if test== '.csv': with col1: # Select list for CSV delimiter psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9) # Select list for CSV header True / False phdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31) if phdr == 'yes': col = 0 else: col = False imp = pd.read_csv(data_file, sep=psep, index_col=col) # spectra = col_cat(imp)[0] # meta_data = col_cat(imp)[1] spectra, meta_data = col_cat(imp) st.success("The data have been loaded successfully", icon="✅") ## Load .dx file elif test == '.dx': # Create a temporary file to save the uploaded file with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: tmp.write(data_file.read()) tmp_path = tmp.name with col1: _, spectra, meta_data = read_dx(file = tmp_path) st.success("The data have been loaded successfully", icon="✅") os.unlink(tmp_path) ## Visualize spectra if not spectra.empty: # retrieve columns name and rows name of spectra colnames = list(spectra.columns) rownames = [str(i) for i in list(spectra.index)] spectra.index = rownames with col2: if test =='.dx': if meta_data.loc[:,'xunits'][0] == '1/cm': lab = 'Wavenumber (1/cm)' else: lab = 'Wavelength (nm)' fig = plot_spectra(spectra, xunits = lab, yunits = meta_data.loc[:,'yunits'][0]) else: fig = plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = 'Signal intensity') st.pyplot(fig) fig.savefig("./Report/figures/Spectra_Plot.png") ############################## Exploratory data analysis ############################### container2 = st.container(border=True) container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') scores, loadings, pc = st.columns([2, 3, 0.5]) influence, hotelling, qexp = st.columns([2, 2, 1]) st.header('Selected samples for chemical analysis', divider='blue') selected_s, selected_samples_metd = st.columns([3, 3]) dim_red_methods=['', 'PCA','UMAP', 'NMF'] # List of dimensionality reduction algos cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos dr_model = None # dimensionality reduction model cl_model = None # clustering model ###### 1- Dimensionality reduction ###### t = pd.DataFrame # scores p = pd.DataFrame # loadings labels = [] if not spectra.empty: dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37) clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38) xc = standardize(spectra) if dim_red_method == dim_red_methods[1]: dr_model = LinearPCA(xc, Ncomp=8) elif dim_red_method == dim_red_methods[2]: if not meta_data.empty: filter = meta_data.columns[1:] col = pc.selectbox('Supervised UMAP by:', options= filter, key=108) supervised = meta_data[col] else: supervised = None dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised) elif dim_red_method == dim_red_methods[3]: dr_model = Nmf(spectra, Ncomp= 3) if dr_model: axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0) axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1) axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2) t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1) ###### 2- clustering ####### if not t.empty: tcr = standardize(t) # Clustering if clus_method == cluster_methods[1]: cl_model = Sk_Kmeans(tcr, max_clusters = 25) ncluster = scores.number_input(min_value=2, max_value=25, value=cl_model.suggested_n_clusters_, label = 'Select the desired number of clusters') fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia') scores.write(f"Suggested n_clusters : {cl_model.suggested_n_clusters_}") scores.plotly_chart(fig2,use_container_width=True) img = pio.to_image(fig2, format="png") with open("./Report/figures/Elbow.png", "wb") as f: f.write(img) data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster) elif clus_method == cluster_methods[2]: optimized_hdbscan = Hdbscan(np.array(t)) labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_ non_clustered = np.where(labels == -1) labels[non_clustered] = 1000 labels = labels.tolist() elif clus_method == cluster_methods[3]: cl_model = AP(X=tcr) data, labels, clu_centers = cl_model.fit_optimal_ ###### 3- Samples selection using the reduced data preentation ###### selec_strategy = ['center','random'] samples_df_chem = pd.DataFrame selected_samples = [] selected_samples_idx = [] if labels: selection = scores.radio('Select samples selection strategy:', options = selec_strategy) if selection == selec_strategy[0]: # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster closest, _ = pairwise_distances_argmin_min(clu_centers, tcr) selected_samples_idx = list(closest) elif selection == selec_strategy[1]: selection_number = scores.number_input('How many samples per cluster?', min_value = 1, step=1, value = 3) for i in np.unique(labels): C = np.where(np.array(labels) == i)[0] if C.shape[0] >= selection_number: #scores.write(list(tcr.index)[labels== i]) km2 = KMeans(n_clusters = selection_number) km2.fit(tcr.iloc[C,:]) clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:]) selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index) else: selected_samples_idx.extend(tcr.iloc[C,:].index.to_list()) # list indexes of selected samples for colored plot if selected_samples_idx: if meta_data.empty: sam = pd.DataFrame({'name': spectra.index[selected_samples_idx], 'cluster':np.array(labels)[selected_samples_idx]}, index = selected_samples_idx) else: sam = meta_data.iloc[selected_samples_idx,:] sam.insert(loc=0, column='index', value=selected_samples_idx) sam.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx]) sam.index = np.arange(len(selected_samples_idx))+1 selected_s.write(f' The selected subset consists of {sam.shape[0]} samples') selected_s.write(sam) ################################ Plots visualization ############################################ ## Scores if not t.empty: with scores: fig1, ((ax1, ax2),(ax3,ax4)) = plt.subplots(2,2) st.write('Scores plot') # scores plot with clustering if list(labels) and meta_data.empty: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels) sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = labels, ax = ax1) # scores plot with metadata elif len(list(labels)) == 0 and not meta_data.empty: filter = meta_data.columns[1:] col = st.selectbox('Color by:', options= filter) if col == 0: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) sns.scatterplot(data = tcr, x = axis2, y =axis3 , ax = ax2) sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,meta_data[col])), ax = ax3) else: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) ) sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax1) sns.scatterplot(data = tcr, x = axis2, y =axis3 , hue = list(map(str.lower,meta_data[col])), ax = ax2) sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,meta_data[col])), ax = ax3) # color with scores and metadata elif len(list(labels)) > 0 and not meta_data.empty: if clus_method in cluster_methods[1:]: filter = ['None', clus_method] filter.extend(meta_data.columns[1:]) else: filter = meta_data.columns[1:].insert(0,'None') col = st.selectbox('Color by:', options= filter) if col == "None": fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) elif col == clus_method: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels) sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) else: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col]))) sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax1) sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax2) sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax3) else: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) fig.update_traces(marker=dict(size=4)) if selected_samples_idx: tt = tcr.iloc[selected_samples_idx,:] fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2], z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 7, color = 'black'), name = 'selected samples') plt.savefig("./Report/Figures/test.png") st.plotly_chart(fig, use_container_width=True) if labels: num_clusters = len(np.unique(labels)) custom_color_palette = px.colors.qualitative.Plotly[:num_clusters] color_discrete_sequence=custom_color_palette # Créer et exporter le graphique Axe1-Axe2 en PNG fig_axe1_axe2 = px.scatter(tcr, x=axis1, y=axis2, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette) fig_axe1_axe2.update_layout(title='Axe1-Axe2') fig_axe1_axe2.update_traces(marker=dict(size=4)) fig_axe1_axe2.write_image("./Report/Figures/plot_axe1_axe2.png") # Créer et exporter le graphique Axe1-Axe3 en PNG fig_axe1_axe3 = px.scatter(tcr, x=axis1, y=axis3, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette) fig_axe1_axe3.update_layout(title='Axe1-Axe3') fig_axe1_axe3.update_traces(marker=dict(size=4)) fig_axe1_axe3.write_image("./Report/Figures/plot_axe1_axe3.png") # Créer et exporter le graphique Axe2-Axe3 en PNG fig_axe2_axe3 = px.scatter(tcr, x=axis2, y=axis3, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette) fig_axe2_axe3.update_layout(title='Axe2-Axe3') fig_axe2_axe3.update_traces(marker=dict(size=4)) fig_axe2_axe3.write_image("./Report/Figures/plot_axe2_axe3.png") if not spectra.empty: if dim_red_method == dim_red_methods[1] or dim_red_method == dim_red_methods[3]: with loadings: st.write('Loadings plot') p = dr_model.loadings_ freq = pd.DataFrame(colnames, index=p.index) if test =='.dx': if meta_data.loc[:,'xunits'][0] == '1/cm': freq.columns = ['Wavenumber (1/cm)'] else: freq.columns = ['Wavelength (nm)'] else: freq.columns = ['Wavelength/Wavenumber'] pp = pd.concat([p, freq], axis=1) ######################################### df1 = pp.melt(id_vars=freq.columns) fig = px.line(df1, x=freq.columns, y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly) fig.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"), bordercolor="black", borderwidth=2)) st.plotly_chart(fig, use_container_width=True) # Export du graphique img = pio.to_image(fig, format="png") with open("./Report/figures/graphe_loadings.png", "wb") as f: f.write(img) if dim_red_method == dim_red_methods[1]: with influence: st.write('Influence plot') ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3) leverage = dr_model.leverage_ residuals = dr_model.residuals_ fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color=leverage[ax1]*residuals[ax1], color_continuous_scale='Blues') fig.update_layout(xaxis_title="Leverage", yaxis_title="Residuals") st.plotly_chart(fig, use_container_width=True) img = pio.to_image(fig, format="png") with open("./Report/figures/graphe_influence.png", "wb") as f: f.write(img) with hotelling: st.write('T²-Hotelling vs Q residuals plot') hotelling = dr_model.hotelling_ ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4) hotelling = dr_model.hotelling_ fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") st.plotly_chart(fig, use_container_width=True) fig.write_image("./Report/figures/graphe_hotelling.png", format="png") if dim_red_method == dim_red_methods[2] and clus_method == cluster_methods[2]: # UMAP clustered by HDBSCAN with loadings: # Display some clustering metrics st.write('Clustering metrics:') clusters_number = set(labels) clusters_number.remove(-1) st.write('Optimal number of clusters = ' + str(len(clusters_number))) st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3))) st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')