diff --git a/src/Class_Mod/DxReader.py b/src/Class_Mod/DxReader.py index 08bf1d175764bc08d93ca0bd241fc4af7aad5843..973372738142a6d6a1233146fef512d6c5f86461 100644 --- a/src/Class_Mod/DxReader.py +++ b/src/Class_Mod/DxReader.py @@ -13,42 +13,38 @@ class DxRead: self.__nb = self.__dxfile['blocks'] # Get the total number of blocks = The total number of scanned samples self.__list_of_blocks = self.__dxfile['children'] # Store all blocks within a a list self.__wl = self.__list_of_blocks[0]["x"] # Wavelengths/frequencies/range - - - + # Start retreiving the data specs = np.zeros((self.__nb, len(self.__list_of_blocks[0]["y"])), dtype=float) # preallocate a np matrix for sotoring spectra self.idx = np.arange(self.__nb) # This list is designed to store samples name self.__met = {} - - for i in range(self.__nb): # Loop over the blocks specs[i] = self.__list_of_blocks[i]['y'] - block = self.__list_of_blocks[i] block_met = { 'name': block['title'], 'origin': block['origin'], 'date': block['date'], - # 'time': block['time'], - # 'spectrometer/data system': block['spectrometer/data system'], - # 'instrumental parameters': block['instrumental parameters'], + #'time': block['time'], + 'spectrometer': block['spectrometer/data system'].split('\n$$')[0], + 'n_scans':block['spectrometer/data system'].split('\n$$')[6].split('=')[1], + 'resolution': block['spectrometer/data system'].split('\n$$')[8].split('=')[1], + #'instrumental parameters': block['instrumental parameters'], 'xunits': block['xunits'], 'yunits': block['yunits'], - # 'xfactor': block['xfactor'], - # 'yfactor': block['yfactor'], - # 'firstx': block['firstx'], - # 'lastx': block['lastx'], - # 'firsty':block['firsty'], - # 'miny': block['miny'], - # 'maxy': block['maxy'], - # 'npoints': block['npoints'], + #'xfactor': block['xfactor'], + #'yfactor': block['yfactor'], + 'firstx': block['firstx'], + 'lastx': block['lastx'], + #'firsty':block['firsty'], + #'miny': block['miny'], + #'maxy': block['maxy'], + 'npoints': block['npoints'], 'concentrations':block['concentrations'], - # 'deltax':block['deltax'] + #'deltax':block['deltax'] } + self.__met[f'{i}'] = block_met self.metadata_ = pd.DataFrame(self.__met).T - - self.spectra = pd.DataFrame(np.fliplr(specs), columns= self.__wl[::-1], index = self.metadata_['name']) # Storing spectra in a pd.dataframe @@ -91,7 +87,12 @@ class DxRead: me = self.metadata_.drop("concentrations", axis = 1) me = me.drop(me.columns[(me == '').all()], axis = 1) return me - + @property + def md_df_st_(self): + rt = ['origin','date'] + cl = self.metadata_.loc[:,rt] + return cl + @property def chem_data_(self): return self.chem_data @@ -99,4 +100,4 @@ class DxRead: @st.cache_data def read_dx(file): M = DxRead(file) - return M.chem_data, M.specs_df_, M.md_df_ + return M.chem_data, M.specs_df_, M.md_df_, M.md_df_st_ \ No newline at end of file diff --git a/src/Class_Mod/UMAP_.py b/src/Class_Mod/UMAP_.py index 28d0436e6efe90fb251e60627234376cc10467d0..1b95e14cf0148fb13df6341aaf84ec8e9d31b23b 100644 --- a/src/Class_Mod/UMAP_.py +++ b/src/Class_Mod/UMAP_.py @@ -20,7 +20,7 @@ class Umap: self.model = UMAP(n_neighbors=20, n_components=3, min_dist=0.0, random_state=42,) self.model.fit(self.numerical_data, y = self.categorical_data_encoded) self.scores_raw = self.model.transform(self.numerical_data) - self.scores = pd.DataFrame(self.scores_raw, index = self.numerical_data.index) + self.scores = pd.DataFrame(self.scores_raw) @property def scores_(self): diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index de03b98e62d2866f40650c73da0120eba4377694..f99b7bd2b08bbbc5b918619921bad471ba83f5e2 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -2,8 +2,6 @@ from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * - - # HTML pour le bandeau "CEFE - CNRS" # bandeau_html = """ # <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;"> @@ -63,7 +61,7 @@ if data_file: tmp.write(data_file.read()) tmp_path = tmp.name with col1: - _, spectra, meta_data = read_dx(file = tmp_path) + _, spectra, meta_data, md_df_st_ = read_dx(file = tmp_path) st.success("The data have been loaded successfully", icon="✅") os.unlink(tmp_path) @@ -119,9 +117,9 @@ if not spectra.empty: elif dim_red_method == dim_red_methods[2]: if not meta_data.empty: - filter = meta_data.columns[1:] + filter = md_df_st_.columns col = pc.selectbox('Supervised UMAP by:', options= filter, key=108) - supervised = meta_data[col] + supervised = md_df_st_[col] else: supervised = None dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised) @@ -136,10 +134,11 @@ if not spectra.empty: t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1) -###### 2- clustering ####### +###### II - clustering ####### if not t.empty: tcr = standardize(t) # Clustering + # 1- K-MEANS Clustering if clus_method == cluster_methods[1]: cl_model = Sk_Kmeans(tcr, max_clusters = 25) ncluster = scores.number_input(min_value=2, max_value=25, value=cl_model.suggested_n_clusters_, label = 'Select the desired number of clusters') @@ -151,27 +150,35 @@ if not t.empty: f.write(img) data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster) + # 2- HDBSCAN clustering elif clus_method == cluster_methods[2]: optimized_hdbscan = Hdbscan(np.array(t)) labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_ non_clustered = np.where(labels == -1) - labels[non_clustered] = 1000 - labels = labels.tolist() - + labels[non_clustered] = np.max(labels)+2 + labels = [f'cluster#{i+1}' for i in labels.tolist()] + + # 3- Affinity propagation elif clus_method == cluster_methods[3]: cl_model = AP(X=tcr) data, labels, clu_centers = cl_model.fit_optimal_ -###### 3- Samples selection using the reduced data preentation ###### +###### III - Samples selection using the reduced data preentation ###### selec_strategy = ['center','random'] samples_df_chem = pd.DataFrame selected_samples = [] selected_samples_idx = [] + if labels: - selection = scores.radio('Select samples selection strategy:', - options = selec_strategy) + if clus_method: + if clus_method == cluster_methods[2]: + selection = scores.radio('Select samples selection strategy:', + options = ['random']) + else: + selection = scores.radio('Select samples selection strategy:', + options = selec_strategy) if selection == selec_strategy[0]: # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster closest, _ = pairwise_distances_argmin_min(clu_centers, tcr) @@ -183,7 +190,7 @@ if labels: for i in np.unique(labels): C = np.where(np.array(labels) == i)[0] if C.shape[0] >= selection_number: - #scores.write(list(tcr.index)[labels== i]) + # scores.write(list(tcr.index)[labels== i]) km2 = KMeans(n_clusters = selection_number) km2.fit(tcr.iloc[C,:]) clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:]) @@ -223,28 +230,28 @@ if not t.empty: # scores plot with metadata elif len(list(labels)) == 0 and not meta_data.empty: - filter = meta_data.columns[1:] + filter = md_df_st_.columns col = st.selectbox('Color by:', options= filter) if col == 0: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) sns.scatterplot(data = tcr, x = axis2, y =axis3 , ax = ax2) - sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,meta_data[col])), ax = ax3) + sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3) else: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) ) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax1) - sns.scatterplot(data = tcr, x = axis2, y =axis3 , hue = list(map(str.lower,meta_data[col])), ax = ax2) - sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,meta_data[col])), ax = ax3) + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,md_df_st_[col])) ) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax1) + sns.scatterplot(data = tcr, x = axis2, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax2) + sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3) # color with scores and metadata elif len(list(labels)) > 0 and not meta_data.empty: if clus_method in cluster_methods[1:]: filter = ['None', clus_method] - filter.extend(meta_data.columns[1:]) + filter.extend(md_df_st_.columns) else: - filter = meta_data.columns[1:].insert(0,'None') + filter = md_df_st_.columns.insert(0,'None') col = st.selectbox('Color by:', options= filter) if col == "None": @@ -254,10 +261,10 @@ if not t.empty: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels) sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) else: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col]))) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax1) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax2) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax3) + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,md_df_st_[col]))) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax1) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax2) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3) else: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) @@ -267,7 +274,7 @@ if not t.empty: if selected_samples_idx: tt = tcr.iloc[selected_samples_idx,:] fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2], - z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 7, color = 'black'), + z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') plt.savefig("./Report/Figures/test.png") @@ -354,20 +361,4 @@ if not spectra.empty: hotelling = dr_model.hotelling_ fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") st.plotly_chart(fig, use_container_width=True) - fig.write_image("./Report/figures/graphe_hotelling.png", format="png") - - if dim_red_method == dim_red_methods[2] and clus_method == cluster_methods[2]: # UMAP clustered by HDBSCAN - with loadings: # Display some clustering metrics - st.write('Clustering metrics:') - clusters_number = set(labels) - clusters_number.remove(-1) - st.write('Optimal number of clusters = ' + str(len(clusters_number))) - st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3))) - st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).') - - - - - - - \ No newline at end of file + fig.write_image("./Report/figures/graphe_hotelling.png", format="png") \ No newline at end of file diff --git a/src/pages/4-inputs.py b/src/pages/4-inputs.py index 4540d4d7f4b3618eb4d1c9e366d28275f36060f8..671182332587000596b195cbeb46c1a573dd0895 100644 --- a/src/pages/4-inputs.py +++ b/src/pages/4-inputs.py @@ -70,7 +70,7 @@ with st.container(): # Save the form data here st.session_state['form_submitted'] = True - st.success('Form sent successfully!') + st.success('Form sent successfully!', icon="✅") # Création du dictionnaire avec les données du formulaire form_data = {