From 1cc1b8f3a719c744f5c33fa9fd6137b65cef56f7 Mon Sep 17 00:00:00 2001
From: DIANE <abderrahim.diane@cefe.cnrs.fr>
Date: Fri, 3 May 2024 10:26:08 +0200
Subject: [PATCH] Meta-dat HDBSCAN CORRECTION

---
 src/Class_Mod/DxReader.py        | 45 ++++++++++---------
 src/Class_Mod/UMAP_.py           |  2 +-
 src/pages/1-samples_selection.py | 77 ++++++++++++++------------------
 src/pages/4-inputs.py            |  2 +-
 4 files changed, 59 insertions(+), 67 deletions(-)

diff --git a/src/Class_Mod/DxReader.py b/src/Class_Mod/DxReader.py
index 08bf1d1..9733727 100644
--- a/src/Class_Mod/DxReader.py
+++ b/src/Class_Mod/DxReader.py
@@ -13,42 +13,38 @@ class DxRead:
         self.__nb = self.__dxfile['blocks'] # Get the total number of blocks = The total number of scanned samples
         self.__list_of_blocks = self.__dxfile['children']  # Store all blocks within a a list
         self.__wl = self.__list_of_blocks[0]["x"] # Wavelengths/frequencies/range 
-        
-
-
+    
         # Start retreiving the data
         specs = np.zeros((self.__nb, len(self.__list_of_blocks[0]["y"])), dtype=float) # preallocate a np matrix for sotoring spectra
         self.idx = np.arange(self.__nb) # This list is designed to store samples name
         self.__met = {}
-
-
         for i in range(self.__nb): # Loop over the blocks
             specs[i] = self.__list_of_blocks[i]['y']
-            
             block = self.__list_of_blocks[i]
             block_met = {   'name': block['title'],
                             'origin': block['origin'],
                             'date': block['date'],
-            #                'time': block['time'],
-            #                'spectrometer/data system': block['spectrometer/data system'],
-            #                'instrumental parameters': block['instrumental parameters'],
+                            #'time': block['time'],
+                            'spectrometer': block['spectrometer/data system'].split('\n$$')[0],
+                            'n_scans':block['spectrometer/data system'].split('\n$$')[6].split('=')[1],
+                            'resolution': block['spectrometer/data system'].split('\n$$')[8].split('=')[1],
+                            #'instrumental parameters': block['instrumental parameters'],
                             'xunits': block['xunits'],
                             'yunits': block['yunits'],
-            #                'xfactor': block['xfactor'],
-            #                'yfactor': block['yfactor'],
-            #                'firstx': block['firstx'],
-            #                'lastx': block['lastx'],
-            #                'firsty':block['firsty'],
-            #                'miny': block['miny'],
-            #                'maxy': block['maxy'],
-            #                'npoints': block['npoints'],
+                            #'xfactor': block['xfactor'],
+                            #'yfactor': block['yfactor'],
+                            'firstx': block['firstx'],
+                            'lastx': block['lastx'],
+                            #'firsty':block['firsty'],
+                            #'miny': block['miny'],
+                            #'maxy': block['maxy'],
+                            'npoints': block['npoints'],
                             'concentrations':block['concentrations'],
-            #                'deltax':block['deltax']
+                            #'deltax':block['deltax']
                             }
+            
             self.__met[f'{i}'] = block_met
         self.metadata_ = pd.DataFrame(self.__met).T
-            
-
         self.spectra = pd.DataFrame(np.fliplr(specs), columns= self.__wl[::-1], index = self.metadata_['name']) # Storing spectra in a pd.dataframe
 
 
@@ -91,7 +87,12 @@ class DxRead:
         me = self.metadata_.drop("concentrations", axis = 1)
         me = me.drop(me.columns[(me == '').all()], axis = 1)
         return me
-    
+    @property
+    def md_df_st_(self):
+         rt = ['origin','date']
+         cl = self.metadata_.loc[:,rt]
+         return cl
+             
     @property
     def chem_data_(self):
          return self.chem_data
@@ -99,4 +100,4 @@ class DxRead:
 @st.cache_data
 def read_dx(file):
      M = DxRead(file)
-     return M.chem_data, M.specs_df_, M.md_df_
+     return M.chem_data, M.specs_df_, M.md_df_, M.md_df_st_
\ No newline at end of file
diff --git a/src/Class_Mod/UMAP_.py b/src/Class_Mod/UMAP_.py
index 28d0436..1b95e14 100644
--- a/src/Class_Mod/UMAP_.py
+++ b/src/Class_Mod/UMAP_.py
@@ -20,7 +20,7 @@ class Umap:
         self.model = UMAP(n_neighbors=20, n_components=3, min_dist=0.0, random_state=42,)
         self.model.fit(self.numerical_data, y = self.categorical_data_encoded)
         self.scores_raw = self.model.transform(self.numerical_data)
-        self.scores = pd.DataFrame(self.scores_raw, index = self.numerical_data.index)
+        self.scores = pd.DataFrame(self.scores_raw)
 
     @property
     def scores_(self):
diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py
index de03b98..f99b7bd 100644
--- a/src/pages/1-samples_selection.py
+++ b/src/pages/1-samples_selection.py
@@ -2,8 +2,6 @@ from Packages import *
 st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
 from Modules import *
 
-
-
 # HTML pour le bandeau "CEFE - CNRS"
 # bandeau_html = """
 # <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;">
@@ -63,7 +61,7 @@ if data_file:
             tmp.write(data_file.read())
             tmp_path = tmp.name
             with col1:
-                _, spectra, meta_data = read_dx(file = tmp_path)
+                _, spectra, meta_data, md_df_st_ = read_dx(file = tmp_path)
                 st.success("The data have been loaded successfully", icon="✅")
         os.unlink(tmp_path)
 
@@ -119,9 +117,9 @@ if not spectra.empty:
 
     elif dim_red_method == dim_red_methods[2]:
         if not meta_data.empty:
-            filter = meta_data.columns[1:]
+            filter = md_df_st_.columns
             col = pc.selectbox('Supervised UMAP by:', options= filter, key=108)
-            supervised = meta_data[col]
+            supervised = md_df_st_[col]
         else:
             supervised = None
         dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised)
@@ -136,10 +134,11 @@ if not spectra.empty:
         t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1)
 
 
-###### 2- clustering #######
+###### II - clustering #######
 if not t.empty:
     tcr = standardize(t)
         # Clustering
+    # 1- K-MEANS Clustering
     if clus_method == cluster_methods[1]:
         cl_model = Sk_Kmeans(tcr, max_clusters = 25)
         ncluster = scores.number_input(min_value=2, max_value=25, value=cl_model.suggested_n_clusters_, label = 'Select the desired number of clusters')
@@ -151,27 +150,35 @@ if not t.empty:
                 f.write(img)    
         data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster)
 
+    # 2- HDBSCAN clustering
     elif clus_method == cluster_methods[2]:
         optimized_hdbscan = Hdbscan(np.array(t))
         labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
         non_clustered = np.where(labels == -1)
-        labels[non_clustered] = 1000
-        labels = labels.tolist()
-    
+        labels[non_clustered] = np.max(labels)+2
+        labels = [f'cluster#{i+1}' for i in labels.tolist()]
+
+    # 3- Affinity propagation
     elif clus_method == cluster_methods[3]:
         cl_model = AP(X=tcr)
         data, labels, clu_centers = cl_model.fit_optimal_
 
 
-###### 3- Samples selection using the reduced data preentation ######
+###### III - Samples selection using the reduced data preentation ######
 selec_strategy = ['center','random']
 samples_df_chem = pd.DataFrame
 selected_samples = []
 selected_samples_idx = []
 
+
 if labels:
-    selection = scores.radio('Select samples selection strategy:',
-                            options = selec_strategy)
+    if clus_method:
+        if clus_method == cluster_methods[2]:
+            selection = scores.radio('Select samples selection strategy:',
+                                    options = ['random'])
+        else:
+            selection = scores.radio('Select samples selection strategy:',
+                                    options = selec_strategy)
     if selection == selec_strategy[0]:
         # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
         closest, _ = pairwise_distances_argmin_min(clu_centers, tcr)
@@ -183,7 +190,7 @@ if labels:
         for i in np.unique(labels):
             C = np.where(np.array(labels) == i)[0]
             if C.shape[0] >= selection_number:
-                #scores.write(list(tcr.index)[labels== i])
+                # scores.write(list(tcr.index)[labels== i])
                 km2 = KMeans(n_clusters = selection_number)
                 km2.fit(tcr.iloc[C,:])
                 clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:])
@@ -223,28 +230,28 @@ if not t.empty:
     
         # scores plot with metadata
         elif len(list(labels)) == 0 and not meta_data.empty:
-            filter = meta_data.columns[1:]
+            filter = md_df_st_.columns
             col = st.selectbox('Color by:', options= filter)
             if col == 0:
                 fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
                 sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1)
                 sns.scatterplot(data = tcr, x = axis2, y =axis3 , ax = ax2)
-                sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,meta_data[col])), ax = ax3)
+                sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3)
 
 
             else:
-                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) )
-                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax1)
-                sns.scatterplot(data = tcr, x = axis2, y =axis3 , hue = list(map(str.lower,meta_data[col])), ax = ax2)
-                sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,meta_data[col])), ax = ax3)
+                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,md_df_st_[col])) )
+                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax1)
+                sns.scatterplot(data = tcr, x = axis2, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax2)
+                sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3)
 
         # color with scores and metadata
         elif len(list(labels)) > 0  and not meta_data.empty:
             if clus_method in cluster_methods[1:]:
                 filter = ['None', clus_method]
-                filter.extend(meta_data.columns[1:])
+                filter.extend(md_df_st_.columns)
             else:
-                filter = meta_data.columns[1:].insert(0,'None')
+                filter = md_df_st_.columns.insert(0,'None')
 
             col = st.selectbox('Color by:', options= filter)
             if col == "None":
@@ -254,10 +261,10 @@ if not t.empty:
                 fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
                 sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1)
             else:
-                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])))
-                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax1)
-                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax2)
-                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax3)
+                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,md_df_st_[col])))
+                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax1)
+                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax2)
+                sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3)
 
         else:
             fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
@@ -267,7 +274,7 @@ if not t.empty:
         if selected_samples_idx:
             tt = tcr.iloc[selected_samples_idx,:]
             fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2],
-                              z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 7, color = 'black'),
+                              z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 5, color = 'black'),
                               name = 'selected samples')
         
         plt.savefig("./Report/Figures/test.png")
@@ -354,20 +361,4 @@ if not spectra.empty:
             hotelling = dr_model.hotelling_
             fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals")
             st.plotly_chart(fig, use_container_width=True)
-            fig.write_image("./Report/figures/graphe_hotelling.png", format="png")
-
-    if dim_red_method == dim_red_methods[2] and clus_method == cluster_methods[2]: # UMAP clustered by HDBSCAN
-        with loadings: # Display some clustering metrics
-            st.write('Clustering metrics:')
-            clusters_number = set(labels)
-            clusters_number.remove(-1)
-            st.write('Optimal number of clusters = ' + str(len(clusters_number)))
-            st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
-            st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')
-    
-
-
-
-       
-
-    
\ No newline at end of file
+            fig.write_image("./Report/figures/graphe_hotelling.png", format="png")
\ No newline at end of file
diff --git a/src/pages/4-inputs.py b/src/pages/4-inputs.py
index 4540d4d..6711823 100644
--- a/src/pages/4-inputs.py
+++ b/src/pages/4-inputs.py
@@ -70,7 +70,7 @@ with st.container():
 
             # Save the form data here
             st.session_state['form_submitted'] = True
-            st.success('Form sent successfully!')
+            st.success('Form sent successfully!', icon="✅")
 
             # Création du dictionnaire avec les données du formulaire
             form_data = {
-- 
GitLab