From 023dfa7d416099d0e9867fcbf323ce9d3cc3d6ab Mon Sep 17 00:00:00 2001
From: DIANE <abderrahim.diane@cefe.cnrs.fr>
Date: Fri, 12 Apr 2024 16:07:06 +0200
Subject: [PATCH] code readability and complexity were enhanced modifications
 were incorporated

---
 Class_Mod/DATA_HANDLING.py   |  12 +-
 Class_Mod/DxReader.py        |  37 +++--
 Class_Mod/Miscellaneous.py   |  15 ++
 Class_Mod/UMAP_.py           |   5 +-
 Class_Mod/__init__.py        |   2 +-
 Modules.py                   |   4 +-
 Packages.py                  |   2 +
 pages/1-samples_selection.py | 300 ++++++++++++++++-------------------
 pages/3-prediction.py        |   1 -
 predictions/.gitkeep         |   0
 10 files changed, 183 insertions(+), 195 deletions(-)
 delete mode 100644 predictions/.gitkeep

diff --git a/Class_Mod/DATA_HANDLING.py b/Class_Mod/DATA_HANDLING.py
index 02e5694..10fb7ab 100644
--- a/Class_Mod/DATA_HANDLING.py
+++ b/Class_Mod/DATA_HANDLING.py
@@ -31,17 +31,13 @@ def col_cat(data_import):
     if len(categorical_columns_list) > 0:
         categorical_data = pd.concat(categorical_columns_list, axis=1)
     if len(categorical_columns_list) == 0:
-        empty = ["" for x in range(len(data_import))]
-        categorical_columns_list.append(empty)
-        categorical_data = pd.DataFrame(categorical_columns_list).T
-        categorical_data.columns = ['no categories']
+        categorical_data = pd.DataFrame
     # Create numerical data matrix from the numerical columns list and fill na with the mean of the column
     numerical_data = pd.concat(numerical_columns_list, axis=1)
     numerical_data = numerical_data.apply(lambda x: x.fillna(x.mean())) #np.mean(x)))
-    # Scale the numerical data
-    scaler = StandardScaler()
-    scaled_values = scaler.fit_transform(numerical_data)
-    return numerical_data, categorical_data, scaled_values
+
+    return numerical_data, categorical_data
+
 
 
 def list_files(mypath, import_type):
diff --git a/Class_Mod/DxReader.py b/Class_Mod/DxReader.py
index d877ff2..f024894 100644
--- a/Class_Mod/DxReader.py
+++ b/Class_Mod/DxReader.py
@@ -29,21 +29,21 @@ class DxRead:
             block_met = {   'name': block['title'],
                             'origin': block['origin'],
                             'date': block['date'],
-                            'time': block['time'],
-                            'spectrometer/data system': block['spectrometer/data system'],
-                            'instrumental parameters': block['instrumental parameters'],
-                            'xunits': block['xunits'],
-                            'yunits': block['yunits'],
-                            'xfactor': block['xfactor'],
-                            'yfactor': block['yfactor'],
-                            'firstx': block['firstx'],
-                            'lastx': block['lastx'],
-                            'firsty':block['firsty'],
-                            'miny': block['miny'],
-                            'maxy': block['maxy'],
-                            'npoints': block['npoints'],
+            #                'time': block['time'],
+            #                'spectrometer/data system': block['spectrometer/data system'],
+            #                'instrumental parameters': block['instrumental parameters'],
+            #                'xunits': block['xunits'],
+            #                'yunits': block['yunits'],
+            #                'xfactor': block['xfactor'],
+            #                'yfactor': block['yfactor'],
+            #                'firstx': block['firstx'],
+            #                'lastx': block['lastx'],
+            #                'firsty':block['firsty'],
+            #                'miny': block['miny'],
+            #                'maxy': block['maxy'],
+            #                'npoints': block['npoints'],
                             'concentrations':block['concentrations'],
-                            'deltax':block['deltax']
+            #                'deltax':block['deltax']
                             }
             self.__met[f'{i}'] = block_met
         self.metadata_ = pd.DataFrame(self.__met).T
@@ -87,8 +87,13 @@ class DxRead:
         return self.spectra
     @property
     def md_df_(self):
-        return self.metadata_
+        return self.metadata_.drop("concentrations", axis = 1)
     
     @property
     def chem_data_(self):
-         return self.chem_data
\ No newline at end of file
+         return self.chem_data
+    
+@st.cache_data
+def read_dx(file):
+     M = DxRead(file)
+     return M.chem_data, M.specs_df_, M.md_df_
\ No newline at end of file
diff --git a/Class_Mod/Miscellaneous.py b/Class_Mod/Miscellaneous.py
index 1627b39..79d1708 100644
--- a/Class_Mod/Miscellaneous.py
+++ b/Class_Mod/Miscellaneous.py
@@ -47,3 +47,18 @@ def resid_plot( meas, pred):
 def download_results(data, export_name):
     with open(data) as f:
         st.download_button('Download Results', f, export_name)
+
+@st.cache_resource
+def plot_spectra(df):
+    if isinstance(df.columns[0], str):
+        m = 0
+    else: 
+        m = np.min(df.columns)
+
+    fig, ax = plt.subplots(figsize = (30,7))
+    df.T.plot(legend=False, ax = ax, color = 'blue')
+    ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
+    ax.set_ylabel('Signal intensity', fontsize=18)
+    plt.margins(x = 0)
+    plt.annotate(text = f'The total number of spectra is {df.shape[0]}', xy =(m, np.max(df)), size=20, color = 'black', backgroundcolor='red')
+    return fig
diff --git a/Class_Mod/UMAP_.py b/Class_Mod/UMAP_.py
index e9ae0dc..21d2f82 100644
--- a/Class_Mod/UMAP_.py
+++ b/Class_Mod/UMAP_.py
@@ -5,10 +5,7 @@ from Class_Mod.DATA_HANDLING import *
 
 class Umap:
     def __init__(self, x, n_components, n_neighbors, min_dist):
-        self.numerical_data, categorical_data, scaled_values = col_cat(x)
-        self.catdata = list(categorical_data.columns)
-
-        self.x = scaled_values
+        self.x = x
         
         self.model = UMAP(n_neighbors=20, n_components=4, min_dist=0.0,) # random_state=42,)
         self.model.fit(self.x)
diff --git a/Class_Mod/__init__.py b/Class_Mod/__init__.py
index eb2dbb5..63b5b5f 100644
--- a/Class_Mod/__init__.py
+++ b/Class_Mod/__init__.py
@@ -7,4 +7,4 @@ from .LWPLSR_ import model_LWPLSR
 from .Regression_metrics import metrics
 from .VarSel import TpeIpls
 from .Miscellaneous import resid_plot, reg_plot
-from .DxReader import DxRead
+from .DxReader import DxRead, read_dx
diff --git a/Modules.py b/Modules.py
index 5439917..f447cdd 100644
--- a/Modules.py
+++ b/Modules.py
@@ -1,4 +1,4 @@
-from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead
+from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, read_dx
 # find_col_index
 
-from Class_Mod.Miscellaneous import prediction, download_results
+from Class_Mod.Miscellaneous import prediction, download_results, plot_spectra
diff --git a/Packages.py b/Packages.py
index 554f28d..924b788 100644
--- a/Packages.py
+++ b/Packages.py
@@ -38,6 +38,8 @@ from PIL import Image
 import plotly.express as px
 import matplotlib.pyplot as plt
 import seaborn as sns
+import matplotlib
+
 ### Important Metrics
 from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score
 
diff --git a/pages/1-samples_selection.py b/pages/1-samples_selection.py
index 6ca45f1..07139bf 100644
--- a/pages/1-samples_selection.py
+++ b/pages/1-samples_selection.py
@@ -3,193 +3,167 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
 from Modules import *
 from Class_Mod.DATA_HANDLING import *
 
+
+
+
+
 ################################### Data Loading and Visualization ########################################
-container1 = st.container(border=True)
+# container1 = st.header("Data loading",border=True)
 col2, col1 = st.columns([3, 1])
 col1.header("Data Loading", divider='blue')
 col2.header("Spectral Data Visualization", divider='blue')
 
 
-container2 = st.container(border=True)
-container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
-scores, loadings, pc = st.columns([2, 3, 0.5])
-influence, hotelling, qexp = st.columns([2, 2, 1])
+## Preallocation of data structure
+data_import = pd.DataFrame
+meta_data = pd.DataFrame
+selected_samples = pd.DataFrame
 
 
-with container1:
-    # loader for csv file containing NIRS spectra
-    sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
-    if sselectx_csv is not None:
-        test = sselectx_csv.name[sselectx_csv.name.find('.'):]
-        if test== '.csv':
-            with col1:
-                # Select list for CSV delimiter
-                psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
-                # Select list for CSV header True / False
-                phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31)
-                if phdr == 'yes':
-                    col = 0
-                else:
-                    col = False
-                data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
-                st.success("The data have been loaded successfully", icon="✅")
-                ## Visualize spectra
-
-            with col2:
-                fig, ax = plt.subplots(figsize = (30,7))
-                data_import.T.plot(legend=False, ax = ax, color = 'blue')
-                ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
-                ax.set_ylabel('Signal', fontsize=18)
-                plt.margins(x = 0)
-                st.pyplot(fig)
-
-                st.write("Summary")
-                info = pd.DataFrame({'N':[data_import.shape[0]],
-                                    'Min': [np.min(data_import)],
-                                    'Max':[np.max(data_import)],}, index = ['Values']).T
-                info.rename_axis('information')
-                st.table(data=info)
-
-        elif test == '.dx':
-            # Create a temporary file to save the uploaded file
-            with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
-                tmp.write(sselectx_csv.read())
-                tmp_path = tmp.name
-                with col1:
-                        data = DxRead(path = tmp_path)
-                        data_import = data.specs_df_
-                        st.success("The data have been loaded successfully", icon="✅")
-
-                    ## Visualize spectra
-
-                with col2:
-                    fig, ax = plt.subplots(figsize = (30,7))
-                    data_import.T.plot(legend=False, ax = ax, color = 'blue')
-                    ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
-                    ax.set_ylabel('Signal', fontsize=18)
-                    plt.margins(x = 0)
-                    st.pyplot(fig)
-
-                    st.write("Summary")
-                    info = pd.DataFrame({'N':[data_import.shape[0]],
-                                        'Min': [np.min(data_import)],
-                                        'Max':[np.max(data_import)],}, index = ['Values']).T
-                    info.rename_axis('information')
-                    st.table(data=info)
-            os.unlink(tmp_path)
-
-
-    
-
-        
-######################################################################################
+# loader for csv file containing NIRS spectra
+sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
 
-############################## Exploratory data analysis ###############################
-plot_type=['', 'PCA','UMAP', 'NMF']
-cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
-with container2:
-    if sselectx_csv is not None:
-        plot_type=['', 'PCA','UMAP', 'NMF']
-        cluster_methods = ['', 'Kmeans','UMAP', 'AP']
 
-        with pc:
-            type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37)
-            type_cluster = st.selectbox("Clustering techniques: ", options=cluster_methods, key=38)
-            # compute UMAP - umap_maker in application_functions.py
-            if type_plot == 'PCA':
-                model = LinearPCA(data_import, Ncomp=5)
-            elif type_plot =='UMAP':
-                model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0)
+#with container1:
+if sselectx_csv:
+    test = sselectx_csv.name[sselectx_csv.name.find('.'):]
 
+    if test== '.csv':
+        with col1:
+            # Select list for CSV delimiter
+            psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
+                # Select list for CSV header True / False
+            phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31)
+            if phdr == 'yes':
+                col = 0
+            else:
+                col = False
+            imp = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
+            data_import = col_cat(imp)[0]
+            meta_data = col_cat(imp)[1]
+            st.success("The data have been loaded successfully", icon="✅")
+
+    elif test == '.dx':
+        # Create a temporary file to save the uploaded file
+        with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
+            tmp.write(sselectx_csv.read())
+            tmp_path = tmp.name
+            with col1:
+                _, data_import, meta_data = read_dx(file =  tmp_path)
+                st.success("The data have been loaded successfully", icon="✅")
+        os.unlink(tmp_path)
 
 
-        if type_plot in ['PCA', 'UMAP']:
-            # add 2 select lists to choose which component to plot
-            axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0)
-            axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1)
-            axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2)
+if not data_import.empty:
+    ## Visualize spectra
+    with col2:
+        fig = plot_spectra(data_import)
 
-            if type_cluster == 'Kmeans':
-                scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1)
-                cl = Sk_Kmeans(scsc, max_clusters = 30)
+        #plt.annotate(text = info.T, xy =(m, info.loc[:,"Max"]), size=20, color = 'black', backgroundcolor='red')
+        st.pyplot(fig)
 
-            elif type_cluster == 'HDBSCAN':
-                from hdbscan import HDBSCAN_function
-                labels, hdbscan_score = HDBSCAN_function(data_import, min_cluster_size=10)
 
-            with scores:
-                t = model.scores_
-                if type_cluster in ['AP', 'Kmeans']:
-                    st.write('Scree plot')
-                    fig2 = px.scatter(cl.inertia_.T, y = 'inertia')
-                    st.plotly_chart(fig2)
+############################## Exploratory data analysis ###############################
+container2 = st.container(border=True)
+container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
+scores, loadings, pc = st.columns([2, 3, 0.5])
+influence, hotelling, qexp = st.columns([2, 2, 1])
 
-                    ncluster = st.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
-                    data, colors = cl.fit_optimal(nclusters=ncluster)
-                    #fig = px.scatter(data, x=axis1, y=axis2, color= colors)
-                    st.write('Scores plot')
-                    fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors)
-                    fig.update_traces(marker=dict(size=4))
+dim_red_methods=['', 'PCA','UMAP', 'NMF']
+cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
+dr_model = None
+cl_model = None
+
+# Dimensionality reduction
+t = pd.DataFrame
+if not data_import.empty:
+    dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37)
+    clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38)
+    if dim_red_method == dim_red_methods[1]:
+        dr_model = LinearPCA(data_import, Ncomp=5)
+    elif dim_red_method == dim_red_methods[2]:
+        dr_model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0)
+        
+    if dr_model:
+        axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
+        axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1)
+        axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2)
+        t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1)
+
+
+# clustering
+labels = pd.DataFrame
+if not t.empty:
+        # Clustering
+        if clus_method == cluster_methods[1]:
+            ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
+            cl_model = Sk_Kmeans(t, max_clusters = 30)
+            fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia')
+            scores.plotly_chart(fig2)
+            data, labels = cl_model.fit_optimal(nclusters = ncluster)
+
+        elif clus_method == cluster_methods[1]:
+                from hdbscan import HDBSCAN_function
+                labels, hdbscan_score = HDBSCAN_function(t, min_cluster_size=10)
+                
 
+##### Plots 
 
-                elif type_cluster in ['HDBSCAN']:
-                    st.write('plot HDBSCAN clustering')
-                    fig_hdbscan = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
-                    fig_hdbscan.update_traces(marker=dict(size=4))
-                    st.plotly_chart(fig_hdbscan)
-                    st.write('DBCV score = ' + str(hdbscan_score))
-                    # st.dataframe(min_score.stack().agg(['min']))
 
+## Scores
 
+if not t.empty:
+    with scores:
+        st.write('Scores plot')
+        # scores plot with clustering
+        if not pd.DataFrame(labels).empty:
+            fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = labels)
+        else:
+        # scores plot with metadata
+            if not meta_data.empty: 
+                filter = meta_data.columns[1:]
+                col = st.selectbox('filter', options= filter)
+                if col == 0:
+                    fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3)  
                 else:
-                    if test == '.dx':
-                        filter = ['origin', 'date', 'time', 'spectrometer/data system']
-                        col = st.selectbox('filter', options= filter)
-
-                        fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = data.md_df_[col])
-                        fig.update_traces(marker=dict(size=4))
-                    else:
-                        fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3 )
-                        fig.update_traces(marker=dict(size=4))
+                    fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) )
+            else:
+        # scores plot with neither metadata nor clustering
+                fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3)        
+        fig.update_traces(marker=dict(size=4))
+        st.plotly_chart(fig)
+
+
+
+
+if not data_import.empty:
+    if dim_red_method == dim_red_methods[1]:
+        with loadings:
+            st.write('Loadings plot')
+            p = dr_model.loadings_
+            pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
+            df1 = pp.melt(id_vars="wl")
+            fig = px.line(df1, x = 'wl', y = 'value', color='variable')
+            fig.update_layout(legend=dict(x=1, y=0,font=dict(family="Courier", size=12, color="black"),
+                                        bordercolor="Black", borderwidth=2))
+            st.plotly_chart(fig, use_container_width = True)
+
+        with influence:
+            st.write('Influence plot')
+            ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3)
+            leverage = dr_model.leverage_
+            residuals = dr_model.residuals_
+            fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
+            st.plotly_chart(fig)
+        
+        with hotelling:
+                st.write('T²-Hotelling vs Q residuals plot')
+                hotelling = dr_model.hotelling_
+                ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4)
 
+                hotelling = dr_model.hotelling_
+                fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals")
                 st.plotly_chart(fig)
 
 
-            if type_plot =='PCA':
-                with loadings:
-                    st.write('Loadings plot')
-                    p = model.loadings_
-                    pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
-                    df1 = pp.melt(id_vars="wl")
-
-                    fig = px.line(df1, x = 'wl', y = 'value', color='variable')
-                    fig.update_layout(
-                        legend=dict(x=1, y=0,
-                                    font=dict(
-                                        family="Courier", size=12, color="black"),
-                                    bordercolor="Black", borderwidth=2)
-                    )
-                    st.plotly_chart(fig, use_container_width = True)
-
-                
-                with influence:
-                    st.write('Influence plot')
-                    ax1 = st.selectbox("Component", options=model.scores_.columns, index=3)
-                    leverage = model.leverage_
-                    residuals = model.residuals_
-                    fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
-                    st.plotly_chart(fig)
-
-                with hotelling:
-                    st.write('T²-Hotelling vs Q residuals plot')
-                    hotelling = model.hotelling_
-                    ax2 = st.selectbox("Component", options=model.scores_.columns, index=4)
-
-                    hotelling = model.hotelling_
-                    fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals")
-                    st.plotly_chart(fig)
-
-
-            else:
-                st.markdown('Select a dimensionality reduction technique from the dropdown list')
 
diff --git a/pages/3-prediction.py b/pages/3-prediction.py
index 6fba851..d215aa7 100644
--- a/pages/3-prediction.py
+++ b/pages/3-prediction.py
@@ -46,7 +46,6 @@ if NIRS_csv:
 
 if st.button("Predict"):
         if s:
-             
              result = model_loaded.predict(pred_data.iloc[:,idx])
         else:
         # use prediction function from application_functions.py to predict chemical values
diff --git a/predictions/.gitkeep b/predictions/.gitkeep
deleted file mode 100644
index e69de29..0000000
-- 
GitLab