From 9c51385277a32857e62fd82f97a306f93fdd5b3e Mon Sep 17 00:00:00 2001
From: DIANE <abderrahim.diane@cefe.cnrs.fr>
Date: Mon, 15 Apr 2024 16:10:02 +0200
Subject: [PATCH] model creation from dx

---
 pages/2-model_creation.py | 171 ++++++++++++++++++++++++--------------
 1 file changed, 109 insertions(+), 62 deletions(-)

diff --git a/pages/2-model_creation.py b/pages/2-model_creation.py
index 3fadcb4..3f506ea 100644
--- a/pages/2-model_creation.py
+++ b/pages/2-model_creation.py
@@ -3,9 +3,12 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
 from Modules import *
 from Class_Mod.DATA_HANDLING import *
 
+
 st.session_state["interface"] = st.session_state.get('interface')
 if st.session_state["interface"] == 'simple':
     hide_pages("Predictions")
+
+
 def nn(x):
     return x is not None
 ########################################################################################
@@ -26,91 +29,135 @@ M9, M10 = st.columns([2,2])
 M9.write("-- Save the model --")
 
 
+files_format = ['.csv', '.dx']
+file = M3.radio('select data file format:', options = files_format)
 
 
-# CSV files loader
-xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
-ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
+### Data
+spectra = pd.DataFrame
+y = pd.DataFrame
 
-
-if xcal_csv is not None and ycal_csv is not None:
+# load .csv file
+if file == files_format[0]:
+    xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
+    ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
+    
+    if xcal_csv and ycal_csv:
+    
         # Select list for CSV delimiter
-        sep = M3.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
+        sep = M3.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)),
+                            options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
         # Select list for CSV header True / False
-        hdr = M3.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
+        hdr = M3.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)),
+                            options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
+        ###############
         if hdr == 'yes':
             col = 0
         else:
             col = False
-        rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i")
-        x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col)
-        # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
-        train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed)
-        # Assign data to training and test sets
-        X_train, y_train, X_test, y_test = pd.DataFrame(x[train_index]), pd.DataFrame(y[train_index]), pd.DataFrame(x[test_index]), pd.DataFrame(y[test_index])
-        y_train = y_train.iloc[:,0]
-        y_test = y_test.iloc[:,0]
-
-
-
-        ############################# Regression modelling ##########################################
-        regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
-        if regression_algo == reg_algo[1]:
-            # Train model with model function from application_functions.py
-            Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test)
-            reg_model = Reg.model_
-            #M2.dataframe(Pin.pred_data_)
-
-        elif regression_algo == reg_algo[2]:
-            reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr)
-
-        elif regression_algo == reg_algo[3]:
-            s = M2.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value=3)
-            it = M2.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100)
-            progress_text = "The model is being created. Please wait."
-            
-            Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s)
-            pro = M1.progress(0, text="The model is being created. Please wait!")
-            rega = Reg.BandSelect(n_iter=it)
-            pro.empty()
-            M1.progress(100, text = "The model has successfully been  created!")
+        ###############
+        spectra, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col)
+        spectra = pd.DataFrame(spectra)
+        y  = pd.DataFrame(y)
+    
+
+
+## Load .dx file
+elif file == files_format[1]:
+    data_file = M3.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file")
+    if data_file:
+        with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
+            tmp.write(data_file.read())
+            tmp_path = tmp.name
+            chem_data, spectra, meta_data = read_dx(file =  tmp_path)
+            M3.success("The data have been loaded successfully", icon="âœ…")
+            yname = M3.selectbox('Select target', options=chem_data.columns)
+            spectra = spectra
+            y = chem_data.loc[:,yname]
+
+        os.unlink(tmp_path)
+
+### split the data
+if not spectra.empty and not y.empty:
+    rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i")
+    # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
+    train_index, test_index = train_test_split_idx(spectra, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed)
+    # Assign data to training and test sets
+    X_train, y_train, X_test, y_test = pd.DataFrame(spectra.iloc[train_index,:]), pd.DataFrame(y.iloc[train_index]), pd.DataFrame(spectra.iloc[test_index,:]), pd.DataFrame(y.iloc[test_index])
+    y_train = y_train.iloc[:,0]
+    y_test = y_test.iloc[:,0]
+    
+
+#######################################
+    regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
+    if regression_algo == reg_algo[1]:
+        # Train model with model function from application_functions.py
+        Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test)
+        reg_model = Reg.model_
+        #M2.dataframe(Pin.pred_data_)
+    elif regression_algo == reg_algo[2]:
+        reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr)
+
+    elif regression_algo == reg_algo[3]:
+        s = M1.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value=3)
+        it = M1.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100)
+        progress_text = "The model is being created. Please wait."
             
-            time.sleep(1)
-            reg_model = Reg.model_
-            M2.table(rega[0])
+        Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s)
+        pro = M1.progress(0, text="The model is being created. Please wait!")
+        rega = Reg.BandSelect(n_iter=it)
+        pro.empty()
+        M1.progress(100, text = "The model has successfully been  created!")            
+        time.sleep(1)
+        reg_model = Reg.model_
+        M2.write('-- Table of selected wavelengths --')
+        M2.table(rega[0])
         
         ################# Model analysis ############
-
-        if regression_algo in reg_algo[1:]:
-            yc = Reg.pred_data_[0]
-            ycv = Reg.pred_data_[1]
-            yt = Reg.pred_data_[2]
+    if regression_algo in reg_algo[1:]:
+        yc = Reg.pred_data_[0]
+        ycv = Reg.pred_data_[1]
+        yt = Reg.pred_data_[2]
             
             
-            M1.write("-- Performance metrics --")
-            M1.dataframe(Reg.metrics_)
+        M2.write("-- Performance metrics --")
+        M2.dataframe(Reg.metrics_)
 
-            M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt]))
-            M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt]))
+        M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt]))
+        M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt]))
             
             
             #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
-            model_name = M9.text_input('Give it a name')
-            if M9.button('Export Model'):
+        model_name = M9.text_input('Give it a name')
+        if M9.button('Export Model'):
+            path = 'data/models/model_'
+            if file == files_format[0]:
                 #export_package = __import__(model_export)
-                with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f:
+                with open(path + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f:
                     joblib.dump(reg_model, f)
-                
-                if regression_algo == reg_algo[3]:
-                    rega[1].sort()
-                    pd.DataFrame(rega[1]).to_csv('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_''Wavelengths_index.csv', sep = ';')
+                    if regression_algo == reg_algo[3]:
+                        rega[1].sort()
+                        pd.DataFrame(rega[1]).to_csv(path + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_'+'Wavelengths_index.csv', sep = ';')
+
+            elif file == files_format[1]:
+                #export_package = __import__(model_export)
+                with open(path + model_name + '_on_'  + '_data_' + '.pkl','wb') as f:
+                    joblib.dump(reg_model, f)
+                    if regression_algo == reg_algo[3]:
+                        rega[1].sort()
+                        pd.DataFrame(rega[1]).to_csv(path + model_name + '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';')
+                        st.write('Model Exported')
+                    
+            if regression_algo == reg_algo[3]:
                 st.write('Model Exported')
-                
+                        
 
                 # create a report with information on the model
                 ## see https://stackoverflow.com/a/59578663
-        #M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv]))
 
 
-                if st.session_state['interface'] == 'simple':
-                    st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !')
+        if st.session_state['interface'] == 'simple':
+            st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !')
+
+
+## Load .dx file
-- 
GitLab