Merge remote-tracking branch 'origin/master'

# Conflicts: # Class_Mod/UMAP_.py # Class_Mod/__init__.py # Modules.py # Packages.py # pages/1-samples_selection.py

Merge remote-tracking branch 'origin/master'
# Conflicts: # Class_Mod/UMAP_.py # Class_Mod/__init__.py # Modules.py # Packages.py # pages/1-samples_selection.py
1b115462 · Nicolas Barthes · f98b67f8 · 9c513852 · 1b115462 · 1b115462
Commit 1b115462 authored 11 months ago by Nicolas Barthes
--- a/Class_Mod/DATA_HANDLING.py
+++ b/Class_Mod/DATA_HANDLING.py
@@ -31,17 +31,13 @@ def col_cat(data_import):
    if len(categorical_columns_list) > 0:
        categorical_data = pd.concat(categorical_columns_list, axis=1)
    if len(categorical_columns_list) == 0:
-        empty = ["" for x in range(len(data_import))]
+        categorical_data = pd.DataFrame
-        categorical_columns_list.append(empty)
-        categorical_data = pd.DataFrame(categorical_columns_list).T
-        categorical_data.columns = ['no categories']
    # Create numerical data matrix from the numerical columns list and fill na with the mean of the column
    numerical_data = pd.concat(numerical_columns_list, axis=1)
    numerical_data = numerical_data.apply(lambda x: x.fillna(x.mean())) #np.mean(x)))
-    # Scale the numerical data
-    scaler = StandardScaler()
+    return numerical_data, categorical_data
-    scaled_values = scaler.fit_transform(numerical_data)
-    return numerical_data, categorical_data, scaled_values
 def list_files(mypath, import_type):

--- a/Class_Mod/DxReader.py
+++ b/Class_Mod/DxReader.py
@@ -29,21 +29,21 @@ class DxRead:
            block_met = {   'name': block['title'],
                            'origin': block['origin'],
                            'date': block['date'],
-                            'time': block['time'],
+            #                'time': block['time'],
-                            'spectrometer/data system': block['spectrometer/data system'],
+            #                'spectrometer/data system': block['spectrometer/data system'],
-                            'instrumental parameters': block['instrumental parameters'],
+            #                'instrumental parameters': block['instrumental parameters'],
-                            'xunits': block['xunits'],
+            #                'xunits': block['xunits'],
-                            'yunits': block['yunits'],
+            #                'yunits': block['yunits'],
-                            'xfactor': block['xfactor'],
+            #                'xfactor': block['xfactor'],
-                            'yfactor': block['yfactor'],
+            #                'yfactor': block['yfactor'],
-                            'firstx': block['firstx'],
+            #                'firstx': block['firstx'],
-                            'lastx': block['lastx'],
+            #                'lastx': block['lastx'],
-                            'firsty':block['firsty'],
+            #                'firsty':block['firsty'],
-                            'miny': block['miny'],
+            #                'miny': block['miny'],
-                            'maxy': block['maxy'],
+            #                'maxy': block['maxy'],
-                            'npoints': block['npoints'],
+            #                'npoints': block['npoints'],
                            'concentrations':block['concentrations'],
-                            'deltax':block['deltax']
+            #                'deltax':block['deltax']
                            }
            self.__met[f'{i}'] = block_met
        self.metadata_ = pd.DataFrame(self.__met).T
@@ -87,8 +87,13 @@ class DxRead:
        return self.spectra
    @property
    def md_df_(self):
-        return self.metadata_
+        return self.metadata_.drop("concentrations", axis = 1)
    @property
    def chem_data_(self):
         return self.chem_data
\ No newline at end of file
+@st.cache_data
+def read_dx(file):
+     M = DxRead(file)
+     return M.chem_data, M.specs_df_, M.md_df_
\ No newline at end of file
--- a/Class_Mod/KMEANS_.py
+++ b/Class_Mod/KMEANS_.py
@@ -19,13 +19,6 @@ class Sk_Kmeans:
    def fit_optimal(self, nclusters):
        model = KMeans(n_clusters = nclusters, init = 'k-means++', random_state = 42)
        model.fit(self.x)
-        yp = model.predict(self.x)
+        yp = model.predict(self.x)+1
-        num_colors = nclusters
+        clu = [f'cluster#{i}' for i in yp]
-        colors = ['#' + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in range(num_colors)]
+        return self.x, clu
-        col = np.array(['#' + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in range(self.x.shape[0])])
\ No newline at end of file
-        for i in range(nclusters):
-            ss = np.where(yp==i)
-            col[ss] = colors[i]
-        return self.x, col
\ No newline at end of file
--- a/Class_Mod/Miscellaneous.py
+++ b/Class_Mod/Miscellaneous.py
@@ -47,3 +47,18 @@ def resid_plot( meas, pred):
 def download_results(data, export_name):
    with open(data) as f:
        st.download_button('Download Results', f, export_name)
+@st.cache_resource
+def plot_spectra(df):
+    if isinstance(df.columns[0], str):
+        m = 0
+    else: 
+        m = np.min(df.columns)
+    fig, ax = plt.subplots(figsize = (30,7))
+    df.T.plot(legend=False, ax = ax, color = 'blue')
+    ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
+    ax.set_ylabel('Signal intensity', fontsize=18)
+    plt.margins(x = 0)
+    plt.annotate(text = f'The total number of spectra is {df.shape[0]}', xy =(m, np.max(df)), size=20, color = 'black', backgroundcolor='red')
+    return fig
--- a/Class_Mod/__init__.py
+++ b/Class_Mod/__init__.py
@@ -7,6 +7,6 @@ from .LWPLSR_ import model_LWPLSR
 from .Regression_metrics import metrics
 from .VarSel import TpeIpls
 from .Miscellaneous import resid_plot, reg_plot
-from .DxReader import DxRead
+from .DxReader import DxRead, read_dx
 from .HDBSCAN_Clustering import Hdbscan
--- a/Modules.py
+++ b/Modules.py
-from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan
+from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan, read_dx
 # find_col_index
-from Class_Mod.Miscellaneous import prediction, download_results
+from Class_Mod.Miscellaneous import prediction, download_results, plot_spectra
--- a/Packages.py
+++ b/Packages.py
@@ -41,6 +41,7 @@ from PIL import Image
 import plotly.express as px
 import matplotlib.pyplot as plt
 import seaborn as sns
+import matplotlib
 ### Important Metrics
 from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score

--- a/pages/1-samples_selection.py
+++ b/pages/1-samples_selection.py
@@ -8,195 +8,175 @@ if st.session_state["interface"] == 'simple':
    hide_pages("Predictions")
 ################################### Data Loading and Visualization ########################################
-container1 = st.container(border=True)
 col2, col1 = st.columns([3, 1])
 col1.header("Data Loading", divider='blue')
 col2.header("Spectral Data Visualization", divider='blue')
-container2 = st.container(border=True)
+## Preallocation of data structure
-container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
+spectra = pd.DataFrame
-scores, loadings, pc = st.columns([2, 3, 0.5])
+meta_data = pd.DataFrame
-influence, hotelling, qexp = st.columns([2, 2, 1])
+selected_samples = pd.DataFrame
-with container1:
+# loader for datafile
-    # loader for csv file containing NIRS spectra
+data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
-    sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
-    if sselectx_csv is not None:
-        test = sselectx_csv.name[sselectx_csv.name.find('.'):]
+if data_file:
-        if test== '.csv':
+    # Retrieve the extension of the file
-            with col1:
+    test = data_file.name[data_file.name.find('.'):]
-                # Select list for CSV delimiter
-                psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
+    ## Load .csv file
+    if test== '.csv':
+        with col1:
+            # Select list for CSV delimiter
+            psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9)
                # Select list for CSV header True / False
-                phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31)
+            phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31)
-                if phdr == 'yes':
+            if phdr == 'yes':
-                    col = 0
+                col = 0
-                else:
+            else:
-                    col = False
+                col = False
-                data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
+            imp = pd.read_csv(data_file, sep=psep, index_col=col)
-                data_import, categorical_data, scaled_values = col_cat(data_import)
+            spectra = col_cat(imp)[0]
+            meta_data = col_cat(imp)[1]
+            st.success("The data have been loaded successfully", icon="✅")
+    ## Load .dx file
+    elif test == '.dx':
+        # Create a temporary file to save the uploaded file
+        with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
+            tmp.write(data_file.read())
+            tmp_path = tmp.name
+            with col1:
+                _, spectra, meta_data = read_dx(file =  tmp_path)
                st.success("The data have been loaded successfully", icon="✅")
-                ## Visualize spectra
+        os.unlink(tmp_path)
-            with col2:
-                fig, ax = plt.subplots(figsize = (30,7))
+## Visualize spectra
-                data_import.T.plot(legend=False, ax = ax, color = 'blue')
+if not spectra.empty:
-                ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
+    with col2:
-                ax.set_ylabel('Signal', fontsize=18)
+        fig = plot_spectra(spectra)
-                plt.margins(x = 0)
+        st.pyplot(fig)
-                st.pyplot(fig)
-                st.write("Summary")
-                info = pd.DataFrame({'N':[data_import.shape[0]],
-                                    'Min': [np.min(data_import)],
-                                    'Max':[np.max(data_import)],}, index = ['Values']).T
-                info.rename_axis('information')
-                st.table(data=info)
-        elif test == '.dx':
-            # Create a temporary file to save the uploaded file
-            with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
-                tmp.write(sselectx_csv.read())
-                tmp_path = tmp.name
-                with col1:
-                        data = DxRead(path = tmp_path)
-                        data_import = data.specs_df_
-                        st.success("The data have been loaded successfully", icon="✅")
-                    ## Visualize spectra
-                with col2:
-                    fig, ax = plt.subplots(figsize = (30,7))
-                    data_import.T.plot(legend=False, ax = ax, color = 'blue')
-                    ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
-                    ax.set_ylabel('Signal', fontsize=18)
-                    plt.margins(x = 0)
-                    st.pyplot(fig)
-                    st.write("Summary")
-                    info = pd.DataFrame({'N':[data_import.shape[0]],
-                                        'Min': [np.min(data_import)],
-                                        'Max':[np.max(data_import)],}, index = ['Values']).T
-                    info.rename_axis('information')
-                    st.table(data=info)
-            os.unlink(tmp_path)
-######################################################################################
 ############################## Exploratory data analysis ###############################
-plot_type=['', 'PCA','UMAP', 'NMF']
+container2 = st.container(border=True)
-cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
+container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
-with container2:
+scores, loadings, pc = st.columns([2, 3, 0.5])
-    if sselectx_csv is not None:
+influence, hotelling, qexp = st.columns([2, 2, 1])
-        plot_type=['', 'PCA','UMAP', 'NMF']
-        cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
-        with pc:
-            type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37)
-            type_cluster = st.selectbox("Clustering techniques: ", options=cluster_methods, key=38)
-            # compute UMAP - umap_maker in application_functions.py
-            if type_plot == 'PCA':
-                model = LinearPCA(data_import, Ncomp=5)
-            elif type_plot =='UMAP':
-                model = Umap(data_import = data_import, numerical_data = scaled_values, cat_data = categorical_data)
-        if type_plot in ['PCA', 'UMAP']:
-            if type_plot in ['PCA']:
-                # add 2 select lists to choose which component to plot
-                axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0)
-                axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1)
-                axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2)
-            elif type_plot in ['UMAP']:
-                axis1 = 0
-                axis2 = 1
-                axis3 = 2
-            if type_cluster == 'Kmeans':
-                scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1)
-                cl = Sk_Kmeans(scsc, max_clusters = 30)
-            elif type_cluster == 'HDBSCAN':
-                optimized_hdbscan = Hdbscan(model.scores_raw_)
-                labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
-            with scores:
-                t = model.scores_
-                if type_cluster in ['AP', 'Kmeans']:
-                    st.write('Scree plot')
-                    fig2 = px.scatter(cl.inertia_.T, y = 'inertia')
-                    st.plotly_chart(fig2)
-                    ncluster = st.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
-                    data, colors = cl.fit_optimal(nclusters=ncluster)
-                    #fig = px.scatter(data, x=axis1, y=axis2, color= colors)
-                    st.write('Scores plot')
-                    fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors)
-                    fig.update_traces(marker=dict(size=4))
-                elif type_cluster in ['HDBSCAN']:
-                    st.write('plot HDBSCAN clustering')
-                    fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
-                    fig.update_traces(marker=dict(size=4))
-                    # st.plotly_chart(fig_hdbscan)
-                    st.write('Optimal number of clusters = ' + str(len(set(labels))))
-                    st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
-                    st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')
-                else:
-                    if test == '.dx':
-                        filter = ['origin', 'date', 'time', 'spectrometer/data system']
-                        col = st.selectbox('filter', options= filter)
-                        fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = data.md_df_[col])
-                        fig.update_traces(marker=dict(size=4))
-                    else:
-                        fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3 )
-                        fig.update_traces(marker=dict(size=4))
-                st.plotly_chart(fig)
-            if type_plot =='PCA':
-                with loadings:
-                    st.write('Loadings plot')
-                    p = model.loadings_
-                    pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
-                    df1 = pp.melt(id_vars="wl")
-                    fig = px.line(df1, x = 'wl', y = 'value', color='variable')
-                    fig.update_layout(
-                        legend=dict(x=1, y=0,
-                                    font=dict(
-                                        family="Courier", size=12, color="black"),
-                                    bordercolor="Black", borderwidth=2)
-                    )
-                    st.plotly_chart(fig, use_container_width = True)
-                with influence:
-                    st.write('Influence plot')
-                    ax1 = st.selectbox("Component", options=model.scores_.columns, index=3)
-                    leverage = model.leverage_
-                    residuals = model.residuals_
-                    fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
-                    st.plotly_chart(fig)
-                with hotelling:
-                    st.write('T²-Hotelling vs Q residuals plot')
-                    hotelling = model.hotelling_
-                    ax2 = st.selectbox("Component", options=model.scores_.columns, index=4)
-                    hotelling = model.hotelling_
-                    fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals")
-                    st.plotly_chart(fig)
-        else:
+dim_red_methods=['', 'PCA','UMAP', 'NMF']  # List of dimensionality reduction algos
-            st.markdown('Select a dimensionality reduction technique from the dropdown list')
+cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos
+dr_model = None # dimensionality reduction model
+cl_model = None # clustering model
+# Dimensionality reduction
+t = pd.DataFrame # scores
+p = pd.DataFrame # loadings
+labels = []
+if not spectra.empty:
+    dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37)
+    clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38)
+    xc = standardize(spectra)
+    if dim_red_method == dim_red_methods[1]:
+        dr_model = LinearPCA(xc, Ncomp=5)
+    elif dim_red_method == dim_red_methods[2]:
+        dr_model = Umap(data_import = data_import, numerical_data = scaled_values, cat_data = categorical_data)
+    if dr_model:
+        axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
+        axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1)
+        axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2)
+        t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1)
+# clustering
+if not t.empty:
+    tcr = standardize(t)
+        # Clustering
+    if clus_method == cluster_methods[1]:
+        ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
+        cl_model = Sk_Kmeans(tcr, max_clusters = 30)
+        fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia')
+        scores.plotly_chart(fig2)
+        data, labels = cl_model.fit_optimal(nclusters = ncluster)
+    elif clus_method == cluster_methods[2]:
+        optimized_hdbscan = Hdbscan(model.scores_raw_)
+        labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
+##### Plots
+## Scores
+if not t.empty:
+    with scores:
+        st.write('Scores plot')
+        # scores plot with clustering
+        if list(labels) and meta_data.empty:
+            fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
+        # scores plot with metadata
+        elif len(list(labels)) == 0 and not meta_data.empty:
+            filter = meta_data.columns[1:]
+            col = st.selectbox('Group by:', options= filter)
+            if col == 0:
+                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
+            else:
+                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) )
+        # color with scores and metadata
+        elif len(list(labels)) > 0  and not meta_data.empty:
+            if clus_method in cluster_methods[1:]:
+                filter = ['None', clus_method]
+                filter.extend(meta_data.columns[1:])
+            else:
+                filter = meta_data.columns[1:].insert(0,'None')
+            col = st.selectbox('Group by:', options= filter)
+            if col == "None":
+                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
+            elif col == clus_method:
+                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
+            else:
+                fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])))
+        else:
+            fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
+        fig.update_traces(marker=dict(size=4))
+        st.plotly_chart(fig)
+if not spectra.empty:
+    if dim_red_method == dim_red_methods[1]:
+        with loadings:
+            st.write('Loadings plot')
+            p = dr_model.loadings_
+            pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
+            df1 = pp.melt(id_vars="wl")
+            fig = px.line(df1, x = 'wl', y = 'value', color='variable')
+            fig.update_layout(legend=dict(x=1, y=0,font=dict(family="Courier", size=12, color="black"),
+                                        bordercolor="Black", borderwidth=2))
+            st.plotly_chart(fig, use_container_width = True)
+        with influence:
+            st.write('Influence plot')
+            ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3)
+            leverage = dr_model.leverage_
+            residuals = dr_model.residuals_
+            fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
+            st.plotly_chart(fig)
+        with hotelling:
+                st.write('T²-Hotelling vs Q residuals plot')
+                hotelling = dr_model.hotelling_
+                ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4)
+                hotelling = dr_model.hotelling_
+                fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals")
+                st.plotly_chart(fig)
\ No newline at end of file
--- a/pages/2-model_creation.py
+++ b/pages/2-model_creation.py
@@ -3,9 +3,12 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
 from Modules import *
 from Class_Mod.DATA_HANDLING import *
 st.session_state["interface"] = st.session_state.get('interface')
 if st.session_state["interface"] == 'simple':
    hide_pages("Predictions")
 def nn(x):
    return x is not None
 ########################################################################################
@@ -26,91 +29,135 @@ M9, M10 = st.columns([2,2])
 M9.write("-- Save the model --")
+files_format = ['.csv', '.dx']
+file = M3.radio('select data file format:', options = files_format)
-# CSV files loader
+### Data
-xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
+spectra = pd.DataFrame
-ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
+y = pd.DataFrame
+# load .csv file
-if xcal_csv is not None and ycal_csv is not None:
+if file == files_format[0]:
+    xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
+    ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
+    if xcal_csv and ycal_csv:
        # Select list for CSV delimiter
-        sep = M3.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
+        sep = M3.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)),
+                            options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
        # Select list for CSV header True / False
-        hdr = M3.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
+        hdr = M3.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)),
+                            options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
+        ###############
        if hdr == 'yes':
            col = 0
        else:
            col = False
-        rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i")
+        ###############
-        x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col)
+        spectra, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col)
-        # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
+        spectra = pd.DataFrame(spectra)
-        train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed)
+        y  = pd.DataFrame(y)
-        # Assign data to training and test sets
-        X_train, y_train, X_test, y_test = pd.DataFrame(x[train_index]), pd.DataFrame(y[train_index]), pd.DataFrame(x[test_index]), pd.DataFrame(y[test_index])
-        y_train = y_train.iloc[:,0]
-        y_test = y_test.iloc[:,0]
+## Load .dx file
+elif file == files_format[1]:
+    data_file = M3.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file")
+    if data_file:
-        ############################# Regression modelling ##########################################
+        with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
-        regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
+            tmp.write(data_file.read())
-        if regression_algo == reg_algo[1]:
+            tmp_path = tmp.name
-            # Train model with model function from application_functions.py
+            chem_data, spectra, meta_data = read_dx(file =  tmp_path)
-            Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test)
+            M3.success("The data have been loaded successfully", icon="✅")
-            reg_model = Reg.model_
+            yname = M3.selectbox('Select target', options=chem_data.columns)
-            #M2.dataframe(Pin.pred_data_)
+            spectra = spectra
+            y = chem_data.loc[:,yname]
-        elif regression_algo == reg_algo[2]:
-            reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr)
+        os.unlink(tmp_path)
-        elif regression_algo == reg_algo[3]:
+### split the data
-            s = M2.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value=3)
+if not spectra.empty and not y.empty:
-            it = M2.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100)
+    rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i")
-            progress_text = "The model is being created. Please wait."
+    # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
+    train_index, test_index = train_test_split_idx(spectra, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed)
-            Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s)
+    # Assign data to training and test sets
-            pro = M1.progress(0, text="The model is being created. Please wait!")
+    X_train, y_train, X_test, y_test = pd.DataFrame(spectra.iloc[train_index,:]), pd.DataFrame(y.iloc[train_index]), pd.DataFrame(spectra.iloc[test_index,:]), pd.DataFrame(y.iloc[test_index])
-            rega = Reg.BandSelect(n_iter=it)
+    y_train = y_train.iloc[:,0]
-            pro.empty()
+    y_test = y_test.iloc[:,0]
-            M1.progress(100, text = "The model has successfully been  created!")
+#######################################
+    regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
+    if regression_algo == reg_algo[1]:
+        # Train model with model function from application_functions.py
+        Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test)
+        reg_model = Reg.model_
+        #M2.dataframe(Pin.pred_data_)
+    elif regression_algo == reg_algo[2]:
+        reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr)
+    elif regression_algo == reg_algo[3]:
+        s = M1.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value=3)
+        it = M1.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100)
+        progress_text = "The model is being created. Please wait."
-            time.sleep(1)
+        Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s)
-            reg_model = Reg.model_
+        pro = M1.progress(0, text="The model is being created. Please wait!")
-            M2.table(rega[0])
+        rega = Reg.BandSelect(n_iter=it)
+        pro.empty()
+        M1.progress(100, text = "The model has successfully been  created!")            
+        time.sleep(1)
+        reg_model = Reg.model_
+        M2.write('-- Table of selected wavelengths --')
+        M2.table(rega[0])
        ################# Model analysis ############
+    if regression_algo in reg_algo[1:]:
-        if regression_algo in reg_algo[1:]:
+        yc = Reg.pred_data_[0]
-            yc = Reg.pred_data_[0]
+        ycv = Reg.pred_data_[1]
-            ycv = Reg.pred_data_[1]
+        yt = Reg.pred_data_[2]
-            yt = Reg.pred_data_[2]
-            M1.write("-- Performance metrics --")
+        M2.write("-- Performance metrics --")
-            M1.dataframe(Reg.metrics_)
+        M2.dataframe(Reg.metrics_)
-            M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt]))
+        M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt]))
-            M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt]))
+        M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt]))
            #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
-            model_name = M9.text_input('Give it a name')
+        model_name = M9.text_input('Give it a name')
-            if M9.button('Export Model'):
+        if M9.button('Export Model'):
+            path = 'data/models/model_'
+            if file == files_format[0]:
                #export_package = __import__(model_export)
-                with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f:
+                with open(path + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f:
                    joblib.dump(reg_model, f)
+                    if regression_algo == reg_algo[3]:
-                if regression_algo == reg_algo[3]:
+                        rega[1].sort()
-                    rega[1].sort()
+                        pd.DataFrame(rega[1]).to_csv(path + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_'+'Wavelengths_index.csv', sep = ';')
-                    pd.DataFrame(rega[1]).to_csv('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_''Wavelengths_index.csv', sep = ';')
+            elif file == files_format[1]:
+                #export_package = __import__(model_export)
+                with open(path + model_name + '_on_'  + '_data_' + '.pkl','wb') as f:
+                    joblib.dump(reg_model, f)
+                    if regression_algo == reg_algo[3]:
+                        rega[1].sort()
+                        pd.DataFrame(rega[1]).to_csv(path + model_name + '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';')
+                        st.write('Model Exported')
+            if regression_algo == reg_algo[3]:
                st.write('Model Exported')
                # create a report with information on the model
                ## see https://stackoverflow.com/a/59578663
-        #M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv]))
-                if st.session_state['interface'] == 'simple':
+        if st.session_state['interface'] == 'simple':
-                    st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !')
+            st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !')
+## Load .dx file
--- a/pages/3-prediction.py
+++ b/pages/3-prediction.py
@@ -47,7 +47,6 @@ if NIRS_csv:
 if st.button("Predict"):
        if s:
             result = model_loaded.predict(pred_data.iloc[:,idx])
        else:
        # use prediction function from application_functions.py to predict chemical values

--- a/predictions/.gitkeep
+++ b/predictions/.gitkeep