diff --git a/Class_Mod/DATA_HANDLING.py b/Class_Mod/DATA_HANDLING.py index 02e5694ac1468525247520df88f0b8c96361ee89..10fb7ab898055afa3fd3f44365bdaf46ce151ce8 100644 --- a/Class_Mod/DATA_HANDLING.py +++ b/Class_Mod/DATA_HANDLING.py @@ -31,17 +31,13 @@ def col_cat(data_import): if len(categorical_columns_list) > 0: categorical_data = pd.concat(categorical_columns_list, axis=1) if len(categorical_columns_list) == 0: - empty = ["" for x in range(len(data_import))] - categorical_columns_list.append(empty) - categorical_data = pd.DataFrame(categorical_columns_list).T - categorical_data.columns = ['no categories'] + categorical_data = pd.DataFrame # Create numerical data matrix from the numerical columns list and fill na with the mean of the column numerical_data = pd.concat(numerical_columns_list, axis=1) numerical_data = numerical_data.apply(lambda x: x.fillna(x.mean())) #np.mean(x))) - # Scale the numerical data - scaler = StandardScaler() - scaled_values = scaler.fit_transform(numerical_data) - return numerical_data, categorical_data, scaled_values + + return numerical_data, categorical_data + def list_files(mypath, import_type): diff --git a/Class_Mod/DxReader.py b/Class_Mod/DxReader.py index d877ff2f2d43a995a51f660a6de5076343315352..f0248949d0697aca948f5053ab56c48bf04e1b6a 100644 --- a/Class_Mod/DxReader.py +++ b/Class_Mod/DxReader.py @@ -29,21 +29,21 @@ class DxRead: block_met = { 'name': block['title'], 'origin': block['origin'], 'date': block['date'], - 'time': block['time'], - 'spectrometer/data system': block['spectrometer/data system'], - 'instrumental parameters': block['instrumental parameters'], - 'xunits': block['xunits'], - 'yunits': block['yunits'], - 'xfactor': block['xfactor'], - 'yfactor': block['yfactor'], - 'firstx': block['firstx'], - 'lastx': block['lastx'], - 'firsty':block['firsty'], - 'miny': block['miny'], - 'maxy': block['maxy'], - 'npoints': block['npoints'], + # 'time': block['time'], + # 'spectrometer/data system': block['spectrometer/data system'], + # 'instrumental parameters': block['instrumental parameters'], + # 'xunits': block['xunits'], + # 'yunits': block['yunits'], + # 'xfactor': block['xfactor'], + # 'yfactor': block['yfactor'], + # 'firstx': block['firstx'], + # 'lastx': block['lastx'], + # 'firsty':block['firsty'], + # 'miny': block['miny'], + # 'maxy': block['maxy'], + # 'npoints': block['npoints'], 'concentrations':block['concentrations'], - 'deltax':block['deltax'] + # 'deltax':block['deltax'] } self.__met[f'{i}'] = block_met self.metadata_ = pd.DataFrame(self.__met).T @@ -87,8 +87,13 @@ class DxRead: return self.spectra @property def md_df_(self): - return self.metadata_ + return self.metadata_.drop("concentrations", axis = 1) @property def chem_data_(self): - return self.chem_data \ No newline at end of file + return self.chem_data + +@st.cache_data +def read_dx(file): + M = DxRead(file) + return M.chem_data, M.specs_df_, M.md_df_ \ No newline at end of file diff --git a/Class_Mod/KMEANS_.py b/Class_Mod/KMEANS_.py index ab9e22bcfe471916405de63e722e8853ceb2504e..526a43597155183de2241e0fd0b850f8b4af13ad 100644 --- a/Class_Mod/KMEANS_.py +++ b/Class_Mod/KMEANS_.py @@ -19,13 +19,6 @@ class Sk_Kmeans: def fit_optimal(self, nclusters): model = KMeans(n_clusters = nclusters, init = 'k-means++', random_state = 42) model.fit(self.x) - yp = model.predict(self.x) - num_colors = nclusters - colors = ['#' + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in range(num_colors)] - col = np.array(['#' + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in range(self.x.shape[0])]) - for i in range(nclusters): - ss = np.where(yp==i) - col[ss] = colors[i] - - - return self.x, col \ No newline at end of file + yp = model.predict(self.x)+1 + clu = [f'cluster#{i}' for i in yp] + return self.x, clu \ No newline at end of file diff --git a/Class_Mod/Miscellaneous.py b/Class_Mod/Miscellaneous.py index 1627b39960d520bd909555380c2fb86bf2badf08..79d1708cba65860d3e2cdf0d1ac50fd148a24937 100644 --- a/Class_Mod/Miscellaneous.py +++ b/Class_Mod/Miscellaneous.py @@ -47,3 +47,18 @@ def resid_plot( meas, pred): def download_results(data, export_name): with open(data) as f: st.download_button('Download Results', f, export_name) + +@st.cache_resource +def plot_spectra(df): + if isinstance(df.columns[0], str): + m = 0 + else: + m = np.min(df.columns) + + fig, ax = plt.subplots(figsize = (30,7)) + df.T.plot(legend=False, ax = ax, color = 'blue') + ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) + ax.set_ylabel('Signal intensity', fontsize=18) + plt.margins(x = 0) + plt.annotate(text = f'The total number of spectra is {df.shape[0]}', xy =(m, np.max(df)), size=20, color = 'black', backgroundcolor='red') + return fig diff --git a/Class_Mod/__init__.py b/Class_Mod/__init__.py index c684862836ba8af35807b889e3b822f091dad3d6..b5e1c5b63d602dd1291703ea4250e4ddf319254b 100644 --- a/Class_Mod/__init__.py +++ b/Class_Mod/__init__.py @@ -7,6 +7,6 @@ from .LWPLSR_ import model_LWPLSR from .Regression_metrics import metrics from .VarSel import TpeIpls from .Miscellaneous import resid_plot, reg_plot -from .DxReader import DxRead +from .DxReader import DxRead, read_dx from .HDBSCAN_Clustering import Hdbscan diff --git a/Modules.py b/Modules.py index 0076fb22adc7da0d1aec6530ee3f6ab0a754d370..09d297f18c22505322b5557d93afd8c60bd76db8 100644 --- a/Modules.py +++ b/Modules.py @@ -1,4 +1,4 @@ -from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan +from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan, read_dx # find_col_index -from Class_Mod.Miscellaneous import prediction, download_results +from Class_Mod.Miscellaneous import prediction, download_results, plot_spectra diff --git a/Packages.py b/Packages.py index b0d939baa8021ba8dfa14088d1b33d972500954d..ec7d83f23abc877b99e5eb07c3abc95a2280edba 100644 --- a/Packages.py +++ b/Packages.py @@ -41,6 +41,7 @@ from PIL import Image import plotly.express as px import matplotlib.pyplot as plt import seaborn as sns +import matplotlib ### Important Metrics from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score diff --git a/pages/1-samples_selection.py b/pages/1-samples_selection.py index ffb4d81631eab0beda7d3fd473b21e004a6704f4..08d8cb6e364ee4b7a55b00dbab272583e1ab2c4e 100644 --- a/pages/1-samples_selection.py +++ b/pages/1-samples_selection.py @@ -8,195 +8,175 @@ if st.session_state["interface"] == 'simple': hide_pages("Predictions") ################################### Data Loading and Visualization ######################################## -container1 = st.container(border=True) col2, col1 = st.columns([3, 1]) col1.header("Data Loading", divider='blue') col2.header("Spectral Data Visualization", divider='blue') -container2 = st.container(border=True) -container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') -scores, loadings, pc = st.columns([2, 3, 0.5]) -influence, hotelling, qexp = st.columns([2, 2, 1]) +## Preallocation of data structure +spectra = pd.DataFrame +meta_data = pd.DataFrame +selected_samples = pd.DataFrame -with container1: - # loader for csv file containing NIRS spectra - sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) - if sselectx_csv is not None: - test = sselectx_csv.name[sselectx_csv.name.find('.'):] - if test== '.csv': - with col1: - # Select list for CSV delimiter - psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9) +# loader for datafile +data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) + + +if data_file: + # Retrieve the extension of the file + test = data_file.name[data_file.name.find('.'):] + + ## Load .csv file + if test== '.csv': + with col1: + # Select list for CSV delimiter + psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9) # Select list for CSV header True / False - phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31) - if phdr == 'yes': - col = 0 - else: - col = False - data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) - data_import, categorical_data, scaled_values = col_cat(data_import) + phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31) + if phdr == 'yes': + col = 0 + else: + col = False + imp = pd.read_csv(data_file, sep=psep, index_col=col) + spectra = col_cat(imp)[0] + meta_data = col_cat(imp)[1] + st.success("The data have been loaded successfully", icon="✅") + + ## Load .dx file + elif test == '.dx': + # Create a temporary file to save the uploaded file + with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: + tmp.write(data_file.read()) + tmp_path = tmp.name + with col1: + _, spectra, meta_data = read_dx(file = tmp_path) st.success("The data have been loaded successfully", icon="✅") - ## Visualize spectra - - with col2: - fig, ax = plt.subplots(figsize = (30,7)) - data_import.T.plot(legend=False, ax = ax, color = 'blue') - ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) - ax.set_ylabel('Signal', fontsize=18) - plt.margins(x = 0) - st.pyplot(fig) - - st.write("Summary") - info = pd.DataFrame({'N':[data_import.shape[0]], - 'Min': [np.min(data_import)], - 'Max':[np.max(data_import)],}, index = ['Values']).T - info.rename_axis('information') - st.table(data=info) - - elif test == '.dx': - # Create a temporary file to save the uploaded file - with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: - tmp.write(sselectx_csv.read()) - tmp_path = tmp.name - with col1: - data = DxRead(path = tmp_path) - data_import = data.specs_df_ - st.success("The data have been loaded successfully", icon="✅") - - ## Visualize spectra - - with col2: - fig, ax = plt.subplots(figsize = (30,7)) - data_import.T.plot(legend=False, ax = ax, color = 'blue') - ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) - ax.set_ylabel('Signal', fontsize=18) - plt.margins(x = 0) - st.pyplot(fig) - - st.write("Summary") - info = pd.DataFrame({'N':[data_import.shape[0]], - 'Min': [np.min(data_import)], - 'Max':[np.max(data_import)],}, index = ['Values']).T - info.rename_axis('information') - st.table(data=info) - os.unlink(tmp_path) - - - - - -###################################################################################### + os.unlink(tmp_path) + + +## Visualize spectra +if not spectra.empty: + with col2: + fig = plot_spectra(spectra) + st.pyplot(fig) + ############################## Exploratory data analysis ############################### -plot_type=['', 'PCA','UMAP', 'NMF'] -cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] -with container2: - if sselectx_csv is not None: - plot_type=['', 'PCA','UMAP', 'NMF'] - cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] - - with pc: - type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37) - type_cluster = st.selectbox("Clustering techniques: ", options=cluster_methods, key=38) - # compute UMAP - umap_maker in application_functions.py - if type_plot == 'PCA': - model = LinearPCA(data_import, Ncomp=5) - elif type_plot =='UMAP': - model = Umap(data_import = data_import, numerical_data = scaled_values, cat_data = categorical_data) - - - if type_plot in ['PCA', 'UMAP']: - if type_plot in ['PCA']: - # add 2 select lists to choose which component to plot - axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0) - axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1) - axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2) - elif type_plot in ['UMAP']: - axis1 = 0 - axis2 = 1 - axis3 = 2 - - if type_cluster == 'Kmeans': - scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1) - cl = Sk_Kmeans(scsc, max_clusters = 30) - - elif type_cluster == 'HDBSCAN': - optimized_hdbscan = Hdbscan(model.scores_raw_) - labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_ - with scores: - t = model.scores_ - if type_cluster in ['AP', 'Kmeans']: - st.write('Scree plot') - fig2 = px.scatter(cl.inertia_.T, y = 'inertia') - st.plotly_chart(fig2) - - ncluster = st.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters') - data, colors = cl.fit_optimal(nclusters=ncluster) - #fig = px.scatter(data, x=axis1, y=axis2, color= colors) - st.write('Scores plot') - fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors) - fig.update_traces(marker=dict(size=4)) - - - elif type_cluster in ['HDBSCAN']: - st.write('plot HDBSCAN clustering') - fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels) - fig.update_traces(marker=dict(size=4)) - # st.plotly_chart(fig_hdbscan) - st.write('Optimal number of clusters = ' + str(len(set(labels)))) - st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3))) - st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).') - - else: - if test == '.dx': - filter = ['origin', 'date', 'time', 'spectrometer/data system'] - col = st.selectbox('filter', options= filter) - - fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = data.md_df_[col]) - fig.update_traces(marker=dict(size=4)) - else: - fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3 ) - fig.update_traces(marker=dict(size=4)) - - st.plotly_chart(fig) - - - if type_plot =='PCA': - with loadings: - st.write('Loadings plot') - p = model.loadings_ - pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1) - df1 = pp.melt(id_vars="wl") - - fig = px.line(df1, x = 'wl', y = 'value', color='variable') - fig.update_layout( - legend=dict(x=1, y=0, - font=dict( - family="Courier", size=12, color="black"), - bordercolor="Black", borderwidth=2) - ) - st.plotly_chart(fig, use_container_width = True) - - - with influence: - st.write('Influence plot') - ax1 = st.selectbox("Component", options=model.scores_.columns, index=3) - leverage = model.leverage_ - residuals = model.residuals_ - fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals") - st.plotly_chart(fig) - - with hotelling: - st.write('T²-Hotelling vs Q residuals plot') - hotelling = model.hotelling_ - ax2 = st.selectbox("Component", options=model.scores_.columns, index=4) - - hotelling = model.hotelling_ - fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") - st.plotly_chart(fig) +container2 = st.container(border=True) +container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') +scores, loadings, pc = st.columns([2, 3, 0.5]) +influence, hotelling, qexp = st.columns([2, 2, 1]) - else: - st.markdown('Select a dimensionality reduction technique from the dropdown list') +dim_red_methods=['', 'PCA','UMAP', 'NMF'] # List of dimensionality reduction algos +cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos + +dr_model = None # dimensionality reduction model +cl_model = None # clustering model + +# Dimensionality reduction +t = pd.DataFrame # scores +p = pd.DataFrame # loadings +labels = [] +if not spectra.empty: + dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37) + clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38) + xc = standardize(spectra) + + if dim_red_method == dim_red_methods[1]: + dr_model = LinearPCA(xc, Ncomp=5) + elif dim_red_method == dim_red_methods[2]: + dr_model = Umap(data_import = data_import, numerical_data = scaled_values, cat_data = categorical_data) + + if dr_model: + axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0) + axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1) + axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2) + t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1) + + +# clustering +if not t.empty: + tcr = standardize(t) + # Clustering + if clus_method == cluster_methods[1]: + ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters') + cl_model = Sk_Kmeans(tcr, max_clusters = 30) + fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia') + scores.plotly_chart(fig2) + data, labels = cl_model.fit_optimal(nclusters = ncluster) + + elif clus_method == cluster_methods[2]: + optimized_hdbscan = Hdbscan(model.scores_raw_) + labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_ + +##### Plots + +## Scores +if not t.empty: + with scores: + st.write('Scores plot') + # scores plot with clustering + if list(labels) and meta_data.empty: + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels) + + # scores plot with metadata + elif len(list(labels)) == 0 and not meta_data.empty: + filter = meta_data.columns[1:] + col = st.selectbox('Group by:', options= filter) + if col == 0: + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) + else: + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) ) + + # color with scores and metadata + elif len(list(labels)) > 0 and not meta_data.empty: + if clus_method in cluster_methods[1:]: + filter = ['None', clus_method] + filter.extend(meta_data.columns[1:]) + else: + filter = meta_data.columns[1:].insert(0,'None') + + col = st.selectbox('Group by:', options= filter) + if col == "None": + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) + elif col == clus_method: + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels) + else: + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col]))) + else: + fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) + fig.update_traces(marker=dict(size=4)) + st.plotly_chart(fig) + + + +if not spectra.empty: + if dim_red_method == dim_red_methods[1]: + with loadings: + st.write('Loadings plot') + p = dr_model.loadings_ + pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1) + df1 = pp.melt(id_vars="wl") + fig = px.line(df1, x = 'wl', y = 'value', color='variable') + fig.update_layout(legend=dict(x=1, y=0,font=dict(family="Courier", size=12, color="black"), + bordercolor="Black", borderwidth=2)) + st.plotly_chart(fig, use_container_width = True) + + with influence: + st.write('Influence plot') + ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3) + leverage = dr_model.leverage_ + residuals = dr_model.residuals_ + fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals") + st.plotly_chart(fig) + + with hotelling: + st.write('T²-Hotelling vs Q residuals plot') + hotelling = dr_model.hotelling_ + ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4) + + hotelling = dr_model.hotelling_ + fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") + st.plotly_chart(fig) \ No newline at end of file diff --git a/pages/2-model_creation.py b/pages/2-model_creation.py index 3fadcb45393c71242b4876ccf32912c616a59ec4..3f506ea752eda5ebf0460f6b53ca8a24015225ec 100644 --- a/pages/2-model_creation.py +++ b/pages/2-model_creation.py @@ -3,9 +3,12 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * from Class_Mod.DATA_HANDLING import * + st.session_state["interface"] = st.session_state.get('interface') if st.session_state["interface"] == 'simple': hide_pages("Predictions") + + def nn(x): return x is not None ######################################################################################## @@ -26,91 +29,135 @@ M9, M10 = st.columns([2,2]) M9.write("-- Save the model --") +files_format = ['.csv', '.dx'] +file = M3.radio('select data file format:', options = files_format) -# CSV files loader -xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") -ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") +### Data +spectra = pd.DataFrame +y = pd.DataFrame - -if xcal_csv is not None and ycal_csv is not None: +# load .csv file +if file == files_format[0]: + xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") + ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") + + if xcal_csv and ycal_csv: + # Select list for CSV delimiter - sep = M3.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) + sep = M3.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), + options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) # Select list for CSV header True / False - hdr = M3.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) + hdr = M3.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), + options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) + ############### if hdr == 'yes': col = 0 else: col = False - rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i") - x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col) - # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing - train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed) - # Assign data to training and test sets - X_train, y_train, X_test, y_test = pd.DataFrame(x[train_index]), pd.DataFrame(y[train_index]), pd.DataFrame(x[test_index]), pd.DataFrame(y[test_index]) - y_train = y_train.iloc[:,0] - y_test = y_test.iloc[:,0] - - - - ############################# Regression modelling ########################################## - regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12) - if regression_algo == reg_algo[1]: - # Train model with model function from application_functions.py - Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test) - reg_model = Reg.model_ - #M2.dataframe(Pin.pred_data_) - - elif regression_algo == reg_algo[2]: - reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr) - - elif regression_algo == reg_algo[3]: - s = M2.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value=3) - it = M2.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100) - progress_text = "The model is being created. Please wait." - - Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s) - pro = M1.progress(0, text="The model is being created. Please wait!") - rega = Reg.BandSelect(n_iter=it) - pro.empty() - M1.progress(100, text = "The model has successfully been created!") + ############### + spectra, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col) + spectra = pd.DataFrame(spectra) + y = pd.DataFrame(y) + + + +## Load .dx file +elif file == files_format[1]: + data_file = M3.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file") + if data_file: + with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: + tmp.write(data_file.read()) + tmp_path = tmp.name + chem_data, spectra, meta_data = read_dx(file = tmp_path) + M3.success("The data have been loaded successfully", icon="✅") + yname = M3.selectbox('Select target', options=chem_data.columns) + spectra = spectra + y = chem_data.loc[:,yname] + + os.unlink(tmp_path) + +### split the data +if not spectra.empty and not y.empty: + rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i") + # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing + train_index, test_index = train_test_split_idx(spectra, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed) + # Assign data to training and test sets + X_train, y_train, X_test, y_test = pd.DataFrame(spectra.iloc[train_index,:]), pd.DataFrame(y.iloc[train_index]), pd.DataFrame(spectra.iloc[test_index,:]), pd.DataFrame(y.iloc[test_index]) + y_train = y_train.iloc[:,0] + y_test = y_test.iloc[:,0] + + +####################################### + regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12) + if regression_algo == reg_algo[1]: + # Train model with model function from application_functions.py + Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test) + reg_model = Reg.model_ + #M2.dataframe(Pin.pred_data_) + elif regression_algo == reg_algo[2]: + reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr) + + elif regression_algo == reg_algo[3]: + s = M1.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value=3) + it = M1.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100) + progress_text = "The model is being created. Please wait." - time.sleep(1) - reg_model = Reg.model_ - M2.table(rega[0]) + Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s) + pro = M1.progress(0, text="The model is being created. Please wait!") + rega = Reg.BandSelect(n_iter=it) + pro.empty() + M1.progress(100, text = "The model has successfully been created!") + time.sleep(1) + reg_model = Reg.model_ + M2.write('-- Table of selected wavelengths --') + M2.table(rega[0]) ################# Model analysis ############ - - if regression_algo in reg_algo[1:]: - yc = Reg.pred_data_[0] - ycv = Reg.pred_data_[1] - yt = Reg.pred_data_[2] + if regression_algo in reg_algo[1:]: + yc = Reg.pred_data_[0] + ycv = Reg.pred_data_[1] + yt = Reg.pred_data_[2] - M1.write("-- Performance metrics --") - M1.dataframe(Reg.metrics_) + M2.write("-- Performance metrics --") + M2.dataframe(Reg.metrics_) - M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt])) - M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt])) + M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt])) + M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt])) #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) - model_name = M9.text_input('Give it a name') - if M9.button('Export Model'): + model_name = M9.text_input('Give it a name') + if M9.button('Export Model'): + path = 'data/models/model_' + if file == files_format[0]: #export_package = __import__(model_export) - with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f: + with open(path + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f: joblib.dump(reg_model, f) - - if regression_algo == reg_algo[3]: - rega[1].sort() - pd.DataFrame(rega[1]).to_csv('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_''Wavelengths_index.csv', sep = ';') + if regression_algo == reg_algo[3]: + rega[1].sort() + pd.DataFrame(rega[1]).to_csv(path + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_'+'Wavelengths_index.csv', sep = ';') + + elif file == files_format[1]: + #export_package = __import__(model_export) + with open(path + model_name + '_on_' + '_data_' + '.pkl','wb') as f: + joblib.dump(reg_model, f) + if regression_algo == reg_algo[3]: + rega[1].sort() + pd.DataFrame(rega[1]).to_csv(path + model_name + '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';') + st.write('Model Exported') + + if regression_algo == reg_algo[3]: st.write('Model Exported') - + # create a report with information on the model ## see https://stackoverflow.com/a/59578663 - #M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv])) - if st.session_state['interface'] == 'simple': - st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !') + if st.session_state['interface'] == 'simple': + st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !') + + +## Load .dx file diff --git a/pages/3-prediction.py b/pages/3-prediction.py index 4ac4e5832e6d4bfce0d6c96ac0ffe748ffec7a97..65130fd1dfcdde9f491dc7f8eaee4e19817ddc55 100644 --- a/pages/3-prediction.py +++ b/pages/3-prediction.py @@ -47,7 +47,6 @@ if NIRS_csv: if st.button("Predict"): if s: - result = model_loaded.predict(pred_data.iloc[:,idx]) else: # use prediction function from application_functions.py to predict chemical values diff --git a/predictions/.gitkeep b/predictions/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000