diff --git a/src/Modules.py b/src/Modules.py index 6d97307362ea3752e5110fa08a4a7f46c96ca3be..d4518bdbd6e89963494654dfba74be42bca20149 100644 --- a/src/Modules.py +++ b/src/Modules.py @@ -1,13 +1,13 @@ from Packages import * -from Class_Mod import Plsr, LinearPCA, Umap, find_col_index, PinardPlsr, Nmf, AP -from Class_Mod import LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan, read_dx, PlsProcess -from Class_Mod.DATA_HANDLING import * -from Class_Mod.Miscellaneous import prediction, download_results, plot_spectra, local_css, desc_stats, hash_data -from Class_Mod.Hash import create_hash, check_hash -from Report import report +from utils import Plsr, LinearPCA, Umap, find_col_index, PinardPlsr, Nmf, AP +from utils import LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan, read_dx, PlsProcess +from utils.DATA_HANDLING import * +from utils.Miscellaneous import prediction, download_results, plot_spectra, local_css, desc_stats, hash_data,data_split, pred_hist +from utils.Hash import create_hash, check_hash +from report import report css_file = Path("style/") pages_folder = Path("pages/") from style.header import add_header, add_sidebar from config.config import pdflatex_path local_css(css_file / "style.css") -from Class_Mod import KS, RDM +from utils import KS, RDM diff --git a/src/Packages.py b/src/Packages.py index f2f58a261b99ef60271b408c833ff09af1d8ab74..3923d3c6b5c5520c5e250b1864ecb5dd32c0466f 100644 --- a/src/Packages.py +++ b/src/Packages.py @@ -1,6 +1,5 @@ ## Data loading, handling, and preprocessing import os -import json import glob import sys from pathlib import Path @@ -8,23 +7,23 @@ import csv import re import jcamp import random -import datetime +from datetime import datetime import numpy as np -import shutil -import pandas as pd +from shutil import rmtree, move, make_archive +from pandas import DataFrame, read_csv, concat, Series, json_normalize from itertools import combinations -import zipfile -import hashlib +from hashlib import md5 from matplotlib import colors from matplotlib.colors import Normalize -from abc import ABC,abstractmethod +from abc import ABC, abstractmethod from typing import Optional, List from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder -import time from scipy.stats import skew, kurtosis from scipy.signal import savgol_filter, find_peaks_cwt, detrend import scipy as sc -import kennard_stone as ks +from kennard_stone import train_test_split as ks_train_test_split +from kennard_stone import KFold as ks_KFold + ### Exploratory data analysis-Dimensionality reduction from umap.umap_ import UMAP from sklearn.decomposition import PCA, NMF @@ -58,7 +57,7 @@ import plotly.graph_objects as go import plotly.io as pio import matplotlib.pyplot as plt, mpld3 import seaborn as sns -import matplotlib +# import matplotlib ### Important Metrics from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score @@ -66,7 +65,7 @@ from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, ## Web app construction import streamlit as st from st_pages import Page, Section, show_pages, add_page_title, hide_pages -from tempfile import NamedTemporaryFile +from tempfile import NamedTemporaryFile, TemporaryDirectory # help on streamlit input https://docs.streamlit.io/library/api-reference/widgets #Library for connecting to SQL DB @@ -76,7 +75,7 @@ import pyodbc import json # save models -import joblib +from joblib import dump, load, hash # import pickle as pkl from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, anneal diff --git a/src/app.py b/src/app.py index 46b5ee09f60af190e5df5c4babe255202e9805a7..4eb6f9ce225901015a73542682a5c2898cf2b5a6 100644 --- a/src/app.py +++ b/src/app.py @@ -4,6 +4,8 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide",) from mod import * # from utils.DATA_HANDLING import * from PIL import Image +import base64 +background_img(change=None) add_header() # add_sidebar(pages_folder) diff --git a/src/images/img-sky.jpg b/src/images/img-sky.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dd6c8febdb245c403c17c1bba5ec72d596f05731 Binary files /dev/null and b/src/images/img-sky.jpg differ diff --git a/src/messange.png b/src/messange.png new file mode 100644 index 0000000000000000000000000000000000000000..fde8399fd157796d5f7d268a7f21d09a992fb8cf Binary files /dev/null and b/src/messange.png differ diff --git a/src/mod.py b/src/mod.py index 2b7f62291cf192471afadb4ec0b35a87ee750d2e..ca6a0fc2d5ea89f1e2f62f29bf75957072c775c0 100644 --- a/src/mod.py +++ b/src/mod.py @@ -12,7 +12,7 @@ from Packages import * # from utils import read_dx, DxRead, Plsr, LinearPCA, Umap, find_col_index, PinardPlsr, Nmf, AP # from utils import LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan, read_dx, PlsProcess, PinardPlsr, Plsr from utils.DATA_HANDLING import * -from utils.Miscellaneous import prediction, download_results, plot_spectra, local_css, desc_stats, hash_data +from utils.Miscellaneous import prediction, download_results, plot_spectra, local_css, desc_stats, hash_data, hist,data_split, pred_hist,background_img from utils.Hash import create_hash, check_hash from report import report css_file = Path("style/") diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index e4baa2a19775b8ed381f22990d481b7a4cff074e..ae1392387470d5b8c2b689208282dcf7602820c7 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -1,18 +1,13 @@ from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") -from utils import read_dx, DxRead, LinearPCA, Umap, find_col_index, Nmf, Sk_Kmeans, AP, KS, RDM +from utils import read_dx, DxRead, LinearPCA, Umap, find_col_index, Nmf, Sk_Kmeans, AP,Hdbscan, KS, RDM from mod import * # HTML pour le bandeau "CEFE - CNRS" add_header() add_sidebar(pages_folder) local_css(css_file / "style_model.css")#load specific model page css - - - - - - +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ hash_ = '' def p_hash(add): @@ -22,17 +17,19 @@ def p_hash(add): # #################################### Methods ############################################## # empty temp figures +report_path = Path("report") +report_path_rel = Path("./report") +# st.write(os.listdir(report_path)) + def delete_files(keep): supp = [] # Walk through the directory - for root, dirs, files in os.walk('report/', topdown=False): + for root, dirs, files in os.walk(report_path, topdown=False): for file in files: if file != 'logo_cefe.png' and not any(file.endswith(ext) for ext in keep): os.remove(os.path.join(root, file)) - -dirpath = Path('report/out/model') -if dirpath.exists() and dirpath.is_dir(): - shutil.rmtree(dirpath) +if Path('report/out/model').exists() and Path('report/out/model').is_dir(): + rmtree(Path('report/out/model')) # algorithms available on our app dim_red_methods=['PCA','UMAP', 'NMF'] # List of dimensionality reduction algos @@ -69,21 +66,21 @@ delete_files(keep = ['.py', '.pyc','.bib']) # ####################################### page preamble ####################################### st.title("Calibration Subset Selection") # page title st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra") -col2, col1 = st.columns([3, 1]) -col2.image("./images/sample selection.png", use_column_width=True) # graphical abstract +c1, c2 = st.columns([3, 1]) +c1.image("./images/sample selection.png", use_column_width=True) # graphical abstract ################################### I - Data Loading and Visualization ######################################## files_format = ['csv', 'dx'] # Supported files format # loader for datafile -file = col1.file_uploader("Data file", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) +file = c2.file_uploader("Data file", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) ## Preallocation of data structure -spectra = pd.DataFrame() -meta_data = pd.DataFrame() -tcr=pd.DataFrame() -sam=pd.DataFrame() -sam1=pd.DataFrame() -selected_samples = pd.DataFrame() +spectra = DataFrame() +meta_data = DataFrame() +tcr=DataFrame() +sam=DataFrame() +sam1=DataFrame() +selected_samples = DataFrame() non_clustered = None l1 = [] labels = [] @@ -92,12 +89,12 @@ dr_model = None # dimensionality reduction model cl_model = None # clustering model selection = None selection_number = "None" -samples_df_chem = pd.DataFrame +samples_df_chem = DataFrame selected_samples = [] selected_samples_idx = [] if not file: - col1.info('Info: Please load data file !') + c2.info('Info: Please load data file !') else: extension = file.name.split(".")[-1] @@ -106,14 +103,14 @@ else: match extension: ## Load .csv file case 'csv': - with col1: + with c2: psep = st.radio("Select csv separator - _detected_: ", options = [";", ","],horizontal=True, key=9) phdr = st.radio("indexes column in csv? - _detected_: " , options = ["no", "yes"],horizontal=True, key=31) if phdr == 'yes':col = 0 else:col = False - # with col1: + # with c2: # # Select list for CSV delimiter # psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+file.name)), options = [";", ","], index = [";", ","].index(str(find_delimiter('data/'+file.name))),horizontal=True, key=9) # # Select list for CSV header True / False @@ -130,7 +127,7 @@ else: @st.cache_data def csv_loader(change): - imp = pd.read_csv(file, sep = psep, index_col=col) + imp = read_csv(file, sep = psep, index_col=col) spectra, md_df_st_ = col_cat(imp) meta_data = md_df_st_ return spectra, md_df_st_, meta_data, imp @@ -148,7 +145,7 @@ else: ## Load .dx file case 'dx': - with col1: + with c2: # Create a temporary file to save the uploaded file with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: tmp.write(file.read()) @@ -187,47 +184,49 @@ if not spectra.empty: @st.cache_data def spectra_visualize(change): - fig, ax = plt.subplots(figsize = (30,7)) - if extension =='dx': - lab = ['Wavenumber (1/cm)' if meta_data.loc[:,'xunits'][0] == '1/cm' else 'Wavelength (nm)'] - if lab[0] =='Wavenumber (1/cm)': - spectra.T.plot(legend=False, ax = ax).invert_xaxis() - else : - spectra.T.plot(legend=False, ax = ax) - ax.set_xlabel(lab[0], fontsize=18) - else: - spectra.T.plot(legend=False, ax = ax) - ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) + # fig, ax = plt.subplots(figsize = (30,7)) - ax.set_ylabel('Signal intensity', fontsize=18) - plt.margins(x = 0) - plt.tight_layout() + # if extension =='dx': + # lab = ['Wavenumber (1/cm)' if meta_data.loc[:,'xunits'][0] == '1/cm' else 'Wavelength (nm)'] + # if lab[0] =='Wavenumber (1/cm)': + # spectra.T.plot(legend=False, ax = ax).invert_xaxis() + # else : + # spectra.T.plot(legend=False, ax = ax) + # ax.set_xlabel(lab[0], fontsize=18) + # else: + # spectra.T.plot(legend=False, ax = ax) + # ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) - data_info = pd.DataFrame({'Name': [file.name], + # ax.set_ylabel('Signal intensity', fontsize=18) + # plt.margins(x = 0) + # plt.tight_layout() + fig = plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity") + + data_info = DataFrame({'Name': [file.name], 'Number of scanned samples': [n_samples]}, index = ['Input file']) - # update lines size to export for report - for line in ax.get_lines(): - line.set_linewidth(0.8) # Set the desired line width here - - # Update the size of plot axis for exprotation to report - l, w = fig.get_size_inches() - fig.set_size_inches(8, 3) - for label in (ax.get_xticklabels()+ax.get_yticklabels()): - ax.xaxis.label.set_size(9.5) - ax.yaxis.label.set_size(9.5) - plt.tight_layout() - fig.set_size_inches(l, w)# reset the plot size to its original size + # # update lines size to export for report + # for line in ax.get_lines(): + # line.set_linewidth(0.8) # Set the desired line width here + + # # Update the size of plot axis for exprotation to report + # l, w = fig.get_size_inches() + # fig.set_size_inches(8, 3) + # for label in (ax.get_xticklabels()+ax.get_yticklabels()): + # ax.xaxis.label.set_size(9.5) + # ax.yaxis.label.set_size(9.5) + # plt.tight_layout() + # fig.set_size_inches(l, w)# reset the plot size to its original size return fig, data_info fig_spectra, data_info = spectra_visualize(change = hash_) - col1, col2 = st.columns([3, 1]) - with col1: + c3, c4 = st.columns([3, 1]) + with c3: st.pyplot(fig_spectra) - with col2: + with c4: st.info('Information on the loaded data file') st.write(data_info) ## table showing the number of samples in the data file @@ -237,13 +236,13 @@ if not spectra.empty: st.header("II - Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') ###### 1- Dimensionality reduction ###### -t = pd.DataFrame # scores -p = pd.DataFrame # loadings +t = DataFrame # scores +p = DataFrame # loadings if not spectra.empty: xc = standardize(spectra, center=True, scale=False) - bb1, bb2, bb3, bb4, bb5, bb6, bb7 = st.columns([1,1,0.6,0.6,0.6,1.5,1.5]) - with bb1: + c5, c6, c7, c8, c9, c10, c11 = st.columns([1, 1, 0.6, 0.6, 0.6, 1.5, 1.5]) + with c5: dim_red_method = st.selectbox("Dimensionality reduction techniques: ", options = ['']+dim_red_methods, index = default_reduction_option, key = 37, format_func = lambda x: x if x else "<Select>") if dim_red_method == '': st.info('Info: Select a dimensionality reduction technique!') @@ -284,9 +283,9 @@ if not spectra.empty: if dr_model: - axis1 = bb3.selectbox("x-axis", options = dr_model.scores_.columns, index=0) - axis2 = bb4.selectbox("y-axis", options = dr_model.scores_.columns, index=1) - axis3 = bb5.selectbox("z-axis", options = dr_model.scores_.columns, index=2) + axis1 = c7.selectbox("x-axis", options = dr_model.scores_.columns, index=0) + axis2 = c8.selectbox("y-axis", options = dr_model.scores_.columns, index=1) + axis3 = c9.selectbox("z-axis", options = dr_model.scores_.columns, index=2) axis = np.unique([axis1, axis2, axis3]) p_hash(axis) t = dr_model.scores_.loc[:,np.unique(axis)] @@ -298,38 +297,26 @@ if not t.empty: non_clustered = None if dim_red_method == 'UMAP': - scores = st.container() + c12 = st.container() else: - scores, loadings= st.columns([3,3]) + c12, c13 = st.columns([3,3]) if not spectra.empty: - sel_ratio = bb2.number_input('Enter the number/fraction of samples to be selected:',min_value=0.01, max_value=float("{:.2f}".format(spectra.shape[0])), value=0.20, format="%.2f", disabled= disablewidgets) - if sel_ratio: - p_hash(sel_ratio) - if sel_ratio > 1.00: - ratio = int(sel_ratio) - elif sel_ratio < 1.00: - ratio = int(sel_ratio*spectra.shape[0]) -if dr_model and not clus_method: - clus_method = bb2.radio('Select samples selection strategy:', - options = ['RDM', 'KS'],) -elif dr_model and clus_method: - # sel_ratio = bb2.number_input('Enter the ratio/precentage of samples to be selected:',min_value=0.01, max_value=float("{:.2f}".format(spectra.shape[0])), value=0.20, format="%.2f") - # p_hash(sel_ratio) - # if sel_ratio > 1.00: - # ratio = int(sel_ratio) - # elif sel_ratio < 1.00: - # ratio = int(sel_ratio*spectra.shape[0]) - - if clus_method in cluster_methods: - selection = bb2.radio('Select samples selection strategy:', - options = selec_strategy, index = default_sample_selection_option,key=102,disabled = False) - else: - selection = bb2.radio('Select samples selection strategy:', - options = selec_strategy, horizontal=True, key=102,disabled = True) - + with c6: + sel_ratio = st.number_input('Enter the number/fraction of samples to be selected:',min_value=0.01, max_value=float("{:.2f}".format(spectra.shape[0])), value=0.20, format="%.2f", disabled= disablewidgets) + if sel_ratio: + if sel_ratio > 1.00: + ratio = int(sel_ratio) + elif sel_ratio < 1.00: + ratio = int(sel_ratio*spectra.shape[0]) + p_hash(sel_ratio) + if dr_model and not clus_method: + clus_method = st.radio('Select samples selection strategy:', options = ['RDM', 'KS']) + elif dr_model and clus_method: + disabled1 = False if clus_method in cluster_methods else True + selection = st.radio('Select samples selection strategy:', options = selec_strategy, index = default_sample_selection_option, key=102, disabled = disabled1) @@ -359,9 +346,6 @@ if dr_model and sel_ratio: case 'RDM': cl_model = RDM(x = tcr, rset = ratio) - # if clus_method in cluster_methods: - # inf.empty() - if clus_method in ['KS', 'RDM']: _, selected_samples_idx = cl_model.calset labels = ["ind"]*n_samples @@ -373,8 +357,6 @@ if dr_model and sel_ratio: # #################################################### III - Samples selection using the reduced data presentation ###### - - if not labels: custom_color_palette = px.colors.qualitative.Plotly[:1] elif labels: @@ -397,18 +379,14 @@ elif labels: for i in np.unique(s): C = np.where(np.array(labels) == i)[0] if C.shape[0] >= selection_number: - # scores.write(list(tcr.index)[labels== i]) km2 = KMeans(n_clusters = selection_number) km2.fit(tcr.iloc[C,:]) clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:]) selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index) else: selected_samples_idx.extend(new_tcr.iloc[C,:].index.to_list()) - # list indexes of selected samples for colored plot - -# ################################ Plots visualization ############################################ - +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ results visualization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## Scores if not t.empty: if meta_data.empty and clus_method in cluster_methods: @@ -420,7 +398,7 @@ if not t.empty: elif meta_data.empty and not clus_method in cluster_methods: filter = [] - with scores: + with c12: st.write('Scores plot') tcr_plot = tcr.copy() colfilter = st.selectbox('Color by:', options= filter,format_func = lambda x: x if x else "<Select>") @@ -436,59 +414,58 @@ if not t.empty: n_categories = len(np.unique(tcr_plot[col_var_name])) custom_color_palette = px.colors.qualitative.Plotly[:n_categories] - with scores: - if selected_samples_idx:# color selected samples - t_selected = tcr_plot.iloc[selected_samples_idx,:] - match t.shape[1]: - case 3: - fig = px.scatter_3d(tcr_plot, x = axis[0], y = axis[1], z = axis[2], color = col_var_name ,color_discrete_sequence = custom_color_palette) - fig.update_traces(marker=dict(size=4)) - if selected_samples_idx:# color selected samples - fig.add_scatter3d(x = t_selected.loc[:,axis[0]], y = t_selected.loc[:,axis[1]], z = t_selected.loc[:,axis[2]], - mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') - - case 2: - fig = px.scatter(tcr_plot, x = axis[0], y = axis[1], color = col_var_name ,color_discrete_sequence = custom_color_palette) - if selected_samples_idx:# color selected samples - fig.add_scatter(x = t_selected.loc[:,axis[0]], y = t_selected.loc[:,axis[1]], - mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') - + if selected_samples_idx:# color selected samples + t_selected = tcr_plot.iloc[selected_samples_idx,:] + match t.shape[1]: + case 3: + fig = px.scatter_3d(tcr_plot, x = axis[0], y = axis[1], z = axis[2], color = col_var_name ,color_discrete_sequence = custom_color_palette) + fig.update_traces(marker=dict(size=4)) + if selected_samples_idx:# color selected samples + fig.add_scatter3d(x = t_selected.loc[:,axis[0]], y = t_selected.loc[:,axis[1]], z = t_selected.loc[:,axis[2]], + mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') - case 1: - fig = px.scatter(tcr_plot, x = axis[0], y = [0]*tcr_plot.shape[0], color = col_var_name ,color_discrete_sequence = custom_color_palette) - fig.add_scatter(x = t_selected.loc[:,axis[0]], y = [0]*tcr_plot.shape[0], - mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') - fig.update_yaxes(visible=False) - - st.plotly_chart(fig, use_container_width = True) - - if labels: - fig_export = {} - # export 2D scores plot - if len(axis)== 3: - comb = [i for i in combinations(np.arange(len(axis)), 2)] - subcap = ['a','b','c'] - for i in range(len(comb)): - fig_= px.scatter(tcr_plot, x = axis[(comb[i][0])], y=axis[(comb[i][1])],color = labels if list(labels) else None,color_discrete_sequence = custom_color_palette) - fig_.add_scatter(x = t_selected.loc[:,axis[(comb[i][0])]], y = t_selected.loc[:,axis[(comb[i][1])]], mode ='markers', marker = dict(size = 5, color = 'black'), - name = 'selected samples') - fig_.update_layout(font=dict(size=23)) - fig_.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1, - font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) - fig_.update_traces(marker=dict(size= 10), showlegend= False) - fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'] = fig_ - # fig_export.write_image(f'./report/out/figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png') - else: - fig_export['fig'] = fig + case 2: + fig = px.scatter(tcr_plot, x = axis[0], y = axis[1], color = col_var_name ,color_discrete_sequence = custom_color_palette) + if selected_samples_idx:# color selected samples + fig.add_scatter(x = t_selected.loc[:,axis[0]], y = t_selected.loc[:,axis[1]], + mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') + + + case 1: + fig = px.scatter(tcr_plot, x = axis[0], y = [0]*tcr_plot.shape[0], color = col_var_name ,color_discrete_sequence = custom_color_palette) + fig.add_scatter(x = t_selected.loc[:,axis[0]], y = [0]*tcr_plot.shape[0], + mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') + fig.update_yaxes(visible=False) + + st.plotly_chart(fig, use_container_width = True) + + if labels: + fig_export = {} + # export 2D scores plot + if len(axis)== 3: + comb = [i for i in combinations(np.arange(len(axis)), 2)] + subcap = ['a','b','c'] + for i in range(len(comb)): + fig_= px.scatter(tcr_plot, x = axis[(comb[i][0])], y=axis[(comb[i][1])],color = labels if list(labels) else None,color_discrete_sequence = custom_color_palette) + fig_.add_scatter(x = t_selected.loc[:,axis[(comb[i][0])]], y = t_selected.loc[:,axis[(comb[i][1])]], mode ='markers', marker = dict(size = 5, color = 'black'), + name = 'selected samples') + fig_.update_layout(font=dict(size=23)) + fig_.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1, + font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) + fig_.update_traces(marker=dict(size= 10), showlegend= False) + fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'] = fig_ + # fig_export.write_image(f'./report/out/figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png') + else: + fig_export['fig'] = fig if not spectra.empty: if dim_red_method in ['PCA','NMF']: - with loadings: + with c13: st.write('Loadings plot') p = dr_model.loadings_ - freq = pd.DataFrame(colnames, index=p.index) + freq = DataFrame(colnames, index=p.index) if extension =='dx': if meta_data.loc[:,'xunits'][0] == '1/cm': freq.columns = ['Wavenumber (1/cm)'] @@ -503,7 +480,7 @@ if not spectra.empty: xlab = 'Wavelength/Wavenumber' inv = None - pp = pd.concat([p, freq], axis=1) + pp = concat([p, freq], axis=1) ######################################### df1 = pp.melt(id_vars=freq.columns) loadingsplot = px.line(df1, x=freq.columns, y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly) @@ -516,8 +493,8 @@ if not spectra.empty: ############################################################################################################# if dim_red_method == 'PCA': - influence, hotelling = st.columns([3, 3]) - with influence: + c14, c15 = st.columns([3, 3]) + with c14: st.write('Influence plot') # Laverage Hat = t.to_numpy() @ np.linalg.inv(np.transpose(t.to_numpy()) @ t.to_numpy()) @ np.transpose(t.to_numpy()) @@ -588,7 +565,7 @@ if not spectra.empty: # influence_plot.write_image('./report/out/figures/influence_plot.png', engine = 'kaleido') - with hotelling: + with c15: st.write('T²-Hotelling vs Q-residuals plot') # Hotelling hotelling = t.var(axis = 1) @@ -635,11 +612,11 @@ if not spectra.empty: st.header('III - Selected Samples for Reference Analysis', divider='blue') if labels: - sel, info = st.columns([3, 1]) - sel.write("Tabular identifiers of selected samples for reference analysis:") + c16, c17 = st.columns([3, 1]) + c16.write("Tabular identifiers of selected samples for reference analysis:") if selected_samples_idx: if meta_data.empty: - sam1 = pd.DataFrame({'name': spectra.index[clustered][selected_samples_idx], + sam1 = DataFrame({'name': spectra.index[clustered][selected_samples_idx], 'cluster':np.array(labels)[clustered][selected_samples_idx]}, index = selected_samples_idx) else: @@ -647,18 +624,18 @@ if labels: sam1.insert(loc=0, column='index', value=selected_samples_idx) sam1.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx]) sam1.index = np.arange(len(selected_samples_idx))+1 - info.info(f'Information !\n - The total number of samples: {n_samples}.\n- The number of samples selected for reference analysis: {sam1.shape[0]}.\n - The proportion of samples selected for reference analysis: {round(sam1.shape[0]/n_samples*100)}%.') + with c17: + st.info(f'Information !\n - The total number of samples: {n_samples}.\n- The number of samples selected for reference analysis: {sam1.shape[0]}.\n - The proportion of samples selected for reference analysis: {round(sam1.shape[0]/n_samples*100)}%.') sam = sam1 - # if clus_method == cluster_methods[2]: - # unclus = sel.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True) - if clus_method == cluster_methods[2]: - unclus = sel.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True) + if clus_method =='HDBSCAN': + with c16: + unclus = st.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True) if selected_samples_idx: if unclus: if meta_data.empty: - sam2 = pd.DataFrame({'name': spectra.index[non_clustered], + sam2 = DataFrame({'name': spectra.index[non_clustered], 'cluster':['Non clustered']*len(spectra.index[non_clustered])}, index = spectra.index[non_clustered]) else : @@ -666,15 +643,18 @@ if labels: sam2.insert(loc=0, column='index', value= spectra.index[non_clustered]) sam2.insert(loc=1, column='cluster', value=['Non clustered']*len(spectra.index[non_clustered])) - sam = pd.concat([sam1, sam2], axis = 0) + sam = concat([sam1, sam2], axis = 0) sam.index = np.arange(sam.shape[0])+1 - info.info(f'- The number of Non-clustered samples: {sam2.shape[0]}.\n - The proportion of Non-clustered samples: {round(sam2.shape[0]/n_samples*100)}%') + with c17: + st.info(f'- The number of Non-clustered samples: {sam2.shape[0]}.\n - The proportion of Non-clustered samples: {round(sam2.shape[0]/n_samples*100)}%') else: sam = sam1 - sel.write(sam) + with c16: + st.write(sam) if not sam.empty: + zip_data = "" Nb_ech = str(n_samples) nb_clu = str(sam1.shape[0]) st.header('Download the analysis results') @@ -688,9 +668,9 @@ if not sam.empty: latex_report = report.report('Representative subset selection', file.name, dim_red_method, clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam) - @st.cache_data def preparing_results_for_downloading(change): + # path_to_report = Path("report")############################### i am here match extension: # load csv file case 'csv': @@ -699,32 +679,32 @@ if not sam.empty: with open('report/out/dataset/'+file.name, 'w') as dd: dd.write(dxdata) - fig_spectra.savefig("./report/out/figures/spectra_plot.png", dpi=400) ## Export report + fig_spectra.savefig(report_path_rel/"out/figures/spectra_plot.png", dpi=400) ## Export report if len(axis) == 3: for i in range(len(comb)): - fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'].write_image(f'./report/out/figures/scores_pc{str(comb[i][0]+1)}_pc{str(comb[i][1]+1)}.png') + fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'].write_image(report_path_rel/f'out/figures/scores_pc{str(comb[i][0]+1)}_pc{str(comb[i][1]+1)}.png') elif len(axis)==2 : - fig_export['fig'].write_image(f'./report/out/figures/scores_plot2D.png') + fig_export['fig'].write_image(report_path_rel/'out/figures/scores_plot2D.png') elif len(axis)==1 : - fig_export['fig'].write_image(f'./report/out/figures/scores_plot1D.png') + fig_export['fig'].write_image(report_path_rel/'out/figures/scores_plot1D.png') # Export du graphique if dim_red_method in ['PCA','NMF']: img = pio.to_image(loadingsplot, format="png") - with open("./report/out/figures/loadings_plot.png", "wb") as f: + with open(report_path_rel/"out/figures/loadings_plot.png", "wb") as f: f.write(img) if dim_red_method == 'PCA': - hotelling_plot.write_image("./report/out/figures/hotelling_plot.png", format="png") - influence_plot.write_image('./report/out/figures/influence_plot.png', engine = 'kaleido') + hotelling_plot.write_image(report_path_rel/"out/figures/hotelling_plot.png", format="png") + influence_plot.write_image(report_path_rel/'out/figures/influence_plot.png', engine = 'kaleido') - sam.to_csv('./report/out/Selected_subset_for_calib_development.csv', sep = ';') + sam.to_csv(report_path_rel/'out/Selected_subset_for_calib_development.csv', sep = ';') export_report(change = hash_) - if Path("./report/report.tex").exists(): + if Path(report_path_rel/"report.tex").exists(): report.generate_report(change = hash_) - if Path("./report/report.pdf").exists(): - shutil.move("./report/report.pdf", "./report/out/report.pdf") + if Path(report_path_rel/"report.pdf").exists(): + move(report_path_rel/"report.pdf", "./report/out/report.pdf") return change @@ -733,25 +713,28 @@ if not sam.empty: - import tempfile @st.cache_data def tempdir(change): - with tempfile.TemporaryDirectory( prefix="results", dir="./report") as temp_dir:# create a temp directory + with TemporaryDirectory( prefix="results", dir="./report") as temp_dir:# create a temp directory tempdirname = os.path.split(temp_dir)[1] - if len(os.listdir('./report/out/figures/'))>=2: - shutil.make_archive(base_name="./report/Results", format="zip", base_dir="out", root_dir = "./report")# create a zip file - shutil.move("./report/Results.zip", f"./report/{tempdirname}/Results.zip")# put the inside the temp dir - with open(f"./report/{tempdirname}/Results.zip", "rb") as f: + if len(os.listdir(report_path_rel/'out/figures/'))>=2: + make_archive(base_name= report_path_rel/"Results", format="zip", base_dir="out", root_dir = "./report")# create a zip file + move(report_path_rel/"Results.zip", f"./report/{tempdirname}/Results.zip")# put the inside the temp dir + with open(report_path_rel/f"{tempdirname}/Results.zip", "rb") as f: zip_data = f.read() return tempdirname, zip_data - - date_time = datetime.datetime.now().strftime('%y%m%d%H%M') + try : tempdirname, zip_data = tempdir(change = hash_) - st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip", - args = None, kwargs = None,type = "primary",use_container_width = True) + # st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip", + # args = None, kwargs = None,type = "primary",use_container_width = True) except: pass + date_time = datetime.now().strftime('%y%m%d%H%M') + disabled_down = True if zip_data == '' else False + st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip", + args = None, kwargs = None,type = "primary",use_container_width = True, disabled = disabled_down) + - delete_files(keep = ['.py', '.pyc','.bib']) \ No newline at end of file + delete_files(keep = ['.py', '.pyc','.bib']) \ No newline at end of file diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index 686d9155a5f04fcb8fa0a71e7a77c6e752b7f366..06b055489651890ed7c0361a113464e70f694c18 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -12,7 +12,6 @@ local_css(css_file / "style_model.css")#load specific model page css - hash_ = '' def p_hash(add): global hash_ @@ -37,7 +36,7 @@ class lw: def __init__(self, Reg_json, pred): self.model_ = Reg_json['model'] self.best_hyperparams_ = Reg_json['best_lwplsr_params'] - self.pred_data_ = [pd.json_normalize(Reg_json[i]) for i in pred] + self.pred_data_ = [json_normalize(Reg_json[i]) for i in pred] ################ clean the results dir ############# @@ -49,58 +48,37 @@ if not dirpath.exists(): # ####################################### page preamble ####################################### st.title("Calibration Model Development") # page title st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra") -M0, M00 = st.columns([1, .4]) -M0.image("./images/model_creation.png", use_column_width = True) # graphical abstract +c0, c1 = st.columns([1, .4]) +c0.image("./images/model_creation.png", use_column_width = True) # graphical abstract ################################################################# Begin : I- Data loading and preparation ###################################### files_format = ['csv', 'dx'] # Supported files format -file = M00.radio('Select files format:', options = files_format,horizontal = True) # Select a file format +file = c1.radio('Select files format:', options = files_format,horizontal = True) # Select a file format -spectra = pd.DataFrame() # preallocate the spectral data block -y = pd.DataFrame() # preallocate the target(s) data block +spectra = DataFrame() # preallocate the spectral data block +y = DataFrame() # preallocate the target(s) data block match file: # load csv file case 'csv': - with M00: + with c1: # Load X-block data xcal_csv = st.file_uploader("Select NIRS Data", type = "csv", help = " :mushroom: select a csv matrix with samples as rows and lambdas as columns") if xcal_csv: - sepx = st.radio("Select separator (X file): " , - options = [";", ","], key = 0,horizontal = True) - hdrx = st.radio("samples name (X file): ", - options = ["no", "yes"], key = 1,horizontal = True) - - # sepx = st.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), - # options = [";", ","], index = [";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key = 0,horizontal = True) - # hdrx = st.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), - # options = ["no", "yes"], index = ["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key = 1,horizontal = True) - - match hdrx: - case "yes":col = 0 - case "no":col = False - - + sepx = st.radio("Select separator (X file): " , options = [";", ","], key = 0,horizontal = True) + hdrx = st.checkbox("Samples name (X file): ") + colx = 0 if hdrx else False else: st.info('Info: Insert your spectral data file above!') # Load Y-block data ycal_csv = st.file_uploader("Select corresponding Chemical Data", type = "csv", help = " :mushroom: select a csv matrix with samples as rows and chemical values as a column") if ycal_csv: - sepy = st.radio("Select separator (Y file): " , - options = [";", ","], key = 2, horizontal = True) - hdry = st.radio("samples name (Y file)?: ", - options = ["no", "yes"], key = 3, horizontal = True) - - # sepy = st.radio("Select separator (Y file) - _detected_: " + str(find_delimiter('data/'+ycal_csv.name)), - # options = [";", ","], index = [";", ","].index(str(find_delimiter('data/'+ycal_csv.name))), key = 2, horizontal = True) - # hdry = st.radio("samples name (Y file)? - _detected_: " + str(find_col_index('data/'+ycal_csv.name)), - # options = ["no", "yes"], index = ["no", "yes"].index(str(find_col_index('data/'+ycal_csv.name))), key = 3, horizontal = True) - match hdry: - case "yes": - col = 0 - case "no": - col = False + sepy = st.radio("Select separator (Y file): ", options = [";", ","], key = 2, horizontal = True) + hdry = st.checkbox("samples name (Y file)?: ") + coly = 0 if hdry else False + + else: st.info('Info: Insert your target data file above!') @@ -121,8 +99,8 @@ match file: def csv_loader(change): delete_files(keep = ['.py', '.pyc','.bib']) file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name) - xfile = pd.read_csv(xcal_csv, decimal = '.', sep = sepx, index_col = col, header = 0) - yfile = pd.read_csv(ycal_csv, decimal = '.', sep = sepy, index_col = col) + xfile = read_csv(xcal_csv, decimal = '.', sep = sepx, index_col = colx, header = 0) + yfile = read_csv(ycal_csv, decimal = '.', sep = sepy, index_col = coly) return xfile, yfile, file_name xfile, yfile, file_name = csv_loader(change = hash_) @@ -136,7 +114,7 @@ match file: spectra, meta_data = col_cat(xfile) except: st.error('Error: The format of the X-file does not correspond to the expected dialect settings. To read the file correctly, please adjust the separator parameters.') - spectra = pd.DataFrame(spectra).astype(float) + spectra = DataFrame(spectra).astype(float) # prepare y data try: @@ -146,27 +124,28 @@ match file: if 'chem_data' in globals(): if chem_data.shape[1]>1: - yname = M00.selectbox('Select a target', options = ['']+chem_data.columns.tolist(), format_func = lambda x: x if x else "<Select>") + yname = c1.selectbox('Select a target', options = ['']+chem_data.columns.tolist(), format_func = lambda x: x if x else "<Select>") if yname: y = chem_data.loc[:, yname] else: - M00.info('Info: Select the target analyte from the drop down list!') - else: + c1.info('Info: Select the target analyte from the drop down list!') + elif chem_data.shape[1] == 1: y = chem_data.iloc[:, 0] + yname = chem_data.iloc[:, [0]].columns[0] ### warning if not y.empty: if spectra.shape[0] != y.shape[0]: st.error('Error: X and Y have different sample size') - y = pd.DataFrame - spectra = pd.DataFrame + y = DataFrame + spectra = DataFrame else: st.error('Error: The data has not been loaded successfully, please consider tuning the dialect settings!') # Load .dx file case 'dx': - with M00: + with c1: data_file = st.file_uploader("Select Data", type = ".dx", help = " :mushroom: select a dx file") if data_file: file_name = str(data_file.name) @@ -217,47 +196,31 @@ st.header("I - Data visualization", divider = 'blue') if not spectra.empty and not y.empty: p_hash(y) p_hash(np.mean(spectra)) - @st.cache_data(show_spinner =False) - def visualize(change): - if np.array(spectra.columns).dtype.kind in ['i', 'f']: - colnames = spectra.columns - else: - colnames = np.arange(spectra.shape[1]) - - - # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing - train_index, test_index = train_test_split_idx(spectra, y = y, method = "kennard_stone", metric = "correlation", test_size = 0.25, random_state = 42) - - # Assign data to training and test sets - X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index] - X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index] - + if np.array(spectra.columns).dtype.kind in ['i', 'f']: + colnames = spectra.columns + else: + colnames = np.arange(spectra.shape[1]) + + X_train, X_test, y_train, y_test, train_index, test_index = data_split(x=spectra, y=y) + - #### insight on loaded data - # M0, M000 = st.columns([1, .4]) - fig1, ax1 = plt.subplots( figsize = (12, 3)) - spectra.T.plot(legend = False, ax = ax1, linestyle = '-', linewidth = 0.6) - ax1.set_ylabel('Signal intensity') - ax1.margins(0) - plt.tight_layout() - fig2, ax2 = plt.subplots(figsize = (12,3)) - sns.histplot(y, color = "deeppink", kde = True, label = "y", ax = ax2, fill = True) - sns.histplot(y_train, color = "blue", kde = True, label = "y (train)", ax = ax2, fill = True) - sns.histplot(y_test, color = "green", kde = True, label = "y (test)", ax = ax2, fill = True) - ax2.set_xlabel('y') - plt.legend() - plt.tight_layout() - stats = pd.DataFrame([desc_stats(y_train), desc_stats(y_test), desc_stats(y)], index =['train', 'test', 'total'] ).round(2) - return X_train, X_test, y_train, y_test, colnames, train_index, test_index, stats, fig1, fig2 + #### insight on loaded data + spectra_plot = plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity") + target_plot = hist(y = y, y_train = y_train, y_test = y_test, target_name=yname) + stats = DataFrame([desc_stats(y_train), desc_stats(y_test), desc_stats(y)], index =['train', 'test', 'total'] ).round(2) - X_train, X_test, y_train, y_test, colnames, train_index, test_index, stats, spectra_plot, target_plot = visualize(change = hash_) - M0, M000 = st.columns([1, .4]) - with M0: + # fig1, ax1 = plt.subplots( figsize = (12, 3)) + # spectra.T.plot(legend = False, ax = ax1, linestyle = '-', linewidth = 0.6) + # ax1.set_ylabel('Signal intensity') + # ax1.margins(0) + # plt.tight_layout() + c2, c3 = st.columns([1, .4]) + with c2: st.pyplot(spectra_plot) ######## Loaded graph st.pyplot(target_plot) - with M000: + with c3: st.write('Loaded data summary') st.write(stats) @@ -272,23 +235,46 @@ if not spectra.empty and not y.empty: # ################################################### BEGIN : Create Model #################################################### model_type = None # initialize the selected regression algorithm Reg = None # initialize the regression model object -# intervalls_with_cols = pd.DataFrame() +# intervalls_with_cols = DataFrame() st.header("II - Model creation", divider = 'blue') if not spectra.empty and not y.empty: - M10, M20, M30, M40, M50 = st.columns([1, 1, 1, 1, 1]) - - # select type of supervised modelling problem - modes = ['regression', 'classification'] - mode = M10.radio("Analysis Methods", options=modes) - p_hash(mode) - match mode: - case "regression": - reg_algo = ["", "PLS", "LW-PLS", "TPE-iPLS"] - model_type = M20.selectbox("Choose the regression algorithm", options = reg_algo, key = "model_type", format_func = lambda x: x if x else "<Select>") - case 'classification': - reg_algo = ["", "PLS", "LW-PLS", "TPE-iPLS", 'LDA'] - model_type = M20.selectbox("Choose the classification algorithm", options = reg_algo, key = 12, format_func = lambda x: x if x else "<Select>") + c4, c5, c6 = st.columns([1, 1, 3]) + with c4: + # select type of supervised modelling problem + var_nature = ['Continuous', 'Categorical'] + mode = c4.radio("The nature of the target variable :", options = var_nature) + p_hash(mode) + match mode: + case "Continuous": + reg_algo = ["", "PLS", "LW-PLS", "TPE-iPLS"] + st.markdown(f'Example1: Quantifying the volume of nectar consumed by a pollinator during a foraging session.') + st.markdown(f"Example2: Measure the sugar content, amino acids, or other compounds in nectar from different flower species.") + case 'Categorical': + reg_algo = ["", "PLS", "LW-PLS", "TPE-iPLS", 'LDA'] + st.markdown(f"Example1: Classifying pollinators into categories such as bees, butterflies, moths, and beetles.") + st.markdown(f"Example2: Classifying plants based on their health status, such as healthy, stressed, or diseased, using NIR spectral data.") + with c5: + model_type = c5.selectbox("Choose a modelling algorithm:", options = reg_algo, key = 12, format_func = lambda x: x if x else "<Select>") + + with c6: + st.markdown("-------------") + match model_type: + case "PLS": + st.markdown("#### For further details on the PLS (Partial Least Squares) algorithm, check the following reference:") + st.markdown('##### https://www.tandfonline.com/doi/abs/10.1080/03610921003778225') + + case "LW-PLS": + st.markdown("#### For further details on the LW-PLS (Locally Weighted - Partial Least Squares) algorithm, check the following reference:") + st.markdown('##### https://analyticalsciencejournals.onlinelibrary.wiley.com/doi/full/10.1002/cem.3117') + + case "TPE-iPLS": + st.markdown("#### For further details on the TPE-iPLS (Tree-structured Parzen Estimator based interval-Partial Least Squares) algorithm, which is a wrapper method for interval selection, check the following references:") + st.markdown("##### https://papers.nips.cc/paper_files/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf") + st.markdown('##### https://www.tandfonline.com/doi/abs/10.1080/03610921003778225') + st.markdown('##### https://journals.sagepub.com/doi/abs/10.1366/0003702001949500') + st.markdown("-------------") + # if model_type != st.session_state.model_type: # st.session_state.model_type = model_type # increment() @@ -300,7 +286,7 @@ if not spectra.empty and not y.empty: nb_folds = 3 # Model creation-M20 columns - with M20: + with c5: @st.cache_data def RequestingModelCreation(change): # spectra_plot.savefig("./report/figures/spectra_plot.png") @@ -310,7 +296,7 @@ if not spectra.empty and not y.empty: match model_type: case 'PLS': - Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter = 10, cv = nb_folds) + Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter = 100, cv = nb_folds) # reg_model = Reg.model_ rega = Reg.selected_features_ @@ -360,11 +346,11 @@ if not spectra.empty and not y.empty: # global Reg # Reg = type('obj', (object,), {'model_' : Reg_json['model'], 'best_hyperparams_' : Reg_json['best_lwplsr_params'], - # 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]}) + # 'pred_data_' : [json_normalize(Reg_json[i]) for i in pred]}) # global Reg Reg = lw(Reg_json = Reg_json, pred = pred) # reg_model = Reg.model_ - Reg.CV_results_ = pd.DataFrame() + Reg.CV_results_ = DataFrame() Reg.cv_data_ = {'YpredCV' : {}, 'idxCV' : {}} # set indexes to Reg.pred_data (train, test, folds idx) for i in range(len(pred)): @@ -380,7 +366,7 @@ if not spectra.empty and not y.empty: else: # CVi Reg.pred_data_[i].index = folds[list(folds)[i-2]] - # Reg.CV_results_ = pd.concat([Reg.CV_results_, Reg.pred_data_[i]]) + # Reg.CV_results_ = concat([Reg.CV_results_, Reg.pred_data_[i]]) Reg.cv_data_['YpredCV']['Fold' + str(i-1)] = np.array(Reg.pred_data_[i]).reshape(-1) Reg.cv_data_['idxCV']['Fold' + str(i-1)] = np.array(folds[list(folds)[i-2]]).reshape(-1) @@ -404,8 +390,8 @@ if not spectra.empty and not y.empty: # reg_model = Reg.model_ global intervalls, intervalls_with_cols - intervalls = Reg.selected_features_.T - intervalls_with_cols = Reg.selected_features_.T + intervalls = Reg.selected_features_.T.copy() + intervalls_with_cols = Reg.selected_features_.T.copy().astype(str) for i in range(intervalls.shape[0]): for j in range(intervalls.shape[1]): @@ -424,7 +410,7 @@ if not spectra.empty and not y.empty: info = st.info('Info: The model is being created. This may take a few minutes.') if model_type == 'TPE-iPLS':# if model type is ipls then ask for the number of iterations and intervalls s = st.number_input(label = 'Enter the maximum number of intervals', min_value = 1, max_value = 6) - it = st.number_input(label = 'Enter the number of iterations', min_value = 2, max_value = 500, value = 2) + it = st.number_input(label = 'Enter the number of iterations', min_value = 2, max_value = 500, value = 250) else: s, it = None, None p_hash(str(s)+str(it)) @@ -433,7 +419,7 @@ if not spectra.empty and not y.empty: p_hash(st.session_state.counter) Reg = RequestingModelCreation(change = hash_) reg_model = Reg.model_ - hash_ = joblib.hash(Reg) + hash_ = hash(Reg) else: st.info('Info: Choose a modelling algorithm from the dropdown list!') @@ -450,8 +436,6 @@ if not spectra.empty and not y.empty: intervalls = st.session_state.intervalls intervalls_with_cols = st.session_state.intervalls_with_cols - - if Reg: # remodel_button = st.button('re-model the data', key=4, help=None, type="primary", use_container_width=True) # if remodel_button:# remodel feature for re-tuning the model @@ -463,8 +447,8 @@ if Reg: yt = Reg.pred_data_[1] - M1, M2 = st.columns([2 ,4]) - with M1: + c7, c8 = st.columns([2 ,4]) + with c7: # Show and export the preprocessing methods st.write('-- Spectral preprocessing info --') st.write(Reg.best_hyperparams_print) @@ -477,9 +461,9 @@ if Reg: # Show the model performance table st.write("-- Model performance --") if model_type != reg_algo[2]: - model_per = pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method = 'regression').scores_) + model_per = DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method = 'regression').scores_) else: - model_per = pd.DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method = 'regression').scores_) + model_per = DataFrame(metrics(c = [y_train, yc], t = [y_test, yt], method = 'regression').scores_) st.dataframe(model_per) @@ -518,7 +502,7 @@ if Reg: ax2.legend() return fig - with M2:## Visualize raw,preprocessed spectra, and selected intervalls(in case of ipls) + with c8:## Visualize raw,preprocessed spectra, and selected intervalls(in case of ipls) if model_type =='TPE-iPLS' : st.write('-- Important Spectral regions used for model creation --') st.table(intervalls_with_cols) @@ -547,7 +531,7 @@ if Reg: cv1, cv2 = st.columns([2, 2]) with cv2: - cv_results = pd.DataFrame(Reg.CV_results_).round(4)# CV table + cv_results = DataFrame(Reg.CV_results_).round(4)# CV table st.write('-- Cross-Validation Summary--') st.write(cv_results.astype(str).style.map(lambda _: "background-color: #cecece;", subset = (cv_results.index.drop(['sd', 'mean', 'cv']), slice(None)))) @@ -599,6 +583,7 @@ if Reg: ########################################################################################################################################## ########################################################################################################################################## if Reg: + zip_data = "" st.header('Download the analysis results') st.write("**Note:** Please check the box only after you have finished processing your data and are satisfied with the results. Checking the box prematurely may slow down the app and could lead to crashes.") decis = st.checkbox("Yes, I want to download the results") @@ -633,7 +618,7 @@ if Reg: dd.write(dxdata) with open('./report/out/model/'+ model_type + '.pkl','wb') as f:# export model - joblib.dump(reg_model, f) + dump(reg_model, f) figpath ='./report/out/figures/' spectra_plot.savefig(figpath + "spectra_plot.png") target_plot.savefig(figpath + "histogram.png") @@ -642,12 +627,12 @@ if Reg: fig0.write_image(figpath + "meas_vs_pred_cv_onebyone.png") measured_vs_predicted.savefig(figpath + 'measured_vs_predicted.png') residuals_plot.savefig(figpath + 'residuals_plot.png') - with open('report/out/Preprocessing.json', "w") as outfile: - json.dump(Reg.best_hyperparams_, outfile) + # with open('report/out/Preprocessing.json', "w") as outfile: + # json.dump(Reg.best_hyperparams_, outfile) if model_type == 'TPE-iPLS': # export selected wavelengths wlfilename = './report/out/model/'+ model_type+'-selected_wavelengths.xlsx' - all = pd.concat([intervalls_with_cols.T, Reg.selected_features_], axis = 0, ignore_index=True).T + all = concat([intervalls_with_cols.T, Reg.selected_features_], axis = 0, ignore_index=True).T all.columns=['wl_from','wl_to','idx_from', 'idx_to'] all.to_excel(wlfilename) @@ -655,30 +640,53 @@ if Reg: if Path("./report/report.tex").exists(): report.generate_report(change = hash_) if Path("./report/report.pdf").exists(): - shutil.move("./report/report.pdf", "./report/out/report.pdf") + move("./report/report.pdf", "./report/out/report.pdf") + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# + # pklfile = {'model_': Reg.model_,"model_type" : model_type, 'training_data':{'raw-spectra':spectra,'target':y, }, + # 'spec-preprocessing':{"normalization": Reg.best_hyperparams_['normalization'], 'SavGol(polyorder,window_length,deriv)': [Reg.best_hyperparams_["polyorder"], + # Reg.best_hyperparams_['window_length'], + # Reg.best_hyperparams_['deriv']]}} + pklfile = {'model_': Reg.model_,"model_type" : model_type, 'data':{'raw-spectra':spectra,'target':y, 'training_data_idx':train_index,'testing_data_idx':test_index}, + 'spec-preprocessing':{"normalization": Reg.best_hyperparams_['normalization'], 'SavGol(polyorder,window_length,deriv)': [Reg.best_hyperparams_["polyorder"], + Reg.best_hyperparams_['window_length'], + Reg.best_hyperparams_['deriv']]}} + if model_type == 'TPE-iPLS': # export selected wavelengths + pklfile['selected-wls'] = {'idx':Reg.selected_features_.T , "wls":intervalls_with_cols } + else: + pklfile['selected-wls'] = {'idx':None, "wls":None } + + with open('./report/out/file_system.pkl', "wb") as pkl: + dump(pklfile, pkl) + + + + + return change preparing_results_for_downloading(change = hash_) - import tempfile @st.cache_data(show_spinner =False) def tempdir(change): - with tempfile.TemporaryDirectory( prefix="results", dir="./report") as temp_dir:# create a temp directory + with TemporaryDirectory( prefix="results", dir="./report") as temp_dir:# create a temp directory tempdirname = os.path.split(temp_dir)[1] if len(os.listdir('./report/out/figures/'))>2: - shutil.make_archive(base_name="./report/Results", format="zip", base_dir="out", root_dir = "./report")# create a zip file - shutil.move("./report/Results.zip", f"./report/{tempdirname}/Results.zip")# put the inside the temp dir + make_archive(base_name="./report/Results", format="zip", base_dir="out", root_dir = "./report")# create a zip file + move("./report/Results.zip", f"./report/{tempdirname}/Results.zip")# put the inside the temp dir with open(f"./report/{tempdirname}/Results.zip", "rb") as f: zip_data = f.read() return tempdirname, zip_data - date_time = datetime.datetime.now().strftime('%y%m%d%H%M') try : tempdirname, zip_data = tempdir(change = hash_) - st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_Reg_.zip', mime ="application/zip", - args = None, kwargs = None,type = "primary",use_container_width = True) except: pass + date_time = datetime.now().strftime('%y%m%d%H%M') + disabled_down = True if zip_data=='' else False + st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_Reg_.zip', mime ="application/zip", + args = None, kwargs = None,type = "primary",use_container_width = True, disabled = disabled_down) + - delete_files(keep = ['.py', '.pyc','.bib']) + delete_files(keep = ['.py', '.pyc','.bib']) diff --git a/src/pages/3-prediction.py b/src/pages/3-prediction.py index eb3cf2002120a9b8ddc2e2c88e0c5d9ade817add..3dce23aa31c446be0a257f81863e60a16fb0fafd 100644 --- a/src/pages/3-prediction.py +++ b/src/pages/3-prediction.py @@ -1,7 +1,7 @@ from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * -from Class_Mod.DATA_HANDLING import * +from utils.DATA_HANDLING import * # HTML pour le bandeau "CEFE - CNRS" # bandeau_html = """ # <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;"> @@ -14,7 +14,6 @@ add_header() add_sidebar(pages_folder) local_css(css_file / "style_model.css") -import shutil hash_ = '' def p_hash(add): global hash_ @@ -23,8 +22,10 @@ def p_hash(add): dirpath = Path('Report/out/model') if dirpath.exists() and dirpath.is_dir(): - shutil.rmtree(dirpath) + rmtree(dirpath) +if 'Predict' not in st.session_state: + st.session_state['Predict'] = False # #################################### Methods ############################################## # empty temp figures def delete_files(keep): @@ -35,267 +36,333 @@ def delete_files(keep): if file != 'logo_cefe.png' and not any(file.endswith(ext) for ext in keep): os.remove(os.path.join(root, file)) ################################################################### + +st.title("Prediction making using a previously developed model") +c1, c2 = st.columns([2, 1]) +c1.image("./images/prediction making.png", use_column_width=True) +pred_data = DataFrame + + -st.title("Prediction making using a previously developed model") -M10, M20= st.columns([2, 1]) -M10.image("./images/prediction making.png", use_column_width=True) def preparespecdf(df): other = df.select_dtypes(exclude = 'float') - rownames = other.iloc[:,0] spec = df.select_dtypes(include='float') - spec.index = rownames + if other.shape[1] > 0: + rownames = other.iloc[:,0] + spec.index = rownames + else: + rownames = [str(i) for i in range(df.shape[0])] + if spec.shape[1]<60: + spec = DataFrame return spec, other, rownames def check_exist(var): out = var in globals() return out -files_format = ['.csv', '.dx'] -export_folder = './data/predictions/' -export_name = 'Predictions_of_' -reg_algo = ["Interval-PLS"] - +with c2: + zip = st.file_uploader("Load your zip file:", type = ['.zip'], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") + if not zip: + st.info('Info: Insert your zip file above!') + + disable1 = False if zip else True + new_data = st.file_uploader("Load NIRS Data for prediction making:", type = ['csv', 'dx'], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", disabled=disable1) + if not disable1 : + info1 = st.info('Info: Insert your NIRS data file above!') -with M20: - file = st.file_uploader("Load NIRS Data for prediction making:", type = files_format, help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") + if zip: + @st.cache_data + def tempdir(prefix, dir): + with TemporaryDirectory( prefix= prefix, dir= dir ) as temp_dir:# create a temp directory + tempdirname = os.path.split(temp_dir)[1] + return tempdirname + + temp_dir = tempdir(prefix = "pred_temp", dir = "./temp") + # Open and extract the zip file + from zipfile import ZipFile + with ZipFile(zip, 'r') as zip_ref: + zip_ref.extractall(temp_dir) - if not file: - st.info('Info: Insert your spectral data file above!') - else: - p_hash(file.name) - test = file.name[file.name.find('.'):] - export_name += file.name[:file.name.find('.')] - - if test == files_format[0]: - qsep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+file.name)), options=[";", ","],index=[";", ","].index(str(find_delimiter('data/'+file.name))), key=2, horizontal= True) - qhdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+file.name))), key=3, horizontal= True) - col = 0 if qhdr == 'yes' else None - p_hash([qsep,qhdr]) - - df = pd.read_csv(file, sep=qsep, header= col) - pred_data, cat, rownames = preparespecdf(df) - - elif test == files_format[1]: - with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: - tmp.write(file.read()) - tmp_path = tmp.name - with open(tmp.name, 'r') as dd: - dxdata = file.read() - p_hash(str(dxdata)+str(file.name)) - - ## load and parse the temp dx file - @st.cache_data - def dx_loader(change): - chem_data, spectra, meta_data, _ = read_dx(file = tmp_path) - return chem_data, spectra, meta_data, _ - chem_data, spectra, meta_data, _ = dx_loader(change = hash_) - st.success("The data have been loaded successfully", icon="✅") - if chem_data.to_numpy().shape[1]>0: - yname = st.selectbox('Select target', options=chem_data.columns) - measured = chem_data.loc[:,yname] == 0 - y = chem_data.loc[:,yname].loc[measured] - pred_data = spectra.loc[measured] - - else: - pred_data = spectra - os.unlink(tmp_path) + + def find_pkl_files(root_dir): + # List to store the paths of .pkl files + pkl_files = [] + + # Walk through the directory + for dirpath, dirnames, filenames in os.walk(root_dir): + for filename in filenames: + # Check if the file has a .pkl extension + if filename.endswith('.pkl'): + # Construct the full file path + file_path = os.path.join(dirpath, filename) + pkl_files.append(file_path) + return pkl_files + pkl = find_pkl_files(root_dir=temp_dir) + + system_file = [path for path in pkl if 'file_system' in path] + if len(system_file) ==1 : + with open(system_file[0], 'rb') as fi: + system_data = load(fi) + +if new_data: + info1.empty() + +with c2: + if new_data: + p_hash(new_data.name) + test = new_data.name.split('.')[-1] + export_name = 'Pred of' + export_name += new_data.name[:new_data.name.find('.')] + + match test: + case 'csv': + qsep = st.radio("Select csv separator : " , options = [';', ','], key = 2, horizontal = True) + qhdr = st.radio("indexes column in csv? : " , options = ['yes', 'no'], key = 3, horizontal = True) + col = 0 if qhdr == 'yes' else None + p_hash([qsep,qhdr]) + + df = read_csv(new_data, sep=qsep, header= col, decimal=".") + pred_data, cat, rownames = preparespecdf(df) + + case "dx": + with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: + tmp.write(new_data.read()) + tmp_path = tmp.name + with open(tmp.name, 'r') as dd: + dxdata = new_data.read() + p_hash(str(dxdata)+str(new_data.name)) + + ## load and parse the temp dx file + @st.cache_data + def dx_loader(change): + chem_data, spectra, meta_data, _ = read_dx(file = tmp_path) + return chem_data, spectra, meta_data, _ + chem_data, spectra, meta_data, _ = dx_loader(change = hash_) + st.success("The data have been loaded successfully", icon="✅") + if chem_data.to_numpy().shape[1]>0: + yname = st.selectbox('Select target', options=chem_data.columns) + measured = chem_data.loc[:,yname] == 0 + y = chem_data.loc[:,yname].loc[measured] + pred_data = spectra.loc[measured] + + else: + pred_data = spectra + os.unlink(tmp_path) # Load parameters st.header("I - Spectral data preprocessing & visualization", divider='blue') -try: - if check_exist("pred_data"):# Load the model with joblib - @st.cache_data - def specplot_raw(change): - fig2 = plot_spectra(pred_data, xunits = 'lab', yunits = "meta_data.loc[:,'yunits'][0]") - return fig2 - rawspectraplot = specplot_raw(change = hash_) - M1, M2= st.columns([2, 1]) - with M1: - st.write('Raw spectra') - st.pyplot(rawspectraplot) - - with M2: - params = st.file_uploader("Load preprocessings params", type = '.json', help=" .json file") - if params: - prep = json.load(params) - p_hash(prep) - - @st.cache_data - def preprocess_spectra(change): - # M4.write(ProcessLookupError) - - if prep['normalization'] == 'Snv': - x1 = Snv(pred_data) - norm = 'Standard Normal Variate' - else: - norm = 'No Normalization was applied' - x1 = pred_data - x2 = savgol_filter(x1, - window_length = int(prep["window_length"]), - polyorder = int(prep["polyorder"]), - deriv = int(prep["deriv"]), - delta=1.0, axis=-1, mode="interp", cval=0.0) - preprocessed = pd.DataFrame(x2, index = pred_data.index, columns = pred_data.columns) - return norm, prep, preprocessed - norm, prep, preprocessed = preprocess_spectra(change= hash_) - - ################################################################################################ - ## plot preprocessed spectra - if check_exist("preprocessed"): - p_hash(preprocessed) - M3, M4= st.columns([2, 1]) - with M3: +# try: +if not pred_data.empty:# Load the model with joblib + @st.cache_data + def preprocess_spectra(change): + # M4.write(ProcessLookupError) + + if system_data['spec-preprocessing']['normalization'] == 'Snv': + x1 = Snv(pred_data) + norm = 'Standard Normal Variate' + else: + norm = 'No Normalization was applied' + x1 = pred_data + x2 = savgol_filter(x1, + window_length = int(system_data['spec-preprocessing']['SavGol(polyorder,window_length,deriv)'][1]), + polyorder = int(system_data['spec-preprocessing']['SavGol(polyorder,window_length,deriv)'][0]), + deriv = int(system_data['spec-preprocessing']['SavGol(polyorder,window_length,deriv)'][2]), + delta=1.0, axis=-1, mode="interp", cval=0.0) + preprocessed = DataFrame(x2, index = pred_data.index, columns = pred_data.columns) + return norm, preprocessed + norm, preprocessed = preprocess_spectra(change= hash_) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + # @st.cache_data + # def specplot_raw(change): + # fig2 = plot_spectra(pred_data, xunits = 'lab', yunits = "meta_data.loc[:,'yunits'][0]") + # return fig2 + # rawspectraplot = specplot_raw(change = hash_) + rawspectraplot = plot_spectra(pred_data, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity") + + c3, c4 = st.columns([2, 1]) + with c3: + st.write('Raw spectra') + st.pyplot(rawspectraplot) + + ## plot preprocessed spectra + if check_exist("preprocessed"): + # def specplot_prep(change): + # fig2 = plot_spectra(preprocessed, xunits = 'lab', yunits = "meta_data.loc[:,'yunits'][0]") + # return fig2 + # prepspectraplot = specplot_prep(change = hash_) + prepspectraplot = plot_spectra(preprocessed, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity") st.write('Preprocessed spectra') - def specplot_prep(change): - fig2 = plot_spectra(preprocessed, xunits = 'lab', yunits = "meta_data.loc[:,'yunits'][0]") - return fig2 - prepspectraplot = specplot_prep(change = hash_) st.pyplot(prepspectraplot) - - with M4: - @st.cache_data - def prep_info(change): - SG = r'''- Savitzky-Golay derivative parameters \:(Window_length:{prep['window_length']}; polynomial order: {prep['polyorder']}; Derivative order : {prep['deriv']})''' - Norm = r'''- Spectral Normalization \: {norm}''' - return SG, Norm - SG, Norm = prep_info(change = hash_) - st.info('The spectra were preprocessed using:\n'+SG+"\n"+Norm) + with c4: + @st.cache_data + def prep_info(change): + SG = f'- Savitzky-Golay derivative parameters \n:(Window_length:{system_data['spec-preprocessing']['SavGol(polyorder,window_length,deriv)'][1]}; polynomial order: {system_data['spec-preprocessing']['SavGol(polyorder,window_length,deriv)'][0]}; Derivative order : {system_data['spec-preprocessing']['SavGol(polyorder,window_length,deriv)'][2]})' + Norm = f'- Spectral Normalization \n: {system_data['spec-preprocessing']['normalization']}' + return SG, Norm + SG, Norm = prep_info(change = hash_) + st.info('The spectra were preprocessed using:\n'+SG+"\n"+Norm) ################### Predictions making ########################## st.header("II - Prediction making", divider='blue') - if check_exist("pred_data") and params:# Load the model with joblib - M5, M6 = st.columns([2, 1]) - model_file = M6.file_uploader("Load your model", type = '.pkl', help=" .pkl file") - if model_file: - with M6: - try: - model = joblib.load(model_file) - st.success("The model has been loaded successfully", icon="✅") - nvar = model.n_features_in_ - - except: - st.error("Error: Something went wrong, the model was not loaded !", icon="❌") - - with M6: - s = st.checkbox('Check this box if your model is of ipls type!', disabled = False if 'model' in globals() else True) - index = st.file_uploader("select wavelengths index file", type="csv", disabled = [False if s else True][0]) - if check_exist('preprocessed'): - if s: - if index: - intervalls = pd.read_csv(index, sep=';', index_col=0).to_numpy() - idx = [] - for i in range(intervalls.shape[0]): - idx.extend(np.arange(intervalls[i,2], intervalls[i,3]+1)) - if max(idx) <= preprocessed.shape[1]: - preprocesseddf = preprocessed.iloc[:,idx] ### get predictors - else: - st.error("Error: The number of columns in your data does not match the number of columns used to train the model. Please ensure they are the same.") - else: - preprocesseddf = preprocessed - - - - if check_exist("model") == False: - disable = True - elif check_exist("model") == True: - if s and not index : - disable = True - elif s and index: - disable = False - elif not s and not index: - disable = False - elif not s and index: - disable = True - - - pred_button = M6.button('Predict', type='primary', disabled= disable) - - if check_exist("preprocesseddf"): - if pred_button and nvar == preprocesseddf.shape[1]: - try: - result = pd.DataFrame(model.predict(preprocesseddf), index = rownames, columns = ['Results']) - except: - st.error(f'''Error: Length mismatch: the number of samples indices is {len(rownames)}, while the model produced - {len(model.predict(preprocesseddf))} values. correct the "indexes column in csv?" parameter''') - with M5: - if preprocesseddf.shape[1]>1 and check_exist('result'): - st.write('Predicted values distribution') - # Creating histogram - hist, axs = plt.subplots(1, 1, figsize =(15, 3), - tight_layout = True) - - # Add x, y gridlines - axs.grid( color ='grey', linestyle ='-.', linewidth = 0.5, alpha = 0.6) - # Remove axes splines - for s in ['top', 'bottom', 'left', 'right']: - axs.spines[s].set_visible(False) - # Remove x, y ticks - axs.xaxis.set_ticks_position('none') - axs.yaxis.set_ticks_position('none') - # Add padding between axes and labels - axs.xaxis.set_tick_params(pad = 5) - axs.yaxis.set_tick_params(pad = 10) - # Creating histogram - N, bins, patches = axs.hist(result, bins = 12) - # Setting color - fracs = ((N**(1 / 5)) / N.max()) - norm = colors.Normalize(fracs.min(), fracs.max()) - - for thisfrac, thispatch in zip(fracs, patches): - color = plt.cm.viridis(norm(thisfrac)) - thispatch.set_facecolor(color) - - st.pyplot(hist) - st.write('Predicted values table') - st.dataframe(result.T) - #################################3 - elif pred_button and nvar != preprocesseddf.shape[1]: - M6.error(f'Error: The model was trained on {nvar} wavelengths, but you provided {preprocessed.shape[1]} wavelengths for prediction. Please ensure they match!') - - - if check_exist('result'): - @st.cache_data(show_spinner =False) - def preparing_results_for_downloading(change): - match test: - # load csv file - case '.csv': - df.to_csv('Report/out/dataset/'+ file.name, sep = ';', encoding = 'utf-8', mode = 'a') - case '.dx': - with open('Report/out/dataset/'+file.name, 'w') as dd: - dd.write(dxdata) - - prepspectraplot.savefig('./Report/out/figures/raw_spectra.png') - rawspectraplot.savefig('./Report/out/figures/preprocessed_spectra.png') - hist.savefig('./Report/out/figures/histogram.png') - result.round(4).to_csv('./Report/out/The analysis result.csv', sep = ";", index_col=0) - - return change - preparing_results_for_downloading(change = hash_) - - import tempfile - @st.cache_data(show_spinner =False) - def tempdir(change): - with tempfile.TemporaryDirectory( prefix="results", dir="./Report") as temp_dir:# create a temp directory - tempdirname = os.path.split(temp_dir)[1] - if len(os.listdir('./Report/out/figures/'))==3: - shutil.make_archive(base_name="./Report/Results", format="zip", base_dir="out", root_dir = "./Report")# create a zip file - shutil.move("./Report/Results.zip", f"./Report/{tempdirname}/Results.zip")# put the inside the temp dir - with open(f"./Report/{tempdirname}/Results.zip", "rb") as f: - zip_data = f.read() - return tempdirname, zip_data - - date_time = datetime.datetime.now().strftime('%y%m%d%H%M') - try : - tempdirname, zip_data = tempdir(change = hash_) - st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_Pred_.zip', mime ="application/zip", - args = None, kwargs = None,type = "primary",use_container_width = True) - except: - st.write('rtt') -except: - M20.error('''Error: Data loading failed. Please check your file. Consider fine-tuning the dialect settings or ensure the file isn't corrupted.''') - - + + disable2 = False if check_exist("pred_data") else True + pred_button = st.button('Predict', type='primary', disabled= disable2, use_container_width=False) + if pred_button:st.session_state['Predict'] = True + + if st.session_state['Predict']: + if check_exist("pred_data"):# Load the model with joblib + c5, c6 = st.columns([2, 1]) + with c6: + model = system_data['model_'] + if system_data['model_type'] in ['PLS','TPE-iPLS']: + nvar = system_data['model_'].n_features_in_ + elif system_data['model_type'] =='LW-PLS': + nvar = system_data['data']['raw-spectra'].shape[1] + + + if check_exist('preprocessed'): + if isinstance(system_data['selected-wls']['idx'], DataFrame): + idx = np.concatenate([np.arange(system_data['selected-wls']['idx'].values.reshape((-1,))[2*i],system_data['selected-wls']['idx'].values.reshape((-1,))[2*i+1]+1) for i in range(system_data['selected-wls']['idx'].shape[0])]) + else: + idx = np.arange(nvar) + + if np.max(idx) <= preprocessed.shape[1]: + preprocesseddf = preprocessed.iloc[:,idx] ### get predictors + else: + st.error("Error: The number of columns in your data does not match the number of columns used to train the model. Please ensure they are the same.") + + + if check_exist("preprocesseddf"): + if st.session_state['Predict'] and nvar == preprocesseddf.shape[1]: + # if nvar == preprocesseddf.shape[1]: + match system_data['model_type']: + case 'PLS'|'TPE-iPLS': + try: + result = DataFrame(system_data['model_'].predict(preprocesseddf), index = rownames, columns = ['Results']) + except: + st.error(f'''Error: Length mismatch: the number of samples indices is {len(rownames)}, while the model produced + {len(model.predict(preprocesseddf))} values. correct the "indexes column in csv?" parameter''') + case 'LW-PLS': + # export data to csv for Julia train/test + train_idx, test_idx = system_data['data']['training_data_idx'], system_data['data']['testing_data_idx'] + spectra = system_data['data']['raw-spectra'] + y = system_data['data']['target'] + X_train, y_train, X_test, y_test = spectra.iloc[train_idx,:], y.iloc[train_idx], spectra.iloc[test_idx,:], y.iloc[test_idx] + nb_folds = 3 + folds = KF_CV.CV(X_train, y_train, nb_folds) + #['raw-spectra', 'target', 'training_data_idx', 'testing_data_idx'] + data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np', 'x_pred'] + x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy() + x_pred = pred_data.to_numpy() + # Cross-Validation calculation + d = {} + for i in range(nb_folds): + d["xtr_fold{0}".format(i+1)], d["ytr_fold{0}".format(i+1)], d["xte_fold{0}".format(i+1)], d["yte_fold{0}".format(i+1)] = np.delete(x_train_np, folds[list(folds)[i]], axis=0), np.delete(y_train_np, folds[list(folds)[i]], axis=0), x_train_np[folds[list(folds)[i]]], y_train_np[folds[list(folds)[i]]] + data_to_work_with.append("xtr_fold{0}".format(i+1)) + data_to_work_with.append("ytr_fold{0}".format(i+1)) + data_to_work_with.append("xte_fold{0}".format(i+1)) + data_to_work_with.append("yte_fold{0}".format(i+1)) + # check best pre-treatment with a global PLSR model + preReg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter=20) + temp_path = Path('temp/') + with open(temp_path / "lwplsr_preTreatments.json", "w+") as outfile: + json.dump(preReg.best_hyperparams_, outfile) + # export Xtrain, Xtest, Ytrain, Ytest and all CV folds to temp folder as csv files + for i in data_to_work_with: + if 'fold' in i: + j = d[i] + else: + j = globals()[i] + # st.write(j) + np.savetxt(temp_path / str(i + ".csv"), j, delimiter=",") + # run Julia Jchemo as subprocess + import subprocess + subprocess_path = Path("utils/") + subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"]) + # retrieve json results from Julia JChemo + try: + with open(temp_path / "lwplsr_outputs.json", "r") as outfile: + Reg_json = json.load(outfile) + # delete csv files + for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) + # delete json file after import + os.unlink(temp_path / "lwplsr_outputs.json") + os.unlink(temp_path / "lwplsr_preTreatments.json") + # format result data into Reg object + pred = ['pred_data_train', 'pred_data_test']### keys of the dict + for i in range(nb_folds): + pred.append("CV" + str(i+1)) ### add cv folds keys to pred + except FileNotFoundError as e: + Reg = None + for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv")) + + st.write(Reg_json) + + ################################### results display ################################### + if check_exist("preprocesseddf"): + if preprocesseddf.shape[1]>1 and check_exist('result'): + hist = pred_hist(pred=result) + with c5: + st.write('Predicted values distribution') + st.pyplot(hist) + st.write('Predicted values table') + st.dataframe(result.T) + with c6: + st.info('descriptive statistics for the model output') + st.write(DataFrame(desc_stats(result))) + + elif pred_button and nvar != preprocesseddf.shape[1]: + with c6: + st.error(f'Error: The model was trained on {nvar} wavelengths, but you provided {preprocessed.shape[1]} wavelengths for prediction. Please ensure they match!') + + ################################# Download results ################################# + if check_exist('result'): + @st.cache_data(show_spinner =False) + def preparing_results_for_downloading(change): + match test: + # load csv file + case 'csv': + df.to_csv('Report/out/dataset/'+ new_data.name, sep = ';', encoding = 'utf-8', mode = 'a') + case 'dx': + with open('Report/out/dataset/'+new_data.name, 'w') as dd: + dd.write(dxdata) + + prepspectraplot.savefig('./Report/out/figures/raw_spectra.png') + rawspectraplot.savefig('./Report/out/figures/preprocessed_spectra.png') + hist.savefig('./Report/out/figures/histogram.png') + result.round(4).to_csv('./Report/out/The analysis result.csv', sep = ";") + + return change + preparing_results_for_downloading(change = hash_) + + @st.cache_data(show_spinner =False) + def tempdir(change): + with TemporaryDirectory( prefix="results", dir="./Report") as temp_dir:# create a temp directory + tempdirname = os.path.split(temp_dir)[1] + if len(os.listdir('./Report/out/figures/'))==3: + make_archive(base_name="./Report/Results", format="zip", base_dir="out", root_dir = "./Report")# create a zip file + move("./Report/Results.zip", f"./Report/{tempdirname}/Results.zip")# put the inside the temp dir + with open(f"./Report/{tempdirname}/Results.zip", "rb") as f: + zip_data = f.read() + return tempdirname, zip_data + + date_time = datetime.now().strftime('%y%m%d%H%M') + try : + tempdirname, zip_data = tempdir(change = hash_) + st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_Pred_.zip', mime ="application/zip", + args = None, kwargs = None,type = "primary",use_container_width = True) + except: + st.write('rtt') + # except: + # c2.error('''Error: Data loading failed. Please check your file. Consider fine-tuning the dialect settings or ensure the file isn't corrupted.''') + + +else: + with c2: + if new_data: + st.error("Error!:The The data you provided for making predictions doesn't appear to be multivariable.!") \ No newline at end of file diff --git a/src/pages/4-inputs.py b/src/pages/4-inputs.py index f7ab3391e7eaebd7783d08a04111393aff1a5ca7..dea19c2bac39f5b9bf81b305d8d9e7aae4050fd9 100644 --- a/src/pages/4-inputs.py +++ b/src/pages/4-inputs.py @@ -6,6 +6,7 @@ st.session_state["interface"] = st.session_state.get('interface') # from Modules import * from mod import * from utils.DATA_HANDLING import * +background_img(change=None) #Import Header diff --git a/src/report/report.py b/src/report/report.py index b2dcccabb5ce98f88f1a88bf8f9343385b87c360..8fb1dfb7b2ee7758b71d97b45ddd45cbb36d1e46 100644 --- a/src/report/report.py +++ b/src/report/report.py @@ -1,7 +1,7 @@ import subprocess from pathlib import Path import os -import pandas as pd +from pandas import DataFrame import os.path import re import streamlit as st @@ -34,7 +34,7 @@ def report(*args): to_report.append(str(arg)) elif isinstance(arg, list): to_report.extend(list(map(str, arg))) - elif isinstance(arg, pd.DataFrame): + elif isinstance(arg, DataFrame): df_name = 'df' + str(j) j+=1 globals()[df_name] = arg.select_dtypes(include=['float64', 'int64']) @@ -426,7 +426,6 @@ def report(*args): # create the Tex file - sections in args will be displayed: {'sample':'Sample Selection';'model':'Model Creation';'predict':'Predictions';'help':'LaTEX help for figs and tables';} # latex_report = report('sample', 'predict',) -import shutil @st.cache_data def generate_report(change): my = Path("./report/report.pdf") diff --git a/src/style/header.py b/src/style/header.py index fe93bca72a0bacbb7e66306844e7a920b1e5de78..577d9558b5675626b84ef4f37ea4626ab98062cc 100644 --- a/src/style/header.py +++ b/src/style/header.py @@ -2,14 +2,25 @@ from Packages import * def add_header(): st.markdown( """ - <div style="width: 100%;height: 170px; background-color: rgb(122,176,199); padding: 10px; margin-bottom: 10px; "> - <h1 style="text-align: center; color: green;">PACE - MEEB / CEFE</h1> - <h2 style="text-align: center; color: green;">NIRS Utils</h2> + <div style="width: 100%;height: 170px; background-color: rgb(0,0,0,0);border: 4px solid rgb(122,176,199); padding: 0px; margin-bottom: 10px;border-radius: 20%; "> + <h1 style="font-family: 'Arial',d;text-align: center; color: #39bf55;">PACE - MEEB / CEFE</h1> + <h2 style="font-family: 'Arial';text-align: center; color: #2cb048;">NIRS Utils</h2> </div> """, unsafe_allow_html=True, ) + st.markdown(""" + <style> + .block-container { + padding-top: 3rem; + padding-bottom: 0rem; + padding-left: 5rem; + padding-right: 5rem; + } + </style> + """, unsafe_allow_html=True) + def add_sidebar(pages_folder): if 'interface' not in st.session_state: @@ -28,7 +39,7 @@ def add_sidebar(pages_folder): ) with st.sidebar: - interface = st.selectbox(label="Interface", options=['simple', 'advanced'], key='interface') + interface = st.radio(label="Interface", options=['simple', 'advanced'], key='interface') # st.page_link(str(pages_folder / '1-samples_selection.py')) if st.session_state['interface'] == 'simple': # st.page_link(str(pages_folder / '2-model_creation.py')) diff --git a/src/utils/DATA_HANDLING.py b/src/utils/DATA_HANDLING.py index 7f73676037f807782b933c9638d1ac7afb0a384d..fe7bc0007711b8cc4cd67cb16ee3f96e3da95f2f 100644 --- a/src/utils/DATA_HANDLING.py +++ b/src/utils/DATA_HANDLING.py @@ -13,7 +13,7 @@ def find_delimiter(filename): def find_col_index(filename): with open(filename) as fp: - lines = pd.read_csv(fp, skiprows=3, nrows=3, index_col=False, sep=find_delimiter(filename)) + lines = read_csv(fp, skiprows=3, nrows=3, index_col=False, sep=find_delimiter(filename)) col_index = 'yes' if lines.iloc[:,0].dtypes != np.float64 else 'no' return col_index @@ -22,7 +22,7 @@ def find_col_index(filename): def col_cat(data_import): """detect numerical and categorical columns in the csv""" # set first column as sample names - name_col = pd.DataFrame(list(data_import.index), index = list(data_import.index)) + name_col = DataFrame(list(data_import.index), index = list(data_import.index)) # name_col=name_col.rename(columns = {0:'name'}) numerical_columns_list = [] categorical_columns_list = [] @@ -35,12 +35,12 @@ def col_cat(data_import): empty = [0 for x in range(len(data_import))] numerical_columns_list.append(empty) if len(categorical_columns_list) > 0: - categorical_data = pd.concat(categorical_columns_list, axis=1) + categorical_data = concat(categorical_columns_list, axis=1) categorical_data.insert(0, 'name', name_col) if len(categorical_columns_list) == 0: - categorical_data = pd.DataFrame + categorical_data = DataFrame # Create numerical data matrix from the numerical columns list and fill na with the mean of the column - numerical_data = pd.concat(numerical_columns_list, axis=1) + numerical_data = concat(numerical_columns_list, axis=1) numerical_data = numerical_data.apply(lambda x: x.fillna(x.mean())) #np.mean(x))) return numerical_data, categorical_data @@ -57,13 +57,13 @@ def list_files(mypath, import_type): def standardize(X, center = True, scale = False): sk = StandardScaler(with_mean=center, with_std = scale) - sc = pd.DataFrame(sk.fit_transform(X), index = X.index, columns = X.columns) + sc = DataFrame(sk.fit_transform(X), index = X.index, columns = X.columns) return sc def MinMaxScale(X): t = X sk = MinMaxScaler(feature_range=(0,1)) - sc = pd.DataFrame(sk.fit_transform(X), index = t.index, columns = t.columns) + sc = DataFrame(sk.fit_transform(X), index = t.index, columns = t.columns) return sc ######################################## Spectral preprocessing @@ -73,8 +73,8 @@ def Detrend(X): def Snv(X): xt = np.array(X).T - c = (xt-xt.mean())/xt.std() - return pd.DataFrame(c.T, index=X.index, columns= X.columns) + c = (xt-xt.mean())/xt.std(axis = 0) + return DataFrame(c.T, index=X.index, columns= X.columns) def No_transformation(X): return X @@ -88,7 +88,7 @@ class KF_CV: def CV(x, y, n_folds:int): test_folds = {} folds_name = [f'Fold{i+1}' for i in range(n_folds)] - kf = ks.KFold(n_splits=n_folds, device='cpu') + kf = ks_KFold(n_splits=n_folds, device='cpu') for i in range(n_folds): d = [] for _, i_test in kf.split(x, y): @@ -129,19 +129,19 @@ class KF_CV: coeff = {} y = np.array(y) for i, Fname in enumerate(folds.keys()): - r = pd.DataFrame() + r = DataFrame() r['Predicted'] = ypcv[Fname] r['Measured'] = y[folds[Fname]] - ols = LinearRegression().fit(pd.DataFrame(y[folds[Fname]]), ypcv[Fname].reshape(-1,1)) + ols = LinearRegression().fit(DataFrame(y[folds[Fname]]), ypcv[Fname].reshape(-1,1)) r.index = folds[Fname] r['Folds'] = [f'{Fname} (Predicted = {np.round(ols.intercept_[0], 2)} + {np.round(ols.coef_[0][0],2)} x Measured'] * r.shape[0] cvcv[i] = r coeff[Fname] = [ols.coef_[0][0], ols.intercept_[0]] - data = pd.concat(cvcv, axis = 0) + data = concat(cvcv, axis = 0) data['index'] = [data.index[i][1] for i in range(data.shape[0])] data.index = data['index'] - coeff = pd.DataFrame(coeff, index = ['Slope', 'Intercept']) + coeff = DataFrame(coeff, index = ['Slope', 'Intercept']) return data, coeff ## returns values predicted in cross validation, ,coefficients of regression @staticmethod @@ -150,7 +150,7 @@ class KF_CV: e = {} for i in folds.keys(): e[i] = metrics().reg_(y[folds[i]],ypcv[i]) - r = pd.DataFrame(e) + r = DataFrame(e) r_print = r.copy() r_print['mean'] = r.mean(axis = 1) r_print['sd'] = r.std(axis = 1) @@ -167,7 +167,7 @@ class KF_CV: e = {} for i in folds.keys(): e[i] = metrics().reg_(y[folds[i]],ypcv[i]) - r = pd.DataFrame(e) + r = DataFrame(e) r_print = r r_print['mean'] = r.mean(axis = 1) r_print['sd'] = r.std(axis = 1) @@ -189,14 +189,14 @@ class KF_CV: def sel_ratio(model, x ): from scipy.stats import f - x = pd.DataFrame(x) + x = DataFrame(x) wtp = model.coef_.T/ np.linalg.norm(model.coef_.T) ttp = np.array(x @ wtp) ptp = np.array(x.T) @ np.array(ttp)/(ttp.T @ ttp) qexpi = np.linalg.norm(ttp @ ptp.T, axis = 0)**2 e = np.array(x-x.mean()) - ttp @ ptp.T qres = np.linalg.norm(e, axis = 0)**2 - sr = pd.DataFrame(qexpi/qres, index = x.columns, columns = ['sr']) + sr = DataFrame(qexpi/qres, index = x.columns, columns = ['sr']) fcr = f.ppf(0.05, sr.shape[0]-2, sr.shape[0]-3) c = sr > fcr diff --git a/src/utils/DxReader.py b/src/utils/DxReader.py index 973372738142a6d6a1233146fef512d6c5f86461..8158228cfa6649525279bf8fcf2174e73e9ad023 100644 --- a/src/utils/DxReader.py +++ b/src/utils/DxReader.py @@ -44,8 +44,8 @@ class DxRead: } self.__met[f'{i}'] = block_met - self.metadata_ = pd.DataFrame(self.__met).T - self.spectra = pd.DataFrame(np.fliplr(specs), columns= self.__wl[::-1], index = self.metadata_['name']) # Storing spectra in a pd.dataframe + self.metadata_ = DataFrame(self.__met).T + self.spectra = DataFrame(np.fliplr(specs), columns= self.__wl[::-1], index = self.metadata_['name']) # Storing spectra in a dataframe @@ -67,7 +67,7 @@ class DxRead: cc[df.index[i]] = self.conc(df[str(i)]) ### dataframe conntaining chemical data - self.chem_data = pd.DataFrame(cc, index=elements_name).T.astype(float) + self.chem_data = DataFrame(cc, index=elements_name).T.astype(float) self.chem_data.index = self.metadata_['name'] ### Method for retrieving the concentration of a single sample diff --git a/src/utils/Evaluation_Metrics.py b/src/utils/Evaluation_Metrics.py index ebe94c0fa613913c426ce373bb649f7e5315ae8b..ecbc6ab8421cf85d8eda8876bd2b114831a4ccb6 100644 --- a/src/utils/Evaluation_Metrics.py +++ b/src/utils/Evaluation_Metrics.py @@ -1,7 +1,7 @@ from Packages import * class metrics: - def __init__(self, c:Optional[float] = None, cv:Optional[List] = None, t:Optional[List] = None, method = 'regression')-> pd.DataFrame: + def __init__(self, c:Optional[float] = None, cv:Optional[List] = None, t:Optional[List] = None, method = 'regression')-> DataFrame: phase = [c, cv, t] index = np.array(["train", "cv", "test"]) notnone = [i for i in range(3) if phase[i] != None] @@ -18,7 +18,7 @@ class metrics: if notnone == 1: self.ret = perf.T else: - self.ret = pd.DataFrame(perf).T + self.ret = DataFrame(perf).T @staticmethod def reg_(meas, pred): diff --git a/src/utils/HDBSCAN_Clustering.py b/src/utils/HDBSCAN_Clustering.py index a5d3bc04794b45231dbd802ece5ce19f5ba97ba8..b4dbfcaf56d42f96923e656efdd064ab96799b65 100644 --- a/src/utils/HDBSCAN_Clustering.py +++ b/src/utils/HDBSCAN_Clustering.py @@ -6,7 +6,7 @@ class Hdbscan: The HDBSCAN_scores_ @Property returns the cluster number of each sample (_labels) and the DBCV best score. Returns: - _labels (pd.DataFrame): DataFrame with the cluster belonging number for each sample + _labels (DataFrame): DataFrame with the cluster belonging number for each sample _hdbscan_score (float): a float with the best DBCV score after optimization Examples: @@ -18,9 +18,9 @@ class Hdbscan: """Initiate the HDBSCAN calculation Args: - data (pd.DataFrame): the Dimensionality reduced space, raw result of the UMAP.fit() + data (DataFrame): the Dimensionality reduced space, raw result of the UMAP.fit() param_dist (dictionary): the HDBSCAN optimization parameters to test - _score (pd.DataFrame): is a dataframe with the DBCV value for each combination of param_dist. We search for the higher value to then compute an HDBSCAN with the best parameters. + _score (DataFrame): is a dataframe with the DBCV value for each combination of param_dist. We search for the higher value to then compute an HDBSCAN with the best parameters. """ # Really fast self._param_dist = {'min_samples': [8], @@ -50,7 +50,7 @@ class Hdbscan: # return tunning # compute optimization. Test each combination of parameters and store DBCV score into _score. - # self._score = pd.DataFrame() + # self._score = DataFrame() # for i in self._param_dist.get('min_samples'): # for j in self._param_dist.get('min_cluster_size'): # self._ij_label = HDBSCAN(min_samples=i, min_cluster_size=j).fit_predict(self._clusterable_embedding) diff --git a/src/utils/Hash.py b/src/utils/Hash.py index fb4138405efa4abbffcb4377cf1062df9758290f..9cde1413b5d8eb1f9f294a7d34d231517540eb42 100644 --- a/src/utils/Hash.py +++ b/src/utils/Hash.py @@ -2,7 +2,7 @@ from Packages import * def create_hash(to_hash): #using the md5 hash function. - hash_func = hashlib.md5() + hash_func = md5() to_hash = str(to_hash) encoded_to_hash = to_hash.encode() hash_func.update(encoded_to_hash) diff --git a/src/utils/KMEANS_.py b/src/utils/KMEANS_.py index 78cb732f07aa961e38056cb4e6e070ad7588fb0c..ea1f5ea430d2dfa566d1c216cd927d103364f6d3 100644 --- a/src/utils/KMEANS_.py +++ b/src/utils/KMEANS_.py @@ -3,22 +3,22 @@ class Sk_Kmeans: """K-Means clustering for Samples selection. Returns: - inertia_ (pd.DataFrame): DataFrame with ... - x (pd.DataFrame): Initial data - clu (pd.DataFrame): Cluster name for each sample - model.cluster_centers_ (pd.DataFrame): Coordinates of the center of each cluster + inertia_ (DataFrame): DataFrame with ... + x (DataFrame): Initial data + clu (DataFrame): Cluster name for each sample + model.cluster_centers_ (DataFrame): Coordinates of the center of each cluster """ def __init__(self, x, max_clusters): """Initiate the KMeans class. Args: - x (pd.DataFrame): the original reduced data to cluster + x (DataFrame): the original reduced data to cluster max_cluster (Int): the max number of desired clusters. """ self.x = x self.max_clusters = max_clusters - self.inertia = pd.DataFrame() + self.inertia = DataFrame() for i in range(1, max_clusters+1): model = KMeans(n_clusters = i, init = 'k-means++', random_state = 42) model.fit(x) diff --git a/src/utils/KennardStone.py b/src/utils/KennardStone.py index 3ad6c9179dbe92882666876c29ef2a3cf4f8a17c..1fb85957e42c639fa6ff2e06631c1e2863ebfc16 100644 --- a/src/utils/KennardStone.py +++ b/src/utils/KennardStone.py @@ -2,10 +2,10 @@ from Packages import * from typing import Sequence, Dict, Optional, Union class KS: - def __init__(self, x:Optional[Union[np.ndarray|pd.DataFrame]], rset:Optional[Union[float|int]]): + def __init__(self, x:Optional[Union[np.ndarray|DataFrame]], rset:Optional[Union[float|int]]): self.x = x self.ratio = rset - self._train, self._test = ks.train_test_split(self.x, train_size = self.ratio) + self._train, self._test = ks_train_test_split(self.x, train_size = self.ratio) @property def calset(self): @@ -13,7 +13,7 @@ class KS: return self.x, clu class RDM: - def __init__(self, x:Optional[Union[np.ndarray|pd.DataFrame]], rset:Optional[Union[float|int]]): + def __init__(self, x:Optional[Union[np.ndarray|DataFrame]], rset:Optional[Union[float|int]]): self.x = x self.ratio = rset self._train, self._test = train_test_split(self.x, train_size = self.ratio) diff --git a/src/utils/LWPLSR_.py b/src/utils/LWPLSR_.py index 2e6c7a7f074b5a205c7648f6b880967c5570d3bb..db7902a84204929ab6474649a9484ab2251bff1a 100644 --- a/src/utils/LWPLSR_.py +++ b/src/utils/LWPLSR_.py @@ -1,6 +1,7 @@ from juliacall import Main as jl import numpy as np import pandas as pd +from pandas import DataFrame class LWPLSR: """The lwpls regression model from Jchemo (M. Lesnoff) @@ -193,13 +194,13 @@ class LWPLSR: res.pred """) # save predicted values for each KFold in the predicted_results dictionary - self.predicted_results["CV" + str(i+1)] = pd.DataFrame(pred_cv) + self.predicted_results["CV" + str(i+1)] = DataFrame(pred_cv) @property def pred_data_(self): # convert predicted data from x_test to Pandas DataFrame - self.predicted_results["pred_data_train"] = pd.DataFrame(self.pred_train) - self.predicted_results["pred_data_test"] = pd.DataFrame(self.pred_test) + self.predicted_results["pred_data_train"] = DataFrame(self.pred_train) + self.predicted_results["pred_data_test"] = DataFrame(self.pred_test) return self.predicted_results @property diff --git a/src/utils/Miscellaneous.py b/src/utils/Miscellaneous.py index 42caf59f410d75e7b8a77415c4b82066e67077b9..6f09fc10112f690e30530ece61a3a3379e311276 100644 --- a/src/utils/Miscellaneous.py +++ b/src/utils/Miscellaneous.py @@ -14,7 +14,7 @@ def prediction(NIRS_csv, qsep, qhdr, model): col = 0 else: col = False - X_test = pd.read_csv(NIRS_csv, sep=qsep, index_col=col) + X_test = read_csv(NIRS_csv, sep=qsep, index_col=col) Y_preds = model.predict(X_test) # Y_preds = X_test return Y_preds @@ -38,8 +38,8 @@ def reg_plot( meas, pred, train_idx, test_idx): et = np.subtract(np.array(meas[1]).reshape(-1), np.array(pred[1]).reshape(-1)) fig, ax = plt.subplots(figsize = (12,4)) - sns.regplot(x = meas[0] , y = pred[0], color='blue', label = f'Calib (Predicted = {a0[0]} + {a1[0]} x Measured)') - sns.regplot(x = meas[1], y = pred[1], color='green', label = f'Test (Predicted = {a0[1]} + {a1[1]} x Measured)') + sns.regplot(x = meas[0] , y = pred[0], color="#2C6B6F", label = f'Cal (Predicted = {a0[0]} + {a1[0]} x Measured)', scatter_kws={'edgecolor': 'black'}) + sns.regplot(x = meas[1], y = pred[1], color='#d0f7be', label = f'Val (Predicted = {a0[1]} + {a1[1]} x Measured)', scatter_kws={'edgecolor': 'black'}) plt.plot([np.min(meas[0]) - 0.05, np.max([meas[0]]) + 0.05], [np.min(meas[0]) - 0.05, np.max([meas[0]]) + 0.05], color = 'black') for i, txt in enumerate(train_idx): @@ -72,8 +72,11 @@ def resid_plot( meas, pred, train_idx, test_idx): fig, ax = plt.subplots(figsize = (12,4)) - sns.scatterplot(x = pred[0], y = e[0], color='blue', label = f'Calib (Residual = {a0[0]} + {a1[0]} * Predicted)') - sns.scatterplot(x = pred[1], y = e[1], color='green', label = f'Test (Residual = {a0[1]} + {a1[1]} * Predicted)') + sns.scatterplot(x = pred[0], y = e[0], color="#2C6B6F", label = f'Cal', edgecolor="black") + sns.scatterplot(x = pred[1], y = e[1], color="#d0f7be", label = f'Val', edgecolor="black") + + # sns.scatterplot(x = pred[0], y = e[0], color='blue', label = f'Cal (Residual = {a0[0]} + {a1[0]} * Predicted)') + # sns.scatterplot(x = pred[1], y = e[1], color='green', label = f'Val (Residual = {a0[1]} + {a1[1]} * Predicted)') plt.axhline(y= 0, c ='black', linestyle = ':') lim = np.max(abs(np.concatenate([e[0], e[1]], axis = 0)))*1.1 plt.ylim(- lim, lim ) @@ -105,25 +108,72 @@ def download_results(data, export_name): with open(data) as f: st.download_button('Download', f, export_name, type='primary') -@st.cache_resource -def plot_spectra(df, xunits, yunits): +@st.cache_data +def plot_spectra(specdf, xunits, yunits): fig, ax = plt.subplots(figsize = (30,7)) - if isinstance(df.columns[0], str): - df.T.plot(legend=False, ax = ax, color = 'blue') + if isinstance(specdf.columns[0], str): + specdf.T.plot(legend=False, ax = ax, color = '#2474b4') min = 0 else: - min = np.max(df.columns) - df.T.plot(legend=False, ax = ax, color = 'blue').invert_xaxis() + min = np.max(specdf.columns) + specdf.T.plot(legend=False, ax = ax, color = '#2474b4').invert_xaxis() - ax.set_xlabel(xunits, fontsize=18) - ax.set_ylabel(yunits, fontsize=18) + ax.set_xlabel(xunits, fontsize=30) + ax.set_ylabel(yunits, fontsize=30) plt.margins(x = 0) plt.tight_layout() + return fig +@st.cache_data +def hist(y, y_train, y_test, target_name = 'y'): + fig, ax = plt.subplots(figsize = (12,3)) + sns.histplot(y, color = "#004e9e", kde = True, label = str(target_name), ax = ax, fill = True) + sns.histplot(y_train, color = "#2C6B6F", kde = True, label = str(target_name)+" (Cal)", ax = ax, fill = True) + sns.histplot(y_test, color = "#d0f7be", kde = True, label = str(target_name)+" (Val)", ax = ax, fill = True) + ax.set_xlabel(str(target_name)) + plt.legend() + plt.tight_layout() return fig +@st.cache_data +def pred_hist(pred): + # Creating histogram + hist, axs = plt.subplots(1, 1, figsize =(15, 3), + tight_layout = True) + + # Add x, y gridlines + axs.grid( color ='grey', linestyle ='-.', linewidth = 0.5, alpha = 0.6) + # Remove axes splines + for s in ['top', 'bottom', 'left', 'right']: + axs.spines[s].set_visible(False) + # Remove x, y ticks + axs.xaxis.set_ticks_position('none') + axs.yaxis.set_ticks_position('none') + # Add padding between axes and labels + axs.xaxis.set_tick_params(pad = 5) + axs.yaxis.set_tick_params(pad = 10) + # Creating histogram + N, bins, patches = axs.hist(pred, bins = 12) + return hist + +@st.cache_data +def fig_export(): + pass + + + +@st.cache_data(show_spinner =True) +def data_split(x, y): + # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing + train_index, test_index = train_test_split_idx(x , y = y, method = "kennard_stone", metric = "correlation", test_size = 0.25, random_state = 42) + # Assign data to training and test sets + X_train, y_train = DataFrame(x.iloc[train_index,:]), y.iloc[train_index] + X_test, y_test = DataFrame(x.iloc[test_index,:]), y.iloc[test_index] + return X_train, X_test, y_train, y_test, train_index, test_index + ## descriptive stat +@st.cache_data(show_spinner =True) def desc_stats(x): a = {} a['N samples'] = x.shape[0] @@ -143,9 +193,9 @@ def hash_data(data): """Hash various data types using MD5.""" # Convert to a string representation - if isinstance(data, pd.DataFrame): + if isinstance(data, DataFrame): data_str = data.to_string() - elif isinstance(data, pd.Series): + elif isinstance(data, Series): data_str = data.to_string() elif isinstance(data, np.ndarray): data_str = np.array2string(data, separator=',') @@ -169,4 +219,35 @@ def hash_data(data): # Compute the MD5 hash md5_hash = xxhash.xxh32(data_bytes).hexdigest() - return str(md5_hash) \ No newline at end of file + return str(md5_hash) + + + + +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ style test + +@st.cache_data +def background_img(change): + import base64 + image_path = './images/img-sky.jpg' + with open(image_path, "rb") as image_file: + base64_image= base64.b64encode(image_file.read()).decode('utf-8') + + + # CSS code to set the background image + # Get the base64-encoded image + + # CSS code to set the background image + background_image_style = f""" + <style> + .stApp {{ + background-image: url("data:image/jpeg;base64,{base64_image}"); + background-size: cover; + background-repeat: no-repeat; + background-attachment: fixed; + }} + </style> + """ + + # Inject the CSS style + st.markdown(background_image_style, unsafe_allow_html=True) diff --git a/src/utils/NMF_.py b/src/utils/NMF_.py index fead5eb4f82b256d0591fc16b44fd5ca0acc4114..8defac8f14a5608ba522d67a2a93b163eb5a28bc 100644 --- a/src/utils/NMF_.py +++ b/src/utils/NMF_.py @@ -21,8 +21,8 @@ class Nmf: self._t = Mo.transform(self.__x) @property def scores_(self): - return pd.DataFrame(self._t) + return DataFrame(self._t) @property def loadings_(self): - return pd.DataFrame(self._p) \ No newline at end of file + return DataFrame(self._p) \ No newline at end of file diff --git a/src/utils/PCA_.py b/src/utils/PCA_.py index 0d2afdb2d00add778fdfdd2f1a56e34e57886e5f..c5023a0503c9a1a1c6a3083b83ec29b5e8f5cd22 100644 --- a/src/utils/PCA_.py +++ b/src/utils/PCA_.py @@ -14,7 +14,7 @@ class LinearPCA: ######## results ######## # Results self.__pcnames = [f'PC{i+1}({100 * M.explained_variance_ratio_[i].round(2)}%)' for i in range(self.__ncp)] - self._Qexp_ratio = pd.DataFrame(100 * M.explained_variance_ratio_, columns = ["Qexp"], index= [f'PC{i+1}' for i in range(self.__ncp)]) + self._Qexp_ratio = DataFrame(100 * M.explained_variance_ratio_, columns = ["Qexp"], index= [f'PC{i+1}' for i in range(self.__ncp)]) self._p = M.components_.T self._t = M.transform(self.__x) @@ -40,14 +40,14 @@ class LinearPCA: @property def scores_(self): - return pd.DataFrame(self._t, columns= self.__pcnames) + return DataFrame(self._t, columns= self.__pcnames) @property def loadings_(self): - return pd.DataFrame(self._p, columns=self.__pcnames) + return DataFrame(self._p, columns=self.__pcnames) @property def residuals_(self): - res = pd.DataFrame(self._qres) + res = DataFrame(self._qres) res.columns=self.__pcnames return res \ No newline at end of file diff --git a/src/utils/PLSR_.py b/src/utils/PLSR_.py index 541d5c782592912127d31c98bea843518e77e9e5..6f90bf49b3cb693cdf17e7560d90b703f1f7d7a1 100644 --- a/src/utils/PLSR_.py +++ b/src/utils/PLSR_.py @@ -32,9 +32,9 @@ class PinardPlsr: # fit scores # Predictions on test set - self.yc = pd.DataFrame(self.trained.predict(self.x_train)) # make predictions on test data and assign to Y_preds variable - self.ycv = pd.DataFrame(cross_val_predict(self.trained, self.x_train, self.y_train, cv = 3)) # make predictions on test data and assign to Y_preds variable - self.yt = pd.DataFrame(self.trained.predict(self.x_test)) # make predictions on test data and assign to Y_preds variable + self.yc = DataFrame(self.trained.predict(self.x_train)) # make predictions on test data and assign to Y_preds variable + self.ycv = DataFrame(cross_val_predict(self.trained, self.x_train, self.y_train, cv = 3)) # make predictions on test data and assign to Y_preds variable + self.yt = DataFrame(self.trained.predict(self.x_test)) # make predictions on test data and assign to Y_preds variable ################################################################################################################ diff --git a/src/utils/PLSR_Preprocess.py b/src/utils/PLSR_Preprocess.py index b70d2aae0291b468005dad970b15b0f8b80d7b26..f83260ca9292fd47aeff38f41439a7213af6e73b 100644 --- a/src/utils/PLSR_Preprocess.py +++ b/src/utils/PLSR_Preprocess.py @@ -4,7 +4,7 @@ from utils.DATA_HANDLING import * class PlsProcess: SCORE = 100000000 - index_export = pd.DataFrame() + index_export = DataFrame() def __init__(self, x_train, x_test, y_train, y_test, scale, Kfold): PlsProcess.SCORE = 10000 @@ -36,9 +36,9 @@ class PlsProcess: self.x_train = self.xtrain self.x_test = self.xtest else: - self.x_train = pd.DataFrame(eval(f'savgol_filter(self.xtrain, polyorder={params['polyorder']}, deriv={params['deriv']}, window_length = {params['window_length']})'), + self.x_train = DataFrame(eval(f'savgol_filter(self.xtrain, polyorder={params['polyorder']}, deriv={params['deriv']}, window_length = {params['window_length']})'), columns = self.xtrain.columns, index= self.xtrain.index) - self.x_test = pd.DataFrame(eval(f'savgol_filter(self.xtest, polyorder={params['polyorder']}, deriv={params['deriv']}, window_length = {params['window_length']})'), columns = self.xtest.columns , index= self.xtest.index) + self.x_test = DataFrame(eval(f'savgol_filter(self.xtest, polyorder={params['polyorder']}, deriv={params['deriv']}, window_length = {params['window_length']})'), columns = self.xtest.columns , index= self.xtest.index) try: diff --git a/src/utils/RegModels.py b/src/utils/RegModels.py index a913e71e2daa56299056e7bd8109189b472eb6f6..1b759f005947de3a4616987fca5aac40bb6e83d8 100644 --- a/src/utils/RegModels.py +++ b/src/utils/RegModels.py @@ -10,10 +10,10 @@ class Regmodel(object): self._nc, self._nt, self._p = train[0].shape[0], test[0].shape[0], train[0].shape[1] self._model, self._best = None, None self._yc, self._ycv, self._yt = None, None, None - self._cv_df = pd.DataFrame() - self._sel_ratio = pd.DataFrame() + self._cv_df = DataFrame() + self._sel_ratio = DataFrame() self._nfolds = nfolds - self._selected_bands = pd.DataFrame(index = ['from', 'to']) + self._selected_bands = DataFrame(index = ['from', 'to']) self.important_features = None self._hyper_params = {'polyorder': hp.choice('polyorder', [0, 1, 2]), 'deriv': hp.choice('deriv', [0, 1, 2]), @@ -135,7 +135,7 @@ class Plsr(Regmodel): except (TypeError, ValueError): params[key] = value self._best = params - self.pretreated = pd.DataFrame(x2[0]) + self.pretreated = DataFrame(x2[0]) self._sel_ratio = sel_ratio(Model, x2[0]) return score @@ -210,7 +210,7 @@ class TpeIpls(Regmodel): except (TypeError, ValueError): params[key] = value self._best = params - self.pretreated = pd.DataFrame(x2[0]) + self.pretreated = DataFrame(x2[0]) self.segments = arrays for i in range(len(self.segments)): diff --git a/src/utils/SK_PLSR_.py b/src/utils/SK_PLSR_.py index bafc8cef9cc74d423f7fdc4a960c50ada5e60391..c6143113baaa570eb4e7877f69a51b6a1116d82c 100644 --- a/src/utils/SK_PLSR_.py +++ b/src/utils/SK_PLSR_.py @@ -58,9 +58,9 @@ class PlsR: self.trained = PLSRegression(n_components= self.best['n_components'], scale = False) self.trained.fit(x_train, self.y_train) - self.yc = pd.DataFrame(self.trained.predict(x_train)) # make predictions on test data and assign to Y_preds variable - self.ycv = pd.DataFrame(cross_val_predict(self.trained, x_train, self.y_train, cv = 3)) # make predictions on test data and assign to Y_preds variable - self.yt = pd.DataFrame(self.trained.predict(x_test)) # make predictions on test data and assign to Y_preds variable + self.yc = DataFrame(self.trained.predict(x_train)) # make predictions on test data and assign to Y_preds variable + self.ycv = DataFrame(cross_val_predict(self.trained, x_train, self.y_train, cv = 3)) # make predictions on test data and assign to Y_preds variable + self.yt = DataFrame(self.trained.predict(x_test)) # make predictions on test data and assign to Y_preds variable ####################################################################################################### def objective(self, params): diff --git a/src/utils/UMAP_.py b/src/utils/UMAP_.py index 7b0e41e5344b3beef4157cbd8a85f409ff79dc90..b4110d5481979a3cb6285224a378e8bd3a78f221 100644 --- a/src/utils/UMAP_.py +++ b/src/utils/UMAP_.py @@ -20,7 +20,7 @@ class Umap: self.model = UMAP(n_neighbors=20, n_components=3, min_dist=0.0, )#random_state=42,) self.model.fit(self.numerical_data, y = self.categorical_data_encoded) self.scores_raw = self.model.transform(self.numerical_data) - self.scores = pd.DataFrame(self.scores_raw) + self.scores = DataFrame(self.scores_raw) self.scores.columns = [f'axis_{i+1}' for i in range(self.scores_raw.shape[1])] @property diff --git a/src/utils/VarSel.py b/src/utils/VarSel.py index b23e2fcdbe6efb83a841a2ca5458b8a0c743351f..001f56ecf4874684769c80e48b6275d857237c3f 100644 --- a/src/utils/VarSel.py +++ b/src/utils/VarSel.py @@ -15,7 +15,7 @@ class TpeIpls: '''Optimization algorithms can be used to find the subset of variables that optimize a certain criterion (e.g., maximize predictive performance, minimize overfitting)''' SCORE = 100000000 - index_export = pd.DataFrame() + index_export = DataFrame() def __init__(self, x_train, x_test, y_train, y_test, scale, Kfold, n_intervall): TpeIpls.SCORE = 10000 @@ -58,10 +58,10 @@ class TpeIpls: pt = params['Preprocess'] - self.x_train = pd.DataFrame(eval(f"savgol_filter(xtrain1, polyorder=pt['deriv_sg'], deriv=pt['deriv_sg'], window_length = pt['window_length_sg'], delta=1.0, axis=-1, mode='interp', cval=0.0)") , + self.x_train = DataFrame(eval(f"savgol_filter(xtrain1, polyorder=pt['deriv_sg'], deriv=pt['deriv_sg'], window_length = pt['window_length_sg'], delta=1.0, axis=-1, mode='interp', cval=0.0)") , columns = self.xtrain.columns, index= self.xtrain.index) - self.x_test = pd.DataFrame(eval(f"savgol_filter(xtest1, polyorder=pt['deriv_sg'], deriv=pt['deriv_sg'], window_length = pt['window_length_sg'], delta=1.0, axis=-1, mode='interp', cval=0.0)") , + self.x_test = DataFrame(eval(f"savgol_filter(xtest1, polyorder=pt['deriv_sg'], deriv=pt['deriv_sg'], window_length = pt['window_length_sg'], delta=1.0, axis=-1, mode='interp', cval=0.0)") , columns = self.xtest.columns, index= self.xtest.index) @@ -93,7 +93,7 @@ class TpeIpls: self.nlv = params['n_components'] - TpeIpls.index_export = pd.DataFrame() + TpeIpls.index_export = DataFrame() TpeIpls.index_export["Vars"] = self.x_test.columns[id] TpeIpls.index_export.index = id self.best = params @@ -122,7 +122,7 @@ class TpeIpls: for i in range(len(self.segments)): ban[f'band{i+1}'] = [self.segments[i][0], self.segments[i][self.segments[i].shape[0]-1]] - self.bands = pd.DataFrame(ban).T + self.bands = DataFrame(ban).T self.bands.columns = ['from', 'to'] diff --git a/src/utils/__init__.py b/src/utils/__init__.py index c5434f5f66a91df7d2634dd82077d24df06676ce..1191fd2c6cb270ca669fd2313231e14f9cfcb365 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -8,7 +8,7 @@ from .PLSR_ import PinardPlsr from .LWPLSR_ import LWPLSR from .Evaluation_Metrics import metrics #from .VarSel import TpeIpls -from .Miscellaneous import resid_plot, reg_plot, desc_stats, hash_data +from .Miscellaneous import resid_plot, reg_plot, desc_stats, hash_data, hist, pred_hist, background_img from .DxReader import DxRead, read_dx from .HDBSCAN_Clustering import Hdbscan from .SK_PLSR_ import PlsR @@ -16,4 +16,4 @@ from .PLSR_Preprocess import PlsProcess from .NMF_ import Nmf from .Ap import AP from .RegModels import Plsr, TpeIpls -from .KennardStone import KS, RDM +from .KennardStone import KS, RDM \ No newline at end of file