diff --git a/src/common.py b/src/common.py index 24e7991b1f99f245677ca80750f9a9bb7337f4aa..3d2b4897d3b2ae5c1841bf1a6b5dd58f211d6971 100644 --- a/src/common.py +++ b/src/common.py @@ -25,6 +25,7 @@ from tempfile import NamedTemporaryFile import numpy as np from datetime import datetime import json +from shutil import rmtree, move, make_archive from utils.data_parsing import JcampParser, CsvParser from style.layout import UiComponents diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index 80891e047da9447f9141e11b0fb30722c61b53dc..483f159d9e5ee8412a123e0b1c892317a8a4830b 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -38,7 +38,7 @@ if Path('report/out/model').exists() and Path('report/out/model').is_dir(): match st.session_state["interface"]: case 'simple': dim_red_methods = ['PCA'] - cluster_methods = ['Kmeans'] # List of clustering algos + cluster_methods = ['KS'] # List of clustering algos selec_strategy = ['center'] case 'advanced': @@ -109,15 +109,16 @@ else: data_str = str(stringio.read()) @st.cache_data - def csv_loader(file = file, change = None): + def read_csv(file = file, change = None): from utils.data_parsing import CsvParser par = CsvParser(file= file) - float_data, non_float = par.parse(decimal = dec, separator = sep, index_col = names, header = hdr) - return float_data, non_float + par.parse(decimal = dec, separator = sep, index_col = names, header = hdr) + return par.float, par.meta_data, par.meta_data_st_, par.df + spectra, meta_data, md_df_st_, imp = read_csv(file= file, change = hash_) try : - spectra, meta_data = csv_loader(file= file, change = hash_) + spectra, meta_data, md_df_st_, imp = read_csv(file= file) st.success("The data have been loaded successfully", icon="✅") except: @@ -147,7 +148,12 @@ else: st.success("The data have been loaded successfully", icon="✅") ################################################### END : I- Data loading and preparation #################################################### - +if not spectra.empty: + with c2: + st.write('Data summary:') + st.write(f'- the number of spectra:{spectra.shape[0]}') + st.write(f'- the number of wavelengths:{spectra.shape[1]}') + st.write(f'- the number of categorical variables:{meta_data.shape[1]}') ################################################### BEGIN : visualize and split the data #################################################### st.subheader("I - Spectral Data Visualization", divider='blue') @@ -265,13 +271,16 @@ if not spectra.empty: elif sel_ratio < 1.00: ratio = int(sel_ratio*spectra.shape[0]) ObjectHash(sel_ratio) + if st.session_state["interface"] =='simple': + clus_method = 'KS' - if dr_model and not clus_method: - clus_method = st.radio('Select samples selection strategy:', options = ['RDM', 'KS']) + else: + if dr_model and not clus_method: + clus_method = st.radio('Select samples selection strategy:', options = ['RDM', 'KS']) - elif dr_model and clus_method: - disabled1 = False if clus_method in cluster_methods else True - selection = st.radio('Select samples selection strategy:', options = selec_strategy, disabled = disabled1) + elif dr_model and clus_method: + disabled1 = False if clus_method in cluster_methods else True + selection = st.radio('Select samples selection strategy:', options = selec_strategy, disabled = disabled1) @@ -338,8 +347,10 @@ elif labels: for i in np.unique(s): C = np.where(np.array(labels) == i)[0] if C.shape[0] >= selection_number: + from sklearn.cluster import KMeans km2 = KMeans(n_clusters = selection_number) km2.fit(tcr.iloc[C,:]) + from sklearn.metrics import pairwise_distances_argmin_min clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:]) selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index) else: @@ -356,6 +367,18 @@ if not t.empty: filter = [''] + md_df_st_.columns.tolist() elif meta_data.empty and not clus_method in cluster_methods: filter = [] + + if st.session_state["interface"] =='simple': + desactivatelist = True + if meta_data.empty: + desactivatelist = True + filter = [''] + elif not meta_data.empty: + filter = [''] + md_df_st_.columns.tolist() + desactivatelist = False + else: + desactivatelist = False + with c12: st.write('Scores plot') @@ -363,7 +386,7 @@ if not t.empty: if len(axis)== 1: tcr_plot['1d'] = np.random.uniform(-.5, .5, tcr_plot.shape[0]) - colfilter = st.selectbox('Color by:', options= filter,format_func = lambda x: x if x else "<Select>") + colfilter = st.selectbox('Color by:', options= filter,format_func = lambda x: x if x else "<Select>", disabled = desactivatelist) ObjectHash(colfilter) if colfilter in cluster_methods: tcr_plot[colfilter] = labels @@ -500,14 +523,14 @@ if not spectra.empty: out3 = leverage > tresh3 out4 = residuals > tresh4 - for i in range(n_samples): - if out3[i]: - if not meta_data.empty: - ann = meta_data.loc[:,'name'][i] - else: - ann = t.index[i] - influence_plot.add_annotation(dict(x = leverage[i], y = residuals[i], showarrow=True, text = str(ann),font= dict(color= "black", size= 15), - xanchor = 'auto', yanchor = 'auto')) + # for i in range(n_samples): + # if out3[i]: + # if not meta_data.empty: + # ann = meta_data.loc[:,'name'][i] + # else: + # ann = t.index[i] + # influence_plot.add_annotation(dict(x = leverage[i], y = residuals[i], showarrow=True, text = str(ann),font= dict(color= "black", size= 15), + # xanchor = 'auto', yanchor = 'auto')) influence_plot.update_traces(marker=dict(size= 6), showlegend= True) influence_plot.update_layout(font=dict(size=23), width=800, height=500) @@ -623,7 +646,7 @@ if not sam.empty: ################################################### # ## generate report @st.cache_data - def export_report(variable): + def export_report(change): latex_report = report.report('Representative subset selection', file.name, dim_red_method, clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam) @@ -638,7 +661,7 @@ if not sam.empty: with open('report/out/dataset/'+file.name, 'w') as dd: dd.write(dxdata) - fig_spectra.savefig(report_path_rel/"out/figures/spectra_plot.png", dpi=400) ## Export report + fig_spectra.savefig(report_path_rel/"out/figures/spectra_plot.png", dpi = 400) ## Export report if len(axis) == 3: for i in range(len(comb)): @@ -650,6 +673,7 @@ if not sam.empty: # Export du graphique if dim_red_method in ['PCA','NMF']: + import plotly.io as pio img = pio.to_image(loadingsplot, format="png") with open(report_path_rel/"out/figures/loadings_plot.png", "wb") as f: f.write(img) @@ -658,25 +682,27 @@ if not sam.empty: influence_plot.write_image(report_path_rel/'out/figures/influence_plot.png', engine = 'kaleido') sam.to_csv(report_path_rel/'out/Selected_subset_for_calib_development.csv', sep = ';') - export_report(variable) + export_report(change = hash_) if Path(report_path_rel/"report.tex").exists(): - report.generate_report(variable = 25) + report.generate_report(change = hash_) if Path(report_path_rel/"report.pdf").exists(): move(report_path_rel/"report.pdf", "./report/out/report.pdf") return change - preparing_results_for_downloading(variable = 25) - report.generate_report(variable = 25) + preparing_results_for_downloading(change = hash_) + report.generate_report(change = hash_) @st.cache_data def tempdir(change): + from tempfile import TemporaryDirectory with TemporaryDirectory( prefix="results", dir="./report") as temp_dir:# create a temp directory tempdirname = os.path.split(temp_dir)[1] if len(os.listdir(report_path_rel/'out/figures/'))>=2: + make_archive(base_name= report_path_rel/"Results", format="zip", base_dir="out", root_dir = "./report")# create a zip file move(report_path_rel/"Results.zip", f"./report/{tempdirname}/Results.zip")# put the inside the temp dir with open(report_path_rel/f"{tempdirname}/Results.zip", "rb") as f: diff --git a/src/utils/data_parsing.py b/src/utils/data_parsing.py index 35e7790090a2b80ead55abd8b0f4c9b7ff6bd2a3..d408fc3ff95aec345df995b9268adc9fd4e33040 100644 --- a/src/utils/data_parsing.py +++ b/src/utils/data_parsing.py @@ -86,6 +86,7 @@ class JcampParser: @property def specs_df_(self): return self.spectra + @property def meta_data_st_(self): me = self.metadata_.drop("concentrations", axis = 1) @@ -114,14 +115,22 @@ class CsvParser: def parse(self, decimal, separator, index_col, header): from pandas import read_csv - df = read_csv(self.file, decimal = decimal, sep = separator, index_col = index_col, header = header) - if len(set(df.index))<df.shape[0]: - df = read_csv(self.file, decimal = decimal, sep = separator, index_col = None, header = header) + self.df = read_csv(self.file, decimal = decimal, sep = separator, index_col = index_col, header = header) + if len(set(self.df.index))<self.df.shape[0]: + self.df = read_csv(self.file, decimal = decimal, sep = separator, index_col = None, header = header) - float, non_float = df.select_dtypes(include='float'), df.select_dtypes(exclude='float') - return float, non_float + self.float, self.non_float = self.df.select_dtypes(include='float'), self.df.select_dtypes(exclude='float') + @property + def meta_data_st_(self): + me = self.non_float.applymap(lambda x: x.upper() if isinstance(x, str) else x) + meta_data_st = me.loc[:,me.nunique(axis=0) > 1] + return meta_data_st + + @property + def meta_data(self): + return self.non_float # def parse(self): # import pandas as pd