diff --git a/src/Class_Mod/HDBSCAN_Clustering.py b/src/Class_Mod/HDBSCAN_Clustering.py index c996730925685329d83f8b930a8a1539bccb1064..a5d3bc04794b45231dbd802ece5ce19f5ba97ba8 100644 --- a/src/Class_Mod/HDBSCAN_Clustering.py +++ b/src/Class_Mod/HDBSCAN_Clustering.py @@ -321,6 +321,15 @@ class Hdbscan: # return members @property - def HDBSCAN_scores_(self): + def centers_(self): # return self._labels, self._hdbscan_bscore, self._centers - return self._labels, self._centers + return self._centers + @property + def labels_(self): + labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in self._labels] + return labels + @property + def non_clustered(self): + labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in self._labels] + non_clustered = np.where(np.array(labels) == 'Non clustered')[0] + return non_clustered diff --git a/src/Class_Mod/KMEANS_.py b/src/Class_Mod/KMEANS_.py index 8c67f1d8eeccc528d54afe61a88814705c12746a..78cb732f07aa961e38056cb4e6e070ad7588fb0c 100644 --- a/src/Class_Mod/KMEANS_.py +++ b/src/Class_Mod/KMEANS_.py @@ -39,11 +39,12 @@ class Sk_Kmeans: idxidx.append(f'{i+1}_clust') values.append((s[i] - s[i+1])*100 / s[i]) - id = np.max(np.where(np.array(values) > 10))+2 + id = np.max(np.where(np.array(values) > 5))+2 return id - def fit_optimal(self, nclusters): - model = KMeans(n_clusters = nclusters, init = 'k-means++', random_state = 42) + @property + def fit_optimal_(self): + model = KMeans(n_clusters = self.suggested_n_clusters_, init = 'k-means++', random_state = 42) model.fit(self.x) yp = model.predict(self.x)+1 clu = [f'cluster#{i}' for i in yp] diff --git a/src/Class_Mod/KennardStone.py b/src/Class_Mod/KennardStone.py index 0e5c5cbe32150e3fd7cf4cdfe860db2766888043..3ad6c9179dbe92882666876c29ef2a3cf4f8a17c 100644 --- a/src/Class_Mod/KennardStone.py +++ b/src/Class_Mod/KennardStone.py @@ -5,19 +5,18 @@ class KS: def __init__(self, x:Optional[Union[np.ndarray|pd.DataFrame]], rset:Optional[Union[float|int]]): self.x = x self.ratio = rset - self._train, self._test = ks.train_test_split(self.x, train_size = self.ratio/100) + self._train, self._test = ks.train_test_split(self.x, train_size = self.ratio) @property def calset(self): clu = self._train.index.tolist() - return self.x, clu class RDM: def __init__(self, x:Optional[Union[np.ndarray|pd.DataFrame]], rset:Optional[Union[float|int]]): self.x = x self.ratio = rset - self._train, self._test = train_test_split(self.x, train_size = self.ratio/100) + self._train, self._test = train_test_split(self.x, train_size = self.ratio) @property def calset(self): diff --git a/src/Report/report.py b/src/Report/report.py index e7e5e9ec2815bdfe41526d85d78ee259a3970f80..c6443a3efae04802f6a56f0a169df8a91541805b 100644 --- a/src/Report/report.py +++ b/src/Report/report.py @@ -63,7 +63,7 @@ def report(*args): \headrulecolor{red!100}% \renewcommand{\footrulewidth}{1pt} \footrulecolor{red!100}% - \graphicspath{{images/}, {Figures/}} + \graphicspath{{images/}, {out/figures/}} \fancyhead[R]{\includegraphics[width=0.1\textwidth]{logo_cefe.png}} \fancyhead[L]{PACE - NIRS Analysis Report} \fancyfoot[L]{Project Name to fill} @@ -148,7 +148,7 @@ def report(*args): latex_report += r""" clustering analysis was developed.""" latex_report += r""" Features extraction was performed by means of {"""+dim_red_methods[to_report[2]] + r"""} technique that helps - represent the high dimensional spectra in a reduced perceptible 3D subspace spanned by a few number of features (three features in our case), while """ + represent the high dimensional spectra in a reduced perceptible 3D subspace spanned by a few number of features, while """ if 'KS' in to_report or 'RDM' in to_report: @@ -178,9 +178,9 @@ def report(*args): latex_report += """ Results of applying this workflow are displayed in""" if 'PCA' in to_report: - latex_report += """ (\cref{pcaplots,hotelling_and_influence,loadings}).""" + latex_report += """ (\cref{pcaplots, hotelling_and_influence, loadings}).""" elif 'NMF' in to_report: - latex_report += """ (\cref{pcaplots,loadings}).""" + latex_report += """ (\cref{pcaplots, loadings}).""" else: latex_report += """ (\cref{pcaplots}).""" @@ -198,26 +198,46 @@ def report(*args): latex_report += r""" and extracted to be representative of the whole data set, i.e, to reflect the variation included in the whole data set. This subset of samples is suggested to be used for a robust NIR calibration developement, therefore should to be analyzed by adequate reference analytical procedures (generally requiring destructive sample preparation) to collect data for the target variable to be modelled.\par""" - - latex_report += r""" - \begin{figure}[h] - \captionsetup{justification=centering} - \centering - \begin{minipage}[b]{0.33\textwidth} - \includegraphics[width=\linewidth]{scores_pc1_pc2.png} - \end{minipage}% - \begin{minipage}[b]{0.33\textwidth} - \includegraphics[width=\linewidth]{scores_pc1_pc3.png} - \end{minipage}% - \begin{minipage}[b]{0.33\textwidth} - \includegraphics[width=\linewidth]{scores_pc2_pc3.png} - \end{minipage} - \centering - \caption{Illustration of the pairwise projection of spectra onto the reduced 3 dimensional subspace, clustering, and sample selection + + pathtofig = os.listdir("./Report/out/figures") + sc = [name for name in pathtofig if name.startswith("score")] + if sc[0] not in ["scores_plot1D.png","scores_plot2D.png"]: + axisn = 'three' + elif sc[0] == "scores_plot2D.png": + axisn = 'two' + elif sc[0] == "scores_plot1D.png": + axisn = "one" + if len(sc) == 3: + latex_report += r""" + \begin{figure}[h] + \captionsetup{justification=centering} + \centering + \begin{minipage}[b]{0.33\textwidth} + \includegraphics[width=\linewidth]{scores_pc1_pc2.png} + \end{minipage}% + \begin{minipage}[b]{0.33\textwidth} + \includegraphics[width=\linewidth]{scores_pc1_pc3.png} + \end{minipage}% + \begin{minipage}[b]{0.33\textwidth} + \includegraphics[width=\linewidth]{scores_pc2_pc3.png} + \end{minipage} + \centering + \caption{Illustration of the pairwise projection of spectra onto the reduced """ + axisn +r""" dimensional subspace, clustering, and sample selection + results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be + analyzed by a standard reference analytical procedure} + \label{pcaplots} + \end{figure}""" + + elif len(sc) == 1: + latex_report += r""" + \begin{figure}[h!] + \centering + \includegraphics[width=.6\linewidth]{"""+sc[0] +r"""} + \caption{Illustration of the pairwise projection of spectra onto the reduced """ + axisn +r""" dimensional subspace, clustering, and sample selection results: data points with the same color belong to the same cluster and data points colored in black correspond to the samples to be analyzed by a standard reference analytical procedure} - \label{pcaplots} - \end{figure} """ + \label{pcaplots} + \end{figure}""" if 'PCA' in to_report or 'NMF' in to_report: latex_report += r""" @@ -408,7 +428,7 @@ def report(*args): # latex_report = report('sample', 'predict',) import shutil @st.cache_data -def compile_latex(change): +def generate_report(change): my = Path("./Report/report.pdf") if my.is_file(): os.remove("./Report/report.pdf") @@ -434,4 +454,3 @@ def compile_latex(change): # proc = subprocess.Popen([str(filename[:-4]) + '.pdf'], cwd = "./results", shell=True) proc.communicate() -# compile_latex() \ No newline at end of file diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index b5e0bbede09e4cc9631d52a573244838386a8705..710da37a60dd13ed5ba5b051a2edfa2d9aaa837b 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -1,29 +1,41 @@ from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * - -# empty temp figures -for i in ['Report/figures','Report/datasets']: - repertoire_a_vider = Path(i) - if os.path.exists(repertoire_a_vider): - for fichier in os.listdir(repertoire_a_vider): - chemin_fichier = os.path.join(repertoire_a_vider, fichier) - if os.path.isfile(chemin_fichier) or os.path.islink(chemin_fichier): - os.unlink(chemin_fichier) - elif os.path.isdir(chemin_fichier): - shutil.rmtree(chemin_fichier) - # HTML pour le bandeau "CEFE - CNRS" add_header() -#load specific model page css -local_css(css_file / "style_model.css") add_sidebar(pages_folder) +local_css(css_file / "style_model.css")#load specific model page css + + + + + + +hash_ = '' +def p_hash(add): + global hash_ + hash_ = hash_data(hash_+str(add)) + return hash_ -# algorithms available in our app -dim_red_methods=['', 'PCA','UMAP', 'NMF'] # List of dimensionality reduction algos -cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP', 'KS', 'RDM'] # List of clustering algos +# #################################### Methods ############################################## +# empty temp figures +def delete_files(keep): + supp = [] + # Walk through the directory + for root, dirs, files in os.walk('Report/', topdown=False): + for file in files: + if file != 'logo_cefe.png' and not any(file.endswith(ext) for ext in keep): + os.remove(os.path.join(root, file)) + +dirpath = Path('Report/out/model') +if dirpath.exists() and dirpath.is_dir(): + shutil.rmtree(dirpath) + +# algorithms available on our app +dim_red_methods=['PCA','UMAP', 'NMF'] # List of dimensionality reduction algos +cluster_methods = ['Kmeans','HDBSCAN', 'AP'] # List of clustering algos selec_strategy = ['center','random'] match st.session_state["interface"]: @@ -48,11 +60,22 @@ match st.session_state["interface"]: default_clustering_option = 0 default_sample_selection_option = 0 -################################### I - Data Loading and Visualization ######################################## -date_time = datetime.datetime.now().strftime('_%y_%m_%d_%H_%M_') -st.title("Calibration Subset Selection") + + +################ clean the results dir ############# +delete_files(keep = ['.py', '.pyc','.bib']) + +# ####################################### page preamble ####################################### +st.title("Calibration Subset Selection") # page title +st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra") col2, col1 = st.columns([3, 1]) -col2.image("./images/sample selection.png", use_column_width=True) +col2.image("./images/sample selection.png", use_column_width=True) # graphical abstract + +################################### I - Data Loading and Visualization ######################################## +files_format = ['csv', 'dx'] # Supported files format +# loader for datafile +file = col1.file_uploader("Data file", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) + ## Preallocation of data structure spectra = pd.DataFrame() meta_data = pd.DataFrame() @@ -68,68 +91,92 @@ dr_model = None # dimensionality reduction model cl_model = None # clustering model selection = None selection_number = "None" +samples_df_chem = pd.DataFrame +selected_samples = [] +selected_samples_idx = [] -# loader for datafile -data_file = col1.file_uploader("Data file", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) +if not file: + col1.info('Info: Please load data file !') -if not data_file: - col1.warning('âš ï¸ Please load data file !') else: - # Retrieve the extension of the file - # test = data_file.name[data_file.name.find('.'):] - - - extension = data_file.name.split(".")[-1] - userfilename = data_file.name.replace(f".{extension}", '') + extension = file.name.split(".")[-1] + userfilename = file.name.replace(f".{extension}", '') match extension: ## Load .csv file case 'csv': with col1: # Select list for CSV delimiter - psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))),horizontal=True, key=9) + psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+file.name)), options = [";", ","], index = [";", ","].index(str(find_delimiter('data/'+file.name))),horizontal=True, key=9) # Select list for CSV header True / False - phdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))),horizontal=True, key=31) - if phdr == 'yes': - col = 0 - else: - col = False - imp = pd.read_csv(data_file, sep=psep, index_col=col) - imp.to_csv("./Report/datasets/"+data_file.name,sep = ';', encoding='utf-8', mode='a') + phdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+file.name)), options = ["no", "yes"], index = ["no", "yes"].index(str(find_col_index('data/'+file.name))),horizontal=True, key=31) + if phdr == 'yes':col = 0 + else:col = False - # spectra = col_cat(imp)[0] - # meta_data = col_cat(imp)[1] - spectra, md_df_st_ = col_cat(imp) - meta_data = md_df_st_ - st.success("The data have been loaded successfully", icon="✅") + from io import StringIO + stringio = StringIO(file.getvalue().decode("utf-8")) + data_str = str(stringio.read()) + p_hash([data_str + str(file.name) , psep, phdr]) + + @st.cache_data + def csv_loader(change): + imp = pd.read_csv(file, sep = psep, index_col=col) + spectra, md_df_st_ = col_cat(imp) + meta_data = md_df_st_ + return spectra, md_df_st_, meta_data, imp + + try : + spectra, md_df_st_, meta_data, imp = csv_loader(change = hash_) + st.success("The data have been loaded successfully", icon="✅") + except: + st.error('''Error: The format of the file does not correspond to the expected dialect settings. + To read the file correctly, please adjust the separator parameters.''') + + + + + ## Load .dx file case 'dx': - # Create a temporary file to save the uploaded file - with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: - tmp.write(data_file.read()) - with open(tmp.name, 'r') as dd: - dxdata = dd.read() - with open('Report/datasets/'+data_file.name, 'w') as dd: - dd.write(dxdata) - tmp_path = tmp.name - with col1: - _, spectra, meta_data, md_df_st_ = read_dx(file = tmp_path) + with col1: + # Create a temporary file to save the uploaded file + with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: + tmp.write(file.read()) + tmp_path = tmp.name + with open(tmp.name, 'r') as dd: + dxdata = dd.read() + p_hash(str(dxdata)+str(file.name)) + + ## load and parse the temp dx file + @st.cache_data + def dx_loader(change): + _, spectra, meta_data, md_df_st_ = read_dx(file = tmp_path) + # os.unlink(tmp_path) + return _, spectra, meta_data, md_df_st_ + _, spectra, meta_data, md_df_st_ = dx_loader(change = hash_) + st.success("The data have been loaded successfully", icon="✅") - os.unlink(tmp_path) +################################################### END : I- Data loading and preparation #################################################### +# with open('Report/datasets/'+file.name, 'w') as dd: +# dd.write(dxdata) +# tmp_path = tmp.name +# imp.to_csv("./Report/datasets/"+file.name,sep = ';', encoding='utf-8', mode='a') +# fig.savefig("./Report/figures/spectra_plot.png", dpi=400) ## Export report - -## Visualize spectra +################################################### BEGIN : visualize and split the data #################################################### st.header("I - Spectral Data Visualization", divider='blue') if not spectra.empty: + p_hash(np.mean(spectra)) n_samples = spectra.shape[0] nwl = spectra.shape[1] - # retrieve columns name and rows name of spectra + # retrieve columns name and rows name of the dataframe colnames = list(spectra.columns) rownames = [str(i) for i in list(spectra.index)] spectra.index = rownames - col2, col1 = st.columns([3, 1]) - with col2: + + @st.cache_data + def spectra_visualize(change): fig, ax = plt.subplots(figsize = (30,7)) if extension =='dx': lab = ['Wavenumber (1/cm)' if meta_data.loc[:,'xunits'][0] == '1/cm' else 'Wavelength (nm)'] @@ -145,9 +192,13 @@ if not spectra.empty: ax.set_ylabel('Signal intensity', fontsize=18) plt.margins(x = 0) plt.tight_layout() - st.pyplot(fig) - # update lines size + data_info = pd.DataFrame({'Name': [file.name], + 'Number of scanned samples': [n_samples]}, + index = ['Input file']) + + + # update lines size to export for report for line in ax.get_lines(): line.set_linewidth(0.8) # Set the desired line width here @@ -158,15 +209,20 @@ if not spectra.empty: ax.xaxis.label.set_size(9.5) ax.yaxis.label.set_size(9.5) plt.tight_layout() - fig.savefig("./Report/figures/spectra_plot.png", dpi=400) ## Export report fig.set_size_inches(l, w)# reset the plot size to its original size - data_info = pd.DataFrame({'Name': [data_file.name], - 'Number of scanned samples': [n_samples]}, - index = ['Input file']) + return fig, data_info + fig_spectra, data_info = spectra_visualize(change = hash_) + + col1, col2 = st.columns([3, 1]) with col1: + st.pyplot(fig_spectra) + + with col2: st.info('Information on the loaded data file') st.write(data_info) ## table showing the number of samples in the data file +################################################### END : visualize and split the data #################################################### + ############################## Exploratory data analysis ############################### st.header("II - Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') @@ -174,56 +230,57 @@ st.header("II - Exploratory Data Analysis-Multivariable Data Analysis", divider= t = pd.DataFrame # scores p = pd.DataFrame # loadings if not spectra.empty: - bb1, bb2, bb3, bb4, bb5, bb6, bb7 = st.columns([1,1,0.6,0.6,0.6,1.5,1.5]) - dim_red_method = bb1.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, index = default_reduction_option, key = 37) - clus_method = bb2.selectbox("Clustering/sampling techniques: ", options = cluster_methods, index = default_clustering_option, key = 38) xc = standardize(spectra, center=True, scale=False) - - match dim_red_method: - case "": - bb1.warning('âš ï¸ Please choose an algorithm !') - - case "PCA": - @st.cache_data - def dr_model_(change): - dr_model = LinearPCA(xc, Ncomp=8) - return dr_model - dr_model = dr_model_(change = hash_data(xc)) - case "UMAP": + bb1, bb2, bb3, bb4, bb5, bb6, bb7 = st.columns([1,1,0.6,0.6,0.6,1.5,1.5]) + with bb1: + dim_red_method = st.selectbox("Dimensionality reduction techniques: ", options = ['']+dim_red_methods, index = default_reduction_option, key = 37, format_func = lambda x: x if x else "<Select>") + if dim_red_method == '': + st.info('Info: Select a dimensionality reduction technique!') + p_hash(dim_red_method) + + + if dim_red_method == "UMAP": if not meta_data.empty: - filter = md_df_st_.columns - filter = filter.insert(0, 'Nothing') - col = bb1.selectbox('Supervised UMAP by:', options= filter, key=108) - if col == 'Nothing': - supervised = None - else: - supervised = md_df_st_[col] - else: - supervised = None - @st.cache_data - def dr_model_(change): - dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised) - return dr_model - dr_model = dr_model_(change = hash_data(spectra)) - + filter = md_df_st_.columns.tolist() + supervised = st.selectbox('Supervised UMAP by(optional):', options = ['']+filter, format_func = lambda x: x if x else "<Select>", key=108) + umapsupervisor = [None if supervised == '' else md_df_st_[supervised]][0] - case 'NMF': - @st.cache_data - def dr_model_(change): - dr_model = Nmf(spectra, Ncomp= 3) - return dr_model - dr_model = dr_model_(change = hash_data(spectra)) - + else: + supervised = st.selectbox('Supervised UMAP by:', options = ["Meta-data is not available"], disabled=True, format_func = lambda x: x if x else "<Select>", key=108) + umapsupervisor = None + p_hash(supervised) - if dr_model: - axis1 = bb3.selectbox("x-axis", options = dr_model.scores_.columns, index=0) - axis2 = bb4.selectbox("y-axis", options = dr_model.scores_.columns, index=1) - axis3 = bb5.selectbox("z-axis", options = dr_model.scores_.columns, index=2) + disablewidgets = [False if dim_red_method else True][0] + clus_method = st.selectbox("Clustering techniques(optional): ", options = ['']+cluster_methods, index = default_clustering_option, key = 38, format_func = lambda x: x if x else "<Select>", disabled= disablewidgets) - t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1) + + # if disablewidgets == False and dim_red_method in dim_red_methods: + # inf = st.info('Info: Select a clustering technique!') + if dim_red_method: + @st.cache_data + def dimensionality_reduction(change): + match dim_red_method: + case "PCA": + dr_model = LinearPCA(xc, Ncomp=8) + case "UMAP": + dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = umapsupervisor) + case 'NMF': + dr_model = Nmf(spectra, Ncomp= 3) + return dr_model + + dr_model = dimensionality_reduction(change = hash_) + + if dr_model: + axis1 = bb3.selectbox("x-axis", options = dr_model.scores_.columns, index=0) + axis2 = bb4.selectbox("y-axis", options = dr_model.scores_.columns, index=1) + axis3 = bb5.selectbox("z-axis", options = dr_model.scores_.columns, index=2) + axis = np.unique([axis1, axis2, axis3]) + p_hash(axis) + t = dr_model.scores_.loc[:,np.unique(axis)] + tcr = standardize(t) ###### II - clustering ####### if not t.empty: @@ -235,24 +292,50 @@ if not t.empty: else: scores, loadings= st.columns([3,3]) - tcr = standardize(t) - +if not spectra.empty: + sel_ratio = bb2.number_input('Enter the number/fraction of samples to be selected:',min_value=0.01, max_value=float("{:.2f}".format(spectra.shape[0])), value=0.20, format="%.2f", disabled= disablewidgets) + if sel_ratio: + p_hash(sel_ratio) + if sel_ratio > 1.00: + ratio = int(sel_ratio) + elif sel_ratio < 1.00: + ratio = int(sel_ratio*spectra.shape[0]) +if dr_model and not clus_method: + clus_method = bb2.radio('Select samples selection strategy:', + options = ['RDM', 'KS'],) +elif dr_model and clus_method: + # sel_ratio = bb2.number_input('Enter the ratio/precentage of samples to be selected:',min_value=0.01, max_value=float("{:.2f}".format(spectra.shape[0])), value=0.20, format="%.2f") + # p_hash(sel_ratio) + # if sel_ratio > 1.00: + # ratio = int(sel_ratio) + # elif sel_ratio < 1.00: + # ratio = int(sel_ratio*spectra.shape[0]) + + if clus_method in cluster_methods: + selection = bb2.radio('Select samples selection strategy:', + options = selec_strategy, index = default_sample_selection_option,key=102,disabled = False) + else: + selection = bb2.radio('Select samples selection strategy:', + options = selec_strategy, horizontal=True, key=102,disabled = True) + + + + + + +if dr_model and sel_ratio: # Clustering match clus_method: - case '': - bb2.warning('âš ï¸ Please choose an algothithm !') case 'Kmeans': - cl_model = Sk_Kmeans(tcr, max_clusters = 25) - ncluster = scores.number_input(min_value=2, max_value=25, value=cl_model.suggested_n_clusters_, label = 'Select the desired number of clusters') - data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster) + cl_model = Sk_Kmeans(tcr, max_clusters = ratio) + data, labels, clu_centers = cl_model.fit_optimal_ + ncluster = clu_centers.shape[0] # 2- HDBSCAN clustering case 'HDBSCAN': - optimized_hdbscan = Hdbscan(np.array(tcr)) - all_labels, clu_centers = optimized_hdbscan.HDBSCAN_scores_ - labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in all_labels] + cl_model = Hdbscan(np.array(tcr)) + labels, clu_centers, non_clustered = cl_model.labels_,cl_model.centers_, cl_model.non_clustered ncluster = len(clu_centers) - non_clustered = np.where(np.array(labels) == 'Non clustered')[0] # 3- Affinity propagation case 'AP': @@ -261,29 +344,26 @@ if not t.empty: ncluster = len(clu_centers) case 'KS': - rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)') - cl_model = KS(x = tcr, rset = rset) - + cl_model = KS(x = tcr, rset = ratio) case 'RDM': - rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)') - cl_model = RDM(x = tcr, rset = rset) - + cl_model = RDM(x = tcr, rset = ratio) + + # if clus_method in cluster_methods: + # inf.empty() + if clus_method in ['KS', 'RDM']: - calset = cl_model.calset - labels = ["ind"]*n_samples - ncluster = "1" - selection_number = 'None' - selected_samples_idx = calset[1] - selection = 'None' + _, selected_samples_idx = cl_model.calset + labels = ["ind"]*n_samples + ncluster = "1" + selection_number = 'None' + selection = 'None' new_tcr = tcr.iloc[clustered,:] -#################################################### III - Samples selection using the reduced data preentation ###### -samples_df_chem = pd.DataFrame -selected_samples = [] -selected_samples_idx = [] +# #################################################### III - Samples selection using the reduced data presentation ###### + if not labels: custom_color_palette = px.colors.qualitative.Plotly[:1] @@ -291,17 +371,6 @@ elif labels: num_clusters = len(np.unique(labels)) custom_color_palette = px.colors.qualitative.Plotly[:num_clusters] if clus_method: - # if clus_method in ['KS', 'RDM']: - # calset = cl_model.calset - # labels = ["ind"]*n_samples - # ncluster = "1" - # selection_number = 'None' - # selected_samples_idx = calset[1] - # selection = 'None' - # else: - selection = scores.radio('Select samples selection strategy:', - options = selec_strategy, index = default_sample_selection_option, key=102) - match selection: # Strategy 0 case 'center': @@ -312,8 +381,8 @@ elif labels: #### Strategy 1 case 'random': - selection_number = scores.number_input('How many samples per cluster?', - min_value = 1, step=1, value = round(n_samples*0.1/ncluster)) + selection_number = int(ratio/num_clusters) + p_hash(selection_number) s = np.array(labels)[np.where(np.array(labels) !='Non clustered')[0]] for i in np.unique(s): C = np.where(np.array(labels) == i)[0] @@ -325,87 +394,82 @@ elif labels: selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index) else: selected_samples_idx.extend(new_tcr.iloc[C,:].index.to_list()) - # list indexes of selected samples for colored plot + # list indexes of selected samples for colored plot -################################ Plots visualization ############################################ +# ################################ Plots visualization ############################################ ## Scores if not t.empty: + if meta_data.empty and clus_method in cluster_methods: + filter = ['', clus_method] + elif not meta_data.empty and clus_method in cluster_methods: + filter = ['',clus_method] + md_df_st_.columns.tolist() + elif not meta_data.empty and clus_method not in cluster_methods: + filter = [''] + md_df_st_.columns.tolist() + elif meta_data.empty and not clus_method in cluster_methods: + filter = [] + with scores: - fig1, ((ax1, ax2),(ax3,ax4)) = plt.subplots(2,2) st.write('Scores plot') - # scores plot with clustering - if list(labels) and meta_data.empty: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color=labels ,color_discrete_sequence= custom_color_palette) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = labels, ax = ax1) - # scores plot with metadata - elif len(list(labels)) == 0 and not meta_data.empty: - filter = md_df_st_.columns - col = st.selectbox('Color by:', options= filter) - if col == 0: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) - sns.scatterplot(data = tcr, x = axis2, y =axis3 , ax = ax2) - sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3) - - - else: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,md_df_st_[col])) ) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax1) - sns.scatterplot(data = tcr, x = axis2, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax2) - sns.scatterplot(data = tcr, x = axis1, y =axis3 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3) - - # color with scores and metadata - elif len(list(labels)) > 0 and not meta_data.empty: - if clus_method in cluster_methods[1:]: - filter = ['None', clus_method] - filter.extend(md_df_st_.columns) - else: - filter = md_df_st_.columns.insert(0,'None') - - col = st.selectbox('Color by:', options= filter) - if col == "None": - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) - elif col == clus_method: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) - else: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,md_df_st_[col]))) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax1) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax2) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,md_df_st_[col])), ax = ax3) - + tcr_plot = tcr.copy() + colfilter = st.selectbox('Color by:', options= filter,format_func = lambda x: x if x else "<Select>") + if colfilter in cluster_methods: + tcr_plot[colfilter] = labels + elif not meta_data.empty and colfilter in md_df_st_.columns.tolist(): + tcr_plot[f'{colfilter} :'] = list(map(str.lower,md_df_st_.loc[:,colfilter])) else: - fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color=labels if list(labels) else None,color_discrete_sequence= custom_color_palette) - sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) - fig.update_traces(marker=dict(size=4)) - - if selected_samples_idx: - tt = tcr.iloc[selected_samples_idx,:] - fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2],z = tt.loc[:,axis3], - mode ='markers', marker = dict(size = 5, color = 'black'), - name = 'selected samples') - st.plotly_chart(fig, use_container_width = True) - - if labels: - # export 2D scores plot - comb = [i for i in combinations([1,2,3], 2)] - subcap = ['a','b','c'] - for i in range(len(comb)): - fig_export = px.scatter(tcr, x = eval(f'axis{str(comb[i][0])}'), y=eval(f'axis{str(comb[i][1])}'), - color = labels if list(labels) else None, - color_discrete_sequence = custom_color_palette) - fig_export.add_scatter(x = tt.loc[:,eval(f'axis{str(comb[i][0])}')], y = tt.loc[:,eval(f'axis{str(comb[i][1])}')], - mode ='markers', marker = dict(size = 5, color = 'black'), - name = 'selected samples') - fig_export.update_layout(font=dict(size=23)) - fig_export.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1, - font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) - fig_export.update_traces(marker=dict(size= 10), showlegend= False) - fig_export.write_image(f'./Report/Figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png') + tcr_plot[f'{colfilter} :'] = ['sample'] * tcr_plot.shape[0] + + col_var_name = tcr_plot.columns.tolist()[-1] + n_categories = len(np.unique(tcr_plot[col_var_name])) + custom_color_palette = px.colors.qualitative.Plotly[:n_categories] + + with scores: + if selected_samples_idx:# color selected samples + t_selected = tcr_plot.iloc[selected_samples_idx,:] + match t.shape[1]: + case 3: + fig = px.scatter_3d(tcr_plot, x = axis[0], y = axis[1], z = axis[2], color = col_var_name ,color_discrete_sequence = custom_color_palette) + fig.update_traces(marker=dict(size=4)) + if selected_samples_idx:# color selected samples + fig.add_scatter3d(x = t_selected.loc[:,axis[0]], y = t_selected.loc[:,axis[1]], z = t_selected.loc[:,axis[2]], + mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') + + case 2: + fig = px.scatter(tcr_plot, x = axis[0], y = axis[1], color = col_var_name ,color_discrete_sequence = custom_color_palette) + if selected_samples_idx:# color selected samples + fig.add_scatter(x = t_selected.loc[:,axis[0]], y = t_selected.loc[:,axis[1]], + mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') + + case 1: + fig = px.scatter(tcr_plot, x = axis[0], y = [0]*tcr_plot.shape[0], color = col_var_name ,color_discrete_sequence = custom_color_palette) + fig.add_scatter(x = t_selected.loc[:,axis[0]], y = [0]*tcr_plot.shape[0], + mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples') + fig.update_yaxes(visible=False) + + st.plotly_chart(fig, use_container_width = True) + + if labels: + fig_export = {} + # export 2D scores plot + if len(axis)== 3: + comb = [i for i in combinations(np.arange(len(axis)), 2)] + subcap = ['a','b','c'] + for i in range(len(comb)): + fig_= px.scatter(tcr_plot, x = axis[(comb[i][0])], y=axis[(comb[i][1])],color = labels if list(labels) else None,color_discrete_sequence = custom_color_palette) + fig_.add_scatter(x = t_selected.loc[:,axis[(comb[i][0])]], y = t_selected.loc[:,axis[(comb[i][1])]], mode ='markers', marker = dict(size = 5, color = 'black'), + name = 'selected samples') + fig_.update_layout(font=dict(size=23)) + fig_.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1, + font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) + fig_.update_traces(marker=dict(size= 10), showlegend= False) + fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'] = fig_ + # fig_export.write_image(f'./Report/out/figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png') + else: + fig_export['fig'] = fig + if not spectra.empty: @@ -431,19 +495,14 @@ if not spectra.empty: pp = pd.concat([p, freq], axis=1) ######################################### df1 = pp.melt(id_vars=freq.columns) - fig = px.line(df1, x=freq.columns, y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly) - fig.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"), + loadingsplot = px.line(df1, x=freq.columns, y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly) + loadingsplot.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"), bordercolor="black", borderwidth=2)) - fig.update_layout(xaxis_title = xlab,yaxis_title = "Intensity" ,xaxis = dict(autorange= inv)) + loadingsplot.update_layout(xaxis_title = xlab,yaxis_title = "Intensity" ,xaxis = dict(autorange= inv)) - st.plotly_chart(fig, use_container_width=True) - - - # Export du graphique - img = pio.to_image(fig, format="png") - with open("./Report/figures/loadings_plot.png", "wb") as f: - f.write(img) + st.plotly_chart(loadingsplot, use_container_width=True) + ############################################################################################################# if dim_red_method == 'PCA': influence, hotelling = st.columns([3, 3]) @@ -454,38 +513,39 @@ if not spectra.empty: leverage = np.diag(Hat) / np.trace(Hat) tresh3 = 2 * tcr.shape[1]/n_samples # Loadings - p = pd.concat([dr_model.loadings_.loc[:,axis1], dr_model.loadings_.loc[:,axis2], dr_model.loadings_.loc[:,axis3]], axis = 1) + p = dr_model.loadings_.loc[:,axis] # Matrix reconstruction xp = np.dot(t,p.T) # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model residuals = np.diag(np.subtract(xc.to_numpy(), xp)@ np.subtract(xc.to_numpy(), xp).T) - tresh4 = sc.stats.chi2.ppf(0.05, df = 3) + tresh4 = sc.stats.chi2.ppf(0.05, df = len(axis)) # color with metadata - if not meta_data.empty and clus_method: - if col == "None": + if colfilter: + if colfilter == "": l1 = ["Samples"]* n_samples - elif col == clus_method: + elif colfilter == clus_method: l1 = labels - + else: - l1 = list(map(str.lower,md_df_st_[col])) + l1 = tcr_plot[f'{colfilter} :'] - elif meta_data.empty and clus_method: - l1 = labels + # elif meta_data.empty and clus_method: + # l1 = labels - elif meta_data.empty and not clus_method: - l1 = ["Samples"]* n_samples + # elif meta_data.empty and not clus_method: + # l1 = ["Samples"]* n_samples - elif not meta_data.empty and not clus_method: - l1 = list(map(str.lower,md_df_st_[col])) - - fig = px.scatter(x = leverage, y = residuals, color=labels if list(labels) else None, + # elif not meta_data.empty and not clus_method: + # l1 = list(map(str.lower,md_df_st_[col])) + tcr_plot["leverage"] = leverage + tcr_plot["residuals"] = residuals + influence_plot = px.scatter(data_frame =tcr_plot, x = "leverage", y = "residuals", color=col_var_name, color_discrete_sequence= custom_color_palette) - fig.add_vline(x = tresh3, line_width = 1, line_dash = 'solid', line_color = 'red') - fig.add_hline(y=tresh4, line_width=1, line_dash='solid', line_color='red') - fig.update_layout(xaxis_title="Leverage", yaxis_title = "Q-residuals", font=dict(size=20), width=800, height=600) + influence_plot.add_vline(x = tresh3, line_width = 1, line_dash = 'solid', line_color = 'red') + influence_plot.add_hline(y=tresh4, line_width=1, line_dash='solid', line_color='red') + influence_plot.update_layout(xaxis_title="Leverage", yaxis_title = "Q-residuals", font=dict(size=20), width=800, height=600) out3 = leverage > tresh3 out4 = residuals > tresh4 @@ -496,22 +556,22 @@ if not spectra.empty: ann = meta_data.loc[:,'name'][i] else: ann = t.index[i] - fig.add_annotation(dict(x = leverage[i], y = residuals[i], showarrow=True, text = str(ann),font= dict(color= "black", size= 15), + influence_plot.add_annotation(dict(x = leverage[i], y = residuals[i], showarrow=True, text = str(ann),font= dict(color= "black", size= 15), xanchor = 'auto', yanchor = 'auto')) - fig.update_traces(marker=dict(size= 6), showlegend= True) - fig.update_layout(font=dict(size=23), width=800, height=500) - st.plotly_chart(fig, use_container_width=True) + influence_plot.update_traces(marker=dict(size= 6), showlegend= True) + influence_plot.update_layout(font=dict(size=23), width=800, height=500) + st.plotly_chart(influence_plot, use_container_width=True) - for annotation in fig.layout.annotations: + for annotation in influence_plot.layout.annotations: annotation.font.size = 35 - fig.update_layout(font=dict(size=23), width=800, height=600) - fig.update_traces(marker=dict(size= 10), showlegend= False) - fig.add_annotation(text= '(a)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1, + influence_plot.update_layout(font=dict(size=23), width=800, height=600) + influence_plot.update_traces(marker=dict(size= 10), showlegend= False) + influence_plot.add_annotation(text= '(a)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1, font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) - fig.write_image('./Report/figures/influence_plot.png', engine = 'kaleido') + # influence_plot.write_image('./Report/out/figures/influence_plot.png', engine = 'kaleido') with hotelling: @@ -525,11 +585,11 @@ if not spectra.empty: tresh0 = (3 * (n_samples ** 2 - 1) * fcri) / (n_samples * (n_samples - 3)) tresh1 = sc.stats.chi2.ppf(0.05, df = 3) - fig = px.scatter(t, x = hotelling, y = residuals, color=labels if list(labels) else None, + hotelling_plot = px.scatter(t, x = hotelling, y = residuals, color=labels if list(labels) else None, color_discrete_sequence= custom_color_palette) - fig.update_layout(xaxis_title="Hotelling-T² distance",yaxis_title="Q-residuals") - fig.add_vline(x=tresh0, line_width=1, line_dash='solid', line_color='red') - fig.add_hline(y=tresh1, line_width=1, line_dash='solid', line_color='red') + hotelling_plot.update_layout(xaxis_title="Hotelling-T² distance",yaxis_title="Q-residuals") + hotelling_plot.add_vline(x=tresh0, line_width=1, line_dash='solid', line_color='red') + hotelling_plot.add_hline(y=tresh1, line_width=1, line_dash='solid', line_color='red') out0 = hotelling > tresh0 out1 = residuals > tresh1 @@ -541,21 +601,21 @@ if not spectra.empty: ann = meta_data.loc[:,'name'][i] else: ann = t.index[i] - fig.add_annotation(dict(x = hotelling[i], y = residuals[i], showarrow=True, text = str(ann), font= dict(color= "black", size= 15), + hotelling_plot.add_annotation(dict(x = hotelling[i], y = residuals[i], showarrow=True, text = str(ann), font= dict(color= "black", size= 15), xanchor = 'auto', yanchor = 'auto')) - fig.update_traces(marker=dict(size= 6), showlegend= True) - fig.update_layout(font=dict(size=23), width=800, height=500) - st.plotly_chart(fig, use_container_width=True) + hotelling_plot.update_traces(marker=dict(size= 6), showlegend= True) + hotelling_plot.update_layout(font=dict(size=23), width=800, height=500) + st.plotly_chart(hotelling_plot, use_container_width=True) - for annotation in fig.layout.annotations: + for annotation in hotelling_plot.layout.annotations: annotation.font.size = 35 - fig.update_layout(font=dict(size=23), width=800, height=600) - fig.update_traces(marker=dict(size= 10), showlegend= False) - fig.add_annotation(text= '(b)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1, + hotelling_plot.update_layout(font=dict(size=23), width=800, height=600) + hotelling_plot.update_traces(marker=dict(size= 10), showlegend= False) + hotelling_plot.add_annotation(text= '(b)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1, font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3) - fig.write_image("./Report/figures/hotelling_plot.png", format="png") + # hotelling_plot.write_image("./Report/out/figures/hotelling_plot.png", format="png") st.header('III - Selected Samples for Reference Analysis', divider='blue') if labels: @@ -602,60 +662,76 @@ if not sam.empty: Nb_ech = str(n_samples) nb_clu = str(sam1.shape[0]) ################################################### - ## generate report - latex_report = report.report('Representative subset selection', data_file.name, dim_red_method, - clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam) + # ## generate report + @st.cache_data + def export_report(change): + latex_report = report.report('Representative subset selection', file.name, dim_red_method, + clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam) - + @st.cache_data - def download_res(file,sam): - zipname = f'results{date_time}subset_selection_{file.name.split('.')[0]}.zip' # name of the zipfile - with open('./temp/fname.json', 'w') as f: # dump filename and save it as a .json file - json.dump(zipname, f) - shutil.make_archive(base_name = zipname.split('.')[0],format = "zip",root_dir = "./Report", base_dir = "figures")# create zip containing figures and report - - file_path = Path("./temp/"+zipname) - sam.to_csv("./"+zipname,sep = ';', encoding='utf-8', mode='a', - compression = dict(method='zip',archive_name=f"selected subset for reference analysis_{userfilename}{date_time}.csv"))### export the table of selected subset - - with zipfile.ZipFile("./"+zipname, 'a') as newzip: - newzip.write("./Report/report.pdf", arcname="report.pdf") # add report to the zipfile - newzip.write("./Report/datasets/"+os.listdir("./Report/datasets")[0], arcname=os.listdir("./Report/datasets")[0]) # add the dataset to the zipfile + def preparing_results_for_downloading(change): + match extension: + # load csv file + case 'csv': + imp.to_csv('Report/out/dataset/'+ file.name, sep = ';', encoding = 'utf-8', mode = 'a') + case 'dx': + with open('Report/out/dataset/'+file.name, 'w') as dd: + dd.write(dxdata) + + fig_spectra.savefig("./Report/out/figures/spectra_plot.png", dpi=400) ## Export report + + if len(axis) == 3: + for i in range(len(comb)): + fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'].write_image(f'./Report/out/figures/scores_pc{str(comb[i][0]+1)}_pc{str(comb[i][1]+1)}.png') + elif len(axis)==2 : + fig_export['fig'].write_image(f'./Report/out/figures/scores_plot2D.png') + elif len(axis)==1 : + fig_export['fig'].write_image(f'./Report/out/figures/scores_plot1D.png') - shutil.move('./'+zipname,'./temp/'+ zipname) # move the .zip file to the temp directory - - # Hash the results - a ='' - for i in (data_file.name, dim_red_method,clus_method, Nb_ech, tcr.astype(str)): - a += str(i) - - myfilepdf = Path("./Report/report.pdf") - if 'htest' not in st.session_state: - st.session_state.htest = '0' - report.compile_latex(change =hash_data(a)) - if myfilepdf.is_file(): - download_res(file = data_file, sam = sam) - - elif st.session_state['htest'] != hash_data(a): - st.session_state['htest'] = hash_data(a) - report.compile_latex(change =hash_data(a)) - if myfilepdf.is_file(): - download_res(file = data_file, sam = sam) - else: + # Export du graphique + if dim_red_method in ['PCA','NMF']: + img = pio.to_image(loadingsplot, format="png") + with open("./Report/out/figures/loadings_plot.png", "wb") as f: + f.write(img) + if dim_red_method == 'PCA': + hotelling_plot.write_image("./Report/out/figures/hotelling_plot.png", format="png") + influence_plot.write_image('./Report/out/figures/influence_plot.png', engine = 'kaleido') + + sam.to_csv('./Report/out/Selected_subset_for_calib_development.csv', sep = ';') + + export_report(change = hash_) + if Path("./Report/report.tex").exists(): + report.generate_report(change = hash_) + if Path("./Report/report.pdf").exists(): + shutil.move("./Report/report.pdf", "./Report/out/report.pdf") + return change + + + preparing_results_for_downloading(change = hash_) + report.generate_report(change = hash_) + + st.header('Download the analysis results') + + import tempfile + @st.cache_data + def tempdir(change): + with tempfile.TemporaryDirectory( prefix="results", dir="./Report") as temp_dir:# create a temp directory + tempdirname = os.path.split(temp_dir)[1] + + if len(os.listdir('./Report/out/figures/'))>=2: + shutil.make_archive(base_name="./Report/Results", format="zip", base_dir="out", root_dir = "./Report")# create a zip file + shutil.move("./Report/Results.zip", f"./Report/{tempdirname}/Results.zip")# put the inside the temp dir + with open(f"./Report/{tempdirname}/Results.zip", "rb") as f: + zip_data = f.read() + return tempdirname, zip_data + + date_time = datetime.datetime.now().strftime('%y%m%d%H%M') + try : + tempdirname, zip_data = tempdir(change = hash_) + st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip", + args = None, kwargs = None,type = "primary",use_container_width = True) + except: pass - - list_of_files = glob.glob(r"./temp/*.zip") - if len(list_of_files) >3: - oldest_file = min(list_of_files, key=os.path.getctime) - os.remove(oldest_file) - list_of_files = glob.glob(r"./temp/*.zip") - recent_file = max(list_of_files, key=os.path.getctime) - - with open('./temp/fname.json', 'r') as f: - zipname = json.load(f) - if os.path.split(recent_file)[1] == os.path.split(zipname)[1]: - with open("./temp/"+zipname, "rb") as fp: - st.subheader('Download the Analysis Results') - st.download_button('Download', data = fp, file_name=zipname, mime="application/zip", - args=None, kwargs=None,type="primary",use_container_width=True) \ No newline at end of file + delete_files(keep = ['.py', '.pyc','.bib']) \ No newline at end of file diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index b9e6d7a116d90e07765075ba7ba363c107d86433..a4a415fc62280316de2bb743da2f5f2b3ce61539 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -2,32 +2,33 @@ from Packages import * st.set_page_config(page_title = "NIRS Utils", page_icon = ":goat:", layout = "wide") from Modules import * from Class_Mod.DATA_HANDLING import * +# HTML pour le bandeau "CEFE - CNRS" add_header() add_sidebar(pages_folder) -local_css(css_file / "style_model.css") +local_css(css_file / "style_model.css")#load specific model page css + + + + hash_ = '' def p_hash(add): global hash_ hash_ = hash_data(hash_+str(add)) return hash_ -def increment(): - st.session_state.counter += 1 - - - - # Initialize the variable in session state if it doesn't exist for st.cache_data if 'counter' not in st.session_state: st.session_state.counter = 0 +def increment(): + st.session_state.counter += 1 # #################################### Methods ############################################## -def delete_files(supp_files): +def delete_files(keep): supp = [] # Walk through the directory for root, dirs, files in os.walk('Report/', topdown=False): for file in files: - if file != 'logo_cefe.png' and not any(file.endswith(ext) for ext in supp_files): + if file != 'logo_cefe.png' and not any(file.endswith(ext) for ext in keep): os.remove(os.path.join(root, file)) class lw: @@ -38,9 +39,10 @@ class lw: ################ clean the results dir ############# - -keptfiles = {'.py', '.pyc'} -delete_files(supp_files = keptfiles) +delete_files(keep = ['.py', '.pyc','.bib']) +dirpath = Path('Report/out/model') +if not dirpath.exists(): + os.mkdir(path = dirpath) # ####################################### page preamble ####################################### st.title("Calibration Model Development") # page title @@ -103,12 +105,10 @@ match file: @st.cache_data def csv_loader(change): - delete_files(supp_files=['.csv','.py', '.pyc']) + delete_files(keep = ['.py', '.pyc','.bib']) file_name = str(xcal_csv.name) +' and '+ str(ycal_csv.name) xfile = pd.read_csv(xcal_csv, decimal = '.', sep = sepx, index_col = col, header = 0) yfile = pd.read_csv(ycal_csv, decimal = '.', sep = sepy, index_col = col) - xfile.to_csv('Report/out/dataset/'+ xcal_csv.name, sep = ';', encoding = 'utf-8', mode = 'a') - yfile.to_csv('Report/out/dataset/'+ ycal_csv.name, sep = ';', encoding = 'utf-8', mode = 'a') return xfile, yfile, file_name xfile, yfile, file_name = csv_loader(change = hash_) @@ -158,14 +158,12 @@ match file: file_name = str(data_file.name) ## creating the temp file with NamedTemporaryFile(delete = False, suffix = ".dx") as tmp: - tmp.write(data_file.read()) - tmp_path = tmp.name - with open(tmp.name, 'r') as dd: - dxdata = dd.read() - p_hash(str(dxdata)+str(data_file.name)) - with open('Report/out/dataset/'+data_file.name, 'w') as dd: - dd.write(dxdata) - + tmp.write(data_file.read()) + tmp_path = tmp.name + with open(tmp.name, 'r') as dd: + dxdata = dd.read() + p_hash(str(dxdata)+str(data_file.name)) + ## load and parse the temp dx file @st.cache_data def dx_loader(change): @@ -205,7 +203,7 @@ st.header("I - Data visualization", divider = 'blue') if not spectra.empty and not y.empty: p_hash(y) p_hash(np.mean(spectra)) - @st.cache_data + @st.cache_data(show_spinner =False) def visualize(change): if np.array(spectra.columns).dtype.kind in ['i', 'f']: colnames = spectra.columns @@ -286,7 +284,6 @@ if not spectra.empty and not y.empty: # Training set preparation for cross-validation(CV) nb_folds = 3 - folds = KF_CV.CV(X_train, y_train, nb_folds)# split train data into nb_folds for cross_validation # Model creation-M20 columns with M20: @@ -295,7 +292,8 @@ if not spectra.empty and not y.empty: # spectra_plot.savefig("./Report/figures/spectra_plot.png") # target_plot.savefig("./Report/figures/histogram.png") # st.session_state['hash_Reg'] = str(np.random.randint(2000000000)) - + folds = KF_CV.CV(X_train, y_train, nb_folds)# split train data into nb_folds for cross_validation + match model_type: case 'PLS': Reg = Plsr(train = [X_train, y_train], test = [X_test, y_test], n_iter = 10, cv = nb_folds) @@ -456,7 +454,7 @@ if Reg: # Show and export the preprocessing methods st.write('-- Spectral preprocessing info --') st.write(Reg.best_hyperparams_print) - @st.cache_data + @st.cache_data(show_spinner =False) def preprocessings(change): with open('Report/out/Preprocessing.json', "w") as outfile: json.dump(Reg.best_hyperparams_, outfile) @@ -473,7 +471,7 @@ if Reg: # M1.dataframe(model_per) # duplicate with line 371 - @st.cache_data + @st.cache_data(show_spinner =False) def prep_important(change, model_type, model_hash): fig, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 4), sharex=True) ax1.plot(colnames, np.mean(X_train, axis = 0), color = 'black', label = 'Average spectrum (Raw)') @@ -519,7 +517,7 @@ if Reg: 6: "Six",7: "Seven",8: "Eight",9: "Nine",10: "Ten"} st.header(f" {numbers_dict[nb_folds]}-Fold Cross-Validation results") - @st.cache_data + @st.cache_data(show_spinner =False) def cv_display(change): fig1 = px.scatter(Reg.cv_data_[0], x = 'Measured', y = 'Predicted' , trendline = 'ols', color = 'Folds', symbol = 'Folds', color_discrete_sequence=px.colors.qualitative.G10) @@ -584,96 +582,88 @@ if Reg: ################################################### END : Model Diagnosis ####################################################### - -@st.cache_data -def save_figs(change): - figpath = Path('./Report/out/figures/') - spectra_plot.savefig(figpath + "spectra_plot.png") - target_plot.savefig(figpath + "histogram.png") - imp_fig.savefig(figpath + "variable_importance.png") - fig1.write_image(figpath + "meas_vs_pred_cv_all.png") - fig0.write_image(figpath + "meas_vs_pred_cv_onebyone.png") - measured_vs_predicted.savefig(figpath + 'measured_vs_predicted.png') - residuals_plot.savefig(figpath + 'residuals_plot.png') - with open('Report/out/Preprocessing.json', "w") as outfile: - json.dump(Reg.best_hyperparams_, outfile) - return change -if Reg: - save_figs(change = hash_) ################################################### BEGIN : Download results ####################################################### ########################################################################################################################################## ########################################################################################################################################## - - shutil.make_archive(base_name='./Report/out/', - format = 'zip', - root_dir='./Report/out/') -# 1-check for saved model -# 2-check for train test data -# 3-check for the report -# 4- -# 5- -# st.subheader('Download the Analysis Results') - - -# if Reg: -# @st.cache_data -# def export_model(change): -# date_time = datetime.datetime.now().strftime('-%Y_%m_%d_%Hh%Mmin-') -# # set files name -# path = 'PageRes/ModelCreation/model_' -# match file: -# case 'csv': -# filesname = xcal_csv.name[:xcal_csv.name.find(".")] +"_"+ ycal_csv.name[:ycal_csv.name.find(".")] -# case 'dx': -# filesname = data_file.name[:data_file.name.find(".")] -# modelname =path + model_type + date_time + filesname + '_data' - -# with open(modelname + '.pkl','wb') as f:# export model -# joblib.dump(reg_model, f) - -# if model_type == 'TPE-iPLS': # export selected wavelengths -# wlfilename = modelname+'-selected_wavelengths.xlsx' -# all = pd.concat([intervalls_with_cols.T, Reg.selected_features_], axis = 0, ignore_index=True).T -# all.columns=['wl_from','wl_to','idx_from', 'idx_to'] -# all.to_excel(wlfilename) - -# @st.cache_data -# def export_report(change): -# match model_type: -# case 'PLS': -# latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), model_type, model_per, cv_results) - -# case 'LW-PLS': -# latex_report = report.report('Predictive model development', file_name, stats, -# list({key: Reg.best_hyperparams_[key] for key in ['deriv', 'normalization', 'polyorder', 'window_length'] if key in Reg.best_hyperparams_}.values()), model_type, model_per, cv_results) +if Reg: + @st.cache_data(show_spinner =False) + def export_report(change): + match model_type: + case 'PLS': + latex_report = report.report('Predictive model development', file_name, stats, list(Reg.best_hyperparams_.values()), model_type, model_per, cv_results) + + case 'LW-PLS': + latex_report = report.report('Predictive model development', file_name, stats, + list({key: Reg.best_hyperparams_[key] for key in ['deriv', 'normalization', 'polyorder', 'window_length'] if key in Reg.best_hyperparams_}.values()), model_type, model_per, cv_results) -# case 'TPE-iPLS': -# latex_report = report.report('Predictive model development', file_name, stats, -# list({key: Reg.best_hyperparams_[key] for key in ['deriv', 'normalization', 'polyorder', 'window_length'] if key in Reg.best_hyperparams_}.values()), model_type, model_per, cv_results) + case 'TPE-iPLS': + latex_report = report.report('Predictive model development', file_name, stats, + list({key: Reg.best_hyperparams_[key] for key in ['deriv', 'normalization', 'polyorder', 'window_length'] if key in Reg.best_hyperparams_}.values()), model_type, model_per, cv_results) -# case _: -# st.warning('Data processing has not been performed or finished yet!', icon = "âš ï¸") - -# ## create .tex file -# export_report(change = hash_) -# # compile.tex file -# if Path("./Report/report.tex").exists(): -# report.compile_latex() -# shutil.copy('./Report/figures', './PageRes/ModelCreation') -# export_model(change = hash_) + case _: + st.warning('Data processing has not been performed or finished yet!', icon = "âš ï¸") + + @st.cache_data(show_spinner =False) + def preparing_results_for_downloading(change): + match file: + # load csv file + case 'csv': + xfile.to_csv('Report/out/dataset/'+ xcal_csv.name, sep = ';', encoding = 'utf-8', mode = 'a') + yfile.to_csv('Report/out/dataset/'+ ycal_csv.name, sep = ';', encoding = 'utf-8', mode = 'a') + case 'dx': + with open('Report/out/dataset/'+data_file.name, 'w') as dd: + dd.write(dxdata) + + with open('./Report/out/model/'+ model_type + '.pkl','wb') as f:# export model + joblib.dump(reg_model, f) + figpath ='./Report/out/figures/' + spectra_plot.savefig(figpath + "spectra_plot.png") + target_plot.savefig(figpath + "histogram.png") + imp_fig.savefig(figpath + "variable_importance.png") + fig1.write_image(figpath + "meas_vs_pred_cv_all.png") + fig0.write_image(figpath + "meas_vs_pred_cv_onebyone.png") + measured_vs_predicted.savefig(figpath + 'measured_vs_predicted.png') + residuals_plot.savefig(figpath + 'residuals_plot.png') + with open('Report/out/Preprocessing.json', "w") as outfile: + json.dump(Reg.best_hyperparams_, outfile) + + if model_type == 'TPE-iPLS': # export selected wavelengths + wlfilename = './Report/out/model/'+ model_type+'-selected_wavelengths.xlsx' + all = pd.concat([intervalls_with_cols.T, Reg.selected_features_], axis = 0, ignore_index=True).T + all.columns=['wl_from','wl_to','idx_from', 'idx_to'] + all.to_excel(wlfilename) + + export_report(change = hash_) + if Path("./Report/report.tex").exists(): + report.generate_report(change = hash_) + if Path("./Report/report.pdf").exists(): + shutil.move("./Report/report.pdf", "./Report/out/report.pdf") + return change + preparing_results_for_downloading(change = hash_) - -# # shutil.move('./'+zipname,'./temp/'+ zipname) - - - shutil.make_archive(base_name = zipname.split('.')[0],format = "zip",root_dir = "./Report", base_dir = "figures")# create zip containing figures and report - - - -# # st.download_button('Download', data = fp, file_name = zipname, mime ="application/zip", -# # args = None, kwargs = None,type = "primary",use_container_width = True, -# # on_click = export_results) - - - + + st.header('Download the analysis results') + + import tempfile + @st.cache_data(show_spinner =False) + def tempdir(change): + with tempfile.TemporaryDirectory( prefix="results", dir="./Report") as temp_dir:# create a temp directory + tempdirname = os.path.split(temp_dir)[1] + + if len(os.listdir('./Report/out/figures/'))>2: + shutil.make_archive(base_name="./Report/Results", format="zip", base_dir="out", root_dir = "./Report")# create a zip file + shutil.move("./Report/Results.zip", f"./Report/{tempdirname}/Results.zip")# put the inside the temp dir + with open(f"./Report/{tempdirname}/Results.zip", "rb") as f: + zip_data = f.read() + return tempdirname, zip_data + + date_time = datetime.datetime.now().strftime('%y%m%d%H%M') + try : + tempdirname, zip_data = tempdir(change = hash_) + st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_Reg_.zip', mime ="application/zip", + args = None, kwargs = None,type = "primary",use_container_width = True) + except: + pass + + delete_files(keep = ['.py', '.pyc','.bib'])