diff --git a/src/Class_Mod/DATA_HANDLING.py b/src/Class_Mod/DATA_HANDLING.py index def743e12a3b202f3c2d4a13b75b6d93ede169c7..56af54af8fa7f40d237e2febd6972590a1cb72d1 100644 --- a/src/Class_Mod/DATA_HANDLING.py +++ b/src/Class_Mod/DATA_HANDLING.py @@ -51,10 +51,9 @@ def list_files(mypath, import_type): -def standardize(X): - t = X - sk = StandardScaler() - sc = pd.DataFrame(sk.fit_transform(X), index = t.index, columns = t.columns) +def standardize(X, center = True, scale = False): + sk = StandardScaler(with_mean=center, with_std = scale) + sc = pd.DataFrame(sk.fit_transform(X), index = X.index, columns = X.columns) return sc def MinMaxScale(X): diff --git a/src/Class_Mod/HDBSCAN_Clustering.py b/src/Class_Mod/HDBSCAN_Clustering.py index d20aa7e8f791c3fd5b84c77b24f8766fd6f78e41..f087a2272f46178a75a9693f8e701837ffee6ac4 100644 --- a/src/Class_Mod/HDBSCAN_Clustering.py +++ b/src/Class_Mod/HDBSCAN_Clustering.py @@ -67,6 +67,7 @@ class Hdbscan: self._labels = self.best_hdbscan.labels_ self._centers = self.best_hdbscan.medoids_ + def DBCV(self, X, labels, dist_function=euclidean): """ Implimentation of Density-Based Clustering Validation "DBCV" @@ -320,5 +321,4 @@ class Hdbscan: @property def HDBSCAN_scores_(self): - clu = [f'cluster#{i}' for i in self._labels] - return clu, self._hdbscan_bscore, self._centers + return self._labels, self._hdbscan_bscore, self._centers diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index 8028dada947d5d0272b440c91ff49a7717f46d45..c4f310189a8962c698d9d99dfd180925fef203e6 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -27,6 +27,7 @@ col2.header("Spectral Data Visualization", divider='blue') spectra = pd.DataFrame meta_data = pd.DataFrame selected_samples = pd.DataFrame +non_clustered = None colnames = [] rownames = [] @@ -51,7 +52,7 @@ if data_file: imp = pd.read_csv(data_file, sep=psep, index_col=col) # spectra = col_cat(imp)[0] # meta_data = col_cat(imp)[1] - spectra, meta_data = col_cat(imp) + spectra, md_df_st_ = col_cat(imp) st.success("The data have been loaded successfully", icon="✅") ## Load .dx file @@ -109,7 +110,7 @@ labels = [] if not spectra.empty: dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37) clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38) - xc = standardize(spectra) + xc = standardize(spectra, center=True, scale=False) if dim_red_method == dim_red_methods[1]: @@ -153,14 +154,15 @@ if not t.empty: # 2- HDBSCAN clustering elif clus_method == cluster_methods[2]: optimized_hdbscan = Hdbscan(np.array(t)) - labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_ - non_clustered = np.where(labels == -1) - labels[non_clustered] = np.max(labels)+2 - labels = [f'cluster#{i+1}' for i in labels.tolist()] - + all_labels, hdbscan_score, clu_centers = optimized_hdbscan.HDBSCAN_scores_ + labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in all_labels] + non_clustered = np.where(np.array(labels) == 'Non clustered') + clustered = np.where(np.array(labels) != 'Non clustered')[0] + + #st.write(optimized_hdbscan.non_clustered) # 3- Affinity propagation elif clus_method == cluster_methods[3]: - cl_model = AP(X=tcr) + cl_model = AP(X = tcr) data, labels, clu_centers = cl_model.fit_optimal_ @@ -173,12 +175,11 @@ selected_samples_idx = [] if labels: if clus_method: - if clus_method == cluster_methods[2]: - selection = scores.radio('Select samples selection strategy:', - options = ['random']) - else: selection = scores.radio('Select samples selection strategy:', options = selec_strategy) + if clus_method == cluster_methods[2]: + tcr = tcr.iloc[clustered,:] + if selection == selec_strategy[0]: # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster closest, _ = pairwise_distances_argmin_min(clu_centers, tcr) @@ -200,7 +201,6 @@ if labels: # list indexes of selected samples for colored plot if selected_samples_idx: - if meta_data.empty: sam = pd.DataFrame({'name': spectra.index[selected_samples_idx], 'cluster':np.array(labels)[selected_samples_idx]}, @@ -214,7 +214,8 @@ if labels: selected_s.write(f' The selected subset consists of {sam.shape[0]} samples') selected_s.write(sam) - + selected_s.checkbox("Include non clustered samples (for HDBSCAN clustering)") + ################################ Plots visualization ############################################ ## Scores if not t.empty: @@ -319,10 +320,17 @@ if not spectra.empty: if test =='.dx': if meta_data.loc[:,'xunits'][0] == '1/cm': freq.columns = ['Wavenumber (1/cm)'] + xlab = "Wavenumber (1/cm)" + inv = 'reversed' else: freq.columns = ['Wavelength (nm)'] + xlab = 'Wavelength (nm)' + inv = None + else: freq.columns = ['Wavelength/Wavenumber'] + xlab = 'Wavelength/Wavenumber' + inv = None pp = pd.concat([p, freq], axis=1) ######################################### @@ -330,6 +338,8 @@ if not spectra.empty: fig = px.line(df1, x=freq.columns, y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly) fig.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"), bordercolor="black", borderwidth=2)) + fig.update_layout(xaxis_title = xlab,yaxis_title = "Intensity" ,xaxis = dict(autorange= inv)) + st.plotly_chart(fig, use_container_width=True) diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index efc2022f954b5b1d8c49326e08bcfd6431bd2c46..cde267b0c88d018f5ac8c0c6049ef380bdd83b99 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -3,6 +3,7 @@ from Packages import * st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") from Modules import * from Class_Mod.DATA_HANDLING import * +from pandas.api.types import is_float_dtype add_header() @@ -56,21 +57,35 @@ if file == files_format[0]: else: col = False if xcal_csv and ycal_csv: - spectra, meta_data = col_cat(pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0)) - y, _ = col_cat(pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col)) + xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0) + yfile = pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col) + + if yfile.shape[1]>0 and xfile.shape[1]>0 : + spectra, meta_data = col_cat(xfile) + y, idx = col_cat(yfile) + if y.shape[1]>1: + yname = M3.selectbox('Select target', options=y.columns) + y = y.loc[:,yname] + else: + y = y.iloc[:,0] + - y = pd.DataFrame(y).astype(float).iloc[:,0] - spectra = pd.DataFrame(spectra).astype(float) - if not meta_data.empty : - st.write(meta_data) + spectra = pd.DataFrame(spectra).astype(float) + if not meta_data.empty : + st.write(meta_data) - if spectra.shape[0] == y.shape[0]: - pass + if spectra.shape[0] != y.shape[0]: + M3.warning('X and Y have different sample size') + y = pd.DataFrame + spectra = pd.DataFrame else: - M3.warning('The number of samples is different in X and Y') - y = pd.DataFrame - spectra = pd.DataFrame + M1.warning('Tune decimal and separator parameters') + + + + + ## Load .dx file elif file == files_format[1]: @@ -175,11 +190,21 @@ if not spectra.empty and not y.empty: ycv = Reg.pred_data_[1] yt = Reg.pred_data_[2] - - M2.write('-- Spectral preprocessing info --') - M2.write(Reg.best_hyperparams) + #if + #M2.write('-- Spectral preprocessing info --') + #M2.write(Reg.best_hyperparams) M2.write("-- Performance metrics --") M2.dataframe(Reg.metrics_) + #from st_circular_progress import CircularProgress + #my_circular_progress = CircularProgress(label = 'Performance',value = 50, key = 'my performance', + # size = "medium", track_color = "black", color = "blue") + + #my_circular_progress.st_circular_progress() + #my_circular_progress.update_value(progress=20) + + + + M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt], train_idx = train_index, test_idx = test_index)) M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt], train_idx = train_index, test_idx = test_index)) @@ -220,5 +245,4 @@ if not spectra.empty and not y.empty: if st.session_state['interface'] == 'simple': st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !') - -## Load .dx file +## Load .dx file \ No newline at end of file