diff --git a/Sample_test.txt b/Sample_test.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/Class_Mod/DxReader.py b/src/Class_Mod/DxReader.py index 7b59b1033c3f4105f0479b8c00abd2cfcc0c6e4d..0e01c44595ba6493fbcca78c2bacf0e53434e31e 100644 --- a/src/Class_Mod/DxReader.py +++ b/src/Class_Mod/DxReader.py @@ -88,7 +88,9 @@ class DxRead: return self.spectra @property def md_df_(self): - return self.metadata_.drop("concentrations", axis = 1) + me = self.metadata_.drop("concentrations", axis = 1) + me = me.drop(me.columns[(me == '').all()], axis = 1) + return me @property def chem_data_(self): diff --git a/src/Class_Mod/Miscellaneous.py b/src/Class_Mod/Miscellaneous.py index 5ff007a69bebc8a0bc2629a7e07a9cee4013b72c..fed7bc2058882217d63fbd24e5571d0a8b8b54b7 100644 --- a/src/Class_Mod/Miscellaneous.py +++ b/src/Class_Mod/Miscellaneous.py @@ -92,15 +92,17 @@ def download_results(data, export_name): @st.cache_resource def plot_spectra(df): + fig, ax = plt.subplots(figsize = (30,7)) if isinstance(df.columns[0], str): - m = 0 + df.T.plot(legend=False, ax = ax, color = 'blue') + min = 0 else: - m = np.min(df.columns) + min = np.max(df.columns) + df.T.plot(legend=False, ax = ax, color = 'blue').invert_xaxis() - fig, ax = plt.subplots(figsize = (30,7)) - df.T.plot(legend=False, ax = ax, color = 'blue') + plt.annotate(text = f'The total number of spectra is {df.shape[0]}', xy =(min, np.max(df)), size=20, color = 'black', backgroundcolor='red') ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) ax.set_ylabel('Signal intensity', fontsize=18) plt.margins(x = 0) - plt.annotate(text = f'The total number of spectra is {df.shape[0]}', xy =(m, np.max(df)), size=20, color = 'black', backgroundcolor='red') + return fig diff --git a/src/Class_Mod/PLSR_Preprocess.py b/src/Class_Mod/PLSR_Preprocess.py index ff18620068f08342aab30348697974b9f28bddef..72aa9841093da02e3bafc84371f9cbd3ce4e12f5 100644 --- a/src/Class_Mod/PLSR_Preprocess.py +++ b/src/Class_Mod/PLSR_Preprocess.py @@ -86,7 +86,11 @@ class PlsProcess: @property def best_hyperparams(self): - return self.best + self.b = {'Scatter':self.best['scatter'], 'Saitzky-Golay derivative parameters':{'polyorder':self.best['polyorder'], + 'deriv':self.best['deriv'], + 'window_length':self.best['window_length']}} + return self.b + @property def model_(self): return self.model diff --git a/src/Class_Mod/UMAP_.py b/src/Class_Mod/UMAP_.py index 05ee5b3f11059363f73b3abe813abf12afb347fb..28d0436e6efe90fb251e60627234376cc10467d0 100644 --- a/src/Class_Mod/UMAP_.py +++ b/src/Class_Mod/UMAP_.py @@ -6,8 +6,7 @@ class Umap: """ The UMAP dimension reduction algorithm from scikit learn """ - def __init__(self, data_import, numerical_data, cat_data): - self.x = data_import + def __init__(self, numerical_data, cat_data): self.numerical_data = numerical_data if cat_data is None: self.categorical_data_encoded = cat_data @@ -21,7 +20,7 @@ class Umap: self.model = UMAP(n_neighbors=20, n_components=3, min_dist=0.0, random_state=42,) self.model.fit(self.numerical_data, y = self.categorical_data_encoded) self.scores_raw = self.model.transform(self.numerical_data) - self.scores = pd.DataFrame(self.scores_raw, index = self.x.index) + self.scores = pd.DataFrame(self.scores_raw, index = self.numerical_data.index) @property def scores_(self): diff --git a/src/Packages.py b/src/Packages.py index 6d750dc4f2c758854e8c2ace77b1249bb6f13090..618e41abc01fb7f71dc163774ec4e5b89b6e3742 100644 --- a/src/Packages.py +++ b/src/Packages.py @@ -26,7 +26,7 @@ from scipy.sparse.csgraph import minimum_spanning_tree from scipy.sparse import csgraph # Modelling -from juliacall import Main as jl +#from juliacall import Main as jl from pinard import utils from pinard import preprocessing as pp diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index c9085bd0d096c62d619ef6b0b26fc11220fbd6f5..d1de4c137e85a3012116d244f79a45a5bffc6b51 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -82,7 +82,6 @@ scores, loadings, pc = st.columns([2, 3, 0.5]) influence, hotelling, qexp = st.columns([2, 2, 1]) st.header('Selected samples for chemical analysis') selected_s, selected_samples_metd = st.columns([3, 3]) -selected_s.write('Samples scores') dim_red_methods=['', 'PCA','UMAP', 'NMF'] # List of dimensionality reduction algos cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos @@ -101,7 +100,7 @@ if not spectra.empty: if dim_red_method == dim_red_methods[1]: - dr_model = LinearPCA(xc, Ncomp=5) + dr_model = LinearPCA(xc, Ncomp=8) elif dim_red_method == dim_red_methods[2]: if not meta_data.empty: filter = meta_data.columns[1:] @@ -109,7 +108,7 @@ if not spectra.empty: supervised = meta_data[col] else: supervised = None - dr_model = Umap(data_import = imp, numerical_data = MinMaxScale(spectra), cat_data = supervised) + dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised) if dr_model: axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0) @@ -152,6 +151,7 @@ if labels: # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster closest, _ = pairwise_distances_argmin_min(clu_centers, tcr) selected_samples_idx = list(closest) + elif selection == selec_strategy[1]: selection_number = scores.number_input('How many samples per cluster?', min_value = 1, step=1, value = 3) for i in np.unique(labels): @@ -161,9 +161,7 @@ if labels: km2 = KMeans(n_clusters = selection_number) km2.fit(tcr.iloc[C,:]) clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:]) - selected_samples_idx2 = list(clos) - selected_samples_idx.extend(tcr.iloc[C,:].index[selected_samples_idx2]) - # selected_samples_idx.extend(tcr.iloc[C,:].sample(n=selection_number).index.to_list()) + selected_samples_idx.extend(list(clos)) else: selected_samples_idx.extend(tcr.iloc[C,:].index.to_list()) # list indexes of selected samples for colored plot @@ -171,9 +169,11 @@ if labels: if labels: if selected_samples_idx: sam = pd.DataFrame({'cluster':np.array(labels)[selected_samples_idx], - 'index': spectra.index[selected_samples_idx]}) + 'index': spectra.index[selected_samples_idx]}, index = selected_samples_idx) + selected_s.write(sam) + if not meta_data.empty: selected_samples_metd.write('Corresponding meta-data') meta = meta_data.iloc[selected_samples_idx,:] @@ -181,14 +181,18 @@ if labels: meta['index'] = spectra.index[selected_samples_idx] selected_samples_metd.write(meta) - +############################################################################ ## Scores if not t.empty: with scores: + fig1, ((ax1, ax2),(ax3,ax4)) = plt.subplots(2,2) st.write('Scores plot') # scores plot with clustering if list(labels) and meta_data.empty: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = labels, ax = ax1) + + # scores plot with metadata elif len(list(labels)) == 0 and not meta_data.empty: @@ -196,9 +200,12 @@ if not t.empty: col = st.selectbox('Color by:', options= filter) if col == 0: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) + else: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) ) - + sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax1) + # color with scores and metadata elif len(list(labels)) > 0 and not meta_data.empty: if clus_method in cluster_methods[1:]: @@ -210,13 +217,19 @@ if not t.empty: col = st.selectbox('Color by:', options= filter) if col == "None": fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) elif col == clus_method: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) else: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col]))) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax1) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax2) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax3) else: fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3) + sns.scatterplot(data = tcr, x = axis1, y =axis2 , ax = ax1) fig.update_traces(marker=dict(size=4)) if selected_samples_idx: @@ -224,6 +237,8 @@ if not t.empty: fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2], z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 7, color = 'black'), name = 'selected samples') + + plt.savefig("./Report/Figures/test.png") st.plotly_chart(fig, use_container_width=True) import plotly.express as px diff --git a/src/pages/2-model_creation.py b/src/pages/2-model_creation.py index e83931ea53a0309b63756ca9818117bd46ae64db..cfed211cd23bdeb57c997ec2eb27fa521bfdcd57 100644 --- a/src/pages/2-model_creation.py +++ b/src/pages/2-model_creation.py @@ -19,10 +19,8 @@ st.session_state["interface"] = st.session_state.get('interface') if st.session_state["interface"] == 'simple': hide_pages("Predictions") -######################################################################################## -reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR", "Full-PLSR-sklearn", "PrePLStester"] -# page Design + ####################################### page Design ####################################### st.header("Calibration Model Development", divider='blue') st.write("Create a predictive model, then use it for predicting your target variable (chemical values) from NIRS spectra") M1, M2, M3 = st.columns([2,3,2]) @@ -35,12 +33,15 @@ M7.write('Predicted vs Measured values') M8.write('Residuals plot') M9, M10 = st.columns([2,2]) M9.write("-- Save the model --") + ###################################################################### + +reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR", "Full-PLSR-sklearn", "PrePLStester"] + ####################################### ########################################### files_format = ['.csv', '.dx'] file = M3.radio('select data file format:', options = files_format) - ### Data spectra = pd.DataFrame y = pd.DataFrame @@ -57,14 +58,11 @@ if file == files_format[0]: else: col = False ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") - - if ycal_csv: sepy = M3.radio("separator (Y file): ", options=[";", ","], key=2) hdry = M3.radio("samples name (Y file)?: ", options=["no", "yes"], key=3) if hdry == "yes": col = 0 else: col = False - if xcal_csv and ycal_csv: spectra, meta_data = col_cat(pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0)) @@ -77,13 +75,11 @@ if file == files_format[0]: if spectra.shape[0] == y.shape[0]: pass + else: M3.warning('The number of samples is different in X and Y') y = pd.DataFrame spectra = pd.DataFrame - - - ## Load .dx file elif file == files_format[1]: @@ -94,10 +90,13 @@ elif file == files_format[1]: tmp_path = tmp.name chem_data, spectra, meta_data = read_dx(file = tmp_path) M3.success("The data have been loaded successfully", icon="✅") - yname = M3.selectbox('Select target', options=chem_data.columns) - measured = chem_data.loc[:,yname] > 0 - y = chem_data.loc[:,yname].loc[measured] - spectra = spectra.loc[measured] + if chem_data.shape[1]>0: + yname = M3.selectbox('Select target', options=chem_data.columns) + measured = chem_data.loc[:,yname] > 0 + y = chem_data.loc[:,yname].loc[measured] + spectra = spectra.loc[measured] + else: + M3.warning('Warning: Chemical data are not included in your file !', icon="âš ï¸") os.unlink(tmp_path) ### split the data @@ -161,7 +160,7 @@ if not spectra.empty and not y.empty: elif regression_algo == reg_algo[5]: Reg = PlsProcess(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test, scale = False, Kfold=3) - Reg.tune(n_iter=100) + Reg.tune(n_iter=500) reg_model = Reg.model_ ################# Model analysis ############ @@ -170,7 +169,9 @@ if not spectra.empty and not y.empty: ycv = Reg.pred_data_[1] yt = Reg.pred_data_[2] - + + M2.write('-- Spectral preprocessing info --') + M2.write(Reg.best_hyperparams) M2.write("-- Performance metrics --") M2.dataframe(Reg.metrics_) diff --git a/src/pages/4-inputs.py b/src/pages/4-inputs.py index 439c3bf6d04763eda9454e440408be504898c9e5..59fbb88694103f31806b91c16ed2510a18215e03 100644 --- a/src/pages/4-inputs.py +++ b/src/pages/4-inputs.py @@ -1,6 +1,11 @@ import streamlit as st - +from Packages import * +st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide",) +if 'interface' not in st.session_state: + st.session_state['interface'] = 'simple' +from Modules import * +from Class_Mod.DATA_HANDLING import * # HTML for the banner "CEFE - CNRS" # bandeau_html = """ # <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;"> @@ -28,7 +33,7 @@ with st.container(): .stButton>button { display: block; margin: 0 auto; - width: 200px; + width: 200px; height: 50px; font-size: 16px; } diff --git a/src/pages/new 4.txt b/src/pages/new 4.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/plot_axe1_axe2.png b/src/plot_axe1_axe2.png new file mode 100644 index 0000000000000000000000000000000000000000..a52dfe7b11df2b892ef0bf0b54e86c47fd0714b9 Binary files /dev/null and b/src/plot_axe1_axe2.png differ