diff --git a/Class_Mod/DxReader.py b/Class_Mod/DxReader.py index 4d6d815d7cef7a6a77dacd50b7a241cd4d8b657f..d877ff2f2d43a995a51f660a6de5076343315352 100644 --- a/Class_Mod/DxReader.py +++ b/Class_Mod/DxReader.py @@ -1,10 +1,12 @@ from Packages import * +import jcamp as jc -class DxReader: +class DxRead: '''This module is designed to help retrieve spectral data as well as metadata of smaples from jcamp file''' def __init__(self, path): - self.__path = path.replace('\\','/') + #self.__path = path.replace('\\','/') + self.__path = path self.__dxfile = jc.jcamp_readfile(self.__path) # Access samples data @@ -23,25 +25,25 @@ class DxReader: for i in range(self.__nb): # Loop over the blocks specs[i] = self.__list_of_blocks[i]['y'] - - block_met = { 'name':self.__list_of_blocks[i]['title'], - 'origin':self.__list_of_blocks[i]['origin'], - 'date':self.__list_of_blocks[i]['date'], - 'time':self.__list_of_blocks[i]['time'], - 'spectrometer/data system':self.__list_of_blocks[i]['spectrometer/data system'], - 'instrumental parameters':self.__list_of_blocks[i]['instrumental parameters'], - 'xunits':self.__list_of_blocks[i]['xunits'], - 'yunits':self.__list_of_blocks[i]['yunits'], - 'xfactor':self.__list_of_blocks[i]['xfactor'], - 'yfactor':self.__list_of_blocks[i]['yfactor'], - 'firstx':self.__list_of_blocks[i]['firstx'], - 'lastx':self.__list_of_blocks[i]['lastx'], - 'firsty':self.__list_of_blocks[i]['firsty'], - 'miny': self.__list_of_blocks[i]['miny'], - 'maxy': self.__list_of_blocks[i]['maxy'], - 'npoints':self.__list_of_blocks[i]['npoints'], - 'concentrations':self.__list_of_blocks[i]['concentrations'], - 'deltax':self.__list_of_blocks[i]['deltax'], + block = self.__list_of_blocks[i] + block_met = { 'name': block['title'], + 'origin': block['origin'], + 'date': block['date'], + 'time': block['time'], + 'spectrometer/data system': block['spectrometer/data system'], + 'instrumental parameters': block['instrumental parameters'], + 'xunits': block['xunits'], + 'yunits': block['yunits'], + 'xfactor': block['xfactor'], + 'yfactor': block['yfactor'], + 'firstx': block['firstx'], + 'lastx': block['lastx'], + 'firsty':block['firsty'], + 'miny': block['miny'], + 'maxy': block['maxy'], + 'npoints': block['npoints'], + 'concentrations':block['concentrations'], + 'deltax':block['deltax'] } self.__met[f'{i}'] = block_met self.metadata_ = pd.DataFrame(self.__met).T @@ -69,7 +71,7 @@ class DxReader: cc[df.index[i]] = self.conc(df[str(i)]) ### dataframe conntaining chemical data - self.chem_data = pd.DataFrame(cc, index=elements_name).T + self.chem_data = pd.DataFrame(cc, index=elements_name).T.astype(float) ### Method for retrieving the concentration of a single sample def conc(self,sample): diff --git a/Class_Mod/Kmedoids.py b/Class_Mod/Kmedoids.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Class_Mod/PCA_.py b/Class_Mod/PCA_.py index 1c4dbc6413a9157dd36a29e014bc9380c3698231..d0578f3942195b1f63f6bae355b56517b3b0e36d 100644 --- a/Class_Mod/PCA_.py +++ b/Class_Mod/PCA_.py @@ -2,86 +2,74 @@ from Packages import * class LinearPCA: def __init__(self, X, Ncomp=10): - ## define color palette to use for plotting - #self.__palette = 'YlGn' - #numerical_data, categorical_data, scaled_values = col_cat(X) - #self.catdata = list(categorical_data.columns) - - ## input matrix - self.__x = X - self._varnames = X.columns - self._rownames = X.index + + if isinstance(X, pd.DataFrame): + self.__x = X.to_numpy() + self._rownames = X.index + else: + self.__x = X + ## set the number of components to compute and fit the model self.__ncp = Ncomp + + # Fit PCA model M = PCA(n_components = self.__ncp) M.fit(self.__x) - ######## results ######## - # Explained variability - + ######## results ######## + # Results self.__pcnames = [f'PC{i+1}({100 * M.explained_variance_ratio_[i].round(2)}%)' for i in range(self.__ncp)] - self._Qexp_ratio = pd.DataFrame(100 * M.explained_variance_ratio_, columns = ["Qexp"], index= [f'PC{i+1}' for i in range(self.__ncp)]) - # Loadings and scores - #scores - s = M.transform(self.__x) - self.__t = s - self._t = s - self._r = pd.DataFrame(2*(s-s.min(axis=0))/(s.max(axis=0)-s.min(axis=0)) -1, index= self._rownames) - self._r.columns = self.__pcnames - # Normalize each loading vector to have unit length - self._p = (M.components_ / np.linalg.norm(M.components_, axis=0)).T - + self._p = M.components_.T + self._t = M.transform(self.__x) + self.eigvals = M.singular_values_**2 + self.Lambda = np.diag(self.eigvals) + # Matrix reconstruction or prediction making - # - self.res = pd.DataFrame() - for i in range(self.__ncp): - self._xp = np.dot(self.__t[:,i].reshape((-1,1)), self._p[:,i].reshape((1,-1))) - # residuals - self._e = self.__x - self._xp - self.res[self.__pcnames[i]] = np.diag(self._e@self._e.T) - #self._res = pd.DataFrame( self._e, columns = self._varnames, index = self._rownames ) + self.T2 = {} + self._xp = {} + self._qres = {} + self.leverage = {} - self._xp = self.__t @ self._p.T + # + for i in range(self.__ncp): + # Matrix reconstruction- prediction + self._xp[i] = np.dot(self._t[:,:i+1], self._p.T[:i+1,:]) - # Compute the cosine similarity between the normalized loading vectors - self.lev = {} - ## Laverage: leverage values range between 0 and 1 - for i in range(self._t.shape[1]): - ti = self._t[:,i].reshape((-1,1)) - Hat = ti @ np.linalg.pinv(np.transpose(ti) @ ti) @ np.transpose(ti) - self.lev[self._r.columns[i]] = ti.ravel() - self.leverage = pd.DataFrame(self.lev) + # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model + self._qres[i] = np.diag(np.subtract(self.__x, self._xp[i])@ np.subtract(self.__x, self._xp[i]).T) + self.T2[i] = np.diag(self._t[:,:i+1] @ np.transpose(self._t[:,:i+1])) + + # Laverage + Hat = self._t[:,:i+1] @ np.linalg.inv(np.transpose(self._t[:,:i+1]) @ self._t[:,:i+1]) @ np.transpose(self._t[:,:i+1]) + self.leverage[i] = np.diag(Hat) / np.trace(Hat) - ## Hotelling t2 - - self.eigvals = M.singular_values_**2 - self.Lambda = np.diag(self.eigvals) - self.T2 = pd.DataFrame() - tt = self._r.to_numpy() - for i in range(self._t.shape[1]): - self.T2[self.__pcnames[i]] = np.diag(self.__t[:,i].reshape((-1,1)) @ np.linalg.inv(np.array(self.Lambda[i,i]).reshape((1,1))) @ np.transpose(self.__t[:,i].reshape((-1,1)))) - @property def scores_(self): - return pd.DataFrame(self._r) + return pd.DataFrame(self._t, columns= self.__pcnames) @property def loadings_(self): - return pd.DataFrame(self._p, columns=self.__pcnames, index=self._varnames) + return pd.DataFrame(self._p, columns=self.__pcnames) @property def leverage_(self): - return self.leverage + lev = pd.DataFrame(self.leverage) + lev.columns =self.__pcnames + return lev @property - def residuals(self): - return self.res + def residuals_(self): + res = pd.DataFrame(self._qres) + res.columns=self.__pcnames + return res + @property def hotelling_(self): - #return pd.DataFrame(self.T2) - return self.T2 \ No newline at end of file + hot = pd.DataFrame(self.T2) + hot.columns=self.__pcnames + return hot \ No newline at end of file diff --git a/Class_Mod/SK_PLSR_.py b/Class_Mod/SK_PLSR_.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0650112a0866cc38bef4ae71fc42e2118803b699 100644 --- a/Class_Mod/SK_PLSR_.py +++ b/Class_Mod/SK_PLSR_.py @@ -0,0 +1,19 @@ +from Packages import * + + +class PlsR: + def __init__(self, x_train, y_train, x_test, y_test): + self.x_train = x_train + self.x_test = x_test + self.y_train = y_train + self.y_test = y_test + + def fit_(self): + nlv = 20 + rmse = [] + for i in range(nlv): + m = PLSRegression(n_components= 20) + ycv = cross_val_predict(m, self.x_train, self.y_train, cv = 5) + rmse.append(mean_squared_error(self.y_train, ycv)) + print(rmse) + diff --git a/Class_Mod/__init__.py b/Class_Mod/__init__.py index 03126eb8049680eedcdff3f7d44257561a93a67a..eb2dbb5b6b3a030cfa727730bf21e84ba9ed0948 100644 --- a/Class_Mod/__init__.py +++ b/Class_Mod/__init__.py @@ -7,3 +7,4 @@ from .LWPLSR_ import model_LWPLSR from .Regression_metrics import metrics from .VarSel import TpeIpls from .Miscellaneous import resid_plot, reg_plot +from .DxReader import DxRead diff --git a/Modules.py b/Modules.py index d0a137e634c80b6caa18d8bfec8ffa02fc2bdbc5..54399173517fa1fbd82e19b0df1cca4a63e380a2 100644 --- a/Modules.py +++ b/Modules.py @@ -1,4 +1,4 @@ -from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans +from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead # find_col_index from Class_Mod.Miscellaneous import prediction, download_results diff --git a/Packages.py b/Packages.py index 18a2ede199cabb44cef6185660106ea8f912f6c3..68c0bf07020805e962eef9454f2fc2099f375727 100644 --- a/Packages.py +++ b/Packages.py @@ -44,7 +44,7 @@ from sklearn.metrics import pairwise_distances_argmin_min ## Web app construction import streamlit as st - +from tempfile import NamedTemporaryFile # help on streamlit input https://docs.streamlit.io/library/api-reference/widgets #Library for connecting to SQL DB diff --git a/pages/1-samples_selection.py b/pages/1-samples_selection.py index 1b969c9b4a888b6e2f02d37f8d7da32b7676fd15..42ce38954bf94065543a8b1751f597b7bf04e7d1 100644 --- a/pages/1-samples_selection.py +++ b/pages/1-samples_selection.py @@ -15,40 +15,73 @@ influence, hotelling, qexp = st.columns([2, 2, 1]) with container1: - col1.header("NIRS Data Loading", divider='blue') + col1.header("Data Loading", divider='blue') col2.header("Spectral Data Visualization", divider='blue') - - with col1: - # loader for csv file containing NIRS spectra - sselectx_csv = st.file_uploader("Load NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) - if sselectx_csv is not None: - # Select list for CSV delimiter - psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9) - # Select list for CSV header True / False - phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31) - if phdr == 'yes': - col = 0 - else: - col = False - data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) - st.success("The data have been loaded successfully", icon="✅") - ## Visualize spectra - + # loader for csv file containing NIRS spectra + sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5) if sselectx_csv is not None: - with col2: - fig, ax = plt.subplots(figsize = (30,7)) - data_import.T.plot(legend=False, ax = ax, color = 'blue') - ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) - ax.set_ylabel('Signal', fontsize=18) - plt.margins(x = 0) - st.pyplot(fig) - - st.write("Summary") - info = pd.DataFrame({'N':[data_import.shape[0]], - 'Min': [np.min(data_import)], - 'Max':[np.max(data_import)],}, index = ['Values']).T - info.rename_axis('information') - st.table(data=info) + test = sselectx_csv.name[sselectx_csv.name.find('.'):] + if test== '.csv': + with col1: + # Select list for CSV delimiter + psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9) + # Select list for CSV header True / False + phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31) + if phdr == 'yes': + col = 0 + else: + col = False + data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) + st.success("The data have been loaded successfully", icon="✅") + ## Visualize spectra + + with col2: + fig, ax = plt.subplots(figsize = (30,7)) + data_import.T.plot(legend=False, ax = ax, color = 'blue') + ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) + ax.set_ylabel('Signal', fontsize=18) + plt.margins(x = 0) + st.pyplot(fig) + + st.write("Summary") + info = pd.DataFrame({'N':[data_import.shape[0]], + 'Min': [np.min(data_import)], + 'Max':[np.max(data_import)],}, index = ['Values']).T + info.rename_axis('information') + st.table(data=info) + + elif test == '.dx': + # Create a temporary file to save the uploaded file + with NamedTemporaryFile(delete=False, suffix=".dx") as tmp: + tmp.write(sselectx_csv.read()) + tmp_path = tmp.name + with col1: + data = DxRead(path = tmp_path) + data_import = data.specs_df_ + st.success("The data have been loaded successfully", icon="✅") + + ## Visualize spectra + + with col2: + fig, ax = plt.subplots(figsize = (30,7)) + data_import.T.plot(legend=False, ax = ax, color = 'blue') + ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) + ax.set_ylabel('Signal', fontsize=18) + plt.margins(x = 0) + st.pyplot(fig) + + st.write("Summary") + info = pd.DataFrame({'N':[data_import.shape[0]], + 'Min': [np.min(data_import)], + 'Max':[np.max(data_import)],}, index = ['Values']).T + info.rename_axis('information') + st.table(data=info) + os.unlink(tmp_path) + + + + + ###################################################################################### ############################## Exploratory data analysis ############################### @@ -116,30 +149,24 @@ with container2: ) st.plotly_chart(fig, use_container_width = True) - - - + with influence: st.write('Influence plot') ax1 = st.selectbox("Component", options=model.scores_.columns, index=3) - leverage = model.leverage_ - residuals = model.residuals - fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]) + residuals = model.residuals_ + fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals") st.plotly_chart(fig) with hotelling: - st.write('T²-Hotelling vs Q residuals plot') + hotelling = model.hotelling_ ax2 = st.selectbox("Component", options=model.scores_.columns, index=4) - t = model.hotelling_ - fig = px.scatter(t, x=t[ax2], y=t[ax2]) + hotelling = model.hotelling_ + fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") st.plotly_chart(fig) - with qexp: - pass - else: st.markdown('Select a dimensionality reduction technique from the dropdown list')