Skip to content
Snippets Groups Projects
Commit 023dfa7d authored by DIANE's avatar DIANE
Browse files

code readability and complexity were enhanced

modifications were incorporated
parent 19757ac9
No related branches found
No related tags found
No related merge requests found
...@@ -31,17 +31,13 @@ def col_cat(data_import): ...@@ -31,17 +31,13 @@ def col_cat(data_import):
if len(categorical_columns_list) > 0: if len(categorical_columns_list) > 0:
categorical_data = pd.concat(categorical_columns_list, axis=1) categorical_data = pd.concat(categorical_columns_list, axis=1)
if len(categorical_columns_list) == 0: if len(categorical_columns_list) == 0:
empty = ["" for x in range(len(data_import))] categorical_data = pd.DataFrame
categorical_columns_list.append(empty)
categorical_data = pd.DataFrame(categorical_columns_list).T
categorical_data.columns = ['no categories']
# Create numerical data matrix from the numerical columns list and fill na with the mean of the column # Create numerical data matrix from the numerical columns list and fill na with the mean of the column
numerical_data = pd.concat(numerical_columns_list, axis=1) numerical_data = pd.concat(numerical_columns_list, axis=1)
numerical_data = numerical_data.apply(lambda x: x.fillna(x.mean())) #np.mean(x))) numerical_data = numerical_data.apply(lambda x: x.fillna(x.mean())) #np.mean(x)))
# Scale the numerical data
scaler = StandardScaler() return numerical_data, categorical_data
scaled_values = scaler.fit_transform(numerical_data)
return numerical_data, categorical_data, scaled_values
def list_files(mypath, import_type): def list_files(mypath, import_type):
......
...@@ -29,21 +29,21 @@ class DxRead: ...@@ -29,21 +29,21 @@ class DxRead:
block_met = { 'name': block['title'], block_met = { 'name': block['title'],
'origin': block['origin'], 'origin': block['origin'],
'date': block['date'], 'date': block['date'],
'time': block['time'], # 'time': block['time'],
'spectrometer/data system': block['spectrometer/data system'], # 'spectrometer/data system': block['spectrometer/data system'],
'instrumental parameters': block['instrumental parameters'], # 'instrumental parameters': block['instrumental parameters'],
'xunits': block['xunits'], # 'xunits': block['xunits'],
'yunits': block['yunits'], # 'yunits': block['yunits'],
'xfactor': block['xfactor'], # 'xfactor': block['xfactor'],
'yfactor': block['yfactor'], # 'yfactor': block['yfactor'],
'firstx': block['firstx'], # 'firstx': block['firstx'],
'lastx': block['lastx'], # 'lastx': block['lastx'],
'firsty':block['firsty'], # 'firsty':block['firsty'],
'miny': block['miny'], # 'miny': block['miny'],
'maxy': block['maxy'], # 'maxy': block['maxy'],
'npoints': block['npoints'], # 'npoints': block['npoints'],
'concentrations':block['concentrations'], 'concentrations':block['concentrations'],
'deltax':block['deltax'] # 'deltax':block['deltax']
} }
self.__met[f'{i}'] = block_met self.__met[f'{i}'] = block_met
self.metadata_ = pd.DataFrame(self.__met).T self.metadata_ = pd.DataFrame(self.__met).T
...@@ -87,8 +87,13 @@ class DxRead: ...@@ -87,8 +87,13 @@ class DxRead:
return self.spectra return self.spectra
@property @property
def md_df_(self): def md_df_(self):
return self.metadata_ return self.metadata_.drop("concentrations", axis = 1)
@property @property
def chem_data_(self): def chem_data_(self):
return self.chem_data return self.chem_data
\ No newline at end of file
@st.cache_data
def read_dx(file):
M = DxRead(file)
return M.chem_data, M.specs_df_, M.md_df_
\ No newline at end of file
...@@ -47,3 +47,18 @@ def resid_plot( meas, pred): ...@@ -47,3 +47,18 @@ def resid_plot( meas, pred):
def download_results(data, export_name): def download_results(data, export_name):
with open(data) as f: with open(data) as f:
st.download_button('Download Results', f, export_name) st.download_button('Download Results', f, export_name)
@st.cache_resource
def plot_spectra(df):
if isinstance(df.columns[0], str):
m = 0
else:
m = np.min(df.columns)
fig, ax = plt.subplots(figsize = (30,7))
df.T.plot(legend=False, ax = ax, color = 'blue')
ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
ax.set_ylabel('Signal intensity', fontsize=18)
plt.margins(x = 0)
plt.annotate(text = f'The total number of spectra is {df.shape[0]}', xy =(m, np.max(df)), size=20, color = 'black', backgroundcolor='red')
return fig
...@@ -5,10 +5,7 @@ from Class_Mod.DATA_HANDLING import * ...@@ -5,10 +5,7 @@ from Class_Mod.DATA_HANDLING import *
class Umap: class Umap:
def __init__(self, x, n_components, n_neighbors, min_dist): def __init__(self, x, n_components, n_neighbors, min_dist):
self.numerical_data, categorical_data, scaled_values = col_cat(x) self.x = x
self.catdata = list(categorical_data.columns)
self.x = scaled_values
self.model = UMAP(n_neighbors=20, n_components=4, min_dist=0.0,) # random_state=42,) self.model = UMAP(n_neighbors=20, n_components=4, min_dist=0.0,) # random_state=42,)
self.model.fit(self.x) self.model.fit(self.x)
......
...@@ -7,4 +7,4 @@ from .LWPLSR_ import model_LWPLSR ...@@ -7,4 +7,4 @@ from .LWPLSR_ import model_LWPLSR
from .Regression_metrics import metrics from .Regression_metrics import metrics
from .VarSel import TpeIpls from .VarSel import TpeIpls
from .Miscellaneous import resid_plot, reg_plot from .Miscellaneous import resid_plot, reg_plot
from .DxReader import DxRead from .DxReader import DxRead, read_dx
from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, read_dx
# find_col_index # find_col_index
from Class_Mod.Miscellaneous import prediction, download_results from Class_Mod.Miscellaneous import prediction, download_results, plot_spectra
...@@ -38,6 +38,8 @@ from PIL import Image ...@@ -38,6 +38,8 @@ from PIL import Image
import plotly.express as px import plotly.express as px
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
import matplotlib
### Important Metrics ### Important Metrics
from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score
......
...@@ -3,193 +3,167 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") ...@@ -3,193 +3,167 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import * from Modules import *
from Class_Mod.DATA_HANDLING import * from Class_Mod.DATA_HANDLING import *
################################### Data Loading and Visualization ######################################## ################################### Data Loading and Visualization ########################################
container1 = st.container(border=True) # container1 = st.header("Data loading",border=True)
col2, col1 = st.columns([3, 1]) col2, col1 = st.columns([3, 1])
col1.header("Data Loading", divider='blue') col1.header("Data Loading", divider='blue')
col2.header("Spectral Data Visualization", divider='blue') col2.header("Spectral Data Visualization", divider='blue')
container2 = st.container(border=True) ## Preallocation of data structure
container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') data_import = pd.DataFrame
scores, loadings, pc = st.columns([2, 3, 0.5]) meta_data = pd.DataFrame
influence, hotelling, qexp = st.columns([2, 2, 1]) selected_samples = pd.DataFrame
with container1: # loader for csv file containing NIRS spectra
# loader for csv file containing NIRS spectra sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
if sselectx_csv is not None:
test = sselectx_csv.name[sselectx_csv.name.find('.'):]
if test== '.csv':
with col1:
# Select list for CSV delimiter
psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
# Select list for CSV header True / False
phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31)
if phdr == 'yes':
col = 0
else:
col = False
data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
st.success("The data have been loaded successfully", icon="")
## Visualize spectra
with col2:
fig, ax = plt.subplots(figsize = (30,7))
data_import.T.plot(legend=False, ax = ax, color = 'blue')
ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
ax.set_ylabel('Signal', fontsize=18)
plt.margins(x = 0)
st.pyplot(fig)
st.write("Summary")
info = pd.DataFrame({'N':[data_import.shape[0]],
'Min': [np.min(data_import)],
'Max':[np.max(data_import)],}, index = ['Values']).T
info.rename_axis('information')
st.table(data=info)
elif test == '.dx':
# Create a temporary file to save the uploaded file
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
tmp.write(sselectx_csv.read())
tmp_path = tmp.name
with col1:
data = DxRead(path = tmp_path)
data_import = data.specs_df_
st.success("The data have been loaded successfully", icon="")
## Visualize spectra
with col2:
fig, ax = plt.subplots(figsize = (30,7))
data_import.T.plot(legend=False, ax = ax, color = 'blue')
ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
ax.set_ylabel('Signal', fontsize=18)
plt.margins(x = 0)
st.pyplot(fig)
st.write("Summary")
info = pd.DataFrame({'N':[data_import.shape[0]],
'Min': [np.min(data_import)],
'Max':[np.max(data_import)],}, index = ['Values']).T
info.rename_axis('information')
st.table(data=info)
os.unlink(tmp_path)
######################################################################################
############################## Exploratory data analysis ###############################
plot_type=['', 'PCA','UMAP', 'NMF']
cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
with container2:
if sselectx_csv is not None:
plot_type=['', 'PCA','UMAP', 'NMF']
cluster_methods = ['', 'Kmeans','UMAP', 'AP']
with pc: #with container1:
type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37) if sselectx_csv:
type_cluster = st.selectbox("Clustering techniques: ", options=cluster_methods, key=38) test = sselectx_csv.name[sselectx_csv.name.find('.'):]
# compute UMAP - umap_maker in application_functions.py
if type_plot == 'PCA':
model = LinearPCA(data_import, Ncomp=5)
elif type_plot =='UMAP':
model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0)
if test== '.csv':
with col1:
# Select list for CSV delimiter
psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
# Select list for CSV header True / False
phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31)
if phdr == 'yes':
col = 0
else:
col = False
imp = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
data_import = col_cat(imp)[0]
meta_data = col_cat(imp)[1]
st.success("The data have been loaded successfully", icon="")
elif test == '.dx':
# Create a temporary file to save the uploaded file
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
tmp.write(sselectx_csv.read())
tmp_path = tmp.name
with col1:
_, data_import, meta_data = read_dx(file = tmp_path)
st.success("The data have been loaded successfully", icon="")
os.unlink(tmp_path)
if type_plot in ['PCA', 'UMAP']: if not data_import.empty:
# add 2 select lists to choose which component to plot ## Visualize spectra
axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0) with col2:
axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1) fig = plot_spectra(data_import)
axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2)
if type_cluster == 'Kmeans': #plt.annotate(text = info.T, xy =(m, info.loc[:,"Max"]), size=20, color = 'black', backgroundcolor='red')
scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1) st.pyplot(fig)
cl = Sk_Kmeans(scsc, max_clusters = 30)
elif type_cluster == 'HDBSCAN':
from hdbscan import HDBSCAN_function
labels, hdbscan_score = HDBSCAN_function(data_import, min_cluster_size=10)
with scores: ############################## Exploratory data analysis ###############################
t = model.scores_ container2 = st.container(border=True)
if type_cluster in ['AP', 'Kmeans']: container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
st.write('Scree plot') scores, loadings, pc = st.columns([2, 3, 0.5])
fig2 = px.scatter(cl.inertia_.T, y = 'inertia') influence, hotelling, qexp = st.columns([2, 2, 1])
st.plotly_chart(fig2)
ncluster = st.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters') dim_red_methods=['', 'PCA','UMAP', 'NMF']
data, colors = cl.fit_optimal(nclusters=ncluster) cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
#fig = px.scatter(data, x=axis1, y=axis2, color= colors) dr_model = None
st.write('Scores plot') cl_model = None
fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors)
fig.update_traces(marker=dict(size=4)) # Dimensionality reduction
t = pd.DataFrame
if not data_import.empty:
dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37)
clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38)
if dim_red_method == dim_red_methods[1]:
dr_model = LinearPCA(data_import, Ncomp=5)
elif dim_red_method == dim_red_methods[2]:
dr_model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0)
if dr_model:
axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1)
axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2)
t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1)
# clustering
labels = pd.DataFrame
if not t.empty:
# Clustering
if clus_method == cluster_methods[1]:
ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
cl_model = Sk_Kmeans(t, max_clusters = 30)
fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia')
scores.plotly_chart(fig2)
data, labels = cl_model.fit_optimal(nclusters = ncluster)
elif clus_method == cluster_methods[1]:
from hdbscan import HDBSCAN_function
labels, hdbscan_score = HDBSCAN_function(t, min_cluster_size=10)
##### Plots
elif type_cluster in ['HDBSCAN']:
st.write('plot HDBSCAN clustering')
fig_hdbscan = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
fig_hdbscan.update_traces(marker=dict(size=4))
st.plotly_chart(fig_hdbscan)
st.write('DBCV score = ' + str(hdbscan_score))
# st.dataframe(min_score.stack().agg(['min']))
## Scores
if not t.empty:
with scores:
st.write('Scores plot')
# scores plot with clustering
if not pd.DataFrame(labels).empty:
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = labels)
else:
# scores plot with metadata
if not meta_data.empty:
filter = meta_data.columns[1:]
col = st.selectbox('filter', options= filter)
if col == 0:
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3)
else: else:
if test == '.dx': fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) )
filter = ['origin', 'date', 'time', 'spectrometer/data system'] else:
col = st.selectbox('filter', options= filter) # scores plot with neither metadata nor clustering
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3)
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = data.md_df_[col]) fig.update_traces(marker=dict(size=4))
fig.update_traces(marker=dict(size=4)) st.plotly_chart(fig)
else:
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3 )
fig.update_traces(marker=dict(size=4))
if not data_import.empty:
if dim_red_method == dim_red_methods[1]:
with loadings:
st.write('Loadings plot')
p = dr_model.loadings_
pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
df1 = pp.melt(id_vars="wl")
fig = px.line(df1, x = 'wl', y = 'value', color='variable')
fig.update_layout(legend=dict(x=1, y=0,font=dict(family="Courier", size=12, color="black"),
bordercolor="Black", borderwidth=2))
st.plotly_chart(fig, use_container_width = True)
with influence:
st.write('Influence plot')
ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3)
leverage = dr_model.leverage_
residuals = dr_model.residuals_
fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
st.plotly_chart(fig)
with hotelling:
st.write('T²-Hotelling vs Q residuals plot')
hotelling = dr_model.hotelling_
ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4)
hotelling = dr_model.hotelling_
fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="",yaxis_title="Residuals")
st.plotly_chart(fig) st.plotly_chart(fig)
if type_plot =='PCA':
with loadings:
st.write('Loadings plot')
p = model.loadings_
pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
df1 = pp.melt(id_vars="wl")
fig = px.line(df1, x = 'wl', y = 'value', color='variable')
fig.update_layout(
legend=dict(x=1, y=0,
font=dict(
family="Courier", size=12, color="black"),
bordercolor="Black", borderwidth=2)
)
st.plotly_chart(fig, use_container_width = True)
with influence:
st.write('Influence plot')
ax1 = st.selectbox("Component", options=model.scores_.columns, index=3)
leverage = model.leverage_
residuals = model.residuals_
fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
st.plotly_chart(fig)
with hotelling:
st.write('T²-Hotelling vs Q residuals plot')
hotelling = model.hotelling_
ax2 = st.selectbox("Component", options=model.scores_.columns, index=4)
hotelling = model.hotelling_
fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="",yaxis_title="Residuals")
st.plotly_chart(fig)
else:
st.markdown('Select a dimensionality reduction technique from the dropdown list')
...@@ -46,7 +46,6 @@ if NIRS_csv: ...@@ -46,7 +46,6 @@ if NIRS_csv:
if st.button("Predict"): if st.button("Predict"):
if s: if s:
result = model_loaded.predict(pred_data.iloc[:,idx]) result = model_loaded.predict(pred_data.iloc[:,idx])
else: else:
# use prediction function from application_functions.py to predict chemical values # use prediction function from application_functions.py to predict chemical values
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment