Newer
Older
from Packages import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import *
from Class_Mod.DATA_HANDLING import *
# HTML pour le bandeau "CEFE - CNRS"
# bandeau_html = """
# <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;">
# <h1 style="text-align: center; color: white;">CEFE - CNRS</h1>
# </div>
# """
# # Injecter le code HTML du bandeau
# st.markdown(bandeau_html, unsafe_allow_html=True)
add_header()
st.session_state["interface"] = st.session_state.get('interface')
if st.session_state["interface"] == 'simple':
hide_pages("Predictions")
################################### Data Loading and Visualization ########################################
col2, col1 = st.columns([3, 1])
col1.header("Data Loading", divider='blue')
col2.header("Spectral Data Visualization", divider='blue')
meta_data = pd.DataFrame
selected_samples = pd.DataFrame
# loader for datafile
data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
if data_file:
# Retrieve the extension of the file
test = data_file.name[data_file.name.find('.'):]
if test== '.csv':
with col1:
# Select list for CSV delimiter
psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9)
phdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31)
if phdr == 'yes':
col = 0
else:
col = False

BARTHES Nicolas
committed
# spectra = col_cat(imp)[0]
# meta_data = col_cat(imp)[1]
spectra, meta_data = col_cat(imp)
st.success("The data have been loaded successfully", icon="✅")
elif test == '.dx':
# Create a temporary file to save the uploaded file
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:

BARTHES Nicolas
committed
_, spectra, meta_data = read_dx(file = tmp_path)
if test =='.dx':
if meta_data.loc[:,'xunits'][0] == '1/cm':
lab = 'Wavenumber (1/cm)'
else:
lab = 'Wavelength (nm)'
fig = plot_spectra(spectra, xunits = lab, yunits = meta_data.loc[:,'yunits'][0])
else:
fig = plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = 'Signal intensity')
fig.savefig("./Report/figures/Spectra_Plot.png")
############################## Exploratory data analysis ###############################
container2 = st.container(border=True)
container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
scores, loadings, pc = st.columns([2, 3, 0.5])
influence, hotelling, qexp = st.columns([2, 2, 1])
st.header('Selected samples for chemical analysis', divider='blue')
dim_red_methods=['', 'PCA','UMAP', 'NMF'] # List of dimensionality reduction algos
cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos
dr_model = None # dimensionality reduction model
cl_model = None # clustering model
t = pd.DataFrame # scores
p = pd.DataFrame # loadings
labels = []
if not spectra.empty:
dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37)
clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38)
xc = standardize(spectra)

BARTHES Nicolas
committed
if dim_red_method == dim_red_methods[1]:
dr_model = LinearPCA(xc, Ncomp=8)
elif dim_red_method == dim_red_methods[2]:
if not meta_data.empty:
filter = meta_data.columns[1:]
col = pc.selectbox('Supervised UMAP by:', options= filter, key=108)
supervised = meta_data[col]
else:
supervised = None
dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised)
if dr_model:
axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1)
axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2)
t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1)
# clustering
if not t.empty:
cl_model = Sk_Kmeans(tcr, max_clusters = 25)
ncluster = scores.number_input(min_value=2, max_value=25, value=cl_model.suggested_n_clusters_, label = 'Select the desired number of clusters')
scores.write(f"Suggested n_clusters : {cl_model.suggested_n_clusters_}")
scores.plotly_chart(fig2,use_container_width=True)
img = pio.to_image(fig2, format="png")
with open("./Report/figures/Elbow.png", "wb") as f:
f.write(img)
data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster)
labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
non_clustered = np.where(labels == -1)
labels[non_clustered] = 1000
labels = labels.tolist()
#####################################################################################################
selec_strategy = ['center','random']
samples_df_chem = pd.DataFrame
selected_samples = []
selected_samples_idx = []
if labels:
selection = scores.radio('Select samples selection strategy:',
options = selec_strategy)
#################### selection strategy to be corrected
if selection == selec_strategy[0]:
# list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
closest, _ = pairwise_distances_argmin_min(clu_centers, tcr)
selected_samples_idx = list(closest)
selection_number = scores.number_input('How many samples per cluster?',
min_value = 1, step=1, value = 3)
if C.shape[0] >= selection_number:
#scores.write(list(tcr.index)[labels== i])
km2 = KMeans(n_clusters = selection_number)
km2.fit(tcr.iloc[C,:])
clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:])
selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index)
else:
selected_samples_idx.extend(tcr.iloc[C,:].index.to_list())
# list indexes of selected samples for colored plot
if labels:
if selected_samples_idx:
sam = pd.DataFrame({'cluster':np.array(labels)[selected_samples_idx],
'index': spectra.index[selected_samples_idx]},
index = selected_samples_idx)
if not meta_data.empty:
selected_samples_metd.write('Corresponding meta-data')
meta = meta_data.iloc[selected_samples_idx,:]
meta['cluster'] = np.array(labels)[selected_samples_idx]
meta['index'] = spectra.index[selected_samples_idx]
selected_samples_metd.write(meta)
############################################################################
st.write('Scores plot')
# scores plot with clustering
if list(labels) and meta_data.empty:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = labels, ax = ax1)
elif len(list(labels)) == 0 and not meta_data.empty:
filter = meta_data.columns[1:]
col = st.selectbox('Color by:', options= filter)
if col == 0:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) )
sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax1)
# color with scores and metadata
elif len(list(labels)) > 0 and not meta_data.empty:
if clus_method in cluster_methods[1:]:
filter = ['None', clus_method]
filter.extend(meta_data.columns[1:])
filter = meta_data.columns[1:].insert(0,'None')
col = st.selectbox('Color by:', options= filter)
if col == "None":
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
elif col == clus_method:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])))
sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax1)
sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax2)
sns.scatterplot(data = tcr, x = axis1, y =axis2 , hue = list(map(str.lower,meta_data[col])), ax = ax3)
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
fig.update_traces(marker=dict(size=4))
if selected_samples_idx:
tt = tcr.iloc[selected_samples_idx,:]
fig.add_scatter3d(x = tt.loc[:,axis1], y = tt.loc[:,axis2],
z = tt.loc[:,axis3], mode ='markers', marker = dict(size = 7, color = 'black'),
name = 'selected samples')
st.plotly_chart(fig, use_container_width=True)
custom_color_palette = px.colors.qualitative.Plotly[:num_clusters]
color_discrete_sequence=custom_color_palette
# Créer et exporter le graphique Axe1-Axe2 en PNG
fig_axe1_axe2 = px.scatter(tcr, x=axis1, y=axis2, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
fig_axe1_axe2.update_layout(title='Axe1-Axe2')
fig_axe1_axe2.update_traces(marker=dict(size=4))
# Créer et exporter le graphique Axe1-Axe3 en PNG
fig_axe1_axe3 = px.scatter(tcr, x=axis1, y=axis3, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
fig_axe1_axe3.update_layout(title='Axe1-Axe3')
fig_axe1_axe3.update_traces(marker=dict(size=4))
# Créer et exporter le graphique Axe2-Axe3 en PNG
fig_axe2_axe3 = px.scatter(tcr, x=axis2, y=axis3, color=labels if list(labels) else None, color_discrete_sequence=custom_color_palette)
fig_axe2_axe3.update_layout(title='Axe2-Axe3')
fig_axe2_axe3.update_traces(marker=dict(size=4))
if dim_red_method == dim_red_methods[1]:
with loadings:
st.write('Loadings plot')
p = dr_model.loadings_
########################################
if isinstance(spectra.columns[0], str):
freq = pd.DataFrame(np.arange(p.shape[0]), index=p.index)
else:
freq = pd.DataFrame(spectra.columns, index=p.index)
if test =='.dx':
if meta_data.loc[:,'xunits'][0] == '1/cm':
freq.columns = ['Wavenumber (1/cm)']
else:
freq.columns = ['Wavelength (nm)']
else:
freq.columns = ['Wavelength/Wavenumber']
##############
pp = pd.concat([p, freq], axis=1)
#########################################
#pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis=1)
df1 = pp.melt(id_vars=freq.columns)
fig = px.line(df1, x=freq.columns, y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly)
fig.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"),
bordercolor="black", borderwidth=2))
st.plotly_chart(fig, use_container_width=True)
# Export du graphique
img = pio.to_image(fig, format="png")
with open("./Report/figures/graphe_loadings.png", "wb") as f:
f.write(img)
with influence:
st.write('Influence plot')
ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3)
leverage = dr_model.leverage_
residuals = dr_model.residuals_
fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color=leverage[ax1]*residuals[ax1], color_continuous_scale='Blues')
fig.update_layout(xaxis_title="Leverage", yaxis_title="Residuals")
img = pio.to_image(fig, format="png")
st.write('T²-Hotelling vs Q residuals plot')
hotelling = dr_model.hotelling_
ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4)
hotelling = dr_model.hotelling_
fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals")
st.plotly_chart(fig, use_container_width=True)
fig.write_image("./Report/figures/graphe_hotelling.png", format="png")
if dim_red_method == dim_red_methods[2] and clus_method == cluster_methods[2]: # UMAP clustered by HDBSCAN
with loadings: # Display some clustering metrics
st.write('Clustering metrics:')
clusters_number = set(labels)
clusters_number.remove(-1)
st.write('Optimal number of clusters = ' + str(len(clusters_number)))
st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')