Newer
Older
from Packages import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import *
from Class_Mod.DATA_HANDLING import *
st.session_state["interface"] = st.session_state.get('interface')
if st.session_state["interface"] == 'simple':
hide_pages("Predictions")
################################### Data Loading and Visualization ########################################
container1 = st.container(border=True)
col2, col1 = st.columns([3, 1])
col1.header("Data Loading", divider='blue')
col2.header("Spectral Data Visualization", divider='blue')
container2 = st.container(border=True)
container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
scores, loadings, pc = st.columns([2, 3, 0.5])
influence, hotelling, qexp = st.columns([2, 2, 1])
with container1:
# loader for csv file containing NIRS spectra
sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
if sselectx_csv is not None:
test = sselectx_csv.name[sselectx_csv.name.find('.'):]
if test== '.csv':
with col1:
# Select list for CSV delimiter
psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
# Select list for CSV header True / False
phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31)
if phdr == 'yes':
col = 0
else:
col = False
data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
data_import, categorical_data, scaled_values = col_cat(data_import)
st.success("The data have been loaded successfully", icon="✅")
## Visualize spectra
with col2:
fig, ax = plt.subplots(figsize = (30,7))
data_import.T.plot(legend=False, ax = ax, color = 'blue')
ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
ax.set_ylabel('Signal', fontsize=18)
plt.margins(x = 0)
st.pyplot(fig)
st.write("Summary")
info = pd.DataFrame({'N':[data_import.shape[0]],
'Min': [np.min(data_import)],
'Max':[np.max(data_import)],}, index = ['Values']).T
info.rename_axis('information')
st.table(data=info)
elif test == '.dx':
# Create a temporary file to save the uploaded file
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
tmp.write(sselectx_csv.read())
tmp_path = tmp.name
with col1:
data = DxRead(path = tmp_path)
data_import = data.specs_df_
st.success("The data have been loaded successfully", icon="✅")
## Visualize spectra
with col2:
fig, ax = plt.subplots(figsize = (30,7))
data_import.T.plot(legend=False, ax = ax, color = 'blue')
ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
ax.set_ylabel('Signal', fontsize=18)
plt.margins(x = 0)
st.pyplot(fig)
st.write("Summary")
info = pd.DataFrame({'N':[data_import.shape[0]],
'Min': [np.min(data_import)],
'Max':[np.max(data_import)],}, index = ['Values']).T
info.rename_axis('information')
st.table(data=info)
os.unlink(tmp_path)
######################################################################################
############################## Exploratory data analysis ###############################
cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
with container2:
if sselectx_csv is not None:
plot_type=['', 'PCA','UMAP', 'NMF']
cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
with pc:
type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37)
type_cluster = st.selectbox("Clustering techniques: ", options=cluster_methods, key=38)
# compute UMAP - umap_maker in application_functions.py
if type_plot == 'PCA':
model = LinearPCA(data_import, Ncomp=5)
elif type_plot =='UMAP':
model = Umap(data_import = data_import, numerical_data = scaled_values, cat_data = categorical_data)
if type_plot in ['PCA', 'UMAP']:
if type_plot in ['PCA']:
# add 2 select lists to choose which component to plot
axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0)
axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1)
axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2)
elif type_plot in ['UMAP']:
axis1 = 0
axis2 = 1
axis3 = 2
if type_cluster == 'Kmeans':
scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1)
cl = Sk_Kmeans(scsc, max_clusters = 30)
elif type_cluster == 'HDBSCAN':
optimized_hdbscan = Hdbscan(model.scores_raw_)
labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
with scores:
t = model.scores_
if type_cluster in ['AP', 'Kmeans']:
st.write('Scree plot')
fig2 = px.scatter(cl.inertia_.T, y = 'inertia')
st.plotly_chart(fig2)
ncluster = st.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
data, colors = cl.fit_optimal(nclusters=ncluster)
#fig = px.scatter(data, x=axis1, y=axis2, color= colors)
st.write('Scores plot')
fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors)
elif type_cluster in ['HDBSCAN']:
st.write('plot HDBSCAN clustering')
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
fig.update_traces(marker=dict(size=4))
# st.plotly_chart(fig_hdbscan)
st.write('Optimal number of clusters = ' + str(len(set(labels))))
st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')
if test == '.dx':
filter = ['origin', 'date', 'time', 'spectrometer/data system']
col = st.selectbox('filter', options= filter)
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = data.md_df_[col])
fig.update_traces(marker=dict(size=4))
else:
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3 )
fig.update_traces(marker=dict(size=4))
st.plotly_chart(fig)
if type_plot =='PCA':
with loadings:
st.write('Loadings plot')
p = model.loadings_
pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
df1 = pp.melt(id_vars="wl")
fig = px.line(df1, x = 'wl', y = 'value', color='variable')
fig.update_layout(
legend=dict(x=1, y=0,
font=dict(
family="Courier", size=12, color="black"),
bordercolor="Black", borderwidth=2)
)
st.plotly_chart(fig, use_container_width = True)
with influence:
st.write('Influence plot')
ax1 = st.selectbox("Component", options=model.scores_.columns, index=3)
leverage = model.leverage_
residuals = model.residuals_
fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
st.plotly_chart(fig)
with hotelling:
st.write('T²-Hotelling vs Q residuals plot')
ax2 = st.selectbox("Component", options=model.scores_.columns, index=4)
hotelling = model.hotelling_
fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals")
st.plotly_chart(fig)
else:
st.markdown('Select a dimensionality reduction technique from the dropdown list')