app.py

import streamlit as st
# help on streamlit input https://docs.streamlit.io/library/api-reference/widgets
from PIL import Image
# emojis code here : https://www.webfx.com/tools/emoji-cheat-sheet/
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans as km
from sklearn.metrics import pairwise_distances_argmin_min
from application_functions import pca_maker, model, predict, find_delimiter, umap_maker

# load images for web interface
img_sselect = Image.open("images\sselect.JPG")
img_general = Image.open("images\general.JPG")
img_predict = Image.open("images\predict.JPG")

# TOC menu on the left
with st.sidebar:
    st.markdown("[Sample Selection](#sample-selection)")
    st.markdown("[Model Creation](#create-a-model)")
    st.markdown("[Prediction](#predict)")
# Page header
with st.container():
    st.subheader("Plateforme d'Analyses Chimiques pour l'Ecologie :goat:")
    st.title("NIRS Utils")
    st.write("Sample selections, Modelisations & Predictions using [Pinard](https://github.com/GBeurier/pinard) and PACE NIRS Database.")
    st.image(img_general)
# graphical delimiter
st.write("---")
# Sample Selection module
with st.container():
    st.header("Sample Selection")
    st.image(img_sselect)
    st.write("Sample selection using PCA and K-Means algorythms")
    # split 2 columns 4:1 ratio
    scatter_column, settings_column = st.columns((4, 1))
    scatter_column.write("**Multi-Dimensional Analysis**")
    settings_column.write("**Settings**")
    # loader for csv file containing NIRS spectra
    sselectx_csv = settings_column.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
    if sselectx_csv is not None:
        # Select list for CSV delimiter
        psep = settings_column.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
        # Select list for CSV header True / False
        phdr = settings_column.selectbox("indexes column in csv?", options=["no", "yes"], key=31)
        if phdr == 'yes':
            col = 0
        else:
            col = False

        data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
        # Select type of plot
        plot_type=['', 'pca','umap']
        type_plot = settings_column.selectbox("Dimensional reduction: ", options=plot_type, key=37)
        # compute UMAP - umap_maker in application_functions.py
        if type_plot == 'umap':
            pc_data, cat_cols, pc_cols = umap_maker(data_import)
        # compute PCA - pca_maker function in application_functions.py
        if type_plot == 'pca':
            pc_data, cat_cols, pc_cols = pca_maker(data_import)
        if type_plot == 'umap' or type_plot == 'pca':
            # add 2 select lists to choose which component to plot
            pc_1 = settings_column.selectbox("First Principle Component", options=pc_cols, index=0)
            pc_2 = settings_column.selectbox("Second Principle Component", options=pc_cols, index=1)
            # if categorical variables exist, add 2 select lists to choose the categorical variables to color the PCA
            if cat_cols[0] == "no categories":
                plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, hover_name=pc_data.index, title="PC plot of sample spectra"))
            else:
                categorical_variable = settings_column.selectbox("Variable Select", options = cat_cols)
                categorical_variable_2 = settings_column.selectbox("Second Variable Select (hover data)", options = cat_cols)
                plot_pc = scatter_column.plotly_chart(px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=categorical_variable, hover_data = [categorical_variable_2], hover_name=pc_data.index, title="PC plot of sample spectra"))
            # Clustering method
            cluster_type = ['', 'k-means']
            # cluster_type = ['k-means', 'umap'] # uncomment if more clustering algorithms available
            type_cluster = settings_column.selectbox("Clustering method: ", options=cluster_type, key=38)
            # clustering via K-Means
            if type_cluster == 'k-means':
                #K-Means
                ## K-Means choose number of clusters
                wcss_samples = []
                cluster_max = settings_column.slider("Max clusters (K-Means)", min_value=2, max_value=100, value=50, format="%i")
                clusters_sample = np.arange(2, cluster_max)
                for i in clusters_sample:
                    kmeans_samples = km(n_clusters = i, init = 'k-means++', random_state = 42)
                    kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
                    wcss_samples.append(kmeans_samples.inertia_)
                settings_column.plotly_chart(px.line(x=clusters_sample, y=wcss_samples, title="K-Means clusters nb sel", width=200))
                ## Draw clustering
                nb_select = settings_column.slider("Choose cluster number (K-Means)", min_value=2, max_value=cluster_max, value=5, format="%i")
                kmeans_samples = km(n_clusters=nb_select, random_state=42)
                kmeans_samples.fit(pc_data.loc[:,[pc_1,pc_2]])
                # plot the pc with clustering only (no selected samples)
                # graph = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters")
                # plot = scatter_column.plotly_chart(graph)
                # choose between cluster centered sample and random samples
                selection = settings_column.select_slider('Centered samples or random ones', options=['center','random'])
                export = []
                scatter_column.write("Selected samples for chemical analysis:")
                if selection == 'center':
                    # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
                    closest, _ = pairwise_distances_argmin_min(kmeans_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
                    scatter_column.dataframe(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index, use_container_width=False)
                    export.append(pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.T)
                    # list indexes of selected samples for colored plot
                    te = pc_data.loc[pc_data.index[closest],[pc_1,pc_2]].index.values.tolist()
                elif selection == 'random':
                    selection_number = settings_column.number_input('How many samples per cluster?', step=1, value = 3)
                    for i in np.unique(kmeans_samples.labels_):
                        if len(pd.DataFrame(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])) >= selection_number:
                            # export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].sample(n=selection_number).index)
                            # another k-means to cluster in 'selection_number' clusters and random to ensure the selected samples are far from each other in each cluster
                            kmeans_selected_samples = km(n_clusters=selection_number, random_state=42)
                            kmeans_selected_samples.fit(pc_data.loc[pc_data.index[kmeans_samples.labels_==i],[pc_1,pc_2]])
                            closest_selected_samples, _ = pairwise_distances_argmin_min(kmeans_selected_samples.cluster_centers_, pc_data.loc[:,[pc_1,pc_2]])
                            export.append(pc_data.loc[pc_data.index[closest_selected_samples],[pc_1,pc_2]].index)
                        else:
                            export.append(pc_data.loc[pc_data.index[kmeans_samples.labels_==i]].index)
                    # list indexes of selected samples for colored plot
                    te = []
                    for sublist in export:
                        for item in sublist:
                            te.append(item)
                    # display a matrix of selected samples
                    scatter_column.write(pd.DataFrame(export).T)
                # convert cluster number to text for optimized coloring
                kmeans_samples.labels_ = kmeans_samples.labels_.astype(str)
                for j in te:
                    kmeans_samples.labels_[pc_data.index.get_loc(j)] = 'selected'
                # plot de pc with colored clusters and selected samples
                graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples")
                plot = scatter_column.plotly_chart(graph_selected)
                # button to export the names of selected samples - by cluster if random - in a csv
                if scatter_column.button('Export'):
                    pd.DataFrame(export).T.to_csv('./data/Samples_for_Chemical_Analysis.csv')
            else:
                scatter_column.write("_Please Choose a file_")
            # clustering via UMAP / HDBSCAN
            if type_cluster == 'umap':
                import hdbscan
                # plot de pc with colored clusters and selected samples
                # graph_selected = px.scatter(data_frame=pc_data, x=pc_1, y=pc_2, template="simple_white", height=800, color=kmeans_samples.labels_, hover_name=pc_data.index, title="PC projection with K-Means Clusters and selected samples")
                # plot = scatter_column.plotly_chart(graph_selected)
                scatter_column.dataframe(pc_data)
                labels = hdbscan.HDBSCAN(min_samples=4, min_cluster_size=10,).fit_predict(pc_data.loc[:,[pc_1,pc_2]])
                clustered = (labels >= 0)
                graph_clustered = plt.scatter(standard_embedding[clustered, 0], standard_embedding[clustered, 1], c=labels[clustered], s=0.1, cmap='Spectral')
                plot = scatter_column.plotly_chart(graph_selected)
# graphical delimiter
st.write("---")
# Model creation module
with st.container():
    st.header("Create a model")
    st.image(img_predict)
    st.write("Create a model to then predict chemical values from NIRS spectra")
    # CSV files loader
    xcal_csv = st.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
    ycal_csv = st.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
    if xcal_csv is not None and ycal_csv is not None:
        # Select list for CSV delimiter
        sep = st.selectbox("Select csv separator - CSV Detected separator: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
        # Select list for column indexes True / False
        hdr = st.selectbox("column indexes in csv?", options=["yes", "no"], key=1)
        rd_seed = st.slider("Choose seed", min_value=1, max_value=1212, value=42, format="%i")
        # Train model with model function from application_functions.py
        trained_model = model(xcal_csv, ycal_csv, sep, hdr, rd_seed)

# graphical delimiter
st.write("---")
# Prediction module - TO BE DONE !!!!!
with st.container():
    st.header("Predict")
    st.write("---")
    st.write("Predict chemical values from NIRS")
    NIRS_csv = st.file_uploader("Select NIRS Data to predict", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
    psep = st.selectbox("Select csv separator", options=[";", ","], key=2)
    phdr = st.selectbox("indexes column in csv?", options=["yes", "no"], key=3)
    st.button("Predict", on_click=predict)