1-samples_selection.py

from Packages import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import *
from Class_Mod.DATA_HANDLING import *

st.session_state["interface"] = st.session_state.get('interface')
if st.session_state["interface"] == 'simple':
    hide_pages("Predictions")

################################### Data Loading and Visualization ########################################
container1 = st.container(border=True)
col2, col1 = st.columns([3, 1])
col1.header("Data Loading", divider='blue')
col2.header("Spectral Data Visualization", divider='blue')


container2 = st.container(border=True)
container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
scores, loadings, pc = st.columns([2, 3, 0.5])
influence, hotelling, qexp = st.columns([2, 2, 1])


with container1:
    # loader for csv file containing NIRS spectra
    sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
    if sselectx_csv is not None:
        test = sselectx_csv.name[sselectx_csv.name.find('.'):]
        if test== '.csv':
            with col1:
                # Select list for CSV delimiter
                psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
                # Select list for CSV header True / False
                phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31)
                if phdr == 'yes':
                    col = 0
                else:
                    col = False
                data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
                data_import, categorical_data, scaled_values = col_cat(data_import)
                st.success("The data have been loaded successfully", icon="✅")
                ## Visualize spectra

            with col2:
                fig, ax = plt.subplots(figsize = (30,7))
                data_import.T.plot(legend=False, ax = ax, color = 'blue')
                ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
                ax.set_ylabel('Signal', fontsize=18)
                plt.margins(x = 0)
                st.pyplot(fig)

                st.write("Summary")
                info = pd.DataFrame({'N':[data_import.shape[0]],
                                    'Min': [np.min(data_import)],
                                    'Max':[np.max(data_import)],}, index = ['Values']).T
                info.rename_axis('information')
                st.table(data=info)

        elif test == '.dx':
            # Create a temporary file to save the uploaded file
            with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
                tmp.write(sselectx_csv.read())
                tmp_path = tmp.name
                with col1:
                        data = DxRead(path = tmp_path)
                        data_import = data.specs_df_
                        st.success("The data have been loaded successfully", icon="✅")

                    ## Visualize spectra

                with col2:
                    fig, ax = plt.subplots(figsize = (30,7))
                    data_import.T.plot(legend=False, ax = ax, color = 'blue')
                    ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
                    ax.set_ylabel('Signal', fontsize=18)
                    plt.margins(x = 0)
                    st.pyplot(fig)

                    st.write("Summary")
                    info = pd.DataFrame({'N':[data_import.shape[0]],
                                        'Min': [np.min(data_import)],
                                        'Max':[np.max(data_import)],}, index = ['Values']).T
                    info.rename_axis('information')
                    st.table(data=info)
            os.unlink(tmp_path)


######################################################################################

############################## Exploratory data analysis ###############################
plot_type=['', 'PCA','UMAP', 'NMF']
cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
with container2:
    if sselectx_csv is not None:
        plot_type=['', 'PCA','UMAP', 'NMF']
        cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']

        with pc:
            type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37)
            type_cluster = st.selectbox("Clustering techniques: ", options=cluster_methods, key=38)
            # compute UMAP - umap_maker in application_functions.py
            if type_plot == 'PCA':
                model = LinearPCA(data_import, Ncomp=5)
            elif type_plot =='UMAP':
                model = Umap(data_import = data_import, numerical_data = scaled_values, cat_data = categorical_data)


        if type_plot in ['PCA', 'UMAP']:
            if type_plot in ['PCA']:
                # add 2 select lists to choose which component to plot
                axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0)
                axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1)
                axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2)
            elif type_plot in ['UMAP']:
                axis1 = 0
                axis2 = 1
                axis3 = 2

            if type_cluster == 'Kmeans':
                scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1)
                cl = Sk_Kmeans(scsc, max_clusters = 30)

            elif type_cluster == 'HDBSCAN':
                optimized_hdbscan = Hdbscan(model.scores_raw_)
                labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
            with scores:
                t = model.scores_
                if type_cluster in ['AP', 'Kmeans']:
                    st.write('Scree plot')
                    fig2 = px.scatter(cl.inertia_.T, y = 'inertia')
                    st.plotly_chart(fig2)

                    ncluster = st.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
                    data, colors = cl.fit_optimal(nclusters=ncluster)
                    #fig = px.scatter(data, x=axis1, y=axis2, color= colors)
                    st.write('Scores plot')
                    fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors)
                    fig.update_traces(marker=dict(size=4))


                elif type_cluster in ['HDBSCAN']:
                    st.write('plot HDBSCAN clustering')
                    fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
                    fig.update_traces(marker=dict(size=4))
                    # st.plotly_chart(fig_hdbscan)
                    st.write('Optimal number of clusters = ' + str(len(set(labels))))
                    st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
                    st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')

                else:
                    if test == '.dx':
                        filter = ['origin', 'date', 'time', 'spectrometer/data system']
                        col = st.selectbox('filter', options= filter)

                        fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = data.md_df_[col])
                        fig.update_traces(marker=dict(size=4))
                    else:
                        fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3 )
                        fig.update_traces(marker=dict(size=4))

                st.plotly_chart(fig)


            if type_plot =='PCA':
                with loadings:
                    st.write('Loadings plot')
                    p = model.loadings_
                    pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
                    df1 = pp.melt(id_vars="wl")

                    fig = px.line(df1, x = 'wl', y = 'value', color='variable')
                    fig.update_layout(
                        legend=dict(x=1, y=0,
                                    font=dict(
                                        family="Courier", size=12, color="black"),
                                    bordercolor="Black", borderwidth=2)
                    )
                    st.plotly_chart(fig, use_container_width = True)

                
                with influence:
                    st.write('Influence plot')
                    ax1 = st.selectbox("Component", options=model.scores_.columns, index=3)
                    leverage = model.leverage_
                    residuals = model.residuals_
                    fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
                    st.plotly_chart(fig)

                with hotelling:
                    st.write('T²-Hotelling vs Q residuals plot')
                    hotelling = model.hotelling_
                    ax2 = st.selectbox("Component", options=model.scores_.columns, index=4)

                    hotelling = model.hotelling_
                    fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals")
                    st.plotly_chart(fig)

        else:
            st.markdown('Select a dimensionality reduction technique from the dropdown list')