Skip to content
Snippets Groups Projects
1-samples_selection.py 9.2 KiB
Newer Older
  • Learn to ignore specific revisions
  • from Packages import *
    st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
    from Modules import *
    from Class_Mod.DATA_HANDLING import *
    
    
    st.session_state["interface"] = st.session_state.get('interface')
    if st.session_state["interface"] == 'simple':
        hide_pages("Predictions")
    
    
    ################################### Data Loading and Visualization ########################################
    container1 = st.container(border=True)
    col2, col1 = st.columns([3, 1])
    
    DIANE's avatar
    DIANE committed
    col1.header("Data Loading", divider='blue')
    col2.header("Spectral Data Visualization", divider='blue')
    
    
    
    container2 = st.container(border=True)
    container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
    scores, loadings, pc = st.columns([2, 3, 0.5])
    influence, hotelling, qexp = st.columns([2, 2, 1])
    
    
    with container1:
    
    DIANE's avatar
    DIANE committed
        # loader for csv file containing NIRS spectra
        sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
    
        if sselectx_csv is not None:
    
    DIANE's avatar
    DIANE committed
            test = sselectx_csv.name[sselectx_csv.name.find('.'):]
            if test== '.csv':
                with col1:
                    # Select list for CSV delimiter
                    psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9)
                    # Select list for CSV header True / False
                    phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31)
                    if phdr == 'yes':
                        col = 0
                    else:
                        col = False
                    data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
                    st.success("The data have been loaded successfully", icon="")
                    ## Visualize spectra
    
                with col2:
                    fig, ax = plt.subplots(figsize = (30,7))
                    data_import.T.plot(legend=False, ax = ax, color = 'blue')
                    ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
                    ax.set_ylabel('Signal', fontsize=18)
                    plt.margins(x = 0)
                    st.pyplot(fig)
    
                    st.write("Summary")
                    info = pd.DataFrame({'N':[data_import.shape[0]],
                                        'Min': [np.min(data_import)],
                                        'Max':[np.max(data_import)],}, index = ['Values']).T
                    info.rename_axis('information')
                    st.table(data=info)
    
            elif test == '.dx':
                # Create a temporary file to save the uploaded file
                with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
                    tmp.write(sselectx_csv.read())
                    tmp_path = tmp.name
                    with col1:
                            data = DxRead(path = tmp_path)
                            data_import = data.specs_df_
                            st.success("The data have been loaded successfully", icon="")
    
                        ## Visualize spectra
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                    with col2:
                        fig, ax = plt.subplots(figsize = (30,7))
                        data_import.T.plot(legend=False, ax = ax, color = 'blue')
                        ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
                        ax.set_ylabel('Signal', fontsize=18)
                        plt.margins(x = 0)
                        st.pyplot(fig)
    
                        st.write("Summary")
                        info = pd.DataFrame({'N':[data_import.shape[0]],
                                            'Min': [np.min(data_import)],
                                            'Max':[np.max(data_import)],}, index = ['Values']).T
                        info.rename_axis('information')
                        st.table(data=info)
                os.unlink(tmp_path)
    
    
        
    
            
    
    ######################################################################################
    
    ############################## Exploratory data analysis ###############################
    
    DIANE's avatar
    DIANE committed
    plot_type=['', 'PCA','UMAP', 'NMF']
    
    cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
    
    with container2:
        if sselectx_csv is not None:
            plot_type=['', 'PCA','UMAP', 'NMF']
    
            cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
    
    
            with pc:
                type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37)
                type_cluster = st.selectbox("Clustering techniques: ", options=cluster_methods, key=38)
                # compute UMAP - umap_maker in application_functions.py
                if type_plot == 'PCA':
                    model = LinearPCA(data_import, Ncomp=5)
                elif type_plot =='UMAP':
                    model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0)
    
    
    
            if type_plot in ['PCA', 'UMAP']:
                # add 2 select lists to choose which component to plot
                axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0)
                axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1)
                axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2)
    
                if type_cluster == 'Kmeans':
    
    DIANE's avatar
    DIANE committed
                    scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1)
                    cl = Sk_Kmeans(scsc, max_clusters = 30)
    
                elif type_cluster == 'HDBSCAN':
    
                    from Class_Mod.HDBSCAN_Clustering import HDBSCAN_function
    
                    labels, hdbscan_score = HDBSCAN_function(data_import, min_cluster_size=10)
    
    
                with scores:
                    t = model.scores_
    
                    if type_cluster in ['AP', 'Kmeans']:
    
                        st.write('Scree plot')
                        fig2 = px.scatter(cl.inertia_.T, y = 'inertia')
                        st.plotly_chart(fig2)
    
                        ncluster = st.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
                        data, colors = cl.fit_optimal(nclusters=ncluster)
                        #fig = px.scatter(data, x=axis1, y=axis2, color= colors)
                        st.write('Scores plot')
                        fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors)
    
    DIANE's avatar
    DIANE committed
                        fig.update_traces(marker=dict(size=4))
    
                    elif type_cluster in ['HDBSCAN']:
                        st.write('plot HDBSCAN clustering')
                        fig_hdbscan = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
                        fig_hdbscan.update_traces(marker=dict(size=4))
                        st.plotly_chart(fig_hdbscan)
                        st.write('DBCV score = ' + str(hdbscan_score))
                        # st.dataframe(min_score.stack().agg(['min']))
    
    DIANE's avatar
    DIANE committed
                        if test == '.dx':
                            filter = ['origin', 'date', 'time', 'spectrometer/data system']
                            col = st.selectbox('filter', options= filter)
    
                            fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = data.md_df_[col])
                            fig.update_traces(marker=dict(size=4))
                        else:
                            fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3 )
                            fig.update_traces(marker=dict(size=4))
    
    
                    st.plotly_chart(fig)
    
    
                if type_plot =='PCA':
                    with loadings:
                        st.write('Loadings plot')
                        p = model.loadings_
                        pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
                        df1 = pp.melt(id_vars="wl")
    
                        fig = px.line(df1, x = 'wl', y = 'value', color='variable')
                        fig.update_layout(
                            legend=dict(x=1, y=0,
                                        font=dict(
                                            family="Courier", size=12, color="black"),
                                        bordercolor="Black", borderwidth=2)
                        )
    
                        st.plotly_chart(fig, use_container_width = True)
    
    DIANE's avatar
    DIANE committed
                    
    
                    with influence:
                        st.write('Influence plot')
                        ax1 = st.selectbox("Component", options=model.scores_.columns, index=3)
                        leverage = model.leverage_
    
    DIANE's avatar
    DIANE committed
                        residuals = model.residuals_
                        fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
    
                        st.plotly_chart(fig)
    
                    with hotelling:
                        st.write('T²-Hotelling vs Q residuals plot')
    
    DIANE's avatar
    DIANE committed
                        hotelling = model.hotelling_
    
                        ax2 = st.selectbox("Component", options=model.scores_.columns, index=4)
    
    
    DIANE's avatar
    DIANE committed
                        hotelling = model.hotelling_
                        fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="",yaxis_title="Residuals")
    
                        st.plotly_chart(fig)
    
    
                else:
                    st.markdown('Select a dimensionality reduction technique from the dropdown list')