Skip to content
Snippets Groups Projects
1-samples_selection.py 34.4 KiB
Newer Older
DIANE's avatar
DIANE committed
from utils.data_parsing import meta_st
DIANE's avatar
DIANE committed
from common import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
DIANE's avatar
DIANE committed


DIANE's avatar
UI  
DIANE committed
# layout
DIANE's avatar
DIANE committed
UiComponents(pagespath=pages_folder, csspath=css_file, imgpath=image_path,
             header=True, sidebar=True, bgimg=False, colborders=True)
st.header("Calibration Subset Selection")  # page title
st.markdown(
    "Select a representative subset of samples for NIR calibration development.")
DIANE's avatar
DIANE committed
c1, c2 = st.columns([3, 1])
DIANE's avatar
DIANE committed
c1.image("./images/sample selection.png",
         use_column_width=True)  # graphical abstract
DIANE's avatar
DIANE committed

DIANE's avatar
UI  
DIANE committed

DIANE's avatar
DIANE committed
# empty temp figures
DIANE's avatar
DIANE committed
report_path = Path("report")
report_path_rel = Path("./report")

DIANE's avatar
DIANE committed
# ~~~~~~~~~~~~~~~~ clean the analysis results dir ~~~~~~~~~~~~~~~~
DIANE's avatar
DIANE committed
HandleItems.delete_files(keep=['.py', '.pyc', '.bib', '.tex'])
HandleItems.delete_dir(delete=['report/results/model'])
DIANE's avatar
DIANE committed

################################### I - Data Loading and Visualization ########################################
# loader for datafile
DIANE's avatar
DIANE committed
file = c2.file_uploader("Data file", type=[
                        "csv", "dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
# Preallocation of data structure
DIANE's avatar
DIANE committed
spectra = DataFrame()
meta_data = DataFrame()
DIANE's avatar
DIANE committed
md_df_st_ = DataFrame()
tcr = DataFrame()
sam = DataFrame()
sam1 = DataFrame()
DIANE's avatar
DIANE committed
selected_samples = DataFrame()
DIANE's avatar
DIANE committed
selected = []
DIANE's avatar
DIANE committed
l1 = []
DIANE's avatar
DIANE committed
color_palette = None
DIANE's avatar
DIANE committed
dr_model = None  # dimensionality reduction model
cl_model = None  # clustering model
selection = None
selection_number = "None"
DIANE's avatar
DIANE committed
samples_df_chem = DataFrame
DIANE's avatar
DIANE committed
selected_samples = []
selected_samples_idx = []
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
if not file:
DIANE's avatar
DIANE committed
    c2.info('Info: Please load data file !')
DIANE's avatar
DIANE committed
    # extension = file.name.split(".")[-1]
    userfilename = file.name.replace(f".{file.name.split(".")[-1]}", '')
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    match file.name.split(".")[-1]:
        # Load .csv file
        case 'csv':
DIANE's avatar
DIANE committed
            with c2:
DIANE's avatar
DIANE committed
                # ~~~~~~~~ select file dialect
DIANE's avatar
DIANE committed
                c2_1, c2_2 = st.columns([.5, .5])
                with c2_1:
DIANE's avatar
DIANE committed
                    dec = st.radio('decimal:', options=[
                                   ".", ","], horizontal=True)
                    sep = st.radio("separator:", options=[
                                   ";", ","], horizontal=True)
DIANE's avatar
DIANE committed
                with c2_2:
DIANE's avatar
DIANE committed
                    hdr = st.radio("header: ", options=[
                                   "yes", "no"], horizontal=True)
                    names = st.radio("samples name:", options=[
                                     "yes", "no"], horizontal=True)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
                hdr = 0 if hdr == "yes" else None
                names = 0 if names == "yes" else None
                hash_ = ObjectHash(current=None, add=[
                                   file.getvalue(), hdr, names, dec, sep])
DIANE's avatar
DIANE committed

                # ~~~~~~~~ read the csv file
DIANE's avatar
DIANE committed
                try:
DIANE's avatar
DIANE committed
                    # spectra = read_csv(file, decimal=dec, sep=sep, index_col=names)
DIANE's avatar
DIANE committed
                    spectra, meta_data = csv_parser(
DIANE's avatar
DIANE committed
                        path=file, decimal=dec, separator=sep, index_col=names, header=hdr, change=hash_)
DIANE's avatar
DIANE committed
                    if spectra.shape[1] > 20:
                        st.success(
                            "The data have been loaded successfully and spectral data was successfully detected, you might need to tune dialect.", icon="")
DIANE's avatar
DIANE committed
                    else:
DIANE's avatar
DIANE committed
                        st.warning(
                            "The data have been loaded successfully and but spectral data was not detected.")
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
                except:
                    st.error('''Error: The format of the file does not correspond to the expected dialect settings.
DIANE's avatar
DIANE committed
                              To read the file correctly, please adjust the dialect parameters.''')
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        # Load .dx file
DIANE's avatar
DIANE committed
        case 'dx':
DIANE's avatar
DIANE committed
            with c2:
DIANE's avatar
DIANE committed
                with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
DIANE's avatar
DIANE committed
                    tmp.write(file.read())
                    dxfile = tmp.name
DIANE's avatar
DIANE committed
                    hash_ = ObjectHash(current=None, add=file.getvalue())
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
                try:
DIANE's avatar
DIANE committed
                    from utils.data_parsing import jcamp_parser
DIANE's avatar
DIANE committed
                    spectra, _, meta_data = jcamp_parser(
                        path=dxfile, include=['x_block', 'meta'], change=hash_)
                    st.success(
                        "The data have been loaded successfully", icon="")
DIANE's avatar
DIANE committed
                except:
DIANE's avatar
DIANE committed
                    st.error(
                        '''Error: an issue was encontered while parsing the uploaded file.''')
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
if not spectra.empty:
DIANE's avatar
DIANE committed
    if len(spectra.index) > len(set(spectra.index)):
        c2.warning(
            "Duplicate sample IDs found. Suffixes (#1, #2, ...) have been added to duplicate IDs.")
DIANE's avatar
DIANE committed
        meta_data['names'] = spectra.index
DIANE's avatar
DIANE committed
        # Keep all duplicates (True for replicated)
        mask = spectra.index.duplicated(keep=False)
DIANE's avatar
DIANE committed
        # For the duplicated sample_ids, apply suffix (_1, _2, etc.)
DIANE's avatar
DIANE committed
        spectra.index = spectra.index.where(~mask,
                                            spectra.groupby(spectra.index).cumcount().add(1).astype(str).radd(spectra.index.astype(str) + '#'))

DIANE's avatar
DIANE committed
if not spectra.empty:
    if not meta_data.empty:
        meta_data.index = [str(i) for i in spectra.index]
        md_df_st_ = meta_st(meta_data)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        if md_df_st_.shape[1] > 0:
DIANE's avatar
DIANE committed
            n_colors = 30
DIANE's avatar
DIANE committed
            # Evenly spaced hues
            hues = np.linspace(0, 1, n_colors, endpoint=False)
DIANE's avatar
DIANE committed
            import random
            random.seed(42)
            import matplotlib.colors as mcolors
            colorslist = [mcolors.rgb2hex(plt.cm.hsv(hue)) for hue in hues]
            random.shuffle(colorslist)

        else:
            colorslist = None

    if spectra.select_dtypes(include=['float']).shape[1] < 50:
DIANE's avatar
DIANE committed
        c2.warning(
            'Error: Your data is not multivariable, check the number of variables in your data or well tune the dialect.')
        spectra = DataFrame
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
if not spectra.empty:
DIANE's avatar
DIANE committed
    n_specs = spectra.shape[0]  # n_samples
    nwls = spectra.shape[1]  # nwl
    wls = list(spectra.columns)  # colnames
DIANE's avatar
DIANE committed
    spectra.index = [str(i) for i in list(spectra.index)]
DIANE's avatar
DIANE committed

    id = spectra.index  # rownames
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    with c2:
        st.write('Data summary:')
        st.write(f'- the number of spectra:{spectra.shape[0]}')
        st.write(f'- the number of wavelengths:{spectra.shape[1]}')
        st.write(f'- the number of categorical variables:{meta_data.shape[1]}')
DIANE's avatar
DIANE committed
################################################### END : I- Data loading and preparation ####################################################
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
################################################### BEGIN : visualize and split the data ####################################################
DIANE's avatar
DIANE committed
st.subheader("I - Spectral Data Visualization", divider='blue')
DIANE's avatar
DIANE committed
if not spectra.empty:
DIANE's avatar
DIANE committed
    c3, c4 = st.columns([3, 1])
DIANE's avatar
DIANE committed
    with c4:
DIANE's avatar
DIANE committed
        st.info('Color spectra based on a categorical variable (for easier visualization: only relevant variables with fewer than 60 categories are displated in the dropdown list.)')
DIANE's avatar
DIANE committed
        filter = ['']+md_df_st_.columns.to_list()
DIANE's avatar
DIANE committed
        specs_col = st.selectbox('Color by:', options=filter, format_func=lambda x: x if x else "<Select>",
                                 disabled=True if len(filter) == 1 else False)
DIANE's avatar
DIANE committed
        if len(filter) == 1:
            st.write("No categorical variable was provided!")
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    with c3:
DIANE's avatar
DIANE committed
        if specs_col != '':
DIANE's avatar
DIANE committed
            cmap = dict(
                zip(set(md_df_st_[specs_col]), colorslist[:len(set(md_df_st_[specs_col]))]))
            fig_spectra = plot_spectra(spectra, color=md_df_st_[
                                       specs_col], cmap=cmap, xunits='Wavelength/Wavenumber', yunits="Signal intensity")
DIANE's avatar
DIANE committed

        else:
DIANE's avatar
DIANE committed
            fig_spectra = plot_spectra(
                spectra, color=None, cmap=None, xunits='Wavelength/Wavenumber', yunits="Signal intensity")
DIANE's avatar
DIANE committed
            cmap = None
DIANE's avatar
DIANE committed
        st.pyplot(fig_spectra)

DIANE's avatar
DIANE committed
    with c4:
DIANE's avatar
DIANE committed
        if specs_col != '':
            st.write('The distribution of samples across categories')
DIANE's avatar
DIANE committed
            barh = barhplot(md_df_st_[[specs_col]], cmap=cmap)
DIANE's avatar
DIANE committed
            st.pyplot(barh)

DIANE's avatar
DIANE committed
        elif len(filter) > 1 and specs_col == '':
DIANE's avatar
DIANE committed
            st.write("No categorical variable was selected!")

    if st.session_state.interface == 'advanced':
        with c3:
DIANE's avatar
DIANE committed
            values = st.slider('Select a range of values',
                               min_value=0, max_value=nwls, value=(0, nwls))
            hash_ = ObjectHash(current=hash_, add=values)
DIANE's avatar
DIANE committed
            spectra = spectra.iloc[:, values[0]:values[1]]
            nwls = spectra.shape[1]
            wls = wls[values[0]:values[1]]

DIANE's avatar
DIANE committed
            st.pyplot(plot_spectra(
                spectra.mean(), xunits='Wavelength/Wavenumber', yunits="Signal intensity"))
DIANE's avatar
DIANE committed

        # st.selectbox('Variable', options= [''], disabled=True if len(colfilter)>1, else False)
        # st.write(data_info) ## table showing the number of samples in the data file
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
################################################### END : visualize and split the data ####################################################

DIANE's avatar
DIANE committed

############################## Exploratory data analysis ###############################
DIANE's avatar
DIANE committed
st.subheader(
    "II - Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
DIANE's avatar
DIANE committed
# ~~~~~~~~~~~~~~ algorithms available on our app ~~~~~~~~~~~~~~~~
match st.session_state["interface"]:
    case 'simple':
        dim_red_methods, cluster_methods, seltechs = ['PCA'], [''], ['random']

    case 'advanced':
DIANE's avatar
DIANE committed
        # List of dimensionality reduction algos
        dim_red_methods = ['PCA', 'UMAP', 'NMF']
        # List of clustering algos
        cluster_methods = ['KMEANS', 'HDBSCAN', 'AP']
DIANE's avatar
DIANE committed
        seltechs = ['random', 'kennard-stone', 'meta-medoids', 'meta-ks']
DIANE's avatar
DIANE committed
###### 1- Dimensionality reduction ######
DIANE's avatar
DIANE committed
t = DataFrame  # scores
p = DataFrame  # loadings
DIANE's avatar
DIANE committed
if not spectra.empty:
DIANE's avatar
DIANE committed
    xc = standardize(spectra, center=True, scale=False)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    c5, c6, c7, c8, c9, c10, c11 = st.columns([1, 1, 0.6, 0.6, 0.6, 1.5, 1.5])
    with c5:
DIANE's avatar
DIANE committed
        # select a dimensionality reduction algorithm
        dim_red_method = st.selectbox("Dimensionality reduction techniques: ",
DIANE's avatar
DIANE committed
                                      options=['']+dim_red_methods if len(dim_red_methods) > 2 else dim_red_methods, format_func=lambda x: x if x else "<Select>",
                                      disabled=False if len(dim_red_methods) > 2 else True)
        hash_ = ObjectHash(current=hash_, add=dim_red_method)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        match dim_red_method:
            case '':
                st.info('Info: Select a dimensionality reduction technique!')
DIANE's avatar
DIANE committed
            case 'UMAP':
DIANE's avatar
DIANE committed
                supervised = st.selectbox('Supervised UMAP by(optional):', options=filter,
                                          format_func=lambda x: x if x else "<Select>", disabled=False if len(filter) > 1 else True)
                umapsupervisor = None if supervised == '' else md_df_st_[
                    supervised]
                hash_ = ObjectHash(current=hash_, add=umapsupervisor)
DIANE's avatar
DIANE committed
        # select a clustering reduction algorithm
DIANE's avatar
DIANE committed
        disablewidgets = [False if (
            dim_red_method and st.session_state.interface == 'advanced') else True][0]
        clus_method = st.selectbox("Clustering techniques(optional): ",
DIANE's avatar
DIANE committed
                                   options=[
                                       ''] + cluster_methods if len(cluster_methods) > 2 else cluster_methods,
                                   key=38, format_func=lambda x: x if x else "<Select>", disabled=disablewidgets)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        # if disablewidgets == False and dim_red_method in dim_red_methods:
        #     inf = st.info('Info: Select a clustering technique!')
DIANE's avatar
DIANE committed
        if dim_red_method:
            @st.cache_data
DIANE's avatar
DIANE committed
            def dimensionality_reduction(dim_red_method, change):
DIANE's avatar
DIANE committed
                match dim_red_method:
                    case "PCA":
DIANE's avatar
DIANE committed
                        from utils.dim_reduction import LinearPCA
                        dr_model = LinearPCA(xc, Ncomp=8)
DIANE's avatar
DIANE committed
                    case "UMAP":
DIANE's avatar
DIANE committed
                        from utils.dim_reduction import Umap
                        dr_model = Umap(numerical_data=spectra,
                                        cat_data=umapsupervisor)
DIANE's avatar
DIANE committed
                    case 'NMF':
DIANE's avatar
DIANE committed
                        from utils.dim_reduction import Nmf
                        dr_model = Nmf(spectra, Ncomp=3)
DIANE's avatar
DIANE committed
                return dr_model
DIANE's avatar
DIANE committed
            dr_model = dimensionality_reduction(dim_red_method, change=hash_)
DIANE's avatar
DIANE committed
        if dr_model:
DIANE's avatar
DIANE committed
            axis1 = c7.selectbox(
                "x-axis", options=dr_model.scores_.columns, index=0)
            axis2 = c8.selectbox(
                "y-axis", options=dr_model.scores_.columns, index=1)
            axis3 = c9.selectbox(
                "z-axis", options=dr_model.scores_.columns, index=2)
DIANE's avatar
DIANE committed
            axis = np.unique([axis1, axis2, axis3])
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
            t = dr_model.scores_.loc[:, axis]
DIANE's avatar
DIANE committed
            t.index = spectra.index
DIANE's avatar
DIANE committed
            tcr = standardize(t)
DIANE's avatar
DIANE committed

if not t.empty:
DIANE's avatar
DIANE committed
        c12 = st.container()
DIANE's avatar
DIANE committed
        c12, c13 = st.columns([3, 3])
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
if not spectra.empty:
DIANE's avatar
DIANE committed
    with c6:
DIANE's avatar
DIANE committed
        sel_ratio = st.number_input('Enter the number/fraction of samples to be selected:', min_value=0.01,
                                    max_value=float("{:.2f}".format(spectra.shape[0])), value=0.20,
                                    format="%.2f", disabled=disablewidgets)
DIANE's avatar
DIANE committed
        if sel_ratio > 1.00:
            ratio = int(sel_ratio)
        elif sel_ratio < 1.00:
            ratio = int(sel_ratio * spectra.shape[0])
DIANE's avatar
DIANE committed
        ObjectHash(current=hash_, add=ratio)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        if dr_model and not clus_method:
DIANE's avatar
DIANE committed
            seltech = st.radio('Select samples selection strategy:', options=[
                               'random', 'kennard-stone'], disabled=True if st.session_state.interface == 'simple' else False)
DIANE's avatar
DIANE committed

        elif dr_model and clus_method:
            disabled1 = False if clus_method in cluster_methods else True
DIANE's avatar
DIANE committed
            seltech = st.radio('Select samples selection strategy:',
                               options=seltechs, disabled=disabled1)
DIANE's avatar
DIANE committed


if not t.empty:
DIANE's avatar
DIANE committed
    # ~~~~~~~~~~~~~~~~~~~~~~~ II- Clustering ~~~~~~~~~~~~~~~~~~~~~~~~~~
DIANE's avatar
DIANE committed
    if clus_method:
        from utils.clustering import clustering
DIANE's avatar
DIANE committed
        labels, n_clusters = clustering(X=tcr, method=clus_method)

DIANE's avatar
DIANE committed
    # ~~~~~~  III - Samples selection based on the reduced data presentation ~~~~~~~
    from utils.samsel import selection_method
DIANE's avatar
DIANE committed
    ObjectHash(current=hash_, add=seltech)
DIANE's avatar
DIANE committed
    if 'labels' not in globals():
DIANE's avatar
DIANE committed
        custom_color_palette = px.colors.qualitative.Plotly[:1]
DIANE's avatar
DIANE committed
        selected = selection_method(X=tcr, method=seltech, rset=ratio)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    else:
DIANE's avatar
DIANE committed
        custom_color_palette = px.colors.qualitative.Plotly[:n_clusters]
        selected = []
DIANE's avatar
DIANE committed
        for i in [i for i in set(labels.index) if i != 'Non clustered']:
            rset_meta = .5 if tcr.loc[labels.loc[i].values.ravel(
            ), :].shape[0] > 1 else 1
            selected += selection_method(X=tcr.loc[labels.loc[i].values.ravel(), :], method=seltech,
                                         rset=ratio, rset_meta=.4)
DIANE's avatar
DIANE committed
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ results visualization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Scores plot
if not t.empty:
DIANE's avatar
DIANE committed
    if clus_method:
DIANE's avatar
DIANE committed
        filter[0] = clus_method
DIANE's avatar
DIANE committed
        desactivatelist = True if len(filter) <= 1 else False
    else:
        desactivatelist = True if len(filter) <= 1 else False
DIANE's avatar
DIANE committed
    with c12:
        st.write('Scores plot')
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        if len(axis) == 1:
DIANE's avatar
DIANE committed
            tcr['1d'] = np.random.uniform(-.5, .5, tcr.shape[0])
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        colfilter = st.selectbox('Color by :', options=filter,
                                 format_func=lambda x: x if x else "<Select>", disabled=desactivatelist)
DIANE's avatar
DIANE committed
        ObjectHash(colfilter)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        if colfilter:
DIANE's avatar
DIANE committed
            if colfilter not in cluster_methods:  # case meta variable
                cmap = dict(
                    zip(set(md_df_st_[colfilter]), colorslist[:len(set(md_df_st_[colfilter]))]))
                tcr['color'] = md_df_st_.loc[:, colfilter]

            elif colfilter in cluster_methods:  # case clustering
DIANE's avatar
DIANE committed
                if 'colorslist' not in globals():
                    n_colors = len(set(labels.index))
DIANE's avatar
DIANE committed
                    # Evenly spaced hues
                    hues = np.linspace(0, 1, n_colors, endpoint=False)
DIANE's avatar
DIANE committed
                    st.write(555)
                    st.write(hues)
                    st.write(555)
                    import random
                    random.seed(42)
                    import matplotlib.colors as mcolors

DIANE's avatar
DIANE committed
                    colorslist = [mcolors.rgb2hex(
                        plt.cm.hsv(hue)) for hue in hues]
DIANE's avatar
DIANE committed
                    random.shuffle(colorslist)

DIANE's avatar
DIANE committed
                cmap = dict(
                    zip(set(labels.index), colorslist[:len(set(labels.index))]))
DIANE's avatar
DIANE committed
                tcr['color'] = labels.index
DIANE's avatar
DIANE committed
        else:
DIANE's avatar
DIANE committed
            cmap = {'Sample': "#7ab0c7"}
            tcr['color'] = ['Sample'] * tcr.shape[0]
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        # start visualization
DIANE's avatar
DIANE committed
        match t.shape[1]:
            case 3:
DIANE's avatar
DIANE committed
                hover1 = {'sample:': tcr.index, 'color': False,
                          axis[0]: False, axis[1]: False, axis[2]: False}
                fig = px.scatter_3d(tcr, x=axis[0], y=axis[1], z=axis[2], color='color',
                                    color_discrete_map=cmap, hover_data=hover1)
                fig.add_scatter3d(x=tcr.loc[selected, axis[0]], y=tcr.loc[selected, axis[1]], z=tcr.loc[selected, axis[2]],
                                  mode='markers', marker=dict(size=5, color='black'),
                                  name='selected samples', hovertext=tcr.loc[selected, :].index)

DIANE's avatar
DIANE committed
            case 2:
DIANE's avatar
DIANE committed
                hover1 = {'sample:': tcr.index, 'color': False,
                          axis[0]: False, axis[1]: False}
                fig = px.scatter(tcr, x=axis[0], y=axis[1], color='color',
                                 color_discrete_map=cmap, hover_data=hover1)
                fig.add_scatter(x=tcr.loc[selected, axis[0]], y=tcr.loc[selected, axis[1]],
                                mode='markers', marker=dict(size=5, color='black'),
                                name='selected samples', hovertext=tcr.loc[selected, :].index)

DIANE's avatar
DIANE committed
            case 1:
DIANE's avatar
DIANE committed
                hover1 = {'sample:': tcr.index, 'color': False,
                          '1d': False, axis[0]: False}
DIANE's avatar
DIANE committed
                yy = np.random.uniform(-.5, .5, tcr.shape[0])
DIANE's avatar
DIANE committed
                fig = px.scatter(tcr, x=axis[0], y='1d', color="color",
                                 color_discrete_map=cmap, hover_data=hover1)

                fig.add_scatter(x=tcr.loc[selected, axis[0]], y=tcr.loc[selected, '1d'],
                                mode='markers', marker=dict(size=5, color='black'),
                                name='selected samples',
                                hovertext=tcr.loc[selected, :].index)
                fig.update_layout(yaxis_range=[-1.6, 1.6])
DIANE's avatar
DIANE committed
                fig.update_yaxes(visible=False)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        st.plotly_chart(fig, use_container_width=True)
DIANE's avatar
DIANE committed
if not spectra.empty:
    if dim_red_method in ['PCA', 'NMF']:
        with c13:
            st.write('Loadings plot')
DIANE's avatar
DIANE committed
            if file.name.split(".")[-1] == 'dx':
                xlab = ["Wavenumbers (1/cm)" if meta_data.loc[:,
                                                              'xunits'].iloc[0] == '1/cm' else 'Wavelengths (nm)']
            elif file.name.split(".")[-1] == 'csv':
DIANE's avatar
DIANE committed
                xlab = ['Wavelength/Wavenumber']

            p = dr_model.loadings_.T
DIANE's avatar
DIANE committed
            freq = DataFrame(wls, columns=xlab, index=p.index)
            df1 = concat([p, freq], axis=1).melt(
                id_vars=freq.columns,  var_name='Loadings:', value_name='Value')

            loadingsplot = px.line(df1, x=xlab, y='Value', color='Loadings:',
                                   color_discrete_sequence=px.colors.qualitative.Plotly)
            loadingsplot.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"),
                                                   bordercolor="black", borderwidth=2))
            loadingsplot.update_layout(
                xaxis_title=xlab[0], yaxis_title='Value')

DIANE's avatar
DIANE committed
            st.plotly_chart(loadingsplot, use_container_width=True)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
# #############################################################################################################
DIANE's avatar
DIANE committed
    if dim_red_method == 'PCA':
        c14, c15 = st.columns([3, 3])
        with c14:
            st.write('Influence plot')
            # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model
DIANE's avatar
DIANE committed
            p = p.loc[:, axis]
DIANE's avatar
DIANE committed
            xp = np.dot(t, p.T)
DIANE's avatar
DIANE committed
            tcr["residuals"] = np.diag(np.subtract(
                xc.values, xp) @ np.subtract(xc.values, xp).T)
DIANE's avatar
DIANE committed

            # Laverage
            # Tr(T(T'T)^(-1)T'): #reference :Introduction to Multi- and Megavariate Data Analysis using Projection Methods (PCA and PLS),
DIANE's avatar
DIANE committed
            # L. Eriksson, E. Johansson, N. Kettaneh-Wold and S. Wold, Umetrics 1999, p. 466
            Hat = t.loc[:, axis].values @ np.linalg.inv(
                t.loc[:, axis].values.T @ t.loc[:, axis].values) @ t.loc[:, axis].values.T
            tcr["leverage"] = DataFrame(
                np.diag(Hat) / np.trace(Hat), index=spectra.index, columns=['Leverage'])
DIANE's avatar
DIANE committed

            # compute tresholds
            tresh3 = 2 * tcr.shape[1]/n_specs
            from scipy.stats import chi2
DIANE's avatar
DIANE committed
            tresh4 = chi2.ppf(0.05, df=len(axis))
DIANE's avatar
DIANE committed

            # Retrieve the index names of these rows
DIANE's avatar
DIANE committed
            exceed_lev = tcr[(tcr['leverage'] > tresh3) & (
                tcr['residuals'] > tresh4)].index.tolist()
DIANE's avatar
DIANE committed

            # plot results
DIANE's avatar
DIANE committed
            influence_plot = px.scatter(tcr, x="leverage", y="residuals", color='color',
                                        color_discrete_map=cmap, hover_data=hover1)
            influence_plot.add_scatter(x=tcr.loc[selected, "leverage"], y=tcr.loc[selected, "residuals"],
                                       mode='markers', marker=dict(size=5, color='black'),
                                       name='selected samples', hovertext=tcr.loc[selected, :].index)
            influence_plot.add_vline(
                x=tresh3, line_width=1, line_dash='dash', line_color='red')
            influence_plot.add_hline(
                y=tresh4, line_width=1, line_dash='dash', line_color='red')
DIANE's avatar
DIANE committed

            # add labels for the outliers
            for i in exceed_lev:
DIANE's avatar
DIANE committed
                influence_plot.add_annotation(dict(x=tcr['leverage'].loc[i], y=tcr['residuals'].loc[i], showarrow=True,
                                                   text=i, font=dict(color="black", size=15), xanchor='auto', yanchor='auto'))
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
            influence_plot.update_traces(marker=dict(size=6), showlegend=True)
            influence_plot.update_layout(xaxis_title="Leverage", yaxis_title="Q-residuals",
                                         font=dict(size=20), width=800, height=600)
DIANE's avatar
DIANE committed
            st.plotly_chart(influence_plot, use_container_width=True)
DIANE's avatar
DIANE committed


#             influence_plot.update_traces(marker=dict(size= 6), showlegend= True)
#             influence_plot.update_layout(font=dict(size=23), width=800, height=500)
#             for annotation in influence_plot.layout.annotations:
#                 annotation.font.size = 35
#             influence_plot.update_traces(marker=dict(size= 10), showlegend= False)
#             influence_plot.add_annotation(text= '(a)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1,
#                                              font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
#             # influence_plot.write_image('./report/results/figures/influence_plot.png', engine = 'kaleido')
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        with c15:
            st.write('T²-Hotelling vs Q-residuals plot')
            # Hotelling
DIANE's avatar
DIANE committed
            tcr['hotelling'] = (t**2/t.std()).sum(axis=1)
DIANE's avatar
DIANE committed

            # compute tresholds
            from scipy.stats import f, chi2
            fcri = f.isf(0.05, 3, n_specs)
DIANE's avatar
DIANE committed
            tresh0 = (3 * (n_specs ** 2 - 1) * fcri) / \
                (n_specs * (n_specs - 3))
            tresh1 = chi2.ppf(0.05, df=3)
DIANE's avatar
DIANE committed

            # Retrieve the index names of these rows
DIANE's avatar
DIANE committed
            exceed_hot = tcr[(tcr['hotelling'] > tresh0) & (
                tcr['residuals'] > tresh1)].index.tolist()
DIANE's avatar
DIANE committed

            # plot results
DIANE's avatar
DIANE committed
            hotelling_plot = px.scatter(tcr, x='hotelling', y='residuals', color="color",
                                        color_discrete_map=cmap, hover_data=hover1)
            hotelling_plot.add_scatter(x=tcr.loc[selected, 'hotelling'], y=tcr.loc[selected, 'residuals'],
                                       mode='markers', marker=dict(size=5, color='black'),
                                       name='selected samples', hovertext=tcr.loc[selected, :].index)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
            hotelling_plot.update_layout(xaxis_title="Hotelling-T² distance", yaxis_title="Q-residuals",
DIANE's avatar
DIANE committed
                                         font=dict(size=20), width=800, height=600)
DIANE's avatar
DIANE committed
            hotelling_plot.add_vline(
                x=tresh0, line_width=1, line_dash='dash', line_color='red')
            hotelling_plot.add_hline(
                y=tresh1, line_width=1, line_dash='dash', line_color='red')
DIANE's avatar
DIANE committed

            # add labels for the outliers
            for i in exceed_hot:
DIANE's avatar
DIANE committed
                hotelling_plot.add_annotation(dict(x=tcr['hotelling'].loc[i], y=tcr['residuals'].loc[i], showarrow=True, text=i,
                                                   font=dict(color="black", size=15), xanchor='auto', yanchor='auto'))
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
            hotelling_plot.update_traces(marker=dict(size=6), showlegend=True)
            hotelling_plot.update_layout(
                font=dict(size=23), width=800, height=500)
            st.plotly_chart(hotelling_plot, use_container_width=True)
DIANE's avatar
DIANE committed


#             # for annotation in hotelling_plot.layout.annotations:
#             #     annotation.font.size = 35
#             # hotelling_plot.update_layout(font=dict(size=23), width=800, height=600)
#             # hotelling_plot.update_traces(marker=dict(size= 10), showlegend= False)
#             # hotelling_plot.add_annotation(text= '(b)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1,
#             #                                  font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
# #             # hotelling_plot.write_image("./report/results/figures/hotelling_plot.png", format="png")

DIANE's avatar
DIANE committed
st.subheader('III - Selected Samples for Reference Analysis', divider='blue')
if selected:
    c16, c17 = st.columns([3, 1])
    with c16:
        st.write("Tabular identifiers of selected samples for reference analysis:")
DIANE's avatar
DIANE committed

        if 'labels' in globals():
            labels['cluster'] = labels.index
            labels.index = labels['names']
            result = DataFrame({'names': selected,
DIANE's avatar
DIANE committed
                                'cluster': selected}, index=selected)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
            if not meta_data.empty:
                if 'name' in meta_data.columns:
                    subset = meta_data.drop('name', axis = 1).loc[selected]
                else:
                    subset = meta_data.loc[selected]
DIANE's avatar
DIANE committed
                subset = DataFrame(selected, columns = ['names'])
            st.write(subset)

        with c17:
DIANE's avatar
DIANE committed
            if clus_method in filter: filter.remove(clus_method)
            st.info(f'Information !\n - The total number of samples: {n_specs}.\n- The number of samples selected for reference analysis: {len(selected)}.\n - The proportion of samples selected for reference analysis: {round(len(selected)/n_specs*100)}%.')
            selected_col = st.selectbox('Color by:  ', options=filter, format_func=lambda x: x if x else "<Select>",
                                     disabled=True if len(filter) == 1 else False)
            if selected_col:
                cmap2 = dict(
                    zip(set(md_df_st_.loc[selected][selected_col]), colorslist[:len(set(md_df_st_.loc[selected][selected_col]))]))
                st.write('The distribution of selected samples across categories')
                
DIANE's avatar
DIANE committed
                barhsel = barhplot(md_df_st_.loc[selected][[selected_col]], cmap=cmap2)
                st.pyplot(barhsel)

            
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed

#         if meta_data.empty:
#             # clustered: a list of ints
#             # sam1 = DataFrame({'name': selected_samples_idx,
#             #                     'cluster':np.array(labels)[selected_samples_idx]},
#             #                     index = selected_samples_idx)
#             st.write(selected_samples_idx)
#             st.write(clustered)
#         else:
#             sam1 = meta_data.iloc[clustered,:].loc[selected_samples_idx,:]
#             sam1.insert(loc=0, column='index', value=selected_samples_idx)
#             sam1.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx])
#         sam1.index = np.arange(len(selected_samples_idx))+1
#         sam = sam1

#         if clus_method =='HDBSCAN':
#             with c16:
#                 unclus = st.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True)

#             if selected_samples_idx:
#                 if unclus:
#                     if meta_data.empty:
#                         sam2 = DataFrame({'name': spectra.index[non_clustered],
#                                             'cluster':['Non clustered']*len(spectra.index[non_clustered])},
#                                             index = spectra.index[non_clustered])
#                     else :
#                         sam2 = meta_data.iloc[non_clustered,:]
#                         sam2.insert(loc=0, column='index', value= spectra.index[non_clustered])
#                         sam2.insert(loc=1, column='cluster', value=['Non clustered']*len(spectra.index[non_clustered]))
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
#                     sam = concat([sam1, sam2], axis = 0)
#                     sam.index = np.arange(sam.shape[0])+1
#                     with c17:
#                         st.info(f'- The number of Non-clustered samples: {sam2.shape[0]}.\n - The proportion of Non-clustered samples: {round(sam2.shape[0]/n_specs*100)}%')
#         else:
#             sam = sam1
#         with c16:
#             st.write(sam)


# if not sam.empty:
#     zip_data = ""
#     Nb_ech = str(n_specs)
#     nb_clu = str(sam1.shape[0])
#     st.subheader('Download the analysis results')
#     st.write("**Note:** Please check the box only after you have finished processing your data and are satisfied with the results. Checking the box prematurely may slow down the app and could lead to crashes.")
#     decis = st.checkbox("Yes, I want to download the results")
#     if decis:
#         ###################################################
#         # ## generate report
#         @st.cache_data
#         def export_report(change):
#             latex_report = report.report('Representative subset selection', file.name, dim_red_method,
#                                         clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam)

#         @st.cache_data
#         def preparing_results_for_downloading(change):
#             # path_to_report = Path("report")############################### i am here
#             match file.name.split(".")[-1]:
#                 # load csv file
#                 case 'csv':
#                     imp.to_csv('report/results/dataset/'+ file.name, sep = ';', encoding = 'utf-8', mode = 'a')
#                 case 'dx':
#                     with open('report/results/dataset/'+file.name, 'w') as dd:
#                         dd.write(dxdata)

#             fig_spectra.savefig(report_path_rel/"results/figures/spectra_plot.png", dpi = 400) ## Export report

#             if len(axis) == 3:
#                 for i in range(len(comb)):
#                     fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'].write_image(report_path_rel/f'results/figures/scores_pc{str(comb[i][0]+1)}_pc{str(comb[i][1]+1)}.png')
#             elif len(axis)==2 :
#                 fig_export['fig'].write_image(report_path_rel/'results/figures/scores_plot2D.png')
#             elif len(axis)==1 :
#                 fig_export['fig'].write_image(report_path_rel/'results/figures/scores_plot1D.png')
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
#             # Export du graphique
#             if dim_red_method in ['PCA','NMF']:
#                 import plotly.io as pio
#                 img = pio.to_image(loadingsplot, format="png")
#                 with open(report_path_rel/"results/figures/loadings_plot.png", "wb") as f:
#                     f.write(img)
DIANE's avatar
DIANE committed
#             if dim_red_method == 'PCA':
DIANE's avatar
DIANE committed
#                 hotelling_plot.write_image(report_path_rel/"results/figures/hotelling_plot.png", format="png")
#                 influence_plot.write_image(report_path_rel/'results/figures/influence_plot.png', engine = 'kaleido')
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
#             sam.to_csv(report_path_rel/'results/Selected_subset_for_calib_development.csv', sep = ';')
#             export_report(change = hash_)
#             if Path(report_path_rel/"report.tex").exists():
#                 report.generate_report(change = hash_)
#             if Path(report_path_rel/"report.pdf").exists():
#                 move(report_path_rel/"report.pdf", "./report/results/report.pdf")
#             return change
DIANE's avatar
DIANE committed
#         preparing_results_for_downloading(change = hash_)
#         report.generate_report(change = hash_)
DIANE's avatar
DIANE committed
#         @st.cache_data
#         def tempdir(change):
#             from tempfile import TemporaryDirectory
#             with  TemporaryDirectory( prefix="results", dir="./report") as temp_dir:# create a temp directory
#                 tempdirname = os.path.split(temp_dir)[1]
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
#                 if len(os.listdir(report_path_rel/'results/figures/'))>=2:
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
#                     make_archive(base_name= report_path_rel/"Results", format="zip", base_dir="results", root_dir = "./report")# create a zip file
#                     move(report_path_rel/"Results.zip", f"./report/{tempdirname}/Results.zip")# put the inside the temp dir
#                     with open(report_path_rel/f"{tempdirname}/Results.zip", "rb") as f:
#                         zip_data = f.read()
#             return tempdirname, zip_data
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
#         try :
#             tempdirname, zip_data = tempdir(change = hash_)
#             # st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip",
#             #             args = None, kwargs = None,type = "primary",use_container_width = True)
#         except:
#             pass
#     date_time = datetime.now().strftime('%y%m%d%H%M')
#     disabled_down = True if zip_data == '' else False
#     st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip",
#                 args = None, kwargs = None,type = "primary",use_container_width = True, disabled = disabled_down)


DIANE's avatar
DIANE committed
#     HandleItems.delete_files(keep = ['.py', '.pyc','.bib'])