Skip to content
Snippets Groups Projects
1-samples_selection.py 34.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • DIANE's avatar
    DIANE committed
    from utils.data_parsing import meta_st
    
    DIANE's avatar
    DIANE committed
    from common import *
    
    st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
    
    DIANE's avatar
    DIANE committed
    
    
    
    DIANE's avatar
    UI  
    DIANE committed
    # layout
    
    DIANE's avatar
    DIANE committed
    UiComponents(pagespath=pages_folder, csspath=css_file, imgpath=image_path,
                 header=True, sidebar=True, bgimg=False, colborders=True)
    st.header("Calibration Subset Selection")  # page title
    st.markdown(
        "Select a representative subset of samples for NIR calibration development.")
    
    DIANE's avatar
    DIANE committed
    c1, c2 = st.columns([3, 1])
    
    DIANE's avatar
    DIANE committed
    c1.image("./images/sample selection.png",
             use_column_width=True)  # graphical abstract
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    UI  
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    # empty temp figures
    
    DIANE's avatar
    DIANE committed
    report_path = Path("report")
    report_path_rel = Path("./report")
    
    
    DIANE's avatar
    DIANE committed
    # ~~~~~~~~~~~~~~~~ clean the analysis results dir ~~~~~~~~~~~~~~~~
    
    DIANE's avatar
    DIANE committed
    HandleItems.delete_files(keep=['.py', '.pyc', '.bib', '.tex'])
    HandleItems.delete_dir(delete=['report/results/model'])
    
    DIANE's avatar
    DIANE committed
    
    ################################### I - Data Loading and Visualization ########################################
    # loader for datafile
    
    DIANE's avatar
    DIANE committed
    file = c2.file_uploader("Data file", type=[
                            "csv", "dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    # Preallocation of data structure
    
    DIANE's avatar
    DIANE committed
    spectra = DataFrame()
    meta_data = DataFrame()
    
    DIANE's avatar
    DIANE committed
    md_df_st_ = DataFrame()
    tcr = DataFrame()
    sam = DataFrame()
    sam1 = DataFrame()
    
    DIANE's avatar
    DIANE committed
    selected_samples = DataFrame()
    
    DIANE's avatar
    DIANE committed
    selected = []
    
    DIANE's avatar
    DIANE committed
    l1 = []
    
    DIANE's avatar
    DIANE committed
    color_palette = None
    
    DIANE's avatar
    DIANE committed
    dr_model = None  # dimensionality reduction model
    cl_model = None  # clustering model
    
    selection = None
    
    selection_number = "None"
    
    DIANE's avatar
    DIANE committed
    samples_df_chem = DataFrame
    
    DIANE's avatar
    DIANE committed
    selected_samples = []
    selected_samples_idx = []
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    if not file:
    
    DIANE's avatar
    DIANE committed
        c2.info('Info: Please load data file !')
    
    DIANE's avatar
    DIANE committed
        # extension = file.name.split(".")[-1]
        userfilename = file.name.replace(f".{file.name.split(".")[-1]}", '')
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        match file.name.split(".")[-1]:
            # Load .csv file
            case 'csv':
    
    DIANE's avatar
    DIANE committed
                with c2:
    
    DIANE's avatar
    DIANE committed
                    # ~~~~~~~~ select file dialect
    
    DIANE's avatar
    DIANE committed
                    c2_1, c2_2 = st.columns([.5, .5])
                    with c2_1:
    
    DIANE's avatar
    DIANE committed
                        dec = st.radio('decimal:', options=[
                                       ".", ","], horizontal=True)
                        sep = st.radio("separator:", options=[
                                       ";", ","], horizontal=True)
    
    DIANE's avatar
    DIANE committed
                    with c2_2:
    
    DIANE's avatar
    DIANE committed
                        hdr = st.radio("header: ", options=[
                                       "yes", "no"], horizontal=True)
                        names = st.radio("samples name:", options=[
                                         "yes", "no"], horizontal=True)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                    hdr = 0 if hdr == "yes" else None
                    names = 0 if names == "yes" else None
                    hash_ = ObjectHash(current=None, add=[
                                       file.getvalue(), hdr, names, dec, sep])
    
    DIANE's avatar
    DIANE committed
    
                    # ~~~~~~~~ read the csv file
    
    DIANE's avatar
    DIANE committed
                    try:
    
    DIANE's avatar
    DIANE committed
                        # spectra = read_csv(file, decimal=dec, sep=sep, index_col=names)
    
    DIANE's avatar
    DIANE committed
                        spectra, meta_data = csv_parser(
    
    DIANE's avatar
    DIANE committed
                            path=file, decimal=dec, separator=sep, index_col=names, header=hdr, change=hash_)
    
    DIANE's avatar
    DIANE committed
                        if spectra.shape[1] > 20:
                            st.success(
                                "The data have been loaded successfully and spectral data was successfully detected, you might need to tune dialect.", icon="")
    
    DIANE's avatar
    DIANE committed
                        else:
    
    DIANE's avatar
    DIANE committed
                            st.warning(
                                "The data have been loaded successfully and but spectral data was not detected.")
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                    except:
                        st.error('''Error: The format of the file does not correspond to the expected dialect settings.
    
    DIANE's avatar
    DIANE committed
                                  To read the file correctly, please adjust the dialect parameters.''')
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            # Load .dx file
    
    DIANE's avatar
    DIANE committed
            case 'dx':
    
    DIANE's avatar
    DIANE committed
                with c2:
    
    DIANE's avatar
    DIANE committed
                    with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
    
    DIANE's avatar
    DIANE committed
                        tmp.write(file.read())
                        dxfile = tmp.name
    
    DIANE's avatar
    DIANE committed
                        hash_ = ObjectHash(current=None, add=file.getvalue())
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                    try:
    
    DIANE's avatar
    DIANE committed
                        from utils.data_parsing import jcamp_parser
    
    DIANE's avatar
    DIANE committed
                        spectra, _, meta_data = jcamp_parser(
                            path=dxfile, include=['x_block', 'meta'], change=hash_)
                        st.success(
                            "The data have been loaded successfully", icon="")
    
    DIANE's avatar
    DIANE committed
                    except:
    
    DIANE's avatar
    DIANE committed
                        st.error(
                            '''Error: an issue was encontered while parsing the uploaded file.''')
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
    
    DIANE's avatar
    DIANE committed
        if len(spectra.index) > len(set(spectra.index)):
            c2.warning(
                "Duplicate sample IDs found. Suffixes (#1, #2, ...) have been added to duplicate IDs.")
    
    DIANE's avatar
    DIANE committed
            meta_data['names'] = spectra.index
    
    DIANE's avatar
    DIANE committed
            # Keep all duplicates (True for replicated)
            mask = spectra.index.duplicated(keep=False)
    
    DIANE's avatar
    DIANE committed
            # For the duplicated sample_ids, apply suffix (_1, _2, etc.)
    
    DIANE's avatar
    DIANE committed
            spectra.index = spectra.index.where(~mask,
                                                spectra.groupby(spectra.index).cumcount().add(1).astype(str).radd(spectra.index.astype(str) + '#'))
    
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
        if not meta_data.empty:
            meta_data.index = [str(i) for i in spectra.index]
            md_df_st_ = meta_st(meta_data)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            if md_df_st_.shape[1] > 0:
    
    DIANE's avatar
    DIANE committed
                n_colors = 30
    
    DIANE's avatar
    DIANE committed
                # Evenly spaced hues
                hues = np.linspace(0, 1, n_colors, endpoint=False)
    
    DIANE's avatar
    DIANE committed
                import random
                random.seed(42)
                import matplotlib.colors as mcolors
                colorslist = [mcolors.rgb2hex(plt.cm.hsv(hue)) for hue in hues]
                random.shuffle(colorslist)
    
            else:
                colorslist = None
    
        if spectra.select_dtypes(include=['float']).shape[1] < 50:
    
    DIANE's avatar
    DIANE committed
            c2.warning(
                'Error: Your data is not multivariable, check the number of variables in your data or well tune the dialect.')
            spectra = DataFrame
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
    
    DIANE's avatar
    DIANE committed
        n_specs = spectra.shape[0]  # n_samples
        nwls = spectra.shape[1]  # nwl
        wls = list(spectra.columns)  # colnames
    
    DIANE's avatar
    DIANE committed
        spectra.index = [str(i) for i in list(spectra.index)]
    
    DIANE's avatar
    DIANE committed
    
        id = spectra.index  # rownames
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        with c2:
            st.write('Data summary:')
            st.write(f'- the number of spectra:{spectra.shape[0]}')
            st.write(f'- the number of wavelengths:{spectra.shape[1]}')
            st.write(f'- the number of categorical variables:{meta_data.shape[1]}')
    
    DIANE's avatar
    DIANE committed
    ################################################### END : I- Data loading and preparation ####################################################
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    ################################################### BEGIN : visualize and split the data ####################################################
    
    DIANE's avatar
    DIANE committed
    st.subheader("I - Spectral Data Visualization", divider='blue')
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
    
    DIANE's avatar
    DIANE committed
        c3, c4 = st.columns([3, 1])
    
    DIANE's avatar
    DIANE committed
        with c4:
    
    DIANE's avatar
    DIANE committed
            st.info('Color spectra based on a categorical variable (for easier visualization: only relevant variables with fewer than 60 categories are displated in the dropdown list.)')
    
    DIANE's avatar
    DIANE committed
            filter = ['']+md_df_st_.columns.to_list()
    
    DIANE's avatar
    DIANE committed
            specs_col = st.selectbox('Color by:', options=filter, format_func=fmt,
    
    DIANE's avatar
    DIANE committed
                                     disabled=True if len(filter) == 1 else False)
    
    DIANE's avatar
    DIANE committed
            if len(filter) == 1:
                st.write("No categorical variable was provided!")
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        with c3:
    
    DIANE's avatar
    DIANE committed
            if specs_col != '':
    
    DIANE's avatar
    DIANE committed
                cmap = dict(
                    zip(set(md_df_st_[specs_col]), colorslist[:len(set(md_df_st_[specs_col]))]))
                fig_spectra = plot_spectra(spectra, color=md_df_st_[
                                           specs_col], cmap=cmap, xunits='Wavelength/Wavenumber', yunits="Signal intensity")
    
    DIANE's avatar
    DIANE committed
    
            else:
    
    DIANE's avatar
    DIANE committed
                fig_spectra = plot_spectra(
                    spectra, color=None, cmap=None, xunits='Wavelength/Wavenumber', yunits="Signal intensity")
    
    DIANE's avatar
    DIANE committed
                cmap = None
    
    DIANE's avatar
    DIANE committed
            st.pyplot(fig_spectra)
    
    
    DIANE's avatar
    DIANE committed
        with c4:
    
    DIANE's avatar
    DIANE committed
            if specs_col != '':
                st.write('The distribution of samples across categories')
    
    DIANE's avatar
    DIANE committed
                barh = barhplot(md_df_st_[[specs_col]], cmap=cmap)
    
    DIANE's avatar
    DIANE committed
                st.pyplot(barh)
    
    
    DIANE's avatar
    DIANE committed
            elif len(filter) > 1 and specs_col == '':
    
    DIANE's avatar
    DIANE committed
                st.write("No categorical variable was selected!")
    
        if st.session_state.interface == 'advanced':
            with c3:
    
    DIANE's avatar
    DIANE committed
                values = st.slider('Select a range of values',
                                   min_value=0, max_value=nwls, value=(0, nwls))
                hash_ = ObjectHash(current=hash_, add=values)
    
    DIANE's avatar
    DIANE committed
                spectra = spectra.iloc[:, values[0]:values[1]]
                nwls = spectra.shape[1]
                wls = wls[values[0]:values[1]]
    
    
    DIANE's avatar
    DIANE committed
                st.pyplot(plot_spectra(
                    spectra.mean(), xunits='Wavelength/Wavenumber', yunits="Signal intensity"))
    
    DIANE's avatar
    DIANE committed
    
            # st.selectbox('Variable', options= [''], disabled=True if len(colfilter)>1, else False)
            # st.write(data_info) ## table showing the number of samples in the data file
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    ################################################### END : visualize and split the data ####################################################
    
    
    DIANE's avatar
    DIANE committed
    
    
    ############################## Exploratory data analysis ###############################
    
    DIANE's avatar
    DIANE committed
    st.subheader(
        "II - Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
    
    DIANE's avatar
    DIANE committed
    # ~~~~~~~~~~~~~~ algorithms available on our app ~~~~~~~~~~~~~~~~
    match st.session_state["interface"]:
        case 'simple':
            dim_red_methods, cluster_methods, seltechs = ['PCA'], [''], ['random']
    
        case 'advanced':
    
    DIANE's avatar
    DIANE committed
            # List of dimensionality reduction algos
            dim_red_methods = ['PCA', 'UMAP', 'NMF']
            # List of clustering algos
            cluster_methods = ['KMEANS', 'HDBSCAN', 'AP']
    
    DIANE's avatar
    DIANE committed
            seltechs = ['random', 'kennard-stone', 'meta-medoids', 'meta-ks']
    
    ###### 1- Dimensionality reduction ######
    
    DIANE's avatar
    DIANE committed
    t = DataFrame  # scores
    p = DataFrame  # loadings
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
    
    DIANE's avatar
    DIANE committed
        xc = standardize(spectra, center=True, scale=False)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        c5, c6, c7, c8, c9, c10, c11 = st.columns([1, 1, 0.6, 0.6, 0.6, 1.5, 1.5])
        with c5:
    
    DIANE's avatar
    DIANE committed
            # select a dimensionality reduction algorithm
    
            dim_red_method = st.selectbox("Dimensionality reduction techniques: ",
    
    DIANE's avatar
    DIANE committed
                                          options=['']+dim_red_methods if len(dim_red_methods) > 2 else dim_red_methods, format_func=fmt,
    
    DIANE's avatar
    DIANE committed
                                          disabled=False if len(dim_red_methods) > 2 else True)
            hash_ = ObjectHash(current=hash_, add=dim_red_method)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            match dim_red_method:
                case '':
                    st.info('Info: Select a dimensionality reduction technique!')
    
    DIANE's avatar
    DIANE committed
                case 'UMAP':
    
    DIANE's avatar
    DIANE committed
                    supervised = st.selectbox('Supervised UMAP by(optional):', options=filter,
    
    DIANE's avatar
    DIANE committed
                                              format_func=fmt, disabled=False if len(filter) > 1 else True)
    
    DIANE's avatar
    DIANE committed
                    umapsupervisor = None if supervised == '' else md_df_st_[
                        supervised]
                    hash_ = ObjectHash(current=hash_, add=umapsupervisor)
    
    DIANE's avatar
    DIANE committed
            # select a clustering reduction algorithm
    
    DIANE's avatar
    DIANE committed
            disablewidgets = [False if (
                dim_red_method and st.session_state.interface == 'advanced') else True][0]
    
            clus_method = st.selectbox("Clustering techniques(optional): ",
    
    DIANE's avatar
    DIANE committed
                                       options=[
                                           ''] + cluster_methods if len(cluster_methods) > 2 else cluster_methods,
    
    DIANE's avatar
    DIANE committed
                                       key=38, format_func=fmt, disabled=disablewidgets)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            # if disablewidgets == False and dim_red_method in dim_red_methods:
            #     inf = st.info('Info: Select a clustering technique!')
    
    DIANE's avatar
    DIANE committed
            if dim_red_method:
                @st.cache_data
    
    DIANE's avatar
    DIANE committed
                def dimensionality_reduction(dim_red_method, change):
    
    DIANE's avatar
    DIANE committed
                    match dim_red_method:
                        case "PCA":
    
    DIANE's avatar
    DIANE committed
                            from utils.dim_reduction import LinearPCA
                            dr_model = LinearPCA(xc, Ncomp=8)
    
    DIANE's avatar
    DIANE committed
                        case "UMAP":
    
    DIANE's avatar
    DIANE committed
                            from utils.dim_reduction import Umap
                            dr_model = Umap(numerical_data=spectra,
                                            cat_data=umapsupervisor)
    
    DIANE's avatar
    DIANE committed
                        case 'NMF':
    
    DIANE's avatar
    DIANE committed
                            from utils.dim_reduction import Nmf
                            dr_model = Nmf(spectra, Ncomp=3)
    
    DIANE's avatar
    DIANE committed
                    return dr_model
    
    DIANE's avatar
    DIANE committed
                dr_model = dimensionality_reduction(dim_red_method, change=hash_)
    
    DIANE's avatar
    DIANE committed
            if dr_model:
    
    DIANE's avatar
    DIANE committed
                axis1 = c7.selectbox(
                    "x-axis", options=dr_model.scores_.columns, index=0)
                axis2 = c8.selectbox(
                    "y-axis", options=dr_model.scores_.columns, index=1)
                axis3 = c9.selectbox(
                    "z-axis", options=dr_model.scores_.columns, index=2)
    
    DIANE's avatar
    DIANE committed
                axis = np.unique([axis1, axis2, axis3])
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                t = dr_model.scores_.loc[:, axis]
    
    DIANE's avatar
    DIANE committed
                t.index = spectra.index
    
    DIANE's avatar
    DIANE committed
                tcr = standardize(t)
    
    DIANE's avatar
    DIANE committed
    
    
    if not t.empty:
    
    DIANE's avatar
    DIANE committed
            c12 = st.container()
    
    DIANE's avatar
    DIANE committed
            c12, c13 = st.columns([3, 3])
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
    
    DIANE's avatar
    DIANE committed
        with c6:
    
    DIANE's avatar
    DIANE committed
            sel_ratio = st.number_input('Enter the number/fraction of samples to be selected:', min_value=0.01,
                                        max_value=float("{:.2f}".format(spectra.shape[0])), value=0.20,
                                        format="%.2f", disabled=disablewidgets)
    
    DIANE's avatar
    DIANE committed
            if sel_ratio > 1.00:
                ratio = int(sel_ratio)
            elif sel_ratio < 1.00:
                ratio = int(sel_ratio * spectra.shape[0])
    
    DIANE's avatar
    DIANE committed
            ObjectHash(current=hash_, add=ratio)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            if dr_model and not clus_method:
    
    DIANE's avatar
    DIANE committed
                seltech = st.radio('Select samples selection strategy:', options=[
                                   'random', 'kennard-stone'], disabled=True if st.session_state.interface == 'simple' else False)
    
    DIANE's avatar
    DIANE committed
    
            elif dr_model and clus_method:
                disabled1 = False if clus_method in cluster_methods else True
    
    DIANE's avatar
    DIANE committed
                seltech = st.radio('Select samples selection strategy:',
                                   options=seltechs, disabled=disabled1)
    
    DIANE's avatar
    DIANE committed
    
    
    
    if not t.empty:
    
    DIANE's avatar
    DIANE committed
        # ~~~~~~~~~~~~~~~~~~~~~~~ II- Clustering ~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    DIANE's avatar
    DIANE committed
        if clus_method:
            from utils.clustering import clustering
    
    DIANE's avatar
    DIANE committed
            labels, n_clusters = clustering(X=tcr, method=clus_method)
    
    
    DIANE's avatar
    DIANE committed
        # ~~~~~~  III - Samples selection based on the reduced data presentation ~~~~~~~
        from utils.samsel import selection_method
    
    DIANE's avatar
    DIANE committed
        ObjectHash(current=hash_, add=seltech)
    
    DIANE's avatar
    DIANE committed
        if 'labels' not in globals():
    
    DIANE's avatar
    DIANE committed
            custom_color_palette = px.colors.qualitative.Plotly[:1]
    
    DIANE's avatar
    DIANE committed
            selected = selection_method(X=tcr, method=seltech, rset=ratio)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        else:
    
    DIANE's avatar
    DIANE committed
            custom_color_palette = px.colors.qualitative.Plotly[:n_clusters]
            selected = []
    
    DIANE's avatar
    DIANE committed
            for i in [i for i in set(labels.index) if i != 'Non clustered']:
                rset_meta = .5 if tcr.loc[labels.loc[i].values.ravel(
                ), :].shape[0] > 1 else 1
                selected += selection_method(X=tcr.loc[labels.loc[i].values.ravel(), :], method=seltech,
                                             rset=ratio, rset_meta=.4)
    
    DIANE's avatar
    DIANE committed
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ results visualization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Scores plot
    if not t.empty:
    
    DIANE's avatar
    DIANE committed
        if clus_method:
    
    DIANE's avatar
    DIANE committed
            filter[0] = clus_method
    
    DIANE's avatar
    DIANE committed
            desactivatelist = True if len(filter) <= 1 else False
        else:
            desactivatelist = True if len(filter) <= 1 else False
    
    DIANE's avatar
    DIANE committed
        with c12:
    
            st.write('Scores plot')
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            if len(axis) == 1:
    
    DIANE's avatar
    DIANE committed
                tcr['1d'] = np.random.uniform(-.5, .5, tcr.shape[0])
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            colfilter = st.selectbox('Color by :', options=filter,
    
    DIANE's avatar
    DIANE committed
                                     format_func=fmt, disabled=desactivatelist)
    
    DIANE's avatar
    DIANE committed
            ObjectHash(colfilter)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            if colfilter:
    
    DIANE's avatar
    DIANE committed
                if colfilter not in cluster_methods:  # case meta variable
                    cmap = dict(
                        zip(set(md_df_st_[colfilter]), colorslist[:len(set(md_df_st_[colfilter]))]))
                    tcr['color'] = md_df_st_.loc[:, colfilter]
    
                elif colfilter in cluster_methods:  # case clustering
    
    DIANE's avatar
    DIANE committed
                    if 'colorslist' not in globals():
                        n_colors = len(set(labels.index))
    
    DIANE's avatar
    DIANE committed
                        # Evenly spaced hues
                        hues = np.linspace(0, 1, n_colors, endpoint=False)
    
    DIANE's avatar
    DIANE committed
                        st.write(555)
                        st.write(hues)
                        st.write(555)
                        import random
                        random.seed(42)
                        import matplotlib.colors as mcolors
    
    
    DIANE's avatar
    DIANE committed
                        colorslist = [mcolors.rgb2hex(
                            plt.cm.hsv(hue)) for hue in hues]
    
    DIANE's avatar
    DIANE committed
                        random.shuffle(colorslist)
    
    
    DIANE's avatar
    DIANE committed
                    cmap = dict(
                        zip(set(labels.index), colorslist[:len(set(labels.index))]))
    
    DIANE's avatar
    DIANE committed
                    tcr['color'] = labels.index
    
    DIANE's avatar
    DIANE committed
            else:
    
    DIANE's avatar
    DIANE committed
                cmap = {'Sample': "#7ab0c7"}
                tcr['color'] = ['Sample'] * tcr.shape[0]
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            # start visualization
    
    DIANE's avatar
    DIANE committed
            match t.shape[1]:
                case 3:
    
    DIANE's avatar
    DIANE committed
                    hover1 = {'sample:': tcr.index, 'color': False,
                              axis[0]: False, axis[1]: False, axis[2]: False}
                    fig = px.scatter_3d(tcr, x=axis[0], y=axis[1], z=axis[2], color='color',
                                        color_discrete_map=cmap, hover_data=hover1)
                    fig.add_scatter3d(x=tcr.loc[selected, axis[0]], y=tcr.loc[selected, axis[1]], z=tcr.loc[selected, axis[2]],
                                      mode='markers', marker=dict(size=5, color='black'),
                                      name='selected samples', hovertext=tcr.loc[selected, :].index)
    
    
    DIANE's avatar
    DIANE committed
                case 2:
    
    DIANE's avatar
    DIANE committed
                    hover1 = {'sample:': tcr.index, 'color': False,
                              axis[0]: False, axis[1]: False}
                    fig = px.scatter(tcr, x=axis[0], y=axis[1], color='color',
                                     color_discrete_map=cmap, hover_data=hover1)
                    fig.add_scatter(x=tcr.loc[selected, axis[0]], y=tcr.loc[selected, axis[1]],
                                    mode='markers', marker=dict(size=5, color='black'),
                                    name='selected samples', hovertext=tcr.loc[selected, :].index)
    
    
    DIANE's avatar
    DIANE committed
                case 1:
    
    DIANE's avatar
    DIANE committed
                    hover1 = {'sample:': tcr.index, 'color': False,
                              '1d': False, axis[0]: False}
    
    DIANE's avatar
    DIANE committed
                    yy = np.random.uniform(-.5, .5, tcr.shape[0])
    
    DIANE's avatar
    DIANE committed
                    fig = px.scatter(tcr, x=axis[0], y='1d', color="color",
                                     color_discrete_map=cmap, hover_data=hover1)
    
                    fig.add_scatter(x=tcr.loc[selected, axis[0]], y=tcr.loc[selected, '1d'],
                                    mode='markers', marker=dict(size=5, color='black'),
                                    name='selected samples',
                                    hovertext=tcr.loc[selected, :].index)
                    fig.update_layout(yaxis_range=[-1.6, 1.6])
    
    DIANE's avatar
    DIANE committed
                    fig.update_yaxes(visible=False)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            st.plotly_chart(fig, use_container_width=True)
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
        if dim_red_method in ['PCA', 'NMF']:
            with c13:
                st.write('Loadings plot')
    
    DIANE's avatar
    DIANE committed
                if file.name.split(".")[-1] == 'dx':
                    xlab = ["Wavenumbers (1/cm)" if meta_data.loc[:,
                                                                  'xunits'].iloc[0] == '1/cm' else 'Wavelengths (nm)']
                elif file.name.split(".")[-1] == 'csv':
    
    DIANE's avatar
    DIANE committed
                    xlab = ['Wavelength/Wavenumber']
    
                p = dr_model.loadings_.T
    
    DIANE's avatar
    DIANE committed
                freq = DataFrame(wls, columns=xlab, index=p.index)
                df1 = concat([p, freq], axis=1).melt(
                    id_vars=freq.columns,  var_name='Loadings:', value_name='Value')
    
                loadingsplot = px.line(df1, x=xlab, y='Value', color='Loadings:',
                                       color_discrete_sequence=px.colors.qualitative.Plotly)
                loadingsplot.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"),
                                                       bordercolor="black", borderwidth=2))
                loadingsplot.update_layout(
                    xaxis_title=xlab[0], yaxis_title='Value')
    
    
    DIANE's avatar
    DIANE committed
                st.plotly_chart(loadingsplot, use_container_width=True)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    # #############################################################################################################
    
    DIANE's avatar
    DIANE committed
        if dim_red_method == 'PCA':
            c14, c15 = st.columns([3, 3])
            with c14:
                st.write('Influence plot')
                # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model
    
    DIANE's avatar
    DIANE committed
                p = p.loc[:, axis]
    
    DIANE's avatar
    DIANE committed
                xp = np.dot(t, p.T)
    
    DIANE's avatar
    DIANE committed
                tcr["residuals"] = np.diag(np.subtract(
                    xc.values, xp) @ np.subtract(xc.values, xp).T)
    
    DIANE's avatar
    DIANE committed
    
                # Laverage
                # Tr(T(T'T)^(-1)T'): #reference :Introduction to Multi- and Megavariate Data Analysis using Projection Methods (PCA and PLS),
    
    DIANE's avatar
    DIANE committed
                # L. Eriksson, E. Johansson, N. Kettaneh-Wold and S. Wold, Umetrics 1999, p. 466
                Hat = t.loc[:, axis].values @ np.linalg.inv(
                    t.loc[:, axis].values.T @ t.loc[:, axis].values) @ t.loc[:, axis].values.T
                tcr["leverage"] = DataFrame(
                    np.diag(Hat) / np.trace(Hat), index=spectra.index, columns=['Leverage'])
    
    DIANE's avatar
    DIANE committed
    
                # compute tresholds
                tresh3 = 2 * tcr.shape[1]/n_specs
                from scipy.stats import chi2
    
    DIANE's avatar
    DIANE committed
                tresh4 = chi2.ppf(0.05, df=len(axis))
    
    DIANE's avatar
    DIANE committed
    
                # Retrieve the index names of these rows
    
    DIANE's avatar
    DIANE committed
                exceed_lev = tcr[(tcr['leverage'] > tresh3) & (
                    tcr['residuals'] > tresh4)].index.tolist()
    
    DIANE's avatar
    DIANE committed
    
                # plot results
    
    DIANE's avatar
    DIANE committed
                influence_plot = px.scatter(tcr, x="leverage", y="residuals", color='color',
                                            color_discrete_map=cmap, hover_data=hover1)
                influence_plot.add_scatter(x=tcr.loc[selected, "leverage"], y=tcr.loc[selected, "residuals"],
                                           mode='markers', marker=dict(size=5, color='black'),
                                           name='selected samples', hovertext=tcr.loc[selected, :].index)
                influence_plot.add_vline(
                    x=tresh3, line_width=1, line_dash='dash', line_color='red')
                influence_plot.add_hline(
                    y=tresh4, line_width=1, line_dash='dash', line_color='red')
    
    DIANE's avatar
    DIANE committed
    
                # add labels for the outliers
                for i in exceed_lev:
    
    DIANE's avatar
    DIANE committed
                    influence_plot.add_annotation(dict(x=tcr['leverage'].loc[i], y=tcr['residuals'].loc[i], showarrow=True,
                                                       text=i, font=dict(color="black", size=15), xanchor='auto', yanchor='auto'))
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                influence_plot.update_traces(marker=dict(size=6), showlegend=True)
                influence_plot.update_layout(xaxis_title="Leverage", yaxis_title="Q-residuals",
                                             font=dict(size=20), width=800, height=600)
    
    DIANE's avatar
    DIANE committed
                st.plotly_chart(influence_plot, use_container_width=True)
    
    DIANE's avatar
    DIANE committed
    
    
    #             influence_plot.update_traces(marker=dict(size= 6), showlegend= True)
    #             influence_plot.update_layout(font=dict(size=23), width=800, height=500)
    #             for annotation in influence_plot.layout.annotations:
    #                 annotation.font.size = 35
    #             influence_plot.update_traces(marker=dict(size= 10), showlegend= False)
    #             influence_plot.add_annotation(text= '(a)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1,
    #                                              font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
    #             # influence_plot.write_image('./report/results/figures/influence_plot.png', engine = 'kaleido')
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            with c15:
                st.write('T²-Hotelling vs Q-residuals plot')
                # Hotelling
    
    DIANE's avatar
    DIANE committed
                tcr['hotelling'] = (t**2/t.std()).sum(axis=1)
    
    DIANE's avatar
    DIANE committed
    
                # compute tresholds
                from scipy.stats import f, chi2
                fcri = f.isf(0.05, 3, n_specs)
    
    DIANE's avatar
    DIANE committed
                tresh0 = (3 * (n_specs ** 2 - 1) * fcri) / \
                    (n_specs * (n_specs - 3))
                tresh1 = chi2.ppf(0.05, df=3)
    
    DIANE's avatar
    DIANE committed
    
                # Retrieve the index names of these rows
    
    DIANE's avatar
    DIANE committed
                exceed_hot = tcr[(tcr['hotelling'] > tresh0) & (
                    tcr['residuals'] > tresh1)].index.tolist()
    
    DIANE's avatar
    DIANE committed
    
                # plot results
    
    DIANE's avatar
    DIANE committed
                hotelling_plot = px.scatter(tcr, x='hotelling', y='residuals', color="color",
                                            color_discrete_map=cmap, hover_data=hover1)
                hotelling_plot.add_scatter(x=tcr.loc[selected, 'hotelling'], y=tcr.loc[selected, 'residuals'],
                                           mode='markers', marker=dict(size=5, color='black'),
                                           name='selected samples', hovertext=tcr.loc[selected, :].index)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                hotelling_plot.update_layout(xaxis_title="Hotelling-T² distance", yaxis_title="Q-residuals",
    
    DIANE's avatar
    DIANE committed
                                             font=dict(size=20), width=800, height=600)
    
    DIANE's avatar
    DIANE committed
                hotelling_plot.add_vline(
                    x=tresh0, line_width=1, line_dash='dash', line_color='red')
                hotelling_plot.add_hline(
                    y=tresh1, line_width=1, line_dash='dash', line_color='red')
    
    DIANE's avatar
    DIANE committed
    
                # add labels for the outliers
                for i in exceed_hot:
    
    DIANE's avatar
    DIANE committed
                    hotelling_plot.add_annotation(dict(x=tcr['hotelling'].loc[i], y=tcr['residuals'].loc[i], showarrow=True, text=i,
                                                       font=dict(color="black", size=15), xanchor='auto', yanchor='auto'))
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                hotelling_plot.update_traces(marker=dict(size=6), showlegend=True)
                hotelling_plot.update_layout(
                    font=dict(size=23), width=800, height=500)
                st.plotly_chart(hotelling_plot, use_container_width=True)
    
    DIANE's avatar
    DIANE committed
    
    
    #             # for annotation in hotelling_plot.layout.annotations:
    #             #     annotation.font.size = 35
    #             # hotelling_plot.update_layout(font=dict(size=23), width=800, height=600)
    #             # hotelling_plot.update_traces(marker=dict(size= 10), showlegend= False)
    #             # hotelling_plot.add_annotation(text= '(b)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1,
    #             #                                  font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
    # #             # hotelling_plot.write_image("./report/results/figures/hotelling_plot.png", format="png")
    
    
    DIANE's avatar
    DIANE committed
    st.subheader('III - Selected Samples for Reference Analysis', divider='blue')
    if selected:
        c16, c17 = st.columns([3, 1])
    
        with c16:
            st.write("Tabular identifiers of selected samples for reference analysis:")
    
    DIANE's avatar
    DIANE committed
    
    
            if 'labels' in globals():
                labels['cluster'] = labels.index
                labels.index = labels['names']
                result = DataFrame({'names': selected,
    
    DIANE's avatar
    DIANE committed
                                    'cluster': selected}, index=selected)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                if not meta_data.empty:
                    if 'name' in meta_data.columns:
    
    DIANE's avatar
    DIANE committed
                        subset = meta_data.drop('name', axis=1).loc[selected]
    
    DIANE's avatar
    DIANE committed
                    else:
                        subset = meta_data.loc[selected]
    
    DIANE's avatar
    DIANE committed
                    subset = DataFrame(selected, columns=['names'])
    
                st.write(subset)
    
            with c17:
    
    DIANE's avatar
    DIANE committed
                if clus_method in filter:
                    filter.remove(clus_method)
                st.info(f'Information !\n - The total number of samples: {n_specs}.\n- The number of samples selected for reference analysis: {
                        len(selected)}.\n - The proportion of samples selected for reference analysis: {round(len(selected)/n_specs*100)}%.')
                selected_col = st.selectbox('Color by:  ', options=filter, format_func=fmt,
                                            disabled=True if len(filter) == 1 else False)
    
                if selected_col:
                    cmap2 = dict(
                        zip(set(md_df_st_.loc[selected][selected_col]), colorslist[:len(set(md_df_st_.loc[selected][selected_col]))]))
                    st.write('The distribution of selected samples across categories')
    
    
    DIANE's avatar
    DIANE committed
                    barhsel = barhplot(
                        md_df_st_.loc[selected][[selected_col]], cmap=cmap2)
                    st.pyplot(barhsel)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    
    #         if meta_data.empty:
    #             # clustered: a list of ints
    #             # sam1 = DataFrame({'name': selected_samples_idx,
    #             #                     'cluster':np.array(labels)[selected_samples_idx]},
    #             #                     index = selected_samples_idx)
    #             st.write(selected_samples_idx)
    #             st.write(clustered)
    #         else:
    #             sam1 = meta_data.iloc[clustered,:].loc[selected_samples_idx,:]
    #             sam1.insert(loc=0, column='index', value=selected_samples_idx)
    #             sam1.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx])
    #         sam1.index = np.arange(len(selected_samples_idx))+1
    #         sam = sam1
    
    #         if clus_method =='HDBSCAN':
    #             with c16:
    #                 unclus = st.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True)
    
    #             if selected_samples_idx:
    #                 if unclus:
    #                     if meta_data.empty:
    #                         sam2 = DataFrame({'name': spectra.index[non_clustered],
    #                                             'cluster':['Non clustered']*len(spectra.index[non_clustered])},
    #                                             index = spectra.index[non_clustered])
    #                     else :
    #                         sam2 = meta_data.iloc[non_clustered,:]
    #                         sam2.insert(loc=0, column='index', value= spectra.index[non_clustered])
    #                         sam2.insert(loc=1, column='cluster', value=['Non clustered']*len(spectra.index[non_clustered]))
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    #                     sam = concat([sam1, sam2], axis = 0)
    #                     sam.index = np.arange(sam.shape[0])+1
    #                     with c17:
    #                         st.info(f'- The number of Non-clustered samples: {sam2.shape[0]}.\n - The proportion of Non-clustered samples: {round(sam2.shape[0]/n_specs*100)}%')
    #         else:
    #             sam = sam1
    #         with c16:
    #             st.write(sam)
    
    
    # if not sam.empty:
    #     zip_data = ""
    #     Nb_ech = str(n_specs)
    #     nb_clu = str(sam1.shape[0])
    #     st.subheader('Download the analysis results')
    #     st.write("**Note:** Please check the box only after you have finished processing your data and are satisfied with the results. Checking the box prematurely may slow down the app and could lead to crashes.")
    #     decis = st.checkbox("Yes, I want to download the results")
    #     if decis:
    #         ###################################################
    #         # ## generate report
    #         @st.cache_data
    #         def export_report(change):
    #             latex_report = report.report('Representative subset selection', file.name, dim_red_method,
    #                                         clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam)
    
    #         @st.cache_data
    #         def preparing_results_for_downloading(change):
    #             # path_to_report = Path("report")############################### i am here
    #             match file.name.split(".")[-1]:
    #                 # load csv file
    #                 case 'csv':
    #                     imp.to_csv('report/results/dataset/'+ file.name, sep = ';', encoding = 'utf-8', mode = 'a')
    #                 case 'dx':
    #                     with open('report/results/dataset/'+file.name, 'w') as dd:
    #                         dd.write(dxdata)
    
    #             fig_spectra.savefig(report_path_rel/"results/figures/spectra_plot.png", dpi = 400) ## Export report
    
    #             if len(axis) == 3:
    #                 for i in range(len(comb)):
    #                     fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'].write_image(report_path_rel/f'results/figures/scores_pc{str(comb[i][0]+1)}_pc{str(comb[i][1]+1)}.png')
    #             elif len(axis)==2 :
    #                 fig_export['fig'].write_image(report_path_rel/'results/figures/scores_plot2D.png')
    #             elif len(axis)==1 :
    #                 fig_export['fig'].write_image(report_path_rel/'results/figures/scores_plot1D.png')
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    #             # Export du graphique
    #             if dim_red_method in ['PCA','NMF']:
    #                 import plotly.io as pio
    #                 img = pio.to_image(loadingsplot, format="png")
    #                 with open(report_path_rel/"results/figures/loadings_plot.png", "wb") as f:
    #                     f.write(img)
    
    DIANE's avatar
    DIANE committed
    #             if dim_red_method == 'PCA':
    
    DIANE's avatar
    DIANE committed
    #                 hotelling_plot.write_image(report_path_rel/"results/figures/hotelling_plot.png", format="png")
    #                 influence_plot.write_image(report_path_rel/'results/figures/influence_plot.png', engine = 'kaleido')
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    #             sam.to_csv(report_path_rel/'results/Selected_subset_for_calib_development.csv', sep = ';')
    #             export_report(change = hash_)
    #             if Path(report_path_rel/"report.tex").exists():
    #                 report.generate_report(change = hash_)
    #             if Path(report_path_rel/"report.pdf").exists():
    #                 move(report_path_rel/"report.pdf", "./report/results/report.pdf")
    #             return change
    
    DIANE's avatar
    DIANE committed
    #         preparing_results_for_downloading(change = hash_)
    #         report.generate_report(change = hash_)
    
    DIANE's avatar
    DIANE committed
    #         @st.cache_data
    #         def tempdir(change):
    #             from tempfile import TemporaryDirectory
    #             with  TemporaryDirectory( prefix="results", dir="./report") as temp_dir:# create a temp directory
    #                 tempdirname = os.path.split(temp_dir)[1]
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    #                 if len(os.listdir(report_path_rel/'results/figures/'))>=2:
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    #                     make_archive(base_name= report_path_rel/"Results", format="zip", base_dir="results", root_dir = "./report")# create a zip file
    #                     move(report_path_rel/"Results.zip", f"./report/{tempdirname}/Results.zip")# put the inside the temp dir
    #                     with open(report_path_rel/f"{tempdirname}/Results.zip", "rb") as f:
    #                         zip_data = f.read()
    #             return tempdirname, zip_data
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    #         try :
    #             tempdirname, zip_data = tempdir(change = hash_)
    #             # st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip",
    #             #             args = None, kwargs = None,type = "primary",use_container_width = True)
    #         except:
    #             pass
    #     date_time = datetime.now().strftime('%y%m%d%H%M')
    #     disabled_down = True if zip_data == '' else False
    #     st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip",
    #                 args = None, kwargs = None,type = "primary",use_container_width = True, disabled = disabled_down)
    
    
    
    DIANE's avatar
    DIANE committed
    #     HandleItems.delete_files(keep = ['.py', '.pyc','.bib'])