Skip to content
Snippets Groups Projects
1-samples_selection.py 33.9 KiB
Newer Older
  • Learn to ignore specific revisions
  • DIANE's avatar
    DIANE committed
    from common import *
    
    st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
    
    DIANE's avatar
    DIANE committed
    
    
    
    
    
    
    DIANE's avatar
    UI  
    DIANE committed
    # layout
    UiComponents(pagespath = pages_folder, csspath= css_file,imgpath=image_path ,
                 header=True, sidebar= True, bgimg=False, colborders=True)
    
    DIANE's avatar
    DIANE committed
    st.header("Calibration Subset Selection") # page title
    st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra")
    c1, c2 = st.columns([3, 1])
    c1.image("./images/sample selection.png", use_column_width=True) # graphical abstract
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    UI  
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    # empty temp figures
    
    DIANE's avatar
    DIANE committed
    report_path = Path("report")
    report_path_rel = Path("./report")
    
    
    DIANE's avatar
    DIANE committed
    def delete_files(keep):
    
    DIANE's avatar
    DIANE committed
        from os import walk, remove
    
    DIANE's avatar
    DIANE committed
        supp = []
        # Walk through the directory
    
    DIANE's avatar
    DIANE committed
        for root, dirs, files in os.walk(report_path, topdown=False):
    
    DIANE's avatar
    DIANE committed
            for file in files:
                if file != 'logo_cefe.png' and not any(file.endswith(ext) for ext in keep):
                    os.remove(os.path.join(root, file))
    
    DIANE's avatar
    DIANE committed
    
    
    
    
    
    DIANE's avatar
    DIANE committed
    if Path('report/out/model').exists() and Path('report/out/model').is_dir():
        rmtree(Path('report/out/model'))
    
    DIANE's avatar
    DIANE committed
    
    # algorithms available on our app
    
    DIANE's avatar
    DIANE committed
    match st.session_state["interface"]:
        case 'simple':
    
            dim_red_methods = ['PCA']
    
    DIANE's avatar
    DIANE committed
            cluster_methods = ['KS'] # List of clustering algos
    
    
        case 'advanced':
            dim_red_methods=['PCA','UMAP', 'NMF']  # List of dimensionality reduction algos
            cluster_methods = ['Kmeans','HDBSCAN', 'AP'] # List of clustering algos
            selec_strategy = ['center','random']
    
    
    DIANE's avatar
    DIANE committed
    # ~~~~~~~~~~~~~~~~ clean the analysis results dir ~~~~~~~~~~~~~~~~
    
    DIANE's avatar
    DIANE committed
    delete_files(keep = ['.py', '.pyc','.bib'])
    
    ################################### I - Data Loading and Visualization ########################################
    files_format = ['csv', 'dx'] # Supported files format
    # loader for datafile
    
    DIANE's avatar
    DIANE committed
    file = c2.file_uploader("Data file", type = ["csv", "dx"], help = " :mushroom: select a csv matrix with samples as rows and lambdas as columns", key = 5)
    
    DIANE's avatar
    DIANE committed
    
    
    ## Preallocation of data structure
    
    DIANE's avatar
    DIANE committed
    spectra = DataFrame()
    meta_data = DataFrame()
    tcr=DataFrame()
    sam=DataFrame()
    sam1=DataFrame()
    selected_samples = DataFrame()
    
    DIANE's avatar
    DIANE committed
    non_clustered = None
    
    DIANE's avatar
    DIANE committed
    l1 = []
    
    DIANE's avatar
    DIANE committed
    labels = []
    color_palette = None
    dr_model = None # dimensionality reduction model
    cl_model = None # clustering model
    
    selection = None
    
    selection_number = "None"
    
    DIANE's avatar
    DIANE committed
    samples_df_chem = DataFrame
    
    DIANE's avatar
    DIANE committed
    selected_samples = []
    selected_samples_idx = []
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    hash_ = ''
    
    
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    if not file:
    
    DIANE's avatar
    DIANE committed
        c2.info('Info: Please load data file !')
    
    DIANE's avatar
    DIANE committed
        extension = file.name.split(".")[-1]
        userfilename = file.name.replace(f".{extension}", '')
    
    DIANE's avatar
    DIANE committed
    
        match extension:
    
    DIANE's avatar
    DIANE committed
            case 'csv':# Load .csv file
    
    DIANE's avatar
    DIANE committed
                with c2:
    
    DIANE's avatar
    DIANE committed
                    c2_1, c2_2 = st.columns([.5, .5])
                    with c2_1:
                        dec = st.radio('decimal:', options= [".", ","], horizontal = True)
                        sep = st.radio("separator:", options = [";", ","], horizontal = True)
                    with c2_2:
                        phdr = st.radio("header: ", options = ["yes", "no"], horizontal = True)
                        pnames = st.radio("samples name:", options = ["yes", "no"], horizontal = True)
    
                    hdr = 0 if phdr =="yes" else None
                    names = 0 if pnames =="yes" else None
                    hash_ = ObjectHash(current=hash_, add= [userfilename, hdr, names, dec, sep])
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                    from io import StringIO
                    stringio = StringIO(file.getvalue().decode("utf-8"))
                    data_str = str(stringio.read())
                    
                    @st.cache_data
    
    DIANE's avatar
    DIANE committed
                    def read_csv(file = file, change = None):
    
    DIANE's avatar
    DIANE committed
                        from utils.data_parsing import CsvParser
    
    DIANE's avatar
    DIANE committed
                        par = CsvParser(file= file)
    
    DIANE's avatar
    DIANE committed
                        par.parse(decimal = dec, separator = sep, index_col = names, header = hdr)
                        return par.float, par.meta_data, par.meta_data_st_, par.df
    
    DIANE's avatar
    DIANE committed
                    
    
    DIANE's avatar
    DIANE committed
                    spectra, meta_data, md_df_st_, imp = read_csv(file= file, change = hash_)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                    try :
    
    DIANE's avatar
    DIANE committed
                        spectra, meta_data, md_df_st_, imp = read_csv(file= file)
    
    DIANE's avatar
    DIANE committed
                        st.success("The data have been loaded successfully", icon="")
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                    except:
                        st.error('''Error: The format of the file does not correspond to the expected dialect settings.
                                  To read the file correctly, please adjust the separator parameters.''')
    
    
    DIANE's avatar
    DIANE committed
            ## Load .dx file
    
    DIANE's avatar
    DIANE committed
            case 'dx':
    
    DIANE's avatar
    DIANE committed
                with c2:
    
    DIANE's avatar
    DIANE committed
                    # Create a temporary file to save the uploaded file
    
    DIANE's avatar
    DIANE committed
                    with NamedTemporaryFile(delete = False, suffix = ".dx") as tmp:
    
    DIANE's avatar
    DIANE committed
                        tmp.write(file.read())
                        tmp_path = tmp.name
                        with open(tmp.name, 'r') as dd:
                            dxdata = dd.read()
                            
                        ## load and parse the temp dx file
                        @st.cache_data
    
    DIANE's avatar
    DIANE committed
                        def read_dx(tmp_path, change = None):
    
    DIANE's avatar
    DIANE committed
                            M = JcampParser(path = tmp_path)
    
    DIANE's avatar
    DIANE committed
                            M.parse()
    
    DIANE's avatar
    DIANE committed
                            return M.chem_data, M.specs_df_, M.meta_data, M.meta_data_st_
                        
                        hash_ = ObjectHash(current=hash_, add= dxdata)
    
    DIANE's avatar
    DIANE committed
                        _, spectra, meta_data, md_df_st_ = read_dx(tmp_path = tmp_path)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                        st.success("The data have been loaded successfully", icon="")
    
    DIANE's avatar
    DIANE committed
    ################################################### END : I- Data loading and preparation ####################################################
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
        with c2:
            st.write('Data summary:')
            st.write(f'- the number of spectra:{spectra.shape[0]}')
            st.write(f'- the number of wavelengths:{spectra.shape[1]}')
            st.write(f'- the number of categorical variables:{meta_data.shape[1]}')
    
    DIANE's avatar
    DIANE committed
    ################################################### BEGIN : visualize and split the data ####################################################
    
    DIANE's avatar
    DIANE committed
    st.subheader("I - Spectral Data Visualization", divider='blue')
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
    
    DIANE's avatar
    DIANE committed
        ObjectHash(np.mean(spectra))
    
        n_samples = spectra.shape[0]
        nwl = spectra.shape[1]
    
    DIANE's avatar
    DIANE committed
        # retrieve columns name and rows name of the dataframe
    
        colnames = list(spectra.columns)
        rownames = [str(i) for i in list(spectra.index)]
        spectra.index = rownames
    
    DIANE's avatar
    DIANE committed
    
        @st.cache_data
    
    DIANE's avatar
    DIANE committed
        def spectra_visualize(variable):# this method takes spectra as input
    
    DIANE's avatar
    DIANE committed
            fig = plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity")
            
            data_info = DataFrame({'Name': [file.name],
    
    DIANE's avatar
    DIANE committed
                                    'Number of scanned samples': [n_samples]},
                                    index = ['Input file'])
            return fig, data_info
    
    DIANE's avatar
    DIANE committed
        fig_spectra, data_info = spectra_visualize(variable = spectra)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        c3, c4 = st.columns([3, 1])
        with c3:
    
    DIANE's avatar
    DIANE committed
            st.pyplot(fig_spectra)
    
    
    DIANE's avatar
    DIANE committed
        with c4:
    
            st.info('Information on the loaded data file')
    
    DIANE's avatar
    DIANE committed
            st.write(data_info) ## table showing the number of samples in the data file
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    ################################################### END : visualize and split the data ####################################################
    
    
    ############################## Exploratory data analysis ###############################
    
    DIANE's avatar
    DIANE committed
    st.subheader("II - Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
    
    ###### 1- Dimensionality reduction ######
    
    DIANE's avatar
    DIANE committed
    t = DataFrame # scores
    p = DataFrame # loadings
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
    
    DIANE's avatar
    DIANE committed
        xc = standardize(spectra, center=True, scale=False)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        c5, c6, c7, c8, c9, c10, c11 = st.columns([1, 1, 0.6, 0.6, 0.6, 1.5, 1.5])
        with c5:
    
            dim_red_method = st.selectbox("Dimensionality reduction techniques: ",
             options = ['']+dim_red_methods if len(dim_red_methods)>2 else dim_red_methods,
             key = 37, format_func = lambda x: x if x else "<Select>", disabled = False if len(dim_red_methods)>2 else True)
    
    DIANE's avatar
    DIANE committed
            if dim_red_method == '':
                st.info('Info: Select a dimensionality reduction technique!')
    
    DIANE's avatar
    DIANE committed
            ObjectHash(dim_red_method)
    
    DIANE's avatar
    DIANE committed
    
    
            if dim_red_method == "UMAP":
    
    DIANE's avatar
    DIANE committed
                if not meta_data.empty:
    
    DIANE's avatar
    DIANE committed
                    filter = md_df_st_.columns.tolist()
                    supervised = st.selectbox('Supervised UMAP by(optional):', options = ['']+filter, format_func = lambda x: x if x else "<Select>", key=108)
                    umapsupervisor = [None if supervised == '' else md_df_st_[supervised]][0]
    
    DIANE's avatar
    DIANE committed
                else:
                    supervised = st.selectbox('Supervised UMAP by:', options = ["Meta-data is not available"], disabled=True, format_func = lambda x: x if x else "<Select>", key=108)
                    umapsupervisor = None
    
    DIANE's avatar
    DIANE committed
                ObjectHash(supervised)
    
            disablewidgets = [False if (dim_red_method and st.session_state.interface == 'advanced') else True][0]
            clus_method = st.selectbox("Clustering techniques(optional): ",
             options = ['']+cluster_methods if len(cluster_methods)>2 else cluster_methods,
             key = 38, format_func = lambda x: x if x else "<Select>", disabled= disablewidgets)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            
            # if disablewidgets == False and dim_red_method in dim_red_methods:
            #     inf = st.info('Info: Select a clustering technique!')
    
    DIANE's avatar
    DIANE committed
            if dim_red_method:
                @st.cache_data
    
    DIANE's avatar
    DIANE committed
                def dimensionality_reduction(variable):
    
    DIANE's avatar
    DIANE committed
                    match dim_red_method:
                        case "PCA":
    
    DIANE's avatar
    DIANE committed
                                from utils.dim_reduction import LinearPCA
    
    DIANE's avatar
    DIANE committed
                                dr_model = LinearPCA(xc, Ncomp=8)
                        case "UMAP":
    
    DIANE's avatar
    DIANE committed
                                from utils.dim_reduction import Umap
    
    DIANE's avatar
    DIANE committed
                                dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = umapsupervisor)   
                        case 'NMF':
    
    DIANE's avatar
    DIANE committed
                                from utils.dim_reduction import Nmf
    
    DIANE's avatar
    DIANE committed
                                dr_model = Nmf(spectra, Ncomp= 3)
                    return dr_model
                
    
    DIANE's avatar
    DIANE committed
                dr_model = dimensionality_reduction(variable = xc)
    
    DIANE's avatar
    DIANE committed
                
    
    DIANE's avatar
    DIANE committed
            if dr_model:
    
    DIANE's avatar
    DIANE committed
                axis1 = c7.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
                axis2 = c8.selectbox("y-axis", options = dr_model.scores_.columns, index=1)
                axis3 = c9.selectbox("z-axis", options = dr_model.scores_.columns, index=2)
    
    DIANE's avatar
    DIANE committed
                axis = np.unique([axis1, axis2, axis3])
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                t = dr_model.scores_.loc[:,np.unique(axis)]
                tcr = standardize(t)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    ###### II - clustering #######
    
    if not t.empty:
    
    DIANE's avatar
    DIANE committed
        clustered = np.arange(n_samples)
        non_clustered = None
    
    
    DIANE's avatar
    DIANE committed
            c12 = st.container()
    
    DIANE's avatar
    DIANE committed
            c12, c13 = st.columns([3,3])
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
    
    DIANE's avatar
    DIANE committed
        with c6:
            sel_ratio = st.number_input('Enter the number/fraction of samples to be selected:',min_value=0.01, max_value=float("{:.2f}".format(spectra.shape[0])), value=0.20, format="%.2f", disabled= disablewidgets)
            if sel_ratio:
                if sel_ratio > 1.00:
                    ratio = int(sel_ratio)
                elif sel_ratio < 1.00:
                    ratio = int(sel_ratio*spectra.shape[0])
    
    DIANE's avatar
    DIANE committed
                ObjectHash(sel_ratio)
    
    DIANE's avatar
    DIANE committed
            if st.session_state["interface"] =='simple':
                clus_method = 'KS'
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            else:
                if dr_model and not clus_method:
                    clus_method = st.radio('Select samples selection strategy:', options = ['RDM', 'KS'])
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                elif dr_model and clus_method:
                    disabled1 = False if clus_method in cluster_methods else True
                    selection = st.radio('Select samples selection strategy:', options = selec_strategy, disabled = disabled1)
    
    DIANE's avatar
    DIANE committed
    
    
    
    if dr_model and sel_ratio:
    
    DIANE's avatar
    DIANE committed
        # Clustering
        match clus_method:
            case 'Kmeans':
    
    DIANE's avatar
    DIANE committed
                from utils.clustering import Sk_Kmeans
    
    DIANE's avatar
    DIANE committed
                cl_model = Sk_Kmeans(tcr, max_clusters = ratio)
                data, labels, clu_centers = cl_model.fit_optimal_
                ncluster = clu_centers.shape[0]
    
    DIANE's avatar
    DIANE committed
    
            # 2- HDBSCAN clustering
            case 'HDBSCAN':
    
    DIANE's avatar
    DIANE committed
                from utils.clustering import Hdbscan
    
    DIANE's avatar
    DIANE committed
                cl_model = Hdbscan(np.array(tcr))
                labels, clu_centers, non_clustered = cl_model.labels_,cl_model.centers_, cl_model.non_clustered
    
    DIANE's avatar
    DIANE committed
                ncluster = len(clu_centers)
    
            # 3- Affinity propagation
            case 'AP':
    
    DIANE's avatar
    DIANE committed
                from utils.clustering import AP
    
    DIANE's avatar
    DIANE committed
                cl_model = AP(X = tcr)
                data, labels, clu_centers = cl_model.fit_optimal_
                ncluster = len(clu_centers)
    
            case 'KS':
    
    DIANE's avatar
    DIANE committed
                cl_model = KS(x = tcr, rset = ratio)
    
    DIANE's avatar
    DIANE committed
    
            case 'RDM':
    
    DIANE's avatar
    DIANE committed
                cl_model = RDM(x = tcr, rset = ratio)
    
    
    DIANE's avatar
    DIANE committed
        if clus_method in ['KS', 'RDM']:
    
    DIANE's avatar
    DIANE committed
            _, selected_samples_idx = cl_model.calset
            labels = ["ind"]*n_samples
            ncluster = "1"
            selection_number = 'None'
            selection = 'None'
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        new_tcr = tcr.iloc[clustered,:]
    
    DIANE's avatar
    DIANE committed
        
    
    DIANE's avatar
    DIANE committed
    # #################################################### III - Samples selection using the reduced data presentation ######
    
    if not labels:
        custom_color_palette = px.colors.qualitative.Plotly[:1]
    elif labels:
    
        num_clusters = len(np.unique(labels))
        custom_color_palette = px.colors.qualitative.Plotly[:num_clusters]
    
    DIANE's avatar
    DIANE committed
        if clus_method:
    
    DIANE's avatar
    DIANE committed
            match selection:
            # Strategy 0
                case 'center':
                    # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
    
    DIANE's avatar
    DIANE committed
                    from sklearn.metrics import pairwise_distances_argmin_min
    
    DIANE's avatar
    DIANE committed
                    closest, _ = pairwise_distances_argmin_min(clu_centers, new_tcr)
                    selected_samples_idx = np.array(new_tcr.index)[list(closest)]
                    selected_samples_idx = selected_samples_idx.tolist()
                    
                #### Strategy 1
                case 'random':
    
    DIANE's avatar
    DIANE committed
                    selection_number = int(ratio/num_clusters)
    
    DIANE's avatar
    DIANE committed
                    ObjectHash(selection_number)
    
    DIANE's avatar
    DIANE committed
                    s = np.array(labels)[np.where(np.array(labels) !='Non clustered')[0]]
                    for i in np.unique(s):
                        C = np.where(np.array(labels) == i)[0]
                        if C.shape[0] >= selection_number:
    
    DIANE's avatar
    DIANE committed
                            from sklearn.cluster import KMeans
    
    DIANE's avatar
    DIANE committed
                            km2 = KMeans(n_clusters = selection_number)
                            km2.fit(tcr.iloc[C,:])
    
    DIANE's avatar
    DIANE committed
                            from sklearn.metrics import pairwise_distances_argmin_min
    
    DIANE's avatar
    DIANE committed
                            clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:])
                            selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index)
                        else:
                            selected_samples_idx.extend(new_tcr.iloc[C,:].index.to_list())
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ results visualization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    
    DIANE's avatar
    DIANE committed
        ## Scores
    
    if not t.empty:
    
    DIANE's avatar
    DIANE committed
        if meta_data.empty and clus_method in cluster_methods:
    
    DIANE's avatar
    DIANE committed
            filter = clus_method
    
    DIANE's avatar
    DIANE committed
        elif not meta_data.empty and clus_method in cluster_methods:
    
    DIANE's avatar
    DIANE committed
            filter = [clus_method] + md_df_st_.columns.tolist()
    
    DIANE's avatar
    DIANE committed
        elif not meta_data.empty and clus_method not in cluster_methods:
            filter = [''] + md_df_st_.columns.tolist()
        elif meta_data.empty and not clus_method in cluster_methods:
            filter = []
    
    DIANE's avatar
    DIANE committed
        
        if st.session_state["interface"] =='simple':
            desactivatelist = True
            if meta_data.empty:
                desactivatelist = True
                filter = ['']
            elif not meta_data.empty:
                filter = [''] + md_df_st_.columns.tolist()
                desactivatelist = False
        else:
            desactivatelist = False
        
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        with c12:
    
            st.write('Scores plot')
    
    DIANE's avatar
    DIANE committed
            tcr_plot = tcr.copy()
    
    DIANE's avatar
    DIANE committed
            if len(axis)== 1:
                tcr_plot['1d'] = np.random.uniform(-.5, .5, tcr_plot.shape[0])
    
    
    DIANE's avatar
    DIANE committed
            colfilter = st.selectbox('Color by:', options= filter,format_func = lambda x: x if x else "<Select>", disabled = desactivatelist)
    
    DIANE's avatar
    DIANE committed
            ObjectHash(colfilter)
    
    DIANE's avatar
    DIANE committed
            if colfilter in cluster_methods:
                tcr_plot[colfilter] = labels
            elif not meta_data.empty and colfilter in md_df_st_.columns.tolist():
                tcr_plot[f'{colfilter} :'] = list(map(str.lower,md_df_st_.loc[:,colfilter]))
    
    DIANE's avatar
    DIANE committed
                tcr_plot[f'{colfilter} :'] = ['sample'] * tcr_plot.shape[0]
            
            col_var_name = tcr_plot.columns.tolist()[-1]
            n_categories = len(np.unique(tcr_plot[col_var_name]))
            custom_color_palette = px.colors.qualitative.Plotly[:n_categories]
    
    
    DIANE's avatar
    DIANE committed
            if selected_samples_idx:# color selected samples
                t_selected = tcr_plot.iloc[selected_samples_idx,:]
            match t.shape[1]:
                case 3:
                    fig = px.scatter_3d(tcr_plot, x = axis[0], y = axis[1], z = axis[2], color = col_var_name ,color_discrete_sequence = custom_color_palette)
                    fig.update_traces(marker=dict(size=4))
                    if selected_samples_idx:# color selected samples
                        fig.add_scatter3d(x = t_selected.loc[:,axis[0]], y = t_selected.loc[:,axis[1]], z = t_selected.loc[:,axis[2]],
                                        mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples')
    
    DIANE's avatar
    DIANE committed
                    
    
    DIANE's avatar
    DIANE committed
                case 2:
                    fig = px.scatter(tcr_plot, x = axis[0], y = axis[1], color = col_var_name ,color_discrete_sequence = custom_color_palette)
                    if selected_samples_idx:# color selected samples
                        fig.add_scatter(x = t_selected.loc[:,axis[0]], y = t_selected.loc[:,axis[1]],
                                        mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples')
    
                
    
    DIANE's avatar
    DIANE committed
                case 1:
                    yy = np.random.uniform(-.5, .5, tcr_plot.shape[0])
                    fig = px.scatter(tcr_plot, x = axis[0], y = '1d', color = col_var_name ,color_discrete_sequence = custom_color_palette)
                    fig.add_scatter(x = t_selected.loc[:,axis[0]], y = t_selected['1d'],
    
    DIANE's avatar
    DIANE committed
                                        mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples')
    
    DIANE's avatar
    DIANE committed
                    fig.update_layout( yaxis_range=[-1.6, 1.6])
    
    DIANE's avatar
    DIANE committed
                    fig.update_yaxes(visible=False)
    
            st.plotly_chart(fig, use_container_width = True)
    
            if labels:
                fig_export = {}
                # export 2D scores plot
                if len(axis)== 3:
    
    DIANE's avatar
    DIANE committed
                    from itertools import combinations
    
    DIANE's avatar
    DIANE committed
                    comb = [i for i in combinations(np.arange(len(axis)), 2)]
                    subcap = ['a','b','c']
                    for i in range(len(comb)):
                        fig_= px.scatter(tcr_plot, x = axis[(comb[i][0])], y=axis[(comb[i][1])],color = labels if list(labels) else None,color_discrete_sequence = custom_color_palette)
                        fig_.add_scatter(x = t_selected.loc[:,axis[(comb[i][0])]], y = t_selected.loc[:,axis[(comb[i][1])]], mode ='markers', marker = dict(size = 5, color = 'black'),
                                    name = 'selected samples')
                        fig_.update_layout(font=dict(size=23))
                        fig_.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1,
                                                    font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
                        fig_.update_traces(marker=dict(size= 10), showlegend= False)
                        fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'] = fig_
                        # fig_export.write_image(f'./report/out/figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png')
                else:
                    fig_export['fig'] = fig
    
    DIANE's avatar
    DIANE committed
                
    
    DIANE's avatar
    DIANE committed
    if not spectra.empty:
    
    DIANE's avatar
    DIANE committed
        if dim_red_method in ['PCA','NMF']:
    
    DIANE's avatar
    DIANE committed
            with c13:
    
                st.write('Loadings plot')
                p = dr_model.loadings_
    
    DIANE's avatar
    DIANE committed
                freq = DataFrame(colnames, index=p.index)
    
    DIANE's avatar
    DIANE committed
                if extension =='dx':
    
    DIANE's avatar
    DIANE committed
                    if meta_data.loc[:,'xunits'][0] == '1/cm':
                        freq.columns = ['Wavenumber (1/cm)']
    
    DIANE's avatar
    DIANE committed
                        xlab = "Wavenumber (1/cm)"
                        inv = 'reversed'
    
    DIANE's avatar
    DIANE committed
                    else:
                        freq.columns = ['Wavelength (nm)']
    
    DIANE's avatar
    DIANE committed
                        xlab = 'Wavelength (nm)'
                        inv = None
    
    DIANE's avatar
    DIANE committed
                else:
                    freq.columns = ['Wavelength/Wavenumber']
    
    DIANE's avatar
    DIANE committed
                    xlab = 'Wavelength/Wavenumber'
                    inv = None
    
    DIANE's avatar
    DIANE committed
                    
    
    DIANE's avatar
    DIANE committed
                pp = concat([p, freq], axis=1)
    
    DIANE's avatar
    DIANE committed
                #########################################
                df1 = pp.melt(id_vars=freq.columns)
    
    DIANE's avatar
    DIANE committed
                loadingsplot = px.line(df1, x=freq.columns, y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly)
                loadingsplot.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"),
    
                                            bordercolor="black", borderwidth=2))
    
    DIANE's avatar
    DIANE committed
                loadingsplot.update_layout(xaxis_title = xlab,yaxis_title = "Intensity" ,xaxis = dict(autorange= inv))
    
    DIANE's avatar
    DIANE committed
                st.plotly_chart(loadingsplot, use_container_width=True)
        
    
    DIANE's avatar
    DIANE committed
    #############################################################################################################
    
    DIANE's avatar
    DIANE committed
        if dim_red_method == 'PCA':
    
    DIANE's avatar
    DIANE committed
            c14, c15 = st.columns([3, 3])
            with c14:
    
                st.write('Influence plot')
    
    DIANE's avatar
    DIANE committed
                # Laverage
                Hat =  t.to_numpy() @ np.linalg.inv(np.transpose(t.to_numpy()) @ t.to_numpy()) @ np.transpose(t.to_numpy())
                leverage = np.diag(Hat) / np.trace(Hat)
    
    DIANE's avatar
    DIANE committed
                # Loadings
    
    DIANE's avatar
    DIANE committed
                p = dr_model.loadings_.loc[:,axis]
    
    DIANE's avatar
    DIANE committed
                # Matrix reconstruction
                xp = np.dot(t,p.T)
                # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model
                residuals = np.diag(np.subtract(xc.to_numpy(), xp)@ np.subtract(xc.to_numpy(), xp).T)
    
    DIANE's avatar
    DIANE committed
                from scipy.stats import chi2
                tresh4 = chi2.ppf(0.05, df = len(axis))
    
    DIANE's avatar
    DIANE committed
    
                # color with metadata
    
    DIANE's avatar
    DIANE committed
                if colfilter:
                    if colfilter == "":
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                    elif colfilter == clus_method:
    
    DIANE's avatar
    DIANE committed
                        l1 = labels
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
                    else:
    
    DIANE's avatar
    DIANE committed
                        l1 = tcr_plot[f'{colfilter} :']
    
    DIANE's avatar
    DIANE committed
                tcr_plot["leverage"] = leverage
                tcr_plot["residuals"] = residuals
                influence_plot = px.scatter(data_frame =tcr_plot, x = "leverage", y = "residuals", color=col_var_name,
    
    DIANE's avatar
    DIANE committed
                                                color_discrete_sequence= custom_color_palette)
    
    DIANE's avatar
    DIANE committed
                influence_plot.add_scatter(x = leverage[selected_samples_idx] , y = residuals[selected_samples_idx],
                                           mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples')
                
    
    DIANE's avatar
    DIANE committed
                influence_plot.add_vline(x = tresh3, line_width = 1, line_dash = 'solid', line_color = 'red')
                influence_plot.add_hline(y=tresh4, line_width=1, line_dash='solid', line_color='red')
                influence_plot.update_layout(xaxis_title="Leverage", yaxis_title = "Q-residuals", font=dict(size=20), width=800, height=600)
    
    DIANE's avatar
    DIANE committed
    
                out3 = leverage > tresh3
                out4 = residuals > tresh4
    
    
    DIANE's avatar
    DIANE committed
                # for i in range(n_samples):
                #     if out3[i]:
                #         if not meta_data.empty:
                #             ann =  meta_data.loc[:,'name'][i]
                #         else:
                #             ann = t.index[i]
                #         influence_plot.add_annotation(dict(x = leverage[i], y = residuals[i], showarrow=True, text = str(ann),font= dict(color= "black", size= 15),
                #                     xanchor = 'auto', yanchor = 'auto'))
    
    DIANE's avatar
    DIANE committed
                
    
    DIANE's avatar
    DIANE committed
                influence_plot.update_traces(marker=dict(size= 6), showlegend= True)
                influence_plot.update_layout(font=dict(size=23), width=800, height=500)
                st.plotly_chart(influence_plot, use_container_width=True)
    
    DIANE's avatar
    DIANE committed
                
    
    DIANE's avatar
    DIANE committed
                for annotation in influence_plot.layout.annotations:
    
    DIANE's avatar
    DIANE committed
                    annotation.font.size = 35
    
    DIANE's avatar
    DIANE committed
                influence_plot.update_layout(font=dict(size=23), width=800, height=600)
                influence_plot.update_traces(marker=dict(size= 10), showlegend= False)
                influence_plot.add_annotation(text= '(a)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1,
    
                                                 font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
    
    DIANE's avatar
    DIANE committed
                # influence_plot.write_image('./report/out/figures/influence_plot.png', engine = 'kaleido')
    
    DIANE's avatar
    DIANE committed
            with c15:
    
    DIANE's avatar
    DIANE committed
                st.write('T²-Hotelling vs Q-residuals plot')
    
    DIANE's avatar
    DIANE committed
                # Hotelling
                hotelling  = t.var(axis = 1)
                # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model
                residuals = np.diag(np.subtract(xc.to_numpy(), xp)@ np.subtract(xc.to_numpy(), xp).T)
    
    DIANE's avatar
    DIANE committed
                
                from scipy.stats import f, chi2
                fcri = f.isf(0.05, 3, n_samples)
    
                tresh0 = (3 * (n_samples ** 2 - 1) * fcri) / (n_samples * (n_samples - 3))
    
    DIANE's avatar
    DIANE committed
                tresh1 = chi2.ppf(0.05, df = 3)
    
    DIANE's avatar
    DIANE committed
                hotelling_plot = px.scatter(t, x = hotelling, y = residuals, color=labels if list(labels) else None,
    
    DIANE's avatar
    DIANE committed
                                                color_discrete_sequence= custom_color_palette)
    
    DIANE's avatar
    DIANE committed
                hotelling_plot.add_scatter(x = hotelling[selected_samples_idx] , y = residuals[selected_samples_idx],
                                           mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples')
    
    DIANE's avatar
    DIANE committed
                hotelling_plot.update_layout(xaxis_title="Hotelling-T² distance",yaxis_title="Q-residuals")
                hotelling_plot.add_vline(x=tresh0, line_width=1, line_dash='solid', line_color='red')
                hotelling_plot.add_hline(y=tresh1, line_width=1, line_dash='solid', line_color='red')
    
    DIANE's avatar
    DIANE committed
    
                out0 = hotelling > tresh0
                out1 = residuals > tresh1
    
    DIANE's avatar
    DIANE committed
                
    
    DIANE's avatar
    DIANE committed
                    if out0[i]:
                        if not meta_data.empty:
                            ann =  meta_data.loc[:,'name'][i]
                        else:
                            ann = t.index[i]
    
    DIANE's avatar
    DIANE committed
                        hotelling_plot.add_annotation(dict(x = hotelling[i], y = residuals[i], showarrow=True, text = str(ann), font= dict(color= "black", size= 15),
    
    DIANE's avatar
    DIANE committed
                                    xanchor = 'auto', yanchor = 'auto'))
    
    DIANE's avatar
    DIANE committed
                        
    
    DIANE's avatar
    DIANE committed
                hotelling_plot.update_traces(marker=dict(size= 6), showlegend= True)
                hotelling_plot.update_layout(font=dict(size=23), width=800, height=500)
                st.plotly_chart(hotelling_plot, use_container_width=True)
    
    DIANE's avatar
    DIANE committed
                for annotation in hotelling_plot.layout.annotations:
    
    DIANE's avatar
    DIANE committed
                    annotation.font.size = 35
    
    DIANE's avatar
    DIANE committed
                hotelling_plot.update_layout(font=dict(size=23), width=800, height=600)
                hotelling_plot.update_traces(marker=dict(size= 10), showlegend= False)
                hotelling_plot.add_annotation(text= '(b)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1,
    
                                                 font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
    
    DIANE's avatar
    DIANE committed
                # hotelling_plot.write_image("./report/out/figures/hotelling_plot.png", format="png")
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    st.subheader('III - Selected Samples for Reference Analysis', divider='blue')
    
    DIANE's avatar
    DIANE committed
        c16, c17 = st.columns([3, 1])
        c16.write("Tabular identifiers of selected samples for reference analysis:")
    
    DIANE's avatar
    DIANE committed
                sam1 = DataFrame({'name': spectra.index[clustered][selected_samples_idx],
    
                                    'cluster':np.array(labels)[clustered][selected_samples_idx]},
                                    index = selected_samples_idx)
            else:
                sam1 = meta_data.iloc[clustered,:].iloc[selected_samples_idx,:]
                sam1.insert(loc=0, column='index', value=selected_samples_idx)
                sam1.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx])
            sam1.index = np.arange(len(selected_samples_idx))+1
    
    DIANE's avatar
    DIANE committed
            with c17:
                st.info(f'Information !\n - The total number of samples: {n_samples}.\n- The number of samples selected for reference analysis: {sam1.shape[0]}.\n - The proportion of samples selected for reference analysis: {round(sam1.shape[0]/n_samples*100)}%.')
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            if clus_method =='HDBSCAN':
                with c16:
                    unclus = st.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True)
    
    
                if selected_samples_idx:
                    if unclus:
                        if meta_data.empty:
    
    DIANE's avatar
    DIANE committed
                            sam2 = DataFrame({'name': spectra.index[non_clustered],
    
                                                'cluster':['Non clustered']*len(spectra.index[non_clustered])},
                                                index = spectra.index[non_clustered])
                        else :
                            sam2 = meta_data.iloc[non_clustered,:]
                            sam2.insert(loc=0, column='index', value= spectra.index[non_clustered])
                            sam2.insert(loc=1, column='cluster', value=['Non clustered']*len(spectra.index[non_clustered]))
                        
    
    DIANE's avatar
    DIANE committed
                        sam = concat([sam1, sam2], axis = 0)
    
    DIANE's avatar
    DIANE committed
                        with c17:
                            st.info(f'- The number of Non-clustered samples: {sam2.shape[0]}.\n - The proportion of Non-clustered samples: {round(sam2.shape[0]/n_samples*100)}%')
    
    DIANE's avatar
    DIANE committed
            with c16:
                st.write(sam)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    if not sam.empty:
    
    DIANE's avatar
    DIANE committed
        zip_data = ""
    
        Nb_ech = str(n_samples)
        nb_clu = str(sam1.shape[0])
    
    DIANE's avatar
    DIANE committed
        st.subheader('Download the analysis results')
    
    DIANE's avatar
    DIANE committed
        st.write("**Note:** Please check the box only after you have finished processing your data and are satisfied with the results. Checking the box prematurely may slow down the app and could lead to crashes.")
        decis = st.checkbox("Yes, I want to download the results")
        if decis:
            ###################################################
            # ## generate report
            @st.cache_data
    
    DIANE's avatar
    DIANE committed
            def export_report(change):
    
    DIANE's avatar
    DIANE committed
                latex_report = report.report('Representative subset selection', file.name, dim_red_method,
                                            clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam)
    
    DIANE's avatar
    DIANE committed
            @st.cache_data
            def preparing_results_for_downloading(change):
    
    DIANE's avatar
    DIANE committed
                # path_to_report = Path("report")############################### i am here
    
    DIANE's avatar
    DIANE committed
                match extension:
                    # load csv file
                    case 'csv':
                        imp.to_csv('report/out/dataset/'+ file.name, sep = ';', encoding = 'utf-8', mode = 'a')
                    case 'dx':
                        with open('report/out/dataset/'+file.name, 'w') as dd:
                            dd.write(dxdata)
    
    
    DIANE's avatar
    DIANE committed
                fig_spectra.savefig(report_path_rel/"out/figures/spectra_plot.png", dpi = 400) ## Export report
    
    DIANE's avatar
    DIANE committed
    
                if len(axis) == 3:
                    for i in range(len(comb)):
    
    DIANE's avatar
    DIANE committed
                        fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'].write_image(report_path_rel/f'out/figures/scores_pc{str(comb[i][0]+1)}_pc{str(comb[i][1]+1)}.png')
    
    DIANE's avatar
    DIANE committed
                elif len(axis)==2 :
    
    DIANE's avatar
    DIANE committed
                    fig_export['fig'].write_image(report_path_rel/'out/figures/scores_plot2D.png')
    
    DIANE's avatar
    DIANE committed
                elif len(axis)==1 :
    
    DIANE's avatar
    DIANE committed
                    fig_export['fig'].write_image(report_path_rel/'out/figures/scores_plot1D.png')
    
    DIANE's avatar
    DIANE committed
                        
                # Export du graphique
                if dim_red_method in ['PCA','NMF']:
    
    DIANE's avatar
    DIANE committed
                    import plotly.io as pio
    
    DIANE's avatar
    DIANE committed
                    img = pio.to_image(loadingsplot, format="png")
    
    DIANE's avatar
    DIANE committed
                    with open(report_path_rel/"out/figures/loadings_plot.png", "wb") as f:
    
    DIANE's avatar
    DIANE committed
                        f.write(img)
                if dim_red_method == 'PCA': 
    
    DIANE's avatar
    DIANE committed
                    hotelling_plot.write_image(report_path_rel/"out/figures/hotelling_plot.png", format="png")
                    influence_plot.write_image(report_path_rel/'out/figures/influence_plot.png', engine = 'kaleido')
    
    DIANE's avatar
    DIANE committed
                
    
    DIANE's avatar
    DIANE committed
                sam.to_csv(report_path_rel/'out/Selected_subset_for_calib_development.csv', sep = ';')
    
    DIANE's avatar
    DIANE committed
                export_report(change = hash_)
    
    DIANE's avatar
    DIANE committed
                if Path(report_path_rel/"report.tex").exists():
    
    DIANE's avatar
    DIANE committed
                    report.generate_report(change = hash_)
    
    DIANE's avatar
    DIANE committed
                if Path(report_path_rel/"report.pdf").exists():
                    move(report_path_rel/"report.pdf", "./report/out/report.pdf")
    
    DIANE's avatar
    DIANE committed
                return change
    
    DIANE's avatar
    DIANE committed
            preparing_results_for_downloading(change = hash_)
            report.generate_report(change = hash_)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            @st.cache_data
            def tempdir(change):
    
    DIANE's avatar
    DIANE committed
                from tempfile import TemporaryDirectory
    
    DIANE's avatar
    DIANE committed
                with  TemporaryDirectory( prefix="results", dir="./report") as temp_dir:# create a temp directory
    
    DIANE's avatar
    DIANE committed
                    tempdirname = os.path.split(temp_dir)[1]
    
    
    DIANE's avatar
    DIANE committed
                    if len(os.listdir(report_path_rel/'out/figures/'))>=2:
    
    DIANE's avatar
    DIANE committed
                        make_archive(base_name= report_path_rel/"Results", format="zip", base_dir="out", root_dir = "./report")# create a zip file
                        move(report_path_rel/"Results.zip", f"./report/{tempdirname}/Results.zip")# put the inside the temp dir
                        with open(report_path_rel/f"{tempdirname}/Results.zip", "rb") as f:
    
    DIANE's avatar
    DIANE committed
                            zip_data = f.read()
                return tempdirname, zip_data
    
    DIANE's avatar
    DIANE committed
            
    
    DIANE's avatar
    DIANE committed
            try :
                tempdirname, zip_data = tempdir(change = hash_)
    
    DIANE's avatar
    DIANE committed
                # st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip",
                #             args = None, kwargs = None,type = "primary",use_container_width = True)
    
    DIANE's avatar
    DIANE committed
            except:
                pass
    
    DIANE's avatar
    DIANE committed
        date_time = datetime.now().strftime('%y%m%d%H%M')
        disabled_down = True if zip_data == '' else False
        st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip",
                    args = None, kwargs = None,type = "primary",use_container_width = True, disabled = disabled_down)
    
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
        delete_files(keep = ['.py', '.pyc','.bib'])