Skip to content
Snippets Groups Projects
1-samples_selection.py 33.8 KiB
Newer Older
DIANE's avatar
DIANE committed
from common import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
DIANE's avatar
DIANE committed





DIANE's avatar
UI  
DIANE committed
# layout
UiComponents(pagespath = pages_folder, csspath= css_file,imgpath=image_path ,
             header=True, sidebar= True, bgimg=False, colborders=True)
DIANE's avatar
DIANE committed
st.header("Calibration Subset Selection") # page title
st.markdown("Create a predictive model, then use it for predicting your target variable (chemical data) from NIRS spectra")
c1, c2 = st.columns([3, 1])
c1.image("./images/sample selection.png", use_column_width=True) # graphical abstract
DIANE's avatar
DIANE committed

DIANE's avatar
UI  
DIANE committed

DIANE's avatar
DIANE committed
# empty temp figures
DIANE's avatar
DIANE committed
report_path = Path("report")
report_path_rel = Path("./report")

DIANE's avatar
DIANE committed
def delete_files(keep):
DIANE's avatar
DIANE committed
    from os import walk, remove
DIANE's avatar
DIANE committed
    supp = []
    # Walk through the directory
DIANE's avatar
DIANE committed
    for root, dirs, files in os.walk(report_path, topdown=False):
DIANE's avatar
DIANE committed
        for file in files:
            if file != 'logo_cefe.png' and not any(file.endswith(ext) for ext in keep):
                os.remove(os.path.join(root, file))
DIANE's avatar
DIANE committed




DIANE's avatar
DIANE committed
if Path('report/out/model').exists() and Path('report/out/model').is_dir():
    rmtree(Path('report/out/model'))
DIANE's avatar
DIANE committed

# algorithms available on our app
DIANE's avatar
DIANE committed
match st.session_state["interface"]:
    case 'simple':
        dim_red_methods = ['PCA']
DIANE's avatar
DIANE committed
        cluster_methods = ['KS'] # List of clustering algos

    case 'advanced':
        dim_red_methods=['PCA','UMAP', 'NMF']  # List of dimensionality reduction algos
        cluster_methods = ['Kmeans','HDBSCAN', 'AP'] # List of clustering algos
        selec_strategy = ['center','random']

DIANE's avatar
DIANE committed
# ~~~~~~~~~~~~~~~~ clean the analysis results dir ~~~~~~~~~~~~~~~~
DIANE's avatar
DIANE committed
delete_files(keep = ['.py', '.pyc','.bib'])

################################### I - Data Loading and Visualization ########################################
files_format = ['csv', 'dx'] # Supported files format
# loader for datafile
DIANE's avatar
DIANE committed
file = c2.file_uploader("Data file", type = ["csv", "dx"], help = " :mushroom: select a csv matrix with samples as rows and lambdas as columns", key = 5)
DIANE's avatar
DIANE committed

## Preallocation of data structure
DIANE's avatar
DIANE committed
spectra = DataFrame()
meta_data = DataFrame()
tcr=DataFrame()
sam=DataFrame()
sam1=DataFrame()
selected_samples = DataFrame()
DIANE's avatar
DIANE committed
non_clustered = None
DIANE's avatar
DIANE committed
l1 = []
DIANE's avatar
DIANE committed
labels = []
color_palette = None
dr_model = None # dimensionality reduction model
cl_model = None # clustering model
selection = None
selection_number = "None"
DIANE's avatar
DIANE committed
samples_df_chem = DataFrame
DIANE's avatar
DIANE committed
selected_samples = []
selected_samples_idx = []
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
hash_ = ''


DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
if not file:
DIANE's avatar
DIANE committed
    c2.info('Info: Please load data file !')
DIANE's avatar
DIANE committed
    extension = file.name.split(".")[-1]
    userfilename = file.name.replace(f".{extension}", '')
DIANE's avatar
DIANE committed

    match extension:
DIANE's avatar
DIANE committed
        case 'csv':# Load .csv file
DIANE's avatar
DIANE committed
            with c2:
DIANE's avatar
DIANE committed
                c2_1, c2_2 = st.columns([.5, .5])
                with c2_1:
                    dec = st.radio('decimal:', options= [".", ","], horizontal = True)
                    sep = st.radio("separator:", options = [";", ","], horizontal = True)
                with c2_2:
                    phdr = st.radio("header: ", options = ["yes", "no"], horizontal = True)
                    pnames = st.radio("samples name:", options = ["yes", "no"], horizontal = True)

                hdr = 0 if phdr =="yes" else None
                names = 0 if pnames =="yes" else None
                hash_ = ObjectHash(current=hash_, add= [userfilename, hdr, names, dec, sep])
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
                from io import StringIO
                stringio = StringIO(file.getvalue().decode("utf-8"))
                data_str = str(stringio.read())
                
                @st.cache_data
DIANE's avatar
DIANE committed
                def read_csv(file = file, change = None):
DIANE's avatar
DIANE committed
                    from utils.data_parsing import CsvParser
DIANE's avatar
DIANE committed
                    par = CsvParser(file= file)
DIANE's avatar
DIANE committed
                    par.parse(decimal = dec, separator = sep, index_col = names, header = hdr)
                    return par.float, par.meta_data, par.meta_data_st_, par.df
DIANE's avatar
DIANE committed
                
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
                try :
                    spectra, meta_data, md_df_st_, imp = read_csv(file= file, change = hash_)
DIANE's avatar
DIANE committed
                    st.success("The data have been loaded successfully", icon="")
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
                except:
                    st.error('''Error: The format of the file does not correspond to the expected dialect settings.
                              To read the file correctly, please adjust the separator parameters.''')

DIANE's avatar
DIANE committed
        ## Load .dx file
DIANE's avatar
DIANE committed
        case 'dx':
DIANE's avatar
DIANE committed
            with c2:
DIANE's avatar
DIANE committed
                # Create a temporary file to save the uploaded file
DIANE's avatar
DIANE committed
                with NamedTemporaryFile(delete = False, suffix = ".dx") as tmp:
DIANE's avatar
DIANE committed
                    tmp.write(file.read())
                    tmp_path = tmp.name
                    with open(tmp.name, 'r') as dd:
                        dxdata = dd.read()
                        
                    ## load and parse the temp dx file
                    @st.cache_data
DIANE's avatar
DIANE committed
                    def read_dx(tmp_path, change = None):
DIANE's avatar
DIANE committed
                        M = JcampParser(path = tmp_path)
DIANE's avatar
DIANE committed
                        M.parse()
DIANE's avatar
DIANE committed
                        return M.chem_data, M.specs_df_, M.meta_data, M.meta_data_st_
                    
                    hash_ = ObjectHash(current=hash_, add= dxdata)
DIANE's avatar
DIANE committed
                    _, spectra, meta_data, md_df_st_ = read_dx(tmp_path = tmp_path)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
                    st.success("The data have been loaded successfully", icon="")
DIANE's avatar
DIANE committed
################################################### END : I- Data loading and preparation ####################################################
DIANE's avatar
DIANE committed
if not spectra.empty:
    with c2:
        st.write('Data summary:')
        st.write(f'- the number of spectra:{spectra.shape[0]}')
        st.write(f'- the number of wavelengths:{spectra.shape[1]}')
        st.write(f'- the number of categorical variables:{meta_data.shape[1]}')
DIANE's avatar
DIANE committed
################################################### BEGIN : visualize and split the data ####################################################
DIANE's avatar
DIANE committed
st.subheader("I - Spectral Data Visualization", divider='blue')
DIANE's avatar
DIANE committed
if not spectra.empty:
DIANE's avatar
DIANE committed
    ObjectHash(np.mean(spectra))
    n_samples = spectra.shape[0]
    nwl = spectra.shape[1]
DIANE's avatar
DIANE committed
    # retrieve columns name and rows name of the dataframe
DIANE's avatar
DIANE committed
    colnames = list(spectra.columns)
    rownames = [str(i) for i in list(spectra.index)]
    spectra.index = rownames
DIANE's avatar
DIANE committed

    @st.cache_data
DIANE's avatar
DIANE committed
    def spectra_visualize(variable):# this method takes spectra as input
DIANE's avatar
DIANE committed
        fig = plot_spectra(spectra, xunits = 'Wavelength/Wavenumber', yunits = "Signal intensity")
        
        data_info = DataFrame({'Name': [file.name],
DIANE's avatar
DIANE committed
                                'Number of scanned samples': [n_samples]},
                                index = ['Input file'])
        return fig, data_info
DIANE's avatar
DIANE committed
    fig_spectra, data_info = spectra_visualize(variable = spectra)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    c3, c4 = st.columns([3, 1])
    with c3:
DIANE's avatar
DIANE committed
        st.pyplot(fig_spectra)

DIANE's avatar
DIANE committed
    with c4:
        st.info('Information on the loaded data file')
DIANE's avatar
DIANE committed
        st.write(data_info) ## table showing the number of samples in the data file
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
################################################### END : visualize and split the data ####################################################

############################## Exploratory data analysis ###############################
DIANE's avatar
DIANE committed
st.subheader("II - Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
DIANE's avatar
DIANE committed
###### 1- Dimensionality reduction ######
DIANE's avatar
DIANE committed
t = DataFrame # scores
p = DataFrame # loadings
DIANE's avatar
DIANE committed
if not spectra.empty:
DIANE's avatar
DIANE committed
    xc = standardize(spectra, center=True, scale=False)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    c5, c6, c7, c8, c9, c10, c11 = st.columns([1, 1, 0.6, 0.6, 0.6, 1.5, 1.5])
    with c5:
        dim_red_method = st.selectbox("Dimensionality reduction techniques: ",
         options = ['']+dim_red_methods if len(dim_red_methods)>2 else dim_red_methods,
         key = 37, format_func = lambda x: x if x else "<Select>", disabled = False if len(dim_red_methods)>2 else True)
DIANE's avatar
DIANE committed
        if dim_red_method == '':
            st.info('Info: Select a dimensionality reduction technique!')
DIANE's avatar
DIANE committed
        ObjectHash(dim_red_method)
DIANE's avatar
DIANE committed


        if dim_red_method == "UMAP":
DIANE's avatar
DIANE committed
            if not meta_data.empty:
DIANE's avatar
DIANE committed
                filter = md_df_st_.columns.tolist()
                supervised = st.selectbox('Supervised UMAP by(optional):', options = ['']+filter, format_func = lambda x: x if x else "<Select>", key=108)
                umapsupervisor = [None if supervised == '' else md_df_st_[supervised]][0]
DIANE's avatar
DIANE committed
            else:
                supervised = st.selectbox('Supervised UMAP by:', options = ["Meta-data is not available"], disabled=True, format_func = lambda x: x if x else "<Select>", key=108)
                umapsupervisor = None
DIANE's avatar
DIANE committed
            ObjectHash(supervised)
        disablewidgets = [False if (dim_red_method and st.session_state.interface == 'advanced') else True][0]
        clus_method = st.selectbox("Clustering techniques(optional): ",
         options = ['']+cluster_methods if len(cluster_methods)>2 else cluster_methods,
         key = 38, format_func = lambda x: x if x else "<Select>", disabled= disablewidgets)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        
        # if disablewidgets == False and dim_red_method in dim_red_methods:
        #     inf = st.info('Info: Select a clustering technique!')
DIANE's avatar
DIANE committed
        if dim_red_method:
            @st.cache_data
DIANE's avatar
DIANE committed
            def dimensionality_reduction(variable):
DIANE's avatar
DIANE committed
                match dim_red_method:
                    case "PCA":
DIANE's avatar
DIANE committed
                            from utils.dim_reduction import LinearPCA
DIANE's avatar
DIANE committed
                            dr_model = LinearPCA(xc, Ncomp=8)
                    case "UMAP":
DIANE's avatar
DIANE committed
                            from utils.dim_reduction import Umap
DIANE's avatar
DIANE committed
                            dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = umapsupervisor)   
                    case 'NMF':
DIANE's avatar
DIANE committed
                            from utils.dim_reduction import Nmf
DIANE's avatar
DIANE committed
                            dr_model = Nmf(spectra, Ncomp= 3)
                return dr_model
            
DIANE's avatar
DIANE committed
            dr_model = dimensionality_reduction(variable = xc)
DIANE's avatar
DIANE committed
            
DIANE's avatar
DIANE committed
        if dr_model:
DIANE's avatar
DIANE committed
            axis1 = c7.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
            axis2 = c8.selectbox("y-axis", options = dr_model.scores_.columns, index=1)
            axis3 = c9.selectbox("z-axis", options = dr_model.scores_.columns, index=2)
DIANE's avatar
DIANE committed
            axis = np.unique([axis1, axis2, axis3])
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
            t = dr_model.scores_.loc[:,np.unique(axis)]
            tcr = standardize(t)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
###### II - clustering #######
if not t.empty:
DIANE's avatar
DIANE committed
    clustered = np.arange(n_samples)
    non_clustered = None

DIANE's avatar
DIANE committed
        c12 = st.container()
DIANE's avatar
DIANE committed
        c12, c13 = st.columns([3,3])
DIANE's avatar
DIANE committed
if not spectra.empty:
DIANE's avatar
DIANE committed
    with c6:
        sel_ratio = st.number_input('Enter the number/fraction of samples to be selected:',min_value=0.01, max_value=float("{:.2f}".format(spectra.shape[0])), value=0.20, format="%.2f", disabled= disablewidgets)
        if sel_ratio:
            if sel_ratio > 1.00:
                ratio = int(sel_ratio)
            elif sel_ratio < 1.00:
                ratio = int(sel_ratio*spectra.shape[0])
DIANE's avatar
DIANE committed
            ObjectHash(sel_ratio)
DIANE's avatar
DIANE committed
        if st.session_state["interface"] =='simple':
            clus_method = 'KS'
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        else:
            if dr_model and not clus_method:
                clus_method = st.radio('Select samples selection strategy:', options = ['RDM', 'KS'])
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
            elif dr_model and clus_method:
                disabled1 = False if clus_method in cluster_methods else True
                selection = st.radio('Select samples selection strategy:', options = selec_strategy, disabled = disabled1)
DIANE's avatar
DIANE committed



if dr_model and sel_ratio:
DIANE's avatar
DIANE committed
    # Clustering
    match clus_method:
        case 'Kmeans':
DIANE's avatar
DIANE committed
            from utils.clustering import Sk_Kmeans
DIANE's avatar
DIANE committed
            cl_model = Sk_Kmeans(tcr, max_clusters = ratio)
            data, labels, clu_centers = cl_model.fit_optimal_
            ncluster = clu_centers.shape[0]
DIANE's avatar
DIANE committed

        # 2- HDBSCAN clustering
        case 'HDBSCAN':
DIANE's avatar
DIANE committed
            from utils.clustering import Hdbscan
DIANE's avatar
DIANE committed
            cl_model = Hdbscan(np.array(tcr))
            labels, clu_centers, non_clustered = cl_model.labels_,cl_model.centers_, cl_model.non_clustered
DIANE's avatar
DIANE committed
            ncluster = len(clu_centers)

        # 3- Affinity propagation
        case 'AP':
DIANE's avatar
DIANE committed
            from utils.clustering import AP
DIANE's avatar
DIANE committed
            cl_model = AP(X = tcr)
            data, labels, clu_centers = cl_model.fit_optimal_
            ncluster = len(clu_centers)

        case 'KS':
DIANE's avatar
DIANE committed
            cl_model = KS(x = tcr, rset = ratio)
DIANE's avatar
DIANE committed

        case 'RDM':
DIANE's avatar
DIANE committed
            cl_model = RDM(x = tcr, rset = ratio)

DIANE's avatar
DIANE committed
    if clus_method in ['KS', 'RDM']:
DIANE's avatar
DIANE committed
        _, selected_samples_idx = cl_model.calset
        labels = ["ind"]*n_samples
        ncluster = "1"
        selection_number = 'None'
        selection = 'None'
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    new_tcr = tcr.iloc[clustered,:]
DIANE's avatar
DIANE committed
    
DIANE's avatar
DIANE committed
# #################################################### III - Samples selection using the reduced data presentation ######
if not labels:
    custom_color_palette = px.colors.qualitative.Plotly[:1]
elif labels:
    num_clusters = len(np.unique(labels))
    custom_color_palette = px.colors.qualitative.Plotly[:num_clusters]
DIANE's avatar
DIANE committed
    if clus_method:
DIANE's avatar
DIANE committed
        match selection:
        # Strategy 0
            case 'center':
                # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
DIANE's avatar
DIANE committed
                from sklearn.metrics import pairwise_distances_argmin_min
DIANE's avatar
DIANE committed
                closest, _ = pairwise_distances_argmin_min(clu_centers, new_tcr)
                selected_samples_idx = np.array(new_tcr.index)[list(closest)]
                selected_samples_idx = selected_samples_idx.tolist()
                
            #### Strategy 1
            case 'random':
DIANE's avatar
DIANE committed
                selection_number = int(ratio/num_clusters)
DIANE's avatar
DIANE committed
                ObjectHash(selection_number)
DIANE's avatar
DIANE committed
                s = np.array(labels)[np.where(np.array(labels) !='Non clustered')[0]]
                for i in np.unique(s):
                    C = np.where(np.array(labels) == i)[0]
                    if C.shape[0] >= selection_number:
DIANE's avatar
DIANE committed
                        from sklearn.cluster import KMeans
DIANE's avatar
DIANE committed
                        km2 = KMeans(n_clusters = selection_number)
                        km2.fit(tcr.iloc[C,:])
DIANE's avatar
DIANE committed
                        from sklearn.metrics import pairwise_distances_argmin_min
DIANE's avatar
DIANE committed
                        clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:])
                        selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index)
                    else:
                        selected_samples_idx.extend(new_tcr.iloc[C,:].index.to_list())
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ results visualization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
DIANE's avatar
DIANE committed
    ## Scores
if not t.empty:
DIANE's avatar
DIANE committed
    if meta_data.empty and clus_method in cluster_methods:
DIANE's avatar
DIANE committed
        filter = clus_method
DIANE's avatar
DIANE committed
    elif not meta_data.empty and clus_method in cluster_methods:
DIANE's avatar
DIANE committed
        filter = [clus_method] + md_df_st_.columns.tolist()
DIANE's avatar
DIANE committed
    elif not meta_data.empty and clus_method not in cluster_methods:
        filter = [''] + md_df_st_.columns.tolist()
    elif meta_data.empty and not clus_method in cluster_methods:
        filter = []
DIANE's avatar
DIANE committed
    
    if st.session_state["interface"] =='simple':
        desactivatelist = True
        if meta_data.empty:
            desactivatelist = True
            filter = ['']
        elif not meta_data.empty:
            filter = [''] + md_df_st_.columns.tolist()
            desactivatelist = False
    else:
        desactivatelist = False
    
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    with c12:
        st.write('Scores plot')
DIANE's avatar
DIANE committed
        tcr_plot = tcr.copy()
DIANE's avatar
DIANE committed
        if len(axis)== 1:
            tcr_plot['1d'] = np.random.uniform(-.5, .5, tcr_plot.shape[0])

DIANE's avatar
DIANE committed
        colfilter = st.selectbox('Color by:', options= filter,format_func = lambda x: x if x else "<Select>", disabled = desactivatelist)
DIANE's avatar
DIANE committed
        ObjectHash(colfilter)
DIANE's avatar
DIANE committed
        if colfilter in cluster_methods:
            tcr_plot[colfilter] = labels
        elif not meta_data.empty and colfilter in md_df_st_.columns.tolist():
            tcr_plot[f'{colfilter} :'] = list(map(str.lower,md_df_st_.loc[:,colfilter]))
DIANE's avatar
DIANE committed
            tcr_plot[f'{colfilter} :'] = ['sample'] * tcr_plot.shape[0]
        
        col_var_name = tcr_plot.columns.tolist()[-1]
        n_categories = len(np.unique(tcr_plot[col_var_name]))
        custom_color_palette = px.colors.qualitative.Plotly[:n_categories]

DIANE's avatar
DIANE committed
        if selected_samples_idx:# color selected samples
            t_selected = tcr_plot.iloc[selected_samples_idx,:]
        match t.shape[1]:
            case 3:
                fig = px.scatter_3d(tcr_plot, x = axis[0], y = axis[1], z = axis[2], color = col_var_name ,color_discrete_sequence = custom_color_palette)
                fig.update_traces(marker=dict(size=4))
                if selected_samples_idx:# color selected samples
                    fig.add_scatter3d(x = t_selected.loc[:,axis[0]], y = t_selected.loc[:,axis[1]], z = t_selected.loc[:,axis[2]],
                                    mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples')
DIANE's avatar
DIANE committed
                
DIANE's avatar
DIANE committed
            case 2:
                fig = px.scatter(tcr_plot, x = axis[0], y = axis[1], color = col_var_name ,color_discrete_sequence = custom_color_palette)
                if selected_samples_idx:# color selected samples
                    fig.add_scatter(x = t_selected.loc[:,axis[0]], y = t_selected.loc[:,axis[1]],
                                    mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples')

            
DIANE's avatar
DIANE committed
            case 1:
                yy = np.random.uniform(-.5, .5, tcr_plot.shape[0])
                fig = px.scatter(tcr_plot, x = axis[0], y = '1d', color = col_var_name ,color_discrete_sequence = custom_color_palette)
                fig.add_scatter(x = t_selected.loc[:,axis[0]], y = t_selected['1d'],
DIANE's avatar
DIANE committed
                                    mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples')
DIANE's avatar
DIANE committed
                fig.update_layout( yaxis_range=[-1.6, 1.6])
DIANE's avatar
DIANE committed
                fig.update_yaxes(visible=False)

        st.plotly_chart(fig, use_container_width = True)

        if labels:
            fig_export = {}
            # export 2D scores plot
            if len(axis)== 3:
DIANE's avatar
DIANE committed
                from itertools import combinations
DIANE's avatar
DIANE committed
                comb = [i for i in combinations(np.arange(len(axis)), 2)]
                subcap = ['a','b','c']
                for i in range(len(comb)):
                    fig_= px.scatter(tcr_plot, x = axis[(comb[i][0])], y=axis[(comb[i][1])],color = labels if list(labels) else None,color_discrete_sequence = custom_color_palette)
                    fig_.add_scatter(x = t_selected.loc[:,axis[(comb[i][0])]], y = t_selected.loc[:,axis[(comb[i][1])]], mode ='markers', marker = dict(size = 5, color = 'black'),
                                name = 'selected samples')
                    fig_.update_layout(font=dict(size=23))
                    fig_.add_annotation(text= f'({subcap[i]})', align='center', showarrow= False, xref='paper', yref='paper', x=-0.13, y= 1,
                                                font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
                    fig_.update_traces(marker=dict(size= 10), showlegend= False)
                    fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'] = fig_
                    # fig_export.write_image(f'./report/out/figures/scores_pc{str(comb[i][0])}_pc{str(comb[i][1])}.png')
            else:
                fig_export['fig'] = fig
DIANE's avatar
DIANE committed
            
DIANE's avatar
DIANE committed
if not spectra.empty:
DIANE's avatar
DIANE committed
    if dim_red_method in ['PCA','NMF']:
DIANE's avatar
DIANE committed
        with c13:
            st.write('Loadings plot')
            p = dr_model.loadings_
DIANE's avatar
DIANE committed
            freq = DataFrame(colnames, index=p.index)
DIANE's avatar
DIANE committed
            if extension =='dx':
DIANE's avatar
DIANE committed
                if meta_data.loc[:,'xunits'][0] == '1/cm':
                    freq.columns = ['Wavenumber (1/cm)']
DIANE's avatar
DIANE committed
                    xlab = "Wavenumber (1/cm)"
                    inv = 'reversed'
DIANE's avatar
DIANE committed
                else:
                    freq.columns = ['Wavelength (nm)']
DIANE's avatar
DIANE committed
                    xlab = 'Wavelength (nm)'
                    inv = None
DIANE's avatar
DIANE committed
            else:
                freq.columns = ['Wavelength/Wavenumber']
DIANE's avatar
DIANE committed
                xlab = 'Wavelength/Wavenumber'
                inv = None
DIANE's avatar
DIANE committed
                
DIANE's avatar
DIANE committed
            pp = concat([p, freq], axis=1)
DIANE's avatar
DIANE committed
            #########################################
            df1 = pp.melt(id_vars=freq.columns)
DIANE's avatar
DIANE committed
            loadingsplot = px.line(df1, x=freq.columns, y='value', color='variable', color_discrete_sequence=px.colors.qualitative.Plotly)
            loadingsplot.update_layout(legend=dict(x=1, y=0, font=dict(family="Courier", size=12, color="black"),
                                        bordercolor="black", borderwidth=2))
DIANE's avatar
DIANE committed
            loadingsplot.update_layout(xaxis_title = xlab,yaxis_title = "Intensity" ,xaxis = dict(autorange= inv))
DIANE's avatar
DIANE committed
            st.plotly_chart(loadingsplot, use_container_width=True)
    
DIANE's avatar
DIANE committed
#############################################################################################################
DIANE's avatar
DIANE committed
    if dim_red_method == 'PCA':
DIANE's avatar
DIANE committed
        c14, c15 = st.columns([3, 3])
        with c14:
            st.write('Influence plot')
DIANE's avatar
DIANE committed
            # Laverage
            Hat =  t.to_numpy() @ np.linalg.inv(np.transpose(t.to_numpy()) @ t.to_numpy()) @ np.transpose(t.to_numpy())
            leverage = np.diag(Hat) / np.trace(Hat)
DIANE's avatar
DIANE committed
            # Loadings
DIANE's avatar
DIANE committed
            p = dr_model.loadings_.loc[:,axis]
DIANE's avatar
DIANE committed
            # Matrix reconstruction
            xp = np.dot(t,p.T)
            # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model
            residuals = np.diag(np.subtract(xc.to_numpy(), xp)@ np.subtract(xc.to_numpy(), xp).T)
DIANE's avatar
DIANE committed
            from scipy.stats import chi2
            tresh4 = chi2.ppf(0.05, df = len(axis))
DIANE's avatar
DIANE committed

            # color with metadata
DIANE's avatar
DIANE committed
            if colfilter:
                if colfilter == "":
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
                elif colfilter == clus_method:
DIANE's avatar
DIANE committed
                    l1 = labels
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
                else:
DIANE's avatar
DIANE committed
                    l1 = tcr_plot[f'{colfilter} :']
DIANE's avatar
DIANE committed
            tcr_plot["leverage"] = leverage
            tcr_plot["residuals"] = residuals
            influence_plot = px.scatter(data_frame =tcr_plot, x = "leverage", y = "residuals", color=col_var_name,
DIANE's avatar
DIANE committed
                                            color_discrete_sequence= custom_color_palette)
DIANE's avatar
DIANE committed
            influence_plot.add_scatter(x = leverage[selected_samples_idx] , y = residuals[selected_samples_idx],
                                       mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples')
            
DIANE's avatar
DIANE committed
            influence_plot.add_vline(x = tresh3, line_width = 1, line_dash = 'solid', line_color = 'red')
            influence_plot.add_hline(y=tresh4, line_width=1, line_dash='solid', line_color='red')
            influence_plot.update_layout(xaxis_title="Leverage", yaxis_title = "Q-residuals", font=dict(size=20), width=800, height=600)
DIANE's avatar
DIANE committed

            out3 = leverage > tresh3
            out4 = residuals > tresh4

DIANE's avatar
DIANE committed
            # for i in range(n_samples):
            #     if out3[i]:
            #         if not meta_data.empty:
            #             ann =  meta_data.loc[:,'name'][i]
            #         else:
            #             ann = t.index[i]
            #         influence_plot.add_annotation(dict(x = leverage[i], y = residuals[i], showarrow=True, text = str(ann),font= dict(color= "black", size= 15),
            #                     xanchor = 'auto', yanchor = 'auto'))
DIANE's avatar
DIANE committed
            influence_plot.update_traces(marker=dict(size= 6), showlegend= True)
            influence_plot.update_layout(font=dict(size=23), width=800, height=500)
            st.plotly_chart(influence_plot, use_container_width=True)
DIANE's avatar
DIANE committed
            for annotation in influence_plot.layout.annotations:
DIANE's avatar
DIANE committed
                annotation.font.size = 35
DIANE's avatar
DIANE committed
            influence_plot.update_layout(font=dict(size=23), width=800, height=600)
            influence_plot.update_traces(marker=dict(size= 10), showlegend= False)
            influence_plot.add_annotation(text= '(a)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1,
                                             font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
DIANE's avatar
DIANE committed
            # influence_plot.write_image('./report/out/figures/influence_plot.png', engine = 'kaleido')
DIANE's avatar
DIANE committed
        with c15:
DIANE's avatar
DIANE committed
            st.write('T²-Hotelling vs Q-residuals plot')
DIANE's avatar
DIANE committed
            # Hotelling
            hotelling  = t.var(axis = 1)
            # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model
            residuals = np.diag(np.subtract(xc.to_numpy(), xp)@ np.subtract(xc.to_numpy(), xp).T)
DIANE's avatar
DIANE committed
            
            from scipy.stats import f, chi2
            fcri = f.isf(0.05, 3, n_samples)
            tresh0 = (3 * (n_samples ** 2 - 1) * fcri) / (n_samples * (n_samples - 3))
DIANE's avatar
DIANE committed
            tresh1 = chi2.ppf(0.05, df = 3)
DIANE's avatar
DIANE committed
            hotelling_plot = px.scatter(t, x = hotelling, y = residuals, color=labels if list(labels) else None,
DIANE's avatar
DIANE committed
                                            color_discrete_sequence= custom_color_palette)
DIANE's avatar
DIANE committed
            hotelling_plot.add_scatter(x = hotelling[selected_samples_idx] , y = residuals[selected_samples_idx],
                                       mode ='markers', marker = dict(size = 5, color = 'black'), name = 'selected samples')
DIANE's avatar
DIANE committed
            hotelling_plot.update_layout(xaxis_title="Hotelling-T² distance",yaxis_title="Q-residuals")
            hotelling_plot.add_vline(x=tresh0, line_width=1, line_dash='solid', line_color='red')
            hotelling_plot.add_hline(y=tresh1, line_width=1, line_dash='solid', line_color='red')
DIANE's avatar
DIANE committed

            out0 = hotelling > tresh0
            out1 = residuals > tresh1
DIANE's avatar
DIANE committed
            
DIANE's avatar
DIANE committed
                if out0[i]:
                    if not meta_data.empty:
                        ann =  meta_data.loc[:,'name'][i]
                    else:
                        ann = t.index[i]
DIANE's avatar
DIANE committed
                    hotelling_plot.add_annotation(dict(x = hotelling[i], y = residuals[i], showarrow=True, text = str(ann), font= dict(color= "black", size= 15),
DIANE's avatar
DIANE committed
                                xanchor = 'auto', yanchor = 'auto'))
DIANE's avatar
DIANE committed
            hotelling_plot.update_traces(marker=dict(size= 6), showlegend= True)
            hotelling_plot.update_layout(font=dict(size=23), width=800, height=500)
            st.plotly_chart(hotelling_plot, use_container_width=True)
DIANE's avatar
DIANE committed
            for annotation in hotelling_plot.layout.annotations:
DIANE's avatar
DIANE committed
                annotation.font.size = 35
DIANE's avatar
DIANE committed
            hotelling_plot.update_layout(font=dict(size=23), width=800, height=600)
            hotelling_plot.update_traces(marker=dict(size= 10), showlegend= False)
            hotelling_plot.add_annotation(text= '(b)', align='center', showarrow= False, xref='paper', yref='paper', x=-0.125, y= 1,
                                             font= dict(color= "black", size= 35), bgcolor ='white', borderpad= 2, bordercolor= 'black', borderwidth= 3)
DIANE's avatar
DIANE committed
            # hotelling_plot.write_image("./report/out/figures/hotelling_plot.png", format="png")
DIANE's avatar
DIANE committed
st.subheader('III - Selected Samples for Reference Analysis', divider='blue')
DIANE's avatar
DIANE committed
    c16, c17 = st.columns([3, 1])
    c16.write("Tabular identifiers of selected samples for reference analysis:")
DIANE's avatar
DIANE committed
            sam1 = DataFrame({'name': spectra.index[clustered][selected_samples_idx],
                                'cluster':np.array(labels)[clustered][selected_samples_idx]},
                                index = selected_samples_idx)
        else:
            sam1 = meta_data.iloc[clustered,:].iloc[selected_samples_idx,:]
            sam1.insert(loc=0, column='index', value=selected_samples_idx)
            sam1.insert(loc=1, column='cluster', value=np.array(labels)[selected_samples_idx])
        sam1.index = np.arange(len(selected_samples_idx))+1
DIANE's avatar
DIANE committed
        with c17:
            st.info(f'Information !\n - The total number of samples: {n_samples}.\n- The number of samples selected for reference analysis: {sam1.shape[0]}.\n - The proportion of samples selected for reference analysis: {round(sam1.shape[0]/n_samples*100)}%.')
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        if clus_method =='HDBSCAN':
            with c16:
                unclus = st.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True)

            if selected_samples_idx:
                if unclus:
                    if meta_data.empty:
DIANE's avatar
DIANE committed
                        sam2 = DataFrame({'name': spectra.index[non_clustered],
                                            'cluster':['Non clustered']*len(spectra.index[non_clustered])},
                                            index = spectra.index[non_clustered])
                    else :
                        sam2 = meta_data.iloc[non_clustered,:]
                        sam2.insert(loc=0, column='index', value= spectra.index[non_clustered])
                        sam2.insert(loc=1, column='cluster', value=['Non clustered']*len(spectra.index[non_clustered]))
                    
DIANE's avatar
DIANE committed
                    sam = concat([sam1, sam2], axis = 0)
DIANE's avatar
DIANE committed
                    with c17:
                        st.info(f'- The number of Non-clustered samples: {sam2.shape[0]}.\n - The proportion of Non-clustered samples: {round(sam2.shape[0]/n_samples*100)}%')
DIANE's avatar
DIANE committed
        with c16:
            st.write(sam)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
if not sam.empty:
DIANE's avatar
DIANE committed
    zip_data = ""
    Nb_ech = str(n_samples)
    nb_clu = str(sam1.shape[0])
DIANE's avatar
DIANE committed
    st.subheader('Download the analysis results')
DIANE's avatar
DIANE committed
    st.write("**Note:** Please check the box only after you have finished processing your data and are satisfied with the results. Checking the box prematurely may slow down the app and could lead to crashes.")
    decis = st.checkbox("Yes, I want to download the results")
    if decis:
        ###################################################
        # ## generate report
        @st.cache_data
DIANE's avatar
DIANE committed
        def export_report(change):
DIANE's avatar
DIANE committed
            latex_report = report.report('Representative subset selection', file.name, dim_red_method,
                                        clus_method, Nb_ech, ncluster, selection, selection_number, nb_clu,tcr, sam)
DIANE's avatar
DIANE committed
        @st.cache_data
        def preparing_results_for_downloading(change):
DIANE's avatar
DIANE committed
            # path_to_report = Path("report")############################### i am here
DIANE's avatar
DIANE committed
            match extension:
                # load csv file
                case 'csv':
                    imp.to_csv('report/out/dataset/'+ file.name, sep = ';', encoding = 'utf-8', mode = 'a')
                case 'dx':
                    with open('report/out/dataset/'+file.name, 'w') as dd:
                        dd.write(dxdata)

DIANE's avatar
DIANE committed
            fig_spectra.savefig(report_path_rel/"out/figures/spectra_plot.png", dpi = 400) ## Export report
DIANE's avatar
DIANE committed

            if len(axis) == 3:
                for i in range(len(comb)):
DIANE's avatar
DIANE committed
                    fig_export[f'scores_pc{comb[i][0]}_pc{comb[i][1]}'].write_image(report_path_rel/f'out/figures/scores_pc{str(comb[i][0]+1)}_pc{str(comb[i][1]+1)}.png')
DIANE's avatar
DIANE committed
            elif len(axis)==2 :
DIANE's avatar
DIANE committed
                fig_export['fig'].write_image(report_path_rel/'out/figures/scores_plot2D.png')
DIANE's avatar
DIANE committed
            elif len(axis)==1 :
DIANE's avatar
DIANE committed
                fig_export['fig'].write_image(report_path_rel/'out/figures/scores_plot1D.png')
DIANE's avatar
DIANE committed
                    
            # Export du graphique
            if dim_red_method in ['PCA','NMF']:
DIANE's avatar
DIANE committed
                import plotly.io as pio
DIANE's avatar
DIANE committed
                img = pio.to_image(loadingsplot, format="png")
DIANE's avatar
DIANE committed
                with open(report_path_rel/"out/figures/loadings_plot.png", "wb") as f:
DIANE's avatar
DIANE committed
                    f.write(img)
            if dim_red_method == 'PCA': 
DIANE's avatar
DIANE committed
                hotelling_plot.write_image(report_path_rel/"out/figures/hotelling_plot.png", format="png")
                influence_plot.write_image(report_path_rel/'out/figures/influence_plot.png', engine = 'kaleido')
DIANE's avatar
DIANE committed
            
DIANE's avatar
DIANE committed
            sam.to_csv(report_path_rel/'out/Selected_subset_for_calib_development.csv', sep = ';')
DIANE's avatar
DIANE committed
            export_report(change = hash_)
DIANE's avatar
DIANE committed
            if Path(report_path_rel/"report.tex").exists():
DIANE's avatar
DIANE committed
                report.generate_report(change = hash_)
DIANE's avatar
DIANE committed
            if Path(report_path_rel/"report.pdf").exists():
                move(report_path_rel/"report.pdf", "./report/out/report.pdf")
DIANE's avatar
DIANE committed
            return change
DIANE's avatar
DIANE committed
        preparing_results_for_downloading(change = hash_)
        report.generate_report(change = hash_)
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
        @st.cache_data
        def tempdir(change):
DIANE's avatar
DIANE committed
            from tempfile import TemporaryDirectory
DIANE's avatar
DIANE committed
            with  TemporaryDirectory( prefix="results", dir="./report") as temp_dir:# create a temp directory
DIANE's avatar
DIANE committed
                tempdirname = os.path.split(temp_dir)[1]

DIANE's avatar
DIANE committed
                if len(os.listdir(report_path_rel/'out/figures/'))>=2:
DIANE's avatar
DIANE committed
                    make_archive(base_name= report_path_rel/"Results", format="zip", base_dir="out", root_dir = "./report")# create a zip file
                    move(report_path_rel/"Results.zip", f"./report/{tempdirname}/Results.zip")# put the inside the temp dir
                    with open(report_path_rel/f"{tempdirname}/Results.zip", "rb") as f:
DIANE's avatar
DIANE committed
                        zip_data = f.read()
            return tempdirname, zip_data
DIANE's avatar
DIANE committed
        
DIANE's avatar
DIANE committed
        try :
            tempdirname, zip_data = tempdir(change = hash_)
DIANE's avatar
DIANE committed
            # st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip",
            #             args = None, kwargs = None,type = "primary",use_container_width = True)
DIANE's avatar
DIANE committed
        except:
            pass
DIANE's avatar
DIANE committed
    date_time = datetime.now().strftime('%y%m%d%H%M')
    disabled_down = True if zip_data == '' else False
    st.download_button(label = 'Download', data = zip_data, file_name = f'Nirs_Workflow_{date_time}_SamSel_.zip', mime ="application/zip",
                args = None, kwargs = None,type = "primary",use_container_width = True, disabled = disabled_down)

DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    delete_files(keep = ['.py', '.pyc','.bib'])