switch case

204a2f34 · DIANE · 2cd400f0 · 204a2f34 · 204a2f34
Commit 204a2f34 authored 7 months ago by DIANE
--- a/src/pages/1-samples_selection.py
+++ b/src/pages/1-samples_selection.py
@@ -24,26 +24,27 @@ dim_red_methods=['', 'PCA','UMAP', 'NMF']  # List of dimensionality reduction al
 cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP', 'KS', 'RDM'] # List of clustering algos
 selec_strategy = ['center','random']

-if st.session_state["interface"] == 'simple':
-    st.write(':red[Automated Simple Interface]')
-    # hide_pages("Predictions")
-    if 37 not in st.session_state:
-        default_reduction_option = 1
-    else:
-        default_reduction_option = dim_red_methods.index(st.session_state.get(37))
-    if 38 not in st.session_state:
-        default_clustering_option = 1
-    else:
-        default_clustering_option = cluster_methods.index(st.session_state.get(38))
-    if 102 not in st.session_state:
-        default_sample_selection_option = 1
-    else:
-        default_sample_selection_option = selec_strategy.index(st.session_state.get(102))
-
-if st.session_state["interface"] == 'advanced':
-    default_reduction_option = 0
-    default_clustering_option = 0
-    default_sample_selection_option = 0
+match st.session_state["interface"]:
+    case 'simple':
+        st.write(':red[Automated Simple Interface]')
+        # hide_pages("Predictions")
+        if 37 not in st.session_state:
+            default_reduction_option = 1
+        else:
+            default_reduction_option = dim_red_methods.index(st.session_state.get(37))
+        if 38 not in st.session_state:
+            default_clustering_option = 1
+        else:
+            default_clustering_option = cluster_methods.index(st.session_state.get(38))
+        if 102 not in st.session_state:
+            default_sample_selection_option = 1
+        else:
+            default_sample_selection_option = selec_strategy.index(st.session_state.get(102))
+        
+    case'advanced':
+        default_reduction_option = 0
+        default_clustering_option = 0
+        default_sample_selection_option = 0

 ################################### I - Data Loading and Visualization ########################################
 st.title("Calibration Subset Selection")
@@ -74,33 +75,34 @@ if not data_file:
 else:
    # Retrieve the extension of the file
    test = data_file.name[data_file.name.find('.'):]
+    match test:
    ## Load .csv file
-    if test== '.csv':
-        with col1:
-            # Select list for CSV delimiter
-            psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9)
-                # Select list for CSV header True / False
-            phdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31)
-            if phdr == 'yes':
-                col = 0
-            else:
-                col = False
-            imp = pd.read_csv(data_file, sep=psep, index_col=col)
-            # spectra = col_cat(imp)[0]
-            # meta_data = col_cat(imp)[1]
-            spectra, md_df_st_ = col_cat(imp)
-            meta_data = md_df_st_
-            st.success("The data have been loaded successfully", icon="✅")
-    ## Load .dx file
-    elif test == '.dx':
-        # Create a temporary file to save the uploaded file
-        with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
-            tmp.write(data_file.read())
-            tmp_path = tmp.name
+        case '.csv':
            with col1:
-                _, spectra, meta_data, md_df_st_ = read_dx(file = tmp_path)
+                # Select list for CSV delimiter
+                psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9)
+                    # Select list for CSV header True / False
+                phdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31)
+                if phdr == 'yes':
+                    col = 0
+                else:
+                    col = False
+                imp = pd.read_csv(data_file, sep=psep, index_col=col)
+                # spectra = col_cat(imp)[0]
+                # meta_data = col_cat(imp)[1]
+                spectra, md_df_st_ = col_cat(imp)
+                meta_data = md_df_st_
                st.success("The data have been loaded successfully", icon="✅")
-        os.unlink(tmp_path)
+        ## Load .dx file
+        case '.dx':
+            # Create a temporary file to save the uploaded file
+            with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
+                tmp.write(data_file.read())
+                tmp_path = tmp.name
+                with col1:
+                    _, spectra, meta_data, md_df_st_ = read_dx(file = tmp_path)
+                    st.success("The data have been loaded successfully", icon="✅")
+            os.unlink(tmp_path)


    
@@ -163,28 +165,29 @@ if not spectra.empty:
    dim_red_method = bb1.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, index = default_reduction_option, key = 37)
    clus_method = bb2.selectbox("Clustering/sampling techniques: ", options = cluster_methods, index = default_clustering_option, key = 38)
    xc = standardize(spectra, center=True, scale=False)
-
-
-    if dim_red_method == dim_red_methods[0]:
-        bb1.warning('⚠️ Please choose an algothithm !')
-    elif dim_red_method == dim_red_methods[1]:
-        dr_model = LinearPCA(xc, Ncomp=8)
-
-    elif dim_red_method == dim_red_methods[2]:
-        if not meta_data.empty:
-            filter = md_df_st_.columns
-            filter = filter.insert(0, 'Nothing')
-            col = bb1.selectbox('Supervised UMAP by:', options= filter, key=108)
-            if col == 'Nothing':
-                supervised = None
+    
+    match dim_red_method:
+        case "":
+                bb1.warning('⚠️ Please choose an algorithm !')
+        
+        case "PCA":
+            dr_model = LinearPCA(xc, Ncomp=8)
+
+        case "UMAP":
+            if not meta_data.empty:
+                filter = md_df_st_.columns
+                filter = filter.insert(0, 'Nothing')
+                col = bb1.selectbox('Supervised UMAP by:', options= filter, key=108)
+                if col == 'Nothing':
+                    supervised = None
+                else:
+                    supervised = md_df_st_[col]
            else:
-                supervised = md_df_st_[col]
-        else:
-            supervised = None
-        dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised)
+                supervised = None
+            dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised)

-    elif dim_red_method == dim_red_methods[3]:
-        dr_model = Nmf(spectra, Ncomp= 3)
+        case 'NMF':
+            dr_model = Nmf(spectra, Ncomp= 3)

    if dr_model:
        axis1 = bb3.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
@@ -196,69 +199,56 @@ if not spectra.empty:


 ###### II - clustering #######
-
 if not t.empty:
+    clustered = np.arange(n_samples)
+    non_clustered = None
+
    if dim_red_method == 'UMAP':
        scores = st.container()
    else:
        scores, loadings= st.columns([3,3])

    tcr = standardize(t)
-        # Clustering
-    # 1- K-MEANS Clustering
-    if clus_method == cluster_methods[0]:
-        bb2.warning('⚠️ Please choose an algothithm !')
-
-    if clus_method == cluster_methods[1]:
-        cl_model = Sk_Kmeans(tcr, max_clusters = 25)
-        ncluster = scores.number_input(min_value=2, max_value=25, value=cl_model.suggested_n_clusters_, label = 'Select the desired number of clusters')
-        # fig2 = px.bar(cl_model.inertia_.T, y = 'inertia')
-        # scores.write(f"Suggested n_clusters : {cl_model.suggested_n_clusters_}")
-        # scores.plotly_chart(fig2,use_container_width=True)
-        # img = pio.to_image(fig2, format="png")
-        # with open("./Report/figures/Elbow.png", "wb") as f:
-        #         f.write(img)    
-        data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster)
-
-    # 2- HDBSCAN clustering
-    elif clus_method == cluster_methods[2]:
-        optimized_hdbscan = Hdbscan(np.array(tcr))
-        # all_labels, hdbscan_score, clu_centers = optimized_hdbscan.HDBSCAN_scores_
-        all_labels, clu_centers = optimized_hdbscan.HDBSCAN_scores_
-        labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in all_labels]
-        ncluster = len(clu_centers)
-
-    # 3- Affinity propagation
-    elif clus_method == cluster_methods[3]:
-        cl_model = AP(X = tcr)
-        data, labels, clu_centers = cl_model.fit_optimal_
-        ncluster = len(clu_centers)
-
-    elif clus_method == cluster_methods[4]:
-        rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)')
-        cl_model = KS(x = tcr, rset = rset)
-        calset = cl_model.calset
-        labels = ["ind"]*n_samples
-        ncluster = "1"
-        selection_number = 'None'
-
-    elif clus_method == cluster_methods[5]:
-        rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)')
-        cl_model = RDM(x = tcr, rset = rset)
-        calset = cl_model.calset
-        labels = ["ind"]*n_samples
-        ncluster = "1"
-        selection_number = 'None'
    
-    if clus_method == cluster_methods[2]:
-        #clustered = np.where(np.array(labels) != 'Non clustered')[0]
-        clustered = np.arange(n_samples)
-        non_clustered = np.where(np.array(labels) == 'Non clustered')[0]
+    # Clustering
+    match clus_method:
+        case '':
+            bb2.warning('⚠️ Please choose an algothithm !')
+        case 'Kmeans':
+            cl_model = Sk_Kmeans(tcr, max_clusters = 25)
+            ncluster = scores.number_input(min_value=2, max_value=25, value=cl_model.suggested_n_clusters_, label = 'Select the desired number of clusters')  
+            data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster)
+
+        # 2- HDBSCAN clustering
+        case 'HDBSCAN':
+            optimized_hdbscan = Hdbscan(np.array(tcr))
+            all_labels, clu_centers = optimized_hdbscan.HDBSCAN_scores_
+            labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in all_labels]
+            ncluster = len(clu_centers)
+            non_clustered = np.where(np.array(labels) == 'Non clustered')[0]
+
+        # 3- Affinity propagation
+        case 'AP':
+            cl_model = AP(X = tcr)
+            data, labels, clu_centers = cl_model.fit_optimal_
+            ncluster = len(clu_centers)
+
+        case 'KS':
+            rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)')
+            cl_model = KS(x = tcr, rset = rset)
+            calset = cl_model.calset
+            labels = ["ind"]*n_samples
+            ncluster = "1"
+            selection_number = 'None'
+
+        case 'RDM':
+            rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)')
+            cl_model = RDM(x = tcr, rset = rset)
+            calset = cl_model.calset
+            labels = ["ind"]*n_samples
+            ncluster = "1"
+            selection_number = 'None'            

-    else:
-        clustered = np.arange(n_samples)
-        non_clustered = None
-    
    new_tcr = tcr.iloc[clustered,:]    
    

@@ -273,35 +263,37 @@ elif labels:
    num_clusters = len(np.unique(labels))
    custom_color_palette = px.colors.qualitative.Plotly[:num_clusters]
    if clus_method:
-        if clus_method == cluster_methods[4] or clus_method == cluster_methods[5]:
+        if clus_method in ['KS', 'RDM']:
            selected_samples_idx = calset[1]
            selection = 'None'
        else:
            selection = scores.radio('Select samples selection strategy:',
                                        options = selec_strategy, index = default_sample_selection_option, key=102)
+        
+        match selection:
        # Strategy 0
-        if selection == selec_strategy[0]:
-            # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
-            closest, _ = pairwise_distances_argmin_min(clu_centers, new_tcr)
-            selected_samples_idx = np.array(new_tcr.index)[list(closest)]
-            selected_samples_idx = selected_samples_idx.tolist()
-            
-        #### Strategy 1
-        elif selection == selec_strategy[1]:
-            selection_number = scores.number_input('How many samples per cluster?',
-                                                    min_value = 1, step=1, value = 3)
-            s = np.array(labels)[np.where(np.array(labels) !='Non clustered')[0]]
-            for i in np.unique(s):
-                C = np.where(np.array(labels) == i)[0]
-                if C.shape[0] >= selection_number:
-                    # scores.write(list(tcr.index)[labels== i])
-                    km2 = KMeans(n_clusters = selection_number)
-                    km2.fit(tcr.iloc[C,:])
-                    clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:])
-                    selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index)
-                else:
-                    selected_samples_idx.extend(new_tcr.iloc[C,:].index.to_list())
-                # list indexes of selected samples for colored plot    
+            case 'center':
+                # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
+                closest, _ = pairwise_distances_argmin_min(clu_centers, new_tcr)
+                selected_samples_idx = np.array(new_tcr.index)[list(closest)]
+                selected_samples_idx = selected_samples_idx.tolist()
+                
+            #### Strategy 1
+            case 'random':
+                selection_number = scores.number_input('How many samples per cluster?',
+                                                        min_value = 1, step=1, value = 3)
+                s = np.array(labels)[np.where(np.array(labels) !='Non clustered')[0]]
+                for i in np.unique(s):
+                    C = np.where(np.array(labels) == i)[0]
+                    if C.shape[0] >= selection_number:
+                        # scores.write(list(tcr.index)[labels== i])
+                        km2 = KMeans(n_clusters = selection_number)
+                        km2.fit(tcr.iloc[C,:])
+                        clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:])
+                        selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index)
+                    else:
+                        selected_samples_idx.extend(new_tcr.iloc[C,:].index.to_list())
+                    # list indexes of selected samples for colored plot    

 ################################      Plots visualization          ############################################

@@ -385,7 +377,7 @@ if not t.empty:


 if not spectra.empty:
-    if dim_red_method == dim_red_methods[1] or dim_red_method == dim_red_methods[3]:
+    if dim_red_method in ['PCA','NMF']:
        with loadings:
            st.write('Loadings plot')
            p = dr_model.loadings_
@@ -421,7 +413,7 @@ if not spectra.empty:
            with open("./Report/figures/loadings_plot.png", "wb") as f:
                f.write(img)
 #############################################################################################################
-    if dim_red_method == dim_red_methods[1]:
+    if dim_red_method == 'PCA':
        influence, hotelling = st.columns([3, 3])
        with influence:
            st.write('Influence plot')
@@ -549,10 +541,12 @@ if labels:
        sam1.index = np.arange(len(selected_samples_idx))+1
        info.info(f'Information !\n - The total number of samples: {n_samples}.\n- The number of samples selected for reference analysis: {sam1.shape[0]}.\n - The proportion of samples selected for reference analysis: {round(sam1.shape[0]/n_samples*100)}%.')
        sam = sam1
+        # if clus_method == cluster_methods[2]:
+        #     unclus = sel.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True)
+
        if clus_method == cluster_methods[2]:
            unclus = sel.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True)

-        if clus_method == cluster_methods[2]:
            if selected_samples_idx:
                if unclus:
                    if meta_data.empty:

--- a/src/pages/2-model_creation.py
+++ b/src/pages/2-model_creation.py