Skip to content
Snippets Groups Projects
Commit 204a2f34 authored by DIANE's avatar DIANE
Browse files

switch case

parent 2cd400f0
No related branches found
No related tags found
No related merge requests found
......@@ -24,26 +24,27 @@ dim_red_methods=['', 'PCA','UMAP', 'NMF'] # List of dimensionality reduction al
cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP', 'KS', 'RDM'] # List of clustering algos
selec_strategy = ['center','random']
if st.session_state["interface"] == 'simple':
st.write(':red[Automated Simple Interface]')
# hide_pages("Predictions")
if 37 not in st.session_state:
default_reduction_option = 1
else:
default_reduction_option = dim_red_methods.index(st.session_state.get(37))
if 38 not in st.session_state:
default_clustering_option = 1
else:
default_clustering_option = cluster_methods.index(st.session_state.get(38))
if 102 not in st.session_state:
default_sample_selection_option = 1
else:
default_sample_selection_option = selec_strategy.index(st.session_state.get(102))
if st.session_state["interface"] == 'advanced':
default_reduction_option = 0
default_clustering_option = 0
default_sample_selection_option = 0
match st.session_state["interface"]:
case 'simple':
st.write(':red[Automated Simple Interface]')
# hide_pages("Predictions")
if 37 not in st.session_state:
default_reduction_option = 1
else:
default_reduction_option = dim_red_methods.index(st.session_state.get(37))
if 38 not in st.session_state:
default_clustering_option = 1
else:
default_clustering_option = cluster_methods.index(st.session_state.get(38))
if 102 not in st.session_state:
default_sample_selection_option = 1
else:
default_sample_selection_option = selec_strategy.index(st.session_state.get(102))
case'advanced':
default_reduction_option = 0
default_clustering_option = 0
default_sample_selection_option = 0
################################### I - Data Loading and Visualization ########################################
st.title("Calibration Subset Selection")
......@@ -74,33 +75,34 @@ if not data_file:
else:
# Retrieve the extension of the file
test = data_file.name[data_file.name.find('.'):]
match test:
## Load .csv file
if test== '.csv':
with col1:
# Select list for CSV delimiter
psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9)
# Select list for CSV header True / False
phdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31)
if phdr == 'yes':
col = 0
else:
col = False
imp = pd.read_csv(data_file, sep=psep, index_col=col)
# spectra = col_cat(imp)[0]
# meta_data = col_cat(imp)[1]
spectra, md_df_st_ = col_cat(imp)
meta_data = md_df_st_
st.success("The data have been loaded successfully", icon="")
## Load .dx file
elif test == '.dx':
# Create a temporary file to save the uploaded file
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
tmp.write(data_file.read())
tmp_path = tmp.name
case '.csv':
with col1:
_, spectra, meta_data, md_df_st_ = read_dx(file = tmp_path)
# Select list for CSV delimiter
psep = st.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9)
# Select list for CSV header True / False
phdr = st.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31)
if phdr == 'yes':
col = 0
else:
col = False
imp = pd.read_csv(data_file, sep=psep, index_col=col)
# spectra = col_cat(imp)[0]
# meta_data = col_cat(imp)[1]
spectra, md_df_st_ = col_cat(imp)
meta_data = md_df_st_
st.success("The data have been loaded successfully", icon="")
os.unlink(tmp_path)
## Load .dx file
case '.dx':
# Create a temporary file to save the uploaded file
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
tmp.write(data_file.read())
tmp_path = tmp.name
with col1:
_, spectra, meta_data, md_df_st_ = read_dx(file = tmp_path)
st.success("The data have been loaded successfully", icon="")
os.unlink(tmp_path)
......@@ -163,28 +165,29 @@ if not spectra.empty:
dim_red_method = bb1.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, index = default_reduction_option, key = 37)
clus_method = bb2.selectbox("Clustering/sampling techniques: ", options = cluster_methods, index = default_clustering_option, key = 38)
xc = standardize(spectra, center=True, scale=False)
if dim_red_method == dim_red_methods[0]:
bb1.warning('⚠️ Please choose an algothithm !')
elif dim_red_method == dim_red_methods[1]:
dr_model = LinearPCA(xc, Ncomp=8)
elif dim_red_method == dim_red_methods[2]:
if not meta_data.empty:
filter = md_df_st_.columns
filter = filter.insert(0, 'Nothing')
col = bb1.selectbox('Supervised UMAP by:', options= filter, key=108)
if col == 'Nothing':
supervised = None
match dim_red_method:
case "":
bb1.warning('⚠️ Please choose an algorithm !')
case "PCA":
dr_model = LinearPCA(xc, Ncomp=8)
case "UMAP":
if not meta_data.empty:
filter = md_df_st_.columns
filter = filter.insert(0, 'Nothing')
col = bb1.selectbox('Supervised UMAP by:', options= filter, key=108)
if col == 'Nothing':
supervised = None
else:
supervised = md_df_st_[col]
else:
supervised = md_df_st_[col]
else:
supervised = None
dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised)
supervised = None
dr_model = Umap(numerical_data = MinMaxScale(spectra), cat_data = supervised)
elif dim_red_method == dim_red_methods[3]:
dr_model = Nmf(spectra, Ncomp= 3)
case 'NMF':
dr_model = Nmf(spectra, Ncomp= 3)
if dr_model:
axis1 = bb3.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
......@@ -196,69 +199,56 @@ if not spectra.empty:
###### II - clustering #######
if not t.empty:
clustered = np.arange(n_samples)
non_clustered = None
if dim_red_method == 'UMAP':
scores = st.container()
else:
scores, loadings= st.columns([3,3])
tcr = standardize(t)
# Clustering
# 1- K-MEANS Clustering
if clus_method == cluster_methods[0]:
bb2.warning('⚠️ Please choose an algothithm !')
if clus_method == cluster_methods[1]:
cl_model = Sk_Kmeans(tcr, max_clusters = 25)
ncluster = scores.number_input(min_value=2, max_value=25, value=cl_model.suggested_n_clusters_, label = 'Select the desired number of clusters')
# fig2 = px.bar(cl_model.inertia_.T, y = 'inertia')
# scores.write(f"Suggested n_clusters : {cl_model.suggested_n_clusters_}")
# scores.plotly_chart(fig2,use_container_width=True)
# img = pio.to_image(fig2, format="png")
# with open("./Report/figures/Elbow.png", "wb") as f:
# f.write(img)
data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster)
# 2- HDBSCAN clustering
elif clus_method == cluster_methods[2]:
optimized_hdbscan = Hdbscan(np.array(tcr))
# all_labels, hdbscan_score, clu_centers = optimized_hdbscan.HDBSCAN_scores_
all_labels, clu_centers = optimized_hdbscan.HDBSCAN_scores_
labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in all_labels]
ncluster = len(clu_centers)
# 3- Affinity propagation
elif clus_method == cluster_methods[3]:
cl_model = AP(X = tcr)
data, labels, clu_centers = cl_model.fit_optimal_
ncluster = len(clu_centers)
elif clus_method == cluster_methods[4]:
rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)')
cl_model = KS(x = tcr, rset = rset)
calset = cl_model.calset
labels = ["ind"]*n_samples
ncluster = "1"
selection_number = 'None'
elif clus_method == cluster_methods[5]:
rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)')
cl_model = RDM(x = tcr, rset = rset)
calset = cl_model.calset
labels = ["ind"]*n_samples
ncluster = "1"
selection_number = 'None'
if clus_method == cluster_methods[2]:
#clustered = np.where(np.array(labels) != 'Non clustered')[0]
clustered = np.arange(n_samples)
non_clustered = np.where(np.array(labels) == 'Non clustered')[0]
# Clustering
match clus_method:
case '':
bb2.warning('⚠️ Please choose an algothithm !')
case 'Kmeans':
cl_model = Sk_Kmeans(tcr, max_clusters = 25)
ncluster = scores.number_input(min_value=2, max_value=25, value=cl_model.suggested_n_clusters_, label = 'Select the desired number of clusters')
data, labels, clu_centers = cl_model.fit_optimal(nclusters = ncluster)
# 2- HDBSCAN clustering
case 'HDBSCAN':
optimized_hdbscan = Hdbscan(np.array(tcr))
all_labels, clu_centers = optimized_hdbscan.HDBSCAN_scores_
labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in all_labels]
ncluster = len(clu_centers)
non_clustered = np.where(np.array(labels) == 'Non clustered')[0]
# 3- Affinity propagation
case 'AP':
cl_model = AP(X = tcr)
data, labels, clu_centers = cl_model.fit_optimal_
ncluster = len(clu_centers)
case 'KS':
rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)')
cl_model = KS(x = tcr, rset = rset)
calset = cl_model.calset
labels = ["ind"]*n_samples
ncluster = "1"
selection_number = 'None'
case 'RDM':
rset = scores.number_input(min_value=0, max_value=100, value=20, label = 'The ratio of data to be sampled (%)')
cl_model = RDM(x = tcr, rset = rset)
calset = cl_model.calset
labels = ["ind"]*n_samples
ncluster = "1"
selection_number = 'None'
else:
clustered = np.arange(n_samples)
non_clustered = None
new_tcr = tcr.iloc[clustered,:]
......@@ -273,35 +263,37 @@ elif labels:
num_clusters = len(np.unique(labels))
custom_color_palette = px.colors.qualitative.Plotly[:num_clusters]
if clus_method:
if clus_method == cluster_methods[4] or clus_method == cluster_methods[5]:
if clus_method in ['KS', 'RDM']:
selected_samples_idx = calset[1]
selection = 'None'
else:
selection = scores.radio('Select samples selection strategy:',
options = selec_strategy, index = default_sample_selection_option, key=102)
match selection:
# Strategy 0
if selection == selec_strategy[0]:
# list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
closest, _ = pairwise_distances_argmin_min(clu_centers, new_tcr)
selected_samples_idx = np.array(new_tcr.index)[list(closest)]
selected_samples_idx = selected_samples_idx.tolist()
#### Strategy 1
elif selection == selec_strategy[1]:
selection_number = scores.number_input('How many samples per cluster?',
min_value = 1, step=1, value = 3)
s = np.array(labels)[np.where(np.array(labels) !='Non clustered')[0]]
for i in np.unique(s):
C = np.where(np.array(labels) == i)[0]
if C.shape[0] >= selection_number:
# scores.write(list(tcr.index)[labels== i])
km2 = KMeans(n_clusters = selection_number)
km2.fit(tcr.iloc[C,:])
clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:])
selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index)
else:
selected_samples_idx.extend(new_tcr.iloc[C,:].index.to_list())
# list indexes of selected samples for colored plot
case 'center':
# list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster
closest, _ = pairwise_distances_argmin_min(clu_centers, new_tcr)
selected_samples_idx = np.array(new_tcr.index)[list(closest)]
selected_samples_idx = selected_samples_idx.tolist()
#### Strategy 1
case 'random':
selection_number = scores.number_input('How many samples per cluster?',
min_value = 1, step=1, value = 3)
s = np.array(labels)[np.where(np.array(labels) !='Non clustered')[0]]
for i in np.unique(s):
C = np.where(np.array(labels) == i)[0]
if C.shape[0] >= selection_number:
# scores.write(list(tcr.index)[labels== i])
km2 = KMeans(n_clusters = selection_number)
km2.fit(tcr.iloc[C,:])
clos, _ = pairwise_distances_argmin_min(km2.cluster_centers_, tcr.iloc[C,:])
selected_samples_idx.extend(tcr.iloc[C,:].iloc[list(clos)].index)
else:
selected_samples_idx.extend(new_tcr.iloc[C,:].index.to_list())
# list indexes of selected samples for colored plot
################################ Plots visualization ############################################
......@@ -385,7 +377,7 @@ if not t.empty:
if not spectra.empty:
if dim_red_method == dim_red_methods[1] or dim_red_method == dim_red_methods[3]:
if dim_red_method in ['PCA','NMF']:
with loadings:
st.write('Loadings plot')
p = dr_model.loadings_
......@@ -421,7 +413,7 @@ if not spectra.empty:
with open("./Report/figures/loadings_plot.png", "wb") as f:
f.write(img)
#############################################################################################################
if dim_red_method == dim_red_methods[1]:
if dim_red_method == 'PCA':
influence, hotelling = st.columns([3, 3])
with influence:
st.write('Influence plot')
......@@ -549,10 +541,12 @@ if labels:
sam1.index = np.arange(len(selected_samples_idx))+1
info.info(f'Information !\n - The total number of samples: {n_samples}.\n- The number of samples selected for reference analysis: {sam1.shape[0]}.\n - The proportion of samples selected for reference analysis: {round(sam1.shape[0]/n_samples*100)}%.')
sam = sam1
# if clus_method == cluster_methods[2]:
# unclus = sel.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True)
if clus_method == cluster_methods[2]:
unclus = sel.checkbox("Include non clustered samples (for HDBSCAN clustering)", value=True)
if clus_method == cluster_methods[2]:
if selected_samples_idx:
if unclus:
if meta_data.empty:
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment