diff --git a/Class_Mod/HDBSCAN_Clustering.py b/Class_Mod/HDBSCAN_Clustering.py index f01928254a72c2516d8f2093011b06130aaaea27..1a9df2d72833121b79f19a6d9c0618868fc0ffc3 100644 --- a/Class_Mod/HDBSCAN_Clustering.py +++ b/Class_Mod/HDBSCAN_Clustering.py @@ -1,299 +1,308 @@ from Packages import * -from scipy.spatial.distance import euclidean, cdist -from scipy.sparse.csgraph import minimum_spanning_tree -from scipy.sparse import csgraph - - -def DBCV(X, labels, dist_function=euclidean): - """ - Implimentation of Density-Based Clustering Validation "DBCV" - - Citation: - Moulavi, Davoud, et al. "Density-based clustering validation." - Proceedings of the 2014 SIAM International Conference on Data Mining. - Society for Industrial and Applied Mathematics, 2014. - - Density Based clustering validation - - Args: - X (np.ndarray): ndarray with dimensions [n_samples, n_features] - data to check validity of clustering - labels (np.array): clustering assignments for data X - dist_dunction (func): function to determine distance between objects - func args must be [np.array, np.array] where each array is a point - - Returns: cluster_validity (float) - score in range[-1, 1] indicating validity of clustering assignments - """ - graph = _mutual_reach_dist_graph(X, labels, dist_function) - mst = _mutual_reach_dist_MST(graph) - cluster_validity = _clustering_validity_index(mst, labels) - return cluster_validity - - -def _core_dist(point, neighbors, dist_function): - """ - Computes the core distance of a point. - Core distance is the inverse density of an object. - - Args: - point (np.array): array of dimensions (n_features,) - point to compute core distance of - neighbors (np.ndarray): array of dimensions (n_neighbors, n_features): - array of all other points in object class - dist_dunction (func): function to determine distance between objects - func args must be [np.array, np.array] where each array is a point - - Returns: core_dist (float) - inverse density of point - """ - n_features = np.shape(point)[0] - n_neighbors = np.shape(neighbors)[0] - - distance_vector = cdist(point.reshape(1, -1), neighbors) - distance_vector = distance_vector[distance_vector != 0] - numerator = ((1/distance_vector)**n_features).sum() - core_dist = (numerator / (n_neighbors - 1)) ** (-1/n_features) - return core_dist - - -def _mutual_reachability_dist(point_i, point_j, neighbors_i, - neighbors_j, dist_function): - """. - Computes the mutual reachability distance between points - - Args: - point_i (np.array): array of dimensions (n_features,) - point i to compare to point j - point_j (np.array): array of dimensions (n_features,) - point i to compare to point i - neighbors_i (np.ndarray): array of dims (n_neighbors, n_features): - array of all other points in object class of point i - neighbors_j (np.ndarray): array of dims (n_neighbors, n_features): - array of all other points in object class of point j - dist_dunction (func): function to determine distance between objects - func args must be [np.array, np.array] where each array is a point - - Returns: mutual_reachability (float) - mutual reachability between points i and j - +class Hdbscan: """ - core_dist_i = _core_dist(point_i, neighbors_i, dist_function) - core_dist_j = _core_dist(point_j, neighbors_j, dist_function) - dist = dist_function(point_i, point_j) - mutual_reachability = np.max([core_dist_i, core_dist_j, dist]) - return mutual_reachability - - -def _mutual_reach_dist_graph(X, labels, dist_function): - """ - Computes the mutual reach distance complete graph. - Graph of all pair-wise mutual reachability distances between points - - Args: - X (np.ndarray): ndarray with dimensions [n_samples, n_features] - data to check validity of clustering - labels (np.array): clustering assignments for data X - dist_dunction (func): function to determine distance between objects - func args must be [np.array, np.array] where each array is a point - - Returns: graph (np.ndarray) - array of dimensions (n_samples, n_samples) - Graph of all pair-wise mutual reachability distances between points. - - """ - n_samples = np.shape(X)[0] - graph = [] - counter = 0 - for row in range(n_samples): - graph_row = [] - for col in range(n_samples): - point_i = X[row] - point_j = X[col] - class_i = labels[row] - class_j = labels[col] - members_i = _get_label_members(X, labels, class_i) - members_j = _get_label_members(X, labels, class_j) - dist = _mutual_reachability_dist(point_i, point_j, - members_i, members_j, - dist_function) - graph_row.append(dist) - counter += 1 - graph.append(graph_row) - graph = np.array(graph) - return graph - - -def _mutual_reach_dist_MST(dist_tree): - """ - Computes minimum spanning tree of the mutual reach distance complete graph - - Args: - dist_tree (np.ndarray): array of dimensions (n_samples, n_samples) - Graph of all pair-wise mutual reachability distances - between points. - - Returns: minimum_spanning_tree (np.ndarray) - array of dimensions (n_samples, n_samples) - minimum spanning tree of all pair-wise mutual reachability - distances between points. - """ - mst = minimum_spanning_tree(dist_tree).toarray() - return mst + np.transpose(mst) - - -def _cluster_density_sparseness(MST, labels, cluster): - """ - Computes the cluster density sparseness, the minimum density - within a cluster - - Args: - MST (np.ndarray): minimum spanning tree of all pair-wise - mutual reachability distances between points. - labels (np.array): clustering assignments for data X - cluster (int): cluster of interest - - Returns: cluster_density_sparseness (float) - value corresponding to the minimum density within a cluster - """ - indices = np.where(labels == cluster)[0] - cluster_MST = MST[indices][:, indices] - cluster_density_sparseness = np.max(cluster_MST) - return cluster_density_sparseness - - -def _cluster_density_separation(MST, labels, cluster_i, cluster_j): - """ - Computes the density separation between two clusters, the maximum - density between clusters. - - Args: - MST (np.ndarray): minimum spanning tree of all pair-wise - mutual reachability distances between points. - labels (np.array): clustering assignments for data X - cluster_i (int): cluster i of interest - cluster_j (int): cluster j of interest - - Returns: density_separation (float): - value corresponding to the maximum density between clusters - """ - indices_i = np.where(labels == cluster_i)[0] - indices_j = np.where(labels == cluster_j)[0] - shortest_paths = csgraph.dijkstra(MST, indices=indices_i) - relevant_paths = shortest_paths[:, indices_j] - density_separation = np.min(relevant_paths) - return density_separation - - -def _cluster_validity_index(MST, labels, cluster): - """ - Computes the validity of a cluster (validity of assignmnets) - - Args: - MST (np.ndarray): minimum spanning tree of all pair-wise - mutual reachability distances between points. - labels (np.array): clustering assignments for data X - cluster (int): cluster of interest - - Returns: cluster_validity (float) - value corresponding to the validity of cluster assignments - """ - min_density_separation = np.inf - for cluster_j in np.unique(labels): - if cluster_j != cluster: - cluster_density_separation = _cluster_density_separation(MST, - labels, - cluster, - cluster_j) - if cluster_density_separation < min_density_separation: - min_density_separation = cluster_density_separation - cluster_density_sparseness = _cluster_density_sparseness(MST, - labels, - cluster) - numerator = min_density_separation - cluster_density_sparseness - denominator = np.max([min_density_separation, cluster_density_sparseness]) - cluster_validity = numerator / denominator - return cluster_validity - - -def _clustering_validity_index(MST, labels): - """ - Computes the validity of all clustering assignments for a - clustering algorithm - - Args: - MST (np.ndarray): minimum spanning tree of all pair-wise - mutual reachability distances between points. - labels (np.array): clustering assignments for data X - - Returns: validity_index (float): - score in range[-1, 1] indicating validity of clustering assignments - """ - n_samples = len(labels) - validity_index = 0 - for label in np.unique(labels): - fraction = np.sum(labels == label) / float(n_samples) - cluster_validity = _cluster_validity_index(MST, labels, label) - validity_index += fraction * cluster_validity - return validity_index - - -def _get_label_members(X, labels, cluster): - """ - Helper function to get samples of a specified cluster. - - Args: - X (np.ndarray): ndarray with dimensions [n_samples, n_features] - data to check validity of clustering - labels (np.array): clustering assignments for data X - cluster (int): cluster of interest - - Returns: members (np.ndarray) - array of dimensions (n_samples, n_features) of samples of the - specified cluster. + Runs an automatic optimized sklearn.HDBSCAN clustering on Dimensionality reducted space. + Vars: + data: the Dimensionality reducted space, raw result of the UMAP.fit() + param_dist: the HDBSCAN optimization parameters to test + Density-Based Clustering Validation - DBCV (https://github.com/christopherjenness/DBCV/tree/master ; + Moulavi, Davoud, et al. "Density-based clustering validation." Proceedings of the 2014 SIAM + International Conference on Data Mining. Society for Industrial and Applied Mathematics, 2014.) + is used as a metric to optimize HDBSCAN algorithm. + Functions DBCV, _core_dist, _mutual_reachability_dist, _mutual_reach_dist_graph, _mutual_reach_dist_graph, + _mutual_reach_dist_MST, _cluster_density_sparseness, _cluster_density_separation, _cluster_validity_index, + _clustering_validity_index and _get_label_members aim at DBCV computing. + _score is a dataframe with the DBCV value for each combination of param_dist. We search for the higher value and + compute an HDBSCAN with the best parameters. + The HDBSCAN_scores_ @property return the cluster number of each sample (_labels) and the DBCV best score. """ - indices = np.where(labels == cluster)[0] - members = X[indices] - return members - -def HDBSCAN_function(data): - # param_dist = {'min_samples': [1,5,10,30], - # 'min_cluster_size':[5,10,20,30,50,75,100], - # # 'cluster_selection_method' : ['eom','leaf'], - # # 'metric' : ['euclidean','manhattan'] - # } - # param_dist = {'min_samples': [1,5,10,50], - # 'min_cluster_size':[5,10,30,50,100,300,500], - # } - param_dist = {'min_samples': [1,5, 10,], - 'min_cluster_size':[5,10,30,50,100], - 'metric' : ['euclidean','manhattan'], - } - - clusterable_embedding = UMAP( - n_neighbors=20, - min_dist=0.0, - n_components=5, - random_state=42, - ).fit_transform(data) - - # RandomizedSearchCV not working... - # def scoring(model, clusterable_embedding): - # label = HDBSCAN().fit_predict(clusterable_embedding) - # hdbscan_score = DBCV(clusterable_embedding, label, dist_function=euclidean) - # return hdbscan_score - # tunning = RandomizedSearchCV(estimator=HDBSCAN(), param_distributions=param_dist, scoring=scoring) - # tunning.fit(clusterable_embedding) - # return tunning - min_score = pd.DataFrame() - for i in param_dist.get('min_samples'): - for j in param_dist.get('min_cluster_size'): - ij_label = HDBSCAN(min_samples=i, min_cluster_size=j).fit_predict(clusterable_embedding) - ij_hdbscan_score = DBCV(clusterable_embedding, ij_label, dist_function=euclidean) - min_score.at[i,j] = ij_hdbscan_score - hdbscan_score = max(min_score.max()) - # get the coordinates of the best clustering parameters and run HDBSCAN below - bparams = np.where(min_score == hdbscan_score) - # run HDBSCAN with best params - labels = HDBSCAN(min_samples=param_dist['min_samples'][bparams[0][0]], min_cluster_size=param_dist['min_cluster_size'][bparams[1][0]], metric=param_dist['metric'][bparams[1][0]]).fit_predict(clusterable_embedding) - return labels, hdbscan_score + def __init__(self, data): + # self._param_dist = {'min_samples': [1], + # 'min_cluster_size':[5,10], + # 'metric' : ['euclidean','manhattan'], + # } + self._param_dist = {'min_samples': [1,5,10,], + 'min_cluster_size':[5,25,50,], + 'metric' : ['euclidean','manhattan'], + } + + self._clusterable_embedding = data + + # RandomizedSearchCV not working... + # def scoring(model, clusterable_embedding): + # label = HDBSCAN().fit_predict(clusterable_embedding) + # hdbscan_score = DBCV(clusterable_embedding, label, dist_function=euclidean) + # return hdbscan_score + # tunning = RandomizedSearchCV(estimator=HDBSCAN(), param_distributions=param_dist, scoring=scoring) + # tunning.fit(clusterable_embedding) + # return tunning + + # compute optimization. Test each combination of parameters and store DBCV score into _score. + self._score = pd.DataFrame() + for i in self._param_dist.get('min_samples'): + for j in self._param_dist.get('min_cluster_size'): + self._ij_label = HDBSCAN(min_samples=i, min_cluster_size=j).fit_predict(self._clusterable_embedding) + self._ij_hdbscan_score = self.DBCV(self._clusterable_embedding, self._ij_label,)# dist_function=euclidean) + self._score.at[i,j] = self._ij_hdbscan_score + # get the best DBCV score + self._hdbscan_score = max(self._score.max()) + # find the coordinates of the best clustering parameters and run HDBSCAN below + self._bparams = np.where(self._score == self._hdbscan_score) + # run HDBSCAN with best params + self._labels = HDBSCAN(min_samples=self._param_dist['min_samples'][self._bparams[0][0]], min_cluster_size=self._param_dist['min_cluster_size'][self._bparams[1][0]], metric=self._param_dist['metric'][self._bparams[1][0]]).fit_predict(self._clusterable_embedding) + + def DBCV(self, X, labels, dist_function=euclidean): + """ + Implimentation of Density-Based Clustering Validation "DBCV" + + Citation: + Moulavi, Davoud, et al. "Density-based clustering validation." + Proceedings of the 2014 SIAM International Conference on Data Mining. + Society for Industrial and Applied Mathematics, 2014. + + Density Based clustering validation + + Args: + X (np.ndarray): ndarray with dimensions [n_samples, n_features] + data to check validity of clustering + labels (np.array): clustering assignments for data X + dist_dunction (func): function to determine distance between objects + func args must be [np.array, np.array] where each array is a point + + Returns: cluster_validity (float) + score in range[-1, 1] indicating validity of clustering assignments + """ + graph = self._mutual_reach_dist_graph(X, labels, dist_function) + mst = self._mutual_reach_dist_MST(graph) + cluster_validity = self._clustering_validity_index(mst, labels) + return cluster_validity + + + def _core_dist(self, point, neighbors, dist_function): + """ + Computes the core distance of a point. + Core distance is the inverse density of an object. + + Args: + point (np.array): array of dimensions (n_features,) + point to compute core distance of + neighbors (np.ndarray): array of dimensions (n_neighbors, n_features): + array of all other points in object class + dist_dunction (func): function to determine distance between objects + func args must be [np.array, np.array] where each array is a point + + Returns: core_dist (float) + inverse density of point + """ + n_features = np.shape(point)[0] + n_neighbors = np.shape(neighbors)[0] + + distance_vector = cdist(point.reshape(1, -1), neighbors) + distance_vector = distance_vector[distance_vector != 0] + numerator = ((1/distance_vector)**n_features).sum() + core_dist = (numerator / (n_neighbors - 1)) ** (-1/n_features) + return core_dist + + + def _mutual_reachability_dist(self, point_i, point_j, neighbors_i, + neighbors_j, dist_function): + """. + Computes the mutual reachability distance between points + + Args: + point_i (np.array): array of dimensions (n_features,) + point i to compare to point j + point_j (np.array): array of dimensions (n_features,) + point i to compare to point i + neighbors_i (np.ndarray): array of dims (n_neighbors, n_features): + array of all other points in object class of point i + neighbors_j (np.ndarray): array of dims (n_neighbors, n_features): + array of all other points in object class of point j + dist_dunction (func): function to determine distance between objects + func args must be [np.array, np.array] where each array is a point + + Returns: mutual_reachability (float) + mutual reachability between points i and j + + """ + core_dist_i = self._core_dist(point_i, neighbors_i, dist_function) + core_dist_j = self._core_dist(point_j, neighbors_j, dist_function) + dist = dist_function(point_i, point_j) + mutual_reachability = np.max([core_dist_i, core_dist_j, dist]) + return mutual_reachability + + + def _mutual_reach_dist_graph(self, X, labels, dist_function): + """ + Computes the mutual reach distance complete graph. + Graph of all pair-wise mutual reachability distances between points + + Args: + X (np.ndarray): ndarray with dimensions [n_samples, n_features] + data to check validity of clustering + labels (np.array): clustering assignments for data X + dist_dunction (func): function to determine distance between objects + func args must be [np.array, np.array] where each array is a point + + Returns: graph (np.ndarray) + array of dimensions (n_samples, n_samples) + Graph of all pair-wise mutual reachability distances between points. + + """ + n_samples = np.shape(X)[0] + graph = [] + counter = 0 + for row in range(n_samples): + graph_row = [] + for col in range(n_samples): + point_i = X[row] + point_j = X[col] + class_i = labels[row] + class_j = labels[col] + members_i = self._get_label_members(X, labels, class_i) + members_j = self._get_label_members(X, labels, class_j) + dist = self._mutual_reachability_dist(point_i, point_j, + members_i, members_j, + dist_function) + graph_row.append(dist) + counter += 1 + graph.append(graph_row) + graph = np.array(graph) + return graph + + + def _mutual_reach_dist_MST(self, dist_tree): + """ + Computes minimum spanning tree of the mutual reach distance complete graph + + Args: + dist_tree (np.ndarray): array of dimensions (n_samples, n_samples) + Graph of all pair-wise mutual reachability distances + between points. + + Returns: minimum_spanning_tree (np.ndarray) + array of dimensions (n_samples, n_samples) + minimum spanning tree of all pair-wise mutual reachability + distances between points. + """ + mst = minimum_spanning_tree(dist_tree).toarray() + return mst + np.transpose(mst) + + + def _cluster_density_sparseness(self, MST, labels, cluster): + """ + Computes the cluster density sparseness, the minimum density + within a cluster + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + cluster (int): cluster of interest + + Returns: cluster_density_sparseness (float) + value corresponding to the minimum density within a cluster + """ + indices = np.where(labels == cluster)[0] + cluster_MST = MST[indices][:, indices] + cluster_density_sparseness = np.max(cluster_MST) + return cluster_density_sparseness + + + def _cluster_density_separation(self, MST, labels, cluster_i, cluster_j): + """ + Computes the density separation between two clusters, the maximum + density between clusters. + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + cluster_i (int): cluster i of interest + cluster_j (int): cluster j of interest + + Returns: density_separation (float): + value corresponding to the maximum density between clusters + """ + indices_i = np.where(labels == cluster_i)[0] + indices_j = np.where(labels == cluster_j)[0] + shortest_paths = csgraph.dijkstra(MST, indices=indices_i) + relevant_paths = shortest_paths[:, indices_j] + density_separation = np.min(relevant_paths) + return density_separation + + + def _cluster_validity_index(self, MST, labels, cluster): + """ + Computes the validity of a cluster (validity of assignmnets) + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + cluster (int): cluster of interest + + Returns: cluster_validity (float) + value corresponding to the validity of cluster assignments + """ + min_density_separation = np.inf + for cluster_j in np.unique(labels): + if cluster_j != cluster: + cluster_density_separation = self._cluster_density_separation(MST, + labels, + cluster, + cluster_j) + if cluster_density_separation < min_density_separation: + min_density_separation = cluster_density_separation + cluster_density_sparseness = self._cluster_density_sparseness(MST, + labels, + cluster) + numerator = min_density_separation - cluster_density_sparseness + denominator = np.max([min_density_separation, cluster_density_sparseness]) + cluster_validity = numerator / denominator + return cluster_validity + + + def _clustering_validity_index(self, MST, labels): + """ + Computes the validity of all clustering assignments for a + clustering algorithm + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + + Returns: validity_index (float): + score in range[-1, 1] indicating validity of clustering assignments + """ + n_samples = len(labels) + validity_index = 0 + for label in np.unique(labels): + fraction = np.sum(labels == label) / float(n_samples) + cluster_validity = self._cluster_validity_index(MST, labels, label) + validity_index += fraction * cluster_validity + return validity_index + + + def _get_label_members(self, X, labels, cluster): + """ + Helper function to get samples of a specified cluster. + + Args: + X (np.ndarray): ndarray with dimensions [n_samples, n_features] + data to check validity of clustering + labels (np.array): clustering assignments for data X + cluster (int): cluster of interest + + Returns: members (np.ndarray) + array of dimensions (n_samples, n_features) of samples of the + specified cluster. + """ + indices = np.where(labels == cluster)[0] + members = X[indices] + return members + + @property + def HDBSCAN_scores_(self): + return self._labels, self._hdbscan_score diff --git a/Class_Mod/UMAP_.py b/Class_Mod/UMAP_.py index e9ae0dc4c930947d47cf8b47660f5ea8d749905a..8d415ebb9b32761ea9c53c06a88363e0300206da 100644 --- a/Class_Mod/UMAP_.py +++ b/Class_Mod/UMAP_.py @@ -4,17 +4,28 @@ from Class_Mod.DATA_HANDLING import * class Umap: - def __init__(self, x, n_components, n_neighbors, min_dist): - self.numerical_data, categorical_data, scaled_values = col_cat(x) - self.catdata = list(categorical_data.columns) + """ + The UMAP dimension reduction algorithm from scikit learn + """ + def __init__(self, data_import, numerical_data, cat_data): + self.x = data_import + self.numerical_data = numerical_data + if len(cat_data) > 0: + self.categorical_data = cat_data + self.le = LabelEncoder() + self.categorical_data_encoded = self.le.fit_transform(self.categorical_data) - self.x = scaled_values - - self.model = UMAP(n_neighbors=20, n_components=4, min_dist=0.0,) # random_state=42,) - self.model.fit(self.x) - self.scores = self.model.transform(self.x) - self.scores = pd.DataFrame(self.scores, index = self.numerical_data.index) + else: + self.categorical_data = False + + self.model = UMAP(n_neighbors=20, n_components=3, min_dist=0.0, random_state=42,) + self.model.fit(self.numerical_data, y = self.categorical_data_encoded) + self.scores_raw = self.model.transform(self.numerical_data) + self.scores = pd.DataFrame(self.scores_raw, index = self.x.index) @property def scores_(self): - return self.scores \ No newline at end of file + return self.scores + @property + def scores_raw_(self): + return self.scores_raw \ No newline at end of file diff --git a/Class_Mod/__init__.py b/Class_Mod/__init__.py index eb2dbb5b6b3a030cfa727730bf21e84ba9ed0948..c684862836ba8af35807b889e3b822f091dad3d6 100644 --- a/Class_Mod/__init__.py +++ b/Class_Mod/__init__.py @@ -8,3 +8,5 @@ from .Regression_metrics import metrics from .VarSel import TpeIpls from .Miscellaneous import resid_plot, reg_plot from .DxReader import DxRead +from .HDBSCAN_Clustering import Hdbscan + diff --git a/Modules.py b/Modules.py index 54399173517fa1fbd82e19b0df1cca4a63e380a2..0076fb22adc7da0d1aec6530ee3f6ab0a754d370 100644 --- a/Modules.py +++ b/Modules.py @@ -1,4 +1,4 @@ -from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead +from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan # find_col_index from Class_Mod.Miscellaneous import prediction, download_results diff --git a/Packages.py b/Packages.py index 9cad07bc174a70c11930d36b1a3e9f5c6d0ef109..b0d939baa8021ba8dfa14088d1b33d972500954d 100644 --- a/Packages.py +++ b/Packages.py @@ -1,4 +1,3 @@ - ## Data loading, handling, and preprocessing import os import sys @@ -10,14 +9,18 @@ import numpy as np import pandas as pd from os import listdir from os.path import isfile, join -from sklearn.preprocessing import StandardScaler, MinMaxScaler +from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder import time + ### Exploratory data analysis-Dimensionality reduction from umap.umap_ import UMAP from sklearn.decomposition import PCA, NMF # Clustering from sklearn.cluster import KMeans, HDBSCAN +from scipy.spatial.distance import euclidean, cdist +from scipy.sparse.csgraph import minimum_spanning_tree +from scipy.sparse import csgraph # Modelling # import julia @@ -38,6 +41,7 @@ from PIL import Image import plotly.express as px import matplotlib.pyplot as plt import seaborn as sns + ### Important Metrics from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score @@ -49,6 +53,7 @@ from tempfile import NamedTemporaryFile #Library for connecting to SQL DB import pyodbc + #Library for reading the config file, which is in JSON import json diff --git a/pages/1-samples_selection.py b/pages/1-samples_selection.py index cb1348dca6451bbf023afb0051ff68b4a8963dbc..ffb4d81631eab0beda7d3fd473b21e004a6704f4 100644 --- a/pages/1-samples_selection.py +++ b/pages/1-samples_selection.py @@ -36,6 +36,7 @@ with container1: else: col = False data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) + data_import, categorical_data, scaled_values = col_cat(data_import) st.success("The data have been loaded successfully", icon="✅") ## Visualize spectra @@ -103,23 +104,27 @@ with container2: if type_plot == 'PCA': model = LinearPCA(data_import, Ncomp=5) elif type_plot =='UMAP': - model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0) - + model = Umap(data_import = data_import, numerical_data = scaled_values, cat_data = categorical_data) if type_plot in ['PCA', 'UMAP']: - # add 2 select lists to choose which component to plot - axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0) - axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1) - axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2) + if type_plot in ['PCA']: + # add 2 select lists to choose which component to plot + axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0) + axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1) + axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2) + elif type_plot in ['UMAP']: + axis1 = 0 + axis2 = 1 + axis3 = 2 if type_cluster == 'Kmeans': scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1) cl = Sk_Kmeans(scsc, max_clusters = 30) elif type_cluster == 'HDBSCAN': - from Class_Mod.HDBSCAN_Clustering import HDBSCAN_function - labels, hdbscan_score = HDBSCAN_function(data_import) + optimized_hdbscan = Hdbscan(model.scores_raw_) + labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_ with scores: t = model.scores_ if type_cluster in ['AP', 'Kmeans']: @@ -140,7 +145,9 @@ with container2: fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels) fig.update_traces(marker=dict(size=4)) # st.plotly_chart(fig_hdbscan) - st.write('DBCV score (-1:1) = ' + str(hdbscan_score)) + st.write('Optimal number of clusters = ' + str(len(set(labels)))) + st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3))) + st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).') else: if test == '.dx': @@ -190,7 +197,6 @@ with container2: fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals") st.plotly_chart(fig) - - else: - st.markdown('Select a dimensionality reduction technique from the dropdown list') + else: + st.markdown('Select a dimensionality reduction technique from the dropdown list')