updated HDBSCAN becomes a class and supervised UMAP

f98b67f8 · Nicolas Barthes · 4b97a50f · f98b67f8 · f98b67f8 · f98b67f8
Commit f98b67f8 authored 11 months ago by Nicolas Barthes
--- a/Class_Mod/HDBSCAN_Clustering.py
+++ b/Class_Mod/HDBSCAN_Clustering.py
 from Packages import *
-from scipy.spatial.distance import euclidean, cdist
+class Hdbscan:
-from scipy.sparse.csgraph import minimum_spanning_tree
-from scipy.sparse import csgraph
-def DBCV(X, labels, dist_function=euclidean):
-    """
-    Implimentation of Density-Based Clustering Validation "DBCV"
-    Citation:
-    Moulavi, Davoud, et al. "Density-based clustering validation."
-    Proceedings of the 2014 SIAM International Conference on Data Mining.
-    Society for Industrial and Applied Mathematics, 2014.
-    Density Based clustering validation
-    Args:
-        X (np.ndarray): ndarray with dimensions [n_samples, n_features]
-            data to check validity of clustering
-        labels (np.array): clustering assignments for data X
-        dist_dunction (func): function to determine distance between objects
-            func args must be [np.array, np.array] where each array is a point
-    Returns: cluster_validity (float)
-        score in range[-1, 1] indicating validity of clustering assignments
-    """
-    graph = _mutual_reach_dist_graph(X, labels, dist_function)
-    mst = _mutual_reach_dist_MST(graph)
-    cluster_validity = _clustering_validity_index(mst, labels)
-    return cluster_validity
-def _core_dist(point, neighbors, dist_function):
-    """
-    Computes the core distance of a point.
-    Core distance is the inverse density of an object.
-    Args:
-        point (np.array): array of dimensions (n_features,)
-            point to compute core distance of
-        neighbors (np.ndarray): array of dimensions (n_neighbors, n_features):
-            array of all other points in object class
-        dist_dunction (func): function to determine distance between objects
-            func args must be [np.array, np.array] where each array is a point
-    Returns: core_dist (float)
-        inverse density of point
-    """
-    n_features = np.shape(point)[0]
-    n_neighbors = np.shape(neighbors)[0]
-    distance_vector = cdist(point.reshape(1, -1), neighbors)
-    distance_vector = distance_vector[distance_vector != 0]
-    numerator = ((1/distance_vector)**n_features).sum()
-    core_dist = (numerator / (n_neighbors - 1)) ** (-1/n_features)
-    return core_dist
-def _mutual_reachability_dist(point_i, point_j, neighbors_i,
-                              neighbors_j, dist_function):
-    """.
-    Computes the mutual reachability distance between points
-    Args:
-        point_i (np.array): array of dimensions (n_features,)
-            point i to compare to point j
-        point_j (np.array): array of dimensions (n_features,)
-            point i to compare to point i
-        neighbors_i (np.ndarray): array of dims (n_neighbors, n_features):
-            array of all other points in object class of point i
-        neighbors_j (np.ndarray): array of dims (n_neighbors, n_features):
-            array of all other points in object class of point j
-        dist_dunction (func): function to determine distance between objects
-            func args must be [np.array, np.array] where each array is a point
-    Returns: mutual_reachability (float)
-        mutual reachability between points i and j
    """
-    core_dist_i = _core_dist(point_i, neighbors_i, dist_function)
+    Runs an automatic optimized sklearn.HDBSCAN clustering on Dimensionality reducted space.
-    core_dist_j = _core_dist(point_j, neighbors_j, dist_function)
+    Vars:
-    dist = dist_function(point_i, point_j)
+        data: the Dimensionality reducted space, raw result of the UMAP.fit()
-    mutual_reachability = np.max([core_dist_i, core_dist_j, dist])
+        param_dist: the HDBSCAN optimization parameters to test
-    return mutual_reachability
+        Density-Based Clustering Validation - DBCV (https://github.com/christopherjenness/DBCV/tree/master ;
+            Moulavi, Davoud, et al. "Density-based clustering validation." Proceedings of the 2014 SIAM
+            International Conference on Data Mining. Society for Industrial and Applied Mathematics, 2014.)
-def _mutual_reach_dist_graph(X, labels, dist_function):
+            is used as a metric to optimize HDBSCAN algorithm.
-    """
+            Functions DBCV, _core_dist, _mutual_reachability_dist, _mutual_reach_dist_graph, _mutual_reach_dist_graph,
-    Computes the mutual reach distance complete graph.
+            _mutual_reach_dist_MST, _cluster_density_sparseness, _cluster_density_separation, _cluster_validity_index,
-    Graph of all pair-wise mutual reachability distances between points
+            _clustering_validity_index and _get_label_members aim at DBCV computing.
+        _score is a dataframe with the DBCV value for each combination of param_dist. We search for the higher value and
-    Args:
+            compute an HDBSCAN with the best parameters.
-        X (np.ndarray): ndarray with dimensions [n_samples, n_features]
+        The HDBSCAN_scores_ @property return the cluster number of each sample (_labels) and the DBCV best score.
-            data to check validity of clustering
-        labels (np.array): clustering assignments for data X
-        dist_dunction (func): function to determine distance between objects
-            func args must be [np.array, np.array] where each array is a point
-    Returns: graph (np.ndarray)
-        array of dimensions (n_samples, n_samples)
-        Graph of all pair-wise mutual reachability distances between points.
-    """
-    n_samples = np.shape(X)[0]
-    graph = []
-    counter = 0
-    for row in range(n_samples):
-        graph_row = []
-        for col in range(n_samples):
-            point_i = X[row]
-            point_j = X[col]
-            class_i = labels[row]
-            class_j = labels[col]
-            members_i = _get_label_members(X, labels, class_i)
-            members_j = _get_label_members(X, labels, class_j)
-            dist = _mutual_reachability_dist(point_i, point_j,
-                                             members_i, members_j,
-                                             dist_function)
-            graph_row.append(dist)
-        counter += 1
-        graph.append(graph_row)
-    graph = np.array(graph)
-    return graph
-def _mutual_reach_dist_MST(dist_tree):
-    """
-    Computes minimum spanning tree of the mutual reach distance complete graph
-    Args:
-        dist_tree (np.ndarray): array of dimensions (n_samples, n_samples)
-            Graph of all pair-wise mutual reachability distances
-            between points.
-    Returns: minimum_spanning_tree (np.ndarray)
-        array of dimensions (n_samples, n_samples)
-        minimum spanning tree of all pair-wise mutual reachability
-            distances between points.
-    """
-    mst = minimum_spanning_tree(dist_tree).toarray()
-    return mst + np.transpose(mst)
-def _cluster_density_sparseness(MST, labels, cluster):
-    """
-    Computes the cluster density sparseness, the minimum density
-        within a cluster
-    Args:
-        MST (np.ndarray): minimum spanning tree of all pair-wise
-            mutual reachability distances between points.
-        labels (np.array): clustering assignments for data X
-        cluster (int): cluster of interest
-    Returns: cluster_density_sparseness (float)
-        value corresponding to the minimum density within a cluster
-    """
-    indices = np.where(labels == cluster)[0]
-    cluster_MST = MST[indices][:, indices]
-    cluster_density_sparseness = np.max(cluster_MST)
-    return cluster_density_sparseness
-def _cluster_density_separation(MST, labels, cluster_i, cluster_j):
-    """
-    Computes the density separation between two clusters, the maximum
-        density between clusters.
-    Args:
-        MST (np.ndarray): minimum spanning tree of all pair-wise
-            mutual reachability distances between points.
-        labels (np.array): clustering assignments for data X
-        cluster_i (int): cluster i of interest
-        cluster_j (int): cluster j of interest
-    Returns: density_separation (float):
-        value corresponding to the maximum density between clusters
-    """
-    indices_i = np.where(labels == cluster_i)[0]
-    indices_j = np.where(labels == cluster_j)[0]
-    shortest_paths = csgraph.dijkstra(MST, indices=indices_i)
-    relevant_paths = shortest_paths[:, indices_j]
-    density_separation = np.min(relevant_paths)
-    return density_separation
-def _cluster_validity_index(MST, labels, cluster):
-    """
-    Computes the validity of a cluster (validity of assignmnets)
-    Args:
-        MST (np.ndarray): minimum spanning tree of all pair-wise
-            mutual reachability distances between points.
-        labels (np.array): clustering assignments for data X
-        cluster (int): cluster of interest
-    Returns: cluster_validity (float)
-        value corresponding to the validity of cluster assignments
-    """
-    min_density_separation = np.inf
-    for cluster_j in np.unique(labels):
-        if cluster_j != cluster:
-            cluster_density_separation = _cluster_density_separation(MST,
-                                                                     labels,
-                                                                     cluster,
-                                                                     cluster_j)
-            if cluster_density_separation < min_density_separation:
-                min_density_separation = cluster_density_separation
-    cluster_density_sparseness = _cluster_density_sparseness(MST,
-                                                             labels,
-                                                             cluster)
-    numerator = min_density_separation - cluster_density_sparseness
-    denominator = np.max([min_density_separation, cluster_density_sparseness])
-    cluster_validity = numerator / denominator
-    return cluster_validity
-def _clustering_validity_index(MST, labels):
-    """
-    Computes the validity of all clustering assignments for a
-    clustering algorithm
-    Args:
-        MST (np.ndarray): minimum spanning tree of all pair-wise
-            mutual reachability distances between points.
-        labels (np.array): clustering assignments for data X
-    Returns: validity_index (float):
-        score in range[-1, 1] indicating validity of clustering assignments
-    """
-    n_samples = len(labels)
-    validity_index = 0
-    for label in np.unique(labels):
-        fraction = np.sum(labels == label) / float(n_samples)
-        cluster_validity = _cluster_validity_index(MST, labels, label)
-        validity_index += fraction * cluster_validity
-    return validity_index
-def _get_label_members(X, labels, cluster):
-    """
-    Helper function to get samples of a specified cluster.
-    Args:
-        X (np.ndarray): ndarray with dimensions [n_samples, n_features]
-            data to check validity of clustering
-        labels (np.array): clustering assignments for data X
-        cluster (int): cluster of interest
-    Returns: members (np.ndarray)
-        array of dimensions (n_samples, n_features) of samples of the
-        specified cluster.
    """
-    indices = np.where(labels == cluster)[0]
+    def __init__(self, data):
-    members = X[indices]
+        # self._param_dist = {'min_samples': [1],
-    return members
+        #               'min_cluster_size':[5,10],
+        #               'metric' : ['euclidean','manhattan'],
-def HDBSCAN_function(data):
+        #               }
-    # param_dist = {'min_samples': [1,5,10,30],
+        self._param_dist = {'min_samples': [1,5,10,],
-    #               'min_cluster_size':[5,10,20,30,50,75,100],
+                      'min_cluster_size':[5,25,50,],
-    #               # 'cluster_selection_method' : ['eom','leaf'],
+                      'metric' : ['euclidean','manhattan'],
-    #               # 'metric' : ['euclidean','manhattan']
+                      }
-    #               }
-    # param_dist = {'min_samples': [1,5,10,50],
+        self._clusterable_embedding = data
-    #               'min_cluster_size':[5,10,30,50,100,300,500],
-    #               }
+        # RandomizedSearchCV not working...
-    param_dist = {'min_samples': [1,5, 10,],
+        # def scoring(model, clusterable_embedding):
-                  'min_cluster_size':[5,10,30,50,100],
+        #     label = HDBSCAN().fit_predict(clusterable_embedding)
-                  'metric' : ['euclidean','manhattan'],
+        #     hdbscan_score = DBCV(clusterable_embedding, label, dist_function=euclidean)
-                  }
+        #     return hdbscan_score
+        # tunning = RandomizedSearchCV(estimator=HDBSCAN(), param_distributions=param_dist,  scoring=scoring)
-    clusterable_embedding = UMAP(
+        # tunning.fit(clusterable_embedding)
-        n_neighbors=20,
+        # return tunning
-        min_dist=0.0,
-        n_components=5,
+        # compute optimization. Test each combination of parameters and store DBCV score into _score.
-        random_state=42,
+        self._score = pd.DataFrame()
-    ).fit_transform(data)
+        for i in self._param_dist.get('min_samples'):
+            for j in self._param_dist.get('min_cluster_size'):
-    # RandomizedSearchCV not working...
+                self._ij_label = HDBSCAN(min_samples=i, min_cluster_size=j).fit_predict(self._clusterable_embedding)
-    # def scoring(model, clusterable_embedding):
+                self._ij_hdbscan_score = self.DBCV(self._clusterable_embedding, self._ij_label,)# dist_function=euclidean)
-    #     label = HDBSCAN().fit_predict(clusterable_embedding)
+                self._score.at[i,j] = self._ij_hdbscan_score
-    #     hdbscan_score = DBCV(clusterable_embedding, label, dist_function=euclidean)
+        # get the best DBCV score
-    #     return hdbscan_score
+        self._hdbscan_score  = max(self._score.max())
-    # tunning = RandomizedSearchCV(estimator=HDBSCAN(), param_distributions=param_dist,  scoring=scoring)
+        # find the coordinates of the best clustering parameters and run HDBSCAN below
-    # tunning.fit(clusterable_embedding)
+        self._bparams = np.where(self._score == self._hdbscan_score)
-    # return tunning
+        # run HDBSCAN with best params
-    min_score = pd.DataFrame()
+        self._labels = HDBSCAN(min_samples=self._param_dist['min_samples'][self._bparams[0][0]], min_cluster_size=self._param_dist['min_cluster_size'][self._bparams[1][0]], metric=self._param_dist['metric'][self._bparams[1][0]]).fit_predict(self._clusterable_embedding)
-    for i in param_dist.get('min_samples'):
-        for j in param_dist.get('min_cluster_size'):
+    def DBCV(self, X, labels, dist_function=euclidean):
-            ij_label = HDBSCAN(min_samples=i, min_cluster_size=j).fit_predict(clusterable_embedding)
+        """
-            ij_hdbscan_score = DBCV(clusterable_embedding, ij_label, dist_function=euclidean)
+        Implimentation of Density-Based Clustering Validation "DBCV"
-            min_score.at[i,j] = ij_hdbscan_score
-    hdbscan_score  = max(min_score.max())
+        Citation:
-    # get the coordinates of the best clustering parameters and run HDBSCAN below
+        Moulavi, Davoud, et al. "Density-based clustering validation."
-    bparams = np.where(min_score == hdbscan_score)
+        Proceedings of the 2014 SIAM International Conference on Data Mining.
-    # run HDBSCAN with best params
+        Society for Industrial and Applied Mathematics, 2014.
-    labels = HDBSCAN(min_samples=param_dist['min_samples'][bparams[0][0]], min_cluster_size=param_dist['min_cluster_size'][bparams[1][0]], metric=param_dist['metric'][bparams[1][0]]).fit_predict(clusterable_embedding)
-    return labels, hdbscan_score
+        Density Based clustering validation
+        Args:
+            X (np.ndarray): ndarray with dimensions [n_samples, n_features]
+                data to check validity of clustering
+            labels (np.array): clustering assignments for data X
+            dist_dunction (func): function to determine distance between objects
+                func args must be [np.array, np.array] where each array is a point
+        Returns: cluster_validity (float)
+            score in range[-1, 1] indicating validity of clustering assignments
+        """
+        graph = self._mutual_reach_dist_graph(X, labels, dist_function)
+        mst = self._mutual_reach_dist_MST(graph)
+        cluster_validity = self._clustering_validity_index(mst, labels)
+        return cluster_validity
+    def _core_dist(self, point, neighbors, dist_function):
+        """
+        Computes the core distance of a point.
+        Core distance is the inverse density of an object.
+        Args:
+            point (np.array): array of dimensions (n_features,)
+                point to compute core distance of
+            neighbors (np.ndarray): array of dimensions (n_neighbors, n_features):
+                array of all other points in object class
+            dist_dunction (func): function to determine distance between objects
+                func args must be [np.array, np.array] where each array is a point
+        Returns: core_dist (float)
+            inverse density of point
+        """
+        n_features = np.shape(point)[0]
+        n_neighbors = np.shape(neighbors)[0]
+        distance_vector = cdist(point.reshape(1, -1), neighbors)
+        distance_vector = distance_vector[distance_vector != 0]
+        numerator = ((1/distance_vector)**n_features).sum()
+        core_dist = (numerator / (n_neighbors - 1)) ** (-1/n_features)
+        return core_dist
+    def _mutual_reachability_dist(self, point_i, point_j, neighbors_i,
+                                  neighbors_j, dist_function):
+        """.
+        Computes the mutual reachability distance between points
+        Args:
+            point_i (np.array): array of dimensions (n_features,)
+                point i to compare to point j
+            point_j (np.array): array of dimensions (n_features,)
+                point i to compare to point i
+            neighbors_i (np.ndarray): array of dims (n_neighbors, n_features):
+                array of all other points in object class of point i
+            neighbors_j (np.ndarray): array of dims (n_neighbors, n_features):
+                array of all other points in object class of point j
+            dist_dunction (func): function to determine distance between objects
+                func args must be [np.array, np.array] where each array is a point
+        Returns: mutual_reachability (float)
+            mutual reachability between points i and j
+        """
+        core_dist_i = self._core_dist(point_i, neighbors_i, dist_function)
+        core_dist_j = self._core_dist(point_j, neighbors_j, dist_function)
+        dist = dist_function(point_i, point_j)
+        mutual_reachability = np.max([core_dist_i, core_dist_j, dist])
+        return mutual_reachability
+    def _mutual_reach_dist_graph(self, X, labels, dist_function):
+        """
+        Computes the mutual reach distance complete graph.
+        Graph of all pair-wise mutual reachability distances between points
+        Args:
+            X (np.ndarray): ndarray with dimensions [n_samples, n_features]
+                data to check validity of clustering
+            labels (np.array): clustering assignments for data X
+            dist_dunction (func): function to determine distance between objects
+                func args must be [np.array, np.array] where each array is a point
+        Returns: graph (np.ndarray)
+            array of dimensions (n_samples, n_samples)
+            Graph of all pair-wise mutual reachability distances between points.
+        """
+        n_samples = np.shape(X)[0]
+        graph = []
+        counter = 0
+        for row in range(n_samples):
+            graph_row = []
+            for col in range(n_samples):
+                point_i = X[row]
+                point_j = X[col]
+                class_i = labels[row]
+                class_j = labels[col]
+                members_i = self._get_label_members(X, labels, class_i)
+                members_j = self._get_label_members(X, labels, class_j)
+                dist = self._mutual_reachability_dist(point_i, point_j,
+                                                 members_i, members_j,
+                                                 dist_function)
+                graph_row.append(dist)
+            counter += 1
+            graph.append(graph_row)
+        graph = np.array(graph)
+        return graph
+    def _mutual_reach_dist_MST(self, dist_tree):
+        """
+        Computes minimum spanning tree of the mutual reach distance complete graph
+        Args:
+            dist_tree (np.ndarray): array of dimensions (n_samples, n_samples)
+                Graph of all pair-wise mutual reachability distances
+                between points.
+        Returns: minimum_spanning_tree (np.ndarray)
+            array of dimensions (n_samples, n_samples)
+            minimum spanning tree of all pair-wise mutual reachability
+                distances between points.
+        """
+        mst = minimum_spanning_tree(dist_tree).toarray()
+        return mst + np.transpose(mst)
+    def _cluster_density_sparseness(self, MST, labels, cluster):
+        """
+        Computes the cluster density sparseness, the minimum density
+            within a cluster
+        Args:
+            MST (np.ndarray): minimum spanning tree of all pair-wise
+                mutual reachability distances between points.
+            labels (np.array): clustering assignments for data X
+            cluster (int): cluster of interest
+        Returns: cluster_density_sparseness (float)
+            value corresponding to the minimum density within a cluster
+        """
+        indices = np.where(labels == cluster)[0]
+        cluster_MST = MST[indices][:, indices]
+        cluster_density_sparseness = np.max(cluster_MST)
+        return cluster_density_sparseness
+    def _cluster_density_separation(self, MST, labels, cluster_i, cluster_j):
+        """
+        Computes the density separation between two clusters, the maximum
+            density between clusters.
+        Args:
+            MST (np.ndarray): minimum spanning tree of all pair-wise
+                mutual reachability distances between points.
+            labels (np.array): clustering assignments for data X
+            cluster_i (int): cluster i of interest
+            cluster_j (int): cluster j of interest
+        Returns: density_separation (float):
+            value corresponding to the maximum density between clusters
+        """
+        indices_i = np.where(labels == cluster_i)[0]
+        indices_j = np.where(labels == cluster_j)[0]
+        shortest_paths = csgraph.dijkstra(MST, indices=indices_i)
+        relevant_paths = shortest_paths[:, indices_j]
+        density_separation = np.min(relevant_paths)
+        return density_separation
+    def _cluster_validity_index(self, MST, labels, cluster):
+        """
+        Computes the validity of a cluster (validity of assignmnets)
+        Args:
+            MST (np.ndarray): minimum spanning tree of all pair-wise
+                mutual reachability distances between points.
+            labels (np.array): clustering assignments for data X
+            cluster (int): cluster of interest
+        Returns: cluster_validity (float)
+            value corresponding to the validity of cluster assignments
+        """
+        min_density_separation = np.inf
+        for cluster_j in np.unique(labels):
+            if cluster_j != cluster:
+                cluster_density_separation = self._cluster_density_separation(MST,
+                                                                         labels,
+                                                                         cluster,
+                                                                         cluster_j)
+                if cluster_density_separation < min_density_separation:
+                    min_density_separation = cluster_density_separation
+        cluster_density_sparseness = self._cluster_density_sparseness(MST,
+                                                                 labels,
+                                                                 cluster)
+        numerator = min_density_separation - cluster_density_sparseness
+        denominator = np.max([min_density_separation, cluster_density_sparseness])
+        cluster_validity = numerator / denominator
+        return cluster_validity
+    def _clustering_validity_index(self, MST, labels):
+        """
+        Computes the validity of all clustering assignments for a
+        clustering algorithm
+        Args:
+            MST (np.ndarray): minimum spanning tree of all pair-wise
+                mutual reachability distances between points.
+            labels (np.array): clustering assignments for data X
+        Returns: validity_index (float):
+            score in range[-1, 1] indicating validity of clustering assignments
+        """
+        n_samples = len(labels)
+        validity_index = 0
+        for label in np.unique(labels):
+            fraction = np.sum(labels == label) / float(n_samples)
+            cluster_validity = self._cluster_validity_index(MST, labels, label)
+            validity_index += fraction * cluster_validity
+        return validity_index
+    def _get_label_members(self, X, labels, cluster):
+        """
+        Helper function to get samples of a specified cluster.
+        Args:
+            X (np.ndarray): ndarray with dimensions [n_samples, n_features]
+                data to check validity of clustering
+            labels (np.array): clustering assignments for data X
+            cluster (int): cluster of interest
+        Returns: members (np.ndarray)
+            array of dimensions (n_samples, n_features) of samples of the
+            specified cluster.
+        """
+        indices = np.where(labels == cluster)[0]
+        members = X[indices]
+        return members
+    @property
+    def HDBSCAN_scores_(self):
+         return self._labels, self._hdbscan_score
--- a/Class_Mod/UMAP_.py
+++ b/Class_Mod/UMAP_.py
@@ -4,17 +4,28 @@ from Class_Mod.DATA_HANDLING import *
 class Umap:
-    def __init__(self, x, n_components, n_neighbors, min_dist):
+    """
-        self.numerical_data, categorical_data, scaled_values = col_cat(x)
+    The UMAP dimension reduction algorithm from scikit learn
-        self.catdata = list(categorical_data.columns)
+    """
+    def __init__(self, data_import, numerical_data, cat_data):
+        self.x = data_import
+        self.numerical_data = numerical_data
+        if len(cat_data) > 0:
+            self.categorical_data = cat_data
+            self.le = LabelEncoder()
+            self.categorical_data_encoded = self.le.fit_transform(self.categorical_data)
-        self.x = scaled_values
+        else:
+            self.categorical_data = False
-        self.model = UMAP(n_neighbors=20, n_components=4, min_dist=0.0,) # random_state=42,)
-        self.model.fit(self.x)
+        self.model = UMAP(n_neighbors=20, n_components=3, min_dist=0.0, random_state=42,)
-        self.scores = self.model.transform(self.x)
+        self.model.fit(self.numerical_data, y = self.categorical_data_encoded)
-        self.scores = pd.DataFrame(self.scores, index = self.numerical_data.index)
+        self.scores_raw = self.model.transform(self.numerical_data)
+        self.scores = pd.DataFrame(self.scores_raw, index = self.x.index)
    @property
    def scores_(self):
        return self.scores
\ No newline at end of file
+    @property
+    def scores_raw_(self):
+        return self.scores_raw
\ No newline at end of file
--- a/Class_Mod/__init__.py
+++ b/Class_Mod/__init__.py
@@ -8,3 +8,5 @@ from .Regression_metrics import metrics
 from .VarSel import TpeIpls
 from .Miscellaneous import resid_plot, reg_plot
 from .DxReader import DxRead
+from .HDBSCAN_Clustering import Hdbscan
--- a/Modules.py
+++ b/Modules.py
-from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead
+from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan
 # find_col_index
 from Class_Mod.Miscellaneous import prediction, download_results
--- a/Packages.py
+++ b/Packages.py
 ## Data loading, handling, and preprocessing
 import os
 import sys
@@ -10,14 +9,18 @@ import numpy as np
 import pandas as pd
 from os import listdir
 from os.path import isfile, join
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
 import time
 ### Exploratory data analysis-Dimensionality reduction
 from umap.umap_ import UMAP
 from sklearn.decomposition import PCA, NMF
 # Clustering
 from sklearn.cluster import KMeans, HDBSCAN
+from scipy.spatial.distance import euclidean, cdist
+from scipy.sparse.csgraph import minimum_spanning_tree
+from scipy.sparse import csgraph
 # Modelling
 # import julia
@@ -38,6 +41,7 @@ from PIL import Image
 import plotly.express as px
 import matplotlib.pyplot as plt
 import seaborn as sns
 ### Important Metrics
 from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score
@@ -49,6 +53,7 @@ from tempfile import NamedTemporaryFile
 #Library for connecting to SQL DB
 import pyodbc
 #Library for reading the config file, which is in JSON
 import json

--- a/pages/1-samples_selection.py
+++ b/pages/1-samples_selection.py
@@ -36,6 +36,7 @@ with container1:
                else:
                    col = False
                data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
+                data_import, categorical_data, scaled_values = col_cat(data_import)
                st.success("The data have been loaded successfully", icon="✅")
                ## Visualize spectra
@@ -103,23 +104,27 @@ with container2:
            if type_plot == 'PCA':
                model = LinearPCA(data_import, Ncomp=5)
            elif type_plot =='UMAP':
-                model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0)
+                model = Umap(data_import = data_import, numerical_data = scaled_values, cat_data = categorical_data)
        if type_plot in ['PCA', 'UMAP']:
-            # add 2 select lists to choose which component to plot
+            if type_plot in ['PCA']:
-            axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0)
+                # add 2 select lists to choose which component to plot
-            axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1)
+                axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0)
-            axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2)
+                axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1)
+                axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2)
+            elif type_plot in ['UMAP']:
+                axis1 = 0
+                axis2 = 1
+                axis3 = 2
            if type_cluster == 'Kmeans':
                scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1)
                cl = Sk_Kmeans(scsc, max_clusters = 30)
            elif type_cluster == 'HDBSCAN':
-                from Class_Mod.HDBSCAN_Clustering import HDBSCAN_function
+                optimized_hdbscan = Hdbscan(model.scores_raw_)
-                labels, hdbscan_score = HDBSCAN_function(data_import)
+                labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
            with scores:
                t = model.scores_
                if type_cluster in ['AP', 'Kmeans']:
@@ -140,7 +145,9 @@ with container2:
                    fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
                    fig.update_traces(marker=dict(size=4))
                    # st.plotly_chart(fig_hdbscan)
-                    st.write('DBCV score (-1:1) = ' + str(hdbscan_score))
+                    st.write('Optimal number of clusters = ' + str(len(set(labels))))
+                    st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
+                    st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')
                else:
                    if test == '.dx':
@@ -190,7 +197,6 @@ with container2:
                    fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="T²",yaxis_title="Residuals")
                    st.plotly_chart(fig)
+        else:
-            else:
+            st.markdown('Select a dimensionality reduction technique from the dropdown list')
-                st.markdown('Select a dimensionality reduction technique from the dropdown list')