diff --git a/hdbscan.py b/hdbscan.py new file mode 100644 index 0000000000000000000000000000000000000000..0d9268bfed8e6c97db22f7165424052a13110f60 --- /dev/null +++ b/hdbscan.py @@ -0,0 +1,285 @@ +from Packages import * +from scipy.spatial.distance import euclidean, cdist +from scipy.sparse.csgraph import minimum_spanning_tree +from scipy.sparse import csgraph + + +def DBCV(X, labels, dist_function=euclidean): + """ + Implimentation of Density-Based Clustering Validation "DBCV" + + Citation: + Moulavi, Davoud, et al. "Density-based clustering validation." + Proceedings of the 2014 SIAM International Conference on Data Mining. + Society for Industrial and Applied Mathematics, 2014. + + Density Based clustering validation + + Args: + X (np.ndarray): ndarray with dimensions [n_samples, n_features] + data to check validity of clustering + labels (np.array): clustering assignments for data X + dist_dunction (func): function to determine distance between objects + func args must be [np.array, np.array] where each array is a point + + Returns: cluster_validity (float) + score in range[-1, 1] indicating validity of clustering assignments + """ + graph = _mutual_reach_dist_graph(X, labels, dist_function) + mst = _mutual_reach_dist_MST(graph) + cluster_validity = _clustering_validity_index(mst, labels) + return cluster_validity + + +def _core_dist(point, neighbors, dist_function): + """ + Computes the core distance of a point. + Core distance is the inverse density of an object. + + Args: + point (np.array): array of dimensions (n_features,) + point to compute core distance of + neighbors (np.ndarray): array of dimensions (n_neighbors, n_features): + array of all other points in object class + dist_dunction (func): function to determine distance between objects + func args must be [np.array, np.array] where each array is a point + + Returns: core_dist (float) + inverse density of point + """ + n_features = np.shape(point)[0] + n_neighbors = np.shape(neighbors)[0] + + distance_vector = cdist(point.reshape(1, -1), neighbors) + distance_vector = distance_vector[distance_vector != 0] + numerator = ((1/distance_vector)**n_features).sum() + core_dist = (numerator / (n_neighbors - 1)) ** (-1/n_features) + return core_dist + + +def _mutual_reachability_dist(point_i, point_j, neighbors_i, + neighbors_j, dist_function): + """. + Computes the mutual reachability distance between points + + Args: + point_i (np.array): array of dimensions (n_features,) + point i to compare to point j + point_j (np.array): array of dimensions (n_features,) + point i to compare to point i + neighbors_i (np.ndarray): array of dims (n_neighbors, n_features): + array of all other points in object class of point i + neighbors_j (np.ndarray): array of dims (n_neighbors, n_features): + array of all other points in object class of point j + dist_dunction (func): function to determine distance between objects + func args must be [np.array, np.array] where each array is a point + + Returns: mutual_reachability (float) + mutual reachability between points i and j + + """ + core_dist_i = _core_dist(point_i, neighbors_i, dist_function) + core_dist_j = _core_dist(point_j, neighbors_j, dist_function) + dist = dist_function(point_i, point_j) + mutual_reachability = np.max([core_dist_i, core_dist_j, dist]) + return mutual_reachability + + +def _mutual_reach_dist_graph(X, labels, dist_function): + """ + Computes the mutual reach distance complete graph. + Graph of all pair-wise mutual reachability distances between points + + Args: + X (np.ndarray): ndarray with dimensions [n_samples, n_features] + data to check validity of clustering + labels (np.array): clustering assignments for data X + dist_dunction (func): function to determine distance between objects + func args must be [np.array, np.array] where each array is a point + + Returns: graph (np.ndarray) + array of dimensions (n_samples, n_samples) + Graph of all pair-wise mutual reachability distances between points. + + """ + n_samples = np.shape(X)[0] + graph = [] + counter = 0 + for row in range(n_samples): + graph_row = [] + for col in range(n_samples): + point_i = X[row] + point_j = X[col] + class_i = labels[row] + class_j = labels[col] + members_i = _get_label_members(X, labels, class_i) + members_j = _get_label_members(X, labels, class_j) + dist = _mutual_reachability_dist(point_i, point_j, + members_i, members_j, + dist_function) + graph_row.append(dist) + counter += 1 + graph.append(graph_row) + graph = np.array(graph) + return graph + + +def _mutual_reach_dist_MST(dist_tree): + """ + Computes minimum spanning tree of the mutual reach distance complete graph + + Args: + dist_tree (np.ndarray): array of dimensions (n_samples, n_samples) + Graph of all pair-wise mutual reachability distances + between points. + + Returns: minimum_spanning_tree (np.ndarray) + array of dimensions (n_samples, n_samples) + minimum spanning tree of all pair-wise mutual reachability + distances between points. + """ + mst = minimum_spanning_tree(dist_tree).toarray() + return mst + np.transpose(mst) + + +def _cluster_density_sparseness(MST, labels, cluster): + """ + Computes the cluster density sparseness, the minimum density + within a cluster + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + cluster (int): cluster of interest + + Returns: cluster_density_sparseness (float) + value corresponding to the minimum density within a cluster + """ + indices = np.where(labels == cluster)[0] + cluster_MST = MST[indices][:, indices] + cluster_density_sparseness = np.max(cluster_MST) + return cluster_density_sparseness + + +def _cluster_density_separation(MST, labels, cluster_i, cluster_j): + """ + Computes the density separation between two clusters, the maximum + density between clusters. + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + cluster_i (int): cluster i of interest + cluster_j (int): cluster j of interest + + Returns: density_separation (float): + value corresponding to the maximum density between clusters + """ + indices_i = np.where(labels == cluster_i)[0] + indices_j = np.where(labels == cluster_j)[0] + shortest_paths = csgraph.dijkstra(MST, indices=indices_i) + relevant_paths = shortest_paths[:, indices_j] + density_separation = np.min(relevant_paths) + return density_separation + + +def _cluster_validity_index(MST, labels, cluster): + """ + Computes the validity of a cluster (validity of assignmnets) + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + cluster (int): cluster of interest + + Returns: cluster_validity (float) + value corresponding to the validity of cluster assignments + """ + min_density_separation = np.inf + for cluster_j in np.unique(labels): + if cluster_j != cluster: + cluster_density_separation = _cluster_density_separation(MST, + labels, + cluster, + cluster_j) + if cluster_density_separation < min_density_separation: + min_density_separation = cluster_density_separation + cluster_density_sparseness = _cluster_density_sparseness(MST, + labels, + cluster) + numerator = min_density_separation - cluster_density_sparseness + denominator = np.max([min_density_separation, cluster_density_sparseness]) + cluster_validity = numerator / denominator + return cluster_validity + + +def _clustering_validity_index(MST, labels): + """ + Computes the validity of all clustering assignments for a + clustering algorithm + + Args: + MST (np.ndarray): minimum spanning tree of all pair-wise + mutual reachability distances between points. + labels (np.array): clustering assignments for data X + + Returns: validity_index (float): + score in range[-1, 1] indicating validity of clustering assignments + """ + n_samples = len(labels) + validity_index = 0 + for label in np.unique(labels): + fraction = np.sum(labels == label) / float(n_samples) + cluster_validity = _cluster_validity_index(MST, labels, label) + validity_index += fraction * cluster_validity + return validity_index + + +def _get_label_members(X, labels, cluster): + """ + Helper function to get samples of a specified cluster. + + Args: + X (np.ndarray): ndarray with dimensions [n_samples, n_features] + data to check validity of clustering + labels (np.array): clustering assignments for data X + cluster (int): cluster of interest + + Returns: members (np.ndarray) + array of dimensions (n_samples, n_features) of samples of the + specified cluster. + """ + indices = np.where(labels == cluster)[0] + members = X[indices] + return members + +def HDBSCAN_function(data, min_cluster_size): + # param_dist = {'min_samples': [1,5,10,30], + # 'min_cluster_size':[5,10,20,30,50,75,100], + # # 'cluster_selection_method' : ['eom','leaf'], + # # 'metric' : ['euclidean','manhattan'] + # } + param_dist = {'min_samples': [1,5], + 'min_cluster_size':[5,10], + } + + clusterable_embedding = UMAP( + n_neighbors=20, + min_dist=0.0, + n_components=5, + random_state=42, + ).fit_transform(data) + min_score = pd.DataFrame() + for i in param_dist.get('min_samples'): + for j in param_dist.get('min_cluster_size'): + ij_label = HDBSCAN(min_samples=i, min_cluster_size=j).fit_predict(clusterable_embedding) + ij_hdbscan_score = DBCV(clusterable_embedding, ij_label, dist_function=euclidean) + min_score.at[i,j] = ij_hdbscan_score + hdbscan_score = max(min_score.max()) + # get the coordinates of the best clustering paramters and run HDBSCAN below + + labels = HDBSCAN(min_samples=1, min_cluster_size=min_cluster_size).fit_predict(clusterable_embedding) + return labels, hdbscan_score