Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kmeans ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
class Sk_Kmeans:
"""K-Means clustering for Samples selection.
Returns:
inertia_ (DataFrame): DataFrame with ...
x (DataFrame): Initial data
clu (DataFrame): Cluster name for each sample
model.cluster_centers_ (DataFrame): Coordinates of the center of each cluster
"""
def __init__(self, x, max_clusters):
"""Initiate the KMeans class.
Args:
x (DataFrame): the original reduced data to cluster
max_cluster (Int): the max number of desired clusters.
"""
self.x = x
self.max_clusters = max_clusters
self.inertia = DataFrame()
for i in range(1, max_clusters+1):
model = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
model.fit(x)
self.inertia[f'{i}_clust']= [model.inertia_]
self.inertia.index = ['inertia']
@property
def inertia_(self):
return self.inertia
@property
def suggested_n_clusters_(self):
idxidx = []
values = []
s = self.inertia.to_numpy().ravel()
for i in range(self.max_clusters-1):
idxidx.append(f'{i+1}_clust')
values.append((s[i] - s[i+1])*100 / s[i])
id = np.max(np.where(np.array(values) > 5))+2
return id
@property
def fit_optimal_(self):
model = KMeans(n_clusters = self.suggested_n_clusters_, init = 'k-means++', random_state = 42)
model.fit(self.x)
yp = model.predict(self.x)+1
clu = [f'cluster#{i}' for i in yp]
return self.x, clu, model.cluster_centers_
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~hdbscan ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
class Hdbscan:
"""Runs an automatically optimized sklearn.HDBSCAN clustering on dimensionality reduced space.
The HDBSCAN_scores_ @Property returns the cluster number of each sample (_labels) and the DBCV best score.
Returns:
_labels (DataFrame): DataFrame with the cluster belonging number for each sample
_hdbscan_score (float): a float with the best DBCV score after optimization
Examples:
- clustering = HDBSCAN((data)
- scores = clustering.HDBSCAN_scores_
"""
def __init__(self, data):
"""Initiate the HDBSCAN calculation
Args:
data (DataFrame): the Dimensionality reduced space, raw result of the UMAP.fit()
param_dist (dictionary): the HDBSCAN optimization parameters to test
_score (DataFrame): is a dataframe with the DBCV value for each combination of param_dist. We search for the higher value to then compute an HDBSCAN with the best parameters.
"""
# Really fast
self._param_dist = {'min_samples': [8],
'min_cluster_size':[10],
'metric' : ['euclidean'],#,'manhattan'],
}
# Medium
# self._param_dist = {'min_samples': [1,10],
# 'min_cluster_size':[5,50],
# 'metric' : ['euclidean','manhattan'],
# }
# Complete
# self._param_dist = {'min_samples': [1,5,10,],
# 'min_cluster_size':[5,25,50,],
# 'metric' : ['euclidean','manhattan'],
# }
self._clusterable_embedding = data
# RandomizedSearchCV not working...
# def scoring(model, clusterable_embedding):
# label = HDBSCAN().fit_predict(clusterable_embedding)
# hdbscan_score = DBCV(clusterable_embedding, label, dist_function=euclidean)
# return hdbscan_score
# tunning = RandomizedSearchCV(estimator=HDBSCAN(), param_distributions=param_dist, scoring=scoring)
# tunning.fit(clusterable_embedding)
# return tunning
# compute optimization. Test each combination of parameters and store DBCV score into _score.
# self._score = DataFrame()
# for i in self._param_dist.get('min_samples'):
# for j in self._param_dist.get('min_cluster_size'):
# self._ij_label = HDBSCAN(min_samples=i, min_cluster_size=j).fit_predict(self._clusterable_embedding)
# self._ij_hdbscan_score = self.DBCV(self._clusterable_embedding, self._ij_label,)# dist_function=euclidean)
# self._score.at[i,j] = self._ij_hdbscan_score
# get the best DBCV score
# self._hdbscan_bscore = max(self._score.max())
# find the coordinates of the best clustering parameters and run HDBSCAN below
# self._bparams = np.where(self._score == self._hdbscan_bscore)
# run HDBSCAN with best params
# self.best_hdbscan = HDBSCAN(min_samples=self._param_dist['min_samples'][self._bparams[0][0]], min_cluster_size=self._param_dist['min_cluster_size'][self._bparams[1][0]], metric=self._param_dist['metric'][self._bparams[1][0]], store_centers="medoid", )
self.best_hdbscan = HDBSCAN(min_samples=self._param_dist['min_samples'][0], min_cluster_size=self._param_dist['min_cluster_size'][0], metric=self._param_dist['metric'][0], store_centers="medoid", )
self.best_hdbscan.fit_predict(self._clusterable_embedding)
self._labels = self.best_hdbscan.labels_
self._centers = self.best_hdbscan.medoids_
# def DBCV(self, X, labels, dist_function=euclidean):
# """
# Implimentation of Density-Based Clustering Validation "DBCV"
#
# Citation: Moulavi, Davoud, et al. "Density-based clustering validation."
# Proceedings of the 2014 SIAM International Conference on Data Mining.
# Society for Industrial and Applied Mathematics, 2014.
#
# Density Based clustering validation
#
# Args:
# X (np.ndarray): ndarray with dimensions [n_samples, n_features]
# data to check validity of clustering
# labels (np.array): clustering assignments for data X
# dist_dunction (func): function to determine distance between objects
# func args must be [np.array, np.array] where each array is a point
#
# Returns:
# cluster_validity (float): score in range[-1, 1] indicating validity of clustering assignments
# """
# graph = self._mutual_reach_dist_graph(X, labels, dist_function)
# mst = self._mutual_reach_dist_MST(graph)
# cluster_validity = self._clustering_validity_index(mst, labels)
# return cluster_validity
#
#
# def _core_dist(self, point, neighbors, dist_function):
# """
# Computes the core distance of a point.
# Core distance is the inverse density of an object.
#
# Args:
# point (np.array): array of dimensions (n_features,)
# point to compute core distance of
# neighbors (np.ndarray): array of dimensions (n_neighbors, n_features):
# array of all other points in object class
# dist_dunction (func): function to determine distance between objects
# func args must be [np.array, np.array] where each array is a point
#
# Returns: core_dist (float)
# inverse density of point
# """
# n_features = np.shape(point)[0]
# n_neighbors = np.shape(neighbors)[0]
#
# distance_vector = cdist(point.reshape(1, -1), neighbors)
# distance_vector = distance_vector[distance_vector != 0]
# numerator = ((1/distance_vector)**n_features).sum()
# core_dist = (numerator / (n_neighbors - 1)) ** (-1/n_features)
# return core_dist
#
# def _mutual_reachability_dist(self, point_i, point_j, neighbors_i,
# neighbors_j, dist_function):
# """.
# Computes the mutual reachability distance between points
#
# Args:
# point_i (np.array): array of dimensions (n_features,)
# point i to compare to point j
# point_j (np.array): array of dimensions (n_features,)
# point i to compare to point i
# neighbors_i (np.ndarray): array of dims (n_neighbors, n_features):
# array of all other points in object class of point i
# neighbors_j (np.ndarray): array of dims (n_neighbors, n_features):
# array of all other points in object class of point j
# dist_function (func): function to determine distance between objects
# func args must be [np.array, np.array] where each array is a point
#
# Returns:
# mutual_reachability (float)
# mutual reachability between points i and j
#
# """
# core_dist_i = self._core_dist(point_i, neighbors_i, dist_function)
# core_dist_j = self._core_dist(point_j, neighbors_j, dist_function)
# dist = dist_function(point_i, point_j)
# mutual_reachability = np.max([core_dist_i, core_dist_j, dist])
# return mutual_reachability
#
#
# def _mutual_reach_dist_graph(self, X, labels, dist_function):
# """
# Computes the mutual reach distance complete graph.
# Graph of all pair-wise mutual reachability distances between points
#
# Args:
# X (np.ndarray): ndarray with dimensions [n_samples, n_features]
# data to check validity of clustering
# labels (np.array): clustering assignments for data X
# dist_dunction (func): function to determine distance between objects
# func args must be [np.array, np.array] where each array is a point
#
# Returns: graph (np.ndarray)
# array of dimensions (n_samples, n_samples)
# Graph of all pair-wise mutual reachability distances between points.
#
# """
# n_samples = np.shape(X)[0]
# graph = []
# counter = 0
# for row in range(n_samples):
# graph_row = []
# for col in range(n_samples):
# point_i = X[row]
# point_j = X[col]
# class_i = labels[row]
# class_j = labels[col]
# members_i = self._get_label_members(X, labels, class_i)
# members_j = self._get_label_members(X, labels, class_j)
# dist = self._mutual_reachability_dist(point_i, point_j,
# members_i, members_j,
# dist_function)
# graph_row.append(dist)
# counter += 1
# graph.append(graph_row)
# graph = np.array(graph)
# return graph
#
#
# def _mutual_reach_dist_MST(self, dist_tree):
# """
# Computes minimum spanning tree of the mutual reach distance complete graph
#
# Args:
# dist_tree (np.ndarray): array of dimensions (n_samples, n_samples)
# Graph of all pair-wise mutual reachability distances
# between points.
#
# Returns: minimum_spanning_tree (np.ndarray)
# array of dimensions (n_samples, n_samples)
# minimum spanning tree of all pair-wise mutual reachability
# distances between points.
# """
# mst = minimum_spanning_tree(dist_tree).toarray()
# return mst + np.transpose(mst)
#
#
# def _cluster_density_sparseness(self, MST, labels, cluster):
# """
# Computes the cluster density sparseness, the minimum density
# within a cluster
#
# Args:
# MST (np.ndarray): minimum spanning tree of all pair-wise
# mutual reachability distances between points.
# labels (np.array): clustering assignments for data X
# cluster (int): cluster of interest
#
# Returns: cluster_density_sparseness (float)
# value corresponding to the minimum density within a cluster
# """
# indices = np.where(labels == cluster)[0]
# cluster_MST = MST[indices][:, indices]
# cluster_density_sparseness = np.max(cluster_MST)
# return cluster_density_sparseness
#
#
# def _cluster_density_separation(self, MST, labels, cluster_i, cluster_j):
# """
# Computes the density separation between two clusters, the maximum
# density between clusters.
#
# Args:
# MST (np.ndarray): minimum spanning tree of all pair-wise
# mutual reachability distances between points.
# labels (np.array): clustering assignments for data X
# cluster_i (int): cluster i of interest
# cluster_j (int): cluster j of interest
#
# Returns: density_separation (float):
# value corresponding to the maximum density between clusters
# """
# indices_i = np.where(labels == cluster_i)[0]
# indices_j = np.where(labels == cluster_j)[0]
# shortest_paths = csgraph.dijkstra(MST, indices=indices_i)
# relevant_paths = shortest_paths[:, indices_j]
# density_separation = np.min(relevant_paths)
# return density_separation
#
#
# def _cluster_validity_index(self, MST, labels, cluster):
# """
# Computes the validity of a cluster (validity of assignmnets)
#
# Args:
# MST (np.ndarray): minimum spanning tree of all pair-wise
# mutual reachability distances between points.
# labels (np.array): clustering assignments for data X
# cluster (int): cluster of interest
#
# Returns: cluster_validity (float)
# value corresponding to the validity of cluster assignments
# """
# min_density_separation = np.inf
# for cluster_j in np.unique(labels):
# if cluster_j != cluster:
# cluster_density_separation = self._cluster_density_separation(MST,
# labels,
# cluster,
# cluster_j)
# if cluster_density_separation < min_density_separation:
# min_density_separation = cluster_density_separation
# cluster_density_sparseness = self._cluster_density_sparseness(MST,
# labels,
# cluster)
# numerator = min_density_separation - cluster_density_sparseness
# denominator = np.max([min_density_separation, cluster_density_sparseness])
# cluster_validity = numerator / denominator
# return cluster_validity
#
#
# def _clustering_validity_index(self, MST, labels):
# """
# Computes the validity of all clustering assignments for a
# clustering algorithm
#
# Args:
# MST (np.ndarray): minimum spanning tree of all pair-wise
# mutual reachability distances between points.
# labels (np.array): clustering assignments for data X
#
# Returns: validity_index (float):
# score in range[-1, 1] indicating validity of clustering assignments
# """
# n_samples = len(labels)
# validity_index = 0
# for label in np.unique(labels):
# fraction = np.sum(labels == label) / float(n_samples)
# cluster_validity = self._cluster_validity_index(MST, labels, label)
# validity_index += fraction * cluster_validity
# return validity_index
#
#
# def _get_label_members(self, X, labels, cluster):
# """
# Helper function to get samples of a specified cluster.
#
# Args:
# X (np.ndarray): ndarray with dimensions [n_samples, n_features]
# data to check validity of clustering
# labels (np.array): clustering assignments for data X
# cluster (int): cluster of interest
#
# Returns: members (np.ndarray)
# array of dimensions (n_samples, n_features) of samples of the
# specified cluster.
# """
# indices = np.where(labels == cluster)[0]
# members = X[indices]
# return members
@property
def centers_(self):
# return self._labels, self._hdbscan_bscore, self._centers
return self._centers
@property
def labels_(self):
labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in self._labels]
return labels
@property
def non_clustered(self):
labels = [f'cluster#{i+1}' if i !=-1 else 'Non clustered' for i in self._labels]
non_clustered = np.where(np.array(labels) == 'Non clustered')[0]
return non_clustered
# ~~~~~~~~~~~~~~~~~~~~~~~~~ ap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
class AP:
def __init__(self, X):
## input matrix
self.__x = np.array(X)
# Fit PCA model
self.M = AffinityPropagation(damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None,
affinity='euclidean', verbose=False, random_state=None)
self.M.fit(self.__x)
self.yp = self.M.predict(self.__x)+1
@property
def fit_optimal_(self):
clu = [f'cluster#{i}' for i in self.yp]
return self.__x, clu, self.M.cluster_centers_