Skip to content
Snippets Groups Projects
Commit f98b67f8 authored by Nicolas Barthes's avatar Nicolas Barthes
Browse files

updated HDBSCAN becomes a class and supervised UMAP

parent 4b97a50f
No related branches found
No related tags found
No related merge requests found
from Packages import * from Packages import *
from scipy.spatial.distance import euclidean, cdist class Hdbscan:
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse import csgraph
def DBCV(X, labels, dist_function=euclidean):
"""
Implimentation of Density-Based Clustering Validation "DBCV"
Citation:
Moulavi, Davoud, et al. "Density-based clustering validation."
Proceedings of the 2014 SIAM International Conference on Data Mining.
Society for Industrial and Applied Mathematics, 2014.
Density Based clustering validation
Args:
X (np.ndarray): ndarray with dimensions [n_samples, n_features]
data to check validity of clustering
labels (np.array): clustering assignments for data X
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: cluster_validity (float)
score in range[-1, 1] indicating validity of clustering assignments
"""
graph = _mutual_reach_dist_graph(X, labels, dist_function)
mst = _mutual_reach_dist_MST(graph)
cluster_validity = _clustering_validity_index(mst, labels)
return cluster_validity
def _core_dist(point, neighbors, dist_function):
"""
Computes the core distance of a point.
Core distance is the inverse density of an object.
Args:
point (np.array): array of dimensions (n_features,)
point to compute core distance of
neighbors (np.ndarray): array of dimensions (n_neighbors, n_features):
array of all other points in object class
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: core_dist (float)
inverse density of point
"""
n_features = np.shape(point)[0]
n_neighbors = np.shape(neighbors)[0]
distance_vector = cdist(point.reshape(1, -1), neighbors)
distance_vector = distance_vector[distance_vector != 0]
numerator = ((1/distance_vector)**n_features).sum()
core_dist = (numerator / (n_neighbors - 1)) ** (-1/n_features)
return core_dist
def _mutual_reachability_dist(point_i, point_j, neighbors_i,
neighbors_j, dist_function):
""".
Computes the mutual reachability distance between points
Args:
point_i (np.array): array of dimensions (n_features,)
point i to compare to point j
point_j (np.array): array of dimensions (n_features,)
point i to compare to point i
neighbors_i (np.ndarray): array of dims (n_neighbors, n_features):
array of all other points in object class of point i
neighbors_j (np.ndarray): array of dims (n_neighbors, n_features):
array of all other points in object class of point j
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: mutual_reachability (float)
mutual reachability between points i and j
""" """
core_dist_i = _core_dist(point_i, neighbors_i, dist_function) Runs an automatic optimized sklearn.HDBSCAN clustering on Dimensionality reducted space.
core_dist_j = _core_dist(point_j, neighbors_j, dist_function) Vars:
dist = dist_function(point_i, point_j) data: the Dimensionality reducted space, raw result of the UMAP.fit()
mutual_reachability = np.max([core_dist_i, core_dist_j, dist]) param_dist: the HDBSCAN optimization parameters to test
return mutual_reachability Density-Based Clustering Validation - DBCV (https://github.com/christopherjenness/DBCV/tree/master ;
Moulavi, Davoud, et al. "Density-based clustering validation." Proceedings of the 2014 SIAM
International Conference on Data Mining. Society for Industrial and Applied Mathematics, 2014.)
def _mutual_reach_dist_graph(X, labels, dist_function): is used as a metric to optimize HDBSCAN algorithm.
""" Functions DBCV, _core_dist, _mutual_reachability_dist, _mutual_reach_dist_graph, _mutual_reach_dist_graph,
Computes the mutual reach distance complete graph. _mutual_reach_dist_MST, _cluster_density_sparseness, _cluster_density_separation, _cluster_validity_index,
Graph of all pair-wise mutual reachability distances between points _clustering_validity_index and _get_label_members aim at DBCV computing.
_score is a dataframe with the DBCV value for each combination of param_dist. We search for the higher value and
Args: compute an HDBSCAN with the best parameters.
X (np.ndarray): ndarray with dimensions [n_samples, n_features] The HDBSCAN_scores_ @property return the cluster number of each sample (_labels) and the DBCV best score.
data to check validity of clustering
labels (np.array): clustering assignments for data X
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: graph (np.ndarray)
array of dimensions (n_samples, n_samples)
Graph of all pair-wise mutual reachability distances between points.
"""
n_samples = np.shape(X)[0]
graph = []
counter = 0
for row in range(n_samples):
graph_row = []
for col in range(n_samples):
point_i = X[row]
point_j = X[col]
class_i = labels[row]
class_j = labels[col]
members_i = _get_label_members(X, labels, class_i)
members_j = _get_label_members(X, labels, class_j)
dist = _mutual_reachability_dist(point_i, point_j,
members_i, members_j,
dist_function)
graph_row.append(dist)
counter += 1
graph.append(graph_row)
graph = np.array(graph)
return graph
def _mutual_reach_dist_MST(dist_tree):
"""
Computes minimum spanning tree of the mutual reach distance complete graph
Args:
dist_tree (np.ndarray): array of dimensions (n_samples, n_samples)
Graph of all pair-wise mutual reachability distances
between points.
Returns: minimum_spanning_tree (np.ndarray)
array of dimensions (n_samples, n_samples)
minimum spanning tree of all pair-wise mutual reachability
distances between points.
"""
mst = minimum_spanning_tree(dist_tree).toarray()
return mst + np.transpose(mst)
def _cluster_density_sparseness(MST, labels, cluster):
"""
Computes the cluster density sparseness, the minimum density
within a cluster
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest
Returns: cluster_density_sparseness (float)
value corresponding to the minimum density within a cluster
"""
indices = np.where(labels == cluster)[0]
cluster_MST = MST[indices][:, indices]
cluster_density_sparseness = np.max(cluster_MST)
return cluster_density_sparseness
def _cluster_density_separation(MST, labels, cluster_i, cluster_j):
"""
Computes the density separation between two clusters, the maximum
density between clusters.
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster_i (int): cluster i of interest
cluster_j (int): cluster j of interest
Returns: density_separation (float):
value corresponding to the maximum density between clusters
"""
indices_i = np.where(labels == cluster_i)[0]
indices_j = np.where(labels == cluster_j)[0]
shortest_paths = csgraph.dijkstra(MST, indices=indices_i)
relevant_paths = shortest_paths[:, indices_j]
density_separation = np.min(relevant_paths)
return density_separation
def _cluster_validity_index(MST, labels, cluster):
"""
Computes the validity of a cluster (validity of assignmnets)
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest
Returns: cluster_validity (float)
value corresponding to the validity of cluster assignments
"""
min_density_separation = np.inf
for cluster_j in np.unique(labels):
if cluster_j != cluster:
cluster_density_separation = _cluster_density_separation(MST,
labels,
cluster,
cluster_j)
if cluster_density_separation < min_density_separation:
min_density_separation = cluster_density_separation
cluster_density_sparseness = _cluster_density_sparseness(MST,
labels,
cluster)
numerator = min_density_separation - cluster_density_sparseness
denominator = np.max([min_density_separation, cluster_density_sparseness])
cluster_validity = numerator / denominator
return cluster_validity
def _clustering_validity_index(MST, labels):
"""
Computes the validity of all clustering assignments for a
clustering algorithm
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
Returns: validity_index (float):
score in range[-1, 1] indicating validity of clustering assignments
"""
n_samples = len(labels)
validity_index = 0
for label in np.unique(labels):
fraction = np.sum(labels == label) / float(n_samples)
cluster_validity = _cluster_validity_index(MST, labels, label)
validity_index += fraction * cluster_validity
return validity_index
def _get_label_members(X, labels, cluster):
"""
Helper function to get samples of a specified cluster.
Args:
X (np.ndarray): ndarray with dimensions [n_samples, n_features]
data to check validity of clustering
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest
Returns: members (np.ndarray)
array of dimensions (n_samples, n_features) of samples of the
specified cluster.
""" """
indices = np.where(labels == cluster)[0] def __init__(self, data):
members = X[indices] # self._param_dist = {'min_samples': [1],
return members # 'min_cluster_size':[5,10],
# 'metric' : ['euclidean','manhattan'],
def HDBSCAN_function(data): # }
# param_dist = {'min_samples': [1,5,10,30], self._param_dist = {'min_samples': [1,5,10,],
# 'min_cluster_size':[5,10,20,30,50,75,100], 'min_cluster_size':[5,25,50,],
# # 'cluster_selection_method' : ['eom','leaf'], 'metric' : ['euclidean','manhattan'],
# # 'metric' : ['euclidean','manhattan'] }
# }
# param_dist = {'min_samples': [1,5,10,50], self._clusterable_embedding = data
# 'min_cluster_size':[5,10,30,50,100,300,500],
# } # RandomizedSearchCV not working...
param_dist = {'min_samples': [1,5, 10,], # def scoring(model, clusterable_embedding):
'min_cluster_size':[5,10,30,50,100], # label = HDBSCAN().fit_predict(clusterable_embedding)
'metric' : ['euclidean','manhattan'], # hdbscan_score = DBCV(clusterable_embedding, label, dist_function=euclidean)
} # return hdbscan_score
# tunning = RandomizedSearchCV(estimator=HDBSCAN(), param_distributions=param_dist, scoring=scoring)
clusterable_embedding = UMAP( # tunning.fit(clusterable_embedding)
n_neighbors=20, # return tunning
min_dist=0.0,
n_components=5, # compute optimization. Test each combination of parameters and store DBCV score into _score.
random_state=42, self._score = pd.DataFrame()
).fit_transform(data) for i in self._param_dist.get('min_samples'):
for j in self._param_dist.get('min_cluster_size'):
# RandomizedSearchCV not working... self._ij_label = HDBSCAN(min_samples=i, min_cluster_size=j).fit_predict(self._clusterable_embedding)
# def scoring(model, clusterable_embedding): self._ij_hdbscan_score = self.DBCV(self._clusterable_embedding, self._ij_label,)# dist_function=euclidean)
# label = HDBSCAN().fit_predict(clusterable_embedding) self._score.at[i,j] = self._ij_hdbscan_score
# hdbscan_score = DBCV(clusterable_embedding, label, dist_function=euclidean) # get the best DBCV score
# return hdbscan_score self._hdbscan_score = max(self._score.max())
# tunning = RandomizedSearchCV(estimator=HDBSCAN(), param_distributions=param_dist, scoring=scoring) # find the coordinates of the best clustering parameters and run HDBSCAN below
# tunning.fit(clusterable_embedding) self._bparams = np.where(self._score == self._hdbscan_score)
# return tunning # run HDBSCAN with best params
min_score = pd.DataFrame() self._labels = HDBSCAN(min_samples=self._param_dist['min_samples'][self._bparams[0][0]], min_cluster_size=self._param_dist['min_cluster_size'][self._bparams[1][0]], metric=self._param_dist['metric'][self._bparams[1][0]]).fit_predict(self._clusterable_embedding)
for i in param_dist.get('min_samples'):
for j in param_dist.get('min_cluster_size'): def DBCV(self, X, labels, dist_function=euclidean):
ij_label = HDBSCAN(min_samples=i, min_cluster_size=j).fit_predict(clusterable_embedding) """
ij_hdbscan_score = DBCV(clusterable_embedding, ij_label, dist_function=euclidean) Implimentation of Density-Based Clustering Validation "DBCV"
min_score.at[i,j] = ij_hdbscan_score
hdbscan_score = max(min_score.max()) Citation:
# get the coordinates of the best clustering parameters and run HDBSCAN below Moulavi, Davoud, et al. "Density-based clustering validation."
bparams = np.where(min_score == hdbscan_score) Proceedings of the 2014 SIAM International Conference on Data Mining.
# run HDBSCAN with best params Society for Industrial and Applied Mathematics, 2014.
labels = HDBSCAN(min_samples=param_dist['min_samples'][bparams[0][0]], min_cluster_size=param_dist['min_cluster_size'][bparams[1][0]], metric=param_dist['metric'][bparams[1][0]]).fit_predict(clusterable_embedding)
return labels, hdbscan_score Density Based clustering validation
Args:
X (np.ndarray): ndarray with dimensions [n_samples, n_features]
data to check validity of clustering
labels (np.array): clustering assignments for data X
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: cluster_validity (float)
score in range[-1, 1] indicating validity of clustering assignments
"""
graph = self._mutual_reach_dist_graph(X, labels, dist_function)
mst = self._mutual_reach_dist_MST(graph)
cluster_validity = self._clustering_validity_index(mst, labels)
return cluster_validity
def _core_dist(self, point, neighbors, dist_function):
"""
Computes the core distance of a point.
Core distance is the inverse density of an object.
Args:
point (np.array): array of dimensions (n_features,)
point to compute core distance of
neighbors (np.ndarray): array of dimensions (n_neighbors, n_features):
array of all other points in object class
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: core_dist (float)
inverse density of point
"""
n_features = np.shape(point)[0]
n_neighbors = np.shape(neighbors)[0]
distance_vector = cdist(point.reshape(1, -1), neighbors)
distance_vector = distance_vector[distance_vector != 0]
numerator = ((1/distance_vector)**n_features).sum()
core_dist = (numerator / (n_neighbors - 1)) ** (-1/n_features)
return core_dist
def _mutual_reachability_dist(self, point_i, point_j, neighbors_i,
neighbors_j, dist_function):
""".
Computes the mutual reachability distance between points
Args:
point_i (np.array): array of dimensions (n_features,)
point i to compare to point j
point_j (np.array): array of dimensions (n_features,)
point i to compare to point i
neighbors_i (np.ndarray): array of dims (n_neighbors, n_features):
array of all other points in object class of point i
neighbors_j (np.ndarray): array of dims (n_neighbors, n_features):
array of all other points in object class of point j
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: mutual_reachability (float)
mutual reachability between points i and j
"""
core_dist_i = self._core_dist(point_i, neighbors_i, dist_function)
core_dist_j = self._core_dist(point_j, neighbors_j, dist_function)
dist = dist_function(point_i, point_j)
mutual_reachability = np.max([core_dist_i, core_dist_j, dist])
return mutual_reachability
def _mutual_reach_dist_graph(self, X, labels, dist_function):
"""
Computes the mutual reach distance complete graph.
Graph of all pair-wise mutual reachability distances between points
Args:
X (np.ndarray): ndarray with dimensions [n_samples, n_features]
data to check validity of clustering
labels (np.array): clustering assignments for data X
dist_dunction (func): function to determine distance between objects
func args must be [np.array, np.array] where each array is a point
Returns: graph (np.ndarray)
array of dimensions (n_samples, n_samples)
Graph of all pair-wise mutual reachability distances between points.
"""
n_samples = np.shape(X)[0]
graph = []
counter = 0
for row in range(n_samples):
graph_row = []
for col in range(n_samples):
point_i = X[row]
point_j = X[col]
class_i = labels[row]
class_j = labels[col]
members_i = self._get_label_members(X, labels, class_i)
members_j = self._get_label_members(X, labels, class_j)
dist = self._mutual_reachability_dist(point_i, point_j,
members_i, members_j,
dist_function)
graph_row.append(dist)
counter += 1
graph.append(graph_row)
graph = np.array(graph)
return graph
def _mutual_reach_dist_MST(self, dist_tree):
"""
Computes minimum spanning tree of the mutual reach distance complete graph
Args:
dist_tree (np.ndarray): array of dimensions (n_samples, n_samples)
Graph of all pair-wise mutual reachability distances
between points.
Returns: minimum_spanning_tree (np.ndarray)
array of dimensions (n_samples, n_samples)
minimum spanning tree of all pair-wise mutual reachability
distances between points.
"""
mst = minimum_spanning_tree(dist_tree).toarray()
return mst + np.transpose(mst)
def _cluster_density_sparseness(self, MST, labels, cluster):
"""
Computes the cluster density sparseness, the minimum density
within a cluster
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest
Returns: cluster_density_sparseness (float)
value corresponding to the minimum density within a cluster
"""
indices = np.where(labels == cluster)[0]
cluster_MST = MST[indices][:, indices]
cluster_density_sparseness = np.max(cluster_MST)
return cluster_density_sparseness
def _cluster_density_separation(self, MST, labels, cluster_i, cluster_j):
"""
Computes the density separation between two clusters, the maximum
density between clusters.
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster_i (int): cluster i of interest
cluster_j (int): cluster j of interest
Returns: density_separation (float):
value corresponding to the maximum density between clusters
"""
indices_i = np.where(labels == cluster_i)[0]
indices_j = np.where(labels == cluster_j)[0]
shortest_paths = csgraph.dijkstra(MST, indices=indices_i)
relevant_paths = shortest_paths[:, indices_j]
density_separation = np.min(relevant_paths)
return density_separation
def _cluster_validity_index(self, MST, labels, cluster):
"""
Computes the validity of a cluster (validity of assignmnets)
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest
Returns: cluster_validity (float)
value corresponding to the validity of cluster assignments
"""
min_density_separation = np.inf
for cluster_j in np.unique(labels):
if cluster_j != cluster:
cluster_density_separation = self._cluster_density_separation(MST,
labels,
cluster,
cluster_j)
if cluster_density_separation < min_density_separation:
min_density_separation = cluster_density_separation
cluster_density_sparseness = self._cluster_density_sparseness(MST,
labels,
cluster)
numerator = min_density_separation - cluster_density_sparseness
denominator = np.max([min_density_separation, cluster_density_sparseness])
cluster_validity = numerator / denominator
return cluster_validity
def _clustering_validity_index(self, MST, labels):
"""
Computes the validity of all clustering assignments for a
clustering algorithm
Args:
MST (np.ndarray): minimum spanning tree of all pair-wise
mutual reachability distances between points.
labels (np.array): clustering assignments for data X
Returns: validity_index (float):
score in range[-1, 1] indicating validity of clustering assignments
"""
n_samples = len(labels)
validity_index = 0
for label in np.unique(labels):
fraction = np.sum(labels == label) / float(n_samples)
cluster_validity = self._cluster_validity_index(MST, labels, label)
validity_index += fraction * cluster_validity
return validity_index
def _get_label_members(self, X, labels, cluster):
"""
Helper function to get samples of a specified cluster.
Args:
X (np.ndarray): ndarray with dimensions [n_samples, n_features]
data to check validity of clustering
labels (np.array): clustering assignments for data X
cluster (int): cluster of interest
Returns: members (np.ndarray)
array of dimensions (n_samples, n_features) of samples of the
specified cluster.
"""
indices = np.where(labels == cluster)[0]
members = X[indices]
return members
@property
def HDBSCAN_scores_(self):
return self._labels, self._hdbscan_score
...@@ -4,17 +4,28 @@ from Class_Mod.DATA_HANDLING import * ...@@ -4,17 +4,28 @@ from Class_Mod.DATA_HANDLING import *
class Umap: class Umap:
def __init__(self, x, n_components, n_neighbors, min_dist): """
self.numerical_data, categorical_data, scaled_values = col_cat(x) The UMAP dimension reduction algorithm from scikit learn
self.catdata = list(categorical_data.columns) """
def __init__(self, data_import, numerical_data, cat_data):
self.x = data_import
self.numerical_data = numerical_data
if len(cat_data) > 0:
self.categorical_data = cat_data
self.le = LabelEncoder()
self.categorical_data_encoded = self.le.fit_transform(self.categorical_data)
self.x = scaled_values else:
self.categorical_data = False
self.model = UMAP(n_neighbors=20, n_components=4, min_dist=0.0,) # random_state=42,)
self.model.fit(self.x) self.model = UMAP(n_neighbors=20, n_components=3, min_dist=0.0, random_state=42,)
self.scores = self.model.transform(self.x) self.model.fit(self.numerical_data, y = self.categorical_data_encoded)
self.scores = pd.DataFrame(self.scores, index = self.numerical_data.index) self.scores_raw = self.model.transform(self.numerical_data)
self.scores = pd.DataFrame(self.scores_raw, index = self.x.index)
@property @property
def scores_(self): def scores_(self):
return self.scores return self.scores
\ No newline at end of file @property
def scores_raw_(self):
return self.scores_raw
\ No newline at end of file
...@@ -8,3 +8,5 @@ from .Regression_metrics import metrics ...@@ -8,3 +8,5 @@ from .Regression_metrics import metrics
from .VarSel import TpeIpls from .VarSel import TpeIpls
from .Miscellaneous import resid_plot, reg_plot from .Miscellaneous import resid_plot, reg_plot
from .DxReader import DxRead from .DxReader import DxRead
from .HDBSCAN_Clustering import Hdbscan
from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan
# find_col_index # find_col_index
from Class_Mod.Miscellaneous import prediction, download_results from Class_Mod.Miscellaneous import prediction, download_results
## Data loading, handling, and preprocessing ## Data loading, handling, and preprocessing
import os import os
import sys import sys
...@@ -10,14 +9,18 @@ import numpy as np ...@@ -10,14 +9,18 @@ import numpy as np
import pandas as pd import pandas as pd
from os import listdir from os import listdir
from os.path import isfile, join from os.path import isfile, join
from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import time import time
### Exploratory data analysis-Dimensionality reduction ### Exploratory data analysis-Dimensionality reduction
from umap.umap_ import UMAP from umap.umap_ import UMAP
from sklearn.decomposition import PCA, NMF from sklearn.decomposition import PCA, NMF
# Clustering # Clustering
from sklearn.cluster import KMeans, HDBSCAN from sklearn.cluster import KMeans, HDBSCAN
from scipy.spatial.distance import euclidean, cdist
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse import csgraph
# Modelling # Modelling
# import julia # import julia
...@@ -38,6 +41,7 @@ from PIL import Image ...@@ -38,6 +41,7 @@ from PIL import Image
import plotly.express as px import plotly.express as px
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
### Important Metrics ### Important Metrics
from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score
...@@ -49,6 +53,7 @@ from tempfile import NamedTemporaryFile ...@@ -49,6 +53,7 @@ from tempfile import NamedTemporaryFile
#Library for connecting to SQL DB #Library for connecting to SQL DB
import pyodbc import pyodbc
#Library for reading the config file, which is in JSON #Library for reading the config file, which is in JSON
import json import json
......
...@@ -36,6 +36,7 @@ with container1: ...@@ -36,6 +36,7 @@ with container1:
else: else:
col = False col = False
data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col)
data_import, categorical_data, scaled_values = col_cat(data_import)
st.success("The data have been loaded successfully", icon="") st.success("The data have been loaded successfully", icon="")
## Visualize spectra ## Visualize spectra
...@@ -103,23 +104,27 @@ with container2: ...@@ -103,23 +104,27 @@ with container2:
if type_plot == 'PCA': if type_plot == 'PCA':
model = LinearPCA(data_import, Ncomp=5) model = LinearPCA(data_import, Ncomp=5)
elif type_plot =='UMAP': elif type_plot =='UMAP':
model = Umap(x = data_import, n_components = 5, n_neighbors = 20 , min_dist = 0) model = Umap(data_import = data_import, numerical_data = scaled_values, cat_data = categorical_data)
if type_plot in ['PCA', 'UMAP']: if type_plot in ['PCA', 'UMAP']:
# add 2 select lists to choose which component to plot if type_plot in ['PCA']:
axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0) # add 2 select lists to choose which component to plot
axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1) axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0)
axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2) axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1)
axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2)
elif type_plot in ['UMAP']:
axis1 = 0
axis2 = 1
axis3 = 2
if type_cluster == 'Kmeans': if type_cluster == 'Kmeans':
scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1) scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1)
cl = Sk_Kmeans(scsc, max_clusters = 30) cl = Sk_Kmeans(scsc, max_clusters = 30)
elif type_cluster == 'HDBSCAN': elif type_cluster == 'HDBSCAN':
from Class_Mod.HDBSCAN_Clustering import HDBSCAN_function optimized_hdbscan = Hdbscan(model.scores_raw_)
labels, hdbscan_score = HDBSCAN_function(data_import) labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
with scores: with scores:
t = model.scores_ t = model.scores_
if type_cluster in ['AP', 'Kmeans']: if type_cluster in ['AP', 'Kmeans']:
...@@ -140,7 +145,9 @@ with container2: ...@@ -140,7 +145,9 @@ with container2:
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels) fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
fig.update_traces(marker=dict(size=4)) fig.update_traces(marker=dict(size=4))
# st.plotly_chart(fig_hdbscan) # st.plotly_chart(fig_hdbscan)
st.write('DBCV score (-1:1) = ' + str(hdbscan_score)) st.write('Optimal number of clusters = ' + str(len(set(labels))))
st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')
else: else:
if test == '.dx': if test == '.dx':
...@@ -190,7 +197,6 @@ with container2: ...@@ -190,7 +197,6 @@ with container2:
fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="",yaxis_title="Residuals") fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="",yaxis_title="Residuals")
st.plotly_chart(fig) st.plotly_chart(fig)
else:
else: st.markdown('Select a dimensionality reduction technique from the dropdown list')
st.markdown('Select a dimensionality reduction technique from the dropdown list')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment