Skip to content
Snippets Groups Projects
clustering.py 2.82 KiB
Newer Older
DIANE's avatar
DIANE committed
import numpy as np
from pandas import DataFrame
from sklearn.cluster import KMeans
DIANE's avatar
DIANE committed
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
DIANE's avatar
DIANE committed

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~  kmeans ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

DIANE's avatar
DIANE committed
from sklearn.cluster import KMeans, AffinityPropagation, HDBSCAN
from sklearn.metrics import silhouette_score
import pandas as pd
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
def clustering(X, method='kmeans', **kwargs):
DIANE's avatar
DIANE committed
    """
DIANE's avatar
DIANE committed
    Perform clustering on the given dataset using the specified method.

    Parameters
    ----------
    X : DataFrame or array-like, shape (n_samples, n_features)
        The input data for clustering.

    method : str, optional, default='kmeans'
        The clustering method to use. Options are:
        - 'kmeans': K-Means clustering.
        - 'ap': Affinity Propagation clustering.
        - 'hdbscan': HDBSCAN clustering.

    kwargs : dict
        Additional arguments specific to the clustering method.

    Returns
    -------
    DataFrame
        A DataFrame containing the cluster assignments for each sample. The index corresponds
        to the sample names (from X), and a column "names" lists the cluster labels.
    """
    if method == 'KMEANS':
        max_k = kwargs.get('max_k', 10)

        # Find the optimal number of clusters using Silhouette Score
        def find_optimal_k(X, max_k):
            best_k = 2
            best_score = -1
            for k in range(2, max_k + 1):
                model = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
                labels = model.fit_predict(X)
                score = silhouette_score(X, labels)
                if score > best_score:
                    best_score = score
                    best_k = k
            return best_k

        optimal_k = find_optimal_k(X, max_k)
        model = KMeans(n_clusters=optimal_k, random_state=42, n_init=10, max_iter=300)
        labels = model.fit_predict(X)
        res = pd.DataFrame({'names': X.index}, index = [f'cluster#{i+1}' for i in labels])
        return res, len(set(labels))

    elif method == 'AP':
        model = AffinityPropagation(random_state=42)
        model.fit(X)
        labels = model.predict(X)
        res = pd.DataFrame({'names': X.index}, index = [f'cluster#{i+1}' for i in labels])
        return res, len(set(labels))

    elif method == 'HDBSCAN':
        min_samples = kwargs.get('min_samples', 8)
        min_cluster_size = kwargs.get('min_cluster_size', 10)
        metric = kwargs.get('metric', 'euclidean')

        model = HDBSCAN(min_samples=2, min_cluster_size=5, metric="euclidean")
        labels = model.fit_predict(X)
        res = pd.DataFrame({'names': X.index}, [f'cluster#{i+1}' if i != -1 else 'Non clustered' for i in labels])
        return res, len(set(labels))-1

    else:
        raise ValueError(f"Unknown clustering method: {method}")