From d836044e88139a838d429390c0df261778348bb8 Mon Sep 17 00:00:00 2001 From: DIANE <abderrahim.diane@cefe.cnrs.fr> Date: Fri, 6 Dec 2024 15:45:42 +0100 Subject: [PATCH] documentation --- docs/Clustering.md | 8 ++---- docs/index.md | 20 ++++++++++---- docs/prediction.md | 3 ++ src/utils/clustering.py | 61 +++++++++++++++++++++++++++++++++-------- 4 files changed, 69 insertions(+), 23 deletions(-) create mode 100644 docs/prediction.md diff --git a/docs/Clustering.md b/docs/Clustering.md index 374fc12..fc2e05e 100644 --- a/docs/Clustering.md +++ b/docs/Clustering.md @@ -1,7 +1,3 @@ # Clustering Methods - -## K-Means clustering -::: src.utils.clustering.Sk_Kmeans - -## HDBSCAN clustering -::: src.utils.clustering.Hdbscan +::: src.utils.clustering.clustering + \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 10504b2..ab4c721 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,13 +2,23 @@ This workflow aims at ... -## Samples Selection +## I - Samples Selection -## Dimension Reduction +### Dimension Reduction -## Clustering +### Clustering analysis [K-Means](Clustering.md#k-means-clustering) [HDBSCAN](Clustering.md#hdbscan-clustering) -## Models Creation -[lwPlsR from Jchemo (Julia)](model_creation.md) \ No newline at end of file +### Representative subset selection + +## II - Models Creation + +### Data split into train/test subsets +### Predictive model creation +[lwPlsR from Jchemo (Julia)](model_creation.md) +### Predictive model evaluation + +## III - Prediction making + +## IV - Reporting \ No newline at end of file diff --git a/docs/prediction.md b/docs/prediction.md new file mode 100644 index 0000000..f301f29 --- /dev/null +++ b/docs/prediction.md @@ -0,0 +1,3 @@ +# Prediction making + +## This section uses the result files obtained from model creation step, and makes prediction on new set of data \ No newline at end of file diff --git a/src/utils/clustering.py b/src/utils/clustering.py index 18b34e1..9b36813 100644 --- a/src/utils/clustering.py +++ b/src/utils/clustering.py @@ -12,28 +12,65 @@ import pandas as pd def clustering(X, method='kmeans', **kwargs): """ - Perform clustering on the given dataset using the specified method. + Perform clustering on the given dataset using the specified method. + Available clustering methods are: + - **'kmeans'**: K-Means clustering algorithm, which partitions the data into `k` clusters. + - **'ap'**: Affinity Propagation clustering, a graph-based algorithm that identifies clusters without predefining the number of clusters. + - **'hdbscan'**: HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise), a method that identifies clusters of varying densities and can detect noise points. Parameters ---------- X : DataFrame or array-like, shape (n_samples, n_features) - The input data for clustering. + The input data for clustering. It can be a pandas DataFrame or a numpy array where + each row represents a sample, and each column represents a feature. - method : str, optional, default='kmeans' - The clustering method to use. Options are: - - 'kmeans': K-Means clustering. - - 'ap': Affinity Propagation clustering. - - 'hdbscan': HDBSCAN clustering. + method : str, default='kmeans' + The clustering method to use. The following clustering methods are available: + - 'kmeans': K-Means clustering. A method for partitioning data into `k` clusters. + - 'ap': Affinity Propagation clustering. A graph-based clustering method. + - 'hdbscan': HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise). A method that works well for data with varying densities. - kwargs : dict - Additional arguments specific to the clustering method. + kwargs : dict, optional + Additional keyword arguments that can be passed to the specific clustering method. + The following parameters are accepted for each method: + - For 'kmeans': + - 'max_k': int, the maximum number of clusters to consider when finding the optimal number of clusters using the Silhouette Score (default is 10). + - For 'hdbscan': + - 'min_samples': int, the number of samples in a neighborhood for a point to be considered a core point (default is 8). + - 'min_cluster_size': int, the minimum size of clusters (default is 10). + - 'metric': str, the distance metric to use (default is 'euclidean'). Returns ------- - DataFrame - A DataFrame containing the cluster assignments for each sample. The index corresponds - to the sample names (from X), and a column "names" lists the cluster labels. + tuple + A tuple containing: + - A pandas DataFrame with the cluster assignments for each sample. The index corresponds to the sample names (from X), + and a column "names" lists the cluster labels. + - An integer representing the number of clusters found. + + Raises + ------ + ValueError + If an unknown clustering method is specified, a ValueError will be raised. + + Notes + ----- + - The KMeans method uses Silhouette Score to determine the optimal number of clusters. + - Affinity Propagation uses a preference value (default -50) to determine cluster centers. + - HDBSCAN can assign some points to a 'Non clustered' category if they are considered noise or outliers (denoted by -1). + + Examples + -------- + # Example using K-Means clustering: + result, num_clusters = clustering(X, method='kmeans', max_k=12) + + # Example using Affinity Propagation clustering: + result, num_clusters = clustering(X, method='ap') + + # Example using HDBSCAN clustering: + result, num_clusters = clustering(X, method='hdbscan', min_samples=10, min_cluster_size=15) """ + if method == 'KMEANS': max_k = kwargs.get('max_k', 10) -- GitLab