From d836044e88139a838d429390c0df261778348bb8 Mon Sep 17 00:00:00 2001
From: DIANE <abderrahim.diane@cefe.cnrs.fr>
Date: Fri, 6 Dec 2024 15:45:42 +0100
Subject: [PATCH] documentation

---
 docs/Clustering.md      |  8 ++----
 docs/index.md           | 20 ++++++++++----
 docs/prediction.md      |  3 ++
 src/utils/clustering.py | 61 +++++++++++++++++++++++++++++++++--------
 4 files changed, 69 insertions(+), 23 deletions(-)
 create mode 100644 docs/prediction.md

diff --git a/docs/Clustering.md b/docs/Clustering.md
index 374fc12..fc2e05e 100644
--- a/docs/Clustering.md
+++ b/docs/Clustering.md
@@ -1,7 +1,3 @@
 # Clustering Methods
-
-## K-Means clustering
-::: src.utils.clustering.Sk_Kmeans
-  
-## HDBSCAN clustering
-::: src.utils.clustering.Hdbscan
+::: src.utils.clustering.clustering
+  
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
index 10504b2..ab4c721 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -2,13 +2,23 @@
 
 This workflow aims at ...
 
-## Samples Selection
+## I - Samples Selection
 
-## Dimension Reduction
+### Dimension Reduction
 
-## Clustering
+### Clustering analysis
 [K-Means](Clustering.md#k-means-clustering)  
 [HDBSCAN](Clustering.md#hdbscan-clustering)
 
-## Models Creation
-[lwPlsR from Jchemo (Julia)](model_creation.md)
\ No newline at end of file
+### Representative subset selection
+
+## II - Models Creation
+
+### Data split into train/test subsets
+### Predictive model creation
+[lwPlsR from Jchemo (Julia)](model_creation.md)
+### Predictive model evaluation
+
+## III - Prediction making
+
+## IV - Reporting
\ No newline at end of file
diff --git a/docs/prediction.md b/docs/prediction.md
new file mode 100644
index 0000000..f301f29
--- /dev/null
+++ b/docs/prediction.md
@@ -0,0 +1,3 @@
+# Prediction making
+
+## This section uses the result files obtained from model creation step, and makes prediction on new set of data 
\ No newline at end of file
diff --git a/src/utils/clustering.py b/src/utils/clustering.py
index 18b34e1..9b36813 100644
--- a/src/utils/clustering.py
+++ b/src/utils/clustering.py
@@ -12,28 +12,65 @@ import pandas as pd
 
 def clustering(X, method='kmeans', **kwargs):
     """
-    Perform clustering on the given dataset using the specified method.
+    Perform clustering on the given dataset using the specified method. 
+    Available clustering methods are:
+    - **'kmeans'**: K-Means clustering algorithm, which partitions the data into `k` clusters.
+    - **'ap'**: Affinity Propagation clustering, a graph-based algorithm that identifies clusters without predefining the number of clusters.
+    - **'hdbscan'**: HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise), a method that identifies clusters of varying densities and can detect noise points.
 
     Parameters
     ----------
     X : DataFrame or array-like, shape (n_samples, n_features)
-        The input data for clustering.
+        The input data for clustering. It can be a pandas DataFrame or a numpy array where 
+        each row represents a sample, and each column represents a feature.
 
-    method : str, optional, default='kmeans'
-        The clustering method to use. Options are:
-        - 'kmeans': K-Means clustering.
-        - 'ap': Affinity Propagation clustering.
-        - 'hdbscan': HDBSCAN clustering.
+    method : str, default='kmeans'
+        The clustering method to use. The following clustering methods are available:
+        - 'kmeans': K-Means clustering. A method for partitioning data into `k` clusters.
+        - 'ap': Affinity Propagation clustering. A graph-based clustering method.
+        - 'hdbscan': HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise). A method that works well for data with varying densities.
 
-    kwargs : dict
-        Additional arguments specific to the clustering method.
+    kwargs : dict, optional
+        Additional keyword arguments that can be passed to the specific clustering method. 
+        The following parameters are accepted for each method:
+        - For 'kmeans': 
+            - 'max_k': int, the maximum number of clusters to consider when finding the optimal number of clusters using the Silhouette Score (default is 10).
+        - For 'hdbscan': 
+            - 'min_samples': int, the number of samples in a neighborhood for a point to be considered a core point (default is 8).
+            - 'min_cluster_size': int, the minimum size of clusters (default is 10).
+            - 'metric': str, the distance metric to use (default is 'euclidean').
 
     Returns
     -------
-    DataFrame
-        A DataFrame containing the cluster assignments for each sample. The index corresponds
-        to the sample names (from X), and a column "names" lists the cluster labels.
+    tuple
+        A tuple containing:
+        - A pandas DataFrame with the cluster assignments for each sample. The index corresponds to the sample names (from X), 
+          and a column "names" lists the cluster labels.
+        - An integer representing the number of clusters found.
+
+    Raises
+    ------
+    ValueError
+        If an unknown clustering method is specified, a ValueError will be raised.
+
+    Notes
+    -----
+    - The KMeans method uses Silhouette Score to determine the optimal number of clusters.
+    - Affinity Propagation uses a preference value (default -50) to determine cluster centers.
+    - HDBSCAN can assign some points to a 'Non clustered' category if they are considered noise or outliers (denoted by -1).
+
+    Examples
+    --------
+    # Example using K-Means clustering:
+    result, num_clusters = clustering(X, method='kmeans', max_k=12)
+
+    # Example using Affinity Propagation clustering:
+    result, num_clusters = clustering(X, method='ap')
+
+    # Example using HDBSCAN clustering:
+    result, num_clusters = clustering(X, method='hdbscan', min_samples=10, min_cluster_size=15)
     """
+    
     if method == 'KMEANS':
         max_k = kwargs.get('max_k', 10)
 
-- 
GitLab