Skip to content
Snippets Groups Projects
samsel.py 4.78 KiB
Newer Older
DIANE's avatar
DIANE committed
from typing import Optional, Union, Tuple
from pandas import DataFrame
from numpy import ndarray
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed



class Samplers:
    def __init__(self) -> None:
        pass
DIANE's avatar
DIANE committed
    
    
DIANE's avatar
DIANE committed
    @staticmethod
    def ksrdm(X, rset, method = 'rdm') -> Tuple[Union[ndarray, DataFrame], list]:
        """
        Splits the dataset using the Kennard-Stone algorithm.

        The Kennard-Stone algorithm is often used for calibration and ensures a more representative
        sampling of the dataset in the training set by selecting points that cover the data space.

        Returns
        -------
        Tuple[Union[ndarray, DataFrame], list]
            A tuple containing:
            - The original dataset (`self.x`).
            - A list of indices representing the training set selection.
        
        Notes
        -----
        Requires `kennard_stone` library to be installed.
        """
        if  method =='ks':
            from kennard_stone import train_test_split
        elif 'rdm':
            from sklearn.model_selection import train_test_split
        train, test = train_test_split(X, train_size= rset)
        # res = tuple(zip(_train.index, self.x.index))
        
        import numpy as np
        calset = DataFrame(index = X.index, columns = ['calset'])
        calset['names'] = X.index
        calset['calset'].loc[train.index] = 'Selected'
        calset['calset'].loc[test.index] = 'Not-Selected'
        calset.index = calset['calset'].to_numpy()
        calset['cluster'] =["cluster1"] * X.shape[0]
        return calset.drop(['calset'], axis = 1)


    def medoid(X, t):
        """
        Computes the medoid of a DataFrame.
        
        Parameters:
        df (pandas.DataFrame): DataFrame where rows represent samples and columns represent variables.
        
        Returns:
        str: The name (index) of the medoid (most central sample).
        """

        sname = []
        for i in set(t.index):
            # Compute pairwise distances between all samples
            distances = cdist(X.loc[t.loc[i,:].values,:].values, X.values, metric='euclidean')
            # Sum the distances for each sample (row)
            sum_distances = np.sum(distances, axis=1)
            # Find the index of the sample with the smallest sum of distances
            medoid_index = np.argmin(sum_distances)
            # Return the index (name) of the medoid
            sname.append(X.index[medoid_index])
        # calset = DataFrame(index = X.index, columns = ['calset'])
        # calset['names'] = X.index
        return sname









def selection_method(X, method, **kwargs):
    #['random', 'kennard-stone', 'medoids', 'meta-clusters']
    if method =='random':
DIANE's avatar
DIANE committed
        from sklearn.model_selection import train_test_split
DIANE's avatar
DIANE committed
        selected, _ = train_test_split(X, train_size= kwargs['rset'], random_state= 42)
        sname = list(selected.index)
    elif method == 'kennard-stone':
        from kennard_stone import train_test_split
        selected, _ = train_test_split(X, train_size= kwargs['rset'])
        sname = list(selected.index)
        

    if method in ['meta-ks','meta-medoids']:
        best_k = 2
        best_score = -1
        for k in range(2, min(10,X.shape[0])):
            from sklearn.cluster import KMeans
            from sklearn.metrics import silhouette_score
            model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)
            labels = model.fit_predict(X)
            score = silhouette_score(X, labels)
            if score > best_score:
                best_score = score
                best_k = k
        from sklearn.cluster import KMeans
        model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)
        model.fit(X)
        yp = model.predict(X)

        sname = []
        for i in range(best_k):
            t = X.loc[yp==i]
            if method == "meta-medoids":
                from scipy.spatial.distance import cdist
                distances = cdist(t.values, t.values, metric='euclidean')                    
                sum_distances = np.sum(distances, axis=1)
                medoid_index = np.argmin(sum_distances)
                sname.append(X.index[medoid_index])
                
DIANE's avatar
DIANE committed
        
DIANE's avatar
DIANE committed
            elif method == 'meta-ks':
                from kennard_stone import train_test_split
                if t.shape[0]>5:
                    selected, _ = train_test_split(t, train_size= kwargs['rset_meta'])
                else:
                    selected = t
                sname +=list(selected.index)
                # import streamlit as st
                # st.write(best_k)
    return sname