samsel.py

from typing import Optional, Union, Tuple
from pandas import DataFrame
from numpy import ndarray
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist


class Samplers:
    def __init__(self) -> None:
        pass
    
    
    @staticmethod
    def ksrdm(X, rset, method = 'rdm') -> Tuple[Union[ndarray, DataFrame], list]:
        """
        Splits the dataset using the Kennard-Stone algorithm.

        The Kennard-Stone algorithm is often used for calibration and ensures a more representative
        sampling of the dataset in the training set by selecting points that cover the data space.

        Returns
        -------
        Tuple[Union[ndarray, DataFrame], list]
            A tuple containing:
            - The original dataset (`self.x`).
            - A list of indices representing the training set selection.
        
        Notes
        -----
        Requires `kennard_stone` library to be installed.
        """
        if  method =='ks':
            from kennard_stone import train_test_split
        elif 'rdm':
            from sklearn.model_selection import train_test_split
        train, test = train_test_split(X, train_size= rset)
        # res = tuple(zip(_train.index, self.x.index))
        
        import numpy as np
        calset = DataFrame(index = X.index, columns = ['calset'])
        calset['names'] = X.index
        calset['calset'].loc[train.index] = 'Selected'
        calset['calset'].loc[test.index] = 'Not-Selected'
        calset.index = calset['calset'].to_numpy()
        calset['cluster'] =["cluster1"] * X.shape[0]
        return calset.drop(['calset'], axis = 1)


    def medoid(X, t):
        """
        Computes the medoid of a DataFrame.
        
        Parameters:
        df (pandas.DataFrame): DataFrame where rows represent samples and columns represent variables.
        
        Returns:
        str: The name (index) of the medoid (most central sample).
        """

        sname = []
        for i in set(t.index):
            # Compute pairwise distances between all samples
            distances = cdist(X.loc[t.loc[i,:].values,:].values, X.values, metric='euclidean')
            # Sum the distances for each sample (row)
            sum_distances = np.sum(distances, axis=1)
            # Find the index of the sample with the smallest sum of distances
            medoid_index = np.argmin(sum_distances)
            # Return the index (name) of the medoid
            sname.append(X.index[medoid_index])
        # calset = DataFrame(index = X.index, columns = ['calset'])
        # calset['names'] = X.index
        return sname


def selection_method(X, method, **kwargs):
    #['random', 'kennard-stone', 'medoids', 'meta-clusters']
    if method =='random':
        from sklearn.model_selection import train_test_split
        selected, _ = train_test_split(X, train_size= kwargs['rset'], random_state= 42)
        sname = list(selected.index)
    elif method == 'kennard-stone':
        from kennard_stone import train_test_split
        selected, _ = train_test_split(X, train_size= kwargs['rset'])
        sname = list(selected.index)
        

    if method in ['meta-ks','meta-medoids']:
        best_k = 2
        best_score = -1
        for k in range(2, min(10,X.shape[0])):
            from sklearn.cluster import KMeans
            from sklearn.metrics import silhouette_score
            model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)
            labels = model.fit_predict(X)
            score = silhouette_score(X, labels)
            if score > best_score:
                best_score = score
                best_k = k
        from sklearn.cluster import KMeans
        model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)
        model.fit(X)
        yp = model.predict(X)

        sname = []
        for i in range(best_k):
            t = X.loc[yp==i]
            if method == "meta-medoids":
                from scipy.spatial.distance import cdist
                distances = cdist(t.values, t.values, metric='euclidean')                    
                sum_distances = np.sum(distances, axis=1)
                medoid_index = np.argmin(sum_distances)
                sname.append(X.index[medoid_index])
                
        
            elif method == 'meta-ks':
                from kennard_stone import train_test_split
                if t.shape[0]>5:
                    selected, _ = train_test_split(t, train_size= kwargs['rset_meta'])
                else:
                    selected = t
                sname +=list(selected.index)
                # import streamlit as st
                # st.write(best_k)
    return sname