from typing import Optional, Union, Tuple from pandas import DataFrame from numpy import ndarray import pandas as pd import numpy as np from scipy.spatial.distance import cdist class Samplers: def __init__(self) -> None: pass @staticmethod def ksrdm(X, rset, method = 'rdm') -> Tuple[Union[ndarray, DataFrame], list]: """ Splits the dataset using the Kennard-Stone algorithm. The Kennard-Stone algorithm is often used for calibration and ensures a more representative sampling of the dataset in the training set by selecting points that cover the data space. Returns ------- Tuple[Union[ndarray, DataFrame], list] A tuple containing: - The original dataset (`self.x`). - A list of indices representing the training set selection. Notes ----- Requires `kennard_stone` library to be installed. """ if method =='ks': from kennard_stone import train_test_split elif 'rdm': from sklearn.model_selection import train_test_split train, test = train_test_split(X, train_size= rset) # res = tuple(zip(_train.index, self.x.index)) import numpy as np calset = DataFrame(index = X.index, columns = ['calset']) calset['names'] = X.index calset['calset'].loc[train.index] = 'Selected' calset['calset'].loc[test.index] = 'Not-Selected' calset.index = calset['calset'].to_numpy() calset['cluster'] =["cluster1"] * X.shape[0] return calset.drop(['calset'], axis = 1) def medoid(X, t): """ Computes the medoid of a DataFrame. Parameters: df (pandas.DataFrame): DataFrame where rows represent samples and columns represent variables. Returns: str: The name (index) of the medoid (most central sample). """ sname = [] for i in set(t.index): # Compute pairwise distances between all samples distances = cdist(X.loc[t.loc[i,:].values,:].values, X.values, metric='euclidean') # Sum the distances for each sample (row) sum_distances = np.sum(distances, axis=1) # Find the index of the sample with the smallest sum of distances medoid_index = np.argmin(sum_distances) # Return the index (name) of the medoid sname.append(X.index[medoid_index]) # calset = DataFrame(index = X.index, columns = ['calset']) # calset['names'] = X.index return sname def selection_method(X, method, **kwargs): #['random', 'kennard-stone', 'medoids', 'meta-clusters'] if method =='random': from sklearn.model_selection import train_test_split selected, _ = train_test_split(X, train_size= kwargs['rset'], random_state= 42) sname = list(selected.index) elif method == 'kennard-stone': from kennard_stone import train_test_split selected, _ = train_test_split(X, train_size= kwargs['rset']) sname = list(selected.index) if method in ['meta-ks','meta-medoids']: best_k = 2 best_score = -1 for k in range(2, min(10,X.shape[0])): from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100) labels = model.fit_predict(X) score = silhouette_score(X, labels) if score > best_score: best_score = score best_k = k from sklearn.cluster import KMeans model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100) model.fit(X) yp = model.predict(X) sname = [] for i in range(best_k): t = X.loc[yp==i] if method == "meta-medoids": from scipy.spatial.distance import cdist distances = cdist(t.values, t.values, metric='euclidean') sum_distances = np.sum(distances, axis=1) medoid_index = np.argmin(sum_distances) sname.append(X.index[medoid_index]) elif method == 'meta-ks': from kennard_stone import train_test_split if t.shape[0]>5: selected, _ = train_test_split(t, train_size= kwargs['rset_meta']) else: selected = t sname +=list(selected.index) # import streamlit as st # st.write(best_k) return sname