{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from pathlib import Path\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "path = 'C:/Users/diane/Desktop/xcalxcal.csv'\n", "df = pd.read_csv(path,decimal = '.', sep = \";\", index_col=0)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name\n", "cluster#2 s126\n", "cluster#2 s256\n", "cluster#1 s27\n", "cluster#2 s166\n", "cluster#2 s27\n", "... ...\n", "cluster#2 s356\n", "cluster#2 s357\n", "cluster#1 s358\n", "cluster#2 s359\n", "cluster#1 s360\n", "\n", "[361 rows x 1 columns]\n" ] }, { "ename": "ValueError", "evalue": "not enough values to unpack (expected 2, got 1)", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[32], line 87\u001b[0m\n\u001b[0;32m 84\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n\u001b[0;32m 86\u001b[0m m\u001b[38;5;241m=\u001b[39m SkKmeans()\n\u001b[1;32m---> 87\u001b[0m a,b \u001b[38;5;241m=\u001b[39m m\u001b[38;5;241m.\u001b[39mkmeans1layer(x \u001b[38;5;241m=\u001b[39m df, ratio \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.2\u001b[39m, max_k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m)\n", "\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)" ] } ], "source": [ " def kmeans(x, max_k):\n", " k = Cluster.find_optimal_k(X=x)\n", " model = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)\n", " model.fit(x)\n", " clu = [f'cluster#{i}' for i in model.predict(x)+1]\n", " res = tuple(zip(clu, x.index))\n", " centers = model.cluster_centers_\n", " for i in set(clu):\n", " # search the closest points of the cluster members to center of the cluster\n", " medoids[i], _ = pairwise_distances_argmin_min(tcr.iloc[clustered,:], clu_centers)\n", "\n", " return res, medoids" ] }, { "cell_type": "code", "execution_count": 288, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'cluster#1': 's315', 'cluster#2': 's303'}" ] }, "execution_count": 288, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", " \n", "\n", "M = SkKmeans()\n", "res, medoids = M.kmeans(df, max_k=40)\n", "medoids" ] }, { "cell_type": "code", "execution_count": 309, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>names</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>cluster#2</th>\n", " <td>s126</td>\n", " </tr>\n", " <tr>\n", " <th>cluster#2</th>\n", " <td>s256</td>\n", " </tr>\n", " <tr>\n", " <th>cluster#1</th>\n", " <td>s27</td>\n", " </tr>\n", " <tr>\n", " <th>cluster#2</th>\n", " <td>s166</td>\n", " </tr>\n", " <tr>\n", " <th>cluster#2</th>\n", " <td>s27</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>cluster#2</th>\n", " <td>s356</td>\n", " </tr>\n", " <tr>\n", " <th>cluster#2</th>\n", " <td>s357</td>\n", " </tr>\n", " <tr>\n", " <th>cluster#1</th>\n", " <td>s358</td>\n", " </tr>\n", " <tr>\n", " <th>cluster#2</th>\n", " <td>s359</td>\n", " </tr>\n", " <tr>\n", " <th>cluster#1</th>\n", " <td>s360</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>361 rows × 1 columns</p>\n", "</div>" ], "text/plain": [ " names\n", "cluster#2 s126\n", "cluster#2 s256\n", "cluster#1 s27\n", "cluster#2 s166\n", "cluster#2 s27\n", "... ...\n", "cluster#2 s356\n", "cluster#2 s357\n", "cluster#1 s358\n", "cluster#2 s359\n", "cluster#1 s360\n", "\n", "[361 rows x 1 columns]" ] }, "execution_count": 309, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 312, "metadata": {}, "outputs": [], "source": [ "X = df" ] }, { "cell_type": "code", "execution_count": 430, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['s116', 's212']" ] }, "execution_count": 430, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def selection_method(X, method, **kwargs):\n", " #['random', 'kennard-stone', 'medoids', 'meta-clusters']\n", " if method =='random':\n", " from sklearn.model_selection import train_test_split\n", " elif method == 'kennard-stone':\n", " from kennard_stone import train_test_split\n", " if method in ['random','kennard-stone']:\n", " selected, _ = train_test_split(X, train_size= kwargs['rset'])\n", " sname = selected.index\n", "\n", " if method in ['meta-ks','meta-medoids']:\n", " best_k = 2\n", " best_score = -1\n", " for k in range(2, min(10,X.shape[0])):\n", " from sklearn.cluster import KMeans\n", " from sklearn.metrics import silhouette_score\n", " model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)\n", " labels = model.fit_predict(X)\n", " score = silhouette_score(X, labels)\n", " if score > best_score:\n", " best_score = score\n", " best_k = k \n", " model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)\n", " model.fit(X)\n", " yp = model.predict(X)\n", "\n", " sname = []\n", " for i in range(best_k):\n", " t = X.loc[yp==i]\n", " if method == \"meta-medoids\":\n", " from scipy.spatial.distance import cdist\n", " distances = cdist(t.values, t.values, metric='euclidean') \n", " sum_distances = np.sum(distances, axis=1)\n", " medoid_index = np.argmin(sum_distances)\n", " sname.append(X.index[medoid_index])\n", " \n", " elif method == 'meta-ks':\n", " from kennard_stone import train_test_split\n", " selected, _ = train_test_split(t, train_size= kwargs['rset_meta'])\n", " sname.append(selected.index)\n", " return sname\n", "l = ['random', 'kennard-stone', 'meta-medoids', 'meta-ks']\n", "selection_method(X=df, method= l[2], rset = 0.01, rset_meta = 0.2)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4\n", "1\n" ] }, { "data": { "text/plain": [ "{}" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mask = df.index.duplicated(keep=False) # Keep all duplicates (True for replicated)\n", "\n", "# For the duplicated sample_ids, apply suffix (_1, _2, etc.)\n", "df.index = df.index.where(~mask, \n", " df.groupby(df.index).cumcount().add(1).astype(str).radd(df.index + '#'))\n", "len(set(df.index))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x.T.plot(y=x.index, kind='line', legend=False, )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "267 " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }