Untitled-1.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from pathlib import Path\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = 'C:/Users/diane/Desktop/xcalxcal.csv'\n",
    "df = pd.read_csv(path,decimal = '.', sep = \";\", index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           name\n",
      "cluster#2  s126\n",
      "cluster#2  s256\n",
      "cluster#1   s27\n",
      "cluster#2  s166\n",
      "cluster#2   s27\n",
      "...         ...\n",
      "cluster#2  s356\n",
      "cluster#2  s357\n",
      "cluster#1  s358\n",
      "cluster#2  s359\n",
      "cluster#1  s360\n",
      "\n",
      "[361 rows x 1 columns]\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "not enough values to unpack (expected 2, got 1)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[32], line 87\u001b[0m\n\u001b[0;32m     84\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m result\n\u001b[0;32m     86\u001b[0m m\u001b[38;5;241m=\u001b[39m SkKmeans()\n\u001b[1;32m---> 87\u001b[0m a,b \u001b[38;5;241m=\u001b[39m m\u001b[38;5;241m.\u001b[39mkmeans1layer(x \u001b[38;5;241m=\u001b[39m df, ratio \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.2\u001b[39m, max_k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m)\n",
      "\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)"
     ]
    }
   ],
   "source": [
    "    def kmeans(x, max_k):\n",
    "        k = Cluster.find_optimal_k(X=x)\n",
    "        model = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)\n",
    "        model.fit(x)\n",
    "        clu = [f'cluster#{i}' for i in model.predict(x)+1]\n",
    "        res = tuple(zip(clu, x.index))\n",
    "        centers = model.cluster_centers_\n",
    "        for i in set(clu):\n",
    "            # search the closest points of the cluster members to center of the cluster\n",
    "            medoids[i], _ = pairwise_distances_argmin_min(tcr.iloc[clustered,:], clu_centers)\n",
    "\n",
    "        return res, medoids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 288,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'cluster#1': 's315', 'cluster#2': 's303'}"
      ]
     },
     "execution_count": 288,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "    \n",
    "\n",
    "M = SkKmeans()\n",
    "res, medoids = M.kmeans(df, max_k=40)\n",
    "medoids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 309,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>names</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>cluster#2</th>\n",
       "      <td>s126</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster#2</th>\n",
       "      <td>s256</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster#1</th>\n",
       "      <td>s27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster#2</th>\n",
       "      <td>s166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster#2</th>\n",
       "      <td>s27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster#2</th>\n",
       "      <td>s356</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster#2</th>\n",
       "      <td>s357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster#1</th>\n",
       "      <td>s358</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster#2</th>\n",
       "      <td>s359</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cluster#1</th>\n",
       "      <td>s360</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>361 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          names\n",
       "cluster#2  s126\n",
       "cluster#2  s256\n",
       "cluster#1   s27\n",
       "cluster#2  s166\n",
       "cluster#2   s27\n",
       "...         ...\n",
       "cluster#2  s356\n",
       "cluster#2  s357\n",
       "cluster#1  s358\n",
       "cluster#2  s359\n",
       "cluster#1  s360\n",
       "\n",
       "[361 rows x 1 columns]"
      ]
     },
     "execution_count": 309,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 312,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 430,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['s116', 's212']"
      ]
     },
     "execution_count": 430,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def selection_method(X, method, **kwargs):\n",
    "    #['random', 'kennard-stone', 'medoids', 'meta-clusters']\n",
    "    if method =='random':\n",
    "        from sklearn.model_selection import train_test_split\n",
    "    elif method == 'kennard-stone':\n",
    "        from kennard_stone import train_test_split\n",
    "    if method in ['random','kennard-stone']:\n",
    "        selected, _ = train_test_split(X, train_size= kwargs['rset'])\n",
    "        sname = selected.index\n",
    "\n",
    "    if method in ['meta-ks','meta-medoids']:\n",
    "        best_k = 2\n",
    "        best_score = -1\n",
    "        for k in range(2, min(10,X.shape[0])):\n",
    "            from sklearn.cluster import KMeans\n",
    "            from sklearn.metrics import silhouette_score\n",
    "            model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)\n",
    "            labels = model.fit_predict(X)\n",
    "            score = silhouette_score(X, labels)\n",
    "            if score > best_score:\n",
    "                best_score = score\n",
    "                best_k = k                \n",
    "        model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)\n",
    "        model.fit(X)\n",
    "        yp = model.predict(X)\n",
    "\n",
    "        sname = []\n",
    "        for i in range(best_k):\n",
    "            t = X.loc[yp==i]\n",
    "            if method == \"meta-medoids\":\n",
    "                from scipy.spatial.distance import cdist\n",
    "                distances = cdist(t.values, t.values, metric='euclidean')                    \n",
    "                sum_distances = np.sum(distances, axis=1)\n",
    "                medoid_index = np.argmin(sum_distances)\n",
    "                sname.append(X.index[medoid_index])\n",
    "        \n",
    "            elif method == 'meta-ks':\n",
    "                from kennard_stone import train_test_split\n",
    "                selected, _ = train_test_split(t, train_size= kwargs['rset_meta'])\n",
    "                sname.append(selected.index)\n",
    "    return sname\n",
    "l = ['random', 'kennard-stone', 'meta-medoids', 'meta-ks']\n",
    "selection_method(X=df, method= l[2], rset = 0.01, rset_meta = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 355,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 355,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "min(15,2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mask = df.index.duplicated(keep=False)  # Keep all duplicates (True for replicated)\n",
    "\n",
    "# For the duplicated sample_ids, apply suffix (_1, _2, etc.)\n",
    "df.index = df.index.where(~mask, \n",
    "                           df.groupby(df.index).cumcount().add(1).astype(str).radd(df.index + '#'))\n",
    "len(set(df.index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x.T.plot(y=x.index, kind='line', legend=False, )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "267 "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}