Skip to content
Snippets Groups Projects
Untitled-1.ipynb 10 KiB
Newer Older
  • Learn to ignore specific revisions
  • DIANE's avatar
    DIANE committed
    {
     "cells": [
      {
       "cell_type": "code",
       "execution_count": 1,
       "metadata": {},
       "outputs": [],
       "source": [
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "from pathlib import Path\n",
        "import numpy as np"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": 2,
       "metadata": {},
       "outputs": [],
       "source": [
        "path = 'C:/Users/diane/Desktop/xcalxcal.csv'\n",
        "df = pd.read_csv(path,decimal = '.', sep = \";\", index_col=0)"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": 32,
       "metadata": {},
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "           name\n",
          "cluster#2  s126\n",
          "cluster#2  s256\n",
          "cluster#1   s27\n",
          "cluster#2  s166\n",
          "cluster#2   s27\n",
          "...         ...\n",
          "cluster#2  s356\n",
          "cluster#2  s357\n",
          "cluster#1  s358\n",
          "cluster#2  s359\n",
          "cluster#1  s360\n",
          "\n",
          "[361 rows x 1 columns]\n"
         ]
        },
        {
         "ename": "ValueError",
         "evalue": "not enough values to unpack (expected 2, got 1)",
         "output_type": "error",
         "traceback": [
          "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
          "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
          "Cell \u001b[1;32mIn[32], line 87\u001b[0m\n\u001b[0;32m     84\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m result\n\u001b[0;32m     86\u001b[0m m\u001b[38;5;241m=\u001b[39m SkKmeans()\n\u001b[1;32m---> 87\u001b[0m a,b \u001b[38;5;241m=\u001b[39m m\u001b[38;5;241m.\u001b[39mkmeans1layer(x \u001b[38;5;241m=\u001b[39m df, ratio \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.2\u001b[39m, max_k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m)\n",
          "\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)"
         ]
        }
       ],
       "source": [
        "    def kmeans(x, max_k):\n",
        "        k = Cluster.find_optimal_k(X=x)\n",
        "        model = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)\n",
        "        model.fit(x)\n",
        "        clu = [f'cluster#{i}' for i in model.predict(x)+1]\n",
        "        res = tuple(zip(clu, x.index))\n",
        "        centers = model.cluster_centers_\n",
        "        for i in set(clu):\n",
        "            # search the closest points of the cluster members to center of the cluster\n",
        "            medoids[i], _ = pairwise_distances_argmin_min(tcr.iloc[clustered,:], clu_centers)\n",
        "\n",
        "        return res, medoids"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": 288,
       "metadata": {},
       "outputs": [
        {
         "data": {
          "text/plain": [
           "{'cluster#1': 's315', 'cluster#2': 's303'}"
          ]
         },
         "execution_count": 288,
         "metadata": {},
         "output_type": "execute_result"
        }
       ],
       "source": [
        "\n",
        "    \n",
        "\n",
        "M = SkKmeans()\n",
        "res, medoids = M.kmeans(df, max_k=40)\n",
        "medoids"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": 309,
       "metadata": {},
       "outputs": [
        {
         "data": {
          "text/html": [
           "<div>\n",
           "<style scoped>\n",
           "    .dataframe tbody tr th:only-of-type {\n",
           "        vertical-align: middle;\n",
           "    }\n",
           "\n",
           "    .dataframe tbody tr th {\n",
           "        vertical-align: top;\n",
           "    }\n",
           "\n",
           "    .dataframe thead th {\n",
           "        text-align: right;\n",
           "    }\n",
           "</style>\n",
           "<table border=\"1\" class=\"dataframe\">\n",
           "  <thead>\n",
           "    <tr style=\"text-align: right;\">\n",
           "      <th></th>\n",
           "      <th>names</th>\n",
           "    </tr>\n",
           "  </thead>\n",
           "  <tbody>\n",
           "    <tr>\n",
           "      <th>cluster#2</th>\n",
           "      <td>s126</td>\n",
           "    </tr>\n",
           "    <tr>\n",
           "      <th>cluster#2</th>\n",
           "      <td>s256</td>\n",
           "    </tr>\n",
           "    <tr>\n",
           "      <th>cluster#1</th>\n",
           "      <td>s27</td>\n",
           "    </tr>\n",
           "    <tr>\n",
           "      <th>cluster#2</th>\n",
           "      <td>s166</td>\n",
           "    </tr>\n",
           "    <tr>\n",
           "      <th>cluster#2</th>\n",
           "      <td>s27</td>\n",
           "    </tr>\n",
           "    <tr>\n",
           "      <th>...</th>\n",
           "      <td>...</td>\n",
           "    </tr>\n",
           "    <tr>\n",
           "      <th>cluster#2</th>\n",
           "      <td>s356</td>\n",
           "    </tr>\n",
           "    <tr>\n",
           "      <th>cluster#2</th>\n",
           "      <td>s357</td>\n",
           "    </tr>\n",
           "    <tr>\n",
           "      <th>cluster#1</th>\n",
           "      <td>s358</td>\n",
           "    </tr>\n",
           "    <tr>\n",
           "      <th>cluster#2</th>\n",
           "      <td>s359</td>\n",
           "    </tr>\n",
           "    <tr>\n",
           "      <th>cluster#1</th>\n",
           "      <td>s360</td>\n",
           "    </tr>\n",
           "  </tbody>\n",
           "</table>\n",
           "<p>361 rows × 1 columns</p>\n",
           "</div>"
          ],
          "text/plain": [
           "          names\n",
           "cluster#2  s126\n",
           "cluster#2  s256\n",
           "cluster#1   s27\n",
           "cluster#2  s166\n",
           "cluster#2   s27\n",
           "...         ...\n",
           "cluster#2  s356\n",
           "cluster#2  s357\n",
           "cluster#1  s358\n",
           "cluster#2  s359\n",
           "cluster#1  s360\n",
           "\n",
           "[361 rows x 1 columns]"
          ]
         },
         "execution_count": 309,
         "metadata": {},
         "output_type": "execute_result"
        }
       ],
       "source": []
      },
      {
       "cell_type": "code",
       "execution_count": 312,
       "metadata": {},
       "outputs": [],
       "source": [
        "X = df"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": 430,
       "metadata": {},
       "outputs": [
        {
         "data": {
          "text/plain": [
           "['s116', 's212']"
          ]
         },
         "execution_count": 430,
         "metadata": {},
         "output_type": "execute_result"
        }
       ],
       "source": [
        "def selection_method(X, method, **kwargs):\n",
        "    #['random', 'kennard-stone', 'medoids', 'meta-clusters']\n",
        "    if method =='random':\n",
        "        from sklearn.model_selection import train_test_split\n",
        "    elif method == 'kennard-stone':\n",
        "        from kennard_stone import train_test_split\n",
        "    if method in ['random','kennard-stone']:\n",
        "        selected, _ = train_test_split(X, train_size= kwargs['rset'])\n",
        "        sname = selected.index\n",
        "\n",
        "    if method in ['meta-ks','meta-medoids']:\n",
        "        best_k = 2\n",
        "        best_score = -1\n",
        "        for k in range(2, min(10,X.shape[0])):\n",
        "            from sklearn.cluster import KMeans\n",
        "            from sklearn.metrics import silhouette_score\n",
        "            model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)\n",
        "            labels = model.fit_predict(X)\n",
        "            score = silhouette_score(X, labels)\n",
        "            if score > best_score:\n",
        "                best_score = score\n",
        "                best_k = k                \n",
        "        model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)\n",
        "        model.fit(X)\n",
        "        yp = model.predict(X)\n",
        "\n",
        "        sname = []\n",
        "        for i in range(best_k):\n",
        "            t = X.loc[yp==i]\n",
        "            if method == \"meta-medoids\":\n",
        "                from scipy.spatial.distance import cdist\n",
        "                distances = cdist(t.values, t.values, metric='euclidean')                    \n",
        "                sum_distances = np.sum(distances, axis=1)\n",
        "                medoid_index = np.argmin(sum_distances)\n",
        "                sname.append(X.index[medoid_index])\n",
        "        \n",
        "            elif method == 'meta-ks':\n",
        "                from kennard_stone import train_test_split\n",
        "                selected, _ = train_test_split(t, train_size= kwargs['rset_meta'])\n",
        "                sname.append(selected.index)\n",
        "    return sname\n",
        "l = ['random', 'kennard-stone', 'meta-medoids', 'meta-ks']\n",
        "selection_method(X=df, method= l[2], rset = 0.01, rset_meta = 0.2)"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": 355,
       "metadata": {},
       "outputs": [
        {
         "data": {
          "text/plain": [
           "2"
          ]
         },
         "execution_count": 355,
         "metadata": {},
         "output_type": "execute_result"
        }
       ],
       "source": [
        "min(15,2)"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
        "mask = df.index.duplicated(keep=False)  # Keep all duplicates (True for replicated)\n",
        "\n",
        "# For the duplicated sample_ids, apply suffix (_1, _2, etc.)\n",
        "df.index = df.index.where(~mask, \n",
        "                           df.groupby(df.index).cumcount().add(1).astype(str).radd(df.index + '#'))\n",
        "len(set(df.index))"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": []
      },
      {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
        "x.T.plot(y=x.index, kind='line', legend=False, )"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": []
      },
      {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
        "267 "
       ]
      }
     ],
     "metadata": {
      "kernelspec": {
       "display_name": "Python 3",
       "language": "python",
       "name": "python3"
      },
      "language_info": {
       "codemirror_mode": {
        "name": "ipython",
        "version": 3
       },
       "file_extension": ".py",
       "mimetype": "text/x-python",
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
       "version": "3.12.7"
      }
     },
     "nbformat": 4,
     "nbformat_minor": 2
    }