Skip to content
Snippets Groups Projects
Commit 5748a0e9 authored by DIANE's avatar DIANE
Browse files

update

parent 0565c2d0
No related branches found
No related tags found
No related merge requests found
{"meta_project": "ds", "meta_sample_species": "ds", "meta_sample_category": "Other", "meta_sample_pretreatment": "Pastile", "meta_machine_ID": "ds", "meta_sample_sub_category": "Animal part", "meta_sample_humidity": "Wet", "meta_scan_place": "Pace"}
\ No newline at end of file
{"meta_project": "abderrahim", "meta_sample_species": "diane", "meta_sample_category": "Soil", "meta_sample_pretreatment": "Powder", "meta_machine_ID": "diane", "meta_sample_sub_category": "Leaf litter", "meta_sample_humidity": "Dry", "meta_scan_place": "Pace"}
\ No newline at end of file
File added
%% Cell type:code id: tags:
``` python
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
```
%% Cell type:code id: tags:
``` python
path = 'C:/Users/diane/Desktop/xcalxcal.csv'
df = pd.read_csv(path,decimal = '.', sep = ";", index_col=0)
```
%% Cell type:code id: tags:
``` python
def kmeans(x, max_k):
k = Cluster.find_optimal_k(X=x)
model = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)
model.fit(x)
clu = [f'cluster#{i}' for i in model.predict(x)+1]
res = tuple(zip(clu, x.index))
centers = model.cluster_centers_
for i in set(clu):
# search the closest points of the cluster members to center of the cluster
medoids[i], _ = pairwise_distances_argmin_min(tcr.iloc[clustered,:], clu_centers)
return res, medoids
```
%% Output
name
cluster#2 s126
cluster#2 s256
cluster#1 s27
cluster#2 s166
cluster#2 s27
... ...
cluster#2 s356
cluster#2 s357
cluster#1 s358
cluster#2 s359
cluster#1 s360
[361 rows x 1 columns]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[32], line 87
84 return result
86 m= SkKmeans()
---> 87 a,b = m.kmeans1layer(x = df, ratio = 0.2, max_k=10)
ValueError: not enough values to unpack (expected 2, got 1)
%% Cell type:code id: tags:
``` python
M = SkKmeans()
res, medoids = M.kmeans(df, max_k=40)
medoids
```
%% Output
{'cluster#1': 's315', 'cluster#2': 's303'}
%% Cell type:code id: tags:
``` python
```
%% Output
names
cluster#2 s126
cluster#2 s256
cluster#1 s27
cluster#2 s166
cluster#2 s27
... ...
cluster#2 s356
cluster#2 s357
cluster#1 s358
cluster#2 s359
cluster#1 s360
[361 rows x 1 columns]
%% Cell type:code id: tags:
``` python
X = df
```
%% Cell type:code id: tags:
``` python
def selection_method(X, method, **kwargs):
#['random', 'kennard-stone', 'medoids', 'meta-clusters']
if method =='random':
from sklearn.model_selection import train_test_split
elif method == 'kennard-stone':
from kennard_stone import train_test_split
if method in ['random','kennard-stone']:
selected, _ = train_test_split(X, train_size= kwargs['rset'])
sname = selected.index
if method in ['meta-ks','meta-medoids']:
best_k = 2
best_score = -1
for k in range(2, min(10,X.shape[0])):
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)
labels = model.fit_predict(X)
score = silhouette_score(X, labels)
if score > best_score:
best_score = score
best_k = k
model = KMeans(n_clusters=best_k, random_state=42, init='random', n_init=1, max_iter=100)
model.fit(X)
yp = model.predict(X)
sname = []
for i in range(best_k):
t = X.loc[yp==i]
if method == "meta-medoids":
from scipy.spatial.distance import cdist
distances = cdist(t.values, t.values, metric='euclidean')
sum_distances = np.sum(distances, axis=1)
medoid_index = np.argmin(sum_distances)
sname.append(X.index[medoid_index])
elif method == 'meta-ks':
from kennard_stone import train_test_split
selected, _ = train_test_split(t, train_size= kwargs['rset_meta'])
sname.append(selected.index)
return sname
l = ['random', 'kennard-stone', 'meta-medoids', 'meta-ks']
selection_method(X=df, method= l[2], rset = 0.01, rset_meta = 0.2)
```
%% Output
['s116', 's212']
%% Cell type:code id: tags:
``` python
min(15,2)
```
%% Output
2
4
1
{}
%% Cell type:code id: tags:
``` python
mask = df.index.duplicated(keep=False) # Keep all duplicates (True for replicated)
# For the duplicated sample_ids, apply suffix (_1, _2, etc.)
df.index = df.index.where(~mask,
df.groupby(df.index).cumcount().add(1).astype(str).radd(df.index + '#'))
len(set(df.index))
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
x.T.plot(y=x.index, kind='line', legend=False, )
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
267
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment