diff --git a/src/common.py b/src/common.py index 5720c506b695ef4928302d66c2bdf4205e56ae5e..92d27f25c22f702ce991850b649e690bcf29e552 100644 --- a/src/common.py +++ b/src/common.py @@ -18,9 +18,12 @@ image_path = Path('./images/img-sky.jpg') import os from shutil import rmtree -from pandas import DataFrame +from pandas import DataFrame, concat from PIL import Image import plotly.express as px +from tempfile import NamedTemporaryFile +import numpy as np +from datetime import datetime from utils.data_parsing import JcampParser, CsvParser diff --git a/src/pages/1-samples_selection.py b/src/pages/1-samples_selection.py index 4ec60a16da8f49c52356cafefde96e7826c9c6f2..3314d99a0c20720b5e245e18b8cc15ff50d0dc04 100644 --- a/src/pages/1-samples_selection.py +++ b/src/pages/1-samples_selection.py @@ -312,6 +312,7 @@ elif labels: # Strategy 0 case 'center': # list samples at clusters centers - Use sklearn.metrics.pairwise_distances_argmin if you want more than 1 sample per cluster + from sklearn.metrics import pairwise_distances_argmin_min closest, _ = pairwise_distances_argmin_min(clu_centers, new_tcr) selected_samples_idx = np.array(new_tcr.index)[list(closest)] selected_samples_idx = selected_samples_idx.tolist() @@ -393,6 +394,7 @@ if not t.empty: fig_export = {} # export 2D scores plot if len(axis)== 3: + from itertools import combinations comb = [i for i in combinations(np.arange(len(axis)), 2)] subcap = ['a','b','c'] for i in range(len(comb)): @@ -456,7 +458,8 @@ if not spectra.empty: xp = np.dot(t,p.T) # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model residuals = np.diag(np.subtract(xc.to_numpy(), xp)@ np.subtract(xc.to_numpy(), xp).T) - tresh4 = sc.stats.chi2.ppf(0.05, df = len(axis)) + from scipy.stats import chi2 + tresh4 = chi2.ppf(0.05, df = len(axis)) # color with metadata if colfilter: @@ -514,10 +517,11 @@ if not spectra.empty: hotelling = t.var(axis = 1) # Q residuals: Q residuals represent the magnitude of the variation remaining in each sample after projection through the model residuals = np.diag(np.subtract(xc.to_numpy(), xp)@ np.subtract(xc.to_numpy(), xp).T) - - fcri = sc.stats.f.isf(0.05, 3, n_samples) + + from scipy.stats import f, chi2 + fcri = f.isf(0.05, 3, n_samples) tresh0 = (3 * (n_samples ** 2 - 1) * fcri) / (n_samples * (n_samples - 3)) - tresh1 = sc.stats.chi2.ppf(0.05, df = 3) + tresh1 = chi2.ppf(0.05, df = 3) hotelling_plot = px.scatter(t, x = hotelling, y = residuals, color=labels if list(labels) else None, color_discrete_sequence= custom_color_palette) hotelling_plot.add_scatter(x = hotelling[selected_samples_idx] , y = residuals[selected_samples_idx], diff --git a/src/utils/clustering.py b/src/utils/clustering.py index 4f9a9c5a7aaf00d2eb989419187274c2a671d5d0..d367d042d3ce61fb78be16703dab7b24f41cfaaa 100644 --- a/src/utils/clustering.py +++ b/src/utils/clustering.py @@ -1,3 +1,8 @@ +import numpy as np +from pandas import DataFrame +from sklearn.cluster import KMeans + + #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kmeans ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ class Sk_Kmeans: diff --git a/src/utils/data_handling.py b/src/utils/data_handling.py index 28d2029dfaae47af75e70c6b4bbe41c4d6bd6128..1c222742d66d71f3b931fe1279f4d6960eed0004 100644 --- a/src/utils/data_handling.py +++ b/src/utils/data_handling.py @@ -54,6 +54,8 @@ def list_files(mypath, import_type): def standardize(X, center = True, scale = False): + from pandas import DataFrame + from sklearn.preprocessing import StandardScaler sk = StandardScaler(with_mean=center, with_std = scale) sc = DataFrame(sk.fit_transform(X), index = X.index, columns = X.columns) return sc diff --git a/src/utils/data_parsing.py b/src/utils/data_parsing.py index d78ef2b0cffe787a64fd34a858e954382dc88c5b..9e536071725fd6f4647c34e80be24bfd6f0c4df4 100644 --- a/src/utils/data_parsing.py +++ b/src/utils/data_parsing.py @@ -13,6 +13,7 @@ class JcampParser: self.__wl = self.__list_of_blocks[0]["x"] # Wavelengths/frequencies/range def parse(self): + import numpy as np # Start retreiving the data specs = np.zeros((self.__nb, len(self.__list_of_blocks[0]["y"])), dtype=float) # preallocate a np matrix for sotoring spectra self.idx = np.arange(self.__nb) # This list is designed to store samples name @@ -43,6 +44,7 @@ class JcampParser: } self.__met[f'{i}'] = block_met + from pandas import DataFrame self.metadata_ = DataFrame(self.__met).T self.spectra = DataFrame(np.fliplr(specs), columns= self.__wl[::-1], index = self.metadata_['name']) # Storing spectra in a dataframe @@ -55,6 +57,7 @@ class JcampParser: n_elements = a.count('(') ## Get the name of analyzed chamical elements + import re elements_name = [] for match in re.findall(self.pattern, a): elements_name.append(match[0]) @@ -71,6 +74,8 @@ class JcampParser: ### Method for retrieving the concentration of a single sample def conc(self,sample): + import re + import numpy as np prep = '\n'.join(line for line in sample.split('\n') if "NCU" not in line and "<<undef>>" not in line) c = [] for match in re.findall(self.pattern, prep): diff --git a/src/utils/dim_reduction.py b/src/utils/dim_reduction.py index b6a13bd8eead619710ecbeb05440ae1587ec85c4..0c112221aa1057e72407544241e96670ec7f8fcd 100644 --- a/src/utils/dim_reduction.py +++ b/src/utils/dim_reduction.py @@ -1,10 +1,12 @@ from utils.data_handling import * - +from pandas import DataFrame +import numpy as np # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pca ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# class LinearPCA: def __init__(self, X, Ncomp=10): + from sklearn.decomposition import PCA ## input matrix self.__x = np.array(X) ## set the number of components to compute and fit the model @@ -67,11 +69,13 @@ class Umap: self.categorical_data_encoded = cat_data elif len(cat_data) > 0: self.categorical_data = cat_data + from sklearn.preprocessing import LabelEncoder self.le = LabelEncoder() self.categorical_data_encoded = self.le.fit_transform(self.categorical_data) else: self.categorical_data_encoded = None + from umap import UMAP self.model = UMAP(n_neighbors=20, n_components=3, min_dist=0.0, )#random_state=42,) self.model.fit(self.numerical_data, y = self.categorical_data_encoded) self.scores_raw = self.model.transform(self.numerical_data) @@ -97,6 +101,7 @@ class Nmf: self.__ncp = Ncomp # Fit PCA model + from sklearn.decomposition import NMF Mo = NMF(n_components=self.__ncp, init=None, solver='cd', beta_loss='frobenius', tol=0.0001, max_iter=300, random_state=None, alpha_W=0.0, alpha_H='same', l1_ratio=0.0, verbose=0, shuffle=False) diff --git a/src/utils/visualize.py b/src/utils/visualize.py index b560be60548a90cdf9a4a728ac1de63632b6a5de..5ac80c34c96ab0bd1df4308a4f9ee36581d50e37 100644 --- a/src/utils/visualize.py +++ b/src/utils/visualize.py @@ -26,6 +26,9 @@ def pred_hist(pred): # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ predictions histogram ~~~~~~~~~~~~~~~~~~~~~~~~~~ @st.cache_data def plot_spectra(specdf, xunits, yunits): + import matplotlib.pyplot as plt + import numpy as np + fig, ax = plt.subplots(figsize = (30,7)) if isinstance(specdf.columns[0], str): specdf.T.plot(legend=False, ax = ax, color = '#2474b4')