Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from utils.data_handling import *
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pca ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
class LinearPCA:
def __init__(self, X, Ncomp=10):
## input matrix
self.__x = np.array(X)
## set the number of components to compute and fit the model
self.__ncp = Ncomp
# Fit PCA model
M = PCA(n_components = self.__ncp)
M.fit(self.__x)
######## results ########
# Results
self.__pcnames = [f'PC{i+1}({100 * M.explained_variance_ratio_[i].round(2)}%)' for i in range(self.__ncp)]
self._Qexp_ratio = DataFrame(100 * M.explained_variance_ratio_, columns = ["Qexp"], index= [f'PC{i+1}' for i in range(self.__ncp)])
self._p = M.components_.T
self._t = M.transform(self.__x)
self.eigvals = M.singular_values_**2
self.Lambda = np.diag(self.eigvals)
# Matrix reconstruction or prediction making
self.T2 = {}
self._xp = {}
self._qres = {}
self.leverage = {}
#
for i in range(self.__ncp):
# Matrix reconstruction- prediction
self._xp[i] = np.dot(self._t[:,:i+1], self._p.T[:i+1,:])
#self.T2[i] = np.diag(self._t[:,:i+1] @ np.transpose(self._t[:,:i+1]))
@property
def scores_(self):
return DataFrame(self._t, columns= self.__pcnames)
@property
def loadings_(self):
return DataFrame(self._p, columns=self.__pcnames)
@property
def residuals_(self):
res = DataFrame(self._qres)
res.columns=self.__pcnames
return res
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ umap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
class Umap:
"""
The UMAP dimension reduction algorithm from scikit learn
"""
def __init__(self, numerical_data, cat_data):
self.numerical_data = numerical_data
if cat_data is None:
self.categorical_data_encoded = cat_data
elif len(cat_data) > 0:
self.categorical_data = cat_data
self.le = LabelEncoder()
self.categorical_data_encoded = self.le.fit_transform(self.categorical_data)
else:
self.categorical_data_encoded = None
self.model = UMAP(n_neighbors=20, n_components=3, min_dist=0.0, )#random_state=42,)
self.model.fit(self.numerical_data, y = self.categorical_data_encoded)
self.scores_raw = self.model.transform(self.numerical_data)
self.scores = DataFrame(self.scores_raw)
self.scores.columns = [f'axis_{i+1}' for i in range(self.scores_raw.shape[1])]
@property
def scores_(self):
return self.scores
@property
def scores_raw_(self):
return self.scores_raw
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nmf ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
class Nmf:
def __init__(self, X, Ncomp=3):
## input matrix
if np.min(X)<0:
self.__x = np.array(X-np.min(X))
else:
self.__x = np.array(X)
## set the number of components to compute and fit the model
self.__ncp = Ncomp
# Fit PCA model
Mo = NMF(n_components=self.__ncp, init=None, solver='cd', beta_loss='frobenius',
tol=0.0001, max_iter=300, random_state=None, alpha_W=0.0, alpha_H='same',
l1_ratio=0.0, verbose=0, shuffle=False)
Mo.fit(self.__x)
# Results
self._p = Mo.components_.T
self._t = Mo.transform(self.__x)
@property
def scores_(self):
return DataFrame(self._t)
@property
def loadings_(self):
return DataFrame(self._p)