Skip to content
Snippets Groups Projects
Commit 4c31b136 authored by DIANE's avatar DIANE
Browse files

PCA update

parent 084f7d2d
No related branches found
No related tags found
No related merge requests found
from Packages import *
from Class_Mod.DATA_HANDLING import *
class LinearPCA:
def __init__(self, X, Ncomp=10):
## define color palette to use for plotting
self.__palette = 'YlGn'
numerical_data, categorical_data, scaled_values = col_cat(X)
self.catdata = list(categorical_data.columns)
def pca_maker(data_import):
numerical_data, categorical_data, scaled_values = col_cat(data_import)
# Compute a 6 components PCA on scaled values
pca = PCA(n_components=6)
pca_fit = pca.fit(scaled_values)
pca_data = pca_fit.transform(scaled_values)
pca_data = pd.DataFrame(pca_data, index=numerical_data.index)
# Set PCA column names with component number and explained variance %
new_column_names = ["PCA_" + str(i) + ' - ' + str(round(pca_fit.explained_variance_ratio_[i-1], 3) *100) + '%' for i in range(1, len(pca_data.columns) + 1)]
# Format the output
column_mapper = dict(zip(list(pca_data.columns), new_column_names))
pca_data = pca_data.rename(columns=column_mapper)
output = pd.concat([data_import, pca_data], axis=1)
return output, list(categorical_data.columns), new_column_names
## input matrix
self.__x = pd.DataFrame(scaled_values)
self._varnames = X.columns
self._rownames = X.index
## set the number of components to compute and fit the model
self.__ncp = Ncomp
M = PCA(n_components = self.__ncp)
M.fit(self.__x)
####################################################################################################################################################################
######## results ########
# Explained variability
self.__pcnames = [f'PC{i+1}({100 * M.explained_variance_ratio_[i].round(2)}%)' for i in range(self.__ncp)]
self._Qexp_ratio = pd.DataFrame(100 * M.explained_variance_ratio_, columns = ["Qexp"], index= [f'PC{i+1}' for i in range(self.__ncp)])
# Loadings and scores
#scores
s = M.transform(self.__x)
self.__t = s
self._t = s
self._r = pd.DataFrame(2*(s-s.min(axis=0))/(s.max(axis=0)-s.min(axis=0)) -1, index= self._rownames)
self._r.columns = self.__pcnames
# Normalize each loading vector to have unit length
self._p = (M.components_ / np.linalg.norm(M.components_, axis=0)).T
# Matrix reconstruction or prediction making
#
self.res = pd.DataFrame()
for i in range(self.__ncp):
self._xp = np.dot(self.__t[:,i].reshape((-1,1)), self._p[:,i].reshape((1,-1)))
# residuals
self._e = self.__x - self._xp
self.res[self.__pcnames[i]] = np.diag(self._e@self._e.T)
#self._res = pd.DataFrame( self._e, columns = self._varnames, index = self._rownames )
self._xp = self.__t @ self._p.T
# Compute the cosine similarity between the normalized loading vectors
self.lev = {}
## Laverage: leverage values range between 0 and 1
for i in range(self._t.shape[1]):
ti = self._t[:,i].reshape((-1,1))
Hat = ti @ np.linalg.pinv(np.transpose(ti) @ ti) @ np.transpose(ti)
self.lev[self._r.columns[i]] = ti.ravel()
self.leverage = pd.DataFrame(self.lev)
## Hotelling t2
#self.eigvals = M.singular_values_**2
#self.Lambda = np.diag(self.eigvals)
#self.T2 = self.__t @ np.linalg.inv(self.Lambda) @self.__t.T
@property
def scores_(self):
return pd.DataFrame(self._r)
@property
def loadings_(self):
return pd.DataFrame(self._p, columns=self.__pcnames, index=self._varnames)
@property
def leverage_(self):
return self.leverage
@property
def residuals(self):
return self.res
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment