From 4c31b136039fdd319aeda7aa61a671f713fcbd39 Mon Sep 17 00:00:00 2001 From: DIANE <abderrahim.diane@cefe.cnrs.fr> Date: Thu, 4 Apr 2024 09:57:40 +0200 Subject: [PATCH] PCA update --- Class_Mod/PCA_.py | 89 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 15 deletions(-) diff --git a/Class_Mod/PCA_.py b/Class_Mod/PCA_.py index cb578c9..982bc2d 100644 --- a/Class_Mod/PCA_.py +++ b/Class_Mod/PCA_.py @@ -1,22 +1,81 @@ from Packages import * from Class_Mod.DATA_HANDLING import * +class LinearPCA: + def __init__(self, X, Ncomp=10): + ## define color palette to use for plotting + self.__palette = 'YlGn' + numerical_data, categorical_data, scaled_values = col_cat(X) + self.catdata = list(categorical_data.columns) -def pca_maker(data_import): - numerical_data, categorical_data, scaled_values = col_cat(data_import) - # Compute a 6 components PCA on scaled values - pca = PCA(n_components=6) - pca_fit = pca.fit(scaled_values) - pca_data = pca_fit.transform(scaled_values) - pca_data = pd.DataFrame(pca_data, index=numerical_data.index) - # Set PCA column names with component number and explained variance % - new_column_names = ["PCA_" + str(i) + ' - ' + str(round(pca_fit.explained_variance_ratio_[i-1], 3) *100) + '%' for i in range(1, len(pca_data.columns) + 1)] - # Format the output - column_mapper = dict(zip(list(pca_data.columns), new_column_names)) - pca_data = pca_data.rename(columns=column_mapper) - output = pd.concat([data_import, pca_data], axis=1) - return output, list(categorical_data.columns), new_column_names + ## input matrix + self.__x = pd.DataFrame(scaled_values) + self._varnames = X.columns + self._rownames = X.index + ## set the number of components to compute and fit the model + self.__ncp = Ncomp + M = PCA(n_components = self.__ncp) + M.fit(self.__x) -#################################################################################################################################################################### + ######## results ######## + # Explained variability + + self.__pcnames = [f'PC{i+1}({100 * M.explained_variance_ratio_[i].round(2)}%)' for i in range(self.__ncp)] + + self._Qexp_ratio = pd.DataFrame(100 * M.explained_variance_ratio_, columns = ["Qexp"], index= [f'PC{i+1}' for i in range(self.__ncp)]) + # Loadings and scores + #scores + s = M.transform(self.__x) + self.__t = s + self._t = s + self._r = pd.DataFrame(2*(s-s.min(axis=0))/(s.max(axis=0)-s.min(axis=0)) -1, index= self._rownames) + self._r.columns = self.__pcnames + + # Normalize each loading vector to have unit length + self._p = (M.components_ / np.linalg.norm(M.components_, axis=0)).T + + # Matrix reconstruction or prediction making + # + self.res = pd.DataFrame() + for i in range(self.__ncp): + self._xp = np.dot(self.__t[:,i].reshape((-1,1)), self._p[:,i].reshape((1,-1))) + # residuals + self._e = self.__x - self._xp + self.res[self.__pcnames[i]] = np.diag(self._e@self._e.T) + #self._res = pd.DataFrame( self._e, columns = self._varnames, index = self._rownames ) + + self._xp = self.__t @ self._p.T + + # Compute the cosine similarity between the normalized loading vectors + self.lev = {} + ## Laverage: leverage values range between 0 and 1 + for i in range(self._t.shape[1]): + ti = self._t[:,i].reshape((-1,1)) + Hat = ti @ np.linalg.pinv(np.transpose(ti) @ ti) @ np.transpose(ti) + self.lev[self._r.columns[i]] = ti.ravel() + self.leverage = pd.DataFrame(self.lev) + ## Hotelling t2 + #self.eigvals = M.singular_values_**2 + #self.Lambda = np.diag(self.eigvals) + + #self.T2 = self.__t @ np.linalg.inv(self.Lambda) @self.__t.T + + + + @property + def scores_(self): + return pd.DataFrame(self._r) + + @property + def loadings_(self): + return pd.DataFrame(self._p, columns=self.__pcnames, index=self._varnames) + + @property + def leverage_(self): + return self.leverage + + @property + def residuals(self): + return self.res \ No newline at end of file -- GitLab