From 4c31b136039fdd319aeda7aa61a671f713fcbd39 Mon Sep 17 00:00:00 2001
From: DIANE <abderrahim.diane@cefe.cnrs.fr>
Date: Thu, 4 Apr 2024 09:57:40 +0200
Subject: [PATCH] PCA update

---
 Class_Mod/PCA_.py | 89 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 74 insertions(+), 15 deletions(-)

diff --git a/Class_Mod/PCA_.py b/Class_Mod/PCA_.py
index cb578c9..982bc2d 100644
--- a/Class_Mod/PCA_.py
+++ b/Class_Mod/PCA_.py
@@ -1,22 +1,81 @@
 from Packages import *
 from Class_Mod.DATA_HANDLING import *
+class LinearPCA:
 
+    def __init__(self, X, Ncomp=10):
+        ## define color palette to use for plotting
+        self.__palette = 'YlGn'
+        numerical_data, categorical_data, scaled_values = col_cat(X)
+        self.catdata = list(categorical_data.columns)
 
-def pca_maker(data_import):
-    numerical_data, categorical_data, scaled_values = col_cat(data_import)
-    # Compute a 6 components PCA on scaled values
-    pca = PCA(n_components=6)
-    pca_fit = pca.fit(scaled_values)
-    pca_data = pca_fit.transform(scaled_values)
-    pca_data = pd.DataFrame(pca_data, index=numerical_data.index)
-    # Set PCA column names with component number and explained variance %
-    new_column_names = ["PCA_" + str(i) + ' - ' + str(round(pca_fit.explained_variance_ratio_[i-1], 3) *100) + '%' for i in range(1, len(pca_data.columns) + 1)]
-    # Format the output
-    column_mapper = dict(zip(list(pca_data.columns), new_column_names))
-    pca_data = pca_data.rename(columns=column_mapper)
-    output = pd.concat([data_import, pca_data], axis=1)
-    return output, list(categorical_data.columns), new_column_names
 
+        ## input matrix
+        self.__x = pd.DataFrame(scaled_values)
+        self._varnames = X.columns
+        self._rownames = X.index
 
+        ## set the number of components to compute and fit the model
+        self.__ncp = Ncomp
+        M = PCA(n_components = self.__ncp)
+        M.fit(self.__x)
 
-####################################################################################################################################################################
+        ######## results ########
+        # Explained variability
+        
+        self.__pcnames = [f'PC{i+1}({100 *  M.explained_variance_ratio_[i].round(2)}%)' for i in range(self.__ncp)]
+        
+        self._Qexp_ratio = pd.DataFrame(100 *  M.explained_variance_ratio_, columns = ["Qexp"], index= [f'PC{i+1}' for i in range(self.__ncp)])
+        # Loadings and scores
+         #scores
+        s = M.transform(self.__x)
+        self.__t = s
+        self._t = s
+        self._r = pd.DataFrame(2*(s-s.min(axis=0))/(s.max(axis=0)-s.min(axis=0)) -1, index= self._rownames)
+        self._r.columns = self.__pcnames
+
+        # Normalize each loading vector to have unit length
+        self._p = (M.components_ / np.linalg.norm(M.components_, axis=0)).T
+        
+        # Matrix reconstruction or prediction making
+        #
+        self.res = pd.DataFrame()
+        for i in range(self.__ncp):
+            self._xp = np.dot(self.__t[:,i].reshape((-1,1)), self._p[:,i].reshape((1,-1)))
+            # residuals
+            self._e = self.__x - self._xp
+            self.res[self.__pcnames[i]] = np.diag(self._e@self._e.T)
+            #self._res = pd.DataFrame( self._e, columns = self._varnames, index = self._rownames )
+        
+        self._xp = self.__t @ self._p.T
+
+        # Compute the cosine similarity between the normalized loading vectors
+        self.lev = {}
+        ## Laverage: leverage values range between 0 and 1
+        for i in range(self._t.shape[1]):
+            ti = self._t[:,i].reshape((-1,1))
+            Hat = ti @ np.linalg.pinv(np.transpose(ti) @ ti) @ np.transpose(ti)
+            self.lev[self._r.columns[i]] = ti.ravel()
+        self.leverage = pd.DataFrame(self.lev)
+        ## Hotelling t2
+        #self.eigvals = M.singular_values_**2
+        #self.Lambda = np.diag(self.eigvals)
+
+        #self.T2 = self.__t @ np.linalg.inv(self.Lambda) @self.__t.T
+
+        
+
+    @property
+    def scores_(self):
+        return pd.DataFrame(self._r)
+    
+    @property
+    def loadings_(self):
+        return pd.DataFrame(self._p, columns=self.__pcnames, index=self._varnames)
+    
+    @property
+    def leverage_(self):
+        return self.leverage
+    
+    @property
+    def residuals(self):
+        return self.res
\ No newline at end of file
-- 
GitLab