Skip to content
Snippets Groups Projects
dim_reduction.py 14.6 KiB
Newer Older
DIANE's avatar
DIANE committed
from utils.data_handling import *
DIANE's avatar
DIANE committed
from pandas import DataFrame
import numpy as np
DIANE's avatar
DIANE committed
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
from pandas import DataFrame
from sklearn.preprocessing import LabelEncoder
from umap import UMAP
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pca ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
DIANE's avatar
DIANE committed


class LinearPCA:
DIANE's avatar
DIANE committed
    """
    A class for performing Principal Component Analysis (PCA) on a given data matrix X.
    
    This class applies PCA for dimensionality reduction, providing the projections of the data onto the principal components 
    (scores), the contribution of each feature to the principal components (loadings), 
    and the residuals (reconstruction errors) after dimensionality reduction.
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    Attributes:
    - scores_ (pandas.DataFrame): The projections of the data onto the principal components, i.e., the transformed data.
    - loadings_ (pandas.DataFrame): The loadings matrix containing the contribution of each feature to the principal components.
    - residuals_ (pandas.DataFrame): The residuals or reconstruction errors between the original data and its reconstruction.
    - eig_val (tuple): A tuple containing the eigenvalues and the diagonal matrix (Lambda) of eigenvalues.
    - qexp_ratio (pandas.DataFrame): The explained variance ratio for each principal component.
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    Methods:
    - __init__(self, X, Ncomp=10): Initializes the LinearPCA object by fitting the PCA model to the data matrix X.
    - eig_val (property): Returns the eigenvalues and the Lambda matrix.
    - qexp_ratio (property): Returns the explained variance ratio for each principal component.
    - scores_ (property): Returns the score matrix, the projections of the data onto the principal components.
    - loadings_ (property): Returns the loadings matrix, the contributions of each feature to the principal components.
    - residuals_ (method): Returns the residuals (reconstruction errors) after dimensionality reduction.
    """
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    def __init__(self, X, Ncomp=10):
        """
        Initialize the LinearPCA class with the data matrix X and the number of principal components Ncomp.
DIANE's avatar
DIANE committed
        
DIANE's avatar
DIANE committed
        Parameters:
        X (pandas.DataFrame): The input data matrix where rows represent samples and columns represent features.
        Ncomp (int): The number of principal components to compute (default is 10).
        """
        # Store input data matrix
        self.__x = X
        
        # Set the number of principal components
        self.__ncp = Ncomp
        
        # Initialize and fit the PCA model
        from sklearn.decomposition import PCA
        self.model = PCA(n_components=self.__ncp)
        self.model.fit(X)
        
    @property
    def eig_val(self):
        """
        Returns the eigenvalues and the diagonal matrix (Lambda) of eigenvalues.
        
        Eigenvalues are the square of the singular values obtained from the PCA model.
        
        Returns:
        tuple: A tuple containing eigenvalues (eigvals) and the Lambda matrix (diagonal matrix of eigenvalues).
        """
DIANE's avatar
DIANE committed
        eigvals = self.model.singular_values_**2 /self.__x.shape[0]
        labels= [f'PC{i+1}({100 * self.model.explained_variance_ratio_[i].round(2)}%)' for i in range(self.__ncp)]
        Lambda = DataFrame(np.diag(eigvals), index = labels, columns = labels)
DIANE's avatar
DIANE committed
        return eigvals, Lambda
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    @property
    def qexp_ratio(self):
        """
        Returns the explained variance ratio for each principal component.
        
        This shows the percentage of variance explained by each principal component.
        
        Returns:
        pandas.DataFrame: DataFrame containing the explained variance ratio for each principal component.
        """
        Qexp_ratio = pd.DataFrame(100 * self.model.explained_variance_ratio_,
                                        columns=["Qexp"],
                                        index=[f'PC{i+1}' for i in range(self.__ncp)])
        return Qexp_ratio
DIANE's avatar
DIANE committed

    @property
    def scores_(self):
DIANE's avatar
DIANE committed
        """
        Returns the scores matrix, which is the projection of the original data onto the principal components.
        
        The scores are the transformed data in the lower-dimensional space after applying PCA.
        
        Returns:
        pandas.DataFrame: The scores matrix (transformed data).
        """
        scores = pd.DataFrame(self.model.transform(self.__x), index= self.__x.index,
                               columns=[f'PC{i+1}({100 * self.model.explained_variance_ratio_[i].round(2)}%)' for i in range(self.__ncp)])
        return scores
DIANE's avatar
DIANE committed
    
    @property
    def loadings_(self):
DIANE's avatar
DIANE committed
        """
        Returns the loadings matrix, which contains the contribution of each feature to the principal components.
        
        The loadings describe how much each original feature contributes to each principal component.
        
        Returns:
        pandas.DataFrame: The loadings matrix.
        """
        p = pd.DataFrame(self.model.components_, columns=self.__x.columns,
                          index=[f'PC{i+1}({100 * self.model.explained_variance_ratio_[i].round(2)}%)' for i in range(self.__ncp)])
        return p
DIANE's avatar
DIANE committed
    
DIANE's avatar
DIANE committed
    def residuals_(self, components):
        """
        Returns the residuals (reconstruction errors) between the original data and its reconstruction 
        using a subset of principal components.
        
        Parameters:
        components (list): A list of principal component names (e.g., 'PC1', 'PC2') used to reconstruct the data.
        
        Returns:
        pandas.DataFrame: The residuals matrix, showing the difference between the original data and its reconstruction.
        """
        axis = []
        import re
        for i in components:
            match = re.search(r'PC(\d+)', i)
            axis.append(int(match.group(1)) - 1)
        axis.sort()

        # Reconstruct the data using the selected components
        for i in range(self.__ncp):
            # Reconstruct the data using the first i+1 principal components
            xp = np.dot(self.model.transform(self.__x)[:, axis], self.model.components_[axis, :])
            
            # Calculate residuals (difference between original and reconstructed data)
            qres = self.__x - xp
        return qres
DIANE's avatar
DIANE committed

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ umap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
class Umap:
    """
DIANE's avatar
DIANE committed
    The UMAP (Uniform Manifold Approximation and Projection) algorithm for dimensionality reduction.
    
    This class implements the UMAP algorithm to reduce the dimensionality of numerical data, with an option to include
    categorical data (if provided) which will be encoded using `LabelEncoder`. The inclusion of categorical data helps in
    improving clustering and visualization, especially when working with mixed data types.
    
    Attributes:
    -----------
    numerical_data : pandas DataFrame
        The numerical features (data) on which the dimensionality reduction will be performed.
        
    categorical_data : list or None, optional
        A list of categorical values that can be included for improved structure of the UMAP embedding. Default is None.
        
    categorical_data_encoded : list or None
        The encoded version of `categorical_data`, processed using `LabelEncoder` for model fitting. This is used only if 
        `categorical_data` is provided.

    model : UMAP
        The fitted UMAP model that contains the learned dimensionality reduction transformation.

    scores : pandas DataFrame
        A DataFrame containing the transformed data (embedding) in the lower-dimensional space.

    Methods:
    --------
    __init__(numerical_data, cat_data=None)
        Initializes and fits the UMAP model using the provided numerical and optional categorical data.

    scores_ : property
        Returns the transformed data (embedding) in the lower-dimensional space.
DIANE's avatar
DIANE committed
    """
DIANE's avatar
DIANE committed
    
    def __init__(self, numerical_data, cat_data=None):
        """
        Initializes and fits the UMAP model using the provided numerical data and optional categorical data.
        
        Parameters:
        -----------
        numerical_data : pandas DataFrame
            The numerical data (features) to be used for dimensionality reduction.
        
        cat_data : list or None, optional
            A list of categorical values associated with the data. If provided, this will be encoded and used during fitting.
            Default is None.
        """
        # Ensure that the numerical data is a pandas DataFrame
        if not isinstance(numerical_data, pd.DataFrame):
            raise TypeError("numerical_data must be a pandas DataFrame")
        
        # Store numerical data
DIANE's avatar
DIANE committed
        self.numerical_data = numerical_data
DIANE's avatar
DIANE committed
        
        # Process categorical data if provided
DIANE's avatar
DIANE committed
        if cat_data is None:
            self.categorical_data_encoded = cat_data
        elif len(cat_data) > 0:
            self.categorical_data = cat_data
DIANE's avatar
DIANE committed
            # Use LabelEncoder to encode categorical data
DIANE's avatar
DIANE committed
            from sklearn.preprocessing import LabelEncoder
DIANE's avatar
DIANE committed
            self.le = LabelEncoder()
            self.categorical_data_encoded = self.le.fit_transform(self.categorical_data)
        else:
            self.categorical_data_encoded = None
DIANE's avatar
DIANE committed
        
        # Initialize the UMAP model with hyperparameters
DIANE's avatar
DIANE committed
        from umap import UMAP
DIANE's avatar
DIANE committed
        self.model = UMAP(n_neighbors=20, n_components=3, min_dist=0.0)
        
        # Fit the model using the numerical data, with optional categorical data encoding
        self.model.fit(self.numerical_data, y=self.categorical_data_encoded)
        
DIANE's avatar
DIANE committed
    @property
    def scores_(self):
DIANE's avatar
DIANE committed
        """
        Returns the lower-dimensional representation (embedding) of the numerical data in the transformed space.
        
        The transformed data is represented in the lower-dimensional UMAP embedding. The data is presented as a 
        pandas DataFrame with columns labeled 'UMAP_1', 'UMAP_2', ..., for each component in the embedding.
        
        Returns:
        --------
        pandas DataFrame
            The transformed data (embedding) in the lower-dimensional space, with the original rows as the index.
        """
        # Apply the UMAP transformation and store the transformed data in a DataFrame
        scores = pd.DataFrame(self.model.transform(self.numerical_data), 
                              index=self.numerical_data.index, 
                              columns=[f'UMAP_{i+1}' for i in range(self.model.n_components)])
        return scores
DIANE's avatar
DIANE committed

DIANE's avatar
DIANE committed
    
DIANE's avatar
DIANE committed
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nmf ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
class Nmf:
DIANE's avatar
DIANE committed
    """
    A class that performs Non-negative Matrix Factorization (NMF) on an input matrix (pandas DataFrame) to extract 
    latent components and their associated scores. The NMF model is fitted using scikit-learn's NMF implementation, 
    and the results include the transformed data (scores) and the components (loadings).
    
    Parameters:
    -----------
    X : pandas DataFrame
        The input matrix (data) to be decomposed. All values in the matrix should be non-negative. Rows represent 
        samples, and columns represent features.
    
    Ncomp : int, optional, default=3
        The number of components to compute. This is the rank of the factorization.

    Attributes:
    -----------
    scores_ : pandas DataFrame
        A DataFrame containing the transformed data (scores) for each of the components. Rows correspond to the 
        samples, and columns represent the components (e.g., 'H1', 'H2', ...).
    
    loadings_ : pandas DataFrame
        A DataFrame containing the components (loadings). Rows represent the components (e.g., 'H1', 'H2', ...), 
        and columns correspond to the original features in the input matrix.
    
    Methods:
    --------
    scores_ : pandas DataFrame
        Returns the transformed data (scores) as a DataFrame. This is the result of applying NMF to the input data.

    loadings_ : pandas DataFrame
        Returns the components (loadings) of the NMF factorization as a DataFrame. These represent the basis vectors 
        in the latent space.

    Notes:
    ------
    The NMF is performed using the multiplicative update algorithm ('cd' solver), with a Frobenius loss function.
    The input matrix is preprocessed to ensure all values are non-negative by subtracting the minimum value of the
    matrix if necessary.
    """

    def __init__(self, X: pd.DataFrame, Ncomp: int = 3):
        """
        Initializes the NMF model and fits it to the input pandas DataFrame.
        
        Parameters:
        -----------
        X : pandas DataFrame
            The input matrix (data) to be decomposed. All values in the matrix should be non-negative. 
            Rows represent samples, and columns represent features.
        
        Ncomp : int, optional, default=3
            The number of components to compute. This is the rank of the factorization.
        """
        # Ensure input matrix has non-negative values
        if np.min(X) < 0:
            self.__x = X - np.min(X)
DIANE's avatar
DIANE committed
        else:
DIANE's avatar
DIANE committed
            self.__x = X

        # Set the number of components to compute
DIANE's avatar
DIANE committed
        self.__ncp = Ncomp

DIANE's avatar
DIANE committed
        # Fit NMF model
DIANE's avatar
DIANE committed
        from sklearn.decomposition import NMF
DIANE's avatar
DIANE committed
        self.Mo = NMF(n_components=self.__ncp, init=None, solver='cd', beta_loss='frobenius',
                      tol=0.0001, max_iter=300, random_state=None, alpha_W=0.0, alpha_H='same',
                      l1_ratio=0.0, verbose=0, shuffle=False)
        self.Mo.fit(self.__x)

DIANE's avatar
DIANE committed
    @property
DIANE's avatar
DIANE committed
    def scores_(self) -> pd.DataFrame:
        """
        Returns the transformed data (scores) from the NMF model as a pandas DataFrame.
        
        The rows correspond to the samples, and the columns correspond to the latent components.
        
        Returns:
        --------
        pandas DataFrame
            A DataFrame of the transformed data with the components as columns.
        """
        t = pd.DataFrame(self.Mo.transform(self.__x),
                         columns=[f'H{i+1}' for i in range(self.__ncp)],
                         index=self.__x.index)
        return t

DIANE's avatar
DIANE committed
    @property
DIANE's avatar
DIANE committed
    def loadings_(self) -> pd.DataFrame:
        """
        Returns the loadings (components) from the NMF model as a pandas DataFrame.
        
        The rows correspond to the components, and the columns correspond to the original features.
        
        Returns:
        --------
        pandas DataFrame
            A DataFrame of the loadings with the components as rows.
        """
        p = pd.DataFrame(self.Mo.components_, columns=self.__x.columns,
                         index=[f'H{i+1}' for i in range(self.__ncp)])
        return p