Source code for robpy.pca.base

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from abc import abstractmethod
from sklearn.decomposition._base import _BasePCA
from scipy.stats import chi2, norm
from robpy.univariate import UnivariateMCD



[docs]
class RobustPCA(_BasePCA):
    def __init__(self, *, n_components: int | None = None):
        """Base class for robust PCA estimators.

        Args:
            n_components (int | None, optional):
                Number of components to select. If None, it is set during fit.

        """
        self.n_components = n_components


[docs]
    @abstractmethod
    def fit(self, X: np.ndarray):
        """Fit the robust PCA model to the data.

        Args:
            X (np.ndarray): Data to fit the model to.
        """
        pass



[docs]
    def transform(self, X: np.ndarray) -> np.ndarray:
        """Apply dimensionality reduction to X.

        X is projected on the first principal components previously extracted
        from a training set.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            New data, where `n_samples` is the number of samples
            and `n_features` is the number of features.

        Returns
        -------
        X_new : np.ndarray of shape (n_samples, n_components)
            Projection of X on the first principal components, where `n_samples`
            is the number of samples and `n_components` is the number of components.
        """
        if self.location_ is None:
            self.location_ = np.mean(X, axis=0)
        return (X - self.location_) @ self.components_



[docs]
    def project(self, X: np.ndarray) -> np.ndarray:
        """Project the data onto the subspace spanned by the principal components.

        Args:
            X (np.ndarray): Data to project.

        Returns:
            np.ndarray: Projected data.
        """
        return self.transform(X) @ self.components_.T



[docs]
    def plot_outlier_map(
        self,
        X: np.ndarray | pd.DataFrame,
        figsize: tuple[int, int] = (4, 4),
        return_distances: bool = False,
    ) -> None | tuple[np.ndarray, np.ndarray, float, float]:
        """Plot Orthogonal distances vs Score distances to identify different types of outliers.

        Args:
            X (np.ndarray): Data  matrix (n x p).
            figsize (tuple[int, int], optional): Size of the plot. Defaults to (4, 4).
            return_distances (bool, optional):
                Whether to return the distances and cutoff values. Defaults to False.
        """
        if isinstance(X, pd.DataFrame):
            X = X.values
        orthogonal_distances = np.linalg.norm((X - self.location_) - self.project(X), axis=1)
        score_distances = np.sqrt(
            np.sum(np.square(self.transform(X)) / self.explained_variance_, axis=1)
        )
        _, ax = plt.subplots(1, 1, figsize=figsize)
        ax.scatter(score_distances, orthogonal_distances)
        ax.set_xlabel("Score distance")
        ax.set_ylabel("Orthogonal distance")
        score_cutoff = np.sqrt(float(chi2.ppf(0.975, self.n_components)))
        od_cutoff = get_od_cutoff(orthogonal_distances)
        ax.axvline(score_cutoff, color="r", linestyle="--")
        ax.axhline(od_cutoff, color="r", linestyle="--")
        if return_distances:
            return (score_distances, orthogonal_distances, score_cutoff, od_cutoff)





[docs]
def get_od_cutoff(orthogonal_distances: np.ndarray) -> float:
    mcd = UnivariateMCD().fit(orthogonal_distances ** (2 / 3))
    return float(mcd.location + mcd.scale * norm.ppf(0.975)) ** (3 / 2)