Source code for robpy.covariance.ogk

import numpy as np
from scipy.stats import median_abs_deviation, chi2

from robpy.utils.distance import mahalanobis_distance
from robpy.covariance.base import RobustCovariance
from robpy.univariate import LocationOrScaleEstimator



[docs]
class OGK(RobustCovariance):
    def __init__(
        self,
        *,
        store_precision=True,
        assume_centered=False,
        location_estimator: LocationOrScaleEstimator = np.median,
        scale_estimator: LocationOrScaleEstimator = median_abs_deviation,
        n_iterations: int = 2,
        reweighting: bool = False,
        reweighting_beta: float = 0.9
    ):
        """
        Implementation of the Orthogonalized Gnanadesikan-Kettenring estimator for location and
        dispersion proposed in Maronna, R. A., & Zamar, R. H. (2002).

        Args:
            store_precision (boolean, optional):
                Whether to store the precision matrix. Defaults to True.
            assume_centered (boolean, optional):
                Whether the data is already centered. Defaults to False.
            location_estimator (LocationOrScaleEstimator, optional):
                Function to estimate the location of the data, should accept an array like input as
                first value and a named argument axis. Defaults to np.median.
            scale_estimator (LocationOrScaleEstimator, optional):
                Function to estimate the scale of the data, should accept an array like input as
                first value and a named argument axis. Defaults to median_abs_deviation.
            n_iterations (int, optional):
                Number of iterations for the orthogonalization step. Defaults to 2.
            reweighting (boolean, optional):
                Whether to apply reweighting at the end (i.e. calculating regular location and
                covariance after filtering outliers based on Mahalanobis distance using OGK
                estimates). Defaults to False.
            reweighting_beta (float, optional):
                Quantile of chi-squared distribution to use as cutoff for the reweighting. Defaults
                to 0.9.

        References:
            - Maronna, R. A., & Zamar, R. H. (2002). Robust estimates of location and dispersion for
              high-dimensional datasets. Technometrics, 44(4), 307-317.

        """
        super().__init__(store_precision=store_precision, assume_centered=assume_centered)
        self.location_estimator = location_estimator
        self.scale_estimator = scale_estimator
        self.n_iterations = n_iterations
        self.reweighting = reweighting
        self.reweighting_beta = reweighting_beta


[docs]
    def calculate_covariance(self, X) -> np.ndarray:
        p = X.shape[1]
        Z = np.copy(X)
        DE = []
        for _ in range(self.n_iterations):
            s = np.array(self.scale_estimator(Z, axis=0))
            D = np.diag(s)
            Dinv = np.diag(1.0 / s)
            Y = Z @ Dinv  # (n x p)
            U = np.ones(shape=(p, p))
            # Loop over pairs of variables, lower triangle suffises as U is symmetric
            for i in range(p):
                for j in range(i):
                    scale_sum = self.scale_estimator(Y[:, i] + Y[:, j])
                    scale_diff = self.scale_estimator(Y[:, i] - Y[:, j])
                    cor = (scale_sum**2 - scale_diff**2) / (scale_sum**2 + scale_diff**2)
                    U[i, j] = U[j, i] = cor
            _, E = np.linalg.eigh(U)  # (p x p)
            E = E[:, ::-1]
            Z = Y @ E  # (n x p)
            DE.append(D @ E)
        cov_X = np.diag(np.power(self.scale_estimator(Z, axis=0), 2))  # (p x p)
        mu_X = self.location_estimator(Z, axis=0)  # (p, )

        for mat in reversed(DE):
            mu_X = mat @ mu_X.reshape(-1, 1)
            cov_X = mat @ cov_X @ mat.T

        mu_X = mu_X.flatten()
        if self.reweighting:
            mahalanobis = mahalanobis_distance(X, location=mu_X, covariance=cov_X)
            cutoff = np.sqrt(chi2.ppf(self.reweighting_beta, p) / chi2.ppf(0.5, p)) * np.median(
                mahalanobis
            )  # mahalanobis is the sqrt distance, so we need to take the sqrt of the chi2 quantiles
            mask = mahalanobis < cutoff
            cov_X = np.cov(X[mask], rowvar=False)
            mu_X = np.mean(X[mask], axis=0)
        self.location_ = mu_X.flatten()

        return cov_X