Source code for robpy.covariance.ogk

import numpy as np
from scipy.stats import median_abs_deviation, chi2

from robpy.utils.distance import mahalanobis_distance
from robpy.covariance.base import RobustCovarianceEstimator
from robpy.univariate import LocationOrScaleEstimator


[docs] class OGKEstimator(RobustCovarianceEstimator): def __init__( self, *, store_precision=True, assume_centered=False, location_estimator: LocationOrScaleEstimator = np.median, scale_estimator: LocationOrScaleEstimator = median_abs_deviation, n_iterations: int = 2, reweighting: bool = False, reweighting_beta: float = 0.9 ): """Implementation of the Orthogonalized Gnanadesikan-Kettenring estimator for location dispersion proposed in Maronna, R. A., & Zamar, R. H. (2002) Args: store_precision (boolean, optional): whether to store the precision matrix assume_centered (boolean, optional): whether the data is already centered location_estimator (LocationOrScaleEstimator, optional): function to estimate the location of the data, should accept an array like input as first value and a named argument axis scale_estimator (LocationOrScaleEstimator, optional): function to estimate the scale of the data, should accept an array like input as first value and a named argument axis n_iterations (int, optional): number of iteration for orthogonalization step reweighting (boolean, optional): whether to apply reweighting at the end (i.e. calculating regular location and covariance after filtering outliers based on Mahalanobis distance using OGK estimates) reweighting_beta (float, optional): quantile of chi2 distribution to use as cutoff for reweighting References: Maronna, R. A., & Zamar, R. H. (2002). Robust Estimates of Location and Dispersion for High-Dimensional Datasets. Technometrics, 44(4), 307–317. http://www.jstor.org/stable/1271538 """ super().__init__(store_precision=store_precision, assume_centered=assume_centered) self.location_estimator = location_estimator self.scale_estimator = scale_estimator self.n_iterations = n_iterations self.reweighting = reweighting self.reweighting_beta = reweighting_beta
[docs] def calculate_covariance(self, X) -> np.ndarray: """Calculate location and covariance with the algorithm described in Maronna & Zamar (2002). Covariance is returned, location is overwritten. """ p = X.shape[1] Z = np.copy(X) DE = [] for _ in range(self.n_iterations): s = np.array(self.scale_estimator(Z, axis=0)) D = np.diag(s) Dinv = np.diag(1.0 / s) Y = Z @ Dinv # (n x p) U = np.ones(shape=(p, p)) # Loop over pairs of variables, lower triangle suffises as U is symmetric for i in range(p): for j in range(i): scale_sum = self.scale_estimator(Y[:, i] + Y[:, j]) scale_diff = self.scale_estimator(Y[:, i] - Y[:, j]) cor = (scale_sum**2 - scale_diff**2) / (scale_sum**2 + scale_diff**2) U[i, j] = U[j, i] = cor _, E = np.linalg.eigh(U) # (p x p) E = E[:, ::-1] Z = Y @ E # (n x p) DE.append(D @ E) cov_X = np.diag(np.power(self.scale_estimator(Z, axis=0), 2)) # (p x p) mu_X = self.location_estimator(Z, axis=0) # (p, ) for mat in reversed(DE): mu_X = mat @ mu_X.reshape(-1, 1) cov_X = mat @ cov_X @ mat.T mu_X = mu_X.flatten() if self.reweighting: mahalanobis = mahalanobis_distance(X, location=mu_X, covariance=cov_X) cutoff = np.sqrt(chi2.ppf(self.reweighting_beta, p) / chi2.ppf(0.5, p)) * np.median( mahalanobis ) # mahalanobis is the sqrt distance, so we need to take the sqrt of the chi2 quantiles mask = mahalanobis < cutoff cov_X = np.cov(X[mask], rowvar=False) mu_X = np.mean(X[mask], axis=0) self.location_ = mu_X.flatten() return cov_X