Source code for robpy.utils.outlyingness

import numpy as np
from scipy.stats import median_abs_deviation


[docs] def stahel_donoho( X: np.ndarray, n_points: int = 2, n_dir: int = 250, random_seed: int | None = None, ) -> np.ndarray: """Calculate the degree of outlyingness for multivariate points. Based on the algorithm proposed by Stahel (1981) and Donoho (1982). Args: X (np.ndarray): data matric of shape (n_obs, n_features) n_points (int, optional): number of points to determine the hyperplane. Defaults to 2. n_dir (int, optional): number of random directions to consider. Defaults to 250. random_seed (int | None, optional): can be used to provide a random seed. Returns: np.ndarray: single column of outlyingness values References: Stahel W.A. (1981). Robuste Schatzungen: infinitesimale Optimalitat und Schatzungen von Kovarianzmatrizen. PhD Thesis, ETH Zurich. Donoho D.L. (1982). Breakdown properties of multivariate location estimators. Ph.D. Qualifying paper, Dept. Statistics, Harvard University, Boston. """ # step 1: get random directions D = np.hstack( [ _get_random_direction(X, n_points=n_points, random_seed=random_seed).reshape(-1, 1) for _ in range(n_dir) ] ) # (n_features, n_dir) # step 2: projections projections = X @ D # (n_obs, n_dir) # step 3: outlyingness # to do: let scale and loc estimators be passed as arguments return np.max( np.abs(projections - np.median(projections, axis=0)) / median_abs_deviation(projections, axis=0), axis=1, )
def _get_random_direction( X: np.ndarray, n_points: int = 2, random_seed: int | None = None, ) -> np.ndarray: """Get direction orthogonal to the hyperplane spanned by random points in X Args: X (np.ndarray): data matrix (n x p) n_points (int, optional): number of points to determine the hyperplane. Defaults to 2. random_seed (int | None, optional): can be used to provide a random seed. Returns: np.ndarray: vector of shape (p, ) indicating the direction """ rng = np.random.default_rng(random_seed) points = X[rng.choice(X.shape[0], n_points, replace=False)] if n_points == 2: d = points[0] - points[1] else: # vectors spanning hyperplane vectors = points - points[0] # direction perpendicular to hyperplane # U of SVD gives orthogonal basis, last vector is perpendicular to hyperplane d = np.linalg.svd(vectors)[-1][-1] return d / np.linalg.norm(d)