import numpy as np
from scipy.stats import median_abs_deviation
[docs]
def stahel_donoho(
X: np.ndarray,
n_points: int = 2,
n_dir: int = 250,
random_seed: int | None = None,
) -> np.ndarray:
"""Calculate the degree of outlyingness for multivariate points.
Based on the algorithm proposed by Stahel (1981) and Donoho (1982).
Args:
X (np.ndarray): data matric of shape (n_obs, n_features)
n_points (int, optional): number of points to determine the hyperplane. Defaults to 2.
n_dir (int, optional): number of random directions to consider. Defaults to 250.
random_seed (int | None, optional): can be used to provide a random seed.
Returns:
np.ndarray: single column of outlyingness values
References:
Stahel W.A. (1981). Robuste Schatzungen: infinitesimale Optimalitat und Schatzungen von
Kovarianzmatrizen. PhD Thesis, ETH Zurich.
Donoho D.L. (1982). Breakdown properties of multivariate location estimators.
Ph.D. Qualifying paper, Dept. Statistics, Harvard University, Boston.
"""
# step 1: get random directions
D = np.hstack(
[
_get_random_direction(X, n_points=n_points, random_seed=random_seed).reshape(-1, 1)
for _ in range(n_dir)
]
) # (n_features, n_dir)
# step 2: projections
projections = X @ D # (n_obs, n_dir)
# step 3: outlyingness
# to do: let scale and loc estimators be passed as arguments
return np.max(
np.abs(projections - np.median(projections, axis=0))
/ median_abs_deviation(projections, axis=0),
axis=1,
)
def _get_random_direction(
X: np.ndarray,
n_points: int = 2,
random_seed: int | None = None,
) -> np.ndarray:
"""Get direction orthogonal to the hyperplane spanned by random points in X
Args:
X (np.ndarray): data matrix (n x p)
n_points (int, optional): number of points to determine the hyperplane. Defaults to 2.
random_seed (int | None, optional): can be used to provide a random seed.
Returns:
np.ndarray: vector of shape (p, ) indicating the direction
"""
rng = np.random.default_rng(random_seed)
points = X[rng.choice(X.shape[0], n_points, replace=False)]
if n_points == 2:
d = points[0] - points[1]
else:
# vectors spanning hyperplane
vectors = points - points[0]
# direction perpendicular to hyperplane
# U of SVD gives orthogonal basis, last vector is perpendicular to hyperplane
d = np.linalg.svd(vectors)[-1][-1]
return d / np.linalg.norm(d)