Source code for robpy.regression.base

from __future__ import annotations
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy.stats import median_abs_deviation, chi2
from sklearn.base import RegressorMixin, BaseEstimator
from sklearn.exceptions import NotFittedError

from robpy.utils.distance import mahalanobis_distance
from robpy.covariance import FastMCD

logger = logging.getLogger(__name__)



[docs]
class RobustRegression(RegressorMixin, BaseEstimator):
    def __init__(
        self,
    ):
        """
        Base class for robust regression estimators.
        """
        super().__init__()


[docs]
    def fit(self, X, y) -> RobustRegression:
        raise NotImplementedError



[docs]
    def predict(self, X):
        raise NotImplementedError


    @property
    def scale(self) -> float:
        scale = getattr(self, "_scale", None)
        if scale is None:
            raise NotFittedError("Model has not been fitted yet.")
        return scale


[docs]
    def outlier_map(
        self,
        X,
        y,
        robust_scaling: bool = True,
        robust_distance: bool = True,
        vertical_outlier_threshold: float = 2.5,
        leverage_threshold_percentile: float = 0.975,
        figsize: tuple[int, int] = (4, 4),
        return_data: bool = False,
    ) -> None | tuple[np.ndarray, np.ndarray, np.ndarray, float, float]:
        """
        Creates a diagnostic plot where the robust residuals of the target are plotted against the
        robust Mahalanobis distances of the features.

        Args:
            X (array like of shape (n_samples, n_features)):
                Training features.
            y (array like of shape (n_samples, )):
                Training targets.
            robust_scaling (bool, optional):
                Whether to scale the residuals using MAD instead of std. Defaults to True.
            robust_distance (bool, optional):
                Whether to use the MCD as loc/scale estimator instead of mean/cov for calculating
                the Mahalanobis distances. Defaults to True.
            vertical_outlier_threshold (float, optional):
                Where to draw the upper (and lower) limit for the standardized residuals to indicate
                outliers. Defaults to 2.5.
            leverage_threshold_percentile (float, optional):
                Which percentile from the chi-squared distribution to use to set as threshold for
                leverage points. Defaults to 0.975.
            figsize (tuple[int, int], optional):
                Size of the plot. Defaults to (4, 4).
            return_data (bool, optional):
                Whether to return the residuals, the standardized residuals and the distances.
                Defaults to False.
        References:
            - Rousseeuw P.J., Hubert M. (2018). Anomaly detection by robust statistics.
              Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery,
              8(2), 1–14.
            - Rousseeuw P.J., van Zomeren B.C. (1990). Unmasking multivariate outliers and leverage
              points. Journal of the American Statistical Association,  85(411), 633–651.
        """

        residuals = y.reshape(-1, 1) - self.predict(X).reshape(-1, 1)
        standardized_residuals = (
            residuals
            / (
                median_abs_deviation(residuals, scale="normal")
                if robust_scaling
                else np.std(residuals)
            )
        ).flatten()

        if robust_distance:
            mcd = FastMCD().fit(X)
            covariance = mcd.covariance_
            location = mcd.location_
        else:
            covariance = np.cov(X, rowvar=False)
            location = np.mean(X, axis=0)
        distances = mahalanobis_distance(X, location=location, covariance=covariance)

        _, ax = plt.subplots(1, 1, figsize=figsize)

        ax.scatter(x=distances, y=standardized_residuals)
        ax.set_xlabel(f"{'Robust' if robust_distance else 'Non-robust'} distance of X")
        ax.set_ylabel(f"{'Robust' if robust_scaling else 'Non-robust'} standardized residuals of y")

        ax.axhline(vertical_outlier_threshold, ls="--", c="grey")
        ax.axhline(-vertical_outlier_threshold, ls="--", c="grey")

        df = X.shape[-1]
        leverage_threshold = float(np.sqrt(chi2.ppf(leverage_threshold_percentile, df)))
        ax.axvline(leverage_threshold, ls="--", c="grey")

        if return_data:
            return (
                residuals,
                standardized_residuals,
                distances,
                vertical_outlier_threshold,
                leverage_threshold,
            )




def _convert_input_to_array(X, y=None) -> tuple[np.ndarray, np.ndarray | None]:
    if isinstance(X, pd.DataFrame):
        X = X.values
    if (len(X.shape) == 1) or (X.shape[1] == 1):
        X = X.reshape(-1, 1)
    if isinstance(y, pd.Series):
        y = y.values

    return X, y