Source code for robpy.datasets.base

import pathlib
import pandas as pd
import numpy as np
from sklearn.utils import Bunch

DATA_FOLDER = pathlib.Path(__file__).parent / "data"
DESCR_FOLDER = pathlib.Path(__file__).parent / "descr"


def _load_data_and_descr(
    data_file_name: str, descr_file_name: str, as_frame: bool, feature_names: list[str]
) -> tuple[pd.DataFrame | np.ndarray, str]:
    """
    Helper function to load data and description files.

    Args:
        data_file_name (str):
            Filename of the .csv file (must end in .csv).
        descr_file_name (str):
            Filename of the .rst file (must end in .rst).
        as_frame (bool):
            Whether data should be stored as a pandas DataFrame.
            If False, it will be stored as a numpy array.
        feature_names (list[str]):
            Columns to be selected, all other columns are ignored.

    Returns:
        tuple[pd.DataFrame | np.ndarray, str]: The data matrix/dataframe and the description string.
    """

    df = pd.read_csv(DATA_FOLDER / data_file_name)

    with open(DESCR_FOLDER / descr_file_name, "r") as f:
        fdescr = f.read()

    data = df[feature_names]

    if not as_frame:
        data = data.values

    return data, fdescr


[docs] def load_telephone(*, as_frame=False): """Load and return the telephone dataset (regression with outliers). The telephone dataset is a well-known univariate regression problem with outliers. ================= ============== Samples 24 Dimensionality 2 Features real, positive ================= ============== Parameters ---------- as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric). The target is a pandas DataFrame or Series depending on the number of target columns. Returns ------- data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes: * data : {ndarray, dataframe} of shape (24, 2) The data matrix. If `as_frame=True`, `data` will be a pandas DataFrame. * feature_names: list The names of the dataset columns. * DESCR: str The full description of the dataset. * filename: str The path to the location of the data. Examples -------- Fitting a robust regression: >>> from robpy.datasets import load_telephone >>> from robpy.regression import MMRegression >>> data = load_telephone() >>> mm = MMRegression().fit(data.data[:, 0], data.data[:, 1]) """ data_file_name = "telephone.csv" descr_file_name = "telephone.rst" feature_names = ["Year", "Calls"] data, fdescr = _load_data_and_descr( data_file_name, descr_file_name, as_frame=as_frame, feature_names=feature_names ) return Bunch( data=data, DESCR=fdescr, feature_names=feature_names, filename=data_file_name, data_folder=DATA_FOLDER, )
[docs] def load_stars(*, as_frame=False): """Load and return the Hertzsprung-Russell Diagram Data of Star Cluster CYG OB1 (covariance/regression). The stars dataset is well-known bivariate dataset used for demonstrating robust covariance and regression estimators. ================= ============== Samples 47 Dimensionality 2 Features real, positive ================= ============== Parameters ---------- as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric). Returns ------- data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes: * data : {ndarray, dataframe} of shape (47, 2) The data matrix. If `as_frame=True`, `data` will be a pandas DataFrame. * feature_names: list The names of the dataset columns. * DESCR: str The full description of the dataset. * filename: str The path to the location of the data. Examples -------- Fitting a robust covariance estimator: >>> from robpy.datasets import load_stars >>> from robpy.covariance import FastMCD >>> data = load_stars() >>> mcd = FastMCD().fit(data.data) """ data_file_name = "stars.csv" descr_file_name = "stars.rst" feature_names = ["Te", "light"] data, fdescr = _load_data_and_descr(data_file_name, descr_file_name, as_frame, feature_names) return Bunch( data=data, DESCR=fdescr, feature_names=feature_names, filename=data_file_name, data_folder=DATA_FOLDER, )
[docs] def load_animals(*, as_frame=False): """Load and return the Animals dataset from MASS (R) (covariance / regression). The animals dataset is a bivariate dataset used for demonstrating robust covariance estimators. ================= ============== Samples 28 Dimensionality 2 Features real, positive ================= ============== Parameters ---------- as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric). Returns ------- data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes: * data : {ndarray, dataframe} of shape (28, 2) The data matrix. If `as_frame=True`, `data` will be a pandas DataFrame. * feature_names: list The names of the dataset columns. * DESCR: str The full description of the dataset. * filename: str The path to the location of the data. Examples -------- Fitting a robust covariance estimator: >>> from robpy.datasets import load_animals >>> from robpy.covariance import FastMCD >>> data = load_animals() >>> mcd = FastMCD().fit(data.data) """ data_file_name = "animals.csv" descr_file_name = "animals.rst" feature_names = ["body", "brain"] data, fdescr = _load_data_and_descr(data_file_name, descr_file_name, as_frame, feature_names) return Bunch( data=data, DESCR=fdescr, feature_names=feature_names, filename=data_file_name, data_folder=DATA_FOLDER, )
[docs] def load_topgear(*, as_frame=False): """Load and return the TopGear dataset from robustHD (R) (regression). The TopGear dataset is a mixed variable dataset used for demonstrating robust regression estimators. ================= ============== Samples 297 Dimensionality 32 Features mixed ================= ============== Parameters ---------- as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric). Returns ------- data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes: * data : {ndarray, dataframe} of shape (297, 32) The data matrix. If `as_frame=True`, `data` will be a pandas DataFrame. * feature_names: list The names of the dataset columns. * categorical_features: list The names of the categorical features. * DESCR: str The full description of the dataset. * filename: str The path to the location of the data. Examples -------- Fitting a robust regression estimator: >>> from robpy.datasets import load_topgear >>> from robpy.regression import FastLTSRegression >>> data = load_topgear(as_frame=True) >>> data.data = data.data.dropna(subset=["Cylinders", "Torque", "TopSpeed", "Price"]) >>> lts = FastLTSRegression().fit( data.data[["Cylinders", "Torque", "TopSpeed"]], data.data["Price"] ) """ data_file_name = "topgear.csv" desc_file_name = "topgear.rst" feature_names = [ "Make", "Model", "Type", "Fuel", "Price", "Cylinders", "Displacement", "DriveWheel", "BHP", "Torque", "Acceleration", "TopSpeed", "MPG", "Weight", "Length", "Width", "Height", "AdaptiveHeadlights", "AdjustableSteering", "AlarmSystem", "Automatic", "Bluetooth", "ClimateControl", "CruiseControl", "ElectricSeats", "Leather", "ParkingSensors", "PowerSteering", "SatNav", "ESP", "Verdict", "Origin", ] categorical_features = [ "Make", "Model", "Type", "Fuel", "DriveWheel", "AdaptiveHeadlights", "AdjustableSteering", "AlarmSystem", "Automatic", "Bluetooth", "ClimateControl", "CruiseControl", "ElectricSeats", "Leather", "ParkingSensors", "PowerSteering", "SatNav", "ESP", "Origin", ] data, fdescr = _load_data_and_descr(data_file_name, desc_file_name, as_frame, feature_names) return Bunch( data=data, DESCR=fdescr, feature_names=feature_names, categorical_features=categorical_features, filename=data_file_name, data_folder=DATA_FOLDER, )
[docs] def load_glass(*, as_frame=False): """Load and return the glass dataset from cellWise (R) (outlier detection). The glass dataset is a high dimensional dataset used for demonstrating outlier detection. ================= ============== Samples 180 Dimensionality 750 Features real, positive ================= ============== Parameters ---------- as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric). Returns ------- data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes: * data : {ndarray, dataframe} of shape (180, 750) The data matrix. If `as_frame=True`, `data` will be a pandas DataFrame. * feature_names: list The names of the dataset columns. * DESCR: str The full description of the dataset. * filename: str The path to the location of the data. """ data_file_name = "glass.csv" descr_file_name = "glass.rst" feature_names = [f"V{i}" for i in range(1, 751)] data, fdescr = _load_data_and_descr(data_file_name, descr_file_name, as_frame, feature_names) return Bunch( data=data, DESCR=fdescr, feature_names=feature_names, filename=data_file_name, data_folder=DATA_FOLDER, )