Source code for kenchi.datasets.base

import os

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.utils import check_random_state, shuffle as _shuffle

__all__ = ['load_wdbc', 'load_pendigits']


[docs]def load_wdbc(contamination=0.0272, random_state=None, shuffle=True):
    """Load and return the breast cancer wisconsin dataset.

    contamination : float, default 0.0272
        Proportion of outliers in the data set.

    random_state : int, RandomState instance, default None
        Seed of the pseudo random number generator.

    shuffle : bool, default True
        If True, shuffle samples.

    Returns
    -------
    X : ndarray of shape (n_samples, n_features)
        Data.

    y : ndarray of shape (n_samples,)
        Return -1 (malignant) for outliers and +1 (benign) for inliers.

    References
    ----------
    .. [#kriegel11] Kriegel, H.-P., Kroger, P., Schubert E., and Zimek, A.,
        "Interpreting and unifying outlier scores,"
        In Proceedings of SDM'11, pp. 13-24, 2011.
    """

    rnd                    = check_random_state(random_state)
    X, y                   = load_breast_cancer(return_X_y=True)

    is_inlier              = y != 0
    n_inliers              = np.sum(is_inlier)
    X_inliers              = X[is_inlier]
    y_inliers              = y[is_inlier]

    n_outliers             = int(
        np.round(contamination / (1. - contamination) * n_inliers)
    )
    X_outliers             = X[~is_inlier]
    y_outliers             = y[~is_inlier]
    X_outliers, y_outliers = _shuffle(
        X_outliers, y_outliers, n_samples=n_outliers, random_state=rnd
    )
    y_outliers[:]          = -1

    X                      = np.concatenate([X_outliers, X_inliers])
    y                      = np.concatenate([y_outliers, y_inliers])

    if shuffle:
        X, y               = _shuffle(X, y, random_state=rnd)

    return X, y


[docs]def load_pendigits(contamination=0.002, random_state=None, shuffle=True):
    """Load and return the pendigits dataset.

    contamination : float, default 0.002
        Proportion of outliers in the data set.

    random_state : int, RandomState instance, default None
        Seed of the pseudo random number generator.

    shuffle : bool, default True
        If True, shuffle samples.

    Returns
    -------
    X : ndarray of shape (n_samples, n_features)
        Data.

    y : ndarray of shape (n_samples,)
        Return -1 (digit 4) for outliers and +1 (otherwise) for inliers.

    References
    ----------
    .. [#kriegel11] Kriegel, H.-P., Kroger, P., Schubert E., and Zimek, A.,
        "Interpreting and unifying outlier scores,"
        In Proceedings of SDM'11, pp. 13-24, 2011.
    """

    rnd                    = check_random_state(random_state)
    module_path            = os.path.dirname(__file__)
    data                   = np.loadtxt(
        os.path.join(module_path, 'data', 'pendigits.csv.gz'), delimiter=','
    )
    X                      = data[:, :-1]
    y                      = data[:, -1].astype(np.int)

    is_inlier              = y != 4
    n_inliers              = np.sum(is_inlier)
    X_inliers              = X[is_inlier]
    y_inliers              = y[is_inlier]
    y_inliers[:]           = 1

    n_outliers             = int(
        np.round(contamination / (1. - contamination) * n_inliers)
    )
    X_outliers             = X[~is_inlier]
    y_outliers             = y[~is_inlier]
    X_outliers, y_outliers = _shuffle(
        X_outliers, y_outliers, n_samples=n_outliers, random_state=rnd
    )
    y_outliers[:]          = -1

    X                      = np.concatenate([X_outliers, X_inliers])
    y                      = np.concatenate([y_outliers, y_inliers])

    if shuffle:
        X, y               = _shuffle(X, y, random_state=rnd)

    return X, y