Source code for kenchi.datasets.base

import os

import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.utils import check_random_state, shuffle as _shuffle

__all__ = ['load_wdbc', 'load_pendigits']


[docs]def load_wdbc(contamination=0.0272, random_state=None, shuffle=True): """Load and return the breast cancer wisconsin dataset. contamination : float, default 0.0272 Proportion of outliers in the data set. random_state : int, RandomState instance, default None Seed of the pseudo random number generator. shuffle : bool, default True If True, shuffle samples. Returns ------- X : ndarray of shape (n_samples, n_features) Data. y : ndarray of shape (n_samples,) Return -1 (malignant) for outliers and +1 (benign) for inliers. References ---------- .. [#kriegel11] Kriegel, H.-P., Kroger, P., Schubert E., and Zimek, A., "Interpreting and unifying outlier scores," In Proceedings of SDM'11, pp. 13-24, 2011. """ rnd = check_random_state(random_state) X, y = load_breast_cancer(return_X_y=True) is_inlier = y != 0 n_inliers = np.sum(is_inlier) X_inliers = X[is_inlier] y_inliers = y[is_inlier] n_outliers = int( np.round(contamination / (1. - contamination) * n_inliers) ) X_outliers = X[~is_inlier] y_outliers = y[~is_inlier] X_outliers, y_outliers = _shuffle( X_outliers, y_outliers, n_samples=n_outliers, random_state=rnd ) y_outliers[:] = -1 X = np.concatenate([X_outliers, X_inliers]) y = np.concatenate([y_outliers, y_inliers]) if shuffle: X, y = _shuffle(X, y, random_state=rnd) return X, y
[docs]def load_pendigits(contamination=0.002, random_state=None, shuffle=True): """Load and return the pendigits dataset. contamination : float, default 0.002 Proportion of outliers in the data set. random_state : int, RandomState instance, default None Seed of the pseudo random number generator. shuffle : bool, default True If True, shuffle samples. Returns ------- X : ndarray of shape (n_samples, n_features) Data. y : ndarray of shape (n_samples,) Return -1 (digit 4) for outliers and +1 (otherwise) for inliers. References ---------- .. [#kriegel11] Kriegel, H.-P., Kroger, P., Schubert E., and Zimek, A., "Interpreting and unifying outlier scores," In Proceedings of SDM'11, pp. 13-24, 2011. """ rnd = check_random_state(random_state) module_path = os.path.dirname(__file__) data = np.loadtxt( os.path.join(module_path, 'data', 'pendigits.csv.gz'), delimiter=',' ) X = data[:, :-1] y = data[:, -1].astype(np.int) is_inlier = y != 4 n_inliers = np.sum(is_inlier) X_inliers = X[is_inlier] y_inliers = y[is_inlier] y_inliers[:] = 1 n_outliers = int( np.round(contamination / (1. - contamination) * n_inliers) ) X_outliers = X[~is_inlier] y_outliers = y[~is_inlier] X_outliers, y_outliers = _shuffle( X_outliers, y_outliers, n_samples=n_outliers, random_state=rnd ) y_outliers[:] = -1 X = np.concatenate([X_outliers, X_inliers]) y = np.concatenate([y_outliers, y_inliers]) if shuffle: X, y = _shuffle(X, y, random_state=rnd) return X, y