Source code for kenchi.datasets.samples_generator

import numpy as np
from sklearn.datasets import make_blobs as _make_blobs
from sklearn.utils import check_random_state, shuffle as _shuffle

from .base import NEG_LABEL, POS_LABEL
from ..utils import check_contamination

__all__ = ['make_blobs']


[docs]def make_blobs( centers=5, center_box=(-10., 10.), cluster_std=1., contamination=0.02, n_features=25, n_samples=500, random_state=None, shuffle=True ): """Generate isotropic Gaussian blobs with outliers. Parameters ---------- centers : int or array-like of shape (n_centers, n_features), default 5 Number of centers to generate, or the fixed center locations. center_box : pair of floats (min, max), default (-10.0, 10.0) Bounding box for each cluster center when centers are generated at random. cluster_std : float or array-like of shape (n_centers,), default 1.0 Standard deviation of the clusters. contamination : float, default 0.02 Proportion of outliers in the data set. n_features : int, default 25 Number of features for each sample. n_samples : int, default 500 Number of samples. random_state : int, RandomState instance, default None Seed of the pseudo random number generator. shuffle : bool, default True If True, shuffle samples. Returns ------- X : array-like of shape (n_samples, n_features) Generated data. y : array-like of shape (n_samples,) Return -1 for outliers and +1 for inliers. References ---------- .. [#kriegel08] Kriegel, H.-P., Schubert, M., and Zimek, A., "Angle-based outlier detection in high-dimensional data," In Proceedings of SIGKDD, pp. 444-452, 2008. .. [#sugiyama13] Sugiyama, M., and Borgwardt, K., "Rapid distance-based outlier detection via sampling," Advances in NIPS, pp. 467-475, 2013. Examples -------- >>> from kenchi.datasets import make_blobs >>> X, y = make_blobs(n_samples=10, n_features=2, contamination=0.1) >>> X.shape (10, 2) >>> y.shape (10,) """ check_contamination(contamination) rnd = check_random_state(random_state) n_inliers = int(np.round((1. - contamination) * n_samples)) X_inlier, _ = _make_blobs( centers = centers, center_box = center_box, cluster_std = cluster_std, n_features = n_features, n_samples = n_inliers, random_state = rnd, shuffle = False ) data_max = np.max(X_inlier, axis=0) data_min = np.min(X_inlier, axis=0) n_outliers = n_samples - n_inliers X_outlier = rnd.uniform( low = np.minimum(center_box[0], data_min), high = np.maximum(center_box[1], data_max), size = (n_outliers, n_features) ) X = np.concatenate([X_inlier, X_outlier]) y = np.empty(n_samples, dtype=int) y[:n_inliers] = POS_LABEL y[n_inliers:] = NEG_LABEL if shuffle: X, y = _shuffle(X, y, random_state=rnd) return X, y