Source code for kenchi.datasets.sample_generator

import numpy as np
from sklearn.datasets import make_blobs as _make_blobs
from sklearn.utils import check_random_state, shuffle as _shuffle

__all__ = ['make_blobs']


[docs]def make_blobs( centers=5, center_box=(-10., 10.), cluster_std=1., contamination=0.02, n_features=25, n_samples=500, random_state=None, shuffle=True ): """Generate isotropic Gaussian blobs with outliers. Parameters ---------- centers : int or array-like of shape (n_centers, n_features), default 5 Number of centers to generate, or the fixed center locations. center_box : pair of floats (min, max), default (-10.0, 10.0) Bounding box for each cluster center when centers are generated at random. cluster_std : float or array-like of shape (n_centers,), default 1.0 Standard deviation of the clusters. contamination : float, default 0.02 Proportion of outliers in the data set. n_features : int, default 25 Number of features for each sample. n_samples : int, default 500 Number of samples. random_state : int, RandomState instance, default None Seed of the pseudo random number generator. shuffle : bool, default True If True, shuffle samples. Returns ------- X : ndarray of shape (n_samples, n_features) Generated data. y : ndarray of shape (n_samples,) Return -1 for outliers and +1 for inliers. References ---------- .. [#kriegel11] Kriegel, H.-P., Kroger, P., Schubert E., and Zimek, A., "Interpreting and unifying outlier scores," In Proceedings of SDM'11, pp. 13-24, 2011. """ rnd = check_random_state(random_state) n_inliers = int(np.round((1. - contamination) * n_samples)) X_inliers, _ = _make_blobs( centers = centers, center_box = center_box, cluster_std = cluster_std, n_features = n_features, n_samples = n_inliers, random_state = rnd, shuffle = False ) n_outliers = n_samples - n_inliers X_outliers = rnd.uniform( low = np.min(X_inliers, axis=0), high = np.max(X_inliers, axis=0), size = (n_outliers, n_features) ) X = np.concatenate([X_outliers, X_inliers]) y = np.ones(n_samples, dtype=np.int) y[:n_outliers] = -1 if shuffle: X, y = _shuffle(X, y, random_state=rnd) return X, y