Source code for kenchi.outlier_detection.reconstruction_based

import numpy as np
from sklearn.decomposition import PCA as _PCA
from sklearn.utils.validation import check_is_fitted

from .base import BaseOutlierDetector

__all__ = ['PCA']


[docs]class PCA(BaseOutlierDetector):
    """Outlier detector using Principal Component Analysis (PCA).

    Parameters
    ----------
    contamination : float, default 0.1
        Proportion of outliers in the data set. Used to define the threshold.

    iterated_power : int, default 'auto'
        Number of iterations for the power method computed by svd_solver ==
        'randomized'.

    n_components : int, float, or string, default None
        Number of components to keep.

    random_state : int or RandomState instance, default None
        Seed of the pseudo random number generator.

    svd_solver : string, default 'auto'
        SVD solver to use. Valid solvers are
        ['auto'|'full'|'arpack'|'randomized'].

    tol : float, default 0.0
        Tolerance to declare convergence for singular values computed by
        svd_solver == 'arpack'.

    whiten : bool, default False
        If True, the ``components_`` vectors are multiplied by the square root
        of n_samples and then divided by the singular values to ensure
        uncorrelated outputs with unit component-wise variances.

    Attributes
    ----------
    anomaly_score_ : array-like of shape (n_samples,)
        Anomaly score for each training data.

    contamination_ : float
        Actual proportion of outliers in the data set.

    threshold_ : float
        Threshold.

    Examples
    --------
    >>> import numpy as np
    >>> from kenchi.outlier_detection import PCA
    >>> X = np.array([
    ...     [0., 0.], [1., 1.], [2., 0.], [3., -1.], [4., 0.],
    ...     [5., 1.], [6., 0.], [7., -1.], [8., 0.], [1000., 1.]
    ... ])
    >>> det = PCA()
    >>> det.fit_predict(X)
    array([ 1,  1,  1,  1,  1,  1,  1,  1,  1, -1])
    """

    @property
    def components_(self):
        """array-like of shape (n_components, n_features): Principal axes in
        feature space, representing the directions of maximum variance in the
        data.
        """

        return self.estimator_.components_

    @property
    def explained_variance_(self):
        """array-like of shape (n_components,): Amount of variance explained by
        each of the selected components.
        """

        return self.estimator_.explained_variance_

    @property
    def explained_variance_ratio_(self):
        """array-like of shape (n_components,): Percentage of variance
        explained by each of the selected components.
        """

        return self.estimator_.explained_variance_ratio_

    @property
    def mean_(self):
        """array-like of shape (n_features,): Per-feature empirical mean,
        estimated from the training set.
        """

        return self.estimator_.mean_

    @property
    def noise_variance_(self):
        """float: Estimated noise covariance following the Probabilistic PCA
        model from Tipping and Bishop 1999.
        """

        return self.estimator_.noise_variance_

    @property
    def n_components_(self):
        """int: Estimated number of components.
        """

        return self.estimator_.n_components_

    @property
    def singular_values_(self):
        """array-like of shape (n_components,): Singular values corresponding
        to each of the selected components.
        """

        return self.estimator_.singular_values_

    def __init__(
        self, contamination=0.1, iterated_power='auto', n_components=None,
        random_state=None, svd_solver='auto', tol=0., whiten=False
    ):
        self.contamination  = contamination
        self.iterated_power = iterated_power
        self.n_components   = n_components
        self.random_state   = random_state
        self.svd_solver     = svd_solver
        self.tol            = tol
        self.whiten         = whiten

    def _check_is_fitted(self):
        super()._check_is_fitted()

        check_is_fitted(
            self, [
                'components_', 'explained_variance_',
                'explained_variance_ratio_', 'mean_',
                'noise_variance_', 'n_components_',
                'singular_values_'
            ]
        )

    def _fit(self, X):
        self.estimator_    = _PCA(
            iterated_power = self.iterated_power,
            n_components   = self.n_components,
            random_state   = self.random_state,
            svd_solver     = self.svd_solver,
            tol            = self.tol,
            whiten         = self.whiten
        ).fit(X)

        return self

    def _anomaly_score(self, X):
        return np.sum((X - self._reconstruct(X)) ** 2, axis=1)

    def _reconstruct(self, X):
        """Apply dimensionality reduction to the given data, and transform the
        data back to its original space.
        """

        return self.estimator_.inverse_transform(self.estimator_.transform(X))