Source code for kenchi.outlier_detection.classification_based

from sklearn.metrics.pairwise import rbf_kernel
from sklearn.svm import OneClassSVM
from sklearn.utils.validation import check_is_fitted

from .base import BaseOutlierDetector

__all__ = ['OCSVM']


[docs]class OCSVM(BaseOutlierDetector):
    """One Class Support Vector Machines (only RBF kernel).

    Parameters
    ----------
    cache_size : float, default 200
        Specify the size of the kernel cache (in MB).

    gamma : float, default 'scale'
        Kernel coefficient. If gamma is 'scale', 1 / (n_features * np.std(X))
        will be used instead.

    max_iter : int, optional default -1
        Maximum number of iterations.

    nu : float, default 0.5
        An upper bound on the fraction of training errors and a lower bound of
        the fraction of support vectors. Should be in the interval (0, 1].

    shrinking : bool, default True
        If True, use the shrinking heuristic.

    tol : float, default 0.001
        Tolerance to declare convergence.

    Attributes
    ----------
    anomaly_score_ : array-like of shape (n_samples,)
        Anomaly score for each training data.

    contamination_ : float
        Actual proportion of outliers in the data set.

    threshold_ : float
        Threshold.

    Examples
    --------
    >>> import numpy as np
    >>> from kenchi.outlier_detection import OCSVM
    >>> X = np.array([
    ...     [0., 0.], [1., 1.], [2., 0.], [3., -1.], [4., 0.],
    ...     [5., 1.], [6., 0.], [7., -1.], [8., 0.], [1000., 1.]
    ... ])
    >>> det = OCSVM(gamma=1e-03, nu=0.25)
    >>> det.fit_predict(X)
    array([ 1,  1,  1,  1,  1,  1,  1,  1,  1, -1])
    """

    @property
    def dual_coef_(self):
        """array-like of shape (1, n_SV): Coefficients of the support vectors
        in the decision function.
        """

        return self.estimator_.dual_coef_ / self.nu_l_

    @property
    def support_(self):
        """array-like of shape (n_SV): Indices of support vectors.
        """

        return self.estimator_.support_

    @property
    def support_vectors_(self):
        """array-like of shape (n_SV, n_features): Support vectors.
        """

        return self.estimator_.support_vectors_

    @property
    def intercept_(self):
        """array-like of shape (1,): Constant in the decision function.
        """

        return self.estimator_.intercept_ / self.nu_l_

    def __init__(
        self, cache_size=200, gamma='scale', max_iter=-1,
        nu=0.5, shrinking=True, tol=0.001
    ):
        self.cache_size   = cache_size
        self.gamma        = gamma
        self.max_iter     = max_iter
        self.nu           = nu
        self.shrinking    = shrinking
        self.tol          = tol

    def _check_is_fitted(self):
        super()._check_is_fitted()

        check_is_fitted(
            self, ['dual_coef_', 'intercept_', 'support_', 'support_vectors_']
        )

    def _get_threshold(self):
        return self.R2_

    def _fit(self, X):
        self.estimator_  = OneClassSVM(
            cache_size   = self.cache_size,
            gamma        = self.gamma,
            max_iter     = self.max_iter,
            nu           = self.nu,
            shrinking    = self.shrinking,
            tol          = self.tol
        ).fit(X)

        l,               = self.support_.shape
        self.nu_l_       = self.nu * l

        Q                = rbf_kernel(
            self.support_vectors_, gamma=self.estimator_._gamma
        )
        c2               = (self.dual_coef_ @ Q @ self.dual_coef_.T)[0, 0]
        self.R2_         = c2 + 2. * self.intercept_[0] + 1.

        return self

    def _anomaly_score(self, X):
        return self.R2_ \
            - 2. / self.nu_l_ * self.estimator_.decision_function(X)