Source code for kenchi.outlier_detection.density_based

import numpy as np
from sklearn.neighbors import LocalOutlierFactor

from .base import BaseOutlierDetector

__all__ = ['LOF']


[docs]class LOF(BaseOutlierDetector):
    """Local Outlier Factor.

    Parameters
    ----------
    algorithm : str, default 'auto'
        Tree algorithm to use. Valid algorithms are
        ['kd_tree'|'ball_tree'|'auto'].

    contamination : float, default 0.1
        Proportion of outliers in the data set. Used to define the threshold.

    leaf_size : int, default 30
        Leaf size of the underlying tree.

    metric : str or callable, default 'minkowski'
        Distance metric to use.

    novelty : bool, default False
        If True, you can use predict, decision_function and anomaly_score on
        new unseen data and not on the training data.

    n_jobs : int, default 1
        Number of jobs to run in parallel. If -1, then the number of jobs is
        set to the number of CPU cores.

    n_neighbors : int, default 20
        Number of neighbors.

    p : int, default 2
        Power parameter for the Minkowski metric.

    metric_params : dict, default None
        Additioal parameters passed to the requested metric.

    Attributes
    ----------
    anomaly_score_ : array-like of shape (n_samples,)
        Anomaly score for each training data.

    threshold_ : float
        Threshold.

    negative_outlier_factor_ : array-like of shape (n_samples,)
        Opposite LOF of the training samples.

    n_neighbors_ : int
        Actual number of neighbors used for `kneighbors` queries.

    X_ : array-like of shape (n_samples, n_features)
        Training data.

    References
    ----------
    .. [#breunig00] Breunig, M. M., Kriegel, H.-P., Ng, R. T., and Sander, J.,
        "LOF: identifying density-based local outliers,"
        In ACM sigmod record, pp. 93-104, 2000.

    .. [#kriegel11] Kriegel, H.-P., Kroger, P., Schubert E., and Zimek, A.,
        "Interpreting and unifying outlier scores,"
        In Proceedings of SDM'11, pp. 13-24, 2011.
    """

    @property
    def negative_outlier_factor_(self):
        return self._estimator.negative_outlier_factor_

    @property
    def n_neighbors_(self):
        return self._estimator.n_neighbors_

    @property
    def X_(self):
        return self._estimator._fit_X

    def __init__(
        self, algorithm='auto', contamination=0.1, leaf_size=30,
        metric='minkowski', novelty=False, n_jobs=1, n_neighbors=20,
        p=2, metric_params=None
    ):
        super().__init__(contamination=contamination)

        self.algorithm     = algorithm
        self.leaf_size     = leaf_size
        self.metric        = metric
        self.novelty       = novelty
        self.n_jobs        = n_jobs
        self.n_neighbors   = n_neighbors
        self.p             = p
        self.metric_params = metric_params

    def _fit(self, X):
        self._estimator   = LocalOutlierFactor(
            algorithm     = self.algorithm,
            leaf_size     = self.leaf_size,
            metric        = self.metric,
            n_jobs        = self.n_jobs,
            n_neighbors   = self.n_neighbors,
            p             = self.p,
            metric_params = self.metric_params
        ).fit(X)

        return self

    def _anomaly_score(self, X, regularize=True):
        lof = self._lof(X)

        if regularize:
            return np.maximum(0., lof - 1.)
        else:
            return lof

    def _lof(self, X):
        """Compute the Local Outlier Factor (LOF) for each sample."""

        if X is self.X_:
            return -self.negative_outlier_factor_
        else:
            return -self._estimator._decision_function(X)