Source code for kenchi.metrics

import numpy as np
from sklearn.metrics import auc, recall_score
from sklearn.utils import check_random_state

__all__ = ['LeeLiuScorer', 'NegativeMVAUCScorer']


def _lebesgue_measure(score_samples, offset, data_volume):
    """Compute Lebesgue measure."""

    return np.mean(score_samples >= offset) * data_volume


[docs]class LeeLiuScorer: """Lee-Liu scorer. References ---------- .. [#lee03] Lee, W. S, and Liu, B., "Learning with positive and unlabeled examples using weighted Logistic Regression," In Proceedings of ICML, pp. 448-455, 2003. """ def __call__(self, det, X, y=None): """Compute the Lee-Liu metric. Parameters ---------- det : object Detector. X : array-like of shape (n_samples, n_features), default None Data. y : array-like of shape (n_samples,), default None Labels. If None, assume that all samples are positive. Returns ------- score : float Lee-Liu metric. """ y_pred = det.predict(X) if y is None: y = np.ones_like(y_pred) r = recall_score(y, y_pred) return r ** 2 / (1. - det.contamination_)
[docs]class NegativeMVAUCScorer: """Negative MV AUC scorer. Parameters ---------- data_max : array-like of shape (n_features,) Per feature maximum seen in the data. data_min : array-like of shape (n_features,) Per feature minimum seen in the data. interval : tuple, default (0.9, 0.999) Interval of probabilities. n_offsets : int, default 1000 Number of offsets. n_uniform_samples : int, default 1000 Number of samples which are drawn from the uniform distribution over the hypercube enclosing the data. random_state : int or RandomState instance, default None Seed of the pseudo random number generator. References ---------- .. [#goix16] Goix, N., "How to evaluate the quality of unsupervised anomaly detection algorithms?" In ICML Anomaly Detection Workshop, 2016. """ def __init__( self, data_max, data_min, interval=(0.9, 0.999), n_offsets=1000, n_uniform_samples=1000, random_state=None ): self.data_max = data_max self.data_min = data_min self.interval = interval self.n_offsets = n_offsets self.n_uniform_samples = n_uniform_samples self.random_state = random_state self.internal_state = check_random_state(random_state).get_state() def __call__(self, det, X, y=None): """Compute the opposite of the area under the Mass-Volume (MV) curve. Parameters ---------- det : object Detector. X : array-like of shape (n_samples, n_features) Data. y : ignored Returns ------- score : float Opposite of the area under the MV curve. """ rnd = np.random.RandomState() rnd.set_state(self.internal_state) U = rnd.uniform( low = self.data_min, high = self.data_max, size = (self.n_uniform_samples, det.n_features_) ) score_samples = det.score_samples(X) score_uniform_samples = det.score_samples(U) mass, volume, _ = self._mv_curve( score_samples, score_uniform_samples ) is_in_range = \ (self.interval[0] <= mass) & (mass <= self.interval[1]) return -auc(mass[is_in_range], volume[is_in_range], reorder=True) def _mv_curve(self, score_samples, score_uniform_samples): """Compute mass-volume pairs for different offsets. Parameters ---------- score_samples : array-like of shape (n_samples,) Opposite of the anomaly score for each sample. score_uniform_samples : array-like of shape (n_uniform_samples,) Opposite of the anomaly score for each sample which is drawn from the uniform distribution over the hypercube enclosing the data. Returns ------- mass : array-like of shape (n_offsets,) volume : array-like of shape (n_offsets,) offsets : array-like of shape (n_offsets,) """ data_volume = np.prod(self.data_max - self.data_min) mass = np.linspace(0., 1., self.n_offsets) offsets = np.percentile( score_samples, 100. * (1. - mass) ) volume = np.vectorize( _lebesgue_measure, excluded=[0, 2] )(score_uniform_samples, offsets, data_volume) return mass, volume, offsets