Source code for kenchi.pipeline

from sklearn.externals.joblib import dump
from sklearn.pipeline import _name_estimators, Pipeline as _Pipeline
from sklearn.utils.metaestimators import if_delegate_has_method

__all__ = ['make_pipeline', 'Pipeline']


[docs]def make_pipeline(*steps):
    """Construct a Pipeline from the given estimators. This is a shorthand for
    the Pipeline constructor; it does not require, and does not permit, naming
    the estimators. Instead, their names will be set to the lowercase of their
    types automatically.

    Parameters
    ----------
    *steps : list
        List of estimators.

    Returns
    -------
    p : Pipeline

    Examples
    --------
    >>> from kenchi.outlier_detection import MiniBatchKMeans
    >>> from kenchi.pipeline import make_pipeline
    >>> from sklearn.preprocessing import StandardScaler
    >>> scaler = StandardScaler()
    >>> det = MiniBatchKMeans()
    >>> pipeline = make_pipeline(scaler, det)
    """

    return Pipeline(_name_estimators(steps))


[docs]class Pipeline(_Pipeline):
    """Pipeline of transforms with a final estimator.

    Parameters
    ----------
    steps : list
        List of (name, transform) tuples (implementing fit/transform) that are
        chained, in the order in which they are chained, with the last object
        an estimator.

    memory : instance of joblib.Memory or string, default None
        Used to cache the fitted transformers of the pipeline. By default, no
        caching is performed. If a string is given, it is the path to the
        caching directory. Enabling caching triggers a clone of the
        transformers before fitting. Therefore, the transformer instance given
        to the pipeline cannot be inspected directly. Use the attribute
        ``named_steps`` or ``steps`` to inspect estimators within the pipeline.
        Caching the transformers is advantageous when fitting is time
        consuming.

    Attributes
    ----------
    named_steps : dict
        Read-only attribute to access any step parameter by user given name.
        Keys are step names and values are steps parameters.

    Examples
    --------
    >>> import numpy as np
    >>> from kenchi.outlier_detection import MiniBatchKMeans
    >>> from kenchi.pipeline import Pipeline
    >>> from sklearn.preprocessing import StandardScaler
    >>> X = np.array([
    ...     [0., 0.], [1., 1.], [2., 0.], [3., -1.], [4., 0.],
    ...     [5., 1.], [6., 0.], [7., -1.], [8., 0.], [1000., 1.]
    ... ])
    >>> det = MiniBatchKMeans(n_clusters=1, random_state=0)
    >>> scaler = StandardScaler()
    >>> pipeline = Pipeline([('scaler', scaler), ('det', det)])
    >>> pipeline.fit_predict(X)
    array([ 1,  1,  1,  1,  1,  1,  1,  1,  1, -1])
    """

    def __len__(self):
        return len(self.named_steps)

    def __getitem__(self, key):
        return self.named_steps[key]

    def __iter__(self):
        return iter(self.named_steps)

    def _pre_transform(self, X):
        if X is None:
            return X

        for _, transform in self.steps[:-1]:
            if transform is not None:
                X = transform.transform(X)

        return X

[docs]    @if_delegate_has_method(delegate='_final_estimator')
    def score_samples(self, X=None):
        """Apply transforms, and compute the opposite of the anomaly score for
        each sample with the final estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features), default None
            Data. If None, compute the opposite of the anomaly score for each
            training sample.

        Returns
        -------
        score_samples : array-like of shape (n_samples,)
            Opposite of the anomaly score for each sample.
        """

        return -self.anomaly_score(X)

[docs]    @if_delegate_has_method(delegate='_final_estimator')
    def anomaly_score(self, X=None, **kwargs):
        """Apply transforms, and compute the anomaly score for each sample with
        the final estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data. If None, compute the anomaly score for each training samples.

        normalize : bool, default False
            If True, return the normalized anomaly score.

        Returns
        -------
        anomaly_score : array-like of shape (n_samples,)
            Anomaly score for each sample.
        """

        X = self._pre_transform(X)

        return self._final_estimator.anomaly_score(X, **kwargs)

[docs]    @if_delegate_has_method(delegate='_final_estimator')
    def featurewise_anomaly_score(self, X):
        """Apply transforms, and compute the feature-wise anomaly scores for
        each sample with the final estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data.

        Returns
        -------
        anomaly_score : array-like of shape (n_samples, n_features)
            Feature-wise anomaly scores for each sample.
        """

        X = self._pre_transform(X)

        return self._final_estimator.featurewise_anomaly_score(X)

[docs]    def to_pickle(self, filename, **kwargs):
        """Persist a pipeline object.

        Parameters
        ----------
        filename : str or pathlib.Path
            Path of the file in which it is to be stored.

        kwargs : dict
            Other keywords passed to ``sklearn.externals.joblib.dump``.

        Returns
        -------
        filenames : list
            List of file names in which the data is stored.
        """

        return dump(self, filename, **kwargs)

[docs]    @if_delegate_has_method(delegate='_final_estimator')
    def plot_anomaly_score(self, X=None, **kwargs):
        """Apply transoforms, and plot the anomaly score for each sample with
        the final estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features), default None
            Data. If None, plot the anomaly score for each training samples.

        normalize : bool, default False
            If True, plot the normalized anomaly score.

        ax : matplotlib Axes, default None
            Target axes instance.

        bins : int, str or array-like, default 'auto'
            Number of hist bins.

        figsize : tuple, default None
            Tuple denoting figure size of the plot.

        filename : str, default None
            If provided, save the current figure.

        hist : bool, default True
            If True, plot a histogram of anomaly scores.

        kde : bool, default True
            If True, plot a gaussian kernel density estimate.

        title : string, default None
            Axes title. To disable, pass None.

        xlabel : string, default 'Samples'
            X axis title label. To disable, pass None.

        xlim : tuple, default None
            Tuple passed to ``ax.xlim``.

        ylabel : string, default 'Anomaly score'
            Y axis title label. To disable, pass None.

        ylim : tuple, default None
            Tuple passed to ``ax.ylim``.

        **kwargs : dict
            Other keywords passed to ``ax.plot``.

        Returns
        -------
        ax : matplotlib Axes
            Axes on which the plot was drawn.
        """

        X = self._pre_transform(X)

        return self._final_estimator.plot_anomaly_score(X, **kwargs)

[docs]    @if_delegate_has_method(delegate='_final_estimator')
    def plot_roc_curve(self, X, y, **kwargs):
        """Apply transoforms, and plot the Receiver Operating Characteristic
        (ROC) curve with the final estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data.

        y : array-like of shape (n_samples,)
            Labels.

        ax : matplotlib Axes, default None
            Target axes instance.

        figsize: tuple, default None
            Tuple denoting figure size of the plot.

        filename : str, default None
            If provided, save the current figure.

        title : string, default 'ROC curve'
            Axes title. To disable, pass None.

        xlabel : string, default 'FPR'
            X axis title label. To disable, pass None.

        ylabel : string, default 'TPR'
            Y axis title label. To disable, pass None.

        **kwargs : dict
            Other keywords passed to ``ax.plot``.

        Returns
        -------
        ax : matplotlib Axes
            Axes on which the plot was drawn.
        """

        X = self._pre_transform(X)

        return self._final_estimator.plot_roc_curve(X, y, **kwargs)

    @property
    def plot_graphical_model(self):
        """Apply transforms, and plot the Gaussian Graphical Model (GGM) with
        the final estimator.

        Parameters
        ----------
        ax : matplotlib Axes, default None
            Target axes instance.

        figsize : tuple, default None
            Tuple denoting figure size of the plot.

        filename : str, default None
            If provided, save the current figure.

        random_state : int, RandomState instance, default None
            Seed of the pseudo random number generator.

        title : string, default 'GGM (n_clusters, n_features, n_isolates)'
            Axes title. To disable, pass None.

        **kwargs : dict
            Other keywords passed to ``nx.draw_networkx``.

        Returns
        -------
        ax : matplotlib Axes
            Axes on which the plot was drawn.
        """

        return self._final_estimator.plot_graphical_model

    @property
    def plot_partial_corrcoef(self):
        """Apply transforms, and plot the partial correlation coefficient
        matrix with the final estimator.

        Parameters
        ----------
        ax : matplotlib Axes, default None
            Target axes instance.

        cbar : bool, default True.
            If True, draw a colorbar.

        figsize : tuple, default None
            Tuple denoting figure size of the plot.

        filename : str, default None
            If provided, save the current figure.

        title : string, default 'Partial correlation'
            Axes title. To disable, pass None.

        **kwargs : dict
            Other keywords passed to ``ax.pcolormesh``.

        Returns
        -------
        ax : matplotlib Axes
            Axes on which the plot was drawn.
        """

        return self._final_estimator.plot_partial_corrcoef