Source code for cleanlab.baseline_methods

# Copyright (C) 2017-2022  Cleanlab Inc.
# This file is part of cleanlab.
# 
# cleanlab is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# cleanlab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
# 
# You should have received a copy of the GNU Affero General Public License
# along with cleanlab.  If not, see <https://www.gnu.org/licenses/>.

# Baseline methods
# 
# Contains baseline methods for estimating label errors.
#
# These methods ONLY WORK FOR SINGLE_LABEL (not multi-label)

from __future__ import (
    print_function, absolute_import, division, unicode_literals, with_statement
)
from sklearn.metrics import confusion_matrix
from cleanlab.pruning import get_noise_indices
from cleanlab.latent_estimation import calibrate_confident_joint
import numpy as np


[docs]def baseline_argmax(psx, s):
    '''This is the simplest baseline approach. Just consider 
    anywhere argmax != s as a label error.

    Parameters
    ----------

    s : np.array
        A discrete vector of noisy labels, i.e. some labels may be erroneous.

    psx : np.array (shape (N, K))
        P(label=k|x) is a matrix with K (noisy) probabilities for each of the
        N examples x. This is the probability distribution over all K classes,
        for each example, regarding whether the example has label s==k P(s=k|x).
        psx should have been computed using 3 (or higher) fold cross-validation.

    Returns
    -------
        A boolean mask that is true if the example belong
        to that index is label error..'''
    
    return np.argmax(psx, axis=1) != np.asarray(s)


[docs]def baseline_argmax_confusion_matrix(
    psx,
    s,
    calibrate=False,
    prune_method='prune_by_noise_rate',
):
    '''This is a baseline approach. That uses the a confusion matrix
    of argmax(psx) and s as the confident joint and then uses cleanlab
    (confident learning) to find the label errors using this matrix.

    Parameters
    ----------

    s : np.array
        A discrete vector of noisy labels, i.e. some labels may be erroneous.

    psx : np.array (shape (N, K))
        P(label=k|x) is a matrix with K (noisy) probabilities for each of the
        N examples x. This is the probability distribution over all K classes,
        for each example, regarding whether the example has label s==k P(s=k|x).
        psx should have been computed using 3 (or higher) fold cross-validation.

    Returns
    -------
        A boolean mask that is true if the example belong
        to that index is label error..'''

    confident_joint = confusion_matrix(np.argmax(psx, axis=1), s).T
    if calibrate:
        confident_joint = calibrate_confident_joint(confident_joint, s)
    return get_noise_indices(
        s=s,
        psx=psx,
        confident_joint=confident_joint,
        prune_method=prune_method,
    )


[docs]def baseline_argmax_calibrated_confusion_matrix(
    psx,
    s,
    prune_method='prune_by_noise_rate',
):
    '''docstring is the same as baseline_argmax_confusion_matrix
    Except in this method, we calibrate the confident joint created using
    the confusion matrix before using cleanlab to find the label errors.'''

    return baseline_argmax_confusion_matrix(
        s=s,
        psx=psx,
        calibrate=True,
        prune_method=prune_method,
    )