Skip to content

Util

add_asymmetric_noise

Util function to add asymmetric noise to labels for simulation of noisy label scenarios.

Parameters:

Name Type Description Default
labels Series

Input pandas series with integer values ranging from 0 to n - 1.

required
noise_prob float

Probability of adding noise to each value.

required
random_state Union[RandomState, None]

Random seed for reproducibility

42

Returns: pd.Series: Series with asymmetric noise added to it. float: Normalized quantification of pairwise disagreement between labels and noisy_labels for parity check

Source code in dqc/utils/noise_utils.py
def add_asymmetric_noise(
    labels: pd.Series,
    noise_prob: float,
    random_state: Union[RandomState, None] = 42,
) -> Tuple[pd.Series, float]:
    """
    Util function to add asymmetric noise to labels
    for simulation of noisy label scenarios.

    Args:
        labels (pd.Series): Input pandas series with integer values
                        ranging from 0 to n - 1.
        noise_prob (float): Probability of adding noise to each value.
        random_state (Union[RandomState, None]): Random seed for reproducibility
    Returns:
        pd.Series: Series with asymmetric noise added to it.
        float: Normalized quantification of pairwise disagreement between `labels` and `noisy_labels` for parity check
    """
    # Set seed
    np.random.seed(random_state)

    # Avoid modifying the original data
    noisy_labels = labels.copy()

    # Build a replacement dictionary
    unique_labels = list(set(noisy_labels))
    replacement_dict = {
        label: [candidate for candidate in unique_labels if candidate != label]
        for label in unique_labels
    }

    # Determine the number of samples to modify based on the noise probability
    num_samples = min(len(noisy_labels), int(len(noisy_labels) * noise_prob + 1))

    # Sample random indices from the labels to introduce noise
    target_indices = np.random.choice(len(noisy_labels), num_samples, replace=False)

    for idx in target_indices:
        # Introduce noise
        noisy_labels[idx] = np.random.choice(replacement_dict[noisy_labels[idx]])

    # Parity check
    num_mismatches = sum(
        [
            label != noisy_label
            for label, noisy_label in zip(labels.values, noisy_labels.values)
        ]
    )
    observed_noise_ratio = num_mismatches / len(noisy_labels)

    return noisy_labels, observed_noise_ratio

show_versions

Print useful debugging information

Returns:

Name Type Description
dict

Dictionary object containing system information

Source code in dqc/version.py
def show_versions():
    """Print useful debugging information

    Returns:
        dict: Dictionary object containing system information
    """
    versions = {
        "os_type": platform.system(),
        "os_version": platform.release(),
        "python_version": platform.python_version(),
        "dqc-toolkit": __version__,
        "transformers": transformers.__version__,
        "sentence_transformers": sentence_transformers.__version__,
        "scikit-learn": sklearn.__version__,
    }

    return versions