Source code for pyrit.score.scorer_evaluation.scorer_metrics

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from __future__ import annotations

import json
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING, Generic, Optional, Type, TypeVar, Union

import numpy as np

from pyrit.common.utils import verify_and_resolve_path

if TYPE_CHECKING:
    from pyrit.models.harm_definition import HarmDefinition
    from pyrit.score.scorer_identifier import ScorerIdentifier

T = TypeVar("T", bound="ScorerMetrics")
M = TypeVar("M", bound="ScorerMetrics")


[docs] @dataclass class ScorerMetrics: """ Base dataclass for storing scorer evaluation metrics. This class provides methods for serializing metrics to JSON and loading them from JSON files. Args: num_responses (int): Total number of responses evaluated. num_human_raters (int): Number of human raters who scored the responses. num_scorer_trials (int): Number of times the model scorer was run. Defaults to 1. dataset_name (str, optional): Name of the dataset used for evaluation. dataset_version (str, optional): Version of the dataset for reproducibility. trial_scores (np.ndarray, optional): Raw scores from each trial for debugging. average_score_time_seconds (float): Average time in seconds to score a single item. Defaults to 0.0. """ num_responses: int num_human_raters: int num_scorer_trials: int = field(default=1, kw_only=True) dataset_name: Optional[str] = field(default=None, kw_only=True) dataset_version: Optional[str] = field(default=None, kw_only=True) trial_scores: Optional[np.ndarray] = field(default=None, kw_only=True) # type: ignore[type-arg, unused-ignore] average_score_time_seconds: float = field(default=0.0, kw_only=True)
[docs] def to_json(self) -> str: """ Convert the metrics to a JSON string. Returns: str: The JSON string representation of the metrics. """ return json.dumps(asdict(self))
[docs] @classmethod def from_json(cls: Type[T], file_path: Union[str, Path]) -> T: """ Load the metrics from a JSON file. Args: file_path (Union[str, Path]): The path to the JSON file. Returns: ScorerMetrics: An instance of ScorerMetrics with the loaded data. Raises: FileNotFoundError: If the specified file does not exist. """ file_path = verify_and_resolve_path(file_path) with open(file_path, "r") as f: data = json.load(f) # Extract metrics from nested structure (always under "metrics" key in evaluation result files) metrics_data = data.get("metrics", data) # Filter out internal fields that shouldn't be passed to __init__ # (e.g., _harm_definition_obj is a cached field with init=False) filtered_data = {k: v for k, v in metrics_data.items() if not k.startswith("_")} return cls(**filtered_data)
[docs] @dataclass class HarmScorerMetrics(ScorerMetrics): """ Metrics for evaluating a harm scorer against a HumanLabeledDataset. Args: mean_absolute_error (float): The mean absolute error between the model scores and the gold scores. mae_standard_error (float): The standard error of the mean absolute error. This can be used to calculate a confidence interval for the mean absolute error. t_statistic (float): The t-statistic for the one-sample t-test comparing model scores to human scores with a null hypothesis that the mean difference is 0. A high positive t-statistic (along with a low p-value) indicates that the model scores are typically higher than the human scores. p_value (float): The p-value for the one-sample t-test above. It represents the probability of obtaining a difference in means as extreme as the observed difference, assuming the null hypothesis is true. krippendorff_alpha_combined (float): Krippendorff's alpha for the reliability data, which includes both human and model scores. This measures the agreement between all the human raters and model scoring trials and ranges between -1.0 to 1.0 where 1.0 indicates perfect agreement, 0.0 indicates no agreement, and negative values indicate systematic disagreement. harm_category (str, optional): The harm category being evaluated (e.g., "hate_speech", "violence"). harm_definition (str, optional): Path to the YAML file containing the harm definition (scale descriptions). Use get_harm_definition() to load the full HarmDefinition object. harm_definition_version (str, optional): Version of the harm definition YAML file that the human labels were created against. Used for reproducibility and to ensure scoring criteria consistency. krippendorff_alpha_humans (float, Optional): Krippendorff's alpha for human scores, if there are multiple human raters. This measures the agreement between human raters. krippendorff_alpha_model (float, Optional): Krippendorff's alpha for model scores, if there are multiple model scoring trials. This measures the agreement between model scoring trials. """ mean_absolute_error: float mae_standard_error: float t_statistic: float p_value: float krippendorff_alpha_combined: float harm_category: Optional[str] = field(default=None, kw_only=True) harm_definition: Optional[str] = field(default=None, kw_only=True) harm_definition_version: Optional[str] = field(default=None, kw_only=True) krippendorff_alpha_humans: Optional[float] = None krippendorff_alpha_model: Optional[float] = None _harm_definition_obj: Optional["HarmDefinition"] = field(default=None, init=False, repr=False)
[docs] def get_harm_definition(self) -> Optional["HarmDefinition"]: """ Load and return the HarmDefinition object for this metrics instance. Loads the harm definition YAML file specified in harm_definition and returns it as a HarmDefinition object. The result is cached after the first load. Returns: HarmDefinition: The loaded harm definition object, or None if harm_definition is not set. Raises: FileNotFoundError: If the harm definition file does not exist. ValueError: If the harm definition file is invalid. """ if not self.harm_definition: return None if self._harm_definition_obj is None: from pyrit.models.harm_definition import HarmDefinition self._harm_definition_obj = HarmDefinition.from_yaml(self.harm_definition) return self._harm_definition_obj
[docs] @dataclass class ObjectiveScorerMetrics(ScorerMetrics): """ Metrics for evaluating an objective scorer against a HumanLabeledDataset. Args: accuracy (float): The accuracy of the model scores when using the majority vote of human scores as the gold label. f1_score (float): The F1 score of the model scores, an indicator of performance of the LLM scorer in its alignment with human scores. precision (float): The precision of the model scores, an indicator of the model's accuracy in its positive predictions. recall (float): The recall of the model scores, an indicator of the model's ability to correctly identify positive labels. trial_scores (Optional[np.ndarray]): The raw scores from each trial. Shape is (num_trials, num_responses). Useful for debugging and analyzing scorer variance. """ accuracy: float accuracy_standard_error: float f1_score: float precision: float recall: float
[docs] @dataclass class ScorerMetricsWithIdentity(Generic[M]): """ Wrapper that combines scorer metrics with the scorer's identity information. This class provides a clean interface for working with evaluation results, allowing access to both the scorer configuration and its performance metrics. Generic over the metrics type M, so: - ScorerMetricsWithIdentity[ObjectiveScorerMetrics] has metrics: ObjectiveScorerMetrics - ScorerMetricsWithIdentity[HarmScorerMetrics] has metrics: HarmScorerMetrics Args: scorer_identifier (ScorerIdentifier): The scorer's configuration identifier. metrics (M): The evaluation metrics (ObjectiveScorerMetrics or HarmScorerMetrics). """ scorer_identifier: "ScorerIdentifier" metrics: M def __repr__(self) -> str: """Return a string representation of the ScorerMetricsWithIdentity.""" metrics_type = type(self.metrics).__name__ scorer_type = self.scorer_identifier.type return f"ScorerMetricsWithIdentity(scorer={scorer_type}, metrics_type={metrics_type})"