Source code for pyrit.executor.attack.component.objective_evaluator

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
from typing import Optional

from pyrit.models import Score
from pyrit.score import Scorer

logger = logging.getLogger(__name__)


[docs] class ObjectiveEvaluator: """ Evaluates scores from a Scorer to determine objective achievement and provide feedback. This component wraps a Scorer and provides logic for: - Determining if an objective is achieved based on score values and thresholds. - Extracting feedback from score rationales for use in subsequent prompts. - Supporting both true/false and numeric score types. """
[docs] def __init__( self, *, scorer: Scorer, use_score_as_feedback: bool = True, successful_objective_threshold: float = 0.8 ): """ Initialize the objective evaluator. Args: scorer (Scorer): The scorer to use for evaluating responses use_score_as_feedback (bool): Whether to use the score as feedback for future prompts successful_objective_threshold (float): The score threshold to consider an objective successful [0.0 to 1.0, inclusive]. Raises: ValueError: If successful_objective_threshold is not between 0.0 and 1.0 (inclusive). """ self._scorer = scorer self._use_score_as_feedback = use_score_as_feedback if not 0.0 <= successful_objective_threshold <= 1.0: raise ValueError("successful_objective_threshold must be between 0.0 and 1.0 (inclusive)") self._successful_objective_threshold = successful_objective_threshold
[docs] def is_objective_achieved(self, score: Optional[Score] = None) -> bool: """ Determine if the objective is achieved based on score. Args: score (Optional[Score]): The score to check. Returns: bool: True if the objective is achieved, False otherwise. Raises: ValueError: If score value cannot be converted to the expected type. """ if not score: return False score_type = score.score_type score_value = score.get_value() # Handle true_false scores if score_type == "true_false": return bool(score_value) # Handle numeric scores if score_type == "float_scale": return float(score_value) >= self._successful_objective_threshold # For other score types, assume not achieved return False
[docs] def get_feedback(self, score: Score) -> Optional[str]: """ Get feedback from a score for use in future prompts. Args: score (Optional[Score]): The score to get feedback from. Returns: Optional[str]: Feedback string, or None if no suitable feedback exists. """ if not score or not self._use_score_as_feedback: return None return score.score_rationale
@property def scorer_type(self) -> str: """ Get the type of the scorer. Returns: str: The type identifier of the scorer. """ return self._scorer.get_identifier()["__type__"]