Source code for pyrit.score.true_false.true_false_scorer
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from typing import TYPE_CHECKING, Optional
from pyrit.models import Message, Score
from pyrit.score.scorer import Scorer
from pyrit.score.scorer_prompt_validator import ScorerPromptValidator
from pyrit.score.true_false.true_false_score_aggregator import (
TrueFalseAggregatorFunc,
TrueFalseScoreAggregator,
)
if TYPE_CHECKING:
from pyrit.score.scorer_evaluation.scorer_evaluator import ScorerEvalDatasetFiles
from pyrit.score.scorer_evaluation.scorer_metrics import ObjectiveScorerMetrics
[docs]
class TrueFalseScorer(Scorer):
"""
Base class for scorers that return true/false binary scores.
This scorer evaluates prompt responses and returns a single boolean score indicating
whether the response meets a specific criterion. Multiple pieces in a request response
are aggregated using a TrueFalseAggregatorFunc function (default: TrueFalseScoreAggregator.OR).
"""
# Default evaluation configuration - evaluates against all objective CSVs
evaluation_file_mapping: Optional["ScorerEvalDatasetFiles"] = None
[docs]
def __init__(
self,
*,
validator: ScorerPromptValidator,
score_aggregator: TrueFalseAggregatorFunc = TrueFalseScoreAggregator.OR,
) -> None:
"""
Initialize the TrueFalseScorer.
Args:
validator (ScorerPromptValidator): Custom validator.
score_aggregator (TrueFalseAggregatorFunc): The aggregator function to use.
Defaults to TrueFalseScoreAggregator.OR.
"""
self._score_aggregator = score_aggregator
# Set default evaluation file mapping if not already set by subclass
if self.evaluation_file_mapping is None:
from pyrit.score.scorer_evaluation.scorer_evaluator import (
ScorerEvalDatasetFiles,
)
self.evaluation_file_mapping = ScorerEvalDatasetFiles(
human_labeled_datasets_files=["objective/*.csv"],
result_file="objective/objective_achieved_metrics.jsonl",
)
super().__init__(validator=validator)
[docs]
def validate_return_scores(self, scores: list[Score]) -> None:
"""
Validate the scores returned by the scorer.
Args:
scores (list[Score]): The scores to be validated.
Raises:
ValueError: If the number of scores is not exactly one.
ValueError: If the score value is not "true" or "false".
"""
if len(scores) != 1:
raise ValueError("TrueFalseScorer should return exactly one score.")
if scores[0].score_value.lower() not in ["true", "false"]:
raise ValueError("TrueFalseScorer score value must be True or False.")
[docs]
def get_scorer_metrics(self) -> Optional["ObjectiveScorerMetrics"]:
"""
Get evaluation metrics for this scorer from the configured evaluation result file.
Returns:
ObjectiveScorerMetrics: The metrics for this scorer, or None if not found or not configured.
"""
from pyrit.common.path import SCORER_EVALS_PATH
from pyrit.score.scorer_evaluation.scorer_metrics_io import (
find_objective_metrics_by_hash,
)
if self.evaluation_file_mapping is None:
return None
scorer_hash = self.scorer_identifier.compute_hash()
result_file = SCORER_EVALS_PATH / self.evaluation_file_mapping.result_file
if not result_file.exists():
return None
return find_objective_metrics_by_hash(hash=scorer_hash, file_path=result_file)
async def _score_async(self, message: Message, *, objective: Optional[str] = None) -> list[Score]:
"""
Score the given request response asynchronously.
For TrueFalseScorer, multiple piece scores are aggregated into a single true/false score.
Args:
message (Message): The message to score.
objective (Optional[str]): The objective to evaluate against. Defaults to None.
Returns:
list[Score]: A list containing a single true/false Score object.
Raises:
ValueError: If no pieces are scored and cannot determine a piece ID for the return score.
"""
# Get individual scores for all supported pieces using base implementation logic
score_list = await super()._score_async(message, objective=objective)
if not score_list:
# If no pieces matched (e.g., due to role filter or if all pieces filtered), return False
# Use the first message piece's ID (or original_prompt_id as fallback)
first_piece = message.message_pieces[0]
piece_id = first_piece.id or first_piece.original_prompt_id
if piece_id is None:
raise ValueError("Cannot create score: message piece has no id or original_prompt_id")
# Determine specific rationale based on message piece status
if first_piece.is_blocked():
rationale = "The request was blocked by the target; returning false."
description = "Blocked response; returning false."
elif first_piece.has_error():
rationale = f"Response had an error: {first_piece.response_error}; returning false."
description = "Error response; returning false."
else:
# this can happen with multi-modal responses if no supported pieces are present
rationale = "No supported pieces to score after filtering; returning false."
description = "No pieces to score after filtering; returning false."
return_score = Score(
score_value=str(False).lower(),
score_value_description=description,
score_type="true_false",
score_category=None,
score_metadata=None,
score_rationale=rationale,
scorer_class_identifier=self.get_identifier(),
message_piece_id=piece_id,
objective=objective,
)
return [return_score]
# Use score aggregator to combine multiple piece scores into a single score
result = self._score_aggregator(score_list)
# Use the message_piece_id from the first score
return_score = Score(
score_value=str(result.value).lower(),
score_value_description=result.description,
score_type="true_false",
score_category=result.category,
score_metadata=result.metadata,
score_rationale=result.rationale,
scorer_class_identifier=self.get_identifier(),
message_piece_id=score_list[0].message_piece_id,
objective=objective,
)
return [return_score]