Source code for pyrit.models.score

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from __future__ import annotations

import uuid
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any, Literal, Optional, Union, get_args

if TYPE_CHECKING:
    from pyrit.identifiers.component_identifier import ComponentIdentifier

ScoreType = Literal["true_false", "float_scale", "unknown"]


[docs] class Score: """Represents a normalized score generated by a scorer component.""" id: uuid.UUID | str # The value the scorer ended up with; e.g. True (if true_false) or 0 (if float_scale) score_value: str # Value that can include a description of the score value score_value_description: str # The type of the scorer; e.g. "true_false" or "float_scale" score_type: ScoreType # The harms categories (e.g. ["hate", "violence"]) – can be multiple score_category: Optional[list[str]] # Extra data the scorer provides around the rationale of the score score_rationale: str # Custom metadata a scorer might use. This can vary by scorer. score_metadata: Optional[dict[str, Union[str, int, float]]] # The identifier of the scorer class, including relevant information scorer_class_identifier: ComponentIdentifier # This is the ID of the MessagePiece that the score is scoring # Note a scorer can generate an additional request. This is NOT that, but # the ID associated with what we're scoring. message_piece_id: uuid.UUID | str # Timestamp of when the score was created timestamp: datetime # The task based on which the text is scored (the original attacker model's objective). task: str
[docs] def __init__( self, *, score_value: str, score_value_description: str, score_type: ScoreType, score_rationale: str, message_piece_id: str | uuid.UUID, id: Optional[uuid.UUID | str] = None, # noqa: A002 score_category: Optional[list[str]] = None, score_metadata: Optional[dict[str, Union[str, int, float]]] = None, scorer_class_identifier: Union[ComponentIdentifier, dict[str, Any]], timestamp: Optional[datetime] = None, objective: Optional[str] = None, ): """ Initialize a score object. Args: score_value (str): Normalized score value. score_value_description (str): Human-readable score value description. score_type (ScoreType): Score type (true_false or float_scale). score_rationale (str): Rationale for the score. message_piece_id (str | uuid.UUID): ID of the scored message piece. id (Optional[uuid.UUID | str]): Optional score ID. score_category (Optional[List[str]]): Optional score categories. score_metadata (Optional[Dict[str, Union[str, int, float]]]): Optional metadata. scorer_class_identifier (Union[ScorerIdentifier, Dict[str, Any]]): Scorer identifier. timestamp (Optional[datetime]): Optional creation timestamp. objective (Optional[str]): Optional task objective. Raises: ValueError: If score value or score type is invalid. """ # Import at runtime to avoid circular import from pyrit.identifiers.component_identifier import ComponentIdentifier self.id = id if id else uuid.uuid4() if timestamp is None: self.timestamp = datetime.now(tz=timezone.utc) elif timestamp.tzinfo is None: self.timestamp = timestamp.replace(tzinfo=timezone.utc) else: self.timestamp = timestamp self.validate(score_type, score_value) self.score_value = score_value self.score_value_description = score_value_description if score_type not in get_args(ScoreType): raise ValueError(f"Score type {score_type} is not a valid score type.") self.score_type = score_type self.score_category = score_category self.score_rationale = score_rationale self.score_metadata = score_metadata or {} self.message_piece_id = message_piece_id self.objective = objective # Normalize to ComponentIdentifier (handles dict) self.scorer_class_identifier = ComponentIdentifier.normalize(scorer_class_identifier)
[docs] def get_value(self) -> bool | float: """ Return the value of the score based on its type. If the score type is "true_false", it returns True if the score value is "true" (case-insensitive), otherwise it returns False. If the score type is "float_scale", it returns the score value as a float. Raises: ValueError: If the score type is unknown. Returns: bool | float: Parsed score value. """ if self.score_type == "true_false": return self.score_value.lower() == "true" if self.score_type == "float_scale": return float(self.score_value) raise ValueError(f"Unknown scorer type: {self.score_type}")
[docs] def validate(self, scorer_type: str, score_value: str) -> None: """ Validate score value against scorer type constraints. Args: scorer_type (str): Scorer type to validate against. score_value (str): Raw score value. Raises: ValueError: If value is incompatible with scorer type constraints. """ if scorer_type == "true_false" and str(score_value).lower() not in ["true", "false"]: raise ValueError(f"True False scorers must have a score value of 'true' or 'false' not {score_value}") if scorer_type == "float_scale": try: score = float(score_value) if not (0 <= score <= 1): raise ValueError(f"Float scale scorers must have a score value between 0 and 1. Got {score_value}") except ValueError as e: raise ValueError(f"Float scale scorers require a numeric score value. Got {score_value}") from e
[docs] def to_dict(self) -> dict[str, Any]: """ Convert this score to a dictionary. Returns: Dict[str, Any]: Serialized score payload. """ return { "id": str(self.id), "score_value": self.score_value, "score_value_description": self.score_value_description, "score_type": self.score_type, "score_category": self.score_category, "score_rationale": self.score_rationale, "score_metadata": self.score_metadata, "scorer_class_identifier": self.scorer_class_identifier.to_dict() if self.scorer_class_identifier else None, "message_piece_id": str(self.message_piece_id), "timestamp": self.timestamp.isoformat(), "objective": self.objective, }
def __str__(self) -> str: """ Return a concise text representation of this score. Returns: str: Human-readable score summary. """ category_str = f": {', '.join(self.score_category) if self.score_category else ''}" if self.scorer_class_identifier: scorer_type = self.scorer_class_identifier.class_name or "Unknown" return f"{scorer_type}{category_str}: {self.score_value}" return f"{category_str}: {self.score_value}" __repr__ = __str__
[docs] @dataclass class UnvalidatedScore: """ Score is an object that validates all the fields. However, we need a common data class that can be used to store the raw score value before it is normalized and validated. """ # The raw score value; has no scale. E.g. in likert could be 1-5 raw_score_value: str score_value_description: str score_category: Optional[list[str]] score_rationale: str score_metadata: Optional[dict[str, Union[str, int, float]]] scorer_class_identifier: ComponentIdentifier message_piece_id: uuid.UUID | str objective: Optional[str] id: Optional[uuid.UUID | str] = None timestamp: Optional[datetime] = None
[docs] def to_score(self, *, score_value: str, score_type: ScoreType) -> Score: """ Convert this unvalidated score into a validated Score. Args: score_value (str): Normalized score value. score_type (ScoreType): Score type. Returns: Score: Validated score object. """ return Score( id=self.id, score_value=score_value, score_value_description=self.score_value_description, score_type=score_type, score_category=self.score_category, score_rationale=self.score_rationale, score_metadata=self.score_metadata, scorer_class_identifier=self.scorer_class_identifier, message_piece_id=self.message_piece_id, timestamp=self.timestamp, objective=self.objective, )