Source code for pyrit.score.self_ask_scale_scorer
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import enum
from pathlib import Path
from typing import Optional
import yaml
from pyrit.common.path import SCALES_PATH
from pyrit.models import PromptRequestPiece, Score, SeedPrompt, UnvalidatedScore
from pyrit.prompt_target import PromptChatTarget
from pyrit.score.scorer import Scorer
[docs]
class SelfAskScaleScorer(Scorer):
"""
A class that represents a "self-ask" score for text scoring for a customizable numeric scale.
"""
[docs]
class ScalePaths(enum.Enum):
TREE_OF_ATTACKS_SCALE = Path(SCALES_PATH, "tree_of_attacks_scale.yaml").resolve()
TASK_ACHIEVED_SCALE = Path(SCALES_PATH, "task_achieved_scale.yaml").resolve()
[docs]
class SystemPaths(enum.Enum):
GENERAL_SYSTEM_PROMPT = Path(SCALES_PATH, "general_system_prompt.yaml").resolve()
RED_TEAMER_SYSTEM_PROMPT = Path(SCALES_PATH, "red_teamer_system_prompt.yaml").resolve()
[docs]
def __init__(
self,
*,
chat_target: PromptChatTarget,
scale_arguments_path: Optional[Path],
system_prompt_path: Optional[Path],
) -> None:
self._prompt_target = chat_target
self.scorer_type = "float_scale"
if not system_prompt_path:
system_prompt_path = self.SystemPaths.GENERAL_SYSTEM_PROMPT.value
if not scale_arguments_path:
scale_arguments_path = self.ScalePaths.TREE_OF_ATTACKS_SCALE.value
scale_args = yaml.safe_load(scale_arguments_path.read_text(encoding="utf-8"))
self._validate_scale_arguments_set(scale_args)
self._minimum_value = scale_args["minimum_value"]
self._maximum_value = scale_args["maximum_value"]
self._category = scale_args["category"]
scoring_instructions_template = SeedPrompt.from_yaml_file(system_prompt_path)
self._system_prompt = scoring_instructions_template.render_template_value(**scale_args)
[docs]
async def score_async(self, request_response: PromptRequestPiece, *, task: Optional[str] = None) -> list[Score]:
"""
Scores the given request_response using "self-ask" for the chat target and adds score to memory.
Args:
request_response (PromptRequestPiece): The prompt request piece containing the text to be scored.
task (str): The task based on which the text should be scored (the original attacker model's objective).
Returns:
list[Score]: The request_response scored.
The score_value is a value from [0,1] that is scaled based on the scorer's scale.
"""
self.validate(request_response, task=task)
scoring_prompt = f"task: {task}\nresponse: {request_response.converted_value}"
unvalidated_score: UnvalidatedScore = await self._score_value_with_llm(
prompt_target=self._prompt_target,
system_prompt=self._system_prompt,
prompt_request_value=scoring_prompt,
prompt_request_data_type=request_response.converted_value_data_type,
scored_prompt_id=request_response.id,
category=self._category,
task=task,
)
score = unvalidated_score.to_score(
score_value=str(
self.scale_value_float(
float(unvalidated_score.raw_score_value), self._minimum_value, self._maximum_value
)
)
)
self._memory.add_scores_to_memory(scores=[score])
return [score]
[docs]
def validate(self, request_response: PromptRequestPiece, *, task: Optional[str] = None):
if request_response.original_value_data_type != "text":
raise ValueError("The original value data type must be text.")
if not task:
raise ValueError("Task must be provided.")
def _validate_scale_arguments_set(self, scale_args: dict):
try:
minimum_value = scale_args["minimum_value"]
maximum_value = scale_args["maximum_value"]
category = scale_args["category"]
except KeyError as e:
raise ValueError(f"Missing key in scale_args: {e.args[0]}") from None
if not isinstance(minimum_value, int):
raise ValueError(f"Minimum value must be an integer, got {type(minimum_value).__name__}.")
if not isinstance(maximum_value, int):
raise ValueError(f"Maximum value must be an integer, got {type(maximum_value).__name__}.")
if minimum_value > maximum_value:
raise ValueError("Minimum value must be less than or equal to the maximum value.")
if not category:
raise ValueError("Category must be set and cannot be empty.")