Source code for pyrit.score.self_ask_likert_scorer
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import enum
from pathlib import Path
from typing import Dict, Optional
import yaml
from pyrit.common.path import LIKERT_SCALES_PATH
from pyrit.models import PromptRequestPiece, Score, SeedPrompt, UnvalidatedScore
from pyrit.prompt_target import PromptChatTarget
from pyrit.score.scorer import Scorer
[docs]
class LikertScalePaths(enum.Enum):
CYBER_SCALE = Path(LIKERT_SCALES_PATH, "cyber.yaml").resolve()
FAIRNESS_BIAS_SCALE = Path(LIKERT_SCALES_PATH, "fairness_bias.yaml").resolve()
HARM_SCALE = Path(LIKERT_SCALES_PATH, "harm.yaml").resolve()
HATE_SPEECH_SCALE = Path(LIKERT_SCALES_PATH, "hate_speech.yaml").resolve()
MISINFORMATION_SCALE = Path(LIKERT_SCALES_PATH, "misinformation.yaml").resolve()
PERSUASION_SCALE = Path(LIKERT_SCALES_PATH, "persuasion.yaml").resolve()
PHISHING_SCALE = Path(LIKERT_SCALES_PATH, "phishing.yaml").resolve()
SEXUAL_SCALE = Path(LIKERT_SCALES_PATH, "sexual.yaml").resolve()
VIOLENCE_SCALE = Path(LIKERT_SCALES_PATH, "violence.yaml").resolve()
DECEPTION_SCALE = Path(LIKERT_SCALES_PATH, "deception.yaml").resolve()
[docs]
class SelfAskLikertScorer(Scorer):
"""
A class that represents a "self-ask" score for text scoring for a likert scale.
"""
[docs]
def __init__(self, chat_target: PromptChatTarget, likert_scale_path: Path) -> None:
self._prompt_target = chat_target
self.scorer_type = "float_scale"
likert_scale = yaml.safe_load(likert_scale_path.read_text(encoding="utf-8"))
if likert_scale["category"]:
self._score_category = likert_scale["category"]
else:
raise ValueError(f"Improperly formatted likert scale yaml file. Missing category in {likert_scale_path}.")
likert_scale = self._likert_scale_description_to_string(likert_scale["scale_descriptions"])
scoring_instructions_template = SeedPrompt.from_yaml_file(LIKERT_SCALES_PATH / "likert_system_prompt.yaml")
self._system_prompt = scoring_instructions_template.render_template_value(
likert_scale=likert_scale, category=self._score_category
)
def _likert_scale_description_to_string(self, descriptions: list[Dict[str, str]]) -> str:
"""
Converts the Likert scales to a string representation to be put in a system prompt.
Args:
descriptions: list[Dict[str, str]]: The Likert scale to use.
Returns:
str: The string representation of the Likert scale.
"""
if not descriptions:
raise ValueError("Impropoerly formated Likert scale yaml file. No likert scale_descriptions provided")
likert_scale_description = ""
for description in descriptions:
name = description["score_value"]
desc = description["description"]
if int(name) < 0 or int(name) > 5:
raise ValueError(
"Impropoerly formated Likert scale yaml file. Likert scale values must be between 1 and 5"
)
likert_scale_description += f"'{name}': {desc}\n"
return likert_scale_description
[docs]
async def score_async(self, request_response: PromptRequestPiece, *, task: Optional[str] = None) -> list[Score]:
"""
Scores the given request_response using "self-ask" for the chat target and adds score to memory.
Args:
request_response (PromptRequestPiece): The prompt request piece containing the text to be scored.
task (str): The task based on which the text should be scored (the original attacker model's objective).
Currently not supported for this scorer.
Returns:
list[Score]: The request_response scored.
The category is configured from the likert_scale.
The score_value is a value from [0,1] that is scaled from the likert scale.
"""
self.validate(request_response, task=task)
unvalidated_score: UnvalidatedScore = await self._score_value_with_llm(
prompt_target=self._prompt_target,
system_prompt=self._system_prompt,
prompt_request_value=request_response.converted_value,
prompt_request_data_type=request_response.converted_value_data_type,
scored_prompt_id=request_response.id,
category=self._score_category,
task=task,
)
score = unvalidated_score.to_score(
score_value=str(self.scale_value_float(float(unvalidated_score.raw_score_value), 1, 5)),
)
score.score_metadata = str({"likert_value": str(unvalidated_score.raw_score_value)})
self._memory.add_scores_to_memory(scores=[score])
return [score]
[docs]
def validate(self, request_response: PromptRequestPiece, *, task: Optional[str] = None):
pass