Source code for pyrit.score.look_back_scorer

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import re
from pathlib import Path
from typing import Optional

import yaml

from pyrit.common.path import DATASETS_PATH
from pyrit.models import PromptRequestPiece, SeedPrompt
from pyrit.models.score import Score, UnvalidatedScore
from pyrit.prompt_target import PromptChatTarget
from pyrit.score.scorer import Scorer


[docs] class LookBackScorer(Scorer): """ Create a score from analyzing the entire conversation and adds them to the database. Parameters: chat_target (PromptChatTarget): The chat target to use for scoring. exclude_instruction_prompts (bool): If True, excludes instruction prompts from the conversation. Must be passed as a keyword argument. """
[docs] def __init__(self, chat_target: PromptChatTarget, *, exclude_instruction_prompts: bool) -> None: self._prompt_target = chat_target self.scorer_type = "float_scale" self.exclude_instruction_prompts = exclude_instruction_prompts behavior_change_prompt_path = Path( DATASETS_PATH, "score", "scales", "behavior_change_system_prompt.yaml" ).resolve() behavior_change_scale_path = Path(DATASETS_PATH, "score", "likert_scales", "behavior_change.yaml") behavior_change_scale = yaml.safe_load(behavior_change_scale_path.read_text(encoding="utf-8")) scoring_instructions_template = SeedPrompt.from_yaml_file(behavior_change_prompt_path) self._system_prompt = scoring_instructions_template.render_template_value( step_description=behavior_change_scale )
[docs] async def score_async(self, request_piece: PromptRequestPiece, *, task: Optional[str] = None) -> list[Score]: """ Scores the entire conversation based on detected behavior change. Args: request_piece (PromptRequestPiece): A piece of the conversation to be scored. The converation ID is used to retrieve the full conversation from memory. task (str): The task based on which the text should be scored (the original attacker model's objective). Currently not supported for this scorer. Returns: The score is the detected amount of behavior change throughout the conversation. """ # Retrieve conversation from provided PromptRequestPiece conversation = self._memory.get_conversation(conversation_id=request_piece.conversation_id) if not conversation: raise ValueError(f"Conversation with ID {request_piece.conversation_id} not found in memory.") # Loop through each request to create a single string containing the conversation text conversation_as_text = "" for request in conversation: # Check exclude_instruction_prompts flag if self.exclude_instruction_prompts: # Skip instruction prompts if re.search(r"#\s*Instructions", request.request_pieces[0].original_value, re.IGNORECASE): continue # Append the request text to the conversation string # Only append user and assistant roles to the conversation string if request.request_pieces[0].role in ["user", "assistant"]: conversation_as_text += ( f"{request.request_pieces[0].role}: {request.request_pieces[0].original_value}\n" ) # Send full conversation to LLM for scoring unvalidated_score: UnvalidatedScore = await self._score_value_with_llm( prompt_target=self._prompt_target, system_prompt=self._system_prompt, prompt_request_value=conversation_as_text, prompt_request_data_type="text", scored_prompt_id=conversation[0].request_pieces[0].original_prompt_id, category="illegal", orchestrator_identifier=conversation[0].request_pieces[0].orchestrator_identifier, ) score = unvalidated_score.to_score(score_value=unvalidated_score.raw_score_value) self._memory.add_scores_to_memory(scores=[score]) # Output score results print("LookBackScorer:", score.score_value, score.score_value_description, "Rationale: ", score.score_rationale) return [score]
[docs] def validate(self, request_response: PromptRequestPiece, *, task: Optional[str] = None): pass