Source code for pyrit.orchestrator.single_turn.role_play_orchestrator

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import enum
import logging
import pathlib
from typing import Optional

from pyrit.common.path import DATASETS_PATH
from pyrit.models import PromptRequestResponse, SeedPromptDataset
from pyrit.orchestrator import OrchestratorResult, PromptSendingOrchestrator
from pyrit.prompt_converter import LLMGenericTextConverter
from pyrit.prompt_normalizer import PromptConverterConfiguration
from pyrit.prompt_target import PromptChatTarget
from pyrit.score import Scorer

logger = logging.getLogger(__name__)


class RolePlayPaths(enum.Enum):
    VIDEO_GAME = pathlib.Path(DATASETS_PATH) / "orchestrators" / "role_play" / "video_game.yaml"
    MOVIE_SCRIPT = pathlib.Path(DATASETS_PATH) / "orchestrators" / "role_play" / "movie_script.yaml"
    TRIVIA_GAME = pathlib.Path(DATASETS_PATH) / "orchestrators" / "role_play" / "trivia_game.yaml"
    PERSUASION_SCRIPT = pathlib.Path(DATASETS_PATH) / "orchestrators" / "role_play" / "persuasion_script.yaml"


[docs] class RolePlayOrchestrator(PromptSendingOrchestrator): """ This orchestrator implements a role-playing attack where the objective is rephrased into a game or script context. It uses an adversarial chat target to rephrase the objective into a more benign form that fits within the role-play scenario, making it harder for the target to detect the true intent. """
[docs] def __init__( self, objective_target: PromptChatTarget, adversarial_chat: PromptChatTarget, role_play_definition_path: pathlib.Path, request_converter_configurations: Optional[list[PromptConverterConfiguration]] = None, response_converter_configurations: Optional[list[PromptConverterConfiguration]] = None, objective_scorer: Optional[Scorer] = None, auxiliary_scorers: Optional[list[Scorer]] = None, should_convert_prepended_conversation: bool = True, batch_size: int = 10, retries_on_objective_failure: int = 0, verbose: bool = False, ) -> None: """ Args: objective_target (PromptChatTarget): The target for sending prompts. adversarial_chat (PromptChatTarget): The target used to rephrase objectives into role-play scenarios. role_play_definition_path (pathlib.Path): Path to the YAML file containing role-play definitions. request_converter_configurations (list[PromptConverterConfiguration], Optional): List of prompt converters. response_converter_configurations (list[PromptConverterConfiguration], Optional): List of response converters. objective_scorer (Scorer, Optional): Scorer to use for evaluating if the objective was achieved. auxiliary_scorers (list[Scorer], Optional): List of additional scorers to use for each prompt request response. should_convert_prepended_conversation (bool, Optional): Whether to convert the prepended conversation. batch_size (int, Optional): The (max) batch size for sending prompts. Defaults to 10. Note: If providing max requests per minute on the prompt_target, this should be set to 1 to ensure proper rate limit management. retries_on_objective_failure (int, Optional): Number of retries to attempt if objective fails. Defaults to 0. verbose (bool, Optional): Whether to log debug information. Defaults to False. """ self._adversarial_chat = adversarial_chat role_play_definition: SeedPromptDataset = SeedPromptDataset.from_yaml_file(role_play_definition_path) self._rephrase_instructions = role_play_definition.prompts[0] self._user_start_turn = role_play_definition.prompts[1] self._assistant_start_turn = role_play_definition.prompts[2] rephrase_turn_converter = PromptConverterConfiguration.from_converters( converters=[ LLMGenericTextConverter( converter_target=adversarial_chat, user_prompt_template_with_objective=self._rephrase_instructions, ) ] ) super().__init__( objective_target=objective_target, request_converter_configurations=rephrase_turn_converter + (request_converter_configurations or []), response_converter_configurations=response_converter_configurations, objective_scorer=objective_scorer, auxiliary_scorers=auxiliary_scorers, should_convert_prepended_conversation=should_convert_prepended_conversation, batch_size=batch_size, retries_on_objective_failure=retries_on_objective_failure, verbose=verbose, )
[docs] async def run_attack_async( # type: ignore[override] self, *, objective: str, memory_labels: Optional[dict[str, str]] = None, ) -> OrchestratorResult: prepended_conversation = await self._get_conversation_start(objective=objective) return await super().run_attack_async( objective=objective, prepended_conversation=prepended_conversation, memory_labels=memory_labels, )
[docs] async def run_attacks_async( # type: ignore[override] self, *, objectives: list[str], memory_labels: Optional[dict[str, str]] = None, ) -> list[OrchestratorResult]: return await super()._run_attacks_with_only_objectives_async( objectives=objectives, memory_labels=memory_labels, )
async def _get_conversation_start(self, objective: str = None) -> Optional[list[PromptRequestResponse]]: return [ PromptRequestResponse.from_prompt( prompt=self._user_start_turn.value, role="user", ), PromptRequestResponse.from_prompt( prompt=self._assistant_start_turn.value, role="assistant", ), ]