Source code for pyrit.orchestrator.multi_turn.pair_orchestrator
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import logging
from pathlib import Path
from typing import Optional
from pyrit.common.path import DATASETS_PATH
from pyrit.models import SeedPrompt
from pyrit.orchestrator import TreeOfAttacksWithPruningOrchestrator
from pyrit.prompt_converter import PromptConverter
from pyrit.prompt_target import PromptChatTarget
logger = logging.getLogger(__name__)
[docs]
class PAIROrchestrator(TreeOfAttacksWithPruningOrchestrator):
"""
This orchestrator implements the Prompt Automatic Iterative Refinement (PAIR) algorithm
This algorithm was published and described in the paper: Chao, Patrick, et al. Jailbreaking Black Box Large
Language Models in Twenty Queries. arXiv:2310.08419, arXiv, 13 Oct. 2023. arXiv.org, http://arxiv.org/abs/2310.08419
The authors published a reference implementation in the following repository: https://github.com/patrickrchao/\
JailbreakingLLMs/blob/main/system_prompts.py
"""
[docs]
def __init__(
self,
*,
objective_target: PromptChatTarget,
adversarial_chat: PromptChatTarget,
scoring_target: PromptChatTarget,
adversarial_chat_seed_prompt: Optional[SeedPrompt] = None,
adversarial_chat_system_prompt_path: Optional[Path] = None,
depth: int = 3,
prompt_converters: Optional[list[PromptConverter]] = None,
objective_achieved_score_threshold: float = 0.8,
desired_response_prefix="Sure, here is",
verbose: bool = False,
) -> None:
adversarial_chat_system_prompt_path = adversarial_chat_system_prompt_path or Path(
DATASETS_PATH / "orchestrators" / "pair" / "attacker_system_prompt.yaml"
)
super().__init__(
objective_target=objective_target,
adversarial_chat=adversarial_chat,
scoring_target=scoring_target,
adversarial_chat_seed_prompt=adversarial_chat_seed_prompt,
adversarial_chat_system_prompt_path=adversarial_chat_system_prompt_path,
width=1,
depth=depth,
branching_factor=1,
on_topic_checking_enabled=False,
prompt_converters=prompt_converters,
objective_achieved_score_threshold=objective_achieved_score_threshold,
desired_response_prefix=desired_response_prefix,
verbose=verbose,
)
[docs]
def set_prepended_conversation(self, *, prepended_conversation):
return super().set_prepended_conversation(prepended_conversation=prepended_conversation)