Source code for pyrit.orchestrator.skeleton_key_orchestrator

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
from pathlib import Path

from typing import Optional
from uuid import uuid4


from pyrit.common.batch_helper import batch_task_async
from pyrit.models import SeedPromptDataset, PromptRequestResponse
from pyrit.common.path import DATASETS_PATH
from pyrit.orchestrator import Orchestrator
from pyrit.prompt_normalizer import PromptNormalizer
from pyrit.prompt_target import PromptTarget
from pyrit.prompt_converter import PromptConverter
from colorama import Style, Fore

logger = logging.getLogger(__name__)


[docs] class SkeletonKeyOrchestrator(Orchestrator): """ Creates an orchestrator that executes a skeleton key jailbreak. The orchestrator sends an inital skeleton key prompt to the target, and then follows up with a separate attack prompt. If successful, the first prompt makes the target comply even with malicious follow-up prompts. In our experiments, using two separate prompts was significantly more effective than using a single combined prompt. Learn more about attack at the link below: https://www.microsoft.com/en-us/security/blog/2024/06/26/mitigating-skeleton-key-a-new-type-of-generative-ai-jailbreak-technique/ """
[docs] def __init__( self, *, skeleton_key_prompt: Optional[str] = None, prompt_target: PromptTarget, prompt_converters: Optional[list[PromptConverter]] = None, batch_size: int = 10, verbose: bool = False, ) -> None: """ Args: skeleton_key_prompt (str, Optional): The skeleton key sent to the target, Default: skeleton_key.prompt prompt_target (PromptTarget): The target for sending prompts. prompt_converters (list[PromptConverter], Optional): List of prompt converters. These are stacked in the order they are provided. E.g. the output of converter1 is the input of converter2. batch_size (int, Optional): The (max) batch size for sending prompts. Defaults to 10. Note: If providing max requests per minute on the prompt_target, this should be set to 1 to ensure proper rate limit management. verbose (bool, Optional): If set to True, verbose output will be enabled. Defaults to False. """ super().__init__(prompt_converters=prompt_converters, verbose=verbose) self._prompt_normalizer = PromptNormalizer() self._skeleton_key_prompt = ( skeleton_key_prompt if skeleton_key_prompt else SeedPromptDataset.from_yaml_file( Path(DATASETS_PATH) / "orchestrators" / "skeleton_key" / "skeleton_key.prompt" ) .prompts[0] .value ) self._prompt_target = prompt_target self._batch_size = batch_size
[docs] async def send_skeleton_key_with_prompt_async( self, *, prompt: str, ) -> PromptRequestResponse: """ Sends a skeleton key, followed by the attack prompt to the target. Args prompt (str): The prompt to be sent. prompt_type (PromptDataType, Optional): The type of the prompt (e.g., "text"). Defaults to "text". Returns: PromptRequestResponse: The response from the prompt target. """ conversation_id = str(uuid4()) target_skeleton_prompt_obj = self._create_normalizer_request( prompt_text=self._skeleton_key_prompt, conversation_id=conversation_id, converters=self._prompt_converters, ) await self._prompt_normalizer.send_prompt_async( normalizer_request=target_skeleton_prompt_obj, target=self._prompt_target, labels=self._global_memory_labels, orchestrator_identifier=self.get_identifier(), ) target_prompt_obj = self._create_normalizer_request( prompt_text=prompt, conversation_id=conversation_id, converters=self._prompt_converters, ) return await self._prompt_normalizer.send_prompt_async( normalizer_request=target_prompt_obj, target=self._prompt_target, labels=self._global_memory_labels, orchestrator_identifier=self.get_identifier(), )
[docs] async def send_skeleton_key_with_prompts_async( self, *, prompt_list: list[str], ) -> list[PromptRequestResponse]: """ Sends a skeleton key and prompt to the target for each prompt in a list of prompts. Args: prompt_list (list[str]): The list of prompts to be sent. prompt_type (PromptDataType, Optional): The type of the prompts (e.g., "text"). Defaults to "text". Returns: list[PromptRequestResponse]: The responses from the prompt target. """ return await batch_task_async( task_func=self.send_skeleton_key_with_prompt_async, task_arguments=["prompt"], prompt_target=self._prompt_target, batch_size=self._batch_size, items_to_batch=[prompt_list], )
[docs] def print_conversation(self) -> None: """Prints all the conversations that have occured with the prompt target.""" target_messages = self.get_memory() if not target_messages or len(target_messages) == 0: print("No conversation with the target") return for message in target_messages: if message.role == "user": print(f"{Style.BRIGHT}{Fore.RED}{message.role}: {message.converted_value}\n") else: print(f"{Style.BRIGHT}{Fore.GREEN}{message.role}: {message.converted_value}\n")