Source code for pyrit.orchestrator.fuzzer_orchestrator

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from __future__ import annotations

import logging
import random
import uuid
from dataclasses import dataclass
from typing import Optional, Union

import numpy as np
from colorama import Fore, Style

from pyrit.exceptions import MissingPromptPlaceholderException, pyrit_placeholder_retry
from pyrit.memory import CentralMemory, MemoryInterface
from pyrit.models import SeedPrompt
from pyrit.orchestrator import Orchestrator
from pyrit.prompt_converter import FuzzerConverter, PromptConverter
from pyrit.prompt_normalizer import NormalizerRequest, PromptNormalizer
from pyrit.prompt_target import PromptChatTarget, PromptTarget
from pyrit.score import FloatScaleThresholdScorer, SelfAskScaleScorer

TEMPLATE_PLACEHOLDER = "{{ prompt }}"
logger = logging.getLogger(__name__)


class PromptNode:
    """
    Class to maintain the tree information for each prompt template
    """

    def __init__(
        self,
        template: str,
        parent: Optional[PromptNode] = None,
    ):
        """
        Creates the PromptNode instance.

        Args:
            template: Prompt template.
            parent: Parent node.
        """

        self.id = uuid.uuid4()
        self.template: str = template
        self.children: list[PromptNode] = []
        self.level: int = 0 if parent is None else parent.level + 1
        self.visited_num = 0
        self.rewards: float = 0
        self.parent: Optional[PromptNode] = None
        if parent is not None:
            self.add_parent(parent)

    def add_parent(self, parent: PromptNode):
        self.parent = parent
        parent.children.append(self)


@dataclass
class FuzzerResult:
    success: bool
    templates: list[str]
    description: str
    prompt_target_conversation_ids: Optional[list[Union[str, uuid.UUID]]] = None

    def __str__(self) -> str:
        return (
            "FuzzerResult("
            f"success={self.success},"
            f"templates={self.templates},"
            f"description={self.description},"
            f"prompt_target_conversation_ids={self.prompt_target_conversation_ids})"
        )

    def __repr__(self) -> str:
        return self.__str__()

    def print_templates(self):
        """
        Prints the templates that were successful in jailbreaking the target.
        """
        if self.templates:
            print("Successful Templates:")
            for template in self.templates:
                print(f"---\n{template}")
        else:
            print("No successful templates found.")

    def print_conversations(self):
        """
        Prints the conversations of the successful jailbreaks.

        Args:
            result: The result of the fuzzer.
        """
        memory = CentralMemory.get_memory_instance()
        for conversation_id in self.prompt_target_conversation_ids:
            print(f"\nConversation ID: {conversation_id}")

            target_messages = memory.get_prompt_request_pieces(conversation_id=str(conversation_id))

            if not target_messages or len(target_messages) == 0:
                print("No conversation with the target")
                return

            for message in target_messages:
                if message.role == "user":
                    print(f"{Style.BRIGHT}{Fore.BLUE}{message.role}: {message.converted_value}")
                else:
                    print(f"{Style.NORMAL}{Fore.YELLOW}{message.role}: {message.converted_value}")

                scores = memory.get_prompt_scores(prompt_ids=[str(message.id)])
                if scores and len(scores) > 0:
                    score = scores[0]
                    print(f"{Style.RESET_ALL}score: {score} : {score.score_rationale}")



[docs]
class FuzzerOrchestrator(Orchestrator):
    """
    An orchestrator that explores a variety of jailbreak options via fuzzing.

    Paper - GPTFUZZER - Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts.
    Link - https://arxiv.org/pdf/2309.10253
    Authors - Jiahao Yu, Xingwei Lin, Zheng Yu, Xinyu Xing
    GitHub - https://github.com/sherdencooper/GPTFuzz
    """

    _memory: MemoryInterface


[docs]
    def __init__(
        self,
        *,
        prompts: list[str],
        prompt_target: PromptTarget,
        prompt_templates: list[str],
        prompt_converters: Optional[list[PromptConverter]] = None,
        template_converters: list[FuzzerConverter],
        scoring_target: PromptChatTarget,
        verbose: bool = False,
        frequency_weight: float = 0.5,
        reward_penalty: float = 0.1,
        minimum_reward: float = 0.2,
        non_leaf_node_probability: float = 0.1,
        batch_size: int = 10,
        target_jailbreak_goal_count: int = 1,
        max_query_limit: Optional[int] = None,
    ) -> None:
        """
        Creates the FuzzerOrchestrator instance.

        Args:
            prompts: The prompts will be the questions to the target.
            prompt_target: The target to send the prompts to.
            prompt_templates: List of all the jailbreak templates which will act as the seed pool.
                At each iteration, a seed will be selected using the MCTS-explore algorithm which will be sent to the
                shorten/expand prompt converter. The converted template along with the prompt will be sent to the
                target.
            prompt_converters: The prompt_converters to use to convert the prompts before sending
                them to the prompt target.
            template_converters: The converters that will be applied on the jailbreak template that was
                selected by MCTS-explore. The converters will not be applied to the prompts.
                In each iteration of the algorithm, one converter is chosen at random.
            verbose: Whether to print debug information.
            frequency_weight: constant that balances between the seed with high reward and the seed that is
                selected fewer times.
            reward_penalty: Reward penalty diminishes the reward for the current node and its ancestors
                when the path lengthens.
            minimum_reward: Minimal reward prevents the reward of the current node and its ancestors
                from being too small or negative.
            non_leaf_node_probability: parameter which decides the likelihood of selecting a non-leaf node.
            batch_size (int, Optional): The (max) batch size for sending prompts. Defaults to 10.
            target_jailbreak_goal_count: target number of the jailbreaks after which the fuzzer will stop.
            max_query_limit: Maximum number of times the fuzzer will run. By default, it calculates the product
                of prompts and prompt templates and multiplies it by 10. Each iteration makes as many calls as
                the number of prompts.
        """

        super().__init__(prompt_converters=prompt_converters, verbose=verbose)

        if not prompt_templates:
            raise ValueError("The initial set of prompt templates cannot be empty.")
        if not prompts:
            raise ValueError("The initial prompts cannot be empty.")
        if not template_converters:
            raise ValueError("Template converters cannot be empty.")

        if batch_size < 1:
            raise ValueError("Batch size must be at least 1.")

        self._prompt_target = prompt_target
        self._prompts = prompts
        self._prompt_normalizer = PromptNormalizer()
        self._prompt_templates = prompt_templates
        self._template_converters = template_converters
        self._frequency_weight = frequency_weight
        self._reward_penalty = reward_penalty
        self._minimum_reward = minimum_reward
        self._non_leaf_node_probability = non_leaf_node_probability
        # Target number of the jailbreaks after which the fuzzer will stop
        self._target_jailbreak_goal_count = target_jailbreak_goal_count
        # Maximum number of times the fuzzer will query the target (not including scoring calls)
        if max_query_limit:
            if max_query_limit < len(prompts):
                raise ValueError("The query limit must be at least the number of prompts to run a single iteration.")
            self._max_query_limit = max_query_limit
        else:
            self._max_query_limit = len(self._prompt_templates) * len(self._prompts) * 10
        self._total_target_query_count = 0
        self._total_jailbreak_count = 0
        self._jailbreak_conversation_ids: list[Union[str, uuid.UUID]] = []
        self._batch_size = batch_size
        self._new_prompt_nodes: list[PromptNode] = []
        self._step = 0
        # keeps track of the path that has been currently selected
        self._mcts_selected_path: list[PromptNode] = []  # type: ignore

        scale_scorer = SelfAskScaleScorer(
            chat_target=scoring_target,
            scale_arguments_path=SelfAskScaleScorer.ScalePaths.TREE_OF_ATTACKS_SCALE.value,
            system_prompt_path=SelfAskScaleScorer.SystemPaths.GENERAL_SYSTEM_PROMPT.value,
        )
        self._scorer = FloatScaleThresholdScorer(
            scorer=scale_scorer,
            threshold=0.8,
        )

        for template in self._prompt_templates:
            if TEMPLATE_PLACEHOLDER not in template:
                raise MissingPromptPlaceholderException(message="Prompt placeholder is empty.")

        # convert each template into a node and maintain the node information parent, child, etc.
        self._initial_prompt_nodes: list[PromptNode] = [PromptNode(prompt) for prompt in prompt_templates]

        self._last_choice_node: Optional[PromptNode] = None



[docs]
    async def execute_fuzzer(self) -> FuzzerResult:
        """
        Generates new templates by applying transformations to existing templates and returns successful ones.

        This method uses the MCTS-explore algorithm to select a template in each iteration and
        applies a randomly chosen template converter to generate a new template.

        Subsequently, it creates a set of prompts by populating instances of the new template with all the prompts.

        These prompts are sent to the target and the responses scored.

        A new template is considered successful if this resulted in at least one successful jailbreak
        which is identified by having a high enough score.

        Successful templates are added to the initial list of templates and may be selected again in
        subsequent iterations.

        Finally, rewards for all nodes are updated.

        The algorithm stops when a sufficient number of jailbreaks are found with new templates or
        when the query limit is reached.
        """
        while True:
            # stopping criteria
            if (self._total_target_query_count + len(self._prompts)) > self._max_query_limit:
                query_limit_reached_message = "Query limit reached."
                logger.info(query_limit_reached_message)
                return FuzzerResult(
                    success=False,
                    templates=[node.template for node in self._new_prompt_nodes],
                    description=query_limit_reached_message,
                    prompt_target_conversation_ids=self._jailbreak_conversation_ids,
                )

            if self._total_jailbreak_count >= self._target_jailbreak_goal_count:
                target_jailbreak_goal_count_reached_message = "Maximum number of jailbreaks reached."
                logger.info(target_jailbreak_goal_count_reached_message)
                return FuzzerResult(
                    success=True,
                    templates=[node.template for node in self._new_prompt_nodes],
                    description=target_jailbreak_goal_count_reached_message,
                    prompt_target_conversation_ids=self._jailbreak_conversation_ids,
                )

            # 1. Select a seed from the list of the templates using the MCTS
            current_seed = self._select_template_with_mcts()

            # 2. Apply seed converter to the selected template.
            try:
                other_templates = []
                node_ids_on_mcts_selected_path = [node.id for node in self._mcts_selected_path]
                for prompt_node in self._initial_prompt_nodes + self._new_prompt_nodes:
                    if prompt_node.id not in node_ids_on_mcts_selected_path:
                        other_templates.append(prompt_node.template)
                target_seed = await self._apply_template_converter(
                    template=current_seed.template,
                    other_templates=other_templates,
                )
            except MissingPromptPlaceholderException as e:
                error_message = (
                    "Tried to apply to and failed even after retries as it didn't preserve the "
                    f"prompt placeholder: {e}"
                )
                logger.error(error_message)
                return FuzzerResult(
                    success=False,
                    templates=[node.template for node in self._new_prompt_nodes],
                    description=error_message,
                    prompt_target_conversation_ids=self._jailbreak_conversation_ids,
                )

            target_template = SeedPrompt(value=target_seed, data_type="text", parameters=["prompt"])

            # convert the target_template into a prompt_node to maintain the tree information
            target_template_node = PromptNode(template=target_seed, parent=None)

            # 3. Fill in prompts into the newly generated template.
            jailbreak_prompts = []
            for prompt in self._prompts:
                jailbreak_prompts.append(target_template.render_template_value(prompt=prompt))

            # 4. Apply prompt converter if any and send request to the target
            requests: list[NormalizerRequest] = []
            for jailbreak_prompt in jailbreak_prompts:
                request = self._create_normalizer_request(
                    prompt_text=jailbreak_prompt,
                    prompt_type="text",
                    converters=self._prompt_converters,
                )
                requests.append(request)

            responses = await self._prompt_normalizer.send_prompt_batch_to_target_async(
                requests=requests,
                target=self._prompt_target,
                labels=self._global_memory_labels,
                orchestrator_identifier=self.get_identifier(),
                batch_size=self._batch_size,
            )

            response_pieces = [response.request_pieces[0] for response in responses]

            # 5. Score responses.
            scores = await self._scorer.score_prompts_with_tasks_batch_async(
                request_responses=response_pieces, tasks=self._prompts
            )
            score_values = [score.get_value() for score in scores]

            # 6. Update the rewards for each of the nodes.
            jailbreak_count = 0
            for index, score in enumerate(score_values):
                if score is True:
                    jailbreak_count += 1
                    self._jailbreak_conversation_ids.append(response_pieces[index].conversation_id)
            num_executed_queries = len(score_values)

            self._total_jailbreak_count += jailbreak_count
            self._total_target_query_count += num_executed_queries

            if jailbreak_count > 0:
                # The template resulted in at least one jailbreak so it will be part of the results
                # and a potential starting template for future iterations.
                self._new_prompt_nodes.append(target_template_node)
                target_template_node.add_parent(current_seed)

            # update the rewards for the target node and others on its path
            self._update(jailbreak_count=jailbreak_count)


    def _select_template_with_mcts(self) -> PromptNode:
        """
        This method selects the template from the list of templates using the MCTS algorithm.
        """
        self._step += 1

        current = max(self._initial_prompt_nodes, key=self._best_UCT_score())  # initial path
        self._mcts_selected_path = [current]

        # while node is not a leaf
        while len(current.children) > 0:
            if np.random.rand() < self._non_leaf_node_probability:
                break
            # compute the node with the best UCT score
            current = max(current.children, key=self._best_UCT_score())
            # append node to path
            self._mcts_selected_path.append(current)

        for prompt_node in self._mcts_selected_path:
            # keep track of number of visited nodes
            prompt_node.visited_num += 1

        self._last_choice_node = current
        # returns the best child
        return current

    def _best_UCT_score(self):
        """Function to compute the Upper Confidence Bounds for Trees(UCT) score for each seed.
        The highest-scoring seed will be selected as the next seed.

        This is an extension of the Monte carlo tree search (MCTS) algorithm which applies Upper
        Confidence Bound (UCB) to the node.

        The UCB function determines the confidence interval for each node and returns the highest value
        which will be selected as the next seed.
        """
        return lambda pn: pn.rewards / (pn.visited_num + 1) + self._frequency_weight * np.sqrt(
            2 * np.log(self._step) / (pn.visited_num + 0.01)
        )  # self._frequency_weight - constant that balances between the seed with high reward
        # and the seed that is selected fewer times.

    def _update(self, jailbreak_count: int):
        """
        Updates the reward of all the nodes in the last chosen path.
        """
        last_chosen_node = self._last_choice_node
        for prompt_node in reversed(self._mcts_selected_path):
            # The output from the template converter in this version will always be a single template so
            # the formula always contains a fixed 1. If this ever gets extended to multiple templates
            # being converted at the same time we need to adjust this formula.
            reward = jailbreak_count / (len(self._prompts) * 1)
            prompt_node.rewards += reward * max(
                self._minimum_reward, (1 - self._reward_penalty * last_chosen_node.level)
            )

    @pyrit_placeholder_retry
    async def _apply_template_converter(self, *, template: str, other_templates: list[str]) -> str:
        """
        Asynchronously applies template converter.

        Args:
            template: The template that is selected.
            other_templates: Other templates that are available. Some fuzzer converters require multiple templates.

        Returns:
            converted template with placeholder for prompt.

        Raises:
            MissingPromptPlaceholderException: If the prompt placeholder is still missing.
        """
        template_converter = random.choice(self._template_converters)

        # This is only required for template converters that require multiple templates,
        # e.g. crossover converter. For others, this is a no op.
        template_converter.update(prompt_templates=other_templates)

        target_seed_obj = await template_converter.convert_async(prompt=template)
        if TEMPLATE_PLACEHOLDER not in target_seed_obj.output_text:
            raise MissingPromptPlaceholderException(message="Prompt placeholder is empty.")
        return target_seed_obj.output_text