Source code for pyrit.orchestrator.single_turn.context_compliance_orchestrator

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import enum
import logging
import pathlib
from typing import Optional, cast

from typing_extensions import LiteralString, deprecated

from pyrit.common import deprecation_message
from pyrit.common.path import DATASETS_PATH
from pyrit.executor.attack import (
    AttackAdversarialConfig,
    AttackConverterConfig,
    AttackScoringConfig,
    ContextComplianceAttack,
)
from pyrit.orchestrator import OrchestratorResult, PromptSendingOrchestrator
from pyrit.prompt_normalizer.prompt_converter_configuration import (
    PromptConverterConfiguration,
)
from pyrit.prompt_target import PromptChatTarget
from pyrit.score import Scorer

logger = logging.getLogger(__name__)


class ContextDescriptionPaths(enum.Enum):
    GENERAL = pathlib.Path(DATASETS_PATH) / "executors" / "context_compliance" / "context_description.yaml"


[docs] @deprecated( cast( LiteralString, deprecation_message( old_item="ContextComplianceOrchestrator", new_item=ContextComplianceAttack, removed_in="v0.12.0", ), ), ) class ContextComplianceOrchestrator(PromptSendingOrchestrator): """ .. warning:: `ContextComplianceOrchestrator` is deprecated and will be removed in **v0.12.0**; use `pyrit.executor.attack.ContextComplianceAttack` instead. This orchestrator implements a context compliance attack that attempts to bypass safety measures by rephrasing the objective into a more benign context. It uses an adversarial chat target to: 1. Rephrase the objective as a more benign question 2. Generate a response to the benign question 3. Rephrase the original objective as a follow-up question This creates a context that makes it harder for the target to detect the true intent. """
[docs] def __init__( self, objective_target: PromptChatTarget, adversarial_chat: PromptChatTarget, affirmative_response: str = "yes.", context_description_instructions_path: Optional[pathlib.Path] = None, request_converter_configurations: Optional[list[PromptConverterConfiguration]] = None, response_converter_configurations: Optional[list[PromptConverterConfiguration]] = None, objective_scorer: Optional[Scorer] = None, auxiliary_scorers: Optional[list[Scorer]] = None, batch_size: int = 10, retries_on_objective_failure: int = 0, verbose: bool = False, ) -> None: """ Args: objective_target (PromptChatTarget): The target for sending prompts. adversarial_chat (PromptChatTarget): The target used to rephrase objectives into benign contexts. affirmative_response (str, Optional): The affirmative response to be used in the conversation history. context_description_instructions_path (pathlib.Path, Optional): Path to the context description instructions YAML file. request_converter_configurations (list[PromptConverterConfiguration], Optional): List of prompt converters. response_converter_configurations (list[PromptConverterConfiguration], Optional): List of response converters. objective_scorer (Scorer, Optional): Scorer to use for evaluating if the objective was achieved. auxiliary_scorers (list[Scorer], Optional): List of additional scorers to use for each prompt request response. batch_size (int, Optional): The (max) batch size for sending prompts. Defaults to 10. Note: If providing max requests per minute on the prompt_target, this should be set to 1 to ensure proper rate limit management. retries_on_objective_failure (int, Optional): Number of retries to attempt if objective fails. Defaults to 0. verbose (bool, Optional): Whether to log debug information. Defaults to False. """ super().__init__( objective_target=objective_target, request_converter_configurations=request_converter_configurations, response_converter_configurations=response_converter_configurations, objective_scorer=objective_scorer, auxiliary_scorers=auxiliary_scorers, should_convert_prepended_conversation=True, batch_size=batch_size, retries_on_objective_failure=retries_on_objective_failure, verbose=verbose, ) # Use default path if not provided if context_description_instructions_path is None: context_description_instructions_path = ContextDescriptionPaths.GENERAL.value self._attack = ContextComplianceAttack( objective_target=objective_target, attack_adversarial_config=AttackAdversarialConfig(target=adversarial_chat), attack_converter_config=AttackConverterConfig( request_converters=self._request_converter_configurations, response_converters=self._response_converter_configurations, ), attack_scoring_config=AttackScoringConfig( objective_scorer=self._objective_scorer, auxiliary_scorers=self._auxiliary_scorers, ), prompt_normalizer=self._prompt_normalizer, context_description_instructions_path=context_description_instructions_path, affirmative_response=affirmative_response, max_attempts_on_failure=self._retries_on_objective_failure, )
[docs] async def run_attack_async( # type: ignore[override] self, *, objective: str, memory_labels: Optional[dict[str, str]] = None, ) -> OrchestratorResult: return await super().run_attack_async( objective=objective, memory_labels=memory_labels, )
[docs] async def run_attacks_async( # type: ignore[override] self, *, objectives: list[str], memory_labels: Optional[dict[str, str]] = None, ) -> list[OrchestratorResult]: return await super()._run_attacks_with_only_objectives_async( objectives=objectives, memory_labels=memory_labels, )