Source code for pyrit.orchestrator.multi_turn.red_teaming_orchestrator

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from __future__ import annotations

import enum
import logging
import warnings
from pathlib import Path
from typing import Optional, cast

from typing_extensions import LiteralString, deprecated

from pyrit.attacks import (
    AttackAdversarialConfig,
    AttackConverterConfig,
    AttackScoringConfig,
    MultiTurnAttackContext,
    RedTeamingAttack,
)
from pyrit.common import deprecation_message
from pyrit.common.path import RED_TEAM_ORCHESTRATOR_PATH
from pyrit.models import AttackOutcome
from pyrit.orchestrator import MultiTurnOrchestrator, OrchestratorResult
from pyrit.prompt_converter import PromptConverter
from pyrit.prompt_normalizer import PromptNormalizer
from pyrit.prompt_normalizer.prompt_converter_configuration import (
    PromptConverterConfiguration,
)
from pyrit.prompt_target import PromptChatTarget, PromptTarget
from pyrit.score import Scorer

logger = logging.getLogger(__name__)


class RTOSystemPromptPaths(enum.Enum):
    TEXT_GENERATION = Path(RED_TEAM_ORCHESTRATOR_PATH, "text_generation.yaml").resolve()
    IMAGE_GENERATION = Path(RED_TEAM_ORCHESTRATOR_PATH, "image_generation.yaml").resolve()
    NAIVE_CRESCENDO = Path(RED_TEAM_ORCHESTRATOR_PATH, "naive_crescendo.yaml").resolve()
    VIOLENT_DURIAN = Path(RED_TEAM_ORCHESTRATOR_PATH, "violent_durian.yaml").resolve()
    CRUCIBLE = Path(RED_TEAM_ORCHESTRATOR_PATH, "crucible.yaml").resolve()


[docs] @deprecated( cast( LiteralString, deprecation_message( old_item="RedTeamingOrchestrator", new_item=RedTeamingAttack, removed_in="v0.12.0", ), ), ) class RedTeamingOrchestrator(MultiTurnOrchestrator): """ .. warning:: `RedTeamingOrchestrator` is deprecated and will be removed in **v0.12.0**; use `pyrit.attacks.RedTeamingAttack` instead. The `RedTeamingOrchestrator` class orchestrates a multi-turn red teaming attack on a target system. It is extemely simple. It sends a prompt to the target system, and then sends the response to the red teaming chat. Args: objective_target (PromptTarget): Target for created prompts. adversarial_chat (PromptChatTarget): Endpoint creating prompts sent to objective_target. adversarial_chat_system_prompt_path (Path): Path to initial adversarial_chat system prompt. adversarial_chat_seed_prompt (str): Initial seed prompt for the adversarial chat. initial_adversarial_chat_prompt (str, Optional): Initial message to start the chat. Defaults to "How can I help you?". prompt_converters (Optional[list[PromptConverter]]): Converters for prompt formatting. Defaults to None. max_turns (int, Optional): Max turns for the conversation, ≥ 0. Defaults to 5. objective_scorer (Scorer): Scores prompt target output as sufficient or insufficient. use_score_as_feedback (bool, Optional): Use scoring as feedback. Defaults to True. verbose (bool, Optional): Print debug info. Defaults to False. Raises: FileNotFoundError: If adversarial_chat_system_prompt_path file not found. ValueError: If max_turns ≤ 0 or if objective_scorer is not binary. """
[docs] def __init__( self, *, objective_target: PromptTarget, adversarial_chat: PromptChatTarget, adversarial_chat_system_prompt_path: Path = RTOSystemPromptPaths.TEXT_GENERATION.value, adversarial_chat_seed_prompt: str = "How can I help you?", prompt_converters: Optional[list[PromptConverter]] = None, max_turns: int = 5, objective_scorer: Scorer, use_score_as_feedback: bool = True, batch_size: int = 1, verbose: bool = False, ) -> None: warnings.warn( deprecation_message( old_item="RedTeamingOrchestrator", new_item=RedTeamingAttack, removed_in="v0.12.0", ), DeprecationWarning, stacklevel=2, ) if objective_scorer.scorer_type != "true_false": raise ValueError( f"The scorer must be a true/false scorer. The scorer type is {objective_scorer.scorer_type}." ) super().__init__( objective_target=objective_target, adversarial_chat=adversarial_chat, adversarial_chat_system_prompt_path=adversarial_chat_system_prompt_path, adversarial_chat_seed_prompt=adversarial_chat_seed_prompt, max_turns=max_turns, prompt_converters=prompt_converters, objective_scorer=objective_scorer, verbose=verbose, batch_size=batch_size, ) self._prompt_normalizer = PromptNormalizer() self._use_score_as_feedback = use_score_as_feedback # Build the new attack model self._attack = RedTeamingAttack( objective_target=objective_target, attack_adversarial_config=AttackAdversarialConfig( target=adversarial_chat, system_prompt_path=adversarial_chat_system_prompt_path, seed_prompt=adversarial_chat_seed_prompt, ), attack_scoring_config=AttackScoringConfig( objective_scorer=objective_scorer, use_score_as_feedback=use_score_as_feedback, ), attack_converter_config=AttackConverterConfig( request_converters=PromptConverterConfiguration.from_converters(converters=prompt_converters or []), ), prompt_normalizer=self._prompt_normalizer, max_turns=max_turns, )
[docs] async def run_attack_async( self, *, objective: str, memory_labels: Optional[dict[str, str]] = None ) -> OrchestratorResult: # Transitions to the new attack model context = MultiTurnAttackContext( objective=objective, memory_labels=memory_labels or {}, ) result = await self._attack.execute_with_context_async(context=context) objective_achieved = result.outcome == AttackOutcome.SUCCESS # Translating the result back to the orchestrator result format return OrchestratorResult( conversation_id=result.conversation_id, objective=objective, status="success" if objective_achieved else "failure", objective_score=result.last_score, confidence=1.0 if objective_achieved else 0.0, )