Source code for pyrit.executor.attack.component.simulated_conversation

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""
Utility functions for generating simulated conversations using adversarial chat.

These utilities help create prepended_conversation content by running an adversarial chat
against a simulated (compliant) target before executing the actual attack.
"""

from __future__ import annotations

import enum
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Union

from pyrit.common.path import EXECUTOR_SIMULATED_TARGET_PATH
from pyrit.executor.attack.core import (
    AttackAdversarialConfig,
    AttackConverterConfig,
    AttackScoringConfig,
)
from pyrit.executor.attack.multi_turn.red_teaming import RedTeamingAttack
from pyrit.memory import CentralMemory
from pyrit.models import Message, Score, SeedPrompt
from pyrit.prompt_target import PromptChatTarget
from pyrit.score import TrueFalseScorer

logger = logging.getLogger(__name__)



[docs]
@dataclass
class SimulatedConversationResult:
    """
    Result from generating a simulated conversation.

    Stores the full conversation and provides properties to access different views of it
    for various attack strategy use cases.

    The conversation attribute contains the complete conversation as a list of Messages
    (user/assistant only, no system messages). The score attribute holds the score from
    evaluating the final turn. The turn_index is a 1-based index of the turn to treat as
    the "final" turn for splitting. If None (default), uses the last turn. Can be set after
    creation to select an earlier turn (e.g., if the last turn's attack didn't work).
    """

    conversation: List[Message]
    score: Optional[Score]
    turn_index: Optional[int] = None

    @property
    def _effective_turn_index(self) -> int:
        """
        Get the effective 1-based turn index.

        Returns:
            int: The turn index to use, bounded by available turns.
        """
        if not self.conversation:
            return 0
        # Calculate total complete turns (user+assistant pairs)
        total_turns = len(self.conversation) // 2
        # Account for trailing user message (incomplete turn)
        if len(self.conversation) % 2 == 1 and self.conversation[-1].api_role == "user":
            total_turns += 1

        if self.turn_index is None:
            return total_turns
        return max(1, min(self.turn_index, total_turns))

    @property
    def prepended_messages(self) -> List[Message]:
        """
        Get all messages before the selected turn with new IDs.

        This returns completed turns before the turn specified by `turn_index`,
        suitable for use as `prepended_conversation` in attack strategies.
        Each message is duplicated with new IDs to avoid database conflicts
        when the messages are inserted into memory by a subsequent attack.

        Returns:
            List[Message]: All messages before the selected turn with fresh IDs.
        """
        turn = self._effective_turn_index
        if turn <= 1:
            return []
        # Each complete turn is 2 messages (user + assistant)
        # Messages before turn N: first (N-1) * 2 messages
        messages = self.conversation[: (turn - 1) * 2]
        return [msg.duplicate_message() for msg in messages]

    @property
    def next_message(self) -> Optional[Message]:
        """
        Get the user message at the selected turn with a new ID.

        This is the user message from the turn specified by `turn_index`, which
        can be used as the initial prompt/next_message for an attack strategy.
        The message is duplicated with a new ID to avoid database conflicts.

        Returns:
            Optional[Message]: The user message at the selected turn with a fresh ID, or None if not found.
        """
        turn = self._effective_turn_index
        if turn < 1:
            return None
        # User message for turn N is at index (N-1) * 2
        user_idx = (turn - 1) * 2
        if user_idx < len(self.conversation) and self.conversation[user_idx].api_role == "user":
            return self.conversation[user_idx].duplicate_message()
        return None




[docs]
class SimulatedTargetSystemPromptPaths(enum.Enum):
    """Enum for predefined simulated target system prompt paths."""

    COMPLIANT = Path(EXECUTOR_SIMULATED_TARGET_PATH, "compliant.yaml").resolve()




[docs]
async def generate_simulated_conversation_async(
    *,
    objective: str,
    adversarial_chat: PromptChatTarget,
    objective_scorer: TrueFalseScorer,
    num_turns: int = 3,
    adversarial_chat_system_prompt_path: Union[str, Path],
    simulated_target_system_prompt_path: Optional[Union[str, Path]] = None,
    attack_converter_config: Optional[AttackConverterConfig] = None,
    memory_labels: Optional[dict[str, str]] = None,
) -> SimulatedConversationResult:
    """
    Generate a simulated conversation between an adversarial chat and a compliant target.

    This utility runs a RedTeamingAttack with `score_last_turn_only=True` against a simulated
    target (the same LLM as adversarial_chat, but configured with a compliant system prompt).
    The resulting conversation can be used as `prepended_conversation` for subsequent attacks
    against real targets.

    Use cases:
    - Creating role-play scenarios dynamically (e.g., movie script, video game)
    - Establishing conversational context before attacking a real target
    - Generating multi-turn jailbreak setups without hardcoded responses

    Args:
        objective (str): The objective for the adversarial chat to work toward.
        adversarial_chat (PromptChatTarget): The adversarial LLM that generates attack prompts.
            This same LLM is also used as the simulated target with a compliant system prompt.
        objective_scorer (TrueFalseScorer): Scorer to evaluate the final turn.
        num_turns (int): Number of conversation turns to generate. Defaults to 3.
        adversarial_chat_system_prompt_path (Union[str, Path]): Path to the system prompt
            for the adversarial chat. This is required.
        simulated_target_system_prompt_path (Optional[Union[str, Path]]): Path to the system prompt
            for the simulated target. If not provided, uses the default compliant prompt.
            The template should accept `objective` and `num_turns` parameters.
        attack_converter_config (Optional[AttackConverterConfig]): Converter configuration for
            the attack. Defaults to None.
        memory_labels (Optional[dict[str, str]]): Labels to associate with the conversation
            in memory. Defaults to None.

    Returns:
        SimulatedConversationResult: The result containing the generated conversation and score.
            Use `prepended_messages` to get completed turns before the selected turn,
            `next_message` to get the user message at the selected turn for use as an
            attack's initial prompt, or access `conversation` directly for all messages.
            Set `turn_index` to select an earlier turn if the final turn wasn't successful.

    Raises:
        ValueError: If num_turns is not a positive integer.
    """
    # Use the same LLM for both adversarial chat and simulated target
    # They get different system prompts to play different roles
    simulated_target = adversarial_chat
    if num_turns <= 0:
        raise ValueError("num_turns must be a positive integer")

    # Load and configure simulated target system prompt
    simulated_target_prompt_path = (
        simulated_target_system_prompt_path or SimulatedTargetSystemPromptPaths.COMPLIANT.value
    )
    simulated_target_system_prompt_template = SeedPrompt.from_yaml_with_required_parameters(
        template_path=simulated_target_prompt_path,
        required_parameters=["objective", "num_turns"],
        error_message="Simulated target system prompt must have objective and num_turns parameters",
    )
    simulated_target_system_prompt = simulated_target_system_prompt_template.render_template_value(
        objective=objective,
        num_turns=num_turns,
    )

    # Create adversarial config for the simulation
    adversarial_config = AttackAdversarialConfig(
        target=adversarial_chat,
        system_prompt_path=adversarial_chat_system_prompt_path,
    )

    # Create scoring config
    scoring_config = AttackScoringConfig(
        objective_scorer=objective_scorer,
        use_score_as_feedback=False,  # Don't need feedback for last-turn-only scoring
    )

    # Create the RedTeamingAttack with simulated target and score_last_turn_only
    attack = RedTeamingAttack(
        objective_target=simulated_target,
        attack_adversarial_config=adversarial_config,
        attack_converter_config=attack_converter_config,
        attack_scoring_config=scoring_config,
        max_turns=num_turns,
        score_last_turn_only=True,
    )

    # Execute the simulated attack
    logger.info(f"Generating {num_turns}-turn simulated conversation for objective: {objective[:50]}...")

    # Create a system message to prepend - this sets the simulated target's behavior
    system_message = Message.from_system_prompt(simulated_target_system_prompt)

    result = await attack.execute_async(
        objective=objective,
        prepended_conversation=[system_message],
        memory_labels=memory_labels,
    )

    # Extract the conversation from memory and filter for prepended_conversation use
    memory = CentralMemory.get_memory_instance()
    raw_messages = list(memory.get_conversation(conversation_id=result.conversation_id))

    # Filter out system messages - prepended_conversation should only have user/assistant turns
    # System prompts are set separately on each target during attack execution
    # Also mark assistant messages as simulated for traceability
    filtered_messages: List[Message] = []
    for message in raw_messages:
        if message.api_role != "system":
            # Mark assistant responses as simulated since this is a simulated conversation
            if message.api_role == "assistant":
                for piece in message.message_pieces:
                    piece._role = "simulated_assistant"
            filtered_messages.append(message)

    # Get the score from the result (there should be one score for the last turn)
    final_score = result.last_score

    logger.info(
        f"Generated simulated conversation with {len(filtered_messages)} messages " f"(outcome: {result.outcome.name})"
    )

    return SimulatedConversationResult(
        conversation=filtered_messages,
        score=final_score,
    )