# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import logging
from typing import Optional, cast
from typing_extensions import LiteralString, deprecated
from pyrit.common import deprecation_message
from pyrit.executor.attack import AttackConverterConfig, AttackScoringConfig, FlipAttack
from pyrit.orchestrator import PromptSendingOrchestrator
from pyrit.orchestrator.models.orchestrator_result import OrchestratorResult
from pyrit.prompt_normalizer import PromptConverterConfiguration
from pyrit.prompt_target import PromptChatTarget
from pyrit.score import Scorer
logger = logging.getLogger(__name__)
[docs]
@deprecated(
cast(
LiteralString,
deprecation_message(
old_item="FlipAttackOrchestrator",
new_item=FlipAttack,
removed_in="v0.12.0",
),
),
)
class FlipAttackOrchestrator(PromptSendingOrchestrator):
"""
.. warning::
`FlipAttackOrchestrator` is deprecated and will be removed in **v0.12.0**;
use `pyrit.executor.attack.FlipAttack` instead.
This orchestrator implements the Flip Attack method found here:
https://arxiv.org/html/2410.02832v1.
Essentially, it adds a system prompt to the beginning of the conversation to flip each word in the prompt.
"""
[docs]
def __init__(
self,
objective_target: PromptChatTarget,
request_converter_configurations: Optional[list[PromptConverterConfiguration]] = None,
response_converter_configurations: Optional[list[PromptConverterConfiguration]] = None,
objective_scorer: Optional[Scorer] = None,
auxiliary_scorers: Optional[list[Scorer]] = None,
batch_size: int = 10,
retries_on_objective_failure: int = 0,
verbose: bool = False,
) -> None:
"""
Args:
objective_target (PromptChatTarget): The target for sending prompts.
request_converter_configurations (list[PromptConverterConfiguration], Optional): List of prompt converters.
These are stacked in order on top of the flip converter.
response_converter_configurations (list[PromptConverterConfiguration], Optional): List of response
converters.
objective_scorer (Scorer, Optional): Scorer to use for evaluating if the objective was achieved.
auxiliary_scorers (list[Scorer], Optional): List of additional scorers to use for each prompt request
response.
batch_size (int, Optional): The (max) batch size for sending prompts. Defaults to 10.
Note: If providing max requests per minute on the objective_target, this should be set to 1 to
ensure proper rate limit management.
retries_on_objective_failure (int, Optional): Number of retries to attempt if objective fails. Defaults to
0.
verbose (bool, Optional): Whether to log debug information. Defaults to False.
"""
super().__init__(
objective_target=objective_target,
request_converter_configurations=request_converter_configurations,
response_converter_configurations=response_converter_configurations,
objective_scorer=objective_scorer,
auxiliary_scorers=auxiliary_scorers,
should_convert_prepended_conversation=False,
batch_size=batch_size,
retries_on_objective_failure=retries_on_objective_failure,
verbose=verbose,
)
self._attack = FlipAttack(
objective_target=objective_target,
attack_converter_config=AttackConverterConfig(
request_converters=self._request_converter_configurations,
response_converters=self._response_converter_configurations,
),
attack_scoring_config=AttackScoringConfig(
objective_scorer=self._objective_scorer,
auxiliary_scorers=self._auxiliary_scorers,
),
prompt_normalizer=self._prompt_normalizer,
max_attempts_on_failure=retries_on_objective_failure,
)
[docs]
async def run_attack_async( # type: ignore[override]
self,
*,
objective: str,
memory_labels: Optional[dict[str, str]] = None,
) -> OrchestratorResult:
return await super().run_attack_async(
objective=objective,
memory_labels=memory_labels,
)
[docs]
async def run_attacks_async( # type: ignore[override]
self,
*,
objectives: list[str],
memory_labels: Optional[dict[str, str]] = None,
) -> list[OrchestratorResult]:
return await super()._run_attacks_with_only_objectives_async(
objectives=objectives,
memory_labels=memory_labels,
)