Source code for pyrit.executor.attack.core.attack_config
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional, Union
from pyrit.executor.core import StrategyConverterConfig
from pyrit.models.seed_prompt import SeedPrompt
from pyrit.prompt_target.common.prompt_chat_target import PromptChatTarget
from pyrit.score import Scorer
[docs]
@dataclass
class AttackAdversarialConfig:
"""
Advarsarial configuration for attacks that involve adversarial chat targets.
This class defines the configuration for attacks that utilize an adversarial chat target,
including the target chat model, system prompt, and seed prompt for the attack.
"""
# Advarsarial chat target for the attack
target: PromptChatTarget
# Path to the YAML file containing the system prompt for the advarsarial chat target
system_prompt_path: Optional[Union[str, Path]] = None
# Seed prompt for the adversarial chat target
seed_prompt: Union[str, SeedPrompt] = "How can I help you?"
[docs]
@dataclass
class AttackScoringConfig:
"""
Scoring configuration for evaluating attack effectiveness.
This class defines the scoring components used to evaluate attack effectiveness,
detect refusals, and perform auxiliary scoring operations.
"""
# Primary scorer for evaluating attack effectiveness
objective_scorer: Optional[Scorer] = None
# Refusal scorer for detecting refusals or non-compliance
refusal_scorer: Optional[Scorer] = None
# Additional scorers for auxiliary metrics or custom evaluations
auxiliary_scorers: List[Scorer] = field(default_factory=list)
# Whether to use scoring results as feedback for iterative attacks
use_score_as_feedback: bool = True
# Threshold for considering an objective successful [0.0 to 1.0]
# A value of 1.0 means the objective must be fully achieved, while 0.0 means any score is acceptable.
# Only applies to float_scale scorers
successful_objective_threshold: float = 0.8
def __post_init__(self):
"""Validate configuration values."""
if not 0.0 <= self.successful_objective_threshold <= 1.0:
raise ValueError(
f"successful_objective_threshold must be between 0.0 and 1.0, "
f"got {self.successful_objective_threshold}"
)
[docs]
@dataclass
class AttackConverterConfig(StrategyConverterConfig):
"""
Configuration for prompt converters used in attacks.
This class defines the converter configurations that transform prompts
during the attack process, both for requests and responses.
"""