Source code for pyrit.datasets.babelscape_alert_dataset

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from typing import Literal, Optional

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt



[docs]
def fetch_babelscape_alert_dataset(
    category: Optional[Literal["alert", "alert_adversarial"]] = "alert_adversarial",
) -> SeedPromptDataset:
    """
    Fetch the Babelscape/ALERT dataset and create a SeedPromptDataset.

    Args:
        category (str, Optional): The dataset category, "alert" or "alert_adversarial".
            If None, both categories will be loaded. Defaults to "alert_adversarial".

    Returns:
        SeedPromptDataset: A SeedPromptDataset containing the examples.
    """

    data_categories = None
    if category is None:  # if category is explicitly None, read both subsets
        data_categories = ["alert_adversarial", "alert"]
    elif category not in ["alert_adversarial", "alert"]:
        raise ValueError(f"Invalid Parameter: {category}. Expected 'alert_adversarial' or 'alert'")
    else:
        data_categories = [category]

    # Load specified subset or both categories
    prompts: list[str] = []
    for name in data_categories:
        data = load_dataset("Babelscape/ALERT", name)
        prompts.extend(item["prompt"] for item in data["test"])

    # Create SeedPrompt instances from each example in 'prompts'
    seed_prompts = [
        SeedPrompt(
            value=prompt,
            data_type="text",
            name="",
            dataset_name="Babelscape/ALERT",
            description="""ALERT by Babelscape is a dataset that consists
            of two different categories, 'alert' with 15k red teaming prompts,
            and 'alert_adversarial' with 30k adversarial red teaming prompts.""",
            source="https://huggingface.co/datasets/Babelscape/ALERT",
        )
        for prompt in prompts
    ]

    seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
    return seed_prompt_dataset