Source code for pyrit.datasets.babelscape_alert_dataset
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from typing import Literal, Optional
from datasets import load_dataset
from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
[docs]
def fetch_babelscape_alert_dataset(
category: Optional[Literal["alert", "alert_adversarial"]] = "alert_adversarial",
) -> SeedPromptDataset:
"""
Fetch the Babelscape/ALERT dataset and create a SeedPromptDataset.
Args:
category (Optional[str]): The dataset category, "alert" or "alert_adversarial".
If None, both categories will be loaded. Defaults to "alert_adversarial".
Returns:
SeedPromptDataset: A SeedPromptDataset containing the examples.
"""
data_categories = None
if category is None: # if category is explicitly None, read both subsets
data_categories = ["alert_adversarial", "alert"]
elif category not in ["alert_adversarial", "alert"]:
raise ValueError(f"Invalid Parameter: {category}. Expected 'alert_adversarial' or 'alert'")
else:
data_categories = [category]
# Load specified subset or both categories
prompts: list[str] = []
for name in data_categories:
data = load_dataset("Babelscape/ALERT", name)
prompts.extend(item["prompt"] for item in data["test"])
# Create SeedPrompt instances from each example in 'prompts'
seed_prompts = [
SeedPrompt(
value=prompt,
data_type="text",
name="",
dataset_name="Babelscape/ALERT",
description="""ALERT by Babelscape is a dataset that consists
of two different categories, 'alert' with 15k red teaming prompts,
and 'alert_adversarial' with 30k adversarial red teaming prompts.""",
source="https://huggingface.co/datasets/Babelscape/ALERT",
)
for prompt in prompts
]
seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
return seed_prompt_dataset