Source code for pyrit.datasets.fetch_jbb_behaviors

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
from typing import Optional

from datasets import load_dataset

from pyrit.models import SeedDataset, SeedPrompt

logger = logging.getLogger(__name__)



[docs]
def fetch_jbb_behaviors_dataset(
    source: str = "JailbreakBench/JBB-Behaviors",
    data_home: Optional[str] = None,
) -> SeedDataset:
    """
    Fetch the JailbreakBench JBB-Behaviors dataset from HuggingFace and create a SeedDataset.

    This dataset contains harmful behaviors for jailbreaking evaluation,
    as described in the paper: https://arxiv.org/abs/2404.01318

    Args:
        source (str): The HuggingFace dataset identifier. Defaults to "JailbreakBench/JBB-Behaviors".
        data_home (str, optional): The directory to cache the dataset. If None, uses default cache.

    Returns:
        SeedDataset: A SeedDataset containing the JBB behaviors with harm_categories set.

    Raises:
        ValueError: If the dataset is empty after processing.
        Exception: If the dataset cannot be loaded or processed.

    Note:
        Content Warning: This dataset contains prompts aimed at provoking harmful responses
        and may contain offensive content. Users should check with their legal department
        before using these prompts against production LLMs.

        For more information and access to the original dataset and related materials, visit:
        https://arxiv.org/abs/2404.01318
    """
    try:
        logger.info(f"Loading JBB-Behaviors dataset from {source}")

        # Load the dataset from HuggingFace
        data = load_dataset(source, "behaviors", cache_dir=data_home)

        # Get the appropriate split (typically 'train' or the first available split)
        dataset_split = data["train"] if "train" in data else data[list(data.keys())[0]]

        # Define common metadata that will be used across all seed prompts
        common_metadata = {
            "dataset_name": "JailbreakBench JBB-Behaviors",
            "authors": ["JailbreakBench Team"],
            "description": (
                "A dataset of harmful behaviors for jailbreaking evaluation from JailbreakBench. "
                "Contains behaviors designed to test AI safety measures."
            ),
            "source": source,
            "data_type": "text",
            "name": "JBB-Behaviors",
        }

        seed_prompts = []

        for item in dataset_split:
            # Extract the required fields
            behavior = item.get("Behavior", "").strip()
            category = item.get("Category", "")

            if not behavior:
                logger.warning("Skipping item with empty behavior")
                continue

            # Map JBB categories to PyRIT harm categories
            harm_categories = _map_jbb_category_to_harm_category(category)

            # Create SeedPrompt object with all metadata
            seed_prompt = SeedPrompt(
                value=behavior,
                harm_categories=harm_categories,
                groups=[category] if category else [],
                metadata={
                    "jbb_category": category,
                    "original_source": "JailbreakBench",
                },
                **common_metadata,  # type: ignore[arg-type]
            )

            seed_prompts.append(seed_prompt)

        if not seed_prompts:
            raise ValueError("SeedDataset cannot be empty.")

        logger.info(f"Successfully loaded {len(seed_prompts)} behaviors from JBB-Behaviors dataset")

        # Create and return SeedDataset
        seed_dataset = SeedDataset(prompts=seed_prompts)
        return seed_dataset

    except Exception as e:
        logger.error(f"Failed to load JBB-Behaviors dataset: {str(e)}")
        raise Exception(f"Error loading JBB-Behaviors dataset: {str(e)}")



def _map_jbb_category_to_harm_category(jbb_category: str) -> list[str]:
    """
    Map JBB categories to PyRIT harm categories.

    Args:
        jbb_category (str): The category from the JBB dataset.

    Returns:
        list[str]: List of corresponding PyRIT harm categories.
    """
    # Based on common harm taxonomies and JailbreakBench paper categories
    category_mapping = {
        # Violence and harm
        "violence": ["violence"],
        "physical_harm": ["violence", "physical_harm"],
        "weapons": ["violence", "weapons"],
        "terrorism": ["violence", "terrorism"],
        # Hate and harassment
        "hate": ["hate"],
        "harassment": ["harassment"],
        "bullying": ["harassment", "bullying"],
        "discrimination": ["discrimination"],
        "bias": ["bias", "discrimination"],
        # Sexual content
        "sexual": ["sexual"],
        "adult_content": ["sexual"],
        "non_consensual": ["sexual", "non_consensual"],
        # Self-harm and suicide
        "self_harm": ["self_harm"],
        "suicide": ["self_harm", "suicide"],
        "eating_disorders": ["self_harm", "eating_disorders"],
        # Criminal activities
        "criminal": ["criminal_planning"],
        "illegal": ["illegal_activity"],
        "fraud": ["criminal_planning", "fraud"],
        "theft": ["criminal_planning", "theft"],
        "cybercrime": ["criminal_planning", "cybercrime"],
        # Privacy and personal information
        "privacy": ["privacy"],
        "personal_info": ["privacy", "personal_information"],
        "doxxing": ["privacy", "doxxing"],
        # Deception and misinformation
        "deception": ["deception"],
        "misinformation": ["misinformation"],
        "conspiracy": ["misinformation", "conspiracy"],
        "fake_news": ["misinformation"],
        # Unethical behavior
        "unethical": ["unethical"],
        "manipulation": ["unethical", "manipulation"],
        "exploitation": ["unethical", "exploitation"],
        # Substance abuse
        "drugs": ["substance_abuse"],
        "alcohol": ["substance_abuse"],
        # Financial harm
        "financial_harm": ["financial_harm"],
        "scams": ["financial_harm", "scams"],
        # Child safety
        "child_abuse": ["child_safety"],
        "grooming": ["child_safety", "grooming"],
        # Government and political
        "government": ["government"],
        "political": ["political"],
        "election": ["political", "election"],
    }

    if not jbb_category or not jbb_category.strip():
        return ["unknown"]

    # Convert to lowercase for case-insensitive matching
    jbb_category_lower = jbb_category.lower().strip()

    # Try exact match first
    if jbb_category_lower in category_mapping:
        return category_mapping[jbb_category_lower]

    # Try partial matching for compound categories
    matched_categories = set()
    for key, harm_cats in category_mapping.items():
        if key in jbb_category_lower or jbb_category_lower in key:
            matched_categories.update(harm_cats)

    if matched_categories:
        return list(matched_categories)

    # Special handling for common patterns
    if any(term in jbb_category_lower for term in ["violent", "kill", "murder", "bomb"]):
        return ["violence"]
    elif any(term in jbb_category_lower for term in ["hate", "racist", "sexist"]):
        return ["hate", "discrimination"]
    elif any(term in jbb_category_lower for term in ["sexual", "porn", "nsfw"]):
        return ["sexual"]
    elif any(term in jbb_category_lower for term in ["illegal", "crime", "criminal"]):
        return ["criminal_planning", "illegal_activity"]
    elif any(term in jbb_category_lower for term in ["harm", "hurt", "damage"]):
        return ["violence", "harm"]

    # Default fallback - log unknown categories for future mapping
    logger.warning(f"Unknown JBB category '{jbb_category}', using default harm category")
    return ["unknown"]



[docs]
def fetch_jbb_behaviors_by_harm_category(harm_category: str, **kwargs) -> SeedDataset:
    """
    Fetch JBB-Behaviors filtered by a specific harm category.

    Args:
        harm_category (str): The harm category to filter by.
        **kwargs: Additional arguments passed to fetch_jbb_behaviors_dataset.

    Returns:
        SeedDataset: Filtered SeedDataset containing only prompts with the specified harm category.

    Raises:
        ValueError: If the dataset is empty after filtering.
    """
    # Get all prompts
    all_dataset = fetch_jbb_behaviors_dataset(**kwargs)

    # Filter by harm category (case-insensitive)
    filtered_prompts = [
        prompt
        for prompt in all_dataset.prompts
        if any(harm_category.lower() in harm_cat.lower() for harm_cat in prompt.harm_categories)
    ]

    if not filtered_prompts:
        raise ValueError("SeedDataset cannot be empty.")

    logger.info(f"Filtered {len(filtered_prompts)} prompts for harm category '{harm_category}'")

    # Create new dataset with filtered prompts
    filtered_dataset = SeedDataset(prompts=filtered_prompts)
    return filtered_dataset




[docs]
def fetch_jbb_behaviors_by_jbb_category(jbb_category: str, **kwargs) -> SeedDataset:
    """
    Fetch JBB-Behaviors filtered by the original JBB category.

    Args:
        jbb_category (str): The original JBB category to filter by.
        **kwargs: Additional arguments passed to fetch_jbb_behaviors_dataset.

    Returns:
        SeedDataset: Filtered SeedDataset containing only prompts with the specified JBB category.

    Raises:
        ValueError: If the dataset is empty after filtering.
    """
    # Get all prompts
    all_dataset = fetch_jbb_behaviors_dataset(**kwargs)

    # Filter by JBB category (case-insensitive)
    filtered_prompts = [
        prompt
        for prompt in all_dataset.prompts
        if (prompt.metadata and str(prompt.metadata.get("jbb_category", "")).lower() == jbb_category.lower())
    ]

    if not filtered_prompts:
        raise ValueError("SeedDataset cannot be empty.")

    logger.info(f"Filtered {len(filtered_prompts)} prompts for JBB category '{jbb_category}'")

    # Create new dataset with filtered prompts
    filtered_dataset = SeedDataset(prompts=filtered_prompts)
    return filtered_dataset