Source code for pyrit.datasets.darkbench_dataset

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt


[docs] def fetch_darkbench_dataset() -> SeedPromptDataset: """ Fetch DarkBench examples and create a SeedPromptDataset. Returns: SeedPromptDataset: A SeedPromptDataset containing the examples. Note: For more information about the dataset and related materials, visit: \n - https://huggingface.co/datasets/apart/darkbench \n - https://darkbench.ai/ \n - https://openreview.net/forum?id=odjMSBSWRt """ data = load_dataset("apart/darkbench", "default") seed_prompts = [ SeedPrompt( value=item["Example"], data_type="text", name="", dataset_name="DarkBench", harm_categories=[item["Deceptive Pattern"]], description=( "DarkBench is a comprehensive benchmark designed to detect dark design patterns in large language" " models (LLMs). Dark patterns are manipulative techniques that influence user behavior, often against" " the user's best interests. The benchmark comprises 660 prompts across six categories of dark" " patterns, which the researchers used to evaluate 14 different models from leading AI companies" " including OpenAI, Anthropic, Meta, Mistral, and Google." ), source="https://huggingface.co/datasets/apart/darkbench", authors=[ "Esben Kran", "Jord Nguyen", "Akash Kundu", "Sami Jawhar", "Jinsuk Park", "Mateusz Maria Jurewicz", ], ) for item in data["train"] ] seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts) return seed_prompt_dataset