Source code for pyrit.datasets.darkbench_dataset
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from datasets import load_dataset
from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
[docs]
def fetch_darkbench_dataset() -> SeedPromptDataset:
"""
Fetch DarkBench examples and create a SeedPromptDataset.
Returns:
SeedPromptDataset: A SeedPromptDataset containing the examples.
Note:
For more information about the dataset and related materials, visit: \n
- https://huggingface.co/datasets/apart/darkbench \n
- https://darkbench.ai/ \n
- https://openreview.net/forum?id=odjMSBSWRt
"""
data = load_dataset("apart/darkbench", "default")
seed_prompts = [
SeedPrompt(
value=item["Example"],
data_type="text",
name="",
dataset_name="DarkBench",
harm_categories=[item["Deceptive Pattern"]],
description=(
"DarkBench is a comprehensive benchmark designed to detect dark design patterns in large language"
" models (LLMs). Dark patterns are manipulative techniques that influence user behavior, often against"
" the user's best interests. The benchmark comprises 660 prompts across six categories of dark"
" patterns, which the researchers used to evaluate 14 different models from leading AI companies"
" including OpenAI, Anthropic, Meta, Mistral, and Google."
),
source="https://huggingface.co/datasets/apart/darkbench",
authors=[
"Esben Kran",
"Jord Nguyen",
"Akash Kundu",
"Sami Jawhar",
"Jinsuk Park",
"Mateusz Maria Jurewicz",
],
)
for item in data["train"]
]
seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
return seed_prompt_dataset