Source code for pyrit.datasets.sosbench_dataset

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from datasets import load_dataset

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt


[docs] def fetch_sosbench_dataset() -> SeedPromptDataset: """ Fetch SOSBench dataset and create a SeedPromptDataset. Returns: SeedPromptDataset: A SeedPromptDataset containing the examples. Note: For more information about the dataset and related materials, visit: \n - https://huggingface.co/datasets/SOSBench/SOSBench \n - https://arxiv.org/abs/2505.21605 \n """ data = load_dataset("SOSBench/SOSBench", "default") seed_prompts = [ SeedPrompt( value=item["goal"], data_type="text", name="", dataset_name="SOSBench", harm_categories=[item["subject"]], description=( "SOSBench is a regulation-grounded, hazard-focused benchmark encompassing " "six high-risk scientific domains: chemistry, biology, medicine, pharmacology, " "physics, and psychology. The benchmark comprises 3,000 prompts derived from " "real-world regulations and laws, systematically expanded via an LLM-assisted " "evolutionary pipeline that introduces diverse, realistic misuse scenarios" " (e.g., detailed explosive synthesis instructions involving advanced" " chemical formulas)." ), source="https://huggingface.co/datasets/SOSBench/SOSBench", authors=[ "Fengqing Jiang", "Fengbo Ma", "Zhangchen Xu", "Yuetai Li", "Bhaskar Ramasubramanian", "Luyao Niu", "Bo Li", "Xianyan Chen", "Zhen Xiang", "Radha Poovendran", ], ) for item in data["train"] ] seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts) return seed_prompt_dataset