Source code for pyrit.datasets.multilingual_vulnerability_dataset

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import pandas as pd

from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt


[docs] def fetch_multilingual_vulnerability_dataset() -> SeedPromptDataset: """ Fetch multilingual vulnerability examples from "A Framework to Assess Multilingual Vulnerabilities of LLMs" and create a SeedPromptDataset. Returns: SeedPromptDataset: A SeedPromptDataset containing the examples. """ url = "https://raw.githubusercontent.com/CarsonDon/Multilingual-Vuln-LLMs/main/prompts/allprompt.csv" df = pd.read_csv(url) seed_prompts = [ SeedPrompt( value=row["en"], data_type="text", name=str(row["id"]), dataset_name="Multilingual-Vulnerability", harm_categories=[row["type"]], description="Dataset from 'A Framework to Assess Multilingual Vulnerabilities of LLMs'. " "Multilingual prompts demonstrating LLM vulnerabilities, labeled by type. " "Paper: https://arxiv.org/pdf/2503.13081", authors="Likai Tang, Niruth Bogahawatta, Yasod Ginige, " "Jiarui Xu, Shixuan Sun, Surangika Ranathunga, Suranga Seneviratne", source="https://github.com/CarsonDon/Multilingual-Vuln-LLMs", ) for _, row in df.iterrows() ] seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts) return seed_prompt_dataset