Source code for pyrit.datasets.multilingual_vulnerability_dataset
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import pandas as pd
from pyrit.models import SeedPromptDataset
from pyrit.models.seed_prompt import SeedPrompt
[docs]
def fetch_multilingual_vulnerability_dataset() -> SeedPromptDataset:
"""
Fetch multilingual vulnerability examples from "A Framework to Assess Multilingual Vulnerabilities of LLMs"
and create a SeedPromptDataset.
Returns:
SeedPromptDataset: A SeedPromptDataset containing the examples.
"""
url = "https://raw.githubusercontent.com/CarsonDon/Multilingual-Vuln-LLMs/main/prompts/allprompt.csv"
df = pd.read_csv(url)
seed_prompts = [
SeedPrompt(
value=row["en"],
data_type="text",
name=str(row["id"]),
dataset_name="Multilingual-Vulnerability",
harm_categories=[row["type"]],
description="Dataset from 'A Framework to Assess Multilingual Vulnerabilities of LLMs'. "
"Multilingual prompts demonstrating LLM vulnerabilities, labeled by type. "
"Paper: https://arxiv.org/pdf/2503.13081",
authors="Likai Tang, Niruth Bogahawatta, Yasod Ginige, "
"Jiarui Xu, Shixuan Sun, Surangika Ranathunga, Suranga Seneviratne",
source="https://github.com/CarsonDon/Multilingual-Vuln-LLMs",
)
for _, row in df.iterrows()
]
seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
return seed_prompt_dataset