Source code for pyrit.score.self_ask_general_scorer

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import json
from typing import Literal, Optional

from pyrit.models import PromptRequestPiece
from pyrit.models.score import Score, UnvalidatedScore
from pyrit.prompt_target import PromptChatTarget
from pyrit.score.scorer import Scorer


[docs] class SelfAskGeneralScorer(Scorer): """ A general scorer that uses a chat target to score a prompt request piece. It can be configured to use different scoring types (e.g., true/false, float scale) It can also format the prompt using a system-level prompt and a format string. Parameters: chat_target (PromptChatTarget): The chat target to use for scoring. system_prompt (str): The system-level prompt that guides the behavior of the target LLM. Defaults to None. This can be a string or a format string template with placeholders for task and request_response. The system prompt or prompt or request_response needs to specify a JSON output format for the response prompt_format_string (str): A format string template for the prompt. Defaults to None. This is a string that can be formatted with task, request_response, and prompt scorer_type (str): The type of scorer (e.g., "true_false", "float_scale"). Defaults to float_scale. score_value_output_key (str): The key in the JSON response that contains the score value. Defaults to "score_value". rationale_output_key (str): The key in the JSON response that contains the rationale. Defaults to "rationale". description_output_key (str): The key in the JSON response that contains the description. Defaults to "description". metadata_output_key (str): The key in the JSON response that contains the metadata. Defaults to "metadata". category_output_key (str): The key in the JSON response that contains the category. Defaults to "category". category (list): A list of categories for the score. Defaults to None. labels (list): A list of labels for the score. Defaults to None. min_value (int): The minimum value for float scale scoring. Defaults to 0. max_value (int): The maximum value for float scale scoring. Defaults to 100. """
[docs] def __init__( self, chat_target: PromptChatTarget, system_prompt_format_string: Optional[str] = None, prompt_format_string: Optional[str] = None, scorer_type: Literal["true_false", "float_scale"] = "float_scale", score_value_output_key: str = "score_value", rationale_output_key: str = "rationale", description_output_key: str = "description", metadata_output_key: str = "metadata", category_output_key: str = "category", category: Optional[list] = None, labels: Optional[list] = None, min_value: int = 0, max_value: int = 100, ) -> None: self._prompt_target = chat_target self._system_prompt = system_prompt_format_string self._prompt_format_string = prompt_format_string if scorer_type != "true_false" and scorer_type != "float_scale": raise ValueError( f"Scorer type {scorer_type} is not a valid scorer type. Options are true_false or float_scale." ) self.scorer_type = scorer_type # Convert category list to string for storage compatibility self._score_category = None if category is not None: self._score_category = json.dumps(category) if isinstance(category, list) else str(category) self.labels = labels self._min_value = min_value self._max_value = max_value self._score_value_output_key = score_value_output_key self._rationale_output_key = rationale_output_key self._description_output_key = description_output_key self._metadata_output_key = metadata_output_key self._category_output_key = category_output_key
async def _score_async(self, request_response: PromptRequestPiece, *, task: Optional[str] = None) -> list[Score]: prompt = request_response.converted_value system_prompt = self._system_prompt.format(task=task, prompt=prompt, request_response=request_response) if self._prompt_format_string: prompt = self._prompt_format_string.format(task=task, prompt=prompt, request_response=request_response) unvalidated_score: UnvalidatedScore = await self._score_value_with_llm( prompt_target=self._prompt_target, system_prompt=system_prompt, prompt_request_value=prompt, prompt_request_data_type=request_response.converted_value_data_type, scored_prompt_id=request_response.id, category=self._score_category, task=task, orchestrator_identifier=request_response.orchestrator_identifier, score_value_output_key=self._score_value_output_key, rationale_output_key=self._rationale_output_key, description_output_key=self._description_output_key, metadata_output_key=self._metadata_output_key, category_output_key=self._category_output_key, ) if self.scorer_type == "true_false": score = unvalidated_score.to_score(score_value=unvalidated_score.raw_score_value) elif self.scorer_type == "float_scale": score = unvalidated_score.to_score( score_value=str( self.scale_value_float(float(unvalidated_score.raw_score_value), self._min_value, self._max_value) ) ) return [score]
[docs] def validate(self, request_response: PromptRequestPiece, *, task: Optional[str] = None) -> None: pass