Source code for pyrit.score.scorer_identifier
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from __future__ import annotations
import hashlib
import json
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
[docs]
@dataclass
class ScorerIdentifier:
"""
Configuration class for Scorers.
This class encapsulates the modifiable parameters that can be used to create a complete scoring configuration.
These parameters can be modified, and configurations can be compared to each other via scorer evaluations.
"""
type: str
system_prompt_template: Optional[str] = None
user_prompt_template: Optional[str] = None
sub_identifier: Optional[List[ScorerIdentifier]] = None
target_info: Optional[Dict[str, Any]] = None
score_aggregator: Optional[str] = None
scorer_specific_params: Optional[Dict[str, Any]] = None
[docs]
def compute_hash(self, hashable_dict: Optional[Dict[str, Any]] = None) -> str:
"""
Compute a hash representing the current configuration.
Args:
hashable_dict: Pre-computed hashable dict to avoid recomputation.
If None, _to_hashable_dict() will be called.
Returns:
str: A hash string representing the configuration.
"""
if hashable_dict is None:
hashable_dict = self._to_hashable_dict()
# Sort keys to ensure deterministic ordering and encode as JSON
config_json = json.dumps(hashable_dict, sort_keys=True, separators=(",", ":"))
hasher = hashlib.sha256()
hasher.update(config_json.encode("utf-8"))
return hasher.hexdigest()
[docs]
def to_compact_dict(self) -> Dict[str, Any]:
"""
Convert the ScorerIdentifier to a compact dictionary for storage.
Long prompts (>100 characters) are hashed to sha256:{hash[:16]} format.
Nested sub_identifiers are recursively compacted.
Includes the computed hash of the configuration.
Returns:
Dict[str, Any]: A compact dictionary representation with hash.
"""
result = self._to_hashable_dict()
result["hash"] = self.compute_hash(hashable_dict=result)
return result
def _to_hashable_dict(self) -> Dict[str, Any]:
"""
Convert to a dictionary suitable for hashing (without the hash field).
Long prompts (>100 characters) are hashed to sha256:{hash[:16]} format.
Nested sub_identifiers are recursively compacted.
Returns:
Dict[str, Any]: A dictionary representation without hash.
"""
# Hash system_prompt_template if longer than 100 characters
sys_prompt = self.system_prompt_template
if sys_prompt and len(sys_prompt) > 100:
sys_prompt = f"sha256:{hashlib.sha256(sys_prompt.encode()).hexdigest()[:16]}"
# Hash user_prompt_template if longer than 100 characters
user_prompt = self.user_prompt_template
if user_prompt and len(user_prompt) > 100:
user_prompt = f"sha256:{hashlib.sha256(user_prompt.encode()).hexdigest()[:16]}"
# Recursively compact sub_identifiers (without hash for consistent hashing)
sub_id_serialized: Any = None
if self.sub_identifier is not None:
sub_id_serialized = [si._to_hashable_dict() for si in self.sub_identifier]
return {
"__type__": self.type,
"system_prompt_template": sys_prompt,
"user_prompt_template": user_prompt,
"sub_identifier": sub_id_serialized,
"target_info": self.target_info,
"score_aggregator": self.score_aggregator,
"scorer_specific_params": self.scorer_specific_params,
}
[docs]
@classmethod
def from_compact_dict(cls, data: Dict[str, Any]) -> "ScorerIdentifier":
"""
Create a ScorerIdentifier from a compact dictionary (as stored in JSONL).
Handles the __type__ -> type field mapping and recursively reconstructs
nested sub_identifiers.
Note:
This is a LOSSY reconstruction. Long prompts (>100 chars) are stored as
"sha256:{hash[:16]}" and the original text is not recoverable. Hash-based
lookups still work correctly since compute_hash() is deterministic and uses
the same hashing logic for long prompts before computing the hash.
Args:
data (Dict[str, Any]): The compact dictionary representation.
Returns:
ScorerIdentifier: A new ScorerIdentifier instance (with possibly hashed prompt fields).
"""
# Handle __type__ -> type mapping
scorer_type = data.get("__type__") or data.get("type", "")
# Recursively reconstruct sub_identifiers
sub_identifier = None
sub_id_data = data.get("sub_identifier")
if sub_id_data is not None:
sub_identifier = [cls.from_compact_dict(si) for si in sub_id_data]
return cls(
type=scorer_type,
system_prompt_template=data.get("system_prompt_template"),
user_prompt_template=data.get("user_prompt_template"),
sub_identifier=sub_identifier,
target_info=data.get("target_info"),
score_aggregator=data.get("score_aggregator"),
scorer_specific_params=data.get("scorer_specific_params"),
)