Source code for pyrit.prompt_converter.selective_text_converter

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.


from pyrit.models import PromptDataType
from pyrit.prompt_converter import PromptConverter
from pyrit.prompt_converter.prompt_converter import ConverterResult
from pyrit.prompt_converter.text_selection_strategy import (
    AllWordsSelectionStrategy,
    TextSelectionStrategy,
    TokenSelectionStrategy,
    WordSelectionStrategy,
)
from pyrit.prompt_converter.word_level_converter import WordLevelConverter


[docs] class SelectiveTextConverter(PromptConverter): """ A wrapper converter that applies another converter to selected portions of text. This converter supports multiple selection strategies: - Character-level: Selects a contiguous character range (e.g., IndexSelectionStrategy, RegexSelectionStrategy) - Word-level: Selects specific words (e.g., WordIndexSelectionStrategy, WordPositionSelectionStrategy) - Token-based: Auto-detects and converts text between ⟪⟫ tokens (TokenSelectionStrategy) Most use cases will use word-level strategies for more intuitive selection. Example: >>> from pyrit.prompt_converter import Base64Converter, SelectiveTextConverter >>> from pyrit.prompt_converter.text_selection_strategy import WordRegexSelectionStrategy >>> >>> # Convert only words matching a pattern >>> strategy = WordRegexSelectionStrategy(pattern=r"\\d+") >>> converter = SelectiveTextConverter( ... converter=Base64Converter(), ... selection_strategy=strategy, ... preserve_tokens=True ... ) >>> result = await converter.convert_async( ... prompt="The code is 12345 here" ... ) >>> # Result: "The code is ⟪MTIzNDU=⟫ here" """
[docs] def __init__( self, *, converter: PromptConverter, selection_strategy: TextSelectionStrategy, preserve_tokens: bool = False, start_token: str = "⟪", end_token: str = "⟫", word_separator: str = " ", ) -> None: """ Initializes the selective text converter. Args: converter (PromptConverter): The converter to apply to the selected text. selection_strategy (TextSelectionStrategy): The strategy for selecting which text to convert. Can be character-level or word-level strategy. preserve_tokens (bool): If True, wraps converted text with start/end tokens. This allows subsequent converters in a chain to target different regions. Defaults to False. start_token (str): The token to place before converted text when preserve_tokens=True. Defaults to "⟪". end_token (str): The token to place after converted text when preserve_tokens=True. Defaults to "⟫". word_separator (str): The separator to use when working with word-level strategies. Defaults to " ". Raises: ValueError: If the wrapped converter does not support text input/output. ValueError: If a word-level selection_strategy is used with a WordLevelConverter that has a non-default word_selection_strategy. When SelectiveTextConverter uses a WordSelectionStrategy, it passes individual words to the wrapped converter, making the wrapped converter's word selection strategy meaningless. """ super().__init__() self._validate_converter(converter=converter, selection_strategy=selection_strategy) self._converter = converter self._selection_strategy = selection_strategy self._preserve_tokens = preserve_tokens self._start_token = start_token self._end_token = end_token self._word_separator = word_separator self._is_word_level = isinstance(selection_strategy, WordSelectionStrategy) self._is_token_based = isinstance(selection_strategy, TokenSelectionStrategy)
def _validate_converter( self, *, converter: PromptConverter, selection_strategy: TextSelectionStrategy, ) -> None: """ Validates the converter and selection strategy combination. Args: converter (PromptConverter): The converter to validate. selection_strategy (TextSelectionStrategy): The selection strategy to validate against. Raises: ValueError: If the converter does not support text input/output. ValueError: If a word-level selection strategy is used with a WordLevelConverter that has a non-default word_selection_strategy. """ if not converter.input_supported("text"): raise ValueError(f"The converter {converter.__class__.__name__} does not support text input") if not converter.output_supported("text"): raise ValueError(f"The converter {converter.__class__.__name__} does not support text output") # Check for conflicting word selection strategies is_word_level_selection = isinstance(selection_strategy, WordSelectionStrategy) if is_word_level_selection and isinstance(converter, WordLevelConverter): has_non_default_strategy = not isinstance(converter._word_selection_strategy, AllWordsSelectionStrategy) if has_non_default_strategy: raise ValueError( f"Cannot use a WordSelectionStrategy with a {converter.__class__.__name__} that has a " f"non-default word_selection_strategy. When SelectiveTextConverter uses a word-level " f"strategy, it passes individual words to the wrapped converter, making the wrapped " f"converter's word selection strategy meaningless. Either use a character-level " f"selection strategy, or remove the word_selection_strategy from the wrapped converter." )
[docs] async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult: """ Converts selected portions of the prompt using the wrapped converter. Args: prompt (str): The prompt to be converted. input_type (PromptDataType): The type of input data. Must be "text". Returns: ConverterResult: The result containing the converted output and its type. Raises: ValueError: If the input type is not "text". """ if input_type != "text": raise ValueError(f"SelectiveTextConverter only supports text input, got {input_type}") # If using TokenSelectionStrategy, delegate to convert_tokens_async if self._is_token_based: result = await self._converter.convert_tokens_async( prompt=prompt, input_type="text", start_token=self._start_token, end_token=self._end_token, ) # If preserve_tokens is True, the tokens are already in the result # If False, convert_tokens_async removes them if self._preserve_tokens and self._start_token not in result.output_text: # Wrap the result with tokens if they were removed result = ConverterResult( output_text=f"{self._start_token}{result.output_text}{self._end_token}", output_type="text" ) return result if self._is_word_level: return await self._convert_word_level_async(prompt=prompt) else: return await self._convert_char_level_async(prompt=prompt)
async def _convert_word_level_async(self, *, prompt: str) -> ConverterResult: """Converts selected words using word-level selection strategy.""" words = prompt.split(self._word_separator) # Get selected word indices selected_indices = self._selection_strategy.select_words(words=words) # type: ignore # If no words selected, return original prompt if not selected_indices: return ConverterResult(output_text=prompt, output_type="text") # Convert selected words for idx in selected_indices: conversion_result = await self._converter.convert_async(prompt=words[idx], input_type="text") converted_word = conversion_result.output_text if self._preserve_tokens: words[idx] = f"{self._start_token}{converted_word}{self._end_token}" else: words[idx] = converted_word final_text = self._word_separator.join(words) return ConverterResult(output_text=final_text, output_type="text") async def _convert_char_level_async(self, *, prompt: str) -> ConverterResult: """Converts a character range using character-level selection strategy.""" start_idx, end_idx = self._selection_strategy.select_range(text=prompt) # If no region selected, return original prompt if start_idx == end_idx: return ConverterResult(output_text=prompt, output_type="text") # Extract the selected region before_text = prompt[:start_idx] selected_text = prompt[start_idx:end_idx] after_text = prompt[end_idx:] # Convert the selected region conversion_result = await self._converter.convert_async(prompt=selected_text, input_type="text") converted_text = conversion_result.output_text if self._preserve_tokens: converted_text = f"{self._start_token}{converted_text}{self._end_token}" final_text = f"{before_text}{converted_text}{after_text}" return ConverterResult(output_text=final_text, output_type="text")
[docs] def input_supported(self, input_type: PromptDataType) -> bool: """ Checks if the input type is supported. Args: input_type (PromptDataType): The input type to check. Returns: bool: True if the input type is "text", False otherwise. """ return input_type == "text"
[docs] def output_supported(self, output_type: PromptDataType) -> bool: """ Checks if the output type is supported. Args: output_type (PromptDataType): The output type to check. Returns: bool: True if the output type is "text", False otherwise. """ return output_type == "text"