Source code for pyrit.prompt_converter.token_smuggling.ascii_smuggler_converter

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
from typing import Literal

from pyrit.prompt_converter.token_smuggling.base import SmugglerConverter

logger = logging.getLogger(__name__)


[docs] class AsciiSmugglerConverter(SmugglerConverter): """ Implements encoding and decoding using Unicode Tags. If 'control' is True, the encoded output is wrapped with: - U+E0001 (start control tag) - U+E007F (end control tag) Replicates the functionality detailed in the following blog post: https://embracethered.com/blog/posts/2024/hiding-and-finding-text-with-unicode-tags/ """
[docs] def __init__(self, action: Literal["encode", "decode"] = "encode", unicode_tags: bool = False): """ Initialize the converter with options for encoding/decoding. Args: unicode_tags (bool): Whether to add Unicode tags during encoding. """ self.unicode_tags = unicode_tags super().__init__(action=action)
[docs] def encode_message(self, *, message: str): """ Encodes the message using Unicode Tags. Each ASCII printable character (0x20-0x7E) is mapped to a corresponding Unicode Tag (by adding 0xE0000). If control mode is enabled, wraps the output. Args: message (str): The message to encode. Returns: Tuple[str, str]: A tuple with a summary of code points and the encoded message. """ encoded = "" code_points = "" invalid_chars = "" if self.unicode_tags: encoded += chr(0xE0001) code_points += "U+E0001 " for char in message: if 0x20 <= ord(char) <= 0x7E: code_point = 0xE0000 + ord(char) encoded += chr(code_point) code_points += f"U+{code_point:X} " else: invalid_chars += char if self.unicode_tags: encoded += chr(0xE007F) code_points += "U+E007F" if invalid_chars: logger.error(f"Invalid characters detected: {invalid_chars}") return code_points, encoded
[docs] def decode_message(self, *, message: str): """ Decodes a message encoded with Unicode Tags. For each character in the Unicode Tags range, subtracts 0xE0000. Skips control tags if present. Args: message (str): The encoded message. Returns: str: The decoded message. """ decoded_message = "" for char in message: code_point = ord(char) if 0xE0000 <= code_point <= 0xE007F: decoded_char = chr(code_point - 0xE0000) if not 0x20 <= ord(decoded_char) <= 0x7E: logger.info(f"Potential unicode tag detected: {decoded_char}") else: decoded_message += decoded_char if len(decoded_message) != len(message): logger.info("Hidden Unicode Tags discovered.") else: logger.info("No hidden Unicode Tag characters discovered.") return decoded_message