Source code for pyrit.prompt_converter.token_smuggling.variation_selector_smuggler_converter

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
from typing import Literal, Optional, Tuple

from pyrit.prompt_converter.token_smuggling.base import SmugglerConverter

logger = logging.getLogger(__name__)


[docs] class VariationSelectorSmugglerConverter(SmugglerConverter): """ Encodes and decodes text using Unicode Variation Selectors. Each UTF-8 byte is mapped as follows: - Bytes 0x00-0x0F are mapped to U+FE00-U+FE0F. - Bytes 0x10-0xFF are mapped to U+E0100-U+E01EF. If ``embed_in_base`` is True, the payload is concatenated with a base character (default: 😊); otherwise, a space separator is inserted. Replicates functionality detailed in: - https://paulbutler.org/2025/smuggling-arbitrary-data-through-an-emoji/ Extension: In addition to embedding into a base character, we also support appending invisible variation selectors directly to visible text—enabling mixed visible and hidden content within a single string. """
[docs] def __init__( self, action: Literal["encode", "decode"] = "encode", base_char_utf8: Optional[str] = None, embed_in_base: bool = True, ): """ Initializes the converter with options for encoding/decoding. Args: action (Literal["encode", "decode"]): The action to perform. base_char_utf8 (Optional[str]): Base character for ``variation_selector_smuggler`` mode (default: 😊). embed_in_base (bool): If True, the hidden payload is embedded directly into the base character. If False, a visible separator (space) is inserted between the base and payload. Default is True. Raises: ValueError: If an unsupported action or ``encoding_mode`` is provided. """ super().__init__(action=action) self.utf8_base_char = base_char_utf8 if base_char_utf8 is not None else "😊" self.embed_in_base = embed_in_base
[docs] def encode_message(self, message: str) -> Tuple[str, str]: """ Encodes the message using Unicode variation selectors. The message is converted to UTF-8 bytes, and each byte is mapped to a variation selector: - 0x00-0x0F => U+FE00 to U+FE0F. - 0x10-0xFF => U+E0100 to U+E01EF. If ``embed_in_base`` is True, the payload is embedded directly into the base character; otherwise, a visible separator (a space) is inserted between the base and payload. """ payload = "" data = message.encode("utf-8") for byte in data: if byte < 16: code_point = 0xFE00 + byte else: code_point = 0xE0100 + (byte - 16) payload += chr(code_point) if self.embed_in_base: encoded = self.utf8_base_char + payload else: encoded = self.utf8_base_char + " " + payload summary_parts = [f"Base char: U+{ord(self.utf8_base_char):X}"] for byte in data: if byte < 16: summary_parts.append(f"U+{(0xFE00 + byte):X}") else: summary_parts.append(f"U+{(0xE0100 + (byte - 16)):X}") code_points_summary = " ".join(summary_parts) logger.info(f"Variation Selector Smuggler encoding complete: {len(data)} bytes encoded.") return code_points_summary.strip(), encoded
[docs] def decode_message(self, message: str) -> str: """ Decodes a message encoded using Unicode variation selectors. The decoder scans the string for variation selectors, ignoring any visible separator. """ bytes_out = bytearray() started = False for char in message: # If not embedding, skip visible separators (e.g., spaces) if not self.embed_in_base and char == " ": continue code = ord(char) if 0xFE00 <= code <= 0xFE0F: started = True byte = code - 0xFE00 bytes_out.append(byte) elif 0xE0100 <= code <= 0xE01EF: started = True byte = (code - 0xE0100) + 16 bytes_out.append(byte) else: if started: break try: decoded_text = bytes_out.decode("utf-8") except UnicodeDecodeError: decoded_text = bytes_out.decode("utf-8", errors="replace") logger.error("Decoded byte sequence is not valid UTF-8; some characters may be replaced.") logger.info(f"Variation Selector Smuggler decoding complete: {len(decoded_text)} characters decoded.") return decoded_text
# Extension of Paul Butler's method
[docs] def encode_visible_hidden(self, visible: str, hidden: str) -> Tuple[str, str]: """ Combines visible text with hidden text by encoding the hidden text using ``variation_selector_smuggler`` mode. The hidden payload is generated as a composite using the current embedding setting and then appended to the visible text. Args: visible (str): The visible text. hidden (str): The secret/hidden text to encode. Returns: Tuple[str, str]: A tuple containing a summary and the combined text. """ summary, encoded_hidden = self.encode_message(hidden) combined = visible + encoded_hidden return summary, combined
# Extension of Paul Butler's method
[docs] def decode_visible_hidden(self, combined: str) -> Tuple[str, str]: """ Extracts the visible text and decodes the hidden text from a combined string. It searches for the first occurrence of the base character (``self.utf8_base_char``) and treats everything from that point on as the hidden payload. Args: combined (str): The combined text containing visible and hidden parts. Returns: Tuple[str, str]: A tuple with the visible text and the decoded hidden text. """ base_char = self.utf8_base_char index = combined.find(base_char) if index == -1: return combined, "" visible = combined[:index] hidden_encoded = combined[index:] hidden = self.decode_message(hidden_encoded) return visible, hidden