Source code for pyrit.prompt_converter.transparency_attack_converter

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import base64
import logging
from io import BytesIO
from pathlib import Path
from typing import Tuple

import numpy
from PIL import Image

from pyrit.models import PromptDataType, data_serializer_factory
from pyrit.prompt_converter import ConverterResult, PromptConverter

logger = logging.getLogger(__name__)


class _AdamOptimizer:
    """
    Implementation of the Adam Optimizer using NumPy. Adam optimization is a stochastic gradient
    descent method that is based on adaptive estimation of first-order and second-order moments.
    For further details, see the original paper: `"Adam: A Method for Stochastic Optimization"`
    by D. P. Kingma and J. Ba, 2014: https://arxiv.org/abs/1412.6980

    Note:
        The code is inspired by the implementation found at:
        https://github.com/xbeat/Machine-Learning/blob/main/Adam%20Optimizer%20in%20Python.md
    """

    def __init__(
        self, *, learning_rate: float = 0.001, beta_1: float = 0.9, beta_2: float = 0.999, epsilon: float = 1e-8
    ):
        """
        Initializes the Adam optimizer with specified hyperparameters.

        Args:
            learning_rate (float): The step size for each update/iteration. Default is 0.001
            beta1 (float): The exponential decay rate for the first moment estimates. Default is 0.9
            beta2 (float): The exponential decay rate for the second moment estimates. Default is 0.999
            epsilon (float): A small constant for numerical stability (to prevent division by zero).
        """
        self.learning_rate = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
        self.m: numpy.ndarray  # first moment vector
        self.v: numpy.ndarray  # second moment vector
        self.t = 0  # initialize timestep

    def update(self, *, params: numpy.ndarray, grads: numpy.ndarray) -> numpy.ndarray:
        """
        Performs a single update step using the Adam optimization algorithm.

        Args:
            params (numpy.ndarray): Current parameter values to be optimized.
            grads (numpy.ndarray): Gradients w.r.t. stochastic objective.

        Returns:
            numpy.ndarray: Updated parameter values after applying the Adam optimization step.
        """
        if self.t == 0:
            self.m = numpy.zeros_like(params)
            self.v = numpy.zeros_like(params)
        self.t += 1

        # Update biased first and second raw moment estimates
        self.m = self.beta_1 * self.m + (1 - self.beta_1) * grads
        self.v = self.beta_2 * self.v + (1 - self.beta_2) * (grads**2)

        # Compute bias-corrected first and second raw moment estimates
        m_hat = self.m / (1 - self.beta_1**self.t)
        v_hat = self.v / (1 - self.beta_2**self.t)

        params -= self.learning_rate * m_hat / (numpy.sqrt(v_hat) + self.epsilon)
        return params


[docs] class TransparencyAttackConverter(PromptConverter): """ Creates a transparency attack by optimizing an alpha channel to blend attack and benign images. This converter takes two inputs: - Benign image (foreground/target): The harmless image specified during initialization. - Attack image (background/harmful): The potentially harmful image passed via the prompt parameter. The algorithm optimizes a transparency pattern so that the output PNG exhibits dual perception: - On white/light backgrounds: appears as the benign image. - On dark backgrounds: reveals the attack image content. - AI systems may perceive either image depending on their background processing assumptions. Currently, only JPEG images are supported as input. Output images will always be saved as PNG with transparency. Note: This converter implements the transparency attack as described in: `"Transparency Attacks: How Imperceptible Image Layers Can Fool AI Perception"` by McKee, F. and Noever, D., 2024: https://arxiv.org/abs/2401.15817 As stated in the paper: `"The major limitation of the transparency attack is the low success rate when the human viewer’s background theme is not light by default or at least a close match to the transparent foreground and hidden background layers. When mismatched, the background becomes visible to the human eye and the vision algorithm."` """ @staticmethod def _validate_input_image(path: str) -> None: """Validates input image to ensure it is a valid JPEG file.""" if not path: raise ValueError("The image path cannot be empty.") if not path.lower().endswith((".jpg", ".jpeg")): raise ValueError(f"The file is not a JPEG: {path}") if not Path(path).exists(): raise FileNotFoundError(f"The file does not exist: {path}")
[docs] def __init__( self, *, benign_image_path: Path, size: Tuple[int, int] = (150, 150), steps: int = 1500, learning_rate: float = 0.001, convergence_threshold: float = 1e-6, convergence_patience: int = 10, ): """ Initializes the converter with the path to a benign image and parameters for blending. Args: benign_image_path (Path): Path to the benign image file. Must be a JPEG file (.jpg or .jpeg). size (tuple): Size that the images will be resized to (width, height). It is recommended to use a size that matches aspect ratio of both attack and benign images. Since the original study resizes images to 150x150 pixels, this is the default size used. Bigger values may significantly increase computation time. steps (int): Number of optimization steps to perform. Recommended range: 100-2000 steps. Default is 1500. Generally, the higher the steps, the better end result you can achieve, but at the cost of increased computation time. learning_rate (float): Controls the magnitude of adjustments in each step (used by the Adam optimizer). Recommended range: 0.0001-0.01. Default is 0.001. Values close to 1 may lead to instability and lower quality blending, while values too low may require more steps to achieve a good blend. convergence_threshold (float): Minimum change in loss required to consider improvement. If the change in loss between steps is below this value, it's counted as no improvement. Default is 1e-6. Recommended range: 1e-6 to 1e-4. convergence_patience (int): Number of consecutive steps with no improvement before stopping. Default is 10. Raises: ValueError: If the benign image is invalid or is not in JPEG format. ValueError: If the learning rate is outside the valid range (0, 1). ValueError: If the size is not a tuple of two positive integers (width, height). ValueError: If the steps is not a positive integer. ValueError: If convergence threshold is not a float between 0 and 1. ValueError: If convergence patience is not a positive integer. """ self.benign_image_path = benign_image_path self.learning_rate = learning_rate self.size = size self.steps = steps self.convergence_threshold = convergence_threshold self.convergence_patience = convergence_patience self._validate_input_image(str(benign_image_path)) if not (0 < learning_rate < 1): raise ValueError(f"Learning rate must be between 0 and 1, got {learning_rate}") if not isinstance(size, tuple) or len(size) != 2 or any(dim <= 0 for dim in size): raise ValueError(f"Size must be a tuple of two positive integers (width, height). Received {size}") if not isinstance(steps, int) or steps <= 0: raise ValueError(f"Steps must be a positive integer, got {steps}") if not (0 < convergence_threshold < 1): raise ValueError(f"Convergence threshold must be a float between 0 and 1, got {convergence_threshold}") if not isinstance(convergence_patience, int) or convergence_patience <= 0: raise ValueError(f"Convergence patience must be a positive integer, got {convergence_patience}") self._cached_benign_image = self._load_and_preprocess_image(str(benign_image_path))
def _load_and_preprocess_image(self, path: str) -> numpy.ndarray: """Loads image, converts to grayscale, resizes, and normalizes for optimization.""" try: with Image.open(path) as img: img_gray = img.convert("L") if img.mode != "L" else img # read as grayscale img_resized = img_gray.resize(self.size, Image.Resampling.LANCZOS) return numpy.array(img_resized, dtype=numpy.float32) / 255.0 # normalize to [0, 1] except Exception as e: raise ValueError(f"Failed to load and preprocess image from {path}: {e}") def _compute_mse_loss(self, blended_image: numpy.ndarray, target_tensor: numpy.ndarray) -> float: """Computes Mean Squared Error (MSE) loss between blended and target images.""" return float(numpy.mean(numpy.square(blended_image - target_tensor))) def _create_blended_image(self, attack_image: numpy.ndarray, alpha: numpy.ndarray) -> numpy.ndarray: """Creates a blended image using the attack image and alpha transparency.""" attack_image_uint8 = (attack_image * 255).astype(numpy.uint8) transparency_uint8 = (alpha * 255).astype(numpy.uint8) # Create LA image: Luminance + Alpha (grayscale with transparency) height, width = attack_image_uint8.shape[:2] la_image = numpy.zeros((height, width, 2), dtype=numpy.uint8) la_image[:, :, 0] = attack_image_uint8 # L (Luminance) la_image[:, :, 1] = transparency_uint8 # A (Alpha) return la_image async def _save_blended_image(self, attack_image: numpy.ndarray, alpha: numpy.ndarray) -> str: """Saves the blended image with transparency as a PNG file.""" try: img_serializer = data_serializer_factory(category="prompt-memory-entries", data_type="image_path") img_serializer.file_extension = "png" la_image = self._create_blended_image(attack_image, alpha) la_pil = Image.fromarray(la_image, mode="LA") image_buffer = BytesIO() la_pil.save(image_buffer, format="PNG") image_str = base64.b64encode(image_buffer.getvalue()) await img_serializer.save_b64_image(data=image_str.decode()) return img_serializer.value except Exception as e: raise ValueError(f"Failed to save blended image: {e}")
[docs] async def convert_async(self, *, prompt: str, input_type: PromptDataType = "image_path") -> ConverterResult: """ Converts the given prompt by blending an attack image (potentially harmful) with a benign image. Uses the Novel Image Blending Algorithm from: https://arxiv.org/abs/2401.15817 Args: prompt (str): The image file path to the attack image. input_type (PromptDataType): The type of input data. Must be "image_path". Returns: ConverterResult: The result containing path to the manipulated image with transparency. Raises: ValueError: If the input type is not supported or if the prompt is invalid. """ if not self.input_supported(input_type): raise ValueError(f"Input type '{input_type}' not supported. Only 'image_path' is supported.") self._validate_input_image(prompt) background_image = self._load_and_preprocess_image(prompt) background_tensor = background_image * 0.5 # darkening for better blending optimization alpha = numpy.ones_like(background_tensor) # optimized to determine transparency pattern white_background = numpy.ones_like(background_tensor) # white canvas for blending simulation optimizer = _AdamOptimizer(learning_rate=self.learning_rate) grad_blended_alpha_constant = background_tensor - white_background prev_loss = float("inf") no_improvement_count = 0 for step in range(self.steps): # Simulate blending: alpha=1 uses darkened attack image, alpha=0 uses white blended_image = alpha * background_tensor + (1 - alpha) * white_background current_loss = self._compute_mse_loss(blended_image, self._cached_benign_image) if step % 100 == 0: logger.debug(f"Step {step}/{self.steps}, Loss: {current_loss:.6f}") if abs(prev_loss - current_loss) < self.convergence_threshold: no_improvement_count += 1 if no_improvement_count >= self.convergence_patience: logger.info( f"Convergence detected at step {step} with loss {current_loss:.8f}. Stopping optimization." ) break else: no_improvement_count = 0 # count only consecutive steps with no improvement prev_loss = current_loss grad_loss_blended = 2 * (blended_image - self._cached_benign_image) / blended_image.size grad_alpha = grad_loss_blended * grad_blended_alpha_constant alpha = optimizer.update(params=alpha, grads=grad_alpha) alpha = numpy.clip(alpha, 0.0, 1.0) image_path = await self._save_blended_image(background_tensor, alpha) return ConverterResult(output_text=image_path, output_type="image_path")
[docs] def input_supported(self, input_type: PromptDataType) -> bool: return input_type == "image_path"
[docs] def output_supported(self, output_type: PromptDataType) -> bool: return output_type == "image_path"