Source code for pyrit.prompt_converter.audio_echo_converter

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import io
import logging
from typing import Any, Literal

import numpy as np
from scipy.io import wavfile

from pyrit.models import PromptDataType, data_serializer_factory
from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter

logger = logging.getLogger(__name__)



[docs]
class AudioEchoConverter(PromptConverter):
    """
    Adds an echo effect to an audio file.

    The echo is created by mixing a delayed, attenuated copy of the signal back
    into the original. The delay and decay parameters control the timing and
    loudness of the echo respectively. Sample rate, bit depth, and channel
    count are preserved.
    """

    SUPPORTED_INPUT_TYPES = ("audio_path",)
    SUPPORTED_OUTPUT_TYPES = ("audio_path",)

    #: Accepted audio formats for conversion.
    AcceptedAudioFormats = Literal["wav"]


[docs]
    def __init__(
        self,
        *,
        output_format: AcceptedAudioFormats = "wav",
        delay: float = 0.3,
        decay: float = 0.5,
    ) -> None:
        """
        Initialize the converter with echo parameters.

        Args:
            output_format (str): The format of the audio file, defaults to "wav".
            delay (float): The echo delay in seconds. Must be greater than 0. Defaults to 0.3.
            decay (float): The decay factor for the echo (0.0 to 1.0).
                A value of 0.0 means no echo, 1.0 means the echo is as loud as
                the original. Must be between 0 and 1 (exclusive of both).
                Defaults to 0.5.

        Raises:
            ValueError: If delay is not positive or decay is not in (0, 1).
        """
        if delay <= 0:
            raise ValueError("delay must be greater than 0.")
        if decay <= 0 or decay >= 1:
            raise ValueError("decay must be between 0 and 1 (exclusive).")
        self._output_format = output_format
        self._delay = delay
        self._decay = decay


    def _apply_echo(self, data: np.ndarray[Any, Any], sample_rate: int) -> np.ndarray[Any, Any]:
        """
        Apply echo effect to a 1-D audio signal.

        Args:
            data: 1-D numpy array of audio samples.
            sample_rate: The sample rate of the audio.

        Returns:
            numpy array with the echo applied, same length as input.
        """
        delay_samples = int(self._delay * sample_rate)
        output = data.astype(np.float64).copy()

        # Add the delayed, decayed copy
        if delay_samples < len(data):
            output[delay_samples:] += self._decay * data[: len(data) - delay_samples].astype(np.float64)

        # Clip to the valid range for the original dtype
        if np.issubdtype(data.dtype, np.integer):
            info = np.iinfo(data.dtype)
            output = np.clip(output, info.min, info.max)

        return output


[docs]
    async def convert_async(self, *, prompt: str, input_type: PromptDataType = "audio_path") -> ConverterResult:
        """
        Convert the given audio file by adding an echo effect.

        Args:
            prompt (str): File path to the audio file to be converted.
            input_type (PromptDataType): The type of input data.

        Returns:
            ConverterResult: The result containing the converted audio file path.

        Raises:
            ValueError: If the input type is not supported.
            Exception: If there is an error during the conversion process.
        """
        if not self.input_supported(input_type):
            raise ValueError("Input type not supported")
        try:
            # Create serializer to read audio data
            audio_serializer = data_serializer_factory(
                category="prompt-memory-entries", data_type="audio_path", extension=self._output_format, value=prompt
            )
            audio_bytes = await audio_serializer.read_data()

            # Read the audio file bytes and process the data
            bytes_io = io.BytesIO(audio_bytes)
            sample_rate, data = wavfile.read(bytes_io)
            original_dtype = data.dtype

            # Apply echo to each channel
            if data.ndim == 1:
                echo_data = self._apply_echo(data, sample_rate).astype(original_dtype)
            else:
                channels = []
                for ch in range(data.shape[1]):
                    channels.append(self._apply_echo(data[:, ch], sample_rate))
                echo_data = np.column_stack(channels).astype(original_dtype)

            # Write the processed data as a new WAV file
            output_bytes_io = io.BytesIO()
            wavfile.write(output_bytes_io, sample_rate, echo_data)

            # Save the converted bytes using the serializer
            converted_bytes = output_bytes_io.getvalue()
            await audio_serializer.save_data(data=converted_bytes)
            audio_serializer_file = str(audio_serializer.value)
            logger.info(
                "Echo effect (delay=%.3fs, decay=%.2f) applied to [%s], saved to [%s]",
                self._delay,
                self._decay,
                prompt,
                audio_serializer_file,
            )

        except Exception as e:
            logger.error("Failed to apply echo effect: %s", str(e))
            raise
        return ConverterResult(output_text=audio_serializer_file, output_type=input_type)