Source code for pyrit.prompt_target.openai.openai_chat_audio_config

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from dataclasses import dataclass
from typing import Any, Literal

# Voices supported by OpenAI Chat Completions API audio output.
# OpenAI SDK: openai/types/chat/chat_completion_audio_param.py voice field
# SDK Literal includes: alloy, ash, ballad, coral, echo, sage, shimmer, verse, marin, cedar
# SDK docstring also lists: fable, nova, onyx (we include these for completeness)
# Note: SDK uses Union[str, Literal[...]] so any string is accepted by the API.
ChatAudioVoice = Literal[
    "alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer", "verse", "marin", "cedar"
]

# Audio output formats supported by OpenAI Chat Completions API.
# OpenAI SDK: openai/types/chat/chat_completion_audio_param.py format field
# defines format: Required[Literal["wav", "aac", "mp3", "flac", "opus", "pcm16"]]
ChatAudioFormat = Literal["wav", "aac", "mp3", "flac", "opus", "pcm16"]



[docs]
@dataclass
class OpenAIChatAudioConfig:
    """
    Configuration for audio output from OpenAI Chat Completions API.

    When provided to OpenAIChatTarget, this enables audio output from models
    that support it (e.g., gpt-4o-audio-preview).

    Note: This is specific to the Chat Completions API. The Responses API does not
    support audio input or output. For real-time audio, use RealtimeTarget instead.
    """

    # The voice to use for audio output. Supported voices are:
    voice: ChatAudioVoice

    # The audio format for the response. Supported formats are:
    audio_format: ChatAudioFormat = "wav"

    # If True, historical user messages that contain both audio and text will only send
    # the text (transcript) to reduce bandwidth and token usage. The current (last) user
    # message will still include audio. Defaults to True.
    prefer_transcript_for_history: bool = True


[docs]
    def to_extra_body_parameters(self) -> dict[str, Any]:
        """
        Convert the config to extra_body_parameters format for OpenAI API.

        Returns:
            dict: Parameters to include in the request body for audio output.
        """
        return {
            "modalities": ["text", "audio"],
            "audio": {"voice": self.voice, "format": self.audio_format},
        }