Source code for pyrit.prompt_converter.word_doc_converter

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from __future__ import annotations

import ast
import hashlib
from dataclasses import dataclass
from io import BytesIO
from pathlib import Path
from typing import Any, Optional

from docx import Document

from pyrit.common.logger import logger
from pyrit.identifiers import ComponentIdentifier
from pyrit.models import PromptDataType, SeedPrompt, data_serializer_factory
from pyrit.models.data_type_serializer import DataTypeSerializer
from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter


@dataclass
class _WordDocInjectionConfig:
    """Configuration for how to inject content into a Word document."""

    existing_docx: Optional[Path]
    placeholder: str


[docs] class WordDocConverter(PromptConverter): """ Convert a text prompt into a Word (.docx) document. This converter supports two main modes: 1. **New document generation** If no existing document is provided, the converter creates a simple `.docx` containing the rendered prompt content in a single paragraph. 2. **Placeholder-based injection into an existing document** If an ``existing_docx`` is provided, the converter searches for a literal placeholder string (for example ``{{INJECTION_PLACEHOLDER}}``) in the document's paragraphs. When the placeholder is found fully inside a single run, it is replaced with the rendered prompt content while preserving the rest of the paragraph and its formatting. .. important:: Placeholders must be fully contained within a single run. If a placeholder spans multiple runs (for example due to mixed formatting), this converter will not replace it. This limitation is intentional to avoid collapsing mixed formatting or rewriting complex run structures. Security note: This converter does **not** render Jinja2 templates from arbitrary ``.docx`` content. Templating is handled via ``SeedPrompt`` (if provided), and only the already-rendered text is injected into the document. This avoids executing untrusted Jinja2 templates from document bodies. """ SUPPORTED_INPUT_TYPES = ("text",) SUPPORTED_OUTPUT_TYPES = ("binary_path",)
[docs] def __init__( self, *, prompt_template: Optional[SeedPrompt] = None, existing_docx: Optional[Path] = None, placeholder: str = "{{INJECTION_PLACEHOLDER}}", ) -> None: """ Initialize the Word document converter. Args: prompt_template: Optional ``SeedPrompt`` template used to render the final content before injection. If provided, ``prompt`` passed to ``convert_async`` must be a string whose contents can be interpreted as the template parameters (for example, a JSON-encoded or other parseable mapping of keys to values). existing_docx: Optional path to an existing `.docx` file. When provided, the converter will search for ``placeholder`` inside the document paragraphs and replace it with the rendered content. If not provided, a new document is generated instead. placeholder: Literal placeholder text to search for in the existing document. This value must be fully contained within a single run for the replacement to succeed. Raises: FileNotFoundError: If ``existing_docx`` is provided but does not exist. ValueError: If ``placeholder`` is empty. """ super().__init__() if not placeholder: raise ValueError("Placeholder must be a non-empty string.") if existing_docx is not None and not existing_docx.is_file(): raise FileNotFoundError(f"Word document not found at: {existing_docx}") self._prompt_template = prompt_template self._injection_config = _WordDocInjectionConfig( existing_docx=existing_docx, placeholder=placeholder, )
def _build_identifier(self) -> ComponentIdentifier: """ Build identifier with template and document parameters. Returns: ComponentIdentifier: The identifier with converter-specific parameters. """ template_hash: Optional[str] = None if self._prompt_template: template_hash = hashlib.sha256(str(self._prompt_template.value).encode("utf-8")).hexdigest()[:16] existing_docx_path = None if self._injection_config.existing_docx: existing_docx_path = str(self._injection_config.existing_docx) return self._create_identifier( params={ "prompt_template_hash": template_hash, "existing_docx_path": existing_docx_path, "placeholder": self._injection_config.placeholder, } )
[docs] async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult: """ Convert the given prompt into a Word document (.docx). If ``prompt_template`` is provided, the prompt is first used to render the template via ``SeedPrompt.render_template_value``. Otherwise, the raw ``prompt`` string is used as the content. - When ``existing_docx`` is set, this content is injected into the document by replacing the configured placeholder string. - When no ``existing_docx`` is provided, a new document with a single paragraph containing the content is created. Args: prompt: The prompt or dynamic data used to generate the content. input_type: The type of input data. Must be ``"text"``. Returns: ConverterResult: Contains the path to the generated `.docx` file in ``output_text`` and ``output_type="binary_path"``. Raises: ValueError: If the input type is not supported. """ if not self.input_supported(input_type): raise ValueError("Input type not supported") content = self._prepare_content(prompt) if self._injection_config.existing_docx: doc_bytes = self._inject_into_existing_docx(content) else: doc_bytes = self._generate_new_docx(content) serializer = await self._serialize_docx_async(doc_bytes) return ConverterResult(output_text=serializer.value, output_type="binary_path")
def _prepare_content(self, prompt: str) -> str: """ Prepare the content to be injected or written to the document. If a ``SeedPrompt`` template is provided, the ``prompt`` is parsed (if necessary) as a dictionary and used to render the template. Otherwise, the raw prompt string is used. Returns: str: The rendered or raw content string. Raises: ValueError: If the prompt cannot be parsed or rendered with the template. """ if self._prompt_template: logger.debug(f"Preparing Word content with template: {self._prompt_template.value}") try: dynamic_data: dict[str, Any] = ast.literal_eval(prompt) if not isinstance(dynamic_data, dict): raise ValueError("Prompt must be a dictionary-compatible object after parsing.") rendered_content = self._prompt_template.render_template_value(**dynamic_data) logger.debug("Rendered Word template content successfully.") return rendered_content except (ValueError, SyntaxError, KeyError) as exc: logger.error("Error rendering Word template content: %s", exc) raise ValueError(f"Failed to render the prompt for Word document: {exc}") from exc logger.debug("No template provided for Word document. Using raw prompt content.") return prompt def _generate_new_docx(self, content: str) -> bytes: """ Generate a new `.docx` document containing the given content. Returns: bytes: The serialized `.docx` document bytes. """ document = Document() document.add_paragraph(content) buffer = BytesIO() document.save(buffer) buffer.seek(0) return buffer.getvalue() def _inject_into_existing_docx(self, content: str) -> bytes: """ Inject content into an existing document by replacing the placeholder. The placeholder must appear fully inside a single run; if it only exists across multiple runs, it will not be replaced. Returns: bytes: The serialized `.docx` document bytes. Raises: ValueError: If no existing_docx path was provided. """ if self._injection_config.existing_docx is None: raise ValueError( "Cannot inject into an existing Word document because no 'existing_docx' " "path was provided in the injection configuration." ) document = Document(str(self._injection_config.existing_docx)) placeholder = self._injection_config.placeholder replaced_any = False for paragraph in document.paragraphs: if placeholder not in paragraph.text: continue if self._replace_placeholder_in_paragraph(paragraph, placeholder, content): replaced_any = True if not replaced_any: logger.warning( "No placeholder '%s' found in document '%s' or placeholder spanned multiple runs.", placeholder, self._injection_config.existing_docx, ) buffer = BytesIO() document.save(buffer) buffer.seek(0) return buffer.getvalue() @staticmethod def _replace_placeholder_in_paragraph(paragraph: Any, placeholder: str, content: str) -> bool: """ Replace a placeholder inside a single run of a paragraph. This function searches all runs of a paragraph and performs a string replacement in the first run whose text contains the placeholder. It does not modify other runs, which helps preserve existing formatting. Returns: bool: True if a replacement was made, False otherwise. """ for run in paragraph.runs: if placeholder in run.text: run.text = run.text.replace(placeholder, content) return True return False async def _serialize_docx_async(self, docx_bytes: bytes) -> DataTypeSerializer: """ Serialize the generated document using a data serializer. Returns: DataTypeSerializer: The serializer containing the saved document path. """ extension = "docx" serializer = data_serializer_factory( category="prompt-memory-entries", data_type="binary_path", extension=extension, ) await serializer.save_data(docx_bytes) return serializer