# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import abc
import random
import re
from typing import List, Optional, Pattern, Union
[docs]
class TextSelectionStrategy(abc.ABC):
"""
Base class for text selection strategies used by SelectiveTextConverter and WordLevelConverter.
Defines how to select a region of text or words for conversion.
"""
[docs]
@abc.abstractmethod
def select_range(self, *, text: str) -> tuple[int, int]:
"""
Selects a range of characters in the text to be converted.
Args:
text (str): The input text to select from.
Returns:
tuple[int, int]: A tuple of (start_index, end_index) representing the character range.
The range is inclusive of start_index and exclusive of end_index.
"""
pass
[docs]
class TokenSelectionStrategy(TextSelectionStrategy):
"""
A special selection strategy that signals SelectiveTextConverter to auto-detect
and convert text between start/end tokens (e.g., ⟪ and ⟫).
This strategy is used when chaining converters with preserve_tokens=True.
Instead of programmatically selecting text, it relies on tokens already present
in the text from a previous converter.
Example:
>>> first_converter = SelectiveTextConverter(
... converter=Base64Converter(),
... selection_strategy=WordPositionSelectionStrategy(start_proportion=0.5, end_proportion=1.0),
... preserve_tokens=True
... )
>>> # Text after first converter: "hello world ⟪Y29udmVydGVk⟫"
>>>
>>> second_converter = SelectiveTextConverter(
... converter=ROT13Converter(),
... selection_strategy=TokenSelectionStrategy(), # Auto-detect tokens
... preserve_tokens=True
... )
"""
[docs]
def select_range(self, *, text: str) -> tuple[int, int]:
"""
This method is not used for TokenSelectionStrategy.
SelectiveTextConverter handles token detection separately.
Args:
text (str): The input text (ignored).
Returns:
tuple[int, int]: Always returns (0, 0) as this strategy uses token detection instead.
"""
return (0, 0)
[docs]
class WordSelectionStrategy(TextSelectionStrategy):
"""
Base class for word-level selection strategies.
Word selection strategies work by splitting text into words and selecting specific word indices.
They provide a select_words() method and implement select_range() by converting word selections
to character ranges.
"""
[docs]
@abc.abstractmethod
def select_words(self, *, words: List[str]) -> List[int]:
"""
Selects word indices to be converted.
Args:
words (List[str]): The list of words to select from.
Returns:
List[int]: A list of indices representing which words should be converted.
"""
pass
[docs]
def select_range(self, *, text: str, word_separator: str = " ") -> tuple[int, int]:
"""
Selects a character range by first selecting words, then converting to character positions.
This implementation splits the text by word_separator, gets selected word indices,
then calculates the character range that spans those words.
Args:
text (str): The input text to select from.
word_separator (str): The separator used to split words. Defaults to " ".
Returns:
tuple[int, int]: A tuple of (start_index, end_index) representing the character range
that encompasses all selected words.
"""
words = text.split(word_separator)
selected_indices = self.select_words(words=words)
if not selected_indices:
return (0, 0)
# Find the character positions of the selected words
min_idx = min(selected_indices)
max_idx = max(selected_indices)
# Calculate character positions
char_pos = 0
start_char = 0
end_char = 0
for i, word in enumerate(words):
if i == min_idx:
start_char = char_pos
if i == max_idx:
end_char = char_pos + len(word)
break
char_pos += len(word) + len(word_separator)
return (start_char, end_char)
[docs]
class IndexSelectionStrategy(TextSelectionStrategy):
"""
Selects text based on absolute character indices.
"""
[docs]
def __init__(self, *, start: int = 0, end: Optional[int] = None) -> None:
"""
Initializes the index selection strategy.
Args:
start (int): The starting character index (inclusive). Defaults to 0.
end (Optional[int]): The ending character index (exclusive). If None, selects to end of text.
"""
self._start = start
self._end = end
[docs]
def select_range(self, *, text: str) -> tuple[int, int]:
"""
Selects a range based on absolute character indices.
Args:
text (str): The input text to select from.
Returns:
tuple[int, int]: A tuple of (start_index, end_index).
"""
end = self._end if self._end is not None else len(text)
start = max(0, min(self._start, len(text)))
end = max(start, min(end, len(text)))
return (start, end)
[docs]
class RegexSelectionStrategy(TextSelectionStrategy):
"""
Selects text based on the first regex match.
"""
[docs]
def __init__(self, *, pattern: Union[str, Pattern]) -> None:
"""
Initializes the regex selection strategy.
Args:
pattern (Union[str, Pattern]): The regex pattern to match.
"""
self._pattern = re.compile(pattern) if isinstance(pattern, str) else pattern
[docs]
def select_range(self, *, text: str) -> tuple[int, int]:
"""
Selects the range of the first regex match.
Args:
text (str): The input text to select from.
Returns:
tuple[int, int]: A tuple of (start_index, end_index) of the first match,
or (0, 0) if no match found.
"""
match = self._pattern.search(text)
if match:
return (match.start(), match.end())
return (0, 0)
[docs]
class KeywordSelectionStrategy(TextSelectionStrategy):
"""
Selects text around a keyword with optional context.
"""
[docs]
def __init__(
self,
*,
keyword: str,
context_before: int = 0,
context_after: int = 0,
case_sensitive: bool = True,
) -> None:
"""
Initializes the keyword selection strategy.
Args:
keyword (str): The keyword to search for.
context_before (int): Number of characters to include before the keyword. Defaults to 0.
context_after (int): Number of characters to include after the keyword. Defaults to 0.
case_sensitive (bool): Whether the keyword search is case-sensitive. Defaults to True.
"""
self._keyword = keyword
self._context_before = context_before
self._context_after = context_after
self._case_sensitive = case_sensitive
[docs]
def select_range(self, *, text: str) -> tuple[int, int]:
"""
Selects the range around the first occurrence of the keyword.
Args:
text (str): The input text to select from.
Returns:
tuple[int, int]: A tuple of (start_index, end_index) including context,
or (0, 0) if keyword not found.
"""
search_text = text if self._case_sensitive else text.lower()
search_keyword = self._keyword if self._case_sensitive else self._keyword.lower()
index = search_text.find(search_keyword)
if index == -1:
return (0, 0)
start = max(0, index - self._context_before)
end = min(len(text), index + len(self._keyword) + self._context_after)
return (start, end)
[docs]
class PositionSelectionStrategy(TextSelectionStrategy):
"""
Selects text based on proportional start and end positions.
"""
[docs]
def __init__(self, *, start_proportion: float, end_proportion: float) -> None:
"""
Initializes the position selection strategy.
Args:
start_proportion (float): The starting position as a proportion (0.0 to 1.0).
end_proportion (float): The ending position as a proportion (0.0 to 1.0).
Raises:
ValueError: If proportions are not between 0.0 and 1.0, or start >= end.
"""
if not 0.0 <= start_proportion <= 1.0:
raise ValueError(f"start_proportion must be between 0.0 and 1.0, got {start_proportion}")
if not 0.0 <= end_proportion <= 1.0:
raise ValueError(f"end_proportion must be between 0.0 and 1.0, got {end_proportion}")
if start_proportion >= end_proportion:
raise ValueError(
f"start_proportion ({start_proportion}) must be less than end_proportion ({end_proportion})"
)
self._start_proportion = start_proportion
self._end_proportion = end_proportion
[docs]
def select_range(self, *, text: str) -> tuple[int, int]:
"""
Selects a range based on the relative position in the text.
Args:
text (str): The input text to select from.
Returns:
tuple[int, int]: A tuple of (start_index, end_index).
"""
text_len = len(text)
start = int(text_len * self._start_proportion)
end = int(text_len * self._end_proportion)
return (start, end)
[docs]
class ProportionSelectionStrategy(TextSelectionStrategy):
"""
Selects a proportion of text anchored to a specific position (start, end, middle, or random).
"""
[docs]
def __init__(self, *, proportion: float, anchor: str = "start", seed: Optional[int] = None) -> None:
"""
Initializes the proportion selection strategy.
Args:
proportion (float): The proportion of text to select (0.0 to 1.0).
anchor (str): Where to anchor the selection. Valid values:
- 'start': Select from the beginning
- 'end': Select from the end
- 'middle': Select from the middle
- 'random': Select from a random position
seed (Optional[int]): Random seed for reproducible random selections. Defaults to None.
Raises:
ValueError: If proportion is not between 0.0 and 1.0, or anchor is invalid.
"""
if not 0.0 <= proportion <= 1.0:
raise ValueError(f"Proportion must be between 0.0 and 1.0, got {proportion}")
valid_anchors = {"start", "end", "middle", "random"}
if anchor not in valid_anchors:
raise ValueError(f"Invalid anchor '{anchor}'. Valid anchors are: {', '.join(valid_anchors)}")
self._proportion = proportion
self._anchor = anchor
self._seed = seed
[docs]
def select_range(self, *, text: str) -> tuple[int, int]:
"""
Selects a proportion of text based on the anchor position.
Args:
text (str): The input text to select from.
Returns:
tuple[int, int]: A tuple of (start_index, end_index).
"""
text_len = len(text)
selection_len = int(text_len * self._proportion)
if self._anchor == "start":
return (0, selection_len)
elif self._anchor == "end":
return (text_len - selection_len, text_len)
elif self._anchor == "middle":
start = (text_len - selection_len) // 2
return (start, start + selection_len)
else: # random
if self._seed is not None:
random.seed(self._seed)
max_start = max(0, text_len - selection_len)
start = random.randint(0, max_start) if max_start > 0 else 0
return (start, start + selection_len)
[docs]
class RangeSelectionStrategy(TextSelectionStrategy):
"""
Selects text based on proportional start and end positions.
"""
[docs]
def __init__(self, *, start_proportion: float = 0.0, end_proportion: float = 1.0) -> None:
"""
Initializes the range selection strategy.
Args:
start_proportion (float): The starting position as a proportion (0.0 to 1.0). Defaults to 0.0.
end_proportion (float): The ending position as a proportion (0.0 to 1.0). Defaults to 1.0.
Raises:
ValueError: If proportions are not between 0.0 and 1.0, or start >= end.
"""
if not 0.0 <= start_proportion <= 1.0:
raise ValueError(f"start_proportion must be between 0.0 and 1.0, got {start_proportion}")
if not 0.0 <= end_proportion <= 1.0:
raise ValueError(f"end_proportion must be between 0.0 and 1.0, got {end_proportion}")
if start_proportion >= end_proportion:
raise ValueError(
f"start_proportion ({start_proportion}) must be less than end_proportion ({end_proportion})"
)
self._start_proportion = start_proportion
self._end_proportion = end_proportion
[docs]
def select_range(self, *, text: str) -> tuple[int, int]:
"""
Selects a range based on proportional positions.
Args:
text (str): The input text to select from.
Returns:
tuple[int, int]: A tuple of (start_index, end_index).
"""
text_len = len(text)
start = int(text_len * self._start_proportion)
end = int(text_len * self._end_proportion)
return (start, end)
# ============================================================================
# Word-Level Selection Strategies
# ============================================================================
[docs]
class WordIndexSelectionStrategy(WordSelectionStrategy):
"""
Selects words based on their indices in the word list.
"""
[docs]
def __init__(self, *, indices: List[int]) -> None:
"""
Initializes the word index selection strategy.
Args:
indices (List[int]): The list of word indices to select.
"""
self._indices = indices
[docs]
def select_words(self, *, words: List[str]) -> List[int]:
"""
Selects words at the specified indices.
Args:
words (List[str]): The list of words to select from.
Returns:
List[int]: The list of valid indices.
Raises:
ValueError: If any indices are out of range.
"""
if not words:
return []
valid_indices = [i for i in self._indices if 0 <= i < len(words)]
invalid_indices = [i for i in self._indices if i < 0 or i >= len(words)]
if invalid_indices:
raise ValueError(f"Invalid word indices {invalid_indices} provided. Valid range is 0 to {len(words) - 1}.")
return valid_indices
[docs]
class WordKeywordSelectionStrategy(WordSelectionStrategy):
"""
Selects words that match specific keywords.
"""
[docs]
def __init__(self, *, keywords: List[str], case_sensitive: bool = True) -> None:
"""
Initializes the word keyword selection strategy.
Args:
keywords (List[str]): The list of keywords to match.
case_sensitive (bool): Whether matching is case-sensitive. Defaults to True.
"""
self._keywords = keywords
self._case_sensitive = case_sensitive
[docs]
def select_words(self, *, words: List[str]) -> List[int]:
"""
Selects words that match the keywords.
Args:
words (List[str]): The list of words to select from.
Returns:
List[int]: The list of indices where keywords were found.
"""
if not words:
return []
if self._case_sensitive:
return [i for i, word in enumerate(words) if word in self._keywords]
else:
keywords_lower = [k.lower() for k in self._keywords]
return [i for i, word in enumerate(words) if word.lower() in keywords_lower]
[docs]
class WordProportionSelectionStrategy(WordSelectionStrategy):
"""
Selects a random proportion of words.
"""
[docs]
def __init__(self, *, proportion: float, seed: Optional[int] = None) -> None:
"""
Initializes the word proportion selection strategy.
Args:
proportion (float): The proportion of words to select (0.0 to 1.0).
seed (Optional[int]): Random seed for reproducible selections. Defaults to None.
Raises:
ValueError: If proportion is not between 0.0 and 1.0.
"""
if not 0.0 <= proportion <= 1.0:
raise ValueError(f"Proportion must be between 0.0 and 1.0, got {proportion}")
self._proportion = proportion
self._seed = seed
[docs]
def select_words(self, *, words: List[str]) -> List[int]:
"""
Selects a random proportion of words.
Args:
words (List[str]): The list of words to select from.
Returns:
List[int]: The list of randomly selected indices.
"""
if not words:
return []
if self._seed is not None:
random.seed(self._seed)
num_to_select = int(len(words) * self._proportion)
return random.sample(range(len(words)), num_to_select) if num_to_select > 0 else []
[docs]
class WordRegexSelectionStrategy(WordSelectionStrategy):
"""
Selects words that match a regex pattern.
"""
[docs]
def __init__(self, *, pattern: Union[str, Pattern]) -> None:
"""
Initializes the word regex selection strategy.
Args:
pattern (Union[str, Pattern]): The regex pattern to match against words.
"""
self._pattern = re.compile(pattern) if isinstance(pattern, str) else pattern
[docs]
def select_words(self, *, words: List[str]) -> List[int]:
"""
Selects words that match the regex pattern.
Args:
words (List[str]): The list of words to select from.
Returns:
List[int]: The list of indices where words matched the pattern.
"""
if not words:
return []
return [i for i, word in enumerate(words) if self._pattern.search(word)]
[docs]
class WordPositionSelectionStrategy(WordSelectionStrategy):
"""
Selects words based on proportional start and end positions.
"""
[docs]
def __init__(self, *, start_proportion: float, end_proportion: float) -> None:
"""
Initializes the word position selection strategy.
Args:
start_proportion (float): The starting position as a proportion (0.0 to 1.0).
end_proportion (float): The ending position as a proportion (0.0 to 1.0).
Raises:
ValueError: If proportions are not between 0.0 and 1.0, or start >= end.
"""
if not 0.0 <= start_proportion <= 1.0:
raise ValueError(f"start_proportion must be between 0.0 and 1.0, got {start_proportion}")
if not 0.0 <= end_proportion <= 1.0:
raise ValueError(f"end_proportion must be between 0.0 and 1.0, got {end_proportion}")
if start_proportion >= end_proportion:
raise ValueError(
f"start_proportion ({start_proportion}) must be less than end_proportion ({end_proportion})"
)
self._start_proportion = start_proportion
self._end_proportion = end_proportion
[docs]
def select_words(self, *, words: List[str]) -> List[int]:
"""
Selects words based on the relative position.
Args:
words (List[str]): The list of words to select from.
Returns:
List[int]: The list of indices in the specified position range.
"""
if not words:
return []
num_words = len(words)
start_idx = int(num_words * self._start_proportion)
end_idx = int(num_words * self._end_proportion)
return list(range(start_idx, end_idx))
class AllWordsSelectionStrategy(WordSelectionStrategy):
"""
Selects all words (default strategy).
"""
def select_words(self, *, words: List[str]) -> List[int]:
"""
Selects all words.
Args:
words (List[str]): The list of words to select from.
Returns:
List[int]: All word indices.
"""
return list(range(len(words)))