Skip to content

core

Core phoneme infrastructure — CMU dict access, phoneme utilities, and inverted indices.

phonemenal.core

Core phoneme infrastructure — CMU dict loading, inverted pronunciation index, leader/trailer phoneme indexing, and syllable-aware phoneme splitting.

All dict access is lazy-loaded and cached. First call triggers NLTK's CMU dict download if not already present.

get_dict() -> dict[str, list[list[str]]] cached

Return the CMU pronouncing dictionary as {word: [[phoneme, ...], ...]}.

Source code in phonemenal/core.py
@lru_cache(maxsize=1)
def get_dict() -> dict[str, list[list[str]]]:
    """Return the CMU pronouncing dictionary as {word: [[phoneme, ...], ...]}."""
    _ensure_cmudict()
    from nltk.corpus import cmudict

    return cmudict.dict()

get_entries() -> list[tuple[str, list[str]]] cached

Return CMU dict entries as [(word, [phoneme, ...]), ...].

Source code in phonemenal/core.py
@lru_cache(maxsize=1)
def get_entries() -> list[tuple[str, list[str]]]:
    """Return CMU dict entries as [(word, [phoneme, ...]), ...]."""
    _ensure_cmudict()
    from nltk.corpus import cmudict

    return cmudict.entries()

get_inverted() -> dict[tuple[str, ...], set[str]] cached

Build inverted pronunciation index: phoneme tuple → set of words.

This is the core data structure for finding exact homophones: given a pronunciation, look up all words that share it.

Source code in phonemenal/core.py
@lru_cache(maxsize=1)
def get_inverted() -> dict[tuple[str, ...], set[str]]:
    """Build inverted pronunciation index: phoneme tuple → set of words.

    This is the core data structure for finding exact homophones:
    given a pronunciation, look up all words that share it.
    """
    inverted: dict[tuple[str, ...], set[str]] = defaultdict(set)
    for word, phonemes in get_entries():
        inverted[tuple(phonemes)].add(word)
    return dict(inverted)

get_leader_trailer() -> tuple[dict[str, set[str]], dict[str, set[str]]] cached

Build leader/trailer phoneme indices for fast candidate filtering.

Returns (leader_index, trailer_index) where: - leader_index: first phoneme → set of words starting with that phoneme - trailer_index: last phoneme → set of words ending with that phoneme

Source code in phonemenal/core.py
@lru_cache(maxsize=1)
def get_leader_trailer() -> tuple[dict[str, set[str]], dict[str, set[str]]]:
    """Build leader/trailer phoneme indices for fast candidate filtering.

    Returns (leader_index, trailer_index) where:
    - leader_index: first phoneme → set of words starting with that phoneme
    - trailer_index: last phoneme → set of words ending with that phoneme
    """
    leader: dict[str, set[str]] = defaultdict(set)
    trailer: dict[str, set[str]] = defaultdict(set)
    for word, phonemes in get_entries():
        if phonemes:
            leader[phonemes[0]].add(word)
            trailer[phonemes[-1]].add(word)
    return dict(leader), dict(trailer)

get_phonemes(word: str) -> list[list[str]]

Get all pronunciations for a word from CMU dict.

Returns list of pronunciations (each a list of phonemes), or empty list if the word is not in the dictionary.

Source code in phonemenal/core.py
def get_phonemes(word: str) -> list[list[str]]:
    """Get all pronunciations for a word from CMU dict.

    Returns list of pronunciations (each a list of phonemes), or empty list
    if the word is not in the dictionary.
    """
    return get_dict().get(word.lower(), [])

strip_stress(phoneme: str) -> str

Remove stress marker from a phoneme. 'AH1' → 'AH', 'K' → 'K'.

Source code in phonemenal/core.py
def strip_stress(phoneme: str) -> str:
    """Remove stress marker from a phoneme. 'AH1' → 'AH', 'K' → 'K'."""
    return phoneme.rstrip("012")

is_vowel(phoneme: str) -> bool

Check if a phoneme is a vowel (ends with stress digit in CMU dict).

Source code in phonemenal/core.py
def is_vowel(phoneme: str) -> bool:
    """Check if a phoneme is a vowel (ends with stress digit in CMU dict)."""
    return len(phoneme) > 1 and phoneme[-1].isdigit()

split_phonemes_by_syllables(phonemes: list[str]) -> list[list[str]]

Split a phoneme list into syllable groups.

Syllable boundaries are marked by stressed vowels (phonemes ending in 0/1/2). Each syllable contains its onset consonants + the vowel. Trailing consonants after the last vowel are appended to the final syllable.

Example

['R', 'IY0', 'T', 'R', 'AE1', 'K', 'SH', 'AH0', 'N'] → [['R', 'IY0'], ['T', 'R', 'AE1'], ['K', 'SH', 'AH0', 'N']]

Source code in phonemenal/core.py
def split_phonemes_by_syllables(phonemes: list[str]) -> list[list[str]]:
    """Split a phoneme list into syllable groups.

    Syllable boundaries are marked by stressed vowels (phonemes ending in 0/1/2).
    Each syllable contains its onset consonants + the vowel. Trailing consonants
    after the last vowel are appended to the final syllable.

    Example:
        ['R', 'IY0', 'T', 'R', 'AE1', 'K', 'SH', 'AH0', 'N']
        → [['R', 'IY0'], ['T', 'R', 'AE1'], ['K', 'SH', 'AH0', 'N']]
    """
    if not phonemes:
        return []

    syllables: list[list[str]] = []
    current: list[str] = []

    for phoneme in phonemes:
        current.append(phoneme)
        if is_vowel(phoneme):
            syllables.append(current)
            current = []

    # Trailing consonants attach to last syllable
    if current:
        if syllables:
            syllables[-1].extend(current)
        else:
            syllables.append(current)

    return syllables

syllable_count(phonemes: list[str]) -> int

Count syllables in a phoneme list (= number of vowel phonemes).

Source code in phonemenal/core.py
def syllable_count(phonemes: list[str]) -> int:
    """Count syllables in a phoneme list (= number of vowel phonemes)."""
    return sum(1 for p in phonemes if is_vowel(p))

phonemes_to_str(phonemes: list[str]) -> str

Join phonemes into a space-separated string.

Source code in phonemenal/core.py
def phonemes_to_str(phonemes: list[str]) -> str:
    """Join phonemes into a space-separated string."""
    return " ".join(phonemes)

find_words_by_pronunciation(phonemes: list[str] | tuple[str, ...]) -> set[str]

Find all words with the exact given pronunciation.

Source code in phonemenal/core.py
def find_words_by_pronunciation(phonemes: list[str] | tuple[str, ...]) -> set[str]:
    """Find all words with the exact given pronunciation."""
    key = tuple(phonemes) if isinstance(phonemes, list) else phonemes
    return get_inverted().get(key, set())

find_words_by_leader(phoneme: str) -> set[str]

Find all words whose pronunciation starts with the given phoneme.

Source code in phonemenal/core.py
def find_words_by_leader(phoneme: str) -> set[str]:
    """Find all words whose pronunciation starts with the given phoneme."""
    leader, _ = get_leader_trailer()
    return leader.get(phoneme, set())

find_words_by_trailer(phoneme: str) -> set[str]

Find all words whose pronunciation ends with the given phoneme.

Source code in phonemenal/core.py
def find_words_by_trailer(phoneme: str) -> set[str]:
    """Find all words whose pronunciation ends with the given phoneme."""
    _, trailer = get_leader_trailer()
    return trailer.get(phoneme, set())

normalize_name(name: str) -> str

Normalize a name for phonetic comparison.

Strips separators (hyphens, underscores, dots) and lowercases.

Source code in phonemenal/core.py
def normalize_name(name: str) -> str:
    """Normalize a name for phonetic comparison.

    Strips separators (hyphens, underscores, dots) and lowercases.
    """
    import re

    return re.sub(r"[-_.]+", "", name.lower())

get_phonemes_or_fallback(word: str) -> Optional[list[list[str]]]

Get phonemes from CMU dict, returning None if not found.

Callers should use fallback.phonetic_key() for words not in the dict.

Source code in phonemenal/core.py
def get_phonemes_or_fallback(word: str) -> Optional[list[list[str]]]:
    """Get phonemes from CMU dict, returning None if not found.

    Callers should use fallback.phonetic_key() for words not in the dict.
    """
    phonemes = get_phonemes(word)
    return phonemes if phonemes else None