Skip to content

similarity

Phonetic similarity scoring with four complementary algorithms.

All public functions return normalized scores between 0.0 (completely different) and 1.0 (identical). Pass raw=True to get a (score, details) tuple with intermediate computation data.

phonemenal.similarity

Four phonetic similarity algorithms, all normalized to 0.0–1.0.

  • PPC-A (Positional Phoneme Correlation — Absolute): Measures overlap of positional phoneme patterns between two words. Based on building forward and reverse phoneme combinations with positional padding.

  • PLD (Phoneme Levenshtein Distance): Syllable-level edit distance using rapidfuzz. Treats each syllable as an atomic unit so distance reflects how many whole syllables differ.

  • PED (Phoneme Edit Distance): Phoneme-level edit distance using stress- stripped CMU pronunciations. Complements PLD for short or monosyllabic pairs where syllable-level scoring is too coarse.

  • LCS (Longest Common Subsequence): Ratio-based scoring on phonetic keys or raw phoneme sequences.

Composite scoring combines PPC-A, an edit channel, and LCS with configurable weights and edit-mode selection.

ppc(word1: str, word2: str, *, raw: bool = False) -> float | tuple[float, dict]

Positional Phoneme Correlation — Absolute (PPC-A).

Builds positional phoneme combinations by traversing forward and reverse directions with padding, then measures set intersection.

Returns normalized score 0.0–1.0 (higher = more similar). If raw=True, returns (score, details_dict) with intermediate values.

Source code in phonemenal/similarity.py
def ppc(
    word1: str,
    word2: str,
    *,
    raw: bool = False,
) -> float | tuple[float, dict]:
    """Positional Phoneme Correlation — Absolute (PPC-A).

    Builds positional phoneme combinations by traversing forward and reverse
    directions with padding, then measures set intersection.

    Returns normalized score 0.0–1.0 (higher = more similar).
    If raw=True, returns (score, details_dict) with intermediate values.
    """
    p1_all = get_phonemes(word1)
    p2_all = get_phonemes(word2)

    if not p1_all or not p2_all:
        if raw:
            return 0.0, {"error": "word not in CMU dict", "word1": word1, "word2": word2}
        return 0.0

    # Find the best scoring pronunciation pair
    best_score = 0.0
    best_details: dict = {}

    for p1 in p1_all:
        for p2 in p2_all:
            score, details = _ppc_pair(p1, p2)
            if score > best_score:
                best_score = score
                best_details = details

    if raw:
        return best_score, best_details
    return best_score

pld(word1: str, word2: str, *, raw: bool = False) -> float | tuple[float, dict]

Phoneme Levenshtein Distance at syllable level.

Each syllable is treated as an atomic unit (tuple of phonemes). Distance is computed between syllable sequences, then normalized to 0.0–1.0 where 1.0 = identical and 0.0 = maximally different.

If raw=True, returns (score, details_dict).

Source code in phonemenal/similarity.py
def pld(
    word1: str,
    word2: str,
    *,
    raw: bool = False,
) -> float | tuple[float, dict]:
    """Phoneme Levenshtein Distance at syllable level.

    Each syllable is treated as an atomic unit (tuple of phonemes). Distance
    is computed between syllable sequences, then normalized to 0.0–1.0 where
    1.0 = identical and 0.0 = maximally different.

    If raw=True, returns (score, details_dict).
    """
    p1_all = get_phonemes(word1)
    p2_all = get_phonemes(word2)

    if not p1_all or not p2_all:
        if raw:
            return 0.0, {"error": "word not in CMU dict", "word1": word1, "word2": word2}
        return 0.0

    # Find the pronunciation pair with the lowest distance
    best_score = 0.0
    best_details: dict = {}

    for p1 in p1_all:
        for p2 in p2_all:
            s1 = [tuple(s) for s in split_phonemes_by_syllables(p1)]
            s2 = [tuple(s) for s in split_phonemes_by_syllables(p2)]

            max_len = max(len(s1), len(s2))
            if max_len == 0:
                continue

            dist = Levenshtein.distance(s1, s2)
            score = 1.0 - (dist / max_len)

            if score > best_score:
                best_score = score
                best_details = {
                    "p1": p1,
                    "p2": p2,
                    "syllables1": s1,
                    "syllables2": s2,
                    "distance": dist,
                    "max_syllables": max_len,
                }

    if raw:
        return best_score, best_details
    return best_score

ped(word1: str, word2: str, *, raw: bool = False) -> float | tuple[float, dict]

Phoneme edit distance on stress-stripped pronunciations.

This operates at the phoneme level rather than the syllable level. It is especially useful for short words and monosyllables where syllable-level PLD often collapses to either 1.0 or 0.0.

Source code in phonemenal/similarity.py
def ped(
    word1: str,
    word2: str,
    *,
    raw: bool = False,
) -> float | tuple[float, dict]:
    """Phoneme edit distance on stress-stripped pronunciations.

    This operates at the phoneme level rather than the syllable level. It is
    especially useful for short words and monosyllables where syllable-level
    PLD often collapses to either 1.0 or 0.0.
    """
    p1_all = get_phonemes(word1)
    p2_all = get_phonemes(word2)

    if not p1_all or not p2_all:
        if raw:
            return 0.0, {"error": "word not in CMU dict", "word1": word1, "word2": word2}
        return 0.0

    best_score = 0.0
    best_details: dict = {}

    for p1 in p1_all:
        s1 = [strip_stress(p) for p in p1]
        for p2 in p2_all:
            s2 = [strip_stress(p) for p in p2]
            max_len = max(len(s1), len(s2))
            if max_len == 0:
                continue

            dist = Levenshtein.distance(s1, s2)
            score = 1.0 - (dist / max_len)

            if score > best_score:
                best_score = score
                best_details = {
                    "p1": p1,
                    "p2": p2,
                    "phonemes1": s1,
                    "phonemes2": s2,
                    "distance": dist,
                    "max_phonemes": max_len,
                }

    if raw:
        return best_score, best_details
    return best_score

lcs(word1: str, word2: str, *, use_phonemes: bool = True, raw: bool = False) -> float | tuple[float, dict]

Longest Common Subsequence ratio.

When use_phonemes=True (default), compares phoneme sequences from CMU dict. When use_phonemes=False, compares raw character strings (useful as fallback for words not in the dictionary).

Returns 0.0–1.0 where 1.0 = identical sequences. If raw=True, returns (score, details_dict).

Source code in phonemenal/similarity.py
def lcs(
    word1: str,
    word2: str,
    *,
    use_phonemes: bool = True,
    raw: bool = False,
) -> float | tuple[float, dict]:
    """Longest Common Subsequence ratio.

    When use_phonemes=True (default), compares phoneme sequences from CMU dict.
    When use_phonemes=False, compares raw character strings (useful as fallback
    for words not in the dictionary).

    Returns 0.0–1.0 where 1.0 = identical sequences.
    If raw=True, returns (score, details_dict).
    """
    if use_phonemes:
        p1_all = get_phonemes(word1)
        p2_all = get_phonemes(word2)

        if not p1_all or not p2_all:
            if raw:
                return 0.0, {"error": "word not in CMU dict"}
            return 0.0

        best_score = 0.0
        best_details: dict = {}

        for p1 in p1_all:
            for p2 in p2_all:
                score = _lcs_ratio(p1, p2)
                if score > best_score:
                    best_score = score
                    best_details = {"p1": p1, "p2": p2}

        if raw:
            return best_score, best_details
        return best_score
    else:
        seq1 = list(word1.lower())
        seq2 = list(word2.lower())
        score = _lcs_ratio(seq1, seq2)
        if raw:
            return score, {"seq1": seq1, "seq2": seq2}
        return score

composite(word1: str, word2: str, *, weights: tuple[float, float, float] = (1.0, 2.0, 1.0), edit_mode: str = 'max', raw: bool = False) -> float | tuple[float, dict]

Weighted composite of PPC-A, edit distance, and LCS scores.

Parameters:

Name Type Description Default
word1 str

First word to compare.

required
word2 str

Second word to compare.

required
weights tuple[float, float, float]

(ppc_weight, edit_weight, lcs_weight). Default weights emphasize edit similarity.

(1.0, 2.0, 1.0)
edit_mode str

"max" (default) uses the stronger of PLD/PED. "length" uses PED for monosyllable-vs-monosyllable pairs and PLD otherwise.

'max'
raw bool

If True, return (composite_score, details_dict).

False

Returns:

Type Description
float | tuple[float, dict]

Composite similarity score between 0.0 and 1.0.

Source code in phonemenal/similarity.py
def composite(
    word1: str,
    word2: str,
    *,
    weights: tuple[float, float, float] = (1.0, 2.0, 1.0),
    edit_mode: str = "max",
    raw: bool = False,
) -> float | tuple[float, dict]:
    """Weighted composite of PPC-A, edit distance, and LCS scores.

    Args:
        word1: First word to compare.
        word2: Second word to compare.
        weights: (ppc_weight, edit_weight, lcs_weight). Default weights
            emphasize edit similarity.
        edit_mode: "max" (default) uses the stronger of PLD/PED. "length"
            uses PED for monosyllable-vs-monosyllable pairs and PLD otherwise.
        raw: If True, return (composite_score, details_dict).

    Returns:
        Composite similarity score between 0.0 and 1.0.
    """
    w_ppc, w_edit, w_lcs = weights
    total_weight = w_ppc + w_edit + w_lcs

    if total_weight == 0:
        if raw:
            return 0.0, {"error": "all weights are zero"}
        return 0.0

    ppc_score = ppc(word1, word2)
    pld_score, pld_details = pld(word1, word2, raw=True)
    ped_score, ped_details = ped(word1, word2, raw=True)
    lcs_score = lcs(word1, word2)
    edit_score, edit_source = _select_edit_score(
        pld_score,
        ped_score,
        edit_mode=edit_mode,
        pld_details=pld_details,
        ped_details=ped_details,
    )

    score = (w_ppc * ppc_score + w_edit * edit_score + w_lcs * lcs_score) / total_weight

    if raw:
        return score, {
            "ppc": ppc_score,
            "pld": pld_score,
            "ped": ped_score,
            "edit": edit_score,
            "edit_mode": edit_mode,
            "edit_source": edit_source,
            "lcs": lcs_score,
            "weights": weights,
            "composite": score,
        }
    return score

compare(word1: str, word2: str, *, weights: Optional[tuple[float, float, float]] = None, edit_mode: str = 'max') -> dict

Full comparison report between two words.

Returns a dict with all individual scores, composite score, and pronunciation details.

Source code in phonemenal/similarity.py
def compare(
    word1: str,
    word2: str,
    *,
    weights: Optional[tuple[float, float, float]] = None,
    edit_mode: str = "max",
) -> dict:
    """Full comparison report between two words.

    Returns a dict with all individual scores, composite score,
    and pronunciation details.
    """
    ppc_score, ppc_details = ppc(word1, word2, raw=True)
    pld_score, pld_details = pld(word1, word2, raw=True)
    ped_score, ped_details = ped(word1, word2, raw=True)
    lcs_score, lcs_details = lcs(word1, word2, raw=True)

    edit_score, edit_source = _select_edit_score(
        pld_score,
        ped_score,
        edit_mode=edit_mode,
        pld_details=pld_details,
        ped_details=ped_details,
    )
    w = weights or (1.0, 2.0, 1.0)
    total_weight = sum(w)
    if total_weight == 0:
        comp = 0.0
    else:
        comp = (w[0] * ppc_score + w[1] * edit_score + w[2] * lcs_score) / total_weight

    return {
        "word1": word1,
        "word2": word2,
        "ppc": {"score": round(ppc_score, 4), **ppc_details},
        "pld": {"score": round(pld_score, 4), **pld_details},
        "ped": {"score": round(ped_score, 4), **ped_details},
        "edit": round(edit_score, 4),
        "edit_mode": edit_mode,
        "edit_source": edit_source,
        "lcs": {"score": round(lcs_score, 4), **lcs_details},
        "composite": round(comp, 4),
        "weights": w,
    }