Skip to content

similarity

Phonetic similarity scoring with three complementary algorithms.

All public functions return normalized scores between 0.0 (completely different) and 1.0 (identical). Pass raw=True to get a (score, details) tuple with intermediate computation data.

phonemenal.similarity

Three phonetic similarity algorithms, all normalized to 0.0–1.0.

  • PPC-A (Positional Phoneme Correlation — Absolute): Measures overlap of positional phoneme patterns between two words. Based on building forward and reverse phoneme combinations with positional padding.

  • PLD (Phoneme Levenshtein Distance): Syllable-level edit distance using rapidfuzz. Treats each syllable as an atomic unit so distance reflects how many whole syllables differ.

  • LCS (Longest Common Subsequence): Ratio-based scoring on phonetic keys or raw phoneme sequences.

Composite scoring combines all three with configurable weights.

ppc(word1: str, word2: str, *, raw: bool = False) -> float | tuple[float, dict]

Positional Phoneme Correlation — Absolute (PPC-A).

Builds positional phoneme combinations by traversing forward and reverse directions with padding, then measures set intersection.

Returns normalized score 0.0–1.0 (higher = more similar). If raw=True, returns (score, details_dict) with intermediate values.

Source code in phonemenal/similarity.py
def ppc(
    word1: str,
    word2: str,
    *,
    raw: bool = False,
) -> float | tuple[float, dict]:
    """Positional Phoneme Correlation — Absolute (PPC-A).

    Builds positional phoneme combinations by traversing forward and reverse
    directions with padding, then measures set intersection.

    Returns normalized score 0.0–1.0 (higher = more similar).
    If raw=True, returns (score, details_dict) with intermediate values.
    """
    p1_all = get_phonemes(word1)
    p2_all = get_phonemes(word2)

    if not p1_all or not p2_all:
        if raw:
            return 0.0, {"error": "word not in CMU dict", "word1": word1, "word2": word2}
        return 0.0

    # Find the best scoring pronunciation pair
    best_score = 0.0
    best_details: dict = {}

    for p1 in p1_all:
        for p2 in p2_all:
            score, details = _ppc_pair(p1, p2)
            if score > best_score:
                best_score = score
                best_details = details

    if raw:
        return best_score, best_details
    return best_score

pld(word1: str, word2: str, *, raw: bool = False) -> float | tuple[float, dict]

Phoneme Levenshtein Distance at syllable level.

Each syllable is treated as an atomic unit (tuple of phonemes). Distance is computed between syllable sequences, then normalized to 0.0–1.0 where 1.0 = identical and 0.0 = maximally different.

If raw=True, returns (score, details_dict).

Source code in phonemenal/similarity.py
def pld(
    word1: str,
    word2: str,
    *,
    raw: bool = False,
) -> float | tuple[float, dict]:
    """Phoneme Levenshtein Distance at syllable level.

    Each syllable is treated as an atomic unit (tuple of phonemes). Distance
    is computed between syllable sequences, then normalized to 0.0–1.0 where
    1.0 = identical and 0.0 = maximally different.

    If raw=True, returns (score, details_dict).
    """
    p1_all = get_phonemes(word1)
    p2_all = get_phonemes(word2)

    if not p1_all or not p2_all:
        if raw:
            return 0.0, {"error": "word not in CMU dict", "word1": word1, "word2": word2}
        return 0.0

    # Find the pronunciation pair with the lowest distance
    best_score = 0.0
    best_details: dict = {}

    for p1 in p1_all:
        for p2 in p2_all:
            s1 = [tuple(s) for s in split_phonemes_by_syllables(p1)]
            s2 = [tuple(s) for s in split_phonemes_by_syllables(p2)]

            max_len = max(len(s1), len(s2))
            if max_len == 0:
                continue

            dist = Levenshtein.distance(s1, s2)
            score = 1.0 - (dist / max_len)

            if score > best_score:
                best_score = score
                best_details = {
                    "p1": p1,
                    "p2": p2,
                    "syllables1": s1,
                    "syllables2": s2,
                    "distance": dist,
                    "max_syllables": max_len,
                }

    if raw:
        return best_score, best_details
    return best_score

lcs(word1: str, word2: str, *, use_phonemes: bool = True, raw: bool = False) -> float | tuple[float, dict]

Longest Common Subsequence ratio.

When use_phonemes=True (default), compares phoneme sequences from CMU dict. When use_phonemes=False, compares raw character strings (useful as fallback for words not in the dictionary).

Returns 0.0–1.0 where 1.0 = identical sequences. If raw=True, returns (score, details_dict).

Source code in phonemenal/similarity.py
def lcs(
    word1: str,
    word2: str,
    *,
    use_phonemes: bool = True,
    raw: bool = False,
) -> float | tuple[float, dict]:
    """Longest Common Subsequence ratio.

    When use_phonemes=True (default), compares phoneme sequences from CMU dict.
    When use_phonemes=False, compares raw character strings (useful as fallback
    for words not in the dictionary).

    Returns 0.0–1.0 where 1.0 = identical sequences.
    If raw=True, returns (score, details_dict).
    """
    if use_phonemes:
        p1_all = get_phonemes(word1)
        p2_all = get_phonemes(word2)

        if not p1_all or not p2_all:
            if raw:
                return 0.0, {"error": "word not in CMU dict"}
            return 0.0

        best_score = 0.0
        best_details: dict = {}

        for p1 in p1_all:
            for p2 in p2_all:
                score = _lcs_ratio(p1, p2)
                if score > best_score:
                    best_score = score
                    best_details = {"p1": p1, "p2": p2}

        if raw:
            return best_score, best_details
        return best_score
    else:
        seq1 = list(word1.lower())
        seq2 = list(word2.lower())
        score = _lcs_ratio(seq1, seq2)
        if raw:
            return score, {"seq1": seq1, "seq2": seq2}
        return score

composite(word1: str, word2: str, *, weights: tuple[float, float, float] = (1.0, 1.0, 1.0), raw: bool = False) -> float | tuple[float, dict]

Weighted composite of PPC-A, PLD, and LCS scores.

Parameters:

Name Type Description Default
word1 str

First word to compare.

required
word2 str

Second word to compare.

required
weights tuple[float, float, float]

(ppc_weight, pld_weight, lcs_weight). Default equal weighting.

(1.0, 1.0, 1.0)
raw bool

If True, return (composite_score, details_dict).

False

Returns:

Type Description
float | tuple[float, dict]

Composite similarity score between 0.0 and 1.0.

Source code in phonemenal/similarity.py
def composite(
    word1: str,
    word2: str,
    *,
    weights: tuple[float, float, float] = (1.0, 1.0, 1.0),
    raw: bool = False,
) -> float | tuple[float, dict]:
    """Weighted composite of PPC-A, PLD, and LCS scores.

    Args:
        word1: First word to compare.
        word2: Second word to compare.
        weights: (ppc_weight, pld_weight, lcs_weight). Default equal weighting.
        raw: If True, return (composite_score, details_dict).

    Returns:
        Composite similarity score between 0.0 and 1.0.
    """
    w_ppc, w_pld, w_lcs = weights
    total_weight = w_ppc + w_pld + w_lcs

    if total_weight == 0:
        if raw:
            return 0.0, {"error": "all weights are zero"}
        return 0.0

    ppc_score = ppc(word1, word2)
    pld_score = pld(word1, word2)
    lcs_score = lcs(word1, word2)

    score = (w_ppc * ppc_score + w_pld * pld_score + w_lcs * lcs_score) / total_weight

    if raw:
        return score, {
            "ppc": ppc_score,
            "pld": pld_score,
            "lcs": lcs_score,
            "weights": weights,
            "composite": score,
        }
    return score

compare(word1: str, word2: str, *, weights: Optional[tuple[float, float, float]] = None) -> dict

Full comparison report between two words.

Returns a dict with all three individual scores, composite score, and pronunciation details.

Source code in phonemenal/similarity.py
def compare(
    word1: str,
    word2: str,
    *,
    weights: Optional[tuple[float, float, float]] = None,
) -> dict:
    """Full comparison report between two words.

    Returns a dict with all three individual scores, composite score,
    and pronunciation details.
    """
    ppc_score, ppc_details = ppc(word1, word2, raw=True)
    pld_score, pld_details = pld(word1, word2, raw=True)
    lcs_score, lcs_details = lcs(word1, word2, raw=True)

    w = weights or (1.0, 1.0, 1.0)
    total_weight = sum(w)
    if total_weight == 0:
        comp = 0.0
    else:
        comp = (w[0] * ppc_score + w[1] * pld_score + w[2] * lcs_score) / total_weight

    return {
        "word1": word1,
        "word2": word2,
        "ppc": {"score": round(ppc_score, 4), **ppc_details},
        "pld": {"score": round(pld_score, 4), **pld_details},
        "lcs": {"score": round(lcs_score, 4), **lcs_details},
        "composite": round(comp, 4),
        "weights": w,
    }