Skip to content

scanning

High-level collision detection pipeline combining the fallback encoder, similarity scoring, and variant generation.

phonemenal.scanning

Scan candidate names against a set of known names for phonetic collisions.

This is the high-level scanning pipeline that combines the fallback phonetic encoder, similarity scoring, and variant generation into a complete workflow.

Two scan modes
  • Forward: check candidates against known names (fast, daemon pipeline)
  • Reverse: also generate variants of candidates and check if they exist in a provided lookup function (e.g. a registry check such as a PyPI HEAD request)

build_phonetic_index(names: list[str]) -> dict[str, list[str]]

Build a mapping from phonetic key → list of names.

Used for fast exact-key lookups before falling back to pairwise scoring.

Source code in phonemenal/scanning.py
def build_phonetic_index(names: list[str]) -> dict[str, list[str]]:
    """Build a mapping from phonetic key → list of names.

    Used for fast exact-key lookups before falling back to pairwise scoring.
    """
    index: dict[str, list[str]] = {}
    for name in names:
        key = phonetic_key(name)
        if key not in index:
            index[key] = []
        index[key].append(name)
    return index

check_collision(candidate: str, known_names: list[str], phonetic_index: dict[str, list[str]], *, threshold: float = DEFAULT_THRESHOLD, use_composite: bool = False, composite_weights: tuple[float, float, float] = (1.0, 1.0, 1.0)) -> list[dict]

Check a candidate name for phonetic collisions against known names.

Parameters:

Name Type Description Default
candidate str

Name to check.

required
known_names list[str]

List of known/legitimate names.

required
phonetic_index dict[str, list[str]]

Pre-built index from build_phonetic_index().

required
threshold float

Minimum similarity score to flag (0.0–1.0).

DEFAULT_THRESHOLD
use_composite bool

If True, use CMU-dict-backed composite scoring (PPC+PLD+LCS) instead of fallback key similarity. Slower but more accurate.

False
composite_weights tuple[float, float, float]

Weights for composite scoring (ppc, pld, lcs).

(1.0, 1.0, 1.0)
Returns list of match dicts sorted by similarity descending
  • candidate: input name
  • matched_name: the known name
  • similarity: 0.0–1.0 score
  • candidate_key: phonetic key of candidate
  • matched_key: phonetic key of match
  • collision_type: "exact_phonetic" | "near_phonetic"
Source code in phonemenal/scanning.py
def check_collision(
    candidate: str,
    known_names: list[str],
    phonetic_index: dict[str, list[str]],
    *,
    threshold: float = DEFAULT_THRESHOLD,
    use_composite: bool = False,
    composite_weights: tuple[float, float, float] = (1.0, 1.0, 1.0),
) -> list[dict]:
    """Check a candidate name for phonetic collisions against known names.

    Args:
        candidate: Name to check.
        known_names: List of known/legitimate names.
        phonetic_index: Pre-built index from build_phonetic_index().
        threshold: Minimum similarity score to flag (0.0–1.0).
        use_composite: If True, use CMU-dict-backed composite scoring (PPC+PLD+LCS)
                      instead of fallback key similarity. Slower but more accurate.
        composite_weights: Weights for composite scoring (ppc, pld, lcs).

    Returns list of match dicts sorted by similarity descending:
        - candidate: input name
        - matched_name: the known name
        - similarity: 0.0–1.0 score
        - candidate_key: phonetic key of candidate
        - matched_key: phonetic key of match
        - collision_type: "exact_phonetic" | "near_phonetic"
    """
    candidate_lower = candidate.lower()
    candidate_key = phonetic_key(candidate)
    matches: list[dict] = []
    seen: set[str] = set()

    # Exact phonetic key match
    for name in phonetic_index.get(candidate_key, []):
        if name.lower() == candidate_lower:
            continue
        if name not in seen:
            matches.append(
                {
                    "candidate": candidate,
                    "matched_name": name,
                    "similarity": 1.0,
                    "candidate_key": candidate_key,
                    "matched_key": candidate_key,
                    "collision_type": "exact_phonetic",
                }
            )
            seen.add(name)

    # Near phonetic matches
    if use_composite:
        # CMU-dict-backed scoring — slower, more accurate
        # Falls back to fallback key similarity when composite returns 0.0
        # (i.e. words missing from CMU dict)
        for name in known_names:
            if name.lower() == candidate_lower or name in seen:
                continue
            score = composite_score(candidate, name, weights=composite_weights)
            if score == 0.0:
                score = fallback_similarity(candidate_key, phonetic_key(name))
            if score >= threshold:
                matches.append(
                    {
                        "candidate": candidate,
                        "matched_name": name,
                        "similarity": round(score, 4),
                        "candidate_key": candidate_key,
                        "matched_key": phonetic_key(name),
                        "collision_type": "near_phonetic",
                    }
                )
                seen.add(name)
    else:
        # Fallback key similarity — fast, good enough for bulk scanning
        for key, names in phonetic_index.items():
            if key == candidate_key:
                continue
            ratio = fallback_similarity(candidate_key, key)
            if ratio >= threshold:
                for name in names:
                    if name.lower() == candidate_lower or name in seen:
                        continue
                    matches.append(
                        {
                            "candidate": candidate,
                            "matched_name": name,
                            "similarity": round(ratio, 4),
                            "candidate_key": candidate_key,
                            "matched_key": key,
                            "collision_type": "near_phonetic",
                        }
                    )
                    seen.add(name)

    matches.sort(key=lambda m: m["similarity"], reverse=True)
    return matches

scan(candidates: list[str], known_names: list[str], *, threshold: float = DEFAULT_THRESHOLD, use_composite: bool = False, composite_weights: tuple[float, float, float] = (1.0, 1.0, 1.0)) -> list[dict]

Scan candidate names for phonetic collisions with known names.

Forward scan only — checks each candidate against the known set.

Parameters:

Name Type Description Default
candidates list[str]

Names to check.

required
known_names list[str]

Known/legitimate names to compare against.

required
threshold float

Minimum similarity score to flag.

DEFAULT_THRESHOLD
use_composite bool

Use CMU-dict-backed composite scoring.

False
composite_weights tuple[float, float, float]

Weights for composite scoring.

(1.0, 1.0, 1.0)

Returns list of all matches across all candidates.

Source code in phonemenal/scanning.py
def scan(
    candidates: list[str],
    known_names: list[str],
    *,
    threshold: float = DEFAULT_THRESHOLD,
    use_composite: bool = False,
    composite_weights: tuple[float, float, float] = (1.0, 1.0, 1.0),
) -> list[dict]:
    """Scan candidate names for phonetic collisions with known names.

    Forward scan only — checks each candidate against the known set.

    Args:
        candidates: Names to check.
        known_names: Known/legitimate names to compare against.
        threshold: Minimum similarity score to flag.
        use_composite: Use CMU-dict-backed composite scoring.
        composite_weights: Weights for composite scoring.

    Returns list of all matches across all candidates.
    """
    index = build_phonetic_index(known_names)
    all_matches: list[dict] = []

    for candidate in candidates:
        hits = check_collision(
            candidate,
            known_names,
            index,
            threshold=threshold,
            use_composite=use_composite,
            composite_weights=composite_weights,
        )
        if hits:
            log.warning(
                "Phonetic collision: %s%s",
                candidate,
                [(h["matched_name"], h["similarity"]) for h in hits],
            )
        all_matches.extend(hits)

    return all_matches

scan_with_reverse(candidates: list[str], known_names: list[str], *, exists_fn: Optional[Callable[[str], bool]] = None, threshold: float = DEFAULT_THRESHOLD, use_composite: bool = False, composite_weights: tuple[float, float, float] = (1.0, 1.0, 1.0), include_morphological: bool = True) -> list[dict]

Scan candidates with forward AND reverse checking.

Forward: candidate vs known names (same as scan()). Reverse: generate variants of each candidate, check if they exist via exists_fn, and score any that do against the candidate.

Parameters:

Name Type Description Default
candidates list[str]

Names to check.

required
known_names list[str]

Known/legitimate names.

required
exists_fn Optional[Callable[[str], bool]]

Callable that returns True if a name exists in an external system (e.g. a PyPI HEAD request). If None, reverse scanning is skipped.

None
threshold float

Minimum similarity score.

DEFAULT_THRESHOLD
use_composite bool

Use CMU-dict composite scoring.

False
composite_weights tuple[float, float, float]

Weights for composite scoring.

(1.0, 1.0, 1.0)
include_morphological bool

Include morphological variants in reverse scan.

True

Returns all matches (forward + reverse).

Source code in phonemenal/scanning.py
def scan_with_reverse(
    candidates: list[str],
    known_names: list[str],
    *,
    exists_fn: Optional[Callable[[str], bool]] = None,
    threshold: float = DEFAULT_THRESHOLD,
    use_composite: bool = False,
    composite_weights: tuple[float, float, float] = (1.0, 1.0, 1.0),
    include_morphological: bool = True,
) -> list[dict]:
    """Scan candidates with forward AND reverse checking.

    Forward: candidate vs known names (same as scan()).
    Reverse: generate variants of each candidate, check if they exist via
             exists_fn, and score any that do against the candidate.

    Args:
        candidates: Names to check.
        known_names: Known/legitimate names.
        exists_fn: Callable that returns True if a name exists in an external system
                  (e.g. a PyPI HEAD request). If None, reverse scanning is skipped.
        threshold: Minimum similarity score.
        use_composite: Use CMU-dict composite scoring.
        composite_weights: Weights for composite scoring.
        include_morphological: Include morphological variants in reverse scan.

    Returns all matches (forward + reverse).
    """
    from .variants import generate, generate_morphological

    # Forward scan
    all_matches = scan(
        candidates,
        known_names,
        threshold=threshold,
        use_composite=use_composite,
        composite_weights=composite_weights,
    )
    seen = {(m["candidate"], m["matched_name"]) for m in all_matches}

    if exists_fn is None:
        return all_matches

    # Reverse scan
    for candidate in candidates:
        candidate_key = phonetic_key(candidate)

        variants = generate(candidate)
        if include_morphological:
            variants.update(generate_morphological(candidate))

        for variant in variants:
            if (candidate, variant) in seen or (variant, candidate) in seen:
                continue
            if not exists_fn(variant):
                continue

            # Score the variant against the candidate
            if use_composite:
                ratio = composite_score(candidate, variant, weights=composite_weights)
            else:
                variant_key = phonetic_key(variant)
                ratio = fallback_similarity(candidate_key, variant_key)

            if ratio >= threshold:
                collision_type = "exact_phonetic" if ratio == 1.0 else "near_phonetic"
                match = {
                    "candidate": candidate,
                    "matched_name": variant,
                    "similarity": round(ratio, 4),
                    "candidate_key": candidate_key,
                    "matched_key": phonetic_key(variant),
                    "collision_type": collision_type,
                    "reverse_verified": True,
                }
                all_matches.append(match)
                seen.add((candidate, variant))
                log.warning(
                    "Reverse phonetic collision: %s ~ %s (score: %.2f, exists)",
                    candidate,
                    variant,
                    ratio,
                )

    all_matches.sort(key=lambda m: m.get("similarity", 0), reverse=True)
    return all_matches

format_matches(matches: list[dict]) -> str

Format collision results for display.

Returns a human-readable string summarizing all matches.

Source code in phonemenal/scanning.py
def format_matches(matches: list[dict]) -> str:
    """Format collision results for display.

    Returns a human-readable string summarizing all matches.
    """
    if not matches:
        return "No phonetic collisions detected."

    lines = [f"Found {len(matches)} phonetic match(es):\n"]
    for m in matches:
        score = m["similarity"]
        icon = "!!!" if score >= 0.95 else "!!" if score >= 0.80 else "!"
        reverse_tag = " [reverse-verified]" if m.get("reverse_verified") else ""
        lines.append(
            f"  [{icon}] {m['candidate']!r} ~ {m['matched_name']!r}  "
            f"(score: {score:.2f}, type: {m['collision_type']}, "
            f"keys: {m['candidate_key']} / {m['matched_key']}){reverse_tag}"
        )
    return "\n".join(lines)