"""
Persian -> Latin transliteration engine for trademark watch.

Goal: generate the *commercial* spellings a person would actually type when
searching, not a single academic romanization. For a mark like اعجاز we want to
surface EJAZ / Ejaz / E'jaz / Ejaaz so an opposition search on any of them hits.

Design:
  - A character map with, for several letters, MULTIPLE plausible Latin forms
    (e.g. ج -> j; ا -> a / aa / ''). We expand these into a small candidate set.
  - Persian has no written short vowels, so we keep candidates conservative and
    cap the count; the admin curates/corrects per mark on the edit screen.
  - Output is a de-duplicated, ranked list of variants. The first is the "best
    guess" used as the default transliteration.

This is deterministic and dependency-free. The API translator (translator.py)
remains the source for *meaning* translation; this is purely script conversion.
"""
import re
import unicodedata

# Each Persian letter maps to one or more Latin candidates.
# Order matters: the first candidate is preferred for the "best guess".
# Commercial spellings (kh, gh) are preferred over academic (x, q).
_MAP = {
    "ا": ["a", "aa"], "آ": ["a", "aa"], "أ": ["a"], "إ": ["e", "a"],
    "ب": ["b"], "پ": ["p"], "ت": ["t"], "ث": ["s", "th"],
    "ج": ["j"], "چ": ["ch"], "ح": ["h"], "خ": ["kh"],
    "د": ["d"], "ذ": ["z", "dh"], "ر": ["r"], "ز": ["z"], "ژ": ["zh", "j"],
    "س": ["s"], "ش": ["sh"], "ص": ["s"], "ض": ["z"], "ط": ["t"], "ظ": ["z"],
    "ع": ["", "'", "a"], "غ": ["gh"], "ف": ["f"], "ق": ["gh", "gh", "q"],
    "ک": ["k"], "گ": ["g"], "ل": ["l"], "م": ["m"], "ن": ["n"],
    "و": ["o", "u", "v", "w"], "ه": ["h", "e"], "ی": ["i", "y", "ee"],
    "ئ": ["'", "y", "i"], "ء": ["", "'"],
    "َ": ["a"], "ُ": ["o", "u"], "ِ": ["e", "i"],
    " ": [" "], "\u200c": [""],
}

# Letters whose alternate forms branch into extra variants.
_BRANCHING = {"ا", "و", "ی", "ع", "ق", "ه", "ث", "ذ"}

_VOWELS = set("aeiouAEIOU")

_MAX_VARIANTS = 6
_CHAR_UNIFY = str.maketrans({"ي": "ی", "ك": "ک", "ۀ": "ه", "ة": "ه"})


def _clean(s):
    s = unicodedata.normalize("NFKC", s or "")
    s = s.translate(_CHAR_UNIFY)
    # keep only Persian letters, spaces, ZWNJ
    s = re.sub(r"[^\u0600-\u06FF\s\u200c]", " ", s)
    return re.sub(r"\s+", " ", s).strip()


def _titlecase(s):
    return " ".join(w[:1].upper() + w[1:] for w in s.split(" ") if w)


def transliterate(persian, max_variants=_MAX_VARIANTS):
    """
    Return a ranked, de-duplicated list of Latin transliteration variants for a
    Persian word/phrase. Empty input -> [].
    """
    text = _clean(persian)
    if not text:
        return []

    # Build candidate strings by expanding only the branching letters.
    candidates = [""]
    for ch in text:
        forms = _MAP.get(ch, [ch])
        if ch in _BRANCHING and len(forms) > 1:
            branch = forms[:3]  # cap branching per letter
            candidates = [c + f for c in candidates for f in branch]
        else:
            primary = forms[0]
            candidates = [c + primary for c in candidates]
        if len(candidates) > 64:  # safety cap before final pruning
            candidates = candidates[:64]

    # Normalize whitespace, collapse repeats, drop empties
    out = []
    seen = set()
    for c in candidates:
        c = re.sub(r"\s+", " ", c).strip()
        c = re.sub(r"(.)\1{2,}", r"\1\1", c)  # cap triple+ repeats
        if not c:
            continue
        tc = _titlecase(c)
        key = tc.lower()
        if key not in seen:
            seen.add(key)
            out.append(tc)

    # Rank by readability: prefer a sensible vowel ratio (real words have
    # vowels), penalize apostrophes and very short/consonant-cluster forms.
    def score(s):
        letters = [c for c in s if c.isalpha()]
        if not letters:
            return (9, 0)
        vowels = sum(1 for c in letters if c in _VOWELS)
        ratio = vowels / len(letters)
        # ideal vowel ratio ~0.35-0.5; distance from 0.4 is the penalty
        vowel_pen = abs(ratio - 0.4)
        apos_pen = s.count("'") * 0.5
        short_pen = 0.5 if len(letters) < 3 else 0
        return (round(vowel_pen + apos_pen + short_pen, 3), len(s), s)

    out.sort(key=score)
    return out[:max_variants]


def best_guess(persian):
    v = transliterate(persian, max_variants=1)
    return v[0] if v else ""


# Variant "type" tags for storage/UX
LATIN_FROM_PDF = "latin_pdf"
AUTO = "auto"
MANUAL = "manual"
TRANSLATION = "translation"


def build_variants(persian, latin_from_pdf=None, api_translit=None, max_rule=4):
    """
    Merge all transliteration sources into a single ranked, de-duplicated list
    of (value, type, is_primary) tuples for storage.

    Priority for the PRIMARY (default report/display name):
      1. Latin text written in the PDF itself (most authoritative)
      2. API-provided transliteration (linguistically vocalized)
      3. Best rule-based guess (fallback)

    Rule-based variants are always added as additional searchable alternates,
    even when a primary exists, so a search on any common spelling matches.
    """
    variants = []
    seen = set()

    def add(value, vtype):
        if not value:
            return
        value = value.strip()
        key = value.lower()
        if not value or key in seen:
            return
        seen.add(key)
        variants.append({"value": value, "type": vtype})

    # authoritative sources first
    add(latin_from_pdf, LATIN_FROM_PDF)
    add(api_translit, AUTO)
    # rule variants
    for v in transliterate(persian, max_variants=max_rule):
        add(v, AUTO)

    # mark the first as primary
    for i, v in enumerate(variants):
        v["is_primary"] = (i == 0)
    return variants