"""
PDF extraction for Iran trademark publication PDFs (آگهی تقاضای ثبت علامت تجاری).

Tuned to the real bulletin format issued by مرکز مالکیت معنوی (Iran IP Office).

Key insight: pdfplumber returns the Farsi text in VISUAL order using Arabic
presentation forms. We normalize with NFKC (presentation forms -> base letters)
and reorder each line with the bidi algorithm to recover LOGICAL order, then
parse labelled fields. A handful of لا-ligature artifacts are corrected.
"""
import os
import re
import unicodedata
import datetime as dt

import pdfplumber
from bidi.algorithm import get_display

try:
    import jdatetime
    HAVE_JDATE = True
except Exception:
    HAVE_JDATE = False

_LIG_FIXES = [
    ("عالمت", "علامت"), ("عالئم", "علائم"), ("اعالم", "اعلام"),
    ("كاالها", "کالاها"), ("كاال", "کالا"),
]

# Unify Arabic vs Persian letter variants so label matching is reliable
# regardless of which form the PDF used (yeh: ي vs ی, kaf: ك vs ک).
_CHAR_UNIFY = str.maketrans({
    "ي": "ی", "ك": "ک", "ﻯ": "ی", "ﻰ": "ی", "ﻲ": "ی", "ﻱ": "ی",
    "ة": "ه", "ﺓ": "ه", "أ": "ا", "إ": "ا", "آ": "آ",
})

_DIGITS = {ord(c): str(i) for i, c in enumerate("۰۱۲۳۴۵۶۷۸۹")}
_DIGITS.update({ord(c): str(i) for i, c in enumerate("٠١٢٣٤٥٦٧٨٩")})


def asc_digits(s):
    return s.translate(_DIGITS) if s else s


def normalize_lines(raw):
    """Visual-order presentation-form text -> logical-order base-letter lines."""
    norm = unicodedata.normalize("NFKC", raw or "")
    out = []
    for ln in norm.splitlines():
        ln = get_display(ln)
        for a, b in _LIG_FIXES:
            ln = ln.replace(a, b)
        ln = ln.translate(_CHAR_UNIFY)
        out.append(ln.strip())
    return out


def jalali_to_gregorian(s):
    """Convert a Jalali date string (YYYY/MM/DD) to a Gregorian date, or None."""
    s = asc_digits(s or "")
    m = re.search(r"(\d{4})/(\d{1,2})/(\d{1,2})", s)
    if not m:
        return None
    y, mo, d = map(int, m.groups())
    if HAVE_JDATE:
        try:
            return jdatetime.date(y, mo, d).togregorian()
        except Exception:
            return None
    return None


def _search(pattern, text, flags=0, group=1):
    m = re.search(pattern, text, flags)
    return m.group(group).strip() if m else None


_NUM = r"[\d\u0660-\u0669\u06f0-\u06f9]"


def extract_application(pdf_path, image_out_dir, app_idx=0):
    """Parse one Iran TM publication PDF. Returns (data: dict, confidence_ok: bool)."""
    data = {
        "adv_number": None, "adv_date": None, "adv_date_raw": None,
        "app_number": None, "app_date": None, "app_date_raw": None,
        "mark_text": None, "mark_text_en": None, "mark_description": None,
        "owner": None, "owner_entity": None, "owner_reg_no": None,
        "nationality": None, "legal_rep": None, "address": None,
        "nice_class": None, "goods_services": None, "disclaimer": None,
        "mark_image_path": None,
        "source_pdf_path": pdf_path, "source_pdf_name": os.path.basename(pdf_path),
    }
    try:
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[0]
            lines = normalize_lines(page.extract_text())
            text = "\n".join(lines)
            data["mark_image_path"] = _extract_first_image(
                page, pdf_path, image_out_dir, app_idx)
    except Exception:
        return data, False

    data["adv_number"] = asc_digits(_search(r"شماره اگهی\s*:\s*(" + _NUM + r"+)", text))
    adv_raw = _search(r"تاریخ اگهی\s*:\s*(" + _NUM + r"+/" + _NUM + r"+/" + _NUM + r"+)", text)
    data["adv_date_raw"] = asc_digits(adv_raw)
    data["adv_date"] = jalali_to_gregorian(adv_raw)

    data["app_number"] = asc_digits(_search(r"شماره\s+(" + _NUM + r"{8,})\s+مورخ", text))
    app_raw = _search(r"مورخ\s*:\s*(" + _NUM + r"+/" + _NUM + r"+/" + _NUM + r"+)", text)
    data["app_date_raw"] = asc_digits(app_raw)
    data["app_date"] = jalali_to_gregorian(app_raw)

    owner = _search(r"مالک اظهارنامه\s*:\s*(.+)", text)
    if owner:
        ent = re.search(r"نوع شخص حقوقی\s*:\s*([^\n]+?)(?:شماره ثبت|$)", owner)
        reg = re.search(r"شماره ثبت\s*:\s*(" + _NUM + r"+)", owner)
        data["owner_entity"] = ent.group(1).strip() if ent else None
        data["owner_reg_no"] = asc_digits(reg.group(1)) if reg else None
        data["owner"] = re.split(r"\s+نوع شخص حقوقی", owner)[0].strip()

    data["nationality"] = _search(r"تابعیت\s*:\s*([^\n]+)", text)
    data["legal_rep"] = _search(r"نماینده قانونی\s*:\s*([^\n]+)", text)
    data["address"] = _search(r"نشانی\s*:\s*([^\n]+)", text)

    comp = re.search(r"اجزاء علامت\s*:\s*(.+?)(?:متقاضی نسبت|کالاها|طبقه\s*:|\Z)", text, re.S)
    if comp:
        desc = " ".join(comp.group(1).split())
        data["mark_description"] = desc          # full "components" description
        latin = re.findall(r"[A-Za-z][A-Za-z]+(?:\s+[A-Za-z]+)*", desc)
        if latin:
            data["mark_text_en"] = " ".join(t.strip() for t in latin)
        # Mark NAME: the words after کلمات/کلمه, up to a descriptor phrase.
        name = _extract_mark_name(desc)
        data["mark_text"] = name or None

    blocks = re.findall(r"طبقه\s*:?\s*(" + _NUM + r"+)\s*:?\s*([^\n]*)", text)
    classes, goods = [], []
    for cls, items in blocks:
        c = asc_digits(cls)
        classes.append(c)
        items = items.strip().rstrip("،").strip()
        if items:
            goods.append(f"[{c}] {items}")
    if classes:
        data["nice_class"] = ", ".join(dict.fromkeys(classes))
    if goods:
        data["goods_services"] = "\n".join(goods)

    if re.search(r"حق استفاده انحصاری ندارد", text):
        dm = re.search(r"(متقاضی نسبت به .+?ندارد)", text, re.S)
        data["disclaimer"] = (" ".join(dm.group(1).split()) if dm
                              else "متقاضی حق استفاده انحصاری ندارد")

    ok = bool(data["adv_number"] and data["app_number"]
              and (data["mark_text"] or data["mark_description"] or data["owner"]))
    return data, ok


# Descriptor phrases that mark the END of the name within the components text.
_NAME_STOP = re.compile(
    r"\s*(?:به فارسی|به انگلیسی|بفارسی|بانگلیسی|به رنگ|به صورت|به همراه|"
    r"به شکل|داخل|درميان|درمیان|در ميان|در میان|طبق نمونه|و تصویر|به لاتین|"
    r"به لاتين|،|\.)"
)


def _extract_mark_name(desc):
    """
    Pull the mark NAME out of the 'اجزاء علامت' description.
    Pattern: optional 'کلمات/کلمه' then the name, ending at a descriptor phrase.
    Returns a cleaned name (may contain Farsi and/or Latin), or None for pure
    device marks with no worded element.
    """
    if not desc:
        return None
    m = re.search(r"(?:کلمات|کلمه|عبارت|واژه)\s+(.+)", desc)
    segment = m.group(1) if m else desc
    stop = _NAME_STOP.search(segment)
    name = segment[:stop.start()] if stop else segment
    name = " ".join(name.split()).strip(" .،-")
    # Separate any Latin glued to the Farsi (e.g. "اعجازejaz", "روکوان Rokvan").
    name = re.sub(r"[A-Za-z][A-Za-z ]*", " ", name)
    name = " ".join(name.split()).strip(" .،-")
    # If nothing meaningful remains (pure device mark), return None.
    if not name or len(name) < 2:
        return None
    # Guard: if the "name" is still very long, it's probably a description, not a name.
    if len(name.split()) > 6:
        return None
    return name


def _extract_first_image(page, pdf_path, out_dir, app_idx):
    """Save the largest image on the page (the mark logo)."""
    try:
        if not page.images:
            return None
        img = max(page.images, key=lambda i: (i["width"] * i["height"]))
        if img["width"] < 40 or img["height"] < 40:
            return None
        bbox = (max(img["x0"], 0), max(img["top"], 0),
                min(img["x1"], page.width), min(img["bottom"], page.height))
        cropped = page.crop(bbox).to_image(resolution=200)
        fname = f"mark_{app_idx}_{os.path.splitext(os.path.basename(pdf_path))[0]}.png"
        out = os.path.join(out_dir, fname)
        cropped.save(out)
        return out
    except Exception:
        return None
