#!/usr/bin/env python
"""Audit .po files for destructive auto-translation mistakes.

Focuses on runtime-safety issues:
- placeholder mismatches (printf-style, named, positional)
- brace-format placeholders
- HTML tag mismatches
- newline count mismatches
- accelerator count mismatches (ampersand shortcuts)

Writes a concise summary and can emit JSON for archival.

This tool is intentionally conservative: it does not attempt to judge language quality.
"""

from __future__ import annotations

import argparse
import json
import re
from collections import Counter
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Tuple

import polib


# --- token extraction ------------------------------------------------------

# %(<name>)<flags><type>
RE_PRINTF_NAMED = re.compile(r"%\([^)]+\)[#0\- +]*\d*(?:\.\d+)?[A-Za-z]")
# %<pos>$<flags><type> (e.g. %1$s)
RE_PRINTF_POSITIONAL = re.compile(r"%\d+\$[#0\- +]*\d*(?:\.\d+)?[A-Za-z]")
# %<flags><type> excluding %%
RE_PRINTF_SIMPLE = re.compile(r"%(?!%)[#0\- +]*\d*(?:\.\d+)?[A-Za-z]")

# Brace-format placeholders: {name}, {0}, {} (but avoid {{ / }})
RE_BRACES = re.compile(r"(?<!\{)\{[^{}]*\}(?!\})")

# HTML-like tags
RE_TAGS = re.compile(r"<\s*(/?)\s*([a-zA-Z0-9]+)(?:\s+[^>]*)?>")

# Ampersand accelerators (Qt-style)
RE_ACCEL = re.compile(r"&[A-Za-z]")

# Catastrophic patterns: translated HTML tags (<b> -> <б>), broken % tokens
# Keep this narrow to avoid false positives on normal percent signs like "100%".
RE_BROKEN_PERCENT_TOKEN = re.compile(r"%(?!%)[^\x00-\x7F]")
RE_PRINTF_NAMED_NAME = re.compile(r"%\(([^)]+)\)")


def _placeholders(text: str) -> List[str]:
    if not text:
        return []
    out: List[str] = []
    out.extend(m.group(0) for m in RE_PRINTF_NAMED.finditer(text))
    out.extend(m.group(0) for m in RE_PRINTF_POSITIONAL.finditer(text))

    stripped = RE_PRINTF_NAMED.sub("", text)
    stripped = RE_PRINTF_POSITIONAL.sub("", stripped)
    out.extend(m.group(0) for m in RE_PRINTF_SIMPLE.finditer(stripped))

    out.extend(m.group(0) for m in RE_BRACES.finditer(text))
    return out


def _tags(text: str) -> List[str]:
    if not text:
        return []
    # Keep a simple tag signature list: "/b", "br", "b" ...
    return [("/" if m.group(1) else "") + (m.group(2) or "").lower() for m in RE_TAGS.finditer(text)]


def _accels(text: str) -> List[str]:
    if not text:
        return []
    return [m.group(0) for m in RE_ACCEL.finditer(text)]


# --- audit -----------------------------------------------------------------


@dataclass
class EntryIssue:
    msgid: str
    msgstr: str
    issues: List[str]


@dataclass
class FileReport:
    locale: str
    path: str
    entries_total: int
    entries_translated: int
    placeholder_mismatch: int
    placeholder_missing: int
    placeholder_extra: int
    tag_mismatch: int
    newline_mismatch: int
    accel_mismatch: int
    catastrophic: int
    examples: List[EntryIssue]


def audit_po(path: Path, max_examples: int = 8) -> FileReport:
    po = polib.pofile(str(path))
    locale = path.stem

    entries_total = 0
    entries_translated = 0

    placeholder_mismatch = 0
    placeholder_missing = 0
    placeholder_extra = 0
    tag_mismatch = 0
    newline_mismatch = 0
    accel_mismatch = 0
    catastrophic = 0

    examples: List[EntryIssue] = []

    for e in po:
        if e.obsolete:
            continue
        msgid = e.msgid or ""
        msgstr = e.msgstr or ""
        if not msgid:
            continue

        entries_total += 1
        if msgstr.strip():
            entries_translated += 1

        # Ignore untranslated entries for mismatch checks (they'll be empty)
        if not msgstr.strip():
            continue

        issues: List[str] = []

        a = Counter(_placeholders(msgid))
        b = Counter(_placeholders(msgstr))
        if a != b:
            placeholder_mismatch += 1
            if (a - b):
                placeholder_missing += 1
                issues.append("missing_placeholders")
            if (b - a):
                placeholder_extra += 1
                issues.append("extra_placeholders")

        ta = Counter(_tags(msgid))
        tb = Counter(_tags(msgstr))
        if ta != tb:
            tag_mismatch += 1
            issues.append("tag_mismatch")

        if msgid.count("\\n") != msgstr.count("\\n"):
            newline_mismatch += 1
            issues.append("newline_mismatch")

        if len(_accels(msgid)) != len(_accels(msgstr)):
            accel_mismatch += 1
            issues.append("accelerator_mismatch")

        # Catastrophic markers (likely runtime breakage)
        cat = False
        if RE_BROKEN_PERCENT_TOKEN.search(msgstr):
            cat = True
            issues.append("broken_percent_token")

        # Translated named placeholder keys: %(name)s -> %(नाम)s
        for m in RE_PRINTF_NAMED_NAME.finditer(msgstr):
            name = m.group(1) or ""
            if any(ord(ch) > 127 for ch in name):
                cat = True
                issues.append("nonascii_named_placeholder_key")
                break
        # Translated tag names: if msgstr contains a tag-like pattern but tag signature differs AND
        # we see non-ASCII letters inside the tag name region.
        if RE_TAGS.search(msgstr) and any(ord(ch) > 127 for ch in msgstr):
            # Heuristic: if msgstr has '<' and '>' and non-ascii near them.
            for m in re.finditer(r"<[^>]*>", msgstr):
                seg = m.group(0)
                if any(ord(ch) > 127 for ch in seg):
                    cat = True
                    issues.append("nonascii_in_html_tag")
                    break

        if cat:
            catastrophic += 1

        if issues and len(examples) < max_examples:
            examples.append(EntryIssue(msgid=msgid, msgstr=msgstr, issues=issues))

    return FileReport(
        locale=locale,
        path=str(path.as_posix()),
        entries_total=entries_total,
        entries_translated=entries_translated,
        placeholder_mismatch=placeholder_mismatch,
        placeholder_missing=placeholder_missing,
        placeholder_extra=placeholder_extra,
        tag_mismatch=tag_mismatch,
        newline_mismatch=newline_mismatch,
        accel_mismatch=accel_mismatch,
        catastrophic=catastrophic,
        examples=examples,
    )


def triage(report: FileReport) -> Tuple[str, List[str]]:
    """Return (bucket, reasons).

    Buckets: KEEP, RETRANSLATE
    Balanced thresholds, with immediate RETRANSLATE on catastrophic signals.
    """

    reasons: List[str] = []

    if report.catastrophic:
        reasons.append(f"catastrophic={report.catastrophic}")
        return "RETRANSLATE", reasons

    # High-confidence destructive symptoms
    if report.placeholder_mismatch >= 10:
        reasons.append(f"placeholder_mismatch={report.placeholder_mismatch}")
        return "RETRANSLATE", reasons

    if report.tag_mismatch >= 3:
        reasons.append(f"tag_mismatch={report.tag_mismatch}")
        return "RETRANSLATE", reasons

    # Unrelated msgstr tends to show up as many 'extra placeholders'
    if report.placeholder_extra >= 8:
        reasons.append(f"extra_placeholders={report.placeholder_extra}")
        return "RETRANSLATE", reasons

    # Otherwise: keep (even if minor issues exist) — user can still choose to redo fully.
    if report.placeholder_mismatch:
        reasons.append(f"placeholder_mismatch={report.placeholder_mismatch}")
    if report.tag_mismatch:
        reasons.append(f"tag_mismatch={report.tag_mismatch}")
    if report.newline_mismatch:
        reasons.append(f"newline_mismatch={report.newline_mismatch}")
    if report.accel_mismatch:
        reasons.append(f"accelerator_mismatch={report.accel_mismatch}")

    return "KEEP", reasons


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--translations", default="translations", help="Path to translations dir")
    ap.add_argument("--json", default="", help="Write full JSON report to path")
    ap.add_argument("--examples", type=int, default=6, help="Max examples per file")
    args = ap.parse_args()

    tdir = Path(args.translations)
    po_files = sorted([p for p in tdir.glob("*.po") if p.name != "messages.pot"], key=lambda p: p.name.casefold())

    reports: List[FileReport] = [audit_po(p, max_examples=args.examples) for p in po_files]

    keep: List[Tuple[str, List[str]]] = []
    redo: List[Tuple[str, List[str]]] = []
    for r in reports:
        bucket, reasons = triage(r)
        if bucket == "RETRANSLATE":
            redo.append((r.locale, reasons))
        else:
            keep.append((r.locale, reasons))

    def _fmt(items: List[Tuple[str, List[str]]]) -> str:
        return ", ".join([loc for loc, _ in items])

    print(f"KEEP ({len(keep)}): {_fmt(keep)}")
    print(f"RETRANSLATE ({len(redo)}): {_fmt(redo)}")

    # Print brief per-locale reason lines for RETRANSLATE
    for loc, reasons in redo:
        print(f"- {loc}: {'; '.join(reasons) if reasons else 'issues'}")

    if args.json:
        out_path = Path(args.json)
        data: Dict[str, Any] = {
            "keep": [{"locale": loc, "reasons": reasons} for loc, reasons in keep],
            "retranslate": [{"locale": loc, "reasons": reasons} for loc, reasons in redo],
            "reports": [asdict(r) for r in reports],
        }
        out_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
        print(f"WROTE_JSON: {out_path.as_posix()}")

    return 0


if __name__ == "__main__":
    raise SystemExit(main())
