OCR-SPRIN-SERVICE/src/ocr_sprint/pipeline/extract/personnel_text.py

"""Text-based fallback personnel extractor.

PP-Structure (Phase 3) is the primary path for personnel rows because it
preserves the table grid. But PP-Structure can fail in two ways on real
sprint scans:

1. The table is not detected at all (low-quality scan, watermark, atypical
   layout) — `extract_personnel` returns an empty list.
2. The table IS detected but the column mapping is too sparse, so each row
   collapses to a single ``nama`` cell with all other fields ``None``. This
   is what was observed on a real Polres Cimahi sprint where the OCR
   produced 24 rows with only ``nama`` populated.

This module provides a regex/heuristic fallback that operates directly on
the flat OCR text. It is deliberately conservative: a row must have BOTH a
recognizable Polri rank AND an 8-digit NRP to be emitted, so we never
generate the kind of "name-only" rows that motivated the fallback in the
first place.
"""

from __future__ import annotations

import re

from ocr_sprint.data.master_pangkat import (
    PANGKAT_VARIANTS,
    is_valid_pangkat,
    normalize_pangkat,
)
from ocr_sprint.schemas.personnel import PersonnelEntry

# Build a single alternation of all known rank tokens (longest first so multi-
# word ranks like "KOMBES POL" win over the single-word "KOMBES").
_RANK_TOKENS: tuple[str, ...] = tuple(
    sorted(
        {variant for variants in PANGKAT_VARIANTS.values() for variant in variants},
        key=lambda v: -len(v),
    )
)
_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
# A line that contains a rank token followed (anywhere on the same line) by
# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
# inside e.g. "ABRIPDA-style" text.
_RE_RANK_NRP_LINE = re.compile(
    rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
    re.IGNORECASE,
)
# A bare row number marker like "1." or "12)". OCR often puts it on its own
# line in tabular layouts.
_RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$")
# Lines that should never be interpreted as a personnel name. These are
# section headers, OCR garbage anchors, and column header tokens. We match
# them with a *word-boundary* regex (built from this list) rather than a
# bare ``startswith`` check, because short tokens like ``"NO"`` and
# ``"KET"`` would otherwise reject perfectly valid Indonesian names
# (e.g. ``"NOVA SARI"``, ``"NOOR HIDAYAT"``, ``"KETUT WARDANA"`` — the
# latter being an extremely common Balinese birth-order name).
_NAME_BLOCKLIST_TOKENS: tuple[str, ...] = (
    "PADA TANGGAL",  # multi-word entries first so they win the alternation
    "SURAT PERINTAH",
    "DASAR",
    "PERIHAL",
    "PERTIMBANGAN",
    "DIPERINTAHKAN",
    "KEPADA",
    "UNTUK",
    "TEMBUSAN",
    "DIKELUARKAN",
    "SELESAI",
    "DAFTAR",
    "LAMPIRAN",
    "NOMOR",
    "TANGGAL",
    "KEPOLISIAN",
    "DAERAH",
    "RESOR",
    "SEKTOR",
    "MABES",
    "NRP",
    "NIP",
    "PANGKAT",
    "JABATAN",
    "NAMA",
    "KETERANGAN",
    "KET",
    "NO",
)
_RE_NAME_BLOCKLIST = re.compile(
    r"^(?:" + "|".join(re.escape(tok) for tok in _NAME_BLOCKLIST_TOKENS) + r")\b",
    re.IGNORECASE,
)
# A name should look like a name: mostly letters, common punctuation, and
# at least one alphabetic character. Pure-numeric or pure-symbol lines are
# rejected.
_RE_NAME_OK = re.compile(r"[A-Za-z]")


def _is_plausible_name(line: str) -> bool:
    """Return True iff ``line`` could plausibly be a personnel name."""
    stripped = line.strip()
    if not stripped or not _RE_NAME_OK.search(stripped):
        return False
    if _RE_NAME_BLOCKLIST.match(stripped):
        return False
    if _RE_ROW_NUMBER.match(stripped):
        return False
    if _RE_RANK_NRP_LINE.search(stripped):
        return False
    # Reject lines that are nothing but a row number with extra punctuation
    # ("1 .", "2)") which the bare-number regex above might miss.
    return not re.fullmatch(r"[\s\d.)(\-]+", stripped)


def _following_jabatan(lines: list[str], idx: int) -> str | None:
    """Collect 1-3 follow-up lines after the rank+NRP line as the jabatan.

    Stops at the next rank+NRP line, the next bare row-number line, or any
    blocked prefix (section header / column header).
    """
    parts: list[str] = []
    for fwd in range(idx + 1, min(idx + 4, len(lines))):
        candidate = lines[fwd].strip()
        if not candidate:
            if parts:
                break
            continue
        if _RE_RANK_NRP_LINE.search(candidate):
            break
        if _RE_ROW_NUMBER.match(candidate):
            break
        if _RE_NAME_BLOCKLIST.match(candidate):
            break
        parts.append(candidate)
    if not parts:
        return None
    joined = " ".join(parts)
    return " ".join(joined.split()) or None


def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
    """Best-effort personnel extraction from a flat OCR text stream.

    Strategy:

    1. Iterate every line. Skip lines that don't contain both a known rank
       and an 8-digit NRP (those are the only signal we trust).
    2. For each rank+NRP line, look back for the most recent plausible name
       line, and forward 1-3 lines for jabatan content.
    3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.

    The fallback is intentionally rate-limited: the first matching rank
    token on a line wins (no greedy multi-match per line), and a name line
    can only be consumed once (so a stray ranked text inside a paragraph
    doesn't turn into multiple bogus entries).
    """
    lines = raw_text.splitlines()
    consumed_names: set[int] = set()
    rows: list[PersonnelEntry] = []

    for idx, raw_line in enumerate(lines):
        line = raw_line.strip()
        match = _RE_RANK_NRP_LINE.search(line)
        if not match:
            continue
        pangkat = normalize_pangkat(match.group("rank"))
        if not pangkat or not is_valid_pangkat(pangkat):
            continue
        nrp = match.group("nrp")

        nama: str | None = None
        for back in range(idx - 1, max(idx - 6, -1), -1):
            if back in consumed_names:
                continue
            candidate = lines[back].strip()
            if _is_plausible_name(candidate):
                nama = candidate
                consumed_names.add(back)
                break

        jabatan = _following_jabatan(lines, idx)
        rows.append(
            PersonnelEntry(
                no=None,
                pangkat=pangkat,
                nrp=nrp,
                nama=nama,
                jabatan_dinas=jabatan,
                jabatan_sprint=None,
                keterangan=None,
            )
        )
    return rows


def is_low_quality(rows: list[PersonnelEntry]) -> bool:
    """Heuristic: did PP-Structure produce useless rows?

    A row is useful when it has at least pangkat OR nrp. If most rows have
    only ``nama`` (or worse, nothing) the table extraction failed and the
    caller should retry with the text-based fallback.
    """
    if not rows:
        return True
    useful = sum(1 for r in rows if r.pangkat or r.nrp)
    # Require at least 30% of rows to carry rank/NRP signal. Below that we
    # assume the column mapper degraded to "everything is nama" and prefer
    # a fresh attempt.
    return useful / max(1, len(rows)) < 0.3