Fix personnel extraction + header bugs on real Polres Cimahi sprint

This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF: 1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' letterhead line instead of the most-specific issuing unit (e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to scan for each level independently and return the deepest available. 2. find_dasar_list dropped numbered items when OCR put the marker on its own line ("1.\n Undang-Undang ..."). Refactored into _collect_numbered_section that buffers a bare-number line and uses the next non-empty line as the body. Also reused for the new find_untuk_list which extracts the previously-empty 'untuk' bullets. 3. find_perihal returned None for documents that use 'Pertimbangan' (very common in Polres-level sprint), forcing the LLM to guess. Added a regex fallback that picks up the first line under a 'Pertimbangan' label so we keep extraction deterministic. 4. Personnel rows were emitted with only nama populated when PP-Structure detected a table but the column mapper degraded. Added a text-based fallback (extract_personnel_from_text) that scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when the PP-Structure result has fewer than 30% rank/NRP-bearing rows. Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag. 5. Validation now flags rows with neither pangkat nor nrp as INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review even when individual nrp/pangkat checks pass on empty values. 6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans). Tests: 229 (was 203) — 26 new tests covering the regex fixes, text-based personnel extractor, low-quality detector, validator behaviour, and orchestrator wiring of the fallback path. Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-26 05:35:42 +00:00
parent dce77e80e1
commit 58a2bf2648
11 changed files with 747 additions and 39 deletions
--- a/src/ocr_sprint/pipeline/extract/personnel_text.py
+++ b/src/ocr_sprint/pipeline/extract/personnel_text.py
@@ -0,0 +1,203 @@
+"""Text-based fallback personnel extractor.
+
+PP-Structure (Phase 3) is the primary path for personnel rows because it
+preserves the table grid. But PP-Structure can fail in two ways on real
+sprint scans:
+
+1. The table is not detected at all (low-quality scan, watermark, atypical
+   layout) — `extract_personnel` returns an empty list.
+2. The table IS detected but the column mapping is too sparse, so each row
+   collapses to a single ``nama`` cell with all other fields ``None``. This
+   is what was observed on a real Polres Cimahi sprint where the OCR
+   produced 24 rows with only ``nama`` populated.
+
+This module provides a regex/heuristic fallback that operates directly on
+the flat OCR text. It is deliberately conservative: a row must have BOTH a
+recognizable Polri rank AND an 8-digit NRP to be emitted, so we never
+generate the kind of "name-only" rows that motivated the fallback in the
+first place.
+"""
+
+from __future__ import annotations
+
+import re
+
+from ocr_sprint.data.master_pangkat import (
+    PANGKAT_VARIANTS,
+    is_valid_pangkat,
+    normalize_pangkat,
+)
+from ocr_sprint.schemas.personnel import PersonnelEntry
+
+# Build a single alternation of all known rank tokens (longest first so multi-
+# word ranks like "KOMBES POL" win over the single-word "KOMBES").
+_RANK_TOKENS: tuple[str, ...] = tuple(
+    sorted(
+        {variant for variants in PANGKAT_VARIANTS.values() for variant in variants},
+        key=lambda v: -len(v),
+    )
+)
+_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
+# A line that contains a rank token followed (anywhere on the same line) by
+# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
+# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
+# inside e.g. "ABRIPDA-style" text.
+_RE_RANK_NRP_LINE = re.compile(
+    rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
+    re.IGNORECASE,
+)
+# A bare row number marker like "1." or "12)". OCR often puts it on its own
+# line in tabular layouts.
+_RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$")
+# Lines that should never be interpreted as a personnel name. These are
+# section headers, OCR garbage anchors, and column header tokens.
+_NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = (
+    "DASAR",
+    "PERIHAL",
+    "PERTIMBANGAN",
+    "DIPERINTAHKAN",
+    "KEPADA",
+    "UNTUK",
+    "TEMBUSAN",
+    "DIKELUARKAN",
+    "PADA TANGGAL",
+    "SELESAI",
+    "DAFTAR",
+    "LAMPIRAN",
+    "NOMOR",
+    "TANGGAL",
+    "KEPOLISIAN",
+    "DAERAH",
+    "RESOR",
+    "SEKTOR",
+    "MABES",
+    "SURAT PERINTAH",
+    "NRP",
+    "NIP",
+    "PANGKAT",
+    "JABATAN",
+    "NAMA",
+    "KETERANGAN",
+    "KET",
+    "NO",
+)
+# A name should look like a name: mostly letters, common punctuation, and
+# at least one alphabetic character. Pure-numeric or pure-symbol lines are
+# rejected.
+_RE_NAME_OK = re.compile(r"[A-Za-z]")
+
+
+def _is_plausible_name(line: str) -> bool:
+    """Return True iff ``line`` could plausibly be a personnel name."""
+    stripped = line.strip()
+    if not stripped or not _RE_NAME_OK.search(stripped):
+        return False
+    upper = stripped.upper()
+    for prefix in _NAME_BLOCKLIST_PREFIXES:
+        if upper.startswith(prefix):
+            return False
+    if _RE_ROW_NUMBER.match(stripped):
+        return False
+    if _RE_RANK_NRP_LINE.search(stripped):
+        return False
+    # Reject lines that are nothing but a row number with extra punctuation
+    # ("1 .", "2)") which the bare-number regex above might miss.
+    return not re.fullmatch(r"[\s\d.)(\-]+", stripped)
+
+
+def _following_jabatan(lines: list[str], idx: int) -> str | None:
+    """Collect 1-3 follow-up lines after the rank+NRP line as the jabatan.
+
+    Stops at the next rank+NRP line, the next bare row-number line, or any
+    blocked prefix (section header / column header).
+    """
+    parts: list[str] = []
+    for fwd in range(idx + 1, min(idx + 4, len(lines))):
+        candidate = lines[fwd].strip()
+        if not candidate:
+            if parts:
+                break
+            continue
+        if _RE_RANK_NRP_LINE.search(candidate):
+            break
+        if _RE_ROW_NUMBER.match(candidate):
+            break
+        upper = candidate.upper()
+        if any(upper.startswith(p) for p in _NAME_BLOCKLIST_PREFIXES):
+            break
+        parts.append(candidate)
+    if not parts:
+        return None
+    joined = " ".join(parts)
+    return " ".join(joined.split()) or None
+
+
+def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
+    """Best-effort personnel extraction from a flat OCR text stream.
+
+    Strategy:
+
+    1. Iterate every line. Skip lines that don't contain both a known rank
+       and an 8-digit NRP (those are the only signal we trust).
+    2. For each rank+NRP line, look back for the most recent plausible name
+       line, and forward 1-3 lines for jabatan content.
+    3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
+
+    The fallback is intentionally rate-limited: the first matching rank
+    token on a line wins (no greedy multi-match per line), and a name line
+    can only be consumed once (so a stray ranked text inside a paragraph
+    doesn't turn into multiple bogus entries).
+    """
+    lines = raw_text.splitlines()
+    consumed_names: set[int] = set()
+    rows: list[PersonnelEntry] = []
+
+    for idx, raw_line in enumerate(lines):
+        line = raw_line.strip()
+        match = _RE_RANK_NRP_LINE.search(line)
+        if not match:
+            continue
+        pangkat = normalize_pangkat(match.group("rank"))
+        if not pangkat or not is_valid_pangkat(pangkat):
+            continue
+        nrp = match.group("nrp")
+
+        nama: str | None = None
+        for back in range(idx - 1, max(idx - 6, -1), -1):
+            if back in consumed_names:
+                continue
+            candidate = lines[back].strip()
+            if _is_plausible_name(candidate):
+                nama = candidate
+                consumed_names.add(back)
+                break
+
+        jabatan = _following_jabatan(lines, idx)
+        rows.append(
+            PersonnelEntry(
+                no=None,
+                pangkat=pangkat,
+                nrp=nrp,
+                nama=nama,
+                jabatan_dinas=jabatan,
+                jabatan_sprint=None,
+                keterangan=None,
+            )
+        )
+    return rows
+
+
+def is_low_quality(rows: list[PersonnelEntry]) -> bool:
+    """Heuristic: did PP-Structure produce useless rows?
+
+    A row is useful when it has at least pangkat OR nrp. If most rows have
+    only ``nama`` (or worse, nothing) the table extraction failed and the
+    caller should retry with the text-based fallback.
+    """
+    if not rows:
+        return True
+    useful = sum(1 for r in rows if r.pangkat or r.nrp)
+    # Require at least 30% of rows to carry rank/NRP signal. Below that we
+    # assume the column mapper degraded to "everything is nama" and prefer
+    # a fresh attempt.
+    return useful / max(1, len(rows)) < 0.3