Fix personnel extraction + header bugs on real Polres Cimahi sprint

This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF: 1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' letterhead line instead of the most-specific issuing unit (e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to scan for each level independently and return the deepest available. 2. find_dasar_list dropped numbered items when OCR put the marker on its own line ("1.\n Undang-Undang ..."). Refactored into _collect_numbered_section that buffers a bare-number line and uses the next non-empty line as the body. Also reused for the new find_untuk_list which extracts the previously-empty 'untuk' bullets. 3. find_perihal returned None for documents that use 'Pertimbangan' (very common in Polres-level sprint), forcing the LLM to guess. Added a regex fallback that picks up the first line under a 'Pertimbangan' label so we keep extraction deterministic. 4. Personnel rows were emitted with only nama populated when PP-Structure detected a table but the column mapper degraded. Added a text-based fallback (extract_personnel_from_text) that scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when the PP-Structure result has fewer than 30% rank/NRP-bearing rows. Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag. 5. Validation now flags rows with neither pangkat nor nrp as INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review even when individual nrp/pangkat checks pass on empty values. 6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans). Tests: 229 (was 203) — 26 new tests covering the regex fixes, text-based personnel extractor, low-quality detector, validator behaviour, and orchestrator wiring of the fallback path. Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-26 05:35:42 +00:00
parent dce77e80e1
commit 58a2bf2648
11 changed files with 747 additions and 39 deletions
--- a/src/ocr_sprint/data/master_pangkat.py
+++ b/src/ocr_sprint/data/master_pangkat.py
@@ -22,7 +22,7 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
    # Bintara
    "BRIPDA": ("BRIPDA",),
    "BRIPTU": ("BRIPTU",),
-    "BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"),
+    "BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL", "BRIGPOL"),
    "BRIPKA": ("BRIPKA",),
    "AIPDA": ("AIPDA",),
    "AIPTU": ("AIPTU",),
--- a/src/ocr_sprint/pipeline/confidence.py
+++ b/src/ocr_sprint/pipeline/confidence.py
@@ -22,6 +22,14 @@ _FLAG_PENALTY: dict[ReviewFlag, float] = {
    ReviewFlag.UNKNOWN_PANGKAT: 0.05,
    ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15,
    ReviewFlag.DATE_PARSE_FAILED: 0.10,
    # Text-based personnel fallback is a recoverable degradation: rank/NRP
    # were extracted via regex from raw OCR rather than from a parsed table
    # grid. Worth flagging for review but not catastrophic.
    ReviewFlag.PERSONNEL_TEXT_FALLBACK: 0.05,
    # An incomplete personnel row (no pangkat AND no nrp) is a strong
    # signal something went wrong. Penalise heavily so the document
    # routes to needs_review even if the rest of the extraction is fine.
    ReviewFlag.INCOMPLETE_PERSONNEL_ROW: 0.15,
 }
 OCR_WEIGHT = 0.6
--- a/src/ocr_sprint/pipeline/extract/personnel_text.py
+++ b/src/ocr_sprint/pipeline/extract/personnel_text.py
@@ -0,0 +1,203 @@
 """Text-based fallback personnel extractor.
 PP-Structure (Phase 3) is the primary path for personnel rows because it
 preserves the table grid. But PP-Structure can fail in two ways on real
 sprint scans:
 1. The table is not detected at all (low-quality scan, watermark, atypical
   layout) — `extract_personnel` returns an empty list.
 2. The table IS detected but the column mapping is too sparse, so each row
   collapses to a single ``nama`` cell with all other fields ``None``. This
   is what was observed on a real Polres Cimahi sprint where the OCR
   produced 24 rows with only ``nama`` populated.
 This module provides a regex/heuristic fallback that operates directly on
 the flat OCR text. It is deliberately conservative: a row must have BOTH a
 recognizable Polri rank AND an 8-digit NRP to be emitted, so we never
 generate the kind of "name-only" rows that motivated the fallback in the
 first place.
 """
 from __future__ import annotations
 import re
 from ocr_sprint.data.master_pangkat import (
    PANGKAT_VARIANTS,
    is_valid_pangkat,
    normalize_pangkat,
 )
 from ocr_sprint.schemas.personnel import PersonnelEntry
 # Build a single alternation of all known rank tokens (longest first so multi-
 # word ranks like "KOMBES POL" win over the single-word "KOMBES").
 _RANK_TOKENS: tuple[str, ...] = tuple(
    sorted(
        {variant for variants in PANGKAT_VARIANTS.values() for variant in variants},
        key=lambda v: -len(v),
    )
 )
 _RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
 # A line that contains a rank token followed (anywhere on the same line) by
 # an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
 # whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
 # inside e.g. "ABRIPDA-style" text.
 _RE_RANK_NRP_LINE = re.compile(
    rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
    re.IGNORECASE,
 )
 # A bare row number marker like "1." or "12)". OCR often puts it on its own
 # line in tabular layouts.
 _RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$")
 # Lines that should never be interpreted as a personnel name. These are
 # section headers, OCR garbage anchors, and column header tokens.
 _NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = (
    "DASAR",
    "PERIHAL",
    "PERTIMBANGAN",
    "DIPERINTAHKAN",
    "KEPADA",
    "UNTUK",
    "TEMBUSAN",
    "DIKELUARKAN",
    "PADA TANGGAL",
    "SELESAI",
    "DAFTAR",
    "LAMPIRAN",
    "NOMOR",
    "TANGGAL",
    "KEPOLISIAN",
    "DAERAH",
    "RESOR",
    "SEKTOR",
    "MABES",
    "SURAT PERINTAH",
    "NRP",
    "NIP",
    "PANGKAT",
    "JABATAN",
    "NAMA",
    "KETERANGAN",
    "KET",
    "NO",
 )
 # A name should look like a name: mostly letters, common punctuation, and
 # at least one alphabetic character. Pure-numeric or pure-symbol lines are
 # rejected.
 _RE_NAME_OK = re.compile(r"[A-Za-z]")
 def _is_plausible_name(line: str) -> bool:
    """Return True iff ``line`` could plausibly be a personnel name."""
    stripped = line.strip()
    if not stripped or not _RE_NAME_OK.search(stripped):
        return False
    upper = stripped.upper()
    for prefix in _NAME_BLOCKLIST_PREFIXES:
        if upper.startswith(prefix):
            return False
    if _RE_ROW_NUMBER.match(stripped):
        return False
    if _RE_RANK_NRP_LINE.search(stripped):
        return False
    # Reject lines that are nothing but a row number with extra punctuation
    # ("1 .", "2)") which the bare-number regex above might miss.
    return not re.fullmatch(r"[\s\d.)(\-]+", stripped)
 def _following_jabatan(lines: list[str], idx: int) -> str | None:
    """Collect 1-3 follow-up lines after the rank+NRP line as the jabatan.
    Stops at the next rank+NRP line, the next bare row-number line, or any
    blocked prefix (section header / column header).
    """
    parts: list[str] = []
    for fwd in range(idx + 1, min(idx + 4, len(lines))):
        candidate = lines[fwd].strip()
        if not candidate:
            if parts:
                break
            continue
        if _RE_RANK_NRP_LINE.search(candidate):
            break
        if _RE_ROW_NUMBER.match(candidate):
            break
        upper = candidate.upper()
        if any(upper.startswith(p) for p in _NAME_BLOCKLIST_PREFIXES):
            break
        parts.append(candidate)
    if not parts:
        return None
    joined = " ".join(parts)
    return " ".join(joined.split()) or None
 def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
    """Best-effort personnel extraction from a flat OCR text stream.
    Strategy:
    1. Iterate every line. Skip lines that don't contain both a known rank
       and an 8-digit NRP (those are the only signal we trust).
    2. For each rank+NRP line, look back for the most recent plausible name
       line, and forward 1-3 lines for jabatan content.
    3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
    The fallback is intentionally rate-limited: the first matching rank
    token on a line wins (no greedy multi-match per line), and a name line
    can only be consumed once (so a stray ranked text inside a paragraph
    doesn't turn into multiple bogus entries).
    """
    lines = raw_text.splitlines()
    consumed_names: set[int] = set()
    rows: list[PersonnelEntry] = []
    for idx, raw_line in enumerate(lines):
        line = raw_line.strip()
        match = _RE_RANK_NRP_LINE.search(line)
        if not match:
            continue
        pangkat = normalize_pangkat(match.group("rank"))
        if not pangkat or not is_valid_pangkat(pangkat):
            continue
        nrp = match.group("nrp")
        nama: str | None = None
        for back in range(idx - 1, max(idx - 6, -1), -1):
            if back in consumed_names:
                continue
            candidate = lines[back].strip()
            if _is_plausible_name(candidate):
                nama = candidate
                consumed_names.add(back)
                break
        jabatan = _following_jabatan(lines, idx)
        rows.append(
            PersonnelEntry(
                no=None,
                pangkat=pangkat,
                nrp=nrp,
                nama=nama,
                jabatan_dinas=jabatan,
                jabatan_sprint=None,
                keterangan=None,
            )
        )
    return rows
 def is_low_quality(rows: list[PersonnelEntry]) -> bool:
    """Heuristic: did PP-Structure produce useless rows?
    A row is useful when it has at least pangkat OR nrp. If most rows have
    only ``nama`` (or worse, nothing) the table extraction failed and the
    caller should retry with the text-based fallback.
    """
    if not rows:
        return True
    useful = sum(1 for r in rows if r.pangkat or r.nrp)
    # Require at least 30% of rows to carry rank/NRP signal. Below that we
    # assume the column mapper degraded to "everything is nama" and prefer
    # a fresh attempt.
    return useful / max(1, len(rows)) < 0.3
--- a/src/ocr_sprint/pipeline/extract/regex_rules.py
+++ b/src/ocr_sprint/pipeline/extract/regex_rules.py
@@ -53,19 +53,52 @@ _RE_TANGGAL_ID = re.compile(
    re.IGNORECASE,
 )
-# Satuan penerbit usually appears in the document letterhead, prefixed by
+# Polri letterhead pieces. The full letterhead spans multiple lines that are
-# KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
+# often broken across separate OCR rows like:
-_RE_SATUAN = re.compile(
+#
-    r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
+#     KEPOLISIAN NEGARA REPUBLIK INDONESIA
-    r"[^\n]{0,80}",
+#     DAERAH JAWA BARAT
 #     RESOR CIMAHI
 #
 # We capture each individual level so we can reconstruct the most-specific
 # unit (RESOR / SEKTOR > DAERAH > NEGARA) — a downstream consumer cares
 # about *which* unit issued the sprint, not just that some Polri unit did.
 _RE_LEVEL_NEGARA = re.compile(
    r"KEPOLISIAN\s+NEGARA\s+REPUBLIK\s+INDONESIA",
    re.IGNORECASE,
 )
 _RE_LEVEL_DAERAH = re.compile(
    r"(?:KEPOLISIAN\s+)?DAERAH\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
    re.IGNORECASE | re.MULTILINE,
 )
 _RE_LEVEL_RESOR = re.compile(
    r"(?:KEPOLISIAN\s+)?RESORT?\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
    re.IGNORECASE | re.MULTILINE,
 )
 _RE_LEVEL_SEKTOR = re.compile(
    r"(?:KEPOLISIAN\s+)?SEKTOR\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
    re.IGNORECASE | re.MULTILINE,
 )
 _RE_LEVEL_MABES = re.compile(r"MABES\s+POLRI\b", re.IGNORECASE)
 # "Perihal : ...." up to end of line.
 _RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
 # Many sprint docs (especially Polres-level) use 'Pertimbangan' as the
 # single-paragraph rationale block instead of (or alongside) 'Perihal'.
 # When `perihal` is missing we fall back to the first non-empty line under
 # 'Pertimbangan :' so the LLM doesn't have to guess and so a downstream
 # audit trail still has *something* in the perihal slot.
 _RE_PERTIMBANGAN_LABEL = re.compile(r"^\s*PERTIMBANGAN\b", re.IGNORECASE)
 # A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
 _RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
 # OCR sometimes splits the number from its content across two lines:
 #     1.
 #      Undang-Undang Nomor 2 Tahun 2002 ...
 # We detect a bare-number line and merge with the next non-empty line.
 _RE_DASAR_BARE_NUMBER = re.compile(r"^\s*(\d+)\s*[.)]\s*$")
 # Generic 'untuk' bullet — same shape as a dasar item.
 _RE_UNTUK_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
 # Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
 _RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
@@ -99,54 +132,159 @@ def find_tanggal(text: str) -> date | None:
        return None
 def _clean_unit_tail(tail: str) -> str:
    """Strip trailing punctuation/noise from the captured place name."""
    return " ".join(tail.split()).strip(" .,;:'\"")
 def find_satuan(text: str) -> str | None:
-    """Return the first letterhead match (issuing unit), normalized."""
+    """Return the issuing unit, preferring the most-specific letterhead level.
-    match = _RE_SATUAN.search(text)
+
-    if not match:
+    Polri letterheads are hierarchical (Negara > Daerah > Resor/Sektor). The
    actual *issuing* unit is the deepest level present in the letterhead, not
    the topmost generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' line. We scan
    for each level independently and pick the most specific one available;
    if only the generic Negara line is present we return that.
    Examples
    --------
    >>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA\\n"
    ...             "DAERAH JAWA BARAT\\nRESOR CIMAHI")
    'KEPOLISIAN RESOR CIMAHI'
    >>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
    'KEPOLISIAN NEGARA REPUBLIK INDONESIA'
    """
    # We only look at the document head — letterheads always sit at the
    # very top, and constraining the search prevents false positives from
    # body text like '... Polres Cimahi ...' deep in a paragraph.
    head = "\n".join(text.splitlines()[:25])
    sektor = _RE_LEVEL_SEKTOR.search(head)
    if sektor:
        return f"KEPOLISIAN SEKTOR {_clean_unit_tail(sektor.group(1))}"
    resor = _RE_LEVEL_RESOR.search(head)
    if resor:
        return f"KEPOLISIAN RESOR {_clean_unit_tail(resor.group(1))}"
    daerah = _RE_LEVEL_DAERAH.search(head)
    if daerah:
        return f"KEPOLISIAN DAERAH {_clean_unit_tail(daerah.group(1))}"
    if _RE_LEVEL_MABES.search(head):
        return "MABES POLRI"
    if _RE_LEVEL_NEGARA.search(head):
        return "KEPOLISIAN NEGARA REPUBLIK INDONESIA"
    return None
    return " ".join(match.group(0).split())
 def find_perihal(text: str) -> str | None:
-    """Return the first 'Perihal: ...' line, trimmed to that line only."""
+    """Return the first 'Perihal: ...' line, trimmed to that line only.
    Falls back to the first non-empty line under a 'Pertimbangan' label
    (a common variant in Polres-level surat sprint that doesn't have a
    distinct 'Perihal' field). We deliberately keep this in regex-land
    rather than deferring to the LLM because the LLM tends to hallucinate
    perihal content from arbitrary paragraphs.
    """
    for line in text.splitlines():
        m = _RE_PERIHAL.search(line)
        if m:
            return m.group(1).strip()
    lines = text.splitlines()
    for idx, line in enumerate(lines):
        if _RE_PERTIMBANGAN_LABEL.match(line):
            for follow in lines[idx + 1 : idx + 5]:
                stripped = follow.strip(" :\t")
                if stripped and stripped != ":":
                    return stripped
            break
    return None
 def _collect_numbered_section(
    lines: list[str],
    start_idx: int,
    terminators: tuple[str, ...],
 ) -> list[str]:
    """Walk forward from ``start_idx`` collecting numbered list items.
    Robust to OCR splitting the number marker onto its own line:
        '1.'  ->   buffer ``pending_index=1``
        next non-empty line starts the item body.
    Continuation lines (non-empty, no leading number, after a started item)
    are appended to the current item. Stops at any line whose uppercase form
    starts with one of ``terminators``.
    """
    items: list[str] = []
    pending_marker = False
    blank_run = 0
    for raw_line in lines[start_idx:]:
        line = raw_line.strip()
        upper = line.upper()
        if any(upper.startswith(term) for term in terminators):
            break
        if not line:
            blank_run += 1
            # Two consecutive blank lines reliably mark the end of a section.
            # A single blank line is tolerated because OCR sprinkles them.
            if blank_run >= 2 and items and not pending_marker:
                break
            continue
        blank_run = 0
        bare = _RE_DASAR_BARE_NUMBER.match(line)
        if bare:
            pending_marker = True
            continue
        m = _RE_DASAR_ITEM.match(line)
        if m:
            items.append(m.group(2).strip())
            pending_marker = False
            continue
        if pending_marker:
            items.append(line)
            pending_marker = False
            continue
        if items:
            items[-1] = (items[-1] + " " + line).strip()
    return items
 def find_dasar_list(text: str) -> list[str]:
    """Extract numbered 'Dasar' items from the text.
    Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
-    collect subsequent lines that start with a number. Stops at a blank line
+    delegate to ``_collect_numbered_section`` which handles three OCR
-    or a line beginning with another section header keyword.
+    artefacts:
    1. Inline numbered items: ``"1. Undang-Undang ..."``.
    2. Bare-number lines (the OCR engine puts the number alone on a line):
       ``"1.\\n Undang-Undang ..."``.
    3. Continuation lines (a line that is the wrapped tail of the previous
       item gets appended back onto it).
    """
    lines = text.splitlines()
    items: list[str] = []
    in_dasar = False
    section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
-    for raw_line in lines:
+    for idx, raw_line in enumerate(lines):
-        line = raw_line.strip()
+        if re.match(r"^\s*DASAR\b", raw_line.strip(), re.IGNORECASE):
-        if not in_dasar:
+            return _collect_numbered_section(lines, idx + 1, section_terminators)
-            if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
+    return []
-                in_dasar = True
+
-            continue
+
-        if not line:
+def find_untuk_list(text: str) -> list[str]:
-            if items:
+    """Extract numbered 'Untuk' / 'DIPERINTAHKAN' bullets from the text.
-                break
+
-            continue
+    The 'Untuk' section follows 'DIPERINTAHKAN' / 'Kepada' and lists the
-        upper = line.upper()
+    tasks assigned to the personnel. Same OCR shape as Dasar, so we reuse
-        if any(upper.startswith(term) for term in section_terminators):
+    the collector but with different terminators.
-            break
+    """
-        m = _RE_DASAR_ITEM.match(line)
+    lines = text.splitlines()
-        if m:
+    # Stop conditions: 'Selesai' (boilerplate), 'Dikeluarkan di' (signature
-            items.append(m.group(2).strip())
+    # block), 'Tembusan' (carbon-copy section).
-        elif items:
+    terminators = ("SELESAI", "DIKELUARKAN", "TEMBUSAN", "PADA TANGGAL")
-            # continuation of the previous dasar item
+    for idx, raw_line in enumerate(lines):
-            items[-1] = (items[-1] + " " + line).strip()
+        if re.match(r"^\s*UNTUK\b", raw_line.strip(), re.IGNORECASE):
-    return items
+            return _collect_numbered_section(lines, idx + 1, terminators)
    return []
 def find_signatory(text: str) -> Signatory:
--- a/src/ocr_sprint/pipeline/extract/validators.py
+++ b/src/ocr_sprint/pipeline/extract/validators.py
@@ -30,6 +30,13 @@ def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
        flags.append(ReviewFlag.INVALID_NRP)
    if entry.pangkat and not is_valid_pangkat(entry.pangkat):
        flags.append(ReviewFlag.UNKNOWN_PANGKAT)
    # Identification of a personnel row requires at least pangkat OR nrp.
    # A row carrying only a name is structurally incomplete - likely a
    # mis-aligned table cell or a leaked tembusan/dasar fragment - and must
    # be flagged for human review even though pangkat/nrp validation
    # individually pass (because they're empty).
    if not entry.pangkat and not entry.nrp:
        flags.append(ReviewFlag.INCOMPLETE_PERSONNEL_ROW)
    return flags
--- a/src/ocr_sprint/pipeline/orchestrator.py
+++ b/src/ocr_sprint/pipeline/orchestrator.py
@@ -19,7 +19,15 @@ from ocr_sprint.llm.extractor import llm_fill_header
 from ocr_sprint.pipeline.confidence import compute_confidence, route
 from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
 from ocr_sprint.pipeline.extract.personnel import extract_personnel
-from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory
+from ocr_sprint.pipeline.extract.personnel_text import (
    extract_personnel_from_text,
    is_low_quality,
 )
 from ocr_sprint.pipeline.extract.regex_rules import (
    extract_header,
    find_signatory,
    find_untuk_list,
 )
 from ocr_sprint.pipeline.extract.validators import validate_extraction
 from ocr_sprint.pipeline.ingest import NDArrayU8, detect_source_kind, ingest
 from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
@@ -112,6 +120,7 @@ def run_pipeline(content: bytes) -> PipelineOutput:
            header = merged
    personel: list[PersonnelEntry] = []
    table_flags: list[ReviewFlag] = []
    if s.tables_enabled and cleaned_pages:
        all_tables: list[DetectedTable] = []
        for img in cleaned_pages:
@@ -126,14 +135,33 @@ def run_pipeline(content: bytes) -> PipelineOutput:
            personel_rows=len(personel),
        )
-    initial_flags: list[ReviewFlag] = list(llm_flags)
+    # Text-based fallback: PP-Structure can succeed structurally but emit
    # rows with only ``nama`` populated (column mapper degraded), or fail to
    # detect the table at all. In both cases the regex fallback that scans
    # raw OCR for rank+NRP pairs produces a much more useful result. We
    # always run it when the structured path is empty or low-quality, and
    # raise a review flag so the operator knows the document didn't go
    # through the preferred path.
    if is_low_quality(personel):
        fallback_rows = extract_personnel_from_text(full_text)
        if fallback_rows:
            personel = fallback_rows
            table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
            _logger.info(
                "pipeline.personnel_text_fallback",
                fallback_rows=len(fallback_rows),
            )
    untuk_items = find_untuk_list(full_text)
    initial_flags: list[ReviewFlag] = list(llm_flags) + list(table_flags)
    if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
        initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
    result = ExtractionResult(
        header=header,
        personel=personel,
-        untuk=[],
+        untuk=untuk_items,
        ttd=ttd,
        raw_text=full_text,
        confidence=mean_ocr_conf,
--- a/src/ocr_sprint/schemas/extraction.py
+++ b/src/ocr_sprint/schemas/extraction.py
@@ -21,6 +21,8 @@ class ReviewFlag(str, Enum):
    DATE_PARSE_FAILED = "date_parse_failed"
    LLM_FALLBACK = "llm_fallback"
    LLM_UNAVAILABLE = "llm_unavailable"
    PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback"
    INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row"
 class Signatory(BaseModel):
--- a/tests/unit/test_orchestrator_llm.py
+++ b/tests/unit/test_orchestrator_llm.py
@@ -169,3 +169,92 @@ def test_orchestrator_marks_unavailable_when_llm_returns_none(
    out = run_pipeline(b"%PDF-1.4\n%fake")
    assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
    assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags
 def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names(
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """When PP-Structure produces low-quality rows (e.g. only ``nama`` filled),
    the orchestrator must run the text fallback against the raw OCR text and
    raise the ``personnel_text_fallback`` flag.
    """
    monkeypatch.setenv("LLM_ENABLED", "false")
    from ocr_sprint.config import get_settings
    get_settings.cache_clear()
    raw_text = (
        "DAFTAR PERSONIL\n"
        "1.\n"
        "SRI WAHYUNI\n"
        "AIPTU / 75070328\n"
        "INTELKAM POLRES CIMAHI\n"
        "2.\n"
        "AGUNG LUKMAN\n"
        "BRIPTU / 99030245\n"
        "SAT INTELKAM\n"
    )
    # PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on
    # the real Polres Cimahi document).
    from ocr_sprint.schemas.personnel import PersonnelEntry
    pp_structure_low_quality = [
        PersonnelEntry(nama="SRI WAHYUNI"),
        PersonnelEntry(nama="AGUNG LUKMAN"),
    ]
    _stub_pipeline_stages(
        monkeypatch,
        raw_text=raw_text,
        regex_header=HeaderFields(
            nomor_sprint="Sprin/1/I/2025",
            tanggal=date(2025, 1, 1),
            satuan_penerbit="Polres Cimahi",
            perihal="ok",
            dasar=["UU 2/2002"],
        ),
    )
    # Override extract_personnel to return the broken PP-Structure rows.
    monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality)
    out = run_pipeline(b"%PDF-1.4\n%fake")
    assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags
    # Fallback rows must carry pangkat + nrp (the whole point of the path).
    assert all(r.pangkat and r.nrp for r in out.result.personel)
    assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"}
 def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high(
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    """Healthy PP-Structure output (rank+nrp present on most rows) must NOT
    be replaced by the text fallback.
    """
    monkeypatch.setenv("LLM_ENABLED", "false")
    from ocr_sprint.config import get_settings
    get_settings.cache_clear()
    from ocr_sprint.schemas.personnel import PersonnelEntry
    healthy = [
        PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"),
        PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"),
        PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"),
    ]
    _stub_pipeline_stages(
        monkeypatch,
        raw_text="ignored — should not be parsed",
        regex_header=HeaderFields(
            nomor_sprint="Sprin/1/I/2025",
            tanggal=date(2025, 1, 1),
            satuan_penerbit="Polres X",
            perihal="ok",
            dasar=["UU 2/2002"],
        ),
    )
    monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy)
    out = run_pipeline(b"%PDF-1.4\n%fake")
    assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags
    assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"]
--- a/tests/unit/test_personnel_text_fallback.py
+++ b/tests/unit/test_personnel_text_fallback.py
@@ -0,0 +1,118 @@
 """Tests for the text-based personnel fallback extractor.
 Driven by the real Polres Cimahi sprint document where PP-Structure
 produced 24 rows with only ``nama`` populated. The fallback should
 recover at least the rank + NRP for every row.
 """
 from __future__ import annotations
 from ocr_sprint.pipeline.extract.personnel_text import (
    extract_personnel_from_text,
    is_low_quality,
 )
 from ocr_sprint.schemas.personnel import PersonnelEntry
 _CIMAHI_FIXTURE = """\
 DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
 NO
 NAMA
 PANGKAT / NRP
 JABATAN
 KET
 BAUR SKCK SAT
 1.
 SRI WAHYUNI
 AIPTU / 75070328
 INTELKAM POLRES
 CIMAHI
 BA PELAKSANA SKCK
 2.
 CITRA DWI PUTRI R
 BRIPTU / 95070659
 SAT INTELKAM
 POLRES CIMAHI
 BA PELAKSANA SKCK
 3.
 AGUNG LUKMAN AL
 BRIPTU / 99030245
 SAT INTELKAM
 POLRES CIMAHI
 BA POLSEK
 8.
 ARIEF SYAHRUL ZAMAN
 BRIGPOL /96030446
 MARGAASIH
 """
 class TestExtractPersonnelFromText:
    def test_extracts_rank_nrp_and_name(self) -> None:
        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
        assert len(rows) == 4
        first = rows[0]
        assert first.pangkat == "AIPTU"
        assert first.nrp == "75070328"
        assert first.nama == "SRI WAHYUNI"
    def test_normalizes_brigpol_to_brigadir(self) -> None:
        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
        last = rows[-1]
        # 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
        assert last.pangkat == "BRIGADIR"
        assert last.nrp == "96030446"
        assert last.nama == "ARIEF SYAHRUL ZAMAN"
    def test_skips_header_lines_as_names(self) -> None:
        # No row should ever have a column-header word as nama.
        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
        names = [r.nama for r in rows]
        for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
            assert blocked not in names
    def test_jabatan_collected_from_following_lines(self) -> None:
        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
        assert rows[0].jabatan_dinas is not None
        assert "INTELKAM" in rows[0].jabatan_dinas
    def test_empty_text_returns_empty(self) -> None:
        assert extract_personnel_from_text("") == []
    def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
        text = "Just a paragraph with no rank or NRP at all.\nAnother line."
        assert extract_personnel_from_text(text) == []
    def test_ignores_isolated_8digit_number_without_rank(self) -> None:
        # NRP without a recognised rank token must not produce a row.
        text = "Some line\n12345678\nanother line"
        assert extract_personnel_from_text(text) == []
    def test_rejects_unknown_rank_with_8digit_number(self) -> None:
        # A "rank-shaped" word that isn't in the master list must not yield a row.
        text = "Some line\nFAKERANK / 12345678\nanother line"
        assert extract_personnel_from_text(text) == []
 class TestIsLowQuality:
    def test_empty_list_is_low_quality(self) -> None:
        assert is_low_quality([]) is True
    def test_all_rows_with_only_name_is_low_quality(self) -> None:
        rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
        assert is_low_quality(rows) is True
    def test_majority_with_rank_nrp_is_high_quality(self) -> None:
        rows = [
            PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
            for i in range(10)
        ]
        assert is_low_quality(rows) is False
    def test_borderline_30_percent_threshold(self) -> None:
        # 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
        useful = [
            PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
            for i in range(3)
        ]
        useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
        assert is_low_quality(useful + useless) is False
--- a/tests/unit/test_regex_rules.py
+++ b/tests/unit/test_regex_rules.py
@@ -14,6 +14,7 @@ from ocr_sprint.pipeline.extract.regex_rules import (
    find_satuan,
    find_signatory,
    find_tanggal,
    find_untuk_list,
 )
@@ -60,6 +61,36 @@ class TestSatuan:
        result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
        assert result is not None
    def test_prefers_resor_over_negara_when_both_present(self) -> None:
        # The Polri letterhead lists units hierarchically; the issuing unit
        # is the deepest level, not the topmost generic "NEGARA" line.
        text = (
            "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
            "DAERAH JAWA BARAT\n"
            "RESOR CIMAHI\n"
            "SURAT PERINTAH\n"
        )
        result = find_satuan(text)
        assert result == "KEPOLISIAN RESOR CIMAHI"
    def test_prefers_sektor_over_resor(self) -> None:
        text = (
            "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
            "DAERAH JAWA BARAT\n"
            "RESOR CIMAHI\n"
            "SEKTOR PADALARANG\n"
        )
        result = find_satuan(text)
        assert result == "KEPOLISIAN SEKTOR PADALARANG"
    def test_handles_daerah_only(self) -> None:
        text = "KEPOLISIAN NEGARA REPUBLIK INDONESIA\nDAERAH JAWA BARAT\n"
        result = find_satuan(text)
        assert result == "KEPOLISIAN DAERAH JAWA BARAT"
    def test_returns_none_when_no_letterhead(self) -> None:
        assert find_satuan("no police letterhead here") is None
 class TestPerihal:
    def test_extracts_perihal_line(self) -> None:
@@ -69,6 +100,25 @@ class TestPerihal:
    def test_returns_none_when_absent(self) -> None:
        assert find_perihal("no perihal field") is None
    def test_falls_back_to_pertimbangan_block(self) -> None:
        # Many Polres-level sprints use "Pertimbangan" instead of "Perihal".
        # The fallback should pick up the first non-empty line under it.
        text = (
            "Pertimbangan\n"
            "Bahwa dalam rangka mendukung kepentingan Dinas Polres Cimahi.\n"
            "DASAR :\n"
            "1. ...\n"
        )
        result = find_perihal(text)
        assert result is not None
        assert result.startswith("Bahwa dalam rangka mendukung")
    def test_perihal_wins_over_pertimbangan_when_both_present(self) -> None:
        # If the document has both a Perihal label AND a Pertimbangan
        # paragraph, the explicit Perihal wins.
        text = "Pertimbangan\nSome pertimbangan content.\nPERIHAL : The actual perihal.\n"
        assert find_perihal(text) == "The actual perihal."
 class TestDasar:
    def test_numbered_list(self) -> None:
@@ -88,6 +138,57 @@ class TestDasar:
    def test_empty_when_section_missing(self) -> None:
        assert find_dasar_list("no dasar section") == []
    def test_handles_bare_number_lines_split_by_ocr(self) -> None:
        # OCR sometimes places the number marker on its own line and the
        # body on the next non-empty line. The collector must merge them
        # rather than dropping the body or appending it to the previous
        # item (which the old implementation did).
        text = (
            "Dasar\n"
            ":\n"
            "1.\n"
            " Undang - Undang Nomor 2 tahun 2002 tentang Kepolisian;\n"
            "2. Peraturan Pemerintah Republik Indonesia No. 76 tahun 2020;\n"
            "3.\n"
            "Keterangan Catatan Kepolisian (SKCK);\n"
            "4.\n"
            "Pelayanan dilingkungan Badan Intelijen Keamanan Polri.\n"
            "5. DIPA Petikan Satker Polres Cimahi.\n"
            "DIPERINTAHKAN\n"
        )
        items = find_dasar_list(text)
        assert len(items) == 5
        assert items[0].startswith("Undang - Undang")
        assert items[2].startswith("Keterangan Catatan")
        assert items[3].startswith("Pelayanan dilingkungan")
        assert items[4].startswith("DIPA")
 class TestUntuk:
    def test_extracts_numbered_untuk_bullets(self) -> None:
        text = (
            "DIPERINTAHKAN\n"
            "Kepada\n"
            "Untuk\n"
            "1.\n"
            "melaksanakan tugas A;\n"
            "2.\n"
            "melaksanakan tugas B;\n"
            "Selesai.\n"
        )
        items = find_untuk_list(text)
        assert len(items) == 2
        assert items[0] == "melaksanakan tugas A;"
        assert items[1] == "melaksanakan tugas B;"
    def test_returns_empty_when_section_missing(self) -> None:
        assert find_untuk_list("no untuk section") == []
    def test_stops_at_dikeluarkan(self) -> None:
        text = "Untuk\n1. tugas A;\nDikeluarkan di Cimahi\n2. should not be captured\n"
        items = find_untuk_list(text)
        assert items == ["tugas A;"]
 class TestSignatory:
    def test_extracts_last_nrp(self) -> None:
--- a/tests/unit/test_validators.py
+++ b/tests/unit/test_validators.py
@@ -62,6 +62,20 @@ class TestPersonnelValidator:
        entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
        assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
    def test_row_with_only_name_is_flagged_incomplete(self) -> None:
        # A row that captured only `nama` (no pangkat AND no nrp) is the
        # signature of mis-aligned table extraction. Must be flagged so
        # the operator routes the document to needs_review.
        entry = PersonnelEntry(nama="LEAKED FROM SOMEWHERE")
        flags = validate_personnel_entry(entry)
        assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW in flags
    def test_row_with_only_pangkat_is_not_flagged_incomplete(self) -> None:
        # Having pangkat without NRP is suboptimal but still identifies a
        # rank, so we don't raise the structural-incompleteness flag.
        entry = PersonnelEntry(pangkat="AKP", nama="Test")
        assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW not in validate_personnel_entry(entry)
 class TestHeaderValidator:
    def test_complete_header_no_flags(self) -> None: