Fix personnel extraction + header bugs on real Polres Cimahi sprint

This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF: 1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' letterhead line instead of the most-specific issuing unit (e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to scan for each level independently and return the deepest available. 2. find_dasar_list dropped numbered items when OCR put the marker on its own line ("1.\n Undang-Undang ..."). Refactored into _collect_numbered_section that buffers a bare-number line and uses the next non-empty line as the body. Also reused for the new find_untuk_list which extracts the previously-empty 'untuk' bullets. 3. find_perihal returned None for documents that use 'Pertimbangan' (very common in Polres-level sprint), forcing the LLM to guess. Added a regex fallback that picks up the first line under a 'Pertimbangan' label so we keep extraction deterministic. 4. Personnel rows were emitted with only nama populated when PP-Structure detected a table but the column mapper degraded. Added a text-based fallback (extract_personnel_from_text) that scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when the PP-Structure result has fewer than 30% rank/NRP-bearing rows. Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag. 5. Validation now flags rows with neither pangkat nor nrp as INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review even when individual nrp/pangkat checks pass on empty values. 6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans). Tests: 229 (was 203) — 26 new tests covering the regex fixes, text-based personnel extractor, low-quality detector, validator behaviour, and orchestrator wiring of the fallback path. Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-26 05:35:42 +00:00
parent dce77e80e1
commit 58a2bf2648
11 changed files with 747 additions and 39 deletions
--- a/src/ocr_sprint/data/master_pangkat.py
+++ b/src/ocr_sprint/data/master_pangkat.py
@@ -22,7 +22,7 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
    # Bintara
    "BRIPDA": ("BRIPDA",),
    "BRIPTU": ("BRIPTU",),
-    "BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"),
+    "BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL", "BRIGPOL"),
    "BRIPKA": ("BRIPKA",),
    "AIPDA": ("AIPDA",),
    "AIPTU": ("AIPTU",),
--- a/src/ocr_sprint/pipeline/confidence.py
+++ b/src/ocr_sprint/pipeline/confidence.py
@@ -22,6 +22,14 @@ _FLAG_PENALTY: dict[ReviewFlag, float] = {
    ReviewFlag.UNKNOWN_PANGKAT: 0.05,
    ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15,
    ReviewFlag.DATE_PARSE_FAILED: 0.10,
+    # Text-based personnel fallback is a recoverable degradation: rank/NRP
+    # were extracted via regex from raw OCR rather than from a parsed table
+    # grid. Worth flagging for review but not catastrophic.
+    ReviewFlag.PERSONNEL_TEXT_FALLBACK: 0.05,
+    # An incomplete personnel row (no pangkat AND no nrp) is a strong
+    # signal something went wrong. Penalise heavily so the document
+    # routes to needs_review even if the rest of the extraction is fine.
+    ReviewFlag.INCOMPLETE_PERSONNEL_ROW: 0.15,
 }

 OCR_WEIGHT = 0.6
--- a/src/ocr_sprint/pipeline/extract/personnel_text.py
+++ b/src/ocr_sprint/pipeline/extract/personnel_text.py
@@ -0,0 +1,203 @@
+"""Text-based fallback personnel extractor.
+
+PP-Structure (Phase 3) is the primary path for personnel rows because it
+preserves the table grid. But PP-Structure can fail in two ways on real
+sprint scans:
+
+1. The table is not detected at all (low-quality scan, watermark, atypical
+   layout) — `extract_personnel` returns an empty list.
+2. The table IS detected but the column mapping is too sparse, so each row
+   collapses to a single ``nama`` cell with all other fields ``None``. This
+   is what was observed on a real Polres Cimahi sprint where the OCR
+   produced 24 rows with only ``nama`` populated.
+
+This module provides a regex/heuristic fallback that operates directly on
+the flat OCR text. It is deliberately conservative: a row must have BOTH a
+recognizable Polri rank AND an 8-digit NRP to be emitted, so we never
+generate the kind of "name-only" rows that motivated the fallback in the
+first place.
+"""
+
+from __future__ import annotations
+
+import re
+
+from ocr_sprint.data.master_pangkat import (
+    PANGKAT_VARIANTS,
+    is_valid_pangkat,
+    normalize_pangkat,
+)
+from ocr_sprint.schemas.personnel import PersonnelEntry
+
+# Build a single alternation of all known rank tokens (longest first so multi-
+# word ranks like "KOMBES POL" win over the single-word "KOMBES").
+_RANK_TOKENS: tuple[str, ...] = tuple(
+    sorted(
+        {variant for variants in PANGKAT_VARIANTS.values() for variant in variants},
+        key=lambda v: -len(v),
+    )
+)
+_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
+# A line that contains a rank token followed (anywhere on the same line) by
+# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
+# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
+# inside e.g. "ABRIPDA-style" text.
+_RE_RANK_NRP_LINE = re.compile(
+    rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
+    re.IGNORECASE,
+)
+# A bare row number marker like "1." or "12)". OCR often puts it on its own
+# line in tabular layouts.
+_RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$")
+# Lines that should never be interpreted as a personnel name. These are
+# section headers, OCR garbage anchors, and column header tokens.
+_NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = (
+    "DASAR",
+    "PERIHAL",
+    "PERTIMBANGAN",
+    "DIPERINTAHKAN",
+    "KEPADA",
+    "UNTUK",
+    "TEMBUSAN",
+    "DIKELUARKAN",
+    "PADA TANGGAL",
+    "SELESAI",
+    "DAFTAR",
+    "LAMPIRAN",
+    "NOMOR",
+    "TANGGAL",
+    "KEPOLISIAN",
+    "DAERAH",
+    "RESOR",
+    "SEKTOR",
+    "MABES",
+    "SURAT PERINTAH",
+    "NRP",
+    "NIP",
+    "PANGKAT",
+    "JABATAN",
+    "NAMA",
+    "KETERANGAN",
+    "KET",
+    "NO",
+)
+# A name should look like a name: mostly letters, common punctuation, and
+# at least one alphabetic character. Pure-numeric or pure-symbol lines are
+# rejected.
+_RE_NAME_OK = re.compile(r"[A-Za-z]")
+
+
+def _is_plausible_name(line: str) -> bool:
+    """Return True iff ``line`` could plausibly be a personnel name."""
+    stripped = line.strip()
+    if not stripped or not _RE_NAME_OK.search(stripped):
+        return False
+    upper = stripped.upper()
+    for prefix in _NAME_BLOCKLIST_PREFIXES:
+        if upper.startswith(prefix):
+            return False
+    if _RE_ROW_NUMBER.match(stripped):
+        return False
+    if _RE_RANK_NRP_LINE.search(stripped):
+        return False
+    # Reject lines that are nothing but a row number with extra punctuation
+    # ("1 .", "2)") which the bare-number regex above might miss.
+    return not re.fullmatch(r"[\s\d.)(\-]+", stripped)
+
+
+def _following_jabatan(lines: list[str], idx: int) -> str | None:
+    """Collect 1-3 follow-up lines after the rank+NRP line as the jabatan.
+
+    Stops at the next rank+NRP line, the next bare row-number line, or any
+    blocked prefix (section header / column header).
+    """
+    parts: list[str] = []
+    for fwd in range(idx + 1, min(idx + 4, len(lines))):
+        candidate = lines[fwd].strip()
+        if not candidate:
+            if parts:
+                break
+            continue
+        if _RE_RANK_NRP_LINE.search(candidate):
+            break
+        if _RE_ROW_NUMBER.match(candidate):
+            break
+        upper = candidate.upper()
+        if any(upper.startswith(p) for p in _NAME_BLOCKLIST_PREFIXES):
+            break
+        parts.append(candidate)
+    if not parts:
+        return None
+    joined = " ".join(parts)
+    return " ".join(joined.split()) or None
+
+
+def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
+    """Best-effort personnel extraction from a flat OCR text stream.
+
+    Strategy:
+
+    1. Iterate every line. Skip lines that don't contain both a known rank
+       and an 8-digit NRP (those are the only signal we trust).
+    2. For each rank+NRP line, look back for the most recent plausible name
+       line, and forward 1-3 lines for jabatan content.
+    3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
+
+    The fallback is intentionally rate-limited: the first matching rank
+    token on a line wins (no greedy multi-match per line), and a name line
+    can only be consumed once (so a stray ranked text inside a paragraph
+    doesn't turn into multiple bogus entries).
+    """
+    lines = raw_text.splitlines()
+    consumed_names: set[int] = set()
+    rows: list[PersonnelEntry] = []
+
+    for idx, raw_line in enumerate(lines):
+        line = raw_line.strip()
+        match = _RE_RANK_NRP_LINE.search(line)
+        if not match:
+            continue
+        pangkat = normalize_pangkat(match.group("rank"))
+        if not pangkat or not is_valid_pangkat(pangkat):
+            continue
+        nrp = match.group("nrp")
+
+        nama: str | None = None
+        for back in range(idx - 1, max(idx - 6, -1), -1):
+            if back in consumed_names:
+                continue
+            candidate = lines[back].strip()
+            if _is_plausible_name(candidate):
+                nama = candidate
+                consumed_names.add(back)
+                break
+
+        jabatan = _following_jabatan(lines, idx)
+        rows.append(
+            PersonnelEntry(
+                no=None,
+                pangkat=pangkat,
+                nrp=nrp,
+                nama=nama,
+                jabatan_dinas=jabatan,
+                jabatan_sprint=None,
+                keterangan=None,
+            )
+        )
+    return rows
+
+
+def is_low_quality(rows: list[PersonnelEntry]) -> bool:
+    """Heuristic: did PP-Structure produce useless rows?
+
+    A row is useful when it has at least pangkat OR nrp. If most rows have
+    only ``nama`` (or worse, nothing) the table extraction failed and the
+    caller should retry with the text-based fallback.
+    """
+    if not rows:
+        return True
+    useful = sum(1 for r in rows if r.pangkat or r.nrp)
+    # Require at least 30% of rows to carry rank/NRP signal. Below that we
+    # assume the column mapper degraded to "everything is nama" and prefer
+    # a fresh attempt.
+    return useful / max(1, len(rows)) < 0.3
--- a/src/ocr_sprint/pipeline/extract/regex_rules.py
+++ b/src/ocr_sprint/pipeline/extract/regex_rules.py
@@ -53,19 +53,52 @@ _RE_TANGGAL_ID = re.compile(
    re.IGNORECASE,
 )

-# Satuan penerbit usually appears in the document letterhead, prefixed by
-# KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
-_RE_SATUAN = re.compile(
-    r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
-    r"[^\n]{0,80}",
+# Polri letterhead pieces. The full letterhead spans multiple lines that are
+# often broken across separate OCR rows like:
+#
+#     KEPOLISIAN NEGARA REPUBLIK INDONESIA
+#     DAERAH JAWA BARAT
+#     RESOR CIMAHI
+#
+# We capture each individual level so we can reconstruct the most-specific
+# unit (RESOR / SEKTOR > DAERAH > NEGARA) — a downstream consumer cares
+# about *which* unit issued the sprint, not just that some Polri unit did.
+_RE_LEVEL_NEGARA = re.compile(
+    r"KEPOLISIAN\s+NEGARA\s+REPUBLIK\s+INDONESIA",
    re.IGNORECASE,
 )
+_RE_LEVEL_DAERAH = re.compile(
+    r"(?:KEPOLISIAN\s+)?DAERAH\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
+    re.IGNORECASE | re.MULTILINE,
+)
+_RE_LEVEL_RESOR = re.compile(
+    r"(?:KEPOLISIAN\s+)?RESORT?\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
+    re.IGNORECASE | re.MULTILINE,
+)
+_RE_LEVEL_SEKTOR = re.compile(
+    r"(?:KEPOLISIAN\s+)?SEKTOR\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
+    re.IGNORECASE | re.MULTILINE,
+)
+_RE_LEVEL_MABES = re.compile(r"MABES\s+POLRI\b", re.IGNORECASE)

 # "Perihal : ...." up to end of line.
 _RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
+# Many sprint docs (especially Polres-level) use 'Pertimbangan' as the
+# single-paragraph rationale block instead of (or alongside) 'Perihal'.
+# When `perihal` is missing we fall back to the first non-empty line under
+# 'Pertimbangan :' so the LLM doesn't have to guess and so a downstream
+# audit trail still has *something* in the perihal slot.
+_RE_PERTIMBANGAN_LABEL = re.compile(r"^\s*PERTIMBANGAN\b", re.IGNORECASE)

 # A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
 _RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
+# OCR sometimes splits the number from its content across two lines:
+#     1.
+#      Undang-Undang Nomor 2 Tahun 2002 ...
+# We detect a bare-number line and merge with the next non-empty line.
+_RE_DASAR_BARE_NUMBER = re.compile(r"^\s*(\d+)\s*[.)]\s*$")
+# Generic 'untuk' bullet — same shape as a dasar item.
+_RE_UNTUK_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")

 # Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
 _RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
@@ -99,54 +132,159 @@ def find_tanggal(text: str) -> date | None:
        return None


+def _clean_unit_tail(tail: str) -> str:
+    """Strip trailing punctuation/noise from the captured place name."""
+    return " ".join(tail.split()).strip(" .,;:'\"")
+
+
 def find_satuan(text: str) -> str | None:
-    """Return the first letterhead match (issuing unit), normalized."""
-    match = _RE_SATUAN.search(text)
-    if not match:
-        return None
-    return " ".join(match.group(0).split())
+    """Return the issuing unit, preferring the most-specific letterhead level.
+
+    Polri letterheads are hierarchical (Negara > Daerah > Resor/Sektor). The
+    actual *issuing* unit is the deepest level present in the letterhead, not
+    the topmost generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' line. We scan
+    for each level independently and pick the most specific one available;
+    if only the generic Negara line is present we return that.
+
+    Examples
+    --------
+    >>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA\\n"
+    ...             "DAERAH JAWA BARAT\\nRESOR CIMAHI")
+    'KEPOLISIAN RESOR CIMAHI'
+    >>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
+    'KEPOLISIAN NEGARA REPUBLIK INDONESIA'
+    """
+    # We only look at the document head — letterheads always sit at the
+    # very top, and constraining the search prevents false positives from
+    # body text like '... Polres Cimahi ...' deep in a paragraph.
+    head = "\n".join(text.splitlines()[:25])
+
+    sektor = _RE_LEVEL_SEKTOR.search(head)
+    if sektor:
+        return f"KEPOLISIAN SEKTOR {_clean_unit_tail(sektor.group(1))}"
+    resor = _RE_LEVEL_RESOR.search(head)
+    if resor:
+        return f"KEPOLISIAN RESOR {_clean_unit_tail(resor.group(1))}"
+    daerah = _RE_LEVEL_DAERAH.search(head)
+    if daerah:
+        return f"KEPOLISIAN DAERAH {_clean_unit_tail(daerah.group(1))}"
+    if _RE_LEVEL_MABES.search(head):
+        return "MABES POLRI"
+    if _RE_LEVEL_NEGARA.search(head):
+        return "KEPOLISIAN NEGARA REPUBLIK INDONESIA"
+    return None


 def find_perihal(text: str) -> str | None:
-    """Return the first 'Perihal: ...' line, trimmed to that line only."""
+    """Return the first 'Perihal: ...' line, trimmed to that line only.
+
+    Falls back to the first non-empty line under a 'Pertimbangan' label
+    (a common variant in Polres-level surat sprint that doesn't have a
+    distinct 'Perihal' field). We deliberately keep this in regex-land
+    rather than deferring to the LLM because the LLM tends to hallucinate
+    perihal content from arbitrary paragraphs.
+    """
    for line in text.splitlines():
        m = _RE_PERIHAL.search(line)
        if m:
            return m.group(1).strip()
+
+    lines = text.splitlines()
+    for idx, line in enumerate(lines):
+        if _RE_PERTIMBANGAN_LABEL.match(line):
+            for follow in lines[idx + 1 : idx + 5]:
+                stripped = follow.strip(" :\t")
+                if stripped and stripped != ":":
+                    return stripped
+            break
    return None


+def _collect_numbered_section(
+    lines: list[str],
+    start_idx: int,
+    terminators: tuple[str, ...],
+) -> list[str]:
+    """Walk forward from ``start_idx`` collecting numbered list items.
+
+    Robust to OCR splitting the number marker onto its own line:
+        '1.'  ->   buffer ``pending_index=1``
+        next non-empty line starts the item body.
+
+    Continuation lines (non-empty, no leading number, after a started item)
+    are appended to the current item. Stops at any line whose uppercase form
+    starts with one of ``terminators``.
+    """
+    items: list[str] = []
+    pending_marker = False
+    blank_run = 0
+    for raw_line in lines[start_idx:]:
+        line = raw_line.strip()
+        upper = line.upper()
+        if any(upper.startswith(term) for term in terminators):
+            break
+        if not line:
+            blank_run += 1
+            # Two consecutive blank lines reliably mark the end of a section.
+            # A single blank line is tolerated because OCR sprinkles them.
+            if blank_run >= 2 and items and not pending_marker:
+                break
+            continue
+        blank_run = 0
+        bare = _RE_DASAR_BARE_NUMBER.match(line)
+        if bare:
+            pending_marker = True
+            continue
+        m = _RE_DASAR_ITEM.match(line)
+        if m:
+            items.append(m.group(2).strip())
+            pending_marker = False
+            continue
+        if pending_marker:
+            items.append(line)
+            pending_marker = False
+            continue
+        if items:
+            items[-1] = (items[-1] + " " + line).strip()
+    return items
+
+
 def find_dasar_list(text: str) -> list[str]:
    """Extract numbered 'Dasar' items from the text.

    Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
-    collect subsequent lines that start with a number. Stops at a blank line
-    or a line beginning with another section header keyword.
+    delegate to ``_collect_numbered_section`` which handles three OCR
+    artefacts:
+
+    1. Inline numbered items: ``"1. Undang-Undang ..."``.
+    2. Bare-number lines (the OCR engine puts the number alone on a line):
+       ``"1.\\n Undang-Undang ..."``.
+    3. Continuation lines (a line that is the wrapped tail of the previous
+       item gets appended back onto it).
    """
    lines = text.splitlines()
-    items: list[str] = []
-    in_dasar = False
    section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
-    for raw_line in lines:
-        line = raw_line.strip()
-        if not in_dasar:
-            if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
-                in_dasar = True
-            continue
-        if not line:
-            if items:
-                break
-            continue
-        upper = line.upper()
-        if any(upper.startswith(term) for term in section_terminators):
-            break
-        m = _RE_DASAR_ITEM.match(line)
-        if m:
-            items.append(m.group(2).strip())
-        elif items:
-            # continuation of the previous dasar item
-            items[-1] = (items[-1] + " " + line).strip()
-    return items
+    for idx, raw_line in enumerate(lines):
+        if re.match(r"^\s*DASAR\b", raw_line.strip(), re.IGNORECASE):
+            return _collect_numbered_section(lines, idx + 1, section_terminators)
+    return []
+
+
+def find_untuk_list(text: str) -> list[str]:
+    """Extract numbered 'Untuk' / 'DIPERINTAHKAN' bullets from the text.
+
+    The 'Untuk' section follows 'DIPERINTAHKAN' / 'Kepada' and lists the
+    tasks assigned to the personnel. Same OCR shape as Dasar, so we reuse
+    the collector but with different terminators.
+    """
+    lines = text.splitlines()
+    # Stop conditions: 'Selesai' (boilerplate), 'Dikeluarkan di' (signature
+    # block), 'Tembusan' (carbon-copy section).
+    terminators = ("SELESAI", "DIKELUARKAN", "TEMBUSAN", "PADA TANGGAL")
+    for idx, raw_line in enumerate(lines):
+        if re.match(r"^\s*UNTUK\b", raw_line.strip(), re.IGNORECASE):
+            return _collect_numbered_section(lines, idx + 1, terminators)
+    return []


 def find_signatory(text: str) -> Signatory:
--- a/src/ocr_sprint/pipeline/extract/validators.py
+++ b/src/ocr_sprint/pipeline/extract/validators.py
@@ -30,6 +30,13 @@ def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
        flags.append(ReviewFlag.INVALID_NRP)
    if entry.pangkat and not is_valid_pangkat(entry.pangkat):
        flags.append(ReviewFlag.UNKNOWN_PANGKAT)
+    # Identification of a personnel row requires at least pangkat OR nrp.
+    # A row carrying only a name is structurally incomplete - likely a
+    # mis-aligned table cell or a leaked tembusan/dasar fragment - and must
+    # be flagged for human review even though pangkat/nrp validation
+    # individually pass (because they're empty).
+    if not entry.pangkat and not entry.nrp:
+        flags.append(ReviewFlag.INCOMPLETE_PERSONNEL_ROW)
    return flags


--- a/src/ocr_sprint/pipeline/orchestrator.py
+++ b/src/ocr_sprint/pipeline/orchestrator.py
@@ -19,7 +19,15 @@ from ocr_sprint.llm.extractor import llm_fill_header
 from ocr_sprint.pipeline.confidence import compute_confidence, route
 from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
 from ocr_sprint.pipeline.extract.personnel import extract_personnel
-from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory
+from ocr_sprint.pipeline.extract.personnel_text import (
+    extract_personnel_from_text,
+    is_low_quality,
+)
+from ocr_sprint.pipeline.extract.regex_rules import (
+    extract_header,
+    find_signatory,
+    find_untuk_list,
+)
 from ocr_sprint.pipeline.extract.validators import validate_extraction
 from ocr_sprint.pipeline.ingest import NDArrayU8, detect_source_kind, ingest
 from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
@@ -112,6 +120,7 @@ def run_pipeline(content: bytes) -> PipelineOutput:
            header = merged

    personel: list[PersonnelEntry] = []
+    table_flags: list[ReviewFlag] = []
    if s.tables_enabled and cleaned_pages:
        all_tables: list[DetectedTable] = []
        for img in cleaned_pages:
@@ -126,14 +135,33 @@ def run_pipeline(content: bytes) -> PipelineOutput:
            personel_rows=len(personel),
        )

-    initial_flags: list[ReviewFlag] = list(llm_flags)
+    # Text-based fallback: PP-Structure can succeed structurally but emit
+    # rows with only ``nama`` populated (column mapper degraded), or fail to
+    # detect the table at all. In both cases the regex fallback that scans
+    # raw OCR for rank+NRP pairs produces a much more useful result. We
+    # always run it when the structured path is empty or low-quality, and
+    # raise a review flag so the operator knows the document didn't go
+    # through the preferred path.
+    if is_low_quality(personel):
+        fallback_rows = extract_personnel_from_text(full_text)
+        if fallback_rows:
+            personel = fallback_rows
+            table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
+            _logger.info(
+                "pipeline.personnel_text_fallback",
+                fallback_rows=len(fallback_rows),
+            )
+
+    untuk_items = find_untuk_list(full_text)
+
+    initial_flags: list[ReviewFlag] = list(llm_flags) + list(table_flags)
    if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
        initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)

    result = ExtractionResult(
        header=header,
        personel=personel,
-        untuk=[],
+        untuk=untuk_items,
        ttd=ttd,
        raw_text=full_text,
        confidence=mean_ocr_conf,
--- a/src/ocr_sprint/schemas/extraction.py
+++ b/src/ocr_sprint/schemas/extraction.py
@@ -21,6 +21,8 @@ class ReviewFlag(str, Enum):
    DATE_PARSE_FAILED = "date_parse_failed"
    LLM_FALLBACK = "llm_fallback"
    LLM_UNAVAILABLE = "llm_unavailable"
+    PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback"
+    INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row"


 class Signatory(BaseModel):
--- a/tests/unit/test_orchestrator_llm.py
+++ b/tests/unit/test_orchestrator_llm.py
@@ -169,3 +169,92 @@ def test_orchestrator_marks_unavailable_when_llm_returns_none(
    out = run_pipeline(b"%PDF-1.4\n%fake")
    assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
    assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags
+
+
+def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """When PP-Structure produces low-quality rows (e.g. only ``nama`` filled),
+    the orchestrator must run the text fallback against the raw OCR text and
+    raise the ``personnel_text_fallback`` flag.
+    """
+    monkeypatch.setenv("LLM_ENABLED", "false")
+    from ocr_sprint.config import get_settings
+
+    get_settings.cache_clear()
+
+    raw_text = (
+        "DAFTAR PERSONIL\n"
+        "1.\n"
+        "SRI WAHYUNI\n"
+        "AIPTU / 75070328\n"
+        "INTELKAM POLRES CIMAHI\n"
+        "2.\n"
+        "AGUNG LUKMAN\n"
+        "BRIPTU / 99030245\n"
+        "SAT INTELKAM\n"
+    )
+
+    # PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on
+    # the real Polres Cimahi document).
+    from ocr_sprint.schemas.personnel import PersonnelEntry
+
+    pp_structure_low_quality = [
+        PersonnelEntry(nama="SRI WAHYUNI"),
+        PersonnelEntry(nama="AGUNG LUKMAN"),
+    ]
+    _stub_pipeline_stages(
+        monkeypatch,
+        raw_text=raw_text,
+        regex_header=HeaderFields(
+            nomor_sprint="Sprin/1/I/2025",
+            tanggal=date(2025, 1, 1),
+            satuan_penerbit="Polres Cimahi",
+            perihal="ok",
+            dasar=["UU 2/2002"],
+        ),
+    )
+    # Override extract_personnel to return the broken PP-Structure rows.
+    monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality)
+
+    out = run_pipeline(b"%PDF-1.4\n%fake")
+    assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags
+    # Fallback rows must carry pangkat + nrp (the whole point of the path).
+    assert all(r.pangkat and r.nrp for r in out.result.personel)
+    assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"}
+
+
+def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Healthy PP-Structure output (rank+nrp present on most rows) must NOT
+    be replaced by the text fallback.
+    """
+    monkeypatch.setenv("LLM_ENABLED", "false")
+    from ocr_sprint.config import get_settings
+
+    get_settings.cache_clear()
+
+    from ocr_sprint.schemas.personnel import PersonnelEntry
+
+    healthy = [
+        PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"),
+        PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"),
+        PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"),
+    ]
+    _stub_pipeline_stages(
+        monkeypatch,
+        raw_text="ignored — should not be parsed",
+        regex_header=HeaderFields(
+            nomor_sprint="Sprin/1/I/2025",
+            tanggal=date(2025, 1, 1),
+            satuan_penerbit="Polres X",
+            perihal="ok",
+            dasar=["UU 2/2002"],
+        ),
+    )
+    monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy)
+
+    out = run_pipeline(b"%PDF-1.4\n%fake")
+    assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags
+    assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"]
--- a/tests/unit/test_personnel_text_fallback.py
+++ b/tests/unit/test_personnel_text_fallback.py
@@ -0,0 +1,118 @@
+"""Tests for the text-based personnel fallback extractor.
+
+Driven by the real Polres Cimahi sprint document where PP-Structure
+produced 24 rows with only ``nama`` populated. The fallback should
+recover at least the rank + NRP for every row.
+"""
+
+from __future__ import annotations
+
+from ocr_sprint.pipeline.extract.personnel_text import (
+    extract_personnel_from_text,
+    is_low_quality,
+)
+from ocr_sprint.schemas.personnel import PersonnelEntry
+
+_CIMAHI_FIXTURE = """\
+DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
+NO
+NAMA
+PANGKAT / NRP
+JABATAN
+KET
+BAUR SKCK SAT
+1.
+SRI WAHYUNI
+AIPTU / 75070328
+INTELKAM POLRES
+CIMAHI
+BA PELAKSANA SKCK
+2.
+CITRA DWI PUTRI R
+BRIPTU / 95070659
+ SAT INTELKAM
+POLRES CIMAHI
+BA PELAKSANA SKCK
+3.
+AGUNG LUKMAN AL
+BRIPTU / 99030245
+SAT INTELKAM
+POLRES CIMAHI
+BA POLSEK
+8.
+ARIEF SYAHRUL ZAMAN
+BRIGPOL /96030446
+MARGAASIH
+"""
+
+
+class TestExtractPersonnelFromText:
+    def test_extracts_rank_nrp_and_name(self) -> None:
+        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
+        assert len(rows) == 4
+        first = rows[0]
+        assert first.pangkat == "AIPTU"
+        assert first.nrp == "75070328"
+        assert first.nama == "SRI WAHYUNI"
+
+    def test_normalizes_brigpol_to_brigadir(self) -> None:
+        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
+        last = rows[-1]
+        # 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
+        assert last.pangkat == "BRIGADIR"
+        assert last.nrp == "96030446"
+        assert last.nama == "ARIEF SYAHRUL ZAMAN"
+
+    def test_skips_header_lines_as_names(self) -> None:
+        # No row should ever have a column-header word as nama.
+        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
+        names = [r.nama for r in rows]
+        for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
+            assert blocked not in names
+
+    def test_jabatan_collected_from_following_lines(self) -> None:
+        rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
+        assert rows[0].jabatan_dinas is not None
+        assert "INTELKAM" in rows[0].jabatan_dinas
+
+    def test_empty_text_returns_empty(self) -> None:
+        assert extract_personnel_from_text("") == []
+
+    def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
+        text = "Just a paragraph with no rank or NRP at all.\nAnother line."
+        assert extract_personnel_from_text(text) == []
+
+    def test_ignores_isolated_8digit_number_without_rank(self) -> None:
+        # NRP without a recognised rank token must not produce a row.
+        text = "Some line\n12345678\nanother line"
+        assert extract_personnel_from_text(text) == []
+
+    def test_rejects_unknown_rank_with_8digit_number(self) -> None:
+        # A "rank-shaped" word that isn't in the master list must not yield a row.
+        text = "Some line\nFAKERANK / 12345678\nanother line"
+        assert extract_personnel_from_text(text) == []
+
+
+class TestIsLowQuality:
+    def test_empty_list_is_low_quality(self) -> None:
+        assert is_low_quality([]) is True
+
+    def test_all_rows_with_only_name_is_low_quality(self) -> None:
+        rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
+        assert is_low_quality(rows) is True
+
+    def test_majority_with_rank_nrp_is_high_quality(self) -> None:
+        rows = [
+            PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
+            for i in range(10)
+        ]
+        assert is_low_quality(rows) is False
+
+    def test_borderline_30_percent_threshold(self) -> None:
+        # 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
+        useful = [
+            PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
+            for i in range(3)
+        ]
+        useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
+        assert is_low_quality(useful + useless) is False
--- a/tests/unit/test_regex_rules.py
+++ b/tests/unit/test_regex_rules.py
@@ -14,6 +14,7 @@ from ocr_sprint.pipeline.extract.regex_rules import (
    find_satuan,
    find_signatory,
    find_tanggal,
+    find_untuk_list,
 )


@@ -60,6 +61,36 @@ class TestSatuan:
        result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
        assert result is not None

+    def test_prefers_resor_over_negara_when_both_present(self) -> None:
+        # The Polri letterhead lists units hierarchically; the issuing unit
+        # is the deepest level, not the topmost generic "NEGARA" line.
+        text = (
+            "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
+            "DAERAH JAWA BARAT\n"
+            "RESOR CIMAHI\n"
+            "SURAT PERINTAH\n"
+        )
+        result = find_satuan(text)
+        assert result == "KEPOLISIAN RESOR CIMAHI"
+
+    def test_prefers_sektor_over_resor(self) -> None:
+        text = (
+            "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
+            "DAERAH JAWA BARAT\n"
+            "RESOR CIMAHI\n"
+            "SEKTOR PADALARANG\n"
+        )
+        result = find_satuan(text)
+        assert result == "KEPOLISIAN SEKTOR PADALARANG"
+
+    def test_handles_daerah_only(self) -> None:
+        text = "KEPOLISIAN NEGARA REPUBLIK INDONESIA\nDAERAH JAWA BARAT\n"
+        result = find_satuan(text)
+        assert result == "KEPOLISIAN DAERAH JAWA BARAT"
+
+    def test_returns_none_when_no_letterhead(self) -> None:
+        assert find_satuan("no police letterhead here") is None
+

 class TestPerihal:
    def test_extracts_perihal_line(self) -> None:
@@ -69,6 +100,25 @@ class TestPerihal:
    def test_returns_none_when_absent(self) -> None:
        assert find_perihal("no perihal field") is None

+    def test_falls_back_to_pertimbangan_block(self) -> None:
+        # Many Polres-level sprints use "Pertimbangan" instead of "Perihal".
+        # The fallback should pick up the first non-empty line under it.
+        text = (
+            "Pertimbangan\n"
+            "Bahwa dalam rangka mendukung kepentingan Dinas Polres Cimahi.\n"
+            "DASAR :\n"
+            "1. ...\n"
+        )
+        result = find_perihal(text)
+        assert result is not None
+        assert result.startswith("Bahwa dalam rangka mendukung")
+
+    def test_perihal_wins_over_pertimbangan_when_both_present(self) -> None:
+        # If the document has both a Perihal label AND a Pertimbangan
+        # paragraph, the explicit Perihal wins.
+        text = "Pertimbangan\nSome pertimbangan content.\nPERIHAL : The actual perihal.\n"
+        assert find_perihal(text) == "The actual perihal."
+

 class TestDasar:
    def test_numbered_list(self) -> None:
@@ -88,6 +138,57 @@ class TestDasar:
    def test_empty_when_section_missing(self) -> None:
        assert find_dasar_list("no dasar section") == []

+    def test_handles_bare_number_lines_split_by_ocr(self) -> None:
+        # OCR sometimes places the number marker on its own line and the
+        # body on the next non-empty line. The collector must merge them
+        # rather than dropping the body or appending it to the previous
+        # item (which the old implementation did).
+        text = (
+            "Dasar\n"
+            ":\n"
+            "1.\n"
+            " Undang - Undang Nomor 2 tahun 2002 tentang Kepolisian;\n"
+            "2. Peraturan Pemerintah Republik Indonesia No. 76 tahun 2020;\n"
+            "3.\n"
+            "Keterangan Catatan Kepolisian (SKCK);\n"
+            "4.\n"
+            "Pelayanan dilingkungan Badan Intelijen Keamanan Polri.\n"
+            "5. DIPA Petikan Satker Polres Cimahi.\n"
+            "DIPERINTAHKAN\n"
+        )
+        items = find_dasar_list(text)
+        assert len(items) == 5
+        assert items[0].startswith("Undang - Undang")
+        assert items[2].startswith("Keterangan Catatan")
+        assert items[3].startswith("Pelayanan dilingkungan")
+        assert items[4].startswith("DIPA")
+
+
+class TestUntuk:
+    def test_extracts_numbered_untuk_bullets(self) -> None:
+        text = (
+            "DIPERINTAHKAN\n"
+            "Kepada\n"
+            "Untuk\n"
+            "1.\n"
+            "melaksanakan tugas A;\n"
+            "2.\n"
+            "melaksanakan tugas B;\n"
+            "Selesai.\n"
+        )
+        items = find_untuk_list(text)
+        assert len(items) == 2
+        assert items[0] == "melaksanakan tugas A;"
+        assert items[1] == "melaksanakan tugas B;"
+
+    def test_returns_empty_when_section_missing(self) -> None:
+        assert find_untuk_list("no untuk section") == []
+
+    def test_stops_at_dikeluarkan(self) -> None:
+        text = "Untuk\n1. tugas A;\nDikeluarkan di Cimahi\n2. should not be captured\n"
+        items = find_untuk_list(text)
+        assert items == ["tugas A;"]
+

 class TestSignatory:
    def test_extracts_last_nrp(self) -> None:
--- a/tests/unit/test_validators.py
+++ b/tests/unit/test_validators.py
@@ -62,6 +62,20 @@ class TestPersonnelValidator:
        entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
        assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)

+    def test_row_with_only_name_is_flagged_incomplete(self) -> None:
+        # A row that captured only `nama` (no pangkat AND no nrp) is the
+        # signature of mis-aligned table extraction. Must be flagged so
+        # the operator routes the document to needs_review.
+        entry = PersonnelEntry(nama="LEAKED FROM SOMEWHERE")
+        flags = validate_personnel_entry(entry)
+        assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW in flags
+
+    def test_row_with_only_pangkat_is_not_flagged_incomplete(self) -> None:
+        # Having pangkat without NRP is suboptimal but still identifies a
+        # rank, so we don't raise the structural-incompleteness flag.
+        entry = PersonnelEntry(pangkat="AKP", nama="Test")
+        assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW not in validate_personnel_entry(entry)
+

 class TestHeaderValidator:
    def test_complete_header_no_flags(self) -> None: