From 58a2bf264828eed22cceaeddbbf3c82fde43e837 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 26 Apr 2026 05:35:42 +0000 Subject: [PATCH 1/2] Fix personnel extraction + header bugs on real Polres Cimahi sprint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF: 1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' letterhead line instead of the most-specific issuing unit (e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to scan for each level independently and return the deepest available. 2. find_dasar_list dropped numbered items when OCR put the marker on its own line ("1.\n Undang-Undang ..."). Refactored into _collect_numbered_section that buffers a bare-number line and uses the next non-empty line as the body. Also reused for the new find_untuk_list which extracts the previously-empty 'untuk' bullets. 3. find_perihal returned None for documents that use 'Pertimbangan' (very common in Polres-level sprint), forcing the LLM to guess. Added a regex fallback that picks up the first line under a 'Pertimbangan' label so we keep extraction deterministic. 4. Personnel rows were emitted with only nama populated when PP-Structure detected a table but the column mapper degraded. Added a text-based fallback (extract_personnel_from_text) that scans raw OCR for + <8-digit NRP> patterns. Triggered when the PP-Structure result has fewer than 30% rank/NRP-bearing rows. Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag. 5. Validation now flags rows with neither pangkat nor nrp as INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review even when individual nrp/pangkat checks pass on empty values. 6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans). Tests: 229 (was 203) — 26 new tests covering the regex fixes, text-based personnel extractor, low-quality detector, validator behaviour, and orchestrator wiring of the fallback path. Co-Authored-By: adrian kuman firmansah --- src/ocr_sprint/data/master_pangkat.py | 2 +- src/ocr_sprint/pipeline/confidence.py | 8 + .../pipeline/extract/personnel_text.py | 203 +++++++++++++++++ .../pipeline/extract/regex_rules.py | 208 +++++++++++++++--- src/ocr_sprint/pipeline/extract/validators.py | 7 + src/ocr_sprint/pipeline/orchestrator.py | 34 ++- src/ocr_sprint/schemas/extraction.py | 2 + tests/unit/test_orchestrator_llm.py | 89 ++++++++ tests/unit/test_personnel_text_fallback.py | 118 ++++++++++ tests/unit/test_regex_rules.py | 101 +++++++++ tests/unit/test_validators.py | 14 ++ 11 files changed, 747 insertions(+), 39 deletions(-) create mode 100644 src/ocr_sprint/pipeline/extract/personnel_text.py create mode 100644 tests/unit/test_personnel_text_fallback.py diff --git a/src/ocr_sprint/data/master_pangkat.py b/src/ocr_sprint/data/master_pangkat.py index 667b47c..554f143 100644 --- a/src/ocr_sprint/data/master_pangkat.py +++ b/src/ocr_sprint/data/master_pangkat.py @@ -22,7 +22,7 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = { # Bintara "BRIPDA": ("BRIPDA",), "BRIPTU": ("BRIPTU",), - "BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"), + "BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL", "BRIGPOL"), "BRIPKA": ("BRIPKA",), "AIPDA": ("AIPDA",), "AIPTU": ("AIPTU",), diff --git a/src/ocr_sprint/pipeline/confidence.py b/src/ocr_sprint/pipeline/confidence.py index d046a36..048516d 100644 --- a/src/ocr_sprint/pipeline/confidence.py +++ b/src/ocr_sprint/pipeline/confidence.py @@ -22,6 +22,14 @@ _FLAG_PENALTY: dict[ReviewFlag, float] = { ReviewFlag.UNKNOWN_PANGKAT: 0.05, ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15, ReviewFlag.DATE_PARSE_FAILED: 0.10, + # Text-based personnel fallback is a recoverable degradation: rank/NRP + # were extracted via regex from raw OCR rather than from a parsed table + # grid. Worth flagging for review but not catastrophic. + ReviewFlag.PERSONNEL_TEXT_FALLBACK: 0.05, + # An incomplete personnel row (no pangkat AND no nrp) is a strong + # signal something went wrong. Penalise heavily so the document + # routes to needs_review even if the rest of the extraction is fine. + ReviewFlag.INCOMPLETE_PERSONNEL_ROW: 0.15, } OCR_WEIGHT = 0.6 diff --git a/src/ocr_sprint/pipeline/extract/personnel_text.py b/src/ocr_sprint/pipeline/extract/personnel_text.py new file mode 100644 index 0000000..4360a80 --- /dev/null +++ b/src/ocr_sprint/pipeline/extract/personnel_text.py @@ -0,0 +1,203 @@ +"""Text-based fallback personnel extractor. + +PP-Structure (Phase 3) is the primary path for personnel rows because it +preserves the table grid. But PP-Structure can fail in two ways on real +sprint scans: + +1. The table is not detected at all (low-quality scan, watermark, atypical + layout) — `extract_personnel` returns an empty list. +2. The table IS detected but the column mapping is too sparse, so each row + collapses to a single ``nama`` cell with all other fields ``None``. This + is what was observed on a real Polres Cimahi sprint where the OCR + produced 24 rows with only ``nama`` populated. + +This module provides a regex/heuristic fallback that operates directly on +the flat OCR text. It is deliberately conservative: a row must have BOTH a +recognizable Polri rank AND an 8-digit NRP to be emitted, so we never +generate the kind of "name-only" rows that motivated the fallback in the +first place. +""" + +from __future__ import annotations + +import re + +from ocr_sprint.data.master_pangkat import ( + PANGKAT_VARIANTS, + is_valid_pangkat, + normalize_pangkat, +) +from ocr_sprint.schemas.personnel import PersonnelEntry + +# Build a single alternation of all known rank tokens (longest first so multi- +# word ranks like "KOMBES POL" win over the single-word "KOMBES"). +_RANK_TOKENS: tuple[str, ...] = tuple( + sorted( + {variant for variants in PANGKAT_VARIANTS.values() for variant in variants}, + key=lambda v: -len(v), + ) +) +_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS) +# A line that contains a rank token followed (anywhere on the same line) by +# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or +# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match +# inside e.g. "ABRIPDA-style" text. +_RE_RANK_NRP_LINE = re.compile( + rf"\b(?P{_RANK_ALT})\b[\s/.\-,:]*?(?P\d{{8}})\b", + re.IGNORECASE, +) +# A bare row number marker like "1." or "12)". OCR often puts it on its own +# line in tabular layouts. +_RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$") +# Lines that should never be interpreted as a personnel name. These are +# section headers, OCR garbage anchors, and column header tokens. +_NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( + "DASAR", + "PERIHAL", + "PERTIMBANGAN", + "DIPERINTAHKAN", + "KEPADA", + "UNTUK", + "TEMBUSAN", + "DIKELUARKAN", + "PADA TANGGAL", + "SELESAI", + "DAFTAR", + "LAMPIRAN", + "NOMOR", + "TANGGAL", + "KEPOLISIAN", + "DAERAH", + "RESOR", + "SEKTOR", + "MABES", + "SURAT PERINTAH", + "NRP", + "NIP", + "PANGKAT", + "JABATAN", + "NAMA", + "KETERANGAN", + "KET", + "NO", +) +# A name should look like a name: mostly letters, common punctuation, and +# at least one alphabetic character. Pure-numeric or pure-symbol lines are +# rejected. +_RE_NAME_OK = re.compile(r"[A-Za-z]") + + +def _is_plausible_name(line: str) -> bool: + """Return True iff ``line`` could plausibly be a personnel name.""" + stripped = line.strip() + if not stripped or not _RE_NAME_OK.search(stripped): + return False + upper = stripped.upper() + for prefix in _NAME_BLOCKLIST_PREFIXES: + if upper.startswith(prefix): + return False + if _RE_ROW_NUMBER.match(stripped): + return False + if _RE_RANK_NRP_LINE.search(stripped): + return False + # Reject lines that are nothing but a row number with extra punctuation + # ("1 .", "2)") which the bare-number regex above might miss. + return not re.fullmatch(r"[\s\d.)(\-]+", stripped) + + +def _following_jabatan(lines: list[str], idx: int) -> str | None: + """Collect 1-3 follow-up lines after the rank+NRP line as the jabatan. + + Stops at the next rank+NRP line, the next bare row-number line, or any + blocked prefix (section header / column header). + """ + parts: list[str] = [] + for fwd in range(idx + 1, min(idx + 4, len(lines))): + candidate = lines[fwd].strip() + if not candidate: + if parts: + break + continue + if _RE_RANK_NRP_LINE.search(candidate): + break + if _RE_ROW_NUMBER.match(candidate): + break + upper = candidate.upper() + if any(upper.startswith(p) for p in _NAME_BLOCKLIST_PREFIXES): + break + parts.append(candidate) + if not parts: + return None + joined = " ".join(parts) + return " ".join(joined.split()) or None + + +def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]: + """Best-effort personnel extraction from a flat OCR text stream. + + Strategy: + + 1. Iterate every line. Skip lines that don't contain both a known rank + and an 8-digit NRP (those are the only signal we trust). + 2. For each rank+NRP line, look back for the most recent plausible name + line, and forward 1-3 lines for jabatan content. + 3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp. + + The fallback is intentionally rate-limited: the first matching rank + token on a line wins (no greedy multi-match per line), and a name line + can only be consumed once (so a stray ranked text inside a paragraph + doesn't turn into multiple bogus entries). + """ + lines = raw_text.splitlines() + consumed_names: set[int] = set() + rows: list[PersonnelEntry] = [] + + for idx, raw_line in enumerate(lines): + line = raw_line.strip() + match = _RE_RANK_NRP_LINE.search(line) + if not match: + continue + pangkat = normalize_pangkat(match.group("rank")) + if not pangkat or not is_valid_pangkat(pangkat): + continue + nrp = match.group("nrp") + + nama: str | None = None + for back in range(idx - 1, max(idx - 6, -1), -1): + if back in consumed_names: + continue + candidate = lines[back].strip() + if _is_plausible_name(candidate): + nama = candidate + consumed_names.add(back) + break + + jabatan = _following_jabatan(lines, idx) + rows.append( + PersonnelEntry( + no=None, + pangkat=pangkat, + nrp=nrp, + nama=nama, + jabatan_dinas=jabatan, + jabatan_sprint=None, + keterangan=None, + ) + ) + return rows + + +def is_low_quality(rows: list[PersonnelEntry]) -> bool: + """Heuristic: did PP-Structure produce useless rows? + + A row is useful when it has at least pangkat OR nrp. If most rows have + only ``nama`` (or worse, nothing) the table extraction failed and the + caller should retry with the text-based fallback. + """ + if not rows: + return True + useful = sum(1 for r in rows if r.pangkat or r.nrp) + # Require at least 30% of rows to carry rank/NRP signal. Below that we + # assume the column mapper degraded to "everything is nama" and prefer + # a fresh attempt. + return useful / max(1, len(rows)) < 0.3 diff --git a/src/ocr_sprint/pipeline/extract/regex_rules.py b/src/ocr_sprint/pipeline/extract/regex_rules.py index 88e594f..d63786d 100644 --- a/src/ocr_sprint/pipeline/extract/regex_rules.py +++ b/src/ocr_sprint/pipeline/extract/regex_rules.py @@ -53,19 +53,52 @@ _RE_TANGGAL_ID = re.compile( re.IGNORECASE, ) -# Satuan penerbit usually appears in the document letterhead, prefixed by -# KEPOLISIAN . -_RE_SATUAN = re.compile( - r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)" - r"[^\n]{0,80}", +# Polri letterhead pieces. The full letterhead spans multiple lines that are +# often broken across separate OCR rows like: +# +# KEPOLISIAN NEGARA REPUBLIK INDONESIA +# DAERAH JAWA BARAT +# RESOR CIMAHI +# +# We capture each individual level so we can reconstruct the most-specific +# unit (RESOR / SEKTOR > DAERAH > NEGARA) — a downstream consumer cares +# about *which* unit issued the sprint, not just that some Polri unit did. +_RE_LEVEL_NEGARA = re.compile( + r"KEPOLISIAN\s+NEGARA\s+REPUBLIK\s+INDONESIA", re.IGNORECASE, ) +_RE_LEVEL_DAERAH = re.compile( + r"(?:KEPOLISIAN\s+)?DAERAH\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)", + re.IGNORECASE | re.MULTILINE, +) +_RE_LEVEL_RESOR = re.compile( + r"(?:KEPOLISIAN\s+)?RESORT?\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)", + re.IGNORECASE | re.MULTILINE, +) +_RE_LEVEL_SEKTOR = re.compile( + r"(?:KEPOLISIAN\s+)?SEKTOR\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)", + re.IGNORECASE | re.MULTILINE, +) +_RE_LEVEL_MABES = re.compile(r"MABES\s+POLRI\b", re.IGNORECASE) # "Perihal : ...." up to end of line. _RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE) +# Many sprint docs (especially Polres-level) use 'Pertimbangan' as the +# single-paragraph rationale block instead of (or alongside) 'Perihal'. +# When `perihal` is missing we fall back to the first non-empty line under +# 'Pertimbangan :' so the LLM doesn't have to guess and so a downstream +# audit trail still has *something* in the perihal slot. +_RE_PERTIMBANGAN_LABEL = re.compile(r"^\s*PERTIMBANGAN\b", re.IGNORECASE) # A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..." _RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$") +# OCR sometimes splits the number from its content across two lines: +# 1. +# Undang-Undang Nomor 2 Tahun 2002 ... +# We detect a bare-number line and merge with the next non-empty line. +_RE_DASAR_BARE_NUMBER = re.compile(r"^\s*(\d+)\s*[.)]\s*$") +# Generic 'untuk' bullet — same shape as a dasar item. +_RE_UNTUK_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$") # Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits. _RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE) @@ -99,54 +132,159 @@ def find_tanggal(text: str) -> date | None: return None +def _clean_unit_tail(tail: str) -> str: + """Strip trailing punctuation/noise from the captured place name.""" + return " ".join(tail.split()).strip(" .,;:'\"") + + def find_satuan(text: str) -> str | None: - """Return the first letterhead match (issuing unit), normalized.""" - match = _RE_SATUAN.search(text) - if not match: - return None - return " ".join(match.group(0).split()) + """Return the issuing unit, preferring the most-specific letterhead level. + + Polri letterheads are hierarchical (Negara > Daerah > Resor/Sektor). The + actual *issuing* unit is the deepest level present in the letterhead, not + the topmost generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' line. We scan + for each level independently and pick the most specific one available; + if only the generic Negara line is present we return that. + + Examples + -------- + >>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA\\n" + ... "DAERAH JAWA BARAT\\nRESOR CIMAHI") + 'KEPOLISIAN RESOR CIMAHI' + >>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA") + 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' + """ + # We only look at the document head — letterheads always sit at the + # very top, and constraining the search prevents false positives from + # body text like '... Polres Cimahi ...' deep in a paragraph. + head = "\n".join(text.splitlines()[:25]) + + sektor = _RE_LEVEL_SEKTOR.search(head) + if sektor: + return f"KEPOLISIAN SEKTOR {_clean_unit_tail(sektor.group(1))}" + resor = _RE_LEVEL_RESOR.search(head) + if resor: + return f"KEPOLISIAN RESOR {_clean_unit_tail(resor.group(1))}" + daerah = _RE_LEVEL_DAERAH.search(head) + if daerah: + return f"KEPOLISIAN DAERAH {_clean_unit_tail(daerah.group(1))}" + if _RE_LEVEL_MABES.search(head): + return "MABES POLRI" + if _RE_LEVEL_NEGARA.search(head): + return "KEPOLISIAN NEGARA REPUBLIK INDONESIA" + return None def find_perihal(text: str) -> str | None: - """Return the first 'Perihal: ...' line, trimmed to that line only.""" + """Return the first 'Perihal: ...' line, trimmed to that line only. + + Falls back to the first non-empty line under a 'Pertimbangan' label + (a common variant in Polres-level surat sprint that doesn't have a + distinct 'Perihal' field). We deliberately keep this in regex-land + rather than deferring to the LLM because the LLM tends to hallucinate + perihal content from arbitrary paragraphs. + """ for line in text.splitlines(): m = _RE_PERIHAL.search(line) if m: return m.group(1).strip() + + lines = text.splitlines() + for idx, line in enumerate(lines): + if _RE_PERTIMBANGAN_LABEL.match(line): + for follow in lines[idx + 1 : idx + 5]: + stripped = follow.strip(" :\t") + if stripped and stripped != ":": + return stripped + break return None +def _collect_numbered_section( + lines: list[str], + start_idx: int, + terminators: tuple[str, ...], +) -> list[str]: + """Walk forward from ``start_idx`` collecting numbered list items. + + Robust to OCR splitting the number marker onto its own line: + '1.' -> buffer ``pending_index=1`` + next non-empty line starts the item body. + + Continuation lines (non-empty, no leading number, after a started item) + are appended to the current item. Stops at any line whose uppercase form + starts with one of ``terminators``. + """ + items: list[str] = [] + pending_marker = False + blank_run = 0 + for raw_line in lines[start_idx:]: + line = raw_line.strip() + upper = line.upper() + if any(upper.startswith(term) for term in terminators): + break + if not line: + blank_run += 1 + # Two consecutive blank lines reliably mark the end of a section. + # A single blank line is tolerated because OCR sprinkles them. + if blank_run >= 2 and items and not pending_marker: + break + continue + blank_run = 0 + bare = _RE_DASAR_BARE_NUMBER.match(line) + if bare: + pending_marker = True + continue + m = _RE_DASAR_ITEM.match(line) + if m: + items.append(m.group(2).strip()) + pending_marker = False + continue + if pending_marker: + items.append(line) + pending_marker = False + continue + if items: + items[-1] = (items[-1] + " " + line).strip() + return items + + def find_dasar_list(text: str) -> list[str]: """Extract numbered 'Dasar' items from the text. Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and - collect subsequent lines that start with a number. Stops at a blank line - or a line beginning with another section header keyword. + delegate to ``_collect_numbered_section`` which handles three OCR + artefacts: + + 1. Inline numbered items: ``"1. Undang-Undang ..."``. + 2. Bare-number lines (the OCR engine puts the number alone on a line): + ``"1.\\n Undang-Undang ..."``. + 3. Continuation lines (a line that is the wrapped tail of the previous + item gets appended back onto it). """ lines = text.splitlines() - items: list[str] = [] - in_dasar = False section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL") - for raw_line in lines: - line = raw_line.strip() - if not in_dasar: - if re.match(r"^\s*DASAR\b", line, re.IGNORECASE): - in_dasar = True - continue - if not line: - if items: - break - continue - upper = line.upper() - if any(upper.startswith(term) for term in section_terminators): - break - m = _RE_DASAR_ITEM.match(line) - if m: - items.append(m.group(2).strip()) - elif items: - # continuation of the previous dasar item - items[-1] = (items[-1] + " " + line).strip() - return items + for idx, raw_line in enumerate(lines): + if re.match(r"^\s*DASAR\b", raw_line.strip(), re.IGNORECASE): + return _collect_numbered_section(lines, idx + 1, section_terminators) + return [] + + +def find_untuk_list(text: str) -> list[str]: + """Extract numbered 'Untuk' / 'DIPERINTAHKAN' bullets from the text. + + The 'Untuk' section follows 'DIPERINTAHKAN' / 'Kepada' and lists the + tasks assigned to the personnel. Same OCR shape as Dasar, so we reuse + the collector but with different terminators. + """ + lines = text.splitlines() + # Stop conditions: 'Selesai' (boilerplate), 'Dikeluarkan di' (signature + # block), 'Tembusan' (carbon-copy section). + terminators = ("SELESAI", "DIKELUARKAN", "TEMBUSAN", "PADA TANGGAL") + for idx, raw_line in enumerate(lines): + if re.match(r"^\s*UNTUK\b", raw_line.strip(), re.IGNORECASE): + return _collect_numbered_section(lines, idx + 1, terminators) + return [] def find_signatory(text: str) -> Signatory: diff --git a/src/ocr_sprint/pipeline/extract/validators.py b/src/ocr_sprint/pipeline/extract/validators.py index 14d15ef..8b28586 100644 --- a/src/ocr_sprint/pipeline/extract/validators.py +++ b/src/ocr_sprint/pipeline/extract/validators.py @@ -30,6 +30,13 @@ def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]: flags.append(ReviewFlag.INVALID_NRP) if entry.pangkat and not is_valid_pangkat(entry.pangkat): flags.append(ReviewFlag.UNKNOWN_PANGKAT) + # Identification of a personnel row requires at least pangkat OR nrp. + # A row carrying only a name is structurally incomplete - likely a + # mis-aligned table cell or a leaked tembusan/dasar fragment - and must + # be flagged for human review even though pangkat/nrp validation + # individually pass (because they're empty). + if not entry.pangkat and not entry.nrp: + flags.append(ReviewFlag.INCOMPLETE_PERSONNEL_ROW) return flags diff --git a/src/ocr_sprint/pipeline/orchestrator.py b/src/ocr_sprint/pipeline/orchestrator.py index 231aec1..e0a0625 100644 --- a/src/ocr_sprint/pipeline/orchestrator.py +++ b/src/ocr_sprint/pipeline/orchestrator.py @@ -19,7 +19,15 @@ from ocr_sprint.llm.extractor import llm_fill_header from ocr_sprint.pipeline.confidence import compute_confidence, route from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct from ocr_sprint.pipeline.extract.personnel import extract_personnel -from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory +from ocr_sprint.pipeline.extract.personnel_text import ( + extract_personnel_from_text, + is_low_quality, +) +from ocr_sprint.pipeline.extract.regex_rules import ( + extract_header, + find_signatory, + find_untuk_list, +) from ocr_sprint.pipeline.extract.validators import validate_extraction from ocr_sprint.pipeline.ingest import NDArrayU8, detect_source_kind, ingest from ocr_sprint.pipeline.ocr import OCRPage, run_ocr @@ -112,6 +120,7 @@ def run_pipeline(content: bytes) -> PipelineOutput: header = merged personel: list[PersonnelEntry] = [] + table_flags: list[ReviewFlag] = [] if s.tables_enabled and cleaned_pages: all_tables: list[DetectedTable] = [] for img in cleaned_pages: @@ -126,14 +135,33 @@ def run_pipeline(content: bytes) -> PipelineOutput: personel_rows=len(personel), ) - initial_flags: list[ReviewFlag] = list(llm_flags) + # Text-based fallback: PP-Structure can succeed structurally but emit + # rows with only ``nama`` populated (column mapper degraded), or fail to + # detect the table at all. In both cases the regex fallback that scans + # raw OCR for rank+NRP pairs produces a much more useful result. We + # always run it when the structured path is empty or low-quality, and + # raise a review flag so the operator knows the document didn't go + # through the preferred path. + if is_low_quality(personel): + fallback_rows = extract_personnel_from_text(full_text) + if fallback_rows: + personel = fallback_rows + table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK) + _logger.info( + "pipeline.personnel_text_fallback", + fallback_rows=len(fallback_rows), + ) + + untuk_items = find_untuk_list(full_text) + + initial_flags: list[ReviewFlag] = list(llm_flags) + list(table_flags) if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD: initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE) result = ExtractionResult( header=header, personel=personel, - untuk=[], + untuk=untuk_items, ttd=ttd, raw_text=full_text, confidence=mean_ocr_conf, diff --git a/src/ocr_sprint/schemas/extraction.py b/src/ocr_sprint/schemas/extraction.py index 5a3cdb0..252d1db 100644 --- a/src/ocr_sprint/schemas/extraction.py +++ b/src/ocr_sprint/schemas/extraction.py @@ -21,6 +21,8 @@ class ReviewFlag(str, Enum): DATE_PARSE_FAILED = "date_parse_failed" LLM_FALLBACK = "llm_fallback" LLM_UNAVAILABLE = "llm_unavailable" + PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback" + INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row" class Signatory(BaseModel): diff --git a/tests/unit/test_orchestrator_llm.py b/tests/unit/test_orchestrator_llm.py index d56af3c..af06ba3 100644 --- a/tests/unit/test_orchestrator_llm.py +++ b/tests/unit/test_orchestrator_llm.py @@ -169,3 +169,92 @@ def test_orchestrator_marks_unavailable_when_llm_returns_none( out = run_pipeline(b"%PDF-1.4\n%fake") assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags + + +def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When PP-Structure produces low-quality rows (e.g. only ``nama`` filled), + the orchestrator must run the text fallback against the raw OCR text and + raise the ``personnel_text_fallback`` flag. + """ + monkeypatch.setenv("LLM_ENABLED", "false") + from ocr_sprint.config import get_settings + + get_settings.cache_clear() + + raw_text = ( + "DAFTAR PERSONIL\n" + "1.\n" + "SRI WAHYUNI\n" + "AIPTU / 75070328\n" + "INTELKAM POLRES CIMAHI\n" + "2.\n" + "AGUNG LUKMAN\n" + "BRIPTU / 99030245\n" + "SAT INTELKAM\n" + ) + + # PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on + # the real Polres Cimahi document). + from ocr_sprint.schemas.personnel import PersonnelEntry + + pp_structure_low_quality = [ + PersonnelEntry(nama="SRI WAHYUNI"), + PersonnelEntry(nama="AGUNG LUKMAN"), + ] + _stub_pipeline_stages( + monkeypatch, + raw_text=raw_text, + regex_header=HeaderFields( + nomor_sprint="Sprin/1/I/2025", + tanggal=date(2025, 1, 1), + satuan_penerbit="Polres Cimahi", + perihal="ok", + dasar=["UU 2/2002"], + ), + ) + # Override extract_personnel to return the broken PP-Structure rows. + monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality) + + out = run_pipeline(b"%PDF-1.4\n%fake") + assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags + # Fallback rows must carry pangkat + nrp (the whole point of the path). + assert all(r.pangkat and r.nrp for r in out.result.personel) + assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"} + + +def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Healthy PP-Structure output (rank+nrp present on most rows) must NOT + be replaced by the text fallback. + """ + monkeypatch.setenv("LLM_ENABLED", "false") + from ocr_sprint.config import get_settings + + get_settings.cache_clear() + + from ocr_sprint.schemas.personnel import PersonnelEntry + + healthy = [ + PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"), + PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"), + PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"), + ] + _stub_pipeline_stages( + monkeypatch, + raw_text="ignored — should not be parsed", + regex_header=HeaderFields( + nomor_sprint="Sprin/1/I/2025", + tanggal=date(2025, 1, 1), + satuan_penerbit="Polres X", + perihal="ok", + dasar=["UU 2/2002"], + ), + ) + monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy) + + out = run_pipeline(b"%PDF-1.4\n%fake") + assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags + assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"] diff --git a/tests/unit/test_personnel_text_fallback.py b/tests/unit/test_personnel_text_fallback.py new file mode 100644 index 0000000..95cdd26 --- /dev/null +++ b/tests/unit/test_personnel_text_fallback.py @@ -0,0 +1,118 @@ +"""Tests for the text-based personnel fallback extractor. + +Driven by the real Polres Cimahi sprint document where PP-Structure +produced 24 rows with only ``nama`` populated. The fallback should +recover at least the rank + NRP for every row. +""" + +from __future__ import annotations + +from ocr_sprint.pipeline.extract.personnel_text import ( + extract_personnel_from_text, + is_low_quality, +) +from ocr_sprint.schemas.personnel import PersonnelEntry + +_CIMAHI_FIXTURE = """\ +DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024 +NO +NAMA +PANGKAT / NRP +JABATAN +KET +BAUR SKCK SAT +1. +SRI WAHYUNI +AIPTU / 75070328 +INTELKAM POLRES +CIMAHI +BA PELAKSANA SKCK +2. +CITRA DWI PUTRI R +BRIPTU / 95070659 + SAT INTELKAM +POLRES CIMAHI +BA PELAKSANA SKCK +3. +AGUNG LUKMAN AL +BRIPTU / 99030245 +SAT INTELKAM +POLRES CIMAHI +BA POLSEK +8. +ARIEF SYAHRUL ZAMAN +BRIGPOL /96030446 +MARGAASIH +""" + + +class TestExtractPersonnelFromText: + def test_extracts_rank_nrp_and_name(self) -> None: + rows = extract_personnel_from_text(_CIMAHI_FIXTURE) + assert len(rows) == 4 + first = rows[0] + assert first.pangkat == "AIPTU" + assert first.nrp == "75070328" + assert first.nama == "SRI WAHYUNI" + + def test_normalizes_brigpol_to_brigadir(self) -> None: + rows = extract_personnel_from_text(_CIMAHI_FIXTURE) + last = rows[-1] + # 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'. + assert last.pangkat == "BRIGADIR" + assert last.nrp == "96030446" + assert last.nama == "ARIEF SYAHRUL ZAMAN" + + def test_skips_header_lines_as_names(self) -> None: + # No row should ever have a column-header word as nama. + rows = extract_personnel_from_text(_CIMAHI_FIXTURE) + names = [r.nama for r in rows] + for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}: + assert blocked not in names + + def test_jabatan_collected_from_following_lines(self) -> None: + rows = extract_personnel_from_text(_CIMAHI_FIXTURE) + assert rows[0].jabatan_dinas is not None + assert "INTELKAM" in rows[0].jabatan_dinas + + def test_empty_text_returns_empty(self) -> None: + assert extract_personnel_from_text("") == [] + + def test_text_without_rank_nrp_pattern_returns_empty(self) -> None: + text = "Just a paragraph with no rank or NRP at all.\nAnother line." + assert extract_personnel_from_text(text) == [] + + def test_ignores_isolated_8digit_number_without_rank(self) -> None: + # NRP without a recognised rank token must not produce a row. + text = "Some line\n12345678\nanother line" + assert extract_personnel_from_text(text) == [] + + def test_rejects_unknown_rank_with_8digit_number(self) -> None: + # A "rank-shaped" word that isn't in the master list must not yield a row. + text = "Some line\nFAKERANK / 12345678\nanother line" + assert extract_personnel_from_text(text) == [] + + +class TestIsLowQuality: + def test_empty_list_is_low_quality(self) -> None: + assert is_low_quality([]) is True + + def test_all_rows_with_only_name_is_low_quality(self) -> None: + rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)] + assert is_low_quality(rows) is True + + def test_majority_with_rank_nrp_is_high_quality(self) -> None: + rows = [ + PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}") + for i in range(10) + ] + assert is_low_quality(rows) is False + + def test_borderline_30_percent_threshold(self) -> None: + # 3 useful out of 10 = exactly 0.3, treated as not-low-quality. + useful = [ + PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}") + for i in range(3) + ] + useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)] + assert is_low_quality(useful + useless) is False diff --git a/tests/unit/test_regex_rules.py b/tests/unit/test_regex_rules.py index 3cd7855..6efca49 100644 --- a/tests/unit/test_regex_rules.py +++ b/tests/unit/test_regex_rules.py @@ -14,6 +14,7 @@ from ocr_sprint.pipeline.extract.regex_rules import ( find_satuan, find_signatory, find_tanggal, + find_untuk_list, ) @@ -60,6 +61,36 @@ class TestSatuan: result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA") assert result is not None + def test_prefers_resor_over_negara_when_both_present(self) -> None: + # The Polri letterhead lists units hierarchically; the issuing unit + # is the deepest level, not the topmost generic "NEGARA" line. + text = ( + "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n" + "DAERAH JAWA BARAT\n" + "RESOR CIMAHI\n" + "SURAT PERINTAH\n" + ) + result = find_satuan(text) + assert result == "KEPOLISIAN RESOR CIMAHI" + + def test_prefers_sektor_over_resor(self) -> None: + text = ( + "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n" + "DAERAH JAWA BARAT\n" + "RESOR CIMAHI\n" + "SEKTOR PADALARANG\n" + ) + result = find_satuan(text) + assert result == "KEPOLISIAN SEKTOR PADALARANG" + + def test_handles_daerah_only(self) -> None: + text = "KEPOLISIAN NEGARA REPUBLIK INDONESIA\nDAERAH JAWA BARAT\n" + result = find_satuan(text) + assert result == "KEPOLISIAN DAERAH JAWA BARAT" + + def test_returns_none_when_no_letterhead(self) -> None: + assert find_satuan("no police letterhead here") is None + class TestPerihal: def test_extracts_perihal_line(self) -> None: @@ -69,6 +100,25 @@ class TestPerihal: def test_returns_none_when_absent(self) -> None: assert find_perihal("no perihal field") is None + def test_falls_back_to_pertimbangan_block(self) -> None: + # Many Polres-level sprints use "Pertimbangan" instead of "Perihal". + # The fallback should pick up the first non-empty line under it. + text = ( + "Pertimbangan\n" + "Bahwa dalam rangka mendukung kepentingan Dinas Polres Cimahi.\n" + "DASAR :\n" + "1. ...\n" + ) + result = find_perihal(text) + assert result is not None + assert result.startswith("Bahwa dalam rangka mendukung") + + def test_perihal_wins_over_pertimbangan_when_both_present(self) -> None: + # If the document has both a Perihal label AND a Pertimbangan + # paragraph, the explicit Perihal wins. + text = "Pertimbangan\nSome pertimbangan content.\nPERIHAL : The actual perihal.\n" + assert find_perihal(text) == "The actual perihal." + class TestDasar: def test_numbered_list(self) -> None: @@ -88,6 +138,57 @@ class TestDasar: def test_empty_when_section_missing(self) -> None: assert find_dasar_list("no dasar section") == [] + def test_handles_bare_number_lines_split_by_ocr(self) -> None: + # OCR sometimes places the number marker on its own line and the + # body on the next non-empty line. The collector must merge them + # rather than dropping the body or appending it to the previous + # item (which the old implementation did). + text = ( + "Dasar\n" + ":\n" + "1.\n" + " Undang - Undang Nomor 2 tahun 2002 tentang Kepolisian;\n" + "2. Peraturan Pemerintah Republik Indonesia No. 76 tahun 2020;\n" + "3.\n" + "Keterangan Catatan Kepolisian (SKCK);\n" + "4.\n" + "Pelayanan dilingkungan Badan Intelijen Keamanan Polri.\n" + "5. DIPA Petikan Satker Polres Cimahi.\n" + "DIPERINTAHKAN\n" + ) + items = find_dasar_list(text) + assert len(items) == 5 + assert items[0].startswith("Undang - Undang") + assert items[2].startswith("Keterangan Catatan") + assert items[3].startswith("Pelayanan dilingkungan") + assert items[4].startswith("DIPA") + + +class TestUntuk: + def test_extracts_numbered_untuk_bullets(self) -> None: + text = ( + "DIPERINTAHKAN\n" + "Kepada\n" + "Untuk\n" + "1.\n" + "melaksanakan tugas A;\n" + "2.\n" + "melaksanakan tugas B;\n" + "Selesai.\n" + ) + items = find_untuk_list(text) + assert len(items) == 2 + assert items[0] == "melaksanakan tugas A;" + assert items[1] == "melaksanakan tugas B;" + + def test_returns_empty_when_section_missing(self) -> None: + assert find_untuk_list("no untuk section") == [] + + def test_stops_at_dikeluarkan(self) -> None: + text = "Untuk\n1. tugas A;\nDikeluarkan di Cimahi\n2. should not be captured\n" + items = find_untuk_list(text) + assert items == ["tugas A;"] + class TestSignatory: def test_extracts_last_nrp(self) -> None: diff --git a/tests/unit/test_validators.py b/tests/unit/test_validators.py index 6ff0dcd..2313d2b 100644 --- a/tests/unit/test_validators.py +++ b/tests/unit/test_validators.py @@ -62,6 +62,20 @@ class TestPersonnelValidator: entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test") assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry) + def test_row_with_only_name_is_flagged_incomplete(self) -> None: + # A row that captured only `nama` (no pangkat AND no nrp) is the + # signature of mis-aligned table extraction. Must be flagged so + # the operator routes the document to needs_review. + entry = PersonnelEntry(nama="LEAKED FROM SOMEWHERE") + flags = validate_personnel_entry(entry) + assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW in flags + + def test_row_with_only_pangkat_is_not_flagged_incomplete(self) -> None: + # Having pangkat without NRP is suboptimal but still identifies a + # rank, so we don't raise the structural-incompleteness flag. + entry = PersonnelEntry(pangkat="AKP", nama="Test") + assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW not in validate_personnel_entry(entry) + class TestHeaderValidator: def test_complete_header_no_flags(self) -> None: From 737f4999dd8be895c026beb840bf208cdac412a4 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 26 Apr 2026 05:46:21 +0000 Subject: [PATCH 2/2] Use word-boundary matching for personnel name blocklist Devin Review correctly flagged that the bare "NO" and "KET" entries in the blocklist would silently drop common Indonesian names (KETUT, NOVA, NOOR, NORMAN, NOVIANTI, ...) because the check used startswith rather than a word boundary. Replaced the per-prefix loop with a single compiled regex anchored at ^ with a trailing \b, which still matches column headers like "NO" or "KET" on their own line but no longer rejects "NOOR HIDAYAT" or "KETUT WARDANA". Also fixes the same bug in _following_jabatan. Added two regression tests covering both directions: names starting with the offending tokens are kept, bare column headers still rejected. Co-Authored-By: adrian kuman firmansah --- .../pipeline/extract/personnel_text.py | 26 ++++++++++------ tests/unit/test_personnel_text_fallback.py | 31 +++++++++++++++++++ 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/ocr_sprint/pipeline/extract/personnel_text.py b/src/ocr_sprint/pipeline/extract/personnel_text.py index 4360a80..5e37984 100644 --- a/src/ocr_sprint/pipeline/extract/personnel_text.py +++ b/src/ocr_sprint/pipeline/extract/personnel_text.py @@ -50,8 +50,15 @@ _RE_RANK_NRP_LINE = re.compile( # line in tabular layouts. _RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$") # Lines that should never be interpreted as a personnel name. These are -# section headers, OCR garbage anchors, and column header tokens. -_NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( +# section headers, OCR garbage anchors, and column header tokens. We match +# them with a *word-boundary* regex (built from this list) rather than a +# bare ``startswith`` check, because short tokens like ``"NO"`` and +# ``"KET"`` would otherwise reject perfectly valid Indonesian names +# (e.g. ``"NOVA SARI"``, ``"NOOR HIDAYAT"``, ``"KETUT WARDANA"`` — the +# latter being an extremely common Balinese birth-order name). +_NAME_BLOCKLIST_TOKENS: tuple[str, ...] = ( + "PADA TANGGAL", # multi-word entries first so they win the alternation + "SURAT PERINTAH", "DASAR", "PERIHAL", "PERTIMBANGAN", @@ -60,7 +67,6 @@ _NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( "UNTUK", "TEMBUSAN", "DIKELUARKAN", - "PADA TANGGAL", "SELESAI", "DAFTAR", "LAMPIRAN", @@ -71,7 +77,6 @@ _NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( "RESOR", "SEKTOR", "MABES", - "SURAT PERINTAH", "NRP", "NIP", "PANGKAT", @@ -81,6 +86,10 @@ _NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( "KET", "NO", ) +_RE_NAME_BLOCKLIST = re.compile( + r"^(?:" + "|".join(re.escape(tok) for tok in _NAME_BLOCKLIST_TOKENS) + r")\b", + re.IGNORECASE, +) # A name should look like a name: mostly letters, common punctuation, and # at least one alphabetic character. Pure-numeric or pure-symbol lines are # rejected. @@ -92,10 +101,8 @@ def _is_plausible_name(line: str) -> bool: stripped = line.strip() if not stripped or not _RE_NAME_OK.search(stripped): return False - upper = stripped.upper() - for prefix in _NAME_BLOCKLIST_PREFIXES: - if upper.startswith(prefix): - return False + if _RE_NAME_BLOCKLIST.match(stripped): + return False if _RE_ROW_NUMBER.match(stripped): return False if _RE_RANK_NRP_LINE.search(stripped): @@ -122,8 +129,7 @@ def _following_jabatan(lines: list[str], idx: int) -> str | None: break if _RE_ROW_NUMBER.match(candidate): break - upper = candidate.upper() - if any(upper.startswith(p) for p in _NAME_BLOCKLIST_PREFIXES): + if _RE_NAME_BLOCKLIST.match(candidate): break parts.append(candidate) if not parts: diff --git a/tests/unit/test_personnel_text_fallback.py b/tests/unit/test_personnel_text_fallback.py index 95cdd26..884f99c 100644 --- a/tests/unit/test_personnel_text_fallback.py +++ b/tests/unit/test_personnel_text_fallback.py @@ -92,6 +92,37 @@ class TestExtractPersonnelFromText: text = "Some line\nFAKERANK / 12345678\nanother line" assert extract_personnel_from_text(text) == [] + def test_does_not_drop_indonesian_names_starting_with_no_or_ket(self) -> None: + # Regression: 'NO' / 'KET' are legitimate column header tokens but + # also prefix common Indonesian names (KETUT, NOVA, NOOR). The + # blocklist must use word boundaries, not a raw startswith check. + text = ( + "DAFTAR PERSONIL\n" + "1.\n" + "KETUT WARDANA\n" + "AIPTU / 11111111\n" + "JABATAN A\n" + "2.\n" + "NOVA SARI\n" + "BRIPTU / 22222222\n" + "JABATAN B\n" + "3.\n" + "NOOR HIDAYAT\n" + "BRIPDA / 33333333\n" + "JABATAN C\n" + ) + rows = extract_personnel_from_text(text) + names = [r.nama for r in rows] + assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"] + + def test_still_blocks_bare_column_header_tokens(self) -> None: + # Word-boundary fix must still reject the actual column-header + # rows that motivated the blocklist in the first place. + text = "NO\nNAMA\nPANGKAT / NRP\nJABATAN\nKET\n1.\nREAL NAME\nAIPTU / 12345678\n" + rows = extract_personnel_from_text(text) + assert len(rows) == 1 + assert rows[0].nama == "REAL NAME" + class TestIsLowQuality: def test_empty_list_is_low_quality(self) -> None: