From 58a2bf264828eed22cceaeddbbf3c82fde43e837 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 26 Apr 2026 05:35:42 +0000 Subject: [PATCH] Fix personnel extraction + header bugs on real Polres Cimahi sprint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF: 1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' letterhead line instead of the most-specific issuing unit (e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to scan for each level independently and return the deepest available. 2. find_dasar_list dropped numbered items when OCR put the marker on its own line ("1.\n Undang-Undang ..."). Refactored into _collect_numbered_section that buffers a bare-number line and uses the next non-empty line as the body. Also reused for the new find_untuk_list which extracts the previously-empty 'untuk' bullets. 3. find_perihal returned None for documents that use 'Pertimbangan' (very common in Polres-level sprint), forcing the LLM to guess. Added a regex fallback that picks up the first line under a 'Pertimbangan' label so we keep extraction deterministic. 4. Personnel rows were emitted with only nama populated when PP-Structure detected a table but the column mapper degraded. Added a text-based fallback (extract_personnel_from_text) that scans raw OCR for + <8-digit NRP> patterns. Triggered when the PP-Structure result has fewer than 30% rank/NRP-bearing rows. Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag. 5. Validation now flags rows with neither pangkat nor nrp as INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review even when individual nrp/pangkat checks pass on empty values. 6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans). Tests: 229 (was 203) — 26 new tests covering the regex fixes, text-based personnel extractor, low-quality detector, validator behaviour, and orchestrator wiring of the fallback path. Co-Authored-By: adrian kuman firmansah --- src/ocr_sprint/data/master_pangkat.py | 2 +- src/ocr_sprint/pipeline/confidence.py | 8 + .../pipeline/extract/personnel_text.py | 203 +++++++++++++++++ .../pipeline/extract/regex_rules.py | 208 +++++++++++++++--- src/ocr_sprint/pipeline/extract/validators.py | 7 + src/ocr_sprint/pipeline/orchestrator.py | 34 ++- src/ocr_sprint/schemas/extraction.py | 2 + tests/unit/test_orchestrator_llm.py | 89 ++++++++ tests/unit/test_personnel_text_fallback.py | 118 ++++++++++ tests/unit/test_regex_rules.py | 101 +++++++++ tests/unit/test_validators.py | 14 ++ 11 files changed, 747 insertions(+), 39 deletions(-) create mode 100644 src/ocr_sprint/pipeline/extract/personnel_text.py create mode 100644 tests/unit/test_personnel_text_fallback.py diff --git a/src/ocr_sprint/data/master_pangkat.py b/src/ocr_sprint/data/master_pangkat.py index 667b47c..554f143 100644 --- a/src/ocr_sprint/data/master_pangkat.py +++ b/src/ocr_sprint/data/master_pangkat.py @@ -22,7 +22,7 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = { # Bintara "BRIPDA": ("BRIPDA",), "BRIPTU": ("BRIPTU",), - "BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"), + "BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL", "BRIGPOL"), "BRIPKA": ("BRIPKA",), "AIPDA": ("AIPDA",), "AIPTU": ("AIPTU",), diff --git a/src/ocr_sprint/pipeline/confidence.py b/src/ocr_sprint/pipeline/confidence.py index d046a36..048516d 100644 --- a/src/ocr_sprint/pipeline/confidence.py +++ b/src/ocr_sprint/pipeline/confidence.py @@ -22,6 +22,14 @@ _FLAG_PENALTY: dict[ReviewFlag, float] = { ReviewFlag.UNKNOWN_PANGKAT: 0.05, ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15, ReviewFlag.DATE_PARSE_FAILED: 0.10, + # Text-based personnel fallback is a recoverable degradation: rank/NRP + # were extracted via regex from raw OCR rather than from a parsed table + # grid. Worth flagging for review but not catastrophic. + ReviewFlag.PERSONNEL_TEXT_FALLBACK: 0.05, + # An incomplete personnel row (no pangkat AND no nrp) is a strong + # signal something went wrong. Penalise heavily so the document + # routes to needs_review even if the rest of the extraction is fine. + ReviewFlag.INCOMPLETE_PERSONNEL_ROW: 0.15, } OCR_WEIGHT = 0.6 diff --git a/src/ocr_sprint/pipeline/extract/personnel_text.py b/src/ocr_sprint/pipeline/extract/personnel_text.py new file mode 100644 index 0000000..4360a80 --- /dev/null +++ b/src/ocr_sprint/pipeline/extract/personnel_text.py @@ -0,0 +1,203 @@ +"""Text-based fallback personnel extractor. + +PP-Structure (Phase 3) is the primary path for personnel rows because it +preserves the table grid. But PP-Structure can fail in two ways on real +sprint scans: + +1. The table is not detected at all (low-quality scan, watermark, atypical + layout) — `extract_personnel` returns an empty list. +2. The table IS detected but the column mapping is too sparse, so each row + collapses to a single ``nama`` cell with all other fields ``None``. This + is what was observed on a real Polres Cimahi sprint where the OCR + produced 24 rows with only ``nama`` populated. + +This module provides a regex/heuristic fallback that operates directly on +the flat OCR text. It is deliberately conservative: a row must have BOTH a +recognizable Polri rank AND an 8-digit NRP to be emitted, so we never +generate the kind of "name-only" rows that motivated the fallback in the +first place. +""" + +from __future__ import annotations + +import re + +from ocr_sprint.data.master_pangkat import ( + PANGKAT_VARIANTS, + is_valid_pangkat, + normalize_pangkat, +) +from ocr_sprint.schemas.personnel import PersonnelEntry + +# Build a single alternation of all known rank tokens (longest first so multi- +# word ranks like "KOMBES POL" win over the single-word "KOMBES"). +_RANK_TOKENS: tuple[str, ...] = tuple( + sorted( + {variant for variants in PANGKAT_VARIANTS.values() for variant in variants}, + key=lambda v: -len(v), + ) +) +_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS) +# A line that contains a rank token followed (anywhere on the same line) by +# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or +# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match +# inside e.g. "ABRIPDA-style" text. +_RE_RANK_NRP_LINE = re.compile( + rf"\b(?P{_RANK_ALT})\b[\s/.\-,:]*?(?P\d{{8}})\b", + re.IGNORECASE, +) +# A bare row number marker like "1." or "12)". OCR often puts it on its own +# line in tabular layouts. +_RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$") +# Lines that should never be interpreted as a personnel name. These are +# section headers, OCR garbage anchors, and column header tokens. +_NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( + "DASAR", + "PERIHAL", + "PERTIMBANGAN", + "DIPERINTAHKAN", + "KEPADA", + "UNTUK", + "TEMBUSAN", + "DIKELUARKAN", + "PADA TANGGAL", + "SELESAI", + "DAFTAR", + "LAMPIRAN", + "NOMOR", + "TANGGAL", + "KEPOLISIAN", + "DAERAH", + "RESOR", + "SEKTOR", + "MABES", + "SURAT PERINTAH", + "NRP", + "NIP", + "PANGKAT", + "JABATAN", + "NAMA", + "KETERANGAN", + "KET", + "NO", +) +# A name should look like a name: mostly letters, common punctuation, and +# at least one alphabetic character. Pure-numeric or pure-symbol lines are +# rejected. +_RE_NAME_OK = re.compile(r"[A-Za-z]") + + +def _is_plausible_name(line: str) -> bool: + """Return True iff ``line`` could plausibly be a personnel name.""" + stripped = line.strip() + if not stripped or not _RE_NAME_OK.search(stripped): + return False + upper = stripped.upper() + for prefix in _NAME_BLOCKLIST_PREFIXES: + if upper.startswith(prefix): + return False + if _RE_ROW_NUMBER.match(stripped): + return False + if _RE_RANK_NRP_LINE.search(stripped): + return False + # Reject lines that are nothing but a row number with extra punctuation + # ("1 .", "2)") which the bare-number regex above might miss. + return not re.fullmatch(r"[\s\d.)(\-]+", stripped) + + +def _following_jabatan(lines: list[str], idx: int) -> str | None: + """Collect 1-3 follow-up lines after the rank+NRP line as the jabatan. + + Stops at the next rank+NRP line, the next bare row-number line, or any + blocked prefix (section header / column header). + """ + parts: list[str] = [] + for fwd in range(idx + 1, min(idx + 4, len(lines))): + candidate = lines[fwd].strip() + if not candidate: + if parts: + break + continue + if _RE_RANK_NRP_LINE.search(candidate): + break + if _RE_ROW_NUMBER.match(candidate): + break + upper = candidate.upper() + if any(upper.startswith(p) for p in _NAME_BLOCKLIST_PREFIXES): + break + parts.append(candidate) + if not parts: + return None + joined = " ".join(parts) + return " ".join(joined.split()) or None + + +def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]: + """Best-effort personnel extraction from a flat OCR text stream. + + Strategy: + + 1. Iterate every line. Skip lines that don't contain both a known rank + and an 8-digit NRP (those are the only signal we trust). + 2. For each rank+NRP line, look back for the most recent plausible name + line, and forward 1-3 lines for jabatan content. + 3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp. + + The fallback is intentionally rate-limited: the first matching rank + token on a line wins (no greedy multi-match per line), and a name line + can only be consumed once (so a stray ranked text inside a paragraph + doesn't turn into multiple bogus entries). + """ + lines = raw_text.splitlines() + consumed_names: set[int] = set() + rows: list[PersonnelEntry] = [] + + for idx, raw_line in enumerate(lines): + line = raw_line.strip() + match = _RE_RANK_NRP_LINE.search(line) + if not match: + continue + pangkat = normalize_pangkat(match.group("rank")) + if not pangkat or not is_valid_pangkat(pangkat): + continue + nrp = match.group("nrp") + + nama: str | None = None + for back in range(idx - 1, max(idx - 6, -1), -1): + if back in consumed_names: + continue + candidate = lines[back].strip() + if _is_plausible_name(candidate): + nama = candidate + consumed_names.add(back) + break + + jabatan = _following_jabatan(lines, idx) + rows.append( + PersonnelEntry( + no=None, + pangkat=pangkat, + nrp=nrp, + nama=nama, + jabatan_dinas=jabatan, + jabatan_sprint=None, + keterangan=None, + ) + ) + return rows + + +def is_low_quality(rows: list[PersonnelEntry]) -> bool: + """Heuristic: did PP-Structure produce useless rows? + + A row is useful when it has at least pangkat OR nrp. If most rows have + only ``nama`` (or worse, nothing) the table extraction failed and the + caller should retry with the text-based fallback. + """ + if not rows: + return True + useful = sum(1 for r in rows if r.pangkat or r.nrp) + # Require at least 30% of rows to carry rank/NRP signal. Below that we + # assume the column mapper degraded to "everything is nama" and prefer + # a fresh attempt. + return useful / max(1, len(rows)) < 0.3 diff --git a/src/ocr_sprint/pipeline/extract/regex_rules.py b/src/ocr_sprint/pipeline/extract/regex_rules.py index 88e594f..d63786d 100644 --- a/src/ocr_sprint/pipeline/extract/regex_rules.py +++ b/src/ocr_sprint/pipeline/extract/regex_rules.py @@ -53,19 +53,52 @@ _RE_TANGGAL_ID = re.compile( re.IGNORECASE, ) -# Satuan penerbit usually appears in the document letterhead, prefixed by -# KEPOLISIAN . -_RE_SATUAN = re.compile( - r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)" - r"[^\n]{0,80}", +# Polri letterhead pieces. The full letterhead spans multiple lines that are +# often broken across separate OCR rows like: +# +# KEPOLISIAN NEGARA REPUBLIK INDONESIA +# DAERAH JAWA BARAT +# RESOR CIMAHI +# +# We capture each individual level so we can reconstruct the most-specific +# unit (RESOR / SEKTOR > DAERAH > NEGARA) — a downstream consumer cares +# about *which* unit issued the sprint, not just that some Polri unit did. +_RE_LEVEL_NEGARA = re.compile( + r"KEPOLISIAN\s+NEGARA\s+REPUBLIK\s+INDONESIA", re.IGNORECASE, ) +_RE_LEVEL_DAERAH = re.compile( + r"(?:KEPOLISIAN\s+)?DAERAH\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)", + re.IGNORECASE | re.MULTILINE, +) +_RE_LEVEL_RESOR = re.compile( + r"(?:KEPOLISIAN\s+)?RESORT?\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)", + re.IGNORECASE | re.MULTILINE, +) +_RE_LEVEL_SEKTOR = re.compile( + r"(?:KEPOLISIAN\s+)?SEKTOR\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)", + re.IGNORECASE | re.MULTILINE, +) +_RE_LEVEL_MABES = re.compile(r"MABES\s+POLRI\b", re.IGNORECASE) # "Perihal : ...." up to end of line. _RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE) +# Many sprint docs (especially Polres-level) use 'Pertimbangan' as the +# single-paragraph rationale block instead of (or alongside) 'Perihal'. +# When `perihal` is missing we fall back to the first non-empty line under +# 'Pertimbangan :' so the LLM doesn't have to guess and so a downstream +# audit trail still has *something* in the perihal slot. +_RE_PERTIMBANGAN_LABEL = re.compile(r"^\s*PERTIMBANGAN\b", re.IGNORECASE) # A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..." _RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$") +# OCR sometimes splits the number from its content across two lines: +# 1. +# Undang-Undang Nomor 2 Tahun 2002 ... +# We detect a bare-number line and merge with the next non-empty line. +_RE_DASAR_BARE_NUMBER = re.compile(r"^\s*(\d+)\s*[.)]\s*$") +# Generic 'untuk' bullet — same shape as a dasar item. +_RE_UNTUK_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$") # Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits. _RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE) @@ -99,54 +132,159 @@ def find_tanggal(text: str) -> date | None: return None +def _clean_unit_tail(tail: str) -> str: + """Strip trailing punctuation/noise from the captured place name.""" + return " ".join(tail.split()).strip(" .,;:'\"") + + def find_satuan(text: str) -> str | None: - """Return the first letterhead match (issuing unit), normalized.""" - match = _RE_SATUAN.search(text) - if not match: - return None - return " ".join(match.group(0).split()) + """Return the issuing unit, preferring the most-specific letterhead level. + + Polri letterheads are hierarchical (Negara > Daerah > Resor/Sektor). The + actual *issuing* unit is the deepest level present in the letterhead, not + the topmost generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' line. We scan + for each level independently and pick the most specific one available; + if only the generic Negara line is present we return that. + + Examples + -------- + >>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA\\n" + ... "DAERAH JAWA BARAT\\nRESOR CIMAHI") + 'KEPOLISIAN RESOR CIMAHI' + >>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA") + 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' + """ + # We only look at the document head — letterheads always sit at the + # very top, and constraining the search prevents false positives from + # body text like '... Polres Cimahi ...' deep in a paragraph. + head = "\n".join(text.splitlines()[:25]) + + sektor = _RE_LEVEL_SEKTOR.search(head) + if sektor: + return f"KEPOLISIAN SEKTOR {_clean_unit_tail(sektor.group(1))}" + resor = _RE_LEVEL_RESOR.search(head) + if resor: + return f"KEPOLISIAN RESOR {_clean_unit_tail(resor.group(1))}" + daerah = _RE_LEVEL_DAERAH.search(head) + if daerah: + return f"KEPOLISIAN DAERAH {_clean_unit_tail(daerah.group(1))}" + if _RE_LEVEL_MABES.search(head): + return "MABES POLRI" + if _RE_LEVEL_NEGARA.search(head): + return "KEPOLISIAN NEGARA REPUBLIK INDONESIA" + return None def find_perihal(text: str) -> str | None: - """Return the first 'Perihal: ...' line, trimmed to that line only.""" + """Return the first 'Perihal: ...' line, trimmed to that line only. + + Falls back to the first non-empty line under a 'Pertimbangan' label + (a common variant in Polres-level surat sprint that doesn't have a + distinct 'Perihal' field). We deliberately keep this in regex-land + rather than deferring to the LLM because the LLM tends to hallucinate + perihal content from arbitrary paragraphs. + """ for line in text.splitlines(): m = _RE_PERIHAL.search(line) if m: return m.group(1).strip() + + lines = text.splitlines() + for idx, line in enumerate(lines): + if _RE_PERTIMBANGAN_LABEL.match(line): + for follow in lines[idx + 1 : idx + 5]: + stripped = follow.strip(" :\t") + if stripped and stripped != ":": + return stripped + break return None +def _collect_numbered_section( + lines: list[str], + start_idx: int, + terminators: tuple[str, ...], +) -> list[str]: + """Walk forward from ``start_idx`` collecting numbered list items. + + Robust to OCR splitting the number marker onto its own line: + '1.' -> buffer ``pending_index=1`` + next non-empty line starts the item body. + + Continuation lines (non-empty, no leading number, after a started item) + are appended to the current item. Stops at any line whose uppercase form + starts with one of ``terminators``. + """ + items: list[str] = [] + pending_marker = False + blank_run = 0 + for raw_line in lines[start_idx:]: + line = raw_line.strip() + upper = line.upper() + if any(upper.startswith(term) for term in terminators): + break + if not line: + blank_run += 1 + # Two consecutive blank lines reliably mark the end of a section. + # A single blank line is tolerated because OCR sprinkles them. + if blank_run >= 2 and items and not pending_marker: + break + continue + blank_run = 0 + bare = _RE_DASAR_BARE_NUMBER.match(line) + if bare: + pending_marker = True + continue + m = _RE_DASAR_ITEM.match(line) + if m: + items.append(m.group(2).strip()) + pending_marker = False + continue + if pending_marker: + items.append(line) + pending_marker = False + continue + if items: + items[-1] = (items[-1] + " " + line).strip() + return items + + def find_dasar_list(text: str) -> list[str]: """Extract numbered 'Dasar' items from the text. Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and - collect subsequent lines that start with a number. Stops at a blank line - or a line beginning with another section header keyword. + delegate to ``_collect_numbered_section`` which handles three OCR + artefacts: + + 1. Inline numbered items: ``"1. Undang-Undang ..."``. + 2. Bare-number lines (the OCR engine puts the number alone on a line): + ``"1.\\n Undang-Undang ..."``. + 3. Continuation lines (a line that is the wrapped tail of the previous + item gets appended back onto it). """ lines = text.splitlines() - items: list[str] = [] - in_dasar = False section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL") - for raw_line in lines: - line = raw_line.strip() - if not in_dasar: - if re.match(r"^\s*DASAR\b", line, re.IGNORECASE): - in_dasar = True - continue - if not line: - if items: - break - continue - upper = line.upper() - if any(upper.startswith(term) for term in section_terminators): - break - m = _RE_DASAR_ITEM.match(line) - if m: - items.append(m.group(2).strip()) - elif items: - # continuation of the previous dasar item - items[-1] = (items[-1] + " " + line).strip() - return items + for idx, raw_line in enumerate(lines): + if re.match(r"^\s*DASAR\b", raw_line.strip(), re.IGNORECASE): + return _collect_numbered_section(lines, idx + 1, section_terminators) + return [] + + +def find_untuk_list(text: str) -> list[str]: + """Extract numbered 'Untuk' / 'DIPERINTAHKAN' bullets from the text. + + The 'Untuk' section follows 'DIPERINTAHKAN' / 'Kepada' and lists the + tasks assigned to the personnel. Same OCR shape as Dasar, so we reuse + the collector but with different terminators. + """ + lines = text.splitlines() + # Stop conditions: 'Selesai' (boilerplate), 'Dikeluarkan di' (signature + # block), 'Tembusan' (carbon-copy section). + terminators = ("SELESAI", "DIKELUARKAN", "TEMBUSAN", "PADA TANGGAL") + for idx, raw_line in enumerate(lines): + if re.match(r"^\s*UNTUK\b", raw_line.strip(), re.IGNORECASE): + return _collect_numbered_section(lines, idx + 1, terminators) + return [] def find_signatory(text: str) -> Signatory: diff --git a/src/ocr_sprint/pipeline/extract/validators.py b/src/ocr_sprint/pipeline/extract/validators.py index 14d15ef..8b28586 100644 --- a/src/ocr_sprint/pipeline/extract/validators.py +++ b/src/ocr_sprint/pipeline/extract/validators.py @@ -30,6 +30,13 @@ def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]: flags.append(ReviewFlag.INVALID_NRP) if entry.pangkat and not is_valid_pangkat(entry.pangkat): flags.append(ReviewFlag.UNKNOWN_PANGKAT) + # Identification of a personnel row requires at least pangkat OR nrp. + # A row carrying only a name is structurally incomplete - likely a + # mis-aligned table cell or a leaked tembusan/dasar fragment - and must + # be flagged for human review even though pangkat/nrp validation + # individually pass (because they're empty). + if not entry.pangkat and not entry.nrp: + flags.append(ReviewFlag.INCOMPLETE_PERSONNEL_ROW) return flags diff --git a/src/ocr_sprint/pipeline/orchestrator.py b/src/ocr_sprint/pipeline/orchestrator.py index 231aec1..e0a0625 100644 --- a/src/ocr_sprint/pipeline/orchestrator.py +++ b/src/ocr_sprint/pipeline/orchestrator.py @@ -19,7 +19,15 @@ from ocr_sprint.llm.extractor import llm_fill_header from ocr_sprint.pipeline.confidence import compute_confidence, route from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct from ocr_sprint.pipeline.extract.personnel import extract_personnel -from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory +from ocr_sprint.pipeline.extract.personnel_text import ( + extract_personnel_from_text, + is_low_quality, +) +from ocr_sprint.pipeline.extract.regex_rules import ( + extract_header, + find_signatory, + find_untuk_list, +) from ocr_sprint.pipeline.extract.validators import validate_extraction from ocr_sprint.pipeline.ingest import NDArrayU8, detect_source_kind, ingest from ocr_sprint.pipeline.ocr import OCRPage, run_ocr @@ -112,6 +120,7 @@ def run_pipeline(content: bytes) -> PipelineOutput: header = merged personel: list[PersonnelEntry] = [] + table_flags: list[ReviewFlag] = [] if s.tables_enabled and cleaned_pages: all_tables: list[DetectedTable] = [] for img in cleaned_pages: @@ -126,14 +135,33 @@ def run_pipeline(content: bytes) -> PipelineOutput: personel_rows=len(personel), ) - initial_flags: list[ReviewFlag] = list(llm_flags) + # Text-based fallback: PP-Structure can succeed structurally but emit + # rows with only ``nama`` populated (column mapper degraded), or fail to + # detect the table at all. In both cases the regex fallback that scans + # raw OCR for rank+NRP pairs produces a much more useful result. We + # always run it when the structured path is empty or low-quality, and + # raise a review flag so the operator knows the document didn't go + # through the preferred path. + if is_low_quality(personel): + fallback_rows = extract_personnel_from_text(full_text) + if fallback_rows: + personel = fallback_rows + table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK) + _logger.info( + "pipeline.personnel_text_fallback", + fallback_rows=len(fallback_rows), + ) + + untuk_items = find_untuk_list(full_text) + + initial_flags: list[ReviewFlag] = list(llm_flags) + list(table_flags) if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD: initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE) result = ExtractionResult( header=header, personel=personel, - untuk=[], + untuk=untuk_items, ttd=ttd, raw_text=full_text, confidence=mean_ocr_conf, diff --git a/src/ocr_sprint/schemas/extraction.py b/src/ocr_sprint/schemas/extraction.py index 5a3cdb0..252d1db 100644 --- a/src/ocr_sprint/schemas/extraction.py +++ b/src/ocr_sprint/schemas/extraction.py @@ -21,6 +21,8 @@ class ReviewFlag(str, Enum): DATE_PARSE_FAILED = "date_parse_failed" LLM_FALLBACK = "llm_fallback" LLM_UNAVAILABLE = "llm_unavailable" + PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback" + INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row" class Signatory(BaseModel): diff --git a/tests/unit/test_orchestrator_llm.py b/tests/unit/test_orchestrator_llm.py index d56af3c..af06ba3 100644 --- a/tests/unit/test_orchestrator_llm.py +++ b/tests/unit/test_orchestrator_llm.py @@ -169,3 +169,92 @@ def test_orchestrator_marks_unavailable_when_llm_returns_none( out = run_pipeline(b"%PDF-1.4\n%fake") assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags + + +def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When PP-Structure produces low-quality rows (e.g. only ``nama`` filled), + the orchestrator must run the text fallback against the raw OCR text and + raise the ``personnel_text_fallback`` flag. + """ + monkeypatch.setenv("LLM_ENABLED", "false") + from ocr_sprint.config import get_settings + + get_settings.cache_clear() + + raw_text = ( + "DAFTAR PERSONIL\n" + "1.\n" + "SRI WAHYUNI\n" + "AIPTU / 75070328\n" + "INTELKAM POLRES CIMAHI\n" + "2.\n" + "AGUNG LUKMAN\n" + "BRIPTU / 99030245\n" + "SAT INTELKAM\n" + ) + + # PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on + # the real Polres Cimahi document). + from ocr_sprint.schemas.personnel import PersonnelEntry + + pp_structure_low_quality = [ + PersonnelEntry(nama="SRI WAHYUNI"), + PersonnelEntry(nama="AGUNG LUKMAN"), + ] + _stub_pipeline_stages( + monkeypatch, + raw_text=raw_text, + regex_header=HeaderFields( + nomor_sprint="Sprin/1/I/2025", + tanggal=date(2025, 1, 1), + satuan_penerbit="Polres Cimahi", + perihal="ok", + dasar=["UU 2/2002"], + ), + ) + # Override extract_personnel to return the broken PP-Structure rows. + monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality) + + out = run_pipeline(b"%PDF-1.4\n%fake") + assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags + # Fallback rows must carry pangkat + nrp (the whole point of the path). + assert all(r.pangkat and r.nrp for r in out.result.personel) + assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"} + + +def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Healthy PP-Structure output (rank+nrp present on most rows) must NOT + be replaced by the text fallback. + """ + monkeypatch.setenv("LLM_ENABLED", "false") + from ocr_sprint.config import get_settings + + get_settings.cache_clear() + + from ocr_sprint.schemas.personnel import PersonnelEntry + + healthy = [ + PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"), + PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"), + PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"), + ] + _stub_pipeline_stages( + monkeypatch, + raw_text="ignored — should not be parsed", + regex_header=HeaderFields( + nomor_sprint="Sprin/1/I/2025", + tanggal=date(2025, 1, 1), + satuan_penerbit="Polres X", + perihal="ok", + dasar=["UU 2/2002"], + ), + ) + monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy) + + out = run_pipeline(b"%PDF-1.4\n%fake") + assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags + assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"] diff --git a/tests/unit/test_personnel_text_fallback.py b/tests/unit/test_personnel_text_fallback.py new file mode 100644 index 0000000..95cdd26 --- /dev/null +++ b/tests/unit/test_personnel_text_fallback.py @@ -0,0 +1,118 @@ +"""Tests for the text-based personnel fallback extractor. + +Driven by the real Polres Cimahi sprint document where PP-Structure +produced 24 rows with only ``nama`` populated. The fallback should +recover at least the rank + NRP for every row. +""" + +from __future__ import annotations + +from ocr_sprint.pipeline.extract.personnel_text import ( + extract_personnel_from_text, + is_low_quality, +) +from ocr_sprint.schemas.personnel import PersonnelEntry + +_CIMAHI_FIXTURE = """\ +DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024 +NO +NAMA +PANGKAT / NRP +JABATAN +KET +BAUR SKCK SAT +1. +SRI WAHYUNI +AIPTU / 75070328 +INTELKAM POLRES +CIMAHI +BA PELAKSANA SKCK +2. +CITRA DWI PUTRI R +BRIPTU / 95070659 + SAT INTELKAM +POLRES CIMAHI +BA PELAKSANA SKCK +3. +AGUNG LUKMAN AL +BRIPTU / 99030245 +SAT INTELKAM +POLRES CIMAHI +BA POLSEK +8. +ARIEF SYAHRUL ZAMAN +BRIGPOL /96030446 +MARGAASIH +""" + + +class TestExtractPersonnelFromText: + def test_extracts_rank_nrp_and_name(self) -> None: + rows = extract_personnel_from_text(_CIMAHI_FIXTURE) + assert len(rows) == 4 + first = rows[0] + assert first.pangkat == "AIPTU" + assert first.nrp == "75070328" + assert first.nama == "SRI WAHYUNI" + + def test_normalizes_brigpol_to_brigadir(self) -> None: + rows = extract_personnel_from_text(_CIMAHI_FIXTURE) + last = rows[-1] + # 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'. + assert last.pangkat == "BRIGADIR" + assert last.nrp == "96030446" + assert last.nama == "ARIEF SYAHRUL ZAMAN" + + def test_skips_header_lines_as_names(self) -> None: + # No row should ever have a column-header word as nama. + rows = extract_personnel_from_text(_CIMAHI_FIXTURE) + names = [r.nama for r in rows] + for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}: + assert blocked not in names + + def test_jabatan_collected_from_following_lines(self) -> None: + rows = extract_personnel_from_text(_CIMAHI_FIXTURE) + assert rows[0].jabatan_dinas is not None + assert "INTELKAM" in rows[0].jabatan_dinas + + def test_empty_text_returns_empty(self) -> None: + assert extract_personnel_from_text("") == [] + + def test_text_without_rank_nrp_pattern_returns_empty(self) -> None: + text = "Just a paragraph with no rank or NRP at all.\nAnother line." + assert extract_personnel_from_text(text) == [] + + def test_ignores_isolated_8digit_number_without_rank(self) -> None: + # NRP without a recognised rank token must not produce a row. + text = "Some line\n12345678\nanother line" + assert extract_personnel_from_text(text) == [] + + def test_rejects_unknown_rank_with_8digit_number(self) -> None: + # A "rank-shaped" word that isn't in the master list must not yield a row. + text = "Some line\nFAKERANK / 12345678\nanother line" + assert extract_personnel_from_text(text) == [] + + +class TestIsLowQuality: + def test_empty_list_is_low_quality(self) -> None: + assert is_low_quality([]) is True + + def test_all_rows_with_only_name_is_low_quality(self) -> None: + rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)] + assert is_low_quality(rows) is True + + def test_majority_with_rank_nrp_is_high_quality(self) -> None: + rows = [ + PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}") + for i in range(10) + ] + assert is_low_quality(rows) is False + + def test_borderline_30_percent_threshold(self) -> None: + # 3 useful out of 10 = exactly 0.3, treated as not-low-quality. + useful = [ + PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}") + for i in range(3) + ] + useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)] + assert is_low_quality(useful + useless) is False diff --git a/tests/unit/test_regex_rules.py b/tests/unit/test_regex_rules.py index 3cd7855..6efca49 100644 --- a/tests/unit/test_regex_rules.py +++ b/tests/unit/test_regex_rules.py @@ -14,6 +14,7 @@ from ocr_sprint.pipeline.extract.regex_rules import ( find_satuan, find_signatory, find_tanggal, + find_untuk_list, ) @@ -60,6 +61,36 @@ class TestSatuan: result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA") assert result is not None + def test_prefers_resor_over_negara_when_both_present(self) -> None: + # The Polri letterhead lists units hierarchically; the issuing unit + # is the deepest level, not the topmost generic "NEGARA" line. + text = ( + "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n" + "DAERAH JAWA BARAT\n" + "RESOR CIMAHI\n" + "SURAT PERINTAH\n" + ) + result = find_satuan(text) + assert result == "KEPOLISIAN RESOR CIMAHI" + + def test_prefers_sektor_over_resor(self) -> None: + text = ( + "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n" + "DAERAH JAWA BARAT\n" + "RESOR CIMAHI\n" + "SEKTOR PADALARANG\n" + ) + result = find_satuan(text) + assert result == "KEPOLISIAN SEKTOR PADALARANG" + + def test_handles_daerah_only(self) -> None: + text = "KEPOLISIAN NEGARA REPUBLIK INDONESIA\nDAERAH JAWA BARAT\n" + result = find_satuan(text) + assert result == "KEPOLISIAN DAERAH JAWA BARAT" + + def test_returns_none_when_no_letterhead(self) -> None: + assert find_satuan("no police letterhead here") is None + class TestPerihal: def test_extracts_perihal_line(self) -> None: @@ -69,6 +100,25 @@ class TestPerihal: def test_returns_none_when_absent(self) -> None: assert find_perihal("no perihal field") is None + def test_falls_back_to_pertimbangan_block(self) -> None: + # Many Polres-level sprints use "Pertimbangan" instead of "Perihal". + # The fallback should pick up the first non-empty line under it. + text = ( + "Pertimbangan\n" + "Bahwa dalam rangka mendukung kepentingan Dinas Polres Cimahi.\n" + "DASAR :\n" + "1. ...\n" + ) + result = find_perihal(text) + assert result is not None + assert result.startswith("Bahwa dalam rangka mendukung") + + def test_perihal_wins_over_pertimbangan_when_both_present(self) -> None: + # If the document has both a Perihal label AND a Pertimbangan + # paragraph, the explicit Perihal wins. + text = "Pertimbangan\nSome pertimbangan content.\nPERIHAL : The actual perihal.\n" + assert find_perihal(text) == "The actual perihal." + class TestDasar: def test_numbered_list(self) -> None: @@ -88,6 +138,57 @@ class TestDasar: def test_empty_when_section_missing(self) -> None: assert find_dasar_list("no dasar section") == [] + def test_handles_bare_number_lines_split_by_ocr(self) -> None: + # OCR sometimes places the number marker on its own line and the + # body on the next non-empty line. The collector must merge them + # rather than dropping the body or appending it to the previous + # item (which the old implementation did). + text = ( + "Dasar\n" + ":\n" + "1.\n" + " Undang - Undang Nomor 2 tahun 2002 tentang Kepolisian;\n" + "2. Peraturan Pemerintah Republik Indonesia No. 76 tahun 2020;\n" + "3.\n" + "Keterangan Catatan Kepolisian (SKCK);\n" + "4.\n" + "Pelayanan dilingkungan Badan Intelijen Keamanan Polri.\n" + "5. DIPA Petikan Satker Polres Cimahi.\n" + "DIPERINTAHKAN\n" + ) + items = find_dasar_list(text) + assert len(items) == 5 + assert items[0].startswith("Undang - Undang") + assert items[2].startswith("Keterangan Catatan") + assert items[3].startswith("Pelayanan dilingkungan") + assert items[4].startswith("DIPA") + + +class TestUntuk: + def test_extracts_numbered_untuk_bullets(self) -> None: + text = ( + "DIPERINTAHKAN\n" + "Kepada\n" + "Untuk\n" + "1.\n" + "melaksanakan tugas A;\n" + "2.\n" + "melaksanakan tugas B;\n" + "Selesai.\n" + ) + items = find_untuk_list(text) + assert len(items) == 2 + assert items[0] == "melaksanakan tugas A;" + assert items[1] == "melaksanakan tugas B;" + + def test_returns_empty_when_section_missing(self) -> None: + assert find_untuk_list("no untuk section") == [] + + def test_stops_at_dikeluarkan(self) -> None: + text = "Untuk\n1. tugas A;\nDikeluarkan di Cimahi\n2. should not be captured\n" + items = find_untuk_list(text) + assert items == ["tugas A;"] + class TestSignatory: def test_extracts_last_nrp(self) -> None: diff --git a/tests/unit/test_validators.py b/tests/unit/test_validators.py index 6ff0dcd..2313d2b 100644 --- a/tests/unit/test_validators.py +++ b/tests/unit/test_validators.py @@ -62,6 +62,20 @@ class TestPersonnelValidator: entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test") assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry) + def test_row_with_only_name_is_flagged_incomplete(self) -> None: + # A row that captured only `nama` (no pangkat AND no nrp) is the + # signature of mis-aligned table extraction. Must be flagged so + # the operator routes the document to needs_review. + entry = PersonnelEntry(nama="LEAKED FROM SOMEWHERE") + flags = validate_personnel_entry(entry) + assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW in flags + + def test_row_with_only_pangkat_is_not_flagged_incomplete(self) -> None: + # Having pangkat without NRP is suboptimal but still identifies a + # rank, so we don't raise the structural-incompleteness flag. + entry = PersonnelEntry(pangkat="AKP", nama="Test") + assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW not in validate_personnel_entry(entry) + class TestHeaderValidator: def test_complete_header_no_flags(self) -> None: