diff --git a/src/ocr_sprint/pipeline/extract/personnel_text.py b/src/ocr_sprint/pipeline/extract/personnel_text.py index 4360a80..5e37984 100644 --- a/src/ocr_sprint/pipeline/extract/personnel_text.py +++ b/src/ocr_sprint/pipeline/extract/personnel_text.py @@ -50,8 +50,15 @@ _RE_RANK_NRP_LINE = re.compile( # line in tabular layouts. _RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$") # Lines that should never be interpreted as a personnel name. These are -# section headers, OCR garbage anchors, and column header tokens. -_NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( +# section headers, OCR garbage anchors, and column header tokens. We match +# them with a *word-boundary* regex (built from this list) rather than a +# bare ``startswith`` check, because short tokens like ``"NO"`` and +# ``"KET"`` would otherwise reject perfectly valid Indonesian names +# (e.g. ``"NOVA SARI"``, ``"NOOR HIDAYAT"``, ``"KETUT WARDANA"`` — the +# latter being an extremely common Balinese birth-order name). +_NAME_BLOCKLIST_TOKENS: tuple[str, ...] = ( + "PADA TANGGAL", # multi-word entries first so they win the alternation + "SURAT PERINTAH", "DASAR", "PERIHAL", "PERTIMBANGAN", @@ -60,7 +67,6 @@ _NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( "UNTUK", "TEMBUSAN", "DIKELUARKAN", - "PADA TANGGAL", "SELESAI", "DAFTAR", "LAMPIRAN", @@ -71,7 +77,6 @@ _NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( "RESOR", "SEKTOR", "MABES", - "SURAT PERINTAH", "NRP", "NIP", "PANGKAT", @@ -81,6 +86,10 @@ _NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( "KET", "NO", ) +_RE_NAME_BLOCKLIST = re.compile( + r"^(?:" + "|".join(re.escape(tok) for tok in _NAME_BLOCKLIST_TOKENS) + r")\b", + re.IGNORECASE, +) # A name should look like a name: mostly letters, common punctuation, and # at least one alphabetic character. Pure-numeric or pure-symbol lines are # rejected. @@ -92,10 +101,8 @@ def _is_plausible_name(line: str) -> bool: stripped = line.strip() if not stripped or not _RE_NAME_OK.search(stripped): return False - upper = stripped.upper() - for prefix in _NAME_BLOCKLIST_PREFIXES: - if upper.startswith(prefix): - return False + if _RE_NAME_BLOCKLIST.match(stripped): + return False if _RE_ROW_NUMBER.match(stripped): return False if _RE_RANK_NRP_LINE.search(stripped): @@ -122,8 +129,7 @@ def _following_jabatan(lines: list[str], idx: int) -> str | None: break if _RE_ROW_NUMBER.match(candidate): break - upper = candidate.upper() - if any(upper.startswith(p) for p in _NAME_BLOCKLIST_PREFIXES): + if _RE_NAME_BLOCKLIST.match(candidate): break parts.append(candidate) if not parts: diff --git a/tests/unit/test_personnel_text_fallback.py b/tests/unit/test_personnel_text_fallback.py index 95cdd26..884f99c 100644 --- a/tests/unit/test_personnel_text_fallback.py +++ b/tests/unit/test_personnel_text_fallback.py @@ -92,6 +92,37 @@ class TestExtractPersonnelFromText: text = "Some line\nFAKERANK / 12345678\nanother line" assert extract_personnel_from_text(text) == [] + def test_does_not_drop_indonesian_names_starting_with_no_or_ket(self) -> None: + # Regression: 'NO' / 'KET' are legitimate column header tokens but + # also prefix common Indonesian names (KETUT, NOVA, NOOR). The + # blocklist must use word boundaries, not a raw startswith check. + text = ( + "DAFTAR PERSONIL\n" + "1.\n" + "KETUT WARDANA\n" + "AIPTU / 11111111\n" + "JABATAN A\n" + "2.\n" + "NOVA SARI\n" + "BRIPTU / 22222222\n" + "JABATAN B\n" + "3.\n" + "NOOR HIDAYAT\n" + "BRIPDA / 33333333\n" + "JABATAN C\n" + ) + rows = extract_personnel_from_text(text) + names = [r.nama for r in rows] + assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"] + + def test_still_blocks_bare_column_header_tokens(self) -> None: + # Word-boundary fix must still reject the actual column-header + # rows that motivated the blocklist in the first place. + text = "NO\nNAMA\nPANGKAT / NRP\nJABATAN\nKET\n1.\nREAL NAME\nAIPTU / 12345678\n" + rows = extract_personnel_from_text(text) + assert len(rows) == 1 + assert rows[0].nama == "REAL NAME" + class TestIsLowQuality: def test_empty_list_is_low_quality(self) -> None: