From 737f4999dd8be895c026beb840bf208cdac412a4 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 26 Apr 2026 05:46:21 +0000 Subject: [PATCH] Use word-boundary matching for personnel name blocklist Devin Review correctly flagged that the bare "NO" and "KET" entries in the blocklist would silently drop common Indonesian names (KETUT, NOVA, NOOR, NORMAN, NOVIANTI, ...) because the check used startswith rather than a word boundary. Replaced the per-prefix loop with a single compiled regex anchored at ^ with a trailing \b, which still matches column headers like "NO" or "KET" on their own line but no longer rejects "NOOR HIDAYAT" or "KETUT WARDANA". Also fixes the same bug in _following_jabatan. Added two regression tests covering both directions: names starting with the offending tokens are kept, bare column headers still rejected. Co-Authored-By: adrian kuman firmansah --- .../pipeline/extract/personnel_text.py | 26 ++++++++++------ tests/unit/test_personnel_text_fallback.py | 31 +++++++++++++++++++ 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/ocr_sprint/pipeline/extract/personnel_text.py b/src/ocr_sprint/pipeline/extract/personnel_text.py index 4360a80..5e37984 100644 --- a/src/ocr_sprint/pipeline/extract/personnel_text.py +++ b/src/ocr_sprint/pipeline/extract/personnel_text.py @@ -50,8 +50,15 @@ _RE_RANK_NRP_LINE = re.compile( # line in tabular layouts. _RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$") # Lines that should never be interpreted as a personnel name. These are -# section headers, OCR garbage anchors, and column header tokens. -_NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( +# section headers, OCR garbage anchors, and column header tokens. We match +# them with a *word-boundary* regex (built from this list) rather than a +# bare ``startswith`` check, because short tokens like ``"NO"`` and +# ``"KET"`` would otherwise reject perfectly valid Indonesian names +# (e.g. ``"NOVA SARI"``, ``"NOOR HIDAYAT"``, ``"KETUT WARDANA"`` — the +# latter being an extremely common Balinese birth-order name). +_NAME_BLOCKLIST_TOKENS: tuple[str, ...] = ( + "PADA TANGGAL", # multi-word entries first so they win the alternation + "SURAT PERINTAH", "DASAR", "PERIHAL", "PERTIMBANGAN", @@ -60,7 +67,6 @@ _NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( "UNTUK", "TEMBUSAN", "DIKELUARKAN", - "PADA TANGGAL", "SELESAI", "DAFTAR", "LAMPIRAN", @@ -71,7 +77,6 @@ _NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( "RESOR", "SEKTOR", "MABES", - "SURAT PERINTAH", "NRP", "NIP", "PANGKAT", @@ -81,6 +86,10 @@ _NAME_BLOCKLIST_PREFIXES: tuple[str, ...] = ( "KET", "NO", ) +_RE_NAME_BLOCKLIST = re.compile( + r"^(?:" + "|".join(re.escape(tok) for tok in _NAME_BLOCKLIST_TOKENS) + r")\b", + re.IGNORECASE, +) # A name should look like a name: mostly letters, common punctuation, and # at least one alphabetic character. Pure-numeric or pure-symbol lines are # rejected. @@ -92,10 +101,8 @@ def _is_plausible_name(line: str) -> bool: stripped = line.strip() if not stripped or not _RE_NAME_OK.search(stripped): return False - upper = stripped.upper() - for prefix in _NAME_BLOCKLIST_PREFIXES: - if upper.startswith(prefix): - return False + if _RE_NAME_BLOCKLIST.match(stripped): + return False if _RE_ROW_NUMBER.match(stripped): return False if _RE_RANK_NRP_LINE.search(stripped): @@ -122,8 +129,7 @@ def _following_jabatan(lines: list[str], idx: int) -> str | None: break if _RE_ROW_NUMBER.match(candidate): break - upper = candidate.upper() - if any(upper.startswith(p) for p in _NAME_BLOCKLIST_PREFIXES): + if _RE_NAME_BLOCKLIST.match(candidate): break parts.append(candidate) if not parts: diff --git a/tests/unit/test_personnel_text_fallback.py b/tests/unit/test_personnel_text_fallback.py index 95cdd26..884f99c 100644 --- a/tests/unit/test_personnel_text_fallback.py +++ b/tests/unit/test_personnel_text_fallback.py @@ -92,6 +92,37 @@ class TestExtractPersonnelFromText: text = "Some line\nFAKERANK / 12345678\nanother line" assert extract_personnel_from_text(text) == [] + def test_does_not_drop_indonesian_names_starting_with_no_or_ket(self) -> None: + # Regression: 'NO' / 'KET' are legitimate column header tokens but + # also prefix common Indonesian names (KETUT, NOVA, NOOR). The + # blocklist must use word boundaries, not a raw startswith check. + text = ( + "DAFTAR PERSONIL\n" + "1.\n" + "KETUT WARDANA\n" + "AIPTU / 11111111\n" + "JABATAN A\n" + "2.\n" + "NOVA SARI\n" + "BRIPTU / 22222222\n" + "JABATAN B\n" + "3.\n" + "NOOR HIDAYAT\n" + "BRIPDA / 33333333\n" + "JABATAN C\n" + ) + rows = extract_personnel_from_text(text) + names = [r.nama for r in rows] + assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"] + + def test_still_blocks_bare_column_header_tokens(self) -> None: + # Word-boundary fix must still reject the actual column-header + # rows that motivated the blocklist in the first place. + text = "NO\nNAMA\nPANGKAT / NRP\nJABATAN\nKET\n1.\nREAL NAME\nAIPTU / 12345678\n" + rows = extract_personnel_from_text(text) + assert len(rows) == 1 + assert rows[0].nama == "REAL NAME" + class TestIsLowQuality: def test_empty_list_is_low_quality(self) -> None: