This fixes 4 bugs found on a real Polres Cimahi SPRIN PDF:
1. satuan_penerbit captured the generic 'KEPOLISIAN NEGARA REPUBLIK
INDONESIA' letterhead line instead of the most-specific issuing unit
(e.g. RESOR CIMAHI / SEKTOR PADALARANG). Reworked find_satuan to
scan for each level independently and return the deepest available.
2. find_dasar_list dropped numbered items when OCR put the marker on
its own line ("1.\n Undang-Undang ..."). Refactored into
_collect_numbered_section that buffers a bare-number line and uses
the next non-empty line as the body. Also reused for the new
find_untuk_list which extracts the previously-empty 'untuk' bullets.
3. find_perihal returned None for documents that use 'Pertimbangan'
(very common in Polres-level sprint), forcing the LLM to guess.
Added a regex fallback that picks up the first line under a
'Pertimbangan' label so we keep extraction deterministic.
4. Personnel rows were emitted with only nama populated when
PP-Structure detected a table but the column mapper degraded.
Added a text-based fallback (extract_personnel_from_text) that
scans raw OCR for <rank> + <8-digit NRP> patterns. Triggered when
the PP-Structure result has fewer than 30% rank/NRP-bearing rows.
Reviewed by raising the new PERSONNEL_TEXT_FALLBACK flag.
5. Validation now flags rows with neither pangkat nor nrp as
INCOMPLETE_PERSONNEL_ROW, so the document routes to needs_review
even when individual nrp/pangkat checks pass on empty values.
6. Added 'BRIGPOL' as a variant of BRIGADIR (seen in real scans).
Tests: 229 (was 203) — 26 new tests covering the regex fixes,
text-based personnel extractor, low-quality detector, validator
behaviour, and orchestrator wiring of the fallback path.
Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
214 lines
7.3 KiB
Python
214 lines
7.3 KiB
Python
"""Tests for regex-based header extraction."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date
|
|
|
|
import pytest
|
|
|
|
from ocr_sprint.pipeline.extract.regex_rules import (
|
|
extract_header,
|
|
find_dasar_list,
|
|
find_nomor_sprint,
|
|
find_perihal,
|
|
find_satuan,
|
|
find_signatory,
|
|
find_tanggal,
|
|
find_untuk_list,
|
|
)
|
|
|
|
|
|
class TestNomorSprint:
|
|
@pytest.mark.parametrize(
|
|
("text", "needle"),
|
|
[
|
|
("Nomor : Sprin/123/IV/2025/Reskrim", "123"),
|
|
("Nomor: SPRIN / 7 / I / 2024", "7"),
|
|
("...Sprin-345-X-2024-Sat Intelkam...", "345"),
|
|
],
|
|
)
|
|
def test_finds_nomor(self, text: str, needle: str) -> None:
|
|
result = find_nomor_sprint(text)
|
|
assert result is not None
|
|
assert needle in result
|
|
assert result.upper().startswith("SPRIN")
|
|
|
|
def test_returns_none_when_absent(self) -> None:
|
|
assert find_nomor_sprint("no nomor here, just some text") is None
|
|
|
|
|
|
class TestTanggal:
|
|
def test_basic_date(self) -> None:
|
|
assert find_tanggal("Bandung, 21 April 2025") == date(2025, 4, 21)
|
|
|
|
def test_with_dashes(self) -> None:
|
|
assert find_tanggal("Tanggal 1 - Desember - 2024") == date(2024, 12, 1)
|
|
|
|
def test_invalid_month(self) -> None:
|
|
assert find_tanggal("21 Foo 2025") is None
|
|
|
|
def test_no_date_present(self) -> None:
|
|
assert find_tanggal("nothing here") is None
|
|
|
|
|
|
class TestSatuan:
|
|
def test_polres(self) -> None:
|
|
result = find_satuan("KEPOLISIAN RESOR BANDUNG\nLainnya")
|
|
assert result is not None
|
|
assert "RESOR BANDUNG" in result.upper()
|
|
|
|
def test_polri_pusat(self) -> None:
|
|
result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
|
|
assert result is not None
|
|
|
|
def test_prefers_resor_over_negara_when_both_present(self) -> None:
|
|
# The Polri letterhead lists units hierarchically; the issuing unit
|
|
# is the deepest level, not the topmost generic "NEGARA" line.
|
|
text = (
|
|
"KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
|
|
"DAERAH JAWA BARAT\n"
|
|
"RESOR CIMAHI\n"
|
|
"SURAT PERINTAH\n"
|
|
)
|
|
result = find_satuan(text)
|
|
assert result == "KEPOLISIAN RESOR CIMAHI"
|
|
|
|
def test_prefers_sektor_over_resor(self) -> None:
|
|
text = (
|
|
"KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
|
|
"DAERAH JAWA BARAT\n"
|
|
"RESOR CIMAHI\n"
|
|
"SEKTOR PADALARANG\n"
|
|
)
|
|
result = find_satuan(text)
|
|
assert result == "KEPOLISIAN SEKTOR PADALARANG"
|
|
|
|
def test_handles_daerah_only(self) -> None:
|
|
text = "KEPOLISIAN NEGARA REPUBLIK INDONESIA\nDAERAH JAWA BARAT\n"
|
|
result = find_satuan(text)
|
|
assert result == "KEPOLISIAN DAERAH JAWA BARAT"
|
|
|
|
def test_returns_none_when_no_letterhead(self) -> None:
|
|
assert find_satuan("no police letterhead here") is None
|
|
|
|
|
|
class TestPerihal:
|
|
def test_extracts_perihal_line(self) -> None:
|
|
text = "Other line\nPERIHAL : Pelaksanaan penyelidikan kasus.\nMore"
|
|
assert find_perihal(text) == "Pelaksanaan penyelidikan kasus."
|
|
|
|
def test_returns_none_when_absent(self) -> None:
|
|
assert find_perihal("no perihal field") is None
|
|
|
|
def test_falls_back_to_pertimbangan_block(self) -> None:
|
|
# Many Polres-level sprints use "Pertimbangan" instead of "Perihal".
|
|
# The fallback should pick up the first non-empty line under it.
|
|
text = (
|
|
"Pertimbangan\n"
|
|
"Bahwa dalam rangka mendukung kepentingan Dinas Polres Cimahi.\n"
|
|
"DASAR :\n"
|
|
"1. ...\n"
|
|
)
|
|
result = find_perihal(text)
|
|
assert result is not None
|
|
assert result.startswith("Bahwa dalam rangka mendukung")
|
|
|
|
def test_perihal_wins_over_pertimbangan_when_both_present(self) -> None:
|
|
# If the document has both a Perihal label AND a Pertimbangan
|
|
# paragraph, the explicit Perihal wins.
|
|
text = "Pertimbangan\nSome pertimbangan content.\nPERIHAL : The actual perihal.\n"
|
|
assert find_perihal(text) == "The actual perihal."
|
|
|
|
|
|
class TestDasar:
|
|
def test_numbered_list(self) -> None:
|
|
text = (
|
|
"DASAR :\n"
|
|
"1. UU No 2 Tahun 2002.\n"
|
|
"2. Peraturan Kapolri Nomor 6.\n"
|
|
"\n"
|
|
"DIPERINTAHKAN :\n"
|
|
"Kepada : ...\n"
|
|
)
|
|
items = find_dasar_list(text)
|
|
assert len(items) == 2
|
|
assert items[0].startswith("UU No 2")
|
|
assert items[1].startswith("Peraturan Kapolri")
|
|
|
|
def test_empty_when_section_missing(self) -> None:
|
|
assert find_dasar_list("no dasar section") == []
|
|
|
|
def test_handles_bare_number_lines_split_by_ocr(self) -> None:
|
|
# OCR sometimes places the number marker on its own line and the
|
|
# body on the next non-empty line. The collector must merge them
|
|
# rather than dropping the body or appending it to the previous
|
|
# item (which the old implementation did).
|
|
text = (
|
|
"Dasar\n"
|
|
":\n"
|
|
"1.\n"
|
|
" Undang - Undang Nomor 2 tahun 2002 tentang Kepolisian;\n"
|
|
"2. Peraturan Pemerintah Republik Indonesia No. 76 tahun 2020;\n"
|
|
"3.\n"
|
|
"Keterangan Catatan Kepolisian (SKCK);\n"
|
|
"4.\n"
|
|
"Pelayanan dilingkungan Badan Intelijen Keamanan Polri.\n"
|
|
"5. DIPA Petikan Satker Polres Cimahi.\n"
|
|
"DIPERINTAHKAN\n"
|
|
)
|
|
items = find_dasar_list(text)
|
|
assert len(items) == 5
|
|
assert items[0].startswith("Undang - Undang")
|
|
assert items[2].startswith("Keterangan Catatan")
|
|
assert items[3].startswith("Pelayanan dilingkungan")
|
|
assert items[4].startswith("DIPA")
|
|
|
|
|
|
class TestUntuk:
|
|
def test_extracts_numbered_untuk_bullets(self) -> None:
|
|
text = (
|
|
"DIPERINTAHKAN\n"
|
|
"Kepada\n"
|
|
"Untuk\n"
|
|
"1.\n"
|
|
"melaksanakan tugas A;\n"
|
|
"2.\n"
|
|
"melaksanakan tugas B;\n"
|
|
"Selesai.\n"
|
|
)
|
|
items = find_untuk_list(text)
|
|
assert len(items) == 2
|
|
assert items[0] == "melaksanakan tugas A;"
|
|
assert items[1] == "melaksanakan tugas B;"
|
|
|
|
def test_returns_empty_when_section_missing(self) -> None:
|
|
assert find_untuk_list("no untuk section") == []
|
|
|
|
def test_stops_at_dikeluarkan(self) -> None:
|
|
text = "Untuk\n1. tugas A;\nDikeluarkan di Cimahi\n2. should not be captured\n"
|
|
items = find_untuk_list(text)
|
|
assert items == ["tugas A;"]
|
|
|
|
|
|
class TestSignatory:
|
|
def test_extracts_last_nrp(self) -> None:
|
|
text = "Some 12345678 NRP earlier 87654321\nNRP. 11223344"
|
|
sig = find_signatory(text)
|
|
assert sig.nrp == "11223344"
|
|
|
|
def test_no_nrp(self) -> None:
|
|
assert find_signatory("no NRP here").nrp is None
|
|
|
|
|
|
class TestExtractHeader:
|
|
def test_full_synthetic_doc(self, sample_sprint_text: str) -> None:
|
|
header = extract_header(sample_sprint_text)
|
|
assert header.nomor_sprint is not None
|
|
assert "Sprin" in header.nomor_sprint
|
|
assert header.tanggal == date(2025, 4, 21)
|
|
assert header.satuan_penerbit is not None
|
|
assert "KEPOLISIAN" in header.satuan_penerbit.upper()
|
|
assert header.perihal is not None
|
|
assert "penyelidikan" in header.perihal.lower()
|
|
assert len(header.dasar) == 3
|