"""Tests for regex-based header extraction.""" from __future__ import annotations from datetime import date import pytest from ocr_sprint.pipeline.extract.regex_rules import ( extract_header, find_dasar_list, find_nomor_sprint, find_perihal, find_satuan, find_signatory, find_tanggal, ) class TestNomorSprint: @pytest.mark.parametrize( ("text", "needle"), [ ("Nomor : Sprin/123/IV/2025/Reskrim", "123"), ("Nomor: SPRIN / 7 / I / 2024", "7"), ("...Sprin-345-X-2024-Sat Intelkam...", "345"), ], ) def test_finds_nomor(self, text: str, needle: str) -> None: result = find_nomor_sprint(text) assert result is not None assert needle in result assert result.upper().startswith("SPRIN") def test_returns_none_when_absent(self) -> None: assert find_nomor_sprint("no nomor here, just some text") is None class TestTanggal: def test_basic_date(self) -> None: assert find_tanggal("Bandung, 21 April 2025") == date(2025, 4, 21) def test_with_dashes(self) -> None: assert find_tanggal("Tanggal 1 - Desember - 2024") == date(2024, 12, 1) def test_invalid_month(self) -> None: assert find_tanggal("21 Foo 2025") is None def test_no_date_present(self) -> None: assert find_tanggal("nothing here") is None class TestSatuan: def test_polres(self) -> None: result = find_satuan("KEPOLISIAN RESOR BANDUNG\nLainnya") assert result is not None assert "RESOR BANDUNG" in result.upper() def test_polri_pusat(self) -> None: result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA") assert result is not None class TestPerihal: def test_extracts_perihal_line(self) -> None: text = "Other line\nPERIHAL : Pelaksanaan penyelidikan kasus.\nMore" assert find_perihal(text) == "Pelaksanaan penyelidikan kasus." def test_returns_none_when_absent(self) -> None: assert find_perihal("no perihal field") is None class TestDasar: def test_numbered_list(self) -> None: text = ( "DASAR :\n" "1. UU No 2 Tahun 2002.\n" "2. Peraturan Kapolri Nomor 6.\n" "\n" "DIPERINTAHKAN :\n" "Kepada : ...\n" ) items = find_dasar_list(text) assert len(items) == 2 assert items[0].startswith("UU No 2") assert items[1].startswith("Peraturan Kapolri") def test_empty_when_section_missing(self) -> None: assert find_dasar_list("no dasar section") == [] class TestSignatory: def test_extracts_last_nrp(self) -> None: text = "Some 12345678 NRP earlier 87654321\nNRP. 11223344" sig = find_signatory(text) assert sig.nrp == "11223344" def test_no_nrp(self) -> None: assert find_signatory("no NRP here").nrp is None class TestExtractHeader: def test_full_synthetic_doc(self, sample_sprint_text: str) -> None: header = extract_header(sample_sprint_text) assert header.nomor_sprint is not None assert "Sprin" in header.nomor_sprint assert header.tanggal == date(2025, 4, 21) assert header.satuan_penerbit is not None assert "KEPOLISIAN" in header.satuan_penerbit.upper() assert header.perihal is not None assert "penyelidikan" in header.perihal.lower() assert len(header.dasar) == 3