Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
113 lines
3.4 KiB
Python
113 lines
3.4 KiB
Python
"""Tests for regex-based header extraction."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date
|
|
|
|
import pytest
|
|
|
|
from ocr_sprint.pipeline.extract.regex_rules import (
|
|
extract_header,
|
|
find_dasar_list,
|
|
find_nomor_sprint,
|
|
find_perihal,
|
|
find_satuan,
|
|
find_signatory,
|
|
find_tanggal,
|
|
)
|
|
|
|
|
|
class TestNomorSprint:
|
|
@pytest.mark.parametrize(
|
|
("text", "needle"),
|
|
[
|
|
("Nomor : Sprin/123/IV/2025/Reskrim", "123"),
|
|
("Nomor: SPRIN / 7 / I / 2024", "7"),
|
|
("...Sprin-345-X-2024-Sat Intelkam...", "345"),
|
|
],
|
|
)
|
|
def test_finds_nomor(self, text: str, needle: str) -> None:
|
|
result = find_nomor_sprint(text)
|
|
assert result is not None
|
|
assert needle in result
|
|
assert result.upper().startswith("SPRIN")
|
|
|
|
def test_returns_none_when_absent(self) -> None:
|
|
assert find_nomor_sprint("no nomor here, just some text") is None
|
|
|
|
|
|
class TestTanggal:
|
|
def test_basic_date(self) -> None:
|
|
assert find_tanggal("Bandung, 21 April 2025") == date(2025, 4, 21)
|
|
|
|
def test_with_dashes(self) -> None:
|
|
assert find_tanggal("Tanggal 1 - Desember - 2024") == date(2024, 12, 1)
|
|
|
|
def test_invalid_month(self) -> None:
|
|
assert find_tanggal("21 Foo 2025") is None
|
|
|
|
def test_no_date_present(self) -> None:
|
|
assert find_tanggal("nothing here") is None
|
|
|
|
|
|
class TestSatuan:
|
|
def test_polres(self) -> None:
|
|
result = find_satuan("KEPOLISIAN RESOR BANDUNG\nLainnya")
|
|
assert result is not None
|
|
assert "RESOR BANDUNG" in result.upper()
|
|
|
|
def test_polri_pusat(self) -> None:
|
|
result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
|
|
assert result is not None
|
|
|
|
|
|
class TestPerihal:
|
|
def test_extracts_perihal_line(self) -> None:
|
|
text = "Other line\nPERIHAL : Pelaksanaan penyelidikan kasus.\nMore"
|
|
assert find_perihal(text) == "Pelaksanaan penyelidikan kasus."
|
|
|
|
def test_returns_none_when_absent(self) -> None:
|
|
assert find_perihal("no perihal field") is None
|
|
|
|
|
|
class TestDasar:
|
|
def test_numbered_list(self) -> None:
|
|
text = (
|
|
"DASAR :\n"
|
|
"1. UU No 2 Tahun 2002.\n"
|
|
"2. Peraturan Kapolri Nomor 6.\n"
|
|
"\n"
|
|
"DIPERINTAHKAN :\n"
|
|
"Kepada : ...\n"
|
|
)
|
|
items = find_dasar_list(text)
|
|
assert len(items) == 2
|
|
assert items[0].startswith("UU No 2")
|
|
assert items[1].startswith("Peraturan Kapolri")
|
|
|
|
def test_empty_when_section_missing(self) -> None:
|
|
assert find_dasar_list("no dasar section") == []
|
|
|
|
|
|
class TestSignatory:
|
|
def test_extracts_last_nrp(self) -> None:
|
|
text = "Some 12345678 NRP earlier 87654321\nNRP. 11223344"
|
|
sig = find_signatory(text)
|
|
assert sig.nrp == "11223344"
|
|
|
|
def test_no_nrp(self) -> None:
|
|
assert find_signatory("no NRP here").nrp is None
|
|
|
|
|
|
class TestExtractHeader:
|
|
def test_full_synthetic_doc(self, sample_sprint_text: str) -> None:
|
|
header = extract_header(sample_sprint_text)
|
|
assert header.nomor_sprint is not None
|
|
assert "Sprin" in header.nomor_sprint
|
|
assert header.tanggal == date(2025, 4, 21)
|
|
assert header.satuan_penerbit is not None
|
|
assert "KEPOLISIAN" in header.satuan_penerbit.upper()
|
|
assert header.perihal is not None
|
|
assert "penyelidikan" in header.perihal.lower()
|
|
assert len(header.dasar) == 3
|