Phase 1 MVP: synchronous OCR + regex header extraction

Implements the foundation of the OCR Sprint service:
- FastAPI app with /api/v1/health and /api/v1/documents (sync upload)
- Pydantic v2 schemas for documents, extraction result, personnel
- Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew,
  denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based
  header extraction (nomor sprint, tanggal, satuan, perihal, dasar),
  signatory NRP, master-pangkat validation, confidence scoring + routing.
- Tests: 61 unit tests covering regex rules, validators, preprocess,
  ingest, confidence, and API contract (PaddleOCR mocked).
- Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit,
  Dockerfile, docker-compose, Makefile.
- Docs: README + docs/architecture.md (full hybrid stack rationale and
  6-phase roadmap).

Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
Devin AI
2026-04-25 14:58:50 +00:00
commit ca0c0a0428
45 changed files with 2457 additions and 0 deletions

View File

@@ -0,0 +1 @@
"""Information extraction layer (regex Phase 1, LLM Phase 5)."""

View File

@@ -0,0 +1,169 @@
"""Regex-based extraction for the deterministic header fields of a surat sprint.
Targets header fields whose layout is highly standardized across Polri units:
- Nomor sprint, e.g. "Sprin / 123 / IV / 2025 / Reskrim"
- Tanggal (date the sprint was issued)
- Satuan penerbit (issuing unit)
- Perihal
- Dasar (numbered list of legal/operational basis)
Personnel table extraction is intentionally NOT done here — that needs
PP-Structure + cell-aware logic and lives in `pipeline/table.py` (Phase 3).
"""
from __future__ import annotations
import re
from datetime import date
from ocr_sprint.schemas.extraction import HeaderFields, Signatory
# ---------- regex patterns ----------
# Nomor sprint, tolerant of spacing and OCR noise.
# Examples it should match:
# "Sprin / 123 / IV / 2025 / Reskrim"
# "SPRIN/345/X/2024"
# "Nomor : Sprin/12/I/2025/Sat Intelkam"
_RE_NOMOR_SPRINT = re.compile(
r"\bSPRIN[\s./-]*\d+[\s./-]*[IVXLCDM]+[\s./-]*\d{2,4}(?:[\s./-]*[\w .-]+?)?",
re.IGNORECASE,
)
# Indonesian month names.
_BULAN_MAP: dict[str, int] = {
"JANUARI": 1,
"FEBRUARI": 2,
"MARET": 3,
"APRIL": 4,
"MEI": 5,
"JUNI": 6,
"JULI": 7,
"AGUSTUS": 8,
"SEPTEMBER": 9,
"OKTOBER": 10,
"NOVEMBER": 11,
"DESEMBER": 12,
}
# Date in Indonesian, e.g. "21 April 2025" or "21 - April - 2025"
_RE_TANGGAL_ID = re.compile(
r"\b(\d{1,2})\s*[-./\s]\s*(" + "|".join(_BULAN_MAP.keys()) + r")\s*[-./\s]\s*(\d{4})\b",
re.IGNORECASE,
)
# Satuan penerbit usually appears in the document letterhead, prefixed by
# KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
_RE_SATUAN = re.compile(
r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
r"[^\n]{0,80}",
re.IGNORECASE,
)
# "Perihal : ...." up to end of line.
_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
def find_nomor_sprint(text: str) -> str | None:
"""Return the first nomor sprint found, normalized (no extra spaces)."""
match = _RE_NOMOR_SPRINT.search(text)
if not match:
return None
return " ".join(match.group(0).split())
def find_tanggal(text: str) -> date | None:
"""Find the issuance date.
Surat sprint typically contains multiple dates: one or more in the 'Dasar'
section (citing prior documents) and one near the signatory at the bottom
(the actual issuance date, usually formatted as 'Tempat, DD Month YYYY').
We prefer the **last** date in the document since the issuance date appears
after the dasar items in the standard layout.
"""
matches = list(_RE_TANGGAL_ID.finditer(text))
if not matches:
return None
last = matches[-1]
day_s, bulan, year_s = last.group(1), last.group(2).upper(), last.group(3)
try:
return date(int(year_s), _BULAN_MAP[bulan], int(day_s))
except (KeyError, ValueError):
return None
def find_satuan(text: str) -> str | None:
"""Return the first letterhead match (issuing unit), normalized."""
match = _RE_SATUAN.search(text)
if not match:
return None
return " ".join(match.group(0).split())
def find_perihal(text: str) -> str | None:
"""Return the first 'Perihal: ...' line, trimmed to that line only."""
for line in text.splitlines():
m = _RE_PERIHAL.search(line)
if m:
return m.group(1).strip()
return None
def find_dasar_list(text: str) -> list[str]:
"""Extract numbered 'Dasar' items from the text.
Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
collect subsequent lines that start with a number. Stops at a blank line
or a line beginning with another section header keyword.
"""
lines = text.splitlines()
items: list[str] = []
in_dasar = False
section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
for raw_line in lines:
line = raw_line.strip()
if not in_dasar:
if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
in_dasar = True
continue
if not line:
if items:
break
continue
upper = line.upper()
if any(upper.startswith(term) for term in section_terminators):
break
m = _RE_DASAR_ITEM.match(line)
if m:
items.append(m.group(2).strip())
elif items:
# continuation of the previous dasar item
items[-1] = (items[-1] + " " + line).strip()
return items
def find_signatory(text: str) -> Signatory:
"""Best-effort extraction of the signatory block (last NRP in the document)."""
matches = list(_RE_NRP.finditer(text))
if not matches:
return Signatory()
last = matches[-1]
return Signatory(nrp=last.group(2))
def extract_header(text: str) -> HeaderFields:
"""Run all header-level regex extractors and return a populated schema."""
return HeaderFields(
nomor_sprint=find_nomor_sprint(text),
tanggal=find_tanggal(text),
satuan_penerbit=find_satuan(text),
perihal=find_perihal(text),
dasar=find_dasar_list(text),
)

View File

@@ -0,0 +1,64 @@
"""Cross-field validation, with structured review-flag output."""
from __future__ import annotations
import re
from ocr_sprint.data.master_pangkat import is_valid_pangkat
from ocr_sprint.schemas.extraction import (
ExtractionResult,
HeaderFields,
ReviewFlag,
)
from ocr_sprint.schemas.personnel import PersonnelEntry
# Polri NRP = 8 digits.
_RE_NRP_8 = re.compile(r"^\d{8}$")
def validate_nrp(nrp: str | None) -> bool:
"""Return True when the value is a well-formed Polri NRP (8 digits)."""
if nrp is None:
return False
return bool(_RE_NRP_8.match(nrp.strip()))
def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
"""Inspect a single personnel row and return any review flags it triggers."""
flags: list[ReviewFlag] = []
if entry.nrp and not validate_nrp(entry.nrp):
flags.append(ReviewFlag.INVALID_NRP)
if entry.pangkat and not is_valid_pangkat(entry.pangkat):
flags.append(ReviewFlag.UNKNOWN_PANGKAT)
return flags
def validate_header(header: HeaderFields) -> list[ReviewFlag]:
"""Flag missing required fields or unparseable dates in the header."""
flags: list[ReviewFlag] = []
if header.nomor_sprint is None:
flags.append(ReviewFlag.MISSING_FIELD)
if header.tanggal is None:
flags.append(ReviewFlag.DATE_PARSE_FAILED)
return flags
def validate_extraction(
result: ExtractionResult,
expected_personnel_count: int | None = None,
) -> list[ReviewFlag]:
"""Run all validators across the full extraction and dedupe the flags."""
flags: list[ReviewFlag] = []
flags.extend(validate_header(result.header))
for entry in result.personel:
flags.extend(validate_personnel_entry(entry))
if expected_personnel_count is not None and expected_personnel_count != len(result.personel):
flags.append(ReviewFlag.PERSONNEL_COUNT_MISMATCH)
# dedupe while preserving order
seen: set[ReviewFlag] = set()
deduped: list[ReviewFlag] = []
for flag in flags:
if flag not in seen:
seen.add(flag)
deduped.append(flag)
return deduped