Phase 1 MVP: synchronous OCR + regex header extraction

Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 14:58:50 +00:00
commit ca0c0a0428
45 changed files with 2457 additions and 0 deletions
--- a/src/ocr_sprint/pipeline/extract/init.py
+++ b/src/ocr_sprint/pipeline/extract/init.py
@@ -0,0 +1 @@
+"""Information extraction layer (regex Phase 1, LLM Phase 5)."""
--- a/src/ocr_sprint/pipeline/extract/regex_rules.py
+++ b/src/ocr_sprint/pipeline/extract/regex_rules.py
@@ -0,0 +1,169 @@
+"""Regex-based extraction for the deterministic header fields of a surat sprint.
+
+Targets header fields whose layout is highly standardized across Polri units:
+
+  - Nomor sprint, e.g. "Sprin / 123 / IV / 2025 / Reskrim"
+  - Tanggal (date the sprint was issued)
+  - Satuan penerbit (issuing unit)
+  - Perihal
+  - Dasar (numbered list of legal/operational basis)
+
+Personnel table extraction is intentionally NOT done here — that needs
+PP-Structure + cell-aware logic and lives in `pipeline/table.py` (Phase 3).
+"""
+
+from __future__ import annotations
+
+import re
+from datetime import date
+
+from ocr_sprint.schemas.extraction import HeaderFields, Signatory
+
+# ---------- regex patterns ----------
+
+# Nomor sprint, tolerant of spacing and OCR noise.
+# Examples it should match:
+#   "Sprin / 123 / IV / 2025 / Reskrim"
+#   "SPRIN/345/X/2024"
+#   "Nomor : Sprin/12/I/2025/Sat Intelkam"
+_RE_NOMOR_SPRINT = re.compile(
+    r"\bSPRIN[\s./-]*\d+[\s./-]*[IVXLCDM]+[\s./-]*\d{2,4}(?:[\s./-]*[\w .-]+?)?",
+    re.IGNORECASE,
+)
+
+# Indonesian month names.
+_BULAN_MAP: dict[str, int] = {
+    "JANUARI": 1,
+    "FEBRUARI": 2,
+    "MARET": 3,
+    "APRIL": 4,
+    "MEI": 5,
+    "JUNI": 6,
+    "JULI": 7,
+    "AGUSTUS": 8,
+    "SEPTEMBER": 9,
+    "OKTOBER": 10,
+    "NOVEMBER": 11,
+    "DESEMBER": 12,
+}
+
+# Date in Indonesian, e.g. "21 April 2025" or "21 - April - 2025"
+_RE_TANGGAL_ID = re.compile(
+    r"\b(\d{1,2})\s*[-./\s]\s*(" + "|".join(_BULAN_MAP.keys()) + r")\s*[-./\s]\s*(\d{4})\b",
+    re.IGNORECASE,
+)
+
+# Satuan penerbit usually appears in the document letterhead, prefixed by
+# KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
+_RE_SATUAN = re.compile(
+    r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
+    r"[^\n]{0,80}",
+    re.IGNORECASE,
+)
+
+# "Perihal : ...." up to end of line.
+_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
+
+# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
+_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
+
+# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
+_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
+
+
+def find_nomor_sprint(text: str) -> str | None:
+    """Return the first nomor sprint found, normalized (no extra spaces)."""
+    match = _RE_NOMOR_SPRINT.search(text)
+    if not match:
+        return None
+    return " ".join(match.group(0).split())
+
+
+def find_tanggal(text: str) -> date | None:
+    """Find the issuance date.
+
+    Surat sprint typically contains multiple dates: one or more in the 'Dasar'
+    section (citing prior documents) and one near the signatory at the bottom
+    (the actual issuance date, usually formatted as 'Tempat, DD Month YYYY').
+    We prefer the **last** date in the document since the issuance date appears
+    after the dasar items in the standard layout.
+    """
+    matches = list(_RE_TANGGAL_ID.finditer(text))
+    if not matches:
+        return None
+    last = matches[-1]
+    day_s, bulan, year_s = last.group(1), last.group(2).upper(), last.group(3)
+    try:
+        return date(int(year_s), _BULAN_MAP[bulan], int(day_s))
+    except (KeyError, ValueError):
+        return None
+
+
+def find_satuan(text: str) -> str | None:
+    """Return the first letterhead match (issuing unit), normalized."""
+    match = _RE_SATUAN.search(text)
+    if not match:
+        return None
+    return " ".join(match.group(0).split())
+
+
+def find_perihal(text: str) -> str | None:
+    """Return the first 'Perihal: ...' line, trimmed to that line only."""
+    for line in text.splitlines():
+        m = _RE_PERIHAL.search(line)
+        if m:
+            return m.group(1).strip()
+    return None
+
+
+def find_dasar_list(text: str) -> list[str]:
+    """Extract numbered 'Dasar' items from the text.
+
+    Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
+    collect subsequent lines that start with a number. Stops at a blank line
+    or a line beginning with another section header keyword.
+    """
+    lines = text.splitlines()
+    items: list[str] = []
+    in_dasar = False
+    section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
+    for raw_line in lines:
+        line = raw_line.strip()
+        if not in_dasar:
+            if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
+                in_dasar = True
+            continue
+        if not line:
+            if items:
+                break
+            continue
+        upper = line.upper()
+        if any(upper.startswith(term) for term in section_terminators):
+            break
+        m = _RE_DASAR_ITEM.match(line)
+        if m:
+            items.append(m.group(2).strip())
+        elif items:
+            # continuation of the previous dasar item
+            items[-1] = (items[-1] + " " + line).strip()
+    return items
+
+
+def find_signatory(text: str) -> Signatory:
+    """Best-effort extraction of the signatory block (last NRP in the document)."""
+    matches = list(_RE_NRP.finditer(text))
+    if not matches:
+        return Signatory()
+    last = matches[-1]
+    return Signatory(nrp=last.group(2))
+
+
+def extract_header(text: str) -> HeaderFields:
+    """Run all header-level regex extractors and return a populated schema."""
+    return HeaderFields(
+        nomor_sprint=find_nomor_sprint(text),
+        tanggal=find_tanggal(text),
+        satuan_penerbit=find_satuan(text),
+        perihal=find_perihal(text),
+        dasar=find_dasar_list(text),
+    )
--- a/src/ocr_sprint/pipeline/extract/validators.py
+++ b/src/ocr_sprint/pipeline/extract/validators.py
@@ -0,0 +1,64 @@
+"""Cross-field validation, with structured review-flag output."""
+
+from __future__ import annotations
+
+import re
+
+from ocr_sprint.data.master_pangkat import is_valid_pangkat
+from ocr_sprint.schemas.extraction import (
+    ExtractionResult,
+    HeaderFields,
+    ReviewFlag,
+)
+from ocr_sprint.schemas.personnel import PersonnelEntry
+
+# Polri NRP = 8 digits.
+_RE_NRP_8 = re.compile(r"^\d{8}$")
+
+
+def validate_nrp(nrp: str | None) -> bool:
+    """Return True when the value is a well-formed Polri NRP (8 digits)."""
+    if nrp is None:
+        return False
+    return bool(_RE_NRP_8.match(nrp.strip()))
+
+
+def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
+    """Inspect a single personnel row and return any review flags it triggers."""
+    flags: list[ReviewFlag] = []
+    if entry.nrp and not validate_nrp(entry.nrp):
+        flags.append(ReviewFlag.INVALID_NRP)
+    if entry.pangkat and not is_valid_pangkat(entry.pangkat):
+        flags.append(ReviewFlag.UNKNOWN_PANGKAT)
+    return flags
+
+
+def validate_header(header: HeaderFields) -> list[ReviewFlag]:
+    """Flag missing required fields or unparseable dates in the header."""
+    flags: list[ReviewFlag] = []
+    if header.nomor_sprint is None:
+        flags.append(ReviewFlag.MISSING_FIELD)
+    if header.tanggal is None:
+        flags.append(ReviewFlag.DATE_PARSE_FAILED)
+    return flags
+
+
+def validate_extraction(
+    result: ExtractionResult,
+    expected_personnel_count: int | None = None,
+) -> list[ReviewFlag]:
+    """Run all validators across the full extraction and dedupe the flags."""
+    flags: list[ReviewFlag] = []
+    flags.extend(validate_header(result.header))
+    for entry in result.personel:
+        flags.extend(validate_personnel_entry(entry))
+    if expected_personnel_count is not None and expected_personnel_count != len(result.personel):
+        flags.append(ReviewFlag.PERSONNEL_COUNT_MISMATCH)
+    # dedupe while preserving order
+    seen: set[ReviewFlag] = set()
+    deduped: list[ReviewFlag] = []
+    for flag in flags:
+        if flag not in seen:
+            seen.add(flag)
+            deduped.append(flag)
+    return deduped
				`@@ -0,0 +1 @@`
				`"""Information extraction layer (regex Phase 1, LLM Phase 5)."""`