Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
1
src/ocr_sprint/pipeline/extract/__init__.py
Normal file
1
src/ocr_sprint/pipeline/extract/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Information extraction layer (regex Phase 1, LLM Phase 5)."""
|
||||
169
src/ocr_sprint/pipeline/extract/regex_rules.py
Normal file
169
src/ocr_sprint/pipeline/extract/regex_rules.py
Normal file
@@ -0,0 +1,169 @@
|
||||
"""Regex-based extraction for the deterministic header fields of a surat sprint.
|
||||
|
||||
Targets header fields whose layout is highly standardized across Polri units:
|
||||
|
||||
- Nomor sprint, e.g. "Sprin / 123 / IV / 2025 / Reskrim"
|
||||
- Tanggal (date the sprint was issued)
|
||||
- Satuan penerbit (issuing unit)
|
||||
- Perihal
|
||||
- Dasar (numbered list of legal/operational basis)
|
||||
|
||||
Personnel table extraction is intentionally NOT done here — that needs
|
||||
PP-Structure + cell-aware logic and lives in `pipeline/table.py` (Phase 3).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from datetime import date
|
||||
|
||||
from ocr_sprint.schemas.extraction import HeaderFields, Signatory
|
||||
|
||||
# ---------- regex patterns ----------
|
||||
|
||||
# Nomor sprint, tolerant of spacing and OCR noise.
|
||||
# Examples it should match:
|
||||
# "Sprin / 123 / IV / 2025 / Reskrim"
|
||||
# "SPRIN/345/X/2024"
|
||||
# "Nomor : Sprin/12/I/2025/Sat Intelkam"
|
||||
_RE_NOMOR_SPRINT = re.compile(
|
||||
r"\bSPRIN[\s./-]*\d+[\s./-]*[IVXLCDM]+[\s./-]*\d{2,4}(?:[\s./-]*[\w .-]+?)?",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Indonesian month names.
|
||||
_BULAN_MAP: dict[str, int] = {
|
||||
"JANUARI": 1,
|
||||
"FEBRUARI": 2,
|
||||
"MARET": 3,
|
||||
"APRIL": 4,
|
||||
"MEI": 5,
|
||||
"JUNI": 6,
|
||||
"JULI": 7,
|
||||
"AGUSTUS": 8,
|
||||
"SEPTEMBER": 9,
|
||||
"OKTOBER": 10,
|
||||
"NOVEMBER": 11,
|
||||
"DESEMBER": 12,
|
||||
}
|
||||
|
||||
# Date in Indonesian, e.g. "21 April 2025" or "21 - April - 2025"
|
||||
_RE_TANGGAL_ID = re.compile(
|
||||
r"\b(\d{1,2})\s*[-./\s]\s*(" + "|".join(_BULAN_MAP.keys()) + r")\s*[-./\s]\s*(\d{4})\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Satuan penerbit usually appears in the document letterhead, prefixed by
|
||||
# KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
|
||||
_RE_SATUAN = re.compile(
|
||||
r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
|
||||
r"[^\n]{0,80}",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# "Perihal : ...." up to end of line.
|
||||
_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
|
||||
|
||||
# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
|
||||
_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
|
||||
|
||||
# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
|
||||
_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
|
||||
|
||||
|
||||
def find_nomor_sprint(text: str) -> str | None:
|
||||
"""Return the first nomor sprint found, normalized (no extra spaces)."""
|
||||
match = _RE_NOMOR_SPRINT.search(text)
|
||||
if not match:
|
||||
return None
|
||||
return " ".join(match.group(0).split())
|
||||
|
||||
|
||||
def find_tanggal(text: str) -> date | None:
|
||||
"""Find the issuance date.
|
||||
|
||||
Surat sprint typically contains multiple dates: one or more in the 'Dasar'
|
||||
section (citing prior documents) and one near the signatory at the bottom
|
||||
(the actual issuance date, usually formatted as 'Tempat, DD Month YYYY').
|
||||
We prefer the **last** date in the document since the issuance date appears
|
||||
after the dasar items in the standard layout.
|
||||
"""
|
||||
matches = list(_RE_TANGGAL_ID.finditer(text))
|
||||
if not matches:
|
||||
return None
|
||||
last = matches[-1]
|
||||
day_s, bulan, year_s = last.group(1), last.group(2).upper(), last.group(3)
|
||||
try:
|
||||
return date(int(year_s), _BULAN_MAP[bulan], int(day_s))
|
||||
except (KeyError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def find_satuan(text: str) -> str | None:
|
||||
"""Return the first letterhead match (issuing unit), normalized."""
|
||||
match = _RE_SATUAN.search(text)
|
||||
if not match:
|
||||
return None
|
||||
return " ".join(match.group(0).split())
|
||||
|
||||
|
||||
def find_perihal(text: str) -> str | None:
|
||||
"""Return the first 'Perihal: ...' line, trimmed to that line only."""
|
||||
for line in text.splitlines():
|
||||
m = _RE_PERIHAL.search(line)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
return None
|
||||
|
||||
|
||||
def find_dasar_list(text: str) -> list[str]:
|
||||
"""Extract numbered 'Dasar' items from the text.
|
||||
|
||||
Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
|
||||
collect subsequent lines that start with a number. Stops at a blank line
|
||||
or a line beginning with another section header keyword.
|
||||
"""
|
||||
lines = text.splitlines()
|
||||
items: list[str] = []
|
||||
in_dasar = False
|
||||
section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
|
||||
for raw_line in lines:
|
||||
line = raw_line.strip()
|
||||
if not in_dasar:
|
||||
if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
|
||||
in_dasar = True
|
||||
continue
|
||||
if not line:
|
||||
if items:
|
||||
break
|
||||
continue
|
||||
upper = line.upper()
|
||||
if any(upper.startswith(term) for term in section_terminators):
|
||||
break
|
||||
m = _RE_DASAR_ITEM.match(line)
|
||||
if m:
|
||||
items.append(m.group(2).strip())
|
||||
elif items:
|
||||
# continuation of the previous dasar item
|
||||
items[-1] = (items[-1] + " " + line).strip()
|
||||
return items
|
||||
|
||||
|
||||
def find_signatory(text: str) -> Signatory:
|
||||
"""Best-effort extraction of the signatory block (last NRP in the document)."""
|
||||
matches = list(_RE_NRP.finditer(text))
|
||||
if not matches:
|
||||
return Signatory()
|
||||
last = matches[-1]
|
||||
return Signatory(nrp=last.group(2))
|
||||
|
||||
|
||||
def extract_header(text: str) -> HeaderFields:
|
||||
"""Run all header-level regex extractors and return a populated schema."""
|
||||
return HeaderFields(
|
||||
nomor_sprint=find_nomor_sprint(text),
|
||||
tanggal=find_tanggal(text),
|
||||
satuan_penerbit=find_satuan(text),
|
||||
perihal=find_perihal(text),
|
||||
dasar=find_dasar_list(text),
|
||||
)
|
||||
64
src/ocr_sprint/pipeline/extract/validators.py
Normal file
64
src/ocr_sprint/pipeline/extract/validators.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""Cross-field validation, with structured review-flag output."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from ocr_sprint.data.master_pangkat import is_valid_pangkat
|
||||
from ocr_sprint.schemas.extraction import (
|
||||
ExtractionResult,
|
||||
HeaderFields,
|
||||
ReviewFlag,
|
||||
)
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
# Polri NRP = 8 digits.
|
||||
_RE_NRP_8 = re.compile(r"^\d{8}$")
|
||||
|
||||
|
||||
def validate_nrp(nrp: str | None) -> bool:
|
||||
"""Return True when the value is a well-formed Polri NRP (8 digits)."""
|
||||
if nrp is None:
|
||||
return False
|
||||
return bool(_RE_NRP_8.match(nrp.strip()))
|
||||
|
||||
|
||||
def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
|
||||
"""Inspect a single personnel row and return any review flags it triggers."""
|
||||
flags: list[ReviewFlag] = []
|
||||
if entry.nrp and not validate_nrp(entry.nrp):
|
||||
flags.append(ReviewFlag.INVALID_NRP)
|
||||
if entry.pangkat and not is_valid_pangkat(entry.pangkat):
|
||||
flags.append(ReviewFlag.UNKNOWN_PANGKAT)
|
||||
return flags
|
||||
|
||||
|
||||
def validate_header(header: HeaderFields) -> list[ReviewFlag]:
|
||||
"""Flag missing required fields or unparseable dates in the header."""
|
||||
flags: list[ReviewFlag] = []
|
||||
if header.nomor_sprint is None:
|
||||
flags.append(ReviewFlag.MISSING_FIELD)
|
||||
if header.tanggal is None:
|
||||
flags.append(ReviewFlag.DATE_PARSE_FAILED)
|
||||
return flags
|
||||
|
||||
|
||||
def validate_extraction(
|
||||
result: ExtractionResult,
|
||||
expected_personnel_count: int | None = None,
|
||||
) -> list[ReviewFlag]:
|
||||
"""Run all validators across the full extraction and dedupe the flags."""
|
||||
flags: list[ReviewFlag] = []
|
||||
flags.extend(validate_header(result.header))
|
||||
for entry in result.personel:
|
||||
flags.extend(validate_personnel_entry(entry))
|
||||
if expected_personnel_count is not None and expected_personnel_count != len(result.personel):
|
||||
flags.append(ReviewFlag.PERSONNEL_COUNT_MISMATCH)
|
||||
# dedupe while preserving order
|
||||
seen: set[ReviewFlag] = set()
|
||||
deduped: list[ReviewFlag] = []
|
||||
for flag in flags:
|
||||
if flag not in seen:
|
||||
seen.add(flag)
|
||||
deduped.append(flag)
|
||||
return deduped
|
||||
Reference in New Issue
Block a user