From ca0c0a0428e2cfb2396ab08929f1fb7130a9696f Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:58:50 +0000 Subject: [PATCH] Phase 1 MVP: synchronous OCR + regex header extraction Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah --- .env.example | 43 +++ .gitignore | 70 +++++ .pre-commit-config.yaml | 19 ++ Dockerfile | 51 ++++ Makefile | 52 ++++ README.md | 123 +++++++++ docker-compose.yml | 23 ++ docs/architecture.md | 259 ++++++++++++++++++ pyproject.toml | 136 +++++++++ samples/README.md | 13 + src/ocr_sprint/__init__.py | 3 + src/ocr_sprint/api/__init__.py | 0 src/ocr_sprint/api/errors.py | 43 +++ src/ocr_sprint/api/routes/__init__.py | 0 src/ocr_sprint/api/routes/documents.py | 58 ++++ src/ocr_sprint/api/routes/health.py | 15 + src/ocr_sprint/config.py | 72 +++++ src/ocr_sprint/data/__init__.py | 0 src/ocr_sprint/data/master_pangkat.py | 66 +++++ src/ocr_sprint/main.py | 42 +++ src/ocr_sprint/pipeline/__init__.py | 1 + src/ocr_sprint/pipeline/confidence.py | 51 ++++ src/ocr_sprint/pipeline/extract/__init__.py | 1 + .../pipeline/extract/regex_rules.py | 169 ++++++++++++ src/ocr_sprint/pipeline/extract/validators.py | 64 +++++ src/ocr_sprint/pipeline/ingest.py | 81 ++++++ src/ocr_sprint/pipeline/ocr.py | 106 +++++++ src/ocr_sprint/pipeline/orchestrator.py | 103 +++++++ src/ocr_sprint/pipeline/preprocess.py | 108 ++++++++ src/ocr_sprint/py.typed | 0 src/ocr_sprint/schemas/__init__.py | 27 ++ src/ocr_sprint/schemas/document.py | 57 ++++ src/ocr_sprint/schemas/extraction.py | 55 ++++ src/ocr_sprint/schemas/personnel.py | 18 ++ src/ocr_sprint/utils/__init__.py | 0 src/ocr_sprint/utils/logging.py | 45 +++ tests/__init__.py | 0 tests/conftest.py | 43 +++ tests/unit/__init__.py | 0 tests/unit/test_api.py | 87 ++++++ tests/unit/test_confidence.py | 46 ++++ tests/unit/test_ingest.py | 50 ++++ tests/unit/test_preprocess.py | 37 +++ tests/unit/test_regex_rules.py | 112 ++++++++ tests/unit/test_validators.py | 108 ++++++++ 45 files changed, 2457 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 Dockerfile create mode 100644 Makefile create mode 100644 README.md create mode 100644 docker-compose.yml create mode 100644 docs/architecture.md create mode 100644 pyproject.toml create mode 100644 samples/README.md create mode 100644 src/ocr_sprint/__init__.py create mode 100644 src/ocr_sprint/api/__init__.py create mode 100644 src/ocr_sprint/api/errors.py create mode 100644 src/ocr_sprint/api/routes/__init__.py create mode 100644 src/ocr_sprint/api/routes/documents.py create mode 100644 src/ocr_sprint/api/routes/health.py create mode 100644 src/ocr_sprint/config.py create mode 100644 src/ocr_sprint/data/__init__.py create mode 100644 src/ocr_sprint/data/master_pangkat.py create mode 100644 src/ocr_sprint/main.py create mode 100644 src/ocr_sprint/pipeline/__init__.py create mode 100644 src/ocr_sprint/pipeline/confidence.py create mode 100644 src/ocr_sprint/pipeline/extract/__init__.py create mode 100644 src/ocr_sprint/pipeline/extract/regex_rules.py create mode 100644 src/ocr_sprint/pipeline/extract/validators.py create mode 100644 src/ocr_sprint/pipeline/ingest.py create mode 100644 src/ocr_sprint/pipeline/ocr.py create mode 100644 src/ocr_sprint/pipeline/orchestrator.py create mode 100644 src/ocr_sprint/pipeline/preprocess.py create mode 100644 src/ocr_sprint/py.typed create mode 100644 src/ocr_sprint/schemas/__init__.py create mode 100644 src/ocr_sprint/schemas/document.py create mode 100644 src/ocr_sprint/schemas/extraction.py create mode 100644 src/ocr_sprint/schemas/personnel.py create mode 100644 src/ocr_sprint/utils/__init__.py create mode 100644 src/ocr_sprint/utils/logging.py create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/unit/__init__.py create mode 100644 tests/unit/test_api.py create mode 100644 tests/unit/test_confidence.py create mode 100644 tests/unit/test_ingest.py create mode 100644 tests/unit/test_preprocess.py create mode 100644 tests/unit/test_regex_rules.py create mode 100644 tests/unit/test_validators.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..edd8ef2 --- /dev/null +++ b/.env.example @@ -0,0 +1,43 @@ +# ==== App ==== +APP_ENV=local # local | dev | staging | prod +APP_HOST=0.0.0.0 +APP_PORT=8000 +APP_LOG_LEVEL=INFO + +# ==== Storage (Phase 1: local filesystem) ==== +STORAGE_LOCAL_DIR=./storage + +# ==== OCR ==== +OCR_LANG=latin # PaddleOCR lang code; "latin" works well for Bahasa Indonesia +OCR_USE_GPU=false # set true if running on a GPU host +OCR_DET_MODEL_DIR= # leave empty to use PaddleOCR defaults +OCR_REC_MODEL_DIR= +OCR_CLS_MODEL_DIR= +OCR_MAX_IMAGE_SIDE=2200 # downscale longest side before OCR + +# ==== Preprocessing ==== +PREPROCESS_TARGET_DPI=300 +PREPROCESS_DENOISE=true +PREPROCESS_DESKEW=true +PREPROCESS_ADAPTIVE_THRESHOLD=false # turn on for low-quality phone photos + +# ==== Confidence / routing (Phase 5) ==== +CONFIDENCE_AUTO_APPROVE=0.95 +CONFIDENCE_NEEDS_REVIEW=0.85 + +# ==== LLM (Phase 5, optional) ==== +LLM_ENABLED=false +LLM_PROVIDER=ollama +LLM_MODEL=qwen2.5:1.5b # CPU-friendly default +LLM_BASE_URL=http://localhost:11434 +LLM_TIMEOUT_S=60 + +# ==== Async pipeline (Phase 4, optional) ==== +QUEUE_ENABLED=false +REDIS_URL=redis://localhost:6379/0 +DATABASE_URL=postgresql+psycopg://ocr:ocr@localhost:5432/ocr_sprint +MINIO_ENDPOINT=localhost:9000 +MINIO_ACCESS_KEY=minioadmin +MINIO_SECRET_KEY=minioadmin +MINIO_BUCKET=ocr-sprint +MINIO_SECURE=false diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9897bab --- /dev/null +++ b/.gitignore @@ -0,0 +1,70 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +dist/ +*.egg-info/ +*.egg +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +.coverage.* +htmlcov/ +coverage.xml +.tox/ +.nox/ + +# Virtual environments +.venv/ +venv/ +env/ +ENV/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +.DS_Store + +# Environment / secrets +.env +.env.* +!.env.example + +# Local data & artifacts +samples/*.pdf +samples/*.PDF +samples/*.jpg +samples/*.JPG +samples/*.jpeg +samples/*.png +samples/*.PNG +samples/*.tif +samples/*.tiff +!samples/README.md +data/local/ +storage/ +*.db +*.sqlite +*.sqlite3 + +# OCR / model caches +.paddleocr/ +~/.paddleocr/ +models/downloaded/ + +# Logs +logs/ +*.log + +# Docker +.docker/ + +# Misc +*.bak +*.tmp diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..1a8beea --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml + - id: check-added-large-files + args: ["--maxkb=1024"] + - id: check-merge-conflict + - id: detect-private-key + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.9 + hooks: + - id: ruff + args: ["--fix"] + - id: ruff-format diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..110cf97 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,51 @@ +# syntax=docker/dockerfile:1.6 +# CPU-only image for the OCR Sprint API. +# PaddleOCR + PyMuPDF + OpenCV-headless work on plain Debian without poppler. +FROM python:3.11-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_NO_CACHE_DIR=1 \ + DEBIAN_FRONTEND=noninteractive + +# System deps for OpenCV, libmagic, PaddlePaddle, and image format support. +RUN apt-get update && apt-get install -y --no-install-recommends \ + libgl1 \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + libgomp1 \ + libmagic1 \ + ca-certificates \ + curl \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# ----- builder layer (install deps separately for caching) ----- +FROM base AS builder +COPY pyproject.toml README.md ./ +COPY src/ ./src/ +RUN pip install --upgrade pip && pip install ".[dev]" + +# ----- runtime layer ----- +FROM base AS runtime +COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages +COPY --from=builder /usr/local/bin /usr/local/bin +COPY pyproject.toml README.md ./ +COPY src/ ./src/ + +# Pre-create cache dirs so PaddleOCR can write models on first run. +RUN mkdir -p /home/app/.paddleocr /app/storage \ + && useradd --create-home --uid 1000 app \ + && chown -R app:app /home/app /app + +USER app +EXPOSE 8000 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \ + CMD curl -fsS http://localhost:8000/api/v1/health || exit 1 + +CMD ["uvicorn", "ocr_sprint.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..65af363 --- /dev/null +++ b/Makefile @@ -0,0 +1,52 @@ +.PHONY: help install dev fmt lint typecheck test test-cov run docker-build docker-up docker-down clean + +help: + @echo "Targets:" + @echo " install - install runtime + dev deps in current env" + @echo " dev - run FastAPI app with autoreload" + @echo " fmt - format code with ruff" + @echo " lint - lint with ruff" + @echo " typecheck - run mypy" + @echo " test - run pytest" + @echo " test-cov - run pytest with coverage" + @echo " docker-build - build api image" + @echo " docker-up - start docker-compose stack" + @echo " docker-down - stop docker-compose stack" + +install: + python -m pip install --upgrade pip + pip install -e ".[dev]" + pre-commit install || true + +dev: + uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000 + +fmt: + ruff format src tests + ruff check --fix src tests + +lint: + ruff check src tests + ruff format --check src tests + +typecheck: + mypy src + +test: + pytest + +test-cov: + pytest --cov --cov-report=term-missing + +docker-build: + docker compose build + +docker-up: + docker compose up -d + +docker-down: + docker compose down + +clean: + rm -rf .pytest_cache .mypy_cache .ruff_cache .coverage htmlcov build dist *.egg-info + find . -type d -name __pycache__ -exec rm -rf {} + diff --git a/README.md b/README.md new file mode 100644 index 0000000..6e5558f --- /dev/null +++ b/README.md @@ -0,0 +1,123 @@ +# OCR Sprint Service + +OCR + structured extraction service for Indonesian police "surat sprint" (surat perintah) documents. Built around **FastAPI + PaddleOCR + hybrid extraction (regex → LLM lokal → validation)** with **on-premise** deployment as a hard requirement. + +> **Status:** Phase 1 MVP — synchronous PDF/image OCR with regex header extraction, validation, and confidence scoring. Phase 2–6 (document detection, table extraction, async pipeline, LLM extraction, HITL) are tracked in [`docs/architecture.md`](docs/architecture.md). + +## Why this stack + +- **PaddleOCR** is the strongest open-source OCR for mixed-language documents and runs fully on-prem (essential for police data). +- **PP-Structure** (Phase 3) handles personnel tables natively. +- **Regex-first, LLM-fallback extraction** keeps deterministic fields fast and predictable while letting an LLM handle format drift across Polri units. +- **CPU-friendly defaults**: a small (1.5B–4B) local LLM via Ollama is the recommended default; the architecture is also GPU-ready. + +See [`docs/architecture.md`](docs/architecture.md) for the full architecture, accuracy expectations, and roadmap. + +## Quickstart + +### Prerequisites + +- Python **3.10–3.12** +- ~3 GB free disk for PaddleOCR model downloads on first run +- Linux/macOS recommended (Windows works but PaddleOCR install can be finicky) + +### Install (local dev) + +```bash +git clone https://github.com/Adriankf59/ocr-sprint-service.git +cd ocr-sprint-service + +python -m venv .venv && source .venv/bin/activate +make install # installs runtime + dev deps + pre-commit +cp .env.example .env # edit if you need GPU / different storage path +``` + +### Run the API + +```bash +make dev +# → http://localhost:8000/docs +``` + +### Try it out + +```bash +curl -F "file=@samples/pdf/example.pdf" http://localhost:8000/api/v1/documents | jq +``` + +Expected response (truncated): + +```json +{ + "job_id": "8f2a...", + "status": "completed", + "confidence": 0.93, + "data": { + "header": { + "nomor_sprint": "Sprin/123/IV/2025/Reskrim", + "tanggal": "2025-04-21", + "satuan_penerbit": "KEPOLISIAN RESOR BANDUNG", + "perihal": "Pelaksanaan penyelidikan kasus pencurian", + "dasar": ["Undang-Undang Nomor 2 Tahun 2002 ...", "..."] + }, + "personel": [], + "ttd": { "nrp": "12345678" } + }, + "review_flags": [] +} +``` + +> **Note:** Phase 1 does not yet populate the `personel[]` table — that requires PP-Structure (Phase 3). Header fields, signatory NRP, confidence, and HITL routing are fully wired. + +### Docker + +```bash +docker compose build +docker compose up -d +docker compose logs -f api +``` + +The first request will trigger PaddleOCR to download its detection/recognition/cls models (~200 MB) into the `paddle-models` volume. + +## Development + +```bash +make fmt # format with ruff +make lint # lint +make typecheck # mypy strict mode +make test # pytest +make test-cov # pytest + coverage +``` + +Pre-commit hooks run ruff on every commit. Install once with `pre-commit install` (already done by `make install`). + +## Project layout + +``` +src/ocr_sprint/ + api/ # FastAPI routes + error handlers + schemas/ # Pydantic v2 models (request/response, extraction, personnel) + pipeline/ # ingest → preprocess → ocr → extract → validate → score + extract/ # regex_rules.py (Phase 1) → llm.py (Phase 5) + data/ # master data (Polri ranks, etc.) + utils/ # logging, helpers + config.py # pydantic-settings + main.py # app factory +tests/unit/ # ~60 unit tests, no PaddleOCR dependency +docs/ # architecture & decision records +``` + +## Roadmap + +| Phase | Scope | Status | +|---|---|---| +| 1 | Sync API, PDF/image ingest, basic preprocessing, PaddleOCR, regex header extraction, validation, confidence scoring | **In progress** | +| 2 | DocTR document detection + dewarping for phone photos | Planned | +| 3 | PP-Structure table extraction for personnel rows | Planned | +| 4 | Async pipeline (Celery + Redis), Postgres + MinIO, auth, observability | Planned | +| 5 | LLM hybrid extraction (Ollama + structured output) | Planned | +| 6 | HITL review endpoints + audit trail | Planned | + +## License + +Proprietary — internal use only. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..cd520ff --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,23 @@ +# Phase 1 MVP compose: API only. +# Phase 4 will add redis, postgres, minio, and worker services. +services: + api: + build: + context: . + dockerfile: Dockerfile + image: ocr-sprint-service:dev + container_name: ocr-sprint-api + ports: + - "8000:8000" + environment: + APP_ENV: local + APP_LOG_LEVEL: INFO + OCR_USE_GPU: "false" + STORAGE_LOCAL_DIR: /app/storage + volumes: + - ./storage:/app/storage + - paddle-models:/home/app/.paddleocr + restart: unless-stopped + +volumes: + paddle-models: diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..2cb7977 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,259 @@ +# Plan & Arsitektur — OCR Service Surat Sprint Kepolisian + +## 1. Penilaian Jujur Tech Stack yang Diusulkan + +Tech stack Anda (FastAPI + PaddleOCR + OpenCV/Pillow + Regex) **sudah bagus dan layak produksi**, tapi **belum tentu paling optimal akurasinya** untuk kasus surat sprint. Ada beberapa gap yang perlu diisi sebelum bisa disebut "terbaik". + +### Yang sudah tepat +| Komponen | Alasan | +|---|---| +| **FastAPI** | Async native, Pydantic validation, OpenAPI docs otomatis, ideal untuk ML serving. | +| **PaddleOCR (PP-OCRv4/v5)** | Salah satu OCR open-source terbaik untuk dokumen campuran teks + tabel, mendukung Latin (cocok untuk Bahasa Indonesia), bisa jalan on-premise (penting untuk dokumen kepolisian yang sensitif — **cloud OCR seperti Google Vision/AWS Textract sebaiknya dihindari** karena masalah kerahasiaan). | +| **OpenCV + Pillow** | Standar industri untuk preprocessing. | +| **Regex/rule-based** | Cocok untuk dokumen terstruktur seperti sprint yang format-nya relatif baku. | + +### Yang masih kurang / perlu ditambah + +1. **Table extraction belum tertangani** + Daftar personel di surat sprint hampir selalu berbentuk **tabel** (No, Pangkat, NRP, Nama, Jabatan, Keterangan). Regex pada teks linear dari OCR biasa **akan kacau** ketika baris tabel pecah atau kolom bergeser. Solusi: gunakan **PaddleOCR PP-Structure** (modul table recognition bawaan Paddle) atau model khusus seperti **TableTransformer (Microsoft)**. + +2. **Document detection & dewarping untuk foto HP belum eksplisit** + Foto HP bermasalah karena: perspektif miring, lipatan, bayangan, lighting tidak rata, fokus tidak merata. OpenCV crop + perspective transform manual saja sering gagal. Tambahkan: + - **Document corner detection**: `DocTR` / `MobileSAM` / model edge-based, atau heuristik kontur OpenCV sebagai fallback. + - **Dewarping**: `DocTr` / `DewarpNet` untuk halaman yang melengkung (lipatan). + - **Shadow removal**: algoritma background division atau model spesialis. + +3. **Strategi ekstraksi 100% regex itu rapuh** + Surat sprint dari satuan berbeda (Polda, Polres, Polsek, Mabes) punya **variasi format**: header berbeda, urutan field berbeda, kadang pangkat disingkat (`AKP`, `IPDA`) kadang ditulis penuh. Regex murni akan butuh ratusan rule dan tetap miss kasus baru. + **Rekomendasi pendekatan hybrid**: + - **Layer 1 — Regex/rule** untuk field deterministik (Nomor sprint, tanggal, dasar hukum) yang format-nya baku. + - **Layer 2 — Schema-aware extraction** menggunakan **LLM lokal** (Llama 3.1 8B / Qwen2.5 7B via Ollama atau vLLM) dengan structured output (JSON schema / Pydantic) untuk field yang variatif (jabatan, keterangan tugas). + - **Layer 3 — Validation** terhadap master data (daftar pangkat valid, format NRP 8 digit, dll). + +4. **Tidak ada confidence scoring & human-in-the-loop** + Untuk dokumen kepolisian, **akurasi 100% otomatis itu mitos**. Sistem harus: + - Mengeluarkan confidence score per field. + - Otomatis flag dokumen low-confidence untuk review manusia. + - Sediakan UI/endpoint koreksi yang feedback-nya bisa dipakai retraining. + +5. **Alternatif end-to-end yang patut dipertimbangkan** + Jika nanti volume dokumen besar dan format relatif stabil, fine-tuning model **Document Understanding** end-to-end bisa lebih akurat: + - **Donut** (OCR-free, langsung image → JSON). + - **LayoutLMv3** (kombinasi teks + layout + visual). + - **Surya OCR** (newer, sangat bagus untuk dokumen). + Untuk MVP, tetap pakai PaddleOCR. Donut/LayoutLM adalah opsi V2 setelah ada labeled dataset cukup (~500–1000 dokumen). + +### Verdict +Stack Anda **bisa mencapai ~85–92% akurasi field-level** untuk surat sprint dengan kualitas scan baik, dan **~70–80%** untuk foto HP, **kalau ditambah** komponen di atas. Tanpa table extraction + dewarping + hybrid extraction, akurasinya akan jatuh di kondisi nyata. + +--- + +## 2. Arsitektur yang Direkomendasikan + +### 2.1 Diagram Logis + +``` +┌────────────────────────────────────────────────────────────────────┐ +│ Client (Web/Mobile) │ +└──────────────────────────────┬─────────────────────────────────────┘ + │ HTTPS (multipart upload) + ▼ +┌────────────────────────────────────────────────────────────────────┐ +│ FastAPI Gateway (stateless) │ +│ - Auth (JWT/API key) - Rate limit - Request validation │ +└──────────────────────────────┬─────────────────────────────────────┘ + │ enqueue job + ▼ +┌────────────────────────────────────────────────────────────────────┐ +│ Job Queue (Redis + Celery / RQ / Dramatiq) │ +└──────────────────────────────┬─────────────────────────────────────┘ + ▼ +┌────────────────────────────────────────────────────────────────────┐ +│ OCR Worker Pipeline (GPU/CPU) │ +│ ┌────────────┐ ┌──────────────┐ ┌───────────┐ ┌────────────┐ │ +│ │ 1. Ingest │→ │ 2. Preproc │→ │ 3. OCR + │→ │ 4. Extract │ │ +│ │ & detect │ │ (deskew, │ │ Layout │ │ (regex + │ │ +│ │ PDF/IMG │ │ dewarp, │ │ PP-Struct│ │ LLM + │ │ +│ │ │ │ denoise) │ │ + Table) │ │ validate) │ │ +│ └────────────┘ └──────────────┘ └───────────┘ └─────┬──────┘ │ +│ │ │ +│ ┌──────────────────────────────┘ │ +│ ▼ │ +│ ┌─────────────┐ │ +│ │ 5. Confidence│ → low conf? flag for review │ +│ │ scoring │ │ +│ └──────┬───────┘ │ +└──────────────────────────┼─────────────────────────────────────────┘ + ▼ +┌────────────────────────────────────────────────────────────────────┐ +│ Storage: PostgreSQL (metadata) + MinIO/S3 (file) │ +│ + Vector store opsional (untuk dedup / search) │ +└────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌────────────────────────────────────────────────────────────────────┐ +│ Review UI (optional) — koreksi manual + audit trail │ +└────────────────────────────────────────────────────────────────────┘ +``` + +### 2.2 Pipeline Detail per Tahap + +**Tahap 1 — Ingest & Document Detection** +- PDF: render setiap halaman jadi image @ 300 DPI (`pdf2image` / `PyMuPDF`). +- Image (foto HP): deteksi sudut dokumen → crop → perspective transform. + - Library: OpenCV `findContours` (cepat) sebagai fallback, **DocTR document detector** (lebih akurat) sebagai utama. + +**Tahap 2 — Preprocessing** +- Deskew (rotation correction) — Hough transform atau model. +- Dewarp (untuk foto buku/lipatan) — `DewarpNet` atau model RNN. +- Adaptive thresholding (untuk foto dengan lighting tidak rata). +- Shadow removal (background division). +- Denoise (Non-Local Means). +- Resize ke ukuran optimal OCR (~1500–2500 px sisi panjang). + +**Tahap 3 — OCR + Layout Analysis** +- **PaddleOCR PP-Structure** dijalankan sekali → menghasilkan: + - Bounding boxes + teks + confidence per word/line. + - Table region detection + table-to-HTML/JSON. + - Layout type per region (title, paragraph, table, figure). +- Output ditampung sebagai struktur intermediate (mirip hOCR / ALTO XML). + +**Tahap 4 — Information Extraction** +- **4a. Header parsing (regex)**: Nomor sprint, tanggal, satuan penerbit, dasar hukum, perihal. Format relatif baku → regex sangat cocok. +- **4b. Personnel table extraction**: ambil dari hasil PP-Structure table → mapping kolom (Pangkat, NRP, Nama, Jabatan, Keterangan). +- **4c. LLM fallback**: untuk field yang regex/table miss, kirim chunk teks + JSON schema ke LLM lokal (Ollama / vLLM) dengan **structured output** (Pydantic via `outlines` / `instructor`). +- **4d. Validation layer**: + - NRP: 8 digit numerik. + - Pangkat: harus ada di daftar master pangkat Polri. + - Tanggal: parse + sanity check. + - Cross-check: jumlah personel di body = jumlah baris tabel. + +**Tahap 5 — Confidence Scoring & Routing** +- Aggregate confidence: weighted average dari OCR confidence + validation pass/fail + LLM logprob (kalau pakai). +- Threshold (mis. < 0.85) → status `NEEDS_REVIEW`. +- Threshold tinggi (≥ 0.95) + semua validasi pass → status `AUTO_APPROVED`. + +### 2.3 API Endpoint (FastAPI) + +``` +POST /api/v1/documents # upload, kembalikan job_id +GET /api/v1/documents/{job_id} # poll status + hasil +GET /api/v1/documents/{job_id}/raw # raw OCR output (debug) +PATCH /api/v1/documents/{job_id} # koreksi manual (HITL) +GET /api/v1/health # liveness +GET /api/v1/metrics # Prometheus +``` + +Response shape (contoh): +```json +{ + "job_id": "uuid", + "status": "completed | processing | needs_review | failed", + "confidence": 0.92, + "data": { + "nomor_sprint": "Sprin/123/IV/2025", + "tanggal": "2025-04-21", + "satuan_penerbit": "Polres Bandung", + "dasar": ["...", "..."], + "perihal": "...", + "personel": [ + {"no": 1, "pangkat": "AKP", "nrp": "12345678", "nama": "...", "jabatan": "Kasat Reskrim", "confidence": 0.97}, + ... + ], + "ttd": {"pejabat": "...", "pangkat": "...", "nrp": "..."} + }, + "review_flags": [] +} +``` + +### 2.4 Tech Stack Final yang Direkomendasikan + +| Layer | Pilihan | Catatan | +|---|---|---| +| API | **FastAPI** + Uvicorn/Gunicorn | sesuai usulan | +| Validation | **Pydantic v2** | wajib | +| Queue | **Redis + Celery** atau **Dramatiq** | OCR berat, jangan blocking request | +| OCR | **PaddleOCR PP-OCRv4 + PP-Structure** | tambah PP-Structure untuk tabel | +| Preprocessing | **OpenCV + Pillow** + **DocTR** (detection) | DocTR untuk foto HP | +| Extraction | **Regex + Ollama (Llama 3.1 8B / Qwen2.5 7B)** + **instructor/outlines** | hybrid | +| Storage | **PostgreSQL** (metadata) + **MinIO** (file blob) | self-hosted, sesuai compliance | +| Observability | **Prometheus + Grafana + Loki** | wajib produksi | +| Container | **Docker + docker-compose** (dev) → **Kubernetes** (prod) | | +| GPU | NVIDIA T4/A10 (1× cukup untuk MVP) | PaddleOCR jauh lebih cepat di GPU | + +--- + +## 3. Roadmap Pengembangan (Bertahap) + +### Fase 0 — Persiapan (1 minggu) +- Kumpulkan **dataset sampel**: minimal 50 surat sprint (campur PDF scan + foto HP) dari beragam satuan. +- Buat **ground truth labelling** untuk 20 dokumen (untuk evaluasi). +- Definisikan **schema output final** (JSON) bersama stakeholder. + +### Fase 1 — MVP Pipeline Sinkron (2 minggu) +- Setup FastAPI skeleton + Pydantic schemas. +- Integrasi PaddleOCR PP-OCRv4 (CPU dulu, GPU menyusul). +- Preprocessing dasar: deskew + denoise + resize. +- Regex extraction untuk field header. +- Endpoint sinkron `POST /documents` (untuk dev/testing saja). +- **Evaluasi akurasi** terhadap 20 ground truth. + +### Fase 2 — Robustness untuk Foto HP (2 minggu) +- Integrasi document detection (DocTR atau OpenCV contour). +- Perspective transform + dewarping. +- Shadow removal. +- Re-evaluasi akurasi pada subset foto HP. + +### Fase 3 — Table Extraction (1.5 minggu) +- Integrasi PP-Structure untuk personnel table. +- Mapping kolom + validation (NRP, pangkat). +- Master data tabel pangkat Polri. + +### Fase 4 — Async + Production Ready (1.5 minggu) +- Pindahkan ke arsitektur async dengan Celery + Redis. +- Storage MinIO + PostgreSQL. +- Auth, rate limit, logging, metrics. +- Docker compose untuk deployment. + +### Fase 5 — LLM Hybrid Extraction (2 minggu) +- Setup Ollama / vLLM dengan model lokal. +- Structured output via `instructor`. +- Confidence scoring + routing ke review. + +### Fase 6 — HITL Review UI (opsional, 2 minggu) +- Endpoint koreksi. +- Simple web UI (Next.js) untuk reviewer. +- Audit trail & feedback loop. + +### Fase 7 — Optimasi Lanjutan (ongoing) +- Fine-tune PaddleOCR detection/recognition pada dataset internal. +- Eksplorasi Donut/LayoutLMv3 jika dataset sudah cukup. +- Batch processing & GPU optimization. + +**Total estimasi MVP fungsional (Fase 1–4): ~7 minggu** dengan 1 backend engineer + 1 ML engineer. + +--- + +## 4. Risiko & Mitigasi + +| Risiko | Mitigasi | +|---|---| +| Data sensitif (kepolisian) bocor | Wajib on-prem; tidak ada cloud OCR; enkripsi at-rest (LUKS/pgcrypto) + in-transit (mTLS); audit log lengkap. | +| Variasi format antar satuan | Hybrid extraction (regex + LLM); kumpulkan sample dari banyak satuan sejak awal. | +| Foto HP kualitas buruk | Validasi kualitas image di client (resolusi minimal, blur detection) sebelum upload. | +| Akurasi tidak sampai target | HITL review wajib untuk dokumen low-confidence; jangan deploy fully-automatic. | +| Tanggung jawab hukum atas hasil OCR | Selalu simpan original document + flag bahwa hasil ekstraksi adalah "draft, perlu verifikasi manusia". | + +--- + +## 5. Pertanyaan Sebelum Implementasi + +Sebelum saya lanjut ke implementasi, mohon konfirmasi: + +1. **Volume**: berapa dokumen/hari yang ditargetkan? (mempengaruhi pilihan async vs sync, GPU vs CPU) +2. **Deployment target**: on-prem mutlak, atau private cloud (GovCloud) boleh? +3. **Source dokumen**: apakah ada akses ke 20–50 sample surat sprint untuk dijadikan dataset awal? +4. **Integrasi**: service ini akan dipanggil sistem apa? (mempengaruhi auth & API contract) +5. **HITL**: apakah ada SDM untuk review manual dokumen low-confidence? +6. **Hardware**: sudah ada server GPU, atau perlu sizing rekomendasi? +7. **Format output final**: ada schema yang sudah dipakai sistem downstream? diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4ae79a8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,136 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "ocr-sprint-service" +version = "0.1.0" +description = "OCR service for Indonesian police 'surat sprint' documents (FastAPI + PaddleOCR + hybrid extraction)" +readme = "README.md" +requires-python = ">=3.10,<3.13" +license = { text = "Proprietary" } +authors = [{ name = "Adrian Kuman Firmansah" }] + +dependencies = [ + # Web framework + "fastapi>=0.115,<0.116", + "uvicorn[standard]>=0.30,<0.34", + "python-multipart>=0.0.9", + "pydantic>=2.7,<3", + "pydantic-settings>=2.4,<3", + # Image / PDF + "pillow>=10.4,<12", + "opencv-python-headless>=4.10,<5", + "numpy>=1.26,<2.2", + "PyMuPDF>=1.24,<2", + "python-magic>=0.4.27", + # OCR (CPU build of paddle; GPU users override via extra index) + "paddlepaddle==2.6.1", + "paddleocr>=2.7.5,<3", + # Logging / observability + "structlog>=24.1", + "prometheus-client>=0.20", + # Misc + "httpx>=0.27", + "tenacity>=8.5", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.2", + "pytest-asyncio>=0.23", + "pytest-cov>=5.0", + "ruff>=0.6.9", + "mypy>=1.11", + "types-Pillow", + "pre-commit>=3.7", +] + +# Extraction layer (Phase 5) — kept optional so MVP install stays light +llm = [ + "ollama>=0.3", + "instructor>=1.4", +] + +# Async pipeline (Phase 4) +async-pipeline = [ + "celery[redis]>=5.4", + "redis>=5.0", + "minio>=7.2", + "sqlalchemy>=2.0", + "psycopg[binary]>=3.2", + "alembic>=1.13", +] + +[project.scripts] +ocr-sprint-api = "ocr_sprint.main:run" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +"ocr_sprint" = ["py.typed"] + +# ---------- Tooling ---------- + +[tool.ruff] +line-length = 100 +target-version = "py310" +src = ["src", "tests"] + +[tool.ruff.lint] +select = [ + "E", "F", "W", # pycodestyle / pyflakes + "I", # isort + "B", # bugbear + "UP", # pyupgrade + "SIM", # simplify + "RUF", # ruff-specific + "C4", # comprehensions + "PIE", + "PT", # pytest style + "TID", # tidy imports +] +ignore = [ + "E501", # line length handled by formatter + "B008", # FastAPI Depends() pattern +] + +[tool.ruff.format] +quote-style = "double" + +[tool.mypy] +python_version = "3.10" +strict = true +warn_unused_ignores = true +warn_redundant_casts = true +disallow_untyped_defs = true +plugins = ["pydantic.mypy"] +mypy_path = "src" +namespace_packages = true +explicit_package_bases = true + +[[tool.mypy.overrides]] +module = ["paddleocr.*", "paddle.*", "cv2.*", "fitz.*", "magic.*"] +ignore_missing_imports = true + +[tool.pytest.ini_options] +minversion = "8.0" +addopts = "-ra --strict-markers --strict-config" +testpaths = ["tests"] +asyncio_mode = "auto" +filterwarnings = [ + "ignore::DeprecationWarning:paddle.*", + "ignore::DeprecationWarning:paddleocr.*", +] + +[tool.coverage.run] +source = ["src/ocr_sprint"] +branch = true + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "raise NotImplementedError", + "if TYPE_CHECKING:", +] diff --git a/samples/README.md b/samples/README.md new file mode 100644 index 0000000..281c7e7 --- /dev/null +++ b/samples/README.md @@ -0,0 +1,13 @@ +# Samples + +Drop sample surat sprint files here for local testing. **Do NOT commit real documents** — `.gitignore` excludes binary file extensions in this folder. + +Recommended layout: +``` +samples/ + pdf/ # PDF scans + photo/ # phone photos + ground_truth/ # JSON ground-truth labels for evaluation +``` + +For sharing real samples with the team, use the project's secured storage (MinIO/S3 once Phase 4 is live), not git. diff --git a/src/ocr_sprint/__init__.py b/src/ocr_sprint/__init__.py new file mode 100644 index 0000000..711ef75 --- /dev/null +++ b/src/ocr_sprint/__init__.py @@ -0,0 +1,3 @@ +"""OCR Sprint Service — extract structured data from Indonesian police 'surat sprint'.""" + +__version__ = "0.1.0" diff --git a/src/ocr_sprint/api/__init__.py b/src/ocr_sprint/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/ocr_sprint/api/errors.py b/src/ocr_sprint/api/errors.py new file mode 100644 index 0000000..81dd321 --- /dev/null +++ b/src/ocr_sprint/api/errors.py @@ -0,0 +1,43 @@ +"""HTTP error handlers.""" + +from __future__ import annotations + +from fastapi import FastAPI, Request, status +from fastapi.responses import JSONResponse + +from ocr_sprint.utils.logging import get_logger + +_logger = get_logger(__name__) + + +class OCRServiceError(Exception): + """Base class for application errors that should map to a 4xx response.""" + + http_status: int = status.HTTP_400_BAD_REQUEST + + +class UnsupportedDocumentError(OCRServiceError): + """Uploaded file is neither a PDF nor a recognized image format.""" + + +class JobNotFoundError(OCRServiceError): + http_status = status.HTTP_404_NOT_FOUND + + +def register_error_handlers(app: FastAPI) -> None: + """Wire OCRServiceError + a final fallback for unexpected exceptions.""" + + @app.exception_handler(OCRServiceError) + async def _ocr_error_handler(_: Request, exc: OCRServiceError) -> JSONResponse: + return JSONResponse( + status_code=exc.http_status, + content={"error": exc.__class__.__name__, "message": str(exc)}, + ) + + @app.exception_handler(Exception) + async def _unexpected_handler(_: Request, exc: Exception) -> JSONResponse: + _logger.exception("api.unhandled_exception", error=str(exc)) + return JSONResponse( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + content={"error": "InternalServerError", "message": "Unexpected error"}, + ) diff --git a/src/ocr_sprint/api/routes/__init__.py b/src/ocr_sprint/api/routes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/ocr_sprint/api/routes/documents.py b/src/ocr_sprint/api/routes/documents.py new file mode 100644 index 0000000..26dd6eb --- /dev/null +++ b/src/ocr_sprint/api/routes/documents.py @@ -0,0 +1,58 @@ +"""Documents API — Phase 1 synchronous endpoint. + +POST /documents accepts a single PDF or image upload, runs the synchronous +pipeline inline, and returns the structured result. This is suitable for +development and low-traffic production; Phase 4 will introduce an async +queue and a polling-style API at the same path. +""" + +from __future__ import annotations + +from uuid import uuid4 + +from fastapi import APIRouter, File, UploadFile, status + +from ocr_sprint.api.errors import UnsupportedDocumentError +from ocr_sprint.pipeline.orchestrator import run_pipeline +from ocr_sprint.schemas.document import DocumentResponse +from ocr_sprint.utils.logging import get_logger + +router = APIRouter(prefix="/documents", tags=["documents"]) +_logger = get_logger(__name__) + +_MAX_UPLOAD_BYTES = 25 * 1024 * 1024 # 25 MB + + +@router.post("", status_code=status.HTTP_200_OK, response_model=DocumentResponse) +async def create_document(file: UploadFile = File(...)) -> DocumentResponse: + """Run OCR + extraction synchronously on a single upload.""" + job_id = uuid4() + log = _logger.bind(job_id=str(job_id), filename=file.filename or "") + + content = await file.read() + if not content: + raise UnsupportedDocumentError("Uploaded file is empty.") + if len(content) > _MAX_UPLOAD_BYTES: + raise UnsupportedDocumentError( + f"Uploaded file exceeds {_MAX_UPLOAD_BYTES // (1024 * 1024)} MB limit." + ) + + log.info("documents.received", size=len(content)) + try: + output = run_pipeline(content) + except ValueError as exc: + raise UnsupportedDocumentError(str(exc)) from exc + + log.info( + "documents.completed", + status=output.status.value, + confidence=round(output.confidence, 3), + flags=[f.value for f in output.result.review_flags], + ) + return DocumentResponse( + job_id=job_id, + status=output.status, + confidence=output.confidence, + data=output.result, + review_flags=[f.value for f in output.result.review_flags], + ) diff --git a/src/ocr_sprint/api/routes/health.py b/src/ocr_sprint/api/routes/health.py new file mode 100644 index 0000000..7a01b81 --- /dev/null +++ b/src/ocr_sprint/api/routes/health.py @@ -0,0 +1,15 @@ +"""Liveness / readiness endpoints.""" + +from __future__ import annotations + +from fastapi import APIRouter + +from ocr_sprint import __version__ + +router = APIRouter(tags=["health"]) + + +@router.get("/health") +async def health() -> dict[str, str]: + """Lightweight liveness check — does NOT touch the OCR engine.""" + return {"status": "ok", "version": __version__} diff --git a/src/ocr_sprint/config.py b/src/ocr_sprint/config.py new file mode 100644 index 0000000..18a955c --- /dev/null +++ b/src/ocr_sprint/config.py @@ -0,0 +1,72 @@ +"""Application settings loaded from environment / .env file.""" + +from __future__ import annotations + +from functools import lru_cache +from pathlib import Path + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + """Runtime configuration. Override via environment variables or a .env file.""" + + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + case_sensitive=False, + extra="ignore", + ) + + # App + app_env: str = "local" + app_host: str = "0.0.0.0" + app_port: int = 8000 + app_log_level: str = "INFO" + + # Storage (Phase 1: local fs) + storage_local_dir: Path = Path("./storage") + + # OCR + ocr_lang: str = "latin" + ocr_use_gpu: bool = False + ocr_det_model_dir: str | None = None + ocr_rec_model_dir: str | None = None + ocr_cls_model_dir: str | None = None + ocr_max_image_side: int = 2200 + + # Preprocessing + preprocess_target_dpi: int = 300 + preprocess_denoise: bool = True + preprocess_deskew: bool = True + preprocess_adaptive_threshold: bool = False + + # Confidence thresholds (Phase 5 routing) + confidence_auto_approve: float = Field(0.95, ge=0.0, le=1.0) + confidence_needs_review: float = Field(0.85, ge=0.0, le=1.0) + + # LLM (Phase 5) + llm_enabled: bool = False + llm_provider: str = "ollama" + llm_model: str = "qwen2.5:1.5b" + llm_base_url: str = "http://localhost:11434" + llm_timeout_s: int = 60 + + # Async pipeline (Phase 4) + queue_enabled: bool = False + redis_url: str = "redis://localhost:6379/0" + database_url: str = "postgresql+psycopg://ocr:ocr@localhost:5432/ocr_sprint" + minio_endpoint: str = "localhost:9000" + minio_access_key: str = "minioadmin" + minio_secret_key: str = "minioadmin" + minio_bucket: str = "ocr-sprint" + minio_secure: bool = False + + +@lru_cache(maxsize=1) +def get_settings() -> Settings: + """Cached accessor so settings are loaded once per process.""" + settings = Settings() + settings.storage_local_dir.mkdir(parents=True, exist_ok=True) + return settings diff --git a/src/ocr_sprint/data/__init__.py b/src/ocr_sprint/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/ocr_sprint/data/master_pangkat.py b/src/ocr_sprint/data/master_pangkat.py new file mode 100644 index 0000000..667b47c --- /dev/null +++ b/src/ocr_sprint/data/master_pangkat.py @@ -0,0 +1,66 @@ +"""Master data for Polri ranks ('pangkat'). + +Used by the validation layer to: +1. Confirm that a recognized rank string is a real Polri rank. +2. Normalize abbreviated forms ("AKP" → "AKP", "Brigadir Polisi" → "Brigadir") to a canonical form. + +Source: Peraturan Kapolri tentang Pangkat (publicly available, 2024). +Update this file when ranks are reorganized. +""" + +from __future__ import annotations + +# Canonical abbreviation → list of accepted variants (case-insensitive). +PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = { + # Tamtama + "BHARADA": ("BHARADA", "BHRD"), + "BHARATU": ("BHARATU", "BHRT"), + "BHARAKA": ("BHARAKA", "BHRK"), + "ABRIP": ("ABRIP",), + "ABRIPTU": ("ABRIPTU",), + "ABRIPKA": ("ABRIPKA",), + # Bintara + "BRIPDA": ("BRIPDA",), + "BRIPTU": ("BRIPTU",), + "BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"), + "BRIPKA": ("BRIPKA",), + "AIPDA": ("AIPDA",), + "AIPTU": ("AIPTU",), + # Perwira Pertama + "IPDA": ("IPDA",), + "IPTU": ("IPTU",), + "AKP": ("AKP",), + # Perwira Menengah + "KOMPOL": ("KOMPOL",), + "AKBP": ("AKBP",), + "KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP"), + # Perwira Tinggi + "BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"), + "IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"), + "KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"), + "JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"), +} + +# Reverse lookup: any variant (uppercased) → canonical form. +_VARIANT_TO_CANONICAL: dict[str, str] = { + variant.upper(): canonical + for canonical, variants in PANGKAT_VARIANTS.items() + for variant in variants +} + + +def normalize_pangkat(raw: str | None) -> str | None: + """Return canonical Polri rank, or None if input is empty/unknown.""" + if not raw: + return None + cleaned = " ".join(raw.strip().upper().split()) + if cleaned in _VARIANT_TO_CANONICAL: + return _VARIANT_TO_CANONICAL[cleaned] + # tolerate trailing punctuation like "AKP." + stripped = cleaned.rstrip(".,;:") + return _VARIANT_TO_CANONICAL.get(stripped) + + +def is_valid_pangkat(raw: str | None) -> bool: + """True if the string maps to a known Polri rank after normalization.""" + return normalize_pangkat(raw) is not None diff --git a/src/ocr_sprint/main.py b/src/ocr_sprint/main.py new file mode 100644 index 0000000..4b5e9b1 --- /dev/null +++ b/src/ocr_sprint/main.py @@ -0,0 +1,42 @@ +"""FastAPI entrypoint.""" + +from __future__ import annotations + +from fastapi import FastAPI + +from ocr_sprint import __version__ +from ocr_sprint.api.errors import register_error_handlers +from ocr_sprint.api.routes import documents, health +from ocr_sprint.config import get_settings +from ocr_sprint.utils.logging import configure_logging + + +def create_app() -> FastAPI: + """Application factory — keeps top-level state easy to test.""" + settings = get_settings() + configure_logging(settings.app_log_level) + + app = FastAPI( + title="OCR Sprint Service", + version=__version__, + description="OCR + structured extraction for Indonesian police 'surat sprint' documents.", + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json", + ) + + register_error_handlers(app) + app.include_router(health.router, prefix="/api/v1") + app.include_router(documents.router, prefix="/api/v1") + return app + + +app = create_app() + + +def run() -> None: + """Console-script entrypoint (`ocr-sprint-api`).""" + import uvicorn + + s = get_settings() + uvicorn.run("ocr_sprint.main:app", host=s.app_host, port=s.app_port, reload=False) diff --git a/src/ocr_sprint/pipeline/__init__.py b/src/ocr_sprint/pipeline/__init__.py new file mode 100644 index 0000000..e389d04 --- /dev/null +++ b/src/ocr_sprint/pipeline/__init__.py @@ -0,0 +1 @@ +"""OCR pipeline: ingest → preprocess → OCR → extract → validate.""" diff --git a/src/ocr_sprint/pipeline/confidence.py b/src/ocr_sprint/pipeline/confidence.py new file mode 100644 index 0000000..d046a36 --- /dev/null +++ b/src/ocr_sprint/pipeline/confidence.py @@ -0,0 +1,51 @@ +"""Confidence scoring + routing decision. + +The score is a weighted blend of: + - mean OCR confidence across all detected lines + - validation pass rate (1.0 if no review flags, decreases per flag) + +This is intentionally simple for Phase 1; Phase 5 will add LLM logprob +contributions and per-field confidences. +""" + +from __future__ import annotations + +from ocr_sprint.config import get_settings +from ocr_sprint.schemas.document import DocumentStatus +from ocr_sprint.schemas.extraction import ReviewFlag + +# Per-flag penalty applied to the validation component of the score. +_FLAG_PENALTY: dict[ReviewFlag, float] = { + ReviewFlag.LOW_OCR_CONFIDENCE: 0.10, + ReviewFlag.MISSING_FIELD: 0.20, + ReviewFlag.INVALID_NRP: 0.10, + ReviewFlag.UNKNOWN_PANGKAT: 0.05, + ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15, + ReviewFlag.DATE_PARSE_FAILED: 0.10, +} + +OCR_WEIGHT = 0.6 +VALIDATION_WEIGHT = 0.4 + + +def compute_confidence( + ocr_confidence: float, + flags: list[ReviewFlag], +) -> float: + """Blend OCR confidence with validation penalties into a single 0-1 score.""" + validation_score = 1.0 + for flag in flags: + validation_score -= _FLAG_PENALTY.get(flag, 0.05) + validation_score = max(0.0, validation_score) + blended = OCR_WEIGHT * ocr_confidence + VALIDATION_WEIGHT * validation_score + return max(0.0, min(1.0, blended)) + + +def route(confidence: float) -> DocumentStatus: + """Map a final confidence score onto the job's terminal status.""" + s = get_settings() + if confidence >= s.confidence_auto_approve: + return DocumentStatus.COMPLETED + if confidence >= s.confidence_needs_review: + return DocumentStatus.NEEDS_REVIEW + return DocumentStatus.NEEDS_REVIEW # below review threshold also goes to humans diff --git a/src/ocr_sprint/pipeline/extract/__init__.py b/src/ocr_sprint/pipeline/extract/__init__.py new file mode 100644 index 0000000..b19f4f7 --- /dev/null +++ b/src/ocr_sprint/pipeline/extract/__init__.py @@ -0,0 +1 @@ +"""Information extraction layer (regex Phase 1, LLM Phase 5).""" diff --git a/src/ocr_sprint/pipeline/extract/regex_rules.py b/src/ocr_sprint/pipeline/extract/regex_rules.py new file mode 100644 index 0000000..88e594f --- /dev/null +++ b/src/ocr_sprint/pipeline/extract/regex_rules.py @@ -0,0 +1,169 @@ +"""Regex-based extraction for the deterministic header fields of a surat sprint. + +Targets header fields whose layout is highly standardized across Polri units: + + - Nomor sprint, e.g. "Sprin / 123 / IV / 2025 / Reskrim" + - Tanggal (date the sprint was issued) + - Satuan penerbit (issuing unit) + - Perihal + - Dasar (numbered list of legal/operational basis) + +Personnel table extraction is intentionally NOT done here — that needs +PP-Structure + cell-aware logic and lives in `pipeline/table.py` (Phase 3). +""" + +from __future__ import annotations + +import re +from datetime import date + +from ocr_sprint.schemas.extraction import HeaderFields, Signatory + +# ---------- regex patterns ---------- + +# Nomor sprint, tolerant of spacing and OCR noise. +# Examples it should match: +# "Sprin / 123 / IV / 2025 / Reskrim" +# "SPRIN/345/X/2024" +# "Nomor : Sprin/12/I/2025/Sat Intelkam" +_RE_NOMOR_SPRINT = re.compile( + r"\bSPRIN[\s./-]*\d+[\s./-]*[IVXLCDM]+[\s./-]*\d{2,4}(?:[\s./-]*[\w .-]+?)?", + re.IGNORECASE, +) + +# Indonesian month names. +_BULAN_MAP: dict[str, int] = { + "JANUARI": 1, + "FEBRUARI": 2, + "MARET": 3, + "APRIL": 4, + "MEI": 5, + "JUNI": 6, + "JULI": 7, + "AGUSTUS": 8, + "SEPTEMBER": 9, + "OKTOBER": 10, + "NOVEMBER": 11, + "DESEMBER": 12, +} + +# Date in Indonesian, e.g. "21 April 2025" or "21 - April - 2025" +_RE_TANGGAL_ID = re.compile( + r"\b(\d{1,2})\s*[-./\s]\s*(" + "|".join(_BULAN_MAP.keys()) + r")\s*[-./\s]\s*(\d{4})\b", + re.IGNORECASE, +) + +# Satuan penerbit usually appears in the document letterhead, prefixed by +# KEPOLISIAN . +_RE_SATUAN = re.compile( + r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)" + r"[^\n]{0,80}", + re.IGNORECASE, +) + +# "Perihal : ...." up to end of line. +_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE) + +# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..." +_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$") + +# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits. +_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE) + + +def find_nomor_sprint(text: str) -> str | None: + """Return the first nomor sprint found, normalized (no extra spaces).""" + match = _RE_NOMOR_SPRINT.search(text) + if not match: + return None + return " ".join(match.group(0).split()) + + +def find_tanggal(text: str) -> date | None: + """Find the issuance date. + + Surat sprint typically contains multiple dates: one or more in the 'Dasar' + section (citing prior documents) and one near the signatory at the bottom + (the actual issuance date, usually formatted as 'Tempat, DD Month YYYY'). + We prefer the **last** date in the document since the issuance date appears + after the dasar items in the standard layout. + """ + matches = list(_RE_TANGGAL_ID.finditer(text)) + if not matches: + return None + last = matches[-1] + day_s, bulan, year_s = last.group(1), last.group(2).upper(), last.group(3) + try: + return date(int(year_s), _BULAN_MAP[bulan], int(day_s)) + except (KeyError, ValueError): + return None + + +def find_satuan(text: str) -> str | None: + """Return the first letterhead match (issuing unit), normalized.""" + match = _RE_SATUAN.search(text) + if not match: + return None + return " ".join(match.group(0).split()) + + +def find_perihal(text: str) -> str | None: + """Return the first 'Perihal: ...' line, trimmed to that line only.""" + for line in text.splitlines(): + m = _RE_PERIHAL.search(line) + if m: + return m.group(1).strip() + return None + + +def find_dasar_list(text: str) -> list[str]: + """Extract numbered 'Dasar' items from the text. + + Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and + collect subsequent lines that start with a number. Stops at a blank line + or a line beginning with another section header keyword. + """ + lines = text.splitlines() + items: list[str] = [] + in_dasar = False + section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL") + for raw_line in lines: + line = raw_line.strip() + if not in_dasar: + if re.match(r"^\s*DASAR\b", line, re.IGNORECASE): + in_dasar = True + continue + if not line: + if items: + break + continue + upper = line.upper() + if any(upper.startswith(term) for term in section_terminators): + break + m = _RE_DASAR_ITEM.match(line) + if m: + items.append(m.group(2).strip()) + elif items: + # continuation of the previous dasar item + items[-1] = (items[-1] + " " + line).strip() + return items + + +def find_signatory(text: str) -> Signatory: + """Best-effort extraction of the signatory block (last NRP in the document).""" + matches = list(_RE_NRP.finditer(text)) + if not matches: + return Signatory() + last = matches[-1] + return Signatory(nrp=last.group(2)) + + +def extract_header(text: str) -> HeaderFields: + """Run all header-level regex extractors and return a populated schema.""" + return HeaderFields( + nomor_sprint=find_nomor_sprint(text), + tanggal=find_tanggal(text), + satuan_penerbit=find_satuan(text), + perihal=find_perihal(text), + dasar=find_dasar_list(text), + ) diff --git a/src/ocr_sprint/pipeline/extract/validators.py b/src/ocr_sprint/pipeline/extract/validators.py new file mode 100644 index 0000000..14d15ef --- /dev/null +++ b/src/ocr_sprint/pipeline/extract/validators.py @@ -0,0 +1,64 @@ +"""Cross-field validation, with structured review-flag output.""" + +from __future__ import annotations + +import re + +from ocr_sprint.data.master_pangkat import is_valid_pangkat +from ocr_sprint.schemas.extraction import ( + ExtractionResult, + HeaderFields, + ReviewFlag, +) +from ocr_sprint.schemas.personnel import PersonnelEntry + +# Polri NRP = 8 digits. +_RE_NRP_8 = re.compile(r"^\d{8}$") + + +def validate_nrp(nrp: str | None) -> bool: + """Return True when the value is a well-formed Polri NRP (8 digits).""" + if nrp is None: + return False + return bool(_RE_NRP_8.match(nrp.strip())) + + +def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]: + """Inspect a single personnel row and return any review flags it triggers.""" + flags: list[ReviewFlag] = [] + if entry.nrp and not validate_nrp(entry.nrp): + flags.append(ReviewFlag.INVALID_NRP) + if entry.pangkat and not is_valid_pangkat(entry.pangkat): + flags.append(ReviewFlag.UNKNOWN_PANGKAT) + return flags + + +def validate_header(header: HeaderFields) -> list[ReviewFlag]: + """Flag missing required fields or unparseable dates in the header.""" + flags: list[ReviewFlag] = [] + if header.nomor_sprint is None: + flags.append(ReviewFlag.MISSING_FIELD) + if header.tanggal is None: + flags.append(ReviewFlag.DATE_PARSE_FAILED) + return flags + + +def validate_extraction( + result: ExtractionResult, + expected_personnel_count: int | None = None, +) -> list[ReviewFlag]: + """Run all validators across the full extraction and dedupe the flags.""" + flags: list[ReviewFlag] = [] + flags.extend(validate_header(result.header)) + for entry in result.personel: + flags.extend(validate_personnel_entry(entry)) + if expected_personnel_count is not None and expected_personnel_count != len(result.personel): + flags.append(ReviewFlag.PERSONNEL_COUNT_MISMATCH) + # dedupe while preserving order + seen: set[ReviewFlag] = set() + deduped: list[ReviewFlag] = [] + for flag in flags: + if flag not in seen: + seen.add(flag) + deduped.append(flag) + return deduped diff --git a/src/ocr_sprint/pipeline/ingest.py b/src/ocr_sprint/pipeline/ingest.py new file mode 100644 index 0000000..0fd2db3 --- /dev/null +++ b/src/ocr_sprint/pipeline/ingest.py @@ -0,0 +1,81 @@ +"""Ingest layer: convert uploaded bytes (PDF/IMG) into a list of numpy images.""" + +from __future__ import annotations + +import io +from dataclasses import dataclass +from typing import Any + +import fitz # PyMuPDF +import numpy as np +from PIL import Image + +from ocr_sprint.schemas.document import SourceKind + +# Generic alias used across the pipeline. We don't constrain dtype/shape because +# OpenCV operations accept multiple dtypes and numpy generics are still rough. +NDArrayU8 = np.ndarray[Any, Any] + +PDF_MAGIC = b"%PDF-" +PNG_MAGIC = b"\x89PNG\r\n\x1a\n" +JPEG_MAGIC = b"\xff\xd8\xff" +TIFF_MAGIC_LE = b"II*\x00" +TIFF_MAGIC_BE = b"MM\x00*" + + +@dataclass(frozen=True) +class IngestedPage: + """One page worth of image data ready for preprocessing.""" + + image: NDArrayU8 # HxWx3 BGR uint8 (OpenCV convention) + page_index: int + + +def detect_source_kind(content: bytes) -> SourceKind: + """Best-effort sniff of an uploaded payload.""" + if content.startswith(PDF_MAGIC): + return SourceKind.PDF + if content.startswith((PNG_MAGIC, JPEG_MAGIC, TIFF_MAGIC_LE, TIFF_MAGIC_BE)): + return SourceKind.IMAGE + return SourceKind.UNKNOWN + + +def _pil_to_bgr(img: Image.Image) -> NDArrayU8: + """Convert PIL image to OpenCV BGR numpy array.""" + if img.mode != "RGB": + img = img.convert("RGB") + arr = np.asarray(img, dtype=np.uint8) + # RGB to BGR + return arr[:, :, ::-1].copy() + + +def ingest_pdf(content: bytes, target_dpi: int = 300) -> list[IngestedPage]: + """Render every page of a PDF to a numpy image at the target DPI. + + Uses PyMuPDF (no poppler dependency). DPI is enforced via a transform matrix: + fitz's default is 72 DPI, so the zoom factor is target_dpi / 72. + """ + pages: list[IngestedPage] = [] + zoom = target_dpi / 72.0 + matrix = fitz.Matrix(zoom, zoom) + with fitz.open(stream=content, filetype="pdf") as doc: + for idx, page in enumerate(doc): + pix = page.get_pixmap(matrix=matrix, alpha=False) + img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples) + pages.append(IngestedPage(image=_pil_to_bgr(img), page_index=idx)) + return pages + + +def ingest_image(content: bytes) -> list[IngestedPage]: + """Decode a single image into a one-element page list.""" + img = Image.open(io.BytesIO(content)) + return [IngestedPage(image=_pil_to_bgr(img), page_index=0)] + + +def ingest(content: bytes, kind: SourceKind, target_dpi: int = 300) -> list[IngestedPage]: + """Dispatch to the right loader based on declared source kind.""" + if kind == SourceKind.PDF: + return ingest_pdf(content, target_dpi=target_dpi) + if kind == SourceKind.IMAGE: + return ingest_image(content) + raise ValueError(f"Unsupported source kind: {kind}") diff --git a/src/ocr_sprint/pipeline/ocr.py b/src/ocr_sprint/pipeline/ocr.py new file mode 100644 index 0000000..f5874de --- /dev/null +++ b/src/ocr_sprint/pipeline/ocr.py @@ -0,0 +1,106 @@ +"""PaddleOCR wrapper. + +PaddleOCR has a heavy initialization cost (~2-5s on CPU as model files load), +so we keep a process-global instance behind a lazy accessor. + +The wrapper exposes a small, stable surface so the rest of the pipeline does +not depend directly on paddleocr's evolving API. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from threading import Lock +from typing import TYPE_CHECKING + +import numpy as np + +from ocr_sprint.config import get_settings +from ocr_sprint.pipeline.ingest import NDArrayU8 +from ocr_sprint.utils.logging import get_logger + +if TYPE_CHECKING: + from paddleocr import PaddleOCR + +_logger = get_logger(__name__) +_lock = Lock() +_instance: PaddleOCR | None = None + + +@dataclass(frozen=True) +class OCRLine: + """One recognized line with its bounding polygon and confidence.""" + + text: str + confidence: float + box: tuple[tuple[float, float], ...] # 4 (x, y) corner points + + +@dataclass(frozen=True) +class OCRPage: + """OCR output for a single page.""" + + lines: list[OCRLine] + + @property + def text(self) -> str: + """Reconstruct page text by concatenating lines (order = paddle's output order).""" + return "\n".join(line.text for line in self.lines) + + @property + def mean_confidence(self) -> float: + if not self.lines: + return 0.0 + return float(np.mean([line.confidence for line in self.lines])) + + +def _build_paddleocr() -> PaddleOCR: + from paddleocr import PaddleOCR + + s = get_settings() + kwargs: dict[str, object] = { + "lang": s.ocr_lang, + "use_angle_cls": True, + "use_gpu": s.ocr_use_gpu, + "show_log": False, + } + if s.ocr_det_model_dir: + kwargs["det_model_dir"] = s.ocr_det_model_dir + if s.ocr_rec_model_dir: + kwargs["rec_model_dir"] = s.ocr_rec_model_dir + if s.ocr_cls_model_dir: + kwargs["cls_model_dir"] = s.ocr_cls_model_dir + _logger.info("paddleocr.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu) + return PaddleOCR(**kwargs) + + +def get_ocr() -> PaddleOCR: + """Lazy, thread-safe singleton accessor for the PaddleOCR engine.""" + global _instance + if _instance is None: + with _lock: + if _instance is None: + _instance = _build_paddleocr() + return _instance + + +def run_ocr(image: NDArrayU8) -> OCRPage: + """Run OCR on a single BGR image and return a structured page result.""" + engine = get_ocr() + raw = engine.ocr(image, cls=True) + # PaddleOCR returns [[ [box, (text, conf)], ... ]] — one outer list per image. + if not raw or raw[0] is None: + return OCRPage(lines=[]) + page_raw = raw[0] + lines: list[OCRLine] = [] + for item in page_raw: + if not item or len(item) < 2: + continue + box_raw, text_conf = item[0], item[1] + text, conf = text_conf[0], float(text_conf[1]) + try: + box = tuple((float(p[0]), float(p[1])) for p in box_raw) + except (TypeError, ValueError, IndexError): + continue + lines.append(OCRLine(text=text, confidence=conf, box=box)) + return OCRPage(lines=lines) diff --git a/src/ocr_sprint/pipeline/orchestrator.py b/src/ocr_sprint/pipeline/orchestrator.py new file mode 100644 index 0000000..547993b --- /dev/null +++ b/src/ocr_sprint/pipeline/orchestrator.py @@ -0,0 +1,103 @@ +"""Synchronous pipeline orchestrator (Phase 1). + +Wires the individual stages together: + + bytes → ingest → preprocess → OCR → regex extract → validate → score + +Phase 4 will replace this with a Celery task graph; Phase 3/5 will plug +in PP-Structure for tables and an LLM extractor for variant fields. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from ocr_sprint.config import get_settings +from ocr_sprint.pipeline.confidence import compute_confidence, route +from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory +from ocr_sprint.pipeline.extract.validators import validate_extraction +from ocr_sprint.pipeline.ingest import detect_source_kind, ingest +from ocr_sprint.pipeline.ocr import OCRPage, run_ocr +from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess +from ocr_sprint.schemas.document import DocumentStatus, SourceKind +from ocr_sprint.schemas.extraction import ExtractionResult, ReviewFlag +from ocr_sprint.utils.logging import get_logger + +_logger = get_logger(__name__) + +# Below this OCR confidence we automatically flag for review. +_OCR_CONFIDENCE_FLAG_THRESHOLD = 0.80 + + +@dataclass +class PipelineOutput: + """Bundle returned by the orchestrator.""" + + source_kind: SourceKind + status: DocumentStatus + confidence: float + result: ExtractionResult + + +def run_pipeline(content: bytes) -> PipelineOutput: + """Execute the synchronous OCR + extraction pipeline on raw upload bytes.""" + s = get_settings() + + kind = detect_source_kind(content) + if kind == SourceKind.UNKNOWN: + raise ValueError("Unsupported file type — only PDF and common image formats are accepted.") + + pages = ingest(content, kind, target_dpi=s.preprocess_target_dpi) + _logger.info("pipeline.ingested", source_kind=kind.value, pages=len(pages)) + + pre_cfg = PreprocessConfig( + max_side=s.ocr_max_image_side, + denoise=s.preprocess_denoise, + deskew=s.preprocess_deskew, + adaptive_threshold=s.preprocess_adaptive_threshold, + ) + + ocr_pages: list[OCRPage] = [] + for page in pages: + cleaned = preprocess(page.image, pre_cfg) + ocr_pages.append(run_ocr(cleaned)) + + full_text = "\n".join(p.text for p in ocr_pages) + mean_ocr_conf = sum(p.mean_confidence for p in ocr_pages) / len(ocr_pages) if ocr_pages else 0.0 + + header = extract_header(full_text) + ttd = find_signatory(full_text) + + initial_flags: list[ReviewFlag] = [] + if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD: + initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE) + + result = ExtractionResult( + header=header, + personel=[], # Phase 3 will populate from PP-Structure + untuk=[], + ttd=ttd, + raw_text=full_text, + confidence=mean_ocr_conf, + review_flags=list(initial_flags), + ) + + flags = validate_extraction(result) + # merge initial OCR-confidence flag with validation flags, preserving uniqueness + seen = set(flags) + for f in initial_flags: + if f not in seen: + flags.append(f) + seen.add(f) + result.review_flags = flags + + final_conf = compute_confidence(mean_ocr_conf, flags) + result.confidence = final_conf + + status = route(final_conf) + return PipelineOutput( + source_kind=kind, + status=status, + confidence=final_conf, + result=result, + ) diff --git a/src/ocr_sprint/pipeline/preprocess.py b/src/ocr_sprint/pipeline/preprocess.py new file mode 100644 index 0000000..c694702 --- /dev/null +++ b/src/ocr_sprint/pipeline/preprocess.py @@ -0,0 +1,108 @@ +"""Image preprocessing for OCR. + +Phase 1 implements the "always-on" steps that work for both clean PDF scans +and reasonable phone photos: + + - resize to a reasonable max side (PaddleOCR runs faster on smaller inputs) + - convert to grayscale for analysis (kept as 3-channel BGR for paddle) + - denoise (Non-Local Means, gentle) + - deskew via Hough line angle estimate + - optional adaptive threshold for low-quality phone photos + +Phase 2 will add document-corner detection + perspective transform + dewarping +for tilted phone shots; those live in `document_detect.py` (added later). +""" + +from __future__ import annotations + +from dataclasses import dataclass + +import cv2 +import numpy as np + +from ocr_sprint.pipeline.ingest import NDArrayU8 + + +@dataclass(frozen=True) +class PreprocessConfig: + """Tunable knobs for the preprocessing pipeline.""" + + max_side: int = 2200 + denoise: bool = True + deskew: bool = True + adaptive_threshold: bool = False + + +def _resize_max_side(img: NDArrayU8, max_side: int) -> NDArrayU8: + h, w = img.shape[:2] + longest = max(h, w) + if longest <= max_side: + return img + scale = max_side / longest + new_w, new_h = round(w * scale), round(h * scale) + return cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA) + + +def _estimate_skew_angle(gray: NDArrayU8) -> float: + """Estimate skew using Canny + Hough; returns angle in degrees within [-15, 15].""" + edges = cv2.Canny(gray, 50, 150, apertureSize=3) + lines = cv2.HoughLines(edges, 1, np.pi / 360, threshold=200) + if lines is None or len(lines) == 0: + return 0.0 + angles: list[float] = [] + for line in lines[:200]: + rho, theta = line[0] + del rho + # convert to angle relative to horizontal (degrees) + angle = (theta * 180.0 / np.pi) - 90.0 + # only keep nearly-horizontal lines (within ±15°) + if -15.0 < angle < 15.0: + angles.append(angle) + if not angles: + return 0.0 + return float(np.median(angles)) + + +def _rotate(img: NDArrayU8, angle_deg: float) -> NDArrayU8: + if abs(angle_deg) < 0.1: + return img + h, w = img.shape[:2] + center = (w / 2, h / 2) + matrix = cv2.getRotationMatrix2D(center, angle_deg, 1.0) + return cv2.warpAffine( + img, + matrix, + (w, h), + flags=cv2.INTER_CUBIC, + borderMode=cv2.BORDER_REPLICATE, + ) + + +def preprocess(img: NDArrayU8, cfg: PreprocessConfig | None = None) -> NDArrayU8: + """Run preprocessing and return a clean BGR uint8 image suitable for OCR.""" + if cfg is None: + cfg = PreprocessConfig() + + out = _resize_max_side(img, cfg.max_side) + + if cfg.deskew: + gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY) + angle = _estimate_skew_angle(gray) + out = _rotate(out, -angle) + + if cfg.denoise: + out = cv2.fastNlMeansDenoisingColored(out, None, 5, 5, 7, 21) + + if cfg.adaptive_threshold: + gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY) + binarized = cv2.adaptiveThreshold( + gray, + 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, + blockSize=31, + C=15, + ) + out = cv2.cvtColor(binarized, cv2.COLOR_GRAY2BGR) + + return out diff --git a/src/ocr_sprint/py.typed b/src/ocr_sprint/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/ocr_sprint/schemas/__init__.py b/src/ocr_sprint/schemas/__init__.py new file mode 100644 index 0000000..c54dbfa --- /dev/null +++ b/src/ocr_sprint/schemas/__init__.py @@ -0,0 +1,27 @@ +"""Pydantic schemas for input/output of the OCR Sprint service.""" + +from ocr_sprint.schemas.document import ( + DocumentJob, + DocumentResponse, + DocumentStatus, + SourceKind, +) +from ocr_sprint.schemas.extraction import ( + ExtractionResult, + HeaderFields, + ReviewFlag, + Signatory, +) +from ocr_sprint.schemas.personnel import PersonnelEntry + +__all__ = [ + "DocumentJob", + "DocumentResponse", + "DocumentStatus", + "ExtractionResult", + "HeaderFields", + "PersonnelEntry", + "ReviewFlag", + "Signatory", + "SourceKind", +] diff --git a/src/ocr_sprint/schemas/document.py b/src/ocr_sprint/schemas/document.py new file mode 100644 index 0000000..c59b8b7 --- /dev/null +++ b/src/ocr_sprint/schemas/document.py @@ -0,0 +1,57 @@ +"""Job-level schemas (request, response, status).""" + +from __future__ import annotations + +from datetime import datetime +from enum import Enum +from typing import Any +from uuid import UUID, uuid4 + +from pydantic import BaseModel, ConfigDict, Field + +from ocr_sprint.schemas.extraction import ExtractionResult + + +class SourceKind(str, Enum): + """High-level type of the uploaded document.""" + + PDF = "pdf" + IMAGE = "image" + UNKNOWN = "unknown" + + +class DocumentStatus(str, Enum): + """Lifecycle status of an OCR job.""" + + PENDING = "pending" + PROCESSING = "processing" + COMPLETED = "completed" + NEEDS_REVIEW = "needs_review" + FAILED = "failed" + + +class DocumentJob(BaseModel): + """Internal representation of a job (Phase 1 holds it in-memory).""" + + model_config = ConfigDict(use_enum_values=False) + + job_id: UUID = Field(default_factory=uuid4) + source_kind: SourceKind = SourceKind.UNKNOWN + filename: str + status: DocumentStatus = DocumentStatus.PENDING + created_at: datetime = Field(default_factory=lambda: datetime.utcnow()) + updated_at: datetime = Field(default_factory=lambda: datetime.utcnow()) + error: str | None = None + result: ExtractionResult | None = None + debug: dict[str, Any] = Field(default_factory=dict) + + +class DocumentResponse(BaseModel): + """Public response payload returned by the documents API.""" + + job_id: UUID + status: DocumentStatus + confidence: float | None = None + data: ExtractionResult | None = None + review_flags: list[str] = Field(default_factory=list) + error: str | None = None diff --git a/src/ocr_sprint/schemas/extraction.py b/src/ocr_sprint/schemas/extraction.py new file mode 100644 index 0000000..1311faa --- /dev/null +++ b/src/ocr_sprint/schemas/extraction.py @@ -0,0 +1,55 @@ +"""Top-level extraction result schemas.""" + +from __future__ import annotations + +from datetime import date +from enum import Enum + +from pydantic import BaseModel, Field + +from ocr_sprint.schemas.personnel import PersonnelEntry + + +class ReviewFlag(str, Enum): + """Reasons a document was routed to human review.""" + + LOW_OCR_CONFIDENCE = "low_ocr_confidence" + MISSING_FIELD = "missing_field" + INVALID_NRP = "invalid_nrp" + UNKNOWN_PANGKAT = "unknown_pangkat" + PERSONNEL_COUNT_MISMATCH = "personnel_count_mismatch" + DATE_PARSE_FAILED = "date_parse_failed" + + +class Signatory(BaseModel): + """The official signing the sprint (Penandatangan).""" + + nama: str | None = None + pangkat: str | None = None + nrp: str | None = None + jabatan: str | None = None + + +class HeaderFields(BaseModel): + """Header fields parsed from the top portion of a sprint.""" + + nomor_sprint: str | None = Field(None, description="e.g. Sprin/123/IV/2025/Reskrim.") + tanggal: date | None = Field(None, description="Date the sprint was issued.") + satuan_penerbit: str | None = Field(None, description="Issuing unit, e.g. 'Polres Bandung'.") + perihal: str | None = None + dasar: list[str] = Field(default_factory=list, description="List of legal/operational basis.") + + +class ExtractionResult(BaseModel): + """Full structured payload extracted from a single sprint document.""" + + header: HeaderFields = Field(default_factory=HeaderFields) + personel: list[PersonnelEntry] = Field(default_factory=list) + untuk: list[str] = Field( + default_factory=list, + description="Bulleted task descriptions in the 'Untuk' / 'Dikerjakan' section.", + ) + ttd: Signatory = Field(default_factory=Signatory) + raw_text: str = Field(default="", description="Concatenated OCR text for debugging.") + confidence: float = Field(0.0, ge=0.0, le=1.0) + review_flags: list[ReviewFlag] = Field(default_factory=list) diff --git a/src/ocr_sprint/schemas/personnel.py b/src/ocr_sprint/schemas/personnel.py new file mode 100644 index 0000000..9eee085 --- /dev/null +++ b/src/ocr_sprint/schemas/personnel.py @@ -0,0 +1,18 @@ +"""Schema for a single personnel row in a surat sprint.""" + +from __future__ import annotations + +from pydantic import BaseModel, Field + + +class PersonnelEntry(BaseModel): + """One row from the personnel table.""" + + no: int | None = Field(None, description="Row number as printed on the document.") + pangkat: str | None = Field(None, description="Rank, normalized when possible.") + nrp: str | None = Field(None, description="8-digit Polri NRP, or blank if not detected.") + nama: str | None = Field(None, description="Full name.") + jabatan_dinas: str | None = Field(None, description="Permanent post (jabatan dalam dinas).") + jabatan_sprint: str | None = Field(None, description="Role within this sprint.") + keterangan: str | None = None + confidence: float = Field(0.0, ge=0.0, le=1.0) diff --git a/src/ocr_sprint/utils/__init__.py b/src/ocr_sprint/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/ocr_sprint/utils/logging.py b/src/ocr_sprint/utils/logging.py new file mode 100644 index 0000000..79bee44 --- /dev/null +++ b/src/ocr_sprint/utils/logging.py @@ -0,0 +1,45 @@ +"""Structured logging setup using structlog.""" + +from __future__ import annotations + +import logging +import sys +from typing import Any + +import structlog + + +def configure_logging(level: str = "INFO") -> None: + """Configure structlog to emit JSON-friendly key=value records to stdout.""" + log_level = getattr(logging, level.upper(), logging.INFO) + logging.basicConfig( + format="%(message)s", + stream=sys.stdout, + level=log_level, + ) + structlog.configure( + processors=[ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.TimeStamper(fmt="iso", utc=True), + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + structlog.dev.ConsoleRenderer(colors=False), + ], + wrapper_class=structlog.make_filtering_bound_logger(log_level), + context_class=dict, + logger_factory=structlog.PrintLoggerFactory(), + cache_logger_on_first_use=True, + ) + + +def get_logger(name: str | None = None, **initial_values: Any) -> Any: + """Return a bound logger with optional initial context. + + The return type is ``Any`` because structlog's BoundLogger generic typing + is too restrictive in practice; callers treat it as a duck-typed logger. + """ + logger = structlog.get_logger(name) + if initial_values: + logger = logger.bind(**initial_values) + return logger diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..75f48d8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,43 @@ +"""Shared pytest fixtures.""" + +from __future__ import annotations + +import numpy as np +import pytest + + +@pytest.fixture +def blank_bgr_image() -> np.ndarray: + """A 600x800 white BGR image (uint8) — useful for preprocessing smoke tests.""" + return np.full((600, 800, 3), 255, dtype=np.uint8) + + +@pytest.fixture +def sample_sprint_text() -> str: + """Realistic-but-synthetic OCR text for regex extractor tests.""" + return ( + "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n" + "DAERAH JAWA BARAT\n" + "RESOR BANDUNG\n" + "\n" + "SURAT PERINTAH\n" + "Nomor : Sprin/123/IV/2025/Reskrim\n" + "\n" + "DASAR :\n" + "1. Undang-Undang Nomor 2 Tahun 2002 tentang Kepolisian Negara Republik Indonesia.\n" + "2. Peraturan Kapolri Nomor 6 Tahun 2017 tentang Susunan Organisasi.\n" + "3. Laporan Polisi Nomor LP/123/IV/2025/Reskrim tanggal 20 April 2025.\n" + "\n" + "DIPERINTAHKAN :\n" + "Kepada : 1. Nama anggota tersebut di bawah ini.\n" + "\n" + "Untuk : Melaksanakan penyelidikan tindak pidana.\n" + "\n" + "PERIHAL : Pelaksanaan penyelidikan kasus pencurian.\n" + "\n" + "Bandung, 21 April 2025\n" + "KEPALA KEPOLISIAN RESOR BANDUNG\n" + "\n" + "Drs. BUDI SANTOSO\n" + "AKBP NRP 12345678\n" + ) diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_api.py b/tests/unit/test_api.py new file mode 100644 index 0000000..be8addd --- /dev/null +++ b/tests/unit/test_api.py @@ -0,0 +1,87 @@ +"""API tests with the OCR engine mocked. + +These tests do NOT load PaddleOCR — instead they monkeypatch the orchestrator +so we can exercise the FastAPI surface without the heavy ML init cost. +""" + +from __future__ import annotations + +from datetime import date + +import pytest +from fastapi.testclient import TestClient + +from ocr_sprint.main import create_app +from ocr_sprint.pipeline import orchestrator as orch_module +from ocr_sprint.pipeline.orchestrator import PipelineOutput +from ocr_sprint.schemas.document import DocumentStatus, SourceKind +from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields + + +@pytest.fixture +def client() -> TestClient: + return TestClient(create_app()) + + +def test_health_endpoint(client: TestClient) -> None: + response = client.get("/api/v1/health") + assert response.status_code == 200 + assert response.json()["status"] == "ok" + + +def test_documents_rejects_empty_upload(client: TestClient) -> None: + response = client.post( + "/api/v1/documents", + files={"file": ("empty.pdf", b"", "application/pdf")}, + ) + assert response.status_code == 400 + + +def test_documents_rejects_unknown_format( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + response = client.post( + "/api/v1/documents", + files={"file": ("x.bin", b"random garbage bytes here", "application/octet-stream")}, + ) + assert response.status_code == 400 + + +def test_documents_returns_pipeline_output( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + fake_result = ExtractionResult( + header=HeaderFields( + nomor_sprint="Sprin/1/I/2025", + tanggal=date(2025, 1, 1), + satuan_penerbit="POLRES TEST", + ), + confidence=0.97, + ) + fake_output = PipelineOutput( + source_kind=SourceKind.PDF, + status=DocumentStatus.COMPLETED, + confidence=0.97, + result=fake_result, + ) + + def _fake_run(_content: bytes) -> PipelineOutput: + return fake_output + + # Patch the symbol *imported into* the routes module. + monkeypatch.setattr(orch_module, "run_pipeline", _fake_run) + from ocr_sprint.api.routes import documents as docs_module + + monkeypatch.setattr(docs_module, "run_pipeline", _fake_run) + + response = client.post( + "/api/v1/documents", + files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")}, + ) + assert response.status_code == 200 + body = response.json() + assert body["status"] == "completed" + assert body["confidence"] == 0.97 + assert body["data"]["header"]["nomor_sprint"] == "Sprin/1/I/2025" diff --git a/tests/unit/test_confidence.py b/tests/unit/test_confidence.py new file mode 100644 index 0000000..e1cc18b --- /dev/null +++ b/tests/unit/test_confidence.py @@ -0,0 +1,46 @@ +"""Tests for confidence scoring + routing.""" + +from __future__ import annotations + +from ocr_sprint.pipeline.confidence import compute_confidence, route +from ocr_sprint.schemas.document import DocumentStatus +from ocr_sprint.schemas.extraction import ReviewFlag + + +def test_no_flags_returns_blend_of_ocr_only() -> None: + score = compute_confidence(0.9, []) + # OCR weight 0.6 * 0.9 + validation 0.4 * 1.0 = 0.94 + assert abs(score - 0.94) < 1e-6 + + +def test_flags_reduce_score() -> None: + base = compute_confidence(0.9, []) + with_flags = compute_confidence(0.9, [ReviewFlag.MISSING_FIELD]) + assert with_flags < base + + +def test_score_is_clamped() -> None: + catastrophic = compute_confidence( + 0.0, + [ + ReviewFlag.MISSING_FIELD, + ReviewFlag.LOW_OCR_CONFIDENCE, + ReviewFlag.PERSONNEL_COUNT_MISMATCH, + ReviewFlag.INVALID_NRP, + ReviewFlag.UNKNOWN_PANGKAT, + ReviewFlag.DATE_PARSE_FAILED, + ], + ) + assert 0.0 <= catastrophic <= 1.0 + + +def test_route_high_confidence() -> None: + assert route(0.97) == DocumentStatus.COMPLETED + + +def test_route_mid_goes_to_review() -> None: + assert route(0.88) == DocumentStatus.NEEDS_REVIEW + + +def test_route_low_goes_to_review() -> None: + assert route(0.40) == DocumentStatus.NEEDS_REVIEW diff --git a/tests/unit/test_ingest.py b/tests/unit/test_ingest.py new file mode 100644 index 0000000..0a7f0c2 --- /dev/null +++ b/tests/unit/test_ingest.py @@ -0,0 +1,50 @@ +"""Tests for source detection + image ingest.""" + +from __future__ import annotations + +import io + +import numpy as np +from PIL import Image + +from ocr_sprint.pipeline.ingest import detect_source_kind, ingest_image +from ocr_sprint.schemas.document import SourceKind + + +def _png_bytes() -> bytes: + img = Image.new("RGB", (100, 80), color="white") + buf = io.BytesIO() + img.save(buf, format="PNG") + return buf.getvalue() + + +def _jpeg_bytes() -> bytes: + img = Image.new("RGB", (100, 80), color="white") + buf = io.BytesIO() + img.save(buf, format="JPEG") + return buf.getvalue() + + +def test_detect_pdf() -> None: + assert detect_source_kind(b"%PDF-1.7\n...") == SourceKind.PDF + + +def test_detect_png() -> None: + assert detect_source_kind(_png_bytes()) == SourceKind.IMAGE + + +def test_detect_jpeg() -> None: + assert detect_source_kind(_jpeg_bytes()) == SourceKind.IMAGE + + +def test_detect_unknown() -> None: + assert detect_source_kind(b"garbage") == SourceKind.UNKNOWN + + +def test_ingest_image_returns_one_page() -> None: + pages = ingest_image(_png_bytes()) + assert len(pages) == 1 + assert pages[0].page_index == 0 + assert isinstance(pages[0].image, np.ndarray) + assert pages[0].image.dtype == np.uint8 + assert pages[0].image.shape == (80, 100, 3) diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py new file mode 100644 index 0000000..56d5bf4 --- /dev/null +++ b/tests/unit/test_preprocess.py @@ -0,0 +1,37 @@ +"""Smoke tests for the preprocessing pipeline.""" + +from __future__ import annotations + +import numpy as np + +from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess + + +def test_preprocess_returns_bgr_uint8(blank_bgr_image: np.ndarray) -> None: + out = preprocess(blank_bgr_image) + assert out.dtype == np.uint8 + assert out.ndim == 3 + assert out.shape[2] == 3 + + +def test_preprocess_resizes_to_max_side() -> None: + big = np.full((4000, 3000, 3), 255, dtype=np.uint8) + cfg = PreprocessConfig(max_side=1000, denoise=False, deskew=False) + out = preprocess(big, cfg) + assert max(out.shape[:2]) == 1000 + + +def test_preprocess_does_not_upscale_small_images() -> None: + small = np.full((400, 300, 3), 255, dtype=np.uint8) + cfg = PreprocessConfig(max_side=2200, denoise=False, deskew=False) + out = preprocess(small, cfg) + assert out.shape[:2] == (400, 300) + + +def test_adaptive_threshold_produces_binary_image() -> None: + img = np.random.randint(0, 256, (200, 200, 3), dtype=np.uint8) + cfg = PreprocessConfig(denoise=False, deskew=False, adaptive_threshold=True) + out = preprocess(img, cfg) + # adaptive threshold should leave only 0s and 255s + unique = np.unique(out) + assert set(unique.tolist()).issubset({0, 255}) diff --git a/tests/unit/test_regex_rules.py b/tests/unit/test_regex_rules.py new file mode 100644 index 0000000..3cd7855 --- /dev/null +++ b/tests/unit/test_regex_rules.py @@ -0,0 +1,112 @@ +"""Tests for regex-based header extraction.""" + +from __future__ import annotations + +from datetime import date + +import pytest + +from ocr_sprint.pipeline.extract.regex_rules import ( + extract_header, + find_dasar_list, + find_nomor_sprint, + find_perihal, + find_satuan, + find_signatory, + find_tanggal, +) + + +class TestNomorSprint: + @pytest.mark.parametrize( + ("text", "needle"), + [ + ("Nomor : Sprin/123/IV/2025/Reskrim", "123"), + ("Nomor: SPRIN / 7 / I / 2024", "7"), + ("...Sprin-345-X-2024-Sat Intelkam...", "345"), + ], + ) + def test_finds_nomor(self, text: str, needle: str) -> None: + result = find_nomor_sprint(text) + assert result is not None + assert needle in result + assert result.upper().startswith("SPRIN") + + def test_returns_none_when_absent(self) -> None: + assert find_nomor_sprint("no nomor here, just some text") is None + + +class TestTanggal: + def test_basic_date(self) -> None: + assert find_tanggal("Bandung, 21 April 2025") == date(2025, 4, 21) + + def test_with_dashes(self) -> None: + assert find_tanggal("Tanggal 1 - Desember - 2024") == date(2024, 12, 1) + + def test_invalid_month(self) -> None: + assert find_tanggal("21 Foo 2025") is None + + def test_no_date_present(self) -> None: + assert find_tanggal("nothing here") is None + + +class TestSatuan: + def test_polres(self) -> None: + result = find_satuan("KEPOLISIAN RESOR BANDUNG\nLainnya") + assert result is not None + assert "RESOR BANDUNG" in result.upper() + + def test_polri_pusat(self) -> None: + result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA") + assert result is not None + + +class TestPerihal: + def test_extracts_perihal_line(self) -> None: + text = "Other line\nPERIHAL : Pelaksanaan penyelidikan kasus.\nMore" + assert find_perihal(text) == "Pelaksanaan penyelidikan kasus." + + def test_returns_none_when_absent(self) -> None: + assert find_perihal("no perihal field") is None + + +class TestDasar: + def test_numbered_list(self) -> None: + text = ( + "DASAR :\n" + "1. UU No 2 Tahun 2002.\n" + "2. Peraturan Kapolri Nomor 6.\n" + "\n" + "DIPERINTAHKAN :\n" + "Kepada : ...\n" + ) + items = find_dasar_list(text) + assert len(items) == 2 + assert items[0].startswith("UU No 2") + assert items[1].startswith("Peraturan Kapolri") + + def test_empty_when_section_missing(self) -> None: + assert find_dasar_list("no dasar section") == [] + + +class TestSignatory: + def test_extracts_last_nrp(self) -> None: + text = "Some 12345678 NRP earlier 87654321\nNRP. 11223344" + sig = find_signatory(text) + assert sig.nrp == "11223344" + + def test_no_nrp(self) -> None: + assert find_signatory("no NRP here").nrp is None + + +class TestExtractHeader: + def test_full_synthetic_doc(self, sample_sprint_text: str) -> None: + header = extract_header(sample_sprint_text) + assert header.nomor_sprint is not None + assert "Sprin" in header.nomor_sprint + assert header.tanggal == date(2025, 4, 21) + assert header.satuan_penerbit is not None + assert "KEPOLISIAN" in header.satuan_penerbit.upper() + assert header.perihal is not None + assert "penyelidikan" in header.perihal.lower() + assert len(header.dasar) == 3 diff --git a/tests/unit/test_validators.py b/tests/unit/test_validators.py new file mode 100644 index 0000000..6ff0dcd --- /dev/null +++ b/tests/unit/test_validators.py @@ -0,0 +1,108 @@ +"""Tests for the validation layer.""" + +from __future__ import annotations + +from datetime import date + +import pytest + +from ocr_sprint.data.master_pangkat import is_valid_pangkat, normalize_pangkat +from ocr_sprint.pipeline.extract.validators import ( + validate_extraction, + validate_header, + validate_nrp, + validate_personnel_entry, +) +from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields, ReviewFlag +from ocr_sprint.schemas.personnel import PersonnelEntry + + +class TestNRP: + @pytest.mark.parametrize("nrp", ["12345678", "00000001", "99999999"]) + def test_valid_8_digits(self, nrp: str) -> None: + assert validate_nrp(nrp) is True + + @pytest.mark.parametrize("nrp", ["1234567", "123456789", "abcdefgh", "", None]) + def test_invalid(self, nrp: str | None) -> None: + assert validate_nrp(nrp) is False + + +class TestPangkat: + @pytest.mark.parametrize( + ("input_str", "expected"), + [ + ("AKP", "AKP"), + ("akp", "AKP"), + ("AKP.", "AKP"), + ("AKBP", "AKBP"), + ("Brigjen Pol", "BRIGJEN POL"), + ("BRIGJEN", "BRIGJEN POL"), + ("Kombespol", "KOMBES POL"), + ("BRIPDA", "BRIPDA"), + ], + ) + def test_normalizes_known_ranks(self, input_str: str, expected: str) -> None: + assert normalize_pangkat(input_str) == expected + + def test_unknown_returns_none(self) -> None: + assert normalize_pangkat("Sersan Mayor") is None + assert is_valid_pangkat("Sersan Mayor") is False + + +class TestPersonnelValidator: + def test_clean_entry_no_flags(self) -> None: + entry = PersonnelEntry(pangkat="AKP", nrp="12345678", nama="Test") + assert validate_personnel_entry(entry) == [] + + def test_invalid_nrp_flagged(self) -> None: + entry = PersonnelEntry(pangkat="AKP", nrp="123", nama="Test") + assert ReviewFlag.INVALID_NRP in validate_personnel_entry(entry) + + def test_unknown_pangkat_flagged(self) -> None: + entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test") + assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry) + + +class TestHeaderValidator: + def test_complete_header_no_flags(self) -> None: + header = HeaderFields( + nomor_sprint="Sprin/1/I/2025", + tanggal=date(2025, 1, 1), + satuan_penerbit="POLRES BANDUNG", + ) + assert validate_header(header) == [] + + def test_missing_nomor_flagged(self) -> None: + header = HeaderFields(tanggal=date(2025, 1, 1)) + assert ReviewFlag.MISSING_FIELD in validate_header(header) + + def test_missing_date_flagged(self) -> None: + header = HeaderFields(nomor_sprint="Sprin/1/I/2025") + assert ReviewFlag.DATE_PARSE_FAILED in validate_header(header) + + +class TestFullValidation: + def test_personnel_count_mismatch(self) -> None: + result = ExtractionResult( + header=HeaderFields( + nomor_sprint="Sprin/1/I/2025", + tanggal=date(2025, 1, 1), + ), + personel=[ + PersonnelEntry(pangkat="AKP", nrp="12345678", nama="A"), + ], + ) + flags = validate_extraction(result, expected_personnel_count=2) + assert ReviewFlag.PERSONNEL_COUNT_MISMATCH in flags + + def test_flags_are_deduped(self) -> None: + result = ExtractionResult( + header=HeaderFields(), # missing both nomor and tanggal + personel=[ + PersonnelEntry(nrp="123", pangkat="X"), + PersonnelEntry(nrp="456", pangkat="Y"), + ], + ) + flags = validate_extraction(result) + # each flag type should appear at most once + assert len(flags) == len(set(flags))