Phase 1 MVP: synchronous OCR + regex header extraction

Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 14:58:50 +00:00
commit ca0c0a0428
45 changed files with 2457 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,43 @@
 # ==== App ====
 APP_ENV=local                 # local | dev | staging | prod
 APP_HOST=0.0.0.0
 APP_PORT=8000
 APP_LOG_LEVEL=INFO
 # ==== Storage (Phase 1: local filesystem) ====
 STORAGE_LOCAL_DIR=./storage
 # ==== OCR ====
 OCR_LANG=latin                # PaddleOCR lang code; "latin" works well for Bahasa Indonesia
 OCR_USE_GPU=false             # set true if running on a GPU host
 OCR_DET_MODEL_DIR=             # leave empty to use PaddleOCR defaults
 OCR_REC_MODEL_DIR=
 OCR_CLS_MODEL_DIR=
 OCR_MAX_IMAGE_SIDE=2200       # downscale longest side before OCR
 # ==== Preprocessing ====
 PREPROCESS_TARGET_DPI=300
 PREPROCESS_DENOISE=true
 PREPROCESS_DESKEW=true
 PREPROCESS_ADAPTIVE_THRESHOLD=false  # turn on for low-quality phone photos
 # ==== Confidence / routing (Phase 5) ====
 CONFIDENCE_AUTO_APPROVE=0.95
 CONFIDENCE_NEEDS_REVIEW=0.85
 # ==== LLM (Phase 5, optional) ====
 LLM_ENABLED=false
 LLM_PROVIDER=ollama
 LLM_MODEL=qwen2.5:1.5b        # CPU-friendly default
 LLM_BASE_URL=http://localhost:11434
 LLM_TIMEOUT_S=60
 # ==== Async pipeline (Phase 4, optional) ====
 QUEUE_ENABLED=false
 REDIS_URL=redis://localhost:6379/0
 DATABASE_URL=postgresql+psycopg://ocr:ocr@localhost:5432/ocr_sprint
 MINIO_ENDPOINT=localhost:9000
 MINIO_ACCESS_KEY=minioadmin
 MINIO_SECRET_KEY=minioadmin
 MINIO_BUCKET=ocr-sprint
 MINIO_SECURE=false
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,70 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 dist/
 *.egg-info/
 *.egg
 .pytest_cache/
 .mypy_cache/
 .ruff_cache/
 .coverage
 .coverage.*
 htmlcov/
 coverage.xml
 .tox/
 .nox/
 # Virtual environments
 .venv/
 venv/
 env/
 ENV/
 # IDE
 .idea/
 .vscode/
 *.swp
 *.swo
 .DS_Store
 # Environment / secrets
 .env
 .env.*
 !.env.example
 # Local data & artifacts
 samples/*.pdf
 samples/*.PDF
 samples/*.jpg
 samples/*.JPG
 samples/*.jpeg
 samples/*.png
 samples/*.PNG
 samples/*.tif
 samples/*.tiff
 !samples/README.md
 data/local/
 storage/
 *.db
 *.sqlite
 *.sqlite3
 # OCR / model caches
 .paddleocr/
 ~/.paddleocr/
 models/downloaded/
 # Logs
 logs/
 *.log
 # Docker
 .docker/
 # Misc
 *.bak
 *.tmp
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,19 @@
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.6.0
    hooks:
      - id: trailing-whitespace
      - id: end-of-file-fixer
      - id: check-yaml
      - id: check-toml
      - id: check-added-large-files
        args: ["--maxkb=1024"]
      - id: check-merge-conflict
      - id: detect-private-key
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.6.9
    hooks:
      - id: ruff
        args: ["--fix"]
      - id: ruff-format
--- a/51
+++ b/51
@@ -0,0 +1,51 @@
 # syntax=docker/dockerfile:1.6
 # CPU-only image for the OCR Sprint API.
 # PaddleOCR + PyMuPDF + OpenCV-headless work on plain Debian without poppler.
 FROM python:3.11-slim AS base
 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    PIP_NO_CACHE_DIR=1 \
    DEBIAN_FRONTEND=noninteractive
 # System deps for OpenCV, libmagic, PaddlePaddle, and image format support.
 RUN apt-get update && apt-get install -y --no-install-recommends \
        libgl1 \
        libglib2.0-0 \
        libsm6 \
        libxext6 \
        libxrender1 \
        libgomp1 \
        libmagic1 \
        ca-certificates \
        curl \
    && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 # ----- builder layer (install deps separately for caching) -----
 FROM base AS builder
 COPY pyproject.toml README.md ./
 COPY src/ ./src/
 RUN pip install --upgrade pip && pip install ".[dev]"
 # ----- runtime layer -----
 FROM base AS runtime
 COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
 COPY --from=builder /usr/local/bin /usr/local/bin
 COPY pyproject.toml README.md ./
 COPY src/ ./src/
 # Pre-create cache dirs so PaddleOCR can write models on first run.
 RUN mkdir -p /home/app/.paddleocr /app/storage \
    && useradd --create-home --uid 1000 app \
    && chown -R app:app /home/app /app
 USER app
 EXPOSE 8000
 HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
    CMD curl -fsS http://localhost:8000/api/v1/health || exit 1
 CMD ["uvicorn", "ocr_sprint.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/52
+++ b/52
@@ -0,0 +1,52 @@
 .PHONY: help install dev fmt lint typecheck test test-cov run docker-build docker-up docker-down clean
 help:
 	@echo "Targets:"
 	@echo "  install       - install runtime + dev deps in current env"
 	@echo "  dev           - run FastAPI app with autoreload"
 	@echo "  fmt           - format code with ruff"
 	@echo "  lint          - lint with ruff"
 	@echo "  typecheck     - run mypy"
 	@echo "  test          - run pytest"
 	@echo "  test-cov      - run pytest with coverage"
 	@echo "  docker-build  - build api image"
 	@echo "  docker-up     - start docker-compose stack"
 	@echo "  docker-down   - stop docker-compose stack"
 install:
 	python -m pip install --upgrade pip
 	pip install -e ".[dev]"
 	pre-commit install || true
 dev:
 	uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
 fmt:
 	ruff format src tests
 	ruff check --fix src tests
 lint:
 	ruff check src tests
 	ruff format --check src tests
 typecheck:
 	mypy src
 test:
 	pytest
 test-cov:
 	pytest --cov --cov-report=term-missing
 docker-build:
 	docker compose build
 docker-up:
 	docker compose up -d
 docker-down:
 	docker compose down
 clean:
 	rm -rf .pytest_cache .mypy_cache .ruff_cache .coverage htmlcov build dist *.egg-info
 	find . -type d -name __pycache__ -exec rm -rf {} +
--- a/README.md
+++ b/README.md
@@ -0,0 +1,123 @@
 # OCR Sprint Service
 OCR + structured extraction service for Indonesian police "surat sprint" (surat perintah) documents. Built around **FastAPI + PaddleOCR + hybrid extraction (regex → LLM lokal → validation)** with **on-premise** deployment as a hard requirement.
 > **Status:** Phase 1 MVP — synchronous PDF/image OCR with regex header extraction, validation, and confidence scoring. Phase 2–6 (document detection, table extraction, async pipeline, LLM extraction, HITL) are tracked in [`docs/architecture.md`](docs/architecture.md).
 ## Why this stack
 - **PaddleOCR** is the strongest open-source OCR for mixed-language documents and runs fully on-prem (essential for police data).
 - **PP-Structure** (Phase 3) handles personnel tables natively.
 - **Regex-first, LLM-fallback extraction** keeps deterministic fields fast and predictable while letting an LLM handle format drift across Polri units.
 - **CPU-friendly defaults**: a small (1.5B–4B) local LLM via Ollama is the recommended default; the architecture is also GPU-ready.
 See [`docs/architecture.md`](docs/architecture.md) for the full architecture, accuracy expectations, and roadmap.
 ## Quickstart
 ### Prerequisites
 - Python **3.10–3.12**
 - ~3 GB free disk for PaddleOCR model downloads on first run
 - Linux/macOS recommended (Windows works but PaddleOCR install can be finicky)
 ### Install (local dev)
 ```bash
 git clone https://github.com/Adriankf59/ocr-sprint-service.git
 cd ocr-sprint-service
 python -m venv .venv && source .venv/bin/activate
 make install         # installs runtime + dev deps + pre-commit
 cp .env.example .env # edit if you need GPU / different storage path
 ```
 ### Run the API
 ```bash
 make dev
 # → http://localhost:8000/docs
 ```
 ### Try it out
 ```bash
 curl -F "file=@samples/pdf/example.pdf" http://localhost:8000/api/v1/documents | jq
 ```
 Expected response (truncated):
 ```json
 {
  "job_id": "8f2a...",
  "status": "completed",
  "confidence": 0.93,
  "data": {
    "header": {
      "nomor_sprint": "Sprin/123/IV/2025/Reskrim",
      "tanggal": "2025-04-21",
      "satuan_penerbit": "KEPOLISIAN RESOR BANDUNG",
      "perihal": "Pelaksanaan penyelidikan kasus pencurian",
      "dasar": ["Undang-Undang Nomor 2 Tahun 2002 ...", "..."]
    },
    "personel": [],
    "ttd": { "nrp": "12345678" }
  },
  "review_flags": []
 }
 ```
 > **Note:** Phase 1 does not yet populate the `personel[]` table — that requires PP-Structure (Phase 3). Header fields, signatory NRP, confidence, and HITL routing are fully wired.
 ### Docker
 ```bash
 docker compose build
 docker compose up -d
 docker compose logs -f api
 ```
 The first request will trigger PaddleOCR to download its detection/recognition/cls models (~200 MB) into the `paddle-models` volume.
 ## Development
 ```bash
 make fmt        # format with ruff
 make lint       # lint
 make typecheck  # mypy strict mode
 make test       # pytest
 make test-cov   # pytest + coverage
 ```
 Pre-commit hooks run ruff on every commit. Install once with `pre-commit install` (already done by `make install`).
 ## Project layout
 ```
 src/ocr_sprint/
  api/          # FastAPI routes + error handlers
  schemas/      # Pydantic v2 models (request/response, extraction, personnel)
  pipeline/     # ingest → preprocess → ocr → extract → validate → score
    extract/    # regex_rules.py (Phase 1) → llm.py (Phase 5)
  data/         # master data (Polri ranks, etc.)
  utils/        # logging, helpers
  config.py     # pydantic-settings
  main.py       # app factory
 tests/unit/     # ~60 unit tests, no PaddleOCR dependency
 docs/           # architecture & decision records
 ```
 ## Roadmap
 | Phase | Scope | Status |
 |---|---|---|
 | 1 | Sync API, PDF/image ingest, basic preprocessing, PaddleOCR, regex header extraction, validation, confidence scoring | **In progress** |
 | 2 | DocTR document detection + dewarping for phone photos | Planned |
 | 3 | PP-Structure table extraction for personnel rows | Planned |
 | 4 | Async pipeline (Celery + Redis), Postgres + MinIO, auth, observability | Planned |
 | 5 | LLM hybrid extraction (Ollama + structured output) | Planned |
 | 6 | HITL review endpoints + audit trail | Planned |
 ## License
 Proprietary — internal use only.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,23 @@
 # Phase 1 MVP compose: API only.
 # Phase 4 will add redis, postgres, minio, and worker services.
 services:
  api:
    build:
      context: .
      dockerfile: Dockerfile
    image: ocr-sprint-service:dev
    container_name: ocr-sprint-api
    ports:
      - "8000:8000"
    environment:
      APP_ENV: local
      APP_LOG_LEVEL: INFO
      OCR_USE_GPU: "false"
      STORAGE_LOCAL_DIR: /app/storage
    volumes:
      - ./storage:/app/storage
      - paddle-models:/home/app/.paddleocr
    restart: unless-stopped
 volumes:
  paddle-models:
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -0,0 +1,259 @@
 # Plan & Arsitektur — OCR Service Surat Sprint Kepolisian
 ## 1. Penilaian Jujur Tech Stack yang Diusulkan
 Tech stack Anda (FastAPI + PaddleOCR + OpenCV/Pillow + Regex) **sudah bagus dan layak produksi**, tapi **belum tentu paling optimal akurasinya** untuk kasus surat sprint. Ada beberapa gap yang perlu diisi sebelum bisa disebut "terbaik".
 ### Yang sudah tepat
 | Komponen | Alasan |
 |---|---|
 | **FastAPI** | Async native, Pydantic validation, OpenAPI docs otomatis, ideal untuk ML serving. |
 | **PaddleOCR (PP-OCRv4/v5)** | Salah satu OCR open-source terbaik untuk dokumen campuran teks + tabel, mendukung Latin (cocok untuk Bahasa Indonesia), bisa jalan on-premise (penting untuk dokumen kepolisian yang sensitif — **cloud OCR seperti Google Vision/AWS Textract sebaiknya dihindari** karena masalah kerahasiaan). |
 | **OpenCV + Pillow** | Standar industri untuk preprocessing. |
 | **Regex/rule-based** | Cocok untuk dokumen terstruktur seperti sprint yang format-nya relatif baku. |
 ### Yang masih kurang / perlu ditambah
 1. **Table extraction belum tertangani**
   Daftar personel di surat sprint hampir selalu berbentuk **tabel** (No, Pangkat, NRP, Nama, Jabatan, Keterangan). Regex pada teks linear dari OCR biasa **akan kacau** ketika baris tabel pecah atau kolom bergeser. Solusi: gunakan **PaddleOCR PP-Structure** (modul table recognition bawaan Paddle) atau model khusus seperti **TableTransformer (Microsoft)**.
 2. **Document detection & dewarping untuk foto HP belum eksplisit**
   Foto HP bermasalah karena: perspektif miring, lipatan, bayangan, lighting tidak rata, fokus tidak merata. OpenCV crop + perspective transform manual saja sering gagal. Tambahkan:
   - **Document corner detection**: `DocTR` / `MobileSAM` / model edge-based, atau heuristik kontur OpenCV sebagai fallback.
   - **Dewarping**: `DocTr` / `DewarpNet` untuk halaman yang melengkung (lipatan).
   - **Shadow removal**: algoritma background division atau model spesialis.
 3. **Strategi ekstraksi 100% regex itu rapuh**
   Surat sprint dari satuan berbeda (Polda, Polres, Polsek, Mabes) punya **variasi format**: header berbeda, urutan field berbeda, kadang pangkat disingkat (`AKP`, `IPDA`) kadang ditulis penuh. Regex murni akan butuh ratusan rule dan tetap miss kasus baru.
   **Rekomendasi pendekatan hybrid**:
   - **Layer 1 — Regex/rule** untuk field deterministik (Nomor sprint, tanggal, dasar hukum) yang format-nya baku.
   - **Layer 2 — Schema-aware extraction** menggunakan **LLM lokal** (Llama 3.1 8B / Qwen2.5 7B via Ollama atau vLLM) dengan structured output (JSON schema / Pydantic) untuk field yang variatif (jabatan, keterangan tugas).
   - **Layer 3 — Validation** terhadap master data (daftar pangkat valid, format NRP 8 digit, dll).
 4. **Tidak ada confidence scoring & human-in-the-loop**
   Untuk dokumen kepolisian, **akurasi 100% otomatis itu mitos**. Sistem harus:
   - Mengeluarkan confidence score per field.
   - Otomatis flag dokumen low-confidence untuk review manusia.
   - Sediakan UI/endpoint koreksi yang feedback-nya bisa dipakai retraining.
 5. **Alternatif end-to-end yang patut dipertimbangkan**
   Jika nanti volume dokumen besar dan format relatif stabil, fine-tuning model **Document Understanding** end-to-end bisa lebih akurat:
   - **Donut** (OCR-free, langsung image → JSON).
   - **LayoutLMv3** (kombinasi teks + layout + visual).
   - **Surya OCR** (newer, sangat bagus untuk dokumen).
   Untuk MVP, tetap pakai PaddleOCR. Donut/LayoutLM adalah opsi V2 setelah ada labeled dataset cukup (~500–1000 dokumen).
 ### Verdict
 Stack Anda **bisa mencapai ~85–92% akurasi field-level** untuk surat sprint dengan kualitas scan baik, dan **~70–80%** untuk foto HP, **kalau ditambah** komponen di atas. Tanpa table extraction + dewarping + hybrid extraction, akurasinya akan jatuh di kondisi nyata.
 ---
 ## 2. Arsitektur yang Direkomendasikan
 ### 2.1 Diagram Logis
 ```
 ┌────────────────────────────────────────────────────────────────────┐
 │                         Client (Web/Mobile)                        │
 └──────────────────────────────┬─────────────────────────────────────┘
                               │ HTTPS (multipart upload)
                               ▼
 ┌────────────────────────────────────────────────────────────────────┐
 │                    FastAPI Gateway (stateless)                     │
 │   - Auth (JWT/API key)   - Rate limit   - Request validation       │
 └──────────────────────────────┬─────────────────────────────────────┘
                               │ enqueue job
                               ▼
 ┌────────────────────────────────────────────────────────────────────┐
 │              Job Queue (Redis + Celery / RQ / Dramatiq)            │
 └──────────────────────────────┬─────────────────────────────────────┘
                               ▼
 ┌────────────────────────────────────────────────────────────────────┐
 │                    OCR Worker Pipeline (GPU/CPU)                   │
 │  ┌────────────┐  ┌──────────────┐  ┌───────────┐  ┌────────────┐   │
 │  │ 1. Ingest  │→ │ 2. Preproc   │→ │ 3. OCR +  │→ │ 4. Extract │   │
 │  │  & detect  │  │ (deskew,     │  │  Layout   │  │ (regex +   │   │
 │  │  PDF/IMG   │  │  dewarp,     │  │  PP-Struct│  │  LLM +     │   │
 │  │            │  │  denoise)    │  │  + Table) │  │  validate) │   │
 │  └────────────┘  └──────────────┘  └───────────┘  └─────┬──────┘   │
 │                                                         │          │
 │                          ┌──────────────────────────────┘          │
 │                          ▼                                         │
 │                   ┌─────────────┐                                  │
 │                   │ 5. Confidence│ → low conf? flag for review    │
 │                   │   scoring    │                                 │
 │                   └──────┬───────┘                                 │
 └──────────────────────────┼─────────────────────────────────────────┘
                           ▼
 ┌────────────────────────────────────────────────────────────────────┐
 │           Storage: PostgreSQL (metadata) + MinIO/S3 (file)         │
 │           + Vector store opsional (untuk dedup / search)           │
 └────────────────────────────────────────────────────────────────────┘
                           │
                           ▼
 ┌────────────────────────────────────────────────────────────────────┐
 │           Review UI (optional) — koreksi manual + audit trail      │
 └────────────────────────────────────────────────────────────────────┘
 ```
 ### 2.2 Pipeline Detail per Tahap
 **Tahap 1 — Ingest & Document Detection**
 - PDF: render setiap halaman jadi image @ 300 DPI (`pdf2image` / `PyMuPDF`).
 - Image (foto HP): deteksi sudut dokumen → crop → perspective transform.
  - Library: OpenCV `findContours` (cepat) sebagai fallback, **DocTR document detector** (lebih akurat) sebagai utama.
 **Tahap 2 — Preprocessing**
 - Deskew (rotation correction) — Hough transform atau model.
 - Dewarp (untuk foto buku/lipatan) — `DewarpNet` atau model RNN.
 - Adaptive thresholding (untuk foto dengan lighting tidak rata).
 - Shadow removal (background division).
 - Denoise (Non-Local Means).
 - Resize ke ukuran optimal OCR (~1500–2500 px sisi panjang).
 **Tahap 3 — OCR + Layout Analysis**
 - **PaddleOCR PP-Structure** dijalankan sekali → menghasilkan:
  - Bounding boxes + teks + confidence per word/line.
  - Table region detection + table-to-HTML/JSON.
  - Layout type per region (title, paragraph, table, figure).
 - Output ditampung sebagai struktur intermediate (mirip hOCR / ALTO XML).
 **Tahap 4 — Information Extraction**
 - **4a. Header parsing (regex)**: Nomor sprint, tanggal, satuan penerbit, dasar hukum, perihal. Format relatif baku → regex sangat cocok.
 - **4b. Personnel table extraction**: ambil dari hasil PP-Structure table → mapping kolom (Pangkat, NRP, Nama, Jabatan, Keterangan).
 - **4c. LLM fallback**: untuk field yang regex/table miss, kirim chunk teks + JSON schema ke LLM lokal (Ollama / vLLM) dengan **structured output** (Pydantic via `outlines` / `instructor`).
 - **4d. Validation layer**:
  - NRP: 8 digit numerik.
  - Pangkat: harus ada di daftar master pangkat Polri.
  - Tanggal: parse + sanity check.
  - Cross-check: jumlah personel di body = jumlah baris tabel.
 **Tahap 5 — Confidence Scoring & Routing**
 - Aggregate confidence: weighted average dari OCR confidence + validation pass/fail + LLM logprob (kalau pakai).
 - Threshold (mis. < 0.85) → status `NEEDS_REVIEW`.
 - Threshold tinggi (≥ 0.95) + semua validasi pass → status `AUTO_APPROVED`.
 ### 2.3 API Endpoint (FastAPI)
 ```
 POST   /api/v1/documents              # upload, kembalikan job_id
 GET    /api/v1/documents/{job_id}     # poll status + hasil
 GET    /api/v1/documents/{job_id}/raw # raw OCR output (debug)
 PATCH  /api/v1/documents/{job_id}     # koreksi manual (HITL)
 GET    /api/v1/health                 # liveness
 GET    /api/v1/metrics                # Prometheus
 ```
 Response shape (contoh):
 ```json
 {
  "job_id": "uuid",
  "status": "completed | processing | needs_review | failed",
  "confidence": 0.92,
  "data": {
    "nomor_sprint": "Sprin/123/IV/2025",
    "tanggal": "2025-04-21",
    "satuan_penerbit": "Polres Bandung",
    "dasar": ["...", "..."],
    "perihal": "...",
    "personel": [
      {"no": 1, "pangkat": "AKP", "nrp": "12345678", "nama": "...", "jabatan": "Kasat Reskrim", "confidence": 0.97},
      ...
    ],
    "ttd": {"pejabat": "...", "pangkat": "...", "nrp": "..."}
  },
  "review_flags": []
 }
 ```
 ### 2.4 Tech Stack Final yang Direkomendasikan
 | Layer | Pilihan | Catatan |
 |---|---|---|
 | API | **FastAPI** + Uvicorn/Gunicorn | sesuai usulan |
 | Validation | **Pydantic v2** | wajib |
 | Queue | **Redis + Celery** atau **Dramatiq** | OCR berat, jangan blocking request |
 | OCR | **PaddleOCR PP-OCRv4 + PP-Structure** | tambah PP-Structure untuk tabel |
 | Preprocessing | **OpenCV + Pillow** + **DocTR** (detection) | DocTR untuk foto HP |
 | Extraction | **Regex + Ollama (Llama 3.1 8B / Qwen2.5 7B)** + **instructor/outlines** | hybrid |
 | Storage | **PostgreSQL** (metadata) + **MinIO** (file blob) | self-hosted, sesuai compliance |
 | Observability | **Prometheus + Grafana + Loki** | wajib produksi |
 | Container | **Docker + docker-compose** (dev) → **Kubernetes** (prod) | |
 | GPU | NVIDIA T4/A10 (1× cukup untuk MVP) | PaddleOCR jauh lebih cepat di GPU |
 ---
 ## 3. Roadmap Pengembangan (Bertahap)
 ### Fase 0 — Persiapan (1 minggu)
 - Kumpulkan **dataset sampel**: minimal 50 surat sprint (campur PDF scan + foto HP) dari beragam satuan.
 - Buat **ground truth labelling** untuk 20 dokumen (untuk evaluasi).
 - Definisikan **schema output final** (JSON) bersama stakeholder.
 ### Fase 1 — MVP Pipeline Sinkron (2 minggu)
 - Setup FastAPI skeleton + Pydantic schemas.
 - Integrasi PaddleOCR PP-OCRv4 (CPU dulu, GPU menyusul).
 - Preprocessing dasar: deskew + denoise + resize.
 - Regex extraction untuk field header.
 - Endpoint sinkron `POST /documents` (untuk dev/testing saja).
 - **Evaluasi akurasi** terhadap 20 ground truth.
 ### Fase 2 — Robustness untuk Foto HP (2 minggu)
 - Integrasi document detection (DocTR atau OpenCV contour).
 - Perspective transform + dewarping.
 - Shadow removal.
 - Re-evaluasi akurasi pada subset foto HP.
 ### Fase 3 — Table Extraction (1.5 minggu)
 - Integrasi PP-Structure untuk personnel table.
 - Mapping kolom + validation (NRP, pangkat).
 - Master data tabel pangkat Polri.
 ### Fase 4 — Async + Production Ready (1.5 minggu)
 - Pindahkan ke arsitektur async dengan Celery + Redis.
 - Storage MinIO + PostgreSQL.
 - Auth, rate limit, logging, metrics.
 - Docker compose untuk deployment.
 ### Fase 5 — LLM Hybrid Extraction (2 minggu)
 - Setup Ollama / vLLM dengan model lokal.
 - Structured output via `instructor`.
 - Confidence scoring + routing ke review.
 ### Fase 6 — HITL Review UI (opsional, 2 minggu)
 - Endpoint koreksi.
 - Simple web UI (Next.js) untuk reviewer.
 - Audit trail & feedback loop.
 ### Fase 7 — Optimasi Lanjutan (ongoing)
 - Fine-tune PaddleOCR detection/recognition pada dataset internal.
 - Eksplorasi Donut/LayoutLMv3 jika dataset sudah cukup.
 - Batch processing & GPU optimization.
 **Total estimasi MVP fungsional (Fase 1–4): ~7 minggu** dengan 1 backend engineer + 1 ML engineer.
 ---
 ## 4. Risiko & Mitigasi
 | Risiko | Mitigasi |
 |---|---|
 | Data sensitif (kepolisian) bocor | Wajib on-prem; tidak ada cloud OCR; enkripsi at-rest (LUKS/pgcrypto) + in-transit (mTLS); audit log lengkap. |
 | Variasi format antar satuan | Hybrid extraction (regex + LLM); kumpulkan sample dari banyak satuan sejak awal. |
 | Foto HP kualitas buruk | Validasi kualitas image di client (resolusi minimal, blur detection) sebelum upload. |
 | Akurasi tidak sampai target | HITL review wajib untuk dokumen low-confidence; jangan deploy fully-automatic. |
 | Tanggung jawab hukum atas hasil OCR | Selalu simpan original document + flag bahwa hasil ekstraksi adalah "draft, perlu verifikasi manusia". |
 ---
 ## 5. Pertanyaan Sebelum Implementasi
 Sebelum saya lanjut ke implementasi, mohon konfirmasi:
 1. **Volume**: berapa dokumen/hari yang ditargetkan? (mempengaruhi pilihan async vs sync, GPU vs CPU)
 2. **Deployment target**: on-prem mutlak, atau private cloud (GovCloud) boleh?
 3. **Source dokumen**: apakah ada akses ke 20–50 sample surat sprint untuk dijadikan dataset awal?
 4. **Integrasi**: service ini akan dipanggil sistem apa? (mempengaruhi auth & API contract)
 5. **HITL**: apakah ada SDM untuk review manual dokumen low-confidence?
 6. **Hardware**: sudah ada server GPU, atau perlu sizing rekomendasi?
 7. **Format output final**: ada schema yang sudah dipakai sistem downstream?
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,136 @@
 [build-system]
 requires = ["setuptools>=68", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "ocr-sprint-service"
 version = "0.1.0"
 description = "OCR service for Indonesian police 'surat sprint' documents (FastAPI + PaddleOCR + hybrid extraction)"
 readme = "README.md"
 requires-python = ">=3.10,<3.13"
 license = { text = "Proprietary" }
 authors = [{ name = "Adrian Kuman Firmansah" }]
 dependencies = [
    # Web framework
    "fastapi>=0.115,<0.116",
    "uvicorn[standard]>=0.30,<0.34",
    "python-multipart>=0.0.9",
    "pydantic>=2.7,<3",
    "pydantic-settings>=2.4,<3",
    # Image / PDF
    "pillow>=10.4,<12",
    "opencv-python-headless>=4.10,<5",
    "numpy>=1.26,<2.2",
    "PyMuPDF>=1.24,<2",
    "python-magic>=0.4.27",
    # OCR (CPU build of paddle; GPU users override via extra index)
    "paddlepaddle==2.6.1",
    "paddleocr>=2.7.5,<3",
    # Logging / observability
    "structlog>=24.1",
    "prometheus-client>=0.20",
    # Misc
    "httpx>=0.27",
    "tenacity>=8.5",
 ]
 [project.optional-dependencies]
 dev = [
    "pytest>=8.2",
    "pytest-asyncio>=0.23",
    "pytest-cov>=5.0",
    "ruff>=0.6.9",
    "mypy>=1.11",
    "types-Pillow",
    "pre-commit>=3.7",
 ]
 # Extraction layer (Phase 5) — kept optional so MVP install stays light
 llm = [
    "ollama>=0.3",
    "instructor>=1.4",
 ]
 # Async pipeline (Phase 4)
 async-pipeline = [
    "celery[redis]>=5.4",
    "redis>=5.0",
    "minio>=7.2",
    "sqlalchemy>=2.0",
    "psycopg[binary]>=3.2",
    "alembic>=1.13",
 ]
 [project.scripts]
 ocr-sprint-api = "ocr_sprint.main:run"
 [tool.setuptools.packages.find]
 where = ["src"]
 [tool.setuptools.package-data]
 "ocr_sprint" = ["py.typed"]
 # ---------- Tooling ----------
 [tool.ruff]
 line-length = 100
 target-version = "py310"
 src = ["src", "tests"]
 [tool.ruff.lint]
 select = [
    "E", "F", "W",       # pycodestyle / pyflakes
    "I",                 # isort
    "B",                 # bugbear
    "UP",                # pyupgrade
    "SIM",               # simplify
    "RUF",               # ruff-specific
    "C4",                # comprehensions
    "PIE",
    "PT",                # pytest style
    "TID",               # tidy imports
 ]
 ignore = [
    "E501",  # line length handled by formatter
    "B008",  # FastAPI Depends() pattern
 ]
 [tool.ruff.format]
 quote-style = "double"
 [tool.mypy]
 python_version = "3.10"
 strict = true
 warn_unused_ignores = true
 warn_redundant_casts = true
 disallow_untyped_defs = true
 plugins = ["pydantic.mypy"]
 mypy_path = "src"
 namespace_packages = true
 explicit_package_bases = true
 [[tool.mypy.overrides]]
 module = ["paddleocr.*", "paddle.*", "cv2.*", "fitz.*", "magic.*"]
 ignore_missing_imports = true
 [tool.pytest.ini_options]
 minversion = "8.0"
 addopts = "-ra --strict-markers --strict-config"
 testpaths = ["tests"]
 asyncio_mode = "auto"
 filterwarnings = [
    "ignore::DeprecationWarning:paddle.*",
    "ignore::DeprecationWarning:paddleocr.*",
 ]
 [tool.coverage.run]
 source = ["src/ocr_sprint"]
 branch = true
 [tool.coverage.report]
 exclude_lines = [
    "pragma: no cover",
    "raise NotImplementedError",
    "if TYPE_CHECKING:",
 ]
--- a/samples/README.md
+++ b/samples/README.md
@@ -0,0 +1,13 @@
 # Samples
 Drop sample surat sprint files here for local testing. **Do NOT commit real documents** — `.gitignore` excludes binary file extensions in this folder.
 Recommended layout:
 ```
 samples/
  pdf/          # PDF scans
  photo/        # phone photos
  ground_truth/ # JSON ground-truth labels for evaluation
 ```
 For sharing real samples with the team, use the project's secured storage (MinIO/S3 once Phase 4 is live), not git.
--- a/src/ocr_sprint/init.py
+++ b/src/ocr_sprint/init.py
@@ -0,0 +1,3 @@
 """OCR Sprint Service — extract structured data from Indonesian police 'surat sprint'."""
 __version__ = "0.1.0"
--- a/src/ocr_sprint/api/init.py
+++ b/src/ocr_sprint/api/init.py
--- a/src/ocr_sprint/api/errors.py
+++ b/src/ocr_sprint/api/errors.py
@@ -0,0 +1,43 @@
 """HTTP error handlers."""
 from __future__ import annotations
 from fastapi import FastAPI, Request, status
 from fastapi.responses import JSONResponse
 from ocr_sprint.utils.logging import get_logger
 _logger = get_logger(__name__)
 class OCRServiceError(Exception):
    """Base class for application errors that should map to a 4xx response."""
    http_status: int = status.HTTP_400_BAD_REQUEST
 class UnsupportedDocumentError(OCRServiceError):
    """Uploaded file is neither a PDF nor a recognized image format."""
 class JobNotFoundError(OCRServiceError):
    http_status = status.HTTP_404_NOT_FOUND
 def register_error_handlers(app: FastAPI) -> None:
    """Wire OCRServiceError + a final fallback for unexpected exceptions."""
    @app.exception_handler(OCRServiceError)
    async def _ocr_error_handler(_: Request, exc: OCRServiceError) -> JSONResponse:
        return JSONResponse(
            status_code=exc.http_status,
            content={"error": exc.__class__.__name__, "message": str(exc)},
        )
    @app.exception_handler(Exception)
    async def _unexpected_handler(_: Request, exc: Exception) -> JSONResponse:
        _logger.exception("api.unhandled_exception", error=str(exc))
        return JSONResponse(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            content={"error": "InternalServerError", "message": "Unexpected error"},
        )
--- a/src/ocr_sprint/api/routes/init.py
+++ b/src/ocr_sprint/api/routes/init.py
--- a/src/ocr_sprint/api/routes/documents.py
+++ b/src/ocr_sprint/api/routes/documents.py
@@ -0,0 +1,58 @@
 """Documents API — Phase 1 synchronous endpoint.
 POST /documents accepts a single PDF or image upload, runs the synchronous
 pipeline inline, and returns the structured result. This is suitable for
 development and low-traffic production; Phase 4 will introduce an async
 queue and a polling-style API at the same path.
 """
 from __future__ import annotations
 from uuid import uuid4
 from fastapi import APIRouter, File, UploadFile, status
 from ocr_sprint.api.errors import UnsupportedDocumentError
 from ocr_sprint.pipeline.orchestrator import run_pipeline
 from ocr_sprint.schemas.document import DocumentResponse
 from ocr_sprint.utils.logging import get_logger
 router = APIRouter(prefix="/documents", tags=["documents"])
 _logger = get_logger(__name__)
 _MAX_UPLOAD_BYTES = 25 * 1024 * 1024  # 25 MB
@router.post("", status_code=status.HTTP_200_OK, response_model=DocumentResponse)
 async def create_document(file: UploadFile = File(...)) -> DocumentResponse:
    """Run OCR + extraction synchronously on a single upload."""
    job_id = uuid4()
    log = _logger.bind(job_id=str(job_id), filename=file.filename or "")
    content = await file.read()
    if not content:
        raise UnsupportedDocumentError("Uploaded file is empty.")
    if len(content) > _MAX_UPLOAD_BYTES:
        raise UnsupportedDocumentError(
            f"Uploaded file exceeds {_MAX_UPLOAD_BYTES // (1024 * 1024)} MB limit."
        )
    log.info("documents.received", size=len(content))
    try:
        output = run_pipeline(content)
    except ValueError as exc:
        raise UnsupportedDocumentError(str(exc)) from exc
    log.info(
        "documents.completed",
        status=output.status.value,
        confidence=round(output.confidence, 3),
        flags=[f.value for f in output.result.review_flags],
    )
    return DocumentResponse(
        job_id=job_id,
        status=output.status,
        confidence=output.confidence,
        data=output.result,
        review_flags=[f.value for f in output.result.review_flags],
    )
--- a/src/ocr_sprint/api/routes/health.py
+++ b/src/ocr_sprint/api/routes/health.py
@@ -0,0 +1,15 @@
 """Liveness / readiness endpoints."""
 from __future__ import annotations
 from fastapi import APIRouter
 from ocr_sprint import __version__
 router = APIRouter(tags=["health"])
@router.get("/health")
 async def health() -> dict[str, str]:
    """Lightweight liveness check — does NOT touch the OCR engine."""
    return {"status": "ok", "version": __version__}
--- a/src/ocr_sprint/config.py
+++ b/src/ocr_sprint/config.py
@@ -0,0 +1,72 @@
 """Application settings loaded from environment / .env file."""
 from __future__ import annotations
 from functools import lru_cache
 from pathlib import Path
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
    """Runtime configuration. Override via environment variables or a .env file."""
    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        case_sensitive=False,
        extra="ignore",
    )
    # App
    app_env: str = "local"
    app_host: str = "0.0.0.0"
    app_port: int = 8000
    app_log_level: str = "INFO"
    # Storage (Phase 1: local fs)
    storage_local_dir: Path = Path("./storage")
    # OCR
    ocr_lang: str = "latin"
    ocr_use_gpu: bool = False
    ocr_det_model_dir: str | None = None
    ocr_rec_model_dir: str | None = None
    ocr_cls_model_dir: str | None = None
    ocr_max_image_side: int = 2200
    # Preprocessing
    preprocess_target_dpi: int = 300
    preprocess_denoise: bool = True
    preprocess_deskew: bool = True
    preprocess_adaptive_threshold: bool = False
    # Confidence thresholds (Phase 5 routing)
    confidence_auto_approve: float = Field(0.95, ge=0.0, le=1.0)
    confidence_needs_review: float = Field(0.85, ge=0.0, le=1.0)
    # LLM (Phase 5)
    llm_enabled: bool = False
    llm_provider: str = "ollama"
    llm_model: str = "qwen2.5:1.5b"
    llm_base_url: str = "http://localhost:11434"
    llm_timeout_s: int = 60
    # Async pipeline (Phase 4)
    queue_enabled: bool = False
    redis_url: str = "redis://localhost:6379/0"
    database_url: str = "postgresql+psycopg://ocr:ocr@localhost:5432/ocr_sprint"
    minio_endpoint: str = "localhost:9000"
    minio_access_key: str = "minioadmin"
    minio_secret_key: str = "minioadmin"
    minio_bucket: str = "ocr-sprint"
    minio_secure: bool = False
@lru_cache(maxsize=1)
 def get_settings() -> Settings:
    """Cached accessor so settings are loaded once per process."""
    settings = Settings()
    settings.storage_local_dir.mkdir(parents=True, exist_ok=True)
    return settings
--- a/src/ocr_sprint/data/init.py
+++ b/src/ocr_sprint/data/init.py
--- a/src/ocr_sprint/data/master_pangkat.py
+++ b/src/ocr_sprint/data/master_pangkat.py
@@ -0,0 +1,66 @@
 """Master data for Polri ranks ('pangkat').
 Used by the validation layer to:
 1. Confirm that a recognized rank string is a real Polri rank.
 2. Normalize abbreviated forms ("AKP" → "AKP", "Brigadir Polisi" → "Brigadir") to a canonical form.
 Source: Peraturan Kapolri tentang Pangkat (publicly available, 2024).
 Update this file when ranks are reorganized.
 """
 from __future__ import annotations
 # Canonical abbreviation → list of accepted variants (case-insensitive).
 PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
    # Tamtama
    "BHARADA": ("BHARADA", "BHRD"),
    "BHARATU": ("BHARATU", "BHRT"),
    "BHARAKA": ("BHARAKA", "BHRK"),
    "ABRIP": ("ABRIP",),
    "ABRIPTU": ("ABRIPTU",),
    "ABRIPKA": ("ABRIPKA",),
    # Bintara
    "BRIPDA": ("BRIPDA",),
    "BRIPTU": ("BRIPTU",),
    "BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"),
    "BRIPKA": ("BRIPKA",),
    "AIPDA": ("AIPDA",),
    "AIPTU": ("AIPTU",),
    # Perwira Pertama
    "IPDA": ("IPDA",),
    "IPTU": ("IPTU",),
    "AKP": ("AKP",),
    # Perwira Menengah
    "KOMPOL": ("KOMPOL",),
    "AKBP": ("AKBP",),
    "KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP"),
    # Perwira Tinggi
    "BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"),
    "IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"),
    "KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"),
    "JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"),
 }
 # Reverse lookup: any variant (uppercased) → canonical form.
 _VARIANT_TO_CANONICAL: dict[str, str] = {
    variant.upper(): canonical
    for canonical, variants in PANGKAT_VARIANTS.items()
    for variant in variants
 }
 def normalize_pangkat(raw: str | None) -> str | None:
    """Return canonical Polri rank, or None if input is empty/unknown."""
    if not raw:
        return None
    cleaned = " ".join(raw.strip().upper().split())
    if cleaned in _VARIANT_TO_CANONICAL:
        return _VARIANT_TO_CANONICAL[cleaned]
    # tolerate trailing punctuation like "AKP."
    stripped = cleaned.rstrip(".,;:")
    return _VARIANT_TO_CANONICAL.get(stripped)
 def is_valid_pangkat(raw: str | None) -> bool:
    """True if the string maps to a known Polri rank after normalization."""
    return normalize_pangkat(raw) is not None
--- a/src/ocr_sprint/main.py
+++ b/src/ocr_sprint/main.py
@@ -0,0 +1,42 @@
 """FastAPI entrypoint."""
 from __future__ import annotations
 from fastapi import FastAPI
 from ocr_sprint import __version__
 from ocr_sprint.api.errors import register_error_handlers
 from ocr_sprint.api.routes import documents, health
 from ocr_sprint.config import get_settings
 from ocr_sprint.utils.logging import configure_logging
 def create_app() -> FastAPI:
    """Application factory — keeps top-level state easy to test."""
    settings = get_settings()
    configure_logging(settings.app_log_level)
    app = FastAPI(
        title="OCR Sprint Service",
        version=__version__,
        description="OCR + structured extraction for Indonesian police 'surat sprint' documents.",
        docs_url="/docs",
        redoc_url="/redoc",
        openapi_url="/openapi.json",
    )
    register_error_handlers(app)
    app.include_router(health.router, prefix="/api/v1")
    app.include_router(documents.router, prefix="/api/v1")
    return app
 app = create_app()
 def run() -> None:
    """Console-script entrypoint (`ocr-sprint-api`)."""
    import uvicorn
    s = get_settings()
    uvicorn.run("ocr_sprint.main:app", host=s.app_host, port=s.app_port, reload=False)
--- a/src/ocr_sprint/pipeline/init.py
+++ b/src/ocr_sprint/pipeline/init.py
@@ -0,0 +1 @@
 """OCR pipeline: ingest → preprocess → OCR → extract → validate."""
--- a/src/ocr_sprint/pipeline/confidence.py
+++ b/src/ocr_sprint/pipeline/confidence.py
@@ -0,0 +1,51 @@
 """Confidence scoring + routing decision.
 The score is a weighted blend of:
  - mean OCR confidence across all detected lines
  - validation pass rate (1.0 if no review flags, decreases per flag)
 This is intentionally simple for Phase 1; Phase 5 will add LLM logprob
 contributions and per-field confidences.
 """
 from __future__ import annotations
 from ocr_sprint.config import get_settings
 from ocr_sprint.schemas.document import DocumentStatus
 from ocr_sprint.schemas.extraction import ReviewFlag
 # Per-flag penalty applied to the validation component of the score.
 _FLAG_PENALTY: dict[ReviewFlag, float] = {
    ReviewFlag.LOW_OCR_CONFIDENCE: 0.10,
    ReviewFlag.MISSING_FIELD: 0.20,
    ReviewFlag.INVALID_NRP: 0.10,
    ReviewFlag.UNKNOWN_PANGKAT: 0.05,
    ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15,
    ReviewFlag.DATE_PARSE_FAILED: 0.10,
 }
 OCR_WEIGHT = 0.6
 VALIDATION_WEIGHT = 0.4
 def compute_confidence(
    ocr_confidence: float,
    flags: list[ReviewFlag],
 ) -> float:
    """Blend OCR confidence with validation penalties into a single 0-1 score."""
    validation_score = 1.0
    for flag in flags:
        validation_score -= _FLAG_PENALTY.get(flag, 0.05)
    validation_score = max(0.0, validation_score)
    blended = OCR_WEIGHT * ocr_confidence + VALIDATION_WEIGHT * validation_score
    return max(0.0, min(1.0, blended))
 def route(confidence: float) -> DocumentStatus:
    """Map a final confidence score onto the job's terminal status."""
    s = get_settings()
    if confidence >= s.confidence_auto_approve:
        return DocumentStatus.COMPLETED
    if confidence >= s.confidence_needs_review:
        return DocumentStatus.NEEDS_REVIEW
    return DocumentStatus.NEEDS_REVIEW  # below review threshold also goes to humans
--- a/src/ocr_sprint/pipeline/extract/init.py
+++ b/src/ocr_sprint/pipeline/extract/init.py
@@ -0,0 +1 @@
 """Information extraction layer (regex Phase 1, LLM Phase 5)."""
--- a/src/ocr_sprint/pipeline/extract/regex_rules.py
+++ b/src/ocr_sprint/pipeline/extract/regex_rules.py
@@ -0,0 +1,169 @@
 """Regex-based extraction for the deterministic header fields of a surat sprint.
 Targets header fields whose layout is highly standardized across Polri units:
  - Nomor sprint, e.g. "Sprin / 123 / IV / 2025 / Reskrim"
  - Tanggal (date the sprint was issued)
  - Satuan penerbit (issuing unit)
  - Perihal
  - Dasar (numbered list of legal/operational basis)
 Personnel table extraction is intentionally NOT done here — that needs
 PP-Structure + cell-aware logic and lives in `pipeline/table.py` (Phase 3).
 """
 from __future__ import annotations
 import re
 from datetime import date
 from ocr_sprint.schemas.extraction import HeaderFields, Signatory
 # ---------- regex patterns ----------
 # Nomor sprint, tolerant of spacing and OCR noise.
 # Examples it should match:
 #   "Sprin / 123 / IV / 2025 / Reskrim"
 #   "SPRIN/345/X/2024"
 #   "Nomor : Sprin/12/I/2025/Sat Intelkam"
 _RE_NOMOR_SPRINT = re.compile(
    r"\bSPRIN[\s./-]*\d+[\s./-]*[IVXLCDM]+[\s./-]*\d{2,4}(?:[\s./-]*[\w .-]+?)?",
    re.IGNORECASE,
 )
 # Indonesian month names.
 _BULAN_MAP: dict[str, int] = {
    "JANUARI": 1,
    "FEBRUARI": 2,
    "MARET": 3,
    "APRIL": 4,
    "MEI": 5,
    "JUNI": 6,
    "JULI": 7,
    "AGUSTUS": 8,
    "SEPTEMBER": 9,
    "OKTOBER": 10,
    "NOVEMBER": 11,
    "DESEMBER": 12,
 }
 # Date in Indonesian, e.g. "21 April 2025" or "21 - April - 2025"
 _RE_TANGGAL_ID = re.compile(
    r"\b(\d{1,2})\s*[-./\s]\s*(" + "|".join(_BULAN_MAP.keys()) + r")\s*[-./\s]\s*(\d{4})\b",
    re.IGNORECASE,
 )
 # Satuan penerbit usually appears in the document letterhead, prefixed by
 # KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
 _RE_SATUAN = re.compile(
    r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
    r"[^\n]{0,80}",
    re.IGNORECASE,
 )
 # "Perihal : ...." up to end of line.
 _RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
 # A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
 _RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
 # Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
 _RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
 def find_nomor_sprint(text: str) -> str | None:
    """Return the first nomor sprint found, normalized (no extra spaces)."""
    match = _RE_NOMOR_SPRINT.search(text)
    if not match:
        return None
    return " ".join(match.group(0).split())
 def find_tanggal(text: str) -> date | None:
    """Find the issuance date.
    Surat sprint typically contains multiple dates: one or more in the 'Dasar'
    section (citing prior documents) and one near the signatory at the bottom
    (the actual issuance date, usually formatted as 'Tempat, DD Month YYYY').
    We prefer the **last** date in the document since the issuance date appears
    after the dasar items in the standard layout.
    """
    matches = list(_RE_TANGGAL_ID.finditer(text))
    if not matches:
        return None
    last = matches[-1]
    day_s, bulan, year_s = last.group(1), last.group(2).upper(), last.group(3)
    try:
        return date(int(year_s), _BULAN_MAP[bulan], int(day_s))
    except (KeyError, ValueError):
        return None
 def find_satuan(text: str) -> str | None:
    """Return the first letterhead match (issuing unit), normalized."""
    match = _RE_SATUAN.search(text)
    if not match:
        return None
    return " ".join(match.group(0).split())
 def find_perihal(text: str) -> str | None:
    """Return the first 'Perihal: ...' line, trimmed to that line only."""
    for line in text.splitlines():
        m = _RE_PERIHAL.search(line)
        if m:
            return m.group(1).strip()
    return None
 def find_dasar_list(text: str) -> list[str]:
    """Extract numbered 'Dasar' items from the text.
    Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
    collect subsequent lines that start with a number. Stops at a blank line
    or a line beginning with another section header keyword.
    """
    lines = text.splitlines()
    items: list[str] = []
    in_dasar = False
    section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
    for raw_line in lines:
        line = raw_line.strip()
        if not in_dasar:
            if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
                in_dasar = True
            continue
        if not line:
            if items:
                break
            continue
        upper = line.upper()
        if any(upper.startswith(term) for term in section_terminators):
            break
        m = _RE_DASAR_ITEM.match(line)
        if m:
            items.append(m.group(2).strip())
        elif items:
            # continuation of the previous dasar item
            items[-1] = (items[-1] + " " + line).strip()
    return items
 def find_signatory(text: str) -> Signatory:
    """Best-effort extraction of the signatory block (last NRP in the document)."""
    matches = list(_RE_NRP.finditer(text))
    if not matches:
        return Signatory()
    last = matches[-1]
    return Signatory(nrp=last.group(2))
 def extract_header(text: str) -> HeaderFields:
    """Run all header-level regex extractors and return a populated schema."""
    return HeaderFields(
        nomor_sprint=find_nomor_sprint(text),
        tanggal=find_tanggal(text),
        satuan_penerbit=find_satuan(text),
        perihal=find_perihal(text),
        dasar=find_dasar_list(text),
    )
--- a/src/ocr_sprint/pipeline/extract/validators.py
+++ b/src/ocr_sprint/pipeline/extract/validators.py
@@ -0,0 +1,64 @@
 """Cross-field validation, with structured review-flag output."""
 from __future__ import annotations
 import re
 from ocr_sprint.data.master_pangkat import is_valid_pangkat
 from ocr_sprint.schemas.extraction import (
    ExtractionResult,
    HeaderFields,
    ReviewFlag,
 )
 from ocr_sprint.schemas.personnel import PersonnelEntry
 # Polri NRP = 8 digits.
 _RE_NRP_8 = re.compile(r"^\d{8}$")
 def validate_nrp(nrp: str | None) -> bool:
    """Return True when the value is a well-formed Polri NRP (8 digits)."""
    if nrp is None:
        return False
    return bool(_RE_NRP_8.match(nrp.strip()))
 def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
    """Inspect a single personnel row and return any review flags it triggers."""
    flags: list[ReviewFlag] = []
    if entry.nrp and not validate_nrp(entry.nrp):
        flags.append(ReviewFlag.INVALID_NRP)
    if entry.pangkat and not is_valid_pangkat(entry.pangkat):
        flags.append(ReviewFlag.UNKNOWN_PANGKAT)
    return flags
 def validate_header(header: HeaderFields) -> list[ReviewFlag]:
    """Flag missing required fields or unparseable dates in the header."""
    flags: list[ReviewFlag] = []
    if header.nomor_sprint is None:
        flags.append(ReviewFlag.MISSING_FIELD)
    if header.tanggal is None:
        flags.append(ReviewFlag.DATE_PARSE_FAILED)
    return flags
 def validate_extraction(
    result: ExtractionResult,
    expected_personnel_count: int | None = None,
 ) -> list[ReviewFlag]:
    """Run all validators across the full extraction and dedupe the flags."""
    flags: list[ReviewFlag] = []
    flags.extend(validate_header(result.header))
    for entry in result.personel:
        flags.extend(validate_personnel_entry(entry))
    if expected_personnel_count is not None and expected_personnel_count != len(result.personel):
        flags.append(ReviewFlag.PERSONNEL_COUNT_MISMATCH)
    # dedupe while preserving order
    seen: set[ReviewFlag] = set()
    deduped: list[ReviewFlag] = []
    for flag in flags:
        if flag not in seen:
            seen.add(flag)
            deduped.append(flag)
    return deduped
--- a/src/ocr_sprint/pipeline/ingest.py
+++ b/src/ocr_sprint/pipeline/ingest.py
@@ -0,0 +1,81 @@
 """Ingest layer: convert uploaded bytes (PDF/IMG) into a list of numpy images."""
 from __future__ import annotations
 import io
 from dataclasses import dataclass
 from typing import Any
 import fitz  # PyMuPDF
 import numpy as np
 from PIL import Image
 from ocr_sprint.schemas.document import SourceKind
 # Generic alias used across the pipeline. We don't constrain dtype/shape because
 # OpenCV operations accept multiple dtypes and numpy generics are still rough.
 NDArrayU8 = np.ndarray[Any, Any]
 PDF_MAGIC = b"%PDF-"
 PNG_MAGIC = b"\x89PNG\r\n\x1a\n"
 JPEG_MAGIC = b"\xff\xd8\xff"
 TIFF_MAGIC_LE = b"II*\x00"
 TIFF_MAGIC_BE = b"MM\x00*"
@dataclass(frozen=True)
 class IngestedPage:
    """One page worth of image data ready for preprocessing."""
    image: NDArrayU8  # HxWx3 BGR uint8 (OpenCV convention)
    page_index: int
 def detect_source_kind(content: bytes) -> SourceKind:
    """Best-effort sniff of an uploaded payload."""
    if content.startswith(PDF_MAGIC):
        return SourceKind.PDF
    if content.startswith((PNG_MAGIC, JPEG_MAGIC, TIFF_MAGIC_LE, TIFF_MAGIC_BE)):
        return SourceKind.IMAGE
    return SourceKind.UNKNOWN
 def _pil_to_bgr(img: Image.Image) -> NDArrayU8:
    """Convert PIL image to OpenCV BGR numpy array."""
    if img.mode != "RGB":
        img = img.convert("RGB")
    arr = np.asarray(img, dtype=np.uint8)
    # RGB to BGR
    return arr[:, :, ::-1].copy()
 def ingest_pdf(content: bytes, target_dpi: int = 300) -> list[IngestedPage]:
    """Render every page of a PDF to a numpy image at the target DPI.
    Uses PyMuPDF (no poppler dependency). DPI is enforced via a transform matrix:
    fitz's default is 72 DPI, so the zoom factor is target_dpi / 72.
    """
    pages: list[IngestedPage] = []
    zoom = target_dpi / 72.0
    matrix = fitz.Matrix(zoom, zoom)
    with fitz.open(stream=content, filetype="pdf") as doc:
        for idx, page in enumerate(doc):
            pix = page.get_pixmap(matrix=matrix, alpha=False)
            img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
            pages.append(IngestedPage(image=_pil_to_bgr(img), page_index=idx))
    return pages
 def ingest_image(content: bytes) -> list[IngestedPage]:
    """Decode a single image into a one-element page list."""
    img = Image.open(io.BytesIO(content))
    return [IngestedPage(image=_pil_to_bgr(img), page_index=0)]
 def ingest(content: bytes, kind: SourceKind, target_dpi: int = 300) -> list[IngestedPage]:
    """Dispatch to the right loader based on declared source kind."""
    if kind == SourceKind.PDF:
        return ingest_pdf(content, target_dpi=target_dpi)
    if kind == SourceKind.IMAGE:
        return ingest_image(content)
    raise ValueError(f"Unsupported source kind: {kind}")
--- a/src/ocr_sprint/pipeline/ocr.py
+++ b/src/ocr_sprint/pipeline/ocr.py
@@ -0,0 +1,106 @@
 """PaddleOCR wrapper.
 PaddleOCR has a heavy initialization cost (~2-5s on CPU as model files load),
 so we keep a process-global instance behind a lazy accessor.
 The wrapper exposes a small, stable surface so the rest of the pipeline does
 not depend directly on paddleocr's evolving API.
 """
 from __future__ import annotations
 from dataclasses import dataclass
 from threading import Lock
 from typing import TYPE_CHECKING
 import numpy as np
 from ocr_sprint.config import get_settings
 from ocr_sprint.pipeline.ingest import NDArrayU8
 from ocr_sprint.utils.logging import get_logger
 if TYPE_CHECKING:
    from paddleocr import PaddleOCR
 _logger = get_logger(__name__)
 _lock = Lock()
 _instance: PaddleOCR | None = None
@dataclass(frozen=True)
 class OCRLine:
    """One recognized line with its bounding polygon and confidence."""
    text: str
    confidence: float
    box: tuple[tuple[float, float], ...]  # 4 (x, y) corner points
@dataclass(frozen=True)
 class OCRPage:
    """OCR output for a single page."""
    lines: list[OCRLine]
    @property
    def text(self) -> str:
        """Reconstruct page text by concatenating lines (order = paddle's output order)."""
        return "\n".join(line.text for line in self.lines)
    @property
    def mean_confidence(self) -> float:
        if not self.lines:
            return 0.0
        return float(np.mean([line.confidence for line in self.lines]))
 def _build_paddleocr() -> PaddleOCR:
    from paddleocr import PaddleOCR
    s = get_settings()
    kwargs: dict[str, object] = {
        "lang": s.ocr_lang,
        "use_angle_cls": True,
        "use_gpu": s.ocr_use_gpu,
        "show_log": False,
    }
    if s.ocr_det_model_dir:
        kwargs["det_model_dir"] = s.ocr_det_model_dir
    if s.ocr_rec_model_dir:
        kwargs["rec_model_dir"] = s.ocr_rec_model_dir
    if s.ocr_cls_model_dir:
        kwargs["cls_model_dir"] = s.ocr_cls_model_dir
    _logger.info("paddleocr.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
    return PaddleOCR(**kwargs)
 def get_ocr() -> PaddleOCR:
    """Lazy, thread-safe singleton accessor for the PaddleOCR engine."""
    global _instance
    if _instance is None:
        with _lock:
            if _instance is None:
                _instance = _build_paddleocr()
    return _instance
 def run_ocr(image: NDArrayU8) -> OCRPage:
    """Run OCR on a single BGR image and return a structured page result."""
    engine = get_ocr()
    raw = engine.ocr(image, cls=True)
    # PaddleOCR returns [[ [box, (text, conf)], ... ]] — one outer list per image.
    if not raw or raw[0] is None:
        return OCRPage(lines=[])
    page_raw = raw[0]
    lines: list[OCRLine] = []
    for item in page_raw:
        if not item or len(item) < 2:
            continue
        box_raw, text_conf = item[0], item[1]
        text, conf = text_conf[0], float(text_conf[1])
        try:
            box = tuple((float(p[0]), float(p[1])) for p in box_raw)
        except (TypeError, ValueError, IndexError):
            continue
        lines.append(OCRLine(text=text, confidence=conf, box=box))
    return OCRPage(lines=lines)
--- a/src/ocr_sprint/pipeline/orchestrator.py
+++ b/src/ocr_sprint/pipeline/orchestrator.py
@@ -0,0 +1,103 @@
 """Synchronous pipeline orchestrator (Phase 1).
 Wires the individual stages together:
    bytes → ingest → preprocess → OCR → regex extract → validate → score
 Phase 4 will replace this with a Celery task graph; Phase 3/5 will plug
 in PP-Structure for tables and an LLM extractor for variant fields.
 """
 from __future__ import annotations
 from dataclasses import dataclass
 from ocr_sprint.config import get_settings
 from ocr_sprint.pipeline.confidence import compute_confidence, route
 from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory
 from ocr_sprint.pipeline.extract.validators import validate_extraction
 from ocr_sprint.pipeline.ingest import detect_source_kind, ingest
 from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
 from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
 from ocr_sprint.schemas.document import DocumentStatus, SourceKind
 from ocr_sprint.schemas.extraction import ExtractionResult, ReviewFlag
 from ocr_sprint.utils.logging import get_logger
 _logger = get_logger(__name__)
 # Below this OCR confidence we automatically flag for review.
 _OCR_CONFIDENCE_FLAG_THRESHOLD = 0.80
@dataclass
 class PipelineOutput:
    """Bundle returned by the orchestrator."""
    source_kind: SourceKind
    status: DocumentStatus
    confidence: float
    result: ExtractionResult
 def run_pipeline(content: bytes) -> PipelineOutput:
    """Execute the synchronous OCR + extraction pipeline on raw upload bytes."""
    s = get_settings()
    kind = detect_source_kind(content)
    if kind == SourceKind.UNKNOWN:
        raise ValueError("Unsupported file type — only PDF and common image formats are accepted.")
    pages = ingest(content, kind, target_dpi=s.preprocess_target_dpi)
    _logger.info("pipeline.ingested", source_kind=kind.value, pages=len(pages))
    pre_cfg = PreprocessConfig(
        max_side=s.ocr_max_image_side,
        denoise=s.preprocess_denoise,
        deskew=s.preprocess_deskew,
        adaptive_threshold=s.preprocess_adaptive_threshold,
    )
    ocr_pages: list[OCRPage] = []
    for page in pages:
        cleaned = preprocess(page.image, pre_cfg)
        ocr_pages.append(run_ocr(cleaned))
    full_text = "\n".join(p.text for p in ocr_pages)
    mean_ocr_conf = sum(p.mean_confidence for p in ocr_pages) / len(ocr_pages) if ocr_pages else 0.0
    header = extract_header(full_text)
    ttd = find_signatory(full_text)
    initial_flags: list[ReviewFlag] = []
    if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
        initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
    result = ExtractionResult(
        header=header,
        personel=[],  # Phase 3 will populate from PP-Structure
        untuk=[],
        ttd=ttd,
        raw_text=full_text,
        confidence=mean_ocr_conf,
        review_flags=list(initial_flags),
    )
    flags = validate_extraction(result)
    # merge initial OCR-confidence flag with validation flags, preserving uniqueness
    seen = set(flags)
    for f in initial_flags:
        if f not in seen:
            flags.append(f)
            seen.add(f)
    result.review_flags = flags
    final_conf = compute_confidence(mean_ocr_conf, flags)
    result.confidence = final_conf
    status = route(final_conf)
    return PipelineOutput(
        source_kind=kind,
        status=status,
        confidence=final_conf,
        result=result,
    )
--- a/src/ocr_sprint/pipeline/preprocess.py
+++ b/src/ocr_sprint/pipeline/preprocess.py
@@ -0,0 +1,108 @@
 """Image preprocessing for OCR.
 Phase 1 implements the "always-on" steps that work for both clean PDF scans
 and reasonable phone photos:
  - resize to a reasonable max side (PaddleOCR runs faster on smaller inputs)
  - convert to grayscale for analysis (kept as 3-channel BGR for paddle)
  - denoise (Non-Local Means, gentle)
  - deskew via Hough line angle estimate
  - optional adaptive threshold for low-quality phone photos
 Phase 2 will add document-corner detection + perspective transform + dewarping
 for tilted phone shots; those live in `document_detect.py` (added later).
 """
 from __future__ import annotations
 from dataclasses import dataclass
 import cv2
 import numpy as np
 from ocr_sprint.pipeline.ingest import NDArrayU8
@dataclass(frozen=True)
 class PreprocessConfig:
    """Tunable knobs for the preprocessing pipeline."""
    max_side: int = 2200
    denoise: bool = True
    deskew: bool = True
    adaptive_threshold: bool = False
 def _resize_max_side(img: NDArrayU8, max_side: int) -> NDArrayU8:
    h, w = img.shape[:2]
    longest = max(h, w)
    if longest <= max_side:
        return img
    scale = max_side / longest
    new_w, new_h = round(w * scale), round(h * scale)
    return cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
 def _estimate_skew_angle(gray: NDArrayU8) -> float:
    """Estimate skew using Canny + Hough; returns angle in degrees within [-15, 15]."""
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    lines = cv2.HoughLines(edges, 1, np.pi / 360, threshold=200)
    if lines is None or len(lines) == 0:
        return 0.0
    angles: list[float] = []
    for line in lines[:200]:
        rho, theta = line[0]
        del rho
        # convert to angle relative to horizontal (degrees)
        angle = (theta * 180.0 / np.pi) - 90.0
        # only keep nearly-horizontal lines (within ±15°)
        if -15.0 < angle < 15.0:
            angles.append(angle)
    if not angles:
        return 0.0
    return float(np.median(angles))
 def _rotate(img: NDArrayU8, angle_deg: float) -> NDArrayU8:
    if abs(angle_deg) < 0.1:
        return img
    h, w = img.shape[:2]
    center = (w / 2, h / 2)
    matrix = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
    return cv2.warpAffine(
        img,
        matrix,
        (w, h),
        flags=cv2.INTER_CUBIC,
        borderMode=cv2.BORDER_REPLICATE,
    )
 def preprocess(img: NDArrayU8, cfg: PreprocessConfig | None = None) -> NDArrayU8:
    """Run preprocessing and return a clean BGR uint8 image suitable for OCR."""
    if cfg is None:
        cfg = PreprocessConfig()
    out = _resize_max_side(img, cfg.max_side)
    if cfg.deskew:
        gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY)
        angle = _estimate_skew_angle(gray)
        out = _rotate(out, -angle)
    if cfg.denoise:
        out = cv2.fastNlMeansDenoisingColored(out, None, 5, 5, 7, 21)
    if cfg.adaptive_threshold:
        gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY)
        binarized = cv2.adaptiveThreshold(
            gray,
            255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            blockSize=31,
            C=15,
        )
        out = cv2.cvtColor(binarized, cv2.COLOR_GRAY2BGR)
    return out
--- a/src/ocr_sprint/py.typed
+++ b/src/ocr_sprint/py.typed
--- a/src/ocr_sprint/schemas/init.py
+++ b/src/ocr_sprint/schemas/init.py
@@ -0,0 +1,27 @@
 """Pydantic schemas for input/output of the OCR Sprint service."""
 from ocr_sprint.schemas.document import (
    DocumentJob,
    DocumentResponse,
    DocumentStatus,
    SourceKind,
 )
 from ocr_sprint.schemas.extraction import (
    ExtractionResult,
    HeaderFields,
    ReviewFlag,
    Signatory,
 )
 from ocr_sprint.schemas.personnel import PersonnelEntry
 __all__ = [
    "DocumentJob",
    "DocumentResponse",
    "DocumentStatus",
    "ExtractionResult",
    "HeaderFields",
    "PersonnelEntry",
    "ReviewFlag",
    "Signatory",
    "SourceKind",
 ]
--- a/src/ocr_sprint/schemas/document.py
+++ b/src/ocr_sprint/schemas/document.py
@@ -0,0 +1,57 @@
 """Job-level schemas (request, response, status)."""
 from __future__ import annotations
 from datetime import datetime
 from enum import Enum
 from typing import Any
 from uuid import UUID, uuid4
 from pydantic import BaseModel, ConfigDict, Field
 from ocr_sprint.schemas.extraction import ExtractionResult
 class SourceKind(str, Enum):
    """High-level type of the uploaded document."""
    PDF = "pdf"
    IMAGE = "image"
    UNKNOWN = "unknown"
 class DocumentStatus(str, Enum):
    """Lifecycle status of an OCR job."""
    PENDING = "pending"
    PROCESSING = "processing"
    COMPLETED = "completed"
    NEEDS_REVIEW = "needs_review"
    FAILED = "failed"
 class DocumentJob(BaseModel):
    """Internal representation of a job (Phase 1 holds it in-memory)."""
    model_config = ConfigDict(use_enum_values=False)
    job_id: UUID = Field(default_factory=uuid4)
    source_kind: SourceKind = SourceKind.UNKNOWN
    filename: str
    status: DocumentStatus = DocumentStatus.PENDING
    created_at: datetime = Field(default_factory=lambda: datetime.utcnow())
    updated_at: datetime = Field(default_factory=lambda: datetime.utcnow())
    error: str | None = None
    result: ExtractionResult | None = None
    debug: dict[str, Any] = Field(default_factory=dict)
 class DocumentResponse(BaseModel):
    """Public response payload returned by the documents API."""
    job_id: UUID
    status: DocumentStatus
    confidence: float | None = None
    data: ExtractionResult | None = None
    review_flags: list[str] = Field(default_factory=list)
    error: str | None = None
--- a/src/ocr_sprint/schemas/extraction.py
+++ b/src/ocr_sprint/schemas/extraction.py
@@ -0,0 +1,55 @@
 """Top-level extraction result schemas."""
 from __future__ import annotations
 from datetime import date
 from enum import Enum
 from pydantic import BaseModel, Field
 from ocr_sprint.schemas.personnel import PersonnelEntry
 class ReviewFlag(str, Enum):
    """Reasons a document was routed to human review."""
    LOW_OCR_CONFIDENCE = "low_ocr_confidence"
    MISSING_FIELD = "missing_field"
    INVALID_NRP = "invalid_nrp"
    UNKNOWN_PANGKAT = "unknown_pangkat"
    PERSONNEL_COUNT_MISMATCH = "personnel_count_mismatch"
    DATE_PARSE_FAILED = "date_parse_failed"
 class Signatory(BaseModel):
    """The official signing the sprint (Penandatangan)."""
    nama: str | None = None
    pangkat: str | None = None
    nrp: str | None = None
    jabatan: str | None = None
 class HeaderFields(BaseModel):
    """Header fields parsed from the top portion of a sprint."""
    nomor_sprint: str | None = Field(None, description="e.g. Sprin/123/IV/2025/Reskrim.")
    tanggal: date | None = Field(None, description="Date the sprint was issued.")
    satuan_penerbit: str | None = Field(None, description="Issuing unit, e.g. 'Polres Bandung'.")
    perihal: str | None = None
    dasar: list[str] = Field(default_factory=list, description="List of legal/operational basis.")
 class ExtractionResult(BaseModel):
    """Full structured payload extracted from a single sprint document."""
    header: HeaderFields = Field(default_factory=HeaderFields)
    personel: list[PersonnelEntry] = Field(default_factory=list)
    untuk: list[str] = Field(
        default_factory=list,
        description="Bulleted task descriptions in the 'Untuk' / 'Dikerjakan' section.",
    )
    ttd: Signatory = Field(default_factory=Signatory)
    raw_text: str = Field(default="", description="Concatenated OCR text for debugging.")
    confidence: float = Field(0.0, ge=0.0, le=1.0)
    review_flags: list[ReviewFlag] = Field(default_factory=list)
--- a/src/ocr_sprint/schemas/personnel.py
+++ b/src/ocr_sprint/schemas/personnel.py
@@ -0,0 +1,18 @@
 """Schema for a single personnel row in a surat sprint."""
 from __future__ import annotations
 from pydantic import BaseModel, Field
 class PersonnelEntry(BaseModel):
    """One row from the personnel table."""
    no: int | None = Field(None, description="Row number as printed on the document.")
    pangkat: str | None = Field(None, description="Rank, normalized when possible.")
    nrp: str | None = Field(None, description="8-digit Polri NRP, or blank if not detected.")
    nama: str | None = Field(None, description="Full name.")
    jabatan_dinas: str | None = Field(None, description="Permanent post (jabatan dalam dinas).")
    jabatan_sprint: str | None = Field(None, description="Role within this sprint.")
    keterangan: str | None = None
    confidence: float = Field(0.0, ge=0.0, le=1.0)
--- a/src/ocr_sprint/utils/init.py
+++ b/src/ocr_sprint/utils/init.py
--- a/src/ocr_sprint/utils/logging.py
+++ b/src/ocr_sprint/utils/logging.py
@@ -0,0 +1,45 @@
 """Structured logging setup using structlog."""
 from __future__ import annotations
 import logging
 import sys
 from typing import Any
 import structlog
 def configure_logging(level: str = "INFO") -> None:
    """Configure structlog to emit JSON-friendly key=value records to stdout."""
    log_level = getattr(logging, level.upper(), logging.INFO)
    logging.basicConfig(
        format="%(message)s",
        stream=sys.stdout,
        level=log_level,
    )
    structlog.configure(
        processors=[
            structlog.contextvars.merge_contextvars,
            structlog.processors.add_log_level,
            structlog.processors.TimeStamper(fmt="iso", utc=True),
            structlog.processors.StackInfoRenderer(),
            structlog.processors.format_exc_info,
            structlog.dev.ConsoleRenderer(colors=False),
        ],
        wrapper_class=structlog.make_filtering_bound_logger(log_level),
        context_class=dict,
        logger_factory=structlog.PrintLoggerFactory(),
        cache_logger_on_first_use=True,
    )
 def get_logger(name: str | None = None, **initial_values: Any) -> Any:
    """Return a bound logger with optional initial context.
    The return type is ``Any`` because structlog's BoundLogger generic typing
    is too restrictive in practice; callers treat it as a duck-typed logger.
    """
    logger = structlog.get_logger(name)
    if initial_values:
        logger = logger.bind(**initial_values)
    return logger
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,43 @@
 """Shared pytest fixtures."""
 from __future__ import annotations
 import numpy as np
 import pytest
@pytest.fixture
 def blank_bgr_image() -> np.ndarray:
    """A 600x800 white BGR image (uint8) — useful for preprocessing smoke tests."""
    return np.full((600, 800, 3), 255, dtype=np.uint8)
@pytest.fixture
 def sample_sprint_text() -> str:
    """Realistic-but-synthetic OCR text for regex extractor tests."""
    return (
        "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
        "DAERAH JAWA BARAT\n"
        "RESOR BANDUNG\n"
        "\n"
        "SURAT PERINTAH\n"
        "Nomor : Sprin/123/IV/2025/Reskrim\n"
        "\n"
        "DASAR :\n"
        "1. Undang-Undang Nomor 2 Tahun 2002 tentang Kepolisian Negara Republik Indonesia.\n"
        "2. Peraturan Kapolri Nomor 6 Tahun 2017 tentang Susunan Organisasi.\n"
        "3. Laporan Polisi Nomor LP/123/IV/2025/Reskrim tanggal 20 April 2025.\n"
        "\n"
        "DIPERINTAHKAN :\n"
        "Kepada : 1. Nama anggota tersebut di bawah ini.\n"
        "\n"
        "Untuk : Melaksanakan penyelidikan tindak pidana.\n"
        "\n"
        "PERIHAL : Pelaksanaan penyelidikan kasus pencurian.\n"
        "\n"
        "Bandung, 21 April 2025\n"
        "KEPALA KEPOLISIAN RESOR BANDUNG\n"
        "\n"
        "Drs. BUDI SANTOSO\n"
        "AKBP NRP 12345678\n"
    )
--- a/tests/unit/init.py
+++ b/tests/unit/init.py
--- a/tests/unit/test_api.py
+++ b/tests/unit/test_api.py
@@ -0,0 +1,87 @@
 """API tests with the OCR engine mocked.
 These tests do NOT load PaddleOCR — instead they monkeypatch the orchestrator
 so we can exercise the FastAPI surface without the heavy ML init cost.
 """
 from __future__ import annotations
 from datetime import date
 import pytest
 from fastapi.testclient import TestClient
 from ocr_sprint.main import create_app
 from ocr_sprint.pipeline import orchestrator as orch_module
 from ocr_sprint.pipeline.orchestrator import PipelineOutput
 from ocr_sprint.schemas.document import DocumentStatus, SourceKind
 from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields
@pytest.fixture
 def client() -> TestClient:
    return TestClient(create_app())
 def test_health_endpoint(client: TestClient) -> None:
    response = client.get("/api/v1/health")
    assert response.status_code == 200
    assert response.json()["status"] == "ok"
 def test_documents_rejects_empty_upload(client: TestClient) -> None:
    response = client.post(
        "/api/v1/documents",
        files={"file": ("empty.pdf", b"", "application/pdf")},
    )
    assert response.status_code == 400
 def test_documents_rejects_unknown_format(
    client: TestClient,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    response = client.post(
        "/api/v1/documents",
        files={"file": ("x.bin", b"random garbage bytes here", "application/octet-stream")},
    )
    assert response.status_code == 400
 def test_documents_returns_pipeline_output(
    client: TestClient,
    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
    fake_result = ExtractionResult(
        header=HeaderFields(
            nomor_sprint="Sprin/1/I/2025",
            tanggal=date(2025, 1, 1),
            satuan_penerbit="POLRES TEST",
        ),
        confidence=0.97,
    )
    fake_output = PipelineOutput(
        source_kind=SourceKind.PDF,
        status=DocumentStatus.COMPLETED,
        confidence=0.97,
        result=fake_result,
    )
    def _fake_run(_content: bytes) -> PipelineOutput:
        return fake_output
    # Patch the symbol *imported into* the routes module.
    monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
    from ocr_sprint.api.routes import documents as docs_module
    monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
    response = client.post(
        "/api/v1/documents",
        files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
    )
    assert response.status_code == 200
    body = response.json()
    assert body["status"] == "completed"
    assert body["confidence"] == 0.97
    assert body["data"]["header"]["nomor_sprint"] == "Sprin/1/I/2025"
--- a/tests/unit/test_confidence.py
+++ b/tests/unit/test_confidence.py
@@ -0,0 +1,46 @@
 """Tests for confidence scoring + routing."""
 from __future__ import annotations
 from ocr_sprint.pipeline.confidence import compute_confidence, route
 from ocr_sprint.schemas.document import DocumentStatus
 from ocr_sprint.schemas.extraction import ReviewFlag
 def test_no_flags_returns_blend_of_ocr_only() -> None:
    score = compute_confidence(0.9, [])
    # OCR weight 0.6 * 0.9 + validation 0.4 * 1.0 = 0.94
    assert abs(score - 0.94) < 1e-6
 def test_flags_reduce_score() -> None:
    base = compute_confidence(0.9, [])
    with_flags = compute_confidence(0.9, [ReviewFlag.MISSING_FIELD])
    assert with_flags < base
 def test_score_is_clamped() -> None:
    catastrophic = compute_confidence(
        0.0,
        [
            ReviewFlag.MISSING_FIELD,
            ReviewFlag.LOW_OCR_CONFIDENCE,
            ReviewFlag.PERSONNEL_COUNT_MISMATCH,
            ReviewFlag.INVALID_NRP,
            ReviewFlag.UNKNOWN_PANGKAT,
            ReviewFlag.DATE_PARSE_FAILED,
        ],
    )
    assert 0.0 <= catastrophic <= 1.0
 def test_route_high_confidence() -> None:
    assert route(0.97) == DocumentStatus.COMPLETED
 def test_route_mid_goes_to_review() -> None:
    assert route(0.88) == DocumentStatus.NEEDS_REVIEW
 def test_route_low_goes_to_review() -> None:
    assert route(0.40) == DocumentStatus.NEEDS_REVIEW
--- a/tests/unit/test_ingest.py
+++ b/tests/unit/test_ingest.py
@@ -0,0 +1,50 @@
 """Tests for source detection + image ingest."""
 from __future__ import annotations
 import io
 import numpy as np
 from PIL import Image
 from ocr_sprint.pipeline.ingest import detect_source_kind, ingest_image
 from ocr_sprint.schemas.document import SourceKind
 def _png_bytes() -> bytes:
    img = Image.new("RGB", (100, 80), color="white")
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    return buf.getvalue()
 def _jpeg_bytes() -> bytes:
    img = Image.new("RGB", (100, 80), color="white")
    buf = io.BytesIO()
    img.save(buf, format="JPEG")
    return buf.getvalue()
 def test_detect_pdf() -> None:
    assert detect_source_kind(b"%PDF-1.7\n...") == SourceKind.PDF
 def test_detect_png() -> None:
    assert detect_source_kind(_png_bytes()) == SourceKind.IMAGE
 def test_detect_jpeg() -> None:
    assert detect_source_kind(_jpeg_bytes()) == SourceKind.IMAGE
 def test_detect_unknown() -> None:
    assert detect_source_kind(b"garbage") == SourceKind.UNKNOWN
 def test_ingest_image_returns_one_page() -> None:
    pages = ingest_image(_png_bytes())
    assert len(pages) == 1
    assert pages[0].page_index == 0
    assert isinstance(pages[0].image, np.ndarray)
    assert pages[0].image.dtype == np.uint8
    assert pages[0].image.shape == (80, 100, 3)
--- a/tests/unit/test_preprocess.py
+++ b/tests/unit/test_preprocess.py
@@ -0,0 +1,37 @@
 """Smoke tests for the preprocessing pipeline."""
 from __future__ import annotations
 import numpy as np
 from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
 def test_preprocess_returns_bgr_uint8(blank_bgr_image: np.ndarray) -> None:
    out = preprocess(blank_bgr_image)
    assert out.dtype == np.uint8
    assert out.ndim == 3
    assert out.shape[2] == 3
 def test_preprocess_resizes_to_max_side() -> None:
    big = np.full((4000, 3000, 3), 255, dtype=np.uint8)
    cfg = PreprocessConfig(max_side=1000, denoise=False, deskew=False)
    out = preprocess(big, cfg)
    assert max(out.shape[:2]) == 1000
 def test_preprocess_does_not_upscale_small_images() -> None:
    small = np.full((400, 300, 3), 255, dtype=np.uint8)
    cfg = PreprocessConfig(max_side=2200, denoise=False, deskew=False)
    out = preprocess(small, cfg)
    assert out.shape[:2] == (400, 300)
 def test_adaptive_threshold_produces_binary_image() -> None:
    img = np.random.randint(0, 256, (200, 200, 3), dtype=np.uint8)
    cfg = PreprocessConfig(denoise=False, deskew=False, adaptive_threshold=True)
    out = preprocess(img, cfg)
    # adaptive threshold should leave only 0s and 255s
    unique = np.unique(out)
    assert set(unique.tolist()).issubset({0, 255})
--- a/tests/unit/test_regex_rules.py
+++ b/tests/unit/test_regex_rules.py
@@ -0,0 +1,112 @@
 """Tests for regex-based header extraction."""
 from __future__ import annotations
 from datetime import date
 import pytest
 from ocr_sprint.pipeline.extract.regex_rules import (
    extract_header,
    find_dasar_list,
    find_nomor_sprint,
    find_perihal,
    find_satuan,
    find_signatory,
    find_tanggal,
 )
 class TestNomorSprint:
    @pytest.mark.parametrize(
        ("text", "needle"),
        [
            ("Nomor : Sprin/123/IV/2025/Reskrim", "123"),
            ("Nomor: SPRIN / 7 / I / 2024", "7"),
            ("...Sprin-345-X-2024-Sat Intelkam...", "345"),
        ],
    )
    def test_finds_nomor(self, text: str, needle: str) -> None:
        result = find_nomor_sprint(text)
        assert result is not None
        assert needle in result
        assert result.upper().startswith("SPRIN")
    def test_returns_none_when_absent(self) -> None:
        assert find_nomor_sprint("no nomor here, just some text") is None
 class TestTanggal:
    def test_basic_date(self) -> None:
        assert find_tanggal("Bandung, 21 April 2025") == date(2025, 4, 21)
    def test_with_dashes(self) -> None:
        assert find_tanggal("Tanggal 1 - Desember - 2024") == date(2024, 12, 1)
    def test_invalid_month(self) -> None:
        assert find_tanggal("21 Foo 2025") is None
    def test_no_date_present(self) -> None:
        assert find_tanggal("nothing here") is None
 class TestSatuan:
    def test_polres(self) -> None:
        result = find_satuan("KEPOLISIAN RESOR BANDUNG\nLainnya")
        assert result is not None
        assert "RESOR BANDUNG" in result.upper()
    def test_polri_pusat(self) -> None:
        result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
        assert result is not None
 class TestPerihal:
    def test_extracts_perihal_line(self) -> None:
        text = "Other line\nPERIHAL : Pelaksanaan penyelidikan kasus.\nMore"
        assert find_perihal(text) == "Pelaksanaan penyelidikan kasus."
    def test_returns_none_when_absent(self) -> None:
        assert find_perihal("no perihal field") is None
 class TestDasar:
    def test_numbered_list(self) -> None:
        text = (
            "DASAR :\n"
            "1. UU No 2 Tahun 2002.\n"
            "2. Peraturan Kapolri Nomor 6.\n"
            "\n"
            "DIPERINTAHKAN :\n"
            "Kepada : ...\n"
        )
        items = find_dasar_list(text)
        assert len(items) == 2
        assert items[0].startswith("UU No 2")
        assert items[1].startswith("Peraturan Kapolri")
    def test_empty_when_section_missing(self) -> None:
        assert find_dasar_list("no dasar section") == []
 class TestSignatory:
    def test_extracts_last_nrp(self) -> None:
        text = "Some 12345678 NRP earlier 87654321\nNRP. 11223344"
        sig = find_signatory(text)
        assert sig.nrp == "11223344"
    def test_no_nrp(self) -> None:
        assert find_signatory("no NRP here").nrp is None
 class TestExtractHeader:
    def test_full_synthetic_doc(self, sample_sprint_text: str) -> None:
        header = extract_header(sample_sprint_text)
        assert header.nomor_sprint is not None
        assert "Sprin" in header.nomor_sprint
        assert header.tanggal == date(2025, 4, 21)
        assert header.satuan_penerbit is not None
        assert "KEPOLISIAN" in header.satuan_penerbit.upper()
        assert header.perihal is not None
        assert "penyelidikan" in header.perihal.lower()
        assert len(header.dasar) == 3
--- a/tests/unit/test_validators.py
+++ b/tests/unit/test_validators.py
@@ -0,0 +1,108 @@
 """Tests for the validation layer."""
 from __future__ import annotations
 from datetime import date
 import pytest
 from ocr_sprint.data.master_pangkat import is_valid_pangkat, normalize_pangkat
 from ocr_sprint.pipeline.extract.validators import (
    validate_extraction,
    validate_header,
    validate_nrp,
    validate_personnel_entry,
 )
 from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields, ReviewFlag
 from ocr_sprint.schemas.personnel import PersonnelEntry
 class TestNRP:
    @pytest.mark.parametrize("nrp", ["12345678", "00000001", "99999999"])
    def test_valid_8_digits(self, nrp: str) -> None:
        assert validate_nrp(nrp) is True
    @pytest.mark.parametrize("nrp", ["1234567", "123456789", "abcdefgh", "", None])
    def test_invalid(self, nrp: str | None) -> None:
        assert validate_nrp(nrp) is False
 class TestPangkat:
    @pytest.mark.parametrize(
        ("input_str", "expected"),
        [
            ("AKP", "AKP"),
            ("akp", "AKP"),
            ("AKP.", "AKP"),
            ("AKBP", "AKBP"),
            ("Brigjen Pol", "BRIGJEN POL"),
            ("BRIGJEN", "BRIGJEN POL"),
            ("Kombespol", "KOMBES POL"),
            ("BRIPDA", "BRIPDA"),
        ],
    )
    def test_normalizes_known_ranks(self, input_str: str, expected: str) -> None:
        assert normalize_pangkat(input_str) == expected
    def test_unknown_returns_none(self) -> None:
        assert normalize_pangkat("Sersan Mayor") is None
        assert is_valid_pangkat("Sersan Mayor") is False
 class TestPersonnelValidator:
    def test_clean_entry_no_flags(self) -> None:
        entry = PersonnelEntry(pangkat="AKP", nrp="12345678", nama="Test")
        assert validate_personnel_entry(entry) == []
    def test_invalid_nrp_flagged(self) -> None:
        entry = PersonnelEntry(pangkat="AKP", nrp="123", nama="Test")
        assert ReviewFlag.INVALID_NRP in validate_personnel_entry(entry)
    def test_unknown_pangkat_flagged(self) -> None:
        entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
        assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
 class TestHeaderValidator:
    def test_complete_header_no_flags(self) -> None:
        header = HeaderFields(
            nomor_sprint="Sprin/1/I/2025",
            tanggal=date(2025, 1, 1),
            satuan_penerbit="POLRES BANDUNG",
        )
        assert validate_header(header) == []
    def test_missing_nomor_flagged(self) -> None:
        header = HeaderFields(tanggal=date(2025, 1, 1))
        assert ReviewFlag.MISSING_FIELD in validate_header(header)
    def test_missing_date_flagged(self) -> None:
        header = HeaderFields(nomor_sprint="Sprin/1/I/2025")
        assert ReviewFlag.DATE_PARSE_FAILED in validate_header(header)
 class TestFullValidation:
    def test_personnel_count_mismatch(self) -> None:
        result = ExtractionResult(
            header=HeaderFields(
                nomor_sprint="Sprin/1/I/2025",
                tanggal=date(2025, 1, 1),
            ),
            personel=[
                PersonnelEntry(pangkat="AKP", nrp="12345678", nama="A"),
            ],
        )
        flags = validate_extraction(result, expected_personnel_count=2)
        assert ReviewFlag.PERSONNEL_COUNT_MISMATCH in flags
    def test_flags_are_deduped(self) -> None:
        result = ExtractionResult(
            header=HeaderFields(),  # missing both nomor and tanggal
            personel=[
                PersonnelEntry(nrp="123", pangkat="X"),
                PersonnelEntry(nrp="456", pangkat="Y"),
            ],
        )
        flags = validate_extraction(result)
        # each flag type should appear at most once
        assert len(flags) == len(set(flags))
		`@@ -0,0 +1,3 @@`
							`"""OCR Sprint Service — extract structured data from Indonesian police 'surat sprint'."""`

							`__version__ = "0.1.0"`
		`@@ -0,0 +1 @@`
							`"""OCR pipeline: ingest → preprocess → OCR → extract → validate."""`
		`@@ -0,0 +1 @@`
							`"""Information extraction layer (regex Phase 1, LLM Phase 5)."""`