From ca0c0a0428e2cfb2396ab08929f1fb7130a9696f Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Sat, 25 Apr 2026 14:58:50 +0000
Subject: [PATCH] Phase 1 MVP: synchronous OCR + regex header extraction

Implements the foundation of the OCR Sprint service:
- FastAPI app with /api/v1/health and /api/v1/documents (sync upload)
- Pydantic v2 schemas for documents, extraction result, personnel
- Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew,
  denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based
  header extraction (nomor sprint, tanggal, satuan, perihal, dasar),
  signatory NRP, master-pangkat validation, confidence scoring + routing.
- Tests: 61 unit tests covering regex rules, validators, preprocess,
  ingest, confidence, and API contract (PaddleOCR mocked).
- Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit,
  Dockerfile, docker-compose, Makefile.
- Docs: README + docs/architecture.md (full hybrid stack rationale and
  6-phase roadmap).

Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
---
 .env.example                                  |  43 +++
 .gitignore                                    |  70 +++++
 .pre-commit-config.yaml                       |  19 ++
 Dockerfile                                    |  51 ++++
 Makefile                                      |  52 ++++
 README.md                                     | 123 +++++++++
 docker-compose.yml                            |  23 ++
 docs/architecture.md                          | 259 ++++++++++++++++++
 pyproject.toml                                | 136 +++++++++
 samples/README.md                             |  13 +
 src/ocr_sprint/__init__.py                    |   3 +
 src/ocr_sprint/api/__init__.py                |   0
 src/ocr_sprint/api/errors.py                  |  43 +++
 src/ocr_sprint/api/routes/__init__.py         |   0
 src/ocr_sprint/api/routes/documents.py        |  58 ++++
 src/ocr_sprint/api/routes/health.py           |  15 +
 src/ocr_sprint/config.py                      |  72 +++++
 src/ocr_sprint/data/__init__.py               |   0
 src/ocr_sprint/data/master_pangkat.py         |  66 +++++
 src/ocr_sprint/main.py                        |  42 +++
 src/ocr_sprint/pipeline/__init__.py           |   1 +
 src/ocr_sprint/pipeline/confidence.py         |  51 ++++
 src/ocr_sprint/pipeline/extract/__init__.py   |   1 +
 .../pipeline/extract/regex_rules.py           | 169 ++++++++++++
 src/ocr_sprint/pipeline/extract/validators.py |  64 +++++
 src/ocr_sprint/pipeline/ingest.py             |  81 ++++++
 src/ocr_sprint/pipeline/ocr.py                | 106 +++++++
 src/ocr_sprint/pipeline/orchestrator.py       | 103 +++++++
 src/ocr_sprint/pipeline/preprocess.py         | 108 ++++++++
 src/ocr_sprint/py.typed                       |   0
 src/ocr_sprint/schemas/__init__.py            |  27 ++
 src/ocr_sprint/schemas/document.py            |  57 ++++
 src/ocr_sprint/schemas/extraction.py          |  55 ++++
 src/ocr_sprint/schemas/personnel.py           |  18 ++
 src/ocr_sprint/utils/__init__.py              |   0
 src/ocr_sprint/utils/logging.py               |  45 +++
 tests/__init__.py                             |   0
 tests/conftest.py                             |  43 +++
 tests/unit/__init__.py                        |   0
 tests/unit/test_api.py                        |  87 ++++++
 tests/unit/test_confidence.py                 |  46 ++++
 tests/unit/test_ingest.py                     |  50 ++++
 tests/unit/test_preprocess.py                 |  37 +++
 tests/unit/test_regex_rules.py                | 112 ++++++++
 tests/unit/test_validators.py                 | 108 ++++++++
 45 files changed, 2457 insertions(+)
 create mode 100644 .env.example
 create mode 100644 .gitignore
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 Dockerfile
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100644 docker-compose.yml
 create mode 100644 docs/architecture.md
 create mode 100644 pyproject.toml
 create mode 100644 samples/README.md
 create mode 100644 src/ocr_sprint/__init__.py
 create mode 100644 src/ocr_sprint/api/__init__.py
 create mode 100644 src/ocr_sprint/api/errors.py
 create mode 100644 src/ocr_sprint/api/routes/__init__.py
 create mode 100644 src/ocr_sprint/api/routes/documents.py
 create mode 100644 src/ocr_sprint/api/routes/health.py
 create mode 100644 src/ocr_sprint/config.py
 create mode 100644 src/ocr_sprint/data/__init__.py
 create mode 100644 src/ocr_sprint/data/master_pangkat.py
 create mode 100644 src/ocr_sprint/main.py
 create mode 100644 src/ocr_sprint/pipeline/__init__.py
 create mode 100644 src/ocr_sprint/pipeline/confidence.py
 create mode 100644 src/ocr_sprint/pipeline/extract/__init__.py
 create mode 100644 src/ocr_sprint/pipeline/extract/regex_rules.py
 create mode 100644 src/ocr_sprint/pipeline/extract/validators.py
 create mode 100644 src/ocr_sprint/pipeline/ingest.py
 create mode 100644 src/ocr_sprint/pipeline/ocr.py
 create mode 100644 src/ocr_sprint/pipeline/orchestrator.py
 create mode 100644 src/ocr_sprint/pipeline/preprocess.py
 create mode 100644 src/ocr_sprint/py.typed
 create mode 100644 src/ocr_sprint/schemas/__init__.py
 create mode 100644 src/ocr_sprint/schemas/document.py
 create mode 100644 src/ocr_sprint/schemas/extraction.py
 create mode 100644 src/ocr_sprint/schemas/personnel.py
 create mode 100644 src/ocr_sprint/utils/__init__.py
 create mode 100644 src/ocr_sprint/utils/logging.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/conftest.py
 create mode 100644 tests/unit/__init__.py
 create mode 100644 tests/unit/test_api.py
 create mode 100644 tests/unit/test_confidence.py
 create mode 100644 tests/unit/test_ingest.py
 create mode 100644 tests/unit/test_preprocess.py
 create mode 100644 tests/unit/test_regex_rules.py
 create mode 100644 tests/unit/test_validators.py

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..edd8ef2
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,43 @@
+# ==== App ====
+APP_ENV=local                 # local | dev | staging | prod
+APP_HOST=0.0.0.0
+APP_PORT=8000
+APP_LOG_LEVEL=INFO
+
+# ==== Storage (Phase 1: local filesystem) ====
+STORAGE_LOCAL_DIR=./storage
+
+# ==== OCR ====
+OCR_LANG=latin                # PaddleOCR lang code; "latin" works well for Bahasa Indonesia
+OCR_USE_GPU=false             # set true if running on a GPU host
+OCR_DET_MODEL_DIR=             # leave empty to use PaddleOCR defaults
+OCR_REC_MODEL_DIR=
+OCR_CLS_MODEL_DIR=
+OCR_MAX_IMAGE_SIDE=2200       # downscale longest side before OCR
+
+# ==== Preprocessing ====
+PREPROCESS_TARGET_DPI=300
+PREPROCESS_DENOISE=true
+PREPROCESS_DESKEW=true
+PREPROCESS_ADAPTIVE_THRESHOLD=false  # turn on for low-quality phone photos
+
+# ==== Confidence / routing (Phase 5) ====
+CONFIDENCE_AUTO_APPROVE=0.95
+CONFIDENCE_NEEDS_REVIEW=0.85
+
+# ==== LLM (Phase 5, optional) ====
+LLM_ENABLED=false
+LLM_PROVIDER=ollama
+LLM_MODEL=qwen2.5:1.5b        # CPU-friendly default
+LLM_BASE_URL=http://localhost:11434
+LLM_TIMEOUT_S=60
+
+# ==== Async pipeline (Phase 4, optional) ====
+QUEUE_ENABLED=false
+REDIS_URL=redis://localhost:6379/0
+DATABASE_URL=postgresql+psycopg://ocr:ocr@localhost:5432/ocr_sprint
+MINIO_ENDPOINT=localhost:9000
+MINIO_ACCESS_KEY=minioadmin
+MINIO_SECRET_KEY=minioadmin
+MINIO_BUCKET=ocr-sprint
+MINIO_SECURE=false
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9897bab
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,70 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+dist/
+*.egg-info/
+*.egg
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.coverage
+.coverage.*
+htmlcov/
+coverage.xml
+.tox/
+.nox/
+
+# Virtual environments
+.venv/
+venv/
+env/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+.DS_Store
+
+# Environment / secrets
+.env
+.env.*
+!.env.example
+
+# Local data & artifacts
+samples/*.pdf
+samples/*.PDF
+samples/*.jpg
+samples/*.JPG
+samples/*.jpeg
+samples/*.png
+samples/*.PNG
+samples/*.tif
+samples/*.tiff
+!samples/README.md
+data/local/
+storage/
+*.db
+*.sqlite
+*.sqlite3
+
+# OCR / model caches
+.paddleocr/
+~/.paddleocr/
+models/downloaded/
+
+# Logs
+logs/
+*.log
+
+# Docker
+.docker/
+
+# Misc
+*.bak
+*.tmp
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..1a8beea
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,19 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-toml
+      - id: check-added-large-files
+        args: ["--maxkb=1024"]
+      - id: check-merge-conflict
+      - id: detect-private-key
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.9
+    hooks:
+      - id: ruff
+        args: ["--fix"]
+      - id: ruff-format
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..110cf97
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,51 @@
+# syntax=docker/dockerfile:1.6
+# CPU-only image for the OCR Sprint API.
+# PaddleOCR + PyMuPDF + OpenCV-headless work on plain Debian without poppler.
+FROM python:3.11-slim AS base
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_NO_CACHE_DIR=1 \
+    DEBIAN_FRONTEND=noninteractive
+
+# System deps for OpenCV, libmagic, PaddlePaddle, and image format support.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libgl1 \
+        libglib2.0-0 \
+        libsm6 \
+        libxext6 \
+        libxrender1 \
+        libgomp1 \
+        libmagic1 \
+        ca-certificates \
+        curl \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# ----- builder layer (install deps separately for caching) -----
+FROM base AS builder
+COPY pyproject.toml README.md ./
+COPY src/ ./src/
+RUN pip install --upgrade pip && pip install ".[dev]"
+
+# ----- runtime layer -----
+FROM base AS runtime
+COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+COPY pyproject.toml README.md ./
+COPY src/ ./src/
+
+# Pre-create cache dirs so PaddleOCR can write models on first run.
+RUN mkdir -p /home/app/.paddleocr /app/storage \
+    && useradd --create-home --uid 1000 app \
+    && chown -R app:app /home/app /app
+
+USER app
+EXPOSE 8000
+
+HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
+    CMD curl -fsS http://localhost:8000/api/v1/health || exit 1
+
+CMD ["uvicorn", "ocr_sprint.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..65af363
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,52 @@
+.PHONY: help install dev fmt lint typecheck test test-cov run docker-build docker-up docker-down clean
+
+help:
+	@echo "Targets:"
+	@echo "  install       - install runtime + dev deps in current env"
+	@echo "  dev           - run FastAPI app with autoreload"
+	@echo "  fmt           - format code with ruff"
+	@echo "  lint          - lint with ruff"
+	@echo "  typecheck     - run mypy"
+	@echo "  test          - run pytest"
+	@echo "  test-cov      - run pytest with coverage"
+	@echo "  docker-build  - build api image"
+	@echo "  docker-up     - start docker-compose stack"
+	@echo "  docker-down   - stop docker-compose stack"
+
+install:
+	python -m pip install --upgrade pip
+	pip install -e ".[dev]"
+	pre-commit install || true
+
+dev:
+	uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
+
+fmt:
+	ruff format src tests
+	ruff check --fix src tests
+
+lint:
+	ruff check src tests
+	ruff format --check src tests
+
+typecheck:
+	mypy src
+
+test:
+	pytest
+
+test-cov:
+	pytest --cov --cov-report=term-missing
+
+docker-build:
+	docker compose build
+
+docker-up:
+	docker compose up -d
+
+docker-down:
+	docker compose down
+
+clean:
+	rm -rf .pytest_cache .mypy_cache .ruff_cache .coverage htmlcov build dist *.egg-info
+	find . -type d -name __pycache__ -exec rm -rf {} +
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6e5558f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,123 @@
+# OCR Sprint Service
+
+OCR + structured extraction service for Indonesian police "surat sprint" (surat perintah) documents. Built around **FastAPI + PaddleOCR + hybrid extraction (regex → LLM lokal → validation)** with **on-premise** deployment as a hard requirement.
+
+> **Status:** Phase 1 MVP — synchronous PDF/image OCR with regex header extraction, validation, and confidence scoring. Phase 2–6 (document detection, table extraction, async pipeline, LLM extraction, HITL) are tracked in [`docs/architecture.md`](docs/architecture.md).
+
+## Why this stack
+
+- **PaddleOCR** is the strongest open-source OCR for mixed-language documents and runs fully on-prem (essential for police data).
+- **PP-Structure** (Phase 3) handles personnel tables natively.
+- **Regex-first, LLM-fallback extraction** keeps deterministic fields fast and predictable while letting an LLM handle format drift across Polri units.
+- **CPU-friendly defaults**: a small (1.5B–4B) local LLM via Ollama is the recommended default; the architecture is also GPU-ready.
+
+See [`docs/architecture.md`](docs/architecture.md) for the full architecture, accuracy expectations, and roadmap.
+
+## Quickstart
+
+### Prerequisites
+
+- Python **3.10–3.12**
+- ~3 GB free disk for PaddleOCR model downloads on first run
+- Linux/macOS recommended (Windows works but PaddleOCR install can be finicky)
+
+### Install (local dev)
+
+```bash
+git clone https://github.com/Adriankf59/ocr-sprint-service.git
+cd ocr-sprint-service
+
+python -m venv .venv && source .venv/bin/activate
+make install         # installs runtime + dev deps + pre-commit
+cp .env.example .env # edit if you need GPU / different storage path
+```
+
+### Run the API
+
+```bash
+make dev
+# → http://localhost:8000/docs
+```
+
+### Try it out
+
+```bash
+curl -F "file=@samples/pdf/example.pdf" http://localhost:8000/api/v1/documents | jq
+```
+
+Expected response (truncated):
+
+```json
+{
+  "job_id": "8f2a...",
+  "status": "completed",
+  "confidence": 0.93,
+  "data": {
+    "header": {
+      "nomor_sprint": "Sprin/123/IV/2025/Reskrim",
+      "tanggal": "2025-04-21",
+      "satuan_penerbit": "KEPOLISIAN RESOR BANDUNG",
+      "perihal": "Pelaksanaan penyelidikan kasus pencurian",
+      "dasar": ["Undang-Undang Nomor 2 Tahun 2002 ...", "..."]
+    },
+    "personel": [],
+    "ttd": { "nrp": "12345678" }
+  },
+  "review_flags": []
+}
+```
+
+> **Note:** Phase 1 does not yet populate the `personel[]` table — that requires PP-Structure (Phase 3). Header fields, signatory NRP, confidence, and HITL routing are fully wired.
+
+### Docker
+
+```bash
+docker compose build
+docker compose up -d
+docker compose logs -f api
+```
+
+The first request will trigger PaddleOCR to download its detection/recognition/cls models (~200 MB) into the `paddle-models` volume.
+
+## Development
+
+```bash
+make fmt        # format with ruff
+make lint       # lint
+make typecheck  # mypy strict mode
+make test       # pytest
+make test-cov   # pytest + coverage
+```
+
+Pre-commit hooks run ruff on every commit. Install once with `pre-commit install` (already done by `make install`).
+
+## Project layout
+
+```
+src/ocr_sprint/
+  api/          # FastAPI routes + error handlers
+  schemas/      # Pydantic v2 models (request/response, extraction, personnel)
+  pipeline/     # ingest → preprocess → ocr → extract → validate → score
+    extract/    # regex_rules.py (Phase 1) → llm.py (Phase 5)
+  data/         # master data (Polri ranks, etc.)
+  utils/        # logging, helpers
+  config.py     # pydantic-settings
+  main.py       # app factory
+tests/unit/     # ~60 unit tests, no PaddleOCR dependency
+docs/           # architecture & decision records
+```
+
+## Roadmap
+
+| Phase | Scope | Status |
+|---|---|---|
+| 1 | Sync API, PDF/image ingest, basic preprocessing, PaddleOCR, regex header extraction, validation, confidence scoring | **In progress** |
+| 2 | DocTR document detection + dewarping for phone photos | Planned |
+| 3 | PP-Structure table extraction for personnel rows | Planned |
+| 4 | Async pipeline (Celery + Redis), Postgres + MinIO, auth, observability | Planned |
+| 5 | LLM hybrid extraction (Ollama + structured output) | Planned |
+| 6 | HITL review endpoints + audit trail | Planned |
+
+## License
+
+Proprietary — internal use only.
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..cd520ff
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,23 @@
+# Phase 1 MVP compose: API only.
+# Phase 4 will add redis, postgres, minio, and worker services.
+services:
+  api:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: ocr-sprint-service:dev
+    container_name: ocr-sprint-api
+    ports:
+      - "8000:8000"
+    environment:
+      APP_ENV: local
+      APP_LOG_LEVEL: INFO
+      OCR_USE_GPU: "false"
+      STORAGE_LOCAL_DIR: /app/storage
+    volumes:
+      - ./storage:/app/storage
+      - paddle-models:/home/app/.paddleocr
+    restart: unless-stopped
+
+volumes:
+  paddle-models:
diff --git a/docs/architecture.md b/docs/architecture.md
new file mode 100644
index 0000000..2cb7977
--- /dev/null
+++ b/docs/architecture.md
@@ -0,0 +1,259 @@
+# Plan & Arsitektur — OCR Service Surat Sprint Kepolisian
+
+## 1. Penilaian Jujur Tech Stack yang Diusulkan
+
+Tech stack Anda (FastAPI + PaddleOCR + OpenCV/Pillow + Regex) **sudah bagus dan layak produksi**, tapi **belum tentu paling optimal akurasinya** untuk kasus surat sprint. Ada beberapa gap yang perlu diisi sebelum bisa disebut "terbaik".
+
+### Yang sudah tepat
+| Komponen | Alasan |
+|---|---|
+| **FastAPI** | Async native, Pydantic validation, OpenAPI docs otomatis, ideal untuk ML serving. |
+| **PaddleOCR (PP-OCRv4/v5)** | Salah satu OCR open-source terbaik untuk dokumen campuran teks + tabel, mendukung Latin (cocok untuk Bahasa Indonesia), bisa jalan on-premise (penting untuk dokumen kepolisian yang sensitif — **cloud OCR seperti Google Vision/AWS Textract sebaiknya dihindari** karena masalah kerahasiaan). |
+| **OpenCV + Pillow** | Standar industri untuk preprocessing. |
+| **Regex/rule-based** | Cocok untuk dokumen terstruktur seperti sprint yang format-nya relatif baku. |
+
+### Yang masih kurang / perlu ditambah
+
+1. **Table extraction belum tertangani**
+   Daftar personel di surat sprint hampir selalu berbentuk **tabel** (No, Pangkat, NRP, Nama, Jabatan, Keterangan). Regex pada teks linear dari OCR biasa **akan kacau** ketika baris tabel pecah atau kolom bergeser. Solusi: gunakan **PaddleOCR PP-Structure** (modul table recognition bawaan Paddle) atau model khusus seperti **TableTransformer (Microsoft)**.
+
+2. **Document detection & dewarping untuk foto HP belum eksplisit**
+   Foto HP bermasalah karena: perspektif miring, lipatan, bayangan, lighting tidak rata, fokus tidak merata. OpenCV crop + perspective transform manual saja sering gagal. Tambahkan:
+   - **Document corner detection**: `DocTR` / `MobileSAM` / model edge-based, atau heuristik kontur OpenCV sebagai fallback.
+   - **Dewarping**: `DocTr` / `DewarpNet` untuk halaman yang melengkung (lipatan).
+   - **Shadow removal**: algoritma background division atau model spesialis.
+
+3. **Strategi ekstraksi 100% regex itu rapuh**
+   Surat sprint dari satuan berbeda (Polda, Polres, Polsek, Mabes) punya **variasi format**: header berbeda, urutan field berbeda, kadang pangkat disingkat (`AKP`, `IPDA`) kadang ditulis penuh. Regex murni akan butuh ratusan rule dan tetap miss kasus baru.
+   **Rekomendasi pendekatan hybrid**:
+   - **Layer 1 — Regex/rule** untuk field deterministik (Nomor sprint, tanggal, dasar hukum) yang format-nya baku.
+   - **Layer 2 — Schema-aware extraction** menggunakan **LLM lokal** (Llama 3.1 8B / Qwen2.5 7B via Ollama atau vLLM) dengan structured output (JSON schema / Pydantic) untuk field yang variatif (jabatan, keterangan tugas).
+   - **Layer 3 — Validation** terhadap master data (daftar pangkat valid, format NRP 8 digit, dll).
+
+4. **Tidak ada confidence scoring & human-in-the-loop**
+   Untuk dokumen kepolisian, **akurasi 100% otomatis itu mitos**. Sistem harus:
+   - Mengeluarkan confidence score per field.
+   - Otomatis flag dokumen low-confidence untuk review manusia.
+   - Sediakan UI/endpoint koreksi yang feedback-nya bisa dipakai retraining.
+
+5. **Alternatif end-to-end yang patut dipertimbangkan**
+   Jika nanti volume dokumen besar dan format relatif stabil, fine-tuning model **Document Understanding** end-to-end bisa lebih akurat:
+   - **Donut** (OCR-free, langsung image → JSON).
+   - **LayoutLMv3** (kombinasi teks + layout + visual).
+   - **Surya OCR** (newer, sangat bagus untuk dokumen).
+   Untuk MVP, tetap pakai PaddleOCR. Donut/LayoutLM adalah opsi V2 setelah ada labeled dataset cukup (~500–1000 dokumen).
+
+### Verdict
+Stack Anda **bisa mencapai ~85–92% akurasi field-level** untuk surat sprint dengan kualitas scan baik, dan **~70–80%** untuk foto HP, **kalau ditambah** komponen di atas. Tanpa table extraction + dewarping + hybrid extraction, akurasinya akan jatuh di kondisi nyata.
+
+---
+
+## 2. Arsitektur yang Direkomendasikan
+
+### 2.1 Diagram Logis
+
+```
+┌────────────────────────────────────────────────────────────────────┐
+│                         Client (Web/Mobile)                        │
+└──────────────────────────────┬─────────────────────────────────────┘
+                               │ HTTPS (multipart upload)
+                               ▼
+┌────────────────────────────────────────────────────────────────────┐
+│                    FastAPI Gateway (stateless)                     │
+│   - Auth (JWT/API key)   - Rate limit   - Request validation       │
+└──────────────────────────────┬─────────────────────────────────────┘
+                               │ enqueue job
+                               ▼
+┌────────────────────────────────────────────────────────────────────┐
+│              Job Queue (Redis + Celery / RQ / Dramatiq)            │
+└──────────────────────────────┬─────────────────────────────────────┘
+                               ▼
+┌────────────────────────────────────────────────────────────────────┐
+│                    OCR Worker Pipeline (GPU/CPU)                   │
+│  ┌────────────┐  ┌──────────────┐  ┌───────────┐  ┌────────────┐   │
+│  │ 1. Ingest  │→ │ 2. Preproc   │→ │ 3. OCR +  │→ │ 4. Extract │   │
+│  │  & detect  │  │ (deskew,     │  │  Layout   │  │ (regex +   │   │
+│  │  PDF/IMG   │  │  dewarp,     │  │  PP-Struct│  │  LLM +     │   │
+│  │            │  │  denoise)    │  │  + Table) │  │  validate) │   │
+│  └────────────┘  └──────────────┘  └───────────┘  └─────┬──────┘   │
+│                                                         │          │
+│                          ┌──────────────────────────────┘          │
+│                          ▼                                         │
+│                   ┌─────────────┐                                  │
+│                   │ 5. Confidence│ → low conf? flag for review    │
+│                   │   scoring    │                                 │
+│                   └──────┬───────┘                                 │
+└──────────────────────────┼─────────────────────────────────────────┘
+                           ▼
+┌────────────────────────────────────────────────────────────────────┐
+│           Storage: PostgreSQL (metadata) + MinIO/S3 (file)         │
+│           + Vector store opsional (untuk dedup / search)           │
+└────────────────────────────────────────────────────────────────────┘
+                           │
+                           ▼
+┌────────────────────────────────────────────────────────────────────┐
+│           Review UI (optional) — koreksi manual + audit trail      │
+└────────────────────────────────────────────────────────────────────┘
+```
+
+### 2.2 Pipeline Detail per Tahap
+
+**Tahap 1 — Ingest & Document Detection**
+- PDF: render setiap halaman jadi image @ 300 DPI (`pdf2image` / `PyMuPDF`).
+- Image (foto HP): deteksi sudut dokumen → crop → perspective transform.
+  - Library: OpenCV `findContours` (cepat) sebagai fallback, **DocTR document detector** (lebih akurat) sebagai utama.
+
+**Tahap 2 — Preprocessing**
+- Deskew (rotation correction) — Hough transform atau model.
+- Dewarp (untuk foto buku/lipatan) — `DewarpNet` atau model RNN.
+- Adaptive thresholding (untuk foto dengan lighting tidak rata).
+- Shadow removal (background division).
+- Denoise (Non-Local Means).
+- Resize ke ukuran optimal OCR (~1500–2500 px sisi panjang).
+
+**Tahap 3 — OCR + Layout Analysis**
+- **PaddleOCR PP-Structure** dijalankan sekali → menghasilkan:
+  - Bounding boxes + teks + confidence per word/line.
+  - Table region detection + table-to-HTML/JSON.
+  - Layout type per region (title, paragraph, table, figure).
+- Output ditampung sebagai struktur intermediate (mirip hOCR / ALTO XML).
+
+**Tahap 4 — Information Extraction**
+- **4a. Header parsing (regex)**: Nomor sprint, tanggal, satuan penerbit, dasar hukum, perihal. Format relatif baku → regex sangat cocok.
+- **4b. Personnel table extraction**: ambil dari hasil PP-Structure table → mapping kolom (Pangkat, NRP, Nama, Jabatan, Keterangan).
+- **4c. LLM fallback**: untuk field yang regex/table miss, kirim chunk teks + JSON schema ke LLM lokal (Ollama / vLLM) dengan **structured output** (Pydantic via `outlines` / `instructor`).
+- **4d. Validation layer**:
+  - NRP: 8 digit numerik.
+  - Pangkat: harus ada di daftar master pangkat Polri.
+  - Tanggal: parse + sanity check.
+  - Cross-check: jumlah personel di body = jumlah baris tabel.
+
+**Tahap 5 — Confidence Scoring & Routing**
+- Aggregate confidence: weighted average dari OCR confidence + validation pass/fail + LLM logprob (kalau pakai).
+- Threshold (mis. < 0.85) → status `NEEDS_REVIEW`.
+- Threshold tinggi (≥ 0.95) + semua validasi pass → status `AUTO_APPROVED`.
+
+### 2.3 API Endpoint (FastAPI)
+
+```
+POST   /api/v1/documents              # upload, kembalikan job_id
+GET    /api/v1/documents/{job_id}     # poll status + hasil
+GET    /api/v1/documents/{job_id}/raw # raw OCR output (debug)
+PATCH  /api/v1/documents/{job_id}     # koreksi manual (HITL)
+GET    /api/v1/health                 # liveness
+GET    /api/v1/metrics                # Prometheus
+```
+
+Response shape (contoh):
+```json
+{
+  "job_id": "uuid",
+  "status": "completed | processing | needs_review | failed",
+  "confidence": 0.92,
+  "data": {
+    "nomor_sprint": "Sprin/123/IV/2025",
+    "tanggal": "2025-04-21",
+    "satuan_penerbit": "Polres Bandung",
+    "dasar": ["...", "..."],
+    "perihal": "...",
+    "personel": [
+      {"no": 1, "pangkat": "AKP", "nrp": "12345678", "nama": "...", "jabatan": "Kasat Reskrim", "confidence": 0.97},
+      ...
+    ],
+    "ttd": {"pejabat": "...", "pangkat": "...", "nrp": "..."}
+  },
+  "review_flags": []
+}
+```
+
+### 2.4 Tech Stack Final yang Direkomendasikan
+
+| Layer | Pilihan | Catatan |
+|---|---|---|
+| API | **FastAPI** + Uvicorn/Gunicorn | sesuai usulan |
+| Validation | **Pydantic v2** | wajib |
+| Queue | **Redis + Celery** atau **Dramatiq** | OCR berat, jangan blocking request |
+| OCR | **PaddleOCR PP-OCRv4 + PP-Structure** | tambah PP-Structure untuk tabel |
+| Preprocessing | **OpenCV + Pillow** + **DocTR** (detection) | DocTR untuk foto HP |
+| Extraction | **Regex + Ollama (Llama 3.1 8B / Qwen2.5 7B)** + **instructor/outlines** | hybrid |
+| Storage | **PostgreSQL** (metadata) + **MinIO** (file blob) | self-hosted, sesuai compliance |
+| Observability | **Prometheus + Grafana + Loki** | wajib produksi |
+| Container | **Docker + docker-compose** (dev) → **Kubernetes** (prod) | |
+| GPU | NVIDIA T4/A10 (1× cukup untuk MVP) | PaddleOCR jauh lebih cepat di GPU |
+
+---
+
+## 3. Roadmap Pengembangan (Bertahap)
+
+### Fase 0 — Persiapan (1 minggu)
+- Kumpulkan **dataset sampel**: minimal 50 surat sprint (campur PDF scan + foto HP) dari beragam satuan.
+- Buat **ground truth labelling** untuk 20 dokumen (untuk evaluasi).
+- Definisikan **schema output final** (JSON) bersama stakeholder.
+
+### Fase 1 — MVP Pipeline Sinkron (2 minggu)
+- Setup FastAPI skeleton + Pydantic schemas.
+- Integrasi PaddleOCR PP-OCRv4 (CPU dulu, GPU menyusul).
+- Preprocessing dasar: deskew + denoise + resize.
+- Regex extraction untuk field header.
+- Endpoint sinkron `POST /documents` (untuk dev/testing saja).
+- **Evaluasi akurasi** terhadap 20 ground truth.
+
+### Fase 2 — Robustness untuk Foto HP (2 minggu)
+- Integrasi document detection (DocTR atau OpenCV contour).
+- Perspective transform + dewarping.
+- Shadow removal.
+- Re-evaluasi akurasi pada subset foto HP.
+
+### Fase 3 — Table Extraction (1.5 minggu)
+- Integrasi PP-Structure untuk personnel table.
+- Mapping kolom + validation (NRP, pangkat).
+- Master data tabel pangkat Polri.
+
+### Fase 4 — Async + Production Ready (1.5 minggu)
+- Pindahkan ke arsitektur async dengan Celery + Redis.
+- Storage MinIO + PostgreSQL.
+- Auth, rate limit, logging, metrics.
+- Docker compose untuk deployment.
+
+### Fase 5 — LLM Hybrid Extraction (2 minggu)
+- Setup Ollama / vLLM dengan model lokal.
+- Structured output via `instructor`.
+- Confidence scoring + routing ke review.
+
+### Fase 6 — HITL Review UI (opsional, 2 minggu)
+- Endpoint koreksi.
+- Simple web UI (Next.js) untuk reviewer.
+- Audit trail & feedback loop.
+
+### Fase 7 — Optimasi Lanjutan (ongoing)
+- Fine-tune PaddleOCR detection/recognition pada dataset internal.
+- Eksplorasi Donut/LayoutLMv3 jika dataset sudah cukup.
+- Batch processing & GPU optimization.
+
+**Total estimasi MVP fungsional (Fase 1–4): ~7 minggu** dengan 1 backend engineer + 1 ML engineer.
+
+---
+
+## 4. Risiko & Mitigasi
+
+| Risiko | Mitigasi |
+|---|---|
+| Data sensitif (kepolisian) bocor | Wajib on-prem; tidak ada cloud OCR; enkripsi at-rest (LUKS/pgcrypto) + in-transit (mTLS); audit log lengkap. |
+| Variasi format antar satuan | Hybrid extraction (regex + LLM); kumpulkan sample dari banyak satuan sejak awal. |
+| Foto HP kualitas buruk | Validasi kualitas image di client (resolusi minimal, blur detection) sebelum upload. |
+| Akurasi tidak sampai target | HITL review wajib untuk dokumen low-confidence; jangan deploy fully-automatic. |
+| Tanggung jawab hukum atas hasil OCR | Selalu simpan original document + flag bahwa hasil ekstraksi adalah "draft, perlu verifikasi manusia". |
+
+---
+
+## 5. Pertanyaan Sebelum Implementasi
+
+Sebelum saya lanjut ke implementasi, mohon konfirmasi:
+
+1. **Volume**: berapa dokumen/hari yang ditargetkan? (mempengaruhi pilihan async vs sync, GPU vs CPU)
+2. **Deployment target**: on-prem mutlak, atau private cloud (GovCloud) boleh?
+3. **Source dokumen**: apakah ada akses ke 20–50 sample surat sprint untuk dijadikan dataset awal?
+4. **Integrasi**: service ini akan dipanggil sistem apa? (mempengaruhi auth & API contract)
+5. **HITL**: apakah ada SDM untuk review manual dokumen low-confidence?
+6. **Hardware**: sudah ada server GPU, atau perlu sizing rekomendasi?
+7. **Format output final**: ada schema yang sudah dipakai sistem downstream?
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..4ae79a8
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,136 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "ocr-sprint-service"
+version = "0.1.0"
+description = "OCR service for Indonesian police 'surat sprint' documents (FastAPI + PaddleOCR + hybrid extraction)"
+readme = "README.md"
+requires-python = ">=3.10,<3.13"
+license = { text = "Proprietary" }
+authors = [{ name = "Adrian Kuman Firmansah" }]
+
+dependencies = [
+    # Web framework
+    "fastapi>=0.115,<0.116",
+    "uvicorn[standard]>=0.30,<0.34",
+    "python-multipart>=0.0.9",
+    "pydantic>=2.7,<3",
+    "pydantic-settings>=2.4,<3",
+    # Image / PDF
+    "pillow>=10.4,<12",
+    "opencv-python-headless>=4.10,<5",
+    "numpy>=1.26,<2.2",
+    "PyMuPDF>=1.24,<2",
+    "python-magic>=0.4.27",
+    # OCR (CPU build of paddle; GPU users override via extra index)
+    "paddlepaddle==2.6.1",
+    "paddleocr>=2.7.5,<3",
+    # Logging / observability
+    "structlog>=24.1",
+    "prometheus-client>=0.20",
+    # Misc
+    "httpx>=0.27",
+    "tenacity>=8.5",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.2",
+    "pytest-asyncio>=0.23",
+    "pytest-cov>=5.0",
+    "ruff>=0.6.9",
+    "mypy>=1.11",
+    "types-Pillow",
+    "pre-commit>=3.7",
+]
+
+# Extraction layer (Phase 5) — kept optional so MVP install stays light
+llm = [
+    "ollama>=0.3",
+    "instructor>=1.4",
+]
+
+# Async pipeline (Phase 4)
+async-pipeline = [
+    "celery[redis]>=5.4",
+    "redis>=5.0",
+    "minio>=7.2",
+    "sqlalchemy>=2.0",
+    "psycopg[binary]>=3.2",
+    "alembic>=1.13",
+]
+
+[project.scripts]
+ocr-sprint-api = "ocr_sprint.main:run"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+"ocr_sprint" = ["py.typed"]
+
+# ---------- Tooling ----------
+
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+src = ["src", "tests"]
+
+[tool.ruff.lint]
+select = [
+    "E", "F", "W",       # pycodestyle / pyflakes
+    "I",                 # isort
+    "B",                 # bugbear
+    "UP",                # pyupgrade
+    "SIM",               # simplify
+    "RUF",               # ruff-specific
+    "C4",                # comprehensions
+    "PIE",
+    "PT",                # pytest style
+    "TID",               # tidy imports
+]
+ignore = [
+    "E501",  # line length handled by formatter
+    "B008",  # FastAPI Depends() pattern
+]
+
+[tool.ruff.format]
+quote-style = "double"
+
+[tool.mypy]
+python_version = "3.10"
+strict = true
+warn_unused_ignores = true
+warn_redundant_casts = true
+disallow_untyped_defs = true
+plugins = ["pydantic.mypy"]
+mypy_path = "src"
+namespace_packages = true
+explicit_package_bases = true
+
+[[tool.mypy.overrides]]
+module = ["paddleocr.*", "paddle.*", "cv2.*", "fitz.*", "magic.*"]
+ignore_missing_imports = true
+
+[tool.pytest.ini_options]
+minversion = "8.0"
+addopts = "-ra --strict-markers --strict-config"
+testpaths = ["tests"]
+asyncio_mode = "auto"
+filterwarnings = [
+    "ignore::DeprecationWarning:paddle.*",
+    "ignore::DeprecationWarning:paddleocr.*",
+]
+
+[tool.coverage.run]
+source = ["src/ocr_sprint"]
+branch = true
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "raise NotImplementedError",
+    "if TYPE_CHECKING:",
+]
diff --git a/samples/README.md b/samples/README.md
new file mode 100644
index 0000000..281c7e7
--- /dev/null
+++ b/samples/README.md
@@ -0,0 +1,13 @@
+# Samples
+
+Drop sample surat sprint files here for local testing. **Do NOT commit real documents** — `.gitignore` excludes binary file extensions in this folder.
+
+Recommended layout:
+```
+samples/
+  pdf/          # PDF scans
+  photo/        # phone photos
+  ground_truth/ # JSON ground-truth labels for evaluation
+```
+
+For sharing real samples with the team, use the project's secured storage (MinIO/S3 once Phase 4 is live), not git.
diff --git a/src/ocr_sprint/__init__.py b/src/ocr_sprint/__init__.py
new file mode 100644
index 0000000..711ef75
--- /dev/null
+++ b/src/ocr_sprint/__init__.py
@@ -0,0 +1,3 @@
+"""OCR Sprint Service — extract structured data from Indonesian police 'surat sprint'."""
+
+__version__ = "0.1.0"
diff --git a/src/ocr_sprint/api/__init__.py b/src/ocr_sprint/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/ocr_sprint/api/errors.py b/src/ocr_sprint/api/errors.py
new file mode 100644
index 0000000..81dd321
--- /dev/null
+++ b/src/ocr_sprint/api/errors.py
@@ -0,0 +1,43 @@
+"""HTTP error handlers."""
+
+from __future__ import annotations
+
+from fastapi import FastAPI, Request, status
+from fastapi.responses import JSONResponse
+
+from ocr_sprint.utils.logging import get_logger
+
+_logger = get_logger(__name__)
+
+
+class OCRServiceError(Exception):
+    """Base class for application errors that should map to a 4xx response."""
+
+    http_status: int = status.HTTP_400_BAD_REQUEST
+
+
+class UnsupportedDocumentError(OCRServiceError):
+    """Uploaded file is neither a PDF nor a recognized image format."""
+
+
+class JobNotFoundError(OCRServiceError):
+    http_status = status.HTTP_404_NOT_FOUND
+
+
+def register_error_handlers(app: FastAPI) -> None:
+    """Wire OCRServiceError + a final fallback for unexpected exceptions."""
+
+    @app.exception_handler(OCRServiceError)
+    async def _ocr_error_handler(_: Request, exc: OCRServiceError) -> JSONResponse:
+        return JSONResponse(
+            status_code=exc.http_status,
+            content={"error": exc.__class__.__name__, "message": str(exc)},
+        )
+
+    @app.exception_handler(Exception)
+    async def _unexpected_handler(_: Request, exc: Exception) -> JSONResponse:
+        _logger.exception("api.unhandled_exception", error=str(exc))
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content={"error": "InternalServerError", "message": "Unexpected error"},
+        )
diff --git a/src/ocr_sprint/api/routes/__init__.py b/src/ocr_sprint/api/routes/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/ocr_sprint/api/routes/documents.py b/src/ocr_sprint/api/routes/documents.py
new file mode 100644
index 0000000..26dd6eb
--- /dev/null
+++ b/src/ocr_sprint/api/routes/documents.py
@@ -0,0 +1,58 @@
+"""Documents API — Phase 1 synchronous endpoint.
+
+POST /documents accepts a single PDF or image upload, runs the synchronous
+pipeline inline, and returns the structured result. This is suitable for
+development and low-traffic production; Phase 4 will introduce an async
+queue and a polling-style API at the same path.
+"""
+
+from __future__ import annotations
+
+from uuid import uuid4
+
+from fastapi import APIRouter, File, UploadFile, status
+
+from ocr_sprint.api.errors import UnsupportedDocumentError
+from ocr_sprint.pipeline.orchestrator import run_pipeline
+from ocr_sprint.schemas.document import DocumentResponse
+from ocr_sprint.utils.logging import get_logger
+
+router = APIRouter(prefix="/documents", tags=["documents"])
+_logger = get_logger(__name__)
+
+_MAX_UPLOAD_BYTES = 25 * 1024 * 1024  # 25 MB
+
+
+@router.post("", status_code=status.HTTP_200_OK, response_model=DocumentResponse)
+async def create_document(file: UploadFile = File(...)) -> DocumentResponse:
+    """Run OCR + extraction synchronously on a single upload."""
+    job_id = uuid4()
+    log = _logger.bind(job_id=str(job_id), filename=file.filename or "")
+
+    content = await file.read()
+    if not content:
+        raise UnsupportedDocumentError("Uploaded file is empty.")
+    if len(content) > _MAX_UPLOAD_BYTES:
+        raise UnsupportedDocumentError(
+            f"Uploaded file exceeds {_MAX_UPLOAD_BYTES // (1024 * 1024)} MB limit."
+        )
+
+    log.info("documents.received", size=len(content))
+    try:
+        output = run_pipeline(content)
+    except ValueError as exc:
+        raise UnsupportedDocumentError(str(exc)) from exc
+
+    log.info(
+        "documents.completed",
+        status=output.status.value,
+        confidence=round(output.confidence, 3),
+        flags=[f.value for f in output.result.review_flags],
+    )
+    return DocumentResponse(
+        job_id=job_id,
+        status=output.status,
+        confidence=output.confidence,
+        data=output.result,
+        review_flags=[f.value for f in output.result.review_flags],
+    )
diff --git a/src/ocr_sprint/api/routes/health.py b/src/ocr_sprint/api/routes/health.py
new file mode 100644
index 0000000..7a01b81
--- /dev/null
+++ b/src/ocr_sprint/api/routes/health.py
@@ -0,0 +1,15 @@
+"""Liveness / readiness endpoints."""
+
+from __future__ import annotations
+
+from fastapi import APIRouter
+
+from ocr_sprint import __version__
+
+router = APIRouter(tags=["health"])
+
+
+@router.get("/health")
+async def health() -> dict[str, str]:
+    """Lightweight liveness check — does NOT touch the OCR engine."""
+    return {"status": "ok", "version": __version__}
diff --git a/src/ocr_sprint/config.py b/src/ocr_sprint/config.py
new file mode 100644
index 0000000..18a955c
--- /dev/null
+++ b/src/ocr_sprint/config.py
@@ -0,0 +1,72 @@
+"""Application settings loaded from environment / .env file."""
+
+from __future__ import annotations
+
+from functools import lru_cache
+from pathlib import Path
+
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    """Runtime configuration. Override via environment variables or a .env file."""
+
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore",
+    )
+
+    # App
+    app_env: str = "local"
+    app_host: str = "0.0.0.0"
+    app_port: int = 8000
+    app_log_level: str = "INFO"
+
+    # Storage (Phase 1: local fs)
+    storage_local_dir: Path = Path("./storage")
+
+    # OCR
+    ocr_lang: str = "latin"
+    ocr_use_gpu: bool = False
+    ocr_det_model_dir: str | None = None
+    ocr_rec_model_dir: str | None = None
+    ocr_cls_model_dir: str | None = None
+    ocr_max_image_side: int = 2200
+
+    # Preprocessing
+    preprocess_target_dpi: int = 300
+    preprocess_denoise: bool = True
+    preprocess_deskew: bool = True
+    preprocess_adaptive_threshold: bool = False
+
+    # Confidence thresholds (Phase 5 routing)
+    confidence_auto_approve: float = Field(0.95, ge=0.0, le=1.0)
+    confidence_needs_review: float = Field(0.85, ge=0.0, le=1.0)
+
+    # LLM (Phase 5)
+    llm_enabled: bool = False
+    llm_provider: str = "ollama"
+    llm_model: str = "qwen2.5:1.5b"
+    llm_base_url: str = "http://localhost:11434"
+    llm_timeout_s: int = 60
+
+    # Async pipeline (Phase 4)
+    queue_enabled: bool = False
+    redis_url: str = "redis://localhost:6379/0"
+    database_url: str = "postgresql+psycopg://ocr:ocr@localhost:5432/ocr_sprint"
+    minio_endpoint: str = "localhost:9000"
+    minio_access_key: str = "minioadmin"
+    minio_secret_key: str = "minioadmin"
+    minio_bucket: str = "ocr-sprint"
+    minio_secure: bool = False
+
+
+@lru_cache(maxsize=1)
+def get_settings() -> Settings:
+    """Cached accessor so settings are loaded once per process."""
+    settings = Settings()
+    settings.storage_local_dir.mkdir(parents=True, exist_ok=True)
+    return settings
diff --git a/src/ocr_sprint/data/__init__.py b/src/ocr_sprint/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/ocr_sprint/data/master_pangkat.py b/src/ocr_sprint/data/master_pangkat.py
new file mode 100644
index 0000000..667b47c
--- /dev/null
+++ b/src/ocr_sprint/data/master_pangkat.py
@@ -0,0 +1,66 @@
+"""Master data for Polri ranks ('pangkat').
+
+Used by the validation layer to:
+1. Confirm that a recognized rank string is a real Polri rank.
+2. Normalize abbreviated forms ("AKP" → "AKP", "Brigadir Polisi" → "Brigadir") to a canonical form.
+
+Source: Peraturan Kapolri tentang Pangkat (publicly available, 2024).
+Update this file when ranks are reorganized.
+"""
+
+from __future__ import annotations
+
+# Canonical abbreviation → list of accepted variants (case-insensitive).
+PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
+    # Tamtama
+    "BHARADA": ("BHARADA", "BHRD"),
+    "BHARATU": ("BHARATU", "BHRT"),
+    "BHARAKA": ("BHARAKA", "BHRK"),
+    "ABRIP": ("ABRIP",),
+    "ABRIPTU": ("ABRIPTU",),
+    "ABRIPKA": ("ABRIPKA",),
+    # Bintara
+    "BRIPDA": ("BRIPDA",),
+    "BRIPTU": ("BRIPTU",),
+    "BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"),
+    "BRIPKA": ("BRIPKA",),
+    "AIPDA": ("AIPDA",),
+    "AIPTU": ("AIPTU",),
+    # Perwira Pertama
+    "IPDA": ("IPDA",),
+    "IPTU": ("IPTU",),
+    "AKP": ("AKP",),
+    # Perwira Menengah
+    "KOMPOL": ("KOMPOL",),
+    "AKBP": ("AKBP",),
+    "KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP"),
+    # Perwira Tinggi
+    "BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"),
+    "IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"),
+    "KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"),
+    "JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"),
+}
+
+# Reverse lookup: any variant (uppercased) → canonical form.
+_VARIANT_TO_CANONICAL: dict[str, str] = {
+    variant.upper(): canonical
+    for canonical, variants in PANGKAT_VARIANTS.items()
+    for variant in variants
+}
+
+
+def normalize_pangkat(raw: str | None) -> str | None:
+    """Return canonical Polri rank, or None if input is empty/unknown."""
+    if not raw:
+        return None
+    cleaned = " ".join(raw.strip().upper().split())
+    if cleaned in _VARIANT_TO_CANONICAL:
+        return _VARIANT_TO_CANONICAL[cleaned]
+    # tolerate trailing punctuation like "AKP."
+    stripped = cleaned.rstrip(".,;:")
+    return _VARIANT_TO_CANONICAL.get(stripped)
+
+
+def is_valid_pangkat(raw: str | None) -> bool:
+    """True if the string maps to a known Polri rank after normalization."""
+    return normalize_pangkat(raw) is not None
diff --git a/src/ocr_sprint/main.py b/src/ocr_sprint/main.py
new file mode 100644
index 0000000..4b5e9b1
--- /dev/null
+++ b/src/ocr_sprint/main.py
@@ -0,0 +1,42 @@
+"""FastAPI entrypoint."""
+
+from __future__ import annotations
+
+from fastapi import FastAPI
+
+from ocr_sprint import __version__
+from ocr_sprint.api.errors import register_error_handlers
+from ocr_sprint.api.routes import documents, health
+from ocr_sprint.config import get_settings
+from ocr_sprint.utils.logging import configure_logging
+
+
+def create_app() -> FastAPI:
+    """Application factory — keeps top-level state easy to test."""
+    settings = get_settings()
+    configure_logging(settings.app_log_level)
+
+    app = FastAPI(
+        title="OCR Sprint Service",
+        version=__version__,
+        description="OCR + structured extraction for Indonesian police 'surat sprint' documents.",
+        docs_url="/docs",
+        redoc_url="/redoc",
+        openapi_url="/openapi.json",
+    )
+
+    register_error_handlers(app)
+    app.include_router(health.router, prefix="/api/v1")
+    app.include_router(documents.router, prefix="/api/v1")
+    return app
+
+
+app = create_app()
+
+
+def run() -> None:
+    """Console-script entrypoint (`ocr-sprint-api`)."""
+    import uvicorn
+
+    s = get_settings()
+    uvicorn.run("ocr_sprint.main:app", host=s.app_host, port=s.app_port, reload=False)
diff --git a/src/ocr_sprint/pipeline/__init__.py b/src/ocr_sprint/pipeline/__init__.py
new file mode 100644
index 0000000..e389d04
--- /dev/null
+++ b/src/ocr_sprint/pipeline/__init__.py
@@ -0,0 +1 @@
+"""OCR pipeline: ingest → preprocess → OCR → extract → validate."""
diff --git a/src/ocr_sprint/pipeline/confidence.py b/src/ocr_sprint/pipeline/confidence.py
new file mode 100644
index 0000000..d046a36
--- /dev/null
+++ b/src/ocr_sprint/pipeline/confidence.py
@@ -0,0 +1,51 @@
+"""Confidence scoring + routing decision.
+
+The score is a weighted blend of:
+  - mean OCR confidence across all detected lines
+  - validation pass rate (1.0 if no review flags, decreases per flag)
+
+This is intentionally simple for Phase 1; Phase 5 will add LLM logprob
+contributions and per-field confidences.
+"""
+
+from __future__ import annotations
+
+from ocr_sprint.config import get_settings
+from ocr_sprint.schemas.document import DocumentStatus
+from ocr_sprint.schemas.extraction import ReviewFlag
+
+# Per-flag penalty applied to the validation component of the score.
+_FLAG_PENALTY: dict[ReviewFlag, float] = {
+    ReviewFlag.LOW_OCR_CONFIDENCE: 0.10,
+    ReviewFlag.MISSING_FIELD: 0.20,
+    ReviewFlag.INVALID_NRP: 0.10,
+    ReviewFlag.UNKNOWN_PANGKAT: 0.05,
+    ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15,
+    ReviewFlag.DATE_PARSE_FAILED: 0.10,
+}
+
+OCR_WEIGHT = 0.6
+VALIDATION_WEIGHT = 0.4
+
+
+def compute_confidence(
+    ocr_confidence: float,
+    flags: list[ReviewFlag],
+) -> float:
+    """Blend OCR confidence with validation penalties into a single 0-1 score."""
+    validation_score = 1.0
+    for flag in flags:
+        validation_score -= _FLAG_PENALTY.get(flag, 0.05)
+    validation_score = max(0.0, validation_score)
+    blended = OCR_WEIGHT * ocr_confidence + VALIDATION_WEIGHT * validation_score
+    return max(0.0, min(1.0, blended))
+
+
+def route(confidence: float) -> DocumentStatus:
+    """Map a final confidence score onto the job's terminal status."""
+    s = get_settings()
+    if confidence >= s.confidence_auto_approve:
+        return DocumentStatus.COMPLETED
+    if confidence >= s.confidence_needs_review:
+        return DocumentStatus.NEEDS_REVIEW
+    return DocumentStatus.NEEDS_REVIEW  # below review threshold also goes to humans
diff --git a/src/ocr_sprint/pipeline/extract/__init__.py b/src/ocr_sprint/pipeline/extract/__init__.py
new file mode 100644
index 0000000..b19f4f7
--- /dev/null
+++ b/src/ocr_sprint/pipeline/extract/__init__.py
@@ -0,0 +1 @@
+"""Information extraction layer (regex Phase 1, LLM Phase 5)."""
diff --git a/src/ocr_sprint/pipeline/extract/regex_rules.py b/src/ocr_sprint/pipeline/extract/regex_rules.py
new file mode 100644
index 0000000..88e594f
--- /dev/null
+++ b/src/ocr_sprint/pipeline/extract/regex_rules.py
@@ -0,0 +1,169 @@
+"""Regex-based extraction for the deterministic header fields of a surat sprint.
+
+Targets header fields whose layout is highly standardized across Polri units:
+
+  - Nomor sprint, e.g. "Sprin / 123 / IV / 2025 / Reskrim"
+  - Tanggal (date the sprint was issued)
+  - Satuan penerbit (issuing unit)
+  - Perihal
+  - Dasar (numbered list of legal/operational basis)
+
+Personnel table extraction is intentionally NOT done here — that needs
+PP-Structure + cell-aware logic and lives in `pipeline/table.py` (Phase 3).
+"""
+
+from __future__ import annotations
+
+import re
+from datetime import date
+
+from ocr_sprint.schemas.extraction import HeaderFields, Signatory
+
+# ---------- regex patterns ----------
+
+# Nomor sprint, tolerant of spacing and OCR noise.
+# Examples it should match:
+#   "Sprin / 123 / IV / 2025 / Reskrim"
+#   "SPRIN/345/X/2024"
+#   "Nomor : Sprin/12/I/2025/Sat Intelkam"
+_RE_NOMOR_SPRINT = re.compile(
+    r"\bSPRIN[\s./-]*\d+[\s./-]*[IVXLCDM]+[\s./-]*\d{2,4}(?:[\s./-]*[\w .-]+?)?",
+    re.IGNORECASE,
+)
+
+# Indonesian month names.
+_BULAN_MAP: dict[str, int] = {
+    "JANUARI": 1,
+    "FEBRUARI": 2,
+    "MARET": 3,
+    "APRIL": 4,
+    "MEI": 5,
+    "JUNI": 6,
+    "JULI": 7,
+    "AGUSTUS": 8,
+    "SEPTEMBER": 9,
+    "OKTOBER": 10,
+    "NOVEMBER": 11,
+    "DESEMBER": 12,
+}
+
+# Date in Indonesian, e.g. "21 April 2025" or "21 - April - 2025"
+_RE_TANGGAL_ID = re.compile(
+    r"\b(\d{1,2})\s*[-./\s]\s*(" + "|".join(_BULAN_MAP.keys()) + r")\s*[-./\s]\s*(\d{4})\b",
+    re.IGNORECASE,
+)
+
+# Satuan penerbit usually appears in the document letterhead, prefixed by
+# KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
+_RE_SATUAN = re.compile(
+    r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
+    r"[^\n]{0,80}",
+    re.IGNORECASE,
+)
+
+# "Perihal : ...." up to end of line.
+_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
+
+# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
+_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
+
+# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
+_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
+
+
+def find_nomor_sprint(text: str) -> str | None:
+    """Return the first nomor sprint found, normalized (no extra spaces)."""
+    match = _RE_NOMOR_SPRINT.search(text)
+    if not match:
+        return None
+    return " ".join(match.group(0).split())
+
+
+def find_tanggal(text: str) -> date | None:
+    """Find the issuance date.
+
+    Surat sprint typically contains multiple dates: one or more in the 'Dasar'
+    section (citing prior documents) and one near the signatory at the bottom
+    (the actual issuance date, usually formatted as 'Tempat, DD Month YYYY').
+    We prefer the **last** date in the document since the issuance date appears
+    after the dasar items in the standard layout.
+    """
+    matches = list(_RE_TANGGAL_ID.finditer(text))
+    if not matches:
+        return None
+    last = matches[-1]
+    day_s, bulan, year_s = last.group(1), last.group(2).upper(), last.group(3)
+    try:
+        return date(int(year_s), _BULAN_MAP[bulan], int(day_s))
+    except (KeyError, ValueError):
+        return None
+
+
+def find_satuan(text: str) -> str | None:
+    """Return the first letterhead match (issuing unit), normalized."""
+    match = _RE_SATUAN.search(text)
+    if not match:
+        return None
+    return " ".join(match.group(0).split())
+
+
+def find_perihal(text: str) -> str | None:
+    """Return the first 'Perihal: ...' line, trimmed to that line only."""
+    for line in text.splitlines():
+        m = _RE_PERIHAL.search(line)
+        if m:
+            return m.group(1).strip()
+    return None
+
+
+def find_dasar_list(text: str) -> list[str]:
+    """Extract numbered 'Dasar' items from the text.
+
+    Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
+    collect subsequent lines that start with a number. Stops at a blank line
+    or a line beginning with another section header keyword.
+    """
+    lines = text.splitlines()
+    items: list[str] = []
+    in_dasar = False
+    section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
+    for raw_line in lines:
+        line = raw_line.strip()
+        if not in_dasar:
+            if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
+                in_dasar = True
+            continue
+        if not line:
+            if items:
+                break
+            continue
+        upper = line.upper()
+        if any(upper.startswith(term) for term in section_terminators):
+            break
+        m = _RE_DASAR_ITEM.match(line)
+        if m:
+            items.append(m.group(2).strip())
+        elif items:
+            # continuation of the previous dasar item
+            items[-1] = (items[-1] + " " + line).strip()
+    return items
+
+
+def find_signatory(text: str) -> Signatory:
+    """Best-effort extraction of the signatory block (last NRP in the document)."""
+    matches = list(_RE_NRP.finditer(text))
+    if not matches:
+        return Signatory()
+    last = matches[-1]
+    return Signatory(nrp=last.group(2))
+
+
+def extract_header(text: str) -> HeaderFields:
+    """Run all header-level regex extractors and return a populated schema."""
+    return HeaderFields(
+        nomor_sprint=find_nomor_sprint(text),
+        tanggal=find_tanggal(text),
+        satuan_penerbit=find_satuan(text),
+        perihal=find_perihal(text),
+        dasar=find_dasar_list(text),
+    )
diff --git a/src/ocr_sprint/pipeline/extract/validators.py b/src/ocr_sprint/pipeline/extract/validators.py
new file mode 100644
index 0000000..14d15ef
--- /dev/null
+++ b/src/ocr_sprint/pipeline/extract/validators.py
@@ -0,0 +1,64 @@
+"""Cross-field validation, with structured review-flag output."""
+
+from __future__ import annotations
+
+import re
+
+from ocr_sprint.data.master_pangkat import is_valid_pangkat
+from ocr_sprint.schemas.extraction import (
+    ExtractionResult,
+    HeaderFields,
+    ReviewFlag,
+)
+from ocr_sprint.schemas.personnel import PersonnelEntry
+
+# Polri NRP = 8 digits.
+_RE_NRP_8 = re.compile(r"^\d{8}$")
+
+
+def validate_nrp(nrp: str | None) -> bool:
+    """Return True when the value is a well-formed Polri NRP (8 digits)."""
+    if nrp is None:
+        return False
+    return bool(_RE_NRP_8.match(nrp.strip()))
+
+
+def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
+    """Inspect a single personnel row and return any review flags it triggers."""
+    flags: list[ReviewFlag] = []
+    if entry.nrp and not validate_nrp(entry.nrp):
+        flags.append(ReviewFlag.INVALID_NRP)
+    if entry.pangkat and not is_valid_pangkat(entry.pangkat):
+        flags.append(ReviewFlag.UNKNOWN_PANGKAT)
+    return flags
+
+
+def validate_header(header: HeaderFields) -> list[ReviewFlag]:
+    """Flag missing required fields or unparseable dates in the header."""
+    flags: list[ReviewFlag] = []
+    if header.nomor_sprint is None:
+        flags.append(ReviewFlag.MISSING_FIELD)
+    if header.tanggal is None:
+        flags.append(ReviewFlag.DATE_PARSE_FAILED)
+    return flags
+
+
+def validate_extraction(
+    result: ExtractionResult,
+    expected_personnel_count: int | None = None,
+) -> list[ReviewFlag]:
+    """Run all validators across the full extraction and dedupe the flags."""
+    flags: list[ReviewFlag] = []
+    flags.extend(validate_header(result.header))
+    for entry in result.personel:
+        flags.extend(validate_personnel_entry(entry))
+    if expected_personnel_count is not None and expected_personnel_count != len(result.personel):
+        flags.append(ReviewFlag.PERSONNEL_COUNT_MISMATCH)
+    # dedupe while preserving order
+    seen: set[ReviewFlag] = set()
+    deduped: list[ReviewFlag] = []
+    for flag in flags:
+        if flag not in seen:
+            seen.add(flag)
+            deduped.append(flag)
+    return deduped
diff --git a/src/ocr_sprint/pipeline/ingest.py b/src/ocr_sprint/pipeline/ingest.py
new file mode 100644
index 0000000..0fd2db3
--- /dev/null
+++ b/src/ocr_sprint/pipeline/ingest.py
@@ -0,0 +1,81 @@
+"""Ingest layer: convert uploaded bytes (PDF/IMG) into a list of numpy images."""
+
+from __future__ import annotations
+
+import io
+from dataclasses import dataclass
+from typing import Any
+
+import fitz  # PyMuPDF
+import numpy as np
+from PIL import Image
+
+from ocr_sprint.schemas.document import SourceKind
+
+# Generic alias used across the pipeline. We don't constrain dtype/shape because
+# OpenCV operations accept multiple dtypes and numpy generics are still rough.
+NDArrayU8 = np.ndarray[Any, Any]
+
+PDF_MAGIC = b"%PDF-"
+PNG_MAGIC = b"\x89PNG\r\n\x1a\n"
+JPEG_MAGIC = b"\xff\xd8\xff"
+TIFF_MAGIC_LE = b"II*\x00"
+TIFF_MAGIC_BE = b"MM\x00*"
+
+
+@dataclass(frozen=True)
+class IngestedPage:
+    """One page worth of image data ready for preprocessing."""
+
+    image: NDArrayU8  # HxWx3 BGR uint8 (OpenCV convention)
+    page_index: int
+
+
+def detect_source_kind(content: bytes) -> SourceKind:
+    """Best-effort sniff of an uploaded payload."""
+    if content.startswith(PDF_MAGIC):
+        return SourceKind.PDF
+    if content.startswith((PNG_MAGIC, JPEG_MAGIC, TIFF_MAGIC_LE, TIFF_MAGIC_BE)):
+        return SourceKind.IMAGE
+    return SourceKind.UNKNOWN
+
+
+def _pil_to_bgr(img: Image.Image) -> NDArrayU8:
+    """Convert PIL image to OpenCV BGR numpy array."""
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    arr = np.asarray(img, dtype=np.uint8)
+    # RGB to BGR
+    return arr[:, :, ::-1].copy()
+
+
+def ingest_pdf(content: bytes, target_dpi: int = 300) -> list[IngestedPage]:
+    """Render every page of a PDF to a numpy image at the target DPI.
+
+    Uses PyMuPDF (no poppler dependency). DPI is enforced via a transform matrix:
+    fitz's default is 72 DPI, so the zoom factor is target_dpi / 72.
+    """
+    pages: list[IngestedPage] = []
+    zoom = target_dpi / 72.0
+    matrix = fitz.Matrix(zoom, zoom)
+    with fitz.open(stream=content, filetype="pdf") as doc:
+        for idx, page in enumerate(doc):
+            pix = page.get_pixmap(matrix=matrix, alpha=False)
+            img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
+            pages.append(IngestedPage(image=_pil_to_bgr(img), page_index=idx))
+    return pages
+
+
+def ingest_image(content: bytes) -> list[IngestedPage]:
+    """Decode a single image into a one-element page list."""
+    img = Image.open(io.BytesIO(content))
+    return [IngestedPage(image=_pil_to_bgr(img), page_index=0)]
+
+
+def ingest(content: bytes, kind: SourceKind, target_dpi: int = 300) -> list[IngestedPage]:
+    """Dispatch to the right loader based on declared source kind."""
+    if kind == SourceKind.PDF:
+        return ingest_pdf(content, target_dpi=target_dpi)
+    if kind == SourceKind.IMAGE:
+        return ingest_image(content)
+    raise ValueError(f"Unsupported source kind: {kind}")
diff --git a/src/ocr_sprint/pipeline/ocr.py b/src/ocr_sprint/pipeline/ocr.py
new file mode 100644
index 0000000..f5874de
--- /dev/null
+++ b/src/ocr_sprint/pipeline/ocr.py
@@ -0,0 +1,106 @@
+"""PaddleOCR wrapper.
+
+PaddleOCR has a heavy initialization cost (~2-5s on CPU as model files load),
+so we keep a process-global instance behind a lazy accessor.
+
+The wrapper exposes a small, stable surface so the rest of the pipeline does
+not depend directly on paddleocr's evolving API.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from threading import Lock
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from ocr_sprint.config import get_settings
+from ocr_sprint.pipeline.ingest import NDArrayU8
+from ocr_sprint.utils.logging import get_logger
+
+if TYPE_CHECKING:
+    from paddleocr import PaddleOCR
+
+_logger = get_logger(__name__)
+_lock = Lock()
+_instance: PaddleOCR | None = None
+
+
+@dataclass(frozen=True)
+class OCRLine:
+    """One recognized line with its bounding polygon and confidence."""
+
+    text: str
+    confidence: float
+    box: tuple[tuple[float, float], ...]  # 4 (x, y) corner points
+
+
+@dataclass(frozen=True)
+class OCRPage:
+    """OCR output for a single page."""
+
+    lines: list[OCRLine]
+
+    @property
+    def text(self) -> str:
+        """Reconstruct page text by concatenating lines (order = paddle's output order)."""
+        return "\n".join(line.text for line in self.lines)
+
+    @property
+    def mean_confidence(self) -> float:
+        if not self.lines:
+            return 0.0
+        return float(np.mean([line.confidence for line in self.lines]))
+
+
+def _build_paddleocr() -> PaddleOCR:
+    from paddleocr import PaddleOCR
+
+    s = get_settings()
+    kwargs: dict[str, object] = {
+        "lang": s.ocr_lang,
+        "use_angle_cls": True,
+        "use_gpu": s.ocr_use_gpu,
+        "show_log": False,
+    }
+    if s.ocr_det_model_dir:
+        kwargs["det_model_dir"] = s.ocr_det_model_dir
+    if s.ocr_rec_model_dir:
+        kwargs["rec_model_dir"] = s.ocr_rec_model_dir
+    if s.ocr_cls_model_dir:
+        kwargs["cls_model_dir"] = s.ocr_cls_model_dir
+    _logger.info("paddleocr.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
+    return PaddleOCR(**kwargs)
+
+
+def get_ocr() -> PaddleOCR:
+    """Lazy, thread-safe singleton accessor for the PaddleOCR engine."""
+    global _instance
+    if _instance is None:
+        with _lock:
+            if _instance is None:
+                _instance = _build_paddleocr()
+    return _instance
+
+
+def run_ocr(image: NDArrayU8) -> OCRPage:
+    """Run OCR on a single BGR image and return a structured page result."""
+    engine = get_ocr()
+    raw = engine.ocr(image, cls=True)
+    # PaddleOCR returns [[ [box, (text, conf)], ... ]] — one outer list per image.
+    if not raw or raw[0] is None:
+        return OCRPage(lines=[])
+    page_raw = raw[0]
+    lines: list[OCRLine] = []
+    for item in page_raw:
+        if not item or len(item) < 2:
+            continue
+        box_raw, text_conf = item[0], item[1]
+        text, conf = text_conf[0], float(text_conf[1])
+        try:
+            box = tuple((float(p[0]), float(p[1])) for p in box_raw)
+        except (TypeError, ValueError, IndexError):
+            continue
+        lines.append(OCRLine(text=text, confidence=conf, box=box))
+    return OCRPage(lines=lines)
diff --git a/src/ocr_sprint/pipeline/orchestrator.py b/src/ocr_sprint/pipeline/orchestrator.py
new file mode 100644
index 0000000..547993b
--- /dev/null
+++ b/src/ocr_sprint/pipeline/orchestrator.py
@@ -0,0 +1,103 @@
+"""Synchronous pipeline orchestrator (Phase 1).
+
+Wires the individual stages together:
+
+    bytes → ingest → preprocess → OCR → regex extract → validate → score
+
+Phase 4 will replace this with a Celery task graph; Phase 3/5 will plug
+in PP-Structure for tables and an LLM extractor for variant fields.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from ocr_sprint.config import get_settings
+from ocr_sprint.pipeline.confidence import compute_confidence, route
+from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory
+from ocr_sprint.pipeline.extract.validators import validate_extraction
+from ocr_sprint.pipeline.ingest import detect_source_kind, ingest
+from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
+from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
+from ocr_sprint.schemas.document import DocumentStatus, SourceKind
+from ocr_sprint.schemas.extraction import ExtractionResult, ReviewFlag
+from ocr_sprint.utils.logging import get_logger
+
+_logger = get_logger(__name__)
+
+# Below this OCR confidence we automatically flag for review.
+_OCR_CONFIDENCE_FLAG_THRESHOLD = 0.80
+
+
+@dataclass
+class PipelineOutput:
+    """Bundle returned by the orchestrator."""
+
+    source_kind: SourceKind
+    status: DocumentStatus
+    confidence: float
+    result: ExtractionResult
+
+
+def run_pipeline(content: bytes) -> PipelineOutput:
+    """Execute the synchronous OCR + extraction pipeline on raw upload bytes."""
+    s = get_settings()
+
+    kind = detect_source_kind(content)
+    if kind == SourceKind.UNKNOWN:
+        raise ValueError("Unsupported file type — only PDF and common image formats are accepted.")
+
+    pages = ingest(content, kind, target_dpi=s.preprocess_target_dpi)
+    _logger.info("pipeline.ingested", source_kind=kind.value, pages=len(pages))
+
+    pre_cfg = PreprocessConfig(
+        max_side=s.ocr_max_image_side,
+        denoise=s.preprocess_denoise,
+        deskew=s.preprocess_deskew,
+        adaptive_threshold=s.preprocess_adaptive_threshold,
+    )
+
+    ocr_pages: list[OCRPage] = []
+    for page in pages:
+        cleaned = preprocess(page.image, pre_cfg)
+        ocr_pages.append(run_ocr(cleaned))
+
+    full_text = "\n".join(p.text for p in ocr_pages)
+    mean_ocr_conf = sum(p.mean_confidence for p in ocr_pages) / len(ocr_pages) if ocr_pages else 0.0
+
+    header = extract_header(full_text)
+    ttd = find_signatory(full_text)
+
+    initial_flags: list[ReviewFlag] = []
+    if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
+        initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
+
+    result = ExtractionResult(
+        header=header,
+        personel=[],  # Phase 3 will populate from PP-Structure
+        untuk=[],
+        ttd=ttd,
+        raw_text=full_text,
+        confidence=mean_ocr_conf,
+        review_flags=list(initial_flags),
+    )
+
+    flags = validate_extraction(result)
+    # merge initial OCR-confidence flag with validation flags, preserving uniqueness
+    seen = set(flags)
+    for f in initial_flags:
+        if f not in seen:
+            flags.append(f)
+            seen.add(f)
+    result.review_flags = flags
+
+    final_conf = compute_confidence(mean_ocr_conf, flags)
+    result.confidence = final_conf
+
+    status = route(final_conf)
+    return PipelineOutput(
+        source_kind=kind,
+        status=status,
+        confidence=final_conf,
+        result=result,
+    )
diff --git a/src/ocr_sprint/pipeline/preprocess.py b/src/ocr_sprint/pipeline/preprocess.py
new file mode 100644
index 0000000..c694702
--- /dev/null
+++ b/src/ocr_sprint/pipeline/preprocess.py
@@ -0,0 +1,108 @@
+"""Image preprocessing for OCR.
+
+Phase 1 implements the "always-on" steps that work for both clean PDF scans
+and reasonable phone photos:
+
+  - resize to a reasonable max side (PaddleOCR runs faster on smaller inputs)
+  - convert to grayscale for analysis (kept as 3-channel BGR for paddle)
+  - denoise (Non-Local Means, gentle)
+  - deskew via Hough line angle estimate
+  - optional adaptive threshold for low-quality phone photos
+
+Phase 2 will add document-corner detection + perspective transform + dewarping
+for tilted phone shots; those live in `document_detect.py` (added later).
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import cv2
+import numpy as np
+
+from ocr_sprint.pipeline.ingest import NDArrayU8
+
+
+@dataclass(frozen=True)
+class PreprocessConfig:
+    """Tunable knobs for the preprocessing pipeline."""
+
+    max_side: int = 2200
+    denoise: bool = True
+    deskew: bool = True
+    adaptive_threshold: bool = False
+
+
+def _resize_max_side(img: NDArrayU8, max_side: int) -> NDArrayU8:
+    h, w = img.shape[:2]
+    longest = max(h, w)
+    if longest <= max_side:
+        return img
+    scale = max_side / longest
+    new_w, new_h = round(w * scale), round(h * scale)
+    return cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
+
+
+def _estimate_skew_angle(gray: NDArrayU8) -> float:
+    """Estimate skew using Canny + Hough; returns angle in degrees within [-15, 15]."""
+    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+    lines = cv2.HoughLines(edges, 1, np.pi / 360, threshold=200)
+    if lines is None or len(lines) == 0:
+        return 0.0
+    angles: list[float] = []
+    for line in lines[:200]:
+        rho, theta = line[0]
+        del rho
+        # convert to angle relative to horizontal (degrees)
+        angle = (theta * 180.0 / np.pi) - 90.0
+        # only keep nearly-horizontal lines (within ±15°)
+        if -15.0 < angle < 15.0:
+            angles.append(angle)
+    if not angles:
+        return 0.0
+    return float(np.median(angles))
+
+
+def _rotate(img: NDArrayU8, angle_deg: float) -> NDArrayU8:
+    if abs(angle_deg) < 0.1:
+        return img
+    h, w = img.shape[:2]
+    center = (w / 2, h / 2)
+    matrix = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
+    return cv2.warpAffine(
+        img,
+        matrix,
+        (w, h),
+        flags=cv2.INTER_CUBIC,
+        borderMode=cv2.BORDER_REPLICATE,
+    )
+
+
+def preprocess(img: NDArrayU8, cfg: PreprocessConfig | None = None) -> NDArrayU8:
+    """Run preprocessing and return a clean BGR uint8 image suitable for OCR."""
+    if cfg is None:
+        cfg = PreprocessConfig()
+
+    out = _resize_max_side(img, cfg.max_side)
+
+    if cfg.deskew:
+        gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY)
+        angle = _estimate_skew_angle(gray)
+        out = _rotate(out, -angle)
+
+    if cfg.denoise:
+        out = cv2.fastNlMeansDenoisingColored(out, None, 5, 5, 7, 21)
+
+    if cfg.adaptive_threshold:
+        gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY)
+        binarized = cv2.adaptiveThreshold(
+            gray,
+            255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY,
+            blockSize=31,
+            C=15,
+        )
+        out = cv2.cvtColor(binarized, cv2.COLOR_GRAY2BGR)
+
+    return out
diff --git a/src/ocr_sprint/py.typed b/src/ocr_sprint/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/src/ocr_sprint/schemas/__init__.py b/src/ocr_sprint/schemas/__init__.py
new file mode 100644
index 0000000..c54dbfa
--- /dev/null
+++ b/src/ocr_sprint/schemas/__init__.py
@@ -0,0 +1,27 @@
+"""Pydantic schemas for input/output of the OCR Sprint service."""
+
+from ocr_sprint.schemas.document import (
+    DocumentJob,
+    DocumentResponse,
+    DocumentStatus,
+    SourceKind,
+)
+from ocr_sprint.schemas.extraction import (
+    ExtractionResult,
+    HeaderFields,
+    ReviewFlag,
+    Signatory,
+)
+from ocr_sprint.schemas.personnel import PersonnelEntry
+
+__all__ = [
+    "DocumentJob",
+    "DocumentResponse",
+    "DocumentStatus",
+    "ExtractionResult",
+    "HeaderFields",
+    "PersonnelEntry",
+    "ReviewFlag",
+    "Signatory",
+    "SourceKind",
+]
diff --git a/src/ocr_sprint/schemas/document.py b/src/ocr_sprint/schemas/document.py
new file mode 100644
index 0000000..c59b8b7
--- /dev/null
+++ b/src/ocr_sprint/schemas/document.py
@@ -0,0 +1,57 @@
+"""Job-level schemas (request, response, status)."""
+
+from __future__ import annotations
+
+from datetime import datetime
+from enum import Enum
+from typing import Any
+from uuid import UUID, uuid4
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from ocr_sprint.schemas.extraction import ExtractionResult
+
+
+class SourceKind(str, Enum):
+    """High-level type of the uploaded document."""
+
+    PDF = "pdf"
+    IMAGE = "image"
+    UNKNOWN = "unknown"
+
+
+class DocumentStatus(str, Enum):
+    """Lifecycle status of an OCR job."""
+
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    NEEDS_REVIEW = "needs_review"
+    FAILED = "failed"
+
+
+class DocumentJob(BaseModel):
+    """Internal representation of a job (Phase 1 holds it in-memory)."""
+
+    model_config = ConfigDict(use_enum_values=False)
+
+    job_id: UUID = Field(default_factory=uuid4)
+    source_kind: SourceKind = SourceKind.UNKNOWN
+    filename: str
+    status: DocumentStatus = DocumentStatus.PENDING
+    created_at: datetime = Field(default_factory=lambda: datetime.utcnow())
+    updated_at: datetime = Field(default_factory=lambda: datetime.utcnow())
+    error: str | None = None
+    result: ExtractionResult | None = None
+    debug: dict[str, Any] = Field(default_factory=dict)
+
+
+class DocumentResponse(BaseModel):
+    """Public response payload returned by the documents API."""
+
+    job_id: UUID
+    status: DocumentStatus
+    confidence: float | None = None
+    data: ExtractionResult | None = None
+    review_flags: list[str] = Field(default_factory=list)
+    error: str | None = None
diff --git a/src/ocr_sprint/schemas/extraction.py b/src/ocr_sprint/schemas/extraction.py
new file mode 100644
index 0000000..1311faa
--- /dev/null
+++ b/src/ocr_sprint/schemas/extraction.py
@@ -0,0 +1,55 @@
+"""Top-level extraction result schemas."""
+
+from __future__ import annotations
+
+from datetime import date
+from enum import Enum
+
+from pydantic import BaseModel, Field
+
+from ocr_sprint.schemas.personnel import PersonnelEntry
+
+
+class ReviewFlag(str, Enum):
+    """Reasons a document was routed to human review."""
+
+    LOW_OCR_CONFIDENCE = "low_ocr_confidence"
+    MISSING_FIELD = "missing_field"
+    INVALID_NRP = "invalid_nrp"
+    UNKNOWN_PANGKAT = "unknown_pangkat"
+    PERSONNEL_COUNT_MISMATCH = "personnel_count_mismatch"
+    DATE_PARSE_FAILED = "date_parse_failed"
+
+
+class Signatory(BaseModel):
+    """The official signing the sprint (Penandatangan)."""
+
+    nama: str | None = None
+    pangkat: str | None = None
+    nrp: str | None = None
+    jabatan: str | None = None
+
+
+class HeaderFields(BaseModel):
+    """Header fields parsed from the top portion of a sprint."""
+
+    nomor_sprint: str | None = Field(None, description="e.g. Sprin/123/IV/2025/Reskrim.")
+    tanggal: date | None = Field(None, description="Date the sprint was issued.")
+    satuan_penerbit: str | None = Field(None, description="Issuing unit, e.g. 'Polres Bandung'.")
+    perihal: str | None = None
+    dasar: list[str] = Field(default_factory=list, description="List of legal/operational basis.")
+
+
+class ExtractionResult(BaseModel):
+    """Full structured payload extracted from a single sprint document."""
+
+    header: HeaderFields = Field(default_factory=HeaderFields)
+    personel: list[PersonnelEntry] = Field(default_factory=list)
+    untuk: list[str] = Field(
+        default_factory=list,
+        description="Bulleted task descriptions in the 'Untuk' / 'Dikerjakan' section.",
+    )
+    ttd: Signatory = Field(default_factory=Signatory)
+    raw_text: str = Field(default="", description="Concatenated OCR text for debugging.")
+    confidence: float = Field(0.0, ge=0.0, le=1.0)
+    review_flags: list[ReviewFlag] = Field(default_factory=list)
diff --git a/src/ocr_sprint/schemas/personnel.py b/src/ocr_sprint/schemas/personnel.py
new file mode 100644
index 0000000..9eee085
--- /dev/null
+++ b/src/ocr_sprint/schemas/personnel.py
@@ -0,0 +1,18 @@
+"""Schema for a single personnel row in a surat sprint."""
+
+from __future__ import annotations
+
+from pydantic import BaseModel, Field
+
+
+class PersonnelEntry(BaseModel):
+    """One row from the personnel table."""
+
+    no: int | None = Field(None, description="Row number as printed on the document.")
+    pangkat: str | None = Field(None, description="Rank, normalized when possible.")
+    nrp: str | None = Field(None, description="8-digit Polri NRP, or blank if not detected.")
+    nama: str | None = Field(None, description="Full name.")
+    jabatan_dinas: str | None = Field(None, description="Permanent post (jabatan dalam dinas).")
+    jabatan_sprint: str | None = Field(None, description="Role within this sprint.")
+    keterangan: str | None = None
+    confidence: float = Field(0.0, ge=0.0, le=1.0)
diff --git a/src/ocr_sprint/utils/__init__.py b/src/ocr_sprint/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/ocr_sprint/utils/logging.py b/src/ocr_sprint/utils/logging.py
new file mode 100644
index 0000000..79bee44
--- /dev/null
+++ b/src/ocr_sprint/utils/logging.py
@@ -0,0 +1,45 @@
+"""Structured logging setup using structlog."""
+
+from __future__ import annotations
+
+import logging
+import sys
+from typing import Any
+
+import structlog
+
+
+def configure_logging(level: str = "INFO") -> None:
+    """Configure structlog to emit JSON-friendly key=value records to stdout."""
+    log_level = getattr(logging, level.upper(), logging.INFO)
+    logging.basicConfig(
+        format="%(message)s",
+        stream=sys.stdout,
+        level=log_level,
+    )
+    structlog.configure(
+        processors=[
+            structlog.contextvars.merge_contextvars,
+            structlog.processors.add_log_level,
+            structlog.processors.TimeStamper(fmt="iso", utc=True),
+            structlog.processors.StackInfoRenderer(),
+            structlog.processors.format_exc_info,
+            structlog.dev.ConsoleRenderer(colors=False),
+        ],
+        wrapper_class=structlog.make_filtering_bound_logger(log_level),
+        context_class=dict,
+        logger_factory=structlog.PrintLoggerFactory(),
+        cache_logger_on_first_use=True,
+    )
+
+
+def get_logger(name: str | None = None, **initial_values: Any) -> Any:
+    """Return a bound logger with optional initial context.
+
+    The return type is ``Any`` because structlog's BoundLogger generic typing
+    is too restrictive in practice; callers treat it as a duck-typed logger.
+    """
+    logger = structlog.get_logger(name)
+    if initial_values:
+        logger = logger.bind(**initial_values)
+    return logger
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..75f48d8
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,43 @@
+"""Shared pytest fixtures."""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+
+@pytest.fixture
+def blank_bgr_image() -> np.ndarray:
+    """A 600x800 white BGR image (uint8) — useful for preprocessing smoke tests."""
+    return np.full((600, 800, 3), 255, dtype=np.uint8)
+
+
+@pytest.fixture
+def sample_sprint_text() -> str:
+    """Realistic-but-synthetic OCR text for regex extractor tests."""
+    return (
+        "KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
+        "DAERAH JAWA BARAT\n"
+        "RESOR BANDUNG\n"
+        "\n"
+        "SURAT PERINTAH\n"
+        "Nomor : Sprin/123/IV/2025/Reskrim\n"
+        "\n"
+        "DASAR :\n"
+        "1. Undang-Undang Nomor 2 Tahun 2002 tentang Kepolisian Negara Republik Indonesia.\n"
+        "2. Peraturan Kapolri Nomor 6 Tahun 2017 tentang Susunan Organisasi.\n"
+        "3. Laporan Polisi Nomor LP/123/IV/2025/Reskrim tanggal 20 April 2025.\n"
+        "\n"
+        "DIPERINTAHKAN :\n"
+        "Kepada : 1. Nama anggota tersebut di bawah ini.\n"
+        "\n"
+        "Untuk : Melaksanakan penyelidikan tindak pidana.\n"
+        "\n"
+        "PERIHAL : Pelaksanaan penyelidikan kasus pencurian.\n"
+        "\n"
+        "Bandung, 21 April 2025\n"
+        "KEPALA KEPOLISIAN RESOR BANDUNG\n"
+        "\n"
+        "Drs. BUDI SANTOSO\n"
+        "AKBP NRP 12345678\n"
+    )
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/test_api.py b/tests/unit/test_api.py
new file mode 100644
index 0000000..be8addd
--- /dev/null
+++ b/tests/unit/test_api.py
@@ -0,0 +1,87 @@
+"""API tests with the OCR engine mocked.
+
+These tests do NOT load PaddleOCR — instead they monkeypatch the orchestrator
+so we can exercise the FastAPI surface without the heavy ML init cost.
+"""
+
+from __future__ import annotations
+
+from datetime import date
+
+import pytest
+from fastapi.testclient import TestClient
+
+from ocr_sprint.main import create_app
+from ocr_sprint.pipeline import orchestrator as orch_module
+from ocr_sprint.pipeline.orchestrator import PipelineOutput
+from ocr_sprint.schemas.document import DocumentStatus, SourceKind
+from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields
+
+
+@pytest.fixture
+def client() -> TestClient:
+    return TestClient(create_app())
+
+
+def test_health_endpoint(client: TestClient) -> None:
+    response = client.get("/api/v1/health")
+    assert response.status_code == 200
+    assert response.json()["status"] == "ok"
+
+
+def test_documents_rejects_empty_upload(client: TestClient) -> None:
+    response = client.post(
+        "/api/v1/documents",
+        files={"file": ("empty.pdf", b"", "application/pdf")},
+    )
+    assert response.status_code == 400
+
+
+def test_documents_rejects_unknown_format(
+    client: TestClient,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    response = client.post(
+        "/api/v1/documents",
+        files={"file": ("x.bin", b"random garbage bytes here", "application/octet-stream")},
+    )
+    assert response.status_code == 400
+
+
+def test_documents_returns_pipeline_output(
+    client: TestClient,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    fake_result = ExtractionResult(
+        header=HeaderFields(
+            nomor_sprint="Sprin/1/I/2025",
+            tanggal=date(2025, 1, 1),
+            satuan_penerbit="POLRES TEST",
+        ),
+        confidence=0.97,
+    )
+    fake_output = PipelineOutput(
+        source_kind=SourceKind.PDF,
+        status=DocumentStatus.COMPLETED,
+        confidence=0.97,
+        result=fake_result,
+    )
+
+    def _fake_run(_content: bytes) -> PipelineOutput:
+        return fake_output
+
+    # Patch the symbol *imported into* the routes module.
+    monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
+    from ocr_sprint.api.routes import documents as docs_module
+
+    monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
+
+    response = client.post(
+        "/api/v1/documents",
+        files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
+    )
+    assert response.status_code == 200
+    body = response.json()
+    assert body["status"] == "completed"
+    assert body["confidence"] == 0.97
+    assert body["data"]["header"]["nomor_sprint"] == "Sprin/1/I/2025"
diff --git a/tests/unit/test_confidence.py b/tests/unit/test_confidence.py
new file mode 100644
index 0000000..e1cc18b
--- /dev/null
+++ b/tests/unit/test_confidence.py
@@ -0,0 +1,46 @@
+"""Tests for confidence scoring + routing."""
+
+from __future__ import annotations
+
+from ocr_sprint.pipeline.confidence import compute_confidence, route
+from ocr_sprint.schemas.document import DocumentStatus
+from ocr_sprint.schemas.extraction import ReviewFlag
+
+
+def test_no_flags_returns_blend_of_ocr_only() -> None:
+    score = compute_confidence(0.9, [])
+    # OCR weight 0.6 * 0.9 + validation 0.4 * 1.0 = 0.94
+    assert abs(score - 0.94) < 1e-6
+
+
+def test_flags_reduce_score() -> None:
+    base = compute_confidence(0.9, [])
+    with_flags = compute_confidence(0.9, [ReviewFlag.MISSING_FIELD])
+    assert with_flags < base
+
+
+def test_score_is_clamped() -> None:
+    catastrophic = compute_confidence(
+        0.0,
+        [
+            ReviewFlag.MISSING_FIELD,
+            ReviewFlag.LOW_OCR_CONFIDENCE,
+            ReviewFlag.PERSONNEL_COUNT_MISMATCH,
+            ReviewFlag.INVALID_NRP,
+            ReviewFlag.UNKNOWN_PANGKAT,
+            ReviewFlag.DATE_PARSE_FAILED,
+        ],
+    )
+    assert 0.0 <= catastrophic <= 1.0
+
+
+def test_route_high_confidence() -> None:
+    assert route(0.97) == DocumentStatus.COMPLETED
+
+
+def test_route_mid_goes_to_review() -> None:
+    assert route(0.88) == DocumentStatus.NEEDS_REVIEW
+
+
+def test_route_low_goes_to_review() -> None:
+    assert route(0.40) == DocumentStatus.NEEDS_REVIEW
diff --git a/tests/unit/test_ingest.py b/tests/unit/test_ingest.py
new file mode 100644
index 0000000..0a7f0c2
--- /dev/null
+++ b/tests/unit/test_ingest.py
@@ -0,0 +1,50 @@
+"""Tests for source detection + image ingest."""
+
+from __future__ import annotations
+
+import io
+
+import numpy as np
+from PIL import Image
+
+from ocr_sprint.pipeline.ingest import detect_source_kind, ingest_image
+from ocr_sprint.schemas.document import SourceKind
+
+
+def _png_bytes() -> bytes:
+    img = Image.new("RGB", (100, 80), color="white")
+    buf = io.BytesIO()
+    img.save(buf, format="PNG")
+    return buf.getvalue()
+
+
+def _jpeg_bytes() -> bytes:
+    img = Image.new("RGB", (100, 80), color="white")
+    buf = io.BytesIO()
+    img.save(buf, format="JPEG")
+    return buf.getvalue()
+
+
+def test_detect_pdf() -> None:
+    assert detect_source_kind(b"%PDF-1.7\n...") == SourceKind.PDF
+
+
+def test_detect_png() -> None:
+    assert detect_source_kind(_png_bytes()) == SourceKind.IMAGE
+
+
+def test_detect_jpeg() -> None:
+    assert detect_source_kind(_jpeg_bytes()) == SourceKind.IMAGE
+
+
+def test_detect_unknown() -> None:
+    assert detect_source_kind(b"garbage") == SourceKind.UNKNOWN
+
+
+def test_ingest_image_returns_one_page() -> None:
+    pages = ingest_image(_png_bytes())
+    assert len(pages) == 1
+    assert pages[0].page_index == 0
+    assert isinstance(pages[0].image, np.ndarray)
+    assert pages[0].image.dtype == np.uint8
+    assert pages[0].image.shape == (80, 100, 3)
diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py
new file mode 100644
index 0000000..56d5bf4
--- /dev/null
+++ b/tests/unit/test_preprocess.py
@@ -0,0 +1,37 @@
+"""Smoke tests for the preprocessing pipeline."""
+
+from __future__ import annotations
+
+import numpy as np
+
+from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
+
+
+def test_preprocess_returns_bgr_uint8(blank_bgr_image: np.ndarray) -> None:
+    out = preprocess(blank_bgr_image)
+    assert out.dtype == np.uint8
+    assert out.ndim == 3
+    assert out.shape[2] == 3
+
+
+def test_preprocess_resizes_to_max_side() -> None:
+    big = np.full((4000, 3000, 3), 255, dtype=np.uint8)
+    cfg = PreprocessConfig(max_side=1000, denoise=False, deskew=False)
+    out = preprocess(big, cfg)
+    assert max(out.shape[:2]) == 1000
+
+
+def test_preprocess_does_not_upscale_small_images() -> None:
+    small = np.full((400, 300, 3), 255, dtype=np.uint8)
+    cfg = PreprocessConfig(max_side=2200, denoise=False, deskew=False)
+    out = preprocess(small, cfg)
+    assert out.shape[:2] == (400, 300)
+
+
+def test_adaptive_threshold_produces_binary_image() -> None:
+    img = np.random.randint(0, 256, (200, 200, 3), dtype=np.uint8)
+    cfg = PreprocessConfig(denoise=False, deskew=False, adaptive_threshold=True)
+    out = preprocess(img, cfg)
+    # adaptive threshold should leave only 0s and 255s
+    unique = np.unique(out)
+    assert set(unique.tolist()).issubset({0, 255})
diff --git a/tests/unit/test_regex_rules.py b/tests/unit/test_regex_rules.py
new file mode 100644
index 0000000..3cd7855
--- /dev/null
+++ b/tests/unit/test_regex_rules.py
@@ -0,0 +1,112 @@
+"""Tests for regex-based header extraction."""
+
+from __future__ import annotations
+
+from datetime import date
+
+import pytest
+
+from ocr_sprint.pipeline.extract.regex_rules import (
+    extract_header,
+    find_dasar_list,
+    find_nomor_sprint,
+    find_perihal,
+    find_satuan,
+    find_signatory,
+    find_tanggal,
+)
+
+
+class TestNomorSprint:
+    @pytest.mark.parametrize(
+        ("text", "needle"),
+        [
+            ("Nomor : Sprin/123/IV/2025/Reskrim", "123"),
+            ("Nomor: SPRIN / 7 / I / 2024", "7"),
+            ("...Sprin-345-X-2024-Sat Intelkam...", "345"),
+        ],
+    )
+    def test_finds_nomor(self, text: str, needle: str) -> None:
+        result = find_nomor_sprint(text)
+        assert result is not None
+        assert needle in result
+        assert result.upper().startswith("SPRIN")
+
+    def test_returns_none_when_absent(self) -> None:
+        assert find_nomor_sprint("no nomor here, just some text") is None
+
+
+class TestTanggal:
+    def test_basic_date(self) -> None:
+        assert find_tanggal("Bandung, 21 April 2025") == date(2025, 4, 21)
+
+    def test_with_dashes(self) -> None:
+        assert find_tanggal("Tanggal 1 - Desember - 2024") == date(2024, 12, 1)
+
+    def test_invalid_month(self) -> None:
+        assert find_tanggal("21 Foo 2025") is None
+
+    def test_no_date_present(self) -> None:
+        assert find_tanggal("nothing here") is None
+
+
+class TestSatuan:
+    def test_polres(self) -> None:
+        result = find_satuan("KEPOLISIAN RESOR BANDUNG\nLainnya")
+        assert result is not None
+        assert "RESOR BANDUNG" in result.upper()
+
+    def test_polri_pusat(self) -> None:
+        result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
+        assert result is not None
+
+
+class TestPerihal:
+    def test_extracts_perihal_line(self) -> None:
+        text = "Other line\nPERIHAL : Pelaksanaan penyelidikan kasus.\nMore"
+        assert find_perihal(text) == "Pelaksanaan penyelidikan kasus."
+
+    def test_returns_none_when_absent(self) -> None:
+        assert find_perihal("no perihal field") is None
+
+
+class TestDasar:
+    def test_numbered_list(self) -> None:
+        text = (
+            "DASAR :\n"
+            "1. UU No 2 Tahun 2002.\n"
+            "2. Peraturan Kapolri Nomor 6.\n"
+            "\n"
+            "DIPERINTAHKAN :\n"
+            "Kepada : ...\n"
+        )
+        items = find_dasar_list(text)
+        assert len(items) == 2
+        assert items[0].startswith("UU No 2")
+        assert items[1].startswith("Peraturan Kapolri")
+
+    def test_empty_when_section_missing(self) -> None:
+        assert find_dasar_list("no dasar section") == []
+
+
+class TestSignatory:
+    def test_extracts_last_nrp(self) -> None:
+        text = "Some 12345678 NRP earlier 87654321\nNRP. 11223344"
+        sig = find_signatory(text)
+        assert sig.nrp == "11223344"
+
+    def test_no_nrp(self) -> None:
+        assert find_signatory("no NRP here").nrp is None
+
+
+class TestExtractHeader:
+    def test_full_synthetic_doc(self, sample_sprint_text: str) -> None:
+        header = extract_header(sample_sprint_text)
+        assert header.nomor_sprint is not None
+        assert "Sprin" in header.nomor_sprint
+        assert header.tanggal == date(2025, 4, 21)
+        assert header.satuan_penerbit is not None
+        assert "KEPOLISIAN" in header.satuan_penerbit.upper()
+        assert header.perihal is not None
+        assert "penyelidikan" in header.perihal.lower()
+        assert len(header.dasar) == 3
diff --git a/tests/unit/test_validators.py b/tests/unit/test_validators.py
new file mode 100644
index 0000000..6ff0dcd
--- /dev/null
+++ b/tests/unit/test_validators.py
@@ -0,0 +1,108 @@
+"""Tests for the validation layer."""
+
+from __future__ import annotations
+
+from datetime import date
+
+import pytest
+
+from ocr_sprint.data.master_pangkat import is_valid_pangkat, normalize_pangkat
+from ocr_sprint.pipeline.extract.validators import (
+    validate_extraction,
+    validate_header,
+    validate_nrp,
+    validate_personnel_entry,
+)
+from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields, ReviewFlag
+from ocr_sprint.schemas.personnel import PersonnelEntry
+
+
+class TestNRP:
+    @pytest.mark.parametrize("nrp", ["12345678", "00000001", "99999999"])
+    def test_valid_8_digits(self, nrp: str) -> None:
+        assert validate_nrp(nrp) is True
+
+    @pytest.mark.parametrize("nrp", ["1234567", "123456789", "abcdefgh", "", None])
+    def test_invalid(self, nrp: str | None) -> None:
+        assert validate_nrp(nrp) is False
+
+
+class TestPangkat:
+    @pytest.mark.parametrize(
+        ("input_str", "expected"),
+        [
+            ("AKP", "AKP"),
+            ("akp", "AKP"),
+            ("AKP.", "AKP"),
+            ("AKBP", "AKBP"),
+            ("Brigjen Pol", "BRIGJEN POL"),
+            ("BRIGJEN", "BRIGJEN POL"),
+            ("Kombespol", "KOMBES POL"),
+            ("BRIPDA", "BRIPDA"),
+        ],
+    )
+    def test_normalizes_known_ranks(self, input_str: str, expected: str) -> None:
+        assert normalize_pangkat(input_str) == expected
+
+    def test_unknown_returns_none(self) -> None:
+        assert normalize_pangkat("Sersan Mayor") is None
+        assert is_valid_pangkat("Sersan Mayor") is False
+
+
+class TestPersonnelValidator:
+    def test_clean_entry_no_flags(self) -> None:
+        entry = PersonnelEntry(pangkat="AKP", nrp="12345678", nama="Test")
+        assert validate_personnel_entry(entry) == []
+
+    def test_invalid_nrp_flagged(self) -> None:
+        entry = PersonnelEntry(pangkat="AKP", nrp="123", nama="Test")
+        assert ReviewFlag.INVALID_NRP in validate_personnel_entry(entry)
+
+    def test_unknown_pangkat_flagged(self) -> None:
+        entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
+        assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
+
+
+class TestHeaderValidator:
+    def test_complete_header_no_flags(self) -> None:
+        header = HeaderFields(
+            nomor_sprint="Sprin/1/I/2025",
+            tanggal=date(2025, 1, 1),
+            satuan_penerbit="POLRES BANDUNG",
+        )
+        assert validate_header(header) == []
+
+    def test_missing_nomor_flagged(self) -> None:
+        header = HeaderFields(tanggal=date(2025, 1, 1))
+        assert ReviewFlag.MISSING_FIELD in validate_header(header)
+
+    def test_missing_date_flagged(self) -> None:
+        header = HeaderFields(nomor_sprint="Sprin/1/I/2025")
+        assert ReviewFlag.DATE_PARSE_FAILED in validate_header(header)
+
+
+class TestFullValidation:
+    def test_personnel_count_mismatch(self) -> None:
+        result = ExtractionResult(
+            header=HeaderFields(
+                nomor_sprint="Sprin/1/I/2025",
+                tanggal=date(2025, 1, 1),
+            ),
+            personel=[
+                PersonnelEntry(pangkat="AKP", nrp="12345678", nama="A"),
+            ],
+        )
+        flags = validate_extraction(result, expected_personnel_count=2)
+        assert ReviewFlag.PERSONNEL_COUNT_MISMATCH in flags
+
+    def test_flags_are_deduped(self) -> None:
+        result = ExtractionResult(
+            header=HeaderFields(),  # missing both nomor and tanggal
+            personel=[
+                PersonnelEntry(nrp="123", pangkat="X"),
+                PersonnelEntry(nrp="456", pangkat="Y"),
+            ],
+        )
+        flags = validate_extraction(result)
+        # each flag type should appear at most once
+        assert len(flags) == len(set(flags))