Phase 1 MVP: synchronous OCR + regex header extraction

Implements the foundation of the OCR Sprint service:
- FastAPI app with /api/v1/health and /api/v1/documents (sync upload)
- Pydantic v2 schemas for documents, extraction result, personnel
- Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew,
  denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based
  header extraction (nomor sprint, tanggal, satuan, perihal, dasar),
  signatory NRP, master-pangkat validation, confidence scoring + routing.
- Tests: 61 unit tests covering regex rules, validators, preprocess,
  ingest, confidence, and API contract (PaddleOCR mocked).
- Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit,
  Dockerfile, docker-compose, Makefile.
- Docs: README + docs/architecture.md (full hybrid stack rationale and
  6-phase roadmap).

Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
Devin AI
2026-04-25 14:58:50 +00:00
commit ca0c0a0428
45 changed files with 2457 additions and 0 deletions

43
.env.example Normal file
View File

@@ -0,0 +1,43 @@
# ==== App ====
APP_ENV=local # local | dev | staging | prod
APP_HOST=0.0.0.0
APP_PORT=8000
APP_LOG_LEVEL=INFO
# ==== Storage (Phase 1: local filesystem) ====
STORAGE_LOCAL_DIR=./storage
# ==== OCR ====
OCR_LANG=latin # PaddleOCR lang code; "latin" works well for Bahasa Indonesia
OCR_USE_GPU=false # set true if running on a GPU host
OCR_DET_MODEL_DIR= # leave empty to use PaddleOCR defaults
OCR_REC_MODEL_DIR=
OCR_CLS_MODEL_DIR=
OCR_MAX_IMAGE_SIDE=2200 # downscale longest side before OCR
# ==== Preprocessing ====
PREPROCESS_TARGET_DPI=300
PREPROCESS_DENOISE=true
PREPROCESS_DESKEW=true
PREPROCESS_ADAPTIVE_THRESHOLD=false # turn on for low-quality phone photos
# ==== Confidence / routing (Phase 5) ====
CONFIDENCE_AUTO_APPROVE=0.95
CONFIDENCE_NEEDS_REVIEW=0.85
# ==== LLM (Phase 5, optional) ====
LLM_ENABLED=false
LLM_PROVIDER=ollama
LLM_MODEL=qwen2.5:1.5b # CPU-friendly default
LLM_BASE_URL=http://localhost:11434
LLM_TIMEOUT_S=60
# ==== Async pipeline (Phase 4, optional) ====
QUEUE_ENABLED=false
REDIS_URL=redis://localhost:6379/0
DATABASE_URL=postgresql+psycopg://ocr:ocr@localhost:5432/ocr_sprint
MINIO_ENDPOINT=localhost:9000
MINIO_ACCESS_KEY=minioadmin
MINIO_SECRET_KEY=minioadmin
MINIO_BUCKET=ocr-sprint
MINIO_SECURE=false

70
.gitignore vendored Normal file
View File

@@ -0,0 +1,70 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
dist/
*.egg-info/
*.egg
.pytest_cache/
.mypy_cache/
.ruff_cache/
.coverage
.coverage.*
htmlcov/
coverage.xml
.tox/
.nox/
# Virtual environments
.venv/
venv/
env/
ENV/
# IDE
.idea/
.vscode/
*.swp
*.swo
.DS_Store
# Environment / secrets
.env
.env.*
!.env.example
# Local data & artifacts
samples/*.pdf
samples/*.PDF
samples/*.jpg
samples/*.JPG
samples/*.jpeg
samples/*.png
samples/*.PNG
samples/*.tif
samples/*.tiff
!samples/README.md
data/local/
storage/
*.db
*.sqlite
*.sqlite3
# OCR / model caches
.paddleocr/
~/.paddleocr/
models/downloaded/
# Logs
logs/
*.log
# Docker
.docker/
# Misc
*.bak
*.tmp

19
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,19 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-toml
- id: check-added-large-files
args: ["--maxkb=1024"]
- id: check-merge-conflict
- id: detect-private-key
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.9
hooks:
- id: ruff
args: ["--fix"]
- id: ruff-format

51
Dockerfile Normal file
View File

@@ -0,0 +1,51 @@
# syntax=docker/dockerfile:1.6
# CPU-only image for the OCR Sprint API.
# PaddleOCR + PyMuPDF + OpenCV-headless work on plain Debian without poppler.
FROM python:3.11-slim AS base
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1 \
DEBIAN_FRONTEND=noninteractive
# System deps for OpenCV, libmagic, PaddlePaddle, and image format support.
RUN apt-get update && apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender1 \
libgomp1 \
libmagic1 \
ca-certificates \
curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# ----- builder layer (install deps separately for caching) -----
FROM base AS builder
COPY pyproject.toml README.md ./
COPY src/ ./src/
RUN pip install --upgrade pip && pip install ".[dev]"
# ----- runtime layer -----
FROM base AS runtime
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
COPY pyproject.toml README.md ./
COPY src/ ./src/
# Pre-create cache dirs so PaddleOCR can write models on first run.
RUN mkdir -p /home/app/.paddleocr /app/storage \
&& useradd --create-home --uid 1000 app \
&& chown -R app:app /home/app /app
USER app
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
CMD curl -fsS http://localhost:8000/api/v1/health || exit 1
CMD ["uvicorn", "ocr_sprint.main:app", "--host", "0.0.0.0", "--port", "8000"]

52
Makefile Normal file
View File

@@ -0,0 +1,52 @@
.PHONY: help install dev fmt lint typecheck test test-cov run docker-build docker-up docker-down clean
help:
@echo "Targets:"
@echo " install - install runtime + dev deps in current env"
@echo " dev - run FastAPI app with autoreload"
@echo " fmt - format code with ruff"
@echo " lint - lint with ruff"
@echo " typecheck - run mypy"
@echo " test - run pytest"
@echo " test-cov - run pytest with coverage"
@echo " docker-build - build api image"
@echo " docker-up - start docker-compose stack"
@echo " docker-down - stop docker-compose stack"
install:
python -m pip install --upgrade pip
pip install -e ".[dev]"
pre-commit install || true
dev:
uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
fmt:
ruff format src tests
ruff check --fix src tests
lint:
ruff check src tests
ruff format --check src tests
typecheck:
mypy src
test:
pytest
test-cov:
pytest --cov --cov-report=term-missing
docker-build:
docker compose build
docker-up:
docker compose up -d
docker-down:
docker compose down
clean:
rm -rf .pytest_cache .mypy_cache .ruff_cache .coverage htmlcov build dist *.egg-info
find . -type d -name __pycache__ -exec rm -rf {} +

123
README.md Normal file
View File

@@ -0,0 +1,123 @@
# OCR Sprint Service
OCR + structured extraction service for Indonesian police "surat sprint" (surat perintah) documents. Built around **FastAPI + PaddleOCR + hybrid extraction (regex → LLM lokal → validation)** with **on-premise** deployment as a hard requirement.
> **Status:** Phase 1 MVP — synchronous PDF/image OCR with regex header extraction, validation, and confidence scoring. Phase 26 (document detection, table extraction, async pipeline, LLM extraction, HITL) are tracked in [`docs/architecture.md`](docs/architecture.md).
## Why this stack
- **PaddleOCR** is the strongest open-source OCR for mixed-language documents and runs fully on-prem (essential for police data).
- **PP-Structure** (Phase 3) handles personnel tables natively.
- **Regex-first, LLM-fallback extraction** keeps deterministic fields fast and predictable while letting an LLM handle format drift across Polri units.
- **CPU-friendly defaults**: a small (1.5B4B) local LLM via Ollama is the recommended default; the architecture is also GPU-ready.
See [`docs/architecture.md`](docs/architecture.md) for the full architecture, accuracy expectations, and roadmap.
## Quickstart
### Prerequisites
- Python **3.103.12**
- ~3 GB free disk for PaddleOCR model downloads on first run
- Linux/macOS recommended (Windows works but PaddleOCR install can be finicky)
### Install (local dev)
```bash
git clone https://github.com/Adriankf59/ocr-sprint-service.git
cd ocr-sprint-service
python -m venv .venv && source .venv/bin/activate
make install # installs runtime + dev deps + pre-commit
cp .env.example .env # edit if you need GPU / different storage path
```
### Run the API
```bash
make dev
# → http://localhost:8000/docs
```
### Try it out
```bash
curl -F "file=@samples/pdf/example.pdf" http://localhost:8000/api/v1/documents | jq
```
Expected response (truncated):
```json
{
"job_id": "8f2a...",
"status": "completed",
"confidence": 0.93,
"data": {
"header": {
"nomor_sprint": "Sprin/123/IV/2025/Reskrim",
"tanggal": "2025-04-21",
"satuan_penerbit": "KEPOLISIAN RESOR BANDUNG",
"perihal": "Pelaksanaan penyelidikan kasus pencurian",
"dasar": ["Undang-Undang Nomor 2 Tahun 2002 ...", "..."]
},
"personel": [],
"ttd": { "nrp": "12345678" }
},
"review_flags": []
}
```
> **Note:** Phase 1 does not yet populate the `personel[]` table — that requires PP-Structure (Phase 3). Header fields, signatory NRP, confidence, and HITL routing are fully wired.
### Docker
```bash
docker compose build
docker compose up -d
docker compose logs -f api
```
The first request will trigger PaddleOCR to download its detection/recognition/cls models (~200 MB) into the `paddle-models` volume.
## Development
```bash
make fmt # format with ruff
make lint # lint
make typecheck # mypy strict mode
make test # pytest
make test-cov # pytest + coverage
```
Pre-commit hooks run ruff on every commit. Install once with `pre-commit install` (already done by `make install`).
## Project layout
```
src/ocr_sprint/
api/ # FastAPI routes + error handlers
schemas/ # Pydantic v2 models (request/response, extraction, personnel)
pipeline/ # ingest → preprocess → ocr → extract → validate → score
extract/ # regex_rules.py (Phase 1) → llm.py (Phase 5)
data/ # master data (Polri ranks, etc.)
utils/ # logging, helpers
config.py # pydantic-settings
main.py # app factory
tests/unit/ # ~60 unit tests, no PaddleOCR dependency
docs/ # architecture & decision records
```
## Roadmap
| Phase | Scope | Status |
|---|---|---|
| 1 | Sync API, PDF/image ingest, basic preprocessing, PaddleOCR, regex header extraction, validation, confidence scoring | **In progress** |
| 2 | DocTR document detection + dewarping for phone photos | Planned |
| 3 | PP-Structure table extraction for personnel rows | Planned |
| 4 | Async pipeline (Celery + Redis), Postgres + MinIO, auth, observability | Planned |
| 5 | LLM hybrid extraction (Ollama + structured output) | Planned |
| 6 | HITL review endpoints + audit trail | Planned |
## License
Proprietary — internal use only.

23
docker-compose.yml Normal file
View File

@@ -0,0 +1,23 @@
# Phase 1 MVP compose: API only.
# Phase 4 will add redis, postgres, minio, and worker services.
services:
api:
build:
context: .
dockerfile: Dockerfile
image: ocr-sprint-service:dev
container_name: ocr-sprint-api
ports:
- "8000:8000"
environment:
APP_ENV: local
APP_LOG_LEVEL: INFO
OCR_USE_GPU: "false"
STORAGE_LOCAL_DIR: /app/storage
volumes:
- ./storage:/app/storage
- paddle-models:/home/app/.paddleocr
restart: unless-stopped
volumes:
paddle-models:

259
docs/architecture.md Normal file
View File

@@ -0,0 +1,259 @@
# Plan & Arsitektur — OCR Service Surat Sprint Kepolisian
## 1. Penilaian Jujur Tech Stack yang Diusulkan
Tech stack Anda (FastAPI + PaddleOCR + OpenCV/Pillow + Regex) **sudah bagus dan layak produksi**, tapi **belum tentu paling optimal akurasinya** untuk kasus surat sprint. Ada beberapa gap yang perlu diisi sebelum bisa disebut "terbaik".
### Yang sudah tepat
| Komponen | Alasan |
|---|---|
| **FastAPI** | Async native, Pydantic validation, OpenAPI docs otomatis, ideal untuk ML serving. |
| **PaddleOCR (PP-OCRv4/v5)** | Salah satu OCR open-source terbaik untuk dokumen campuran teks + tabel, mendukung Latin (cocok untuk Bahasa Indonesia), bisa jalan on-premise (penting untuk dokumen kepolisian yang sensitif — **cloud OCR seperti Google Vision/AWS Textract sebaiknya dihindari** karena masalah kerahasiaan). |
| **OpenCV + Pillow** | Standar industri untuk preprocessing. |
| **Regex/rule-based** | Cocok untuk dokumen terstruktur seperti sprint yang format-nya relatif baku. |
### Yang masih kurang / perlu ditambah
1. **Table extraction belum tertangani**
Daftar personel di surat sprint hampir selalu berbentuk **tabel** (No, Pangkat, NRP, Nama, Jabatan, Keterangan). Regex pada teks linear dari OCR biasa **akan kacau** ketika baris tabel pecah atau kolom bergeser. Solusi: gunakan **PaddleOCR PP-Structure** (modul table recognition bawaan Paddle) atau model khusus seperti **TableTransformer (Microsoft)**.
2. **Document detection & dewarping untuk foto HP belum eksplisit**
Foto HP bermasalah karena: perspektif miring, lipatan, bayangan, lighting tidak rata, fokus tidak merata. OpenCV crop + perspective transform manual saja sering gagal. Tambahkan:
- **Document corner detection**: `DocTR` / `MobileSAM` / model edge-based, atau heuristik kontur OpenCV sebagai fallback.
- **Dewarping**: `DocTr` / `DewarpNet` untuk halaman yang melengkung (lipatan).
- **Shadow removal**: algoritma background division atau model spesialis.
3. **Strategi ekstraksi 100% regex itu rapuh**
Surat sprint dari satuan berbeda (Polda, Polres, Polsek, Mabes) punya **variasi format**: header berbeda, urutan field berbeda, kadang pangkat disingkat (`AKP`, `IPDA`) kadang ditulis penuh. Regex murni akan butuh ratusan rule dan tetap miss kasus baru.
**Rekomendasi pendekatan hybrid**:
- **Layer 1 — Regex/rule** untuk field deterministik (Nomor sprint, tanggal, dasar hukum) yang format-nya baku.
- **Layer 2 — Schema-aware extraction** menggunakan **LLM lokal** (Llama 3.1 8B / Qwen2.5 7B via Ollama atau vLLM) dengan structured output (JSON schema / Pydantic) untuk field yang variatif (jabatan, keterangan tugas).
- **Layer 3 — Validation** terhadap master data (daftar pangkat valid, format NRP 8 digit, dll).
4. **Tidak ada confidence scoring & human-in-the-loop**
Untuk dokumen kepolisian, **akurasi 100% otomatis itu mitos**. Sistem harus:
- Mengeluarkan confidence score per field.
- Otomatis flag dokumen low-confidence untuk review manusia.
- Sediakan UI/endpoint koreksi yang feedback-nya bisa dipakai retraining.
5. **Alternatif end-to-end yang patut dipertimbangkan**
Jika nanti volume dokumen besar dan format relatif stabil, fine-tuning model **Document Understanding** end-to-end bisa lebih akurat:
- **Donut** (OCR-free, langsung image → JSON).
- **LayoutLMv3** (kombinasi teks + layout + visual).
- **Surya OCR** (newer, sangat bagus untuk dokumen).
Untuk MVP, tetap pakai PaddleOCR. Donut/LayoutLM adalah opsi V2 setelah ada labeled dataset cukup (~5001000 dokumen).
### Verdict
Stack Anda **bisa mencapai ~8592% akurasi field-level** untuk surat sprint dengan kualitas scan baik, dan **~7080%** untuk foto HP, **kalau ditambah** komponen di atas. Tanpa table extraction + dewarping + hybrid extraction, akurasinya akan jatuh di kondisi nyata.
---
## 2. Arsitektur yang Direkomendasikan
### 2.1 Diagram Logis
```
┌────────────────────────────────────────────────────────────────────┐
│ Client (Web/Mobile) │
└──────────────────────────────┬─────────────────────────────────────┘
│ HTTPS (multipart upload)
┌────────────────────────────────────────────────────────────────────┐
│ FastAPI Gateway (stateless) │
│ - Auth (JWT/API key) - Rate limit - Request validation │
└──────────────────────────────┬─────────────────────────────────────┘
│ enqueue job
┌────────────────────────────────────────────────────────────────────┐
│ Job Queue (Redis + Celery / RQ / Dramatiq) │
└──────────────────────────────┬─────────────────────────────────────┘
┌────────────────────────────────────────────────────────────────────┐
│ OCR Worker Pipeline (GPU/CPU) │
│ ┌────────────┐ ┌──────────────┐ ┌───────────┐ ┌────────────┐ │
│ │ 1. Ingest │→ │ 2. Preproc │→ │ 3. OCR + │→ │ 4. Extract │ │
│ │ & detect │ │ (deskew, │ │ Layout │ │ (regex + │ │
│ │ PDF/IMG │ │ dewarp, │ │ PP-Struct│ │ LLM + │ │
│ │ │ │ denoise) │ │ + Table) │ │ validate) │ │
│ └────────────┘ └──────────────┘ └───────────┘ └─────┬──────┘ │
│ │ │
│ ┌──────────────────────────────┘ │
│ ▼ │
│ ┌─────────────┐ │
│ │ 5. Confidence│ → low conf? flag for review │
│ │ scoring │ │
│ └──────┬───────┘ │
└──────────────────────────┼─────────────────────────────────────────┘
┌────────────────────────────────────────────────────────────────────┐
│ Storage: PostgreSQL (metadata) + MinIO/S3 (file) │
│ + Vector store opsional (untuk dedup / search) │
└────────────────────────────────────────────────────────────────────┘
┌────────────────────────────────────────────────────────────────────┐
│ Review UI (optional) — koreksi manual + audit trail │
└────────────────────────────────────────────────────────────────────┘
```
### 2.2 Pipeline Detail per Tahap
**Tahap 1 — Ingest & Document Detection**
- PDF: render setiap halaman jadi image @ 300 DPI (`pdf2image` / `PyMuPDF`).
- Image (foto HP): deteksi sudut dokumen → crop → perspective transform.
- Library: OpenCV `findContours` (cepat) sebagai fallback, **DocTR document detector** (lebih akurat) sebagai utama.
**Tahap 2 — Preprocessing**
- Deskew (rotation correction) — Hough transform atau model.
- Dewarp (untuk foto buku/lipatan) — `DewarpNet` atau model RNN.
- Adaptive thresholding (untuk foto dengan lighting tidak rata).
- Shadow removal (background division).
- Denoise (Non-Local Means).
- Resize ke ukuran optimal OCR (~15002500 px sisi panjang).
**Tahap 3 — OCR + Layout Analysis**
- **PaddleOCR PP-Structure** dijalankan sekali → menghasilkan:
- Bounding boxes + teks + confidence per word/line.
- Table region detection + table-to-HTML/JSON.
- Layout type per region (title, paragraph, table, figure).
- Output ditampung sebagai struktur intermediate (mirip hOCR / ALTO XML).
**Tahap 4 — Information Extraction**
- **4a. Header parsing (regex)**: Nomor sprint, tanggal, satuan penerbit, dasar hukum, perihal. Format relatif baku → regex sangat cocok.
- **4b. Personnel table extraction**: ambil dari hasil PP-Structure table → mapping kolom (Pangkat, NRP, Nama, Jabatan, Keterangan).
- **4c. LLM fallback**: untuk field yang regex/table miss, kirim chunk teks + JSON schema ke LLM lokal (Ollama / vLLM) dengan **structured output** (Pydantic via `outlines` / `instructor`).
- **4d. Validation layer**:
- NRP: 8 digit numerik.
- Pangkat: harus ada di daftar master pangkat Polri.
- Tanggal: parse + sanity check.
- Cross-check: jumlah personel di body = jumlah baris tabel.
**Tahap 5 — Confidence Scoring & Routing**
- Aggregate confidence: weighted average dari OCR confidence + validation pass/fail + LLM logprob (kalau pakai).
- Threshold (mis. < 0.85) status `NEEDS_REVIEW`.
- Threshold tinggi (≥ 0.95) + semua validasi pass status `AUTO_APPROVED`.
### 2.3 API Endpoint (FastAPI)
```
POST /api/v1/documents # upload, kembalikan job_id
GET /api/v1/documents/{job_id} # poll status + hasil
GET /api/v1/documents/{job_id}/raw # raw OCR output (debug)
PATCH /api/v1/documents/{job_id} # koreksi manual (HITL)
GET /api/v1/health # liveness
GET /api/v1/metrics # Prometheus
```
Response shape (contoh):
```json
{
"job_id": "uuid",
"status": "completed | processing | needs_review | failed",
"confidence": 0.92,
"data": {
"nomor_sprint": "Sprin/123/IV/2025",
"tanggal": "2025-04-21",
"satuan_penerbit": "Polres Bandung",
"dasar": ["...", "..."],
"perihal": "...",
"personel": [
{"no": 1, "pangkat": "AKP", "nrp": "12345678", "nama": "...", "jabatan": "Kasat Reskrim", "confidence": 0.97},
...
],
"ttd": {"pejabat": "...", "pangkat": "...", "nrp": "..."}
},
"review_flags": []
}
```
### 2.4 Tech Stack Final yang Direkomendasikan
| Layer | Pilihan | Catatan |
|---|---|---|
| API | **FastAPI** + Uvicorn/Gunicorn | sesuai usulan |
| Validation | **Pydantic v2** | wajib |
| Queue | **Redis + Celery** atau **Dramatiq** | OCR berat, jangan blocking request |
| OCR | **PaddleOCR PP-OCRv4 + PP-Structure** | tambah PP-Structure untuk tabel |
| Preprocessing | **OpenCV + Pillow** + **DocTR** (detection) | DocTR untuk foto HP |
| Extraction | **Regex + Ollama (Llama 3.1 8B / Qwen2.5 7B)** + **instructor/outlines** | hybrid |
| Storage | **PostgreSQL** (metadata) + **MinIO** (file blob) | self-hosted, sesuai compliance |
| Observability | **Prometheus + Grafana + Loki** | wajib produksi |
| Container | **Docker + docker-compose** (dev) **Kubernetes** (prod) | |
| GPU | NVIDIA T4/A10 (1× cukup untuk MVP) | PaddleOCR jauh lebih cepat di GPU |
---
## 3. Roadmap Pengembangan (Bertahap)
### Fase 0 — Persiapan (1 minggu)
- Kumpulkan **dataset sampel**: minimal 50 surat sprint (campur PDF scan + foto HP) dari beragam satuan.
- Buat **ground truth labelling** untuk 20 dokumen (untuk evaluasi).
- Definisikan **schema output final** (JSON) bersama stakeholder.
### Fase 1 — MVP Pipeline Sinkron (2 minggu)
- Setup FastAPI skeleton + Pydantic schemas.
- Integrasi PaddleOCR PP-OCRv4 (CPU dulu, GPU menyusul).
- Preprocessing dasar: deskew + denoise + resize.
- Regex extraction untuk field header.
- Endpoint sinkron `POST /documents` (untuk dev/testing saja).
- **Evaluasi akurasi** terhadap 20 ground truth.
### Fase 2 — Robustness untuk Foto HP (2 minggu)
- Integrasi document detection (DocTR atau OpenCV contour).
- Perspective transform + dewarping.
- Shadow removal.
- Re-evaluasi akurasi pada subset foto HP.
### Fase 3 — Table Extraction (1.5 minggu)
- Integrasi PP-Structure untuk personnel table.
- Mapping kolom + validation (NRP, pangkat).
- Master data tabel pangkat Polri.
### Fase 4 — Async + Production Ready (1.5 minggu)
- Pindahkan ke arsitektur async dengan Celery + Redis.
- Storage MinIO + PostgreSQL.
- Auth, rate limit, logging, metrics.
- Docker compose untuk deployment.
### Fase 5 — LLM Hybrid Extraction (2 minggu)
- Setup Ollama / vLLM dengan model lokal.
- Structured output via `instructor`.
- Confidence scoring + routing ke review.
### Fase 6 — HITL Review UI (opsional, 2 minggu)
- Endpoint koreksi.
- Simple web UI (Next.js) untuk reviewer.
- Audit trail & feedback loop.
### Fase 7 — Optimasi Lanjutan (ongoing)
- Fine-tune PaddleOCR detection/recognition pada dataset internal.
- Eksplorasi Donut/LayoutLMv3 jika dataset sudah cukup.
- Batch processing & GPU optimization.
**Total estimasi MVP fungsional (Fase 14): ~7 minggu** dengan 1 backend engineer + 1 ML engineer.
---
## 4. Risiko & Mitigasi
| Risiko | Mitigasi |
|---|---|
| Data sensitif (kepolisian) bocor | Wajib on-prem; tidak ada cloud OCR; enkripsi at-rest (LUKS/pgcrypto) + in-transit (mTLS); audit log lengkap. |
| Variasi format antar satuan | Hybrid extraction (regex + LLM); kumpulkan sample dari banyak satuan sejak awal. |
| Foto HP kualitas buruk | Validasi kualitas image di client (resolusi minimal, blur detection) sebelum upload. |
| Akurasi tidak sampai target | HITL review wajib untuk dokumen low-confidence; jangan deploy fully-automatic. |
| Tanggung jawab hukum atas hasil OCR | Selalu simpan original document + flag bahwa hasil ekstraksi adalah "draft, perlu verifikasi manusia". |
---
## 5. Pertanyaan Sebelum Implementasi
Sebelum saya lanjut ke implementasi, mohon konfirmasi:
1. **Volume**: berapa dokumen/hari yang ditargetkan? (mempengaruhi pilihan async vs sync, GPU vs CPU)
2. **Deployment target**: on-prem mutlak, atau private cloud (GovCloud) boleh?
3. **Source dokumen**: apakah ada akses ke 2050 sample surat sprint untuk dijadikan dataset awal?
4. **Integrasi**: service ini akan dipanggil sistem apa? (mempengaruhi auth & API contract)
5. **HITL**: apakah ada SDM untuk review manual dokumen low-confidence?
6. **Hardware**: sudah ada server GPU, atau perlu sizing rekomendasi?
7. **Format output final**: ada schema yang sudah dipakai sistem downstream?

136
pyproject.toml Normal file
View File

@@ -0,0 +1,136 @@
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "ocr-sprint-service"
version = "0.1.0"
description = "OCR service for Indonesian police 'surat sprint' documents (FastAPI + PaddleOCR + hybrid extraction)"
readme = "README.md"
requires-python = ">=3.10,<3.13"
license = { text = "Proprietary" }
authors = [{ name = "Adrian Kuman Firmansah" }]
dependencies = [
# Web framework
"fastapi>=0.115,<0.116",
"uvicorn[standard]>=0.30,<0.34",
"python-multipart>=0.0.9",
"pydantic>=2.7,<3",
"pydantic-settings>=2.4,<3",
# Image / PDF
"pillow>=10.4,<12",
"opencv-python-headless>=4.10,<5",
"numpy>=1.26,<2.2",
"PyMuPDF>=1.24,<2",
"python-magic>=0.4.27",
# OCR (CPU build of paddle; GPU users override via extra index)
"paddlepaddle==2.6.1",
"paddleocr>=2.7.5,<3",
# Logging / observability
"structlog>=24.1",
"prometheus-client>=0.20",
# Misc
"httpx>=0.27",
"tenacity>=8.5",
]
[project.optional-dependencies]
dev = [
"pytest>=8.2",
"pytest-asyncio>=0.23",
"pytest-cov>=5.0",
"ruff>=0.6.9",
"mypy>=1.11",
"types-Pillow",
"pre-commit>=3.7",
]
# Extraction layer (Phase 5) — kept optional so MVP install stays light
llm = [
"ollama>=0.3",
"instructor>=1.4",
]
# Async pipeline (Phase 4)
async-pipeline = [
"celery[redis]>=5.4",
"redis>=5.0",
"minio>=7.2",
"sqlalchemy>=2.0",
"psycopg[binary]>=3.2",
"alembic>=1.13",
]
[project.scripts]
ocr-sprint-api = "ocr_sprint.main:run"
[tool.setuptools.packages.find]
where = ["src"]
[tool.setuptools.package-data]
"ocr_sprint" = ["py.typed"]
# ---------- Tooling ----------
[tool.ruff]
line-length = 100
target-version = "py310"
src = ["src", "tests"]
[tool.ruff.lint]
select = [
"E", "F", "W", # pycodestyle / pyflakes
"I", # isort
"B", # bugbear
"UP", # pyupgrade
"SIM", # simplify
"RUF", # ruff-specific
"C4", # comprehensions
"PIE",
"PT", # pytest style
"TID", # tidy imports
]
ignore = [
"E501", # line length handled by formatter
"B008", # FastAPI Depends() pattern
]
[tool.ruff.format]
quote-style = "double"
[tool.mypy]
python_version = "3.10"
strict = true
warn_unused_ignores = true
warn_redundant_casts = true
disallow_untyped_defs = true
plugins = ["pydantic.mypy"]
mypy_path = "src"
namespace_packages = true
explicit_package_bases = true
[[tool.mypy.overrides]]
module = ["paddleocr.*", "paddle.*", "cv2.*", "fitz.*", "magic.*"]
ignore_missing_imports = true
[tool.pytest.ini_options]
minversion = "8.0"
addopts = "-ra --strict-markers --strict-config"
testpaths = ["tests"]
asyncio_mode = "auto"
filterwarnings = [
"ignore::DeprecationWarning:paddle.*",
"ignore::DeprecationWarning:paddleocr.*",
]
[tool.coverage.run]
source = ["src/ocr_sprint"]
branch = true
[tool.coverage.report]
exclude_lines = [
"pragma: no cover",
"raise NotImplementedError",
"if TYPE_CHECKING:",
]

13
samples/README.md Normal file
View File

@@ -0,0 +1,13 @@
# Samples
Drop sample surat sprint files here for local testing. **Do NOT commit real documents**`.gitignore` excludes binary file extensions in this folder.
Recommended layout:
```
samples/
pdf/ # PDF scans
photo/ # phone photos
ground_truth/ # JSON ground-truth labels for evaluation
```
For sharing real samples with the team, use the project's secured storage (MinIO/S3 once Phase 4 is live), not git.

View File

@@ -0,0 +1,3 @@
"""OCR Sprint Service — extract structured data from Indonesian police 'surat sprint'."""
__version__ = "0.1.0"

View File

View File

@@ -0,0 +1,43 @@
"""HTTP error handlers."""
from __future__ import annotations
from fastapi import FastAPI, Request, status
from fastapi.responses import JSONResponse
from ocr_sprint.utils.logging import get_logger
_logger = get_logger(__name__)
class OCRServiceError(Exception):
"""Base class for application errors that should map to a 4xx response."""
http_status: int = status.HTTP_400_BAD_REQUEST
class UnsupportedDocumentError(OCRServiceError):
"""Uploaded file is neither a PDF nor a recognized image format."""
class JobNotFoundError(OCRServiceError):
http_status = status.HTTP_404_NOT_FOUND
def register_error_handlers(app: FastAPI) -> None:
"""Wire OCRServiceError + a final fallback for unexpected exceptions."""
@app.exception_handler(OCRServiceError)
async def _ocr_error_handler(_: Request, exc: OCRServiceError) -> JSONResponse:
return JSONResponse(
status_code=exc.http_status,
content={"error": exc.__class__.__name__, "message": str(exc)},
)
@app.exception_handler(Exception)
async def _unexpected_handler(_: Request, exc: Exception) -> JSONResponse:
_logger.exception("api.unhandled_exception", error=str(exc))
return JSONResponse(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
content={"error": "InternalServerError", "message": "Unexpected error"},
)

View File

View File

@@ -0,0 +1,58 @@
"""Documents API — Phase 1 synchronous endpoint.
POST /documents accepts a single PDF or image upload, runs the synchronous
pipeline inline, and returns the structured result. This is suitable for
development and low-traffic production; Phase 4 will introduce an async
queue and a polling-style API at the same path.
"""
from __future__ import annotations
from uuid import uuid4
from fastapi import APIRouter, File, UploadFile, status
from ocr_sprint.api.errors import UnsupportedDocumentError
from ocr_sprint.pipeline.orchestrator import run_pipeline
from ocr_sprint.schemas.document import DocumentResponse
from ocr_sprint.utils.logging import get_logger
router = APIRouter(prefix="/documents", tags=["documents"])
_logger = get_logger(__name__)
_MAX_UPLOAD_BYTES = 25 * 1024 * 1024 # 25 MB
@router.post("", status_code=status.HTTP_200_OK, response_model=DocumentResponse)
async def create_document(file: UploadFile = File(...)) -> DocumentResponse:
"""Run OCR + extraction synchronously on a single upload."""
job_id = uuid4()
log = _logger.bind(job_id=str(job_id), filename=file.filename or "")
content = await file.read()
if not content:
raise UnsupportedDocumentError("Uploaded file is empty.")
if len(content) > _MAX_UPLOAD_BYTES:
raise UnsupportedDocumentError(
f"Uploaded file exceeds {_MAX_UPLOAD_BYTES // (1024 * 1024)} MB limit."
)
log.info("documents.received", size=len(content))
try:
output = run_pipeline(content)
except ValueError as exc:
raise UnsupportedDocumentError(str(exc)) from exc
log.info(
"documents.completed",
status=output.status.value,
confidence=round(output.confidence, 3),
flags=[f.value for f in output.result.review_flags],
)
return DocumentResponse(
job_id=job_id,
status=output.status,
confidence=output.confidence,
data=output.result,
review_flags=[f.value for f in output.result.review_flags],
)

View File

@@ -0,0 +1,15 @@
"""Liveness / readiness endpoints."""
from __future__ import annotations
from fastapi import APIRouter
from ocr_sprint import __version__
router = APIRouter(tags=["health"])
@router.get("/health")
async def health() -> dict[str, str]:
"""Lightweight liveness check — does NOT touch the OCR engine."""
return {"status": "ok", "version": __version__}

72
src/ocr_sprint/config.py Normal file
View File

@@ -0,0 +1,72 @@
"""Application settings loaded from environment / .env file."""
from __future__ import annotations
from functools import lru_cache
from pathlib import Path
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
"""Runtime configuration. Override via environment variables or a .env file."""
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
extra="ignore",
)
# App
app_env: str = "local"
app_host: str = "0.0.0.0"
app_port: int = 8000
app_log_level: str = "INFO"
# Storage (Phase 1: local fs)
storage_local_dir: Path = Path("./storage")
# OCR
ocr_lang: str = "latin"
ocr_use_gpu: bool = False
ocr_det_model_dir: str | None = None
ocr_rec_model_dir: str | None = None
ocr_cls_model_dir: str | None = None
ocr_max_image_side: int = 2200
# Preprocessing
preprocess_target_dpi: int = 300
preprocess_denoise: bool = True
preprocess_deskew: bool = True
preprocess_adaptive_threshold: bool = False
# Confidence thresholds (Phase 5 routing)
confidence_auto_approve: float = Field(0.95, ge=0.0, le=1.0)
confidence_needs_review: float = Field(0.85, ge=0.0, le=1.0)
# LLM (Phase 5)
llm_enabled: bool = False
llm_provider: str = "ollama"
llm_model: str = "qwen2.5:1.5b"
llm_base_url: str = "http://localhost:11434"
llm_timeout_s: int = 60
# Async pipeline (Phase 4)
queue_enabled: bool = False
redis_url: str = "redis://localhost:6379/0"
database_url: str = "postgresql+psycopg://ocr:ocr@localhost:5432/ocr_sprint"
minio_endpoint: str = "localhost:9000"
minio_access_key: str = "minioadmin"
minio_secret_key: str = "minioadmin"
minio_bucket: str = "ocr-sprint"
minio_secure: bool = False
@lru_cache(maxsize=1)
def get_settings() -> Settings:
"""Cached accessor so settings are loaded once per process."""
settings = Settings()
settings.storage_local_dir.mkdir(parents=True, exist_ok=True)
return settings

View File

View File

@@ -0,0 +1,66 @@
"""Master data for Polri ranks ('pangkat').
Used by the validation layer to:
1. Confirm that a recognized rank string is a real Polri rank.
2. Normalize abbreviated forms ("AKP""AKP", "Brigadir Polisi""Brigadir") to a canonical form.
Source: Peraturan Kapolri tentang Pangkat (publicly available, 2024).
Update this file when ranks are reorganized.
"""
from __future__ import annotations
# Canonical abbreviation → list of accepted variants (case-insensitive).
PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
# Tamtama
"BHARADA": ("BHARADA", "BHRD"),
"BHARATU": ("BHARATU", "BHRT"),
"BHARAKA": ("BHARAKA", "BHRK"),
"ABRIP": ("ABRIP",),
"ABRIPTU": ("ABRIPTU",),
"ABRIPKA": ("ABRIPKA",),
# Bintara
"BRIPDA": ("BRIPDA",),
"BRIPTU": ("BRIPTU",),
"BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"),
"BRIPKA": ("BRIPKA",),
"AIPDA": ("AIPDA",),
"AIPTU": ("AIPTU",),
# Perwira Pertama
"IPDA": ("IPDA",),
"IPTU": ("IPTU",),
"AKP": ("AKP",),
# Perwira Menengah
"KOMPOL": ("KOMPOL",),
"AKBP": ("AKBP",),
"KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP"),
# Perwira Tinggi
"BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"),
"IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"),
"KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"),
"JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"),
}
# Reverse lookup: any variant (uppercased) → canonical form.
_VARIANT_TO_CANONICAL: dict[str, str] = {
variant.upper(): canonical
for canonical, variants in PANGKAT_VARIANTS.items()
for variant in variants
}
def normalize_pangkat(raw: str | None) -> str | None:
"""Return canonical Polri rank, or None if input is empty/unknown."""
if not raw:
return None
cleaned = " ".join(raw.strip().upper().split())
if cleaned in _VARIANT_TO_CANONICAL:
return _VARIANT_TO_CANONICAL[cleaned]
# tolerate trailing punctuation like "AKP."
stripped = cleaned.rstrip(".,;:")
return _VARIANT_TO_CANONICAL.get(stripped)
def is_valid_pangkat(raw: str | None) -> bool:
"""True if the string maps to a known Polri rank after normalization."""
return normalize_pangkat(raw) is not None

42
src/ocr_sprint/main.py Normal file
View File

@@ -0,0 +1,42 @@
"""FastAPI entrypoint."""
from __future__ import annotations
from fastapi import FastAPI
from ocr_sprint import __version__
from ocr_sprint.api.errors import register_error_handlers
from ocr_sprint.api.routes import documents, health
from ocr_sprint.config import get_settings
from ocr_sprint.utils.logging import configure_logging
def create_app() -> FastAPI:
"""Application factory — keeps top-level state easy to test."""
settings = get_settings()
configure_logging(settings.app_log_level)
app = FastAPI(
title="OCR Sprint Service",
version=__version__,
description="OCR + structured extraction for Indonesian police 'surat sprint' documents.",
docs_url="/docs",
redoc_url="/redoc",
openapi_url="/openapi.json",
)
register_error_handlers(app)
app.include_router(health.router, prefix="/api/v1")
app.include_router(documents.router, prefix="/api/v1")
return app
app = create_app()
def run() -> None:
"""Console-script entrypoint (`ocr-sprint-api`)."""
import uvicorn
s = get_settings()
uvicorn.run("ocr_sprint.main:app", host=s.app_host, port=s.app_port, reload=False)

View File

@@ -0,0 +1 @@
"""OCR pipeline: ingest → preprocess → OCR → extract → validate."""

View File

@@ -0,0 +1,51 @@
"""Confidence scoring + routing decision.
The score is a weighted blend of:
- mean OCR confidence across all detected lines
- validation pass rate (1.0 if no review flags, decreases per flag)
This is intentionally simple for Phase 1; Phase 5 will add LLM logprob
contributions and per-field confidences.
"""
from __future__ import annotations
from ocr_sprint.config import get_settings
from ocr_sprint.schemas.document import DocumentStatus
from ocr_sprint.schemas.extraction import ReviewFlag
# Per-flag penalty applied to the validation component of the score.
_FLAG_PENALTY: dict[ReviewFlag, float] = {
ReviewFlag.LOW_OCR_CONFIDENCE: 0.10,
ReviewFlag.MISSING_FIELD: 0.20,
ReviewFlag.INVALID_NRP: 0.10,
ReviewFlag.UNKNOWN_PANGKAT: 0.05,
ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15,
ReviewFlag.DATE_PARSE_FAILED: 0.10,
}
OCR_WEIGHT = 0.6
VALIDATION_WEIGHT = 0.4
def compute_confidence(
ocr_confidence: float,
flags: list[ReviewFlag],
) -> float:
"""Blend OCR confidence with validation penalties into a single 0-1 score."""
validation_score = 1.0
for flag in flags:
validation_score -= _FLAG_PENALTY.get(flag, 0.05)
validation_score = max(0.0, validation_score)
blended = OCR_WEIGHT * ocr_confidence + VALIDATION_WEIGHT * validation_score
return max(0.0, min(1.0, blended))
def route(confidence: float) -> DocumentStatus:
"""Map a final confidence score onto the job's terminal status."""
s = get_settings()
if confidence >= s.confidence_auto_approve:
return DocumentStatus.COMPLETED
if confidence >= s.confidence_needs_review:
return DocumentStatus.NEEDS_REVIEW
return DocumentStatus.NEEDS_REVIEW # below review threshold also goes to humans

View File

@@ -0,0 +1 @@
"""Information extraction layer (regex Phase 1, LLM Phase 5)."""

View File

@@ -0,0 +1,169 @@
"""Regex-based extraction for the deterministic header fields of a surat sprint.
Targets header fields whose layout is highly standardized across Polri units:
- Nomor sprint, e.g. "Sprin / 123 / IV / 2025 / Reskrim"
- Tanggal (date the sprint was issued)
- Satuan penerbit (issuing unit)
- Perihal
- Dasar (numbered list of legal/operational basis)
Personnel table extraction is intentionally NOT done here — that needs
PP-Structure + cell-aware logic and lives in `pipeline/table.py` (Phase 3).
"""
from __future__ import annotations
import re
from datetime import date
from ocr_sprint.schemas.extraction import HeaderFields, Signatory
# ---------- regex patterns ----------
# Nomor sprint, tolerant of spacing and OCR noise.
# Examples it should match:
# "Sprin / 123 / IV / 2025 / Reskrim"
# "SPRIN/345/X/2024"
# "Nomor : Sprin/12/I/2025/Sat Intelkam"
_RE_NOMOR_SPRINT = re.compile(
r"\bSPRIN[\s./-]*\d+[\s./-]*[IVXLCDM]+[\s./-]*\d{2,4}(?:[\s./-]*[\w .-]+?)?",
re.IGNORECASE,
)
# Indonesian month names.
_BULAN_MAP: dict[str, int] = {
"JANUARI": 1,
"FEBRUARI": 2,
"MARET": 3,
"APRIL": 4,
"MEI": 5,
"JUNI": 6,
"JULI": 7,
"AGUSTUS": 8,
"SEPTEMBER": 9,
"OKTOBER": 10,
"NOVEMBER": 11,
"DESEMBER": 12,
}
# Date in Indonesian, e.g. "21 April 2025" or "21 - April - 2025"
_RE_TANGGAL_ID = re.compile(
r"\b(\d{1,2})\s*[-./\s]\s*(" + "|".join(_BULAN_MAP.keys()) + r")\s*[-./\s]\s*(\d{4})\b",
re.IGNORECASE,
)
# Satuan penerbit usually appears in the document letterhead, prefixed by
# KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
_RE_SATUAN = re.compile(
r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
r"[^\n]{0,80}",
re.IGNORECASE,
)
# "Perihal : ...." up to end of line.
_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
def find_nomor_sprint(text: str) -> str | None:
"""Return the first nomor sprint found, normalized (no extra spaces)."""
match = _RE_NOMOR_SPRINT.search(text)
if not match:
return None
return " ".join(match.group(0).split())
def find_tanggal(text: str) -> date | None:
"""Find the issuance date.
Surat sprint typically contains multiple dates: one or more in the 'Dasar'
section (citing prior documents) and one near the signatory at the bottom
(the actual issuance date, usually formatted as 'Tempat, DD Month YYYY').
We prefer the **last** date in the document since the issuance date appears
after the dasar items in the standard layout.
"""
matches = list(_RE_TANGGAL_ID.finditer(text))
if not matches:
return None
last = matches[-1]
day_s, bulan, year_s = last.group(1), last.group(2).upper(), last.group(3)
try:
return date(int(year_s), _BULAN_MAP[bulan], int(day_s))
except (KeyError, ValueError):
return None
def find_satuan(text: str) -> str | None:
"""Return the first letterhead match (issuing unit), normalized."""
match = _RE_SATUAN.search(text)
if not match:
return None
return " ".join(match.group(0).split())
def find_perihal(text: str) -> str | None:
"""Return the first 'Perihal: ...' line, trimmed to that line only."""
for line in text.splitlines():
m = _RE_PERIHAL.search(line)
if m:
return m.group(1).strip()
return None
def find_dasar_list(text: str) -> list[str]:
"""Extract numbered 'Dasar' items from the text.
Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
collect subsequent lines that start with a number. Stops at a blank line
or a line beginning with another section header keyword.
"""
lines = text.splitlines()
items: list[str] = []
in_dasar = False
section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
for raw_line in lines:
line = raw_line.strip()
if not in_dasar:
if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
in_dasar = True
continue
if not line:
if items:
break
continue
upper = line.upper()
if any(upper.startswith(term) for term in section_terminators):
break
m = _RE_DASAR_ITEM.match(line)
if m:
items.append(m.group(2).strip())
elif items:
# continuation of the previous dasar item
items[-1] = (items[-1] + " " + line).strip()
return items
def find_signatory(text: str) -> Signatory:
"""Best-effort extraction of the signatory block (last NRP in the document)."""
matches = list(_RE_NRP.finditer(text))
if not matches:
return Signatory()
last = matches[-1]
return Signatory(nrp=last.group(2))
def extract_header(text: str) -> HeaderFields:
"""Run all header-level regex extractors and return a populated schema."""
return HeaderFields(
nomor_sprint=find_nomor_sprint(text),
tanggal=find_tanggal(text),
satuan_penerbit=find_satuan(text),
perihal=find_perihal(text),
dasar=find_dasar_list(text),
)

View File

@@ -0,0 +1,64 @@
"""Cross-field validation, with structured review-flag output."""
from __future__ import annotations
import re
from ocr_sprint.data.master_pangkat import is_valid_pangkat
from ocr_sprint.schemas.extraction import (
ExtractionResult,
HeaderFields,
ReviewFlag,
)
from ocr_sprint.schemas.personnel import PersonnelEntry
# Polri NRP = 8 digits.
_RE_NRP_8 = re.compile(r"^\d{8}$")
def validate_nrp(nrp: str | None) -> bool:
"""Return True when the value is a well-formed Polri NRP (8 digits)."""
if nrp is None:
return False
return bool(_RE_NRP_8.match(nrp.strip()))
def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
"""Inspect a single personnel row and return any review flags it triggers."""
flags: list[ReviewFlag] = []
if entry.nrp and not validate_nrp(entry.nrp):
flags.append(ReviewFlag.INVALID_NRP)
if entry.pangkat and not is_valid_pangkat(entry.pangkat):
flags.append(ReviewFlag.UNKNOWN_PANGKAT)
return flags
def validate_header(header: HeaderFields) -> list[ReviewFlag]:
"""Flag missing required fields or unparseable dates in the header."""
flags: list[ReviewFlag] = []
if header.nomor_sprint is None:
flags.append(ReviewFlag.MISSING_FIELD)
if header.tanggal is None:
flags.append(ReviewFlag.DATE_PARSE_FAILED)
return flags
def validate_extraction(
result: ExtractionResult,
expected_personnel_count: int | None = None,
) -> list[ReviewFlag]:
"""Run all validators across the full extraction and dedupe the flags."""
flags: list[ReviewFlag] = []
flags.extend(validate_header(result.header))
for entry in result.personel:
flags.extend(validate_personnel_entry(entry))
if expected_personnel_count is not None and expected_personnel_count != len(result.personel):
flags.append(ReviewFlag.PERSONNEL_COUNT_MISMATCH)
# dedupe while preserving order
seen: set[ReviewFlag] = set()
deduped: list[ReviewFlag] = []
for flag in flags:
if flag not in seen:
seen.add(flag)
deduped.append(flag)
return deduped

View File

@@ -0,0 +1,81 @@
"""Ingest layer: convert uploaded bytes (PDF/IMG) into a list of numpy images."""
from __future__ import annotations
import io
from dataclasses import dataclass
from typing import Any
import fitz # PyMuPDF
import numpy as np
from PIL import Image
from ocr_sprint.schemas.document import SourceKind
# Generic alias used across the pipeline. We don't constrain dtype/shape because
# OpenCV operations accept multiple dtypes and numpy generics are still rough.
NDArrayU8 = np.ndarray[Any, Any]
PDF_MAGIC = b"%PDF-"
PNG_MAGIC = b"\x89PNG\r\n\x1a\n"
JPEG_MAGIC = b"\xff\xd8\xff"
TIFF_MAGIC_LE = b"II*\x00"
TIFF_MAGIC_BE = b"MM\x00*"
@dataclass(frozen=True)
class IngestedPage:
"""One page worth of image data ready for preprocessing."""
image: NDArrayU8 # HxWx3 BGR uint8 (OpenCV convention)
page_index: int
def detect_source_kind(content: bytes) -> SourceKind:
"""Best-effort sniff of an uploaded payload."""
if content.startswith(PDF_MAGIC):
return SourceKind.PDF
if content.startswith((PNG_MAGIC, JPEG_MAGIC, TIFF_MAGIC_LE, TIFF_MAGIC_BE)):
return SourceKind.IMAGE
return SourceKind.UNKNOWN
def _pil_to_bgr(img: Image.Image) -> NDArrayU8:
"""Convert PIL image to OpenCV BGR numpy array."""
if img.mode != "RGB":
img = img.convert("RGB")
arr = np.asarray(img, dtype=np.uint8)
# RGB to BGR
return arr[:, :, ::-1].copy()
def ingest_pdf(content: bytes, target_dpi: int = 300) -> list[IngestedPage]:
"""Render every page of a PDF to a numpy image at the target DPI.
Uses PyMuPDF (no poppler dependency). DPI is enforced via a transform matrix:
fitz's default is 72 DPI, so the zoom factor is target_dpi / 72.
"""
pages: list[IngestedPage] = []
zoom = target_dpi / 72.0
matrix = fitz.Matrix(zoom, zoom)
with fitz.open(stream=content, filetype="pdf") as doc:
for idx, page in enumerate(doc):
pix = page.get_pixmap(matrix=matrix, alpha=False)
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
pages.append(IngestedPage(image=_pil_to_bgr(img), page_index=idx))
return pages
def ingest_image(content: bytes) -> list[IngestedPage]:
"""Decode a single image into a one-element page list."""
img = Image.open(io.BytesIO(content))
return [IngestedPage(image=_pil_to_bgr(img), page_index=0)]
def ingest(content: bytes, kind: SourceKind, target_dpi: int = 300) -> list[IngestedPage]:
"""Dispatch to the right loader based on declared source kind."""
if kind == SourceKind.PDF:
return ingest_pdf(content, target_dpi=target_dpi)
if kind == SourceKind.IMAGE:
return ingest_image(content)
raise ValueError(f"Unsupported source kind: {kind}")

View File

@@ -0,0 +1,106 @@
"""PaddleOCR wrapper.
PaddleOCR has a heavy initialization cost (~2-5s on CPU as model files load),
so we keep a process-global instance behind a lazy accessor.
The wrapper exposes a small, stable surface so the rest of the pipeline does
not depend directly on paddleocr's evolving API.
"""
from __future__ import annotations
from dataclasses import dataclass
from threading import Lock
from typing import TYPE_CHECKING
import numpy as np
from ocr_sprint.config import get_settings
from ocr_sprint.pipeline.ingest import NDArrayU8
from ocr_sprint.utils.logging import get_logger
if TYPE_CHECKING:
from paddleocr import PaddleOCR
_logger = get_logger(__name__)
_lock = Lock()
_instance: PaddleOCR | None = None
@dataclass(frozen=True)
class OCRLine:
"""One recognized line with its bounding polygon and confidence."""
text: str
confidence: float
box: tuple[tuple[float, float], ...] # 4 (x, y) corner points
@dataclass(frozen=True)
class OCRPage:
"""OCR output for a single page."""
lines: list[OCRLine]
@property
def text(self) -> str:
"""Reconstruct page text by concatenating lines (order = paddle's output order)."""
return "\n".join(line.text for line in self.lines)
@property
def mean_confidence(self) -> float:
if not self.lines:
return 0.0
return float(np.mean([line.confidence for line in self.lines]))
def _build_paddleocr() -> PaddleOCR:
from paddleocr import PaddleOCR
s = get_settings()
kwargs: dict[str, object] = {
"lang": s.ocr_lang,
"use_angle_cls": True,
"use_gpu": s.ocr_use_gpu,
"show_log": False,
}
if s.ocr_det_model_dir:
kwargs["det_model_dir"] = s.ocr_det_model_dir
if s.ocr_rec_model_dir:
kwargs["rec_model_dir"] = s.ocr_rec_model_dir
if s.ocr_cls_model_dir:
kwargs["cls_model_dir"] = s.ocr_cls_model_dir
_logger.info("paddleocr.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
return PaddleOCR(**kwargs)
def get_ocr() -> PaddleOCR:
"""Lazy, thread-safe singleton accessor for the PaddleOCR engine."""
global _instance
if _instance is None:
with _lock:
if _instance is None:
_instance = _build_paddleocr()
return _instance
def run_ocr(image: NDArrayU8) -> OCRPage:
"""Run OCR on a single BGR image and return a structured page result."""
engine = get_ocr()
raw = engine.ocr(image, cls=True)
# PaddleOCR returns [[ [box, (text, conf)], ... ]] — one outer list per image.
if not raw or raw[0] is None:
return OCRPage(lines=[])
page_raw = raw[0]
lines: list[OCRLine] = []
for item in page_raw:
if not item or len(item) < 2:
continue
box_raw, text_conf = item[0], item[1]
text, conf = text_conf[0], float(text_conf[1])
try:
box = tuple((float(p[0]), float(p[1])) for p in box_raw)
except (TypeError, ValueError, IndexError):
continue
lines.append(OCRLine(text=text, confidence=conf, box=box))
return OCRPage(lines=lines)

View File

@@ -0,0 +1,103 @@
"""Synchronous pipeline orchestrator (Phase 1).
Wires the individual stages together:
bytes → ingest → preprocess → OCR → regex extract → validate → score
Phase 4 will replace this with a Celery task graph; Phase 3/5 will plug
in PP-Structure for tables and an LLM extractor for variant fields.
"""
from __future__ import annotations
from dataclasses import dataclass
from ocr_sprint.config import get_settings
from ocr_sprint.pipeline.confidence import compute_confidence, route
from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory
from ocr_sprint.pipeline.extract.validators import validate_extraction
from ocr_sprint.pipeline.ingest import detect_source_kind, ingest
from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
from ocr_sprint.schemas.extraction import ExtractionResult, ReviewFlag
from ocr_sprint.utils.logging import get_logger
_logger = get_logger(__name__)
# Below this OCR confidence we automatically flag for review.
_OCR_CONFIDENCE_FLAG_THRESHOLD = 0.80
@dataclass
class PipelineOutput:
"""Bundle returned by the orchestrator."""
source_kind: SourceKind
status: DocumentStatus
confidence: float
result: ExtractionResult
def run_pipeline(content: bytes) -> PipelineOutput:
"""Execute the synchronous OCR + extraction pipeline on raw upload bytes."""
s = get_settings()
kind = detect_source_kind(content)
if kind == SourceKind.UNKNOWN:
raise ValueError("Unsupported file type — only PDF and common image formats are accepted.")
pages = ingest(content, kind, target_dpi=s.preprocess_target_dpi)
_logger.info("pipeline.ingested", source_kind=kind.value, pages=len(pages))
pre_cfg = PreprocessConfig(
max_side=s.ocr_max_image_side,
denoise=s.preprocess_denoise,
deskew=s.preprocess_deskew,
adaptive_threshold=s.preprocess_adaptive_threshold,
)
ocr_pages: list[OCRPage] = []
for page in pages:
cleaned = preprocess(page.image, pre_cfg)
ocr_pages.append(run_ocr(cleaned))
full_text = "\n".join(p.text for p in ocr_pages)
mean_ocr_conf = sum(p.mean_confidence for p in ocr_pages) / len(ocr_pages) if ocr_pages else 0.0
header = extract_header(full_text)
ttd = find_signatory(full_text)
initial_flags: list[ReviewFlag] = []
if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
result = ExtractionResult(
header=header,
personel=[], # Phase 3 will populate from PP-Structure
untuk=[],
ttd=ttd,
raw_text=full_text,
confidence=mean_ocr_conf,
review_flags=list(initial_flags),
)
flags = validate_extraction(result)
# merge initial OCR-confidence flag with validation flags, preserving uniqueness
seen = set(flags)
for f in initial_flags:
if f not in seen:
flags.append(f)
seen.add(f)
result.review_flags = flags
final_conf = compute_confidence(mean_ocr_conf, flags)
result.confidence = final_conf
status = route(final_conf)
return PipelineOutput(
source_kind=kind,
status=status,
confidence=final_conf,
result=result,
)

View File

@@ -0,0 +1,108 @@
"""Image preprocessing for OCR.
Phase 1 implements the "always-on" steps that work for both clean PDF scans
and reasonable phone photos:
- resize to a reasonable max side (PaddleOCR runs faster on smaller inputs)
- convert to grayscale for analysis (kept as 3-channel BGR for paddle)
- denoise (Non-Local Means, gentle)
- deskew via Hough line angle estimate
- optional adaptive threshold for low-quality phone photos
Phase 2 will add document-corner detection + perspective transform + dewarping
for tilted phone shots; those live in `document_detect.py` (added later).
"""
from __future__ import annotations
from dataclasses import dataclass
import cv2
import numpy as np
from ocr_sprint.pipeline.ingest import NDArrayU8
@dataclass(frozen=True)
class PreprocessConfig:
"""Tunable knobs for the preprocessing pipeline."""
max_side: int = 2200
denoise: bool = True
deskew: bool = True
adaptive_threshold: bool = False
def _resize_max_side(img: NDArrayU8, max_side: int) -> NDArrayU8:
h, w = img.shape[:2]
longest = max(h, w)
if longest <= max_side:
return img
scale = max_side / longest
new_w, new_h = round(w * scale), round(h * scale)
return cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
def _estimate_skew_angle(gray: NDArrayU8) -> float:
"""Estimate skew using Canny + Hough; returns angle in degrees within [-15, 15]."""
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
lines = cv2.HoughLines(edges, 1, np.pi / 360, threshold=200)
if lines is None or len(lines) == 0:
return 0.0
angles: list[float] = []
for line in lines[:200]:
rho, theta = line[0]
del rho
# convert to angle relative to horizontal (degrees)
angle = (theta * 180.0 / np.pi) - 90.0
# only keep nearly-horizontal lines (within ±15°)
if -15.0 < angle < 15.0:
angles.append(angle)
if not angles:
return 0.0
return float(np.median(angles))
def _rotate(img: NDArrayU8, angle_deg: float) -> NDArrayU8:
if abs(angle_deg) < 0.1:
return img
h, w = img.shape[:2]
center = (w / 2, h / 2)
matrix = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
return cv2.warpAffine(
img,
matrix,
(w, h),
flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE,
)
def preprocess(img: NDArrayU8, cfg: PreprocessConfig | None = None) -> NDArrayU8:
"""Run preprocessing and return a clean BGR uint8 image suitable for OCR."""
if cfg is None:
cfg = PreprocessConfig()
out = _resize_max_side(img, cfg.max_side)
if cfg.deskew:
gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY)
angle = _estimate_skew_angle(gray)
out = _rotate(out, -angle)
if cfg.denoise:
out = cv2.fastNlMeansDenoisingColored(out, None, 5, 5, 7, 21)
if cfg.adaptive_threshold:
gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY)
binarized = cv2.adaptiveThreshold(
gray,
255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY,
blockSize=31,
C=15,
)
out = cv2.cvtColor(binarized, cv2.COLOR_GRAY2BGR)
return out

0
src/ocr_sprint/py.typed Normal file
View File

View File

@@ -0,0 +1,27 @@
"""Pydantic schemas for input/output of the OCR Sprint service."""
from ocr_sprint.schemas.document import (
DocumentJob,
DocumentResponse,
DocumentStatus,
SourceKind,
)
from ocr_sprint.schemas.extraction import (
ExtractionResult,
HeaderFields,
ReviewFlag,
Signatory,
)
from ocr_sprint.schemas.personnel import PersonnelEntry
__all__ = [
"DocumentJob",
"DocumentResponse",
"DocumentStatus",
"ExtractionResult",
"HeaderFields",
"PersonnelEntry",
"ReviewFlag",
"Signatory",
"SourceKind",
]

View File

@@ -0,0 +1,57 @@
"""Job-level schemas (request, response, status)."""
from __future__ import annotations
from datetime import datetime
from enum import Enum
from typing import Any
from uuid import UUID, uuid4
from pydantic import BaseModel, ConfigDict, Field
from ocr_sprint.schemas.extraction import ExtractionResult
class SourceKind(str, Enum):
"""High-level type of the uploaded document."""
PDF = "pdf"
IMAGE = "image"
UNKNOWN = "unknown"
class DocumentStatus(str, Enum):
"""Lifecycle status of an OCR job."""
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
NEEDS_REVIEW = "needs_review"
FAILED = "failed"
class DocumentJob(BaseModel):
"""Internal representation of a job (Phase 1 holds it in-memory)."""
model_config = ConfigDict(use_enum_values=False)
job_id: UUID = Field(default_factory=uuid4)
source_kind: SourceKind = SourceKind.UNKNOWN
filename: str
status: DocumentStatus = DocumentStatus.PENDING
created_at: datetime = Field(default_factory=lambda: datetime.utcnow())
updated_at: datetime = Field(default_factory=lambda: datetime.utcnow())
error: str | None = None
result: ExtractionResult | None = None
debug: dict[str, Any] = Field(default_factory=dict)
class DocumentResponse(BaseModel):
"""Public response payload returned by the documents API."""
job_id: UUID
status: DocumentStatus
confidence: float | None = None
data: ExtractionResult | None = None
review_flags: list[str] = Field(default_factory=list)
error: str | None = None

View File

@@ -0,0 +1,55 @@
"""Top-level extraction result schemas."""
from __future__ import annotations
from datetime import date
from enum import Enum
from pydantic import BaseModel, Field
from ocr_sprint.schemas.personnel import PersonnelEntry
class ReviewFlag(str, Enum):
"""Reasons a document was routed to human review."""
LOW_OCR_CONFIDENCE = "low_ocr_confidence"
MISSING_FIELD = "missing_field"
INVALID_NRP = "invalid_nrp"
UNKNOWN_PANGKAT = "unknown_pangkat"
PERSONNEL_COUNT_MISMATCH = "personnel_count_mismatch"
DATE_PARSE_FAILED = "date_parse_failed"
class Signatory(BaseModel):
"""The official signing the sprint (Penandatangan)."""
nama: str | None = None
pangkat: str | None = None
nrp: str | None = None
jabatan: str | None = None
class HeaderFields(BaseModel):
"""Header fields parsed from the top portion of a sprint."""
nomor_sprint: str | None = Field(None, description="e.g. Sprin/123/IV/2025/Reskrim.")
tanggal: date | None = Field(None, description="Date the sprint was issued.")
satuan_penerbit: str | None = Field(None, description="Issuing unit, e.g. 'Polres Bandung'.")
perihal: str | None = None
dasar: list[str] = Field(default_factory=list, description="List of legal/operational basis.")
class ExtractionResult(BaseModel):
"""Full structured payload extracted from a single sprint document."""
header: HeaderFields = Field(default_factory=HeaderFields)
personel: list[PersonnelEntry] = Field(default_factory=list)
untuk: list[str] = Field(
default_factory=list,
description="Bulleted task descriptions in the 'Untuk' / 'Dikerjakan' section.",
)
ttd: Signatory = Field(default_factory=Signatory)
raw_text: str = Field(default="", description="Concatenated OCR text for debugging.")
confidence: float = Field(0.0, ge=0.0, le=1.0)
review_flags: list[ReviewFlag] = Field(default_factory=list)

View File

@@ -0,0 +1,18 @@
"""Schema for a single personnel row in a surat sprint."""
from __future__ import annotations
from pydantic import BaseModel, Field
class PersonnelEntry(BaseModel):
"""One row from the personnel table."""
no: int | None = Field(None, description="Row number as printed on the document.")
pangkat: str | None = Field(None, description="Rank, normalized when possible.")
nrp: str | None = Field(None, description="8-digit Polri NRP, or blank if not detected.")
nama: str | None = Field(None, description="Full name.")
jabatan_dinas: str | None = Field(None, description="Permanent post (jabatan dalam dinas).")
jabatan_sprint: str | None = Field(None, description="Role within this sprint.")
keterangan: str | None = None
confidence: float = Field(0.0, ge=0.0, le=1.0)

View File

View File

@@ -0,0 +1,45 @@
"""Structured logging setup using structlog."""
from __future__ import annotations
import logging
import sys
from typing import Any
import structlog
def configure_logging(level: str = "INFO") -> None:
"""Configure structlog to emit JSON-friendly key=value records to stdout."""
log_level = getattr(logging, level.upper(), logging.INFO)
logging.basicConfig(
format="%(message)s",
stream=sys.stdout,
level=log_level,
)
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso", utc=True),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.dev.ConsoleRenderer(colors=False),
],
wrapper_class=structlog.make_filtering_bound_logger(log_level),
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
cache_logger_on_first_use=True,
)
def get_logger(name: str | None = None, **initial_values: Any) -> Any:
"""Return a bound logger with optional initial context.
The return type is ``Any`` because structlog's BoundLogger generic typing
is too restrictive in practice; callers treat it as a duck-typed logger.
"""
logger = structlog.get_logger(name)
if initial_values:
logger = logger.bind(**initial_values)
return logger

0
tests/__init__.py Normal file
View File

43
tests/conftest.py Normal file
View File

@@ -0,0 +1,43 @@
"""Shared pytest fixtures."""
from __future__ import annotations
import numpy as np
import pytest
@pytest.fixture
def blank_bgr_image() -> np.ndarray:
"""A 600x800 white BGR image (uint8) — useful for preprocessing smoke tests."""
return np.full((600, 800, 3), 255, dtype=np.uint8)
@pytest.fixture
def sample_sprint_text() -> str:
"""Realistic-but-synthetic OCR text for regex extractor tests."""
return (
"KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
"DAERAH JAWA BARAT\n"
"RESOR BANDUNG\n"
"\n"
"SURAT PERINTAH\n"
"Nomor : Sprin/123/IV/2025/Reskrim\n"
"\n"
"DASAR :\n"
"1. Undang-Undang Nomor 2 Tahun 2002 tentang Kepolisian Negara Republik Indonesia.\n"
"2. Peraturan Kapolri Nomor 6 Tahun 2017 tentang Susunan Organisasi.\n"
"3. Laporan Polisi Nomor LP/123/IV/2025/Reskrim tanggal 20 April 2025.\n"
"\n"
"DIPERINTAHKAN :\n"
"Kepada : 1. Nama anggota tersebut di bawah ini.\n"
"\n"
"Untuk : Melaksanakan penyelidikan tindak pidana.\n"
"\n"
"PERIHAL : Pelaksanaan penyelidikan kasus pencurian.\n"
"\n"
"Bandung, 21 April 2025\n"
"KEPALA KEPOLISIAN RESOR BANDUNG\n"
"\n"
"Drs. BUDI SANTOSO\n"
"AKBP NRP 12345678\n"
)

0
tests/unit/__init__.py Normal file
View File

87
tests/unit/test_api.py Normal file
View File

@@ -0,0 +1,87 @@
"""API tests with the OCR engine mocked.
These tests do NOT load PaddleOCR — instead they monkeypatch the orchestrator
so we can exercise the FastAPI surface without the heavy ML init cost.
"""
from __future__ import annotations
from datetime import date
import pytest
from fastapi.testclient import TestClient
from ocr_sprint.main import create_app
from ocr_sprint.pipeline import orchestrator as orch_module
from ocr_sprint.pipeline.orchestrator import PipelineOutput
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields
@pytest.fixture
def client() -> TestClient:
return TestClient(create_app())
def test_health_endpoint(client: TestClient) -> None:
response = client.get("/api/v1/health")
assert response.status_code == 200
assert response.json()["status"] == "ok"
def test_documents_rejects_empty_upload(client: TestClient) -> None:
response = client.post(
"/api/v1/documents",
files={"file": ("empty.pdf", b"", "application/pdf")},
)
assert response.status_code == 400
def test_documents_rejects_unknown_format(
client: TestClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
response = client.post(
"/api/v1/documents",
files={"file": ("x.bin", b"random garbage bytes here", "application/octet-stream")},
)
assert response.status_code == 400
def test_documents_returns_pipeline_output(
client: TestClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_result = ExtractionResult(
header=HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="POLRES TEST",
),
confidence=0.97,
)
fake_output = PipelineOutput(
source_kind=SourceKind.PDF,
status=DocumentStatus.COMPLETED,
confidence=0.97,
result=fake_result,
)
def _fake_run(_content: bytes) -> PipelineOutput:
return fake_output
# Patch the symbol *imported into* the routes module.
monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
from ocr_sprint.api.routes import documents as docs_module
monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
response = client.post(
"/api/v1/documents",
files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
)
assert response.status_code == 200
body = response.json()
assert body["status"] == "completed"
assert body["confidence"] == 0.97
assert body["data"]["header"]["nomor_sprint"] == "Sprin/1/I/2025"

View File

@@ -0,0 +1,46 @@
"""Tests for confidence scoring + routing."""
from __future__ import annotations
from ocr_sprint.pipeline.confidence import compute_confidence, route
from ocr_sprint.schemas.document import DocumentStatus
from ocr_sprint.schemas.extraction import ReviewFlag
def test_no_flags_returns_blend_of_ocr_only() -> None:
score = compute_confidence(0.9, [])
# OCR weight 0.6 * 0.9 + validation 0.4 * 1.0 = 0.94
assert abs(score - 0.94) < 1e-6
def test_flags_reduce_score() -> None:
base = compute_confidence(0.9, [])
with_flags = compute_confidence(0.9, [ReviewFlag.MISSING_FIELD])
assert with_flags < base
def test_score_is_clamped() -> None:
catastrophic = compute_confidence(
0.0,
[
ReviewFlag.MISSING_FIELD,
ReviewFlag.LOW_OCR_CONFIDENCE,
ReviewFlag.PERSONNEL_COUNT_MISMATCH,
ReviewFlag.INVALID_NRP,
ReviewFlag.UNKNOWN_PANGKAT,
ReviewFlag.DATE_PARSE_FAILED,
],
)
assert 0.0 <= catastrophic <= 1.0
def test_route_high_confidence() -> None:
assert route(0.97) == DocumentStatus.COMPLETED
def test_route_mid_goes_to_review() -> None:
assert route(0.88) == DocumentStatus.NEEDS_REVIEW
def test_route_low_goes_to_review() -> None:
assert route(0.40) == DocumentStatus.NEEDS_REVIEW

50
tests/unit/test_ingest.py Normal file
View File

@@ -0,0 +1,50 @@
"""Tests for source detection + image ingest."""
from __future__ import annotations
import io
import numpy as np
from PIL import Image
from ocr_sprint.pipeline.ingest import detect_source_kind, ingest_image
from ocr_sprint.schemas.document import SourceKind
def _png_bytes() -> bytes:
img = Image.new("RGB", (100, 80), color="white")
buf = io.BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
def _jpeg_bytes() -> bytes:
img = Image.new("RGB", (100, 80), color="white")
buf = io.BytesIO()
img.save(buf, format="JPEG")
return buf.getvalue()
def test_detect_pdf() -> None:
assert detect_source_kind(b"%PDF-1.7\n...") == SourceKind.PDF
def test_detect_png() -> None:
assert detect_source_kind(_png_bytes()) == SourceKind.IMAGE
def test_detect_jpeg() -> None:
assert detect_source_kind(_jpeg_bytes()) == SourceKind.IMAGE
def test_detect_unknown() -> None:
assert detect_source_kind(b"garbage") == SourceKind.UNKNOWN
def test_ingest_image_returns_one_page() -> None:
pages = ingest_image(_png_bytes())
assert len(pages) == 1
assert pages[0].page_index == 0
assert isinstance(pages[0].image, np.ndarray)
assert pages[0].image.dtype == np.uint8
assert pages[0].image.shape == (80, 100, 3)

View File

@@ -0,0 +1,37 @@
"""Smoke tests for the preprocessing pipeline."""
from __future__ import annotations
import numpy as np
from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
def test_preprocess_returns_bgr_uint8(blank_bgr_image: np.ndarray) -> None:
out = preprocess(blank_bgr_image)
assert out.dtype == np.uint8
assert out.ndim == 3
assert out.shape[2] == 3
def test_preprocess_resizes_to_max_side() -> None:
big = np.full((4000, 3000, 3), 255, dtype=np.uint8)
cfg = PreprocessConfig(max_side=1000, denoise=False, deskew=False)
out = preprocess(big, cfg)
assert max(out.shape[:2]) == 1000
def test_preprocess_does_not_upscale_small_images() -> None:
small = np.full((400, 300, 3), 255, dtype=np.uint8)
cfg = PreprocessConfig(max_side=2200, denoise=False, deskew=False)
out = preprocess(small, cfg)
assert out.shape[:2] == (400, 300)
def test_adaptive_threshold_produces_binary_image() -> None:
img = np.random.randint(0, 256, (200, 200, 3), dtype=np.uint8)
cfg = PreprocessConfig(denoise=False, deskew=False, adaptive_threshold=True)
out = preprocess(img, cfg)
# adaptive threshold should leave only 0s and 255s
unique = np.unique(out)
assert set(unique.tolist()).issubset({0, 255})

View File

@@ -0,0 +1,112 @@
"""Tests for regex-based header extraction."""
from __future__ import annotations
from datetime import date
import pytest
from ocr_sprint.pipeline.extract.regex_rules import (
extract_header,
find_dasar_list,
find_nomor_sprint,
find_perihal,
find_satuan,
find_signatory,
find_tanggal,
)
class TestNomorSprint:
@pytest.mark.parametrize(
("text", "needle"),
[
("Nomor : Sprin/123/IV/2025/Reskrim", "123"),
("Nomor: SPRIN / 7 / I / 2024", "7"),
("...Sprin-345-X-2024-Sat Intelkam...", "345"),
],
)
def test_finds_nomor(self, text: str, needle: str) -> None:
result = find_nomor_sprint(text)
assert result is not None
assert needle in result
assert result.upper().startswith("SPRIN")
def test_returns_none_when_absent(self) -> None:
assert find_nomor_sprint("no nomor here, just some text") is None
class TestTanggal:
def test_basic_date(self) -> None:
assert find_tanggal("Bandung, 21 April 2025") == date(2025, 4, 21)
def test_with_dashes(self) -> None:
assert find_tanggal("Tanggal 1 - Desember - 2024") == date(2024, 12, 1)
def test_invalid_month(self) -> None:
assert find_tanggal("21 Foo 2025") is None
def test_no_date_present(self) -> None:
assert find_tanggal("nothing here") is None
class TestSatuan:
def test_polres(self) -> None:
result = find_satuan("KEPOLISIAN RESOR BANDUNG\nLainnya")
assert result is not None
assert "RESOR BANDUNG" in result.upper()
def test_polri_pusat(self) -> None:
result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
assert result is not None
class TestPerihal:
def test_extracts_perihal_line(self) -> None:
text = "Other line\nPERIHAL : Pelaksanaan penyelidikan kasus.\nMore"
assert find_perihal(text) == "Pelaksanaan penyelidikan kasus."
def test_returns_none_when_absent(self) -> None:
assert find_perihal("no perihal field") is None
class TestDasar:
def test_numbered_list(self) -> None:
text = (
"DASAR :\n"
"1. UU No 2 Tahun 2002.\n"
"2. Peraturan Kapolri Nomor 6.\n"
"\n"
"DIPERINTAHKAN :\n"
"Kepada : ...\n"
)
items = find_dasar_list(text)
assert len(items) == 2
assert items[0].startswith("UU No 2")
assert items[1].startswith("Peraturan Kapolri")
def test_empty_when_section_missing(self) -> None:
assert find_dasar_list("no dasar section") == []
class TestSignatory:
def test_extracts_last_nrp(self) -> None:
text = "Some 12345678 NRP earlier 87654321\nNRP. 11223344"
sig = find_signatory(text)
assert sig.nrp == "11223344"
def test_no_nrp(self) -> None:
assert find_signatory("no NRP here").nrp is None
class TestExtractHeader:
def test_full_synthetic_doc(self, sample_sprint_text: str) -> None:
header = extract_header(sample_sprint_text)
assert header.nomor_sprint is not None
assert "Sprin" in header.nomor_sprint
assert header.tanggal == date(2025, 4, 21)
assert header.satuan_penerbit is not None
assert "KEPOLISIAN" in header.satuan_penerbit.upper()
assert header.perihal is not None
assert "penyelidikan" in header.perihal.lower()
assert len(header.dasar) == 3

View File

@@ -0,0 +1,108 @@
"""Tests for the validation layer."""
from __future__ import annotations
from datetime import date
import pytest
from ocr_sprint.data.master_pangkat import is_valid_pangkat, normalize_pangkat
from ocr_sprint.pipeline.extract.validators import (
validate_extraction,
validate_header,
validate_nrp,
validate_personnel_entry,
)
from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields, ReviewFlag
from ocr_sprint.schemas.personnel import PersonnelEntry
class TestNRP:
@pytest.mark.parametrize("nrp", ["12345678", "00000001", "99999999"])
def test_valid_8_digits(self, nrp: str) -> None:
assert validate_nrp(nrp) is True
@pytest.mark.parametrize("nrp", ["1234567", "123456789", "abcdefgh", "", None])
def test_invalid(self, nrp: str | None) -> None:
assert validate_nrp(nrp) is False
class TestPangkat:
@pytest.mark.parametrize(
("input_str", "expected"),
[
("AKP", "AKP"),
("akp", "AKP"),
("AKP.", "AKP"),
("AKBP", "AKBP"),
("Brigjen Pol", "BRIGJEN POL"),
("BRIGJEN", "BRIGJEN POL"),
("Kombespol", "KOMBES POL"),
("BRIPDA", "BRIPDA"),
],
)
def test_normalizes_known_ranks(self, input_str: str, expected: str) -> None:
assert normalize_pangkat(input_str) == expected
def test_unknown_returns_none(self) -> None:
assert normalize_pangkat("Sersan Mayor") is None
assert is_valid_pangkat("Sersan Mayor") is False
class TestPersonnelValidator:
def test_clean_entry_no_flags(self) -> None:
entry = PersonnelEntry(pangkat="AKP", nrp="12345678", nama="Test")
assert validate_personnel_entry(entry) == []
def test_invalid_nrp_flagged(self) -> None:
entry = PersonnelEntry(pangkat="AKP", nrp="123", nama="Test")
assert ReviewFlag.INVALID_NRP in validate_personnel_entry(entry)
def test_unknown_pangkat_flagged(self) -> None:
entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
class TestHeaderValidator:
def test_complete_header_no_flags(self) -> None:
header = HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="POLRES BANDUNG",
)
assert validate_header(header) == []
def test_missing_nomor_flagged(self) -> None:
header = HeaderFields(tanggal=date(2025, 1, 1))
assert ReviewFlag.MISSING_FIELD in validate_header(header)
def test_missing_date_flagged(self) -> None:
header = HeaderFields(nomor_sprint="Sprin/1/I/2025")
assert ReviewFlag.DATE_PARSE_FAILED in validate_header(header)
class TestFullValidation:
def test_personnel_count_mismatch(self) -> None:
result = ExtractionResult(
header=HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
),
personel=[
PersonnelEntry(pangkat="AKP", nrp="12345678", nama="A"),
],
)
flags = validate_extraction(result, expected_personnel_count=2)
assert ReviewFlag.PERSONNEL_COUNT_MISMATCH in flags
def test_flags_are_deduped(self) -> None:
result = ExtractionResult(
header=HeaderFields(), # missing both nomor and tanggal
personel=[
PersonnelEntry(nrp="123", pangkat="X"),
PersonnelEntry(nrp="456", pangkat="Y"),
],
)
flags = validate_extraction(result)
# each flag type should appear at most once
assert len(flags) == len(set(flags))