Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
43
.env.example
Normal file
43
.env.example
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# ==== App ====
|
||||||
|
APP_ENV=local # local | dev | staging | prod
|
||||||
|
APP_HOST=0.0.0.0
|
||||||
|
APP_PORT=8000
|
||||||
|
APP_LOG_LEVEL=INFO
|
||||||
|
|
||||||
|
# ==== Storage (Phase 1: local filesystem) ====
|
||||||
|
STORAGE_LOCAL_DIR=./storage
|
||||||
|
|
||||||
|
# ==== OCR ====
|
||||||
|
OCR_LANG=latin # PaddleOCR lang code; "latin" works well for Bahasa Indonesia
|
||||||
|
OCR_USE_GPU=false # set true if running on a GPU host
|
||||||
|
OCR_DET_MODEL_DIR= # leave empty to use PaddleOCR defaults
|
||||||
|
OCR_REC_MODEL_DIR=
|
||||||
|
OCR_CLS_MODEL_DIR=
|
||||||
|
OCR_MAX_IMAGE_SIDE=2200 # downscale longest side before OCR
|
||||||
|
|
||||||
|
# ==== Preprocessing ====
|
||||||
|
PREPROCESS_TARGET_DPI=300
|
||||||
|
PREPROCESS_DENOISE=true
|
||||||
|
PREPROCESS_DESKEW=true
|
||||||
|
PREPROCESS_ADAPTIVE_THRESHOLD=false # turn on for low-quality phone photos
|
||||||
|
|
||||||
|
# ==== Confidence / routing (Phase 5) ====
|
||||||
|
CONFIDENCE_AUTO_APPROVE=0.95
|
||||||
|
CONFIDENCE_NEEDS_REVIEW=0.85
|
||||||
|
|
||||||
|
# ==== LLM (Phase 5, optional) ====
|
||||||
|
LLM_ENABLED=false
|
||||||
|
LLM_PROVIDER=ollama
|
||||||
|
LLM_MODEL=qwen2.5:1.5b # CPU-friendly default
|
||||||
|
LLM_BASE_URL=http://localhost:11434
|
||||||
|
LLM_TIMEOUT_S=60
|
||||||
|
|
||||||
|
# ==== Async pipeline (Phase 4, optional) ====
|
||||||
|
QUEUE_ENABLED=false
|
||||||
|
REDIS_URL=redis://localhost:6379/0
|
||||||
|
DATABASE_URL=postgresql+psycopg://ocr:ocr@localhost:5432/ocr_sprint
|
||||||
|
MINIO_ENDPOINT=localhost:9000
|
||||||
|
MINIO_ACCESS_KEY=minioadmin
|
||||||
|
MINIO_SECRET_KEY=minioadmin
|
||||||
|
MINIO_BUCKET=ocr-sprint
|
||||||
|
MINIO_SECURE=false
|
||||||
70
.gitignore
vendored
Normal file
70
.gitignore
vendored
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
dist/
|
||||||
|
*.egg-info/
|
||||||
|
*.egg
|
||||||
|
.pytest_cache/
|
||||||
|
.mypy_cache/
|
||||||
|
.ruff_cache/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
htmlcov/
|
||||||
|
coverage.xml
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
env/
|
||||||
|
ENV/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# Environment / secrets
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
!.env.example
|
||||||
|
|
||||||
|
# Local data & artifacts
|
||||||
|
samples/*.pdf
|
||||||
|
samples/*.PDF
|
||||||
|
samples/*.jpg
|
||||||
|
samples/*.JPG
|
||||||
|
samples/*.jpeg
|
||||||
|
samples/*.png
|
||||||
|
samples/*.PNG
|
||||||
|
samples/*.tif
|
||||||
|
samples/*.tiff
|
||||||
|
!samples/README.md
|
||||||
|
data/local/
|
||||||
|
storage/
|
||||||
|
*.db
|
||||||
|
*.sqlite
|
||||||
|
*.sqlite3
|
||||||
|
|
||||||
|
# OCR / model caches
|
||||||
|
.paddleocr/
|
||||||
|
~/.paddleocr/
|
||||||
|
models/downloaded/
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
logs/
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
.docker/
|
||||||
|
|
||||||
|
# Misc
|
||||||
|
*.bak
|
||||||
|
*.tmp
|
||||||
19
.pre-commit-config.yaml
Normal file
19
.pre-commit-config.yaml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
repos:
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
rev: v4.6.0
|
||||||
|
hooks:
|
||||||
|
- id: trailing-whitespace
|
||||||
|
- id: end-of-file-fixer
|
||||||
|
- id: check-yaml
|
||||||
|
- id: check-toml
|
||||||
|
- id: check-added-large-files
|
||||||
|
args: ["--maxkb=1024"]
|
||||||
|
- id: check-merge-conflict
|
||||||
|
- id: detect-private-key
|
||||||
|
|
||||||
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
|
rev: v0.6.9
|
||||||
|
hooks:
|
||||||
|
- id: ruff
|
||||||
|
args: ["--fix"]
|
||||||
|
- id: ruff-format
|
||||||
51
Dockerfile
Normal file
51
Dockerfile
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
# syntax=docker/dockerfile:1.6
|
||||||
|
# CPU-only image for the OCR Sprint API.
|
||||||
|
# PaddleOCR + PyMuPDF + OpenCV-headless work on plain Debian without poppler.
|
||||||
|
FROM python:3.11-slim AS base
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||||
|
PIP_NO_CACHE_DIR=1 \
|
||||||
|
DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# System deps for OpenCV, libmagic, PaddlePaddle, and image format support.
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libgl1 \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libsm6 \
|
||||||
|
libxext6 \
|
||||||
|
libxrender1 \
|
||||||
|
libgomp1 \
|
||||||
|
libmagic1 \
|
||||||
|
ca-certificates \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# ----- builder layer (install deps separately for caching) -----
|
||||||
|
FROM base AS builder
|
||||||
|
COPY pyproject.toml README.md ./
|
||||||
|
COPY src/ ./src/
|
||||||
|
RUN pip install --upgrade pip && pip install ".[dev]"
|
||||||
|
|
||||||
|
# ----- runtime layer -----
|
||||||
|
FROM base AS runtime
|
||||||
|
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
|
||||||
|
COPY --from=builder /usr/local/bin /usr/local/bin
|
||||||
|
COPY pyproject.toml README.md ./
|
||||||
|
COPY src/ ./src/
|
||||||
|
|
||||||
|
# Pre-create cache dirs so PaddleOCR can write models on first run.
|
||||||
|
RUN mkdir -p /home/app/.paddleocr /app/storage \
|
||||||
|
&& useradd --create-home --uid 1000 app \
|
||||||
|
&& chown -R app:app /home/app /app
|
||||||
|
|
||||||
|
USER app
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
|
||||||
|
CMD curl -fsS http://localhost:8000/api/v1/health || exit 1
|
||||||
|
|
||||||
|
CMD ["uvicorn", "ocr_sprint.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
52
Makefile
Normal file
52
Makefile
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
.PHONY: help install dev fmt lint typecheck test test-cov run docker-build docker-up docker-down clean
|
||||||
|
|
||||||
|
help:
|
||||||
|
@echo "Targets:"
|
||||||
|
@echo " install - install runtime + dev deps in current env"
|
||||||
|
@echo " dev - run FastAPI app with autoreload"
|
||||||
|
@echo " fmt - format code with ruff"
|
||||||
|
@echo " lint - lint with ruff"
|
||||||
|
@echo " typecheck - run mypy"
|
||||||
|
@echo " test - run pytest"
|
||||||
|
@echo " test-cov - run pytest with coverage"
|
||||||
|
@echo " docker-build - build api image"
|
||||||
|
@echo " docker-up - start docker-compose stack"
|
||||||
|
@echo " docker-down - stop docker-compose stack"
|
||||||
|
|
||||||
|
install:
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -e ".[dev]"
|
||||||
|
pre-commit install || true
|
||||||
|
|
||||||
|
dev:
|
||||||
|
uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
|
||||||
|
|
||||||
|
fmt:
|
||||||
|
ruff format src tests
|
||||||
|
ruff check --fix src tests
|
||||||
|
|
||||||
|
lint:
|
||||||
|
ruff check src tests
|
||||||
|
ruff format --check src tests
|
||||||
|
|
||||||
|
typecheck:
|
||||||
|
mypy src
|
||||||
|
|
||||||
|
test:
|
||||||
|
pytest
|
||||||
|
|
||||||
|
test-cov:
|
||||||
|
pytest --cov --cov-report=term-missing
|
||||||
|
|
||||||
|
docker-build:
|
||||||
|
docker compose build
|
||||||
|
|
||||||
|
docker-up:
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
docker-down:
|
||||||
|
docker compose down
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf .pytest_cache .mypy_cache .ruff_cache .coverage htmlcov build dist *.egg-info
|
||||||
|
find . -type d -name __pycache__ -exec rm -rf {} +
|
||||||
123
README.md
Normal file
123
README.md
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
# OCR Sprint Service
|
||||||
|
|
||||||
|
OCR + structured extraction service for Indonesian police "surat sprint" (surat perintah) documents. Built around **FastAPI + PaddleOCR + hybrid extraction (regex → LLM lokal → validation)** with **on-premise** deployment as a hard requirement.
|
||||||
|
|
||||||
|
> **Status:** Phase 1 MVP — synchronous PDF/image OCR with regex header extraction, validation, and confidence scoring. Phase 2–6 (document detection, table extraction, async pipeline, LLM extraction, HITL) are tracked in [`docs/architecture.md`](docs/architecture.md).
|
||||||
|
|
||||||
|
## Why this stack
|
||||||
|
|
||||||
|
- **PaddleOCR** is the strongest open-source OCR for mixed-language documents and runs fully on-prem (essential for police data).
|
||||||
|
- **PP-Structure** (Phase 3) handles personnel tables natively.
|
||||||
|
- **Regex-first, LLM-fallback extraction** keeps deterministic fields fast and predictable while letting an LLM handle format drift across Polri units.
|
||||||
|
- **CPU-friendly defaults**: a small (1.5B–4B) local LLM via Ollama is the recommended default; the architecture is also GPU-ready.
|
||||||
|
|
||||||
|
See [`docs/architecture.md`](docs/architecture.md) for the full architecture, accuracy expectations, and roadmap.
|
||||||
|
|
||||||
|
## Quickstart
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- Python **3.10–3.12**
|
||||||
|
- ~3 GB free disk for PaddleOCR model downloads on first run
|
||||||
|
- Linux/macOS recommended (Windows works but PaddleOCR install can be finicky)
|
||||||
|
|
||||||
|
### Install (local dev)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/Adriankf59/ocr-sprint-service.git
|
||||||
|
cd ocr-sprint-service
|
||||||
|
|
||||||
|
python -m venv .venv && source .venv/bin/activate
|
||||||
|
make install # installs runtime + dev deps + pre-commit
|
||||||
|
cp .env.example .env # edit if you need GPU / different storage path
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run the API
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make dev
|
||||||
|
# → http://localhost:8000/docs
|
||||||
|
```
|
||||||
|
|
||||||
|
### Try it out
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -F "file=@samples/pdf/example.pdf" http://localhost:8000/api/v1/documents | jq
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected response (truncated):
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"job_id": "8f2a...",
|
||||||
|
"status": "completed",
|
||||||
|
"confidence": 0.93,
|
||||||
|
"data": {
|
||||||
|
"header": {
|
||||||
|
"nomor_sprint": "Sprin/123/IV/2025/Reskrim",
|
||||||
|
"tanggal": "2025-04-21",
|
||||||
|
"satuan_penerbit": "KEPOLISIAN RESOR BANDUNG",
|
||||||
|
"perihal": "Pelaksanaan penyelidikan kasus pencurian",
|
||||||
|
"dasar": ["Undang-Undang Nomor 2 Tahun 2002 ...", "..."]
|
||||||
|
},
|
||||||
|
"personel": [],
|
||||||
|
"ttd": { "nrp": "12345678" }
|
||||||
|
},
|
||||||
|
"review_flags": []
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
> **Note:** Phase 1 does not yet populate the `personel[]` table — that requires PP-Structure (Phase 3). Header fields, signatory NRP, confidence, and HITL routing are fully wired.
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose build
|
||||||
|
docker compose up -d
|
||||||
|
docker compose logs -f api
|
||||||
|
```
|
||||||
|
|
||||||
|
The first request will trigger PaddleOCR to download its detection/recognition/cls models (~200 MB) into the `paddle-models` volume.
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make fmt # format with ruff
|
||||||
|
make lint # lint
|
||||||
|
make typecheck # mypy strict mode
|
||||||
|
make test # pytest
|
||||||
|
make test-cov # pytest + coverage
|
||||||
|
```
|
||||||
|
|
||||||
|
Pre-commit hooks run ruff on every commit. Install once with `pre-commit install` (already done by `make install`).
|
||||||
|
|
||||||
|
## Project layout
|
||||||
|
|
||||||
|
```
|
||||||
|
src/ocr_sprint/
|
||||||
|
api/ # FastAPI routes + error handlers
|
||||||
|
schemas/ # Pydantic v2 models (request/response, extraction, personnel)
|
||||||
|
pipeline/ # ingest → preprocess → ocr → extract → validate → score
|
||||||
|
extract/ # regex_rules.py (Phase 1) → llm.py (Phase 5)
|
||||||
|
data/ # master data (Polri ranks, etc.)
|
||||||
|
utils/ # logging, helpers
|
||||||
|
config.py # pydantic-settings
|
||||||
|
main.py # app factory
|
||||||
|
tests/unit/ # ~60 unit tests, no PaddleOCR dependency
|
||||||
|
docs/ # architecture & decision records
|
||||||
|
```
|
||||||
|
|
||||||
|
## Roadmap
|
||||||
|
|
||||||
|
| Phase | Scope | Status |
|
||||||
|
|---|---|---|
|
||||||
|
| 1 | Sync API, PDF/image ingest, basic preprocessing, PaddleOCR, regex header extraction, validation, confidence scoring | **In progress** |
|
||||||
|
| 2 | DocTR document detection + dewarping for phone photos | Planned |
|
||||||
|
| 3 | PP-Structure table extraction for personnel rows | Planned |
|
||||||
|
| 4 | Async pipeline (Celery + Redis), Postgres + MinIO, auth, observability | Planned |
|
||||||
|
| 5 | LLM hybrid extraction (Ollama + structured output) | Planned |
|
||||||
|
| 6 | HITL review endpoints + audit trail | Planned |
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
Proprietary — internal use only.
|
||||||
23
docker-compose.yml
Normal file
23
docker-compose.yml
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
# Phase 1 MVP compose: API only.
|
||||||
|
# Phase 4 will add redis, postgres, minio, and worker services.
|
||||||
|
services:
|
||||||
|
api:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: ocr-sprint-service:dev
|
||||||
|
container_name: ocr-sprint-api
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
environment:
|
||||||
|
APP_ENV: local
|
||||||
|
APP_LOG_LEVEL: INFO
|
||||||
|
OCR_USE_GPU: "false"
|
||||||
|
STORAGE_LOCAL_DIR: /app/storage
|
||||||
|
volumes:
|
||||||
|
- ./storage:/app/storage
|
||||||
|
- paddle-models:/home/app/.paddleocr
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
paddle-models:
|
||||||
259
docs/architecture.md
Normal file
259
docs/architecture.md
Normal file
@@ -0,0 +1,259 @@
|
|||||||
|
# Plan & Arsitektur — OCR Service Surat Sprint Kepolisian
|
||||||
|
|
||||||
|
## 1. Penilaian Jujur Tech Stack yang Diusulkan
|
||||||
|
|
||||||
|
Tech stack Anda (FastAPI + PaddleOCR + OpenCV/Pillow + Regex) **sudah bagus dan layak produksi**, tapi **belum tentu paling optimal akurasinya** untuk kasus surat sprint. Ada beberapa gap yang perlu diisi sebelum bisa disebut "terbaik".
|
||||||
|
|
||||||
|
### Yang sudah tepat
|
||||||
|
| Komponen | Alasan |
|
||||||
|
|---|---|
|
||||||
|
| **FastAPI** | Async native, Pydantic validation, OpenAPI docs otomatis, ideal untuk ML serving. |
|
||||||
|
| **PaddleOCR (PP-OCRv4/v5)** | Salah satu OCR open-source terbaik untuk dokumen campuran teks + tabel, mendukung Latin (cocok untuk Bahasa Indonesia), bisa jalan on-premise (penting untuk dokumen kepolisian yang sensitif — **cloud OCR seperti Google Vision/AWS Textract sebaiknya dihindari** karena masalah kerahasiaan). |
|
||||||
|
| **OpenCV + Pillow** | Standar industri untuk preprocessing. |
|
||||||
|
| **Regex/rule-based** | Cocok untuk dokumen terstruktur seperti sprint yang format-nya relatif baku. |
|
||||||
|
|
||||||
|
### Yang masih kurang / perlu ditambah
|
||||||
|
|
||||||
|
1. **Table extraction belum tertangani**
|
||||||
|
Daftar personel di surat sprint hampir selalu berbentuk **tabel** (No, Pangkat, NRP, Nama, Jabatan, Keterangan). Regex pada teks linear dari OCR biasa **akan kacau** ketika baris tabel pecah atau kolom bergeser. Solusi: gunakan **PaddleOCR PP-Structure** (modul table recognition bawaan Paddle) atau model khusus seperti **TableTransformer (Microsoft)**.
|
||||||
|
|
||||||
|
2. **Document detection & dewarping untuk foto HP belum eksplisit**
|
||||||
|
Foto HP bermasalah karena: perspektif miring, lipatan, bayangan, lighting tidak rata, fokus tidak merata. OpenCV crop + perspective transform manual saja sering gagal. Tambahkan:
|
||||||
|
- **Document corner detection**: `DocTR` / `MobileSAM` / model edge-based, atau heuristik kontur OpenCV sebagai fallback.
|
||||||
|
- **Dewarping**: `DocTr` / `DewarpNet` untuk halaman yang melengkung (lipatan).
|
||||||
|
- **Shadow removal**: algoritma background division atau model spesialis.
|
||||||
|
|
||||||
|
3. **Strategi ekstraksi 100% regex itu rapuh**
|
||||||
|
Surat sprint dari satuan berbeda (Polda, Polres, Polsek, Mabes) punya **variasi format**: header berbeda, urutan field berbeda, kadang pangkat disingkat (`AKP`, `IPDA`) kadang ditulis penuh. Regex murni akan butuh ratusan rule dan tetap miss kasus baru.
|
||||||
|
**Rekomendasi pendekatan hybrid**:
|
||||||
|
- **Layer 1 — Regex/rule** untuk field deterministik (Nomor sprint, tanggal, dasar hukum) yang format-nya baku.
|
||||||
|
- **Layer 2 — Schema-aware extraction** menggunakan **LLM lokal** (Llama 3.1 8B / Qwen2.5 7B via Ollama atau vLLM) dengan structured output (JSON schema / Pydantic) untuk field yang variatif (jabatan, keterangan tugas).
|
||||||
|
- **Layer 3 — Validation** terhadap master data (daftar pangkat valid, format NRP 8 digit, dll).
|
||||||
|
|
||||||
|
4. **Tidak ada confidence scoring & human-in-the-loop**
|
||||||
|
Untuk dokumen kepolisian, **akurasi 100% otomatis itu mitos**. Sistem harus:
|
||||||
|
- Mengeluarkan confidence score per field.
|
||||||
|
- Otomatis flag dokumen low-confidence untuk review manusia.
|
||||||
|
- Sediakan UI/endpoint koreksi yang feedback-nya bisa dipakai retraining.
|
||||||
|
|
||||||
|
5. **Alternatif end-to-end yang patut dipertimbangkan**
|
||||||
|
Jika nanti volume dokumen besar dan format relatif stabil, fine-tuning model **Document Understanding** end-to-end bisa lebih akurat:
|
||||||
|
- **Donut** (OCR-free, langsung image → JSON).
|
||||||
|
- **LayoutLMv3** (kombinasi teks + layout + visual).
|
||||||
|
- **Surya OCR** (newer, sangat bagus untuk dokumen).
|
||||||
|
Untuk MVP, tetap pakai PaddleOCR. Donut/LayoutLM adalah opsi V2 setelah ada labeled dataset cukup (~500–1000 dokumen).
|
||||||
|
|
||||||
|
### Verdict
|
||||||
|
Stack Anda **bisa mencapai ~85–92% akurasi field-level** untuk surat sprint dengan kualitas scan baik, dan **~70–80%** untuk foto HP, **kalau ditambah** komponen di atas. Tanpa table extraction + dewarping + hybrid extraction, akurasinya akan jatuh di kondisi nyata.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Arsitektur yang Direkomendasikan
|
||||||
|
|
||||||
|
### 2.1 Diagram Logis
|
||||||
|
|
||||||
|
```
|
||||||
|
┌────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ Client (Web/Mobile) │
|
||||||
|
└──────────────────────────────┬─────────────────────────────────────┘
|
||||||
|
│ HTTPS (multipart upload)
|
||||||
|
▼
|
||||||
|
┌────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ FastAPI Gateway (stateless) │
|
||||||
|
│ - Auth (JWT/API key) - Rate limit - Request validation │
|
||||||
|
└──────────────────────────────┬─────────────────────────────────────┘
|
||||||
|
│ enqueue job
|
||||||
|
▼
|
||||||
|
┌────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ Job Queue (Redis + Celery / RQ / Dramatiq) │
|
||||||
|
└──────────────────────────────┬─────────────────────────────────────┘
|
||||||
|
▼
|
||||||
|
┌────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ OCR Worker Pipeline (GPU/CPU) │
|
||||||
|
│ ┌────────────┐ ┌──────────────┐ ┌───────────┐ ┌────────────┐ │
|
||||||
|
│ │ 1. Ingest │→ │ 2. Preproc │→ │ 3. OCR + │→ │ 4. Extract │ │
|
||||||
|
│ │ & detect │ │ (deskew, │ │ Layout │ │ (regex + │ │
|
||||||
|
│ │ PDF/IMG │ │ dewarp, │ │ PP-Struct│ │ LLM + │ │
|
||||||
|
│ │ │ │ denoise) │ │ + Table) │ │ validate) │ │
|
||||||
|
│ └────────────┘ └──────────────┘ └───────────┘ └─────┬──────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ┌──────────────────────────────┘ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌─────────────┐ │
|
||||||
|
│ │ 5. Confidence│ → low conf? flag for review │
|
||||||
|
│ │ scoring │ │
|
||||||
|
│ └──────┬───────┘ │
|
||||||
|
└──────────────────────────┼─────────────────────────────────────────┘
|
||||||
|
▼
|
||||||
|
┌────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ Storage: PostgreSQL (metadata) + MinIO/S3 (file) │
|
||||||
|
│ + Vector store opsional (untuk dedup / search) │
|
||||||
|
└────────────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ Review UI (optional) — koreksi manual + audit trail │
|
||||||
|
└────────────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.2 Pipeline Detail per Tahap
|
||||||
|
|
||||||
|
**Tahap 1 — Ingest & Document Detection**
|
||||||
|
- PDF: render setiap halaman jadi image @ 300 DPI (`pdf2image` / `PyMuPDF`).
|
||||||
|
- Image (foto HP): deteksi sudut dokumen → crop → perspective transform.
|
||||||
|
- Library: OpenCV `findContours` (cepat) sebagai fallback, **DocTR document detector** (lebih akurat) sebagai utama.
|
||||||
|
|
||||||
|
**Tahap 2 — Preprocessing**
|
||||||
|
- Deskew (rotation correction) — Hough transform atau model.
|
||||||
|
- Dewarp (untuk foto buku/lipatan) — `DewarpNet` atau model RNN.
|
||||||
|
- Adaptive thresholding (untuk foto dengan lighting tidak rata).
|
||||||
|
- Shadow removal (background division).
|
||||||
|
- Denoise (Non-Local Means).
|
||||||
|
- Resize ke ukuran optimal OCR (~1500–2500 px sisi panjang).
|
||||||
|
|
||||||
|
**Tahap 3 — OCR + Layout Analysis**
|
||||||
|
- **PaddleOCR PP-Structure** dijalankan sekali → menghasilkan:
|
||||||
|
- Bounding boxes + teks + confidence per word/line.
|
||||||
|
- Table region detection + table-to-HTML/JSON.
|
||||||
|
- Layout type per region (title, paragraph, table, figure).
|
||||||
|
- Output ditampung sebagai struktur intermediate (mirip hOCR / ALTO XML).
|
||||||
|
|
||||||
|
**Tahap 4 — Information Extraction**
|
||||||
|
- **4a. Header parsing (regex)**: Nomor sprint, tanggal, satuan penerbit, dasar hukum, perihal. Format relatif baku → regex sangat cocok.
|
||||||
|
- **4b. Personnel table extraction**: ambil dari hasil PP-Structure table → mapping kolom (Pangkat, NRP, Nama, Jabatan, Keterangan).
|
||||||
|
- **4c. LLM fallback**: untuk field yang regex/table miss, kirim chunk teks + JSON schema ke LLM lokal (Ollama / vLLM) dengan **structured output** (Pydantic via `outlines` / `instructor`).
|
||||||
|
- **4d. Validation layer**:
|
||||||
|
- NRP: 8 digit numerik.
|
||||||
|
- Pangkat: harus ada di daftar master pangkat Polri.
|
||||||
|
- Tanggal: parse + sanity check.
|
||||||
|
- Cross-check: jumlah personel di body = jumlah baris tabel.
|
||||||
|
|
||||||
|
**Tahap 5 — Confidence Scoring & Routing**
|
||||||
|
- Aggregate confidence: weighted average dari OCR confidence + validation pass/fail + LLM logprob (kalau pakai).
|
||||||
|
- Threshold (mis. < 0.85) → status `NEEDS_REVIEW`.
|
||||||
|
- Threshold tinggi (≥ 0.95) + semua validasi pass → status `AUTO_APPROVED`.
|
||||||
|
|
||||||
|
### 2.3 API Endpoint (FastAPI)
|
||||||
|
|
||||||
|
```
|
||||||
|
POST /api/v1/documents # upload, kembalikan job_id
|
||||||
|
GET /api/v1/documents/{job_id} # poll status + hasil
|
||||||
|
GET /api/v1/documents/{job_id}/raw # raw OCR output (debug)
|
||||||
|
PATCH /api/v1/documents/{job_id} # koreksi manual (HITL)
|
||||||
|
GET /api/v1/health # liveness
|
||||||
|
GET /api/v1/metrics # Prometheus
|
||||||
|
```
|
||||||
|
|
||||||
|
Response shape (contoh):
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"job_id": "uuid",
|
||||||
|
"status": "completed | processing | needs_review | failed",
|
||||||
|
"confidence": 0.92,
|
||||||
|
"data": {
|
||||||
|
"nomor_sprint": "Sprin/123/IV/2025",
|
||||||
|
"tanggal": "2025-04-21",
|
||||||
|
"satuan_penerbit": "Polres Bandung",
|
||||||
|
"dasar": ["...", "..."],
|
||||||
|
"perihal": "...",
|
||||||
|
"personel": [
|
||||||
|
{"no": 1, "pangkat": "AKP", "nrp": "12345678", "nama": "...", "jabatan": "Kasat Reskrim", "confidence": 0.97},
|
||||||
|
...
|
||||||
|
],
|
||||||
|
"ttd": {"pejabat": "...", "pangkat": "...", "nrp": "..."}
|
||||||
|
},
|
||||||
|
"review_flags": []
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.4 Tech Stack Final yang Direkomendasikan
|
||||||
|
|
||||||
|
| Layer | Pilihan | Catatan |
|
||||||
|
|---|---|---|
|
||||||
|
| API | **FastAPI** + Uvicorn/Gunicorn | sesuai usulan |
|
||||||
|
| Validation | **Pydantic v2** | wajib |
|
||||||
|
| Queue | **Redis + Celery** atau **Dramatiq** | OCR berat, jangan blocking request |
|
||||||
|
| OCR | **PaddleOCR PP-OCRv4 + PP-Structure** | tambah PP-Structure untuk tabel |
|
||||||
|
| Preprocessing | **OpenCV + Pillow** + **DocTR** (detection) | DocTR untuk foto HP |
|
||||||
|
| Extraction | **Regex + Ollama (Llama 3.1 8B / Qwen2.5 7B)** + **instructor/outlines** | hybrid |
|
||||||
|
| Storage | **PostgreSQL** (metadata) + **MinIO** (file blob) | self-hosted, sesuai compliance |
|
||||||
|
| Observability | **Prometheus + Grafana + Loki** | wajib produksi |
|
||||||
|
| Container | **Docker + docker-compose** (dev) → **Kubernetes** (prod) | |
|
||||||
|
| GPU | NVIDIA T4/A10 (1× cukup untuk MVP) | PaddleOCR jauh lebih cepat di GPU |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Roadmap Pengembangan (Bertahap)
|
||||||
|
|
||||||
|
### Fase 0 — Persiapan (1 minggu)
|
||||||
|
- Kumpulkan **dataset sampel**: minimal 50 surat sprint (campur PDF scan + foto HP) dari beragam satuan.
|
||||||
|
- Buat **ground truth labelling** untuk 20 dokumen (untuk evaluasi).
|
||||||
|
- Definisikan **schema output final** (JSON) bersama stakeholder.
|
||||||
|
|
||||||
|
### Fase 1 — MVP Pipeline Sinkron (2 minggu)
|
||||||
|
- Setup FastAPI skeleton + Pydantic schemas.
|
||||||
|
- Integrasi PaddleOCR PP-OCRv4 (CPU dulu, GPU menyusul).
|
||||||
|
- Preprocessing dasar: deskew + denoise + resize.
|
||||||
|
- Regex extraction untuk field header.
|
||||||
|
- Endpoint sinkron `POST /documents` (untuk dev/testing saja).
|
||||||
|
- **Evaluasi akurasi** terhadap 20 ground truth.
|
||||||
|
|
||||||
|
### Fase 2 — Robustness untuk Foto HP (2 minggu)
|
||||||
|
- Integrasi document detection (DocTR atau OpenCV contour).
|
||||||
|
- Perspective transform + dewarping.
|
||||||
|
- Shadow removal.
|
||||||
|
- Re-evaluasi akurasi pada subset foto HP.
|
||||||
|
|
||||||
|
### Fase 3 — Table Extraction (1.5 minggu)
|
||||||
|
- Integrasi PP-Structure untuk personnel table.
|
||||||
|
- Mapping kolom + validation (NRP, pangkat).
|
||||||
|
- Master data tabel pangkat Polri.
|
||||||
|
|
||||||
|
### Fase 4 — Async + Production Ready (1.5 minggu)
|
||||||
|
- Pindahkan ke arsitektur async dengan Celery + Redis.
|
||||||
|
- Storage MinIO + PostgreSQL.
|
||||||
|
- Auth, rate limit, logging, metrics.
|
||||||
|
- Docker compose untuk deployment.
|
||||||
|
|
||||||
|
### Fase 5 — LLM Hybrid Extraction (2 minggu)
|
||||||
|
- Setup Ollama / vLLM dengan model lokal.
|
||||||
|
- Structured output via `instructor`.
|
||||||
|
- Confidence scoring + routing ke review.
|
||||||
|
|
||||||
|
### Fase 6 — HITL Review UI (opsional, 2 minggu)
|
||||||
|
- Endpoint koreksi.
|
||||||
|
- Simple web UI (Next.js) untuk reviewer.
|
||||||
|
- Audit trail & feedback loop.
|
||||||
|
|
||||||
|
### Fase 7 — Optimasi Lanjutan (ongoing)
|
||||||
|
- Fine-tune PaddleOCR detection/recognition pada dataset internal.
|
||||||
|
- Eksplorasi Donut/LayoutLMv3 jika dataset sudah cukup.
|
||||||
|
- Batch processing & GPU optimization.
|
||||||
|
|
||||||
|
**Total estimasi MVP fungsional (Fase 1–4): ~7 minggu** dengan 1 backend engineer + 1 ML engineer.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Risiko & Mitigasi
|
||||||
|
|
||||||
|
| Risiko | Mitigasi |
|
||||||
|
|---|---|
|
||||||
|
| Data sensitif (kepolisian) bocor | Wajib on-prem; tidak ada cloud OCR; enkripsi at-rest (LUKS/pgcrypto) + in-transit (mTLS); audit log lengkap. |
|
||||||
|
| Variasi format antar satuan | Hybrid extraction (regex + LLM); kumpulkan sample dari banyak satuan sejak awal. |
|
||||||
|
| Foto HP kualitas buruk | Validasi kualitas image di client (resolusi minimal, blur detection) sebelum upload. |
|
||||||
|
| Akurasi tidak sampai target | HITL review wajib untuk dokumen low-confidence; jangan deploy fully-automatic. |
|
||||||
|
| Tanggung jawab hukum atas hasil OCR | Selalu simpan original document + flag bahwa hasil ekstraksi adalah "draft, perlu verifikasi manusia". |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Pertanyaan Sebelum Implementasi
|
||||||
|
|
||||||
|
Sebelum saya lanjut ke implementasi, mohon konfirmasi:
|
||||||
|
|
||||||
|
1. **Volume**: berapa dokumen/hari yang ditargetkan? (mempengaruhi pilihan async vs sync, GPU vs CPU)
|
||||||
|
2. **Deployment target**: on-prem mutlak, atau private cloud (GovCloud) boleh?
|
||||||
|
3. **Source dokumen**: apakah ada akses ke 20–50 sample surat sprint untuk dijadikan dataset awal?
|
||||||
|
4. **Integrasi**: service ini akan dipanggil sistem apa? (mempengaruhi auth & API contract)
|
||||||
|
5. **HITL**: apakah ada SDM untuk review manual dokumen low-confidence?
|
||||||
|
6. **Hardware**: sudah ada server GPU, atau perlu sizing rekomendasi?
|
||||||
|
7. **Format output final**: ada schema yang sudah dipakai sistem downstream?
|
||||||
136
pyproject.toml
Normal file
136
pyproject.toml
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=68", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "ocr-sprint-service"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "OCR service for Indonesian police 'surat sprint' documents (FastAPI + PaddleOCR + hybrid extraction)"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.10,<3.13"
|
||||||
|
license = { text = "Proprietary" }
|
||||||
|
authors = [{ name = "Adrian Kuman Firmansah" }]
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
# Web framework
|
||||||
|
"fastapi>=0.115,<0.116",
|
||||||
|
"uvicorn[standard]>=0.30,<0.34",
|
||||||
|
"python-multipart>=0.0.9",
|
||||||
|
"pydantic>=2.7,<3",
|
||||||
|
"pydantic-settings>=2.4,<3",
|
||||||
|
# Image / PDF
|
||||||
|
"pillow>=10.4,<12",
|
||||||
|
"opencv-python-headless>=4.10,<5",
|
||||||
|
"numpy>=1.26,<2.2",
|
||||||
|
"PyMuPDF>=1.24,<2",
|
||||||
|
"python-magic>=0.4.27",
|
||||||
|
# OCR (CPU build of paddle; GPU users override via extra index)
|
||||||
|
"paddlepaddle==2.6.1",
|
||||||
|
"paddleocr>=2.7.5,<3",
|
||||||
|
# Logging / observability
|
||||||
|
"structlog>=24.1",
|
||||||
|
"prometheus-client>=0.20",
|
||||||
|
# Misc
|
||||||
|
"httpx>=0.27",
|
||||||
|
"tenacity>=8.5",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"pytest>=8.2",
|
||||||
|
"pytest-asyncio>=0.23",
|
||||||
|
"pytest-cov>=5.0",
|
||||||
|
"ruff>=0.6.9",
|
||||||
|
"mypy>=1.11",
|
||||||
|
"types-Pillow",
|
||||||
|
"pre-commit>=3.7",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Extraction layer (Phase 5) — kept optional so MVP install stays light
|
||||||
|
llm = [
|
||||||
|
"ollama>=0.3",
|
||||||
|
"instructor>=1.4",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Async pipeline (Phase 4)
|
||||||
|
async-pipeline = [
|
||||||
|
"celery[redis]>=5.4",
|
||||||
|
"redis>=5.0",
|
||||||
|
"minio>=7.2",
|
||||||
|
"sqlalchemy>=2.0",
|
||||||
|
"psycopg[binary]>=3.2",
|
||||||
|
"alembic>=1.13",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
ocr-sprint-api = "ocr_sprint.main:run"
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["src"]
|
||||||
|
|
||||||
|
[tool.setuptools.package-data]
|
||||||
|
"ocr_sprint" = ["py.typed"]
|
||||||
|
|
||||||
|
# ---------- Tooling ----------
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 100
|
||||||
|
target-version = "py310"
|
||||||
|
src = ["src", "tests"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
"E", "F", "W", # pycodestyle / pyflakes
|
||||||
|
"I", # isort
|
||||||
|
"B", # bugbear
|
||||||
|
"UP", # pyupgrade
|
||||||
|
"SIM", # simplify
|
||||||
|
"RUF", # ruff-specific
|
||||||
|
"C4", # comprehensions
|
||||||
|
"PIE",
|
||||||
|
"PT", # pytest style
|
||||||
|
"TID", # tidy imports
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
"E501", # line length handled by formatter
|
||||||
|
"B008", # FastAPI Depends() pattern
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
quote-style = "double"
|
||||||
|
|
||||||
|
[tool.mypy]
|
||||||
|
python_version = "3.10"
|
||||||
|
strict = true
|
||||||
|
warn_unused_ignores = true
|
||||||
|
warn_redundant_casts = true
|
||||||
|
disallow_untyped_defs = true
|
||||||
|
plugins = ["pydantic.mypy"]
|
||||||
|
mypy_path = "src"
|
||||||
|
namespace_packages = true
|
||||||
|
explicit_package_bases = true
|
||||||
|
|
||||||
|
[[tool.mypy.overrides]]
|
||||||
|
module = ["paddleocr.*", "paddle.*", "cv2.*", "fitz.*", "magic.*"]
|
||||||
|
ignore_missing_imports = true
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
minversion = "8.0"
|
||||||
|
addopts = "-ra --strict-markers --strict-config"
|
||||||
|
testpaths = ["tests"]
|
||||||
|
asyncio_mode = "auto"
|
||||||
|
filterwarnings = [
|
||||||
|
"ignore::DeprecationWarning:paddle.*",
|
||||||
|
"ignore::DeprecationWarning:paddleocr.*",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.coverage.run]
|
||||||
|
source = ["src/ocr_sprint"]
|
||||||
|
branch = true
|
||||||
|
|
||||||
|
[tool.coverage.report]
|
||||||
|
exclude_lines = [
|
||||||
|
"pragma: no cover",
|
||||||
|
"raise NotImplementedError",
|
||||||
|
"if TYPE_CHECKING:",
|
||||||
|
]
|
||||||
13
samples/README.md
Normal file
13
samples/README.md
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
# Samples
|
||||||
|
|
||||||
|
Drop sample surat sprint files here for local testing. **Do NOT commit real documents** — `.gitignore` excludes binary file extensions in this folder.
|
||||||
|
|
||||||
|
Recommended layout:
|
||||||
|
```
|
||||||
|
samples/
|
||||||
|
pdf/ # PDF scans
|
||||||
|
photo/ # phone photos
|
||||||
|
ground_truth/ # JSON ground-truth labels for evaluation
|
||||||
|
```
|
||||||
|
|
||||||
|
For sharing real samples with the team, use the project's secured storage (MinIO/S3 once Phase 4 is live), not git.
|
||||||
3
src/ocr_sprint/__init__.py
Normal file
3
src/ocr_sprint/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
"""OCR Sprint Service — extract structured data from Indonesian police 'surat sprint'."""
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
0
src/ocr_sprint/api/__init__.py
Normal file
0
src/ocr_sprint/api/__init__.py
Normal file
43
src/ocr_sprint/api/errors.py
Normal file
43
src/ocr_sprint/api/errors.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
"""HTTP error handlers."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import FastAPI, Request, status
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
|
from ocr_sprint.utils.logging import get_logger
|
||||||
|
|
||||||
|
_logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class OCRServiceError(Exception):
|
||||||
|
"""Base class for application errors that should map to a 4xx response."""
|
||||||
|
|
||||||
|
http_status: int = status.HTTP_400_BAD_REQUEST
|
||||||
|
|
||||||
|
|
||||||
|
class UnsupportedDocumentError(OCRServiceError):
|
||||||
|
"""Uploaded file is neither a PDF nor a recognized image format."""
|
||||||
|
|
||||||
|
|
||||||
|
class JobNotFoundError(OCRServiceError):
|
||||||
|
http_status = status.HTTP_404_NOT_FOUND
|
||||||
|
|
||||||
|
|
||||||
|
def register_error_handlers(app: FastAPI) -> None:
|
||||||
|
"""Wire OCRServiceError + a final fallback for unexpected exceptions."""
|
||||||
|
|
||||||
|
@app.exception_handler(OCRServiceError)
|
||||||
|
async def _ocr_error_handler(_: Request, exc: OCRServiceError) -> JSONResponse:
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=exc.http_status,
|
||||||
|
content={"error": exc.__class__.__name__, "message": str(exc)},
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.exception_handler(Exception)
|
||||||
|
async def _unexpected_handler(_: Request, exc: Exception) -> JSONResponse:
|
||||||
|
_logger.exception("api.unhandled_exception", error=str(exc))
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
content={"error": "InternalServerError", "message": "Unexpected error"},
|
||||||
|
)
|
||||||
0
src/ocr_sprint/api/routes/__init__.py
Normal file
0
src/ocr_sprint/api/routes/__init__.py
Normal file
58
src/ocr_sprint/api/routes/documents.py
Normal file
58
src/ocr_sprint/api/routes/documents.py
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
"""Documents API — Phase 1 synchronous endpoint.
|
||||||
|
|
||||||
|
POST /documents accepts a single PDF or image upload, runs the synchronous
|
||||||
|
pipeline inline, and returns the structured result. This is suitable for
|
||||||
|
development and low-traffic production; Phase 4 will introduce an async
|
||||||
|
queue and a polling-style API at the same path.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
from fastapi import APIRouter, File, UploadFile, status
|
||||||
|
|
||||||
|
from ocr_sprint.api.errors import UnsupportedDocumentError
|
||||||
|
from ocr_sprint.pipeline.orchestrator import run_pipeline
|
||||||
|
from ocr_sprint.schemas.document import DocumentResponse
|
||||||
|
from ocr_sprint.utils.logging import get_logger
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/documents", tags=["documents"])
|
||||||
|
_logger = get_logger(__name__)
|
||||||
|
|
||||||
|
_MAX_UPLOAD_BYTES = 25 * 1024 * 1024 # 25 MB
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("", status_code=status.HTTP_200_OK, response_model=DocumentResponse)
|
||||||
|
async def create_document(file: UploadFile = File(...)) -> DocumentResponse:
|
||||||
|
"""Run OCR + extraction synchronously on a single upload."""
|
||||||
|
job_id = uuid4()
|
||||||
|
log = _logger.bind(job_id=str(job_id), filename=file.filename or "")
|
||||||
|
|
||||||
|
content = await file.read()
|
||||||
|
if not content:
|
||||||
|
raise UnsupportedDocumentError("Uploaded file is empty.")
|
||||||
|
if len(content) > _MAX_UPLOAD_BYTES:
|
||||||
|
raise UnsupportedDocumentError(
|
||||||
|
f"Uploaded file exceeds {_MAX_UPLOAD_BYTES // (1024 * 1024)} MB limit."
|
||||||
|
)
|
||||||
|
|
||||||
|
log.info("documents.received", size=len(content))
|
||||||
|
try:
|
||||||
|
output = run_pipeline(content)
|
||||||
|
except ValueError as exc:
|
||||||
|
raise UnsupportedDocumentError(str(exc)) from exc
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"documents.completed",
|
||||||
|
status=output.status.value,
|
||||||
|
confidence=round(output.confidence, 3),
|
||||||
|
flags=[f.value for f in output.result.review_flags],
|
||||||
|
)
|
||||||
|
return DocumentResponse(
|
||||||
|
job_id=job_id,
|
||||||
|
status=output.status,
|
||||||
|
confidence=output.confidence,
|
||||||
|
data=output.result,
|
||||||
|
review_flags=[f.value for f in output.result.review_flags],
|
||||||
|
)
|
||||||
15
src/ocr_sprint/api/routes/health.py
Normal file
15
src/ocr_sprint/api/routes/health.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
"""Liveness / readiness endpoints."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
from ocr_sprint import __version__
|
||||||
|
|
||||||
|
router = APIRouter(tags=["health"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/health")
|
||||||
|
async def health() -> dict[str, str]:
|
||||||
|
"""Lightweight liveness check — does NOT touch the OCR engine."""
|
||||||
|
return {"status": "ok", "version": __version__}
|
||||||
72
src/ocr_sprint/config.py
Normal file
72
src/ocr_sprint/config.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
"""Application settings loaded from environment / .env file."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from functools import lru_cache
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
"""Runtime configuration. Override via environment variables or a .env file."""
|
||||||
|
|
||||||
|
model_config = SettingsConfigDict(
|
||||||
|
env_file=".env",
|
||||||
|
env_file_encoding="utf-8",
|
||||||
|
case_sensitive=False,
|
||||||
|
extra="ignore",
|
||||||
|
)
|
||||||
|
|
||||||
|
# App
|
||||||
|
app_env: str = "local"
|
||||||
|
app_host: str = "0.0.0.0"
|
||||||
|
app_port: int = 8000
|
||||||
|
app_log_level: str = "INFO"
|
||||||
|
|
||||||
|
# Storage (Phase 1: local fs)
|
||||||
|
storage_local_dir: Path = Path("./storage")
|
||||||
|
|
||||||
|
# OCR
|
||||||
|
ocr_lang: str = "latin"
|
||||||
|
ocr_use_gpu: bool = False
|
||||||
|
ocr_det_model_dir: str | None = None
|
||||||
|
ocr_rec_model_dir: str | None = None
|
||||||
|
ocr_cls_model_dir: str | None = None
|
||||||
|
ocr_max_image_side: int = 2200
|
||||||
|
|
||||||
|
# Preprocessing
|
||||||
|
preprocess_target_dpi: int = 300
|
||||||
|
preprocess_denoise: bool = True
|
||||||
|
preprocess_deskew: bool = True
|
||||||
|
preprocess_adaptive_threshold: bool = False
|
||||||
|
|
||||||
|
# Confidence thresholds (Phase 5 routing)
|
||||||
|
confidence_auto_approve: float = Field(0.95, ge=0.0, le=1.0)
|
||||||
|
confidence_needs_review: float = Field(0.85, ge=0.0, le=1.0)
|
||||||
|
|
||||||
|
# LLM (Phase 5)
|
||||||
|
llm_enabled: bool = False
|
||||||
|
llm_provider: str = "ollama"
|
||||||
|
llm_model: str = "qwen2.5:1.5b"
|
||||||
|
llm_base_url: str = "http://localhost:11434"
|
||||||
|
llm_timeout_s: int = 60
|
||||||
|
|
||||||
|
# Async pipeline (Phase 4)
|
||||||
|
queue_enabled: bool = False
|
||||||
|
redis_url: str = "redis://localhost:6379/0"
|
||||||
|
database_url: str = "postgresql+psycopg://ocr:ocr@localhost:5432/ocr_sprint"
|
||||||
|
minio_endpoint: str = "localhost:9000"
|
||||||
|
minio_access_key: str = "minioadmin"
|
||||||
|
minio_secret_key: str = "minioadmin"
|
||||||
|
minio_bucket: str = "ocr-sprint"
|
||||||
|
minio_secure: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_settings() -> Settings:
|
||||||
|
"""Cached accessor so settings are loaded once per process."""
|
||||||
|
settings = Settings()
|
||||||
|
settings.storage_local_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
return settings
|
||||||
0
src/ocr_sprint/data/__init__.py
Normal file
0
src/ocr_sprint/data/__init__.py
Normal file
66
src/ocr_sprint/data/master_pangkat.py
Normal file
66
src/ocr_sprint/data/master_pangkat.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
"""Master data for Polri ranks ('pangkat').
|
||||||
|
|
||||||
|
Used by the validation layer to:
|
||||||
|
1. Confirm that a recognized rank string is a real Polri rank.
|
||||||
|
2. Normalize abbreviated forms ("AKP" → "AKP", "Brigadir Polisi" → "Brigadir") to a canonical form.
|
||||||
|
|
||||||
|
Source: Peraturan Kapolri tentang Pangkat (publicly available, 2024).
|
||||||
|
Update this file when ranks are reorganized.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
# Canonical abbreviation → list of accepted variants (case-insensitive).
|
||||||
|
PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
|
||||||
|
# Tamtama
|
||||||
|
"BHARADA": ("BHARADA", "BHRD"),
|
||||||
|
"BHARATU": ("BHARATU", "BHRT"),
|
||||||
|
"BHARAKA": ("BHARAKA", "BHRK"),
|
||||||
|
"ABRIP": ("ABRIP",),
|
||||||
|
"ABRIPTU": ("ABRIPTU",),
|
||||||
|
"ABRIPKA": ("ABRIPKA",),
|
||||||
|
# Bintara
|
||||||
|
"BRIPDA": ("BRIPDA",),
|
||||||
|
"BRIPTU": ("BRIPTU",),
|
||||||
|
"BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"),
|
||||||
|
"BRIPKA": ("BRIPKA",),
|
||||||
|
"AIPDA": ("AIPDA",),
|
||||||
|
"AIPTU": ("AIPTU",),
|
||||||
|
# Perwira Pertama
|
||||||
|
"IPDA": ("IPDA",),
|
||||||
|
"IPTU": ("IPTU",),
|
||||||
|
"AKP": ("AKP",),
|
||||||
|
# Perwira Menengah
|
||||||
|
"KOMPOL": ("KOMPOL",),
|
||||||
|
"AKBP": ("AKBP",),
|
||||||
|
"KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP"),
|
||||||
|
# Perwira Tinggi
|
||||||
|
"BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"),
|
||||||
|
"IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"),
|
||||||
|
"KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"),
|
||||||
|
"JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Reverse lookup: any variant (uppercased) → canonical form.
|
||||||
|
_VARIANT_TO_CANONICAL: dict[str, str] = {
|
||||||
|
variant.upper(): canonical
|
||||||
|
for canonical, variants in PANGKAT_VARIANTS.items()
|
||||||
|
for variant in variants
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_pangkat(raw: str | None) -> str | None:
|
||||||
|
"""Return canonical Polri rank, or None if input is empty/unknown."""
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
cleaned = " ".join(raw.strip().upper().split())
|
||||||
|
if cleaned in _VARIANT_TO_CANONICAL:
|
||||||
|
return _VARIANT_TO_CANONICAL[cleaned]
|
||||||
|
# tolerate trailing punctuation like "AKP."
|
||||||
|
stripped = cleaned.rstrip(".,;:")
|
||||||
|
return _VARIANT_TO_CANONICAL.get(stripped)
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_pangkat(raw: str | None) -> bool:
|
||||||
|
"""True if the string maps to a known Polri rank after normalization."""
|
||||||
|
return normalize_pangkat(raw) is not None
|
||||||
42
src/ocr_sprint/main.py
Normal file
42
src/ocr_sprint/main.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
"""FastAPI entrypoint."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
|
||||||
|
from ocr_sprint import __version__
|
||||||
|
from ocr_sprint.api.errors import register_error_handlers
|
||||||
|
from ocr_sprint.api.routes import documents, health
|
||||||
|
from ocr_sprint.config import get_settings
|
||||||
|
from ocr_sprint.utils.logging import configure_logging
|
||||||
|
|
||||||
|
|
||||||
|
def create_app() -> FastAPI:
|
||||||
|
"""Application factory — keeps top-level state easy to test."""
|
||||||
|
settings = get_settings()
|
||||||
|
configure_logging(settings.app_log_level)
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="OCR Sprint Service",
|
||||||
|
version=__version__,
|
||||||
|
description="OCR + structured extraction for Indonesian police 'surat sprint' documents.",
|
||||||
|
docs_url="/docs",
|
||||||
|
redoc_url="/redoc",
|
||||||
|
openapi_url="/openapi.json",
|
||||||
|
)
|
||||||
|
|
||||||
|
register_error_handlers(app)
|
||||||
|
app.include_router(health.router, prefix="/api/v1")
|
||||||
|
app.include_router(documents.router, prefix="/api/v1")
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
app = create_app()
|
||||||
|
|
||||||
|
|
||||||
|
def run() -> None:
|
||||||
|
"""Console-script entrypoint (`ocr-sprint-api`)."""
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
s = get_settings()
|
||||||
|
uvicorn.run("ocr_sprint.main:app", host=s.app_host, port=s.app_port, reload=False)
|
||||||
1
src/ocr_sprint/pipeline/__init__.py
Normal file
1
src/ocr_sprint/pipeline/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""OCR pipeline: ingest → preprocess → OCR → extract → validate."""
|
||||||
51
src/ocr_sprint/pipeline/confidence.py
Normal file
51
src/ocr_sprint/pipeline/confidence.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
"""Confidence scoring + routing decision.
|
||||||
|
|
||||||
|
The score is a weighted blend of:
|
||||||
|
- mean OCR confidence across all detected lines
|
||||||
|
- validation pass rate (1.0 if no review flags, decreases per flag)
|
||||||
|
|
||||||
|
This is intentionally simple for Phase 1; Phase 5 will add LLM logprob
|
||||||
|
contributions and per-field confidences.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from ocr_sprint.config import get_settings
|
||||||
|
from ocr_sprint.schemas.document import DocumentStatus
|
||||||
|
from ocr_sprint.schemas.extraction import ReviewFlag
|
||||||
|
|
||||||
|
# Per-flag penalty applied to the validation component of the score.
|
||||||
|
_FLAG_PENALTY: dict[ReviewFlag, float] = {
|
||||||
|
ReviewFlag.LOW_OCR_CONFIDENCE: 0.10,
|
||||||
|
ReviewFlag.MISSING_FIELD: 0.20,
|
||||||
|
ReviewFlag.INVALID_NRP: 0.10,
|
||||||
|
ReviewFlag.UNKNOWN_PANGKAT: 0.05,
|
||||||
|
ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15,
|
||||||
|
ReviewFlag.DATE_PARSE_FAILED: 0.10,
|
||||||
|
}
|
||||||
|
|
||||||
|
OCR_WEIGHT = 0.6
|
||||||
|
VALIDATION_WEIGHT = 0.4
|
||||||
|
|
||||||
|
|
||||||
|
def compute_confidence(
|
||||||
|
ocr_confidence: float,
|
||||||
|
flags: list[ReviewFlag],
|
||||||
|
) -> float:
|
||||||
|
"""Blend OCR confidence with validation penalties into a single 0-1 score."""
|
||||||
|
validation_score = 1.0
|
||||||
|
for flag in flags:
|
||||||
|
validation_score -= _FLAG_PENALTY.get(flag, 0.05)
|
||||||
|
validation_score = max(0.0, validation_score)
|
||||||
|
blended = OCR_WEIGHT * ocr_confidence + VALIDATION_WEIGHT * validation_score
|
||||||
|
return max(0.0, min(1.0, blended))
|
||||||
|
|
||||||
|
|
||||||
|
def route(confidence: float) -> DocumentStatus:
|
||||||
|
"""Map a final confidence score onto the job's terminal status."""
|
||||||
|
s = get_settings()
|
||||||
|
if confidence >= s.confidence_auto_approve:
|
||||||
|
return DocumentStatus.COMPLETED
|
||||||
|
if confidence >= s.confidence_needs_review:
|
||||||
|
return DocumentStatus.NEEDS_REVIEW
|
||||||
|
return DocumentStatus.NEEDS_REVIEW # below review threshold also goes to humans
|
||||||
1
src/ocr_sprint/pipeline/extract/__init__.py
Normal file
1
src/ocr_sprint/pipeline/extract/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""Information extraction layer (regex Phase 1, LLM Phase 5)."""
|
||||||
169
src/ocr_sprint/pipeline/extract/regex_rules.py
Normal file
169
src/ocr_sprint/pipeline/extract/regex_rules.py
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
"""Regex-based extraction for the deterministic header fields of a surat sprint.
|
||||||
|
|
||||||
|
Targets header fields whose layout is highly standardized across Polri units:
|
||||||
|
|
||||||
|
- Nomor sprint, e.g. "Sprin / 123 / IV / 2025 / Reskrim"
|
||||||
|
- Tanggal (date the sprint was issued)
|
||||||
|
- Satuan penerbit (issuing unit)
|
||||||
|
- Perihal
|
||||||
|
- Dasar (numbered list of legal/operational basis)
|
||||||
|
|
||||||
|
Personnel table extraction is intentionally NOT done here — that needs
|
||||||
|
PP-Structure + cell-aware logic and lives in `pipeline/table.py` (Phase 3).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
from ocr_sprint.schemas.extraction import HeaderFields, Signatory
|
||||||
|
|
||||||
|
# ---------- regex patterns ----------
|
||||||
|
|
||||||
|
# Nomor sprint, tolerant of spacing and OCR noise.
|
||||||
|
# Examples it should match:
|
||||||
|
# "Sprin / 123 / IV / 2025 / Reskrim"
|
||||||
|
# "SPRIN/345/X/2024"
|
||||||
|
# "Nomor : Sprin/12/I/2025/Sat Intelkam"
|
||||||
|
_RE_NOMOR_SPRINT = re.compile(
|
||||||
|
r"\bSPRIN[\s./-]*\d+[\s./-]*[IVXLCDM]+[\s./-]*\d{2,4}(?:[\s./-]*[\w .-]+?)?",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Indonesian month names.
|
||||||
|
_BULAN_MAP: dict[str, int] = {
|
||||||
|
"JANUARI": 1,
|
||||||
|
"FEBRUARI": 2,
|
||||||
|
"MARET": 3,
|
||||||
|
"APRIL": 4,
|
||||||
|
"MEI": 5,
|
||||||
|
"JUNI": 6,
|
||||||
|
"JULI": 7,
|
||||||
|
"AGUSTUS": 8,
|
||||||
|
"SEPTEMBER": 9,
|
||||||
|
"OKTOBER": 10,
|
||||||
|
"NOVEMBER": 11,
|
||||||
|
"DESEMBER": 12,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Date in Indonesian, e.g. "21 April 2025" or "21 - April - 2025"
|
||||||
|
_RE_TANGGAL_ID = re.compile(
|
||||||
|
r"\b(\d{1,2})\s*[-./\s]\s*(" + "|".join(_BULAN_MAP.keys()) + r")\s*[-./\s]\s*(\d{4})\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Satuan penerbit usually appears in the document letterhead, prefixed by
|
||||||
|
# KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
|
||||||
|
_RE_SATUAN = re.compile(
|
||||||
|
r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
|
||||||
|
r"[^\n]{0,80}",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# "Perihal : ...." up to end of line.
|
||||||
|
_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
|
||||||
|
|
||||||
|
# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
|
||||||
|
_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
|
||||||
|
|
||||||
|
# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
|
||||||
|
_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def find_nomor_sprint(text: str) -> str | None:
|
||||||
|
"""Return the first nomor sprint found, normalized (no extra spaces)."""
|
||||||
|
match = _RE_NOMOR_SPRINT.search(text)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
return " ".join(match.group(0).split())
|
||||||
|
|
||||||
|
|
||||||
|
def find_tanggal(text: str) -> date | None:
|
||||||
|
"""Find the issuance date.
|
||||||
|
|
||||||
|
Surat sprint typically contains multiple dates: one or more in the 'Dasar'
|
||||||
|
section (citing prior documents) and one near the signatory at the bottom
|
||||||
|
(the actual issuance date, usually formatted as 'Tempat, DD Month YYYY').
|
||||||
|
We prefer the **last** date in the document since the issuance date appears
|
||||||
|
after the dasar items in the standard layout.
|
||||||
|
"""
|
||||||
|
matches = list(_RE_TANGGAL_ID.finditer(text))
|
||||||
|
if not matches:
|
||||||
|
return None
|
||||||
|
last = matches[-1]
|
||||||
|
day_s, bulan, year_s = last.group(1), last.group(2).upper(), last.group(3)
|
||||||
|
try:
|
||||||
|
return date(int(year_s), _BULAN_MAP[bulan], int(day_s))
|
||||||
|
except (KeyError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_satuan(text: str) -> str | None:
|
||||||
|
"""Return the first letterhead match (issuing unit), normalized."""
|
||||||
|
match = _RE_SATUAN.search(text)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
return " ".join(match.group(0).split())
|
||||||
|
|
||||||
|
|
||||||
|
def find_perihal(text: str) -> str | None:
|
||||||
|
"""Return the first 'Perihal: ...' line, trimmed to that line only."""
|
||||||
|
for line in text.splitlines():
|
||||||
|
m = _RE_PERIHAL.search(line)
|
||||||
|
if m:
|
||||||
|
return m.group(1).strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_dasar_list(text: str) -> list[str]:
|
||||||
|
"""Extract numbered 'Dasar' items from the text.
|
||||||
|
|
||||||
|
Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
|
||||||
|
collect subsequent lines that start with a number. Stops at a blank line
|
||||||
|
or a line beginning with another section header keyword.
|
||||||
|
"""
|
||||||
|
lines = text.splitlines()
|
||||||
|
items: list[str] = []
|
||||||
|
in_dasar = False
|
||||||
|
section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
|
||||||
|
for raw_line in lines:
|
||||||
|
line = raw_line.strip()
|
||||||
|
if not in_dasar:
|
||||||
|
if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
|
||||||
|
in_dasar = True
|
||||||
|
continue
|
||||||
|
if not line:
|
||||||
|
if items:
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
upper = line.upper()
|
||||||
|
if any(upper.startswith(term) for term in section_terminators):
|
||||||
|
break
|
||||||
|
m = _RE_DASAR_ITEM.match(line)
|
||||||
|
if m:
|
||||||
|
items.append(m.group(2).strip())
|
||||||
|
elif items:
|
||||||
|
# continuation of the previous dasar item
|
||||||
|
items[-1] = (items[-1] + " " + line).strip()
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
def find_signatory(text: str) -> Signatory:
|
||||||
|
"""Best-effort extraction of the signatory block (last NRP in the document)."""
|
||||||
|
matches = list(_RE_NRP.finditer(text))
|
||||||
|
if not matches:
|
||||||
|
return Signatory()
|
||||||
|
last = matches[-1]
|
||||||
|
return Signatory(nrp=last.group(2))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_header(text: str) -> HeaderFields:
|
||||||
|
"""Run all header-level regex extractors and return a populated schema."""
|
||||||
|
return HeaderFields(
|
||||||
|
nomor_sprint=find_nomor_sprint(text),
|
||||||
|
tanggal=find_tanggal(text),
|
||||||
|
satuan_penerbit=find_satuan(text),
|
||||||
|
perihal=find_perihal(text),
|
||||||
|
dasar=find_dasar_list(text),
|
||||||
|
)
|
||||||
64
src/ocr_sprint/pipeline/extract/validators.py
Normal file
64
src/ocr_sprint/pipeline/extract/validators.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
"""Cross-field validation, with structured review-flag output."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ocr_sprint.data.master_pangkat import is_valid_pangkat
|
||||||
|
from ocr_sprint.schemas.extraction import (
|
||||||
|
ExtractionResult,
|
||||||
|
HeaderFields,
|
||||||
|
ReviewFlag,
|
||||||
|
)
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
# Polri NRP = 8 digits.
|
||||||
|
_RE_NRP_8 = re.compile(r"^\d{8}$")
|
||||||
|
|
||||||
|
|
||||||
|
def validate_nrp(nrp: str | None) -> bool:
|
||||||
|
"""Return True when the value is a well-formed Polri NRP (8 digits)."""
|
||||||
|
if nrp is None:
|
||||||
|
return False
|
||||||
|
return bool(_RE_NRP_8.match(nrp.strip()))
|
||||||
|
|
||||||
|
|
||||||
|
def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
|
||||||
|
"""Inspect a single personnel row and return any review flags it triggers."""
|
||||||
|
flags: list[ReviewFlag] = []
|
||||||
|
if entry.nrp and not validate_nrp(entry.nrp):
|
||||||
|
flags.append(ReviewFlag.INVALID_NRP)
|
||||||
|
if entry.pangkat and not is_valid_pangkat(entry.pangkat):
|
||||||
|
flags.append(ReviewFlag.UNKNOWN_PANGKAT)
|
||||||
|
return flags
|
||||||
|
|
||||||
|
|
||||||
|
def validate_header(header: HeaderFields) -> list[ReviewFlag]:
|
||||||
|
"""Flag missing required fields or unparseable dates in the header."""
|
||||||
|
flags: list[ReviewFlag] = []
|
||||||
|
if header.nomor_sprint is None:
|
||||||
|
flags.append(ReviewFlag.MISSING_FIELD)
|
||||||
|
if header.tanggal is None:
|
||||||
|
flags.append(ReviewFlag.DATE_PARSE_FAILED)
|
||||||
|
return flags
|
||||||
|
|
||||||
|
|
||||||
|
def validate_extraction(
|
||||||
|
result: ExtractionResult,
|
||||||
|
expected_personnel_count: int | None = None,
|
||||||
|
) -> list[ReviewFlag]:
|
||||||
|
"""Run all validators across the full extraction and dedupe the flags."""
|
||||||
|
flags: list[ReviewFlag] = []
|
||||||
|
flags.extend(validate_header(result.header))
|
||||||
|
for entry in result.personel:
|
||||||
|
flags.extend(validate_personnel_entry(entry))
|
||||||
|
if expected_personnel_count is not None and expected_personnel_count != len(result.personel):
|
||||||
|
flags.append(ReviewFlag.PERSONNEL_COUNT_MISMATCH)
|
||||||
|
# dedupe while preserving order
|
||||||
|
seen: set[ReviewFlag] = set()
|
||||||
|
deduped: list[ReviewFlag] = []
|
||||||
|
for flag in flags:
|
||||||
|
if flag not in seen:
|
||||||
|
seen.add(flag)
|
||||||
|
deduped.append(flag)
|
||||||
|
return deduped
|
||||||
81
src/ocr_sprint/pipeline/ingest.py
Normal file
81
src/ocr_sprint/pipeline/ingest.py
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
"""Ingest layer: convert uploaded bytes (PDF/IMG) into a list of numpy images."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from ocr_sprint.schemas.document import SourceKind
|
||||||
|
|
||||||
|
# Generic alias used across the pipeline. We don't constrain dtype/shape because
|
||||||
|
# OpenCV operations accept multiple dtypes and numpy generics are still rough.
|
||||||
|
NDArrayU8 = np.ndarray[Any, Any]
|
||||||
|
|
||||||
|
PDF_MAGIC = b"%PDF-"
|
||||||
|
PNG_MAGIC = b"\x89PNG\r\n\x1a\n"
|
||||||
|
JPEG_MAGIC = b"\xff\xd8\xff"
|
||||||
|
TIFF_MAGIC_LE = b"II*\x00"
|
||||||
|
TIFF_MAGIC_BE = b"MM\x00*"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IngestedPage:
|
||||||
|
"""One page worth of image data ready for preprocessing."""
|
||||||
|
|
||||||
|
image: NDArrayU8 # HxWx3 BGR uint8 (OpenCV convention)
|
||||||
|
page_index: int
|
||||||
|
|
||||||
|
|
||||||
|
def detect_source_kind(content: bytes) -> SourceKind:
|
||||||
|
"""Best-effort sniff of an uploaded payload."""
|
||||||
|
if content.startswith(PDF_MAGIC):
|
||||||
|
return SourceKind.PDF
|
||||||
|
if content.startswith((PNG_MAGIC, JPEG_MAGIC, TIFF_MAGIC_LE, TIFF_MAGIC_BE)):
|
||||||
|
return SourceKind.IMAGE
|
||||||
|
return SourceKind.UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def _pil_to_bgr(img: Image.Image) -> NDArrayU8:
|
||||||
|
"""Convert PIL image to OpenCV BGR numpy array."""
|
||||||
|
if img.mode != "RGB":
|
||||||
|
img = img.convert("RGB")
|
||||||
|
arr = np.asarray(img, dtype=np.uint8)
|
||||||
|
# RGB to BGR
|
||||||
|
return arr[:, :, ::-1].copy()
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_pdf(content: bytes, target_dpi: int = 300) -> list[IngestedPage]:
|
||||||
|
"""Render every page of a PDF to a numpy image at the target DPI.
|
||||||
|
|
||||||
|
Uses PyMuPDF (no poppler dependency). DPI is enforced via a transform matrix:
|
||||||
|
fitz's default is 72 DPI, so the zoom factor is target_dpi / 72.
|
||||||
|
"""
|
||||||
|
pages: list[IngestedPage] = []
|
||||||
|
zoom = target_dpi / 72.0
|
||||||
|
matrix = fitz.Matrix(zoom, zoom)
|
||||||
|
with fitz.open(stream=content, filetype="pdf") as doc:
|
||||||
|
for idx, page in enumerate(doc):
|
||||||
|
pix = page.get_pixmap(matrix=matrix, alpha=False)
|
||||||
|
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
|
||||||
|
pages.append(IngestedPage(image=_pil_to_bgr(img), page_index=idx))
|
||||||
|
return pages
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_image(content: bytes) -> list[IngestedPage]:
|
||||||
|
"""Decode a single image into a one-element page list."""
|
||||||
|
img = Image.open(io.BytesIO(content))
|
||||||
|
return [IngestedPage(image=_pil_to_bgr(img), page_index=0)]
|
||||||
|
|
||||||
|
|
||||||
|
def ingest(content: bytes, kind: SourceKind, target_dpi: int = 300) -> list[IngestedPage]:
|
||||||
|
"""Dispatch to the right loader based on declared source kind."""
|
||||||
|
if kind == SourceKind.PDF:
|
||||||
|
return ingest_pdf(content, target_dpi=target_dpi)
|
||||||
|
if kind == SourceKind.IMAGE:
|
||||||
|
return ingest_image(content)
|
||||||
|
raise ValueError(f"Unsupported source kind: {kind}")
|
||||||
106
src/ocr_sprint/pipeline/ocr.py
Normal file
106
src/ocr_sprint/pipeline/ocr.py
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
"""PaddleOCR wrapper.
|
||||||
|
|
||||||
|
PaddleOCR has a heavy initialization cost (~2-5s on CPU as model files load),
|
||||||
|
so we keep a process-global instance behind a lazy accessor.
|
||||||
|
|
||||||
|
The wrapper exposes a small, stable surface so the rest of the pipeline does
|
||||||
|
not depend directly on paddleocr's evolving API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from threading import Lock
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ocr_sprint.config import get_settings
|
||||||
|
from ocr_sprint.pipeline.ingest import NDArrayU8
|
||||||
|
from ocr_sprint.utils.logging import get_logger
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
|
||||||
|
_logger = get_logger(__name__)
|
||||||
|
_lock = Lock()
|
||||||
|
_instance: PaddleOCR | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class OCRLine:
|
||||||
|
"""One recognized line with its bounding polygon and confidence."""
|
||||||
|
|
||||||
|
text: str
|
||||||
|
confidence: float
|
||||||
|
box: tuple[tuple[float, float], ...] # 4 (x, y) corner points
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class OCRPage:
|
||||||
|
"""OCR output for a single page."""
|
||||||
|
|
||||||
|
lines: list[OCRLine]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self) -> str:
|
||||||
|
"""Reconstruct page text by concatenating lines (order = paddle's output order)."""
|
||||||
|
return "\n".join(line.text for line in self.lines)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mean_confidence(self) -> float:
|
||||||
|
if not self.lines:
|
||||||
|
return 0.0
|
||||||
|
return float(np.mean([line.confidence for line in self.lines]))
|
||||||
|
|
||||||
|
|
||||||
|
def _build_paddleocr() -> PaddleOCR:
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
|
||||||
|
s = get_settings()
|
||||||
|
kwargs: dict[str, object] = {
|
||||||
|
"lang": s.ocr_lang,
|
||||||
|
"use_angle_cls": True,
|
||||||
|
"use_gpu": s.ocr_use_gpu,
|
||||||
|
"show_log": False,
|
||||||
|
}
|
||||||
|
if s.ocr_det_model_dir:
|
||||||
|
kwargs["det_model_dir"] = s.ocr_det_model_dir
|
||||||
|
if s.ocr_rec_model_dir:
|
||||||
|
kwargs["rec_model_dir"] = s.ocr_rec_model_dir
|
||||||
|
if s.ocr_cls_model_dir:
|
||||||
|
kwargs["cls_model_dir"] = s.ocr_cls_model_dir
|
||||||
|
_logger.info("paddleocr.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
|
||||||
|
return PaddleOCR(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def get_ocr() -> PaddleOCR:
|
||||||
|
"""Lazy, thread-safe singleton accessor for the PaddleOCR engine."""
|
||||||
|
global _instance
|
||||||
|
if _instance is None:
|
||||||
|
with _lock:
|
||||||
|
if _instance is None:
|
||||||
|
_instance = _build_paddleocr()
|
||||||
|
return _instance
|
||||||
|
|
||||||
|
|
||||||
|
def run_ocr(image: NDArrayU8) -> OCRPage:
|
||||||
|
"""Run OCR on a single BGR image and return a structured page result."""
|
||||||
|
engine = get_ocr()
|
||||||
|
raw = engine.ocr(image, cls=True)
|
||||||
|
# PaddleOCR returns [[ [box, (text, conf)], ... ]] — one outer list per image.
|
||||||
|
if not raw or raw[0] is None:
|
||||||
|
return OCRPage(lines=[])
|
||||||
|
page_raw = raw[0]
|
||||||
|
lines: list[OCRLine] = []
|
||||||
|
for item in page_raw:
|
||||||
|
if not item or len(item) < 2:
|
||||||
|
continue
|
||||||
|
box_raw, text_conf = item[0], item[1]
|
||||||
|
text, conf = text_conf[0], float(text_conf[1])
|
||||||
|
try:
|
||||||
|
box = tuple((float(p[0]), float(p[1])) for p in box_raw)
|
||||||
|
except (TypeError, ValueError, IndexError):
|
||||||
|
continue
|
||||||
|
lines.append(OCRLine(text=text, confidence=conf, box=box))
|
||||||
|
return OCRPage(lines=lines)
|
||||||
103
src/ocr_sprint/pipeline/orchestrator.py
Normal file
103
src/ocr_sprint/pipeline/orchestrator.py
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
"""Synchronous pipeline orchestrator (Phase 1).
|
||||||
|
|
||||||
|
Wires the individual stages together:
|
||||||
|
|
||||||
|
bytes → ingest → preprocess → OCR → regex extract → validate → score
|
||||||
|
|
||||||
|
Phase 4 will replace this with a Celery task graph; Phase 3/5 will plug
|
||||||
|
in PP-Structure for tables and an LLM extractor for variant fields.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from ocr_sprint.config import get_settings
|
||||||
|
from ocr_sprint.pipeline.confidence import compute_confidence, route
|
||||||
|
from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory
|
||||||
|
from ocr_sprint.pipeline.extract.validators import validate_extraction
|
||||||
|
from ocr_sprint.pipeline.ingest import detect_source_kind, ingest
|
||||||
|
from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
|
||||||
|
from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
|
||||||
|
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
|
||||||
|
from ocr_sprint.schemas.extraction import ExtractionResult, ReviewFlag
|
||||||
|
from ocr_sprint.utils.logging import get_logger
|
||||||
|
|
||||||
|
_logger = get_logger(__name__)
|
||||||
|
|
||||||
|
# Below this OCR confidence we automatically flag for review.
|
||||||
|
_OCR_CONFIDENCE_FLAG_THRESHOLD = 0.80
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PipelineOutput:
|
||||||
|
"""Bundle returned by the orchestrator."""
|
||||||
|
|
||||||
|
source_kind: SourceKind
|
||||||
|
status: DocumentStatus
|
||||||
|
confidence: float
|
||||||
|
result: ExtractionResult
|
||||||
|
|
||||||
|
|
||||||
|
def run_pipeline(content: bytes) -> PipelineOutput:
|
||||||
|
"""Execute the synchronous OCR + extraction pipeline on raw upload bytes."""
|
||||||
|
s = get_settings()
|
||||||
|
|
||||||
|
kind = detect_source_kind(content)
|
||||||
|
if kind == SourceKind.UNKNOWN:
|
||||||
|
raise ValueError("Unsupported file type — only PDF and common image formats are accepted.")
|
||||||
|
|
||||||
|
pages = ingest(content, kind, target_dpi=s.preprocess_target_dpi)
|
||||||
|
_logger.info("pipeline.ingested", source_kind=kind.value, pages=len(pages))
|
||||||
|
|
||||||
|
pre_cfg = PreprocessConfig(
|
||||||
|
max_side=s.ocr_max_image_side,
|
||||||
|
denoise=s.preprocess_denoise,
|
||||||
|
deskew=s.preprocess_deskew,
|
||||||
|
adaptive_threshold=s.preprocess_adaptive_threshold,
|
||||||
|
)
|
||||||
|
|
||||||
|
ocr_pages: list[OCRPage] = []
|
||||||
|
for page in pages:
|
||||||
|
cleaned = preprocess(page.image, pre_cfg)
|
||||||
|
ocr_pages.append(run_ocr(cleaned))
|
||||||
|
|
||||||
|
full_text = "\n".join(p.text for p in ocr_pages)
|
||||||
|
mean_ocr_conf = sum(p.mean_confidence for p in ocr_pages) / len(ocr_pages) if ocr_pages else 0.0
|
||||||
|
|
||||||
|
header = extract_header(full_text)
|
||||||
|
ttd = find_signatory(full_text)
|
||||||
|
|
||||||
|
initial_flags: list[ReviewFlag] = []
|
||||||
|
if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
|
||||||
|
initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
|
||||||
|
|
||||||
|
result = ExtractionResult(
|
||||||
|
header=header,
|
||||||
|
personel=[], # Phase 3 will populate from PP-Structure
|
||||||
|
untuk=[],
|
||||||
|
ttd=ttd,
|
||||||
|
raw_text=full_text,
|
||||||
|
confidence=mean_ocr_conf,
|
||||||
|
review_flags=list(initial_flags),
|
||||||
|
)
|
||||||
|
|
||||||
|
flags = validate_extraction(result)
|
||||||
|
# merge initial OCR-confidence flag with validation flags, preserving uniqueness
|
||||||
|
seen = set(flags)
|
||||||
|
for f in initial_flags:
|
||||||
|
if f not in seen:
|
||||||
|
flags.append(f)
|
||||||
|
seen.add(f)
|
||||||
|
result.review_flags = flags
|
||||||
|
|
||||||
|
final_conf = compute_confidence(mean_ocr_conf, flags)
|
||||||
|
result.confidence = final_conf
|
||||||
|
|
||||||
|
status = route(final_conf)
|
||||||
|
return PipelineOutput(
|
||||||
|
source_kind=kind,
|
||||||
|
status=status,
|
||||||
|
confidence=final_conf,
|
||||||
|
result=result,
|
||||||
|
)
|
||||||
108
src/ocr_sprint/pipeline/preprocess.py
Normal file
108
src/ocr_sprint/pipeline/preprocess.py
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
"""Image preprocessing for OCR.
|
||||||
|
|
||||||
|
Phase 1 implements the "always-on" steps that work for both clean PDF scans
|
||||||
|
and reasonable phone photos:
|
||||||
|
|
||||||
|
- resize to a reasonable max side (PaddleOCR runs faster on smaller inputs)
|
||||||
|
- convert to grayscale for analysis (kept as 3-channel BGR for paddle)
|
||||||
|
- denoise (Non-Local Means, gentle)
|
||||||
|
- deskew via Hough line angle estimate
|
||||||
|
- optional adaptive threshold for low-quality phone photos
|
||||||
|
|
||||||
|
Phase 2 will add document-corner detection + perspective transform + dewarping
|
||||||
|
for tilted phone shots; those live in `document_detect.py` (added later).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ocr_sprint.pipeline.ingest import NDArrayU8
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PreprocessConfig:
|
||||||
|
"""Tunable knobs for the preprocessing pipeline."""
|
||||||
|
|
||||||
|
max_side: int = 2200
|
||||||
|
denoise: bool = True
|
||||||
|
deskew: bool = True
|
||||||
|
adaptive_threshold: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
def _resize_max_side(img: NDArrayU8, max_side: int) -> NDArrayU8:
|
||||||
|
h, w = img.shape[:2]
|
||||||
|
longest = max(h, w)
|
||||||
|
if longest <= max_side:
|
||||||
|
return img
|
||||||
|
scale = max_side / longest
|
||||||
|
new_w, new_h = round(w * scale), round(h * scale)
|
||||||
|
return cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
||||||
|
|
||||||
|
|
||||||
|
def _estimate_skew_angle(gray: NDArrayU8) -> float:
|
||||||
|
"""Estimate skew using Canny + Hough; returns angle in degrees within [-15, 15]."""
|
||||||
|
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
|
||||||
|
lines = cv2.HoughLines(edges, 1, np.pi / 360, threshold=200)
|
||||||
|
if lines is None or len(lines) == 0:
|
||||||
|
return 0.0
|
||||||
|
angles: list[float] = []
|
||||||
|
for line in lines[:200]:
|
||||||
|
rho, theta = line[0]
|
||||||
|
del rho
|
||||||
|
# convert to angle relative to horizontal (degrees)
|
||||||
|
angle = (theta * 180.0 / np.pi) - 90.0
|
||||||
|
# only keep nearly-horizontal lines (within ±15°)
|
||||||
|
if -15.0 < angle < 15.0:
|
||||||
|
angles.append(angle)
|
||||||
|
if not angles:
|
||||||
|
return 0.0
|
||||||
|
return float(np.median(angles))
|
||||||
|
|
||||||
|
|
||||||
|
def _rotate(img: NDArrayU8, angle_deg: float) -> NDArrayU8:
|
||||||
|
if abs(angle_deg) < 0.1:
|
||||||
|
return img
|
||||||
|
h, w = img.shape[:2]
|
||||||
|
center = (w / 2, h / 2)
|
||||||
|
matrix = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
|
||||||
|
return cv2.warpAffine(
|
||||||
|
img,
|
||||||
|
matrix,
|
||||||
|
(w, h),
|
||||||
|
flags=cv2.INTER_CUBIC,
|
||||||
|
borderMode=cv2.BORDER_REPLICATE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(img: NDArrayU8, cfg: PreprocessConfig | None = None) -> NDArrayU8:
|
||||||
|
"""Run preprocessing and return a clean BGR uint8 image suitable for OCR."""
|
||||||
|
if cfg is None:
|
||||||
|
cfg = PreprocessConfig()
|
||||||
|
|
||||||
|
out = _resize_max_side(img, cfg.max_side)
|
||||||
|
|
||||||
|
if cfg.deskew:
|
||||||
|
gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY)
|
||||||
|
angle = _estimate_skew_angle(gray)
|
||||||
|
out = _rotate(out, -angle)
|
||||||
|
|
||||||
|
if cfg.denoise:
|
||||||
|
out = cv2.fastNlMeansDenoisingColored(out, None, 5, 5, 7, 21)
|
||||||
|
|
||||||
|
if cfg.adaptive_threshold:
|
||||||
|
gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY)
|
||||||
|
binarized = cv2.adaptiveThreshold(
|
||||||
|
gray,
|
||||||
|
255,
|
||||||
|
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
||||||
|
cv2.THRESH_BINARY,
|
||||||
|
blockSize=31,
|
||||||
|
C=15,
|
||||||
|
)
|
||||||
|
out = cv2.cvtColor(binarized, cv2.COLOR_GRAY2BGR)
|
||||||
|
|
||||||
|
return out
|
||||||
0
src/ocr_sprint/py.typed
Normal file
0
src/ocr_sprint/py.typed
Normal file
27
src/ocr_sprint/schemas/__init__.py
Normal file
27
src/ocr_sprint/schemas/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
"""Pydantic schemas for input/output of the OCR Sprint service."""
|
||||||
|
|
||||||
|
from ocr_sprint.schemas.document import (
|
||||||
|
DocumentJob,
|
||||||
|
DocumentResponse,
|
||||||
|
DocumentStatus,
|
||||||
|
SourceKind,
|
||||||
|
)
|
||||||
|
from ocr_sprint.schemas.extraction import (
|
||||||
|
ExtractionResult,
|
||||||
|
HeaderFields,
|
||||||
|
ReviewFlag,
|
||||||
|
Signatory,
|
||||||
|
)
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"DocumentJob",
|
||||||
|
"DocumentResponse",
|
||||||
|
"DocumentStatus",
|
||||||
|
"ExtractionResult",
|
||||||
|
"HeaderFields",
|
||||||
|
"PersonnelEntry",
|
||||||
|
"ReviewFlag",
|
||||||
|
"Signatory",
|
||||||
|
"SourceKind",
|
||||||
|
]
|
||||||
57
src/ocr_sprint/schemas/document.py
Normal file
57
src/ocr_sprint/schemas/document.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
"""Job-level schemas (request, response, status)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any
|
||||||
|
from uuid import UUID, uuid4
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
from ocr_sprint.schemas.extraction import ExtractionResult
|
||||||
|
|
||||||
|
|
||||||
|
class SourceKind(str, Enum):
|
||||||
|
"""High-level type of the uploaded document."""
|
||||||
|
|
||||||
|
PDF = "pdf"
|
||||||
|
IMAGE = "image"
|
||||||
|
UNKNOWN = "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentStatus(str, Enum):
|
||||||
|
"""Lifecycle status of an OCR job."""
|
||||||
|
|
||||||
|
PENDING = "pending"
|
||||||
|
PROCESSING = "processing"
|
||||||
|
COMPLETED = "completed"
|
||||||
|
NEEDS_REVIEW = "needs_review"
|
||||||
|
FAILED = "failed"
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentJob(BaseModel):
|
||||||
|
"""Internal representation of a job (Phase 1 holds it in-memory)."""
|
||||||
|
|
||||||
|
model_config = ConfigDict(use_enum_values=False)
|
||||||
|
|
||||||
|
job_id: UUID = Field(default_factory=uuid4)
|
||||||
|
source_kind: SourceKind = SourceKind.UNKNOWN
|
||||||
|
filename: str
|
||||||
|
status: DocumentStatus = DocumentStatus.PENDING
|
||||||
|
created_at: datetime = Field(default_factory=lambda: datetime.utcnow())
|
||||||
|
updated_at: datetime = Field(default_factory=lambda: datetime.utcnow())
|
||||||
|
error: str | None = None
|
||||||
|
result: ExtractionResult | None = None
|
||||||
|
debug: dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentResponse(BaseModel):
|
||||||
|
"""Public response payload returned by the documents API."""
|
||||||
|
|
||||||
|
job_id: UUID
|
||||||
|
status: DocumentStatus
|
||||||
|
confidence: float | None = None
|
||||||
|
data: ExtractionResult | None = None
|
||||||
|
review_flags: list[str] = Field(default_factory=list)
|
||||||
|
error: str | None = None
|
||||||
55
src/ocr_sprint/schemas/extraction.py
Normal file
55
src/ocr_sprint/schemas/extraction.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
"""Top-level extraction result schemas."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import date
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
|
||||||
|
class ReviewFlag(str, Enum):
|
||||||
|
"""Reasons a document was routed to human review."""
|
||||||
|
|
||||||
|
LOW_OCR_CONFIDENCE = "low_ocr_confidence"
|
||||||
|
MISSING_FIELD = "missing_field"
|
||||||
|
INVALID_NRP = "invalid_nrp"
|
||||||
|
UNKNOWN_PANGKAT = "unknown_pangkat"
|
||||||
|
PERSONNEL_COUNT_MISMATCH = "personnel_count_mismatch"
|
||||||
|
DATE_PARSE_FAILED = "date_parse_failed"
|
||||||
|
|
||||||
|
|
||||||
|
class Signatory(BaseModel):
|
||||||
|
"""The official signing the sprint (Penandatangan)."""
|
||||||
|
|
||||||
|
nama: str | None = None
|
||||||
|
pangkat: str | None = None
|
||||||
|
nrp: str | None = None
|
||||||
|
jabatan: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class HeaderFields(BaseModel):
|
||||||
|
"""Header fields parsed from the top portion of a sprint."""
|
||||||
|
|
||||||
|
nomor_sprint: str | None = Field(None, description="e.g. Sprin/123/IV/2025/Reskrim.")
|
||||||
|
tanggal: date | None = Field(None, description="Date the sprint was issued.")
|
||||||
|
satuan_penerbit: str | None = Field(None, description="Issuing unit, e.g. 'Polres Bandung'.")
|
||||||
|
perihal: str | None = None
|
||||||
|
dasar: list[str] = Field(default_factory=list, description="List of legal/operational basis.")
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractionResult(BaseModel):
|
||||||
|
"""Full structured payload extracted from a single sprint document."""
|
||||||
|
|
||||||
|
header: HeaderFields = Field(default_factory=HeaderFields)
|
||||||
|
personel: list[PersonnelEntry] = Field(default_factory=list)
|
||||||
|
untuk: list[str] = Field(
|
||||||
|
default_factory=list,
|
||||||
|
description="Bulleted task descriptions in the 'Untuk' / 'Dikerjakan' section.",
|
||||||
|
)
|
||||||
|
ttd: Signatory = Field(default_factory=Signatory)
|
||||||
|
raw_text: str = Field(default="", description="Concatenated OCR text for debugging.")
|
||||||
|
confidence: float = Field(0.0, ge=0.0, le=1.0)
|
||||||
|
review_flags: list[ReviewFlag] = Field(default_factory=list)
|
||||||
18
src/ocr_sprint/schemas/personnel.py
Normal file
18
src/ocr_sprint/schemas/personnel.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
"""Schema for a single personnel row in a surat sprint."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class PersonnelEntry(BaseModel):
|
||||||
|
"""One row from the personnel table."""
|
||||||
|
|
||||||
|
no: int | None = Field(None, description="Row number as printed on the document.")
|
||||||
|
pangkat: str | None = Field(None, description="Rank, normalized when possible.")
|
||||||
|
nrp: str | None = Field(None, description="8-digit Polri NRP, or blank if not detected.")
|
||||||
|
nama: str | None = Field(None, description="Full name.")
|
||||||
|
jabatan_dinas: str | None = Field(None, description="Permanent post (jabatan dalam dinas).")
|
||||||
|
jabatan_sprint: str | None = Field(None, description="Role within this sprint.")
|
||||||
|
keterangan: str | None = None
|
||||||
|
confidence: float = Field(0.0, ge=0.0, le=1.0)
|
||||||
0
src/ocr_sprint/utils/__init__.py
Normal file
0
src/ocr_sprint/utils/__init__.py
Normal file
45
src/ocr_sprint/utils/logging.py
Normal file
45
src/ocr_sprint/utils/logging.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
"""Structured logging setup using structlog."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
|
||||||
|
def configure_logging(level: str = "INFO") -> None:
|
||||||
|
"""Configure structlog to emit JSON-friendly key=value records to stdout."""
|
||||||
|
log_level = getattr(logging, level.upper(), logging.INFO)
|
||||||
|
logging.basicConfig(
|
||||||
|
format="%(message)s",
|
||||||
|
stream=sys.stdout,
|
||||||
|
level=log_level,
|
||||||
|
)
|
||||||
|
structlog.configure(
|
||||||
|
processors=[
|
||||||
|
structlog.contextvars.merge_contextvars,
|
||||||
|
structlog.processors.add_log_level,
|
||||||
|
structlog.processors.TimeStamper(fmt="iso", utc=True),
|
||||||
|
structlog.processors.StackInfoRenderer(),
|
||||||
|
structlog.processors.format_exc_info,
|
||||||
|
structlog.dev.ConsoleRenderer(colors=False),
|
||||||
|
],
|
||||||
|
wrapper_class=structlog.make_filtering_bound_logger(log_level),
|
||||||
|
context_class=dict,
|
||||||
|
logger_factory=structlog.PrintLoggerFactory(),
|
||||||
|
cache_logger_on_first_use=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_logger(name: str | None = None, **initial_values: Any) -> Any:
|
||||||
|
"""Return a bound logger with optional initial context.
|
||||||
|
|
||||||
|
The return type is ``Any`` because structlog's BoundLogger generic typing
|
||||||
|
is too restrictive in practice; callers treat it as a duck-typed logger.
|
||||||
|
"""
|
||||||
|
logger = structlog.get_logger(name)
|
||||||
|
if initial_values:
|
||||||
|
logger = logger.bind(**initial_values)
|
||||||
|
return logger
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
43
tests/conftest.py
Normal file
43
tests/conftest.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
"""Shared pytest fixtures."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def blank_bgr_image() -> np.ndarray:
|
||||||
|
"""A 600x800 white BGR image (uint8) — useful for preprocessing smoke tests."""
|
||||||
|
return np.full((600, 800, 3), 255, dtype=np.uint8)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_sprint_text() -> str:
|
||||||
|
"""Realistic-but-synthetic OCR text for regex extractor tests."""
|
||||||
|
return (
|
||||||
|
"KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
|
||||||
|
"DAERAH JAWA BARAT\n"
|
||||||
|
"RESOR BANDUNG\n"
|
||||||
|
"\n"
|
||||||
|
"SURAT PERINTAH\n"
|
||||||
|
"Nomor : Sprin/123/IV/2025/Reskrim\n"
|
||||||
|
"\n"
|
||||||
|
"DASAR :\n"
|
||||||
|
"1. Undang-Undang Nomor 2 Tahun 2002 tentang Kepolisian Negara Republik Indonesia.\n"
|
||||||
|
"2. Peraturan Kapolri Nomor 6 Tahun 2017 tentang Susunan Organisasi.\n"
|
||||||
|
"3. Laporan Polisi Nomor LP/123/IV/2025/Reskrim tanggal 20 April 2025.\n"
|
||||||
|
"\n"
|
||||||
|
"DIPERINTAHKAN :\n"
|
||||||
|
"Kepada : 1. Nama anggota tersebut di bawah ini.\n"
|
||||||
|
"\n"
|
||||||
|
"Untuk : Melaksanakan penyelidikan tindak pidana.\n"
|
||||||
|
"\n"
|
||||||
|
"PERIHAL : Pelaksanaan penyelidikan kasus pencurian.\n"
|
||||||
|
"\n"
|
||||||
|
"Bandung, 21 April 2025\n"
|
||||||
|
"KEPALA KEPOLISIAN RESOR BANDUNG\n"
|
||||||
|
"\n"
|
||||||
|
"Drs. BUDI SANTOSO\n"
|
||||||
|
"AKBP NRP 12345678\n"
|
||||||
|
)
|
||||||
0
tests/unit/__init__.py
Normal file
0
tests/unit/__init__.py
Normal file
87
tests/unit/test_api.py
Normal file
87
tests/unit/test_api.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
"""API tests with the OCR engine mocked.
|
||||||
|
|
||||||
|
These tests do NOT load PaddleOCR — instead they monkeypatch the orchestrator
|
||||||
|
so we can exercise the FastAPI surface without the heavy ML init cost.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from ocr_sprint.main import create_app
|
||||||
|
from ocr_sprint.pipeline import orchestrator as orch_module
|
||||||
|
from ocr_sprint.pipeline.orchestrator import PipelineOutput
|
||||||
|
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
|
||||||
|
from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client() -> TestClient:
|
||||||
|
return TestClient(create_app())
|
||||||
|
|
||||||
|
|
||||||
|
def test_health_endpoint(client: TestClient) -> None:
|
||||||
|
response = client.get("/api/v1/health")
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.json()["status"] == "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def test_documents_rejects_empty_upload(client: TestClient) -> None:
|
||||||
|
response = client.post(
|
||||||
|
"/api/v1/documents",
|
||||||
|
files={"file": ("empty.pdf", b"", "application/pdf")},
|
||||||
|
)
|
||||||
|
assert response.status_code == 400
|
||||||
|
|
||||||
|
|
||||||
|
def test_documents_rejects_unknown_format(
|
||||||
|
client: TestClient,
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
response = client.post(
|
||||||
|
"/api/v1/documents",
|
||||||
|
files={"file": ("x.bin", b"random garbage bytes here", "application/octet-stream")},
|
||||||
|
)
|
||||||
|
assert response.status_code == 400
|
||||||
|
|
||||||
|
|
||||||
|
def test_documents_returns_pipeline_output(
|
||||||
|
client: TestClient,
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
fake_result = ExtractionResult(
|
||||||
|
header=HeaderFields(
|
||||||
|
nomor_sprint="Sprin/1/I/2025",
|
||||||
|
tanggal=date(2025, 1, 1),
|
||||||
|
satuan_penerbit="POLRES TEST",
|
||||||
|
),
|
||||||
|
confidence=0.97,
|
||||||
|
)
|
||||||
|
fake_output = PipelineOutput(
|
||||||
|
source_kind=SourceKind.PDF,
|
||||||
|
status=DocumentStatus.COMPLETED,
|
||||||
|
confidence=0.97,
|
||||||
|
result=fake_result,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _fake_run(_content: bytes) -> PipelineOutput:
|
||||||
|
return fake_output
|
||||||
|
|
||||||
|
# Patch the symbol *imported into* the routes module.
|
||||||
|
monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
|
||||||
|
from ocr_sprint.api.routes import documents as docs_module
|
||||||
|
|
||||||
|
monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
"/api/v1/documents",
|
||||||
|
files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
|
||||||
|
)
|
||||||
|
assert response.status_code == 200
|
||||||
|
body = response.json()
|
||||||
|
assert body["status"] == "completed"
|
||||||
|
assert body["confidence"] == 0.97
|
||||||
|
assert body["data"]["header"]["nomor_sprint"] == "Sprin/1/I/2025"
|
||||||
46
tests/unit/test_confidence.py
Normal file
46
tests/unit/test_confidence.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
"""Tests for confidence scoring + routing."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from ocr_sprint.pipeline.confidence import compute_confidence, route
|
||||||
|
from ocr_sprint.schemas.document import DocumentStatus
|
||||||
|
from ocr_sprint.schemas.extraction import ReviewFlag
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_flags_returns_blend_of_ocr_only() -> None:
|
||||||
|
score = compute_confidence(0.9, [])
|
||||||
|
# OCR weight 0.6 * 0.9 + validation 0.4 * 1.0 = 0.94
|
||||||
|
assert abs(score - 0.94) < 1e-6
|
||||||
|
|
||||||
|
|
||||||
|
def test_flags_reduce_score() -> None:
|
||||||
|
base = compute_confidence(0.9, [])
|
||||||
|
with_flags = compute_confidence(0.9, [ReviewFlag.MISSING_FIELD])
|
||||||
|
assert with_flags < base
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_is_clamped() -> None:
|
||||||
|
catastrophic = compute_confidence(
|
||||||
|
0.0,
|
||||||
|
[
|
||||||
|
ReviewFlag.MISSING_FIELD,
|
||||||
|
ReviewFlag.LOW_OCR_CONFIDENCE,
|
||||||
|
ReviewFlag.PERSONNEL_COUNT_MISMATCH,
|
||||||
|
ReviewFlag.INVALID_NRP,
|
||||||
|
ReviewFlag.UNKNOWN_PANGKAT,
|
||||||
|
ReviewFlag.DATE_PARSE_FAILED,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert 0.0 <= catastrophic <= 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_route_high_confidence() -> None:
|
||||||
|
assert route(0.97) == DocumentStatus.COMPLETED
|
||||||
|
|
||||||
|
|
||||||
|
def test_route_mid_goes_to_review() -> None:
|
||||||
|
assert route(0.88) == DocumentStatus.NEEDS_REVIEW
|
||||||
|
|
||||||
|
|
||||||
|
def test_route_low_goes_to_review() -> None:
|
||||||
|
assert route(0.40) == DocumentStatus.NEEDS_REVIEW
|
||||||
50
tests/unit/test_ingest.py
Normal file
50
tests/unit/test_ingest.py
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
"""Tests for source detection + image ingest."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from ocr_sprint.pipeline.ingest import detect_source_kind, ingest_image
|
||||||
|
from ocr_sprint.schemas.document import SourceKind
|
||||||
|
|
||||||
|
|
||||||
|
def _png_bytes() -> bytes:
|
||||||
|
img = Image.new("RGB", (100, 80), color="white")
|
||||||
|
buf = io.BytesIO()
|
||||||
|
img.save(buf, format="PNG")
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def _jpeg_bytes() -> bytes:
|
||||||
|
img = Image.new("RGB", (100, 80), color="white")
|
||||||
|
buf = io.BytesIO()
|
||||||
|
img.save(buf, format="JPEG")
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_pdf() -> None:
|
||||||
|
assert detect_source_kind(b"%PDF-1.7\n...") == SourceKind.PDF
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_png() -> None:
|
||||||
|
assert detect_source_kind(_png_bytes()) == SourceKind.IMAGE
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_jpeg() -> None:
|
||||||
|
assert detect_source_kind(_jpeg_bytes()) == SourceKind.IMAGE
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_unknown() -> None:
|
||||||
|
assert detect_source_kind(b"garbage") == SourceKind.UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_image_returns_one_page() -> None:
|
||||||
|
pages = ingest_image(_png_bytes())
|
||||||
|
assert len(pages) == 1
|
||||||
|
assert pages[0].page_index == 0
|
||||||
|
assert isinstance(pages[0].image, np.ndarray)
|
||||||
|
assert pages[0].image.dtype == np.uint8
|
||||||
|
assert pages[0].image.shape == (80, 100, 3)
|
||||||
37
tests/unit/test_preprocess.py
Normal file
37
tests/unit/test_preprocess.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
"""Smoke tests for the preprocessing pipeline."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ocr_sprint.pipeline.preprocess import PreprocessConfig, preprocess
|
||||||
|
|
||||||
|
|
||||||
|
def test_preprocess_returns_bgr_uint8(blank_bgr_image: np.ndarray) -> None:
|
||||||
|
out = preprocess(blank_bgr_image)
|
||||||
|
assert out.dtype == np.uint8
|
||||||
|
assert out.ndim == 3
|
||||||
|
assert out.shape[2] == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_preprocess_resizes_to_max_side() -> None:
|
||||||
|
big = np.full((4000, 3000, 3), 255, dtype=np.uint8)
|
||||||
|
cfg = PreprocessConfig(max_side=1000, denoise=False, deskew=False)
|
||||||
|
out = preprocess(big, cfg)
|
||||||
|
assert max(out.shape[:2]) == 1000
|
||||||
|
|
||||||
|
|
||||||
|
def test_preprocess_does_not_upscale_small_images() -> None:
|
||||||
|
small = np.full((400, 300, 3), 255, dtype=np.uint8)
|
||||||
|
cfg = PreprocessConfig(max_side=2200, denoise=False, deskew=False)
|
||||||
|
out = preprocess(small, cfg)
|
||||||
|
assert out.shape[:2] == (400, 300)
|
||||||
|
|
||||||
|
|
||||||
|
def test_adaptive_threshold_produces_binary_image() -> None:
|
||||||
|
img = np.random.randint(0, 256, (200, 200, 3), dtype=np.uint8)
|
||||||
|
cfg = PreprocessConfig(denoise=False, deskew=False, adaptive_threshold=True)
|
||||||
|
out = preprocess(img, cfg)
|
||||||
|
# adaptive threshold should leave only 0s and 255s
|
||||||
|
unique = np.unique(out)
|
||||||
|
assert set(unique.tolist()).issubset({0, 255})
|
||||||
112
tests/unit/test_regex_rules.py
Normal file
112
tests/unit/test_regex_rules.py
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
"""Tests for regex-based header extraction."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ocr_sprint.pipeline.extract.regex_rules import (
|
||||||
|
extract_header,
|
||||||
|
find_dasar_list,
|
||||||
|
find_nomor_sprint,
|
||||||
|
find_perihal,
|
||||||
|
find_satuan,
|
||||||
|
find_signatory,
|
||||||
|
find_tanggal,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestNomorSprint:
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("text", "needle"),
|
||||||
|
[
|
||||||
|
("Nomor : Sprin/123/IV/2025/Reskrim", "123"),
|
||||||
|
("Nomor: SPRIN / 7 / I / 2024", "7"),
|
||||||
|
("...Sprin-345-X-2024-Sat Intelkam...", "345"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_finds_nomor(self, text: str, needle: str) -> None:
|
||||||
|
result = find_nomor_sprint(text)
|
||||||
|
assert result is not None
|
||||||
|
assert needle in result
|
||||||
|
assert result.upper().startswith("SPRIN")
|
||||||
|
|
||||||
|
def test_returns_none_when_absent(self) -> None:
|
||||||
|
assert find_nomor_sprint("no nomor here, just some text") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestTanggal:
|
||||||
|
def test_basic_date(self) -> None:
|
||||||
|
assert find_tanggal("Bandung, 21 April 2025") == date(2025, 4, 21)
|
||||||
|
|
||||||
|
def test_with_dashes(self) -> None:
|
||||||
|
assert find_tanggal("Tanggal 1 - Desember - 2024") == date(2024, 12, 1)
|
||||||
|
|
||||||
|
def test_invalid_month(self) -> None:
|
||||||
|
assert find_tanggal("21 Foo 2025") is None
|
||||||
|
|
||||||
|
def test_no_date_present(self) -> None:
|
||||||
|
assert find_tanggal("nothing here") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestSatuan:
|
||||||
|
def test_polres(self) -> None:
|
||||||
|
result = find_satuan("KEPOLISIAN RESOR BANDUNG\nLainnya")
|
||||||
|
assert result is not None
|
||||||
|
assert "RESOR BANDUNG" in result.upper()
|
||||||
|
|
||||||
|
def test_polri_pusat(self) -> None:
|
||||||
|
result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
|
||||||
|
assert result is not None
|
||||||
|
|
||||||
|
|
||||||
|
class TestPerihal:
|
||||||
|
def test_extracts_perihal_line(self) -> None:
|
||||||
|
text = "Other line\nPERIHAL : Pelaksanaan penyelidikan kasus.\nMore"
|
||||||
|
assert find_perihal(text) == "Pelaksanaan penyelidikan kasus."
|
||||||
|
|
||||||
|
def test_returns_none_when_absent(self) -> None:
|
||||||
|
assert find_perihal("no perihal field") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestDasar:
|
||||||
|
def test_numbered_list(self) -> None:
|
||||||
|
text = (
|
||||||
|
"DASAR :\n"
|
||||||
|
"1. UU No 2 Tahun 2002.\n"
|
||||||
|
"2. Peraturan Kapolri Nomor 6.\n"
|
||||||
|
"\n"
|
||||||
|
"DIPERINTAHKAN :\n"
|
||||||
|
"Kepada : ...\n"
|
||||||
|
)
|
||||||
|
items = find_dasar_list(text)
|
||||||
|
assert len(items) == 2
|
||||||
|
assert items[0].startswith("UU No 2")
|
||||||
|
assert items[1].startswith("Peraturan Kapolri")
|
||||||
|
|
||||||
|
def test_empty_when_section_missing(self) -> None:
|
||||||
|
assert find_dasar_list("no dasar section") == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestSignatory:
|
||||||
|
def test_extracts_last_nrp(self) -> None:
|
||||||
|
text = "Some 12345678 NRP earlier 87654321\nNRP. 11223344"
|
||||||
|
sig = find_signatory(text)
|
||||||
|
assert sig.nrp == "11223344"
|
||||||
|
|
||||||
|
def test_no_nrp(self) -> None:
|
||||||
|
assert find_signatory("no NRP here").nrp is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractHeader:
|
||||||
|
def test_full_synthetic_doc(self, sample_sprint_text: str) -> None:
|
||||||
|
header = extract_header(sample_sprint_text)
|
||||||
|
assert header.nomor_sprint is not None
|
||||||
|
assert "Sprin" in header.nomor_sprint
|
||||||
|
assert header.tanggal == date(2025, 4, 21)
|
||||||
|
assert header.satuan_penerbit is not None
|
||||||
|
assert "KEPOLISIAN" in header.satuan_penerbit.upper()
|
||||||
|
assert header.perihal is not None
|
||||||
|
assert "penyelidikan" in header.perihal.lower()
|
||||||
|
assert len(header.dasar) == 3
|
||||||
108
tests/unit/test_validators.py
Normal file
108
tests/unit/test_validators.py
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
"""Tests for the validation layer."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ocr_sprint.data.master_pangkat import is_valid_pangkat, normalize_pangkat
|
||||||
|
from ocr_sprint.pipeline.extract.validators import (
|
||||||
|
validate_extraction,
|
||||||
|
validate_header,
|
||||||
|
validate_nrp,
|
||||||
|
validate_personnel_entry,
|
||||||
|
)
|
||||||
|
from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields, ReviewFlag
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
|
||||||
|
class TestNRP:
|
||||||
|
@pytest.mark.parametrize("nrp", ["12345678", "00000001", "99999999"])
|
||||||
|
def test_valid_8_digits(self, nrp: str) -> None:
|
||||||
|
assert validate_nrp(nrp) is True
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("nrp", ["1234567", "123456789", "abcdefgh", "", None])
|
||||||
|
def test_invalid(self, nrp: str | None) -> None:
|
||||||
|
assert validate_nrp(nrp) is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestPangkat:
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("input_str", "expected"),
|
||||||
|
[
|
||||||
|
("AKP", "AKP"),
|
||||||
|
("akp", "AKP"),
|
||||||
|
("AKP.", "AKP"),
|
||||||
|
("AKBP", "AKBP"),
|
||||||
|
("Brigjen Pol", "BRIGJEN POL"),
|
||||||
|
("BRIGJEN", "BRIGJEN POL"),
|
||||||
|
("Kombespol", "KOMBES POL"),
|
||||||
|
("BRIPDA", "BRIPDA"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_normalizes_known_ranks(self, input_str: str, expected: str) -> None:
|
||||||
|
assert normalize_pangkat(input_str) == expected
|
||||||
|
|
||||||
|
def test_unknown_returns_none(self) -> None:
|
||||||
|
assert normalize_pangkat("Sersan Mayor") is None
|
||||||
|
assert is_valid_pangkat("Sersan Mayor") is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestPersonnelValidator:
|
||||||
|
def test_clean_entry_no_flags(self) -> None:
|
||||||
|
entry = PersonnelEntry(pangkat="AKP", nrp="12345678", nama="Test")
|
||||||
|
assert validate_personnel_entry(entry) == []
|
||||||
|
|
||||||
|
def test_invalid_nrp_flagged(self) -> None:
|
||||||
|
entry = PersonnelEntry(pangkat="AKP", nrp="123", nama="Test")
|
||||||
|
assert ReviewFlag.INVALID_NRP in validate_personnel_entry(entry)
|
||||||
|
|
||||||
|
def test_unknown_pangkat_flagged(self) -> None:
|
||||||
|
entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
|
||||||
|
assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
|
||||||
|
|
||||||
|
|
||||||
|
class TestHeaderValidator:
|
||||||
|
def test_complete_header_no_flags(self) -> None:
|
||||||
|
header = HeaderFields(
|
||||||
|
nomor_sprint="Sprin/1/I/2025",
|
||||||
|
tanggal=date(2025, 1, 1),
|
||||||
|
satuan_penerbit="POLRES BANDUNG",
|
||||||
|
)
|
||||||
|
assert validate_header(header) == []
|
||||||
|
|
||||||
|
def test_missing_nomor_flagged(self) -> None:
|
||||||
|
header = HeaderFields(tanggal=date(2025, 1, 1))
|
||||||
|
assert ReviewFlag.MISSING_FIELD in validate_header(header)
|
||||||
|
|
||||||
|
def test_missing_date_flagged(self) -> None:
|
||||||
|
header = HeaderFields(nomor_sprint="Sprin/1/I/2025")
|
||||||
|
assert ReviewFlag.DATE_PARSE_FAILED in validate_header(header)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFullValidation:
|
||||||
|
def test_personnel_count_mismatch(self) -> None:
|
||||||
|
result = ExtractionResult(
|
||||||
|
header=HeaderFields(
|
||||||
|
nomor_sprint="Sprin/1/I/2025",
|
||||||
|
tanggal=date(2025, 1, 1),
|
||||||
|
),
|
||||||
|
personel=[
|
||||||
|
PersonnelEntry(pangkat="AKP", nrp="12345678", nama="A"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
flags = validate_extraction(result, expected_personnel_count=2)
|
||||||
|
assert ReviewFlag.PERSONNEL_COUNT_MISMATCH in flags
|
||||||
|
|
||||||
|
def test_flags_are_deduped(self) -> None:
|
||||||
|
result = ExtractionResult(
|
||||||
|
header=HeaderFields(), # missing both nomor and tanggal
|
||||||
|
personel=[
|
||||||
|
PersonnelEntry(nrp="123", pangkat="X"),
|
||||||
|
PersonnelEntry(nrp="456", pangkat="Y"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
flags = validate_extraction(result)
|
||||||
|
# each flag type should appear at most once
|
||||||
|
assert len(flags) == len(set(flags))
|
||||||
Reference in New Issue
Block a user