feat: implement robust personnel data extraction pipeline with text-based fallback and coordinate-aware processing

2026-04-26 17:16:47 +07:00
parent dbcf480130
commit 002821ca07
20 changed files with 3326 additions and 20 deletions
--- a/defaults/inference.pdiparams
+++ b/defaults/inference.pdiparams
--- a/defaults/inference.pdiparams.info
+++ b/defaults/inference.pdiparams.info
--- a/defaults/inference.pdmodel
+++ b/defaults/inference.pdmodel
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -0,0 +1,18 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(python -m pytest tests/unit/test_personnel_text_fallback.py -x -q)",
+      "Bash(python -c \"import sys; print\\(sys.executable\\)\")",
+      "Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_personnel_text_fallback.py -x -q)",
+      "Bash(.venv/Scripts/python.exe -m pytest tests/unit -x -q)",
+      "Bash(git stash *)",
+      "Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_api.py::test_documents_sync_returns_pipeline_output -x -q)",
+      "Bash(.venv/Scripts/python.exe -m pytest tests/unit --ignore=tests/unit/test_api.py -q)",
+      "Bash(.venv/Scripts/python.exe -c ' *)",
+      "Bash(xargs grep *)",
+      "Bash(.venv/Scripts/python.exe -m pytest tests/unit -q --ignore=tests/unit/test_api.py --ignore=tests/unit/test_api_hitl.py --ignore=tests/unit/test_blob_storage.py)",
+      "Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_ocr_layout.py tests/unit/test_personnel_text_fallback.py -q)",
+      "Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_personnel_text_fallback.py tests/unit/test_ocr_layout.py -q)"
+    ]
+  }
+}
--- a/13
+++ b/13
@@ -1,9 +1,10 @@
-.PHONY: help install dev fmt lint typecheck test test-cov run docker-build docker-up docker-down clean
+.PHONY: help install dev update fmt lint typecheck test test-cov run docker-build docker-up docker-down clean

 help:
 	@echo "Targets:"
 	@echo "  install       - install runtime + dev deps in current env"
 	@echo "  dev           - run FastAPI app with autoreload"
+	@echo "  update        - git pull + install deps + migrate db + run dev server"
 	@echo "  fmt           - format code with ruff"
 	@echo "  lint          - lint with ruff"
 	@echo "  typecheck     - run mypy"
@@ -21,6 +22,16 @@ install:
 dev:
 	uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000

+update:
+	@echo "[1/4] Pulling latest code..."
+	git pull
+	@echo "[2/4] Installing/updating dependencies..."
+	pip install -e ".[dev]"
+	@echo "[3/4] Running database migrations..."
+	alembic upgrade head
+	@echo "[4/4] Starting dev server..."
+	uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
+
 fmt:
 	ruff format src tests
 	ruff check --fix src tests
--- a/docs/DEPLOYMENT-EXISTING-STACK.md
+++ b/docs/DEPLOYMENT-EXISTING-STACK.md
@@ -0,0 +1,858 @@
+# Deployment OCR Sprint Service (Existing Stack)
+
+Panduan deployment untuk server dengan Python 3.12.3, PostgreSQL 16.13, dan Redis 7.0.15 yang sudah terinstall.
+
+## Informasi Server Anda
+
+- **OS**: Ubuntu 24.04
+- **Python**: 3.12.3 ✅
+- **PostgreSQL**: 16.13 ✅
+- **Redis**: 7.0.15 ✅
+
+Semua versi sudah kompatibel dan optimal untuk OCR Sprint Service!
+
+## Langkah 1: Install System Libraries untuk OpenCV & PaddleOCR
+
+```bash
+# Update package list
+sudo apt update
+
+# Install libraries yang dibutuhkan oleh OpenCV dan PaddleOCR
+sudo apt install -y \
+    libgl1 \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    libgomp1 \
+    libmagic1 \
+    python3.12-venv \
+    python3.12-dev \
+    build-essential \
+    git
+```
+
+## Langkah 2: Setup PostgreSQL Database
+
+```bash
+# Login ke PostgreSQL
+sudo -u postgres psql
+```
+
+Jalankan SQL commands berikut:
+
+```sql
+-- Create user dan database
+CREATE USER ocr WITH PASSWORD '@Offroader123';
+CREATE DATABASE ocr_sprint OWNER ocr;
+
+-- Grant privileges
+GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
+
+-- Connect ke database untuk grant schema privileges
+\c ocr_sprint
+
+-- Grant schema privileges (PostgreSQL 15+)
+GRANT ALL ON SCHEMA public TO ocr;
+GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO ocr;
+GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO ocr;
+
+-- Verify
+\l ocr_sprint
+\du ocr
+
+-- Exit
+\q
+```
+
+**Generate password yang aman:**
+
+```bash
+# Generate random password
+openssl rand -base64 32
+J33GdYQcWcfqXs169cmgPrQJpLFgybjoedr/tNb0d4=
+```
+
+Simpan password ini, akan digunakan di konfigurasi nanti.
+
+## Langkah 3: Verify Redis
+
+```bash
+# Check Redis status
+sudo systemctl status redis-server
+
+# Test connection
+redis-cli ping
+# Expected output: PONG
+
+# Check Redis config (opsional)
+redis-cli CONFIG GET maxmemory
+```
+
+Jika Redis belum running:
+
+```bash
+sudo systemctl enable redis-server
+sudo systemctl start redis-server
+```
+
+## Langkah 4: Create Application User
+
+```bash
+# Create dedicated user untuk aplikasi
+sudo useradd -m -s /bin/bash ocr
+
+# Create application directory
+sudo mkdir -p /opt/ocr-sprint-service
+sudo chown ocr:ocr /opt/ocr-sprint-service
+```
+
+## Langkah 5: Clone dan Install Application
+
+```bash
+# Switch ke user ocr
+sudo su - ocr
+
+# Clone repository
+cd /opt
+git clone https://github.com/Adriankf59/ocr-sprint-service.git
+cd ocr-sprint-service
+
+# Create virtual environment dengan Python 3.12
+python3.12 -m venv .venv
+
+# Activate virtual environment
+source .venv/bin/activate
+
+# Verify Python version di venv
+python --version
+# Expected: Python 3.12.3
+
+# Upgrade pip
+pip install --upgrade pip setuptools wheel
+
+# Install application dengan OCR dependencies
+# Ini akan download ~1.5GB PaddlePaddle wheels
+pip install -e ".[ocr]"
+
+# Verify installation
+python -c "import paddleocr; print('PaddleOCR OK')"
+python -c "import cv2; print('OpenCV OK')"
+python -c "import fastapi; print('FastAPI OK')"
+```
+
+## Langkah 6: Konfigurasi Application
+
+```bash
+# Masih sebagai user ocr
+cd /opt/ocr-sprint-service
+
+# Copy environment template
+cp .env.example .env
+
+# Edit konfigurasi
+nano .env
+```
+
+**Konfigurasi `/opt/ocr-sprint-service/.env`:**
+
+```bash
+# ==== App ====
+APP_ENV=prod
+APP_HOST=0.0.0.0
+APP_PORT=8000
+APP_LOG_LEVEL=INFO
+
+# ==== Storage ====
+STORAGE_LOCAL_DIR=/opt/ocr-sprint-service/storage
+BLOB_STORAGE_DIR=/opt/ocr-sprint-service/storage/blobs
+BLOB_MAX_UPLOAD_MB=25
+
+# ==== OCR ====
+OCR_LANG=latin
+OCR_USE_GPU=false
+OCR_MAX_IMAGE_SIDE=2200
+
+# ==== Preprocessing ====
+PREPROCESS_TARGET_DPI=300
+PREPROCESS_DENOISE=true
+PREPROCESS_DESKEW=true
+PREPROCESS_DETECT_DOCUMENT=true
+PREPROCESS_REMOVE_SHADOW=true
+PREPROCESS_MIN_QUAD_AREA_FRACTION=0.20
+
+# ==== Table Extraction ====
+TABLES_ENABLED=true
+
+# ==== Confidence ====
+CONFIDENCE_AUTO_APPROVE=0.95
+CONFIDENCE_NEEDS_REVIEW=0.85
+
+# ==== LLM (Phase 5, optional - disable untuk sekarang) ====
+LLM_ENABLED=false
+
+# ==== Async Pipeline ====
+QUEUE_ENABLED=true
+REDIS_URL=redis://localhost:6379/0
+CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
+
+# ==== Database ====
+# Ganti 'your-password-here' dengan password yang Anda generate di Langkah 2
+DATABASE_URL=postgresql+psycopg://ocr:your-password-here@localhost:5432/ocr_sprint
+DATABASE_ECHO=false
+
+# ==== Auth (WAJIB untuk production!) ====
+# Generate dengan: openssl rand -hex 32
+API_KEYS=paste-api-key-1-here,paste-api-key-2-here
+API_KEY_HEADER=X-API-Key
+```
+
+**Generate API keys:**
+
+```bash
+# Generate 2 API keys
+echo "API Key 1: $(openssl rand -hex 32)"
+echo "API Key 2: $(openssl rand -hex 32)"
+```
+
+Copy output dan paste ke `API_KEYS` di file `.env`.
+
+**Create storage directories:**
+
+```bash
+mkdir -p /opt/ocr-sprint-service/storage/blobs
+chmod 755 /opt/ocr-sprint-service/storage
+```
+
+## Langkah 7: Run Database Migrations
+
+```bash
+# Masih sebagai user ocr, dengan venv activated
+cd /opt/ocr-sprint-service
+source .venv/bin/activate
+
+# Run migrations
+alembic upgrade head
+
+# Verify - should show current revision
+alembic current
+
+# Expected output: (head) atau revision number
+```
+
+## Langkah 8: Test Manual Run
+
+```bash
+# Masih sebagai user ocr
+cd /opt/ocr-sprint-service
+source .venv/bin/activate
+
+# Test API server
+uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
+```
+
+**Di terminal lain (sebagai user ubuntu):**
+
+```bash
+# Test health check
+curl http://localhost:8000/api/v1/health
+
+# Expected: {"status":"ok","version":"0.1.0"}
+
+# Test dengan sample file (jika ada)
+curl -X POST "http://localhost:8000/api/v1/documents?sync=true" \
+  -H "X-API-Key: your-api-key-here" \
+  -F "file=@/path/to/test.pdf"
+```
+
+Jika berhasil, stop server dengan `Ctrl+C`.
+
+## Langkah 9: Setup Systemd Services
+
+```bash
+# Exit dari user ocr
+exit
+
+# Kembali sebagai user ubuntu dengan sudo
+```
+
+### Create API Service
+
+```bash
+sudo nano /etc/systemd/system/ocr-sprint-api.service
+```
+
+**Content:**
+
+```ini
+[Unit]
+Description=OCR Sprint API Service
+After=network.target postgresql.service redis-server.service
+Wants=postgresql.service redis-server.service
+
+[Service]
+Type=simple
+User=ocr
+Group=ocr
+WorkingDirectory=/opt/ocr-sprint-service
+
+# Environment
+Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
+EnvironmentFile=/opt/ocr-sprint-service/.env
+
+# Start command - 4 workers untuk production
+ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn \
+    ocr_sprint.main:app \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --workers 4 \
+    --log-level info
+
+# Restart policy
+Restart=always
+RestartSec=10
+StartLimitInterval=0
+
+# Resource limits
+LimitNOFILE=65536
+
+# Security
+NoNewPrivileges=true
+PrivateTmp=true
+
+[Install]
+WantedBy=multi-user.target
+```
+
+### Create Celery Worker Service
+
+```bash
+sudo nano /etc/systemd/system/ocr-sprint-worker.service
+```
+
+**Content:**
+
+```ini
+[Unit]
+Description=OCR Sprint Celery Worker
+After=network.target postgresql.service redis-server.service ocr-sprint-api.service
+Wants=postgresql.service redis-server.service
+
+[Service]
+Type=simple
+User=ocr
+Group=ocr
+WorkingDirectory=/opt/ocr-sprint-service
+
+# Environment
+Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
+EnvironmentFile=/opt/ocr-sprint-service/.env
+
+# Start command - concurrency 2 untuk CPU dengan 4 cores
+# Sesuaikan dengan jumlah CPU cores server Anda
+ExecStart=/opt/ocr-sprint-service/.venv/bin/celery \
+    -A ocr_sprint.worker.celery_app \
+    worker \
+    --loglevel=info \
+    --concurrency=2 \
+    --max-tasks-per-child=100
+
+# Restart policy
+Restart=always
+RestartSec=10
+StartLimitInterval=0
+
+# Resource limits
+LimitNOFILE=65536
+
+# Security
+NoNewPrivileges=true
+PrivateTmp=true
+
+[Install]
+WantedBy=multi-user.target
+```
+
+### Enable dan Start Services
+
+```bash
+# Reload systemd
+sudo systemctl daemon-reload
+
+# Enable services (auto-start on boot)
+sudo systemctl enable ocr-sprint-api
+sudo systemctl enable ocr-sprint-worker
+
+# Start services
+sudo systemctl start ocr-sprint-api
+sudo systemctl start ocr-sprint-worker
+
+# Check status
+sudo systemctl status ocr-sprint-api
+sudo systemctl status ocr-sprint-worker
+```
+
+**Expected output:** `active (running)` dengan warna hijau.
+
+### View Logs
+
+```bash
+# API logs (real-time)
+sudo journalctl -u ocr-sprint-api -f
+
+# Worker logs (real-time)
+sudo journalctl -u ocr-sprint-worker -f
+
+# Last 50 lines
+sudo journalctl -u ocr-sprint-api -n 50
+sudo journalctl -u ocr-sprint-worker -n 50
+```
+
+## Langkah 10: Install dan Setup Nginx
+
+```bash
+# Install Nginx dan Certbot
+sudo apt install -y nginx certbot python3-certbot-nginx
+
+# Check Nginx status
+sudo systemctl status nginx
+```
+
+### Create Nginx Configuration
+
+```bash
+sudo nano /etc/nginx/sites-available/ocr-sprint
+```
+
+**Content (ganti `ocr.yourdomain.com` dengan domain Anda):**
+
+```nginx
+# Upstream
+upstream ocr_api {
+    server 127.0.0.1:8000;
+    keepalive 32;
+}
+
+# Rate limiting
+limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
+
+server {
+    listen 80;
+    server_name ocr.yourdomain.com;
+
+    # Max upload size
+    client_max_body_size 30M;
+    client_body_buffer_size 128k;
+
+    # Timeouts
+    proxy_connect_timeout 300s;
+    proxy_send_timeout 300s;
+    proxy_read_timeout 300s;
+    send_timeout 300s;
+
+    # Logging
+    access_log /var/log/nginx/ocr-sprint-access.log;
+    error_log /var/log/nginx/ocr-sprint-error.log;
+
+    # API endpoints
+    location /api/ {
+        limit_req zone=api_limit burst=20 nodelay;
+
+        proxy_pass http://ocr_api;
+        proxy_http_version 1.1;
+        
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_set_header Connection "";
+        
+        proxy_buffering off;
+    }
+
+    # Health check
+    location /api/v1/health {
+        proxy_pass http://ocr_api;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        access_log off;
+    }
+
+    # Metrics (restrict access)
+    location /metrics {
+        allow 127.0.0.1;
+        allow 10.0.0.0/8;
+        deny all;
+
+        proxy_pass http://ocr_api;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+    }
+
+    # API docs
+    location /docs {
+        proxy_pass http://ocr_api;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+    }
+
+    location /redoc {
+        proxy_pass http://ocr_api;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+    }
+}
+```
+
+### Enable Site
+
+```bash
+# Test konfigurasi
+sudo nginx -t
+
+# Enable site
+sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
+
+# Reload Nginx
+sudo systemctl reload nginx
+```
+
+### Setup SSL (jika punya domain)
+
+```bash
+# Obtain certificate
+sudo certbot --nginx -d ocr.yourdomain.com
+
+# Test auto-renewal
+sudo certbot renew --dry-run
+```
+
+## Langkah 11: Setup Firewall
+
+```bash
+# Check UFW status
+sudo ufw status
+
+# Allow SSH (PENTING!)
+sudo ufw allow 22/tcp
+
+# Allow HTTP dan HTTPS
+sudo ufw allow 80/tcp
+sudo ufw allow 443/tcp
+
+# Enable firewall (jika belum)
+sudo ufw enable
+
+# Verify
+sudo ufw status numbered
+```
+
+## Langkah 12: Verifikasi Final
+
+### Test dari Server
+
+```bash
+# Health check
+curl http://localhost:8000/api/v1/health
+
+# Test async endpoint
+curl -X POST http://localhost:8000/api/v1/documents \
+  -H "X-API-Key: your-api-key-here" \
+  -F "file=@/path/to/test.pdf"
+
+# Expected: {"job_id":"...","status":"pending",...}
+
+# Check job status
+curl -H "X-API-Key: your-api-key-here" \
+  http://localhost:8000/api/v1/documents/JOB_ID_HERE
+```
+
+### Test via Domain (jika sudah setup SSL)
+
+```bash
+curl https://ocr.yourdomain.com/api/v1/health
+```
+
+### Check Services
+
+```bash
+# All services should be active
+sudo systemctl status ocr-sprint-api
+sudo systemctl status ocr-sprint-worker
+sudo systemctl status postgresql
+sudo systemctl status redis-server
+sudo systemctl status nginx
+```
+
+## Monitoring
+
+### View Logs
+
+```bash
+# API logs
+sudo journalctl -u ocr-sprint-api -f
+
+# Worker logs
+sudo journalctl -u ocr-sprint-worker -f
+
+# Nginx access logs
+sudo tail -f /var/log/nginx/ocr-sprint-access.log
+
+# Nginx error logs
+sudo tail -f /var/log/nginx/ocr-sprint-error.log
+```
+
+### Prometheus Metrics
+
+```bash
+# View metrics
+curl http://localhost:8000/metrics
+
+# Key metrics:
+# - ocr_documents_total
+# - ocr_processing_duration_seconds
+# - ocr_confidence_score
+```
+
+## Maintenance
+
+### Restart Services
+
+```bash
+sudo systemctl restart ocr-sprint-api
+sudo systemctl restart ocr-sprint-worker
+```
+
+### Update Application
+
+```bash
+# Switch ke user ocr
+sudo su - ocr
+cd /opt/ocr-sprint-service
+
+# Pull latest code
+git pull
+
+# Activate venv
+source .venv/bin/activate
+
+# Update dependencies
+pip install -e ".[ocr]"
+
+# Run migrations
+alembic upgrade head
+
+# Exit
+exit
+
+# Restart services
+sudo systemctl restart ocr-sprint-api
+sudo systemctl restart ocr-sprint-worker
+
+# Check logs
+sudo journalctl -u ocr-sprint-api -n 50
+```
+
+### Database Backup
+
+```bash
+# Create backup directory
+sudo mkdir -p /opt/ocr-sprint-service/backups
+sudo chown ocr:ocr /opt/ocr-sprint-service/backups
+
+# Manual backup
+sudo -u ocr pg_dump -h localhost -U ocr ocr_sprint | gzip > /opt/ocr-sprint-service/backups/backup_$(date +%Y%m%d_%H%M%S).sql.gz
+```
+
+**Setup automated backup:**
+
+```bash
+# Create backup script
+sudo nano /opt/ocr-sprint-service/backup.sh
+```
+
+```bash
+#!/bin/bash
+BACKUP_DIR="/opt/ocr-sprint-service/backups"
+DATE=$(date +%Y%m%d_%H%M%S)
+
+mkdir -p $BACKUP_DIR
+
+# Backup database
+PGPASSWORD='your-db-password' pg_dump -h localhost -U ocr ocr_sprint | gzip > $BACKUP_DIR/db_$DATE.sql.gz
+
+# Keep only last 7 days
+find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete
+
+echo "Backup completed: $DATE"
+```
+
+```bash
+# Make executable
+sudo chmod +x /opt/ocr-sprint-service/backup.sh
+sudo chown ocr:ocr /opt/ocr-sprint-service/backup.sh
+
+# Setup cron (daily at 2 AM)
+sudo crontab -e -u ocr
+
+# Add line:
+0 2 * * * /opt/ocr-sprint-service/backup.sh >> /var/log/ocr-backup.log 2>&1
+```
+
+## Troubleshooting
+
+### Service tidak start
+
+```bash
+# Check detailed logs
+sudo journalctl -u ocr-sprint-api -n 100 --no-pager
+sudo journalctl -u ocr-sprint-worker -n 100 --no-pager
+
+# Check file permissions
+ls -la /opt/ocr-sprint-service
+ls -la /opt/ocr-sprint-service/storage
+
+# Test manual run
+sudo su - ocr
+cd /opt/ocr-sprint-service
+source .venv/bin/activate
+uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
+```
+
+### Database connection error
+
+```bash
+# Test connection
+sudo -u ocr psql -h localhost -U ocr -d ocr_sprint
+
+# Check PostgreSQL status
+sudo systemctl status postgresql
+
+# Check PostgreSQL logs
+sudo journalctl -u postgresql -n 50
+```
+
+### Redis connection error
+
+```bash
+# Test Redis
+redis-cli ping
+
+# Check Redis status
+sudo systemctl status redis-server
+
+# Check Redis logs
+sudo journalctl -u redis-server -n 50
+```
+
+### Worker tidak memproses jobs
+
+```bash
+# Check Celery worker status
+sudo su - ocr
+cd /opt/ocr-sprint-service
+source .venv/bin/activate
+celery -A ocr_sprint.worker.celery_app inspect active
+celery -A ocr_sprint.worker.celery_app inspect stats
+
+# Check Redis queue
+redis-cli LLEN ocr_sprint
+```
+
+### PaddleOCR error
+
+```bash
+# Re-download models
+sudo su - ocr
+cd /opt/ocr-sprint-service
+source .venv/bin/activate
+
+python << EOF
+from paddleocr import PaddleOCR
+ocr = PaddleOCR(use_angle_cls=True, lang='latin')
+print("Models downloaded successfully")
+EOF
+```
+
+## Performance Tuning
+
+### Check CPU cores
+
+```bash
+nproc
+```
+
+### Adjust worker concurrency
+
+```bash
+# Edit worker service
+sudo nano /etc/systemd/system/ocr-sprint-worker.service
+
+# Untuk 4 cores: --concurrency=2
+# Untuk 8 cores: --concurrency=4
+# Untuk 16 cores: --concurrency=8
+
+# Reload dan restart
+sudo systemctl daemon-reload
+sudo systemctl restart ocr-sprint-worker
+```
+
+### PostgreSQL 16 Tuning
+
+```bash
+sudo nano /etc/postgresql/16/main/postgresql.conf
+```
+
+**Recommended settings (sesuaikan dengan RAM server):**
+
+```
+# Untuk 8GB RAM:
+shared_buffers = 2GB
+effective_cache_size = 6GB
+maintenance_work_mem = 512MB
+work_mem = 8MB
+
+# Untuk 16GB RAM:
+shared_buffers = 4GB
+effective_cache_size = 12GB
+maintenance_work_mem = 1GB
+work_mem = 10MB
+
+# General
+checkpoint_completion_target = 0.9
+wal_buffers = 16MB
+default_statistics_target = 100
+random_page_cost = 1.1
+effective_io_concurrency = 200
+max_worker_processes = 4
+max_parallel_workers_per_gather = 2
+max_parallel_workers = 4
+```
+
+```bash
+sudo systemctl restart postgresql
+```
+
+## Security Checklist
+
+- [ ] API keys set dengan nilai random yang kuat
+- [ ] Database password diganti dari default
+- [ ] Firewall enabled (UFW)
+- [ ] SSL/TLS enabled (jika punya domain)
+- [ ] `/metrics` endpoint restricted
+- [ ] PostgreSQL hanya listen di localhost
+- [ ] Redis hanya listen di localhost
+- [ ] Backup automated (cron job)
+- [ ] OS security updates enabled
+
+## Next Steps
+
+1. **Setup monitoring** - Install Prometheus + Grafana (opsional)
+2. **Setup alerting** - Email/Slack notification untuk errors
+3. **Load testing** - Test dengan volume dokumen production
+4. **Backup verification** - Test restore dari backup
+5. **Documentation** - Dokumentasi API keys untuk tim
+
+## Support
+
+Untuk pertanyaan atau issues, hubungi tim development.
--- a/docs/DEPLOYMENT-MANUAL.md
+++ b/docs/DEPLOYMENT-MANUAL.md
@@ -0,0 +1,943 @@
+# Deployment Manual OCR Sprint Service (Tanpa Docker)
+
+Panduan lengkap deployment OCR Sprint Service langsung di server tanpa menggunakan Docker.
+
+## Prasyarat Server
+
+### Spesifikasi Minimum
+- **OS**: Ubuntu 20.04+ / Debian 11+ / RHEL 8+
+- **CPU**: 4 cores (8 cores recommended)
+- **RAM**: 8 GB minimum (16 GB recommended)
+- **Storage**: 50 GB free space
+- **User**: Non-root user dengan sudo access
+
+### Port yang Dibutuhkan
+- `8000`: API server (internal, akan di-proxy oleh Nginx)
+- `80/443`: HTTP/HTTPS (Nginx)
+- `5432`: PostgreSQL (localhost only)
+- `6379`: Redis (localhost only)
+
+## Langkah 1: Install System Dependencies
+
+### Ubuntu/Debian
+
+```bash
+# Update system
+sudo apt update && sudo apt upgrade -y
+
+# Install Python 3.11
+sudo apt install -y software-properties-common
+sudo add-apt-repository ppa:deadsnakes/ppa -y
+sudo apt update
+sudo apt install -y python3.11 python3.11-venv python3.11-dev python3-pip
+
+# Install system libraries untuk OpenCV dan PaddleOCR
+sudo apt install -y \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    libgomp1 \
+    libmagic1 \
+    build-essential \
+    git \
+    curl \
+    wget
+
+# Install Redis
+sudo apt install -y redis-server
+sudo systemctl enable redis-server
+sudo systemctl start redis-server
+
+# Install PostgreSQL
+sudo apt install -y postgresql postgresql-contrib
+sudo systemctl enable postgresql
+sudo systemctl start postgresql
+```
+
+### RHEL/CentOS/Rocky Linux
+
+```bash
+# Update system
+sudo dnf update -y
+
+# Install Python 3.11
+sudo dnf install -y python3.11 python3.11-devel python3.11-pip
+
+# Install system libraries
+sudo dnf install -y \
+    mesa-libGL \
+    glib2 \
+    libSM \
+    libXext \
+    libXrender \
+    file-libs \
+    gcc \
+    gcc-c++ \
+    make \
+    git
+
+# Install Redis
+sudo dnf install -y redis
+sudo systemctl enable redis
+sudo systemctl start redis
+
+# Install PostgreSQL
+sudo dnf install -y postgresql-server postgresql-contrib
+sudo postgresql-setup --initdb
+sudo systemctl enable postgresql
+sudo systemctl start postgresql
+```
+
+## Langkah 2: Setup Database PostgreSQL
+
+```bash
+# Masuk sebagai postgres user
+sudo -u postgres psql
+
+# Jalankan SQL commands berikut:
+```
+
+```sql
+-- Create user dan database
+CREATE USER ocr WITH PASSWORD 'ganti-dengan-password-kuat';
+CREATE DATABASE ocr_sprint OWNER ocr;
+
+-- Grant privileges
+GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
+
+-- Connect ke database
+\c ocr_sprint
+
+-- Grant schema privileges (PostgreSQL 15+)
+GRANT ALL ON SCHEMA public TO ocr;
+
+-- Exit
+\q
+```
+
+**Konfigurasi PostgreSQL untuk remote access (opsional):**
+
+```bash
+# Edit postgresql.conf
+sudo nano /etc/postgresql/14/main/postgresql.conf
+
+# Uncomment dan ubah:
+listen_addresses = 'localhost'  # Tetap localhost untuk keamanan
+
+# Edit pg_hba.conf
+sudo nano /etc/postgresql/14/main/pg_hba.conf
+
+# Tambahkan line:
+local   ocr_sprint      ocr                                     scram-sha-256
+
+# Restart PostgreSQL
+sudo systemctl restart postgresql
+```
+
+## Langkah 3: Setup Application User
+
+```bash
+# Create dedicated user untuk aplikasi
+sudo useradd -m -s /bin/bash ocr
+sudo usermod -aG sudo ocr  # Opsional, untuk maintenance
+
+# Create application directory
+sudo mkdir -p /opt/ocr-sprint-service
+sudo chown ocr:ocr /opt/ocr-sprint-service
+
+# Switch ke user ocr
+sudo su - ocr
+```
+
+## Langkah 4: Install Application
+
+```bash
+# Clone repository
+cd /opt
+git clone https://github.com/Adriankf59/ocr-sprint-service.git
+cd ocr-sprint-service
+
+# Create virtual environment
+python3.11 -m venv .venv
+
+# Activate virtual environment
+source .venv/bin/activate
+
+# Upgrade pip
+pip install --upgrade pip setuptools wheel
+
+# Install application dengan OCR dependencies
+pip install -e ".[ocr]"
+
+# Verify installation
+python -c "import paddleocr; print('PaddleOCR installed successfully')"
+```
+
+## Langkah 5: Konfigurasi Application
+
+```bash
+# Copy environment template
+cp .env.example .env
+
+# Edit konfigurasi
+nano .env
+```
+
+**Konfigurasi production (`/opt/ocr-sprint-service/.env`):**
+
+```bash
+# ==== App ====
+APP_ENV=prod
+APP_HOST=0.0.0.0
+APP_PORT=8000
+APP_LOG_LEVEL=INFO
+
+# ==== Storage ====
+STORAGE_LOCAL_DIR=/opt/ocr-sprint-service/storage
+BLOB_STORAGE_DIR=/opt/ocr-sprint-service/storage/blobs
+BLOB_MAX_UPLOAD_MB=25
+
+# ==== OCR ====
+OCR_LANG=latin
+OCR_USE_GPU=false
+OCR_MAX_IMAGE_SIDE=2200
+
+# ==== Preprocessing ====
+PREPROCESS_TARGET_DPI=300
+PREPROCESS_DENOISE=true
+PREPROCESS_DESKEW=true
+PREPROCESS_DETECT_DOCUMENT=true
+PREPROCESS_REMOVE_SHADOW=true
+PREPROCESS_MIN_QUAD_AREA_FRACTION=0.20
+
+# ==== Table Extraction ====
+TABLES_ENABLED=true
+
+# ==== Confidence ====
+CONFIDENCE_AUTO_APPROVE=0.95
+CONFIDENCE_NEEDS_REVIEW=0.85
+
+# ==== LLM (Phase 5, optional) ====
+LLM_ENABLED=false
+
+# ==== Async Pipeline ====
+QUEUE_ENABLED=true
+REDIS_URL=redis://localhost:6379/0
+CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
+
+# ==== Database ====
+DATABASE_URL=postgresql+psycopg://ocr:ganti-dengan-password-kuat@localhost:5432/ocr_sprint
+DATABASE_ECHO=false
+
+# ==== Auth (WAJIB!) ====
+API_KEYS=key1-ganti-dengan-random-string,key2-ganti-dengan-random-string
+API_KEY_HEADER=X-API-Key
+```
+
+**Generate secure API keys:**
+
+```bash
+# Generate 2 API keys
+openssl rand -hex 32
+openssl rand -hex 32
+```
+
+**Create storage directories:**
+
+```bash
+mkdir -p /opt/ocr-sprint-service/storage/blobs
+chmod 755 /opt/ocr-sprint-service/storage
+```
+
+## Langkah 6: Run Database Migrations
+
+```bash
+# Masih sebagai user ocr, dengan venv activated
+cd /opt/ocr-sprint-service
+source .venv/bin/activate
+
+# Run migrations
+alembic upgrade head
+
+# Verify
+alembic current
+```
+
+## Langkah 7: Test Manual Run
+
+```bash
+# Test API server
+uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
+
+# Di terminal lain, test health check
+curl http://localhost:8000/api/v1/health
+
+# Jika berhasil, stop dengan Ctrl+C
+```
+
+## Langkah 8: Setup Systemd Services
+
+### API Service
+
+```bash
+# Exit dari user ocr, kembali ke user dengan sudo
+exit
+
+# Create systemd service file
+sudo nano /etc/systemd/system/ocr-sprint-api.service
+```
+
+**Content `/etc/systemd/system/ocr-sprint-api.service`:**
+
+```ini
+[Unit]
+Description=OCR Sprint API Service
+After=network.target postgresql.service redis.service
+Wants=postgresql.service redis.service
+
+[Service]
+Type=simple
+User=ocr
+Group=ocr
+WorkingDirectory=/opt/ocr-sprint-service
+
+# Environment
+Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
+EnvironmentFile=/opt/ocr-sprint-service/.env
+
+# Start command - 4 workers untuk production
+ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn \
+    ocr_sprint.main:app \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --workers 4 \
+    --log-level info
+
+# Restart policy
+Restart=always
+RestartSec=10
+StartLimitInterval=0
+
+# Resource limits
+LimitNOFILE=65536
+MemoryLimit=6G
+
+# Security
+NoNewPrivileges=true
+PrivateTmp=true
+
+[Install]
+WantedBy=multi-user.target
+```
+
+### Celery Worker Service
+
+```bash
+sudo nano /etc/systemd/system/ocr-sprint-worker.service
+```
+
+**Content `/etc/systemd/system/ocr-sprint-worker.service`:**
+
+```ini
+[Unit]
+Description=OCR Sprint Celery Worker
+After=network.target postgresql.service redis.service ocr-sprint-api.service
+Wants=postgresql.service redis.service
+
+[Service]
+Type=simple
+User=ocr
+Group=ocr
+WorkingDirectory=/opt/ocr-sprint-service
+
+# Environment
+Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
+EnvironmentFile=/opt/ocr-sprint-service/.env
+
+# Start command - concurrency 2 untuk 4 core CPU
+ExecStart=/opt/ocr-sprint-service/.venv/bin/celery \
+    -A ocr_sprint.worker.celery_app \
+    worker \
+    --loglevel=info \
+    --concurrency=2 \
+    --max-tasks-per-child=100
+
+# Restart policy
+Restart=always
+RestartSec=10
+StartLimitInterval=0
+
+# Resource limits
+LimitNOFILE=65536
+MemoryLimit=4G
+
+# Security
+NoNewPrivileges=true
+PrivateTmp=true
+
+[Install]
+WantedBy=multi-user.target
+```
+
+### Enable dan Start Services
+
+```bash
+# Reload systemd
+sudo systemctl daemon-reload
+
+# Enable services (auto-start on boot)
+sudo systemctl enable ocr-sprint-api
+sudo systemctl enable ocr-sprint-worker
+
+# Start services
+sudo systemctl start ocr-sprint-api
+sudo systemctl start ocr-sprint-worker
+
+# Check status
+sudo systemctl status ocr-sprint-api
+sudo systemctl status ocr-sprint-worker
+
+# View logs
+sudo journalctl -u ocr-sprint-api -f
+sudo journalctl -u ocr-sprint-worker -f
+```
+
+## Langkah 9: Setup Nginx Reverse Proxy
+
+### Install Nginx
+
+```bash
+sudo apt install -y nginx certbot python3-certbot-nginx
+```
+
+### Konfigurasi Nginx
+
+```bash
+sudo nano /etc/nginx/sites-available/ocr-sprint
+```
+
+**Content `/etc/nginx/sites-available/ocr-sprint`:**
+
+```nginx
+# Upstream untuk load balancing (jika scale horizontal)
+upstream ocr_api {
+    server 127.0.0.1:8000;
+    keepalive 32;
+}
+
+# Rate limiting
+limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
+
+server {
+    listen 80;
+    server_name ocr.yourdomain.com;  # Ganti dengan domain Anda
+
+    # Max upload size (sesuaikan dengan BLOB_MAX_UPLOAD_MB)
+    client_max_body_size 30M;
+    client_body_buffer_size 128k;
+
+    # Timeouts untuk dokumen besar
+    proxy_connect_timeout 300s;
+    proxy_send_timeout 300s;
+    proxy_read_timeout 300s;
+    send_timeout 300s;
+
+    # Logging
+    access_log /var/log/nginx/ocr-sprint-access.log;
+    error_log /var/log/nginx/ocr-sprint-error.log;
+
+    # API endpoints
+    location /api/ {
+        # Rate limiting
+        limit_req zone=api_limit burst=20 nodelay;
+
+        proxy_pass http://ocr_api;
+        proxy_http_version 1.1;
+        
+        # Headers
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_set_header Connection "";
+        
+        # Disable buffering untuk streaming responses
+        proxy_buffering off;
+    }
+
+    # Health check endpoint (no rate limit)
+    location /api/v1/health {
+        proxy_pass http://ocr_api;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        access_log off;
+    }
+
+    # Metrics endpoint (restrict access)
+    location /metrics {
+        # Allow only from internal network
+        allow 10.0.0.0/8;
+        allow 172.16.0.0/12;
+        allow 192.168.0.0/16;
+        allow 127.0.0.1;
+        deny all;
+
+        proxy_pass http://ocr_api;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+    }
+
+    # Docs (opsional, bisa di-disable di production)
+    location /docs {
+        proxy_pass http://ocr_api;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+    }
+
+    location /redoc {
+        proxy_pass http://ocr_api;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+    }
+}
+```
+
+### Enable Site
+
+```bash
+# Test konfigurasi
+sudo nginx -t
+
+# Enable site
+sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
+
+# Remove default site (opsional)
+sudo rm /etc/nginx/sites-enabled/default
+
+# Reload Nginx
+sudo systemctl reload nginx
+```
+
+### Setup SSL dengan Let's Encrypt
+
+```bash
+# Install certbot
+sudo apt install -y certbot python3-certbot-nginx
+
+# Obtain certificate (ganti dengan domain Anda)
+sudo certbot --nginx -d ocr.yourdomain.com
+
+# Test auto-renewal
+sudo certbot renew --dry-run
+```
+
+Certbot akan otomatis mengupdate konfigurasi Nginx untuk HTTPS.
+
+## Langkah 10: Setup Firewall
+
+```bash
+# Install UFW (jika belum ada)
+sudo apt install -y ufw
+
+# Allow SSH (PENTING! Jangan sampai terkunci)
+sudo ufw allow 22/tcp
+
+# Allow HTTP dan HTTPS
+sudo ufw allow 80/tcp
+sudo ufw allow 443/tcp
+
+# Enable firewall
+sudo ufw enable
+
+# Check status
+sudo ufw status
+```
+
+## Langkah 11: Verifikasi Deployment
+
+### Test dari Server
+
+```bash
+# Health check
+curl http://localhost:8000/api/v1/health
+
+# Test dengan API key
+curl -X POST http://localhost:8000/api/v1/documents?sync=true \
+  -H "X-API-Key: your-api-key-here" \
+  -F "file=@/path/to/test.pdf"
+```
+
+### Test dari Client
+
+```bash
+# Health check via domain
+curl https://ocr.yourdomain.com/api/v1/health
+
+# Upload dokumen
+curl -X POST https://ocr.yourdomain.com/api/v1/documents \
+  -H "X-API-Key: your-api-key-here" \
+  -F "file=@document.pdf"
+```
+
+## Monitoring dan Maintenance
+
+### View Logs
+
+```bash
+# API logs
+sudo journalctl -u ocr-sprint-api -f
+
+# Worker logs
+sudo journalctl -u ocr-sprint-worker -f
+
+# Nginx logs
+sudo tail -f /var/log/nginx/ocr-sprint-access.log
+sudo tail -f /var/log/nginx/ocr-sprint-error.log
+
+# PostgreSQL logs
+sudo tail -f /var/log/postgresql/postgresql-14-main.log
+```
+
+### Service Management
+
+```bash
+# Restart services
+sudo systemctl restart ocr-sprint-api
+sudo systemctl restart ocr-sprint-worker
+
+# Stop services
+sudo systemctl stop ocr-sprint-api
+sudo systemctl stop ocr-sprint-worker
+
+# Check status
+sudo systemctl status ocr-sprint-api
+sudo systemctl status ocr-sprint-worker
+```
+
+### Database Backup
+
+```bash
+# Create backup script
+sudo nano /opt/ocr-sprint-service/backup.sh
+```
+
+**Content `backup.sh`:**
+
+```bash
+#!/bin/bash
+BACKUP_DIR="/opt/ocr-sprint-service/backups"
+DATE=$(date +%Y%m%d_%H%M%S)
+
+mkdir -p $BACKUP_DIR
+
+# Backup database
+pg_dump -U ocr -h localhost ocr_sprint | gzip > $BACKUP_DIR/db_$DATE.sql.gz
+
+# Backup blobs (opsional, bisa besar)
+# tar -czf $BACKUP_DIR/blobs_$DATE.tar.gz /opt/ocr-sprint-service/storage/blobs
+
+# Keep only last 7 days
+find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete
+
+echo "Backup completed: $DATE"
+```
+
+```bash
+# Make executable
+chmod +x /opt/ocr-sprint-service/backup.sh
+
+# Setup cron job (daily at 2 AM)
+sudo crontab -e
+
+# Add line:
+0 2 * * * /opt/ocr-sprint-service/backup.sh >> /var/log/ocr-backup.log 2>&1
+```
+
+### Log Rotation
+
+```bash
+sudo nano /etc/logrotate.d/ocr-sprint
+```
+
+**Content:**
+
+```
+/var/log/nginx/ocr-sprint-*.log {
+    daily
+    rotate 14
+    compress
+    delaycompress
+    notifempty
+    create 0640 www-data adm
+    sharedscripts
+    postrotate
+        [ -f /var/run/nginx.pid ] && kill -USR1 `cat /var/run/nginx.pid`
+    endscript
+}
+```
+
+## Update Application
+
+```bash
+# Switch ke user ocr
+sudo su - ocr
+cd /opt/ocr-sprint-service
+
+# Pull latest code
+git pull
+
+# Activate venv
+source .venv/bin/activate
+
+# Update dependencies
+pip install -e ".[ocr]"
+
+# Run migrations
+alembic upgrade head
+
+# Exit user ocr
+exit
+
+# Restart services
+sudo systemctl restart ocr-sprint-api
+sudo systemctl restart ocr-sprint-worker
+
+# Check logs
+sudo journalctl -u ocr-sprint-api -n 50
+```
+
+## Performance Tuning
+
+### Increase Worker Concurrency
+
+```bash
+# Edit worker service
+sudo nano /etc/systemd/system/ocr-sprint-worker.service
+
+# Ubah --concurrency sesuai CPU cores
+# Untuk 8 cores: --concurrency=4
+# Untuk 16 cores: --concurrency=8
+
+# Reload dan restart
+sudo systemctl daemon-reload
+sudo systemctl restart ocr-sprint-worker
+```
+
+### PostgreSQL Tuning
+
+```bash
+sudo nano /etc/postgresql/14/main/postgresql.conf
+```
+
+**Recommended settings untuk 16GB RAM:**
+
+```
+shared_buffers = 4GB
+effective_cache_size = 12GB
+maintenance_work_mem = 1GB
+checkpoint_completion_target = 0.9
+wal_buffers = 16MB
+default_statistics_target = 100
+random_page_cost = 1.1
+effective_io_concurrency = 200
+work_mem = 10MB
+min_wal_size = 1GB
+max_wal_size = 4GB
+max_worker_processes = 4
+max_parallel_workers_per_gather = 2
+max_parallel_workers = 4
+```
+
+```bash
+sudo systemctl restart postgresql
+```
+
+### Redis Tuning
+
+```bash
+sudo nano /etc/redis/redis.conf
+```
+
+**Recommended settings:**
+
+```
+maxmemory 2gb
+maxmemory-policy allkeys-lru
+save ""  # Disable RDB snapshots untuk performance
+```
+
+```bash
+sudo systemctl restart redis
+```
+
+## Troubleshooting
+
+### Service tidak start
+
+```bash
+# Check logs
+sudo journalctl -u ocr-sprint-api -n 100 --no-pager
+sudo journalctl -u ocr-sprint-worker -n 100 --no-pager
+
+# Check permissions
+ls -la /opt/ocr-sprint-service
+ls -la /opt/ocr-sprint-service/storage
+
+# Test manual run
+sudo su - ocr
+cd /opt/ocr-sprint-service
+source .venv/bin/activate
+uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
+```
+
+### Database connection error
+
+```bash
+# Test connection
+sudo -u ocr psql -h localhost -U ocr -d ocr_sprint
+
+# Check PostgreSQL status
+sudo systemctl status postgresql
+
+# Check pg_hba.conf
+sudo cat /etc/postgresql/14/main/pg_hba.conf | grep ocr
+```
+
+### Redis connection error
+
+```bash
+# Test Redis
+redis-cli ping
+
+# Check Redis status
+sudo systemctl status redis
+
+# Check Redis logs
+sudo journalctl -u redis -n 50
+```
+
+### PaddleOCR model download gagal
+
+```bash
+# Download manual
+sudo su - ocr
+cd /opt/ocr-sprint-service
+source .venv/bin/activate
+
+python << EOF
+from paddleocr import PaddleOCR
+ocr = PaddleOCR(use_angle_cls=True, lang='latin')
+print("Models downloaded successfully")
+EOF
+```
+
+### Out of memory
+
+```bash
+# Check memory usage
+free -h
+htop
+
+# Reduce worker concurrency
+sudo nano /etc/systemd/system/ocr-sprint-worker.service
+# Ubah --concurrency=1
+
+# Add swap (jika perlu)
+sudo fallocate -l 4G /swapfile
+sudo chmod 600 /swapfile
+sudo mkswap /swapfile
+sudo swapon /swapfile
+echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
+```
+
+## Security Checklist
+
+- [ ] API keys diganti dengan nilai random yang kuat
+- [ ] Database password diganti dari default
+- [ ] Firewall enabled (UFW) - hanya port 22, 80, 443 terbuka
+- [ ] SSL/TLS enabled via Let's Encrypt
+- [ ] `/metrics` endpoint restricted ke internal network
+- [ ] Nginx rate limiting configured
+- [ ] PostgreSQL hanya listen di localhost
+- [ ] Redis hanya listen di localhost
+- [ ] Regular backup configured (cron job)
+- [ ] Log rotation configured
+- [ ] OS security updates enabled (`unattended-upgrades`)
+- [ ] Fail2ban installed untuk SSH protection
+
+## Monitoring dengan Prometheus (Opsional)
+
+### Install Prometheus
+
+```bash
+# Download Prometheus
+cd /tmp
+wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
+tar xvfz prometheus-*.tar.gz
+sudo mv prometheus-2.45.0.linux-amd64 /opt/prometheus
+
+# Create user
+sudo useradd --no-create-home --shell /bin/false prometheus
+
+# Create directories
+sudo mkdir /etc/prometheus /var/lib/prometheus
+sudo chown prometheus:prometheus /var/lib/prometheus
+```
+
+### Configure Prometheus
+
+```bash
+sudo nano /etc/prometheus/prometheus.yml
+```
+
+**Content:**
+
+```yaml
+global:
+  scrape_interval: 15s
+
+scrape_configs:
+  - job_name: 'ocr-sprint'
+    static_configs:
+      - targets: ['localhost:8000']
+    metrics_path: '/metrics'
+```
+
+### Create Systemd Service
+
+```bash
+sudo nano /etc/systemd/system/prometheus.service
+```
+
+**Content:**
+
+```ini
+[Unit]
+Description=Prometheus
+After=network.target
+
+[Service]
+User=prometheus
+Group=prometheus
+Type=simple
+ExecStart=/opt/prometheus/prometheus \
+    --config.file=/etc/prometheus/prometheus.yml \
+    --storage.tsdb.path=/var/lib/prometheus/
+
+[Install]
+WantedBy=multi-user.target
+```
+
+```bash
+sudo systemctl daemon-reload
+sudo systemctl enable prometheus
+sudo systemctl start prometheus
+```
+
+Access Prometheus di `http://localhost:9090`
+
+## Support
+
+Untuk pertanyaan atau issues, hubungi tim development.
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -0,0 +1,437 @@
+# Quickstart Deployment OCR Sprint Service
+
+Panduan deployment OCR Sprint Service ke server production untuk pemrosesan dokumen surat sprint Polri.
+
+## Prasyarat Server
+
+### Spesifikasi Minimum
+- **OS**: Linux (Ubuntu 20.04+ / Debian 11+ / RHEL 8+)
+- **CPU**: 4 cores (8 cores recommended untuk throughput tinggi)
+- **RAM**: 8 GB minimum (16 GB recommended)
+- **Storage**: 50 GB free space
+  - ~3 GB untuk model PaddleOCR
+  - ~1.5 GB untuk dependencies Python
+  - Sisanya untuk blob storage dokumen
+- **Network**: Port 8000 terbuka untuk API access
+
+### Software Requirements
+- Docker 24.0+ dan Docker Compose v2
+- Git
+- (Opsional) Nginx/Caddy untuk reverse proxy + SSL
+
+## Deployment dengan Docker Compose (Recommended)
+
+### 1. Clone Repository
+
+```bash
+# Login ke server sebagai user non-root dengan sudo access
+ssh user@your-server.com
+
+# Clone repository
+git clone https://github.com/Adriankf59/ocr-sprint-service.git
+cd ocr-sprint-service
+```
+
+### 2. Konfigurasi Environment
+
+```bash
+# Copy template environment
+cp .env.example .env
+
+# Edit konfigurasi production
+nano .env
+```
+
+**Konfigurasi penting untuk production:**
+
+```bash
+# ==== App ====
+APP_ENV=prod
+APP_LOG_LEVEL=INFO
+
+# ==== Storage ====
+STORAGE_LOCAL_DIR=/app/storage
+BLOB_STORAGE_DIR=/app/storage/blobs
+BLOB_MAX_UPLOAD_MB=25
+
+# ==== OCR ====
+OCR_LANG=latin
+OCR_USE_GPU=false              # set true jika server punya GPU NVIDIA
+OCR_MAX_IMAGE_SIDE=2200
+
+# ==== Preprocessing ====
+PREPROCESS_TARGET_DPI=300
+PREPROCESS_DENOISE=true
+PREPROCESS_DESKEW=true
+PREPROCESS_DETECT_DOCUMENT=true
+PREPROCESS_REMOVE_SHADOW=true
+
+# ==== Table Extraction ====
+TABLES_ENABLED=true
+
+# ==== Async Pipeline ====
+QUEUE_ENABLED=true
+REDIS_URL=redis://redis:6379/0
+CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
+
+# ==== Database ====
+DATABASE_URL=postgresql+psycopg://ocr:ocr@postgres:5432/ocr_sprint
+DATABASE_ECHO=false
+
+# ==== Auth (WAJIB untuk production!) ====
+API_KEYS=your-secret-key-1,your-secret-key-2
+API_KEY_HEADER=X-API-Key
+```
+
+**Generate API keys yang aman:**
+
+```bash
+# Generate random API key
+openssl rand -hex 32
+```
+
+### 3. Build dan Start Services
+
+```bash
+# Build Docker images
+docker compose build
+
+# Start semua services (API, Worker, Redis, Postgres)
+docker compose up -d
+
+# Cek logs untuk memastikan semua berjalan
+docker compose logs -f api worker
+```
+
+**Services yang berjalan:**
+- `api`: FastAPI server di port 8000
+- `worker`: Celery worker untuk async processing
+- `redis`: Message broker untuk job queue
+- `postgres`: Database untuk job state
+
+### 4. Verifikasi Deployment
+
+```bash
+# Health check
+curl http://localhost:8000/api/v1/health
+
+# Expected response:
+# {"status":"ok","version":"0.1.0"}
+
+# Test OCR endpoint (sync mode untuk testing)
+curl -X POST http://localhost:8000/api/v1/documents?sync=true \
+  -H "X-API-Key: your-secret-key-1" \
+  -F "file=@samples/pdf/example.pdf" \
+  | jq
+```
+
+### 5. Setup Reverse Proxy (Nginx)
+
+**Install Nginx:**
+
+```bash
+sudo apt update
+sudo apt install nginx certbot python3-certbot-nginx
+```
+
+**Konfigurasi Nginx (`/etc/nginx/sites-available/ocr-sprint`):**
+
+```nginx
+upstream ocr_api {
+    server localhost:8000;
+}
+
+server {
+    listen 80;
+    server_name ocr.yourdomain.com;
+
+    client_max_body_size 30M;  # Sesuaikan dengan BLOB_MAX_UPLOAD_MB
+
+    location / {
+        proxy_pass http://ocr_api;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        
+        # Timeout untuk dokumen besar
+        proxy_read_timeout 300s;
+        proxy_connect_timeout 75s;
+    }
+
+    location /metrics {
+        # Restrict metrics endpoint
+        allow 10.0.0.0/8;  # Internal network only
+        deny all;
+        proxy_pass http://ocr_api;
+    }
+}
+```
+
+**Enable site dan setup SSL:**
+
+```bash
+# Enable site
+sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
+sudo nginx -t
+sudo systemctl reload nginx
+
+# Setup SSL dengan Let's Encrypt
+sudo certbot --nginx -d ocr.yourdomain.com
+```
+
+## Deployment Manual (Tanpa Docker)
+
+### 1. Install System Dependencies
+
+```bash
+# Ubuntu/Debian
+sudo apt update
+sudo apt install -y \
+    python3.11 python3.11-venv python3-pip \
+    libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \
+    libgomp1 libmagic1 \
+    redis-server postgresql-14
+
+# Start services
+sudo systemctl enable --now redis-server postgresql
+```
+
+### 2. Setup Database
+
+```bash
+# Create database dan user
+sudo -u postgres psql << EOF
+CREATE USER ocr WITH PASSWORD 'your-secure-password';
+CREATE DATABASE ocr_sprint OWNER ocr;
+GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
+EOF
+```
+
+### 3. Install Application
+
+```bash
+# Clone repository
+git clone https://github.com/Adriankf59/ocr-sprint-service.git
+cd ocr-sprint-service
+
+# Create virtual environment
+python3.11 -m venv .venv
+source .venv/bin/activate
+
+# Install dependencies
+pip install --upgrade pip
+pip install -e ".[ocr]"
+
+# Copy dan edit .env
+cp .env.example .env
+nano .env
+```
+
+**Update DATABASE_URL di .env:**
+
+```bash
+DATABASE_URL=postgresql+psycopg://ocr:your-secure-password@localhost:5432/ocr_sprint
+REDIS_URL=redis://localhost:6379/0
+QUEUE_ENABLED=true
+```
+
+### 4. Run Database Migrations
+
+```bash
+alembic upgrade head
+```
+
+### 5. Setup Systemd Services
+
+**API Service (`/etc/systemd/system/ocr-sprint-api.service`):**
+
+```ini
+[Unit]
+Description=OCR Sprint API
+After=network.target postgresql.service redis.service
+
+[Service]
+Type=simple
+User=ocr
+WorkingDirectory=/opt/ocr-sprint-service
+Environment="PATH=/opt/ocr-sprint-service/.venv/bin"
+ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000 --workers 4
+Restart=always
+RestartSec=10
+
+[Install]
+WantedBy=multi-user.target
+```
+
+**Worker Service (`/etc/systemd/system/ocr-sprint-worker.service`):**
+
+```ini
+[Unit]
+Description=OCR Sprint Celery Worker
+After=network.target postgresql.service redis.service
+
+[Service]
+Type=simple
+User=ocr
+WorkingDirectory=/opt/ocr-sprint-service
+Environment="PATH=/opt/ocr-sprint-service/.venv/bin"
+ExecStart=/opt/ocr-sprint-service/.venv/bin/celery -A ocr_sprint.worker.celery_app worker -l info --concurrency=2
+Restart=always
+RestartSec=10
+
+[Install]
+WantedBy=multi-user.target
+```
+
+**Enable dan start services:**
+
+```bash
+sudo systemctl daemon-reload
+sudo systemctl enable --now ocr-sprint-api ocr-sprint-worker
+sudo systemctl status ocr-sprint-api ocr-sprint-worker
+```
+
+## Monitoring dan Maintenance
+
+### Monitoring Logs
+
+```bash
+# Docker deployment
+docker compose logs -f api worker
+
+# Manual deployment
+sudo journalctl -u ocr-sprint-api -f
+sudo journalctl -u ocr-sprint-worker -f
+```
+
+### Prometheus Metrics
+
+Metrics tersedia di endpoint `/metrics`:
+
+```bash
+curl http://localhost:8000/metrics
+```
+
+**Key metrics:**
+- `ocr_documents_total`: Total dokumen diproses
+- `ocr_processing_duration_seconds`: Durasi processing
+- `ocr_confidence_score`: Distribusi confidence score
+- `celery_task_*`: Celery worker metrics
+
+### Backup Database
+
+```bash
+# Docker deployment
+docker compose exec postgres pg_dump -U ocr ocr_sprint > backup_$(date +%Y%m%d).sql
+
+# Manual deployment
+pg_dump -U ocr ocr_sprint > backup_$(date +%Y%m%d).sql
+```
+
+### Update Service
+
+```bash
+# Docker deployment
+cd ocr-sprint-service
+git pull
+docker compose build
+docker compose up -d
+
+# Manual deployment
+cd ocr-sprint-service
+git pull
+source .venv/bin/activate
+pip install -e ".[ocr]"
+alembic upgrade head
+sudo systemctl restart ocr-sprint-api ocr-sprint-worker
+```
+
+## Troubleshooting
+
+### Service tidak start
+
+```bash
+# Cek logs
+docker compose logs api worker
+
+# Cek health check
+curl http://localhost:8000/api/v1/health
+```
+
+### PaddleOCR model download gagal
+
+```bash
+# Download manual ke volume
+docker compose exec api python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='latin')"
+```
+
+### Worker tidak memproses jobs
+
+```bash
+# Cek Redis connection
+docker compose exec worker redis-cli -h redis ping
+
+# Cek Celery worker status
+docker compose exec worker celery -A ocr_sprint.worker.celery_app inspect active
+```
+
+### Database migration error
+
+```bash
+# Cek current revision
+docker compose exec api alembic current
+
+# Force upgrade
+docker compose exec api alembic upgrade head
+```
+
+### Out of memory
+
+```bash
+# Kurangi worker concurrency di docker-compose.yml
+# Ubah: --concurrency=1 (default) atau tambahkan memory limit
+```
+
+## Security Checklist
+
+- [ ] API_KEYS diset dengan nilai random yang kuat
+- [ ] Firewall configured (hanya port 80/443 terbuka)
+- [ ] SSL/TLS enabled via Nginx + Let's Encrypt
+- [ ] Database password diganti dari default
+- [ ] `/metrics` endpoint restricted ke internal network
+- [ ] Regular backup database dan blob storage
+- [ ] Log rotation configured
+- [ ] OS security updates enabled
+
+## Performance Tuning
+
+### Untuk throughput tinggi:
+
+1. **Increase worker concurrency:**
+   ```yaml
+   # docker-compose.yml
+   command: ["celery", "-A", "ocr_sprint.worker.celery_app", "worker", "-l", "info", "--concurrency=4"]
+   ```
+
+2. **Scale workers horizontally:**
+   ```bash
+   docker compose up -d --scale worker=3
+   ```
+
+3. **Enable GPU (jika tersedia):**
+   ```bash
+   # .env
+   OCR_USE_GPU=true
+   ```
+
+4. **Tune Postgres:**
+   ```sql
+   -- Increase connection pool
+   ALTER SYSTEM SET max_connections = 200;
+   ALTER SYSTEM SET shared_buffers = '2GB';
+   ```
+
+## Support
+
+Untuk pertanyaan atau issues, hubungi tim development atau buat issue di repository.
--- a/src/ocr_sprint/api/routes/documents.py
+++ b/src/ocr_sprint/api/routes/documents.py
@@ -86,14 +86,18 @@ def _row_to_response(row: object) -> DocumentResponse:

    assert isinstance(row, JobRow)
    status_enum = DocumentStatus(row.status)
-    result_obj: ExtractionResult | None = None
+    personel_list = None
    if row.result is not None:
        result_obj = ExtractionResult.model_validate(row.result)
+        # Auto-number personnel entries sequentially (1, 2, 3, ...)
+        for idx, entry in enumerate(result_obj.personel, start=1):
+            entry.no = idx
+        personel_list = result_obj.personel
    return DocumentResponse(
        job_id=row.job_id,
        status=status_enum,
        confidence=row.confidence,
-        data=result_obj,
+        data=personel_list,
        review_flags=list(row.review_flags or []),
        error=row.error,
        approved=bool(row.approved),
--- a/src/ocr_sprint/data/master_pangkat.py
+++ b/src/ocr_sprint/data/master_pangkat.py
@@ -33,12 +33,45 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
    # Perwira Menengah
    "KOMPOL": ("KOMPOL",),
    "AKBP": ("AKBP",),
-    "KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP"),
+    "KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP", "KOMBES"),
    # Perwira Tinggi
    "BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"),
    "IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"),
    "KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"),
    "JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"),
+    # PNS Polri (Pegawai Negeri Sipil di lingkungan Polri). PNS appear
+    # routinely on sprint panitia / undangan templates alongside Polri
+    # personnel, so we treat them as valid ranks for extraction.
+    # Sources: PP 11/2017 jo PP 17/2020 (Manajemen PNS); golongan I-IV.
+    # Golongan I (Juru)
+    "JURU MUDA": ("JURU MUDA",),
+    "JURU MUDA TK I": ("JURU MUDA TK I", "JURU MUDA TK.I", "JURU MUDA TINGKAT I"),
+    "JURU": ("JURU",),
+    "JURU TK I": ("JURU TK I", "JURU TK.I", "JURU TINGKAT I"),
+    # Golongan II (Pengatur)
+    "PENGATUR MUDA": ("PENGATUR MUDA",),
+    "PENGATUR MUDA TK I": (
+        "PENGATUR MUDA TK I",
+        "PENGATUR MUDA TK.I",
+        "PENGATUR MUDA TINGKAT I",
+    ),
+    "PENGATUR": ("PENGATUR",),
+    "PENGATUR TK I": ("PENGATUR TK I", "PENGATUR TK.I", "PENGATUR TINGKAT I"),
+    # Golongan III (Penata)
+    "PENATA MUDA": ("PENATA MUDA",),
+    "PENATA MUDA TK I": (
+        "PENATA MUDA TK I",
+        "PENATA MUDA TK.I",
+        "PENATA MUDA TINGKAT I",
+    ),
+    "PENATA": ("PENATA",),
+    "PENATA TK I": ("PENATA TK I", "PENATA TK.I", "PENATA TINGKAT I"),
+    # Golongan IV (Pembina)
+    "PEMBINA": ("PEMBINA",),
+    "PEMBINA TK I": ("PEMBINA TK I", "PEMBINA TK.I", "PEMBINA TINGKAT I"),
+    "PEMBINA UTAMA MUDA": ("PEMBINA UTAMA MUDA",),
+    "PEMBINA UTAMA MADYA": ("PEMBINA UTAMA MADYA",),
+    "PEMBINA UTAMA": ("PEMBINA UTAMA",),
 }

 # Reverse lookup: any variant (uppercased) → canonical form.
--- a/src/ocr_sprint/pipeline/extract/personnel.py
+++ b/src/ocr_sprint/pipeline/extract/personnel.py
@@ -64,6 +64,8 @@ _HEADER_SYNONYMS: dict[str, str] = {
    "jabatan dinas": "jabatan_dinas",
    "jabatan dalam dinas": "jabatan_dinas",
    "jbt dinas": "jabatan_dinas",
+    "struktural": "jabatan_dinas",
+    "jabatan struktural": "jabatan_dinas",
    # jabatan dalam sprint (role for this dispatch)
    "jabatan dalam sprint": "jabatan_sprint",
    "jabatan dalam sprin": "jabatan_sprint",
@@ -72,6 +74,8 @@ _HEADER_SYNONYMS: dict[str, str] = {
    "jabatan sprin": "jabatan_sprint",
    "tugas": "jabatan_sprint",
    "penugasan": "jabatan_sprint",
+    "dalam penugasan": "jabatan_sprint",
+    "jabatan dalam penugasan": "jabatan_sprint",
    # remarks
    "keterangan": "keterangan",
    "ket": "keterangan",
--- a/src/ocr_sprint/pipeline/extract/personnel_text.py
+++ b/src/ocr_sprint/pipeline/extract/personnel_text.py
@@ -38,12 +38,18 @@ _RANK_TOKENS: tuple[str, ...] = tuple(
    )
 )
 _RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
-# A line that contains a rank token followed (anywhere on the same line) by
-# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
-# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
-# inside e.g. "ABRIPDA-style" text.
+# A rank token followed (within a few characters) by an 8-digit NRP.
+# We allow common separators: '/', '-', '.', ',', ':' or whitespace.
+# The trailing ``\b`` plus proximity to the 8-digit NRP is the
+# specificity signal — we deliberately do *not* require a leading
+# ``\b`` because real Polri sprint OCR routinely mashes the rank into
+# the trailing characters of the previous cell (observed on Polres
+# Banjar: "...CPHR., CBA, CI" runs straight into "AKP" giving
+# "CIAKP 84011113"). Requiring a leading boundary loses that row
+# entirely. The longest-first alternation order ensures multi-token
+# ranks ("KOMBES POL") still win over short overlaps ("KBP").
 _RE_RANK_NRP_LINE = re.compile(
-    rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
+    rf"(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
    re.IGNORECASE,
 )
 # A bare row number marker like "1." or "12)". OCR often puts it on its own
@@ -143,31 +149,248 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:

    Strategy:

+    **Pass 1** — same-line rank+NRP (original strategy):
    1. Iterate every line. Skip lines that don't contain both a known rank
       and an 8-digit NRP (those are the only signal we trust).
    2. For each rank+NRP line, look back for the most recent plausible name
       line, and forward 1-3 lines for jabatan content.
    3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.

+    **Pass 2** — separate-line rank and NRP (for tabular sprint formats):
+    If pass 1 produces no results, scan for lines containing a standalone
+    rank token, then look up to 2 lines forward for a standalone NRP.
+    This handles sprint formats where OCR renders each column on its own
+    line (e.g. Polres Banjar layout).
+
+    **Pass 3** — rank-only (for sprint formats *without* an NRP column):
+    Some sprint templates (panitia, undangan, etc.) list only nama +
+    pangkat + jabatan, no NRP. If pass 1 and pass 2 both yield nothing,
+    fall back to a rank-only scan: every standalone rank line (or
+    two-line rank like "KOMBES" + "POL" produced by narrow-column OCR)
+    becomes a row, with name assembled from preceding lines and jabatan
+    from following lines. ``nrp`` stays ``None``. False-positive risk
+    is higher (stray rank tokens in body text), so this only fires when
+    nothing else matched.
+
    The fallback is intentionally rate-limited: the first matching rank
    token on a line wins (no greedy multi-match per line), and a name line
    can only be consumed once (so a stray ranked text inside a paragraph
    doesn't turn into multiple bogus entries).
    """
    lines = raw_text.splitlines()
+
+    # ── Pass 1: rank+NRP on the same line ────────────────────────────
+    rows = _extract_same_line(lines)
+    if rows:
+        return rows
+
+    # ── Pass 2: rank and NRP on separate lines ───────────────────────
+    rows = _extract_separate_lines(lines)
+    if rows:
+        return rows
+
+    # ── Pass 3: rank-only (no NRP column) ────────────────────────────
+    return _extract_rank_only(lines)
+
+
+# Regex for a line that is *only* a rank token (possibly with punctuation).
+_RE_RANK_ONLY = re.compile(
+    rf"^\s*(?P<rank>{_RANK_ALT})\s*[/.\-,:]*\s*$",
+    re.IGNORECASE,
+)
+# Regex for a line that contains a standalone 8-digit NRP.
+_RE_NRP_ONLY = re.compile(r"(?<!\d)(?P<nrp>\d{8})(?!\d)")
+
+
+# Strip a leading row number marker like "1 ", "1.", "12)" from a name
+# prefix taken from the same OCR line as a rank+NRP match. Unlike
+# _RE_ROW_NUMBER (which matches a *whole* line), this is a prefix strip
+# for embedded same-line cases like "1 CUCU JUHANA, A.K.S. KOMPOL ...".
+_RE_LEADING_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s+")
+
+
+def _extract_same_line(lines: list[str]) -> list[PersonnelEntry]:
+    """Pass 1: rank+NRP pairs found anywhere in the joined text.
+
+    Uses ``finditer`` over the full ``\\n``-joined OCR text rather than
+    ``re.search`` per line so that multiple rank+NRP pairs on the same
+    OCR line still produce separate rows. This is required for sprint
+    scans where Paddle merges several table rows into one OCR line
+    (observed on Polres Banjar where row 2's "...CBA.AKP 77020049 KASAT
+    RESKRIM" was being swallowed into row 1's jabatan because per-line
+    ``search`` only returns the first match).
+
+    For each match we resolve nama from text *before* the match (the
+    same-line prefix takes precedence; otherwise look back through the
+    preceding lines bounded by the previous match) and jabatan from text
+    *after* the match (same-line suffix plus up to ~3 follow-up lines,
+    bounded by the next match).
+    """
+    if not lines:
+        return []
+    full_text = "\n".join(lines)
+
+    line_starts: list[int] = []
+    pos = 0
+    for line in lines:
+        line_starts.append(pos)
+        pos += len(line) + 1  # +1 for the joining "\n"
+
+    def offset_to_line(offset: int) -> int:
+        lo, hi = 0, len(line_starts)
+        while lo < hi:
+            mid = (lo + hi) // 2
+            if line_starts[mid] <= offset:
+                lo = mid + 1
+            else:
+                hi = mid
+        return max(0, lo - 1)
+
+    matches = list(_RE_RANK_NRP_LINE.finditer(full_text))
+    rows: list[PersonnelEntry] = []
+    consumed_lines: set[int] = set()
+
+    for i, m in enumerate(matches):
+        pangkat = normalize_pangkat(m.group("rank"))
+        if not pangkat or not is_valid_pangkat(pangkat):
+            continue
+        nrp = m.group("nrp")
+        ml = offset_to_line(m.start())
+        prev_ml = (
+            offset_to_line(matches[i - 1].start()) if i > 0 else -1
+        )
+        next_ml = (
+            offset_to_line(matches[i + 1].start())
+            if i + 1 < len(matches)
+            else len(lines)
+        )
+
+        line_text = lines[ml]
+        line_off = line_starts[ml]
+
+        # Same-line prefix: text on this line *before* the rank token.
+        # If the previous match was on this same line, only consider the
+        # text after that previous match's NRP (otherwise we'd reuse the
+        # earlier row's tail as this row's name).
+        prefix_start_local = 0
+        if prev_ml == ml and i > 0:
+            prefix_start_local = max(0, matches[i - 1].end() - line_off)
+        prefix = line_text[prefix_start_local : m.start() - line_off]
+
+        # Same-line suffix: text on this line *after* the NRP, capped at
+        # the next match's start if it's on this same line.
+        suffix_end_local = len(line_text)
+        if next_ml == ml and i + 1 < len(matches):
+            suffix_end_local = matches[i + 1].start() - line_off
+        suffix = line_text[m.end() - line_off : suffix_end_local]
+
+        # ── Resolve nama ────────────────────────────────────────────
+        nama: str | None = None
+        prefix_clean = _RE_LEADING_ROW_NUMBER.sub("", prefix).strip()
+        if prefix_clean and _is_plausible_name(prefix_clean):
+            nama = prefix_clean
+        elif prev_ml < ml:
+            for back in range(ml - 1, prev_ml, -1):
+                if back in consumed_lines or back < 0:
+                    continue
+                candidate = lines[back].strip()
+                if _is_plausible_name(candidate):
+                    nama = candidate
+                    consumed_lines.add(back)
+                    break
+
+        # ── Resolve jabatan ─────────────────────────────────────────
+        jabatan_parts: list[str] = []
+        suffix_clean = suffix.strip()
+        if suffix_clean:
+            jabatan_parts.append(suffix_clean)
+        if next_ml > ml:
+            max_fwd = min(ml + 4, next_ml, len(lines))
+            for fwd in range(ml + 1, max_fwd):
+                candidate = lines[fwd].strip()
+                if not candidate:
+                    if jabatan_parts:
+                        break
+                    continue
+                if _RE_NAME_BLOCKLIST.match(candidate):
+                    break
+                if _RE_ROW_NUMBER.match(candidate):
+                    break
+                jabatan_parts.append(candidate)
+        jabatan = (
+            " ".join(" ".join(jabatan_parts).split())
+            if jabatan_parts
+            else None
+        )
+
+        rows.append(
+            PersonnelEntry(
+                no=None,
+                pangkat=pangkat,
+                nrp=nrp,
+                nama=nama,
+                jabatan_dinas=jabatan,
+                jabatan_sprint=None,
+                keterangan=None,
+            )
+        )
+    return rows
+
+
+def _extract_separate_lines(lines: list[str]) -> list[PersonnelEntry]:
+    """Pass 2: rank and NRP on separate nearby lines.
+
+    Handles tabular sprint formats where OCR outputs each column as its
+    own line, e.g.:
+        1
+        CUCU JUHANA, A.K.S.
+        KOMPOL
+        70100418
+        KABAGOPS
+    """
    consumed_names: set[int] = set()
+    consumed_nrps: set[int] = set()
    rows: list[PersonnelEntry] = []

    for idx, raw_line in enumerate(lines):
        line = raw_line.strip()
-        match = _RE_RANK_NRP_LINE.search(line)
-        if not match:
+        rank_match = _RE_RANK_ONLY.match(line)
+        if not rank_match:
+            # Also try: line starts with a rank token (may have trailing text)
+            for tok in _RANK_TOKENS:
+                if line.upper().startswith(tok) and len(line) - len(tok) < 5:
+                    rank_match = re.match(
+                        rf"^\s*(?P<rank>{re.escape(tok)})\s*[/.\-,:]*",
+                        line,
+                        re.IGNORECASE,
+                    )
+                    if rank_match:
+                        break
+        if not rank_match:
            continue
-        pangkat = normalize_pangkat(match.group("rank"))
+
+        pangkat = normalize_pangkat(rank_match.group("rank"))
        if not pangkat or not is_valid_pangkat(pangkat):
            continue
-        nrp = match.group("nrp")

+        # Look forward up to 2 lines for NRP
+        nrp: str | None = None
+        nrp_idx: int | None = None
+        for fwd in range(idx + 1, min(idx + 3, len(lines))):
+            if fwd in consumed_nrps:
+                continue
+            nrp_match = _RE_NRP_ONLY.search(lines[fwd].strip())
+            if nrp_match:
+                nrp = nrp_match.group("nrp")
+                nrp_idx = fwd
+                break
+
+        if not nrp:
+            continue
+        assert nrp_idx is not None
+        consumed_nrps.add(nrp_idx)
+
+        # Look back for name
        nama: str | None = None
        for back in range(idx - 1, max(idx - 6, -1), -1):
            if back in consumed_names:
@@ -178,7 +401,8 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
                consumed_names.add(back)
                break

-        jabatan = _following_jabatan(lines, idx)
+        # Look forward after NRP for jabatan
+        jabatan = _following_jabatan(lines, nrp_idx)
        rows.append(
            PersonnelEntry(
                no=None,
@@ -193,6 +417,370 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
    return rows


+# Bare row-number markers used by sprint formats without NRP (the dot
+# is often missing in narrow-column OCR, e.g. just "1" on its own line).
+_RE_BARE_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s*$")
+
+
+def _try_match_rank_at(lines: list[str], idx: int) -> tuple[str, int] | None:
+    """Try to match a standalone rank starting at ``lines[idx]``.
+
+    Returns ``(rank_text, lines_consumed)`` on success. Handles narrow-
+    column OCR that splits a multi-token rank across two lines (e.g.
+    ``"KOMBES"`` + ``"POL"`` or ``"PENATA"`` + ``"TK I"``).
+
+    The two-line concatenation is tried *first* so that more-specific
+    multi-token ranks ("PENATA TK I") win over their less-specific
+    single-line prefix ("PENATA"). Without this preference, "TK I"
+    would leak into the jabatan column.
+    """
+    if idx >= len(lines):
+        return None
+    line = lines[idx].strip()
+    if idx + 1 < len(lines):
+        combined = (line + " " + lines[idx + 1].strip()).strip()
+        m2 = _RE_RANK_ONLY.match(combined)
+        if m2:
+            return m2.group("rank"), 2
+    m = _RE_RANK_ONLY.match(line)
+    if m:
+        return m.group("rank"), 1
+    return None
+
+
+def _extract_rank_only(lines: list[str]) -> list[PersonnelEntry]:
+    """Pass 3: rank-only fallback for sprint formats without an NRP column.
+
+    Each standalone rank line (single line or two-line concatenation) is
+    treated as the pivot of a personnel row. ``nama`` is assembled from
+    the preceding contiguous plausible-name lines (typical OCR splits a
+    long name across 2-3 short lines because of narrow columns); jabatan
+    is collected from following lines until the next rank or row marker.
+
+    ``nrp`` is always ``None`` for rows produced by this pass.
+    """
+    rows: list[PersonnelEntry] = []
+    consumed_lines: set[int] = set()
+    i = 0
+    while i < len(lines):
+        match = _try_match_rank_at(lines, i)
+        if not match:
+            i += 1
+            continue
+        rank_text, rank_span = match
+        pangkat = normalize_pangkat(rank_text)
+        if not pangkat or not is_valid_pangkat(pangkat):
+            i += 1
+            continue
+
+        # ── Look back for name lines (assemble up to 4 contiguous lines) ──
+        name_lines: list[str] = []
+        for back in range(i - 1, max(i - 6, -1), -1):
+            if back in consumed_lines:
+                break
+            candidate = lines[back].strip()
+            if not candidate:
+                if name_lines:
+                    break
+                continue
+            if _RE_BARE_ROW_NUMBER.match(candidate):
+                break
+            if _RE_NAME_BLOCKLIST.match(candidate):
+                break
+            if _try_match_rank_at(lines, back) is not None:
+                break
+            if not _is_plausible_name(candidate):
+                break
+            name_lines.insert(0, candidate)
+            consumed_lines.add(back)
+        nama = " ".join(" ".join(name_lines).split()) if name_lines else None
+
+        # ── Look forward for jabatan (stop at next rank / row marker) ─────
+        jabatan_parts: list[str] = []
+        fwd = i + rank_span
+        steps = 0
+        while fwd < len(lines) and steps < 8:
+            candidate = lines[fwd].strip()
+            if not candidate:
+                if jabatan_parts:
+                    break
+                fwd += 1
+                steps += 1
+                continue
+            if _RE_BARE_ROW_NUMBER.match(candidate):
+                break
+            if _try_match_rank_at(lines, fwd) is not None:
+                break
+            if _RE_NAME_BLOCKLIST.match(candidate):
+                break
+            jabatan_parts.append(candidate)
+            fwd += 1
+            steps += 1
+        jabatan = " ".join(" ".join(jabatan_parts).split()) if jabatan_parts else None
+
+        rows.append(
+            PersonnelEntry(
+                no=None,
+                pangkat=pangkat,
+                nrp=None,
+                nama=nama,
+                jabatan_dinas=jabatan,
+                jabatan_sprint=None,
+                keterangan=None,
+            )
+        )
+        i += rank_span
+    return rows
+
+
+# ── Column-aware Pass 3 (uses OCR bounding boxes) ───────────────────────
+
+
+def _box_x_left(box: tuple[tuple[float, float], ...]) -> float:
+    return min(p[0] for p in box)
+
+
+def _box_x_right(box: tuple[tuple[float, float], ...]) -> float:
+    return max(p[0] for p in box)
+
+
+def _box_x_center(box: tuple[tuple[float, float], ...]) -> float:
+    return (_box_x_left(box) + _box_x_right(box)) / 2
+
+
+def _box_y_top(box: tuple[tuple[float, float], ...]) -> float:
+    return min(p[1] for p in box)
+
+
+def _box_y_bottom(box: tuple[tuple[float, float], ...]) -> float:
+    return max(p[1] for p in box)
+
+
+def _box_y_center(box: tuple[tuple[float, float], ...]) -> float:
+    return (_box_y_top(box) + _box_y_bottom(box)) / 2
+
+
+def _box_height(box: tuple[tuple[float, float], ...]) -> float:
+    return _box_y_bottom(box) - _box_y_top(box)
+
+
+def extract_personnel_from_ocr_lines(ocr_lines: list) -> list[PersonnelEntry]:
+    """Column-aware Pass 3 for sprint formats without an NRP column.
+
+    Each ``ocr_line`` must expose ``text`` (str) and ``box`` (a tuple of
+    4 ``(x, y)`` corner points). We use the geometry to:
+
+    1. Detect rank lines (single-line or vertically-stacked two-line).
+    2. Estimate the PANGKAT column X-center from those rank lines.
+    3. For each rank, gather **only** lines in the NAMA column (X left
+       of PANGKAT) within the row's Y span as the name fragments, and
+       **only** lines in the JABATAN column (X right of PANGKAT) for
+       jabatan. This prevents column-bleed that flat-text Pass 3
+       suffers from on dense tables.
+
+    Returns ``[]`` if no rank lines are detected (caller can fall back
+    to the text-only Pass 3).
+    """
+    if not ocr_lines:
+        return []
+
+    # Sort by (y_top, x_left) for vertical-stacking rank detection.
+    indexed = sorted(
+        range(len(ocr_lines)),
+        key=lambda i: (_box_y_top(ocr_lines[i].box), _box_x_left(ocr_lines[i].box)),
+    )
+
+    # Pass 1: find rank anchors.
+    # An anchor is one or two stacked OCR lines whose combined text matches
+    # _RE_RANK_ONLY and normalises to a known pangkat. Two-line stacks must
+    # X-overlap so we don't accidentally merge cells from different columns.
+    used: set[int] = set()
+    anchors: list[dict] = []
+    for pos, idx in enumerate(indexed):
+        if idx in used:
+            continue
+        ln = ocr_lines[idx]
+        text = ln.text.strip()
+
+        rank_text: str | None = None
+        member_idxs: list[int] = [idx]
+
+        # Try two-line stack first (so PENATA TK I beats PENATA).
+        for j_pos in range(pos + 1, min(pos + 5, len(indexed))):
+            j_idx = indexed[j_pos]
+            if j_idx in used:
+                continue
+            other = ocr_lines[j_idx]
+            x_overlap = (
+                min(_box_x_right(ln.box), _box_x_right(other.box))
+                - max(_box_x_left(ln.box), _box_x_left(other.box))
+            )
+            if x_overlap <= 0:
+                continue
+            y_gap = _box_y_top(other.box) - _box_y_bottom(ln.box)
+            if y_gap > _box_height(ln.box) * 1.5:
+                break
+            combined = (text + " " + other.text.strip()).strip()
+            m2 = _RE_RANK_ONLY.match(combined)
+            if m2:
+                rank_text = m2.group("rank")
+                member_idxs.append(j_idx)
+                break
+
+        if rank_text is None:
+            m1 = _RE_RANK_ONLY.match(text)
+            if m1:
+                rank_text = m1.group("rank")
+
+        if rank_text is None:
+            continue
+        pangkat = normalize_pangkat(rank_text)
+        if not pangkat or not is_valid_pangkat(pangkat):
+            continue
+
+        anchors.append(
+            {
+                "member_idxs": member_idxs,
+                "pangkat": pangkat,
+                "x_center": _box_x_center(ln.box),
+                "y_top": min(_box_y_top(ocr_lines[m].box) for m in member_idxs),
+                "y_bottom": max(_box_y_bottom(ocr_lines[m].box) for m in member_idxs),
+            }
+        )
+        used.update(member_idxs)
+
+    if not anchors:
+        return []
+
+    # Sort anchors by Y so we can compute row spans.
+    anchors.sort(key=lambda a: a["y_top"])
+
+    # Estimate PANGKAT column X-center as the median of rank anchor X-centers.
+    xs_sorted = sorted(a["x_center"] for a in anchors)
+    pangkat_x = xs_sorted[len(xs_sorted) // 2]
+
+    # X tolerance: half the median rank-line width. Lines with x_center
+    # within ±tolerance of pangkat_x are *in* the PANGKAT column and
+    # excluded from both NAMA and JABATAN buckets.
+    rank_widths = [
+        _box_x_right(ocr_lines[a["member_idxs"][0]].box)
+        - _box_x_left(ocr_lines[a["member_idxs"][0]].box)
+        for a in anchors
+    ]
+    rank_widths.sort()
+    median_rank_width = rank_widths[len(rank_widths) // 2] if rank_widths else 50.0
+    column_margin = max(median_rank_width * 0.5, 5.0)
+
+    # Try to split the JABATAN side into STRUKTURAL (jabatan_dinas) and
+    # DALAM SPRIN (jabatan_sprint) by clustering jabatan-side X-centers.
+    # This is a 2-cluster k-means-style split: collect all X-centers of
+    # lines to the right of PANGKAT, find the largest X-gap among them,
+    # and use that gap as the column boundary. KET is typically the
+    # right-most narrow column we let bleed into jabatan_sprint since
+    # it's commonly empty.
+    jabatan_xs: list[float] = []
+    for ln in ocr_lines:
+        x = _box_x_center(ln.box)
+        if x > pangkat_x + column_margin and ln.text.strip():
+            jabatan_xs.append(x)
+    jabatan_split_x: float | None = None
+    if len(jabatan_xs) >= 4:
+        jabatan_xs.sort()
+        max_gap = 0.0
+        max_gap_x: float | None = None
+        for k in range(1, len(jabatan_xs)):
+            gap = jabatan_xs[k] - jabatan_xs[k - 1]
+            if gap > max_gap:
+                max_gap = gap
+                max_gap_x = (jabatan_xs[k] + jabatan_xs[k - 1]) / 2
+        # Only use the split if the gap is meaningfully larger than a
+        # within-column gap (heuristic: > 1.5× median rank width).
+        if max_gap_x is not None and max_gap > median_rank_width * 1.5:
+            jabatan_split_x = max_gap_x
+
+    # Pre-compute each anchor's y_center for midpoint row dividers.
+    anchor_y_centers = [(a["y_top"] + a["y_bottom"]) / 2 for a in anchors]
+
+    rows: list[PersonnelEntry] = []
+    for i, anchor in enumerate(anchors):
+        # Row Y span: midpoint between this anchor and its neighbours.
+        # Using the midpoint (rather than the previous anchor's
+        # y_bottom) prevents row N's tail content (e.g. last name
+        # fragment "M.H.") from leaking into row N+1's nama bucket
+        # when rank lines don't extend to the full visual row height.
+        y_lo = (
+            (anchor_y_centers[i - 1] + anchor_y_centers[i]) / 2
+            if i > 0
+            else float("-inf")
+        )
+        y_hi = (
+            (anchor_y_centers[i] + anchor_y_centers[i + 1]) / 2
+            if i + 1 < len(anchors)
+            else float("inf")
+        )
+
+        nama_pieces: list[tuple[float, str]] = []
+        struktural_pieces: list[tuple[float, str]] = []
+        sprint_pieces: list[tuple[float, str]] = []
+        for j, ln in enumerate(ocr_lines):
+            if j in anchor["member_idxs"]:
+                continue
+            text = ln.text.strip()
+            if not text:
+                continue
+            x = _box_x_center(ln.box)
+            y = _box_y_center(ln.box)
+            if not (y_lo <= y <= y_hi):
+                continue
+            if x < pangkat_x - column_margin:
+                # NAMA side
+                if _RE_NAME_BLOCKLIST.match(text):
+                    continue
+                if _RE_BARE_ROW_NUMBER.match(text):
+                    continue
+                if not _is_plausible_name(text):
+                    continue
+                nama_pieces.append((y, text))
+            elif x > pangkat_x + column_margin:
+                # JABATAN side — split into STRUKTURAL vs DALAM SPRIN
+                # using the geometric column boundary detected above.
+                if _RE_NAME_BLOCKLIST.match(text):
+                    continue
+                if jabatan_split_x is not None and x > jabatan_split_x:
+                    sprint_pieces.append((y, text))
+                else:
+                    struktural_pieces.append((y, text))
+            # else: in PANGKAT column or column margin — skip
+
+        nama_pieces.sort(key=lambda p: p[0])
+        struktural_pieces.sort(key=lambda p: p[0])
+        sprint_pieces.sort(key=lambda p: p[0])
+
+        # Strip leading row number from the first nama piece (e.g. "1 F. GUNTUR"
+        # collapses to "F. GUNTUR" if the row marker happens to share a box).
+        if nama_pieces:
+            head = _RE_LEADING_ROW_NUMBER.sub("", nama_pieces[0][1]).strip()
+            nama_pieces[0] = (nama_pieces[0][0], head)
+
+        def _join(pieces: list[tuple[float, str]]) -> str | None:
+            text = " ".join(t for _, t in pieces if t).strip()
+            text = " ".join(text.split())
+            return text or None
+
+        rows.append(
+            PersonnelEntry(
+                no=None,
+                pangkat=anchor["pangkat"],
+                nrp=None,
+                nama=_join(nama_pieces),
+                jabatan_dinas=_join(struktural_pieces),
+                jabatan_sprint=_join(sprint_pieces),
+                keterangan=None,
+            )
+        )
+    return rows
+
+
 def is_low_quality(rows: list[PersonnelEntry]) -> bool:
    """Heuristic: did PP-Structure produce useless rows?

--- a/src/ocr_sprint/pipeline/ocr.py
+++ b/src/ocr_sprint/pipeline/ocr.py
@@ -36,6 +36,73 @@ class OCRLine:
    box: tuple[tuple[float, float], ...]  # 4 (x, y) corner points


+def _line_y_center(line: OCRLine) -> float:
+    return sum(p[1] for p in line.box) / len(line.box)
+
+
+def _line_x_left(line: OCRLine) -> float:
+    return min(p[0] for p in line.box)
+
+
+def _line_height(line: OCRLine) -> float:
+    ys = [p[1] for p in line.box]
+    return max(ys) - min(ys)
+
+
+def sort_lines_by_layout(lines: list[OCRLine]) -> list[OCRLine]:
+    """Reorder lines into top-to-bottom, left-to-right reading order.
+
+    PaddleOCR's natural output order reflects detection order, not visual
+    layout. On dense tables (e.g. Polda Kalbar Akpol-panitia sprint) this
+    interleaves rows and columns — Paddle may emit a row's KET column
+    before its NAMA column, breaking every downstream extractor that
+    assumes top-to-bottom row order.
+
+    We rebuild reading order by:
+
+    1. Sorting by ``y_center``.
+    2. Grouping consecutive lines into row-bands when their ``y_center``
+       differs by less than half the median line height (so visually
+       same-row cells stay together even when their boxes don't perfectly
+       align).
+    3. Sorting each band left-to-right by ``x_left``.
+    """
+    if not lines:
+        return []
+
+    heights = [_line_height(ln) for ln in lines if _line_height(ln) > 0]
+    if not heights:
+        return list(lines)
+    median_height = sorted(heights)[len(heights) // 2]
+    band_threshold = max(1.0, median_height * 0.5)
+
+    by_y = sorted(lines, key=_line_y_center)
+    bands: list[list[OCRLine]] = []
+    current_band: list[OCRLine] = []
+    current_y: float | None = None
+    for ln in by_y:
+        y = _line_y_center(ln)
+        if current_y is None or abs(y - current_y) <= band_threshold:
+            current_band.append(ln)
+            # Track the band's running y-center as the mean of its
+            # members so a slowly-drifting set of cells doesn't split
+            # mid-row.
+            current_y = (
+                sum(_line_y_center(b) for b in current_band) / len(current_band)
+            )
+        else:
+            bands.append(current_band)
+            current_band = [ln]
+            current_y = y
+    if current_band:
+        bands.append(current_band)
+
+    ordered: list[OCRLine] = []
+    for band in bands:
+        ordered.extend(sorted(band, key=_line_x_left))
+    return ordered
+
+
@dataclass(frozen=True)
 class OCRPage:
    """OCR output for a single page."""
@@ -44,8 +111,8 @@ class OCRPage:

    @property
    def text(self) -> str:
-        """Reconstruct page text by concatenating lines (order = paddle's output order)."""
-        return "\n".join(line.text for line in self.lines)
+        """Reconstruct page text in visual reading order (top-to-bottom, left-to-right)."""
+        return "\n".join(line.text for line in sort_lines_by_layout(self.lines))

    @property
    def mean_confidence(self) -> float:
--- a/src/ocr_sprint/pipeline/orchestrator.py
+++ b/src/ocr_sprint/pipeline/orchestrator.py
@@ -20,6 +20,7 @@ from ocr_sprint.pipeline.confidence import compute_confidence, route
 from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
 from ocr_sprint.pipeline.extract.personnel import extract_personnel
 from ocr_sprint.pipeline.extract.personnel_text import (
+    extract_personnel_from_ocr_lines,
    extract_personnel_from_text,
    is_low_quality,
 )
@@ -144,12 +145,37 @@ def run_pipeline(content: bytes) -> PipelineOutput:
    # through the preferred path.
    if is_low_quality(personel):
        fallback_rows = extract_personnel_from_text(full_text)
+        # If text-based fallback produced rows but they all lack NRP
+        # (Pass 3 territory), retry with the column-aware extractor that
+        # uses OCR bounding boxes. On dense tables (e.g. Polda Kalbar
+        # Akpol-panitia), text-only Pass 3 bleeds adjacent columns into
+        # nama/jabatan because lines are interleaved within each Y-band;
+        # the columnar variant restricts each field to its visual column.
+        text_only_no_nrp = bool(fallback_rows) and all(
+            r.nrp is None for r in fallback_rows
+        )
+        if (not fallback_rows) or text_only_no_nrp:
+            ocr_lines = [ln for page in ocr_pages for ln in page.lines]
+            columnar_rows = extract_personnel_from_ocr_lines(ocr_lines)
+            if columnar_rows and (
+                not fallback_rows or len(columnar_rows) >= len(fallback_rows)
+            ):
+                fallback_rows = columnar_rows
        if fallback_rows:
            personel = fallback_rows
+            # Pass 3 / columnar emit rows with nrp=None for sprint
+            # templates without an NRP column. Surface that with a
+            # distinct flag so operators know to expect missing NRPs by
+            # design rather than by OCR failure.
+            no_nrp = all(r.nrp is None for r in fallback_rows)
+            if no_nrp:
+                table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK_NO_NRP)
+            else:
                table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
            _logger.info(
                "pipeline.personnel_text_fallback",
                fallback_rows=len(fallback_rows),
+                no_nrp=no_nrp,
            )

    untuk_items = find_untuk_list(full_text)
--- a/src/ocr_sprint/pipeline/table.py
+++ b/src/ocr_sprint/pipeline/table.py
@@ -71,11 +71,16 @@ def _build_pp_structure() -> PPStructure:
    from paddleocr import PPStructure

    s = get_settings()
-    _logger.info("pp_structure.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
+    # PPStructure layout models only support 'en' and 'ch', not 'latin'.
+    # Use 'en' for layout/table detection — it's language-agnostic (detects
+    # table structure, not text language). OCR within cells still works for
+    # Indonesian text because the recognition model handles Latin scripts.
+    pp_lang = "en" if s.ocr_lang not in ("en", "ch") else s.ocr_lang
+    _logger.info("pp_structure.init", lang=pp_lang, use_gpu=s.ocr_use_gpu)
    # layout=True so that PP-Structure also returns figure/text regions; we
    # filter to tables only afterwards. show_log=False to keep stdout clean.
    return PPStructure(
-        lang=s.ocr_lang,
+        lang=pp_lang,
        use_gpu=s.ocr_use_gpu,
        layout=True,
        show_log=False,
--- a/src/ocr_sprint/schemas/document.py
+++ b/src/ocr_sprint/schemas/document.py
@@ -10,6 +10,7 @@ from uuid import UUID, uuid4
 from pydantic import BaseModel, ConfigDict, Field

 from ocr_sprint.schemas.extraction import ExtractionResult
+from ocr_sprint.schemas.personnel import PersonnelEntry


 class SourceKind(str, Enum):
@@ -52,7 +53,7 @@ class DocumentResponse(BaseModel):
    job_id: UUID
    status: DocumentStatus
    confidence: float | None = None
-    data: ExtractionResult | None = None
+    data: list[PersonnelEntry] | None = None
    review_flags: list[str] = Field(default_factory=list)
    error: str | None = None
    # Phase 6 — HITL review state.
--- a/src/ocr_sprint/schemas/extraction.py
+++ b/src/ocr_sprint/schemas/extraction.py
@@ -22,6 +22,7 @@ class ReviewFlag(str, Enum):
    LLM_FALLBACK = "llm_fallback"
    LLM_UNAVAILABLE = "llm_unavailable"
    PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback"
+    PERSONNEL_TEXT_FALLBACK_NO_NRP = "personnel_text_fallback_no_nrp"
    INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row"


--- a/tests/unit/test_ocr_layout.py
+++ b/tests/unit/test_ocr_layout.py
@@ -0,0 +1,75 @@
+"""Tests for OCR layout reordering.
+
+PaddleOCR emits text boxes in detection order, not visual reading order.
+On dense table layouts (Polda Kalbar Akpol-panitia regression) this
+interleaves columns within a row and breaks every downstream extractor
+that assumes top-to-bottom row order. ``sort_lines_by_layout`` rebuilds
+reading order from the bounding-box geometry.
+"""
+
+from __future__ import annotations
+
+from ocr_sprint.pipeline.ocr import OCRLine, OCRPage, sort_lines_by_layout
+
+
+def _box(x: float, y: float, w: float = 30, h: float = 15):
+    return ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
+
+
+def _make(text: str, x: float, y: float) -> OCRLine:
+    return OCRLine(text=text, confidence=1.0, box=_box(x, y))
+
+
+class TestSortLinesByLayout:
+    def test_empty_returns_empty(self) -> None:
+        assert sort_lines_by_layout([]) == []
+
+    def test_already_sorted_is_stable(self) -> None:
+        lines = [_make("A", 10, 10), _make("B", 50, 10), _make("C", 10, 30)]
+        assert [ln.text for ln in sort_lines_by_layout(lines)] == ["A", "B", "C"]
+
+    def test_reorders_column_first_detection_to_row_first(self) -> None:
+        # Simulate a 2-row, 3-col table where Paddle returned cells
+        # column-first instead of row-first.
+        lines = [
+            _make("B1", 50, 10),
+            _make("B2", 50, 30),
+            _make("A1", 10, 10),
+            _make("A2", 10, 30),
+            _make("C1", 90, 10),
+            _make("C2", 90, 30),
+        ]
+        result = [ln.text for ln in sort_lines_by_layout(lines)]
+        assert result == ["A1", "B1", "C1", "A2", "B2", "C2"]
+
+    def test_groups_slightly_misaligned_cells_into_one_band(self) -> None:
+        # Real OCR boxes for a single visual row are rarely perfectly
+        # y-aligned; we still want them grouped.
+        lines = [
+            _make("LEFT", 10, 10),
+            _make("MID", 50, 12),  # 2px below LEFT — same row visually
+            _make("RIGHT", 90, 11),
+        ]
+        result = [ln.text for ln in sort_lines_by_layout(lines)]
+        assert result == ["LEFT", "MID", "RIGHT"]
+
+    def test_separates_rows_when_y_gap_exceeds_threshold(self) -> None:
+        # Lines with a y gap larger than ~½ line-height must NOT collapse
+        # into the same band.
+        lines = [
+            _make("ROW1A", 10, 10),
+            _make("ROW1B", 50, 10),
+            _make("ROW2A", 10, 30),  # gap of 20 vs height 15 → new band
+            _make("ROW2B", 50, 30),
+        ]
+        result = [ln.text for ln in sort_lines_by_layout(lines)]
+        assert result == ["ROW1A", "ROW1B", "ROW2A", "ROW2B"]
+
+    def test_ocrpage_text_uses_sorted_order(self) -> None:
+        lines = [
+            _make("RIGHT", 90, 10),
+            _make("LEFT", 10, 10),
+            _make("BOTTOM", 10, 30),
+        ]
+        page = OCRPage(lines=lines)
+        assert page.text == "LEFT\nRIGHT\nBOTTOM"
--- a/tests/unit/test_personnel_text_fallback.py
+++ b/tests/unit/test_personnel_text_fallback.py
@@ -8,11 +8,18 @@ recover at least the rank + NRP for every row.
 from __future__ import annotations

 from ocr_sprint.pipeline.extract.personnel_text import (
+    extract_personnel_from_ocr_lines,
    extract_personnel_from_text,
    is_low_quality,
 )
+from ocr_sprint.pipeline.ocr import OCRLine
 from ocr_sprint.schemas.personnel import PersonnelEntry

+
+def _ocr_line(text: str, x: float, y: float, w: float = 80, h: float = 15) -> OCRLine:
+    box = ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
+    return OCRLine(text=text, confidence=1.0, box=box)
+
 _CIMAHI_FIXTURE = """\
 DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
 NO
@@ -115,6 +122,86 @@ class TestExtractPersonnelFromText:
        names = [r.nama for r in rows]
        assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"]

+    def test_extracts_multiple_rows_when_collapsed_to_one_line(self) -> None:
+        # Polres Banjar regression: when PaddleOCR merges several table
+        # rows onto a single OCR line, every rank+NRP pair on that line
+        # must still produce a separate row. Previously per-line
+        # ``re.search`` returned only the first match.
+        text = (
+            "DAFTAR NAMA INSTRUKTUR\n"
+            "1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS "
+            "INSTRUKTUR LAT PRA OPS "
+            "HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 "
+            "KASAT RESKRIM SDA "
+            "YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 "
+            "KASATINTELKAM POLRES BANJAR SDA\n"
+        )
+        rows = extract_personnel_from_text(text)
+        assert len(rows) == 3
+        assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
+        assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
+        assert rows[0].nama == "CUCU JUHANA, A.K.S."
+        assert rows[1].nama is not None and "HERU SAMSUL BAHRI" in rows[1].nama
+        assert rows[2].nama is not None and "YAYAN SOPIANA" in rows[2].nama
+
+    def test_extracts_multiple_rows_when_split_across_lines(self) -> None:
+        # Variant of the squished case where OCR produces one line per
+        # table row. Each row still ends up with multiple rank+NRP pairs
+        # never being on the same line, but verifies the finditer-based
+        # path doesn't regress this layout.
+        text = (
+            "1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS\n"
+            "INSTRUKTUR LAT PRA OPS\n"
+            "HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 KASAT RESKRIM\n"
+            "SDA\n"
+            "YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 KASATINTELKAM\n"
+            "POLRES BANJAR SDA\n"
+        )
+        rows = extract_personnel_from_text(text)
+        assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
+        assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
+        assert rows[0].nama == "CUCU JUHANA, A.K.S."
+
+    def test_extracts_rows_when_sprint_has_no_nrp_column(self) -> None:
+        # Polda Kalbar Akpol-panitia regression: sprint formats without
+        # an NRP column (panitia, undangan templates) must still extract
+        # rows via the rank-only Pass 3 path. Names span multiple OCR
+        # lines (narrow column), and the multi-token rank "KOMBES POL"
+        # is split across two lines.
+        text = (
+            "DAFTAR NAMA PANITIA\n"
+            "NO\nNAMA\nPANGKAT\nJABATAN\nSTRUKTURAL\nDALAM SPRIN\nKET\n"
+            "1\nF. GUNTUR\nSUNOTO, S.I.K.,\nM.H.\n"
+            "KOMBES\nPOL\n"
+            "KARO SDM\nPOLDA KALBAR\nKETUA\nPELAKSANA\n"
+            "2\nJUDA TRISNO\nTAMPUBOLON,\nS.H., S.I.K., M.H.\n"
+            "AKBP\n"
+            "KABAGDALPERS\nRO SDM\nPOLDA KALBAR\nSEKRETARIS\n"
+            "3\nPRAYITNO, S.H.,\nM.H.\n"
+            "KOMPOL\n"
+            "KASUBBAG DIAPERS\nANGGOTA\n"
+        )
+        rows = extract_personnel_from_text(text)
+        assert len(rows) == 3
+        assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "KOMPOL"]
+        # All Pass 3 rows have nrp=None by design.
+        assert all(r.nrp is None for r in rows)
+        assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
+        assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
+        assert rows[2].nama == "PRAYITNO, S.H., M.H."
+        assert rows[0].jabatan_dinas is not None and "KARO SDM" in rows[0].jabatan_dinas
+
+    def test_pass3_does_not_run_when_pass1_succeeds(self) -> None:
+        # If a sprint has NRPs (Pass 1 succeeds), Pass 3 must not fire
+        # and produce duplicate/contaminating rows.
+        text = (
+            "1\nSRI WAHYUNI\nAIPTU / 75070328\nBAUR SKCK\n"
+            "2\nCITRA DWI PUTRI\nBRIPTU / 95070659\nBA PELAKSANA\n"
+        )
+        rows = extract_personnel_from_text(text)
+        assert len(rows) == 2
+        assert all(r.nrp is not None for r in rows)
+
    def test_still_blocks_bare_column_header_tokens(self) -> None:
        # Word-boundary fix must still reject the actual column-header
        # rows that motivated the blocklist in the first place.
@@ -124,6 +211,94 @@ class TestExtractPersonnelFromText:
        assert rows[0].nama == "REAL NAME"


+class TestExtractPersonnelFromOcrLines:
+    """Column-aware Pass 3 — Polda Kalbar Akpol-panitia regression.
+
+    Verifies that bounding-box geometry preserves column boundaries on
+    dense tables where text-only Pass 3 bleeds adjacent columns into
+    nama/jabatan.
+    """
+
+    def _kalbar_lines(self) -> list[OCRLine]:
+        # Stylised Polda Kalbar layout: NO | NAMA | PANGKAT | STRUKTURAL | SPRIN
+        # X columns: 10, 100, 250, 380, 520. Each row may have multi-line cells.
+        return [
+            # Row 1 — KOMBES POL spans two stacked OCR boxes
+            _ocr_line("1", 10, 100),
+            _ocr_line("F. GUNTUR", 100, 100),
+            _ocr_line("SUNOTO, S.I.K.,", 100, 120),
+            _ocr_line("M.H.", 100, 140),
+            _ocr_line("KOMBES", 250, 100),
+            _ocr_line("POL", 250, 120),
+            _ocr_line("KARO SDM", 380, 100),
+            _ocr_line("POLDA KALBAR", 380, 120),
+            _ocr_line("KETUA", 520, 100),
+            _ocr_line("PELAKSANA", 520, 120),
+            # Row 2
+            _ocr_line("2", 10, 200),
+            _ocr_line("JUDA TRISNO", 100, 200),
+            _ocr_line("TAMPUBOLON,", 100, 220),
+            _ocr_line("S.H., S.I.K., M.H.", 100, 240),
+            _ocr_line("AKBP", 250, 200),
+            _ocr_line("KABAGDALPERS", 380, 200),
+            _ocr_line("RO SDM", 380, 220),
+            _ocr_line("POLDA KALBAR", 380, 240),
+            _ocr_line("SEKRETARIS", 520, 200),
+            # Row 9 — PNS PENATA TK I (multi-token rank stacked)
+            _ocr_line("9", 10, 500),
+            _ocr_line("FITRIANSYAH,", 100, 500),
+            _ocr_line("S.E.", 100, 520),
+            _ocr_line("PENATA", 250, 500),
+            _ocr_line("TK I", 250, 520),
+            _ocr_line("KAURKEU", 380, 500),
+            _ocr_line("RO SDM", 380, 520),
+            _ocr_line("POLDA KALBAR", 380, 540),
+            _ocr_line("BENDAHARA", 520, 500),
+        ]
+
+    def test_extracts_three_rows(self) -> None:
+        rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
+        assert len(rows) == 3
+        assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "PENATA TK I"]
+
+    def test_nama_is_assembled_only_from_nama_column(self) -> None:
+        # Each row's nama must contain *all* its multi-line fragments
+        # and *only* its multi-line fragments — no bleed from struktural.
+        rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
+        assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
+        assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
+        assert rows[2].nama == "FITRIANSYAH, S.E."
+
+    def test_jabatan_split_into_struktural_and_sprint(self) -> None:
+        # The geometric column boundary must split STRUKTURAL (jabatan_dinas)
+        # from DALAM SPRIN (jabatan_sprint).
+        rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
+        assert rows[0].jabatan_dinas == "KARO SDM POLDA KALBAR"
+        assert rows[0].jabatan_sprint == "KETUA PELAKSANA"
+        assert rows[1].jabatan_dinas == "KABAGDALPERS RO SDM POLDA KALBAR"
+        assert rows[1].jabatan_sprint == "SEKRETARIS"
+
+    def test_returns_empty_when_no_rank_anchors(self) -> None:
+        lines = [
+            _ocr_line("DAFTAR NAMA", 100, 50),
+            _ocr_line("HEADER", 100, 100),
+        ]
+        assert extract_personnel_from_ocr_lines(lines) == []
+
+    def test_returns_empty_for_empty_input(self) -> None:
+        assert extract_personnel_from_ocr_lines([]) == []
+
+    def test_no_row_bleed_between_consecutive_rows(self) -> None:
+        # Row 1's last name fragment ("F. GUNTUR") sits BELOW its rank
+        # line but inside row 1's visual span. It must NOT leak into
+        # row 2's nama, which should start with "JUDA TRISNO".
+        rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
+        assert rows[1].nama is not None
+        assert rows[1].nama.startswith("JUDA TRISNO")
+        assert "GUNTUR" not in rows[1].nama
+        assert "SUNOTO" not in rows[1].nama
+
+
 class TestIsLowQuality:
    def test_empty_list_is_low_quality(self) -> None:
        assert is_low_quality([]) is True
--- a/update.ps1
+++ b/update.ps1
@@ -0,0 +1,60 @@
+#!/usr/bin/env pwsh
+# update.ps1 - One-command update & restart for ocr-sprint-service (local dev)
+
+$Port = 8000
+
+# ── [1/5] Git pull ──────────────────────────────────────────────────────────
+Write-Host "`n[1/5] Pulling latest code..." -ForegroundColor Cyan
+git pull
+
+# ── [2/5] Install/update dependencies ───────────────────────────────────────
+Write-Host "`n[2/5] Installing/updating dependencies..." -ForegroundColor Cyan
+pip install -e ".[dev]" -q
+
+# ── [3/5] Database migration ─────────────────────────────────────────────────
+Write-Host "`n[3/5] Running database migrations..." -ForegroundColor Cyan
+alembic upgrade head
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "  Migration conflict detected, stamping current state as head..." -ForegroundColor Yellow
+    alembic stamp head
+    Write-Host "  Retrying upgrade for any remaining new migrations..." -ForegroundColor Yellow
+    alembic upgrade head
+    if ($LASTEXITCODE -ne 0) {
+        Write-Host "  Migration still failed. Please check alembic manually." -ForegroundColor Red
+        exit 1
+    }
+}
+Write-Host "  Migrations OK." -ForegroundColor Green
+
+# ── [4/5] Free up port ───────────────────────────────────────────────────────
+Write-Host "`n[4/5] Checking port $Port..." -ForegroundColor Cyan
+
+# Use Get-NetTCPConnection for reliable port detection on Windows
+$connections = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue
+if ($connections) {
+    foreach ($conn in $connections) {
+        $procId = $conn.OwningProcess
+        $procName = (Get-Process -Id $procId -ErrorAction SilentlyContinue).Name
+        Write-Host "  Port $Port used by '$procName' (PID $procId), killing..." -ForegroundColor Yellow
+        Stop-Process -Id $procId -Force -ErrorAction SilentlyContinue
+    }
+    # Wait until port is actually released (max 5 seconds)
+    $waited = 0
+    do {
+        Start-Sleep -Milliseconds 500
+        $waited += 500
+        $still = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue
+    } while ($still -and $waited -lt 5000)
+
+    if ($still) {
+        Write-Host "  Port $Port still in use after waiting. Try a different port or restart manually." -ForegroundColor Red
+        exit 1
+    }
+    Write-Host "  Port $Port freed." -ForegroundColor Green
+} else {
+    Write-Host "  Port $Port is free." -ForegroundColor Green
+}
+
+# ── [5/5] Start dev server ───────────────────────────────────────────────────
+Write-Host "`n[5/5] Starting dev server on port $Port (Ctrl+C to stop)..." -ForegroundColor Cyan
+uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port $Port