feat: implement robust personnel data extraction pipeline with text-based fallback and coordinate-aware processing

This commit is contained in:
Adriankf59
2026-04-26 17:16:47 +07:00
parent dbcf480130
commit 002821ca07
20 changed files with 3326 additions and 20 deletions

View File

@@ -0,0 +1,18 @@
{
"permissions": {
"allow": [
"Bash(python -m pytest tests/unit/test_personnel_text_fallback.py -x -q)",
"Bash(python -c \"import sys; print\\(sys.executable\\)\")",
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_personnel_text_fallback.py -x -q)",
"Bash(.venv/Scripts/python.exe -m pytest tests/unit -x -q)",
"Bash(git stash *)",
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_api.py::test_documents_sync_returns_pipeline_output -x -q)",
"Bash(.venv/Scripts/python.exe -m pytest tests/unit --ignore=tests/unit/test_api.py -q)",
"Bash(.venv/Scripts/python.exe -c ' *)",
"Bash(xargs grep *)",
"Bash(.venv/Scripts/python.exe -m pytest tests/unit -q --ignore=tests/unit/test_api.py --ignore=tests/unit/test_api_hitl.py --ignore=tests/unit/test_blob_storage.py)",
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_ocr_layout.py tests/unit/test_personnel_text_fallback.py -q)",
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_personnel_text_fallback.py tests/unit/test_ocr_layout.py -q)"
]
}
}

View File

@@ -1,9 +1,10 @@
.PHONY: help install dev fmt lint typecheck test test-cov run docker-build docker-up docker-down clean
.PHONY: help install dev update fmt lint typecheck test test-cov run docker-build docker-up docker-down clean
help:
@echo "Targets:"
@echo " install - install runtime + dev deps in current env"
@echo " dev - run FastAPI app with autoreload"
@echo " update - git pull + install deps + migrate db + run dev server"
@echo " fmt - format code with ruff"
@echo " lint - lint with ruff"
@echo " typecheck - run mypy"
@@ -21,6 +22,16 @@ install:
dev:
uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
update:
@echo "[1/4] Pulling latest code..."
git pull
@echo "[2/4] Installing/updating dependencies..."
pip install -e ".[dev]"
@echo "[3/4] Running database migrations..."
alembic upgrade head
@echo "[4/4] Starting dev server..."
uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
fmt:
ruff format src tests
ruff check --fix src tests

View File

@@ -0,0 +1,858 @@
# Deployment OCR Sprint Service (Existing Stack)
Panduan deployment untuk server dengan Python 3.12.3, PostgreSQL 16.13, dan Redis 7.0.15 yang sudah terinstall.
## Informasi Server Anda
- **OS**: Ubuntu 24.04
- **Python**: 3.12.3 ✅
- **PostgreSQL**: 16.13 ✅
- **Redis**: 7.0.15 ✅
Semua versi sudah kompatibel dan optimal untuk OCR Sprint Service!
## Langkah 1: Install System Libraries untuk OpenCV & PaddleOCR
```bash
# Update package list
sudo apt update
# Install libraries yang dibutuhkan oleh OpenCV dan PaddleOCR
sudo apt install -y \
libgl1 \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender1 \
libgomp1 \
libmagic1 \
python3.12-venv \
python3.12-dev \
build-essential \
git
```
## Langkah 2: Setup PostgreSQL Database
```bash
# Login ke PostgreSQL
sudo -u postgres psql
```
Jalankan SQL commands berikut:
```sql
-- Create user dan database
CREATE USER ocr WITH PASSWORD '@Offroader123';
CREATE DATABASE ocr_sprint OWNER ocr;
-- Grant privileges
GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
-- Connect ke database untuk grant schema privileges
\c ocr_sprint
-- Grant schema privileges (PostgreSQL 15+)
GRANT ALL ON SCHEMA public TO ocr;
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO ocr;
GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO ocr;
-- Verify
\l ocr_sprint
\du ocr
-- Exit
\q
```
**Generate password yang aman:**
```bash
# Generate random password
openssl rand -base64 32
+J33GdYQcWcfqXs169cmgPrQJpLFgybjoedr/tNb0d4=
```
Simpan password ini, akan digunakan di konfigurasi nanti.
## Langkah 3: Verify Redis
```bash
# Check Redis status
sudo systemctl status redis-server
# Test connection
redis-cli ping
# Expected output: PONG
# Check Redis config (opsional)
redis-cli CONFIG GET maxmemory
```
Jika Redis belum running:
```bash
sudo systemctl enable redis-server
sudo systemctl start redis-server
```
## Langkah 4: Create Application User
```bash
# Create dedicated user untuk aplikasi
sudo useradd -m -s /bin/bash ocr
# Create application directory
sudo mkdir -p /opt/ocr-sprint-service
sudo chown ocr:ocr /opt/ocr-sprint-service
```
## Langkah 5: Clone dan Install Application
```bash
# Switch ke user ocr
sudo su - ocr
# Clone repository
cd /opt
git clone https://github.com/Adriankf59/ocr-sprint-service.git
cd ocr-sprint-service
# Create virtual environment dengan Python 3.12
python3.12 -m venv .venv
# Activate virtual environment
source .venv/bin/activate
# Verify Python version di venv
python --version
# Expected: Python 3.12.3
# Upgrade pip
pip install --upgrade pip setuptools wheel
# Install application dengan OCR dependencies
# Ini akan download ~1.5GB PaddlePaddle wheels
pip install -e ".[ocr]"
# Verify installation
python -c "import paddleocr; print('PaddleOCR OK')"
python -c "import cv2; print('OpenCV OK')"
python -c "import fastapi; print('FastAPI OK')"
```
## Langkah 6: Konfigurasi Application
```bash
# Masih sebagai user ocr
cd /opt/ocr-sprint-service
# Copy environment template
cp .env.example .env
# Edit konfigurasi
nano .env
```
**Konfigurasi `/opt/ocr-sprint-service/.env`:**
```bash
# ==== App ====
APP_ENV=prod
APP_HOST=0.0.0.0
APP_PORT=8000
APP_LOG_LEVEL=INFO
# ==== Storage ====
STORAGE_LOCAL_DIR=/opt/ocr-sprint-service/storage
BLOB_STORAGE_DIR=/opt/ocr-sprint-service/storage/blobs
BLOB_MAX_UPLOAD_MB=25
# ==== OCR ====
OCR_LANG=latin
OCR_USE_GPU=false
OCR_MAX_IMAGE_SIDE=2200
# ==== Preprocessing ====
PREPROCESS_TARGET_DPI=300
PREPROCESS_DENOISE=true
PREPROCESS_DESKEW=true
PREPROCESS_DETECT_DOCUMENT=true
PREPROCESS_REMOVE_SHADOW=true
PREPROCESS_MIN_QUAD_AREA_FRACTION=0.20
# ==== Table Extraction ====
TABLES_ENABLED=true
# ==== Confidence ====
CONFIDENCE_AUTO_APPROVE=0.95
CONFIDENCE_NEEDS_REVIEW=0.85
# ==== LLM (Phase 5, optional - disable untuk sekarang) ====
LLM_ENABLED=false
# ==== Async Pipeline ====
QUEUE_ENABLED=true
REDIS_URL=redis://localhost:6379/0
CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
# ==== Database ====
# Ganti 'your-password-here' dengan password yang Anda generate di Langkah 2
DATABASE_URL=postgresql+psycopg://ocr:your-password-here@localhost:5432/ocr_sprint
DATABASE_ECHO=false
# ==== Auth (WAJIB untuk production!) ====
# Generate dengan: openssl rand -hex 32
API_KEYS=paste-api-key-1-here,paste-api-key-2-here
API_KEY_HEADER=X-API-Key
```
**Generate API keys:**
```bash
# Generate 2 API keys
echo "API Key 1: $(openssl rand -hex 32)"
echo "API Key 2: $(openssl rand -hex 32)"
```
Copy output dan paste ke `API_KEYS` di file `.env`.
**Create storage directories:**
```bash
mkdir -p /opt/ocr-sprint-service/storage/blobs
chmod 755 /opt/ocr-sprint-service/storage
```
## Langkah 7: Run Database Migrations
```bash
# Masih sebagai user ocr, dengan venv activated
cd /opt/ocr-sprint-service
source .venv/bin/activate
# Run migrations
alembic upgrade head
# Verify - should show current revision
alembic current
# Expected output: (head) atau revision number
```
## Langkah 8: Test Manual Run
```bash
# Masih sebagai user ocr
cd /opt/ocr-sprint-service
source .venv/bin/activate
# Test API server
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
```
**Di terminal lain (sebagai user ubuntu):**
```bash
# Test health check
curl http://localhost:8000/api/v1/health
# Expected: {"status":"ok","version":"0.1.0"}
# Test dengan sample file (jika ada)
curl -X POST "http://localhost:8000/api/v1/documents?sync=true" \
-H "X-API-Key: your-api-key-here" \
-F "file=@/path/to/test.pdf"
```
Jika berhasil, stop server dengan `Ctrl+C`.
## Langkah 9: Setup Systemd Services
```bash
# Exit dari user ocr
exit
# Kembali sebagai user ubuntu dengan sudo
```
### Create API Service
```bash
sudo nano /etc/systemd/system/ocr-sprint-api.service
```
**Content:**
```ini
[Unit]
Description=OCR Sprint API Service
After=network.target postgresql.service redis-server.service
Wants=postgresql.service redis-server.service
[Service]
Type=simple
User=ocr
Group=ocr
WorkingDirectory=/opt/ocr-sprint-service
# Environment
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
EnvironmentFile=/opt/ocr-sprint-service/.env
# Start command - 4 workers untuk production
ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn \
ocr_sprint.main:app \
--host 0.0.0.0 \
--port 8000 \
--workers 4 \
--log-level info
# Restart policy
Restart=always
RestartSec=10
StartLimitInterval=0
# Resource limits
LimitNOFILE=65536
# Security
NoNewPrivileges=true
PrivateTmp=true
[Install]
WantedBy=multi-user.target
```
### Create Celery Worker Service
```bash
sudo nano /etc/systemd/system/ocr-sprint-worker.service
```
**Content:**
```ini
[Unit]
Description=OCR Sprint Celery Worker
After=network.target postgresql.service redis-server.service ocr-sprint-api.service
Wants=postgresql.service redis-server.service
[Service]
Type=simple
User=ocr
Group=ocr
WorkingDirectory=/opt/ocr-sprint-service
# Environment
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
EnvironmentFile=/opt/ocr-sprint-service/.env
# Start command - concurrency 2 untuk CPU dengan 4 cores
# Sesuaikan dengan jumlah CPU cores server Anda
ExecStart=/opt/ocr-sprint-service/.venv/bin/celery \
-A ocr_sprint.worker.celery_app \
worker \
--loglevel=info \
--concurrency=2 \
--max-tasks-per-child=100
# Restart policy
Restart=always
RestartSec=10
StartLimitInterval=0
# Resource limits
LimitNOFILE=65536
# Security
NoNewPrivileges=true
PrivateTmp=true
[Install]
WantedBy=multi-user.target
```
### Enable dan Start Services
```bash
# Reload systemd
sudo systemctl daemon-reload
# Enable services (auto-start on boot)
sudo systemctl enable ocr-sprint-api
sudo systemctl enable ocr-sprint-worker
# Start services
sudo systemctl start ocr-sprint-api
sudo systemctl start ocr-sprint-worker
# Check status
sudo systemctl status ocr-sprint-api
sudo systemctl status ocr-sprint-worker
```
**Expected output:** `active (running)` dengan warna hijau.
### View Logs
```bash
# API logs (real-time)
sudo journalctl -u ocr-sprint-api -f
# Worker logs (real-time)
sudo journalctl -u ocr-sprint-worker -f
# Last 50 lines
sudo journalctl -u ocr-sprint-api -n 50
sudo journalctl -u ocr-sprint-worker -n 50
```
## Langkah 10: Install dan Setup Nginx
```bash
# Install Nginx dan Certbot
sudo apt install -y nginx certbot python3-certbot-nginx
# Check Nginx status
sudo systemctl status nginx
```
### Create Nginx Configuration
```bash
sudo nano /etc/nginx/sites-available/ocr-sprint
```
**Content (ganti `ocr.yourdomain.com` dengan domain Anda):**
```nginx
# Upstream
upstream ocr_api {
server 127.0.0.1:8000;
keepalive 32;
}
# Rate limiting
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
server {
listen 80;
server_name ocr.yourdomain.com;
# Max upload size
client_max_body_size 30M;
client_body_buffer_size 128k;
# Timeouts
proxy_connect_timeout 300s;
proxy_send_timeout 300s;
proxy_read_timeout 300s;
send_timeout 300s;
# Logging
access_log /var/log/nginx/ocr-sprint-access.log;
error_log /var/log/nginx/ocr-sprint-error.log;
# API endpoints
location /api/ {
limit_req zone=api_limit burst=20 nodelay;
proxy_pass http://ocr_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Connection "";
proxy_buffering off;
}
# Health check
location /api/v1/health {
proxy_pass http://ocr_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
access_log off;
}
# Metrics (restrict access)
location /metrics {
allow 127.0.0.1;
allow 10.0.0.0/8;
deny all;
proxy_pass http://ocr_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
}
# API docs
location /docs {
proxy_pass http://ocr_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
}
location /redoc {
proxy_pass http://ocr_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
}
}
```
### Enable Site
```bash
# Test konfigurasi
sudo nginx -t
# Enable site
sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
# Reload Nginx
sudo systemctl reload nginx
```
### Setup SSL (jika punya domain)
```bash
# Obtain certificate
sudo certbot --nginx -d ocr.yourdomain.com
# Test auto-renewal
sudo certbot renew --dry-run
```
## Langkah 11: Setup Firewall
```bash
# Check UFW status
sudo ufw status
# Allow SSH (PENTING!)
sudo ufw allow 22/tcp
# Allow HTTP dan HTTPS
sudo ufw allow 80/tcp
sudo ufw allow 443/tcp
# Enable firewall (jika belum)
sudo ufw enable
# Verify
sudo ufw status numbered
```
## Langkah 12: Verifikasi Final
### Test dari Server
```bash
# Health check
curl http://localhost:8000/api/v1/health
# Test async endpoint
curl -X POST http://localhost:8000/api/v1/documents \
-H "X-API-Key: your-api-key-here" \
-F "file=@/path/to/test.pdf"
# Expected: {"job_id":"...","status":"pending",...}
# Check job status
curl -H "X-API-Key: your-api-key-here" \
http://localhost:8000/api/v1/documents/JOB_ID_HERE
```
### Test via Domain (jika sudah setup SSL)
```bash
curl https://ocr.yourdomain.com/api/v1/health
```
### Check Services
```bash
# All services should be active
sudo systemctl status ocr-sprint-api
sudo systemctl status ocr-sprint-worker
sudo systemctl status postgresql
sudo systemctl status redis-server
sudo systemctl status nginx
```
## Monitoring
### View Logs
```bash
# API logs
sudo journalctl -u ocr-sprint-api -f
# Worker logs
sudo journalctl -u ocr-sprint-worker -f
# Nginx access logs
sudo tail -f /var/log/nginx/ocr-sprint-access.log
# Nginx error logs
sudo tail -f /var/log/nginx/ocr-sprint-error.log
```
### Prometheus Metrics
```bash
# View metrics
curl http://localhost:8000/metrics
# Key metrics:
# - ocr_documents_total
# - ocr_processing_duration_seconds
# - ocr_confidence_score
```
## Maintenance
### Restart Services
```bash
sudo systemctl restart ocr-sprint-api
sudo systemctl restart ocr-sprint-worker
```
### Update Application
```bash
# Switch ke user ocr
sudo su - ocr
cd /opt/ocr-sprint-service
# Pull latest code
git pull
# Activate venv
source .venv/bin/activate
# Update dependencies
pip install -e ".[ocr]"
# Run migrations
alembic upgrade head
# Exit
exit
# Restart services
sudo systemctl restart ocr-sprint-api
sudo systemctl restart ocr-sprint-worker
# Check logs
sudo journalctl -u ocr-sprint-api -n 50
```
### Database Backup
```bash
# Create backup directory
sudo mkdir -p /opt/ocr-sprint-service/backups
sudo chown ocr:ocr /opt/ocr-sprint-service/backups
# Manual backup
sudo -u ocr pg_dump -h localhost -U ocr ocr_sprint | gzip > /opt/ocr-sprint-service/backups/backup_$(date +%Y%m%d_%H%M%S).sql.gz
```
**Setup automated backup:**
```bash
# Create backup script
sudo nano /opt/ocr-sprint-service/backup.sh
```
```bash
#!/bin/bash
BACKUP_DIR="/opt/ocr-sprint-service/backups"
DATE=$(date +%Y%m%d_%H%M%S)
mkdir -p $BACKUP_DIR
# Backup database
PGPASSWORD='your-db-password' pg_dump -h localhost -U ocr ocr_sprint | gzip > $BACKUP_DIR/db_$DATE.sql.gz
# Keep only last 7 days
find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete
echo "Backup completed: $DATE"
```
```bash
# Make executable
sudo chmod +x /opt/ocr-sprint-service/backup.sh
sudo chown ocr:ocr /opt/ocr-sprint-service/backup.sh
# Setup cron (daily at 2 AM)
sudo crontab -e -u ocr
# Add line:
0 2 * * * /opt/ocr-sprint-service/backup.sh >> /var/log/ocr-backup.log 2>&1
```
## Troubleshooting
### Service tidak start
```bash
# Check detailed logs
sudo journalctl -u ocr-sprint-api -n 100 --no-pager
sudo journalctl -u ocr-sprint-worker -n 100 --no-pager
# Check file permissions
ls -la /opt/ocr-sprint-service
ls -la /opt/ocr-sprint-service/storage
# Test manual run
sudo su - ocr
cd /opt/ocr-sprint-service
source .venv/bin/activate
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
```
### Database connection error
```bash
# Test connection
sudo -u ocr psql -h localhost -U ocr -d ocr_sprint
# Check PostgreSQL status
sudo systemctl status postgresql
# Check PostgreSQL logs
sudo journalctl -u postgresql -n 50
```
### Redis connection error
```bash
# Test Redis
redis-cli ping
# Check Redis status
sudo systemctl status redis-server
# Check Redis logs
sudo journalctl -u redis-server -n 50
```
### Worker tidak memproses jobs
```bash
# Check Celery worker status
sudo su - ocr
cd /opt/ocr-sprint-service
source .venv/bin/activate
celery -A ocr_sprint.worker.celery_app inspect active
celery -A ocr_sprint.worker.celery_app inspect stats
# Check Redis queue
redis-cli LLEN ocr_sprint
```
### PaddleOCR error
```bash
# Re-download models
sudo su - ocr
cd /opt/ocr-sprint-service
source .venv/bin/activate
python << EOF
from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='latin')
print("Models downloaded successfully")
EOF
```
## Performance Tuning
### Check CPU cores
```bash
nproc
```
### Adjust worker concurrency
```bash
# Edit worker service
sudo nano /etc/systemd/system/ocr-sprint-worker.service
# Untuk 4 cores: --concurrency=2
# Untuk 8 cores: --concurrency=4
# Untuk 16 cores: --concurrency=8
# Reload dan restart
sudo systemctl daemon-reload
sudo systemctl restart ocr-sprint-worker
```
### PostgreSQL 16 Tuning
```bash
sudo nano /etc/postgresql/16/main/postgresql.conf
```
**Recommended settings (sesuaikan dengan RAM server):**
```
# Untuk 8GB RAM:
shared_buffers = 2GB
effective_cache_size = 6GB
maintenance_work_mem = 512MB
work_mem = 8MB
# Untuk 16GB RAM:
shared_buffers = 4GB
effective_cache_size = 12GB
maintenance_work_mem = 1GB
work_mem = 10MB
# General
checkpoint_completion_target = 0.9
wal_buffers = 16MB
default_statistics_target = 100
random_page_cost = 1.1
effective_io_concurrency = 200
max_worker_processes = 4
max_parallel_workers_per_gather = 2
max_parallel_workers = 4
```
```bash
sudo systemctl restart postgresql
```
## Security Checklist
- [ ] API keys set dengan nilai random yang kuat
- [ ] Database password diganti dari default
- [ ] Firewall enabled (UFW)
- [ ] SSL/TLS enabled (jika punya domain)
- [ ] `/metrics` endpoint restricted
- [ ] PostgreSQL hanya listen di localhost
- [ ] Redis hanya listen di localhost
- [ ] Backup automated (cron job)
- [ ] OS security updates enabled
## Next Steps
1. **Setup monitoring** - Install Prometheus + Grafana (opsional)
2. **Setup alerting** - Email/Slack notification untuk errors
3. **Load testing** - Test dengan volume dokumen production
4. **Backup verification** - Test restore dari backup
5. **Documentation** - Dokumentasi API keys untuk tim
## Support
Untuk pertanyaan atau issues, hubungi tim development.

943
docs/DEPLOYMENT-MANUAL.md Normal file
View File

@@ -0,0 +1,943 @@
# Deployment Manual OCR Sprint Service (Tanpa Docker)
Panduan lengkap deployment OCR Sprint Service langsung di server tanpa menggunakan Docker.
## Prasyarat Server
### Spesifikasi Minimum
- **OS**: Ubuntu 20.04+ / Debian 11+ / RHEL 8+
- **CPU**: 4 cores (8 cores recommended)
- **RAM**: 8 GB minimum (16 GB recommended)
- **Storage**: 50 GB free space
- **User**: Non-root user dengan sudo access
### Port yang Dibutuhkan
- `8000`: API server (internal, akan di-proxy oleh Nginx)
- `80/443`: HTTP/HTTPS (Nginx)
- `5432`: PostgreSQL (localhost only)
- `6379`: Redis (localhost only)
## Langkah 1: Install System Dependencies
### Ubuntu/Debian
```bash
# Update system
sudo apt update && sudo apt upgrade -y
# Install Python 3.11
sudo apt install -y software-properties-common
sudo add-apt-repository ppa:deadsnakes/ppa -y
sudo apt update
sudo apt install -y python3.11 python3.11-venv python3.11-dev python3-pip
# Install system libraries untuk OpenCV dan PaddleOCR
sudo apt install -y \
libgl1-mesa-glx \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender1 \
libgomp1 \
libmagic1 \
build-essential \
git \
curl \
wget
# Install Redis
sudo apt install -y redis-server
sudo systemctl enable redis-server
sudo systemctl start redis-server
# Install PostgreSQL
sudo apt install -y postgresql postgresql-contrib
sudo systemctl enable postgresql
sudo systemctl start postgresql
```
### RHEL/CentOS/Rocky Linux
```bash
# Update system
sudo dnf update -y
# Install Python 3.11
sudo dnf install -y python3.11 python3.11-devel python3.11-pip
# Install system libraries
sudo dnf install -y \
mesa-libGL \
glib2 \
libSM \
libXext \
libXrender \
file-libs \
gcc \
gcc-c++ \
make \
git
# Install Redis
sudo dnf install -y redis
sudo systemctl enable redis
sudo systemctl start redis
# Install PostgreSQL
sudo dnf install -y postgresql-server postgresql-contrib
sudo postgresql-setup --initdb
sudo systemctl enable postgresql
sudo systemctl start postgresql
```
## Langkah 2: Setup Database PostgreSQL
```bash
# Masuk sebagai postgres user
sudo -u postgres psql
# Jalankan SQL commands berikut:
```
```sql
-- Create user dan database
CREATE USER ocr WITH PASSWORD 'ganti-dengan-password-kuat';
CREATE DATABASE ocr_sprint OWNER ocr;
-- Grant privileges
GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
-- Connect ke database
\c ocr_sprint
-- Grant schema privileges (PostgreSQL 15+)
GRANT ALL ON SCHEMA public TO ocr;
-- Exit
\q
```
**Konfigurasi PostgreSQL untuk remote access (opsional):**
```bash
# Edit postgresql.conf
sudo nano /etc/postgresql/14/main/postgresql.conf
# Uncomment dan ubah:
listen_addresses = 'localhost' # Tetap localhost untuk keamanan
# Edit pg_hba.conf
sudo nano /etc/postgresql/14/main/pg_hba.conf
# Tambahkan line:
local ocr_sprint ocr scram-sha-256
# Restart PostgreSQL
sudo systemctl restart postgresql
```
## Langkah 3: Setup Application User
```bash
# Create dedicated user untuk aplikasi
sudo useradd -m -s /bin/bash ocr
sudo usermod -aG sudo ocr # Opsional, untuk maintenance
# Create application directory
sudo mkdir -p /opt/ocr-sprint-service
sudo chown ocr:ocr /opt/ocr-sprint-service
# Switch ke user ocr
sudo su - ocr
```
## Langkah 4: Install Application
```bash
# Clone repository
cd /opt
git clone https://github.com/Adriankf59/ocr-sprint-service.git
cd ocr-sprint-service
# Create virtual environment
python3.11 -m venv .venv
# Activate virtual environment
source .venv/bin/activate
# Upgrade pip
pip install --upgrade pip setuptools wheel
# Install application dengan OCR dependencies
pip install -e ".[ocr]"
# Verify installation
python -c "import paddleocr; print('PaddleOCR installed successfully')"
```
## Langkah 5: Konfigurasi Application
```bash
# Copy environment template
cp .env.example .env
# Edit konfigurasi
nano .env
```
**Konfigurasi production (`/opt/ocr-sprint-service/.env`):**
```bash
# ==== App ====
APP_ENV=prod
APP_HOST=0.0.0.0
APP_PORT=8000
APP_LOG_LEVEL=INFO
# ==== Storage ====
STORAGE_LOCAL_DIR=/opt/ocr-sprint-service/storage
BLOB_STORAGE_DIR=/opt/ocr-sprint-service/storage/blobs
BLOB_MAX_UPLOAD_MB=25
# ==== OCR ====
OCR_LANG=latin
OCR_USE_GPU=false
OCR_MAX_IMAGE_SIDE=2200
# ==== Preprocessing ====
PREPROCESS_TARGET_DPI=300
PREPROCESS_DENOISE=true
PREPROCESS_DESKEW=true
PREPROCESS_DETECT_DOCUMENT=true
PREPROCESS_REMOVE_SHADOW=true
PREPROCESS_MIN_QUAD_AREA_FRACTION=0.20
# ==== Table Extraction ====
TABLES_ENABLED=true
# ==== Confidence ====
CONFIDENCE_AUTO_APPROVE=0.95
CONFIDENCE_NEEDS_REVIEW=0.85
# ==== LLM (Phase 5, optional) ====
LLM_ENABLED=false
# ==== Async Pipeline ====
QUEUE_ENABLED=true
REDIS_URL=redis://localhost:6379/0
CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
# ==== Database ====
DATABASE_URL=postgresql+psycopg://ocr:ganti-dengan-password-kuat@localhost:5432/ocr_sprint
DATABASE_ECHO=false
# ==== Auth (WAJIB!) ====
API_KEYS=key1-ganti-dengan-random-string,key2-ganti-dengan-random-string
API_KEY_HEADER=X-API-Key
```
**Generate secure API keys:**
```bash
# Generate 2 API keys
openssl rand -hex 32
openssl rand -hex 32
```
**Create storage directories:**
```bash
mkdir -p /opt/ocr-sprint-service/storage/blobs
chmod 755 /opt/ocr-sprint-service/storage
```
## Langkah 6: Run Database Migrations
```bash
# Masih sebagai user ocr, dengan venv activated
cd /opt/ocr-sprint-service
source .venv/bin/activate
# Run migrations
alembic upgrade head
# Verify
alembic current
```
## Langkah 7: Test Manual Run
```bash
# Test API server
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
# Di terminal lain, test health check
curl http://localhost:8000/api/v1/health
# Jika berhasil, stop dengan Ctrl+C
```
## Langkah 8: Setup Systemd Services
### API Service
```bash
# Exit dari user ocr, kembali ke user dengan sudo
exit
# Create systemd service file
sudo nano /etc/systemd/system/ocr-sprint-api.service
```
**Content `/etc/systemd/system/ocr-sprint-api.service`:**
```ini
[Unit]
Description=OCR Sprint API Service
After=network.target postgresql.service redis.service
Wants=postgresql.service redis.service
[Service]
Type=simple
User=ocr
Group=ocr
WorkingDirectory=/opt/ocr-sprint-service
# Environment
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
EnvironmentFile=/opt/ocr-sprint-service/.env
# Start command - 4 workers untuk production
ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn \
ocr_sprint.main:app \
--host 0.0.0.0 \
--port 8000 \
--workers 4 \
--log-level info
# Restart policy
Restart=always
RestartSec=10
StartLimitInterval=0
# Resource limits
LimitNOFILE=65536
MemoryLimit=6G
# Security
NoNewPrivileges=true
PrivateTmp=true
[Install]
WantedBy=multi-user.target
```
### Celery Worker Service
```bash
sudo nano /etc/systemd/system/ocr-sprint-worker.service
```
**Content `/etc/systemd/system/ocr-sprint-worker.service`:**
```ini
[Unit]
Description=OCR Sprint Celery Worker
After=network.target postgresql.service redis.service ocr-sprint-api.service
Wants=postgresql.service redis.service
[Service]
Type=simple
User=ocr
Group=ocr
WorkingDirectory=/opt/ocr-sprint-service
# Environment
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
EnvironmentFile=/opt/ocr-sprint-service/.env
# Start command - concurrency 2 untuk 4 core CPU
ExecStart=/opt/ocr-sprint-service/.venv/bin/celery \
-A ocr_sprint.worker.celery_app \
worker \
--loglevel=info \
--concurrency=2 \
--max-tasks-per-child=100
# Restart policy
Restart=always
RestartSec=10
StartLimitInterval=0
# Resource limits
LimitNOFILE=65536
MemoryLimit=4G
# Security
NoNewPrivileges=true
PrivateTmp=true
[Install]
WantedBy=multi-user.target
```
### Enable dan Start Services
```bash
# Reload systemd
sudo systemctl daemon-reload
# Enable services (auto-start on boot)
sudo systemctl enable ocr-sprint-api
sudo systemctl enable ocr-sprint-worker
# Start services
sudo systemctl start ocr-sprint-api
sudo systemctl start ocr-sprint-worker
# Check status
sudo systemctl status ocr-sprint-api
sudo systemctl status ocr-sprint-worker
# View logs
sudo journalctl -u ocr-sprint-api -f
sudo journalctl -u ocr-sprint-worker -f
```
## Langkah 9: Setup Nginx Reverse Proxy
### Install Nginx
```bash
sudo apt install -y nginx certbot python3-certbot-nginx
```
### Konfigurasi Nginx
```bash
sudo nano /etc/nginx/sites-available/ocr-sprint
```
**Content `/etc/nginx/sites-available/ocr-sprint`:**
```nginx
# Upstream untuk load balancing (jika scale horizontal)
upstream ocr_api {
server 127.0.0.1:8000;
keepalive 32;
}
# Rate limiting
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
server {
listen 80;
server_name ocr.yourdomain.com; # Ganti dengan domain Anda
# Max upload size (sesuaikan dengan BLOB_MAX_UPLOAD_MB)
client_max_body_size 30M;
client_body_buffer_size 128k;
# Timeouts untuk dokumen besar
proxy_connect_timeout 300s;
proxy_send_timeout 300s;
proxy_read_timeout 300s;
send_timeout 300s;
# Logging
access_log /var/log/nginx/ocr-sprint-access.log;
error_log /var/log/nginx/ocr-sprint-error.log;
# API endpoints
location /api/ {
# Rate limiting
limit_req zone=api_limit burst=20 nodelay;
proxy_pass http://ocr_api;
proxy_http_version 1.1;
# Headers
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Connection "";
# Disable buffering untuk streaming responses
proxy_buffering off;
}
# Health check endpoint (no rate limit)
location /api/v1/health {
proxy_pass http://ocr_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
access_log off;
}
# Metrics endpoint (restrict access)
location /metrics {
# Allow only from internal network
allow 10.0.0.0/8;
allow 172.16.0.0/12;
allow 192.168.0.0/16;
allow 127.0.0.1;
deny all;
proxy_pass http://ocr_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
}
# Docs (opsional, bisa di-disable di production)
location /docs {
proxy_pass http://ocr_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
}
location /redoc {
proxy_pass http://ocr_api;
proxy_http_version 1.1;
proxy_set_header Host $host;
}
}
```
### Enable Site
```bash
# Test konfigurasi
sudo nginx -t
# Enable site
sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
# Remove default site (opsional)
sudo rm /etc/nginx/sites-enabled/default
# Reload Nginx
sudo systemctl reload nginx
```
### Setup SSL dengan Let's Encrypt
```bash
# Install certbot
sudo apt install -y certbot python3-certbot-nginx
# Obtain certificate (ganti dengan domain Anda)
sudo certbot --nginx -d ocr.yourdomain.com
# Test auto-renewal
sudo certbot renew --dry-run
```
Certbot akan otomatis mengupdate konfigurasi Nginx untuk HTTPS.
## Langkah 10: Setup Firewall
```bash
# Install UFW (jika belum ada)
sudo apt install -y ufw
# Allow SSH (PENTING! Jangan sampai terkunci)
sudo ufw allow 22/tcp
# Allow HTTP dan HTTPS
sudo ufw allow 80/tcp
sudo ufw allow 443/tcp
# Enable firewall
sudo ufw enable
# Check status
sudo ufw status
```
## Langkah 11: Verifikasi Deployment
### Test dari Server
```bash
# Health check
curl http://localhost:8000/api/v1/health
# Test dengan API key
curl -X POST http://localhost:8000/api/v1/documents?sync=true \
-H "X-API-Key: your-api-key-here" \
-F "file=@/path/to/test.pdf"
```
### Test dari Client
```bash
# Health check via domain
curl https://ocr.yourdomain.com/api/v1/health
# Upload dokumen
curl -X POST https://ocr.yourdomain.com/api/v1/documents \
-H "X-API-Key: your-api-key-here" \
-F "file=@document.pdf"
```
## Monitoring dan Maintenance
### View Logs
```bash
# API logs
sudo journalctl -u ocr-sprint-api -f
# Worker logs
sudo journalctl -u ocr-sprint-worker -f
# Nginx logs
sudo tail -f /var/log/nginx/ocr-sprint-access.log
sudo tail -f /var/log/nginx/ocr-sprint-error.log
# PostgreSQL logs
sudo tail -f /var/log/postgresql/postgresql-14-main.log
```
### Service Management
```bash
# Restart services
sudo systemctl restart ocr-sprint-api
sudo systemctl restart ocr-sprint-worker
# Stop services
sudo systemctl stop ocr-sprint-api
sudo systemctl stop ocr-sprint-worker
# Check status
sudo systemctl status ocr-sprint-api
sudo systemctl status ocr-sprint-worker
```
### Database Backup
```bash
# Create backup script
sudo nano /opt/ocr-sprint-service/backup.sh
```
**Content `backup.sh`:**
```bash
#!/bin/bash
BACKUP_DIR="/opt/ocr-sprint-service/backups"
DATE=$(date +%Y%m%d_%H%M%S)
mkdir -p $BACKUP_DIR
# Backup database
pg_dump -U ocr -h localhost ocr_sprint | gzip > $BACKUP_DIR/db_$DATE.sql.gz
# Backup blobs (opsional, bisa besar)
# tar -czf $BACKUP_DIR/blobs_$DATE.tar.gz /opt/ocr-sprint-service/storage/blobs
# Keep only last 7 days
find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete
echo "Backup completed: $DATE"
```
```bash
# Make executable
chmod +x /opt/ocr-sprint-service/backup.sh
# Setup cron job (daily at 2 AM)
sudo crontab -e
# Add line:
0 2 * * * /opt/ocr-sprint-service/backup.sh >> /var/log/ocr-backup.log 2>&1
```
### Log Rotation
```bash
sudo nano /etc/logrotate.d/ocr-sprint
```
**Content:**
```
/var/log/nginx/ocr-sprint-*.log {
daily
rotate 14
compress
delaycompress
notifempty
create 0640 www-data adm
sharedscripts
postrotate
[ -f /var/run/nginx.pid ] && kill -USR1 `cat /var/run/nginx.pid`
endscript
}
```
## Update Application
```bash
# Switch ke user ocr
sudo su - ocr
cd /opt/ocr-sprint-service
# Pull latest code
git pull
# Activate venv
source .venv/bin/activate
# Update dependencies
pip install -e ".[ocr]"
# Run migrations
alembic upgrade head
# Exit user ocr
exit
# Restart services
sudo systemctl restart ocr-sprint-api
sudo systemctl restart ocr-sprint-worker
# Check logs
sudo journalctl -u ocr-sprint-api -n 50
```
## Performance Tuning
### Increase Worker Concurrency
```bash
# Edit worker service
sudo nano /etc/systemd/system/ocr-sprint-worker.service
# Ubah --concurrency sesuai CPU cores
# Untuk 8 cores: --concurrency=4
# Untuk 16 cores: --concurrency=8
# Reload dan restart
sudo systemctl daemon-reload
sudo systemctl restart ocr-sprint-worker
```
### PostgreSQL Tuning
```bash
sudo nano /etc/postgresql/14/main/postgresql.conf
```
**Recommended settings untuk 16GB RAM:**
```
shared_buffers = 4GB
effective_cache_size = 12GB
maintenance_work_mem = 1GB
checkpoint_completion_target = 0.9
wal_buffers = 16MB
default_statistics_target = 100
random_page_cost = 1.1
effective_io_concurrency = 200
work_mem = 10MB
min_wal_size = 1GB
max_wal_size = 4GB
max_worker_processes = 4
max_parallel_workers_per_gather = 2
max_parallel_workers = 4
```
```bash
sudo systemctl restart postgresql
```
### Redis Tuning
```bash
sudo nano /etc/redis/redis.conf
```
**Recommended settings:**
```
maxmemory 2gb
maxmemory-policy allkeys-lru
save "" # Disable RDB snapshots untuk performance
```
```bash
sudo systemctl restart redis
```
## Troubleshooting
### Service tidak start
```bash
# Check logs
sudo journalctl -u ocr-sprint-api -n 100 --no-pager
sudo journalctl -u ocr-sprint-worker -n 100 --no-pager
# Check permissions
ls -la /opt/ocr-sprint-service
ls -la /opt/ocr-sprint-service/storage
# Test manual run
sudo su - ocr
cd /opt/ocr-sprint-service
source .venv/bin/activate
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
```
### Database connection error
```bash
# Test connection
sudo -u ocr psql -h localhost -U ocr -d ocr_sprint
# Check PostgreSQL status
sudo systemctl status postgresql
# Check pg_hba.conf
sudo cat /etc/postgresql/14/main/pg_hba.conf | grep ocr
```
### Redis connection error
```bash
# Test Redis
redis-cli ping
# Check Redis status
sudo systemctl status redis
# Check Redis logs
sudo journalctl -u redis -n 50
```
### PaddleOCR model download gagal
```bash
# Download manual
sudo su - ocr
cd /opt/ocr-sprint-service
source .venv/bin/activate
python << EOF
from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='latin')
print("Models downloaded successfully")
EOF
```
### Out of memory
```bash
# Check memory usage
free -h
htop
# Reduce worker concurrency
sudo nano /etc/systemd/system/ocr-sprint-worker.service
# Ubah --concurrency=1
# Add swap (jika perlu)
sudo fallocate -l 4G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
```
## Security Checklist
- [ ] API keys diganti dengan nilai random yang kuat
- [ ] Database password diganti dari default
- [ ] Firewall enabled (UFW) - hanya port 22, 80, 443 terbuka
- [ ] SSL/TLS enabled via Let's Encrypt
- [ ] `/metrics` endpoint restricted ke internal network
- [ ] Nginx rate limiting configured
- [ ] PostgreSQL hanya listen di localhost
- [ ] Redis hanya listen di localhost
- [ ] Regular backup configured (cron job)
- [ ] Log rotation configured
- [ ] OS security updates enabled (`unattended-upgrades`)
- [ ] Fail2ban installed untuk SSH protection
## Monitoring dengan Prometheus (Opsional)
### Install Prometheus
```bash
# Download Prometheus
cd /tmp
wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
tar xvfz prometheus-*.tar.gz
sudo mv prometheus-2.45.0.linux-amd64 /opt/prometheus
# Create user
sudo useradd --no-create-home --shell /bin/false prometheus
# Create directories
sudo mkdir /etc/prometheus /var/lib/prometheus
sudo chown prometheus:prometheus /var/lib/prometheus
```
### Configure Prometheus
```bash
sudo nano /etc/prometheus/prometheus.yml
```
**Content:**
```yaml
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'ocr-sprint'
static_configs:
- targets: ['localhost:8000']
metrics_path: '/metrics'
```
### Create Systemd Service
```bash
sudo nano /etc/systemd/system/prometheus.service
```
**Content:**
```ini
[Unit]
Description=Prometheus
After=network.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/opt/prometheus/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/var/lib/prometheus/
[Install]
WantedBy=multi-user.target
```
```bash
sudo systemctl daemon-reload
sudo systemctl enable prometheus
sudo systemctl start prometheus
```
Access Prometheus di `http://localhost:9090`
## Support
Untuk pertanyaan atau issues, hubungi tim development.

437
docs/DEPLOYMENT.md Normal file
View File

@@ -0,0 +1,437 @@
# Quickstart Deployment OCR Sprint Service
Panduan deployment OCR Sprint Service ke server production untuk pemrosesan dokumen surat sprint Polri.
## Prasyarat Server
### Spesifikasi Minimum
- **OS**: Linux (Ubuntu 20.04+ / Debian 11+ / RHEL 8+)
- **CPU**: 4 cores (8 cores recommended untuk throughput tinggi)
- **RAM**: 8 GB minimum (16 GB recommended)
- **Storage**: 50 GB free space
- ~3 GB untuk model PaddleOCR
- ~1.5 GB untuk dependencies Python
- Sisanya untuk blob storage dokumen
- **Network**: Port 8000 terbuka untuk API access
### Software Requirements
- Docker 24.0+ dan Docker Compose v2
- Git
- (Opsional) Nginx/Caddy untuk reverse proxy + SSL
## Deployment dengan Docker Compose (Recommended)
### 1. Clone Repository
```bash
# Login ke server sebagai user non-root dengan sudo access
ssh user@your-server.com
# Clone repository
git clone https://github.com/Adriankf59/ocr-sprint-service.git
cd ocr-sprint-service
```
### 2. Konfigurasi Environment
```bash
# Copy template environment
cp .env.example .env
# Edit konfigurasi production
nano .env
```
**Konfigurasi penting untuk production:**
```bash
# ==== App ====
APP_ENV=prod
APP_LOG_LEVEL=INFO
# ==== Storage ====
STORAGE_LOCAL_DIR=/app/storage
BLOB_STORAGE_DIR=/app/storage/blobs
BLOB_MAX_UPLOAD_MB=25
# ==== OCR ====
OCR_LANG=latin
OCR_USE_GPU=false # set true jika server punya GPU NVIDIA
OCR_MAX_IMAGE_SIDE=2200
# ==== Preprocessing ====
PREPROCESS_TARGET_DPI=300
PREPROCESS_DENOISE=true
PREPROCESS_DESKEW=true
PREPROCESS_DETECT_DOCUMENT=true
PREPROCESS_REMOVE_SHADOW=true
# ==== Table Extraction ====
TABLES_ENABLED=true
# ==== Async Pipeline ====
QUEUE_ENABLED=true
REDIS_URL=redis://redis:6379/0
CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
# ==== Database ====
DATABASE_URL=postgresql+psycopg://ocr:ocr@postgres:5432/ocr_sprint
DATABASE_ECHO=false
# ==== Auth (WAJIB untuk production!) ====
API_KEYS=your-secret-key-1,your-secret-key-2
API_KEY_HEADER=X-API-Key
```
**Generate API keys yang aman:**
```bash
# Generate random API key
openssl rand -hex 32
```
### 3. Build dan Start Services
```bash
# Build Docker images
docker compose build
# Start semua services (API, Worker, Redis, Postgres)
docker compose up -d
# Cek logs untuk memastikan semua berjalan
docker compose logs -f api worker
```
**Services yang berjalan:**
- `api`: FastAPI server di port 8000
- `worker`: Celery worker untuk async processing
- `redis`: Message broker untuk job queue
- `postgres`: Database untuk job state
### 4. Verifikasi Deployment
```bash
# Health check
curl http://localhost:8000/api/v1/health
# Expected response:
# {"status":"ok","version":"0.1.0"}
# Test OCR endpoint (sync mode untuk testing)
curl -X POST http://localhost:8000/api/v1/documents?sync=true \
-H "X-API-Key: your-secret-key-1" \
-F "file=@samples/pdf/example.pdf" \
| jq
```
### 5. Setup Reverse Proxy (Nginx)
**Install Nginx:**
```bash
sudo apt update
sudo apt install nginx certbot python3-certbot-nginx
```
**Konfigurasi Nginx (`/etc/nginx/sites-available/ocr-sprint`):**
```nginx
upstream ocr_api {
server localhost:8000;
}
server {
listen 80;
server_name ocr.yourdomain.com;
client_max_body_size 30M; # Sesuaikan dengan BLOB_MAX_UPLOAD_MB
location / {
proxy_pass http://ocr_api;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Timeout untuk dokumen besar
proxy_read_timeout 300s;
proxy_connect_timeout 75s;
}
location /metrics {
# Restrict metrics endpoint
allow 10.0.0.0/8; # Internal network only
deny all;
proxy_pass http://ocr_api;
}
}
```
**Enable site dan setup SSL:**
```bash
# Enable site
sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
sudo nginx -t
sudo systemctl reload nginx
# Setup SSL dengan Let's Encrypt
sudo certbot --nginx -d ocr.yourdomain.com
```
## Deployment Manual (Tanpa Docker)
### 1. Install System Dependencies
```bash
# Ubuntu/Debian
sudo apt update
sudo apt install -y \
python3.11 python3.11-venv python3-pip \
libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \
libgomp1 libmagic1 \
redis-server postgresql-14
# Start services
sudo systemctl enable --now redis-server postgresql
```
### 2. Setup Database
```bash
# Create database dan user
sudo -u postgres psql << EOF
CREATE USER ocr WITH PASSWORD 'your-secure-password';
CREATE DATABASE ocr_sprint OWNER ocr;
GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
EOF
```
### 3. Install Application
```bash
# Clone repository
git clone https://github.com/Adriankf59/ocr-sprint-service.git
cd ocr-sprint-service
# Create virtual environment
python3.11 -m venv .venv
source .venv/bin/activate
# Install dependencies
pip install --upgrade pip
pip install -e ".[ocr]"
# Copy dan edit .env
cp .env.example .env
nano .env
```
**Update DATABASE_URL di .env:**
```bash
DATABASE_URL=postgresql+psycopg://ocr:your-secure-password@localhost:5432/ocr_sprint
REDIS_URL=redis://localhost:6379/0
QUEUE_ENABLED=true
```
### 4. Run Database Migrations
```bash
alembic upgrade head
```
### 5. Setup Systemd Services
**API Service (`/etc/systemd/system/ocr-sprint-api.service`):**
```ini
[Unit]
Description=OCR Sprint API
After=network.target postgresql.service redis.service
[Service]
Type=simple
User=ocr
WorkingDirectory=/opt/ocr-sprint-service
Environment="PATH=/opt/ocr-sprint-service/.venv/bin"
ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000 --workers 4
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
```
**Worker Service (`/etc/systemd/system/ocr-sprint-worker.service`):**
```ini
[Unit]
Description=OCR Sprint Celery Worker
After=network.target postgresql.service redis.service
[Service]
Type=simple
User=ocr
WorkingDirectory=/opt/ocr-sprint-service
Environment="PATH=/opt/ocr-sprint-service/.venv/bin"
ExecStart=/opt/ocr-sprint-service/.venv/bin/celery -A ocr_sprint.worker.celery_app worker -l info --concurrency=2
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
```
**Enable dan start services:**
```bash
sudo systemctl daemon-reload
sudo systemctl enable --now ocr-sprint-api ocr-sprint-worker
sudo systemctl status ocr-sprint-api ocr-sprint-worker
```
## Monitoring dan Maintenance
### Monitoring Logs
```bash
# Docker deployment
docker compose logs -f api worker
# Manual deployment
sudo journalctl -u ocr-sprint-api -f
sudo journalctl -u ocr-sprint-worker -f
```
### Prometheus Metrics
Metrics tersedia di endpoint `/metrics`:
```bash
curl http://localhost:8000/metrics
```
**Key metrics:**
- `ocr_documents_total`: Total dokumen diproses
- `ocr_processing_duration_seconds`: Durasi processing
- `ocr_confidence_score`: Distribusi confidence score
- `celery_task_*`: Celery worker metrics
### Backup Database
```bash
# Docker deployment
docker compose exec postgres pg_dump -U ocr ocr_sprint > backup_$(date +%Y%m%d).sql
# Manual deployment
pg_dump -U ocr ocr_sprint > backup_$(date +%Y%m%d).sql
```
### Update Service
```bash
# Docker deployment
cd ocr-sprint-service
git pull
docker compose build
docker compose up -d
# Manual deployment
cd ocr-sprint-service
git pull
source .venv/bin/activate
pip install -e ".[ocr]"
alembic upgrade head
sudo systemctl restart ocr-sprint-api ocr-sprint-worker
```
## Troubleshooting
### Service tidak start
```bash
# Cek logs
docker compose logs api worker
# Cek health check
curl http://localhost:8000/api/v1/health
```
### PaddleOCR model download gagal
```bash
# Download manual ke volume
docker compose exec api python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='latin')"
```
### Worker tidak memproses jobs
```bash
# Cek Redis connection
docker compose exec worker redis-cli -h redis ping
# Cek Celery worker status
docker compose exec worker celery -A ocr_sprint.worker.celery_app inspect active
```
### Database migration error
```bash
# Cek current revision
docker compose exec api alembic current
# Force upgrade
docker compose exec api alembic upgrade head
```
### Out of memory
```bash
# Kurangi worker concurrency di docker-compose.yml
# Ubah: --concurrency=1 (default) atau tambahkan memory limit
```
## Security Checklist
- [ ] API_KEYS diset dengan nilai random yang kuat
- [ ] Firewall configured (hanya port 80/443 terbuka)
- [ ] SSL/TLS enabled via Nginx + Let's Encrypt
- [ ] Database password diganti dari default
- [ ] `/metrics` endpoint restricted ke internal network
- [ ] Regular backup database dan blob storage
- [ ] Log rotation configured
- [ ] OS security updates enabled
## Performance Tuning
### Untuk throughput tinggi:
1. **Increase worker concurrency:**
```yaml
# docker-compose.yml
command: ["celery", "-A", "ocr_sprint.worker.celery_app", "worker", "-l", "info", "--concurrency=4"]
```
2. **Scale workers horizontally:**
```bash
docker compose up -d --scale worker=3
```
3. **Enable GPU (jika tersedia):**
```bash
# .env
OCR_USE_GPU=true
```
4. **Tune Postgres:**
```sql
-- Increase connection pool
ALTER SYSTEM SET max_connections = 200;
ALTER SYSTEM SET shared_buffers = '2GB';
```
## Support
Untuk pertanyaan atau issues, hubungi tim development atau buat issue di repository.

View File

@@ -86,14 +86,18 @@ def _row_to_response(row: object) -> DocumentResponse:
assert isinstance(row, JobRow)
status_enum = DocumentStatus(row.status)
result_obj: ExtractionResult | None = None
personel_list = None
if row.result is not None:
result_obj = ExtractionResult.model_validate(row.result)
# Auto-number personnel entries sequentially (1, 2, 3, ...)
for idx, entry in enumerate(result_obj.personel, start=1):
entry.no = idx
personel_list = result_obj.personel
return DocumentResponse(
job_id=row.job_id,
status=status_enum,
confidence=row.confidence,
data=result_obj,
data=personel_list,
review_flags=list(row.review_flags or []),
error=row.error,
approved=bool(row.approved),

View File

@@ -33,12 +33,45 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
# Perwira Menengah
"KOMPOL": ("KOMPOL",),
"AKBP": ("AKBP",),
"KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP"),
"KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP", "KOMBES"),
# Perwira Tinggi
"BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"),
"IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"),
"KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"),
"JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"),
# PNS Polri (Pegawai Negeri Sipil di lingkungan Polri). PNS appear
# routinely on sprint panitia / undangan templates alongside Polri
# personnel, so we treat them as valid ranks for extraction.
# Sources: PP 11/2017 jo PP 17/2020 (Manajemen PNS); golongan I-IV.
# Golongan I (Juru)
"JURU MUDA": ("JURU MUDA",),
"JURU MUDA TK I": ("JURU MUDA TK I", "JURU MUDA TK.I", "JURU MUDA TINGKAT I"),
"JURU": ("JURU",),
"JURU TK I": ("JURU TK I", "JURU TK.I", "JURU TINGKAT I"),
# Golongan II (Pengatur)
"PENGATUR MUDA": ("PENGATUR MUDA",),
"PENGATUR MUDA TK I": (
"PENGATUR MUDA TK I",
"PENGATUR MUDA TK.I",
"PENGATUR MUDA TINGKAT I",
),
"PENGATUR": ("PENGATUR",),
"PENGATUR TK I": ("PENGATUR TK I", "PENGATUR TK.I", "PENGATUR TINGKAT I"),
# Golongan III (Penata)
"PENATA MUDA": ("PENATA MUDA",),
"PENATA MUDA TK I": (
"PENATA MUDA TK I",
"PENATA MUDA TK.I",
"PENATA MUDA TINGKAT I",
),
"PENATA": ("PENATA",),
"PENATA TK I": ("PENATA TK I", "PENATA TK.I", "PENATA TINGKAT I"),
# Golongan IV (Pembina)
"PEMBINA": ("PEMBINA",),
"PEMBINA TK I": ("PEMBINA TK I", "PEMBINA TK.I", "PEMBINA TINGKAT I"),
"PEMBINA UTAMA MUDA": ("PEMBINA UTAMA MUDA",),
"PEMBINA UTAMA MADYA": ("PEMBINA UTAMA MADYA",),
"PEMBINA UTAMA": ("PEMBINA UTAMA",),
}
# Reverse lookup: any variant (uppercased) → canonical form.

View File

@@ -64,6 +64,8 @@ _HEADER_SYNONYMS: dict[str, str] = {
"jabatan dinas": "jabatan_dinas",
"jabatan dalam dinas": "jabatan_dinas",
"jbt dinas": "jabatan_dinas",
"struktural": "jabatan_dinas",
"jabatan struktural": "jabatan_dinas",
# jabatan dalam sprint (role for this dispatch)
"jabatan dalam sprint": "jabatan_sprint",
"jabatan dalam sprin": "jabatan_sprint",
@@ -72,6 +74,8 @@ _HEADER_SYNONYMS: dict[str, str] = {
"jabatan sprin": "jabatan_sprint",
"tugas": "jabatan_sprint",
"penugasan": "jabatan_sprint",
"dalam penugasan": "jabatan_sprint",
"jabatan dalam penugasan": "jabatan_sprint",
# remarks
"keterangan": "keterangan",
"ket": "keterangan",

View File

@@ -38,12 +38,18 @@ _RANK_TOKENS: tuple[str, ...] = tuple(
)
)
_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
# A line that contains a rank token followed (anywhere on the same line) by
# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
# inside e.g. "ABRIPDA-style" text.
# A rank token followed (within a few characters) by an 8-digit NRP.
# We allow common separators: '/', '-', '.', ',', ':' or whitespace.
# The trailing ``\b`` plus proximity to the 8-digit NRP is the
# specificity signal — we deliberately do *not* require a leading
# ``\b`` because real Polri sprint OCR routinely mashes the rank into
# the trailing characters of the previous cell (observed on Polres
# Banjar: "...CPHR., CBA, CI" runs straight into "AKP" giving
# "CIAKP 84011113"). Requiring a leading boundary loses that row
# entirely. The longest-first alternation order ensures multi-token
# ranks ("KOMBES POL") still win over short overlaps ("KBP").
_RE_RANK_NRP_LINE = re.compile(
rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
rf"(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
re.IGNORECASE,
)
# A bare row number marker like "1." or "12)". OCR often puts it on its own
@@ -143,31 +149,248 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
Strategy:
**Pass 1** — same-line rank+NRP (original strategy):
1. Iterate every line. Skip lines that don't contain both a known rank
and an 8-digit NRP (those are the only signal we trust).
2. For each rank+NRP line, look back for the most recent plausible name
line, and forward 1-3 lines for jabatan content.
3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
**Pass 2** — separate-line rank and NRP (for tabular sprint formats):
If pass 1 produces no results, scan for lines containing a standalone
rank token, then look up to 2 lines forward for a standalone NRP.
This handles sprint formats where OCR renders each column on its own
line (e.g. Polres Banjar layout).
**Pass 3** — rank-only (for sprint formats *without* an NRP column):
Some sprint templates (panitia, undangan, etc.) list only nama +
pangkat + jabatan, no NRP. If pass 1 and pass 2 both yield nothing,
fall back to a rank-only scan: every standalone rank line (or
two-line rank like "KOMBES" + "POL" produced by narrow-column OCR)
becomes a row, with name assembled from preceding lines and jabatan
from following lines. ``nrp`` stays ``None``. False-positive risk
is higher (stray rank tokens in body text), so this only fires when
nothing else matched.
The fallback is intentionally rate-limited: the first matching rank
token on a line wins (no greedy multi-match per line), and a name line
can only be consumed once (so a stray ranked text inside a paragraph
doesn't turn into multiple bogus entries).
"""
lines = raw_text.splitlines()
# ── Pass 1: rank+NRP on the same line ────────────────────────────
rows = _extract_same_line(lines)
if rows:
return rows
# ── Pass 2: rank and NRP on separate lines ───────────────────────
rows = _extract_separate_lines(lines)
if rows:
return rows
# ── Pass 3: rank-only (no NRP column) ────────────────────────────
return _extract_rank_only(lines)
# Regex for a line that is *only* a rank token (possibly with punctuation).
_RE_RANK_ONLY = re.compile(
rf"^\s*(?P<rank>{_RANK_ALT})\s*[/.\-,:]*\s*$",
re.IGNORECASE,
)
# Regex for a line that contains a standalone 8-digit NRP.
_RE_NRP_ONLY = re.compile(r"(?<!\d)(?P<nrp>\d{8})(?!\d)")
# Strip a leading row number marker like "1 ", "1.", "12)" from a name
# prefix taken from the same OCR line as a rank+NRP match. Unlike
# _RE_ROW_NUMBER (which matches a *whole* line), this is a prefix strip
# for embedded same-line cases like "1 CUCU JUHANA, A.K.S. KOMPOL ...".
_RE_LEADING_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s+")
def _extract_same_line(lines: list[str]) -> list[PersonnelEntry]:
"""Pass 1: rank+NRP pairs found anywhere in the joined text.
Uses ``finditer`` over the full ``\\n``-joined OCR text rather than
``re.search`` per line so that multiple rank+NRP pairs on the same
OCR line still produce separate rows. This is required for sprint
scans where Paddle merges several table rows into one OCR line
(observed on Polres Banjar where row 2's "...CBA.AKP 77020049 KASAT
RESKRIM" was being swallowed into row 1's jabatan because per-line
``search`` only returns the first match).
For each match we resolve nama from text *before* the match (the
same-line prefix takes precedence; otherwise look back through the
preceding lines bounded by the previous match) and jabatan from text
*after* the match (same-line suffix plus up to ~3 follow-up lines,
bounded by the next match).
"""
if not lines:
return []
full_text = "\n".join(lines)
line_starts: list[int] = []
pos = 0
for line in lines:
line_starts.append(pos)
pos += len(line) + 1 # +1 for the joining "\n"
def offset_to_line(offset: int) -> int:
lo, hi = 0, len(line_starts)
while lo < hi:
mid = (lo + hi) // 2
if line_starts[mid] <= offset:
lo = mid + 1
else:
hi = mid
return max(0, lo - 1)
matches = list(_RE_RANK_NRP_LINE.finditer(full_text))
rows: list[PersonnelEntry] = []
consumed_lines: set[int] = set()
for i, m in enumerate(matches):
pangkat = normalize_pangkat(m.group("rank"))
if not pangkat or not is_valid_pangkat(pangkat):
continue
nrp = m.group("nrp")
ml = offset_to_line(m.start())
prev_ml = (
offset_to_line(matches[i - 1].start()) if i > 0 else -1
)
next_ml = (
offset_to_line(matches[i + 1].start())
if i + 1 < len(matches)
else len(lines)
)
line_text = lines[ml]
line_off = line_starts[ml]
# Same-line prefix: text on this line *before* the rank token.
# If the previous match was on this same line, only consider the
# text after that previous match's NRP (otherwise we'd reuse the
# earlier row's tail as this row's name).
prefix_start_local = 0
if prev_ml == ml and i > 0:
prefix_start_local = max(0, matches[i - 1].end() - line_off)
prefix = line_text[prefix_start_local : m.start() - line_off]
# Same-line suffix: text on this line *after* the NRP, capped at
# the next match's start if it's on this same line.
suffix_end_local = len(line_text)
if next_ml == ml and i + 1 < len(matches):
suffix_end_local = matches[i + 1].start() - line_off
suffix = line_text[m.end() - line_off : suffix_end_local]
# ── Resolve nama ────────────────────────────────────────────
nama: str | None = None
prefix_clean = _RE_LEADING_ROW_NUMBER.sub("", prefix).strip()
if prefix_clean and _is_plausible_name(prefix_clean):
nama = prefix_clean
elif prev_ml < ml:
for back in range(ml - 1, prev_ml, -1):
if back in consumed_lines or back < 0:
continue
candidate = lines[back].strip()
if _is_plausible_name(candidate):
nama = candidate
consumed_lines.add(back)
break
# ── Resolve jabatan ─────────────────────────────────────────
jabatan_parts: list[str] = []
suffix_clean = suffix.strip()
if suffix_clean:
jabatan_parts.append(suffix_clean)
if next_ml > ml:
max_fwd = min(ml + 4, next_ml, len(lines))
for fwd in range(ml + 1, max_fwd):
candidate = lines[fwd].strip()
if not candidate:
if jabatan_parts:
break
continue
if _RE_NAME_BLOCKLIST.match(candidate):
break
if _RE_ROW_NUMBER.match(candidate):
break
jabatan_parts.append(candidate)
jabatan = (
" ".join(" ".join(jabatan_parts).split())
if jabatan_parts
else None
)
rows.append(
PersonnelEntry(
no=None,
pangkat=pangkat,
nrp=nrp,
nama=nama,
jabatan_dinas=jabatan,
jabatan_sprint=None,
keterangan=None,
)
)
return rows
def _extract_separate_lines(lines: list[str]) -> list[PersonnelEntry]:
"""Pass 2: rank and NRP on separate nearby lines.
Handles tabular sprint formats where OCR outputs each column as its
own line, e.g.:
1
CUCU JUHANA, A.K.S.
KOMPOL
70100418
KABAGOPS
"""
consumed_names: set[int] = set()
consumed_nrps: set[int] = set()
rows: list[PersonnelEntry] = []
for idx, raw_line in enumerate(lines):
line = raw_line.strip()
match = _RE_RANK_NRP_LINE.search(line)
if not match:
rank_match = _RE_RANK_ONLY.match(line)
if not rank_match:
# Also try: line starts with a rank token (may have trailing text)
for tok in _RANK_TOKENS:
if line.upper().startswith(tok) and len(line) - len(tok) < 5:
rank_match = re.match(
rf"^\s*(?P<rank>{re.escape(tok)})\s*[/.\-,:]*",
line,
re.IGNORECASE,
)
if rank_match:
break
if not rank_match:
continue
pangkat = normalize_pangkat(match.group("rank"))
pangkat = normalize_pangkat(rank_match.group("rank"))
if not pangkat or not is_valid_pangkat(pangkat):
continue
nrp = match.group("nrp")
# Look forward up to 2 lines for NRP
nrp: str | None = None
nrp_idx: int | None = None
for fwd in range(idx + 1, min(idx + 3, len(lines))):
if fwd in consumed_nrps:
continue
nrp_match = _RE_NRP_ONLY.search(lines[fwd].strip())
if nrp_match:
nrp = nrp_match.group("nrp")
nrp_idx = fwd
break
if not nrp:
continue
assert nrp_idx is not None
consumed_nrps.add(nrp_idx)
# Look back for name
nama: str | None = None
for back in range(idx - 1, max(idx - 6, -1), -1):
if back in consumed_names:
@@ -178,7 +401,8 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
consumed_names.add(back)
break
jabatan = _following_jabatan(lines, idx)
# Look forward after NRP for jabatan
jabatan = _following_jabatan(lines, nrp_idx)
rows.append(
PersonnelEntry(
no=None,
@@ -193,6 +417,370 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
return rows
# Bare row-number markers used by sprint formats without NRP (the dot
# is often missing in narrow-column OCR, e.g. just "1" on its own line).
_RE_BARE_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s*$")
def _try_match_rank_at(lines: list[str], idx: int) -> tuple[str, int] | None:
"""Try to match a standalone rank starting at ``lines[idx]``.
Returns ``(rank_text, lines_consumed)`` on success. Handles narrow-
column OCR that splits a multi-token rank across two lines (e.g.
``"KOMBES"`` + ``"POL"`` or ``"PENATA"`` + ``"TK I"``).
The two-line concatenation is tried *first* so that more-specific
multi-token ranks ("PENATA TK I") win over their less-specific
single-line prefix ("PENATA"). Without this preference, "TK I"
would leak into the jabatan column.
"""
if idx >= len(lines):
return None
line = lines[idx].strip()
if idx + 1 < len(lines):
combined = (line + " " + lines[idx + 1].strip()).strip()
m2 = _RE_RANK_ONLY.match(combined)
if m2:
return m2.group("rank"), 2
m = _RE_RANK_ONLY.match(line)
if m:
return m.group("rank"), 1
return None
def _extract_rank_only(lines: list[str]) -> list[PersonnelEntry]:
"""Pass 3: rank-only fallback for sprint formats without an NRP column.
Each standalone rank line (single line or two-line concatenation) is
treated as the pivot of a personnel row. ``nama`` is assembled from
the preceding contiguous plausible-name lines (typical OCR splits a
long name across 2-3 short lines because of narrow columns); jabatan
is collected from following lines until the next rank or row marker.
``nrp`` is always ``None`` for rows produced by this pass.
"""
rows: list[PersonnelEntry] = []
consumed_lines: set[int] = set()
i = 0
while i < len(lines):
match = _try_match_rank_at(lines, i)
if not match:
i += 1
continue
rank_text, rank_span = match
pangkat = normalize_pangkat(rank_text)
if not pangkat or not is_valid_pangkat(pangkat):
i += 1
continue
# ── Look back for name lines (assemble up to 4 contiguous lines) ──
name_lines: list[str] = []
for back in range(i - 1, max(i - 6, -1), -1):
if back in consumed_lines:
break
candidate = lines[back].strip()
if not candidate:
if name_lines:
break
continue
if _RE_BARE_ROW_NUMBER.match(candidate):
break
if _RE_NAME_BLOCKLIST.match(candidate):
break
if _try_match_rank_at(lines, back) is not None:
break
if not _is_plausible_name(candidate):
break
name_lines.insert(0, candidate)
consumed_lines.add(back)
nama = " ".join(" ".join(name_lines).split()) if name_lines else None
# ── Look forward for jabatan (stop at next rank / row marker) ─────
jabatan_parts: list[str] = []
fwd = i + rank_span
steps = 0
while fwd < len(lines) and steps < 8:
candidate = lines[fwd].strip()
if not candidate:
if jabatan_parts:
break
fwd += 1
steps += 1
continue
if _RE_BARE_ROW_NUMBER.match(candidate):
break
if _try_match_rank_at(lines, fwd) is not None:
break
if _RE_NAME_BLOCKLIST.match(candidate):
break
jabatan_parts.append(candidate)
fwd += 1
steps += 1
jabatan = " ".join(" ".join(jabatan_parts).split()) if jabatan_parts else None
rows.append(
PersonnelEntry(
no=None,
pangkat=pangkat,
nrp=None,
nama=nama,
jabatan_dinas=jabatan,
jabatan_sprint=None,
keterangan=None,
)
)
i += rank_span
return rows
# ── Column-aware Pass 3 (uses OCR bounding boxes) ───────────────────────
def _box_x_left(box: tuple[tuple[float, float], ...]) -> float:
return min(p[0] for p in box)
def _box_x_right(box: tuple[tuple[float, float], ...]) -> float:
return max(p[0] for p in box)
def _box_x_center(box: tuple[tuple[float, float], ...]) -> float:
return (_box_x_left(box) + _box_x_right(box)) / 2
def _box_y_top(box: tuple[tuple[float, float], ...]) -> float:
return min(p[1] for p in box)
def _box_y_bottom(box: tuple[tuple[float, float], ...]) -> float:
return max(p[1] for p in box)
def _box_y_center(box: tuple[tuple[float, float], ...]) -> float:
return (_box_y_top(box) + _box_y_bottom(box)) / 2
def _box_height(box: tuple[tuple[float, float], ...]) -> float:
return _box_y_bottom(box) - _box_y_top(box)
def extract_personnel_from_ocr_lines(ocr_lines: list) -> list[PersonnelEntry]:
"""Column-aware Pass 3 for sprint formats without an NRP column.
Each ``ocr_line`` must expose ``text`` (str) and ``box`` (a tuple of
4 ``(x, y)`` corner points). We use the geometry to:
1. Detect rank lines (single-line or vertically-stacked two-line).
2. Estimate the PANGKAT column X-center from those rank lines.
3. For each rank, gather **only** lines in the NAMA column (X left
of PANGKAT) within the row's Y span as the name fragments, and
**only** lines in the JABATAN column (X right of PANGKAT) for
jabatan. This prevents column-bleed that flat-text Pass 3
suffers from on dense tables.
Returns ``[]`` if no rank lines are detected (caller can fall back
to the text-only Pass 3).
"""
if not ocr_lines:
return []
# Sort by (y_top, x_left) for vertical-stacking rank detection.
indexed = sorted(
range(len(ocr_lines)),
key=lambda i: (_box_y_top(ocr_lines[i].box), _box_x_left(ocr_lines[i].box)),
)
# Pass 1: find rank anchors.
# An anchor is one or two stacked OCR lines whose combined text matches
# _RE_RANK_ONLY and normalises to a known pangkat. Two-line stacks must
# X-overlap so we don't accidentally merge cells from different columns.
used: set[int] = set()
anchors: list[dict] = []
for pos, idx in enumerate(indexed):
if idx in used:
continue
ln = ocr_lines[idx]
text = ln.text.strip()
rank_text: str | None = None
member_idxs: list[int] = [idx]
# Try two-line stack first (so PENATA TK I beats PENATA).
for j_pos in range(pos + 1, min(pos + 5, len(indexed))):
j_idx = indexed[j_pos]
if j_idx in used:
continue
other = ocr_lines[j_idx]
x_overlap = (
min(_box_x_right(ln.box), _box_x_right(other.box))
- max(_box_x_left(ln.box), _box_x_left(other.box))
)
if x_overlap <= 0:
continue
y_gap = _box_y_top(other.box) - _box_y_bottom(ln.box)
if y_gap > _box_height(ln.box) * 1.5:
break
combined = (text + " " + other.text.strip()).strip()
m2 = _RE_RANK_ONLY.match(combined)
if m2:
rank_text = m2.group("rank")
member_idxs.append(j_idx)
break
if rank_text is None:
m1 = _RE_RANK_ONLY.match(text)
if m1:
rank_text = m1.group("rank")
if rank_text is None:
continue
pangkat = normalize_pangkat(rank_text)
if not pangkat or not is_valid_pangkat(pangkat):
continue
anchors.append(
{
"member_idxs": member_idxs,
"pangkat": pangkat,
"x_center": _box_x_center(ln.box),
"y_top": min(_box_y_top(ocr_lines[m].box) for m in member_idxs),
"y_bottom": max(_box_y_bottom(ocr_lines[m].box) for m in member_idxs),
}
)
used.update(member_idxs)
if not anchors:
return []
# Sort anchors by Y so we can compute row spans.
anchors.sort(key=lambda a: a["y_top"])
# Estimate PANGKAT column X-center as the median of rank anchor X-centers.
xs_sorted = sorted(a["x_center"] for a in anchors)
pangkat_x = xs_sorted[len(xs_sorted) // 2]
# X tolerance: half the median rank-line width. Lines with x_center
# within ±tolerance of pangkat_x are *in* the PANGKAT column and
# excluded from both NAMA and JABATAN buckets.
rank_widths = [
_box_x_right(ocr_lines[a["member_idxs"][0]].box)
- _box_x_left(ocr_lines[a["member_idxs"][0]].box)
for a in anchors
]
rank_widths.sort()
median_rank_width = rank_widths[len(rank_widths) // 2] if rank_widths else 50.0
column_margin = max(median_rank_width * 0.5, 5.0)
# Try to split the JABATAN side into STRUKTURAL (jabatan_dinas) and
# DALAM SPRIN (jabatan_sprint) by clustering jabatan-side X-centers.
# This is a 2-cluster k-means-style split: collect all X-centers of
# lines to the right of PANGKAT, find the largest X-gap among them,
# and use that gap as the column boundary. KET is typically the
# right-most narrow column we let bleed into jabatan_sprint since
# it's commonly empty.
jabatan_xs: list[float] = []
for ln in ocr_lines:
x = _box_x_center(ln.box)
if x > pangkat_x + column_margin and ln.text.strip():
jabatan_xs.append(x)
jabatan_split_x: float | None = None
if len(jabatan_xs) >= 4:
jabatan_xs.sort()
max_gap = 0.0
max_gap_x: float | None = None
for k in range(1, len(jabatan_xs)):
gap = jabatan_xs[k] - jabatan_xs[k - 1]
if gap > max_gap:
max_gap = gap
max_gap_x = (jabatan_xs[k] + jabatan_xs[k - 1]) / 2
# Only use the split if the gap is meaningfully larger than a
# within-column gap (heuristic: > 1.5× median rank width).
if max_gap_x is not None and max_gap > median_rank_width * 1.5:
jabatan_split_x = max_gap_x
# Pre-compute each anchor's y_center for midpoint row dividers.
anchor_y_centers = [(a["y_top"] + a["y_bottom"]) / 2 for a in anchors]
rows: list[PersonnelEntry] = []
for i, anchor in enumerate(anchors):
# Row Y span: midpoint between this anchor and its neighbours.
# Using the midpoint (rather than the previous anchor's
# y_bottom) prevents row N's tail content (e.g. last name
# fragment "M.H.") from leaking into row N+1's nama bucket
# when rank lines don't extend to the full visual row height.
y_lo = (
(anchor_y_centers[i - 1] + anchor_y_centers[i]) / 2
if i > 0
else float("-inf")
)
y_hi = (
(anchor_y_centers[i] + anchor_y_centers[i + 1]) / 2
if i + 1 < len(anchors)
else float("inf")
)
nama_pieces: list[tuple[float, str]] = []
struktural_pieces: list[tuple[float, str]] = []
sprint_pieces: list[tuple[float, str]] = []
for j, ln in enumerate(ocr_lines):
if j in anchor["member_idxs"]:
continue
text = ln.text.strip()
if not text:
continue
x = _box_x_center(ln.box)
y = _box_y_center(ln.box)
if not (y_lo <= y <= y_hi):
continue
if x < pangkat_x - column_margin:
# NAMA side
if _RE_NAME_BLOCKLIST.match(text):
continue
if _RE_BARE_ROW_NUMBER.match(text):
continue
if not _is_plausible_name(text):
continue
nama_pieces.append((y, text))
elif x > pangkat_x + column_margin:
# JABATAN side — split into STRUKTURAL vs DALAM SPRIN
# using the geometric column boundary detected above.
if _RE_NAME_BLOCKLIST.match(text):
continue
if jabatan_split_x is not None and x > jabatan_split_x:
sprint_pieces.append((y, text))
else:
struktural_pieces.append((y, text))
# else: in PANGKAT column or column margin — skip
nama_pieces.sort(key=lambda p: p[0])
struktural_pieces.sort(key=lambda p: p[0])
sprint_pieces.sort(key=lambda p: p[0])
# Strip leading row number from the first nama piece (e.g. "1 F. GUNTUR"
# collapses to "F. GUNTUR" if the row marker happens to share a box).
if nama_pieces:
head = _RE_LEADING_ROW_NUMBER.sub("", nama_pieces[0][1]).strip()
nama_pieces[0] = (nama_pieces[0][0], head)
def _join(pieces: list[tuple[float, str]]) -> str | None:
text = " ".join(t for _, t in pieces if t).strip()
text = " ".join(text.split())
return text or None
rows.append(
PersonnelEntry(
no=None,
pangkat=anchor["pangkat"],
nrp=None,
nama=_join(nama_pieces),
jabatan_dinas=_join(struktural_pieces),
jabatan_sprint=_join(sprint_pieces),
keterangan=None,
)
)
return rows
def is_low_quality(rows: list[PersonnelEntry]) -> bool:
"""Heuristic: did PP-Structure produce useless rows?

View File

@@ -36,6 +36,73 @@ class OCRLine:
box: tuple[tuple[float, float], ...] # 4 (x, y) corner points
def _line_y_center(line: OCRLine) -> float:
return sum(p[1] for p in line.box) / len(line.box)
def _line_x_left(line: OCRLine) -> float:
return min(p[0] for p in line.box)
def _line_height(line: OCRLine) -> float:
ys = [p[1] for p in line.box]
return max(ys) - min(ys)
def sort_lines_by_layout(lines: list[OCRLine]) -> list[OCRLine]:
"""Reorder lines into top-to-bottom, left-to-right reading order.
PaddleOCR's natural output order reflects detection order, not visual
layout. On dense tables (e.g. Polda Kalbar Akpol-panitia sprint) this
interleaves rows and columns — Paddle may emit a row's KET column
before its NAMA column, breaking every downstream extractor that
assumes top-to-bottom row order.
We rebuild reading order by:
1. Sorting by ``y_center``.
2. Grouping consecutive lines into row-bands when their ``y_center``
differs by less than half the median line height (so visually
same-row cells stay together even when their boxes don't perfectly
align).
3. Sorting each band left-to-right by ``x_left``.
"""
if not lines:
return []
heights = [_line_height(ln) for ln in lines if _line_height(ln) > 0]
if not heights:
return list(lines)
median_height = sorted(heights)[len(heights) // 2]
band_threshold = max(1.0, median_height * 0.5)
by_y = sorted(lines, key=_line_y_center)
bands: list[list[OCRLine]] = []
current_band: list[OCRLine] = []
current_y: float | None = None
for ln in by_y:
y = _line_y_center(ln)
if current_y is None or abs(y - current_y) <= band_threshold:
current_band.append(ln)
# Track the band's running y-center as the mean of its
# members so a slowly-drifting set of cells doesn't split
# mid-row.
current_y = (
sum(_line_y_center(b) for b in current_band) / len(current_band)
)
else:
bands.append(current_band)
current_band = [ln]
current_y = y
if current_band:
bands.append(current_band)
ordered: list[OCRLine] = []
for band in bands:
ordered.extend(sorted(band, key=_line_x_left))
return ordered
@dataclass(frozen=True)
class OCRPage:
"""OCR output for a single page."""
@@ -44,8 +111,8 @@ class OCRPage:
@property
def text(self) -> str:
"""Reconstruct page text by concatenating lines (order = paddle's output order)."""
return "\n".join(line.text for line in self.lines)
"""Reconstruct page text in visual reading order (top-to-bottom, left-to-right)."""
return "\n".join(line.text for line in sort_lines_by_layout(self.lines))
@property
def mean_confidence(self) -> float:

View File

@@ -20,6 +20,7 @@ from ocr_sprint.pipeline.confidence import compute_confidence, route
from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
from ocr_sprint.pipeline.extract.personnel import extract_personnel
from ocr_sprint.pipeline.extract.personnel_text import (
extract_personnel_from_ocr_lines,
extract_personnel_from_text,
is_low_quality,
)
@@ -144,12 +145,37 @@ def run_pipeline(content: bytes) -> PipelineOutput:
# through the preferred path.
if is_low_quality(personel):
fallback_rows = extract_personnel_from_text(full_text)
# If text-based fallback produced rows but they all lack NRP
# (Pass 3 territory), retry with the column-aware extractor that
# uses OCR bounding boxes. On dense tables (e.g. Polda Kalbar
# Akpol-panitia), text-only Pass 3 bleeds adjacent columns into
# nama/jabatan because lines are interleaved within each Y-band;
# the columnar variant restricts each field to its visual column.
text_only_no_nrp = bool(fallback_rows) and all(
r.nrp is None for r in fallback_rows
)
if (not fallback_rows) or text_only_no_nrp:
ocr_lines = [ln for page in ocr_pages for ln in page.lines]
columnar_rows = extract_personnel_from_ocr_lines(ocr_lines)
if columnar_rows and (
not fallback_rows or len(columnar_rows) >= len(fallback_rows)
):
fallback_rows = columnar_rows
if fallback_rows:
personel = fallback_rows
# Pass 3 / columnar emit rows with nrp=None for sprint
# templates without an NRP column. Surface that with a
# distinct flag so operators know to expect missing NRPs by
# design rather than by OCR failure.
no_nrp = all(r.nrp is None for r in fallback_rows)
if no_nrp:
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK_NO_NRP)
else:
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
_logger.info(
"pipeline.personnel_text_fallback",
fallback_rows=len(fallback_rows),
no_nrp=no_nrp,
)
untuk_items = find_untuk_list(full_text)

View File

@@ -71,11 +71,16 @@ def _build_pp_structure() -> PPStructure:
from paddleocr import PPStructure
s = get_settings()
_logger.info("pp_structure.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
# PPStructure layout models only support 'en' and 'ch', not 'latin'.
# Use 'en' for layout/table detection — it's language-agnostic (detects
# table structure, not text language). OCR within cells still works for
# Indonesian text because the recognition model handles Latin scripts.
pp_lang = "en" if s.ocr_lang not in ("en", "ch") else s.ocr_lang
_logger.info("pp_structure.init", lang=pp_lang, use_gpu=s.ocr_use_gpu)
# layout=True so that PP-Structure also returns figure/text regions; we
# filter to tables only afterwards. show_log=False to keep stdout clean.
return PPStructure(
lang=s.ocr_lang,
lang=pp_lang,
use_gpu=s.ocr_use_gpu,
layout=True,
show_log=False,

View File

@@ -10,6 +10,7 @@ from uuid import UUID, uuid4
from pydantic import BaseModel, ConfigDict, Field
from ocr_sprint.schemas.extraction import ExtractionResult
from ocr_sprint.schemas.personnel import PersonnelEntry
class SourceKind(str, Enum):
@@ -52,7 +53,7 @@ class DocumentResponse(BaseModel):
job_id: UUID
status: DocumentStatus
confidence: float | None = None
data: ExtractionResult | None = None
data: list[PersonnelEntry] | None = None
review_flags: list[str] = Field(default_factory=list)
error: str | None = None
# Phase 6 — HITL review state.

View File

@@ -22,6 +22,7 @@ class ReviewFlag(str, Enum):
LLM_FALLBACK = "llm_fallback"
LLM_UNAVAILABLE = "llm_unavailable"
PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback"
PERSONNEL_TEXT_FALLBACK_NO_NRP = "personnel_text_fallback_no_nrp"
INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row"

View File

@@ -0,0 +1,75 @@
"""Tests for OCR layout reordering.
PaddleOCR emits text boxes in detection order, not visual reading order.
On dense table layouts (Polda Kalbar Akpol-panitia regression) this
interleaves columns within a row and breaks every downstream extractor
that assumes top-to-bottom row order. ``sort_lines_by_layout`` rebuilds
reading order from the bounding-box geometry.
"""
from __future__ import annotations
from ocr_sprint.pipeline.ocr import OCRLine, OCRPage, sort_lines_by_layout
def _box(x: float, y: float, w: float = 30, h: float = 15):
return ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
def _make(text: str, x: float, y: float) -> OCRLine:
return OCRLine(text=text, confidence=1.0, box=_box(x, y))
class TestSortLinesByLayout:
def test_empty_returns_empty(self) -> None:
assert sort_lines_by_layout([]) == []
def test_already_sorted_is_stable(self) -> None:
lines = [_make("A", 10, 10), _make("B", 50, 10), _make("C", 10, 30)]
assert [ln.text for ln in sort_lines_by_layout(lines)] == ["A", "B", "C"]
def test_reorders_column_first_detection_to_row_first(self) -> None:
# Simulate a 2-row, 3-col table where Paddle returned cells
# column-first instead of row-first.
lines = [
_make("B1", 50, 10),
_make("B2", 50, 30),
_make("A1", 10, 10),
_make("A2", 10, 30),
_make("C1", 90, 10),
_make("C2", 90, 30),
]
result = [ln.text for ln in sort_lines_by_layout(lines)]
assert result == ["A1", "B1", "C1", "A2", "B2", "C2"]
def test_groups_slightly_misaligned_cells_into_one_band(self) -> None:
# Real OCR boxes for a single visual row are rarely perfectly
# y-aligned; we still want them grouped.
lines = [
_make("LEFT", 10, 10),
_make("MID", 50, 12), # 2px below LEFT — same row visually
_make("RIGHT", 90, 11),
]
result = [ln.text for ln in sort_lines_by_layout(lines)]
assert result == ["LEFT", "MID", "RIGHT"]
def test_separates_rows_when_y_gap_exceeds_threshold(self) -> None:
# Lines with a y gap larger than ~½ line-height must NOT collapse
# into the same band.
lines = [
_make("ROW1A", 10, 10),
_make("ROW1B", 50, 10),
_make("ROW2A", 10, 30), # gap of 20 vs height 15 → new band
_make("ROW2B", 50, 30),
]
result = [ln.text for ln in sort_lines_by_layout(lines)]
assert result == ["ROW1A", "ROW1B", "ROW2A", "ROW2B"]
def test_ocrpage_text_uses_sorted_order(self) -> None:
lines = [
_make("RIGHT", 90, 10),
_make("LEFT", 10, 10),
_make("BOTTOM", 10, 30),
]
page = OCRPage(lines=lines)
assert page.text == "LEFT\nRIGHT\nBOTTOM"

View File

@@ -8,11 +8,18 @@ recover at least the rank + NRP for every row.
from __future__ import annotations
from ocr_sprint.pipeline.extract.personnel_text import (
extract_personnel_from_ocr_lines,
extract_personnel_from_text,
is_low_quality,
)
from ocr_sprint.pipeline.ocr import OCRLine
from ocr_sprint.schemas.personnel import PersonnelEntry
def _ocr_line(text: str, x: float, y: float, w: float = 80, h: float = 15) -> OCRLine:
box = ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
return OCRLine(text=text, confidence=1.0, box=box)
_CIMAHI_FIXTURE = """\
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
NO
@@ -115,6 +122,86 @@ class TestExtractPersonnelFromText:
names = [r.nama for r in rows]
assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"]
def test_extracts_multiple_rows_when_collapsed_to_one_line(self) -> None:
# Polres Banjar regression: when PaddleOCR merges several table
# rows onto a single OCR line, every rank+NRP pair on that line
# must still produce a separate row. Previously per-line
# ``re.search`` returned only the first match.
text = (
"DAFTAR NAMA INSTRUKTUR\n"
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS "
"INSTRUKTUR LAT PRA OPS "
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 "
"KASAT RESKRIM SDA "
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 "
"KASATINTELKAM POLRES BANJAR SDA\n"
)
rows = extract_personnel_from_text(text)
assert len(rows) == 3
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
assert rows[0].nama == "CUCU JUHANA, A.K.S."
assert rows[1].nama is not None and "HERU SAMSUL BAHRI" in rows[1].nama
assert rows[2].nama is not None and "YAYAN SOPIANA" in rows[2].nama
def test_extracts_multiple_rows_when_split_across_lines(self) -> None:
# Variant of the squished case where OCR produces one line per
# table row. Each row still ends up with multiple rank+NRP pairs
# never being on the same line, but verifies the finditer-based
# path doesn't regress this layout.
text = (
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS\n"
"INSTRUKTUR LAT PRA OPS\n"
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 KASAT RESKRIM\n"
"SDA\n"
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 KASATINTELKAM\n"
"POLRES BANJAR SDA\n"
)
rows = extract_personnel_from_text(text)
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
assert rows[0].nama == "CUCU JUHANA, A.K.S."
def test_extracts_rows_when_sprint_has_no_nrp_column(self) -> None:
# Polda Kalbar Akpol-panitia regression: sprint formats without
# an NRP column (panitia, undangan templates) must still extract
# rows via the rank-only Pass 3 path. Names span multiple OCR
# lines (narrow column), and the multi-token rank "KOMBES POL"
# is split across two lines.
text = (
"DAFTAR NAMA PANITIA\n"
"NO\nNAMA\nPANGKAT\nJABATAN\nSTRUKTURAL\nDALAM SPRIN\nKET\n"
"1\nF. GUNTUR\nSUNOTO, S.I.K.,\nM.H.\n"
"KOMBES\nPOL\n"
"KARO SDM\nPOLDA KALBAR\nKETUA\nPELAKSANA\n"
"2\nJUDA TRISNO\nTAMPUBOLON,\nS.H., S.I.K., M.H.\n"
"AKBP\n"
"KABAGDALPERS\nRO SDM\nPOLDA KALBAR\nSEKRETARIS\n"
"3\nPRAYITNO, S.H.,\nM.H.\n"
"KOMPOL\n"
"KASUBBAG DIAPERS\nANGGOTA\n"
)
rows = extract_personnel_from_text(text)
assert len(rows) == 3
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "KOMPOL"]
# All Pass 3 rows have nrp=None by design.
assert all(r.nrp is None for r in rows)
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
assert rows[2].nama == "PRAYITNO, S.H., M.H."
assert rows[0].jabatan_dinas is not None and "KARO SDM" in rows[0].jabatan_dinas
def test_pass3_does_not_run_when_pass1_succeeds(self) -> None:
# If a sprint has NRPs (Pass 1 succeeds), Pass 3 must not fire
# and produce duplicate/contaminating rows.
text = (
"1\nSRI WAHYUNI\nAIPTU / 75070328\nBAUR SKCK\n"
"2\nCITRA DWI PUTRI\nBRIPTU / 95070659\nBA PELAKSANA\n"
)
rows = extract_personnel_from_text(text)
assert len(rows) == 2
assert all(r.nrp is not None for r in rows)
def test_still_blocks_bare_column_header_tokens(self) -> None:
# Word-boundary fix must still reject the actual column-header
# rows that motivated the blocklist in the first place.
@@ -124,6 +211,94 @@ class TestExtractPersonnelFromText:
assert rows[0].nama == "REAL NAME"
class TestExtractPersonnelFromOcrLines:
"""Column-aware Pass 3 — Polda Kalbar Akpol-panitia regression.
Verifies that bounding-box geometry preserves column boundaries on
dense tables where text-only Pass 3 bleeds adjacent columns into
nama/jabatan.
"""
def _kalbar_lines(self) -> list[OCRLine]:
# Stylised Polda Kalbar layout: NO | NAMA | PANGKAT | STRUKTURAL | SPRIN
# X columns: 10, 100, 250, 380, 520. Each row may have multi-line cells.
return [
# Row 1 — KOMBES POL spans two stacked OCR boxes
_ocr_line("1", 10, 100),
_ocr_line("F. GUNTUR", 100, 100),
_ocr_line("SUNOTO, S.I.K.,", 100, 120),
_ocr_line("M.H.", 100, 140),
_ocr_line("KOMBES", 250, 100),
_ocr_line("POL", 250, 120),
_ocr_line("KARO SDM", 380, 100),
_ocr_line("POLDA KALBAR", 380, 120),
_ocr_line("KETUA", 520, 100),
_ocr_line("PELAKSANA", 520, 120),
# Row 2
_ocr_line("2", 10, 200),
_ocr_line("JUDA TRISNO", 100, 200),
_ocr_line("TAMPUBOLON,", 100, 220),
_ocr_line("S.H., S.I.K., M.H.", 100, 240),
_ocr_line("AKBP", 250, 200),
_ocr_line("KABAGDALPERS", 380, 200),
_ocr_line("RO SDM", 380, 220),
_ocr_line("POLDA KALBAR", 380, 240),
_ocr_line("SEKRETARIS", 520, 200),
# Row 9 — PNS PENATA TK I (multi-token rank stacked)
_ocr_line("9", 10, 500),
_ocr_line("FITRIANSYAH,", 100, 500),
_ocr_line("S.E.", 100, 520),
_ocr_line("PENATA", 250, 500),
_ocr_line("TK I", 250, 520),
_ocr_line("KAURKEU", 380, 500),
_ocr_line("RO SDM", 380, 520),
_ocr_line("POLDA KALBAR", 380, 540),
_ocr_line("BENDAHARA", 520, 500),
]
def test_extracts_three_rows(self) -> None:
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert len(rows) == 3
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "PENATA TK I"]
def test_nama_is_assembled_only_from_nama_column(self) -> None:
# Each row's nama must contain *all* its multi-line fragments
# and *only* its multi-line fragments — no bleed from struktural.
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
assert rows[2].nama == "FITRIANSYAH, S.E."
def test_jabatan_split_into_struktural_and_sprint(self) -> None:
# The geometric column boundary must split STRUKTURAL (jabatan_dinas)
# from DALAM SPRIN (jabatan_sprint).
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert rows[0].jabatan_dinas == "KARO SDM POLDA KALBAR"
assert rows[0].jabatan_sprint == "KETUA PELAKSANA"
assert rows[1].jabatan_dinas == "KABAGDALPERS RO SDM POLDA KALBAR"
assert rows[1].jabatan_sprint == "SEKRETARIS"
def test_returns_empty_when_no_rank_anchors(self) -> None:
lines = [
_ocr_line("DAFTAR NAMA", 100, 50),
_ocr_line("HEADER", 100, 100),
]
assert extract_personnel_from_ocr_lines(lines) == []
def test_returns_empty_for_empty_input(self) -> None:
assert extract_personnel_from_ocr_lines([]) == []
def test_no_row_bleed_between_consecutive_rows(self) -> None:
# Row 1's last name fragment ("F. GUNTUR") sits BELOW its rank
# line but inside row 1's visual span. It must NOT leak into
# row 2's nama, which should start with "JUDA TRISNO".
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
assert rows[1].nama is not None
assert rows[1].nama.startswith("JUDA TRISNO")
assert "GUNTUR" not in rows[1].nama
assert "SUNOTO" not in rows[1].nama
class TestIsLowQuality:
def test_empty_list_is_low_quality(self) -> None:
assert is_low_quality([]) is True

60
update.ps1 Normal file
View File

@@ -0,0 +1,60 @@
#!/usr/bin/env pwsh
# update.ps1 - One-command update & restart for ocr-sprint-service (local dev)
$Port = 8000
# ── [1/5] Git pull ──────────────────────────────────────────────────────────
Write-Host "`n[1/5] Pulling latest code..." -ForegroundColor Cyan
git pull
# ── [2/5] Install/update dependencies ───────────────────────────────────────
Write-Host "`n[2/5] Installing/updating dependencies..." -ForegroundColor Cyan
pip install -e ".[dev]" -q
# ── [3/5] Database migration ─────────────────────────────────────────────────
Write-Host "`n[3/5] Running database migrations..." -ForegroundColor Cyan
alembic upgrade head
if ($LASTEXITCODE -ne 0) {
Write-Host " Migration conflict detected, stamping current state as head..." -ForegroundColor Yellow
alembic stamp head
Write-Host " Retrying upgrade for any remaining new migrations..." -ForegroundColor Yellow
alembic upgrade head
if ($LASTEXITCODE -ne 0) {
Write-Host " Migration still failed. Please check alembic manually." -ForegroundColor Red
exit 1
}
}
Write-Host " Migrations OK." -ForegroundColor Green
# ── [4/5] Free up port ───────────────────────────────────────────────────────
Write-Host "`n[4/5] Checking port $Port..." -ForegroundColor Cyan
# Use Get-NetTCPConnection for reliable port detection on Windows
$connections = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue
if ($connections) {
foreach ($conn in $connections) {
$procId = $conn.OwningProcess
$procName = (Get-Process -Id $procId -ErrorAction SilentlyContinue).Name
Write-Host " Port $Port used by '$procName' (PID $procId), killing..." -ForegroundColor Yellow
Stop-Process -Id $procId -Force -ErrorAction SilentlyContinue
}
# Wait until port is actually released (max 5 seconds)
$waited = 0
do {
Start-Sleep -Milliseconds 500
$waited += 500
$still = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue
} while ($still -and $waited -lt 5000)
if ($still) {
Write-Host " Port $Port still in use after waiting. Try a different port or restart manually." -ForegroundColor Red
exit 1
}
Write-Host " Port $Port freed." -ForegroundColor Green
} else {
Write-Host " Port $Port is free." -ForegroundColor Green
}
# ── [5/5] Start dev server ───────────────────────────────────────────────────
Write-Host "`n[5/5] Starting dev server on port $Port (Ctrl+C to stop)..." -ForegroundColor Cyan
uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port $Port