diff --git a/# leave empty to use PaddleOCR defaults/inference.pdiparams b/# leave empty to use PaddleOCR defaults/inference.pdiparams new file mode 100644 index 0000000..2efedca Binary files /dev/null and b/# leave empty to use PaddleOCR defaults/inference.pdiparams differ diff --git a/# leave empty to use PaddleOCR defaults/inference.pdiparams.info b/# leave empty to use PaddleOCR defaults/inference.pdiparams.info new file mode 100644 index 0000000..622d87b Binary files /dev/null and b/# leave empty to use PaddleOCR defaults/inference.pdiparams.info differ diff --git a/# leave empty to use PaddleOCR defaults/inference.pdmodel b/# leave empty to use PaddleOCR defaults/inference.pdmodel new file mode 100644 index 0000000..0a6bf1e Binary files /dev/null and b/# leave empty to use PaddleOCR defaults/inference.pdmodel differ diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..a920e26 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,18 @@ +{ + "permissions": { + "allow": [ + "Bash(python -m pytest tests/unit/test_personnel_text_fallback.py -x -q)", + "Bash(python -c \"import sys; print\\(sys.executable\\)\")", + "Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_personnel_text_fallback.py -x -q)", + "Bash(.venv/Scripts/python.exe -m pytest tests/unit -x -q)", + "Bash(git stash *)", + "Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_api.py::test_documents_sync_returns_pipeline_output -x -q)", + "Bash(.venv/Scripts/python.exe -m pytest tests/unit --ignore=tests/unit/test_api.py -q)", + "Bash(.venv/Scripts/python.exe -c ' *)", + "Bash(xargs grep *)", + "Bash(.venv/Scripts/python.exe -m pytest tests/unit -q --ignore=tests/unit/test_api.py --ignore=tests/unit/test_api_hitl.py --ignore=tests/unit/test_blob_storage.py)", + "Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_ocr_layout.py tests/unit/test_personnel_text_fallback.py -q)", + "Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_personnel_text_fallback.py tests/unit/test_ocr_layout.py -q)" + ] + } +} diff --git a/Makefile b/Makefile index 65af363..12cb949 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,10 @@ -.PHONY: help install dev fmt lint typecheck test test-cov run docker-build docker-up docker-down clean +.PHONY: help install dev update fmt lint typecheck test test-cov run docker-build docker-up docker-down clean help: @echo "Targets:" @echo " install - install runtime + dev deps in current env" @echo " dev - run FastAPI app with autoreload" + @echo " update - git pull + install deps + migrate db + run dev server" @echo " fmt - format code with ruff" @echo " lint - lint with ruff" @echo " typecheck - run mypy" @@ -21,6 +22,16 @@ install: dev: uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000 +update: + @echo "[1/4] Pulling latest code..." + git pull + @echo "[2/4] Installing/updating dependencies..." + pip install -e ".[dev]" + @echo "[3/4] Running database migrations..." + alembic upgrade head + @echo "[4/4] Starting dev server..." + uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000 + fmt: ruff format src tests ruff check --fix src tests diff --git a/docs/DEPLOYMENT-EXISTING-STACK.md b/docs/DEPLOYMENT-EXISTING-STACK.md new file mode 100644 index 0000000..6bcb9ec --- /dev/null +++ b/docs/DEPLOYMENT-EXISTING-STACK.md @@ -0,0 +1,858 @@ +# Deployment OCR Sprint Service (Existing Stack) + +Panduan deployment untuk server dengan Python 3.12.3, PostgreSQL 16.13, dan Redis 7.0.15 yang sudah terinstall. + +## Informasi Server Anda + +- **OS**: Ubuntu 24.04 +- **Python**: 3.12.3 ✅ +- **PostgreSQL**: 16.13 ✅ +- **Redis**: 7.0.15 ✅ + +Semua versi sudah kompatibel dan optimal untuk OCR Sprint Service! + +## Langkah 1: Install System Libraries untuk OpenCV & PaddleOCR + +```bash +# Update package list +sudo apt update + +# Install libraries yang dibutuhkan oleh OpenCV dan PaddleOCR +sudo apt install -y \ + libgl1 \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + libgomp1 \ + libmagic1 \ + python3.12-venv \ + python3.12-dev \ + build-essential \ + git +``` + +## Langkah 2: Setup PostgreSQL Database + +```bash +# Login ke PostgreSQL +sudo -u postgres psql +``` + +Jalankan SQL commands berikut: + +```sql +-- Create user dan database +CREATE USER ocr WITH PASSWORD '@Offroader123'; +CREATE DATABASE ocr_sprint OWNER ocr; + +-- Grant privileges +GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr; + +-- Connect ke database untuk grant schema privileges +\c ocr_sprint + +-- Grant schema privileges (PostgreSQL 15+) +GRANT ALL ON SCHEMA public TO ocr; +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO ocr; +GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO ocr; + +-- Verify +\l ocr_sprint +\du ocr + +-- Exit +\q +``` + +**Generate password yang aman:** + +```bash +# Generate random password +openssl rand -base64 32 ++J33GdYQcWcfqXs169cmgPrQJpLFgybjoedr/tNb0d4= +``` + +Simpan password ini, akan digunakan di konfigurasi nanti. + +## Langkah 3: Verify Redis + +```bash +# Check Redis status +sudo systemctl status redis-server + +# Test connection +redis-cli ping +# Expected output: PONG + +# Check Redis config (opsional) +redis-cli CONFIG GET maxmemory +``` + +Jika Redis belum running: + +```bash +sudo systemctl enable redis-server +sudo systemctl start redis-server +``` + +## Langkah 4: Create Application User + +```bash +# Create dedicated user untuk aplikasi +sudo useradd -m -s /bin/bash ocr + +# Create application directory +sudo mkdir -p /opt/ocr-sprint-service +sudo chown ocr:ocr /opt/ocr-sprint-service +``` + +## Langkah 5: Clone dan Install Application + +```bash +# Switch ke user ocr +sudo su - ocr + +# Clone repository +cd /opt +git clone https://github.com/Adriankf59/ocr-sprint-service.git +cd ocr-sprint-service + +# Create virtual environment dengan Python 3.12 +python3.12 -m venv .venv + +# Activate virtual environment +source .venv/bin/activate + +# Verify Python version di venv +python --version +# Expected: Python 3.12.3 + +# Upgrade pip +pip install --upgrade pip setuptools wheel + +# Install application dengan OCR dependencies +# Ini akan download ~1.5GB PaddlePaddle wheels +pip install -e ".[ocr]" + +# Verify installation +python -c "import paddleocr; print('PaddleOCR OK')" +python -c "import cv2; print('OpenCV OK')" +python -c "import fastapi; print('FastAPI OK')" +``` + +## Langkah 6: Konfigurasi Application + +```bash +# Masih sebagai user ocr +cd /opt/ocr-sprint-service + +# Copy environment template +cp .env.example .env + +# Edit konfigurasi +nano .env +``` + +**Konfigurasi `/opt/ocr-sprint-service/.env`:** + +```bash +# ==== App ==== +APP_ENV=prod +APP_HOST=0.0.0.0 +APP_PORT=8000 +APP_LOG_LEVEL=INFO + +# ==== Storage ==== +STORAGE_LOCAL_DIR=/opt/ocr-sprint-service/storage +BLOB_STORAGE_DIR=/opt/ocr-sprint-service/storage/blobs +BLOB_MAX_UPLOAD_MB=25 + +# ==== OCR ==== +OCR_LANG=latin +OCR_USE_GPU=false +OCR_MAX_IMAGE_SIDE=2200 + +# ==== Preprocessing ==== +PREPROCESS_TARGET_DPI=300 +PREPROCESS_DENOISE=true +PREPROCESS_DESKEW=true +PREPROCESS_DETECT_DOCUMENT=true +PREPROCESS_REMOVE_SHADOW=true +PREPROCESS_MIN_QUAD_AREA_FRACTION=0.20 + +# ==== Table Extraction ==== +TABLES_ENABLED=true + +# ==== Confidence ==== +CONFIDENCE_AUTO_APPROVE=0.95 +CONFIDENCE_NEEDS_REVIEW=0.85 + +# ==== LLM (Phase 5, optional - disable untuk sekarang) ==== +LLM_ENABLED=false + +# ==== Async Pipeline ==== +QUEUE_ENABLED=true +REDIS_URL=redis://localhost:6379/0 +CELERY_TASK_DEFAULT_QUEUE=ocr_sprint + +# ==== Database ==== +# Ganti 'your-password-here' dengan password yang Anda generate di Langkah 2 +DATABASE_URL=postgresql+psycopg://ocr:your-password-here@localhost:5432/ocr_sprint +DATABASE_ECHO=false + +# ==== Auth (WAJIB untuk production!) ==== +# Generate dengan: openssl rand -hex 32 +API_KEYS=paste-api-key-1-here,paste-api-key-2-here +API_KEY_HEADER=X-API-Key +``` + +**Generate API keys:** + +```bash +# Generate 2 API keys +echo "API Key 1: $(openssl rand -hex 32)" +echo "API Key 2: $(openssl rand -hex 32)" +``` + +Copy output dan paste ke `API_KEYS` di file `.env`. + +**Create storage directories:** + +```bash +mkdir -p /opt/ocr-sprint-service/storage/blobs +chmod 755 /opt/ocr-sprint-service/storage +``` + +## Langkah 7: Run Database Migrations + +```bash +# Masih sebagai user ocr, dengan venv activated +cd /opt/ocr-sprint-service +source .venv/bin/activate + +# Run migrations +alembic upgrade head + +# Verify - should show current revision +alembic current + +# Expected output: (head) atau revision number +``` + +## Langkah 8: Test Manual Run + +```bash +# Masih sebagai user ocr +cd /opt/ocr-sprint-service +source .venv/bin/activate + +# Test API server +uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000 +``` + +**Di terminal lain (sebagai user ubuntu):** + +```bash +# Test health check +curl http://localhost:8000/api/v1/health + +# Expected: {"status":"ok","version":"0.1.0"} + +# Test dengan sample file (jika ada) +curl -X POST "http://localhost:8000/api/v1/documents?sync=true" \ + -H "X-API-Key: your-api-key-here" \ + -F "file=@/path/to/test.pdf" +``` + +Jika berhasil, stop server dengan `Ctrl+C`. + +## Langkah 9: Setup Systemd Services + +```bash +# Exit dari user ocr +exit + +# Kembali sebagai user ubuntu dengan sudo +``` + +### Create API Service + +```bash +sudo nano /etc/systemd/system/ocr-sprint-api.service +``` + +**Content:** + +```ini +[Unit] +Description=OCR Sprint API Service +After=network.target postgresql.service redis-server.service +Wants=postgresql.service redis-server.service + +[Service] +Type=simple +User=ocr +Group=ocr +WorkingDirectory=/opt/ocr-sprint-service + +# Environment +Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin" +EnvironmentFile=/opt/ocr-sprint-service/.env + +# Start command - 4 workers untuk production +ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn \ + ocr_sprint.main:app \ + --host 0.0.0.0 \ + --port 8000 \ + --workers 4 \ + --log-level info + +# Restart policy +Restart=always +RestartSec=10 +StartLimitInterval=0 + +# Resource limits +LimitNOFILE=65536 + +# Security +NoNewPrivileges=true +PrivateTmp=true + +[Install] +WantedBy=multi-user.target +``` + +### Create Celery Worker Service + +```bash +sudo nano /etc/systemd/system/ocr-sprint-worker.service +``` + +**Content:** + +```ini +[Unit] +Description=OCR Sprint Celery Worker +After=network.target postgresql.service redis-server.service ocr-sprint-api.service +Wants=postgresql.service redis-server.service + +[Service] +Type=simple +User=ocr +Group=ocr +WorkingDirectory=/opt/ocr-sprint-service + +# Environment +Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin" +EnvironmentFile=/opt/ocr-sprint-service/.env + +# Start command - concurrency 2 untuk CPU dengan 4 cores +# Sesuaikan dengan jumlah CPU cores server Anda +ExecStart=/opt/ocr-sprint-service/.venv/bin/celery \ + -A ocr_sprint.worker.celery_app \ + worker \ + --loglevel=info \ + --concurrency=2 \ + --max-tasks-per-child=100 + +# Restart policy +Restart=always +RestartSec=10 +StartLimitInterval=0 + +# Resource limits +LimitNOFILE=65536 + +# Security +NoNewPrivileges=true +PrivateTmp=true + +[Install] +WantedBy=multi-user.target +``` + +### Enable dan Start Services + +```bash +# Reload systemd +sudo systemctl daemon-reload + +# Enable services (auto-start on boot) +sudo systemctl enable ocr-sprint-api +sudo systemctl enable ocr-sprint-worker + +# Start services +sudo systemctl start ocr-sprint-api +sudo systemctl start ocr-sprint-worker + +# Check status +sudo systemctl status ocr-sprint-api +sudo systemctl status ocr-sprint-worker +``` + +**Expected output:** `active (running)` dengan warna hijau. + +### View Logs + +```bash +# API logs (real-time) +sudo journalctl -u ocr-sprint-api -f + +# Worker logs (real-time) +sudo journalctl -u ocr-sprint-worker -f + +# Last 50 lines +sudo journalctl -u ocr-sprint-api -n 50 +sudo journalctl -u ocr-sprint-worker -n 50 +``` + +## Langkah 10: Install dan Setup Nginx + +```bash +# Install Nginx dan Certbot +sudo apt install -y nginx certbot python3-certbot-nginx + +# Check Nginx status +sudo systemctl status nginx +``` + +### Create Nginx Configuration + +```bash +sudo nano /etc/nginx/sites-available/ocr-sprint +``` + +**Content (ganti `ocr.yourdomain.com` dengan domain Anda):** + +```nginx +# Upstream +upstream ocr_api { + server 127.0.0.1:8000; + keepalive 32; +} + +# Rate limiting +limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s; + +server { + listen 80; + server_name ocr.yourdomain.com; + + # Max upload size + client_max_body_size 30M; + client_body_buffer_size 128k; + + # Timeouts + proxy_connect_timeout 300s; + proxy_send_timeout 300s; + proxy_read_timeout 300s; + send_timeout 300s; + + # Logging + access_log /var/log/nginx/ocr-sprint-access.log; + error_log /var/log/nginx/ocr-sprint-error.log; + + # API endpoints + location /api/ { + limit_req zone=api_limit burst=20 nodelay; + + proxy_pass http://ocr_api; + proxy_http_version 1.1; + + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Connection ""; + + proxy_buffering off; + } + + # Health check + location /api/v1/health { + proxy_pass http://ocr_api; + proxy_http_version 1.1; + proxy_set_header Host $host; + access_log off; + } + + # Metrics (restrict access) + location /metrics { + allow 127.0.0.1; + allow 10.0.0.0/8; + deny all; + + proxy_pass http://ocr_api; + proxy_http_version 1.1; + proxy_set_header Host $host; + } + + # API docs + location /docs { + proxy_pass http://ocr_api; + proxy_http_version 1.1; + proxy_set_header Host $host; + } + + location /redoc { + proxy_pass http://ocr_api; + proxy_http_version 1.1; + proxy_set_header Host $host; + } +} +``` + +### Enable Site + +```bash +# Test konfigurasi +sudo nginx -t + +# Enable site +sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/ + +# Reload Nginx +sudo systemctl reload nginx +``` + +### Setup SSL (jika punya domain) + +```bash +# Obtain certificate +sudo certbot --nginx -d ocr.yourdomain.com + +# Test auto-renewal +sudo certbot renew --dry-run +``` + +## Langkah 11: Setup Firewall + +```bash +# Check UFW status +sudo ufw status + +# Allow SSH (PENTING!) +sudo ufw allow 22/tcp + +# Allow HTTP dan HTTPS +sudo ufw allow 80/tcp +sudo ufw allow 443/tcp + +# Enable firewall (jika belum) +sudo ufw enable + +# Verify +sudo ufw status numbered +``` + +## Langkah 12: Verifikasi Final + +### Test dari Server + +```bash +# Health check +curl http://localhost:8000/api/v1/health + +# Test async endpoint +curl -X POST http://localhost:8000/api/v1/documents \ + -H "X-API-Key: your-api-key-here" \ + -F "file=@/path/to/test.pdf" + +# Expected: {"job_id":"...","status":"pending",...} + +# Check job status +curl -H "X-API-Key: your-api-key-here" \ + http://localhost:8000/api/v1/documents/JOB_ID_HERE +``` + +### Test via Domain (jika sudah setup SSL) + +```bash +curl https://ocr.yourdomain.com/api/v1/health +``` + +### Check Services + +```bash +# All services should be active +sudo systemctl status ocr-sprint-api +sudo systemctl status ocr-sprint-worker +sudo systemctl status postgresql +sudo systemctl status redis-server +sudo systemctl status nginx +``` + +## Monitoring + +### View Logs + +```bash +# API logs +sudo journalctl -u ocr-sprint-api -f + +# Worker logs +sudo journalctl -u ocr-sprint-worker -f + +# Nginx access logs +sudo tail -f /var/log/nginx/ocr-sprint-access.log + +# Nginx error logs +sudo tail -f /var/log/nginx/ocr-sprint-error.log +``` + +### Prometheus Metrics + +```bash +# View metrics +curl http://localhost:8000/metrics + +# Key metrics: +# - ocr_documents_total +# - ocr_processing_duration_seconds +# - ocr_confidence_score +``` + +## Maintenance + +### Restart Services + +```bash +sudo systemctl restart ocr-sprint-api +sudo systemctl restart ocr-sprint-worker +``` + +### Update Application + +```bash +# Switch ke user ocr +sudo su - ocr +cd /opt/ocr-sprint-service + +# Pull latest code +git pull + +# Activate venv +source .venv/bin/activate + +# Update dependencies +pip install -e ".[ocr]" + +# Run migrations +alembic upgrade head + +# Exit +exit + +# Restart services +sudo systemctl restart ocr-sprint-api +sudo systemctl restart ocr-sprint-worker + +# Check logs +sudo journalctl -u ocr-sprint-api -n 50 +``` + +### Database Backup + +```bash +# Create backup directory +sudo mkdir -p /opt/ocr-sprint-service/backups +sudo chown ocr:ocr /opt/ocr-sprint-service/backups + +# Manual backup +sudo -u ocr pg_dump -h localhost -U ocr ocr_sprint | gzip > /opt/ocr-sprint-service/backups/backup_$(date +%Y%m%d_%H%M%S).sql.gz +``` + +**Setup automated backup:** + +```bash +# Create backup script +sudo nano /opt/ocr-sprint-service/backup.sh +``` + +```bash +#!/bin/bash +BACKUP_DIR="/opt/ocr-sprint-service/backups" +DATE=$(date +%Y%m%d_%H%M%S) + +mkdir -p $BACKUP_DIR + +# Backup database +PGPASSWORD='your-db-password' pg_dump -h localhost -U ocr ocr_sprint | gzip > $BACKUP_DIR/db_$DATE.sql.gz + +# Keep only last 7 days +find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete + +echo "Backup completed: $DATE" +``` + +```bash +# Make executable +sudo chmod +x /opt/ocr-sprint-service/backup.sh +sudo chown ocr:ocr /opt/ocr-sprint-service/backup.sh + +# Setup cron (daily at 2 AM) +sudo crontab -e -u ocr + +# Add line: +0 2 * * * /opt/ocr-sprint-service/backup.sh >> /var/log/ocr-backup.log 2>&1 +``` + +## Troubleshooting + +### Service tidak start + +```bash +# Check detailed logs +sudo journalctl -u ocr-sprint-api -n 100 --no-pager +sudo journalctl -u ocr-sprint-worker -n 100 --no-pager + +# Check file permissions +ls -la /opt/ocr-sprint-service +ls -la /opt/ocr-sprint-service/storage + +# Test manual run +sudo su - ocr +cd /opt/ocr-sprint-service +source .venv/bin/activate +uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000 +``` + +### Database connection error + +```bash +# Test connection +sudo -u ocr psql -h localhost -U ocr -d ocr_sprint + +# Check PostgreSQL status +sudo systemctl status postgresql + +# Check PostgreSQL logs +sudo journalctl -u postgresql -n 50 +``` + +### Redis connection error + +```bash +# Test Redis +redis-cli ping + +# Check Redis status +sudo systemctl status redis-server + +# Check Redis logs +sudo journalctl -u redis-server -n 50 +``` + +### Worker tidak memproses jobs + +```bash +# Check Celery worker status +sudo su - ocr +cd /opt/ocr-sprint-service +source .venv/bin/activate +celery -A ocr_sprint.worker.celery_app inspect active +celery -A ocr_sprint.worker.celery_app inspect stats + +# Check Redis queue +redis-cli LLEN ocr_sprint +``` + +### PaddleOCR error + +```bash +# Re-download models +sudo su - ocr +cd /opt/ocr-sprint-service +source .venv/bin/activate + +python << EOF +from paddleocr import PaddleOCR +ocr = PaddleOCR(use_angle_cls=True, lang='latin') +print("Models downloaded successfully") +EOF +``` + +## Performance Tuning + +### Check CPU cores + +```bash +nproc +``` + +### Adjust worker concurrency + +```bash +# Edit worker service +sudo nano /etc/systemd/system/ocr-sprint-worker.service + +# Untuk 4 cores: --concurrency=2 +# Untuk 8 cores: --concurrency=4 +# Untuk 16 cores: --concurrency=8 + +# Reload dan restart +sudo systemctl daemon-reload +sudo systemctl restart ocr-sprint-worker +``` + +### PostgreSQL 16 Tuning + +```bash +sudo nano /etc/postgresql/16/main/postgresql.conf +``` + +**Recommended settings (sesuaikan dengan RAM server):** + +``` +# Untuk 8GB RAM: +shared_buffers = 2GB +effective_cache_size = 6GB +maintenance_work_mem = 512MB +work_mem = 8MB + +# Untuk 16GB RAM: +shared_buffers = 4GB +effective_cache_size = 12GB +maintenance_work_mem = 1GB +work_mem = 10MB + +# General +checkpoint_completion_target = 0.9 +wal_buffers = 16MB +default_statistics_target = 100 +random_page_cost = 1.1 +effective_io_concurrency = 200 +max_worker_processes = 4 +max_parallel_workers_per_gather = 2 +max_parallel_workers = 4 +``` + +```bash +sudo systemctl restart postgresql +``` + +## Security Checklist + +- [ ] API keys set dengan nilai random yang kuat +- [ ] Database password diganti dari default +- [ ] Firewall enabled (UFW) +- [ ] SSL/TLS enabled (jika punya domain) +- [ ] `/metrics` endpoint restricted +- [ ] PostgreSQL hanya listen di localhost +- [ ] Redis hanya listen di localhost +- [ ] Backup automated (cron job) +- [ ] OS security updates enabled + +## Next Steps + +1. **Setup monitoring** - Install Prometheus + Grafana (opsional) +2. **Setup alerting** - Email/Slack notification untuk errors +3. **Load testing** - Test dengan volume dokumen production +4. **Backup verification** - Test restore dari backup +5. **Documentation** - Dokumentasi API keys untuk tim + +## Support + +Untuk pertanyaan atau issues, hubungi tim development. diff --git a/docs/DEPLOYMENT-MANUAL.md b/docs/DEPLOYMENT-MANUAL.md new file mode 100644 index 0000000..f04df85 --- /dev/null +++ b/docs/DEPLOYMENT-MANUAL.md @@ -0,0 +1,943 @@ +# Deployment Manual OCR Sprint Service (Tanpa Docker) + +Panduan lengkap deployment OCR Sprint Service langsung di server tanpa menggunakan Docker. + +## Prasyarat Server + +### Spesifikasi Minimum +- **OS**: Ubuntu 20.04+ / Debian 11+ / RHEL 8+ +- **CPU**: 4 cores (8 cores recommended) +- **RAM**: 8 GB minimum (16 GB recommended) +- **Storage**: 50 GB free space +- **User**: Non-root user dengan sudo access + +### Port yang Dibutuhkan +- `8000`: API server (internal, akan di-proxy oleh Nginx) +- `80/443`: HTTP/HTTPS (Nginx) +- `5432`: PostgreSQL (localhost only) +- `6379`: Redis (localhost only) + +## Langkah 1: Install System Dependencies + +### Ubuntu/Debian + +```bash +# Update system +sudo apt update && sudo apt upgrade -y + +# Install Python 3.11 +sudo apt install -y software-properties-common +sudo add-apt-repository ppa:deadsnakes/ppa -y +sudo apt update +sudo apt install -y python3.11 python3.11-venv python3.11-dev python3-pip + +# Install system libraries untuk OpenCV dan PaddleOCR +sudo apt install -y \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + libgomp1 \ + libmagic1 \ + build-essential \ + git \ + curl \ + wget + +# Install Redis +sudo apt install -y redis-server +sudo systemctl enable redis-server +sudo systemctl start redis-server + +# Install PostgreSQL +sudo apt install -y postgresql postgresql-contrib +sudo systemctl enable postgresql +sudo systemctl start postgresql +``` + +### RHEL/CentOS/Rocky Linux + +```bash +# Update system +sudo dnf update -y + +# Install Python 3.11 +sudo dnf install -y python3.11 python3.11-devel python3.11-pip + +# Install system libraries +sudo dnf install -y \ + mesa-libGL \ + glib2 \ + libSM \ + libXext \ + libXrender \ + file-libs \ + gcc \ + gcc-c++ \ + make \ + git + +# Install Redis +sudo dnf install -y redis +sudo systemctl enable redis +sudo systemctl start redis + +# Install PostgreSQL +sudo dnf install -y postgresql-server postgresql-contrib +sudo postgresql-setup --initdb +sudo systemctl enable postgresql +sudo systemctl start postgresql +``` + +## Langkah 2: Setup Database PostgreSQL + +```bash +# Masuk sebagai postgres user +sudo -u postgres psql + +# Jalankan SQL commands berikut: +``` + +```sql +-- Create user dan database +CREATE USER ocr WITH PASSWORD 'ganti-dengan-password-kuat'; +CREATE DATABASE ocr_sprint OWNER ocr; + +-- Grant privileges +GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr; + +-- Connect ke database +\c ocr_sprint + +-- Grant schema privileges (PostgreSQL 15+) +GRANT ALL ON SCHEMA public TO ocr; + +-- Exit +\q +``` + +**Konfigurasi PostgreSQL untuk remote access (opsional):** + +```bash +# Edit postgresql.conf +sudo nano /etc/postgresql/14/main/postgresql.conf + +# Uncomment dan ubah: +listen_addresses = 'localhost' # Tetap localhost untuk keamanan + +# Edit pg_hba.conf +sudo nano /etc/postgresql/14/main/pg_hba.conf + +# Tambahkan line: +local ocr_sprint ocr scram-sha-256 + +# Restart PostgreSQL +sudo systemctl restart postgresql +``` + +## Langkah 3: Setup Application User + +```bash +# Create dedicated user untuk aplikasi +sudo useradd -m -s /bin/bash ocr +sudo usermod -aG sudo ocr # Opsional, untuk maintenance + +# Create application directory +sudo mkdir -p /opt/ocr-sprint-service +sudo chown ocr:ocr /opt/ocr-sprint-service + +# Switch ke user ocr +sudo su - ocr +``` + +## Langkah 4: Install Application + +```bash +# Clone repository +cd /opt +git clone https://github.com/Adriankf59/ocr-sprint-service.git +cd ocr-sprint-service + +# Create virtual environment +python3.11 -m venv .venv + +# Activate virtual environment +source .venv/bin/activate + +# Upgrade pip +pip install --upgrade pip setuptools wheel + +# Install application dengan OCR dependencies +pip install -e ".[ocr]" + +# Verify installation +python -c "import paddleocr; print('PaddleOCR installed successfully')" +``` + +## Langkah 5: Konfigurasi Application + +```bash +# Copy environment template +cp .env.example .env + +# Edit konfigurasi +nano .env +``` + +**Konfigurasi production (`/opt/ocr-sprint-service/.env`):** + +```bash +# ==== App ==== +APP_ENV=prod +APP_HOST=0.0.0.0 +APP_PORT=8000 +APP_LOG_LEVEL=INFO + +# ==== Storage ==== +STORAGE_LOCAL_DIR=/opt/ocr-sprint-service/storage +BLOB_STORAGE_DIR=/opt/ocr-sprint-service/storage/blobs +BLOB_MAX_UPLOAD_MB=25 + +# ==== OCR ==== +OCR_LANG=latin +OCR_USE_GPU=false +OCR_MAX_IMAGE_SIDE=2200 + +# ==== Preprocessing ==== +PREPROCESS_TARGET_DPI=300 +PREPROCESS_DENOISE=true +PREPROCESS_DESKEW=true +PREPROCESS_DETECT_DOCUMENT=true +PREPROCESS_REMOVE_SHADOW=true +PREPROCESS_MIN_QUAD_AREA_FRACTION=0.20 + +# ==== Table Extraction ==== +TABLES_ENABLED=true + +# ==== Confidence ==== +CONFIDENCE_AUTO_APPROVE=0.95 +CONFIDENCE_NEEDS_REVIEW=0.85 + +# ==== LLM (Phase 5, optional) ==== +LLM_ENABLED=false + +# ==== Async Pipeline ==== +QUEUE_ENABLED=true +REDIS_URL=redis://localhost:6379/0 +CELERY_TASK_DEFAULT_QUEUE=ocr_sprint + +# ==== Database ==== +DATABASE_URL=postgresql+psycopg://ocr:ganti-dengan-password-kuat@localhost:5432/ocr_sprint +DATABASE_ECHO=false + +# ==== Auth (WAJIB!) ==== +API_KEYS=key1-ganti-dengan-random-string,key2-ganti-dengan-random-string +API_KEY_HEADER=X-API-Key +``` + +**Generate secure API keys:** + +```bash +# Generate 2 API keys +openssl rand -hex 32 +openssl rand -hex 32 +``` + +**Create storage directories:** + +```bash +mkdir -p /opt/ocr-sprint-service/storage/blobs +chmod 755 /opt/ocr-sprint-service/storage +``` + +## Langkah 6: Run Database Migrations + +```bash +# Masih sebagai user ocr, dengan venv activated +cd /opt/ocr-sprint-service +source .venv/bin/activate + +# Run migrations +alembic upgrade head + +# Verify +alembic current +``` + +## Langkah 7: Test Manual Run + +```bash +# Test API server +uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000 + +# Di terminal lain, test health check +curl http://localhost:8000/api/v1/health + +# Jika berhasil, stop dengan Ctrl+C +``` + +## Langkah 8: Setup Systemd Services + +### API Service + +```bash +# Exit dari user ocr, kembali ke user dengan sudo +exit + +# Create systemd service file +sudo nano /etc/systemd/system/ocr-sprint-api.service +``` + +**Content `/etc/systemd/system/ocr-sprint-api.service`:** + +```ini +[Unit] +Description=OCR Sprint API Service +After=network.target postgresql.service redis.service +Wants=postgresql.service redis.service + +[Service] +Type=simple +User=ocr +Group=ocr +WorkingDirectory=/opt/ocr-sprint-service + +# Environment +Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin" +EnvironmentFile=/opt/ocr-sprint-service/.env + +# Start command - 4 workers untuk production +ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn \ + ocr_sprint.main:app \ + --host 0.0.0.0 \ + --port 8000 \ + --workers 4 \ + --log-level info + +# Restart policy +Restart=always +RestartSec=10 +StartLimitInterval=0 + +# Resource limits +LimitNOFILE=65536 +MemoryLimit=6G + +# Security +NoNewPrivileges=true +PrivateTmp=true + +[Install] +WantedBy=multi-user.target +``` + +### Celery Worker Service + +```bash +sudo nano /etc/systemd/system/ocr-sprint-worker.service +``` + +**Content `/etc/systemd/system/ocr-sprint-worker.service`:** + +```ini +[Unit] +Description=OCR Sprint Celery Worker +After=network.target postgresql.service redis.service ocr-sprint-api.service +Wants=postgresql.service redis.service + +[Service] +Type=simple +User=ocr +Group=ocr +WorkingDirectory=/opt/ocr-sprint-service + +# Environment +Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin" +EnvironmentFile=/opt/ocr-sprint-service/.env + +# Start command - concurrency 2 untuk 4 core CPU +ExecStart=/opt/ocr-sprint-service/.venv/bin/celery \ + -A ocr_sprint.worker.celery_app \ + worker \ + --loglevel=info \ + --concurrency=2 \ + --max-tasks-per-child=100 + +# Restart policy +Restart=always +RestartSec=10 +StartLimitInterval=0 + +# Resource limits +LimitNOFILE=65536 +MemoryLimit=4G + +# Security +NoNewPrivileges=true +PrivateTmp=true + +[Install] +WantedBy=multi-user.target +``` + +### Enable dan Start Services + +```bash +# Reload systemd +sudo systemctl daemon-reload + +# Enable services (auto-start on boot) +sudo systemctl enable ocr-sprint-api +sudo systemctl enable ocr-sprint-worker + +# Start services +sudo systemctl start ocr-sprint-api +sudo systemctl start ocr-sprint-worker + +# Check status +sudo systemctl status ocr-sprint-api +sudo systemctl status ocr-sprint-worker + +# View logs +sudo journalctl -u ocr-sprint-api -f +sudo journalctl -u ocr-sprint-worker -f +``` + +## Langkah 9: Setup Nginx Reverse Proxy + +### Install Nginx + +```bash +sudo apt install -y nginx certbot python3-certbot-nginx +``` + +### Konfigurasi Nginx + +```bash +sudo nano /etc/nginx/sites-available/ocr-sprint +``` + +**Content `/etc/nginx/sites-available/ocr-sprint`:** + +```nginx +# Upstream untuk load balancing (jika scale horizontal) +upstream ocr_api { + server 127.0.0.1:8000; + keepalive 32; +} + +# Rate limiting +limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s; + +server { + listen 80; + server_name ocr.yourdomain.com; # Ganti dengan domain Anda + + # Max upload size (sesuaikan dengan BLOB_MAX_UPLOAD_MB) + client_max_body_size 30M; + client_body_buffer_size 128k; + + # Timeouts untuk dokumen besar + proxy_connect_timeout 300s; + proxy_send_timeout 300s; + proxy_read_timeout 300s; + send_timeout 300s; + + # Logging + access_log /var/log/nginx/ocr-sprint-access.log; + error_log /var/log/nginx/ocr-sprint-error.log; + + # API endpoints + location /api/ { + # Rate limiting + limit_req zone=api_limit burst=20 nodelay; + + proxy_pass http://ocr_api; + proxy_http_version 1.1; + + # Headers + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Connection ""; + + # Disable buffering untuk streaming responses + proxy_buffering off; + } + + # Health check endpoint (no rate limit) + location /api/v1/health { + proxy_pass http://ocr_api; + proxy_http_version 1.1; + proxy_set_header Host $host; + access_log off; + } + + # Metrics endpoint (restrict access) + location /metrics { + # Allow only from internal network + allow 10.0.0.0/8; + allow 172.16.0.0/12; + allow 192.168.0.0/16; + allow 127.0.0.1; + deny all; + + proxy_pass http://ocr_api; + proxy_http_version 1.1; + proxy_set_header Host $host; + } + + # Docs (opsional, bisa di-disable di production) + location /docs { + proxy_pass http://ocr_api; + proxy_http_version 1.1; + proxy_set_header Host $host; + } + + location /redoc { + proxy_pass http://ocr_api; + proxy_http_version 1.1; + proxy_set_header Host $host; + } +} +``` + +### Enable Site + +```bash +# Test konfigurasi +sudo nginx -t + +# Enable site +sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/ + +# Remove default site (opsional) +sudo rm /etc/nginx/sites-enabled/default + +# Reload Nginx +sudo systemctl reload nginx +``` + +### Setup SSL dengan Let's Encrypt + +```bash +# Install certbot +sudo apt install -y certbot python3-certbot-nginx + +# Obtain certificate (ganti dengan domain Anda) +sudo certbot --nginx -d ocr.yourdomain.com + +# Test auto-renewal +sudo certbot renew --dry-run +``` + +Certbot akan otomatis mengupdate konfigurasi Nginx untuk HTTPS. + +## Langkah 10: Setup Firewall + +```bash +# Install UFW (jika belum ada) +sudo apt install -y ufw + +# Allow SSH (PENTING! Jangan sampai terkunci) +sudo ufw allow 22/tcp + +# Allow HTTP dan HTTPS +sudo ufw allow 80/tcp +sudo ufw allow 443/tcp + +# Enable firewall +sudo ufw enable + +# Check status +sudo ufw status +``` + +## Langkah 11: Verifikasi Deployment + +### Test dari Server + +```bash +# Health check +curl http://localhost:8000/api/v1/health + +# Test dengan API key +curl -X POST http://localhost:8000/api/v1/documents?sync=true \ + -H "X-API-Key: your-api-key-here" \ + -F "file=@/path/to/test.pdf" +``` + +### Test dari Client + +```bash +# Health check via domain +curl https://ocr.yourdomain.com/api/v1/health + +# Upload dokumen +curl -X POST https://ocr.yourdomain.com/api/v1/documents \ + -H "X-API-Key: your-api-key-here" \ + -F "file=@document.pdf" +``` + +## Monitoring dan Maintenance + +### View Logs + +```bash +# API logs +sudo journalctl -u ocr-sprint-api -f + +# Worker logs +sudo journalctl -u ocr-sprint-worker -f + +# Nginx logs +sudo tail -f /var/log/nginx/ocr-sprint-access.log +sudo tail -f /var/log/nginx/ocr-sprint-error.log + +# PostgreSQL logs +sudo tail -f /var/log/postgresql/postgresql-14-main.log +``` + +### Service Management + +```bash +# Restart services +sudo systemctl restart ocr-sprint-api +sudo systemctl restart ocr-sprint-worker + +# Stop services +sudo systemctl stop ocr-sprint-api +sudo systemctl stop ocr-sprint-worker + +# Check status +sudo systemctl status ocr-sprint-api +sudo systemctl status ocr-sprint-worker +``` + +### Database Backup + +```bash +# Create backup script +sudo nano /opt/ocr-sprint-service/backup.sh +``` + +**Content `backup.sh`:** + +```bash +#!/bin/bash +BACKUP_DIR="/opt/ocr-sprint-service/backups" +DATE=$(date +%Y%m%d_%H%M%S) + +mkdir -p $BACKUP_DIR + +# Backup database +pg_dump -U ocr -h localhost ocr_sprint | gzip > $BACKUP_DIR/db_$DATE.sql.gz + +# Backup blobs (opsional, bisa besar) +# tar -czf $BACKUP_DIR/blobs_$DATE.tar.gz /opt/ocr-sprint-service/storage/blobs + +# Keep only last 7 days +find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete + +echo "Backup completed: $DATE" +``` + +```bash +# Make executable +chmod +x /opt/ocr-sprint-service/backup.sh + +# Setup cron job (daily at 2 AM) +sudo crontab -e + +# Add line: +0 2 * * * /opt/ocr-sprint-service/backup.sh >> /var/log/ocr-backup.log 2>&1 +``` + +### Log Rotation + +```bash +sudo nano /etc/logrotate.d/ocr-sprint +``` + +**Content:** + +``` +/var/log/nginx/ocr-sprint-*.log { + daily + rotate 14 + compress + delaycompress + notifempty + create 0640 www-data adm + sharedscripts + postrotate + [ -f /var/run/nginx.pid ] && kill -USR1 `cat /var/run/nginx.pid` + endscript +} +``` + +## Update Application + +```bash +# Switch ke user ocr +sudo su - ocr +cd /opt/ocr-sprint-service + +# Pull latest code +git pull + +# Activate venv +source .venv/bin/activate + +# Update dependencies +pip install -e ".[ocr]" + +# Run migrations +alembic upgrade head + +# Exit user ocr +exit + +# Restart services +sudo systemctl restart ocr-sprint-api +sudo systemctl restart ocr-sprint-worker + +# Check logs +sudo journalctl -u ocr-sprint-api -n 50 +``` + +## Performance Tuning + +### Increase Worker Concurrency + +```bash +# Edit worker service +sudo nano /etc/systemd/system/ocr-sprint-worker.service + +# Ubah --concurrency sesuai CPU cores +# Untuk 8 cores: --concurrency=4 +# Untuk 16 cores: --concurrency=8 + +# Reload dan restart +sudo systemctl daemon-reload +sudo systemctl restart ocr-sprint-worker +``` + +### PostgreSQL Tuning + +```bash +sudo nano /etc/postgresql/14/main/postgresql.conf +``` + +**Recommended settings untuk 16GB RAM:** + +``` +shared_buffers = 4GB +effective_cache_size = 12GB +maintenance_work_mem = 1GB +checkpoint_completion_target = 0.9 +wal_buffers = 16MB +default_statistics_target = 100 +random_page_cost = 1.1 +effective_io_concurrency = 200 +work_mem = 10MB +min_wal_size = 1GB +max_wal_size = 4GB +max_worker_processes = 4 +max_parallel_workers_per_gather = 2 +max_parallel_workers = 4 +``` + +```bash +sudo systemctl restart postgresql +``` + +### Redis Tuning + +```bash +sudo nano /etc/redis/redis.conf +``` + +**Recommended settings:** + +``` +maxmemory 2gb +maxmemory-policy allkeys-lru +save "" # Disable RDB snapshots untuk performance +``` + +```bash +sudo systemctl restart redis +``` + +## Troubleshooting + +### Service tidak start + +```bash +# Check logs +sudo journalctl -u ocr-sprint-api -n 100 --no-pager +sudo journalctl -u ocr-sprint-worker -n 100 --no-pager + +# Check permissions +ls -la /opt/ocr-sprint-service +ls -la /opt/ocr-sprint-service/storage + +# Test manual run +sudo su - ocr +cd /opt/ocr-sprint-service +source .venv/bin/activate +uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000 +``` + +### Database connection error + +```bash +# Test connection +sudo -u ocr psql -h localhost -U ocr -d ocr_sprint + +# Check PostgreSQL status +sudo systemctl status postgresql + +# Check pg_hba.conf +sudo cat /etc/postgresql/14/main/pg_hba.conf | grep ocr +``` + +### Redis connection error + +```bash +# Test Redis +redis-cli ping + +# Check Redis status +sudo systemctl status redis + +# Check Redis logs +sudo journalctl -u redis -n 50 +``` + +### PaddleOCR model download gagal + +```bash +# Download manual +sudo su - ocr +cd /opt/ocr-sprint-service +source .venv/bin/activate + +python << EOF +from paddleocr import PaddleOCR +ocr = PaddleOCR(use_angle_cls=True, lang='latin') +print("Models downloaded successfully") +EOF +``` + +### Out of memory + +```bash +# Check memory usage +free -h +htop + +# Reduce worker concurrency +sudo nano /etc/systemd/system/ocr-sprint-worker.service +# Ubah --concurrency=1 + +# Add swap (jika perlu) +sudo fallocate -l 4G /swapfile +sudo chmod 600 /swapfile +sudo mkswap /swapfile +sudo swapon /swapfile +echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab +``` + +## Security Checklist + +- [ ] API keys diganti dengan nilai random yang kuat +- [ ] Database password diganti dari default +- [ ] Firewall enabled (UFW) - hanya port 22, 80, 443 terbuka +- [ ] SSL/TLS enabled via Let's Encrypt +- [ ] `/metrics` endpoint restricted ke internal network +- [ ] Nginx rate limiting configured +- [ ] PostgreSQL hanya listen di localhost +- [ ] Redis hanya listen di localhost +- [ ] Regular backup configured (cron job) +- [ ] Log rotation configured +- [ ] OS security updates enabled (`unattended-upgrades`) +- [ ] Fail2ban installed untuk SSH protection + +## Monitoring dengan Prometheus (Opsional) + +### Install Prometheus + +```bash +# Download Prometheus +cd /tmp +wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz +tar xvfz prometheus-*.tar.gz +sudo mv prometheus-2.45.0.linux-amd64 /opt/prometheus + +# Create user +sudo useradd --no-create-home --shell /bin/false prometheus + +# Create directories +sudo mkdir /etc/prometheus /var/lib/prometheus +sudo chown prometheus:prometheus /var/lib/prometheus +``` + +### Configure Prometheus + +```bash +sudo nano /etc/prometheus/prometheus.yml +``` + +**Content:** + +```yaml +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'ocr-sprint' + static_configs: + - targets: ['localhost:8000'] + metrics_path: '/metrics' +``` + +### Create Systemd Service + +```bash +sudo nano /etc/systemd/system/prometheus.service +``` + +**Content:** + +```ini +[Unit] +Description=Prometheus +After=network.target + +[Service] +User=prometheus +Group=prometheus +Type=simple +ExecStart=/opt/prometheus/prometheus \ + --config.file=/etc/prometheus/prometheus.yml \ + --storage.tsdb.path=/var/lib/prometheus/ + +[Install] +WantedBy=multi-user.target +``` + +```bash +sudo systemctl daemon-reload +sudo systemctl enable prometheus +sudo systemctl start prometheus +``` + +Access Prometheus di `http://localhost:9090` + +## Support + +Untuk pertanyaan atau issues, hubungi tim development. diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md new file mode 100644 index 0000000..2d18641 --- /dev/null +++ b/docs/DEPLOYMENT.md @@ -0,0 +1,437 @@ +# Quickstart Deployment OCR Sprint Service + +Panduan deployment OCR Sprint Service ke server production untuk pemrosesan dokumen surat sprint Polri. + +## Prasyarat Server + +### Spesifikasi Minimum +- **OS**: Linux (Ubuntu 20.04+ / Debian 11+ / RHEL 8+) +- **CPU**: 4 cores (8 cores recommended untuk throughput tinggi) +- **RAM**: 8 GB minimum (16 GB recommended) +- **Storage**: 50 GB free space + - ~3 GB untuk model PaddleOCR + - ~1.5 GB untuk dependencies Python + - Sisanya untuk blob storage dokumen +- **Network**: Port 8000 terbuka untuk API access + +### Software Requirements +- Docker 24.0+ dan Docker Compose v2 +- Git +- (Opsional) Nginx/Caddy untuk reverse proxy + SSL + +## Deployment dengan Docker Compose (Recommended) + +### 1. Clone Repository + +```bash +# Login ke server sebagai user non-root dengan sudo access +ssh user@your-server.com + +# Clone repository +git clone https://github.com/Adriankf59/ocr-sprint-service.git +cd ocr-sprint-service +``` + +### 2. Konfigurasi Environment + +```bash +# Copy template environment +cp .env.example .env + +# Edit konfigurasi production +nano .env +``` + +**Konfigurasi penting untuk production:** + +```bash +# ==== App ==== +APP_ENV=prod +APP_LOG_LEVEL=INFO + +# ==== Storage ==== +STORAGE_LOCAL_DIR=/app/storage +BLOB_STORAGE_DIR=/app/storage/blobs +BLOB_MAX_UPLOAD_MB=25 + +# ==== OCR ==== +OCR_LANG=latin +OCR_USE_GPU=false # set true jika server punya GPU NVIDIA +OCR_MAX_IMAGE_SIDE=2200 + +# ==== Preprocessing ==== +PREPROCESS_TARGET_DPI=300 +PREPROCESS_DENOISE=true +PREPROCESS_DESKEW=true +PREPROCESS_DETECT_DOCUMENT=true +PREPROCESS_REMOVE_SHADOW=true + +# ==== Table Extraction ==== +TABLES_ENABLED=true + +# ==== Async Pipeline ==== +QUEUE_ENABLED=true +REDIS_URL=redis://redis:6379/0 +CELERY_TASK_DEFAULT_QUEUE=ocr_sprint + +# ==== Database ==== +DATABASE_URL=postgresql+psycopg://ocr:ocr@postgres:5432/ocr_sprint +DATABASE_ECHO=false + +# ==== Auth (WAJIB untuk production!) ==== +API_KEYS=your-secret-key-1,your-secret-key-2 +API_KEY_HEADER=X-API-Key +``` + +**Generate API keys yang aman:** + +```bash +# Generate random API key +openssl rand -hex 32 +``` + +### 3. Build dan Start Services + +```bash +# Build Docker images +docker compose build + +# Start semua services (API, Worker, Redis, Postgres) +docker compose up -d + +# Cek logs untuk memastikan semua berjalan +docker compose logs -f api worker +``` + +**Services yang berjalan:** +- `api`: FastAPI server di port 8000 +- `worker`: Celery worker untuk async processing +- `redis`: Message broker untuk job queue +- `postgres`: Database untuk job state + +### 4. Verifikasi Deployment + +```bash +# Health check +curl http://localhost:8000/api/v1/health + +# Expected response: +# {"status":"ok","version":"0.1.0"} + +# Test OCR endpoint (sync mode untuk testing) +curl -X POST http://localhost:8000/api/v1/documents?sync=true \ + -H "X-API-Key: your-secret-key-1" \ + -F "file=@samples/pdf/example.pdf" \ + | jq +``` + +### 5. Setup Reverse Proxy (Nginx) + +**Install Nginx:** + +```bash +sudo apt update +sudo apt install nginx certbot python3-certbot-nginx +``` + +**Konfigurasi Nginx (`/etc/nginx/sites-available/ocr-sprint`):** + +```nginx +upstream ocr_api { + server localhost:8000; +} + +server { + listen 80; + server_name ocr.yourdomain.com; + + client_max_body_size 30M; # Sesuaikan dengan BLOB_MAX_UPLOAD_MB + + location / { + proxy_pass http://ocr_api; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # Timeout untuk dokumen besar + proxy_read_timeout 300s; + proxy_connect_timeout 75s; + } + + location /metrics { + # Restrict metrics endpoint + allow 10.0.0.0/8; # Internal network only + deny all; + proxy_pass http://ocr_api; + } +} +``` + +**Enable site dan setup SSL:** + +```bash +# Enable site +sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/ +sudo nginx -t +sudo systemctl reload nginx + +# Setup SSL dengan Let's Encrypt +sudo certbot --nginx -d ocr.yourdomain.com +``` + +## Deployment Manual (Tanpa Docker) + +### 1. Install System Dependencies + +```bash +# Ubuntu/Debian +sudo apt update +sudo apt install -y \ + python3.11 python3.11-venv python3-pip \ + libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \ + libgomp1 libmagic1 \ + redis-server postgresql-14 + +# Start services +sudo systemctl enable --now redis-server postgresql +``` + +### 2. Setup Database + +```bash +# Create database dan user +sudo -u postgres psql << EOF +CREATE USER ocr WITH PASSWORD 'your-secure-password'; +CREATE DATABASE ocr_sprint OWNER ocr; +GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr; +EOF +``` + +### 3. Install Application + +```bash +# Clone repository +git clone https://github.com/Adriankf59/ocr-sprint-service.git +cd ocr-sprint-service + +# Create virtual environment +python3.11 -m venv .venv +source .venv/bin/activate + +# Install dependencies +pip install --upgrade pip +pip install -e ".[ocr]" + +# Copy dan edit .env +cp .env.example .env +nano .env +``` + +**Update DATABASE_URL di .env:** + +```bash +DATABASE_URL=postgresql+psycopg://ocr:your-secure-password@localhost:5432/ocr_sprint +REDIS_URL=redis://localhost:6379/0 +QUEUE_ENABLED=true +``` + +### 4. Run Database Migrations + +```bash +alembic upgrade head +``` + +### 5. Setup Systemd Services + +**API Service (`/etc/systemd/system/ocr-sprint-api.service`):** + +```ini +[Unit] +Description=OCR Sprint API +After=network.target postgresql.service redis.service + +[Service] +Type=simple +User=ocr +WorkingDirectory=/opt/ocr-sprint-service +Environment="PATH=/opt/ocr-sprint-service/.venv/bin" +ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000 --workers 4 +Restart=always +RestartSec=10 + +[Install] +WantedBy=multi-user.target +``` + +**Worker Service (`/etc/systemd/system/ocr-sprint-worker.service`):** + +```ini +[Unit] +Description=OCR Sprint Celery Worker +After=network.target postgresql.service redis.service + +[Service] +Type=simple +User=ocr +WorkingDirectory=/opt/ocr-sprint-service +Environment="PATH=/opt/ocr-sprint-service/.venv/bin" +ExecStart=/opt/ocr-sprint-service/.venv/bin/celery -A ocr_sprint.worker.celery_app worker -l info --concurrency=2 +Restart=always +RestartSec=10 + +[Install] +WantedBy=multi-user.target +``` + +**Enable dan start services:** + +```bash +sudo systemctl daemon-reload +sudo systemctl enable --now ocr-sprint-api ocr-sprint-worker +sudo systemctl status ocr-sprint-api ocr-sprint-worker +``` + +## Monitoring dan Maintenance + +### Monitoring Logs + +```bash +# Docker deployment +docker compose logs -f api worker + +# Manual deployment +sudo journalctl -u ocr-sprint-api -f +sudo journalctl -u ocr-sprint-worker -f +``` + +### Prometheus Metrics + +Metrics tersedia di endpoint `/metrics`: + +```bash +curl http://localhost:8000/metrics +``` + +**Key metrics:** +- `ocr_documents_total`: Total dokumen diproses +- `ocr_processing_duration_seconds`: Durasi processing +- `ocr_confidence_score`: Distribusi confidence score +- `celery_task_*`: Celery worker metrics + +### Backup Database + +```bash +# Docker deployment +docker compose exec postgres pg_dump -U ocr ocr_sprint > backup_$(date +%Y%m%d).sql + +# Manual deployment +pg_dump -U ocr ocr_sprint > backup_$(date +%Y%m%d).sql +``` + +### Update Service + +```bash +# Docker deployment +cd ocr-sprint-service +git pull +docker compose build +docker compose up -d + +# Manual deployment +cd ocr-sprint-service +git pull +source .venv/bin/activate +pip install -e ".[ocr]" +alembic upgrade head +sudo systemctl restart ocr-sprint-api ocr-sprint-worker +``` + +## Troubleshooting + +### Service tidak start + +```bash +# Cek logs +docker compose logs api worker + +# Cek health check +curl http://localhost:8000/api/v1/health +``` + +### PaddleOCR model download gagal + +```bash +# Download manual ke volume +docker compose exec api python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='latin')" +``` + +### Worker tidak memproses jobs + +```bash +# Cek Redis connection +docker compose exec worker redis-cli -h redis ping + +# Cek Celery worker status +docker compose exec worker celery -A ocr_sprint.worker.celery_app inspect active +``` + +### Database migration error + +```bash +# Cek current revision +docker compose exec api alembic current + +# Force upgrade +docker compose exec api alembic upgrade head +``` + +### Out of memory + +```bash +# Kurangi worker concurrency di docker-compose.yml +# Ubah: --concurrency=1 (default) atau tambahkan memory limit +``` + +## Security Checklist + +- [ ] API_KEYS diset dengan nilai random yang kuat +- [ ] Firewall configured (hanya port 80/443 terbuka) +- [ ] SSL/TLS enabled via Nginx + Let's Encrypt +- [ ] Database password diganti dari default +- [ ] `/metrics` endpoint restricted ke internal network +- [ ] Regular backup database dan blob storage +- [ ] Log rotation configured +- [ ] OS security updates enabled + +## Performance Tuning + +### Untuk throughput tinggi: + +1. **Increase worker concurrency:** + ```yaml + # docker-compose.yml + command: ["celery", "-A", "ocr_sprint.worker.celery_app", "worker", "-l", "info", "--concurrency=4"] + ``` + +2. **Scale workers horizontally:** + ```bash + docker compose up -d --scale worker=3 + ``` + +3. **Enable GPU (jika tersedia):** + ```bash + # .env + OCR_USE_GPU=true + ``` + +4. **Tune Postgres:** + ```sql + -- Increase connection pool + ALTER SYSTEM SET max_connections = 200; + ALTER SYSTEM SET shared_buffers = '2GB'; + ``` + +## Support + +Untuk pertanyaan atau issues, hubungi tim development atau buat issue di repository. diff --git a/src/ocr_sprint/api/routes/documents.py b/src/ocr_sprint/api/routes/documents.py index 195b4dc..ea5e970 100644 --- a/src/ocr_sprint/api/routes/documents.py +++ b/src/ocr_sprint/api/routes/documents.py @@ -86,14 +86,18 @@ def _row_to_response(row: object) -> DocumentResponse: assert isinstance(row, JobRow) status_enum = DocumentStatus(row.status) - result_obj: ExtractionResult | None = None + personel_list = None if row.result is not None: result_obj = ExtractionResult.model_validate(row.result) + # Auto-number personnel entries sequentially (1, 2, 3, ...) + for idx, entry in enumerate(result_obj.personel, start=1): + entry.no = idx + personel_list = result_obj.personel return DocumentResponse( job_id=row.job_id, status=status_enum, confidence=row.confidence, - data=result_obj, + data=personel_list, review_flags=list(row.review_flags or []), error=row.error, approved=bool(row.approved), diff --git a/src/ocr_sprint/data/master_pangkat.py b/src/ocr_sprint/data/master_pangkat.py index 554f143..8ccf482 100644 --- a/src/ocr_sprint/data/master_pangkat.py +++ b/src/ocr_sprint/data/master_pangkat.py @@ -33,12 +33,45 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = { # Perwira Menengah "KOMPOL": ("KOMPOL",), "AKBP": ("AKBP",), - "KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP"), + "KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP", "KOMBES"), # Perwira Tinggi "BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"), "IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"), "KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"), "JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"), + # PNS Polri (Pegawai Negeri Sipil di lingkungan Polri). PNS appear + # routinely on sprint panitia / undangan templates alongside Polri + # personnel, so we treat them as valid ranks for extraction. + # Sources: PP 11/2017 jo PP 17/2020 (Manajemen PNS); golongan I-IV. + # Golongan I (Juru) + "JURU MUDA": ("JURU MUDA",), + "JURU MUDA TK I": ("JURU MUDA TK I", "JURU MUDA TK.I", "JURU MUDA TINGKAT I"), + "JURU": ("JURU",), + "JURU TK I": ("JURU TK I", "JURU TK.I", "JURU TINGKAT I"), + # Golongan II (Pengatur) + "PENGATUR MUDA": ("PENGATUR MUDA",), + "PENGATUR MUDA TK I": ( + "PENGATUR MUDA TK I", + "PENGATUR MUDA TK.I", + "PENGATUR MUDA TINGKAT I", + ), + "PENGATUR": ("PENGATUR",), + "PENGATUR TK I": ("PENGATUR TK I", "PENGATUR TK.I", "PENGATUR TINGKAT I"), + # Golongan III (Penata) + "PENATA MUDA": ("PENATA MUDA",), + "PENATA MUDA TK I": ( + "PENATA MUDA TK I", + "PENATA MUDA TK.I", + "PENATA MUDA TINGKAT I", + ), + "PENATA": ("PENATA",), + "PENATA TK I": ("PENATA TK I", "PENATA TK.I", "PENATA TINGKAT I"), + # Golongan IV (Pembina) + "PEMBINA": ("PEMBINA",), + "PEMBINA TK I": ("PEMBINA TK I", "PEMBINA TK.I", "PEMBINA TINGKAT I"), + "PEMBINA UTAMA MUDA": ("PEMBINA UTAMA MUDA",), + "PEMBINA UTAMA MADYA": ("PEMBINA UTAMA MADYA",), + "PEMBINA UTAMA": ("PEMBINA UTAMA",), } # Reverse lookup: any variant (uppercased) → canonical form. diff --git a/src/ocr_sprint/pipeline/extract/personnel.py b/src/ocr_sprint/pipeline/extract/personnel.py index 26c0ded..4574a92 100644 --- a/src/ocr_sprint/pipeline/extract/personnel.py +++ b/src/ocr_sprint/pipeline/extract/personnel.py @@ -64,6 +64,8 @@ _HEADER_SYNONYMS: dict[str, str] = { "jabatan dinas": "jabatan_dinas", "jabatan dalam dinas": "jabatan_dinas", "jbt dinas": "jabatan_dinas", + "struktural": "jabatan_dinas", + "jabatan struktural": "jabatan_dinas", # jabatan dalam sprint (role for this dispatch) "jabatan dalam sprint": "jabatan_sprint", "jabatan dalam sprin": "jabatan_sprint", @@ -72,6 +74,8 @@ _HEADER_SYNONYMS: dict[str, str] = { "jabatan sprin": "jabatan_sprint", "tugas": "jabatan_sprint", "penugasan": "jabatan_sprint", + "dalam penugasan": "jabatan_sprint", + "jabatan dalam penugasan": "jabatan_sprint", # remarks "keterangan": "keterangan", "ket": "keterangan", diff --git a/src/ocr_sprint/pipeline/extract/personnel_text.py b/src/ocr_sprint/pipeline/extract/personnel_text.py index 5e37984..52bacf4 100644 --- a/src/ocr_sprint/pipeline/extract/personnel_text.py +++ b/src/ocr_sprint/pipeline/extract/personnel_text.py @@ -38,12 +38,18 @@ _RANK_TOKENS: tuple[str, ...] = tuple( ) ) _RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS) -# A line that contains a rank token followed (anywhere on the same line) by -# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or -# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match -# inside e.g. "ABRIPDA-style" text. +# A rank token followed (within a few characters) by an 8-digit NRP. +# We allow common separators: '/', '-', '.', ',', ':' or whitespace. +# The trailing ``\b`` plus proximity to the 8-digit NRP is the +# specificity signal — we deliberately do *not* require a leading +# ``\b`` because real Polri sprint OCR routinely mashes the rank into +# the trailing characters of the previous cell (observed on Polres +# Banjar: "...CPHR., CBA, CI" runs straight into "AKP" giving +# "CIAKP 84011113"). Requiring a leading boundary loses that row +# entirely. The longest-first alternation order ensures multi-token +# ranks ("KOMBES POL") still win over short overlaps ("KBP"). _RE_RANK_NRP_LINE = re.compile( - rf"\b(?P{_RANK_ALT})\b[\s/.\-,:]*?(?P\d{{8}})\b", + rf"(?P{_RANK_ALT})\b[\s/.\-,:]*?(?P\d{{8}})\b", re.IGNORECASE, ) # A bare row number marker like "1." or "12)". OCR often puts it on its own @@ -143,31 +149,248 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]: Strategy: + **Pass 1** — same-line rank+NRP (original strategy): 1. Iterate every line. Skip lines that don't contain both a known rank and an 8-digit NRP (those are the only signal we trust). 2. For each rank+NRP line, look back for the most recent plausible name line, and forward 1-3 lines for jabatan content. 3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp. + **Pass 2** — separate-line rank and NRP (for tabular sprint formats): + If pass 1 produces no results, scan for lines containing a standalone + rank token, then look up to 2 lines forward for a standalone NRP. + This handles sprint formats where OCR renders each column on its own + line (e.g. Polres Banjar layout). + + **Pass 3** — rank-only (for sprint formats *without* an NRP column): + Some sprint templates (panitia, undangan, etc.) list only nama + + pangkat + jabatan, no NRP. If pass 1 and pass 2 both yield nothing, + fall back to a rank-only scan: every standalone rank line (or + two-line rank like "KOMBES" + "POL" produced by narrow-column OCR) + becomes a row, with name assembled from preceding lines and jabatan + from following lines. ``nrp`` stays ``None``. False-positive risk + is higher (stray rank tokens in body text), so this only fires when + nothing else matched. + The fallback is intentionally rate-limited: the first matching rank token on a line wins (no greedy multi-match per line), and a name line can only be consumed once (so a stray ranked text inside a paragraph doesn't turn into multiple bogus entries). """ lines = raw_text.splitlines() + + # ── Pass 1: rank+NRP on the same line ──────────────────────────── + rows = _extract_same_line(lines) + if rows: + return rows + + # ── Pass 2: rank and NRP on separate lines ─────────────────────── + rows = _extract_separate_lines(lines) + if rows: + return rows + + # ── Pass 3: rank-only (no NRP column) ──────────────────────────── + return _extract_rank_only(lines) + + +# Regex for a line that is *only* a rank token (possibly with punctuation). +_RE_RANK_ONLY = re.compile( + rf"^\s*(?P{_RANK_ALT})\s*[/.\-,:]*\s*$", + re.IGNORECASE, +) +# Regex for a line that contains a standalone 8-digit NRP. +_RE_NRP_ONLY = re.compile(r"(?\d{8})(?!\d)") + + +# Strip a leading row number marker like "1 ", "1.", "12)" from a name +# prefix taken from the same OCR line as a rank+NRP match. Unlike +# _RE_ROW_NUMBER (which matches a *whole* line), this is a prefix strip +# for embedded same-line cases like "1 CUCU JUHANA, A.K.S. KOMPOL ...". +_RE_LEADING_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s+") + + +def _extract_same_line(lines: list[str]) -> list[PersonnelEntry]: + """Pass 1: rank+NRP pairs found anywhere in the joined text. + + Uses ``finditer`` over the full ``\\n``-joined OCR text rather than + ``re.search`` per line so that multiple rank+NRP pairs on the same + OCR line still produce separate rows. This is required for sprint + scans where Paddle merges several table rows into one OCR line + (observed on Polres Banjar where row 2's "...CBA.AKP 77020049 KASAT + RESKRIM" was being swallowed into row 1's jabatan because per-line + ``search`` only returns the first match). + + For each match we resolve nama from text *before* the match (the + same-line prefix takes precedence; otherwise look back through the + preceding lines bounded by the previous match) and jabatan from text + *after* the match (same-line suffix plus up to ~3 follow-up lines, + bounded by the next match). + """ + if not lines: + return [] + full_text = "\n".join(lines) + + line_starts: list[int] = [] + pos = 0 + for line in lines: + line_starts.append(pos) + pos += len(line) + 1 # +1 for the joining "\n" + + def offset_to_line(offset: int) -> int: + lo, hi = 0, len(line_starts) + while lo < hi: + mid = (lo + hi) // 2 + if line_starts[mid] <= offset: + lo = mid + 1 + else: + hi = mid + return max(0, lo - 1) + + matches = list(_RE_RANK_NRP_LINE.finditer(full_text)) + rows: list[PersonnelEntry] = [] + consumed_lines: set[int] = set() + + for i, m in enumerate(matches): + pangkat = normalize_pangkat(m.group("rank")) + if not pangkat or not is_valid_pangkat(pangkat): + continue + nrp = m.group("nrp") + ml = offset_to_line(m.start()) + prev_ml = ( + offset_to_line(matches[i - 1].start()) if i > 0 else -1 + ) + next_ml = ( + offset_to_line(matches[i + 1].start()) + if i + 1 < len(matches) + else len(lines) + ) + + line_text = lines[ml] + line_off = line_starts[ml] + + # Same-line prefix: text on this line *before* the rank token. + # If the previous match was on this same line, only consider the + # text after that previous match's NRP (otherwise we'd reuse the + # earlier row's tail as this row's name). + prefix_start_local = 0 + if prev_ml == ml and i > 0: + prefix_start_local = max(0, matches[i - 1].end() - line_off) + prefix = line_text[prefix_start_local : m.start() - line_off] + + # Same-line suffix: text on this line *after* the NRP, capped at + # the next match's start if it's on this same line. + suffix_end_local = len(line_text) + if next_ml == ml and i + 1 < len(matches): + suffix_end_local = matches[i + 1].start() - line_off + suffix = line_text[m.end() - line_off : suffix_end_local] + + # ── Resolve nama ──────────────────────────────────────────── + nama: str | None = None + prefix_clean = _RE_LEADING_ROW_NUMBER.sub("", prefix).strip() + if prefix_clean and _is_plausible_name(prefix_clean): + nama = prefix_clean + elif prev_ml < ml: + for back in range(ml - 1, prev_ml, -1): + if back in consumed_lines or back < 0: + continue + candidate = lines[back].strip() + if _is_plausible_name(candidate): + nama = candidate + consumed_lines.add(back) + break + + # ── Resolve jabatan ───────────────────────────────────────── + jabatan_parts: list[str] = [] + suffix_clean = suffix.strip() + if suffix_clean: + jabatan_parts.append(suffix_clean) + if next_ml > ml: + max_fwd = min(ml + 4, next_ml, len(lines)) + for fwd in range(ml + 1, max_fwd): + candidate = lines[fwd].strip() + if not candidate: + if jabatan_parts: + break + continue + if _RE_NAME_BLOCKLIST.match(candidate): + break + if _RE_ROW_NUMBER.match(candidate): + break + jabatan_parts.append(candidate) + jabatan = ( + " ".join(" ".join(jabatan_parts).split()) + if jabatan_parts + else None + ) + + rows.append( + PersonnelEntry( + no=None, + pangkat=pangkat, + nrp=nrp, + nama=nama, + jabatan_dinas=jabatan, + jabatan_sprint=None, + keterangan=None, + ) + ) + return rows + + +def _extract_separate_lines(lines: list[str]) -> list[PersonnelEntry]: + """Pass 2: rank and NRP on separate nearby lines. + + Handles tabular sprint formats where OCR outputs each column as its + own line, e.g.: + 1 + CUCU JUHANA, A.K.S. + KOMPOL + 70100418 + KABAGOPS + """ consumed_names: set[int] = set() + consumed_nrps: set[int] = set() rows: list[PersonnelEntry] = [] for idx, raw_line in enumerate(lines): line = raw_line.strip() - match = _RE_RANK_NRP_LINE.search(line) - if not match: + rank_match = _RE_RANK_ONLY.match(line) + if not rank_match: + # Also try: line starts with a rank token (may have trailing text) + for tok in _RANK_TOKENS: + if line.upper().startswith(tok) and len(line) - len(tok) < 5: + rank_match = re.match( + rf"^\s*(?P{re.escape(tok)})\s*[/.\-,:]*", + line, + re.IGNORECASE, + ) + if rank_match: + break + if not rank_match: continue - pangkat = normalize_pangkat(match.group("rank")) + + pangkat = normalize_pangkat(rank_match.group("rank")) if not pangkat or not is_valid_pangkat(pangkat): continue - nrp = match.group("nrp") + # Look forward up to 2 lines for NRP + nrp: str | None = None + nrp_idx: int | None = None + for fwd in range(idx + 1, min(idx + 3, len(lines))): + if fwd in consumed_nrps: + continue + nrp_match = _RE_NRP_ONLY.search(lines[fwd].strip()) + if nrp_match: + nrp = nrp_match.group("nrp") + nrp_idx = fwd + break + + if not nrp: + continue + assert nrp_idx is not None + consumed_nrps.add(nrp_idx) + + # Look back for name nama: str | None = None for back in range(idx - 1, max(idx - 6, -1), -1): if back in consumed_names: @@ -178,7 +401,8 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]: consumed_names.add(back) break - jabatan = _following_jabatan(lines, idx) + # Look forward after NRP for jabatan + jabatan = _following_jabatan(lines, nrp_idx) rows.append( PersonnelEntry( no=None, @@ -193,6 +417,370 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]: return rows +# Bare row-number markers used by sprint formats without NRP (the dot +# is often missing in narrow-column OCR, e.g. just "1" on its own line). +_RE_BARE_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s*$") + + +def _try_match_rank_at(lines: list[str], idx: int) -> tuple[str, int] | None: + """Try to match a standalone rank starting at ``lines[idx]``. + + Returns ``(rank_text, lines_consumed)`` on success. Handles narrow- + column OCR that splits a multi-token rank across two lines (e.g. + ``"KOMBES"`` + ``"POL"`` or ``"PENATA"`` + ``"TK I"``). + + The two-line concatenation is tried *first* so that more-specific + multi-token ranks ("PENATA TK I") win over their less-specific + single-line prefix ("PENATA"). Without this preference, "TK I" + would leak into the jabatan column. + """ + if idx >= len(lines): + return None + line = lines[idx].strip() + if idx + 1 < len(lines): + combined = (line + " " + lines[idx + 1].strip()).strip() + m2 = _RE_RANK_ONLY.match(combined) + if m2: + return m2.group("rank"), 2 + m = _RE_RANK_ONLY.match(line) + if m: + return m.group("rank"), 1 + return None + + +def _extract_rank_only(lines: list[str]) -> list[PersonnelEntry]: + """Pass 3: rank-only fallback for sprint formats without an NRP column. + + Each standalone rank line (single line or two-line concatenation) is + treated as the pivot of a personnel row. ``nama`` is assembled from + the preceding contiguous plausible-name lines (typical OCR splits a + long name across 2-3 short lines because of narrow columns); jabatan + is collected from following lines until the next rank or row marker. + + ``nrp`` is always ``None`` for rows produced by this pass. + """ + rows: list[PersonnelEntry] = [] + consumed_lines: set[int] = set() + i = 0 + while i < len(lines): + match = _try_match_rank_at(lines, i) + if not match: + i += 1 + continue + rank_text, rank_span = match + pangkat = normalize_pangkat(rank_text) + if not pangkat or not is_valid_pangkat(pangkat): + i += 1 + continue + + # ── Look back for name lines (assemble up to 4 contiguous lines) ── + name_lines: list[str] = [] + for back in range(i - 1, max(i - 6, -1), -1): + if back in consumed_lines: + break + candidate = lines[back].strip() + if not candidate: + if name_lines: + break + continue + if _RE_BARE_ROW_NUMBER.match(candidate): + break + if _RE_NAME_BLOCKLIST.match(candidate): + break + if _try_match_rank_at(lines, back) is not None: + break + if not _is_plausible_name(candidate): + break + name_lines.insert(0, candidate) + consumed_lines.add(back) + nama = " ".join(" ".join(name_lines).split()) if name_lines else None + + # ── Look forward for jabatan (stop at next rank / row marker) ───── + jabatan_parts: list[str] = [] + fwd = i + rank_span + steps = 0 + while fwd < len(lines) and steps < 8: + candidate = lines[fwd].strip() + if not candidate: + if jabatan_parts: + break + fwd += 1 + steps += 1 + continue + if _RE_BARE_ROW_NUMBER.match(candidate): + break + if _try_match_rank_at(lines, fwd) is not None: + break + if _RE_NAME_BLOCKLIST.match(candidate): + break + jabatan_parts.append(candidate) + fwd += 1 + steps += 1 + jabatan = " ".join(" ".join(jabatan_parts).split()) if jabatan_parts else None + + rows.append( + PersonnelEntry( + no=None, + pangkat=pangkat, + nrp=None, + nama=nama, + jabatan_dinas=jabatan, + jabatan_sprint=None, + keterangan=None, + ) + ) + i += rank_span + return rows + + +# ── Column-aware Pass 3 (uses OCR bounding boxes) ─────────────────────── + + +def _box_x_left(box: tuple[tuple[float, float], ...]) -> float: + return min(p[0] for p in box) + + +def _box_x_right(box: tuple[tuple[float, float], ...]) -> float: + return max(p[0] for p in box) + + +def _box_x_center(box: tuple[tuple[float, float], ...]) -> float: + return (_box_x_left(box) + _box_x_right(box)) / 2 + + +def _box_y_top(box: tuple[tuple[float, float], ...]) -> float: + return min(p[1] for p in box) + + +def _box_y_bottom(box: tuple[tuple[float, float], ...]) -> float: + return max(p[1] for p in box) + + +def _box_y_center(box: tuple[tuple[float, float], ...]) -> float: + return (_box_y_top(box) + _box_y_bottom(box)) / 2 + + +def _box_height(box: tuple[tuple[float, float], ...]) -> float: + return _box_y_bottom(box) - _box_y_top(box) + + +def extract_personnel_from_ocr_lines(ocr_lines: list) -> list[PersonnelEntry]: + """Column-aware Pass 3 for sprint formats without an NRP column. + + Each ``ocr_line`` must expose ``text`` (str) and ``box`` (a tuple of + 4 ``(x, y)`` corner points). We use the geometry to: + + 1. Detect rank lines (single-line or vertically-stacked two-line). + 2. Estimate the PANGKAT column X-center from those rank lines. + 3. For each rank, gather **only** lines in the NAMA column (X left + of PANGKAT) within the row's Y span as the name fragments, and + **only** lines in the JABATAN column (X right of PANGKAT) for + jabatan. This prevents column-bleed that flat-text Pass 3 + suffers from on dense tables. + + Returns ``[]`` if no rank lines are detected (caller can fall back + to the text-only Pass 3). + """ + if not ocr_lines: + return [] + + # Sort by (y_top, x_left) for vertical-stacking rank detection. + indexed = sorted( + range(len(ocr_lines)), + key=lambda i: (_box_y_top(ocr_lines[i].box), _box_x_left(ocr_lines[i].box)), + ) + + # Pass 1: find rank anchors. + # An anchor is one or two stacked OCR lines whose combined text matches + # _RE_RANK_ONLY and normalises to a known pangkat. Two-line stacks must + # X-overlap so we don't accidentally merge cells from different columns. + used: set[int] = set() + anchors: list[dict] = [] + for pos, idx in enumerate(indexed): + if idx in used: + continue + ln = ocr_lines[idx] + text = ln.text.strip() + + rank_text: str | None = None + member_idxs: list[int] = [idx] + + # Try two-line stack first (so PENATA TK I beats PENATA). + for j_pos in range(pos + 1, min(pos + 5, len(indexed))): + j_idx = indexed[j_pos] + if j_idx in used: + continue + other = ocr_lines[j_idx] + x_overlap = ( + min(_box_x_right(ln.box), _box_x_right(other.box)) + - max(_box_x_left(ln.box), _box_x_left(other.box)) + ) + if x_overlap <= 0: + continue + y_gap = _box_y_top(other.box) - _box_y_bottom(ln.box) + if y_gap > _box_height(ln.box) * 1.5: + break + combined = (text + " " + other.text.strip()).strip() + m2 = _RE_RANK_ONLY.match(combined) + if m2: + rank_text = m2.group("rank") + member_idxs.append(j_idx) + break + + if rank_text is None: + m1 = _RE_RANK_ONLY.match(text) + if m1: + rank_text = m1.group("rank") + + if rank_text is None: + continue + pangkat = normalize_pangkat(rank_text) + if not pangkat or not is_valid_pangkat(pangkat): + continue + + anchors.append( + { + "member_idxs": member_idxs, + "pangkat": pangkat, + "x_center": _box_x_center(ln.box), + "y_top": min(_box_y_top(ocr_lines[m].box) for m in member_idxs), + "y_bottom": max(_box_y_bottom(ocr_lines[m].box) for m in member_idxs), + } + ) + used.update(member_idxs) + + if not anchors: + return [] + + # Sort anchors by Y so we can compute row spans. + anchors.sort(key=lambda a: a["y_top"]) + + # Estimate PANGKAT column X-center as the median of rank anchor X-centers. + xs_sorted = sorted(a["x_center"] for a in anchors) + pangkat_x = xs_sorted[len(xs_sorted) // 2] + + # X tolerance: half the median rank-line width. Lines with x_center + # within ±tolerance of pangkat_x are *in* the PANGKAT column and + # excluded from both NAMA and JABATAN buckets. + rank_widths = [ + _box_x_right(ocr_lines[a["member_idxs"][0]].box) + - _box_x_left(ocr_lines[a["member_idxs"][0]].box) + for a in anchors + ] + rank_widths.sort() + median_rank_width = rank_widths[len(rank_widths) // 2] if rank_widths else 50.0 + column_margin = max(median_rank_width * 0.5, 5.0) + + # Try to split the JABATAN side into STRUKTURAL (jabatan_dinas) and + # DALAM SPRIN (jabatan_sprint) by clustering jabatan-side X-centers. + # This is a 2-cluster k-means-style split: collect all X-centers of + # lines to the right of PANGKAT, find the largest X-gap among them, + # and use that gap as the column boundary. KET is typically the + # right-most narrow column we let bleed into jabatan_sprint since + # it's commonly empty. + jabatan_xs: list[float] = [] + for ln in ocr_lines: + x = _box_x_center(ln.box) + if x > pangkat_x + column_margin and ln.text.strip(): + jabatan_xs.append(x) + jabatan_split_x: float | None = None + if len(jabatan_xs) >= 4: + jabatan_xs.sort() + max_gap = 0.0 + max_gap_x: float | None = None + for k in range(1, len(jabatan_xs)): + gap = jabatan_xs[k] - jabatan_xs[k - 1] + if gap > max_gap: + max_gap = gap + max_gap_x = (jabatan_xs[k] + jabatan_xs[k - 1]) / 2 + # Only use the split if the gap is meaningfully larger than a + # within-column gap (heuristic: > 1.5× median rank width). + if max_gap_x is not None and max_gap > median_rank_width * 1.5: + jabatan_split_x = max_gap_x + + # Pre-compute each anchor's y_center for midpoint row dividers. + anchor_y_centers = [(a["y_top"] + a["y_bottom"]) / 2 for a in anchors] + + rows: list[PersonnelEntry] = [] + for i, anchor in enumerate(anchors): + # Row Y span: midpoint between this anchor and its neighbours. + # Using the midpoint (rather than the previous anchor's + # y_bottom) prevents row N's tail content (e.g. last name + # fragment "M.H.") from leaking into row N+1's nama bucket + # when rank lines don't extend to the full visual row height. + y_lo = ( + (anchor_y_centers[i - 1] + anchor_y_centers[i]) / 2 + if i > 0 + else float("-inf") + ) + y_hi = ( + (anchor_y_centers[i] + anchor_y_centers[i + 1]) / 2 + if i + 1 < len(anchors) + else float("inf") + ) + + nama_pieces: list[tuple[float, str]] = [] + struktural_pieces: list[tuple[float, str]] = [] + sprint_pieces: list[tuple[float, str]] = [] + for j, ln in enumerate(ocr_lines): + if j in anchor["member_idxs"]: + continue + text = ln.text.strip() + if not text: + continue + x = _box_x_center(ln.box) + y = _box_y_center(ln.box) + if not (y_lo <= y <= y_hi): + continue + if x < pangkat_x - column_margin: + # NAMA side + if _RE_NAME_BLOCKLIST.match(text): + continue + if _RE_BARE_ROW_NUMBER.match(text): + continue + if not _is_plausible_name(text): + continue + nama_pieces.append((y, text)) + elif x > pangkat_x + column_margin: + # JABATAN side — split into STRUKTURAL vs DALAM SPRIN + # using the geometric column boundary detected above. + if _RE_NAME_BLOCKLIST.match(text): + continue + if jabatan_split_x is not None and x > jabatan_split_x: + sprint_pieces.append((y, text)) + else: + struktural_pieces.append((y, text)) + # else: in PANGKAT column or column margin — skip + + nama_pieces.sort(key=lambda p: p[0]) + struktural_pieces.sort(key=lambda p: p[0]) + sprint_pieces.sort(key=lambda p: p[0]) + + # Strip leading row number from the first nama piece (e.g. "1 F. GUNTUR" + # collapses to "F. GUNTUR" if the row marker happens to share a box). + if nama_pieces: + head = _RE_LEADING_ROW_NUMBER.sub("", nama_pieces[0][1]).strip() + nama_pieces[0] = (nama_pieces[0][0], head) + + def _join(pieces: list[tuple[float, str]]) -> str | None: + text = " ".join(t for _, t in pieces if t).strip() + text = " ".join(text.split()) + return text or None + + rows.append( + PersonnelEntry( + no=None, + pangkat=anchor["pangkat"], + nrp=None, + nama=_join(nama_pieces), + jabatan_dinas=_join(struktural_pieces), + jabatan_sprint=_join(sprint_pieces), + keterangan=None, + ) + ) + return rows + + def is_low_quality(rows: list[PersonnelEntry]) -> bool: """Heuristic: did PP-Structure produce useless rows? diff --git a/src/ocr_sprint/pipeline/ocr.py b/src/ocr_sprint/pipeline/ocr.py index f5874de..a3d8775 100644 --- a/src/ocr_sprint/pipeline/ocr.py +++ b/src/ocr_sprint/pipeline/ocr.py @@ -36,6 +36,73 @@ class OCRLine: box: tuple[tuple[float, float], ...] # 4 (x, y) corner points +def _line_y_center(line: OCRLine) -> float: + return sum(p[1] for p in line.box) / len(line.box) + + +def _line_x_left(line: OCRLine) -> float: + return min(p[0] for p in line.box) + + +def _line_height(line: OCRLine) -> float: + ys = [p[1] for p in line.box] + return max(ys) - min(ys) + + +def sort_lines_by_layout(lines: list[OCRLine]) -> list[OCRLine]: + """Reorder lines into top-to-bottom, left-to-right reading order. + + PaddleOCR's natural output order reflects detection order, not visual + layout. On dense tables (e.g. Polda Kalbar Akpol-panitia sprint) this + interleaves rows and columns — Paddle may emit a row's KET column + before its NAMA column, breaking every downstream extractor that + assumes top-to-bottom row order. + + We rebuild reading order by: + + 1. Sorting by ``y_center``. + 2. Grouping consecutive lines into row-bands when their ``y_center`` + differs by less than half the median line height (so visually + same-row cells stay together even when their boxes don't perfectly + align). + 3. Sorting each band left-to-right by ``x_left``. + """ + if not lines: + return [] + + heights = [_line_height(ln) for ln in lines if _line_height(ln) > 0] + if not heights: + return list(lines) + median_height = sorted(heights)[len(heights) // 2] + band_threshold = max(1.0, median_height * 0.5) + + by_y = sorted(lines, key=_line_y_center) + bands: list[list[OCRLine]] = [] + current_band: list[OCRLine] = [] + current_y: float | None = None + for ln in by_y: + y = _line_y_center(ln) + if current_y is None or abs(y - current_y) <= band_threshold: + current_band.append(ln) + # Track the band's running y-center as the mean of its + # members so a slowly-drifting set of cells doesn't split + # mid-row. + current_y = ( + sum(_line_y_center(b) for b in current_band) / len(current_band) + ) + else: + bands.append(current_band) + current_band = [ln] + current_y = y + if current_band: + bands.append(current_band) + + ordered: list[OCRLine] = [] + for band in bands: + ordered.extend(sorted(band, key=_line_x_left)) + return ordered + + @dataclass(frozen=True) class OCRPage: """OCR output for a single page.""" @@ -44,8 +111,8 @@ class OCRPage: @property def text(self) -> str: - """Reconstruct page text by concatenating lines (order = paddle's output order).""" - return "\n".join(line.text for line in self.lines) + """Reconstruct page text in visual reading order (top-to-bottom, left-to-right).""" + return "\n".join(line.text for line in sort_lines_by_layout(self.lines)) @property def mean_confidence(self) -> float: diff --git a/src/ocr_sprint/pipeline/orchestrator.py b/src/ocr_sprint/pipeline/orchestrator.py index e0a0625..3309ae8 100644 --- a/src/ocr_sprint/pipeline/orchestrator.py +++ b/src/ocr_sprint/pipeline/orchestrator.py @@ -20,6 +20,7 @@ from ocr_sprint.pipeline.confidence import compute_confidence, route from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct from ocr_sprint.pipeline.extract.personnel import extract_personnel from ocr_sprint.pipeline.extract.personnel_text import ( + extract_personnel_from_ocr_lines, extract_personnel_from_text, is_low_quality, ) @@ -144,12 +145,37 @@ def run_pipeline(content: bytes) -> PipelineOutput: # through the preferred path. if is_low_quality(personel): fallback_rows = extract_personnel_from_text(full_text) + # If text-based fallback produced rows but they all lack NRP + # (Pass 3 territory), retry with the column-aware extractor that + # uses OCR bounding boxes. On dense tables (e.g. Polda Kalbar + # Akpol-panitia), text-only Pass 3 bleeds adjacent columns into + # nama/jabatan because lines are interleaved within each Y-band; + # the columnar variant restricts each field to its visual column. + text_only_no_nrp = bool(fallback_rows) and all( + r.nrp is None for r in fallback_rows + ) + if (not fallback_rows) or text_only_no_nrp: + ocr_lines = [ln for page in ocr_pages for ln in page.lines] + columnar_rows = extract_personnel_from_ocr_lines(ocr_lines) + if columnar_rows and ( + not fallback_rows or len(columnar_rows) >= len(fallback_rows) + ): + fallback_rows = columnar_rows if fallback_rows: personel = fallback_rows - table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK) + # Pass 3 / columnar emit rows with nrp=None for sprint + # templates without an NRP column. Surface that with a + # distinct flag so operators know to expect missing NRPs by + # design rather than by OCR failure. + no_nrp = all(r.nrp is None for r in fallback_rows) + if no_nrp: + table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK_NO_NRP) + else: + table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK) _logger.info( "pipeline.personnel_text_fallback", fallback_rows=len(fallback_rows), + no_nrp=no_nrp, ) untuk_items = find_untuk_list(full_text) diff --git a/src/ocr_sprint/pipeline/table.py b/src/ocr_sprint/pipeline/table.py index b93ccd1..135f0a2 100644 --- a/src/ocr_sprint/pipeline/table.py +++ b/src/ocr_sprint/pipeline/table.py @@ -71,11 +71,16 @@ def _build_pp_structure() -> PPStructure: from paddleocr import PPStructure s = get_settings() - _logger.info("pp_structure.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu) + # PPStructure layout models only support 'en' and 'ch', not 'latin'. + # Use 'en' for layout/table detection — it's language-agnostic (detects + # table structure, not text language). OCR within cells still works for + # Indonesian text because the recognition model handles Latin scripts. + pp_lang = "en" if s.ocr_lang not in ("en", "ch") else s.ocr_lang + _logger.info("pp_structure.init", lang=pp_lang, use_gpu=s.ocr_use_gpu) # layout=True so that PP-Structure also returns figure/text regions; we # filter to tables only afterwards. show_log=False to keep stdout clean. return PPStructure( - lang=s.ocr_lang, + lang=pp_lang, use_gpu=s.ocr_use_gpu, layout=True, show_log=False, diff --git a/src/ocr_sprint/schemas/document.py b/src/ocr_sprint/schemas/document.py index 3269539..12f7a69 100644 --- a/src/ocr_sprint/schemas/document.py +++ b/src/ocr_sprint/schemas/document.py @@ -10,6 +10,7 @@ from uuid import UUID, uuid4 from pydantic import BaseModel, ConfigDict, Field from ocr_sprint.schemas.extraction import ExtractionResult +from ocr_sprint.schemas.personnel import PersonnelEntry class SourceKind(str, Enum): @@ -52,7 +53,7 @@ class DocumentResponse(BaseModel): job_id: UUID status: DocumentStatus confidence: float | None = None - data: ExtractionResult | None = None + data: list[PersonnelEntry] | None = None review_flags: list[str] = Field(default_factory=list) error: str | None = None # Phase 6 — HITL review state. diff --git a/src/ocr_sprint/schemas/extraction.py b/src/ocr_sprint/schemas/extraction.py index 252d1db..d69fd86 100644 --- a/src/ocr_sprint/schemas/extraction.py +++ b/src/ocr_sprint/schemas/extraction.py @@ -22,6 +22,7 @@ class ReviewFlag(str, Enum): LLM_FALLBACK = "llm_fallback" LLM_UNAVAILABLE = "llm_unavailable" PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback" + PERSONNEL_TEXT_FALLBACK_NO_NRP = "personnel_text_fallback_no_nrp" INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row" diff --git a/tests/unit/test_ocr_layout.py b/tests/unit/test_ocr_layout.py new file mode 100644 index 0000000..80768ff --- /dev/null +++ b/tests/unit/test_ocr_layout.py @@ -0,0 +1,75 @@ +"""Tests for OCR layout reordering. + +PaddleOCR emits text boxes in detection order, not visual reading order. +On dense table layouts (Polda Kalbar Akpol-panitia regression) this +interleaves columns within a row and breaks every downstream extractor +that assumes top-to-bottom row order. ``sort_lines_by_layout`` rebuilds +reading order from the bounding-box geometry. +""" + +from __future__ import annotations + +from ocr_sprint.pipeline.ocr import OCRLine, OCRPage, sort_lines_by_layout + + +def _box(x: float, y: float, w: float = 30, h: float = 15): + return ((x, y), (x + w, y), (x + w, y + h), (x, y + h)) + + +def _make(text: str, x: float, y: float) -> OCRLine: + return OCRLine(text=text, confidence=1.0, box=_box(x, y)) + + +class TestSortLinesByLayout: + def test_empty_returns_empty(self) -> None: + assert sort_lines_by_layout([]) == [] + + def test_already_sorted_is_stable(self) -> None: + lines = [_make("A", 10, 10), _make("B", 50, 10), _make("C", 10, 30)] + assert [ln.text for ln in sort_lines_by_layout(lines)] == ["A", "B", "C"] + + def test_reorders_column_first_detection_to_row_first(self) -> None: + # Simulate a 2-row, 3-col table where Paddle returned cells + # column-first instead of row-first. + lines = [ + _make("B1", 50, 10), + _make("B2", 50, 30), + _make("A1", 10, 10), + _make("A2", 10, 30), + _make("C1", 90, 10), + _make("C2", 90, 30), + ] + result = [ln.text for ln in sort_lines_by_layout(lines)] + assert result == ["A1", "B1", "C1", "A2", "B2", "C2"] + + def test_groups_slightly_misaligned_cells_into_one_band(self) -> None: + # Real OCR boxes for a single visual row are rarely perfectly + # y-aligned; we still want them grouped. + lines = [ + _make("LEFT", 10, 10), + _make("MID", 50, 12), # 2px below LEFT — same row visually + _make("RIGHT", 90, 11), + ] + result = [ln.text for ln in sort_lines_by_layout(lines)] + assert result == ["LEFT", "MID", "RIGHT"] + + def test_separates_rows_when_y_gap_exceeds_threshold(self) -> None: + # Lines with a y gap larger than ~½ line-height must NOT collapse + # into the same band. + lines = [ + _make("ROW1A", 10, 10), + _make("ROW1B", 50, 10), + _make("ROW2A", 10, 30), # gap of 20 vs height 15 → new band + _make("ROW2B", 50, 30), + ] + result = [ln.text for ln in sort_lines_by_layout(lines)] + assert result == ["ROW1A", "ROW1B", "ROW2A", "ROW2B"] + + def test_ocrpage_text_uses_sorted_order(self) -> None: + lines = [ + _make("RIGHT", 90, 10), + _make("LEFT", 10, 10), + _make("BOTTOM", 10, 30), + ] + page = OCRPage(lines=lines) + assert page.text == "LEFT\nRIGHT\nBOTTOM" diff --git a/tests/unit/test_personnel_text_fallback.py b/tests/unit/test_personnel_text_fallback.py index 884f99c..71546a7 100644 --- a/tests/unit/test_personnel_text_fallback.py +++ b/tests/unit/test_personnel_text_fallback.py @@ -8,11 +8,18 @@ recover at least the rank + NRP for every row. from __future__ import annotations from ocr_sprint.pipeline.extract.personnel_text import ( + extract_personnel_from_ocr_lines, extract_personnel_from_text, is_low_quality, ) +from ocr_sprint.pipeline.ocr import OCRLine from ocr_sprint.schemas.personnel import PersonnelEntry + +def _ocr_line(text: str, x: float, y: float, w: float = 80, h: float = 15) -> OCRLine: + box = ((x, y), (x + w, y), (x + w, y + h), (x, y + h)) + return OCRLine(text=text, confidence=1.0, box=box) + _CIMAHI_FIXTURE = """\ DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024 NO @@ -115,6 +122,86 @@ class TestExtractPersonnelFromText: names = [r.nama for r in rows] assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"] + def test_extracts_multiple_rows_when_collapsed_to_one_line(self) -> None: + # Polres Banjar regression: when PaddleOCR merges several table + # rows onto a single OCR line, every rank+NRP pair on that line + # must still produce a separate row. Previously per-line + # ``re.search`` returned only the first match. + text = ( + "DAFTAR NAMA INSTRUKTUR\n" + "1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS " + "INSTRUKTUR LAT PRA OPS " + "HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 " + "KASAT RESKRIM SDA " + "YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 " + "KASATINTELKAM POLRES BANJAR SDA\n" + ) + rows = extract_personnel_from_text(text) + assert len(rows) == 3 + assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"] + assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"] + assert rows[0].nama == "CUCU JUHANA, A.K.S." + assert rows[1].nama is not None and "HERU SAMSUL BAHRI" in rows[1].nama + assert rows[2].nama is not None and "YAYAN SOPIANA" in rows[2].nama + + def test_extracts_multiple_rows_when_split_across_lines(self) -> None: + # Variant of the squished case where OCR produces one line per + # table row. Each row still ends up with multiple rank+NRP pairs + # never being on the same line, but verifies the finditer-based + # path doesn't regress this layout. + text = ( + "1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS\n" + "INSTRUKTUR LAT PRA OPS\n" + "HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 KASAT RESKRIM\n" + "SDA\n" + "YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 KASATINTELKAM\n" + "POLRES BANJAR SDA\n" + ) + rows = extract_personnel_from_text(text) + assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"] + assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"] + assert rows[0].nama == "CUCU JUHANA, A.K.S." + + def test_extracts_rows_when_sprint_has_no_nrp_column(self) -> None: + # Polda Kalbar Akpol-panitia regression: sprint formats without + # an NRP column (panitia, undangan templates) must still extract + # rows via the rank-only Pass 3 path. Names span multiple OCR + # lines (narrow column), and the multi-token rank "KOMBES POL" + # is split across two lines. + text = ( + "DAFTAR NAMA PANITIA\n" + "NO\nNAMA\nPANGKAT\nJABATAN\nSTRUKTURAL\nDALAM SPRIN\nKET\n" + "1\nF. GUNTUR\nSUNOTO, S.I.K.,\nM.H.\n" + "KOMBES\nPOL\n" + "KARO SDM\nPOLDA KALBAR\nKETUA\nPELAKSANA\n" + "2\nJUDA TRISNO\nTAMPUBOLON,\nS.H., S.I.K., M.H.\n" + "AKBP\n" + "KABAGDALPERS\nRO SDM\nPOLDA KALBAR\nSEKRETARIS\n" + "3\nPRAYITNO, S.H.,\nM.H.\n" + "KOMPOL\n" + "KASUBBAG DIAPERS\nANGGOTA\n" + ) + rows = extract_personnel_from_text(text) + assert len(rows) == 3 + assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "KOMPOL"] + # All Pass 3 rows have nrp=None by design. + assert all(r.nrp is None for r in rows) + assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H." + assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H." + assert rows[2].nama == "PRAYITNO, S.H., M.H." + assert rows[0].jabatan_dinas is not None and "KARO SDM" in rows[0].jabatan_dinas + + def test_pass3_does_not_run_when_pass1_succeeds(self) -> None: + # If a sprint has NRPs (Pass 1 succeeds), Pass 3 must not fire + # and produce duplicate/contaminating rows. + text = ( + "1\nSRI WAHYUNI\nAIPTU / 75070328\nBAUR SKCK\n" + "2\nCITRA DWI PUTRI\nBRIPTU / 95070659\nBA PELAKSANA\n" + ) + rows = extract_personnel_from_text(text) + assert len(rows) == 2 + assert all(r.nrp is not None for r in rows) + def test_still_blocks_bare_column_header_tokens(self) -> None: # Word-boundary fix must still reject the actual column-header # rows that motivated the blocklist in the first place. @@ -124,6 +211,94 @@ class TestExtractPersonnelFromText: assert rows[0].nama == "REAL NAME" +class TestExtractPersonnelFromOcrLines: + """Column-aware Pass 3 — Polda Kalbar Akpol-panitia regression. + + Verifies that bounding-box geometry preserves column boundaries on + dense tables where text-only Pass 3 bleeds adjacent columns into + nama/jabatan. + """ + + def _kalbar_lines(self) -> list[OCRLine]: + # Stylised Polda Kalbar layout: NO | NAMA | PANGKAT | STRUKTURAL | SPRIN + # X columns: 10, 100, 250, 380, 520. Each row may have multi-line cells. + return [ + # Row 1 — KOMBES POL spans two stacked OCR boxes + _ocr_line("1", 10, 100), + _ocr_line("F. GUNTUR", 100, 100), + _ocr_line("SUNOTO, S.I.K.,", 100, 120), + _ocr_line("M.H.", 100, 140), + _ocr_line("KOMBES", 250, 100), + _ocr_line("POL", 250, 120), + _ocr_line("KARO SDM", 380, 100), + _ocr_line("POLDA KALBAR", 380, 120), + _ocr_line("KETUA", 520, 100), + _ocr_line("PELAKSANA", 520, 120), + # Row 2 + _ocr_line("2", 10, 200), + _ocr_line("JUDA TRISNO", 100, 200), + _ocr_line("TAMPUBOLON,", 100, 220), + _ocr_line("S.H., S.I.K., M.H.", 100, 240), + _ocr_line("AKBP", 250, 200), + _ocr_line("KABAGDALPERS", 380, 200), + _ocr_line("RO SDM", 380, 220), + _ocr_line("POLDA KALBAR", 380, 240), + _ocr_line("SEKRETARIS", 520, 200), + # Row 9 — PNS PENATA TK I (multi-token rank stacked) + _ocr_line("9", 10, 500), + _ocr_line("FITRIANSYAH,", 100, 500), + _ocr_line("S.E.", 100, 520), + _ocr_line("PENATA", 250, 500), + _ocr_line("TK I", 250, 520), + _ocr_line("KAURKEU", 380, 500), + _ocr_line("RO SDM", 380, 520), + _ocr_line("POLDA KALBAR", 380, 540), + _ocr_line("BENDAHARA", 520, 500), + ] + + def test_extracts_three_rows(self) -> None: + rows = extract_personnel_from_ocr_lines(self._kalbar_lines()) + assert len(rows) == 3 + assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "PENATA TK I"] + + def test_nama_is_assembled_only_from_nama_column(self) -> None: + # Each row's nama must contain *all* its multi-line fragments + # and *only* its multi-line fragments — no bleed from struktural. + rows = extract_personnel_from_ocr_lines(self._kalbar_lines()) + assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H." + assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H." + assert rows[2].nama == "FITRIANSYAH, S.E." + + def test_jabatan_split_into_struktural_and_sprint(self) -> None: + # The geometric column boundary must split STRUKTURAL (jabatan_dinas) + # from DALAM SPRIN (jabatan_sprint). + rows = extract_personnel_from_ocr_lines(self._kalbar_lines()) + assert rows[0].jabatan_dinas == "KARO SDM POLDA KALBAR" + assert rows[0].jabatan_sprint == "KETUA PELAKSANA" + assert rows[1].jabatan_dinas == "KABAGDALPERS RO SDM POLDA KALBAR" + assert rows[1].jabatan_sprint == "SEKRETARIS" + + def test_returns_empty_when_no_rank_anchors(self) -> None: + lines = [ + _ocr_line("DAFTAR NAMA", 100, 50), + _ocr_line("HEADER", 100, 100), + ] + assert extract_personnel_from_ocr_lines(lines) == [] + + def test_returns_empty_for_empty_input(self) -> None: + assert extract_personnel_from_ocr_lines([]) == [] + + def test_no_row_bleed_between_consecutive_rows(self) -> None: + # Row 1's last name fragment ("F. GUNTUR") sits BELOW its rank + # line but inside row 1's visual span. It must NOT leak into + # row 2's nama, which should start with "JUDA TRISNO". + rows = extract_personnel_from_ocr_lines(self._kalbar_lines()) + assert rows[1].nama is not None + assert rows[1].nama.startswith("JUDA TRISNO") + assert "GUNTUR" not in rows[1].nama + assert "SUNOTO" not in rows[1].nama + + class TestIsLowQuality: def test_empty_list_is_low_quality(self) -> None: assert is_low_quality([]) is True diff --git a/update.ps1 b/update.ps1 new file mode 100644 index 0000000..7b3b432 --- /dev/null +++ b/update.ps1 @@ -0,0 +1,60 @@ +#!/usr/bin/env pwsh +# update.ps1 - One-command update & restart for ocr-sprint-service (local dev) + +$Port = 8000 + +# ── [1/5] Git pull ────────────────────────────────────────────────────────── +Write-Host "`n[1/5] Pulling latest code..." -ForegroundColor Cyan +git pull + +# ── [2/5] Install/update dependencies ─────────────────────────────────────── +Write-Host "`n[2/5] Installing/updating dependencies..." -ForegroundColor Cyan +pip install -e ".[dev]" -q + +# ── [3/5] Database migration ───────────────────────────────────────────────── +Write-Host "`n[3/5] Running database migrations..." -ForegroundColor Cyan +alembic upgrade head +if ($LASTEXITCODE -ne 0) { + Write-Host " Migration conflict detected, stamping current state as head..." -ForegroundColor Yellow + alembic stamp head + Write-Host " Retrying upgrade for any remaining new migrations..." -ForegroundColor Yellow + alembic upgrade head + if ($LASTEXITCODE -ne 0) { + Write-Host " Migration still failed. Please check alembic manually." -ForegroundColor Red + exit 1 + } +} +Write-Host " Migrations OK." -ForegroundColor Green + +# ── [4/5] Free up port ─────────────────────────────────────────────────────── +Write-Host "`n[4/5] Checking port $Port..." -ForegroundColor Cyan + +# Use Get-NetTCPConnection for reliable port detection on Windows +$connections = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue +if ($connections) { + foreach ($conn in $connections) { + $procId = $conn.OwningProcess + $procName = (Get-Process -Id $procId -ErrorAction SilentlyContinue).Name + Write-Host " Port $Port used by '$procName' (PID $procId), killing..." -ForegroundColor Yellow + Stop-Process -Id $procId -Force -ErrorAction SilentlyContinue + } + # Wait until port is actually released (max 5 seconds) + $waited = 0 + do { + Start-Sleep -Milliseconds 500 + $waited += 500 + $still = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue + } while ($still -and $waited -lt 5000) + + if ($still) { + Write-Host " Port $Port still in use after waiting. Try a different port or restart manually." -ForegroundColor Red + exit 1 + } + Write-Host " Port $Port freed." -ForegroundColor Green +} else { + Write-Host " Port $Port is free." -ForegroundColor Green +} + +# ── [5/5] Start dev server ─────────────────────────────────────────────────── +Write-Host "`n[5/5] Starting dev server on port $Port (Ctrl+C to stop)..." -ForegroundColor Cyan +uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port $Port