Compare commits
10 Commits
5ea45de5ea
...
b8a1198e93
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b8a1198e93 | ||
|
|
6d793758ff | ||
|
|
9d969e61fd | ||
|
|
5d9d9f784a | ||
|
|
002821ca07 | ||
|
|
dbcf480130 | ||
|
|
737f4999dd | ||
|
|
58a2bf2648 | ||
|
|
dce77e80e1 | ||
|
|
0755fbebda |
BIN
# leave empty to use PaddleOCR defaults/inference.pdiparams
Normal file
BIN
# leave empty to use PaddleOCR defaults/inference.pdiparams
Normal file
Binary file not shown.
BIN
# leave empty to use PaddleOCR defaults/inference.pdiparams.info
Normal file
BIN
# leave empty to use PaddleOCR defaults/inference.pdiparams.info
Normal file
Binary file not shown.
BIN
# leave empty to use PaddleOCR defaults/inference.pdmodel
Normal file
BIN
# leave empty to use PaddleOCR defaults/inference.pdmodel
Normal file
Binary file not shown.
18
.claude/settings.local.json
Normal file
18
.claude/settings.local.json
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(python -m pytest tests/unit/test_personnel_text_fallback.py -x -q)",
|
||||||
|
"Bash(python -c \"import sys; print\\(sys.executable\\)\")",
|
||||||
|
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_personnel_text_fallback.py -x -q)",
|
||||||
|
"Bash(.venv/Scripts/python.exe -m pytest tests/unit -x -q)",
|
||||||
|
"Bash(git stash *)",
|
||||||
|
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_api.py::test_documents_sync_returns_pipeline_output -x -q)",
|
||||||
|
"Bash(.venv/Scripts/python.exe -m pytest tests/unit --ignore=tests/unit/test_api.py -q)",
|
||||||
|
"Bash(.venv/Scripts/python.exe -c ' *)",
|
||||||
|
"Bash(xargs grep *)",
|
||||||
|
"Bash(.venv/Scripts/python.exe -m pytest tests/unit -q --ignore=tests/unit/test_api.py --ignore=tests/unit/test_api_hitl.py --ignore=tests/unit/test_blob_storage.py)",
|
||||||
|
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_ocr_layout.py tests/unit/test_personnel_text_fallback.py -q)",
|
||||||
|
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_personnel_text_fallback.py tests/unit/test_ocr_layout.py -q)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -10,7 +10,8 @@ STORAGE_LOCAL_DIR=./storage
|
|||||||
# ==== OCR ====
|
# ==== OCR ====
|
||||||
OCR_LANG=latin # PaddleOCR lang code; "latin" works well for Bahasa Indonesia
|
OCR_LANG=latin # PaddleOCR lang code; "latin" works well for Bahasa Indonesia
|
||||||
OCR_USE_GPU=false # set true if running on a GPU host
|
OCR_USE_GPU=false # set true if running on a GPU host
|
||||||
OCR_DET_MODEL_DIR= # leave empty to use PaddleOCR defaults
|
# Leave empty to use PaddleOCR defaults.
|
||||||
|
OCR_DET_MODEL_DIR=
|
||||||
OCR_REC_MODEL_DIR=
|
OCR_REC_MODEL_DIR=
|
||||||
OCR_CLS_MODEL_DIR=
|
OCR_CLS_MODEL_DIR=
|
||||||
OCR_MAX_IMAGE_SIDE=2200 # downscale longest side before OCR
|
OCR_MAX_IMAGE_SIDE=2200 # downscale longest side before OCR
|
||||||
|
|||||||
13
Makefile
13
Makefile
@@ -1,9 +1,10 @@
|
|||||||
.PHONY: help install dev fmt lint typecheck test test-cov run docker-build docker-up docker-down clean
|
.PHONY: help install dev update fmt lint typecheck test test-cov run docker-build docker-up docker-down clean
|
||||||
|
|
||||||
help:
|
help:
|
||||||
@echo "Targets:"
|
@echo "Targets:"
|
||||||
@echo " install - install runtime + dev deps in current env"
|
@echo " install - install runtime + dev deps in current env"
|
||||||
@echo " dev - run FastAPI app with autoreload"
|
@echo " dev - run FastAPI app with autoreload"
|
||||||
|
@echo " update - git pull + install deps + migrate db + run dev server"
|
||||||
@echo " fmt - format code with ruff"
|
@echo " fmt - format code with ruff"
|
||||||
@echo " lint - lint with ruff"
|
@echo " lint - lint with ruff"
|
||||||
@echo " typecheck - run mypy"
|
@echo " typecheck - run mypy"
|
||||||
@@ -21,6 +22,16 @@ install:
|
|||||||
dev:
|
dev:
|
||||||
uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
|
uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
|
||||||
|
|
||||||
|
update:
|
||||||
|
@echo "[1/4] Pulling latest code..."
|
||||||
|
git pull
|
||||||
|
@echo "[2/4] Installing/updating dependencies..."
|
||||||
|
pip install -e ".[dev]"
|
||||||
|
@echo "[3/4] Running database migrations..."
|
||||||
|
alembic upgrade head
|
||||||
|
@echo "[4/4] Starting dev server..."
|
||||||
|
uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
|
||||||
|
|
||||||
fmt:
|
fmt:
|
||||||
ruff format src tests
|
ruff format src tests
|
||||||
ruff check --fix src tests
|
ruff check --fix src tests
|
||||||
|
|||||||
858
docs/DEPLOYMENT-EXISTING-STACK.md
Normal file
858
docs/DEPLOYMENT-EXISTING-STACK.md
Normal file
@@ -0,0 +1,858 @@
|
|||||||
|
# Deployment OCR Sprint Service (Existing Stack)
|
||||||
|
|
||||||
|
Panduan deployment untuk server dengan Python 3.12.3, PostgreSQL 16.13, dan Redis 7.0.15 yang sudah terinstall.
|
||||||
|
|
||||||
|
## Informasi Server Anda
|
||||||
|
|
||||||
|
- **OS**: Ubuntu 24.04
|
||||||
|
- **Python**: 3.12.3 ✅
|
||||||
|
- **PostgreSQL**: 16.13 ✅
|
||||||
|
- **Redis**: 7.0.15 ✅
|
||||||
|
|
||||||
|
Semua versi sudah kompatibel dan optimal untuk OCR Sprint Service!
|
||||||
|
|
||||||
|
## Langkah 1: Install System Libraries untuk OpenCV & PaddleOCR
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Update package list
|
||||||
|
sudo apt update
|
||||||
|
|
||||||
|
# Install libraries yang dibutuhkan oleh OpenCV dan PaddleOCR
|
||||||
|
sudo apt install -y \
|
||||||
|
libgl1 \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libsm6 \
|
||||||
|
libxext6 \
|
||||||
|
libxrender1 \
|
||||||
|
libgomp1 \
|
||||||
|
libmagic1 \
|
||||||
|
python3.12-venv \
|
||||||
|
python3.12-dev \
|
||||||
|
build-essential \
|
||||||
|
git
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 2: Setup PostgreSQL Database
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Login ke PostgreSQL
|
||||||
|
sudo -u postgres psql
|
||||||
|
```
|
||||||
|
|
||||||
|
Jalankan SQL commands berikut:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Create user dan database
|
||||||
|
CREATE USER ocr WITH PASSWORD '@Offroader123';
|
||||||
|
CREATE DATABASE ocr_sprint OWNER ocr;
|
||||||
|
|
||||||
|
-- Grant privileges
|
||||||
|
GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
|
||||||
|
|
||||||
|
-- Connect ke database untuk grant schema privileges
|
||||||
|
\c ocr_sprint
|
||||||
|
|
||||||
|
-- Grant schema privileges (PostgreSQL 15+)
|
||||||
|
GRANT ALL ON SCHEMA public TO ocr;
|
||||||
|
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO ocr;
|
||||||
|
GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO ocr;
|
||||||
|
|
||||||
|
-- Verify
|
||||||
|
\l ocr_sprint
|
||||||
|
\du ocr
|
||||||
|
|
||||||
|
-- Exit
|
||||||
|
\q
|
||||||
|
```
|
||||||
|
|
||||||
|
**Generate password yang aman:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Generate random password
|
||||||
|
openssl rand -base64 32
|
||||||
|
+J33GdYQcWcfqXs169cmgPrQJpLFgybjoedr/tNb0d4=
|
||||||
|
```
|
||||||
|
|
||||||
|
Simpan password ini, akan digunakan di konfigurasi nanti.
|
||||||
|
|
||||||
|
## Langkah 3: Verify Redis
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check Redis status
|
||||||
|
sudo systemctl status redis-server
|
||||||
|
|
||||||
|
# Test connection
|
||||||
|
redis-cli ping
|
||||||
|
# Expected output: PONG
|
||||||
|
|
||||||
|
# Check Redis config (opsional)
|
||||||
|
redis-cli CONFIG GET maxmemory
|
||||||
|
```
|
||||||
|
|
||||||
|
Jika Redis belum running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo systemctl enable redis-server
|
||||||
|
sudo systemctl start redis-server
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 4: Create Application User
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create dedicated user untuk aplikasi
|
||||||
|
sudo useradd -m -s /bin/bash ocr
|
||||||
|
|
||||||
|
# Create application directory
|
||||||
|
sudo mkdir -p /opt/ocr-sprint-service
|
||||||
|
sudo chown ocr:ocr /opt/ocr-sprint-service
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 5: Clone dan Install Application
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Switch ke user ocr
|
||||||
|
sudo su - ocr
|
||||||
|
|
||||||
|
# Clone repository
|
||||||
|
cd /opt
|
||||||
|
git clone https://github.com/Adriankf59/ocr-sprint-service.git
|
||||||
|
cd ocr-sprint-service
|
||||||
|
|
||||||
|
# Create virtual environment dengan Python 3.12
|
||||||
|
python3.12 -m venv .venv
|
||||||
|
|
||||||
|
# Activate virtual environment
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Verify Python version di venv
|
||||||
|
python --version
|
||||||
|
# Expected: Python 3.12.3
|
||||||
|
|
||||||
|
# Upgrade pip
|
||||||
|
pip install --upgrade pip setuptools wheel
|
||||||
|
|
||||||
|
# Install application dengan OCR dependencies
|
||||||
|
# Ini akan download ~1.5GB PaddlePaddle wheels
|
||||||
|
pip install -e ".[ocr]"
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
python -c "import paddleocr; print('PaddleOCR OK')"
|
||||||
|
python -c "import cv2; print('OpenCV OK')"
|
||||||
|
python -c "import fastapi; print('FastAPI OK')"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 6: Konfigurasi Application
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Masih sebagai user ocr
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
|
||||||
|
# Copy environment template
|
||||||
|
cp .env.example .env
|
||||||
|
|
||||||
|
# Edit konfigurasi
|
||||||
|
nano .env
|
||||||
|
```
|
||||||
|
|
||||||
|
**Konfigurasi `/opt/ocr-sprint-service/.env`:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ==== App ====
|
||||||
|
APP_ENV=prod
|
||||||
|
APP_HOST=0.0.0.0
|
||||||
|
APP_PORT=8000
|
||||||
|
APP_LOG_LEVEL=INFO
|
||||||
|
|
||||||
|
# ==== Storage ====
|
||||||
|
STORAGE_LOCAL_DIR=/opt/ocr-sprint-service/storage
|
||||||
|
BLOB_STORAGE_DIR=/opt/ocr-sprint-service/storage/blobs
|
||||||
|
BLOB_MAX_UPLOAD_MB=25
|
||||||
|
|
||||||
|
# ==== OCR ====
|
||||||
|
OCR_LANG=latin
|
||||||
|
OCR_USE_GPU=false
|
||||||
|
OCR_MAX_IMAGE_SIDE=2200
|
||||||
|
|
||||||
|
# ==== Preprocessing ====
|
||||||
|
PREPROCESS_TARGET_DPI=300
|
||||||
|
PREPROCESS_DENOISE=true
|
||||||
|
PREPROCESS_DESKEW=true
|
||||||
|
PREPROCESS_DETECT_DOCUMENT=true
|
||||||
|
PREPROCESS_REMOVE_SHADOW=true
|
||||||
|
PREPROCESS_MIN_QUAD_AREA_FRACTION=0.20
|
||||||
|
|
||||||
|
# ==== Table Extraction ====
|
||||||
|
TABLES_ENABLED=true
|
||||||
|
|
||||||
|
# ==== Confidence ====
|
||||||
|
CONFIDENCE_AUTO_APPROVE=0.95
|
||||||
|
CONFIDENCE_NEEDS_REVIEW=0.85
|
||||||
|
|
||||||
|
# ==== LLM (Phase 5, optional - disable untuk sekarang) ====
|
||||||
|
LLM_ENABLED=false
|
||||||
|
|
||||||
|
# ==== Async Pipeline ====
|
||||||
|
QUEUE_ENABLED=true
|
||||||
|
REDIS_URL=redis://localhost:6379/0
|
||||||
|
CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
|
||||||
|
|
||||||
|
# ==== Database ====
|
||||||
|
# Ganti 'your-password-here' dengan password yang Anda generate di Langkah 2
|
||||||
|
DATABASE_URL=postgresql+psycopg://ocr:your-password-here@localhost:5432/ocr_sprint
|
||||||
|
DATABASE_ECHO=false
|
||||||
|
|
||||||
|
# ==== Auth (WAJIB untuk production!) ====
|
||||||
|
# Generate dengan: openssl rand -hex 32
|
||||||
|
API_KEYS=paste-api-key-1-here,paste-api-key-2-here
|
||||||
|
API_KEY_HEADER=X-API-Key
|
||||||
|
```
|
||||||
|
|
||||||
|
**Generate API keys:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Generate 2 API keys
|
||||||
|
echo "API Key 1: $(openssl rand -hex 32)"
|
||||||
|
echo "API Key 2: $(openssl rand -hex 32)"
|
||||||
|
```
|
||||||
|
|
||||||
|
Copy output dan paste ke `API_KEYS` di file `.env`.
|
||||||
|
|
||||||
|
**Create storage directories:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p /opt/ocr-sprint-service/storage/blobs
|
||||||
|
chmod 755 /opt/ocr-sprint-service/storage
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 7: Run Database Migrations
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Masih sebagai user ocr, dengan venv activated
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Run migrations
|
||||||
|
alembic upgrade head
|
||||||
|
|
||||||
|
# Verify - should show current revision
|
||||||
|
alembic current
|
||||||
|
|
||||||
|
# Expected output: (head) atau revision number
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 8: Test Manual Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Masih sebagai user ocr
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Test API server
|
||||||
|
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Di terminal lain (sebagai user ubuntu):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test health check
|
||||||
|
curl http://localhost:8000/api/v1/health
|
||||||
|
|
||||||
|
# Expected: {"status":"ok","version":"0.1.0"}
|
||||||
|
|
||||||
|
# Test dengan sample file (jika ada)
|
||||||
|
curl -X POST "http://localhost:8000/api/v1/documents?sync=true" \
|
||||||
|
-H "X-API-Key: your-api-key-here" \
|
||||||
|
-F "file=@/path/to/test.pdf"
|
||||||
|
```
|
||||||
|
|
||||||
|
Jika berhasil, stop server dengan `Ctrl+C`.
|
||||||
|
|
||||||
|
## Langkah 9: Setup Systemd Services
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Exit dari user ocr
|
||||||
|
exit
|
||||||
|
|
||||||
|
# Kembali sebagai user ubuntu dengan sudo
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create API Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nano /etc/systemd/system/ocr-sprint-api.service
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content:**
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=OCR Sprint API Service
|
||||||
|
After=network.target postgresql.service redis-server.service
|
||||||
|
Wants=postgresql.service redis-server.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=ocr
|
||||||
|
Group=ocr
|
||||||
|
WorkingDirectory=/opt/ocr-sprint-service
|
||||||
|
|
||||||
|
# Environment
|
||||||
|
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
|
||||||
|
EnvironmentFile=/opt/ocr-sprint-service/.env
|
||||||
|
|
||||||
|
# Start command - 4 workers untuk production
|
||||||
|
ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn \
|
||||||
|
ocr_sprint.main:app \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 8000 \
|
||||||
|
--workers 4 \
|
||||||
|
--log-level info
|
||||||
|
|
||||||
|
# Restart policy
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
StartLimitInterval=0
|
||||||
|
|
||||||
|
# Resource limits
|
||||||
|
LimitNOFILE=65536
|
||||||
|
|
||||||
|
# Security
|
||||||
|
NoNewPrivileges=true
|
||||||
|
PrivateTmp=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create Celery Worker Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nano /etc/systemd/system/ocr-sprint-worker.service
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content:**
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=OCR Sprint Celery Worker
|
||||||
|
After=network.target postgresql.service redis-server.service ocr-sprint-api.service
|
||||||
|
Wants=postgresql.service redis-server.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=ocr
|
||||||
|
Group=ocr
|
||||||
|
WorkingDirectory=/opt/ocr-sprint-service
|
||||||
|
|
||||||
|
# Environment
|
||||||
|
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
|
||||||
|
EnvironmentFile=/opt/ocr-sprint-service/.env
|
||||||
|
|
||||||
|
# Start command - concurrency 2 untuk CPU dengan 4 cores
|
||||||
|
# Sesuaikan dengan jumlah CPU cores server Anda
|
||||||
|
ExecStart=/opt/ocr-sprint-service/.venv/bin/celery \
|
||||||
|
-A ocr_sprint.worker.celery_app \
|
||||||
|
worker \
|
||||||
|
--loglevel=info \
|
||||||
|
--concurrency=2 \
|
||||||
|
--max-tasks-per-child=100
|
||||||
|
|
||||||
|
# Restart policy
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
StartLimitInterval=0
|
||||||
|
|
||||||
|
# Resource limits
|
||||||
|
LimitNOFILE=65536
|
||||||
|
|
||||||
|
# Security
|
||||||
|
NoNewPrivileges=true
|
||||||
|
PrivateTmp=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
### Enable dan Start Services
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Reload systemd
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
|
||||||
|
# Enable services (auto-start on boot)
|
||||||
|
sudo systemctl enable ocr-sprint-api
|
||||||
|
sudo systemctl enable ocr-sprint-worker
|
||||||
|
|
||||||
|
# Start services
|
||||||
|
sudo systemctl start ocr-sprint-api
|
||||||
|
sudo systemctl start ocr-sprint-worker
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
sudo systemctl status ocr-sprint-api
|
||||||
|
sudo systemctl status ocr-sprint-worker
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected output:** `active (running)` dengan warna hijau.
|
||||||
|
|
||||||
|
### View Logs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# API logs (real-time)
|
||||||
|
sudo journalctl -u ocr-sprint-api -f
|
||||||
|
|
||||||
|
# Worker logs (real-time)
|
||||||
|
sudo journalctl -u ocr-sprint-worker -f
|
||||||
|
|
||||||
|
# Last 50 lines
|
||||||
|
sudo journalctl -u ocr-sprint-api -n 50
|
||||||
|
sudo journalctl -u ocr-sprint-worker -n 50
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 10: Install dan Setup Nginx
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install Nginx dan Certbot
|
||||||
|
sudo apt install -y nginx certbot python3-certbot-nginx
|
||||||
|
|
||||||
|
# Check Nginx status
|
||||||
|
sudo systemctl status nginx
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create Nginx Configuration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nano /etc/nginx/sites-available/ocr-sprint
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content (ganti `ocr.yourdomain.com` dengan domain Anda):**
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
# Upstream
|
||||||
|
upstream ocr_api {
|
||||||
|
server 127.0.0.1:8000;
|
||||||
|
keepalive 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Rate limiting
|
||||||
|
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name ocr.yourdomain.com;
|
||||||
|
|
||||||
|
# Max upload size
|
||||||
|
client_max_body_size 30M;
|
||||||
|
client_body_buffer_size 128k;
|
||||||
|
|
||||||
|
# Timeouts
|
||||||
|
proxy_connect_timeout 300s;
|
||||||
|
proxy_send_timeout 300s;
|
||||||
|
proxy_read_timeout 300s;
|
||||||
|
send_timeout 300s;
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
access_log /var/log/nginx/ocr-sprint-access.log;
|
||||||
|
error_log /var/log/nginx/ocr-sprint-error.log;
|
||||||
|
|
||||||
|
# API endpoints
|
||||||
|
location /api/ {
|
||||||
|
limit_req zone=api_limit burst=20 nodelay;
|
||||||
|
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_set_header Connection "";
|
||||||
|
|
||||||
|
proxy_buffering off;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
location /api/v1/health {
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
access_log off;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Metrics (restrict access)
|
||||||
|
location /metrics {
|
||||||
|
allow 127.0.0.1;
|
||||||
|
allow 10.0.0.0/8;
|
||||||
|
deny all;
|
||||||
|
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
}
|
||||||
|
|
||||||
|
# API docs
|
||||||
|
location /docs {
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
}
|
||||||
|
|
||||||
|
location /redoc {
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Enable Site
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test konfigurasi
|
||||||
|
sudo nginx -t
|
||||||
|
|
||||||
|
# Enable site
|
||||||
|
sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
|
||||||
|
|
||||||
|
# Reload Nginx
|
||||||
|
sudo systemctl reload nginx
|
||||||
|
```
|
||||||
|
|
||||||
|
### Setup SSL (jika punya domain)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Obtain certificate
|
||||||
|
sudo certbot --nginx -d ocr.yourdomain.com
|
||||||
|
|
||||||
|
# Test auto-renewal
|
||||||
|
sudo certbot renew --dry-run
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 11: Setup Firewall
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check UFW status
|
||||||
|
sudo ufw status
|
||||||
|
|
||||||
|
# Allow SSH (PENTING!)
|
||||||
|
sudo ufw allow 22/tcp
|
||||||
|
|
||||||
|
# Allow HTTP dan HTTPS
|
||||||
|
sudo ufw allow 80/tcp
|
||||||
|
sudo ufw allow 443/tcp
|
||||||
|
|
||||||
|
# Enable firewall (jika belum)
|
||||||
|
sudo ufw enable
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
sudo ufw status numbered
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 12: Verifikasi Final
|
||||||
|
|
||||||
|
### Test dari Server
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Health check
|
||||||
|
curl http://localhost:8000/api/v1/health
|
||||||
|
|
||||||
|
# Test async endpoint
|
||||||
|
curl -X POST http://localhost:8000/api/v1/documents \
|
||||||
|
-H "X-API-Key: your-api-key-here" \
|
||||||
|
-F "file=@/path/to/test.pdf"
|
||||||
|
|
||||||
|
# Expected: {"job_id":"...","status":"pending",...}
|
||||||
|
|
||||||
|
# Check job status
|
||||||
|
curl -H "X-API-Key: your-api-key-here" \
|
||||||
|
http://localhost:8000/api/v1/documents/JOB_ID_HERE
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test via Domain (jika sudah setup SSL)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl https://ocr.yourdomain.com/api/v1/health
|
||||||
|
```
|
||||||
|
|
||||||
|
### Check Services
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# All services should be active
|
||||||
|
sudo systemctl status ocr-sprint-api
|
||||||
|
sudo systemctl status ocr-sprint-worker
|
||||||
|
sudo systemctl status postgresql
|
||||||
|
sudo systemctl status redis-server
|
||||||
|
sudo systemctl status nginx
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
### View Logs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# API logs
|
||||||
|
sudo journalctl -u ocr-sprint-api -f
|
||||||
|
|
||||||
|
# Worker logs
|
||||||
|
sudo journalctl -u ocr-sprint-worker -f
|
||||||
|
|
||||||
|
# Nginx access logs
|
||||||
|
sudo tail -f /var/log/nginx/ocr-sprint-access.log
|
||||||
|
|
||||||
|
# Nginx error logs
|
||||||
|
sudo tail -f /var/log/nginx/ocr-sprint-error.log
|
||||||
|
```
|
||||||
|
|
||||||
|
### Prometheus Metrics
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# View metrics
|
||||||
|
curl http://localhost:8000/metrics
|
||||||
|
|
||||||
|
# Key metrics:
|
||||||
|
# - ocr_documents_total
|
||||||
|
# - ocr_processing_duration_seconds
|
||||||
|
# - ocr_confidence_score
|
||||||
|
```
|
||||||
|
|
||||||
|
## Maintenance
|
||||||
|
|
||||||
|
### Restart Services
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo systemctl restart ocr-sprint-api
|
||||||
|
sudo systemctl restart ocr-sprint-worker
|
||||||
|
```
|
||||||
|
|
||||||
|
### Update Application
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Switch ke user ocr
|
||||||
|
sudo su - ocr
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
|
||||||
|
# Pull latest code
|
||||||
|
git pull
|
||||||
|
|
||||||
|
# Activate venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Update dependencies
|
||||||
|
pip install -e ".[ocr]"
|
||||||
|
|
||||||
|
# Run migrations
|
||||||
|
alembic upgrade head
|
||||||
|
|
||||||
|
# Exit
|
||||||
|
exit
|
||||||
|
|
||||||
|
# Restart services
|
||||||
|
sudo systemctl restart ocr-sprint-api
|
||||||
|
sudo systemctl restart ocr-sprint-worker
|
||||||
|
|
||||||
|
# Check logs
|
||||||
|
sudo journalctl -u ocr-sprint-api -n 50
|
||||||
|
```
|
||||||
|
|
||||||
|
### Database Backup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create backup directory
|
||||||
|
sudo mkdir -p /opt/ocr-sprint-service/backups
|
||||||
|
sudo chown ocr:ocr /opt/ocr-sprint-service/backups
|
||||||
|
|
||||||
|
# Manual backup
|
||||||
|
sudo -u ocr pg_dump -h localhost -U ocr ocr_sprint | gzip > /opt/ocr-sprint-service/backups/backup_$(date +%Y%m%d_%H%M%S).sql.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
**Setup automated backup:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create backup script
|
||||||
|
sudo nano /opt/ocr-sprint-service/backup.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
BACKUP_DIR="/opt/ocr-sprint-service/backups"
|
||||||
|
DATE=$(date +%Y%m%d_%H%M%S)
|
||||||
|
|
||||||
|
mkdir -p $BACKUP_DIR
|
||||||
|
|
||||||
|
# Backup database
|
||||||
|
PGPASSWORD='your-db-password' pg_dump -h localhost -U ocr ocr_sprint | gzip > $BACKUP_DIR/db_$DATE.sql.gz
|
||||||
|
|
||||||
|
# Keep only last 7 days
|
||||||
|
find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete
|
||||||
|
|
||||||
|
echo "Backup completed: $DATE"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Make executable
|
||||||
|
sudo chmod +x /opt/ocr-sprint-service/backup.sh
|
||||||
|
sudo chown ocr:ocr /opt/ocr-sprint-service/backup.sh
|
||||||
|
|
||||||
|
# Setup cron (daily at 2 AM)
|
||||||
|
sudo crontab -e -u ocr
|
||||||
|
|
||||||
|
# Add line:
|
||||||
|
0 2 * * * /opt/ocr-sprint-service/backup.sh >> /var/log/ocr-backup.log 2>&1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Service tidak start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check detailed logs
|
||||||
|
sudo journalctl -u ocr-sprint-api -n 100 --no-pager
|
||||||
|
sudo journalctl -u ocr-sprint-worker -n 100 --no-pager
|
||||||
|
|
||||||
|
# Check file permissions
|
||||||
|
ls -la /opt/ocr-sprint-service
|
||||||
|
ls -la /opt/ocr-sprint-service/storage
|
||||||
|
|
||||||
|
# Test manual run
|
||||||
|
sudo su - ocr
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
source .venv/bin/activate
|
||||||
|
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Database connection error
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test connection
|
||||||
|
sudo -u ocr psql -h localhost -U ocr -d ocr_sprint
|
||||||
|
|
||||||
|
# Check PostgreSQL status
|
||||||
|
sudo systemctl status postgresql
|
||||||
|
|
||||||
|
# Check PostgreSQL logs
|
||||||
|
sudo journalctl -u postgresql -n 50
|
||||||
|
```
|
||||||
|
|
||||||
|
### Redis connection error
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test Redis
|
||||||
|
redis-cli ping
|
||||||
|
|
||||||
|
# Check Redis status
|
||||||
|
sudo systemctl status redis-server
|
||||||
|
|
||||||
|
# Check Redis logs
|
||||||
|
sudo journalctl -u redis-server -n 50
|
||||||
|
```
|
||||||
|
|
||||||
|
### Worker tidak memproses jobs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check Celery worker status
|
||||||
|
sudo su - ocr
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
source .venv/bin/activate
|
||||||
|
celery -A ocr_sprint.worker.celery_app inspect active
|
||||||
|
celery -A ocr_sprint.worker.celery_app inspect stats
|
||||||
|
|
||||||
|
# Check Redis queue
|
||||||
|
redis-cli LLEN ocr_sprint
|
||||||
|
```
|
||||||
|
|
||||||
|
### PaddleOCR error
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Re-download models
|
||||||
|
sudo su - ocr
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
python << EOF
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
ocr = PaddleOCR(use_angle_cls=True, lang='latin')
|
||||||
|
print("Models downloaded successfully")
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Tuning
|
||||||
|
|
||||||
|
### Check CPU cores
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nproc
|
||||||
|
```
|
||||||
|
|
||||||
|
### Adjust worker concurrency
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Edit worker service
|
||||||
|
sudo nano /etc/systemd/system/ocr-sprint-worker.service
|
||||||
|
|
||||||
|
# Untuk 4 cores: --concurrency=2
|
||||||
|
# Untuk 8 cores: --concurrency=4
|
||||||
|
# Untuk 16 cores: --concurrency=8
|
||||||
|
|
||||||
|
# Reload dan restart
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl restart ocr-sprint-worker
|
||||||
|
```
|
||||||
|
|
||||||
|
### PostgreSQL 16 Tuning
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nano /etc/postgresql/16/main/postgresql.conf
|
||||||
|
```
|
||||||
|
|
||||||
|
**Recommended settings (sesuaikan dengan RAM server):**
|
||||||
|
|
||||||
|
```
|
||||||
|
# Untuk 8GB RAM:
|
||||||
|
shared_buffers = 2GB
|
||||||
|
effective_cache_size = 6GB
|
||||||
|
maintenance_work_mem = 512MB
|
||||||
|
work_mem = 8MB
|
||||||
|
|
||||||
|
# Untuk 16GB RAM:
|
||||||
|
shared_buffers = 4GB
|
||||||
|
effective_cache_size = 12GB
|
||||||
|
maintenance_work_mem = 1GB
|
||||||
|
work_mem = 10MB
|
||||||
|
|
||||||
|
# General
|
||||||
|
checkpoint_completion_target = 0.9
|
||||||
|
wal_buffers = 16MB
|
||||||
|
default_statistics_target = 100
|
||||||
|
random_page_cost = 1.1
|
||||||
|
effective_io_concurrency = 200
|
||||||
|
max_worker_processes = 4
|
||||||
|
max_parallel_workers_per_gather = 2
|
||||||
|
max_parallel_workers = 4
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo systemctl restart postgresql
|
||||||
|
```
|
||||||
|
|
||||||
|
## Security Checklist
|
||||||
|
|
||||||
|
- [ ] API keys set dengan nilai random yang kuat
|
||||||
|
- [ ] Database password diganti dari default
|
||||||
|
- [ ] Firewall enabled (UFW)
|
||||||
|
- [ ] SSL/TLS enabled (jika punya domain)
|
||||||
|
- [ ] `/metrics` endpoint restricted
|
||||||
|
- [ ] PostgreSQL hanya listen di localhost
|
||||||
|
- [ ] Redis hanya listen di localhost
|
||||||
|
- [ ] Backup automated (cron job)
|
||||||
|
- [ ] OS security updates enabled
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. **Setup monitoring** - Install Prometheus + Grafana (opsional)
|
||||||
|
2. **Setup alerting** - Email/Slack notification untuk errors
|
||||||
|
3. **Load testing** - Test dengan volume dokumen production
|
||||||
|
4. **Backup verification** - Test restore dari backup
|
||||||
|
5. **Documentation** - Dokumentasi API keys untuk tim
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
Untuk pertanyaan atau issues, hubungi tim development.
|
||||||
571
docs/DEPLOYMENT-GUIDE.md
Normal file
571
docs/DEPLOYMENT-GUIDE.md
Normal file
@@ -0,0 +1,571 @@
|
|||||||
|
# Panduan Deployment OCR Sprint Service
|
||||||
|
|
||||||
|
> Dokumen ini adalah panduan langkah-langkah deployment **ocr-sprint-service** ke server production. Disusun berdasarkan kondisi kodingan aktual per April 2026 (Phase 1–4 selesai).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Daftar Isi
|
||||||
|
|
||||||
|
1. [Gambaran Arsitektur](#1-gambaran-arsitektur)
|
||||||
|
2. [Prasyarat Server](#2-prasyarat-server)
|
||||||
|
3. [Opsi A — Docker Compose (Recommended)](#3-opsi-a--docker-compose-recommended)
|
||||||
|
4. [Opsi B — Manual (Tanpa Docker)](#4-opsi-b--manual-tanpa-docker)
|
||||||
|
5. [Konfigurasi Environment Production](#5-konfigurasi-environment-production)
|
||||||
|
6. [Reverse Proxy & SSL (Nginx)](#6-reverse-proxy--ssl-nginx)
|
||||||
|
7. [Firewall](#7-firewall)
|
||||||
|
8. [Verifikasi Deployment](#8-verifikasi-deployment)
|
||||||
|
9. [Monitoring & Maintenance](#9-monitoring--maintenance)
|
||||||
|
10. [Troubleshooting](#10-troubleshooting)
|
||||||
|
11. [Security Checklist](#11-security-checklist)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Gambaran Arsitektur
|
||||||
|
|
||||||
|
```
|
||||||
|
┌──────────┐ ┌──────────────┐ ┌───────┐
|
||||||
|
│ Client │────▶│ Nginx (SSL) │────▶│ API │──▶ PaddleOCR
|
||||||
|
└──────────┘ └──────────────┘ │ :8000 │ Pipeline
|
||||||
|
└───┬───┘
|
||||||
|
│ async job
|
||||||
|
┌─────▼─────┐
|
||||||
|
│ Redis │
|
||||||
|
│ :6379 │
|
||||||
|
└─────┬─────┘
|
||||||
|
┌─────▼──────┐
|
||||||
|
│ Worker │──▶ PaddleOCR
|
||||||
|
│ (Celery) │ Pipeline
|
||||||
|
└─────┬──────┘
|
||||||
|
┌─────▼──────┐
|
||||||
|
│ PostgreSQL │
|
||||||
|
│ :5432 │
|
||||||
|
└────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**4 services** yang harus berjalan:
|
||||||
|
|
||||||
|
| Service | Fungsi |
|
||||||
|
|---------|--------|
|
||||||
|
| **API** (FastAPI + Uvicorn) | Menerima upload dokumen, serve hasil OCR |
|
||||||
|
| **Worker** (Celery) | Async OCR processing di background |
|
||||||
|
| **Redis** | Message broker untuk job queue |
|
||||||
|
| **PostgreSQL** | Menyimpan job state & hasil ekstraksi |
|
||||||
|
|
||||||
|
Blob storage menggunakan **local filesystem** (belum S3/MinIO).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Prasyarat Server
|
||||||
|
|
||||||
|
### Spesifikasi Minimum
|
||||||
|
|
||||||
|
| Resource | Minimum | Recommended |
|
||||||
|
|----------|---------|-------------|
|
||||||
|
| OS | Ubuntu 20.04+ / Debian 11+ | Ubuntu 22.04+ |
|
||||||
|
| CPU | 4 cores | 8 cores |
|
||||||
|
| RAM | 8 GB | 16 GB |
|
||||||
|
| Storage | 50 GB free | 100 GB free |
|
||||||
|
| Python | 3.10–3.12 | 3.11 atau 3.12 |
|
||||||
|
| Network | Port 8000 (internal) | + Port 80/443 (Nginx) |
|
||||||
|
|
||||||
|
### Kebutuhan Disk
|
||||||
|
|
||||||
|
- ~1.5 GB — PaddlePaddle wheels
|
||||||
|
- ~200 MB — PaddleOCR model downloads (otomatis saat pertama jalan)
|
||||||
|
- Sisanya — blob storage dokumen yang diupload
|
||||||
|
|
||||||
|
### Software yang Dibutuhkan
|
||||||
|
|
||||||
|
- **Docker Compose** — untuk Opsi A
|
||||||
|
- **Python 3.10–3.12 + PostgreSQL + Redis** — untuk Opsi B
|
||||||
|
- **Git** — kedua opsi
|
||||||
|
- **Nginx** (opsional) — reverse proxy + SSL
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Opsi A — Docker Compose (Recommended)
|
||||||
|
|
||||||
|
> Cara paling cepat. Semua service (API, Worker, Redis, Postgres) berjalan dalam container.
|
||||||
|
|
||||||
|
### 3.1 Login & Clone
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh user@your-server.com
|
||||||
|
|
||||||
|
git clone https://github.com/Adriankf59/ocr-sprint-service.git
|
||||||
|
cd ocr-sprint-service
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.2 Konfigurasi .env
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
nano .env
|
||||||
|
```
|
||||||
|
|
||||||
|
Lihat [Bagian 5](#5-konfigurasi-environment-production) untuk detail konfigurasi production.
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
> Untuk Docker Compose, **jangan ubah** `DATABASE_URL` dan `REDIS_URL` — sudah dioverride oleh `docker-compose.yml` via environment variables di masing-masing container.
|
||||||
|
|
||||||
|
### 3.3 Build & Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build image (~5–10 menit pertama kali)
|
||||||
|
docker compose build
|
||||||
|
|
||||||
|
# Start semua services
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
# Cek logs
|
||||||
|
docker compose logs -f api worker
|
||||||
|
```
|
||||||
|
|
||||||
|
Container `api` akan otomatis menjalankan `alembic upgrade head` sebelum start server (lihat `command` di `docker-compose.yml`).
|
||||||
|
|
||||||
|
### 3.4 First-Run Model Download
|
||||||
|
|
||||||
|
Request pertama akan trigger download model PaddleOCR (~200 MB) ke Docker volume `paddle-models`. Tunggu hingga selesai sebelum test.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Monitor download di logs
|
||||||
|
docker compose logs -f api
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.5 Verifikasi
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/api/v1/health
|
||||||
|
# Expected: {"status":"ok","version":"0.1.0"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3.6 Update Service (Setelah Ada Perubahan Kode)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ocr-sprint-service
|
||||||
|
git pull
|
||||||
|
docker compose build
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Opsi B — Manual (Tanpa Docker)
|
||||||
|
|
||||||
|
> Untuk server yang sudah punya Python, PostgreSQL, dan Redis terinstall.
|
||||||
|
|
||||||
|
### 4.1 Install System Libraries
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt update && sudo apt upgrade -y
|
||||||
|
|
||||||
|
# Libraries untuk OpenCV & PaddleOCR
|
||||||
|
sudo apt install -y \
|
||||||
|
python3.11 python3.11-venv python3.11-dev \
|
||||||
|
libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \
|
||||||
|
libgomp1 libmagic1 \
|
||||||
|
build-essential git curl
|
||||||
|
|
||||||
|
# Install Redis & PostgreSQL (jika belum ada)
|
||||||
|
sudo apt install -y redis-server postgresql postgresql-contrib
|
||||||
|
sudo systemctl enable --now redis-server postgresql
|
||||||
|
```
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> Jika server sudah punya Python 3.12, gunakan `python3.12` di semua perintah selanjutnya.
|
||||||
|
|
||||||
|
### 4.2 Setup Database
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo -u postgres psql
|
||||||
|
```
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE USER ocr WITH PASSWORD 'ganti-password-kuat';
|
||||||
|
CREATE DATABASE ocr_sprint OWNER ocr;
|
||||||
|
GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
|
||||||
|
\c ocr_sprint
|
||||||
|
GRANT ALL ON SCHEMA public TO ocr;
|
||||||
|
\q
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.3 Create Application User & Directory
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo useradd -m -s /bin/bash ocr
|
||||||
|
sudo mkdir -p /opt/ocr-sprint-service
|
||||||
|
sudo chown ocr:ocr /opt/ocr-sprint-service
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.4 Clone & Install
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo su - ocr
|
||||||
|
cd /opt
|
||||||
|
git clone https://github.com/Adriankf59/ocr-sprint-service.git
|
||||||
|
cd ocr-sprint-service
|
||||||
|
|
||||||
|
# Create virtual environment
|
||||||
|
python3.11 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Install dependencies + OCR runtime (~1.5 GB download)
|
||||||
|
pip install --upgrade pip setuptools wheel
|
||||||
|
pip install -e ".[ocr]"
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
python -c "import paddleocr; print('PaddleOCR OK')"
|
||||||
|
python -c "import fastapi; print('FastAPI OK')"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.5 Konfigurasi .env
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
nano .env
|
||||||
|
```
|
||||||
|
|
||||||
|
**Wajib diubah untuk manual deployment:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
APP_ENV=prod
|
||||||
|
DATABASE_URL=postgresql+psycopg://ocr:ganti-password-kuat@localhost:5432/ocr_sprint
|
||||||
|
REDIS_URL=redis://localhost:6379/0
|
||||||
|
QUEUE_ENABLED=true
|
||||||
|
API_KEYS=your-generated-api-key
|
||||||
|
STORAGE_LOCAL_DIR=/opt/ocr-sprint-service/storage
|
||||||
|
BLOB_STORAGE_DIR=/opt/ocr-sprint-service/storage/blobs
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create storage directories
|
||||||
|
mkdir -p /opt/ocr-sprint-service/storage/blobs
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.6 Run Database Migrations
|
||||||
|
|
||||||
|
```bash
|
||||||
|
source .venv/bin/activate
|
||||||
|
alembic upgrade head
|
||||||
|
alembic current # verify
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.7 Test Manual
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
|
||||||
|
# Di terminal lain: curl http://localhost:8000/api/v1/health
|
||||||
|
# Ctrl+C untuk stop
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.8 Setup Systemd Services
|
||||||
|
|
||||||
|
**API Service** — `/etc/systemd/system/ocr-sprint-api.service`:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=OCR Sprint API Service
|
||||||
|
After=network.target postgresql.service redis-server.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=ocr
|
||||||
|
Group=ocr
|
||||||
|
WorkingDirectory=/opt/ocr-sprint-service
|
||||||
|
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
|
||||||
|
EnvironmentFile=/opt/ocr-sprint-service/.env
|
||||||
|
ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn \
|
||||||
|
ocr_sprint.main:app \
|
||||||
|
--host 0.0.0.0 --port 8000 --workers 4 --log-level info
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
LimitNOFILE=65536
|
||||||
|
NoNewPrivileges=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
**Worker Service** — `/etc/systemd/system/ocr-sprint-worker.service`:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=OCR Sprint Celery Worker
|
||||||
|
After=network.target postgresql.service redis-server.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=ocr
|
||||||
|
Group=ocr
|
||||||
|
WorkingDirectory=/opt/ocr-sprint-service
|
||||||
|
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
|
||||||
|
EnvironmentFile=/opt/ocr-sprint-service/.env
|
||||||
|
ExecStart=/opt/ocr-sprint-service/.venv/bin/celery \
|
||||||
|
-A ocr_sprint.worker.celery_app worker \
|
||||||
|
--loglevel=info --concurrency=2 --max-tasks-per-child=100
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
LimitNOFILE=65536
|
||||||
|
NoNewPrivileges=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
**Enable & Start:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Keluar dari user ocr dulu
|
||||||
|
exit
|
||||||
|
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable --now ocr-sprint-api ocr-sprint-worker
|
||||||
|
sudo systemctl status ocr-sprint-api ocr-sprint-worker
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.9 Update Service (Manual)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo su - ocr
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
git pull
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install -e ".[ocr]"
|
||||||
|
alembic upgrade head
|
||||||
|
exit
|
||||||
|
|
||||||
|
sudo systemctl restart ocr-sprint-api ocr-sprint-worker
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Konfigurasi Environment Production
|
||||||
|
|
||||||
|
Berikut konfigurasi `.env` yang **wajib diubah** dari default untuk production:
|
||||||
|
|
||||||
|
| Variable | Default | Production | Keterangan |
|
||||||
|
|----------|---------|------------|------------|
|
||||||
|
| `APP_ENV` | `local` | `prod` | Mode environment |
|
||||||
|
| `API_KEYS` | *(kosong)* | `key1,key2` | **WAJIB!** Auth disabled jika kosong |
|
||||||
|
| `QUEUE_ENABLED` | `false` | `true` | Aktifkan async processing |
|
||||||
|
| `DATABASE_URL` | `sqlite:///...` | `postgresql+psycopg://...` | Docker: otomatis di-override |
|
||||||
|
| `REDIS_URL` | `redis://localhost:6379/0` | Sesuaikan | Docker: otomatis di-override |
|
||||||
|
| `OCR_USE_GPU` | `false` | `true` jika ada GPU | Mode GPU butuh NVIDIA driver |
|
||||||
|
| `TABLES_ENABLED` | `true` | `true` | Ekstraksi tabel personel |
|
||||||
|
|
||||||
|
**Generate API Key:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
openssl rand -hex 32
|
||||||
|
```
|
||||||
|
|
||||||
|
> [!WARNING]
|
||||||
|
> Jangan pernah deploy ke production tanpa mengisi `API_KEYS`. Jika kosong, semua endpoint terbuka tanpa autentikasi.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Reverse Proxy & SSL (Nginx)
|
||||||
|
|
||||||
|
### Install
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt install -y nginx certbot python3-certbot-nginx
|
||||||
|
```
|
||||||
|
|
||||||
|
### Konfigurasi — `/etc/nginx/sites-available/ocr-sprint`
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
upstream ocr_api {
|
||||||
|
server 127.0.0.1:8000;
|
||||||
|
keepalive 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name ocr.yourdomain.com;
|
||||||
|
|
||||||
|
client_max_body_size 30M;
|
||||||
|
|
||||||
|
proxy_connect_timeout 300s;
|
||||||
|
proxy_read_timeout 300s;
|
||||||
|
|
||||||
|
location / {
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
}
|
||||||
|
|
||||||
|
location /metrics {
|
||||||
|
allow 127.0.0.1;
|
||||||
|
allow 10.0.0.0/8;
|
||||||
|
deny all;
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Enable & SSL
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
|
||||||
|
sudo nginx -t
|
||||||
|
sudo systemctl reload nginx
|
||||||
|
|
||||||
|
# SSL
|
||||||
|
sudo certbot --nginx -d ocr.yourdomain.com
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Firewall
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo ufw allow 22/tcp # SSH — PENTING!
|
||||||
|
sudo ufw allow 80/tcp # HTTP
|
||||||
|
sudo ufw allow 443/tcp # HTTPS
|
||||||
|
sudo ufw enable
|
||||||
|
sudo ufw status
|
||||||
|
```
|
||||||
|
|
||||||
|
> [!CAUTION]
|
||||||
|
> Pastikan SSH (port 22) di-allow **sebelum** enable firewall, agar tidak terkunci dari server.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Verifikasi Deployment
|
||||||
|
|
||||||
|
### Health Check
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/api/v1/health
|
||||||
|
# {"status":"ok","version":"0.1.0"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test OCR (Sync)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST "http://localhost:8000/api/v1/documents?sync=true" \
|
||||||
|
-H "X-API-Key: your-api-key" \
|
||||||
|
-F "file=@/path/to/test.pdf" | jq
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test OCR (Async — Production Flow)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Submit job
|
||||||
|
curl -X POST http://localhost:8000/api/v1/documents \
|
||||||
|
-H "X-API-Key: your-api-key" \
|
||||||
|
-F "file=@document.pdf" | jq
|
||||||
|
# → {"job_id":"8f2a...","status":"pending",...}
|
||||||
|
|
||||||
|
# Poll result
|
||||||
|
curl -H "X-API-Key: your-api-key" \
|
||||||
|
http://localhost:8000/api/v1/documents/8f2a... | jq
|
||||||
|
# → {"status":"completed","confidence":0.93,"data":{...}}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cek Semua Service Berjalan
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Docker
|
||||||
|
docker compose ps
|
||||||
|
|
||||||
|
# Manual
|
||||||
|
sudo systemctl status ocr-sprint-api ocr-sprint-worker postgresql redis-server nginx
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Monitoring & Maintenance
|
||||||
|
|
||||||
|
### Logs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Docker
|
||||||
|
docker compose logs -f api worker
|
||||||
|
|
||||||
|
# Manual (systemd)
|
||||||
|
sudo journalctl -u ocr-sprint-api -f
|
||||||
|
sudo journalctl -u ocr-sprint-worker -f
|
||||||
|
```
|
||||||
|
|
||||||
|
### Prometheus Metrics
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/metrics
|
||||||
|
```
|
||||||
|
|
||||||
|
Metrics penting: `ocr_documents_total`, `ocr_processing_duration_seconds`, `ocr_confidence_score`.
|
||||||
|
|
||||||
|
### Backup Database
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Docker
|
||||||
|
docker compose exec postgres pg_dump -U ocr ocr_sprint > backup_$(date +%Y%m%d).sql
|
||||||
|
|
||||||
|
# Manual
|
||||||
|
pg_dump -U ocr -h localhost ocr_sprint | gzip > backup_$(date +%Y%m%d).sql.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
### Automated Backup (Cron)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# /opt/ocr-sprint-service/backup.sh
|
||||||
|
#!/bin/bash
|
||||||
|
BACKUP_DIR="/opt/ocr-sprint-service/backups"
|
||||||
|
mkdir -p $BACKUP_DIR
|
||||||
|
pg_dump -U ocr -h localhost ocr_sprint | gzip > $BACKUP_DIR/db_$(date +%Y%m%d_%H%M%S).sql.gz
|
||||||
|
find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
chmod +x /opt/ocr-sprint-service/backup.sh
|
||||||
|
# Cron: daily at 2 AM
|
||||||
|
echo "0 2 * * * /opt/ocr-sprint-service/backup.sh >> /var/log/ocr-backup.log 2>&1" | sudo crontab -u ocr -
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. Troubleshooting
|
||||||
|
|
||||||
|
| Masalah | Diagnosis | Solusi |
|
||||||
|
|---------|-----------|--------|
|
||||||
|
| Service tidak start | `journalctl -u ocr-sprint-api -n 100` | Cek permissions, `.env`, dan log error |
|
||||||
|
| PaddleOCR model gagal download | Timeout di logs | `python -c "from paddleocr import PaddleOCR; PaddleOCR(lang='latin')"` |
|
||||||
|
| Worker tidak proses jobs | `redis-cli ping` → bukan PONG | Pastikan Redis running, cek `REDIS_URL` |
|
||||||
|
| Database migration error | `alembic current` | `alembic stamp head` lalu `alembic upgrade head` |
|
||||||
|
| Port 8000 sudah dipakai | `ss -tlnp | grep 8000` | Kill proses lama atau ganti port di `.env` |
|
||||||
|
| Out of memory | OOM killer di logs | Kurangi `--concurrency` di worker, atau tambah RAM |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Security Checklist
|
||||||
|
|
||||||
|
- [ ] `API_KEYS` diisi dengan random key (`openssl rand -hex 32`)
|
||||||
|
- [ ] Password database diganti dari default
|
||||||
|
- [ ] Firewall aktif (hanya port 22, 80, 443 terbuka)
|
||||||
|
- [ ] SSL/TLS aktif via Nginx + Let's Encrypt
|
||||||
|
- [ ] Endpoint `/metrics` restricted ke internal network
|
||||||
|
- [ ] Backup database otomatis via cron
|
||||||
|
- [ ] OS security updates enabled (`unattended-upgrades`)
|
||||||
|
- [ ] `APP_ENV=prod` (bukan `local`)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Reference — Perintah Sehari-hari
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# === Docker ===
|
||||||
|
docker compose up -d # Start
|
||||||
|
docker compose down # Stop
|
||||||
|
docker compose logs -f api # Logs
|
||||||
|
docker compose build && docker compose up -d # Update
|
||||||
|
|
||||||
|
# === Manual ===
|
||||||
|
sudo systemctl restart ocr-sprint-api ocr-sprint-worker # Restart
|
||||||
|
sudo journalctl -u ocr-sprint-api -f # Logs
|
||||||
|
curl http://localhost:8000/api/v1/health # Health check
|
||||||
|
```
|
||||||
943
docs/DEPLOYMENT-MANUAL.md
Normal file
943
docs/DEPLOYMENT-MANUAL.md
Normal file
@@ -0,0 +1,943 @@
|
|||||||
|
# Deployment Manual OCR Sprint Service (Tanpa Docker)
|
||||||
|
|
||||||
|
Panduan lengkap deployment OCR Sprint Service langsung di server tanpa menggunakan Docker.
|
||||||
|
|
||||||
|
## Prasyarat Server
|
||||||
|
|
||||||
|
### Spesifikasi Minimum
|
||||||
|
- **OS**: Ubuntu 20.04+ / Debian 11+ / RHEL 8+
|
||||||
|
- **CPU**: 4 cores (8 cores recommended)
|
||||||
|
- **RAM**: 8 GB minimum (16 GB recommended)
|
||||||
|
- **Storage**: 50 GB free space
|
||||||
|
- **User**: Non-root user dengan sudo access
|
||||||
|
|
||||||
|
### Port yang Dibutuhkan
|
||||||
|
- `8000`: API server (internal, akan di-proxy oleh Nginx)
|
||||||
|
- `80/443`: HTTP/HTTPS (Nginx)
|
||||||
|
- `5432`: PostgreSQL (localhost only)
|
||||||
|
- `6379`: Redis (localhost only)
|
||||||
|
|
||||||
|
## Langkah 1: Install System Dependencies
|
||||||
|
|
||||||
|
### Ubuntu/Debian
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Update system
|
||||||
|
sudo apt update && sudo apt upgrade -y
|
||||||
|
|
||||||
|
# Install Python 3.11
|
||||||
|
sudo apt install -y software-properties-common
|
||||||
|
sudo add-apt-repository ppa:deadsnakes/ppa -y
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y python3.11 python3.11-venv python3.11-dev python3-pip
|
||||||
|
|
||||||
|
# Install system libraries untuk OpenCV dan PaddleOCR
|
||||||
|
sudo apt install -y \
|
||||||
|
libgl1-mesa-glx \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libsm6 \
|
||||||
|
libxext6 \
|
||||||
|
libxrender1 \
|
||||||
|
libgomp1 \
|
||||||
|
libmagic1 \
|
||||||
|
build-essential \
|
||||||
|
git \
|
||||||
|
curl \
|
||||||
|
wget
|
||||||
|
|
||||||
|
# Install Redis
|
||||||
|
sudo apt install -y redis-server
|
||||||
|
sudo systemctl enable redis-server
|
||||||
|
sudo systemctl start redis-server
|
||||||
|
|
||||||
|
# Install PostgreSQL
|
||||||
|
sudo apt install -y postgresql postgresql-contrib
|
||||||
|
sudo systemctl enable postgresql
|
||||||
|
sudo systemctl start postgresql
|
||||||
|
```
|
||||||
|
|
||||||
|
### RHEL/CentOS/Rocky Linux
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Update system
|
||||||
|
sudo dnf update -y
|
||||||
|
|
||||||
|
# Install Python 3.11
|
||||||
|
sudo dnf install -y python3.11 python3.11-devel python3.11-pip
|
||||||
|
|
||||||
|
# Install system libraries
|
||||||
|
sudo dnf install -y \
|
||||||
|
mesa-libGL \
|
||||||
|
glib2 \
|
||||||
|
libSM \
|
||||||
|
libXext \
|
||||||
|
libXrender \
|
||||||
|
file-libs \
|
||||||
|
gcc \
|
||||||
|
gcc-c++ \
|
||||||
|
make \
|
||||||
|
git
|
||||||
|
|
||||||
|
# Install Redis
|
||||||
|
sudo dnf install -y redis
|
||||||
|
sudo systemctl enable redis
|
||||||
|
sudo systemctl start redis
|
||||||
|
|
||||||
|
# Install PostgreSQL
|
||||||
|
sudo dnf install -y postgresql-server postgresql-contrib
|
||||||
|
sudo postgresql-setup --initdb
|
||||||
|
sudo systemctl enable postgresql
|
||||||
|
sudo systemctl start postgresql
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 2: Setup Database PostgreSQL
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Masuk sebagai postgres user
|
||||||
|
sudo -u postgres psql
|
||||||
|
|
||||||
|
# Jalankan SQL commands berikut:
|
||||||
|
```
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Create user dan database
|
||||||
|
CREATE USER ocr WITH PASSWORD 'ganti-dengan-password-kuat';
|
||||||
|
CREATE DATABASE ocr_sprint OWNER ocr;
|
||||||
|
|
||||||
|
-- Grant privileges
|
||||||
|
GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
|
||||||
|
|
||||||
|
-- Connect ke database
|
||||||
|
\c ocr_sprint
|
||||||
|
|
||||||
|
-- Grant schema privileges (PostgreSQL 15+)
|
||||||
|
GRANT ALL ON SCHEMA public TO ocr;
|
||||||
|
|
||||||
|
-- Exit
|
||||||
|
\q
|
||||||
|
```
|
||||||
|
|
||||||
|
**Konfigurasi PostgreSQL untuk remote access (opsional):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Edit postgresql.conf
|
||||||
|
sudo nano /etc/postgresql/14/main/postgresql.conf
|
||||||
|
|
||||||
|
# Uncomment dan ubah:
|
||||||
|
listen_addresses = 'localhost' # Tetap localhost untuk keamanan
|
||||||
|
|
||||||
|
# Edit pg_hba.conf
|
||||||
|
sudo nano /etc/postgresql/14/main/pg_hba.conf
|
||||||
|
|
||||||
|
# Tambahkan line:
|
||||||
|
local ocr_sprint ocr scram-sha-256
|
||||||
|
|
||||||
|
# Restart PostgreSQL
|
||||||
|
sudo systemctl restart postgresql
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 3: Setup Application User
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create dedicated user untuk aplikasi
|
||||||
|
sudo useradd -m -s /bin/bash ocr
|
||||||
|
sudo usermod -aG sudo ocr # Opsional, untuk maintenance
|
||||||
|
|
||||||
|
# Create application directory
|
||||||
|
sudo mkdir -p /opt/ocr-sprint-service
|
||||||
|
sudo chown ocr:ocr /opt/ocr-sprint-service
|
||||||
|
|
||||||
|
# Switch ke user ocr
|
||||||
|
sudo su - ocr
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 4: Install Application
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone repository
|
||||||
|
cd /opt
|
||||||
|
git clone https://github.com/Adriankf59/ocr-sprint-service.git
|
||||||
|
cd ocr-sprint-service
|
||||||
|
|
||||||
|
# Create virtual environment
|
||||||
|
python3.11 -m venv .venv
|
||||||
|
|
||||||
|
# Activate virtual environment
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Upgrade pip
|
||||||
|
pip install --upgrade pip setuptools wheel
|
||||||
|
|
||||||
|
# Install application dengan OCR dependencies
|
||||||
|
pip install -e ".[ocr]"
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
python -c "import paddleocr; print('PaddleOCR installed successfully')"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 5: Konfigurasi Application
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Copy environment template
|
||||||
|
cp .env.example .env
|
||||||
|
|
||||||
|
# Edit konfigurasi
|
||||||
|
nano .env
|
||||||
|
```
|
||||||
|
|
||||||
|
**Konfigurasi production (`/opt/ocr-sprint-service/.env`):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ==== App ====
|
||||||
|
APP_ENV=prod
|
||||||
|
APP_HOST=0.0.0.0
|
||||||
|
APP_PORT=8000
|
||||||
|
APP_LOG_LEVEL=INFO
|
||||||
|
|
||||||
|
# ==== Storage ====
|
||||||
|
STORAGE_LOCAL_DIR=/opt/ocr-sprint-service/storage
|
||||||
|
BLOB_STORAGE_DIR=/opt/ocr-sprint-service/storage/blobs
|
||||||
|
BLOB_MAX_UPLOAD_MB=25
|
||||||
|
|
||||||
|
# ==== OCR ====
|
||||||
|
OCR_LANG=latin
|
||||||
|
OCR_USE_GPU=false
|
||||||
|
OCR_MAX_IMAGE_SIDE=2200
|
||||||
|
|
||||||
|
# ==== Preprocessing ====
|
||||||
|
PREPROCESS_TARGET_DPI=300
|
||||||
|
PREPROCESS_DENOISE=true
|
||||||
|
PREPROCESS_DESKEW=true
|
||||||
|
PREPROCESS_DETECT_DOCUMENT=true
|
||||||
|
PREPROCESS_REMOVE_SHADOW=true
|
||||||
|
PREPROCESS_MIN_QUAD_AREA_FRACTION=0.20
|
||||||
|
|
||||||
|
# ==== Table Extraction ====
|
||||||
|
TABLES_ENABLED=true
|
||||||
|
|
||||||
|
# ==== Confidence ====
|
||||||
|
CONFIDENCE_AUTO_APPROVE=0.95
|
||||||
|
CONFIDENCE_NEEDS_REVIEW=0.85
|
||||||
|
|
||||||
|
# ==== LLM (Phase 5, optional) ====
|
||||||
|
LLM_ENABLED=false
|
||||||
|
|
||||||
|
# ==== Async Pipeline ====
|
||||||
|
QUEUE_ENABLED=true
|
||||||
|
REDIS_URL=redis://localhost:6379/0
|
||||||
|
CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
|
||||||
|
|
||||||
|
# ==== Database ====
|
||||||
|
DATABASE_URL=postgresql+psycopg://ocr:ganti-dengan-password-kuat@localhost:5432/ocr_sprint
|
||||||
|
DATABASE_ECHO=false
|
||||||
|
|
||||||
|
# ==== Auth (WAJIB!) ====
|
||||||
|
API_KEYS=key1-ganti-dengan-random-string,key2-ganti-dengan-random-string
|
||||||
|
API_KEY_HEADER=X-API-Key
|
||||||
|
```
|
||||||
|
|
||||||
|
**Generate secure API keys:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Generate 2 API keys
|
||||||
|
openssl rand -hex 32
|
||||||
|
openssl rand -hex 32
|
||||||
|
```
|
||||||
|
|
||||||
|
**Create storage directories:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p /opt/ocr-sprint-service/storage/blobs
|
||||||
|
chmod 755 /opt/ocr-sprint-service/storage
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 6: Run Database Migrations
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Masih sebagai user ocr, dengan venv activated
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Run migrations
|
||||||
|
alembic upgrade head
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
alembic current
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 7: Test Manual Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test API server
|
||||||
|
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
|
||||||
|
|
||||||
|
# Di terminal lain, test health check
|
||||||
|
curl http://localhost:8000/api/v1/health
|
||||||
|
|
||||||
|
# Jika berhasil, stop dengan Ctrl+C
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 8: Setup Systemd Services
|
||||||
|
|
||||||
|
### API Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Exit dari user ocr, kembali ke user dengan sudo
|
||||||
|
exit
|
||||||
|
|
||||||
|
# Create systemd service file
|
||||||
|
sudo nano /etc/systemd/system/ocr-sprint-api.service
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content `/etc/systemd/system/ocr-sprint-api.service`:**
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=OCR Sprint API Service
|
||||||
|
After=network.target postgresql.service redis.service
|
||||||
|
Wants=postgresql.service redis.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=ocr
|
||||||
|
Group=ocr
|
||||||
|
WorkingDirectory=/opt/ocr-sprint-service
|
||||||
|
|
||||||
|
# Environment
|
||||||
|
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
|
||||||
|
EnvironmentFile=/opt/ocr-sprint-service/.env
|
||||||
|
|
||||||
|
# Start command - 4 workers untuk production
|
||||||
|
ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn \
|
||||||
|
ocr_sprint.main:app \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 8000 \
|
||||||
|
--workers 4 \
|
||||||
|
--log-level info
|
||||||
|
|
||||||
|
# Restart policy
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
StartLimitInterval=0
|
||||||
|
|
||||||
|
# Resource limits
|
||||||
|
LimitNOFILE=65536
|
||||||
|
MemoryLimit=6G
|
||||||
|
|
||||||
|
# Security
|
||||||
|
NoNewPrivileges=true
|
||||||
|
PrivateTmp=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
### Celery Worker Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nano /etc/systemd/system/ocr-sprint-worker.service
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content `/etc/systemd/system/ocr-sprint-worker.service`:**
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=OCR Sprint Celery Worker
|
||||||
|
After=network.target postgresql.service redis.service ocr-sprint-api.service
|
||||||
|
Wants=postgresql.service redis.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=ocr
|
||||||
|
Group=ocr
|
||||||
|
WorkingDirectory=/opt/ocr-sprint-service
|
||||||
|
|
||||||
|
# Environment
|
||||||
|
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
|
||||||
|
EnvironmentFile=/opt/ocr-sprint-service/.env
|
||||||
|
|
||||||
|
# Start command - concurrency 2 untuk 4 core CPU
|
||||||
|
ExecStart=/opt/ocr-sprint-service/.venv/bin/celery \
|
||||||
|
-A ocr_sprint.worker.celery_app \
|
||||||
|
worker \
|
||||||
|
--loglevel=info \
|
||||||
|
--concurrency=2 \
|
||||||
|
--max-tasks-per-child=100
|
||||||
|
|
||||||
|
# Restart policy
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
StartLimitInterval=0
|
||||||
|
|
||||||
|
# Resource limits
|
||||||
|
LimitNOFILE=65536
|
||||||
|
MemoryLimit=4G
|
||||||
|
|
||||||
|
# Security
|
||||||
|
NoNewPrivileges=true
|
||||||
|
PrivateTmp=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
### Enable dan Start Services
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Reload systemd
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
|
||||||
|
# Enable services (auto-start on boot)
|
||||||
|
sudo systemctl enable ocr-sprint-api
|
||||||
|
sudo systemctl enable ocr-sprint-worker
|
||||||
|
|
||||||
|
# Start services
|
||||||
|
sudo systemctl start ocr-sprint-api
|
||||||
|
sudo systemctl start ocr-sprint-worker
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
sudo systemctl status ocr-sprint-api
|
||||||
|
sudo systemctl status ocr-sprint-worker
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
sudo journalctl -u ocr-sprint-api -f
|
||||||
|
sudo journalctl -u ocr-sprint-worker -f
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 9: Setup Nginx Reverse Proxy
|
||||||
|
|
||||||
|
### Install Nginx
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt install -y nginx certbot python3-certbot-nginx
|
||||||
|
```
|
||||||
|
|
||||||
|
### Konfigurasi Nginx
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nano /etc/nginx/sites-available/ocr-sprint
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content `/etc/nginx/sites-available/ocr-sprint`:**
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
# Upstream untuk load balancing (jika scale horizontal)
|
||||||
|
upstream ocr_api {
|
||||||
|
server 127.0.0.1:8000;
|
||||||
|
keepalive 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Rate limiting
|
||||||
|
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name ocr.yourdomain.com; # Ganti dengan domain Anda
|
||||||
|
|
||||||
|
# Max upload size (sesuaikan dengan BLOB_MAX_UPLOAD_MB)
|
||||||
|
client_max_body_size 30M;
|
||||||
|
client_body_buffer_size 128k;
|
||||||
|
|
||||||
|
# Timeouts untuk dokumen besar
|
||||||
|
proxy_connect_timeout 300s;
|
||||||
|
proxy_send_timeout 300s;
|
||||||
|
proxy_read_timeout 300s;
|
||||||
|
send_timeout 300s;
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
access_log /var/log/nginx/ocr-sprint-access.log;
|
||||||
|
error_log /var/log/nginx/ocr-sprint-error.log;
|
||||||
|
|
||||||
|
# API endpoints
|
||||||
|
location /api/ {
|
||||||
|
# Rate limiting
|
||||||
|
limit_req zone=api_limit burst=20 nodelay;
|
||||||
|
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
|
||||||
|
# Headers
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_set_header Connection "";
|
||||||
|
|
||||||
|
# Disable buffering untuk streaming responses
|
||||||
|
proxy_buffering off;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Health check endpoint (no rate limit)
|
||||||
|
location /api/v1/health {
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
access_log off;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Metrics endpoint (restrict access)
|
||||||
|
location /metrics {
|
||||||
|
# Allow only from internal network
|
||||||
|
allow 10.0.0.0/8;
|
||||||
|
allow 172.16.0.0/12;
|
||||||
|
allow 192.168.0.0/16;
|
||||||
|
allow 127.0.0.1;
|
||||||
|
deny all;
|
||||||
|
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Docs (opsional, bisa di-disable di production)
|
||||||
|
location /docs {
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
}
|
||||||
|
|
||||||
|
location /redoc {
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Enable Site
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test konfigurasi
|
||||||
|
sudo nginx -t
|
||||||
|
|
||||||
|
# Enable site
|
||||||
|
sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
|
||||||
|
|
||||||
|
# Remove default site (opsional)
|
||||||
|
sudo rm /etc/nginx/sites-enabled/default
|
||||||
|
|
||||||
|
# Reload Nginx
|
||||||
|
sudo systemctl reload nginx
|
||||||
|
```
|
||||||
|
|
||||||
|
### Setup SSL dengan Let's Encrypt
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install certbot
|
||||||
|
sudo apt install -y certbot python3-certbot-nginx
|
||||||
|
|
||||||
|
# Obtain certificate (ganti dengan domain Anda)
|
||||||
|
sudo certbot --nginx -d ocr.yourdomain.com
|
||||||
|
|
||||||
|
# Test auto-renewal
|
||||||
|
sudo certbot renew --dry-run
|
||||||
|
```
|
||||||
|
|
||||||
|
Certbot akan otomatis mengupdate konfigurasi Nginx untuk HTTPS.
|
||||||
|
|
||||||
|
## Langkah 10: Setup Firewall
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install UFW (jika belum ada)
|
||||||
|
sudo apt install -y ufw
|
||||||
|
|
||||||
|
# Allow SSH (PENTING! Jangan sampai terkunci)
|
||||||
|
sudo ufw allow 22/tcp
|
||||||
|
|
||||||
|
# Allow HTTP dan HTTPS
|
||||||
|
sudo ufw allow 80/tcp
|
||||||
|
sudo ufw allow 443/tcp
|
||||||
|
|
||||||
|
# Enable firewall
|
||||||
|
sudo ufw enable
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
sudo ufw status
|
||||||
|
```
|
||||||
|
|
||||||
|
## Langkah 11: Verifikasi Deployment
|
||||||
|
|
||||||
|
### Test dari Server
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Health check
|
||||||
|
curl http://localhost:8000/api/v1/health
|
||||||
|
|
||||||
|
# Test dengan API key
|
||||||
|
curl -X POST http://localhost:8000/api/v1/documents?sync=true \
|
||||||
|
-H "X-API-Key: your-api-key-here" \
|
||||||
|
-F "file=@/path/to/test.pdf"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test dari Client
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Health check via domain
|
||||||
|
curl https://ocr.yourdomain.com/api/v1/health
|
||||||
|
|
||||||
|
# Upload dokumen
|
||||||
|
curl -X POST https://ocr.yourdomain.com/api/v1/documents \
|
||||||
|
-H "X-API-Key: your-api-key-here" \
|
||||||
|
-F "file=@document.pdf"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring dan Maintenance
|
||||||
|
|
||||||
|
### View Logs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# API logs
|
||||||
|
sudo journalctl -u ocr-sprint-api -f
|
||||||
|
|
||||||
|
# Worker logs
|
||||||
|
sudo journalctl -u ocr-sprint-worker -f
|
||||||
|
|
||||||
|
# Nginx logs
|
||||||
|
sudo tail -f /var/log/nginx/ocr-sprint-access.log
|
||||||
|
sudo tail -f /var/log/nginx/ocr-sprint-error.log
|
||||||
|
|
||||||
|
# PostgreSQL logs
|
||||||
|
sudo tail -f /var/log/postgresql/postgresql-14-main.log
|
||||||
|
```
|
||||||
|
|
||||||
|
### Service Management
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Restart services
|
||||||
|
sudo systemctl restart ocr-sprint-api
|
||||||
|
sudo systemctl restart ocr-sprint-worker
|
||||||
|
|
||||||
|
# Stop services
|
||||||
|
sudo systemctl stop ocr-sprint-api
|
||||||
|
sudo systemctl stop ocr-sprint-worker
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
sudo systemctl status ocr-sprint-api
|
||||||
|
sudo systemctl status ocr-sprint-worker
|
||||||
|
```
|
||||||
|
|
||||||
|
### Database Backup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create backup script
|
||||||
|
sudo nano /opt/ocr-sprint-service/backup.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content `backup.sh`:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
BACKUP_DIR="/opt/ocr-sprint-service/backups"
|
||||||
|
DATE=$(date +%Y%m%d_%H%M%S)
|
||||||
|
|
||||||
|
mkdir -p $BACKUP_DIR
|
||||||
|
|
||||||
|
# Backup database
|
||||||
|
pg_dump -U ocr -h localhost ocr_sprint | gzip > $BACKUP_DIR/db_$DATE.sql.gz
|
||||||
|
|
||||||
|
# Backup blobs (opsional, bisa besar)
|
||||||
|
# tar -czf $BACKUP_DIR/blobs_$DATE.tar.gz /opt/ocr-sprint-service/storage/blobs
|
||||||
|
|
||||||
|
# Keep only last 7 days
|
||||||
|
find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete
|
||||||
|
|
||||||
|
echo "Backup completed: $DATE"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Make executable
|
||||||
|
chmod +x /opt/ocr-sprint-service/backup.sh
|
||||||
|
|
||||||
|
# Setup cron job (daily at 2 AM)
|
||||||
|
sudo crontab -e
|
||||||
|
|
||||||
|
# Add line:
|
||||||
|
0 2 * * * /opt/ocr-sprint-service/backup.sh >> /var/log/ocr-backup.log 2>&1
|
||||||
|
```
|
||||||
|
|
||||||
|
### Log Rotation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nano /etc/logrotate.d/ocr-sprint
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content:**
|
||||||
|
|
||||||
|
```
|
||||||
|
/var/log/nginx/ocr-sprint-*.log {
|
||||||
|
daily
|
||||||
|
rotate 14
|
||||||
|
compress
|
||||||
|
delaycompress
|
||||||
|
notifempty
|
||||||
|
create 0640 www-data adm
|
||||||
|
sharedscripts
|
||||||
|
postrotate
|
||||||
|
[ -f /var/run/nginx.pid ] && kill -USR1 `cat /var/run/nginx.pid`
|
||||||
|
endscript
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Update Application
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Switch ke user ocr
|
||||||
|
sudo su - ocr
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
|
||||||
|
# Pull latest code
|
||||||
|
git pull
|
||||||
|
|
||||||
|
# Activate venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Update dependencies
|
||||||
|
pip install -e ".[ocr]"
|
||||||
|
|
||||||
|
# Run migrations
|
||||||
|
alembic upgrade head
|
||||||
|
|
||||||
|
# Exit user ocr
|
||||||
|
exit
|
||||||
|
|
||||||
|
# Restart services
|
||||||
|
sudo systemctl restart ocr-sprint-api
|
||||||
|
sudo systemctl restart ocr-sprint-worker
|
||||||
|
|
||||||
|
# Check logs
|
||||||
|
sudo journalctl -u ocr-sprint-api -n 50
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Tuning
|
||||||
|
|
||||||
|
### Increase Worker Concurrency
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Edit worker service
|
||||||
|
sudo nano /etc/systemd/system/ocr-sprint-worker.service
|
||||||
|
|
||||||
|
# Ubah --concurrency sesuai CPU cores
|
||||||
|
# Untuk 8 cores: --concurrency=4
|
||||||
|
# Untuk 16 cores: --concurrency=8
|
||||||
|
|
||||||
|
# Reload dan restart
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl restart ocr-sprint-worker
|
||||||
|
```
|
||||||
|
|
||||||
|
### PostgreSQL Tuning
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nano /etc/postgresql/14/main/postgresql.conf
|
||||||
|
```
|
||||||
|
|
||||||
|
**Recommended settings untuk 16GB RAM:**
|
||||||
|
|
||||||
|
```
|
||||||
|
shared_buffers = 4GB
|
||||||
|
effective_cache_size = 12GB
|
||||||
|
maintenance_work_mem = 1GB
|
||||||
|
checkpoint_completion_target = 0.9
|
||||||
|
wal_buffers = 16MB
|
||||||
|
default_statistics_target = 100
|
||||||
|
random_page_cost = 1.1
|
||||||
|
effective_io_concurrency = 200
|
||||||
|
work_mem = 10MB
|
||||||
|
min_wal_size = 1GB
|
||||||
|
max_wal_size = 4GB
|
||||||
|
max_worker_processes = 4
|
||||||
|
max_parallel_workers_per_gather = 2
|
||||||
|
max_parallel_workers = 4
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo systemctl restart postgresql
|
||||||
|
```
|
||||||
|
|
||||||
|
### Redis Tuning
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nano /etc/redis/redis.conf
|
||||||
|
```
|
||||||
|
|
||||||
|
**Recommended settings:**
|
||||||
|
|
||||||
|
```
|
||||||
|
maxmemory 2gb
|
||||||
|
maxmemory-policy allkeys-lru
|
||||||
|
save "" # Disable RDB snapshots untuk performance
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo systemctl restart redis
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Service tidak start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check logs
|
||||||
|
sudo journalctl -u ocr-sprint-api -n 100 --no-pager
|
||||||
|
sudo journalctl -u ocr-sprint-worker -n 100 --no-pager
|
||||||
|
|
||||||
|
# Check permissions
|
||||||
|
ls -la /opt/ocr-sprint-service
|
||||||
|
ls -la /opt/ocr-sprint-service/storage
|
||||||
|
|
||||||
|
# Test manual run
|
||||||
|
sudo su - ocr
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
source .venv/bin/activate
|
||||||
|
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Database connection error
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test connection
|
||||||
|
sudo -u ocr psql -h localhost -U ocr -d ocr_sprint
|
||||||
|
|
||||||
|
# Check PostgreSQL status
|
||||||
|
sudo systemctl status postgresql
|
||||||
|
|
||||||
|
# Check pg_hba.conf
|
||||||
|
sudo cat /etc/postgresql/14/main/pg_hba.conf | grep ocr
|
||||||
|
```
|
||||||
|
|
||||||
|
### Redis connection error
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test Redis
|
||||||
|
redis-cli ping
|
||||||
|
|
||||||
|
# Check Redis status
|
||||||
|
sudo systemctl status redis
|
||||||
|
|
||||||
|
# Check Redis logs
|
||||||
|
sudo journalctl -u redis -n 50
|
||||||
|
```
|
||||||
|
|
||||||
|
### PaddleOCR model download gagal
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Download manual
|
||||||
|
sudo su - ocr
|
||||||
|
cd /opt/ocr-sprint-service
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
python << EOF
|
||||||
|
from paddleocr import PaddleOCR
|
||||||
|
ocr = PaddleOCR(use_angle_cls=True, lang='latin')
|
||||||
|
print("Models downloaded successfully")
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
### Out of memory
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check memory usage
|
||||||
|
free -h
|
||||||
|
htop
|
||||||
|
|
||||||
|
# Reduce worker concurrency
|
||||||
|
sudo nano /etc/systemd/system/ocr-sprint-worker.service
|
||||||
|
# Ubah --concurrency=1
|
||||||
|
|
||||||
|
# Add swap (jika perlu)
|
||||||
|
sudo fallocate -l 4G /swapfile
|
||||||
|
sudo chmod 600 /swapfile
|
||||||
|
sudo mkswap /swapfile
|
||||||
|
sudo swapon /swapfile
|
||||||
|
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
|
||||||
|
```
|
||||||
|
|
||||||
|
## Security Checklist
|
||||||
|
|
||||||
|
- [ ] API keys diganti dengan nilai random yang kuat
|
||||||
|
- [ ] Database password diganti dari default
|
||||||
|
- [ ] Firewall enabled (UFW) - hanya port 22, 80, 443 terbuka
|
||||||
|
- [ ] SSL/TLS enabled via Let's Encrypt
|
||||||
|
- [ ] `/metrics` endpoint restricted ke internal network
|
||||||
|
- [ ] Nginx rate limiting configured
|
||||||
|
- [ ] PostgreSQL hanya listen di localhost
|
||||||
|
- [ ] Redis hanya listen di localhost
|
||||||
|
- [ ] Regular backup configured (cron job)
|
||||||
|
- [ ] Log rotation configured
|
||||||
|
- [ ] OS security updates enabled (`unattended-upgrades`)
|
||||||
|
- [ ] Fail2ban installed untuk SSH protection
|
||||||
|
|
||||||
|
## Monitoring dengan Prometheus (Opsional)
|
||||||
|
|
||||||
|
### Install Prometheus
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Download Prometheus
|
||||||
|
cd /tmp
|
||||||
|
wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
|
||||||
|
tar xvfz prometheus-*.tar.gz
|
||||||
|
sudo mv prometheus-2.45.0.linux-amd64 /opt/prometheus
|
||||||
|
|
||||||
|
# Create user
|
||||||
|
sudo useradd --no-create-home --shell /bin/false prometheus
|
||||||
|
|
||||||
|
# Create directories
|
||||||
|
sudo mkdir /etc/prometheus /var/lib/prometheus
|
||||||
|
sudo chown prometheus:prometheus /var/lib/prometheus
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configure Prometheus
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nano /etc/prometheus/prometheus.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: 'ocr-sprint'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:8000']
|
||||||
|
metrics_path: '/metrics'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create Systemd Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo nano /etc/systemd/system/prometheus.service
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content:**
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=Prometheus
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
User=prometheus
|
||||||
|
Group=prometheus
|
||||||
|
Type=simple
|
||||||
|
ExecStart=/opt/prometheus/prometheus \
|
||||||
|
--config.file=/etc/prometheus/prometheus.yml \
|
||||||
|
--storage.tsdb.path=/var/lib/prometheus/
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable prometheus
|
||||||
|
sudo systemctl start prometheus
|
||||||
|
```
|
||||||
|
|
||||||
|
Access Prometheus di `http://localhost:9090`
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
Untuk pertanyaan atau issues, hubungi tim development.
|
||||||
437
docs/DEPLOYMENT.md
Normal file
437
docs/DEPLOYMENT.md
Normal file
@@ -0,0 +1,437 @@
|
|||||||
|
# Quickstart Deployment OCR Sprint Service
|
||||||
|
|
||||||
|
Panduan deployment OCR Sprint Service ke server production untuk pemrosesan dokumen surat sprint Polri.
|
||||||
|
|
||||||
|
## Prasyarat Server
|
||||||
|
|
||||||
|
### Spesifikasi Minimum
|
||||||
|
- **OS**: Linux (Ubuntu 20.04+ / Debian 11+ / RHEL 8+)
|
||||||
|
- **CPU**: 4 cores (8 cores recommended untuk throughput tinggi)
|
||||||
|
- **RAM**: 8 GB minimum (16 GB recommended)
|
||||||
|
- **Storage**: 50 GB free space
|
||||||
|
- ~3 GB untuk model PaddleOCR
|
||||||
|
- ~1.5 GB untuk dependencies Python
|
||||||
|
- Sisanya untuk blob storage dokumen
|
||||||
|
- **Network**: Port 8000 terbuka untuk API access
|
||||||
|
|
||||||
|
### Software Requirements
|
||||||
|
- Docker 24.0+ dan Docker Compose v2
|
||||||
|
- Git
|
||||||
|
- (Opsional) Nginx/Caddy untuk reverse proxy + SSL
|
||||||
|
|
||||||
|
## Deployment dengan Docker Compose (Recommended)
|
||||||
|
|
||||||
|
### 1. Clone Repository
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Login ke server sebagai user non-root dengan sudo access
|
||||||
|
ssh user@your-server.com
|
||||||
|
|
||||||
|
# Clone repository
|
||||||
|
git clone https://github.com/Adriankf59/ocr-sprint-service.git
|
||||||
|
cd ocr-sprint-service
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Konfigurasi Environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Copy template environment
|
||||||
|
cp .env.example .env
|
||||||
|
|
||||||
|
# Edit konfigurasi production
|
||||||
|
nano .env
|
||||||
|
```
|
||||||
|
|
||||||
|
**Konfigurasi penting untuk production:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ==== App ====
|
||||||
|
APP_ENV=prod
|
||||||
|
APP_LOG_LEVEL=INFO
|
||||||
|
|
||||||
|
# ==== Storage ====
|
||||||
|
STORAGE_LOCAL_DIR=/app/storage
|
||||||
|
BLOB_STORAGE_DIR=/app/storage/blobs
|
||||||
|
BLOB_MAX_UPLOAD_MB=25
|
||||||
|
|
||||||
|
# ==== OCR ====
|
||||||
|
OCR_LANG=latin
|
||||||
|
OCR_USE_GPU=false # set true jika server punya GPU NVIDIA
|
||||||
|
OCR_MAX_IMAGE_SIDE=2200
|
||||||
|
|
||||||
|
# ==== Preprocessing ====
|
||||||
|
PREPROCESS_TARGET_DPI=300
|
||||||
|
PREPROCESS_DENOISE=true
|
||||||
|
PREPROCESS_DESKEW=true
|
||||||
|
PREPROCESS_DETECT_DOCUMENT=true
|
||||||
|
PREPROCESS_REMOVE_SHADOW=true
|
||||||
|
|
||||||
|
# ==== Table Extraction ====
|
||||||
|
TABLES_ENABLED=true
|
||||||
|
|
||||||
|
# ==== Async Pipeline ====
|
||||||
|
QUEUE_ENABLED=true
|
||||||
|
REDIS_URL=redis://redis:6379/0
|
||||||
|
CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
|
||||||
|
|
||||||
|
# ==== Database ====
|
||||||
|
DATABASE_URL=postgresql+psycopg://ocr:ocr@postgres:5432/ocr_sprint
|
||||||
|
DATABASE_ECHO=false
|
||||||
|
|
||||||
|
# ==== Auth (WAJIB untuk production!) ====
|
||||||
|
API_KEYS=your-secret-key-1,your-secret-key-2
|
||||||
|
API_KEY_HEADER=X-API-Key
|
||||||
|
```
|
||||||
|
|
||||||
|
**Generate API keys yang aman:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Generate random API key
|
||||||
|
openssl rand -hex 32
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Build dan Start Services
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build Docker images
|
||||||
|
docker compose build
|
||||||
|
|
||||||
|
# Start semua services (API, Worker, Redis, Postgres)
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
# Cek logs untuk memastikan semua berjalan
|
||||||
|
docker compose logs -f api worker
|
||||||
|
```
|
||||||
|
|
||||||
|
**Services yang berjalan:**
|
||||||
|
- `api`: FastAPI server di port 8000
|
||||||
|
- `worker`: Celery worker untuk async processing
|
||||||
|
- `redis`: Message broker untuk job queue
|
||||||
|
- `postgres`: Database untuk job state
|
||||||
|
|
||||||
|
### 4. Verifikasi Deployment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Health check
|
||||||
|
curl http://localhost:8000/api/v1/health
|
||||||
|
|
||||||
|
# Expected response:
|
||||||
|
# {"status":"ok","version":"0.1.0"}
|
||||||
|
|
||||||
|
# Test OCR endpoint (sync mode untuk testing)
|
||||||
|
curl -X POST http://localhost:8000/api/v1/documents?sync=true \
|
||||||
|
-H "X-API-Key: your-secret-key-1" \
|
||||||
|
-F "file=@samples/pdf/example.pdf" \
|
||||||
|
| jq
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Setup Reverse Proxy (Nginx)
|
||||||
|
|
||||||
|
**Install Nginx:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install nginx certbot python3-certbot-nginx
|
||||||
|
```
|
||||||
|
|
||||||
|
**Konfigurasi Nginx (`/etc/nginx/sites-available/ocr-sprint`):**
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
upstream ocr_api {
|
||||||
|
server localhost:8000;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name ocr.yourdomain.com;
|
||||||
|
|
||||||
|
client_max_body_size 30M; # Sesuaikan dengan BLOB_MAX_UPLOAD_MB
|
||||||
|
|
||||||
|
location / {
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
# Timeout untuk dokumen besar
|
||||||
|
proxy_read_timeout 300s;
|
||||||
|
proxy_connect_timeout 75s;
|
||||||
|
}
|
||||||
|
|
||||||
|
location /metrics {
|
||||||
|
# Restrict metrics endpoint
|
||||||
|
allow 10.0.0.0/8; # Internal network only
|
||||||
|
deny all;
|
||||||
|
proxy_pass http://ocr_api;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Enable site dan setup SSL:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Enable site
|
||||||
|
sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
|
||||||
|
sudo nginx -t
|
||||||
|
sudo systemctl reload nginx
|
||||||
|
|
||||||
|
# Setup SSL dengan Let's Encrypt
|
||||||
|
sudo certbot --nginx -d ocr.yourdomain.com
|
||||||
|
```
|
||||||
|
|
||||||
|
## Deployment Manual (Tanpa Docker)
|
||||||
|
|
||||||
|
### 1. Install System Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Ubuntu/Debian
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y \
|
||||||
|
python3.11 python3.11-venv python3-pip \
|
||||||
|
libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \
|
||||||
|
libgomp1 libmagic1 \
|
||||||
|
redis-server postgresql-14
|
||||||
|
|
||||||
|
# Start services
|
||||||
|
sudo systemctl enable --now redis-server postgresql
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Setup Database
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create database dan user
|
||||||
|
sudo -u postgres psql << EOF
|
||||||
|
CREATE USER ocr WITH PASSWORD 'your-secure-password';
|
||||||
|
CREATE DATABASE ocr_sprint OWNER ocr;
|
||||||
|
GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Install Application
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone repository
|
||||||
|
git clone https://github.com/Adriankf59/ocr-sprint-service.git
|
||||||
|
cd ocr-sprint-service
|
||||||
|
|
||||||
|
# Create virtual environment
|
||||||
|
python3.11 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install -e ".[ocr]"
|
||||||
|
|
||||||
|
# Copy dan edit .env
|
||||||
|
cp .env.example .env
|
||||||
|
nano .env
|
||||||
|
```
|
||||||
|
|
||||||
|
**Update DATABASE_URL di .env:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
DATABASE_URL=postgresql+psycopg://ocr:your-secure-password@localhost:5432/ocr_sprint
|
||||||
|
REDIS_URL=redis://localhost:6379/0
|
||||||
|
QUEUE_ENABLED=true
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Run Database Migrations
|
||||||
|
|
||||||
|
```bash
|
||||||
|
alembic upgrade head
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Setup Systemd Services
|
||||||
|
|
||||||
|
**API Service (`/etc/systemd/system/ocr-sprint-api.service`):**
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=OCR Sprint API
|
||||||
|
After=network.target postgresql.service redis.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=ocr
|
||||||
|
WorkingDirectory=/opt/ocr-sprint-service
|
||||||
|
Environment="PATH=/opt/ocr-sprint-service/.venv/bin"
|
||||||
|
ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000 --workers 4
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
**Worker Service (`/etc/systemd/system/ocr-sprint-worker.service`):**
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[Unit]
|
||||||
|
Description=OCR Sprint Celery Worker
|
||||||
|
After=network.target postgresql.service redis.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=ocr
|
||||||
|
WorkingDirectory=/opt/ocr-sprint-service
|
||||||
|
Environment="PATH=/opt/ocr-sprint-service/.venv/bin"
|
||||||
|
ExecStart=/opt/ocr-sprint-service/.venv/bin/celery -A ocr_sprint.worker.celery_app worker -l info --concurrency=2
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
|
||||||
|
**Enable dan start services:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable --now ocr-sprint-api ocr-sprint-worker
|
||||||
|
sudo systemctl status ocr-sprint-api ocr-sprint-worker
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring dan Maintenance
|
||||||
|
|
||||||
|
### Monitoring Logs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Docker deployment
|
||||||
|
docker compose logs -f api worker
|
||||||
|
|
||||||
|
# Manual deployment
|
||||||
|
sudo journalctl -u ocr-sprint-api -f
|
||||||
|
sudo journalctl -u ocr-sprint-worker -f
|
||||||
|
```
|
||||||
|
|
||||||
|
### Prometheus Metrics
|
||||||
|
|
||||||
|
Metrics tersedia di endpoint `/metrics`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/metrics
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key metrics:**
|
||||||
|
- `ocr_documents_total`: Total dokumen diproses
|
||||||
|
- `ocr_processing_duration_seconds`: Durasi processing
|
||||||
|
- `ocr_confidence_score`: Distribusi confidence score
|
||||||
|
- `celery_task_*`: Celery worker metrics
|
||||||
|
|
||||||
|
### Backup Database
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Docker deployment
|
||||||
|
docker compose exec postgres pg_dump -U ocr ocr_sprint > backup_$(date +%Y%m%d).sql
|
||||||
|
|
||||||
|
# Manual deployment
|
||||||
|
pg_dump -U ocr ocr_sprint > backup_$(date +%Y%m%d).sql
|
||||||
|
```
|
||||||
|
|
||||||
|
### Update Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Docker deployment
|
||||||
|
cd ocr-sprint-service
|
||||||
|
git pull
|
||||||
|
docker compose build
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
# Manual deployment
|
||||||
|
cd ocr-sprint-service
|
||||||
|
git pull
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install -e ".[ocr]"
|
||||||
|
alembic upgrade head
|
||||||
|
sudo systemctl restart ocr-sprint-api ocr-sprint-worker
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Service tidak start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Cek logs
|
||||||
|
docker compose logs api worker
|
||||||
|
|
||||||
|
# Cek health check
|
||||||
|
curl http://localhost:8000/api/v1/health
|
||||||
|
```
|
||||||
|
|
||||||
|
### PaddleOCR model download gagal
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Download manual ke volume
|
||||||
|
docker compose exec api python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='latin')"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Worker tidak memproses jobs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Cek Redis connection
|
||||||
|
docker compose exec worker redis-cli -h redis ping
|
||||||
|
|
||||||
|
# Cek Celery worker status
|
||||||
|
docker compose exec worker celery -A ocr_sprint.worker.celery_app inspect active
|
||||||
|
```
|
||||||
|
|
||||||
|
### Database migration error
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Cek current revision
|
||||||
|
docker compose exec api alembic current
|
||||||
|
|
||||||
|
# Force upgrade
|
||||||
|
docker compose exec api alembic upgrade head
|
||||||
|
```
|
||||||
|
|
||||||
|
### Out of memory
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Kurangi worker concurrency di docker-compose.yml
|
||||||
|
# Ubah: --concurrency=1 (default) atau tambahkan memory limit
|
||||||
|
```
|
||||||
|
|
||||||
|
## Security Checklist
|
||||||
|
|
||||||
|
- [ ] API_KEYS diset dengan nilai random yang kuat
|
||||||
|
- [ ] Firewall configured (hanya port 80/443 terbuka)
|
||||||
|
- [ ] SSL/TLS enabled via Nginx + Let's Encrypt
|
||||||
|
- [ ] Database password diganti dari default
|
||||||
|
- [ ] `/metrics` endpoint restricted ke internal network
|
||||||
|
- [ ] Regular backup database dan blob storage
|
||||||
|
- [ ] Log rotation configured
|
||||||
|
- [ ] OS security updates enabled
|
||||||
|
|
||||||
|
## Performance Tuning
|
||||||
|
|
||||||
|
### Untuk throughput tinggi:
|
||||||
|
|
||||||
|
1. **Increase worker concurrency:**
|
||||||
|
```yaml
|
||||||
|
# docker-compose.yml
|
||||||
|
command: ["celery", "-A", "ocr_sprint.worker.celery_app", "worker", "-l", "info", "--concurrency=4"]
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Scale workers horizontally:**
|
||||||
|
```bash
|
||||||
|
docker compose up -d --scale worker=3
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Enable GPU (jika tersedia):**
|
||||||
|
```bash
|
||||||
|
# .env
|
||||||
|
OCR_USE_GPU=true
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Tune Postgres:**
|
||||||
|
```sql
|
||||||
|
-- Increase connection pool
|
||||||
|
ALTER SYSTEM SET max_connections = 200;
|
||||||
|
ALTER SYSTEM SET shared_buffers = '2GB';
|
||||||
|
```
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
Untuk pertanyaan atau issues, hubungi tim development atau buat issue di repository.
|
||||||
537
docs/FRONTEND-INTEGRATION.md
Normal file
537
docs/FRONTEND-INTEGRATION.md
Normal file
@@ -0,0 +1,537 @@
|
|||||||
|
# Frontend Integration Guide
|
||||||
|
|
||||||
|
Dokumen ini menjelaskan kontrak API yang perlu dipakai frontend untuk upload dokumen sprint, menampilkan hasil OCR, menjalankan review manual, dan approve hasil final.
|
||||||
|
|
||||||
|
## Base URL
|
||||||
|
|
||||||
|
Default local API:
|
||||||
|
|
||||||
|
```text
|
||||||
|
http://localhost:8000/api/v1
|
||||||
|
```
|
||||||
|
|
||||||
|
Untuk frontend, simpan URL di environment variable:
|
||||||
|
|
||||||
|
```env
|
||||||
|
VITE_OCR_API_BASE_URL=http://localhost:8000/api/v1
|
||||||
|
```
|
||||||
|
|
||||||
|
Jika `API_KEYS` di backend diisi, semua endpoint protected membutuhkan header:
|
||||||
|
|
||||||
|
```http
|
||||||
|
X-API-Key: <api-key>
|
||||||
|
```
|
||||||
|
|
||||||
|
Catatan: jangan expose API key production di frontend publik. Untuk deployment internal, gunakan reverse proxy atau session backend-for-frontend jika aksesnya tidak sepenuhnya trusted.
|
||||||
|
|
||||||
|
## Health Check
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /health
|
||||||
|
GET /health/ready
|
||||||
|
```
|
||||||
|
|
||||||
|
Contoh response `/health`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "ok",
|
||||||
|
"version": "0.1.0"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Contoh response `/health/ready`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "ready",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"models": {
|
||||||
|
"paddleocr": "ready",
|
||||||
|
"pp_structure": "disabled"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Gunakan `/health/ready` untuk disable upload button sampai model OCR siap.
|
||||||
|
|
||||||
|
## Upload Dokumen
|
||||||
|
|
||||||
|
Endpoint:
|
||||||
|
|
||||||
|
```http
|
||||||
|
POST /documents
|
||||||
|
POST /documents?sync=true
|
||||||
|
```
|
||||||
|
|
||||||
|
Body harus `multipart/form-data` dengan field `file`.
|
||||||
|
|
||||||
|
Backend menerima PDF dan format image umum. Default max upload mengikuti backend config `BLOB_MAX_UPLOAD_MB`, saat ini 25 MB.
|
||||||
|
|
||||||
|
### Recommended Flow
|
||||||
|
|
||||||
|
Untuk frontend production, gunakan async flow:
|
||||||
|
|
||||||
|
1. `POST /documents`
|
||||||
|
2. Jika status HTTP `202`, ambil `job_id`
|
||||||
|
3. Poll `GET /documents/{job_id}` setiap 1-3 detik
|
||||||
|
4. Stop polling saat status `completed`, `needs_review`, atau `failed`
|
||||||
|
|
||||||
|
Untuk local dev sederhana, `POST /documents?sync=true` boleh dipakai, tetapi request bisa lama karena OCR berjalan inline.
|
||||||
|
|
||||||
|
### Upload Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
const API_BASE = import.meta.env.VITE_OCR_API_BASE_URL;
|
||||||
|
const API_KEY = import.meta.env.VITE_OCR_API_KEY;
|
||||||
|
|
||||||
|
async function uploadDocument(file: File) {
|
||||||
|
const form = new FormData();
|
||||||
|
form.append("file", file);
|
||||||
|
|
||||||
|
const res = await fetch(`${API_BASE}/documents`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: API_KEY ? { "X-API-Key": API_KEY } : undefined,
|
||||||
|
body: form,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
throw await readApiError(res);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (await res.json()) as DocumentResponse;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Polling Job
|
||||||
|
|
||||||
|
Endpoint:
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /documents/{job_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
```ts
|
||||||
|
const TERMINAL_STATUSES = new Set(["completed", "needs_review", "failed"]);
|
||||||
|
|
||||||
|
async function getDocument(jobId: string) {
|
||||||
|
const res = await fetch(`${API_BASE}/documents/${jobId}`, {
|
||||||
|
headers: API_KEY ? { "X-API-Key": API_KEY } : undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
throw await readApiError(res);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (await res.json()) as DocumentResponse;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function pollDocument(jobId: string, onUpdate: (doc: DocumentResponse) => void) {
|
||||||
|
while (true) {
|
||||||
|
const doc = await getDocument(jobId);
|
||||||
|
onUpdate(doc);
|
||||||
|
|
||||||
|
if (TERMINAL_STATUSES.has(doc.status)) {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Response Schema
|
||||||
|
|
||||||
|
### DocumentResponse
|
||||||
|
|
||||||
|
```ts
|
||||||
|
type DocumentStatus =
|
||||||
|
| "pending"
|
||||||
|
| "processing"
|
||||||
|
| "completed"
|
||||||
|
| "needs_review"
|
||||||
|
| "failed";
|
||||||
|
|
||||||
|
type DocumentResponse = {
|
||||||
|
job_id: string;
|
||||||
|
status: DocumentStatus;
|
||||||
|
confidence: number | null;
|
||||||
|
data: ExtractionResult | null;
|
||||||
|
review_flags: ReviewFlag[];
|
||||||
|
error: string | null;
|
||||||
|
approved: boolean;
|
||||||
|
reviewed_by: string | null;
|
||||||
|
reviewed_at: string | null;
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### ExtractionResult
|
||||||
|
|
||||||
|
```ts
|
||||||
|
type ExtractionResult = {
|
||||||
|
header: HeaderFields;
|
||||||
|
personel: PersonnelEntry[];
|
||||||
|
untuk: string[];
|
||||||
|
ttd: Signatory;
|
||||||
|
raw_text: string;
|
||||||
|
confidence: number;
|
||||||
|
review_flags: ReviewFlag[];
|
||||||
|
};
|
||||||
|
|
||||||
|
type HeaderFields = {
|
||||||
|
nomor_sprint: string | null;
|
||||||
|
tanggal: string | null; // YYYY-MM-DD
|
||||||
|
satuan_penerbit: string | null;
|
||||||
|
perihal: string | null;
|
||||||
|
dasar: string[];
|
||||||
|
};
|
||||||
|
|
||||||
|
type PersonnelEntry = {
|
||||||
|
no: number | null;
|
||||||
|
pangkat: string | null;
|
||||||
|
nrp: string | null;
|
||||||
|
nama: string | null;
|
||||||
|
jabatan_dinas: string | null;
|
||||||
|
jabatan_sprint: string | null;
|
||||||
|
keterangan: string | null;
|
||||||
|
confidence: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
type Signatory = {
|
||||||
|
nama: string | null;
|
||||||
|
pangkat: string | null;
|
||||||
|
nrp: string | null;
|
||||||
|
jabatan: string | null;
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
### Review Flags
|
||||||
|
|
||||||
|
```ts
|
||||||
|
type ReviewFlag =
|
||||||
|
| "low_ocr_confidence"
|
||||||
|
| "missing_field"
|
||||||
|
| "invalid_nrp"
|
||||||
|
| "unknown_pangkat"
|
||||||
|
| "personnel_count_mismatch"
|
||||||
|
| "date_parse_failed"
|
||||||
|
| "llm_fallback"
|
||||||
|
| "llm_unavailable"
|
||||||
|
| "personnel_text_fallback"
|
||||||
|
| "personnel_text_fallback_no_nrp"
|
||||||
|
| "incomplete_personnel_row";
|
||||||
|
```
|
||||||
|
|
||||||
|
Recommended UI labels:
|
||||||
|
|
||||||
|
| Flag | Label |
|
||||||
|
|---|---|
|
||||||
|
| `low_ocr_confidence` | Confidence OCR rendah |
|
||||||
|
| `missing_field` | Field wajib belum lengkap |
|
||||||
|
| `invalid_nrp` | NRP tidak valid |
|
||||||
|
| `unknown_pangkat` | Pangkat tidak dikenali |
|
||||||
|
| `personnel_count_mismatch` | Jumlah personel perlu dicek |
|
||||||
|
| `date_parse_failed` | Tanggal gagal dibaca |
|
||||||
|
| `llm_fallback` | Sebagian field diisi fallback LLM |
|
||||||
|
| `llm_unavailable` | LLM tidak tersedia |
|
||||||
|
| `personnel_text_fallback` | Personel dibaca dari fallback teks |
|
||||||
|
| `personnel_text_fallback_no_nrp` | Personel dibaca tanpa NRP |
|
||||||
|
| `incomplete_personnel_row` | Baris personel belum lengkap |
|
||||||
|
|
||||||
|
## Example Final Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"job_id": "e21e83ed-a42c-4672-baec-914e5c60cc5a",
|
||||||
|
"status": "needs_review",
|
||||||
|
"confidence": 0.82,
|
||||||
|
"data": {
|
||||||
|
"header": {
|
||||||
|
"nomor_sprint": "Sprin/123/IV/2026",
|
||||||
|
"tanggal": "2026-04-21",
|
||||||
|
"satuan_penerbit": "POLRES BANJAR",
|
||||||
|
"perihal": "Instruktur Ops Pekat I Lodaya 2026",
|
||||||
|
"dasar": []
|
||||||
|
},
|
||||||
|
"personel": [
|
||||||
|
{
|
||||||
|
"no": 1,
|
||||||
|
"pangkat": "IPDA",
|
||||||
|
"nrp": "12345678",
|
||||||
|
"nama": "BUDI SANTOSO",
|
||||||
|
"jabatan_dinas": "KANIT",
|
||||||
|
"jabatan_sprint": "INSTRUKTUR",
|
||||||
|
"keterangan": null,
|
||||||
|
"confidence": 0.91
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"untuk": ["Melaksanakan kegiatan sesuai surat perintah."],
|
||||||
|
"ttd": {
|
||||||
|
"nama": "AGUS",
|
||||||
|
"pangkat": "AKBP",
|
||||||
|
"nrp": "87654321",
|
||||||
|
"jabatan": "KAPOLRES"
|
||||||
|
},
|
||||||
|
"raw_text": "full OCR text...",
|
||||||
|
"confidence": 0.82,
|
||||||
|
"review_flags": ["low_ocr_confidence"]
|
||||||
|
},
|
||||||
|
"review_flags": ["low_ocr_confidence"],
|
||||||
|
"error": null,
|
||||||
|
"approved": false,
|
||||||
|
"reviewed_by": null,
|
||||||
|
"reviewed_at": null
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`raw_text` bisa panjang. Tampilkan di collapsible/debug panel, bukan di layar utama.
|
||||||
|
|
||||||
|
## Review dan Koreksi HITL
|
||||||
|
|
||||||
|
Frontend review screen sebaiknya mengizinkan editor untuk:
|
||||||
|
|
||||||
|
- Header: nomor sprint, tanggal, satuan penerbit, perihal, dasar
|
||||||
|
- Personel: pangkat, NRP, nama, jabatan dinas, jabatan sprint, keterangan
|
||||||
|
- Untuk: daftar tugas
|
||||||
|
- TTD: nama, pangkat, NRP, jabatan
|
||||||
|
|
||||||
|
### Patch Corrections
|
||||||
|
|
||||||
|
Endpoint:
|
||||||
|
|
||||||
|
```http
|
||||||
|
PATCH /documents/{job_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
Body:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"corrections": [
|
||||||
|
{
|
||||||
|
"path": "header.perihal",
|
||||||
|
"value": "Pelaksanaan Operasi Pekat I Lodaya 2026",
|
||||||
|
"reason": "OCR membaca perihal tidak lengkap"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "personel[0].nama",
|
||||||
|
"value": "BUDI SANTOSO",
|
||||||
|
"reason": "Perbaikan nama"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Header opsional untuk audit trail:
|
||||||
|
|
||||||
|
```http
|
||||||
|
X-User-Id: reviewer-a
|
||||||
|
```
|
||||||
|
|
||||||
|
Path yang umum dipakai:
|
||||||
|
|
||||||
|
```text
|
||||||
|
header.nomor_sprint
|
||||||
|
header.tanggal
|
||||||
|
header.satuan_penerbit
|
||||||
|
header.perihal
|
||||||
|
header.dasar
|
||||||
|
ttd.nama
|
||||||
|
ttd.pangkat
|
||||||
|
ttd.nrp
|
||||||
|
ttd.jabatan
|
||||||
|
personel[0].pangkat
|
||||||
|
personel[0].nrp
|
||||||
|
personel[0].nama
|
||||||
|
personel[0].jabatan_dinas
|
||||||
|
personel[0].jabatan_sprint
|
||||||
|
personel[0].keterangan
|
||||||
|
untuk
|
||||||
|
```
|
||||||
|
|
||||||
|
Semua correction dalam satu request bersifat atomic. Jika satu path invalid, seluruh batch ditolak dan tidak ada perubahan disimpan.
|
||||||
|
|
||||||
|
### Patch Example
|
||||||
|
|
||||||
|
```ts
|
||||||
|
async function patchDocument(jobId: string, corrections: FieldCorrection[], userId?: string) {
|
||||||
|
const headers: Record<string, string> = { "Content-Type": "application/json" };
|
||||||
|
if (API_KEY) headers["X-API-Key"] = API_KEY;
|
||||||
|
if (userId) headers["X-User-Id"] = userId;
|
||||||
|
|
||||||
|
const res = await fetch(`${API_BASE}/documents/${jobId}`, {
|
||||||
|
method: "PATCH",
|
||||||
|
headers,
|
||||||
|
body: JSON.stringify({ corrections }),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
throw await readApiError(res);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (await res.json()) as DocumentResponse;
|
||||||
|
}
|
||||||
|
|
||||||
|
type FieldCorrection = {
|
||||||
|
path: string;
|
||||||
|
value: unknown;
|
||||||
|
reason?: string | null;
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
## Correction History
|
||||||
|
|
||||||
|
Endpoint:
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /documents/{job_id}/history
|
||||||
|
```
|
||||||
|
|
||||||
|
Response:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
type CorrectionEventResponse = {
|
||||||
|
id: number;
|
||||||
|
job_id: string;
|
||||||
|
field_path: string;
|
||||||
|
old_value: unknown | null;
|
||||||
|
new_value: unknown | null;
|
||||||
|
corrected_by: string | null;
|
||||||
|
reason: string | null;
|
||||||
|
corrected_at: string;
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
Gunakan endpoint ini untuk audit panel di halaman review.
|
||||||
|
|
||||||
|
## Approve Final Result
|
||||||
|
|
||||||
|
Endpoint:
|
||||||
|
|
||||||
|
```http
|
||||||
|
POST /documents/{job_id}/approve
|
||||||
|
```
|
||||||
|
|
||||||
|
Header opsional:
|
||||||
|
|
||||||
|
```http
|
||||||
|
X-User-Id: reviewer-a
|
||||||
|
```
|
||||||
|
|
||||||
|
Response:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"job_id": "e21e83ed-a42c-4672-baec-914e5c60cc5a",
|
||||||
|
"approved": true,
|
||||||
|
"reviewed_by": "reviewer-a",
|
||||||
|
"reviewed_at": "2026-04-26T16:30:00"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Setelah approved, `PATCH /documents/{job_id}` akan ditolak dengan `409`.
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
Application errors:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"error": "UnsupportedDocumentError",
|
||||||
|
"message": "Uploaded file is empty."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
FastAPI validation errors memakai shape standar:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"detail": [
|
||||||
|
{
|
||||||
|
"type": "missing",
|
||||||
|
"loc": ["body", "file"],
|
||||||
|
"msg": "Field required"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Helper error:
|
||||||
|
|
||||||
|
```ts
|
||||||
|
async function readApiError(res: Response) {
|
||||||
|
let payload: unknown = null;
|
||||||
|
try {
|
||||||
|
payload = await res.json();
|
||||||
|
} catch {
|
||||||
|
payload = await res.text();
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
status: res.status,
|
||||||
|
payload,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Recommended UI handling:
|
||||||
|
|
||||||
|
| HTTP Status | UI Handling |
|
||||||
|
|---|---|
|
||||||
|
| `400` | Tampilkan pesan validasi/upload |
|
||||||
|
| `401` | Session/API key tidak valid |
|
||||||
|
| `404` | Job tidak ditemukan |
|
||||||
|
| `409` | Job belum selesai atau sudah approved |
|
||||||
|
| `422` | Form correction tidak valid |
|
||||||
|
| `500` | Tampilkan error umum dan minta operator cek log backend |
|
||||||
|
|
||||||
|
## Ground Truth Admin
|
||||||
|
|
||||||
|
Endpoint ini opsional untuk dashboard admin/training data.
|
||||||
|
|
||||||
|
```http
|
||||||
|
GET /ground-truth/stats?top_n=10
|
||||||
|
GET /ground-truth/export?approved_only=true&has_corrections=true&limit=1000
|
||||||
|
```
|
||||||
|
|
||||||
|
`/ground-truth/export` mengembalikan `application/x-ndjson`, satu JSON per baris. Frontend biasanya cukup menyediakan tombol download, bukan parse seluruh stream di browser.
|
||||||
|
|
||||||
|
## Recommended Screens
|
||||||
|
|
||||||
|
1. Upload screen
|
||||||
|
- Dropzone file PDF/image
|
||||||
|
- Health readiness badge
|
||||||
|
- Upload progress
|
||||||
|
- Processing state setelah `job_id` diterima
|
||||||
|
|
||||||
|
2. Result screen
|
||||||
|
- Status badge
|
||||||
|
- Confidence score
|
||||||
|
- Review flags
|
||||||
|
- Header summary
|
||||||
|
- Personnel table
|
||||||
|
- Untuk list
|
||||||
|
- TTD section
|
||||||
|
- Raw OCR collapsible
|
||||||
|
|
||||||
|
3. Review screen
|
||||||
|
- Editable fields
|
||||||
|
- Dirty-state tracking
|
||||||
|
- Correction reason input
|
||||||
|
- Save corrections via `PATCH`
|
||||||
|
- History panel
|
||||||
|
- Approve button
|
||||||
|
|
||||||
|
4. Admin screen
|
||||||
|
- Health/ready status
|
||||||
|
- Ground-truth stats
|
||||||
|
- Export approved samples
|
||||||
|
|
||||||
|
## UX Rules
|
||||||
|
|
||||||
|
- Jangan tunggu `POST /documents?sync=true` untuk production UI; gunakan async + polling.
|
||||||
|
- Disable approve kalau status masih `pending` atau `processing`.
|
||||||
|
- Tampilkan `needs_review` sebagai hasil yang berhasil diproses tetapi perlu validasi manusia.
|
||||||
|
- Jangan render `raw_text` sebagai konten utama.
|
||||||
|
- Pada `failed`, tampilkan `error` dari response jika ada.
|
||||||
|
- Pada confidence rendah, arahkan user ke review fields yang punya flag terkait.
|
||||||
49
docs/OCR-RUNTIME-MODES.md
Normal file
49
docs/OCR-RUNTIME-MODES.md
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
# OCR Runtime Modes
|
||||||
|
|
||||||
|
Backend OCR bisa dijalankan dalam mode CPU atau GPU lewat konfigurasi `OCR_USE_GPU`.
|
||||||
|
|
||||||
|
## Cara Pakai
|
||||||
|
|
||||||
|
Mode CPU:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
.\update.ps1 -OcrMode cpu
|
||||||
|
```
|
||||||
|
|
||||||
|
Mode GPU:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
.\update.ps1 -OcrMode gpu
|
||||||
|
```
|
||||||
|
|
||||||
|
Jika parameter tidak diberikan, `update.ps1` memakai nilai yang sudah ada di `.env`.
|
||||||
|
|
||||||
|
```env
|
||||||
|
OCR_USE_GPU=false
|
||||||
|
```
|
||||||
|
|
||||||
|
atau:
|
||||||
|
|
||||||
|
```env
|
||||||
|
OCR_USE_GPU=true
|
||||||
|
```
|
||||||
|
|
||||||
|
## Perilaku Script
|
||||||
|
|
||||||
|
- `-OcrMode cpu` menyimpan `OCR_USE_GPU=false` ke `.env`.
|
||||||
|
- `-OcrMode gpu` menyimpan `OCR_USE_GPU=true` ke `.env`.
|
||||||
|
- Script tidak menghapus package Paddle/CUDA yang sudah terpasang.
|
||||||
|
- Dalam mode GPU, script akan memasang `paddlepaddle-gpu` dan runtime cuDNN/cuBLAS jika belum ada.
|
||||||
|
- Dalam mode CPU, script hanya memasang `paddlepaddle` CPU jika belum ada runtime Paddle sama sekali.
|
||||||
|
|
||||||
|
## Catatan
|
||||||
|
|
||||||
|
Mode CPU tidak membutuhkan CUDA, cuDNN, atau driver NVIDIA.
|
||||||
|
|
||||||
|
Mode GPU membutuhkan NVIDIA driver dan runtime CUDA/cuDNN yang cocok. Pada Windows, backend juga menambahkan folder DLL NVIDIA dari `.venv` secara otomatis sebelum PaddleOCR diinisialisasi.
|
||||||
|
|
||||||
|
`TABLES_ENABLED` adalah konfigurasi terpisah dari mode CPU/GPU. Jika PP-Structure belum stabil di environment lokal, biarkan:
|
||||||
|
|
||||||
|
```env
|
||||||
|
TABLES_ENABLED=false
|
||||||
|
```
|
||||||
@@ -10,7 +10,10 @@ flow on top:
|
|||||||
* `POST /documents?sync=true` — runs the pipeline inline (the original
|
* `POST /documents?sync=true` — runs the pipeline inline (the original
|
||||||
Phase 1 behaviour). Useful for tests and
|
Phase 1 behaviour). Useful for tests and
|
||||||
small-volume single-tenant deploys without
|
small-volume single-tenant deploys without
|
||||||
a Celery worker.
|
a Celery worker. The heavy OCR work is
|
||||||
|
offloaded to a thread-pool executor so the
|
||||||
|
uvicorn event loop stays responsive during
|
||||||
|
processing (~30-120s on CPU).
|
||||||
* `GET /documents/{job_id}` — returns the current job state. Async
|
* `GET /documents/{job_id}` — returns the current job state. Async
|
||||||
clients poll this until `status` is in a
|
clients poll this until `status` is in a
|
||||||
terminal state (completed / needs_review /
|
terminal state (completed / needs_review /
|
||||||
@@ -19,6 +22,9 @@ flow on top:
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from functools import partial
|
||||||
from typing import Annotated
|
from typing import Annotated
|
||||||
from uuid import UUID, uuid4
|
from uuid import UUID, uuid4
|
||||||
|
|
||||||
@@ -60,6 +66,13 @@ from ocr_sprint.schemas.review import (
|
|||||||
from ocr_sprint.storage.blob import get_blob_storage
|
from ocr_sprint.storage.blob import get_blob_storage
|
||||||
from ocr_sprint.utils.logging import get_logger
|
from ocr_sprint.utils.logging import get_logger
|
||||||
|
|
||||||
|
# Thread pool dedicated to blocking OCR work. Using a *separate* pool
|
||||||
|
# (rather than the default loop executor) lets us cap the number of
|
||||||
|
# concurrent heavy OCR jobs independently of other thread-pool users.
|
||||||
|
# With 1 Celery worker + 1 sync slot we never exceed 2 parallel OCR
|
||||||
|
# runs; keep the pool at 1 so RAM stays bounded on the 7.4 GB server.
|
||||||
|
_OCR_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="ocr-inline")
|
||||||
|
|
||||||
router = APIRouter(
|
router = APIRouter(
|
||||||
prefix="/documents",
|
prefix="/documents",
|
||||||
tags=["documents"],
|
tags=["documents"],
|
||||||
@@ -86,9 +99,12 @@ def _row_to_response(row: object) -> DocumentResponse:
|
|||||||
|
|
||||||
assert isinstance(row, JobRow)
|
assert isinstance(row, JobRow)
|
||||||
status_enum = DocumentStatus(row.status)
|
status_enum = DocumentStatus(row.status)
|
||||||
result_obj: ExtractionResult | None = None
|
result_obj = None
|
||||||
if row.result is not None:
|
if row.result is not None:
|
||||||
result_obj = ExtractionResult.model_validate(row.result)
|
result_obj = ExtractionResult.model_validate(row.result)
|
||||||
|
# Auto-number personnel entries sequentially (1, 2, 3, ...)
|
||||||
|
for idx, entry in enumerate(result_obj.personel, start=1):
|
||||||
|
entry.no = idx
|
||||||
return DocumentResponse(
|
return DocumentResponse(
|
||||||
job_id=row.job_id,
|
job_id=row.job_id,
|
||||||
status=status_enum,
|
status=status_enum,
|
||||||
@@ -161,11 +177,13 @@ async def create_document(
|
|||||||
|
|
||||||
|
|
||||||
async def _run_inline(job_id: UUID, content: bytes) -> DocumentResponse:
|
async def _run_inline(job_id: UUID, content: bytes) -> DocumentResponse:
|
||||||
"""Synchronous pipeline execution.
|
"""Run the OCR pipeline without blocking the uvicorn event loop.
|
||||||
|
|
||||||
Each state transition opens its own short session so the request-scoped
|
``run_pipeline`` is CPU-bound and can take 30-120 s on a 2 vCPU server.
|
||||||
session's rollback-on-exception behaviour cannot wipe out the
|
Awaiting it directly on the async handler would freeze the entire event
|
||||||
``mark_failed`` write or strand the blob on disk.
|
loop (and therefore block health-checks, metrics, and every other request)
|
||||||
|
for the full duration. We push the work onto a dedicated single-thread
|
||||||
|
executor so the loop stays free while the OCR runs in the background.
|
||||||
"""
|
"""
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@@ -173,8 +191,13 @@ async def _run_inline(job_id: UUID, content: bytes) -> DocumentResponse:
|
|||||||
JobRepository(s).mark_processing(job_id)
|
JobRepository(s).mark_processing(job_id)
|
||||||
|
|
||||||
started = time.perf_counter()
|
started = time.perf_counter()
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
try:
|
try:
|
||||||
output = run_pipeline(content)
|
# run_pipeline is synchronous; wrap it so asyncio can await it.
|
||||||
|
output = await loop.run_in_executor(
|
||||||
|
_OCR_EXECUTOR,
|
||||||
|
partial(run_pipeline, content),
|
||||||
|
)
|
||||||
except ValueError as exc:
|
except ValueError as exc:
|
||||||
with session_scope() as s:
|
with session_scope() as s:
|
||||||
JobRepository(s).mark_failed(job_id, error=str(exc))
|
JobRepository(s).mark_failed(job_id, error=str(exc))
|
||||||
|
|||||||
@@ -3,8 +3,12 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from fastapi import APIRouter
|
from fastapi import APIRouter
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
from ocr_sprint import __version__
|
from ocr_sprint import __version__
|
||||||
|
from ocr_sprint.config import get_settings
|
||||||
|
from ocr_sprint.pipeline import ocr as _ocr
|
||||||
|
from ocr_sprint.pipeline import table as _table
|
||||||
|
|
||||||
router = APIRouter(tags=["health"])
|
router = APIRouter(tags=["health"])
|
||||||
|
|
||||||
@@ -13,3 +17,23 @@ router = APIRouter(tags=["health"])
|
|||||||
async def health() -> dict[str, str]:
|
async def health() -> dict[str, str]:
|
||||||
"""Lightweight liveness check — does NOT touch the OCR engine."""
|
"""Lightweight liveness check — does NOT touch the OCR engine."""
|
||||||
return {"status": "ok", "version": __version__}
|
return {"status": "ok", "version": __version__}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/health/ready")
|
||||||
|
async def readiness() -> JSONResponse:
|
||||||
|
"""Readiness check — returns 200 when OCR models are loaded, 503 if still warming up."""
|
||||||
|
settings = get_settings()
|
||||||
|
ocr_ready = _ocr._instance is not None
|
||||||
|
table_ready = (not settings.tables_enabled) or _table._instance is not None
|
||||||
|
ready = ocr_ready and table_ready
|
||||||
|
payload = {
|
||||||
|
"status": "ready" if ready else "warming_up",
|
||||||
|
"version": __version__,
|
||||||
|
"models": {
|
||||||
|
"paddleocr": "ready" if ocr_ready else "loading",
|
||||||
|
"pp_structure": (
|
||||||
|
"disabled" if not settings.tables_enabled else "ready" if table_ready else "loading"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return JSONResponse(content=payload, status_code=200 if ready else 503)
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ class Settings(BaseSettings):
|
|||||||
app_host: str = "0.0.0.0"
|
app_host: str = "0.0.0.0"
|
||||||
app_port: int = 8000
|
app_port: int = 8000
|
||||||
app_log_level: str = "INFO"
|
app_log_level: str = "INFO"
|
||||||
|
root_path: str = "" # For reverse proxy with path prefix (e.g., "/ocr")
|
||||||
|
|
||||||
# Storage (Phase 1: local fs)
|
# Storage (Phase 1: local fs)
|
||||||
storage_local_dir: Path = Path("./storage")
|
storage_local_dir: Path = Path("./storage")
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
|
|||||||
# Bintara
|
# Bintara
|
||||||
"BRIPDA": ("BRIPDA",),
|
"BRIPDA": ("BRIPDA",),
|
||||||
"BRIPTU": ("BRIPTU",),
|
"BRIPTU": ("BRIPTU",),
|
||||||
"BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL"),
|
"BRIGADIR": ("BRIGADIR", "BRIG", "BRIG POL", "BRIGPOL"),
|
||||||
"BRIPKA": ("BRIPKA",),
|
"BRIPKA": ("BRIPKA",),
|
||||||
"AIPDA": ("AIPDA",),
|
"AIPDA": ("AIPDA",),
|
||||||
"AIPTU": ("AIPTU",),
|
"AIPTU": ("AIPTU",),
|
||||||
@@ -33,12 +33,45 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
|
|||||||
# Perwira Menengah
|
# Perwira Menengah
|
||||||
"KOMPOL": ("KOMPOL",),
|
"KOMPOL": ("KOMPOL",),
|
||||||
"AKBP": ("AKBP",),
|
"AKBP": ("AKBP",),
|
||||||
"KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP"),
|
"KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP", "KOMBES"),
|
||||||
# Perwira Tinggi
|
# Perwira Tinggi
|
||||||
"BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"),
|
"BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"),
|
||||||
"IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"),
|
"IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"),
|
||||||
"KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"),
|
"KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"),
|
||||||
"JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"),
|
"JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"),
|
||||||
|
# PNS Polri (Pegawai Negeri Sipil di lingkungan Polri). PNS appear
|
||||||
|
# routinely on sprint panitia / undangan templates alongside Polri
|
||||||
|
# personnel, so we treat them as valid ranks for extraction.
|
||||||
|
# Sources: PP 11/2017 jo PP 17/2020 (Manajemen PNS); golongan I-IV.
|
||||||
|
# Golongan I (Juru)
|
||||||
|
"JURU MUDA": ("JURU MUDA",),
|
||||||
|
"JURU MUDA TK I": ("JURU MUDA TK I", "JURU MUDA TK.I", "JURU MUDA TINGKAT I"),
|
||||||
|
"JURU": ("JURU",),
|
||||||
|
"JURU TK I": ("JURU TK I", "JURU TK.I", "JURU TINGKAT I"),
|
||||||
|
# Golongan II (Pengatur)
|
||||||
|
"PENGATUR MUDA": ("PENGATUR MUDA",),
|
||||||
|
"PENGATUR MUDA TK I": (
|
||||||
|
"PENGATUR MUDA TK I",
|
||||||
|
"PENGATUR MUDA TK.I",
|
||||||
|
"PENGATUR MUDA TINGKAT I",
|
||||||
|
),
|
||||||
|
"PENGATUR": ("PENGATUR",),
|
||||||
|
"PENGATUR TK I": ("PENGATUR TK I", "PENGATUR TK.I", "PENGATUR TINGKAT I"),
|
||||||
|
# Golongan III (Penata)
|
||||||
|
"PENATA MUDA": ("PENATA MUDA",),
|
||||||
|
"PENATA MUDA TK I": (
|
||||||
|
"PENATA MUDA TK I",
|
||||||
|
"PENATA MUDA TK.I",
|
||||||
|
"PENATA MUDA TINGKAT I",
|
||||||
|
),
|
||||||
|
"PENATA": ("PENATA",),
|
||||||
|
"PENATA TK I": ("PENATA TK I", "PENATA TK.I", "PENATA TINGKAT I"),
|
||||||
|
# Golongan IV (Pembina)
|
||||||
|
"PEMBINA": ("PEMBINA",),
|
||||||
|
"PEMBINA TK I": ("PEMBINA TK I", "PEMBINA TK.I", "PEMBINA TINGKAT I"),
|
||||||
|
"PEMBINA UTAMA MUDA": ("PEMBINA UTAMA MUDA",),
|
||||||
|
"PEMBINA UTAMA MADYA": ("PEMBINA UTAMA MADYA",),
|
||||||
|
"PEMBINA UTAMA": ("PEMBINA UTAMA",),
|
||||||
}
|
}
|
||||||
|
|
||||||
# Reverse lookup: any variant (uppercased) → canonical form.
|
# Reverse lookup: any variant (uppercased) → canonical form.
|
||||||
|
|||||||
@@ -171,7 +171,10 @@ def iter_ground_truth_samples(
|
|||||||
reviewed_at=job_row.reviewed_at,
|
reviewed_at=job_row.reviewed_at,
|
||||||
created_at=job_row.created_at,
|
created_at=job_row.created_at,
|
||||||
initial_result=initial,
|
initial_result=initial,
|
||||||
final_result=copy.deepcopy(job_row.result) if job_row.result else None,
|
# Use an ``is None`` check to stay consistent with
|
||||||
|
# ``build_initial_result``; otherwise an empty-dict result
|
||||||
|
# would produce ``initial_result={}`` but ``final_result=None``.
|
||||||
|
final_result=(copy.deepcopy(job_row.result) if job_row.result is not None else None),
|
||||||
corrections=[
|
corrections=[
|
||||||
GroundTruthCorrection(
|
GroundTruthCorrection(
|
||||||
field_path=c.field_path,
|
field_path=c.field_path,
|
||||||
|
|||||||
@@ -2,6 +2,10 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import threading
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from typing import AsyncIterator
|
||||||
|
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
|
||||||
from ocr_sprint import __version__
|
from ocr_sprint import __version__
|
||||||
@@ -11,7 +15,10 @@ from ocr_sprint.api.routes import documents, ground_truth, health
|
|||||||
from ocr_sprint.config import get_settings
|
from ocr_sprint.config import get_settings
|
||||||
from ocr_sprint.db import models as _models # noqa: F401 (register ORM tables)
|
from ocr_sprint.db import models as _models # noqa: F401 (register ORM tables)
|
||||||
from ocr_sprint.db.base import Base, get_engine
|
from ocr_sprint.db.base import Base, get_engine
|
||||||
from ocr_sprint.utils.logging import configure_logging
|
from ocr_sprint.utils.logging import configure_logging, get_logger
|
||||||
|
|
||||||
|
|
||||||
|
_startup_logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def _ensure_schema() -> None:
|
def _ensure_schema() -> None:
|
||||||
@@ -24,22 +31,74 @@ def _ensure_schema() -> None:
|
|||||||
Base.metadata.create_all(bind=get_engine())
|
Base.metadata.create_all(bind=get_engine())
|
||||||
|
|
||||||
|
|
||||||
|
def _warmup_models_background() -> None:
|
||||||
|
"""Load PaddleOCR and PP-Structure models in a background thread.
|
||||||
|
|
||||||
|
Running in a thread keeps the lifespan non-blocking so uvicorn can
|
||||||
|
start accepting health-check requests immediately while the heavy models
|
||||||
|
load (~5-15s on CPU). Requests that arrive before warmup completes will
|
||||||
|
wait on the existing _lock in each module rather than racing to load.
|
||||||
|
"""
|
||||||
|
from ocr_sprint.config import get_settings as _gs
|
||||||
|
from ocr_sprint.pipeline import ocr as _ocr
|
||||||
|
from ocr_sprint.pipeline import table as _table
|
||||||
|
|
||||||
|
s = _gs()
|
||||||
|
try:
|
||||||
|
_ocr.warmup()
|
||||||
|
except Exception as exc:
|
||||||
|
_startup_logger.warning("paddleocr.warmup.failed", error=str(exc))
|
||||||
|
|
||||||
|
if s.tables_enabled:
|
||||||
|
try:
|
||||||
|
_table.warmup()
|
||||||
|
except Exception as exc:
|
||||||
|
_startup_logger.warning("pp_structure.warmup.failed", error=str(exc))
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
|
||||||
|
"""FastAPI lifespan: warm OCR models on startup in a background thread."""
|
||||||
|
_startup_logger.info("startup.warmup.begin")
|
||||||
|
t = threading.Thread(target=_warmup_models_background, name="ocr-warmup", daemon=True)
|
||||||
|
t.start()
|
||||||
|
yield
|
||||||
|
# Shutdown: nothing to clean up (models are process-global singletons).
|
||||||
|
_startup_logger.info("shutdown.complete")
|
||||||
|
|
||||||
|
|
||||||
def create_app() -> FastAPI:
|
def create_app() -> FastAPI:
|
||||||
"""Application factory — keeps top-level state easy to test."""
|
"""Application factory — keeps top-level state easy to test."""
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
configure_logging(settings.app_log_level)
|
configure_logging(settings.app_log_level)
|
||||||
_ensure_schema()
|
_ensure_schema()
|
||||||
|
|
||||||
|
# Support for reverse proxy with path prefix (e.g., /ocr)
|
||||||
|
root_path = getattr(settings, "root_path", "")
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
|
lifespan=lifespan,
|
||||||
title="OCR Sprint Service",
|
title="OCR Sprint Service",
|
||||||
version=__version__,
|
version=__version__,
|
||||||
description="OCR + structured extraction for Indonesian police 'surat sprint' documents.",
|
description="OCR + structured extraction for Indonesian police 'surat sprint' documents.",
|
||||||
docs_url="/docs",
|
docs_url="/docs",
|
||||||
redoc_url="/redoc",
|
redoc_url="/redoc",
|
||||||
openapi_url="/openapi.json",
|
openapi_url="/openapi.json",
|
||||||
|
root_path=root_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
register_error_handlers(app)
|
register_error_handlers(app)
|
||||||
|
|
||||||
|
# CORS — allow frontend dev servers and production origins
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
app.add_middleware(MetricsMiddleware)
|
app.add_middleware(MetricsMiddleware)
|
||||||
app.include_router(health.router, prefix="/api/v1")
|
app.include_router(health.router, prefix="/api/v1")
|
||||||
app.include_router(documents.router, prefix="/api/v1")
|
app.include_router(documents.router, prefix="/api/v1")
|
||||||
|
|||||||
@@ -22,6 +22,14 @@ _FLAG_PENALTY: dict[ReviewFlag, float] = {
|
|||||||
ReviewFlag.UNKNOWN_PANGKAT: 0.05,
|
ReviewFlag.UNKNOWN_PANGKAT: 0.05,
|
||||||
ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15,
|
ReviewFlag.PERSONNEL_COUNT_MISMATCH: 0.15,
|
||||||
ReviewFlag.DATE_PARSE_FAILED: 0.10,
|
ReviewFlag.DATE_PARSE_FAILED: 0.10,
|
||||||
|
# Text-based personnel fallback is a recoverable degradation: rank/NRP
|
||||||
|
# were extracted via regex from raw OCR rather than from a parsed table
|
||||||
|
# grid. Worth flagging for review but not catastrophic.
|
||||||
|
ReviewFlag.PERSONNEL_TEXT_FALLBACK: 0.05,
|
||||||
|
# An incomplete personnel row (no pangkat AND no nrp) is a strong
|
||||||
|
# signal something went wrong. Penalise heavily so the document
|
||||||
|
# routes to needs_review even if the rest of the extraction is fine.
|
||||||
|
ReviewFlag.INCOMPLETE_PERSONNEL_ROW: 0.15,
|
||||||
}
|
}
|
||||||
|
|
||||||
OCR_WEIGHT = 0.6
|
OCR_WEIGHT = 0.6
|
||||||
|
|||||||
@@ -64,6 +64,8 @@ _HEADER_SYNONYMS: dict[str, str] = {
|
|||||||
"jabatan dinas": "jabatan_dinas",
|
"jabatan dinas": "jabatan_dinas",
|
||||||
"jabatan dalam dinas": "jabatan_dinas",
|
"jabatan dalam dinas": "jabatan_dinas",
|
||||||
"jbt dinas": "jabatan_dinas",
|
"jbt dinas": "jabatan_dinas",
|
||||||
|
"struktural": "jabatan_dinas",
|
||||||
|
"jabatan struktural": "jabatan_dinas",
|
||||||
# jabatan dalam sprint (role for this dispatch)
|
# jabatan dalam sprint (role for this dispatch)
|
||||||
"jabatan dalam sprint": "jabatan_sprint",
|
"jabatan dalam sprint": "jabatan_sprint",
|
||||||
"jabatan dalam sprin": "jabatan_sprint",
|
"jabatan dalam sprin": "jabatan_sprint",
|
||||||
@@ -72,6 +74,8 @@ _HEADER_SYNONYMS: dict[str, str] = {
|
|||||||
"jabatan sprin": "jabatan_sprint",
|
"jabatan sprin": "jabatan_sprint",
|
||||||
"tugas": "jabatan_sprint",
|
"tugas": "jabatan_sprint",
|
||||||
"penugasan": "jabatan_sprint",
|
"penugasan": "jabatan_sprint",
|
||||||
|
"dalam penugasan": "jabatan_sprint",
|
||||||
|
"jabatan dalam penugasan": "jabatan_sprint",
|
||||||
# remarks
|
# remarks
|
||||||
"keterangan": "keterangan",
|
"keterangan": "keterangan",
|
||||||
"ket": "keterangan",
|
"ket": "keterangan",
|
||||||
|
|||||||
797
src/ocr_sprint/pipeline/extract/personnel_text.py
Normal file
797
src/ocr_sprint/pipeline/extract/personnel_text.py
Normal file
@@ -0,0 +1,797 @@
|
|||||||
|
"""Text-based fallback personnel extractor.
|
||||||
|
|
||||||
|
PP-Structure (Phase 3) is the primary path for personnel rows because it
|
||||||
|
preserves the table grid. But PP-Structure can fail in two ways on real
|
||||||
|
sprint scans:
|
||||||
|
|
||||||
|
1. The table is not detected at all (low-quality scan, watermark, atypical
|
||||||
|
layout) — `extract_personnel` returns an empty list.
|
||||||
|
2. The table IS detected but the column mapping is too sparse, so each row
|
||||||
|
collapses to a single ``nama`` cell with all other fields ``None``. This
|
||||||
|
is what was observed on a real Polres Cimahi sprint where the OCR
|
||||||
|
produced 24 rows with only ``nama`` populated.
|
||||||
|
|
||||||
|
This module provides a regex/heuristic fallback that operates directly on
|
||||||
|
the flat OCR text. It is deliberately conservative: a row must have BOTH a
|
||||||
|
recognizable Polri rank AND an 8-digit NRP to be emitted, so we never
|
||||||
|
generate the kind of "name-only" rows that motivated the fallback in the
|
||||||
|
first place.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ocr_sprint.data.master_pangkat import (
|
||||||
|
PANGKAT_VARIANTS,
|
||||||
|
is_valid_pangkat,
|
||||||
|
normalize_pangkat,
|
||||||
|
)
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
# Build a single alternation of all known rank tokens (longest first so multi-
|
||||||
|
# word ranks like "KOMBES POL" win over the single-word "KOMBES").
|
||||||
|
_RANK_TOKENS: tuple[str, ...] = tuple(
|
||||||
|
sorted(
|
||||||
|
{variant for variants in PANGKAT_VARIANTS.values() for variant in variants},
|
||||||
|
key=lambda v: -len(v),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
|
||||||
|
# A rank token followed (within a few characters) by an 8-digit NRP.
|
||||||
|
# We allow common separators: '/', '-', '.', ',', ':' or whitespace.
|
||||||
|
# The trailing ``\b`` plus proximity to the 8-digit NRP is the
|
||||||
|
# specificity signal — we deliberately do *not* require a leading
|
||||||
|
# ``\b`` because real Polri sprint OCR routinely mashes the rank into
|
||||||
|
# the trailing characters of the previous cell (observed on Polres
|
||||||
|
# Banjar: "...CPHR., CBA, CI" runs straight into "AKP" giving
|
||||||
|
# "CIAKP 84011113"). Requiring a leading boundary loses that row
|
||||||
|
# entirely. The longest-first alternation order ensures multi-token
|
||||||
|
# ranks ("KOMBES POL") still win over short overlaps ("KBP").
|
||||||
|
_RE_RANK_NRP_LINE = re.compile(
|
||||||
|
rf"(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# A bare row number marker like "1." or "12)". OCR often puts it on its own
|
||||||
|
# line in tabular layouts.
|
||||||
|
_RE_ROW_NUMBER = re.compile(r"^\s*(\d{1,3})\s*[.)]\s*$")
|
||||||
|
# Lines that should never be interpreted as a personnel name. These are
|
||||||
|
# section headers, OCR garbage anchors, and column header tokens. We match
|
||||||
|
# them with a *word-boundary* regex (built from this list) rather than a
|
||||||
|
# bare ``startswith`` check, because short tokens like ``"NO"`` and
|
||||||
|
# ``"KET"`` would otherwise reject perfectly valid Indonesian names
|
||||||
|
# (e.g. ``"NOVA SARI"``, ``"NOOR HIDAYAT"``, ``"KETUT WARDANA"`` — the
|
||||||
|
# latter being an extremely common Balinese birth-order name).
|
||||||
|
_NAME_BLOCKLIST_TOKENS: tuple[str, ...] = (
|
||||||
|
"PADA TANGGAL", # multi-word entries first so they win the alternation
|
||||||
|
"SURAT PERINTAH",
|
||||||
|
"DASAR",
|
||||||
|
"PERIHAL",
|
||||||
|
"PERTIMBANGAN",
|
||||||
|
"DIPERINTAHKAN",
|
||||||
|
"KEPADA",
|
||||||
|
"UNTUK",
|
||||||
|
"TEMBUSAN",
|
||||||
|
"DIKELUARKAN",
|
||||||
|
"SELESAI",
|
||||||
|
"DAFTAR",
|
||||||
|
"LAMPIRAN",
|
||||||
|
"NOMOR",
|
||||||
|
"TANGGAL",
|
||||||
|
"KEPOLISIAN",
|
||||||
|
"DAERAH",
|
||||||
|
"RESOR",
|
||||||
|
"SEKTOR",
|
||||||
|
"MABES",
|
||||||
|
"NRP",
|
||||||
|
"NIP",
|
||||||
|
"PANGKAT",
|
||||||
|
"JABATAN",
|
||||||
|
"NAMA",
|
||||||
|
"KETERANGAN",
|
||||||
|
"KET",
|
||||||
|
"NO",
|
||||||
|
)
|
||||||
|
_RE_NAME_BLOCKLIST = re.compile(
|
||||||
|
r"^(?:" + "|".join(re.escape(tok) for tok in _NAME_BLOCKLIST_TOKENS) + r")\b",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# A name should look like a name: mostly letters, common punctuation, and
|
||||||
|
# at least one alphabetic character. Pure-numeric or pure-symbol lines are
|
||||||
|
# rejected.
|
||||||
|
_RE_NAME_OK = re.compile(r"[A-Za-z]")
|
||||||
|
|
||||||
|
|
||||||
|
def _is_plausible_name(line: str) -> bool:
|
||||||
|
"""Return True iff ``line`` could plausibly be a personnel name."""
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped or not _RE_NAME_OK.search(stripped):
|
||||||
|
return False
|
||||||
|
if _RE_NAME_BLOCKLIST.match(stripped):
|
||||||
|
return False
|
||||||
|
if _RE_ROW_NUMBER.match(stripped):
|
||||||
|
return False
|
||||||
|
if _RE_RANK_NRP_LINE.search(stripped):
|
||||||
|
return False
|
||||||
|
# Reject lines that are nothing but a row number with extra punctuation
|
||||||
|
# ("1 .", "2)") which the bare-number regex above might miss.
|
||||||
|
return not re.fullmatch(r"[\s\d.)(\-]+", stripped)
|
||||||
|
|
||||||
|
|
||||||
|
def _following_jabatan(lines: list[str], idx: int) -> str | None:
|
||||||
|
"""Collect 1-3 follow-up lines after the rank+NRP line as the jabatan.
|
||||||
|
|
||||||
|
Stops at the next rank+NRP line, the next bare row-number line, or any
|
||||||
|
blocked prefix (section header / column header).
|
||||||
|
"""
|
||||||
|
parts: list[str] = []
|
||||||
|
for fwd in range(idx + 1, min(idx + 4, len(lines))):
|
||||||
|
candidate = lines[fwd].strip()
|
||||||
|
if not candidate:
|
||||||
|
if parts:
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
if _RE_RANK_NRP_LINE.search(candidate):
|
||||||
|
break
|
||||||
|
if _RE_ROW_NUMBER.match(candidate):
|
||||||
|
break
|
||||||
|
if _RE_NAME_BLOCKLIST.match(candidate):
|
||||||
|
break
|
||||||
|
parts.append(candidate)
|
||||||
|
if not parts:
|
||||||
|
return None
|
||||||
|
joined = " ".join(parts)
|
||||||
|
return " ".join(joined.split()) or None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
|
||||||
|
"""Best-effort personnel extraction from a flat OCR text stream.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
|
||||||
|
**Pass 1** — same-line rank+NRP (original strategy):
|
||||||
|
1. Iterate every line. Skip lines that don't contain both a known rank
|
||||||
|
and an 8-digit NRP (those are the only signal we trust).
|
||||||
|
2. For each rank+NRP line, look back for the most recent plausible name
|
||||||
|
line, and forward 1-3 lines for jabatan content.
|
||||||
|
3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
|
||||||
|
|
||||||
|
**Pass 2** — separate-line rank and NRP (for tabular sprint formats):
|
||||||
|
If pass 1 produces no results, scan for lines containing a standalone
|
||||||
|
rank token, then look up to 2 lines forward for a standalone NRP.
|
||||||
|
This handles sprint formats where OCR renders each column on its own
|
||||||
|
line (e.g. Polres Banjar layout).
|
||||||
|
|
||||||
|
**Pass 3** — rank-only (for sprint formats *without* an NRP column):
|
||||||
|
Some sprint templates (panitia, undangan, etc.) list only nama +
|
||||||
|
pangkat + jabatan, no NRP. If pass 1 and pass 2 both yield nothing,
|
||||||
|
fall back to a rank-only scan: every standalone rank line (or
|
||||||
|
two-line rank like "KOMBES" + "POL" produced by narrow-column OCR)
|
||||||
|
becomes a row, with name assembled from preceding lines and jabatan
|
||||||
|
from following lines. ``nrp`` stays ``None``. False-positive risk
|
||||||
|
is higher (stray rank tokens in body text), so this only fires when
|
||||||
|
nothing else matched.
|
||||||
|
|
||||||
|
The fallback is intentionally rate-limited: the first matching rank
|
||||||
|
token on a line wins (no greedy multi-match per line), and a name line
|
||||||
|
can only be consumed once (so a stray ranked text inside a paragraph
|
||||||
|
doesn't turn into multiple bogus entries).
|
||||||
|
"""
|
||||||
|
lines = raw_text.splitlines()
|
||||||
|
|
||||||
|
# ── Pass 1: rank+NRP on the same line ────────────────────────────
|
||||||
|
rows = _extract_same_line(lines)
|
||||||
|
if rows:
|
||||||
|
return rows
|
||||||
|
|
||||||
|
# ── Pass 2: rank and NRP on separate lines ───────────────────────
|
||||||
|
rows = _extract_separate_lines(lines)
|
||||||
|
if rows:
|
||||||
|
return rows
|
||||||
|
|
||||||
|
# ── Pass 3: rank-only (no NRP column) ────────────────────────────
|
||||||
|
return _extract_rank_only(lines)
|
||||||
|
|
||||||
|
|
||||||
|
# Regex for a line that is *only* a rank token (possibly with punctuation).
|
||||||
|
_RE_RANK_ONLY = re.compile(
|
||||||
|
rf"^\s*(?P<rank>{_RANK_ALT})\s*[/.\-,:]*\s*$",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
# Regex for a line that contains a standalone 8-digit NRP.
|
||||||
|
_RE_NRP_ONLY = re.compile(r"(?<!\d)(?P<nrp>\d{8})(?!\d)")
|
||||||
|
|
||||||
|
|
||||||
|
# Strip a leading row number marker like "1 ", "1.", "12)" from a name
|
||||||
|
# prefix taken from the same OCR line as a rank+NRP match. Unlike
|
||||||
|
# _RE_ROW_NUMBER (which matches a *whole* line), this is a prefix strip
|
||||||
|
# for embedded same-line cases like "1 CUCU JUHANA, A.K.S. KOMPOL ...".
|
||||||
|
_RE_LEADING_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s+")
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_same_line(lines: list[str]) -> list[PersonnelEntry]:
|
||||||
|
"""Pass 1: rank+NRP pairs found anywhere in the joined text.
|
||||||
|
|
||||||
|
Uses ``finditer`` over the full ``\\n``-joined OCR text rather than
|
||||||
|
``re.search`` per line so that multiple rank+NRP pairs on the same
|
||||||
|
OCR line still produce separate rows. This is required for sprint
|
||||||
|
scans where Paddle merges several table rows into one OCR line
|
||||||
|
(observed on Polres Banjar where row 2's "...CBA.AKP 77020049 KASAT
|
||||||
|
RESKRIM" was being swallowed into row 1's jabatan because per-line
|
||||||
|
``search`` only returns the first match).
|
||||||
|
|
||||||
|
For each match we resolve nama from text *before* the match (the
|
||||||
|
same-line prefix takes precedence; otherwise look back through the
|
||||||
|
preceding lines bounded by the previous match) and jabatan from text
|
||||||
|
*after* the match (same-line suffix plus up to ~3 follow-up lines,
|
||||||
|
bounded by the next match).
|
||||||
|
"""
|
||||||
|
if not lines:
|
||||||
|
return []
|
||||||
|
full_text = "\n".join(lines)
|
||||||
|
|
||||||
|
line_starts: list[int] = []
|
||||||
|
pos = 0
|
||||||
|
for line in lines:
|
||||||
|
line_starts.append(pos)
|
||||||
|
pos += len(line) + 1 # +1 for the joining "\n"
|
||||||
|
|
||||||
|
def offset_to_line(offset: int) -> int:
|
||||||
|
lo, hi = 0, len(line_starts)
|
||||||
|
while lo < hi:
|
||||||
|
mid = (lo + hi) // 2
|
||||||
|
if line_starts[mid] <= offset:
|
||||||
|
lo = mid + 1
|
||||||
|
else:
|
||||||
|
hi = mid
|
||||||
|
return max(0, lo - 1)
|
||||||
|
|
||||||
|
matches = list(_RE_RANK_NRP_LINE.finditer(full_text))
|
||||||
|
rows: list[PersonnelEntry] = []
|
||||||
|
consumed_lines: set[int] = set()
|
||||||
|
|
||||||
|
for i, m in enumerate(matches):
|
||||||
|
pangkat = normalize_pangkat(m.group("rank"))
|
||||||
|
if not pangkat or not is_valid_pangkat(pangkat):
|
||||||
|
continue
|
||||||
|
nrp = m.group("nrp")
|
||||||
|
ml = offset_to_line(m.start())
|
||||||
|
prev_ml = (
|
||||||
|
offset_to_line(matches[i - 1].start()) if i > 0 else -1
|
||||||
|
)
|
||||||
|
next_ml = (
|
||||||
|
offset_to_line(matches[i + 1].start())
|
||||||
|
if i + 1 < len(matches)
|
||||||
|
else len(lines)
|
||||||
|
)
|
||||||
|
|
||||||
|
line_text = lines[ml]
|
||||||
|
line_off = line_starts[ml]
|
||||||
|
|
||||||
|
# Same-line prefix: text on this line *before* the rank token.
|
||||||
|
# If the previous match was on this same line, only consider the
|
||||||
|
# text after that previous match's NRP (otherwise we'd reuse the
|
||||||
|
# earlier row's tail as this row's name).
|
||||||
|
prefix_start_local = 0
|
||||||
|
if prev_ml == ml and i > 0:
|
||||||
|
prefix_start_local = max(0, matches[i - 1].end() - line_off)
|
||||||
|
prefix = line_text[prefix_start_local : m.start() - line_off]
|
||||||
|
|
||||||
|
# Same-line suffix: text on this line *after* the NRP, capped at
|
||||||
|
# the next match's start if it's on this same line.
|
||||||
|
suffix_end_local = len(line_text)
|
||||||
|
if next_ml == ml and i + 1 < len(matches):
|
||||||
|
suffix_end_local = matches[i + 1].start() - line_off
|
||||||
|
suffix = line_text[m.end() - line_off : suffix_end_local]
|
||||||
|
|
||||||
|
# ── Resolve nama ────────────────────────────────────────────
|
||||||
|
nama: str | None = None
|
||||||
|
prefix_clean = _RE_LEADING_ROW_NUMBER.sub("", prefix).strip()
|
||||||
|
if prefix_clean and _is_plausible_name(prefix_clean):
|
||||||
|
nama = prefix_clean
|
||||||
|
elif prev_ml < ml:
|
||||||
|
for back in range(ml - 1, prev_ml, -1):
|
||||||
|
if back in consumed_lines or back < 0:
|
||||||
|
continue
|
||||||
|
candidate = lines[back].strip()
|
||||||
|
if _is_plausible_name(candidate):
|
||||||
|
nama = candidate
|
||||||
|
consumed_lines.add(back)
|
||||||
|
break
|
||||||
|
|
||||||
|
# ── Resolve jabatan ─────────────────────────────────────────
|
||||||
|
jabatan_parts: list[str] = []
|
||||||
|
suffix_clean = suffix.strip()
|
||||||
|
if suffix_clean:
|
||||||
|
jabatan_parts.append(suffix_clean)
|
||||||
|
if next_ml > ml:
|
||||||
|
max_fwd = min(ml + 4, next_ml, len(lines))
|
||||||
|
for fwd in range(ml + 1, max_fwd):
|
||||||
|
candidate = lines[fwd].strip()
|
||||||
|
if not candidate:
|
||||||
|
if jabatan_parts:
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
if _RE_NAME_BLOCKLIST.match(candidate):
|
||||||
|
break
|
||||||
|
if _RE_ROW_NUMBER.match(candidate):
|
||||||
|
break
|
||||||
|
jabatan_parts.append(candidate)
|
||||||
|
jabatan = (
|
||||||
|
" ".join(" ".join(jabatan_parts).split())
|
||||||
|
if jabatan_parts
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
rows.append(
|
||||||
|
PersonnelEntry(
|
||||||
|
no=None,
|
||||||
|
pangkat=pangkat,
|
||||||
|
nrp=nrp,
|
||||||
|
nama=nama,
|
||||||
|
jabatan_dinas=jabatan,
|
||||||
|
jabatan_sprint=None,
|
||||||
|
keterangan=None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_separate_lines(lines: list[str]) -> list[PersonnelEntry]:
|
||||||
|
"""Pass 2: rank and NRP on separate nearby lines.
|
||||||
|
|
||||||
|
Handles tabular sprint formats where OCR outputs each column as its
|
||||||
|
own line, e.g.:
|
||||||
|
1
|
||||||
|
CUCU JUHANA, A.K.S.
|
||||||
|
KOMPOL
|
||||||
|
70100418
|
||||||
|
KABAGOPS
|
||||||
|
"""
|
||||||
|
consumed_names: set[int] = set()
|
||||||
|
consumed_nrps: set[int] = set()
|
||||||
|
rows: list[PersonnelEntry] = []
|
||||||
|
|
||||||
|
for idx, raw_line in enumerate(lines):
|
||||||
|
line = raw_line.strip()
|
||||||
|
rank_match = _RE_RANK_ONLY.match(line)
|
||||||
|
if not rank_match:
|
||||||
|
# Also try: line starts with a rank token (may have trailing text)
|
||||||
|
for tok in _RANK_TOKENS:
|
||||||
|
if line.upper().startswith(tok) and len(line) - len(tok) < 5:
|
||||||
|
rank_match = re.match(
|
||||||
|
rf"^\s*(?P<rank>{re.escape(tok)})\s*[/.\-,:]*",
|
||||||
|
line,
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if rank_match:
|
||||||
|
break
|
||||||
|
if not rank_match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
pangkat = normalize_pangkat(rank_match.group("rank"))
|
||||||
|
if not pangkat or not is_valid_pangkat(pangkat):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Look forward up to 2 lines for NRP
|
||||||
|
nrp: str | None = None
|
||||||
|
nrp_idx: int | None = None
|
||||||
|
for fwd in range(idx + 1, min(idx + 3, len(lines))):
|
||||||
|
if fwd in consumed_nrps:
|
||||||
|
continue
|
||||||
|
nrp_match = _RE_NRP_ONLY.search(lines[fwd].strip())
|
||||||
|
if nrp_match:
|
||||||
|
nrp = nrp_match.group("nrp")
|
||||||
|
nrp_idx = fwd
|
||||||
|
break
|
||||||
|
|
||||||
|
if not nrp:
|
||||||
|
continue
|
||||||
|
assert nrp_idx is not None
|
||||||
|
consumed_nrps.add(nrp_idx)
|
||||||
|
|
||||||
|
# Look back for name
|
||||||
|
nama: str | None = None
|
||||||
|
for back in range(idx - 1, max(idx - 6, -1), -1):
|
||||||
|
if back in consumed_names:
|
||||||
|
continue
|
||||||
|
candidate = lines[back].strip()
|
||||||
|
if _is_plausible_name(candidate):
|
||||||
|
nama = candidate
|
||||||
|
consumed_names.add(back)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Look forward after NRP for jabatan
|
||||||
|
jabatan = _following_jabatan(lines, nrp_idx)
|
||||||
|
rows.append(
|
||||||
|
PersonnelEntry(
|
||||||
|
no=None,
|
||||||
|
pangkat=pangkat,
|
||||||
|
nrp=nrp,
|
||||||
|
nama=nama,
|
||||||
|
jabatan_dinas=jabatan,
|
||||||
|
jabatan_sprint=None,
|
||||||
|
keterangan=None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
# Bare row-number markers used by sprint formats without NRP (the dot
|
||||||
|
# is often missing in narrow-column OCR, e.g. just "1" on its own line).
|
||||||
|
_RE_BARE_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s*$")
|
||||||
|
|
||||||
|
|
||||||
|
def _try_match_rank_at(lines: list[str], idx: int) -> tuple[str, int] | None:
|
||||||
|
"""Try to match a standalone rank starting at ``lines[idx]``.
|
||||||
|
|
||||||
|
Returns ``(rank_text, lines_consumed)`` on success. Handles narrow-
|
||||||
|
column OCR that splits a multi-token rank across two lines (e.g.
|
||||||
|
``"KOMBES"`` + ``"POL"`` or ``"PENATA"`` + ``"TK I"``).
|
||||||
|
|
||||||
|
The two-line concatenation is tried *first* so that more-specific
|
||||||
|
multi-token ranks ("PENATA TK I") win over their less-specific
|
||||||
|
single-line prefix ("PENATA"). Without this preference, "TK I"
|
||||||
|
would leak into the jabatan column.
|
||||||
|
"""
|
||||||
|
if idx >= len(lines):
|
||||||
|
return None
|
||||||
|
line = lines[idx].strip()
|
||||||
|
if idx + 1 < len(lines):
|
||||||
|
combined = (line + " " + lines[idx + 1].strip()).strip()
|
||||||
|
m2 = _RE_RANK_ONLY.match(combined)
|
||||||
|
if m2:
|
||||||
|
return m2.group("rank"), 2
|
||||||
|
m = _RE_RANK_ONLY.match(line)
|
||||||
|
if m:
|
||||||
|
return m.group("rank"), 1
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_rank_only(lines: list[str]) -> list[PersonnelEntry]:
|
||||||
|
"""Pass 3: rank-only fallback for sprint formats without an NRP column.
|
||||||
|
|
||||||
|
Each standalone rank line (single line or two-line concatenation) is
|
||||||
|
treated as the pivot of a personnel row. ``nama`` is assembled from
|
||||||
|
the preceding contiguous plausible-name lines (typical OCR splits a
|
||||||
|
long name across 2-3 short lines because of narrow columns); jabatan
|
||||||
|
is collected from following lines until the next rank or row marker.
|
||||||
|
|
||||||
|
``nrp`` is always ``None`` for rows produced by this pass.
|
||||||
|
"""
|
||||||
|
rows: list[PersonnelEntry] = []
|
||||||
|
consumed_lines: set[int] = set()
|
||||||
|
i = 0
|
||||||
|
while i < len(lines):
|
||||||
|
match = _try_match_rank_at(lines, i)
|
||||||
|
if not match:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
rank_text, rank_span = match
|
||||||
|
pangkat = normalize_pangkat(rank_text)
|
||||||
|
if not pangkat or not is_valid_pangkat(pangkat):
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ── Look back for name lines (assemble up to 4 contiguous lines) ──
|
||||||
|
name_lines: list[str] = []
|
||||||
|
for back in range(i - 1, max(i - 6, -1), -1):
|
||||||
|
if back in consumed_lines:
|
||||||
|
break
|
||||||
|
candidate = lines[back].strip()
|
||||||
|
if not candidate:
|
||||||
|
if name_lines:
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
if _RE_BARE_ROW_NUMBER.match(candidate):
|
||||||
|
break
|
||||||
|
if _RE_NAME_BLOCKLIST.match(candidate):
|
||||||
|
break
|
||||||
|
if _try_match_rank_at(lines, back) is not None:
|
||||||
|
break
|
||||||
|
if not _is_plausible_name(candidate):
|
||||||
|
break
|
||||||
|
name_lines.insert(0, candidate)
|
||||||
|
consumed_lines.add(back)
|
||||||
|
nama = " ".join(" ".join(name_lines).split()) if name_lines else None
|
||||||
|
|
||||||
|
# ── Look forward for jabatan (stop at next rank / row marker) ─────
|
||||||
|
jabatan_parts: list[str] = []
|
||||||
|
fwd = i + rank_span
|
||||||
|
steps = 0
|
||||||
|
while fwd < len(lines) and steps < 8:
|
||||||
|
candidate = lines[fwd].strip()
|
||||||
|
if not candidate:
|
||||||
|
if jabatan_parts:
|
||||||
|
break
|
||||||
|
fwd += 1
|
||||||
|
steps += 1
|
||||||
|
continue
|
||||||
|
if _RE_BARE_ROW_NUMBER.match(candidate):
|
||||||
|
break
|
||||||
|
if _try_match_rank_at(lines, fwd) is not None:
|
||||||
|
break
|
||||||
|
if _RE_NAME_BLOCKLIST.match(candidate):
|
||||||
|
break
|
||||||
|
jabatan_parts.append(candidate)
|
||||||
|
fwd += 1
|
||||||
|
steps += 1
|
||||||
|
jabatan = " ".join(" ".join(jabatan_parts).split()) if jabatan_parts else None
|
||||||
|
|
||||||
|
rows.append(
|
||||||
|
PersonnelEntry(
|
||||||
|
no=None,
|
||||||
|
pangkat=pangkat,
|
||||||
|
nrp=None,
|
||||||
|
nama=nama,
|
||||||
|
jabatan_dinas=jabatan,
|
||||||
|
jabatan_sprint=None,
|
||||||
|
keterangan=None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
i += rank_span
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
# ── Column-aware Pass 3 (uses OCR bounding boxes) ───────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _box_x_left(box: tuple[tuple[float, float], ...]) -> float:
|
||||||
|
return min(p[0] for p in box)
|
||||||
|
|
||||||
|
|
||||||
|
def _box_x_right(box: tuple[tuple[float, float], ...]) -> float:
|
||||||
|
return max(p[0] for p in box)
|
||||||
|
|
||||||
|
|
||||||
|
def _box_x_center(box: tuple[tuple[float, float], ...]) -> float:
|
||||||
|
return (_box_x_left(box) + _box_x_right(box)) / 2
|
||||||
|
|
||||||
|
|
||||||
|
def _box_y_top(box: tuple[tuple[float, float], ...]) -> float:
|
||||||
|
return min(p[1] for p in box)
|
||||||
|
|
||||||
|
|
||||||
|
def _box_y_bottom(box: tuple[tuple[float, float], ...]) -> float:
|
||||||
|
return max(p[1] for p in box)
|
||||||
|
|
||||||
|
|
||||||
|
def _box_y_center(box: tuple[tuple[float, float], ...]) -> float:
|
||||||
|
return (_box_y_top(box) + _box_y_bottom(box)) / 2
|
||||||
|
|
||||||
|
|
||||||
|
def _box_height(box: tuple[tuple[float, float], ...]) -> float:
|
||||||
|
return _box_y_bottom(box) - _box_y_top(box)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_personnel_from_ocr_lines(ocr_lines: list) -> list[PersonnelEntry]:
|
||||||
|
"""Column-aware Pass 3 for sprint formats without an NRP column.
|
||||||
|
|
||||||
|
Each ``ocr_line`` must expose ``text`` (str) and ``box`` (a tuple of
|
||||||
|
4 ``(x, y)`` corner points). We use the geometry to:
|
||||||
|
|
||||||
|
1. Detect rank lines (single-line or vertically-stacked two-line).
|
||||||
|
2. Estimate the PANGKAT column X-center from those rank lines.
|
||||||
|
3. For each rank, gather **only** lines in the NAMA column (X left
|
||||||
|
of PANGKAT) within the row's Y span as the name fragments, and
|
||||||
|
**only** lines in the JABATAN column (X right of PANGKAT) for
|
||||||
|
jabatan. This prevents column-bleed that flat-text Pass 3
|
||||||
|
suffers from on dense tables.
|
||||||
|
|
||||||
|
Returns ``[]`` if no rank lines are detected (caller can fall back
|
||||||
|
to the text-only Pass 3).
|
||||||
|
"""
|
||||||
|
if not ocr_lines:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Sort by (y_top, x_left) for vertical-stacking rank detection.
|
||||||
|
indexed = sorted(
|
||||||
|
range(len(ocr_lines)),
|
||||||
|
key=lambda i: (_box_y_top(ocr_lines[i].box), _box_x_left(ocr_lines[i].box)),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pass 1: find rank anchors.
|
||||||
|
# An anchor is one or two stacked OCR lines whose combined text matches
|
||||||
|
# _RE_RANK_ONLY and normalises to a known pangkat. Two-line stacks must
|
||||||
|
# X-overlap so we don't accidentally merge cells from different columns.
|
||||||
|
used: set[int] = set()
|
||||||
|
anchors: list[dict] = []
|
||||||
|
for pos, idx in enumerate(indexed):
|
||||||
|
if idx in used:
|
||||||
|
continue
|
||||||
|
ln = ocr_lines[idx]
|
||||||
|
text = ln.text.strip()
|
||||||
|
|
||||||
|
rank_text: str | None = None
|
||||||
|
member_idxs: list[int] = [idx]
|
||||||
|
|
||||||
|
# Try two-line stack first (so PENATA TK I beats PENATA).
|
||||||
|
for j_pos in range(pos + 1, min(pos + 5, len(indexed))):
|
||||||
|
j_idx = indexed[j_pos]
|
||||||
|
if j_idx in used:
|
||||||
|
continue
|
||||||
|
other = ocr_lines[j_idx]
|
||||||
|
x_overlap = (
|
||||||
|
min(_box_x_right(ln.box), _box_x_right(other.box))
|
||||||
|
- max(_box_x_left(ln.box), _box_x_left(other.box))
|
||||||
|
)
|
||||||
|
if x_overlap <= 0:
|
||||||
|
continue
|
||||||
|
y_gap = _box_y_top(other.box) - _box_y_bottom(ln.box)
|
||||||
|
if y_gap > _box_height(ln.box) * 1.5:
|
||||||
|
break
|
||||||
|
combined = (text + " " + other.text.strip()).strip()
|
||||||
|
m2 = _RE_RANK_ONLY.match(combined)
|
||||||
|
if m2:
|
||||||
|
rank_text = m2.group("rank")
|
||||||
|
member_idxs.append(j_idx)
|
||||||
|
break
|
||||||
|
|
||||||
|
if rank_text is None:
|
||||||
|
m1 = _RE_RANK_ONLY.match(text)
|
||||||
|
if m1:
|
||||||
|
rank_text = m1.group("rank")
|
||||||
|
|
||||||
|
if rank_text is None:
|
||||||
|
continue
|
||||||
|
pangkat = normalize_pangkat(rank_text)
|
||||||
|
if not pangkat or not is_valid_pangkat(pangkat):
|
||||||
|
continue
|
||||||
|
|
||||||
|
anchors.append(
|
||||||
|
{
|
||||||
|
"member_idxs": member_idxs,
|
||||||
|
"pangkat": pangkat,
|
||||||
|
"x_center": _box_x_center(ln.box),
|
||||||
|
"y_top": min(_box_y_top(ocr_lines[m].box) for m in member_idxs),
|
||||||
|
"y_bottom": max(_box_y_bottom(ocr_lines[m].box) for m in member_idxs),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
used.update(member_idxs)
|
||||||
|
|
||||||
|
if not anchors:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Sort anchors by Y so we can compute row spans.
|
||||||
|
anchors.sort(key=lambda a: a["y_top"])
|
||||||
|
|
||||||
|
# Estimate PANGKAT column X-center as the median of rank anchor X-centers.
|
||||||
|
xs_sorted = sorted(a["x_center"] for a in anchors)
|
||||||
|
pangkat_x = xs_sorted[len(xs_sorted) // 2]
|
||||||
|
|
||||||
|
# X tolerance: half the median rank-line width. Lines with x_center
|
||||||
|
# within ±tolerance of pangkat_x are *in* the PANGKAT column and
|
||||||
|
# excluded from both NAMA and JABATAN buckets.
|
||||||
|
rank_widths = [
|
||||||
|
_box_x_right(ocr_lines[a["member_idxs"][0]].box)
|
||||||
|
- _box_x_left(ocr_lines[a["member_idxs"][0]].box)
|
||||||
|
for a in anchors
|
||||||
|
]
|
||||||
|
rank_widths.sort()
|
||||||
|
median_rank_width = rank_widths[len(rank_widths) // 2] if rank_widths else 50.0
|
||||||
|
column_margin = max(median_rank_width * 0.5, 5.0)
|
||||||
|
|
||||||
|
# Try to split the JABATAN side into STRUKTURAL (jabatan_dinas) and
|
||||||
|
# DALAM SPRIN (jabatan_sprint) by clustering jabatan-side X-centers.
|
||||||
|
# This is a 2-cluster k-means-style split: collect all X-centers of
|
||||||
|
# lines to the right of PANGKAT, find the largest X-gap among them,
|
||||||
|
# and use that gap as the column boundary. KET is typically the
|
||||||
|
# right-most narrow column we let bleed into jabatan_sprint since
|
||||||
|
# it's commonly empty.
|
||||||
|
jabatan_xs: list[float] = []
|
||||||
|
for ln in ocr_lines:
|
||||||
|
x = _box_x_center(ln.box)
|
||||||
|
if x > pangkat_x + column_margin and ln.text.strip():
|
||||||
|
jabatan_xs.append(x)
|
||||||
|
jabatan_split_x: float | None = None
|
||||||
|
if len(jabatan_xs) >= 4:
|
||||||
|
jabatan_xs.sort()
|
||||||
|
max_gap = 0.0
|
||||||
|
max_gap_x: float | None = None
|
||||||
|
for k in range(1, len(jabatan_xs)):
|
||||||
|
gap = jabatan_xs[k] - jabatan_xs[k - 1]
|
||||||
|
if gap > max_gap:
|
||||||
|
max_gap = gap
|
||||||
|
max_gap_x = (jabatan_xs[k] + jabatan_xs[k - 1]) / 2
|
||||||
|
# Only use the split if the gap is meaningfully larger than a
|
||||||
|
# within-column gap (heuristic: > 1.5× median rank width).
|
||||||
|
if max_gap_x is not None and max_gap > median_rank_width * 1.5:
|
||||||
|
jabatan_split_x = max_gap_x
|
||||||
|
|
||||||
|
# Pre-compute each anchor's y_center for midpoint row dividers.
|
||||||
|
anchor_y_centers = [(a["y_top"] + a["y_bottom"]) / 2 for a in anchors]
|
||||||
|
|
||||||
|
rows: list[PersonnelEntry] = []
|
||||||
|
for i, anchor in enumerate(anchors):
|
||||||
|
# Row Y span: midpoint between this anchor and its neighbours.
|
||||||
|
# Using the midpoint (rather than the previous anchor's
|
||||||
|
# y_bottom) prevents row N's tail content (e.g. last name
|
||||||
|
# fragment "M.H.") from leaking into row N+1's nama bucket
|
||||||
|
# when rank lines don't extend to the full visual row height.
|
||||||
|
y_lo = (
|
||||||
|
(anchor_y_centers[i - 1] + anchor_y_centers[i]) / 2
|
||||||
|
if i > 0
|
||||||
|
else float("-inf")
|
||||||
|
)
|
||||||
|
y_hi = (
|
||||||
|
(anchor_y_centers[i] + anchor_y_centers[i + 1]) / 2
|
||||||
|
if i + 1 < len(anchors)
|
||||||
|
else float("inf")
|
||||||
|
)
|
||||||
|
|
||||||
|
nama_pieces: list[tuple[float, str]] = []
|
||||||
|
struktural_pieces: list[tuple[float, str]] = []
|
||||||
|
sprint_pieces: list[tuple[float, str]] = []
|
||||||
|
for j, ln in enumerate(ocr_lines):
|
||||||
|
if j in anchor["member_idxs"]:
|
||||||
|
continue
|
||||||
|
text = ln.text.strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
x = _box_x_center(ln.box)
|
||||||
|
y = _box_y_center(ln.box)
|
||||||
|
if not (y_lo <= y <= y_hi):
|
||||||
|
continue
|
||||||
|
if x < pangkat_x - column_margin:
|
||||||
|
# NAMA side
|
||||||
|
if _RE_NAME_BLOCKLIST.match(text):
|
||||||
|
continue
|
||||||
|
if _RE_BARE_ROW_NUMBER.match(text):
|
||||||
|
continue
|
||||||
|
if not _is_plausible_name(text):
|
||||||
|
continue
|
||||||
|
nama_pieces.append((y, text))
|
||||||
|
elif x > pangkat_x + column_margin:
|
||||||
|
# JABATAN side — split into STRUKTURAL vs DALAM SPRIN
|
||||||
|
# using the geometric column boundary detected above.
|
||||||
|
if _RE_NAME_BLOCKLIST.match(text):
|
||||||
|
continue
|
||||||
|
if jabatan_split_x is not None and x > jabatan_split_x:
|
||||||
|
sprint_pieces.append((y, text))
|
||||||
|
else:
|
||||||
|
struktural_pieces.append((y, text))
|
||||||
|
# else: in PANGKAT column or column margin — skip
|
||||||
|
|
||||||
|
nama_pieces.sort(key=lambda p: p[0])
|
||||||
|
struktural_pieces.sort(key=lambda p: p[0])
|
||||||
|
sprint_pieces.sort(key=lambda p: p[0])
|
||||||
|
|
||||||
|
# Strip leading row number from the first nama piece (e.g. "1 F. GUNTUR"
|
||||||
|
# collapses to "F. GUNTUR" if the row marker happens to share a box).
|
||||||
|
if nama_pieces:
|
||||||
|
head = _RE_LEADING_ROW_NUMBER.sub("", nama_pieces[0][1]).strip()
|
||||||
|
nama_pieces[0] = (nama_pieces[0][0], head)
|
||||||
|
|
||||||
|
def _join(pieces: list[tuple[float, str]]) -> str | None:
|
||||||
|
text = " ".join(t for _, t in pieces if t).strip()
|
||||||
|
text = " ".join(text.split())
|
||||||
|
return text or None
|
||||||
|
|
||||||
|
rows.append(
|
||||||
|
PersonnelEntry(
|
||||||
|
no=None,
|
||||||
|
pangkat=anchor["pangkat"],
|
||||||
|
nrp=None,
|
||||||
|
nama=_join(nama_pieces),
|
||||||
|
jabatan_dinas=_join(struktural_pieces),
|
||||||
|
jabatan_sprint=_join(sprint_pieces),
|
||||||
|
keterangan=None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def is_low_quality(rows: list[PersonnelEntry]) -> bool:
|
||||||
|
"""Heuristic: did PP-Structure produce useless rows?
|
||||||
|
|
||||||
|
A row is useful when it has at least pangkat OR nrp. If most rows have
|
||||||
|
only ``nama`` (or worse, nothing) the table extraction failed and the
|
||||||
|
caller should retry with the text-based fallback.
|
||||||
|
"""
|
||||||
|
if not rows:
|
||||||
|
return True
|
||||||
|
useful = sum(1 for r in rows if r.pangkat or r.nrp)
|
||||||
|
# Require at least 30% of rows to carry rank/NRP signal. Below that we
|
||||||
|
# assume the column mapper degraded to "everything is nama" and prefer
|
||||||
|
# a fresh attempt.
|
||||||
|
return useful / max(1, len(rows)) < 0.3
|
||||||
@@ -53,19 +53,52 @@ _RE_TANGGAL_ID = re.compile(
|
|||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Satuan penerbit usually appears in the document letterhead, prefixed by
|
# Polri letterhead pieces. The full letterhead spans multiple lines that are
|
||||||
# KEPOLISIAN <NEGARA|DAERAH|RESORT|SEKTOR>.
|
# often broken across separate OCR rows like:
|
||||||
_RE_SATUAN = re.compile(
|
#
|
||||||
r"KEPOLISIAN\s+(?:NEGARA\s+REPUBLIK\s+INDONESIA|DAERAH|RESOR(?:T)?|SEKTOR|RESORT)"
|
# KEPOLISIAN NEGARA REPUBLIK INDONESIA
|
||||||
r"[^\n]{0,80}",
|
# DAERAH JAWA BARAT
|
||||||
|
# RESOR CIMAHI
|
||||||
|
#
|
||||||
|
# We capture each individual level so we can reconstruct the most-specific
|
||||||
|
# unit (RESOR / SEKTOR > DAERAH > NEGARA) — a downstream consumer cares
|
||||||
|
# about *which* unit issued the sprint, not just that some Polri unit did.
|
||||||
|
_RE_LEVEL_NEGARA = re.compile(
|
||||||
|
r"KEPOLISIAN\s+NEGARA\s+REPUBLIK\s+INDONESIA",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
_RE_LEVEL_DAERAH = re.compile(
|
||||||
|
r"(?:KEPOLISIAN\s+)?DAERAH\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
|
||||||
|
re.IGNORECASE | re.MULTILINE,
|
||||||
|
)
|
||||||
|
_RE_LEVEL_RESOR = re.compile(
|
||||||
|
r"(?:KEPOLISIAN\s+)?RESORT?\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
|
||||||
|
re.IGNORECASE | re.MULTILINE,
|
||||||
|
)
|
||||||
|
_RE_LEVEL_SEKTOR = re.compile(
|
||||||
|
r"(?:KEPOLISIAN\s+)?SEKTOR\s+([A-Z][A-Z .'/-]{1,60}?)(?:$|\s*\n)",
|
||||||
|
re.IGNORECASE | re.MULTILINE,
|
||||||
|
)
|
||||||
|
_RE_LEVEL_MABES = re.compile(r"MABES\s+POLRI\b", re.IGNORECASE)
|
||||||
|
|
||||||
# "Perihal : ...." up to end of line.
|
# "Perihal : ...." up to end of line.
|
||||||
_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
|
_RE_PERIHAL = re.compile(r"PERIHAL\s*[:\-]\s*(.+)", re.IGNORECASE)
|
||||||
|
# Many sprint docs (especially Polres-level) use 'Pertimbangan' as the
|
||||||
|
# single-paragraph rationale block instead of (or alongside) 'Perihal'.
|
||||||
|
# When `perihal` is missing we fall back to the first non-empty line under
|
||||||
|
# 'Pertimbangan :' so the LLM doesn't have to guess and so a downstream
|
||||||
|
# audit trail still has *something* in the perihal slot.
|
||||||
|
_RE_PERTIMBANGAN_LABEL = re.compile(r"^\s*PERTIMBANGAN\b", re.IGNORECASE)
|
||||||
|
|
||||||
# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
|
# A dasar entry typically begins with a number and dot, e.g. "1. UU No. 2 Tahun 2002 ..."
|
||||||
_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
|
_RE_DASAR_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
|
||||||
|
# OCR sometimes splits the number from its content across two lines:
|
||||||
|
# 1.
|
||||||
|
# Undang-Undang Nomor 2 Tahun 2002 ...
|
||||||
|
# We detect a bare-number line and merge with the next non-empty line.
|
||||||
|
_RE_DASAR_BARE_NUMBER = re.compile(r"^\s*(\d+)\s*[.)]\s*$")
|
||||||
|
# Generic 'untuk' bullet — same shape as a dasar item.
|
||||||
|
_RE_UNTUK_ITEM = re.compile(r"^\s*(\d+)\s*[.)]\s*(.+)$")
|
||||||
|
|
||||||
# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
|
# Signatory NRP — Polri NRPs are 8 digits, civil servant NIPs are 18 digits.
|
||||||
_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
|
_RE_NRP = re.compile(r"\b(NRP|NIP)\s*[.:]?\s*(\d{8,20})\b", re.IGNORECASE)
|
||||||
@@ -99,54 +132,159 @@ def find_tanggal(text: str) -> date | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_unit_tail(tail: str) -> str:
|
||||||
|
"""Strip trailing punctuation/noise from the captured place name."""
|
||||||
|
return " ".join(tail.split()).strip(" .,;:'\"")
|
||||||
|
|
||||||
|
|
||||||
def find_satuan(text: str) -> str | None:
|
def find_satuan(text: str) -> str | None:
|
||||||
"""Return the first letterhead match (issuing unit), normalized."""
|
"""Return the issuing unit, preferring the most-specific letterhead level.
|
||||||
match = _RE_SATUAN.search(text)
|
|
||||||
if not match:
|
Polri letterheads are hierarchical (Negara > Daerah > Resor/Sektor). The
|
||||||
return None
|
actual *issuing* unit is the deepest level present in the letterhead, not
|
||||||
return " ".join(match.group(0).split())
|
the topmost generic 'KEPOLISIAN NEGARA REPUBLIK INDONESIA' line. We scan
|
||||||
|
for each level independently and pick the most specific one available;
|
||||||
|
if only the generic Negara line is present we return that.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA\\n"
|
||||||
|
... "DAERAH JAWA BARAT\\nRESOR CIMAHI")
|
||||||
|
'KEPOLISIAN RESOR CIMAHI'
|
||||||
|
>>> find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
|
||||||
|
'KEPOLISIAN NEGARA REPUBLIK INDONESIA'
|
||||||
|
"""
|
||||||
|
# We only look at the document head — letterheads always sit at the
|
||||||
|
# very top, and constraining the search prevents false positives from
|
||||||
|
# body text like '... Polres Cimahi ...' deep in a paragraph.
|
||||||
|
head = "\n".join(text.splitlines()[:25])
|
||||||
|
|
||||||
|
sektor = _RE_LEVEL_SEKTOR.search(head)
|
||||||
|
if sektor:
|
||||||
|
return f"KEPOLISIAN SEKTOR {_clean_unit_tail(sektor.group(1))}"
|
||||||
|
resor = _RE_LEVEL_RESOR.search(head)
|
||||||
|
if resor:
|
||||||
|
return f"KEPOLISIAN RESOR {_clean_unit_tail(resor.group(1))}"
|
||||||
|
daerah = _RE_LEVEL_DAERAH.search(head)
|
||||||
|
if daerah:
|
||||||
|
return f"KEPOLISIAN DAERAH {_clean_unit_tail(daerah.group(1))}"
|
||||||
|
if _RE_LEVEL_MABES.search(head):
|
||||||
|
return "MABES POLRI"
|
||||||
|
if _RE_LEVEL_NEGARA.search(head):
|
||||||
|
return "KEPOLISIAN NEGARA REPUBLIK INDONESIA"
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def find_perihal(text: str) -> str | None:
|
def find_perihal(text: str) -> str | None:
|
||||||
"""Return the first 'Perihal: ...' line, trimmed to that line only."""
|
"""Return the first 'Perihal: ...' line, trimmed to that line only.
|
||||||
|
|
||||||
|
Falls back to the first non-empty line under a 'Pertimbangan' label
|
||||||
|
(a common variant in Polres-level surat sprint that doesn't have a
|
||||||
|
distinct 'Perihal' field). We deliberately keep this in regex-land
|
||||||
|
rather than deferring to the LLM because the LLM tends to hallucinate
|
||||||
|
perihal content from arbitrary paragraphs.
|
||||||
|
"""
|
||||||
for line in text.splitlines():
|
for line in text.splitlines():
|
||||||
m = _RE_PERIHAL.search(line)
|
m = _RE_PERIHAL.search(line)
|
||||||
if m:
|
if m:
|
||||||
return m.group(1).strip()
|
return m.group(1).strip()
|
||||||
|
|
||||||
|
lines = text.splitlines()
|
||||||
|
for idx, line in enumerate(lines):
|
||||||
|
if _RE_PERTIMBANGAN_LABEL.match(line):
|
||||||
|
for follow in lines[idx + 1 : idx + 5]:
|
||||||
|
stripped = follow.strip(" :\t")
|
||||||
|
if stripped and stripped != ":":
|
||||||
|
return stripped
|
||||||
|
break
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_numbered_section(
|
||||||
|
lines: list[str],
|
||||||
|
start_idx: int,
|
||||||
|
terminators: tuple[str, ...],
|
||||||
|
) -> list[str]:
|
||||||
|
"""Walk forward from ``start_idx`` collecting numbered list items.
|
||||||
|
|
||||||
|
Robust to OCR splitting the number marker onto its own line:
|
||||||
|
'1.' -> buffer ``pending_index=1``
|
||||||
|
next non-empty line starts the item body.
|
||||||
|
|
||||||
|
Continuation lines (non-empty, no leading number, after a started item)
|
||||||
|
are appended to the current item. Stops at any line whose uppercase form
|
||||||
|
starts with one of ``terminators``.
|
||||||
|
"""
|
||||||
|
items: list[str] = []
|
||||||
|
pending_marker = False
|
||||||
|
blank_run = 0
|
||||||
|
for raw_line in lines[start_idx:]:
|
||||||
|
line = raw_line.strip()
|
||||||
|
upper = line.upper()
|
||||||
|
if any(upper.startswith(term) for term in terminators):
|
||||||
|
break
|
||||||
|
if not line:
|
||||||
|
blank_run += 1
|
||||||
|
# Two consecutive blank lines reliably mark the end of a section.
|
||||||
|
# A single blank line is tolerated because OCR sprinkles them.
|
||||||
|
if blank_run >= 2 and items and not pending_marker:
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
blank_run = 0
|
||||||
|
bare = _RE_DASAR_BARE_NUMBER.match(line)
|
||||||
|
if bare:
|
||||||
|
pending_marker = True
|
||||||
|
continue
|
||||||
|
m = _RE_DASAR_ITEM.match(line)
|
||||||
|
if m:
|
||||||
|
items.append(m.group(2).strip())
|
||||||
|
pending_marker = False
|
||||||
|
continue
|
||||||
|
if pending_marker:
|
||||||
|
items.append(line)
|
||||||
|
pending_marker = False
|
||||||
|
continue
|
||||||
|
if items:
|
||||||
|
items[-1] = (items[-1] + " " + line).strip()
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
def find_dasar_list(text: str) -> list[str]:
|
def find_dasar_list(text: str) -> list[str]:
|
||||||
"""Extract numbered 'Dasar' items from the text.
|
"""Extract numbered 'Dasar' items from the text.
|
||||||
|
|
||||||
Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
|
Heuristic: locate a line containing 'DASAR' (Indonesian: "DASAR :") and
|
||||||
collect subsequent lines that start with a number. Stops at a blank line
|
delegate to ``_collect_numbered_section`` which handles three OCR
|
||||||
or a line beginning with another section header keyword.
|
artefacts:
|
||||||
|
|
||||||
|
1. Inline numbered items: ``"1. Undang-Undang ..."``.
|
||||||
|
2. Bare-number lines (the OCR engine puts the number alone on a line):
|
||||||
|
``"1.\\n Undang-Undang ..."``.
|
||||||
|
3. Continuation lines (a line that is the wrapped tail of the previous
|
||||||
|
item gets appended back onto it).
|
||||||
"""
|
"""
|
||||||
lines = text.splitlines()
|
lines = text.splitlines()
|
||||||
items: list[str] = []
|
|
||||||
in_dasar = False
|
|
||||||
section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
|
section_terminators = ("DIPERINTAHKAN", "UNTUK", "DASAR HUKUM", "PERIHAL")
|
||||||
for raw_line in lines:
|
for idx, raw_line in enumerate(lines):
|
||||||
line = raw_line.strip()
|
if re.match(r"^\s*DASAR\b", raw_line.strip(), re.IGNORECASE):
|
||||||
if not in_dasar:
|
return _collect_numbered_section(lines, idx + 1, section_terminators)
|
||||||
if re.match(r"^\s*DASAR\b", line, re.IGNORECASE):
|
return []
|
||||||
in_dasar = True
|
|
||||||
continue
|
|
||||||
if not line:
|
def find_untuk_list(text: str) -> list[str]:
|
||||||
if items:
|
"""Extract numbered 'Untuk' / 'DIPERINTAHKAN' bullets from the text.
|
||||||
break
|
|
||||||
continue
|
The 'Untuk' section follows 'DIPERINTAHKAN' / 'Kepada' and lists the
|
||||||
upper = line.upper()
|
tasks assigned to the personnel. Same OCR shape as Dasar, so we reuse
|
||||||
if any(upper.startswith(term) for term in section_terminators):
|
the collector but with different terminators.
|
||||||
break
|
"""
|
||||||
m = _RE_DASAR_ITEM.match(line)
|
lines = text.splitlines()
|
||||||
if m:
|
# Stop conditions: 'Selesai' (boilerplate), 'Dikeluarkan di' (signature
|
||||||
items.append(m.group(2).strip())
|
# block), 'Tembusan' (carbon-copy section).
|
||||||
elif items:
|
terminators = ("SELESAI", "DIKELUARKAN", "TEMBUSAN", "PADA TANGGAL")
|
||||||
# continuation of the previous dasar item
|
for idx, raw_line in enumerate(lines):
|
||||||
items[-1] = (items[-1] + " " + line).strip()
|
if re.match(r"^\s*UNTUK\b", raw_line.strip(), re.IGNORECASE):
|
||||||
return items
|
return _collect_numbered_section(lines, idx + 1, terminators)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def find_signatory(text: str) -> Signatory:
|
def find_signatory(text: str) -> Signatory:
|
||||||
|
|||||||
@@ -30,6 +30,13 @@ def validate_personnel_entry(entry: PersonnelEntry) -> list[ReviewFlag]:
|
|||||||
flags.append(ReviewFlag.INVALID_NRP)
|
flags.append(ReviewFlag.INVALID_NRP)
|
||||||
if entry.pangkat and not is_valid_pangkat(entry.pangkat):
|
if entry.pangkat and not is_valid_pangkat(entry.pangkat):
|
||||||
flags.append(ReviewFlag.UNKNOWN_PANGKAT)
|
flags.append(ReviewFlag.UNKNOWN_PANGKAT)
|
||||||
|
# Identification of a personnel row requires at least pangkat OR nrp.
|
||||||
|
# A row carrying only a name is structurally incomplete - likely a
|
||||||
|
# mis-aligned table cell or a leaked tembusan/dasar fragment - and must
|
||||||
|
# be flagged for human review even though pangkat/nrp validation
|
||||||
|
# individually pass (because they're empty).
|
||||||
|
if not entry.pangkat and not entry.nrp:
|
||||||
|
flags.append(ReviewFlag.INCOMPLETE_PERSONNEL_ROW)
|
||||||
return flags
|
return flags
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -36,6 +36,73 @@ class OCRLine:
|
|||||||
box: tuple[tuple[float, float], ...] # 4 (x, y) corner points
|
box: tuple[tuple[float, float], ...] # 4 (x, y) corner points
|
||||||
|
|
||||||
|
|
||||||
|
def _line_y_center(line: OCRLine) -> float:
|
||||||
|
return sum(p[1] for p in line.box) / len(line.box)
|
||||||
|
|
||||||
|
|
||||||
|
def _line_x_left(line: OCRLine) -> float:
|
||||||
|
return min(p[0] for p in line.box)
|
||||||
|
|
||||||
|
|
||||||
|
def _line_height(line: OCRLine) -> float:
|
||||||
|
ys = [p[1] for p in line.box]
|
||||||
|
return max(ys) - min(ys)
|
||||||
|
|
||||||
|
|
||||||
|
def sort_lines_by_layout(lines: list[OCRLine]) -> list[OCRLine]:
|
||||||
|
"""Reorder lines into top-to-bottom, left-to-right reading order.
|
||||||
|
|
||||||
|
PaddleOCR's natural output order reflects detection order, not visual
|
||||||
|
layout. On dense tables (e.g. Polda Kalbar Akpol-panitia sprint) this
|
||||||
|
interleaves rows and columns — Paddle may emit a row's KET column
|
||||||
|
before its NAMA column, breaking every downstream extractor that
|
||||||
|
assumes top-to-bottom row order.
|
||||||
|
|
||||||
|
We rebuild reading order by:
|
||||||
|
|
||||||
|
1. Sorting by ``y_center``.
|
||||||
|
2. Grouping consecutive lines into row-bands when their ``y_center``
|
||||||
|
differs by less than half the median line height (so visually
|
||||||
|
same-row cells stay together even when their boxes don't perfectly
|
||||||
|
align).
|
||||||
|
3. Sorting each band left-to-right by ``x_left``.
|
||||||
|
"""
|
||||||
|
if not lines:
|
||||||
|
return []
|
||||||
|
|
||||||
|
heights = [_line_height(ln) for ln in lines if _line_height(ln) > 0]
|
||||||
|
if not heights:
|
||||||
|
return list(lines)
|
||||||
|
median_height = sorted(heights)[len(heights) // 2]
|
||||||
|
band_threshold = max(1.0, median_height * 0.5)
|
||||||
|
|
||||||
|
by_y = sorted(lines, key=_line_y_center)
|
||||||
|
bands: list[list[OCRLine]] = []
|
||||||
|
current_band: list[OCRLine] = []
|
||||||
|
current_y: float | None = None
|
||||||
|
for ln in by_y:
|
||||||
|
y = _line_y_center(ln)
|
||||||
|
if current_y is None or abs(y - current_y) <= band_threshold:
|
||||||
|
current_band.append(ln)
|
||||||
|
# Track the band's running y-center as the mean of its
|
||||||
|
# members so a slowly-drifting set of cells doesn't split
|
||||||
|
# mid-row.
|
||||||
|
current_y = (
|
||||||
|
sum(_line_y_center(b) for b in current_band) / len(current_band)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
bands.append(current_band)
|
||||||
|
current_band = [ln]
|
||||||
|
current_y = y
|
||||||
|
if current_band:
|
||||||
|
bands.append(current_band)
|
||||||
|
|
||||||
|
ordered: list[OCRLine] = []
|
||||||
|
for band in bands:
|
||||||
|
ordered.extend(sorted(band, key=_line_x_left))
|
||||||
|
return ordered
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class OCRPage:
|
class OCRPage:
|
||||||
"""OCR output for a single page."""
|
"""OCR output for a single page."""
|
||||||
@@ -44,8 +111,8 @@ class OCRPage:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def text(self) -> str:
|
def text(self) -> str:
|
||||||
"""Reconstruct page text by concatenating lines (order = paddle's output order)."""
|
"""Reconstruct page text in visual reading order (top-to-bottom, left-to-right)."""
|
||||||
return "\n".join(line.text for line in self.lines)
|
return "\n".join(line.text for line in sort_lines_by_layout(self.lines))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def mean_confidence(self) -> float:
|
def mean_confidence(self) -> float:
|
||||||
@@ -55,9 +122,14 @@ class OCRPage:
|
|||||||
|
|
||||||
|
|
||||||
def _build_paddleocr() -> PaddleOCR:
|
def _build_paddleocr() -> PaddleOCR:
|
||||||
|
s = get_settings()
|
||||||
|
if s.ocr_use_gpu:
|
||||||
|
from ocr_sprint.utils.gpu import configure_nvidia_dll_path
|
||||||
|
|
||||||
|
configure_nvidia_dll_path()
|
||||||
|
|
||||||
from paddleocr import PaddleOCR
|
from paddleocr import PaddleOCR
|
||||||
|
|
||||||
s = get_settings()
|
|
||||||
kwargs: dict[str, object] = {
|
kwargs: dict[str, object] = {
|
||||||
"lang": s.ocr_lang,
|
"lang": s.ocr_lang,
|
||||||
"use_angle_cls": True,
|
"use_angle_cls": True,
|
||||||
@@ -84,6 +156,19 @@ def get_ocr() -> PaddleOCR:
|
|||||||
return _instance
|
return _instance
|
||||||
|
|
||||||
|
|
||||||
|
def warmup() -> None:
|
||||||
|
"""Eagerly initialize the PaddleOCR engine.
|
||||||
|
|
||||||
|
Call this during application startup so the first real request does not
|
||||||
|
pay the model-loading cost (~2-5s on CPU). Also prevents the process from
|
||||||
|
entering Disk-Sleep state (state D) mid-request when memory is tight,
|
||||||
|
because the OS has already paged in all model weights during startup.
|
||||||
|
"""
|
||||||
|
_logger.info("paddleocr.warmup.start")
|
||||||
|
get_ocr()
|
||||||
|
_logger.info("paddleocr.warmup.done")
|
||||||
|
|
||||||
|
|
||||||
def run_ocr(image: NDArrayU8) -> OCRPage:
|
def run_ocr(image: NDArrayU8) -> OCRPage:
|
||||||
"""Run OCR on a single BGR image and return a structured page result."""
|
"""Run OCR on a single BGR image and return a structured page result."""
|
||||||
engine = get_ocr()
|
engine = get_ocr()
|
||||||
|
|||||||
@@ -19,7 +19,16 @@ from ocr_sprint.llm.extractor import llm_fill_header
|
|||||||
from ocr_sprint.pipeline.confidence import compute_confidence, route
|
from ocr_sprint.pipeline.confidence import compute_confidence, route
|
||||||
from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
|
from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
|
||||||
from ocr_sprint.pipeline.extract.personnel import extract_personnel
|
from ocr_sprint.pipeline.extract.personnel import extract_personnel
|
||||||
from ocr_sprint.pipeline.extract.regex_rules import extract_header, find_signatory
|
from ocr_sprint.pipeline.extract.personnel_text import (
|
||||||
|
extract_personnel_from_ocr_lines,
|
||||||
|
extract_personnel_from_text,
|
||||||
|
is_low_quality,
|
||||||
|
)
|
||||||
|
from ocr_sprint.pipeline.extract.regex_rules import (
|
||||||
|
extract_header,
|
||||||
|
find_signatory,
|
||||||
|
find_untuk_list,
|
||||||
|
)
|
||||||
from ocr_sprint.pipeline.extract.validators import validate_extraction
|
from ocr_sprint.pipeline.extract.validators import validate_extraction
|
||||||
from ocr_sprint.pipeline.ingest import NDArrayU8, detect_source_kind, ingest
|
from ocr_sprint.pipeline.ingest import NDArrayU8, detect_source_kind, ingest
|
||||||
from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
|
from ocr_sprint.pipeline.ocr import OCRPage, run_ocr
|
||||||
@@ -112,6 +121,7 @@ def run_pipeline(content: bytes) -> PipelineOutput:
|
|||||||
header = merged
|
header = merged
|
||||||
|
|
||||||
personel: list[PersonnelEntry] = []
|
personel: list[PersonnelEntry] = []
|
||||||
|
table_flags: list[ReviewFlag] = []
|
||||||
if s.tables_enabled and cleaned_pages:
|
if s.tables_enabled and cleaned_pages:
|
||||||
all_tables: list[DetectedTable] = []
|
all_tables: list[DetectedTable] = []
|
||||||
for img in cleaned_pages:
|
for img in cleaned_pages:
|
||||||
@@ -126,14 +136,58 @@ def run_pipeline(content: bytes) -> PipelineOutput:
|
|||||||
personel_rows=len(personel),
|
personel_rows=len(personel),
|
||||||
)
|
)
|
||||||
|
|
||||||
initial_flags: list[ReviewFlag] = list(llm_flags)
|
# Text-based fallback: PP-Structure can succeed structurally but emit
|
||||||
|
# rows with only ``nama`` populated (column mapper degraded), or fail to
|
||||||
|
# detect the table at all. In both cases the regex fallback that scans
|
||||||
|
# raw OCR for rank+NRP pairs produces a much more useful result. We
|
||||||
|
# always run it when the structured path is empty or low-quality, and
|
||||||
|
# raise a review flag so the operator knows the document didn't go
|
||||||
|
# through the preferred path.
|
||||||
|
if is_low_quality(personel):
|
||||||
|
fallback_rows = extract_personnel_from_text(full_text)
|
||||||
|
# If text-based fallback produced rows but they all lack NRP
|
||||||
|
# (Pass 3 territory), retry with the column-aware extractor that
|
||||||
|
# uses OCR bounding boxes. On dense tables (e.g. Polda Kalbar
|
||||||
|
# Akpol-panitia), text-only Pass 3 bleeds adjacent columns into
|
||||||
|
# nama/jabatan because lines are interleaved within each Y-band;
|
||||||
|
# the columnar variant restricts each field to its visual column.
|
||||||
|
text_only_no_nrp = bool(fallback_rows) and all(
|
||||||
|
r.nrp is None for r in fallback_rows
|
||||||
|
)
|
||||||
|
if (not fallback_rows) or text_only_no_nrp:
|
||||||
|
ocr_lines = [ln for page in ocr_pages for ln in page.lines]
|
||||||
|
columnar_rows = extract_personnel_from_ocr_lines(ocr_lines)
|
||||||
|
if columnar_rows and (
|
||||||
|
not fallback_rows or len(columnar_rows) >= len(fallback_rows)
|
||||||
|
):
|
||||||
|
fallback_rows = columnar_rows
|
||||||
|
if fallback_rows:
|
||||||
|
personel = fallback_rows
|
||||||
|
# Pass 3 / columnar emit rows with nrp=None for sprint
|
||||||
|
# templates without an NRP column. Surface that with a
|
||||||
|
# distinct flag so operators know to expect missing NRPs by
|
||||||
|
# design rather than by OCR failure.
|
||||||
|
no_nrp = all(r.nrp is None for r in fallback_rows)
|
||||||
|
if no_nrp:
|
||||||
|
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK_NO_NRP)
|
||||||
|
else:
|
||||||
|
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
|
||||||
|
_logger.info(
|
||||||
|
"pipeline.personnel_text_fallback",
|
||||||
|
fallback_rows=len(fallback_rows),
|
||||||
|
no_nrp=no_nrp,
|
||||||
|
)
|
||||||
|
|
||||||
|
untuk_items = find_untuk_list(full_text)
|
||||||
|
|
||||||
|
initial_flags: list[ReviewFlag] = list(llm_flags) + list(table_flags)
|
||||||
if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
|
if mean_ocr_conf < _OCR_CONFIDENCE_FLAG_THRESHOLD:
|
||||||
initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
|
initial_flags.append(ReviewFlag.LOW_OCR_CONFIDENCE)
|
||||||
|
|
||||||
result = ExtractionResult(
|
result = ExtractionResult(
|
||||||
header=header,
|
header=header,
|
||||||
personel=personel,
|
personel=personel,
|
||||||
untuk=[],
|
untuk=untuk_items,
|
||||||
ttd=ttd,
|
ttd=ttd,
|
||||||
raw_text=full_text,
|
raw_text=full_text,
|
||||||
confidence=mean_ocr_conf,
|
confidence=mean_ocr_conf,
|
||||||
|
|||||||
@@ -67,21 +67,43 @@ class DetectedTable:
|
|||||||
# ---------- PP-Structure singleton ----------
|
# ---------- PP-Structure singleton ----------
|
||||||
|
|
||||||
|
|
||||||
def _build_pp_structure() -> PPStructure:
|
def _create_pp_structure(
|
||||||
from paddleocr import PPStructure
|
pp_structure_cls: type[PPStructure], pp_lang: str, use_gpu: bool
|
||||||
|
) -> PPStructure:
|
||||||
s = get_settings()
|
|
||||||
_logger.info("pp_structure.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
|
|
||||||
# layout=True so that PP-Structure also returns figure/text regions; we
|
# layout=True so that PP-Structure also returns figure/text regions; we
|
||||||
# filter to tables only afterwards. show_log=False to keep stdout clean.
|
# filter to tables only afterwards. show_log=False to keep stdout clean.
|
||||||
return PPStructure(
|
return pp_structure_cls(
|
||||||
lang=s.ocr_lang,
|
lang=pp_lang,
|
||||||
use_gpu=s.ocr_use_gpu,
|
use_gpu=use_gpu,
|
||||||
layout=True,
|
layout=True,
|
||||||
show_log=False,
|
show_log=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_pp_structure() -> PPStructure:
|
||||||
|
s = get_settings()
|
||||||
|
if s.ocr_use_gpu:
|
||||||
|
from ocr_sprint.utils.gpu import configure_nvidia_dll_path
|
||||||
|
|
||||||
|
configure_nvidia_dll_path()
|
||||||
|
|
||||||
|
from paddleocr import PPStructure
|
||||||
|
|
||||||
|
# PPStructure layout models only support 'en' and 'ch', not 'latin'.
|
||||||
|
# Use 'en' for layout/table detection — it's language-agnostic (detects
|
||||||
|
# table structure, not text language). OCR within cells still works for
|
||||||
|
# Indonesian text because the recognition model handles Latin scripts.
|
||||||
|
pp_lang = "en" if s.ocr_lang not in ("en", "ch") else s.ocr_lang
|
||||||
|
_logger.info("pp_structure.init", lang=pp_lang, use_gpu=s.ocr_use_gpu)
|
||||||
|
try:
|
||||||
|
return _create_pp_structure(PPStructure, pp_lang, s.ocr_use_gpu)
|
||||||
|
except Exception as exc:
|
||||||
|
if not s.ocr_use_gpu:
|
||||||
|
raise
|
||||||
|
_logger.warning("pp_structure.gpu_init_failed_falling_back_cpu", error=str(exc))
|
||||||
|
return _create_pp_structure(PPStructure, pp_lang, False)
|
||||||
|
|
||||||
|
|
||||||
def get_pp_structure() -> PPStructure:
|
def get_pp_structure() -> PPStructure:
|
||||||
"""Lazy, thread-safe singleton accessor for PP-Structure."""
|
"""Lazy, thread-safe singleton accessor for PP-Structure."""
|
||||||
global _instance
|
global _instance
|
||||||
@@ -92,6 +114,18 @@ def get_pp_structure() -> PPStructure:
|
|||||||
return _instance
|
return _instance
|
||||||
|
|
||||||
|
|
||||||
|
def warmup() -> None:
|
||||||
|
"""Eagerly initialize the PP-Structure engine.
|
||||||
|
|
||||||
|
Call this during application startup so the first real request does not
|
||||||
|
pay the model-loading cost (~3-6s on CPU). Mirrors ocr.warmup() so the
|
||||||
|
lifespan handler can warm both engines in one place.
|
||||||
|
"""
|
||||||
|
_logger.info("pp_structure.warmup.start")
|
||||||
|
get_pp_structure()
|
||||||
|
_logger.info("pp_structure.warmup.done")
|
||||||
|
|
||||||
|
|
||||||
# ---------- table parsing ----------
|
# ---------- table parsing ----------
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -21,6 +21,9 @@ class ReviewFlag(str, Enum):
|
|||||||
DATE_PARSE_FAILED = "date_parse_failed"
|
DATE_PARSE_FAILED = "date_parse_failed"
|
||||||
LLM_FALLBACK = "llm_fallback"
|
LLM_FALLBACK = "llm_fallback"
|
||||||
LLM_UNAVAILABLE = "llm_unavailable"
|
LLM_UNAVAILABLE = "llm_unavailable"
|
||||||
|
PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback"
|
||||||
|
PERSONNEL_TEXT_FALLBACK_NO_NRP = "personnel_text_fallback_no_nrp"
|
||||||
|
INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row"
|
||||||
|
|
||||||
|
|
||||||
class Signatory(BaseModel):
|
class Signatory(BaseModel):
|
||||||
|
|||||||
57
src/ocr_sprint/utils/gpu.py
Normal file
57
src/ocr_sprint/utils/gpu.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
"""GPU runtime helpers."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
_DLL_HANDLES: list[object] = []
|
||||||
|
_CONFIGURED = False
|
||||||
|
|
||||||
|
|
||||||
|
def configure_nvidia_dll_path() -> None:
|
||||||
|
"""Expose NVIDIA wheel DLL directories to the Windows dynamic loader.
|
||||||
|
|
||||||
|
Paddle's Windows GPU wheels dynamically load CUDA/cuDNN DLLs by name. When
|
||||||
|
those DLLs come from Python packages such as ``nvidia-cudnn-cu11`` instead
|
||||||
|
of a system-wide CUDA Toolkit install, their ``bin`` folders are not on
|
||||||
|
``PATH`` by default.
|
||||||
|
"""
|
||||||
|
global _CONFIGURED
|
||||||
|
if _CONFIGURED or os.name != "nt":
|
||||||
|
return
|
||||||
|
|
||||||
|
package_names = ("nvidia.cudnn", "nvidia.cublas", "nvidia.cuda_nvrtc")
|
||||||
|
dll_dirs: list[Path] = []
|
||||||
|
for package_name in package_names:
|
||||||
|
try:
|
||||||
|
module = __import__(package_name, fromlist=["__file__"])
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
module_file = getattr(module, "__file__", None)
|
||||||
|
if not module_file:
|
||||||
|
continue
|
||||||
|
dll_dir = Path(module_file).resolve().parent / "bin"
|
||||||
|
if dll_dir.is_dir():
|
||||||
|
dll_dirs.append(dll_dir)
|
||||||
|
|
||||||
|
if not dll_dirs:
|
||||||
|
_CONFIGURED = True
|
||||||
|
return
|
||||||
|
|
||||||
|
current_path_parts = os.environ.get("PATH", "").split(os.pathsep)
|
||||||
|
current_path_norm = {part.casefold() for part in current_path_parts if part}
|
||||||
|
|
||||||
|
prepend: list[str] = []
|
||||||
|
for dll_dir in dll_dirs:
|
||||||
|
dll_dir_str = str(dll_dir)
|
||||||
|
if dll_dir_str.casefold() not in current_path_norm:
|
||||||
|
prepend.append(dll_dir_str)
|
||||||
|
add_dll_directory = getattr(os, "add_dll_directory", None)
|
||||||
|
if add_dll_directory is not None:
|
||||||
|
_DLL_HANDLES.append(add_dll_directory(dll_dir_str))
|
||||||
|
|
||||||
|
if prepend:
|
||||||
|
os.environ["PATH"] = os.pathsep.join([*prepend, os.environ.get("PATH", "")])
|
||||||
|
|
||||||
|
_CONFIGURED = True
|
||||||
@@ -15,8 +15,12 @@ from __future__ import annotations
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from celery import Celery
|
from celery import Celery
|
||||||
|
from celery.signals import worker_ready
|
||||||
|
|
||||||
from ocr_sprint.config import get_settings
|
from ocr_sprint.config import get_settings
|
||||||
|
from ocr_sprint.utils.logging import get_logger
|
||||||
|
|
||||||
|
_logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def build_celery_app() -> Celery:
|
def build_celery_app() -> Celery:
|
||||||
@@ -47,3 +51,32 @@ def build_celery_app() -> Celery:
|
|||||||
|
|
||||||
|
|
||||||
celery_app = build_celery_app()
|
celery_app = build_celery_app()
|
||||||
|
|
||||||
|
|
||||||
|
@worker_ready.connect
|
||||||
|
def preload_ocr_models(sender: object, **kwargs: object) -> None:
|
||||||
|
"""Warm up PaddleOCR and PP-Structure when the worker process is ready.
|
||||||
|
|
||||||
|
With ``--pool=solo`` the worker runs tasks in the *same* process that
|
||||||
|
receives this signal, so models loaded here are reused for every
|
||||||
|
subsequent task — no fork overhead, no duplicate model loading, and
|
||||||
|
RAM usage stays bounded (~1.5 GB instead of 1.5 GB × n_forks).
|
||||||
|
"""
|
||||||
|
from ocr_sprint.config import get_settings as _gs
|
||||||
|
from ocr_sprint.pipeline import ocr as _ocr
|
||||||
|
from ocr_sprint.pipeline import table as _table
|
||||||
|
|
||||||
|
_logger.info("celery.worker.warmup.start")
|
||||||
|
s = _gs()
|
||||||
|
try:
|
||||||
|
_ocr.warmup()
|
||||||
|
except Exception as exc:
|
||||||
|
_logger.warning("celery.worker.paddleocr.warmup.failed", error=str(exc))
|
||||||
|
|
||||||
|
if s.tables_enabled:
|
||||||
|
try:
|
||||||
|
_table.warmup()
|
||||||
|
except Exception as exc:
|
||||||
|
_logger.warning("celery.worker.pp_structure.warmup.failed", error=str(exc))
|
||||||
|
|
||||||
|
_logger.info("celery.worker.warmup.done")
|
||||||
|
|||||||
@@ -40,11 +40,19 @@ def _seed_approved_job_with_corrections(
|
|||||||
jid,
|
jid,
|
||||||
status=DocumentStatus.NEEDS_REVIEW,
|
status=DocumentStatus.NEEDS_REVIEW,
|
||||||
confidence=0.8,
|
confidence=0.8,
|
||||||
result=final_result
|
# ``is None`` (not truthiness) so callers can pass ``{}`` to
|
||||||
or {
|
# exercise the empty-dict edge case.
|
||||||
"header": {"nomor_sprint": "SPR/1/2025", "satuan_penerbit": "POLRES X"},
|
result=(
|
||||||
"personel": [{"pangkat": "AIPDA", "nrp": "77060000", "nama": "BUDI"}],
|
final_result
|
||||||
},
|
if final_result is not None
|
||||||
|
else {
|
||||||
|
"header": {
|
||||||
|
"nomor_sprint": "SPR/1/2025",
|
||||||
|
"satuan_penerbit": "POLRES X",
|
||||||
|
},
|
||||||
|
"personel": [{"pangkat": "AIPDA", "nrp": "77060000", "nama": "BUDI"}],
|
||||||
|
}
|
||||||
|
),
|
||||||
review_flags=[],
|
review_flags=[],
|
||||||
)
|
)
|
||||||
if corrections:
|
if corrections:
|
||||||
@@ -197,6 +205,19 @@ def test_stats_counts_rollup_and_top_fields(db_ready: None) -> None:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_dict_result_stays_consistent(db_ready: None) -> None:
|
||||||
|
"""An empty-dict result (``{}``) is logically a valid snapshot — it
|
||||||
|
must round-trip as ``{}`` on *both* ``initial_result`` and
|
||||||
|
``final_result``, not ``{}`` on one and ``None`` on the other.
|
||||||
|
"""
|
||||||
|
_seed_approved_job_with_corrections(final_result={})
|
||||||
|
with session_scope() as session:
|
||||||
|
samples = list(iter_ground_truth_samples(session, GroundTruthFilters()))
|
||||||
|
assert len(samples) == 1
|
||||||
|
assert samples[0].initial_result == {}
|
||||||
|
assert samples[0].final_result == {}
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_is_valid_jsonl(db_ready: None) -> None:
|
def test_serialize_is_valid_jsonl(db_ready: None) -> None:
|
||||||
_seed_approved_job_with_corrections(corrections=[("header.perihal", "X", None)])
|
_seed_approved_job_with_corrections(corrections=[("header.perihal", "X", None)])
|
||||||
with session_scope() as session:
|
with session_scope() as session:
|
||||||
|
|||||||
75
tests/unit/test_ocr_layout.py
Normal file
75
tests/unit/test_ocr_layout.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
"""Tests for OCR layout reordering.
|
||||||
|
|
||||||
|
PaddleOCR emits text boxes in detection order, not visual reading order.
|
||||||
|
On dense table layouts (Polda Kalbar Akpol-panitia regression) this
|
||||||
|
interleaves columns within a row and breaks every downstream extractor
|
||||||
|
that assumes top-to-bottom row order. ``sort_lines_by_layout`` rebuilds
|
||||||
|
reading order from the bounding-box geometry.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from ocr_sprint.pipeline.ocr import OCRLine, OCRPage, sort_lines_by_layout
|
||||||
|
|
||||||
|
|
||||||
|
def _box(x: float, y: float, w: float = 30, h: float = 15):
|
||||||
|
return ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
|
||||||
|
|
||||||
|
|
||||||
|
def _make(text: str, x: float, y: float) -> OCRLine:
|
||||||
|
return OCRLine(text=text, confidence=1.0, box=_box(x, y))
|
||||||
|
|
||||||
|
|
||||||
|
class TestSortLinesByLayout:
|
||||||
|
def test_empty_returns_empty(self) -> None:
|
||||||
|
assert sort_lines_by_layout([]) == []
|
||||||
|
|
||||||
|
def test_already_sorted_is_stable(self) -> None:
|
||||||
|
lines = [_make("A", 10, 10), _make("B", 50, 10), _make("C", 10, 30)]
|
||||||
|
assert [ln.text for ln in sort_lines_by_layout(lines)] == ["A", "B", "C"]
|
||||||
|
|
||||||
|
def test_reorders_column_first_detection_to_row_first(self) -> None:
|
||||||
|
# Simulate a 2-row, 3-col table where Paddle returned cells
|
||||||
|
# column-first instead of row-first.
|
||||||
|
lines = [
|
||||||
|
_make("B1", 50, 10),
|
||||||
|
_make("B2", 50, 30),
|
||||||
|
_make("A1", 10, 10),
|
||||||
|
_make("A2", 10, 30),
|
||||||
|
_make("C1", 90, 10),
|
||||||
|
_make("C2", 90, 30),
|
||||||
|
]
|
||||||
|
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||||
|
assert result == ["A1", "B1", "C1", "A2", "B2", "C2"]
|
||||||
|
|
||||||
|
def test_groups_slightly_misaligned_cells_into_one_band(self) -> None:
|
||||||
|
# Real OCR boxes for a single visual row are rarely perfectly
|
||||||
|
# y-aligned; we still want them grouped.
|
||||||
|
lines = [
|
||||||
|
_make("LEFT", 10, 10),
|
||||||
|
_make("MID", 50, 12), # 2px below LEFT — same row visually
|
||||||
|
_make("RIGHT", 90, 11),
|
||||||
|
]
|
||||||
|
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||||
|
assert result == ["LEFT", "MID", "RIGHT"]
|
||||||
|
|
||||||
|
def test_separates_rows_when_y_gap_exceeds_threshold(self) -> None:
|
||||||
|
# Lines with a y gap larger than ~½ line-height must NOT collapse
|
||||||
|
# into the same band.
|
||||||
|
lines = [
|
||||||
|
_make("ROW1A", 10, 10),
|
||||||
|
_make("ROW1B", 50, 10),
|
||||||
|
_make("ROW2A", 10, 30), # gap of 20 vs height 15 → new band
|
||||||
|
_make("ROW2B", 50, 30),
|
||||||
|
]
|
||||||
|
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||||
|
assert result == ["ROW1A", "ROW1B", "ROW2A", "ROW2B"]
|
||||||
|
|
||||||
|
def test_ocrpage_text_uses_sorted_order(self) -> None:
|
||||||
|
lines = [
|
||||||
|
_make("RIGHT", 90, 10),
|
||||||
|
_make("LEFT", 10, 10),
|
||||||
|
_make("BOTTOM", 10, 30),
|
||||||
|
]
|
||||||
|
page = OCRPage(lines=lines)
|
||||||
|
assert page.text == "LEFT\nRIGHT\nBOTTOM"
|
||||||
@@ -169,3 +169,92 @@ def test_orchestrator_marks_unavailable_when_llm_returns_none(
|
|||||||
out = run_pipeline(b"%PDF-1.4\n%fake")
|
out = run_pipeline(b"%PDF-1.4\n%fake")
|
||||||
assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
|
assert ReviewFlag.LLM_UNAVAILABLE in out.result.review_flags
|
||||||
assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags
|
assert ReviewFlag.LLM_FALLBACK not in out.result.review_flags
|
||||||
|
|
||||||
|
|
||||||
|
def test_orchestrator_uses_text_fallback_when_pp_structure_yields_only_names(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
"""When PP-Structure produces low-quality rows (e.g. only ``nama`` filled),
|
||||||
|
the orchestrator must run the text fallback against the raw OCR text and
|
||||||
|
raise the ``personnel_text_fallback`` flag.
|
||||||
|
"""
|
||||||
|
monkeypatch.setenv("LLM_ENABLED", "false")
|
||||||
|
from ocr_sprint.config import get_settings
|
||||||
|
|
||||||
|
get_settings.cache_clear()
|
||||||
|
|
||||||
|
raw_text = (
|
||||||
|
"DAFTAR PERSONIL\n"
|
||||||
|
"1.\n"
|
||||||
|
"SRI WAHYUNI\n"
|
||||||
|
"AIPTU / 75070328\n"
|
||||||
|
"INTELKAM POLRES CIMAHI\n"
|
||||||
|
"2.\n"
|
||||||
|
"AGUNG LUKMAN\n"
|
||||||
|
"BRIPTU / 99030245\n"
|
||||||
|
"SAT INTELKAM\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# PP-Structure 'succeeded' but emitted name-only rows (the bug we saw on
|
||||||
|
# the real Polres Cimahi document).
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
pp_structure_low_quality = [
|
||||||
|
PersonnelEntry(nama="SRI WAHYUNI"),
|
||||||
|
PersonnelEntry(nama="AGUNG LUKMAN"),
|
||||||
|
]
|
||||||
|
_stub_pipeline_stages(
|
||||||
|
monkeypatch,
|
||||||
|
raw_text=raw_text,
|
||||||
|
regex_header=HeaderFields(
|
||||||
|
nomor_sprint="Sprin/1/I/2025",
|
||||||
|
tanggal=date(2025, 1, 1),
|
||||||
|
satuan_penerbit="Polres Cimahi",
|
||||||
|
perihal="ok",
|
||||||
|
dasar=["UU 2/2002"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# Override extract_personnel to return the broken PP-Structure rows.
|
||||||
|
monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: pp_structure_low_quality)
|
||||||
|
|
||||||
|
out = run_pipeline(b"%PDF-1.4\n%fake")
|
||||||
|
assert ReviewFlag.PERSONNEL_TEXT_FALLBACK in out.result.review_flags
|
||||||
|
# Fallback rows must carry pangkat + nrp (the whole point of the path).
|
||||||
|
assert all(r.pangkat and r.nrp for r in out.result.personel)
|
||||||
|
assert {r.pangkat for r in out.result.personel} == {"AIPTU", "BRIPTU"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_orchestrator_keeps_pp_structure_rows_when_quality_is_high(
|
||||||
|
monkeypatch: pytest.MonkeyPatch,
|
||||||
|
) -> None:
|
||||||
|
"""Healthy PP-Structure output (rank+nrp present on most rows) must NOT
|
||||||
|
be replaced by the text fallback.
|
||||||
|
"""
|
||||||
|
monkeypatch.setenv("LLM_ENABLED", "false")
|
||||||
|
from ocr_sprint.config import get_settings
|
||||||
|
|
||||||
|
get_settings.cache_clear()
|
||||||
|
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
healthy = [
|
||||||
|
PersonnelEntry(pangkat="AIPTU", nrp="11111111", nama="A"),
|
||||||
|
PersonnelEntry(pangkat="BRIPTU", nrp="22222222", nama="B"),
|
||||||
|
PersonnelEntry(pangkat="BRIPDA", nrp="33333333", nama="C"),
|
||||||
|
]
|
||||||
|
_stub_pipeline_stages(
|
||||||
|
monkeypatch,
|
||||||
|
raw_text="ignored — should not be parsed",
|
||||||
|
regex_header=HeaderFields(
|
||||||
|
nomor_sprint="Sprin/1/I/2025",
|
||||||
|
tanggal=date(2025, 1, 1),
|
||||||
|
satuan_penerbit="Polres X",
|
||||||
|
perihal="ok",
|
||||||
|
dasar=["UU 2/2002"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(orch_module, "extract_personnel", lambda _t: healthy)
|
||||||
|
|
||||||
|
out = run_pipeline(b"%PDF-1.4\n%fake")
|
||||||
|
assert ReviewFlag.PERSONNEL_TEXT_FALLBACK not in out.result.review_flags
|
||||||
|
assert [r.nrp for r in out.result.personel] == ["11111111", "22222222", "33333333"]
|
||||||
|
|||||||
324
tests/unit/test_personnel_text_fallback.py
Normal file
324
tests/unit/test_personnel_text_fallback.py
Normal file
@@ -0,0 +1,324 @@
|
|||||||
|
"""Tests for the text-based personnel fallback extractor.
|
||||||
|
|
||||||
|
Driven by the real Polres Cimahi sprint document where PP-Structure
|
||||||
|
produced 24 rows with only ``nama`` populated. The fallback should
|
||||||
|
recover at least the rank + NRP for every row.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from ocr_sprint.pipeline.extract.personnel_text import (
|
||||||
|
extract_personnel_from_ocr_lines,
|
||||||
|
extract_personnel_from_text,
|
||||||
|
is_low_quality,
|
||||||
|
)
|
||||||
|
from ocr_sprint.pipeline.ocr import OCRLine
|
||||||
|
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||||
|
|
||||||
|
|
||||||
|
def _ocr_line(text: str, x: float, y: float, w: float = 80, h: float = 15) -> OCRLine:
|
||||||
|
box = ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
|
||||||
|
return OCRLine(text=text, confidence=1.0, box=box)
|
||||||
|
|
||||||
|
_CIMAHI_FIXTURE = """\
|
||||||
|
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
|
||||||
|
NO
|
||||||
|
NAMA
|
||||||
|
PANGKAT / NRP
|
||||||
|
JABATAN
|
||||||
|
KET
|
||||||
|
BAUR SKCK SAT
|
||||||
|
1.
|
||||||
|
SRI WAHYUNI
|
||||||
|
AIPTU / 75070328
|
||||||
|
INTELKAM POLRES
|
||||||
|
CIMAHI
|
||||||
|
BA PELAKSANA SKCK
|
||||||
|
2.
|
||||||
|
CITRA DWI PUTRI R
|
||||||
|
BRIPTU / 95070659
|
||||||
|
SAT INTELKAM
|
||||||
|
POLRES CIMAHI
|
||||||
|
BA PELAKSANA SKCK
|
||||||
|
3.
|
||||||
|
AGUNG LUKMAN AL
|
||||||
|
BRIPTU / 99030245
|
||||||
|
SAT INTELKAM
|
||||||
|
POLRES CIMAHI
|
||||||
|
BA POLSEK
|
||||||
|
8.
|
||||||
|
ARIEF SYAHRUL ZAMAN
|
||||||
|
BRIGPOL /96030446
|
||||||
|
MARGAASIH
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractPersonnelFromText:
|
||||||
|
def test_extracts_rank_nrp_and_name(self) -> None:
|
||||||
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||||
|
assert len(rows) == 4
|
||||||
|
first = rows[0]
|
||||||
|
assert first.pangkat == "AIPTU"
|
||||||
|
assert first.nrp == "75070328"
|
||||||
|
assert first.nama == "SRI WAHYUNI"
|
||||||
|
|
||||||
|
def test_normalizes_brigpol_to_brigadir(self) -> None:
|
||||||
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||||
|
last = rows[-1]
|
||||||
|
# 'BRIGPOL' (no space) must canonicalize to 'BRIGADIR'.
|
||||||
|
assert last.pangkat == "BRIGADIR"
|
||||||
|
assert last.nrp == "96030446"
|
||||||
|
assert last.nama == "ARIEF SYAHRUL ZAMAN"
|
||||||
|
|
||||||
|
def test_skips_header_lines_as_names(self) -> None:
|
||||||
|
# No row should ever have a column-header word as nama.
|
||||||
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||||
|
names = [r.nama for r in rows]
|
||||||
|
for blocked in {"NAMA", "PANGKAT", "JABATAN", "KET", "DAFTAR"}:
|
||||||
|
assert blocked not in names
|
||||||
|
|
||||||
|
def test_jabatan_collected_from_following_lines(self) -> None:
|
||||||
|
rows = extract_personnel_from_text(_CIMAHI_FIXTURE)
|
||||||
|
assert rows[0].jabatan_dinas is not None
|
||||||
|
assert "INTELKAM" in rows[0].jabatan_dinas
|
||||||
|
|
||||||
|
def test_empty_text_returns_empty(self) -> None:
|
||||||
|
assert extract_personnel_from_text("") == []
|
||||||
|
|
||||||
|
def test_text_without_rank_nrp_pattern_returns_empty(self) -> None:
|
||||||
|
text = "Just a paragraph with no rank or NRP at all.\nAnother line."
|
||||||
|
assert extract_personnel_from_text(text) == []
|
||||||
|
|
||||||
|
def test_ignores_isolated_8digit_number_without_rank(self) -> None:
|
||||||
|
# NRP without a recognised rank token must not produce a row.
|
||||||
|
text = "Some line\n12345678\nanother line"
|
||||||
|
assert extract_personnel_from_text(text) == []
|
||||||
|
|
||||||
|
def test_rejects_unknown_rank_with_8digit_number(self) -> None:
|
||||||
|
# A "rank-shaped" word that isn't in the master list must not yield a row.
|
||||||
|
text = "Some line\nFAKERANK / 12345678\nanother line"
|
||||||
|
assert extract_personnel_from_text(text) == []
|
||||||
|
|
||||||
|
def test_does_not_drop_indonesian_names_starting_with_no_or_ket(self) -> None:
|
||||||
|
# Regression: 'NO' / 'KET' are legitimate column header tokens but
|
||||||
|
# also prefix common Indonesian names (KETUT, NOVA, NOOR). The
|
||||||
|
# blocklist must use word boundaries, not a raw startswith check.
|
||||||
|
text = (
|
||||||
|
"DAFTAR PERSONIL\n"
|
||||||
|
"1.\n"
|
||||||
|
"KETUT WARDANA\n"
|
||||||
|
"AIPTU / 11111111\n"
|
||||||
|
"JABATAN A\n"
|
||||||
|
"2.\n"
|
||||||
|
"NOVA SARI\n"
|
||||||
|
"BRIPTU / 22222222\n"
|
||||||
|
"JABATAN B\n"
|
||||||
|
"3.\n"
|
||||||
|
"NOOR HIDAYAT\n"
|
||||||
|
"BRIPDA / 33333333\n"
|
||||||
|
"JABATAN C\n"
|
||||||
|
)
|
||||||
|
rows = extract_personnel_from_text(text)
|
||||||
|
names = [r.nama for r in rows]
|
||||||
|
assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"]
|
||||||
|
|
||||||
|
def test_extracts_multiple_rows_when_collapsed_to_one_line(self) -> None:
|
||||||
|
# Polres Banjar regression: when PaddleOCR merges several table
|
||||||
|
# rows onto a single OCR line, every rank+NRP pair on that line
|
||||||
|
# must still produce a separate row. Previously per-line
|
||||||
|
# ``re.search`` returned only the first match.
|
||||||
|
text = (
|
||||||
|
"DAFTAR NAMA INSTRUKTUR\n"
|
||||||
|
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS "
|
||||||
|
"INSTRUKTUR LAT PRA OPS "
|
||||||
|
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 "
|
||||||
|
"KASAT RESKRIM SDA "
|
||||||
|
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 "
|
||||||
|
"KASATINTELKAM POLRES BANJAR SDA\n"
|
||||||
|
)
|
||||||
|
rows = extract_personnel_from_text(text)
|
||||||
|
assert len(rows) == 3
|
||||||
|
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
|
||||||
|
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
|
||||||
|
assert rows[0].nama == "CUCU JUHANA, A.K.S."
|
||||||
|
assert rows[1].nama is not None and "HERU SAMSUL BAHRI" in rows[1].nama
|
||||||
|
assert rows[2].nama is not None and "YAYAN SOPIANA" in rows[2].nama
|
||||||
|
|
||||||
|
def test_extracts_multiple_rows_when_split_across_lines(self) -> None:
|
||||||
|
# Variant of the squished case where OCR produces one line per
|
||||||
|
# table row. Each row still ends up with multiple rank+NRP pairs
|
||||||
|
# never being on the same line, but verifies the finditer-based
|
||||||
|
# path doesn't regress this layout.
|
||||||
|
text = (
|
||||||
|
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS\n"
|
||||||
|
"INSTRUKTUR LAT PRA OPS\n"
|
||||||
|
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 KASAT RESKRIM\n"
|
||||||
|
"SDA\n"
|
||||||
|
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 KASATINTELKAM\n"
|
||||||
|
"POLRES BANJAR SDA\n"
|
||||||
|
)
|
||||||
|
rows = extract_personnel_from_text(text)
|
||||||
|
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
|
||||||
|
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
|
||||||
|
assert rows[0].nama == "CUCU JUHANA, A.K.S."
|
||||||
|
|
||||||
|
def test_extracts_rows_when_sprint_has_no_nrp_column(self) -> None:
|
||||||
|
# Polda Kalbar Akpol-panitia regression: sprint formats without
|
||||||
|
# an NRP column (panitia, undangan templates) must still extract
|
||||||
|
# rows via the rank-only Pass 3 path. Names span multiple OCR
|
||||||
|
# lines (narrow column), and the multi-token rank "KOMBES POL"
|
||||||
|
# is split across two lines.
|
||||||
|
text = (
|
||||||
|
"DAFTAR NAMA PANITIA\n"
|
||||||
|
"NO\nNAMA\nPANGKAT\nJABATAN\nSTRUKTURAL\nDALAM SPRIN\nKET\n"
|
||||||
|
"1\nF. GUNTUR\nSUNOTO, S.I.K.,\nM.H.\n"
|
||||||
|
"KOMBES\nPOL\n"
|
||||||
|
"KARO SDM\nPOLDA KALBAR\nKETUA\nPELAKSANA\n"
|
||||||
|
"2\nJUDA TRISNO\nTAMPUBOLON,\nS.H., S.I.K., M.H.\n"
|
||||||
|
"AKBP\n"
|
||||||
|
"KABAGDALPERS\nRO SDM\nPOLDA KALBAR\nSEKRETARIS\n"
|
||||||
|
"3\nPRAYITNO, S.H.,\nM.H.\n"
|
||||||
|
"KOMPOL\n"
|
||||||
|
"KASUBBAG DIAPERS\nANGGOTA\n"
|
||||||
|
)
|
||||||
|
rows = extract_personnel_from_text(text)
|
||||||
|
assert len(rows) == 3
|
||||||
|
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "KOMPOL"]
|
||||||
|
# All Pass 3 rows have nrp=None by design.
|
||||||
|
assert all(r.nrp is None for r in rows)
|
||||||
|
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
|
||||||
|
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
|
||||||
|
assert rows[2].nama == "PRAYITNO, S.H., M.H."
|
||||||
|
assert rows[0].jabatan_dinas is not None and "KARO SDM" in rows[0].jabatan_dinas
|
||||||
|
|
||||||
|
def test_pass3_does_not_run_when_pass1_succeeds(self) -> None:
|
||||||
|
# If a sprint has NRPs (Pass 1 succeeds), Pass 3 must not fire
|
||||||
|
# and produce duplicate/contaminating rows.
|
||||||
|
text = (
|
||||||
|
"1\nSRI WAHYUNI\nAIPTU / 75070328\nBAUR SKCK\n"
|
||||||
|
"2\nCITRA DWI PUTRI\nBRIPTU / 95070659\nBA PELAKSANA\n"
|
||||||
|
)
|
||||||
|
rows = extract_personnel_from_text(text)
|
||||||
|
assert len(rows) == 2
|
||||||
|
assert all(r.nrp is not None for r in rows)
|
||||||
|
|
||||||
|
def test_still_blocks_bare_column_header_tokens(self) -> None:
|
||||||
|
# Word-boundary fix must still reject the actual column-header
|
||||||
|
# rows that motivated the blocklist in the first place.
|
||||||
|
text = "NO\nNAMA\nPANGKAT / NRP\nJABATAN\nKET\n1.\nREAL NAME\nAIPTU / 12345678\n"
|
||||||
|
rows = extract_personnel_from_text(text)
|
||||||
|
assert len(rows) == 1
|
||||||
|
assert rows[0].nama == "REAL NAME"
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractPersonnelFromOcrLines:
|
||||||
|
"""Column-aware Pass 3 — Polda Kalbar Akpol-panitia regression.
|
||||||
|
|
||||||
|
Verifies that bounding-box geometry preserves column boundaries on
|
||||||
|
dense tables where text-only Pass 3 bleeds adjacent columns into
|
||||||
|
nama/jabatan.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _kalbar_lines(self) -> list[OCRLine]:
|
||||||
|
# Stylised Polda Kalbar layout: NO | NAMA | PANGKAT | STRUKTURAL | SPRIN
|
||||||
|
# X columns: 10, 100, 250, 380, 520. Each row may have multi-line cells.
|
||||||
|
return [
|
||||||
|
# Row 1 — KOMBES POL spans two stacked OCR boxes
|
||||||
|
_ocr_line("1", 10, 100),
|
||||||
|
_ocr_line("F. GUNTUR", 100, 100),
|
||||||
|
_ocr_line("SUNOTO, S.I.K.,", 100, 120),
|
||||||
|
_ocr_line("M.H.", 100, 140),
|
||||||
|
_ocr_line("KOMBES", 250, 100),
|
||||||
|
_ocr_line("POL", 250, 120),
|
||||||
|
_ocr_line("KARO SDM", 380, 100),
|
||||||
|
_ocr_line("POLDA KALBAR", 380, 120),
|
||||||
|
_ocr_line("KETUA", 520, 100),
|
||||||
|
_ocr_line("PELAKSANA", 520, 120),
|
||||||
|
# Row 2
|
||||||
|
_ocr_line("2", 10, 200),
|
||||||
|
_ocr_line("JUDA TRISNO", 100, 200),
|
||||||
|
_ocr_line("TAMPUBOLON,", 100, 220),
|
||||||
|
_ocr_line("S.H., S.I.K., M.H.", 100, 240),
|
||||||
|
_ocr_line("AKBP", 250, 200),
|
||||||
|
_ocr_line("KABAGDALPERS", 380, 200),
|
||||||
|
_ocr_line("RO SDM", 380, 220),
|
||||||
|
_ocr_line("POLDA KALBAR", 380, 240),
|
||||||
|
_ocr_line("SEKRETARIS", 520, 200),
|
||||||
|
# Row 9 — PNS PENATA TK I (multi-token rank stacked)
|
||||||
|
_ocr_line("9", 10, 500),
|
||||||
|
_ocr_line("FITRIANSYAH,", 100, 500),
|
||||||
|
_ocr_line("S.E.", 100, 520),
|
||||||
|
_ocr_line("PENATA", 250, 500),
|
||||||
|
_ocr_line("TK I", 250, 520),
|
||||||
|
_ocr_line("KAURKEU", 380, 500),
|
||||||
|
_ocr_line("RO SDM", 380, 520),
|
||||||
|
_ocr_line("POLDA KALBAR", 380, 540),
|
||||||
|
_ocr_line("BENDAHARA", 520, 500),
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_extracts_three_rows(self) -> None:
|
||||||
|
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||||
|
assert len(rows) == 3
|
||||||
|
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "PENATA TK I"]
|
||||||
|
|
||||||
|
def test_nama_is_assembled_only_from_nama_column(self) -> None:
|
||||||
|
# Each row's nama must contain *all* its multi-line fragments
|
||||||
|
# and *only* its multi-line fragments — no bleed from struktural.
|
||||||
|
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||||
|
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
|
||||||
|
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
|
||||||
|
assert rows[2].nama == "FITRIANSYAH, S.E."
|
||||||
|
|
||||||
|
def test_jabatan_split_into_struktural_and_sprint(self) -> None:
|
||||||
|
# The geometric column boundary must split STRUKTURAL (jabatan_dinas)
|
||||||
|
# from DALAM SPRIN (jabatan_sprint).
|
||||||
|
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||||
|
assert rows[0].jabatan_dinas == "KARO SDM POLDA KALBAR"
|
||||||
|
assert rows[0].jabatan_sprint == "KETUA PELAKSANA"
|
||||||
|
assert rows[1].jabatan_dinas == "KABAGDALPERS RO SDM POLDA KALBAR"
|
||||||
|
assert rows[1].jabatan_sprint == "SEKRETARIS"
|
||||||
|
|
||||||
|
def test_returns_empty_when_no_rank_anchors(self) -> None:
|
||||||
|
lines = [
|
||||||
|
_ocr_line("DAFTAR NAMA", 100, 50),
|
||||||
|
_ocr_line("HEADER", 100, 100),
|
||||||
|
]
|
||||||
|
assert extract_personnel_from_ocr_lines(lines) == []
|
||||||
|
|
||||||
|
def test_returns_empty_for_empty_input(self) -> None:
|
||||||
|
assert extract_personnel_from_ocr_lines([]) == []
|
||||||
|
|
||||||
|
def test_no_row_bleed_between_consecutive_rows(self) -> None:
|
||||||
|
# Row 1's last name fragment ("F. GUNTUR") sits BELOW its rank
|
||||||
|
# line but inside row 1's visual span. It must NOT leak into
|
||||||
|
# row 2's nama, which should start with "JUDA TRISNO".
|
||||||
|
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||||
|
assert rows[1].nama is not None
|
||||||
|
assert rows[1].nama.startswith("JUDA TRISNO")
|
||||||
|
assert "GUNTUR" not in rows[1].nama
|
||||||
|
assert "SUNOTO" not in rows[1].nama
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsLowQuality:
|
||||||
|
def test_empty_list_is_low_quality(self) -> None:
|
||||||
|
assert is_low_quality([]) is True
|
||||||
|
|
||||||
|
def test_all_rows_with_only_name_is_low_quality(self) -> None:
|
||||||
|
rows = [PersonnelEntry(nama=f"NAMA {i}") for i in range(10)]
|
||||||
|
assert is_low_quality(rows) is True
|
||||||
|
|
||||||
|
def test_majority_with_rank_nrp_is_high_quality(self) -> None:
|
||||||
|
rows = [
|
||||||
|
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
||||||
|
for i in range(10)
|
||||||
|
]
|
||||||
|
assert is_low_quality(rows) is False
|
||||||
|
|
||||||
|
def test_borderline_30_percent_threshold(self) -> None:
|
||||||
|
# 3 useful out of 10 = exactly 0.3, treated as not-low-quality.
|
||||||
|
useful = [
|
||||||
|
PersonnelEntry(nama=f"NAMA {i}", pangkat="AIPTU", nrp=f"{10000000 + i:08d}")
|
||||||
|
for i in range(3)
|
||||||
|
]
|
||||||
|
useless = [PersonnelEntry(nama=f"NAMA {i + 3}") for i in range(7)]
|
||||||
|
assert is_low_quality(useful + useless) is False
|
||||||
@@ -14,6 +14,7 @@ from ocr_sprint.pipeline.extract.regex_rules import (
|
|||||||
find_satuan,
|
find_satuan,
|
||||||
find_signatory,
|
find_signatory,
|
||||||
find_tanggal,
|
find_tanggal,
|
||||||
|
find_untuk_list,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -60,6 +61,36 @@ class TestSatuan:
|
|||||||
result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
|
result = find_satuan("KEPOLISIAN NEGARA REPUBLIK INDONESIA")
|
||||||
assert result is not None
|
assert result is not None
|
||||||
|
|
||||||
|
def test_prefers_resor_over_negara_when_both_present(self) -> None:
|
||||||
|
# The Polri letterhead lists units hierarchically; the issuing unit
|
||||||
|
# is the deepest level, not the topmost generic "NEGARA" line.
|
||||||
|
text = (
|
||||||
|
"KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
|
||||||
|
"DAERAH JAWA BARAT\n"
|
||||||
|
"RESOR CIMAHI\n"
|
||||||
|
"SURAT PERINTAH\n"
|
||||||
|
)
|
||||||
|
result = find_satuan(text)
|
||||||
|
assert result == "KEPOLISIAN RESOR CIMAHI"
|
||||||
|
|
||||||
|
def test_prefers_sektor_over_resor(self) -> None:
|
||||||
|
text = (
|
||||||
|
"KEPOLISIAN NEGARA REPUBLIK INDONESIA\n"
|
||||||
|
"DAERAH JAWA BARAT\n"
|
||||||
|
"RESOR CIMAHI\n"
|
||||||
|
"SEKTOR PADALARANG\n"
|
||||||
|
)
|
||||||
|
result = find_satuan(text)
|
||||||
|
assert result == "KEPOLISIAN SEKTOR PADALARANG"
|
||||||
|
|
||||||
|
def test_handles_daerah_only(self) -> None:
|
||||||
|
text = "KEPOLISIAN NEGARA REPUBLIK INDONESIA\nDAERAH JAWA BARAT\n"
|
||||||
|
result = find_satuan(text)
|
||||||
|
assert result == "KEPOLISIAN DAERAH JAWA BARAT"
|
||||||
|
|
||||||
|
def test_returns_none_when_no_letterhead(self) -> None:
|
||||||
|
assert find_satuan("no police letterhead here") is None
|
||||||
|
|
||||||
|
|
||||||
class TestPerihal:
|
class TestPerihal:
|
||||||
def test_extracts_perihal_line(self) -> None:
|
def test_extracts_perihal_line(self) -> None:
|
||||||
@@ -69,6 +100,25 @@ class TestPerihal:
|
|||||||
def test_returns_none_when_absent(self) -> None:
|
def test_returns_none_when_absent(self) -> None:
|
||||||
assert find_perihal("no perihal field") is None
|
assert find_perihal("no perihal field") is None
|
||||||
|
|
||||||
|
def test_falls_back_to_pertimbangan_block(self) -> None:
|
||||||
|
# Many Polres-level sprints use "Pertimbangan" instead of "Perihal".
|
||||||
|
# The fallback should pick up the first non-empty line under it.
|
||||||
|
text = (
|
||||||
|
"Pertimbangan\n"
|
||||||
|
"Bahwa dalam rangka mendukung kepentingan Dinas Polres Cimahi.\n"
|
||||||
|
"DASAR :\n"
|
||||||
|
"1. ...\n"
|
||||||
|
)
|
||||||
|
result = find_perihal(text)
|
||||||
|
assert result is not None
|
||||||
|
assert result.startswith("Bahwa dalam rangka mendukung")
|
||||||
|
|
||||||
|
def test_perihal_wins_over_pertimbangan_when_both_present(self) -> None:
|
||||||
|
# If the document has both a Perihal label AND a Pertimbangan
|
||||||
|
# paragraph, the explicit Perihal wins.
|
||||||
|
text = "Pertimbangan\nSome pertimbangan content.\nPERIHAL : The actual perihal.\n"
|
||||||
|
assert find_perihal(text) == "The actual perihal."
|
||||||
|
|
||||||
|
|
||||||
class TestDasar:
|
class TestDasar:
|
||||||
def test_numbered_list(self) -> None:
|
def test_numbered_list(self) -> None:
|
||||||
@@ -88,6 +138,57 @@ class TestDasar:
|
|||||||
def test_empty_when_section_missing(self) -> None:
|
def test_empty_when_section_missing(self) -> None:
|
||||||
assert find_dasar_list("no dasar section") == []
|
assert find_dasar_list("no dasar section") == []
|
||||||
|
|
||||||
|
def test_handles_bare_number_lines_split_by_ocr(self) -> None:
|
||||||
|
# OCR sometimes places the number marker on its own line and the
|
||||||
|
# body on the next non-empty line. The collector must merge them
|
||||||
|
# rather than dropping the body or appending it to the previous
|
||||||
|
# item (which the old implementation did).
|
||||||
|
text = (
|
||||||
|
"Dasar\n"
|
||||||
|
":\n"
|
||||||
|
"1.\n"
|
||||||
|
" Undang - Undang Nomor 2 tahun 2002 tentang Kepolisian;\n"
|
||||||
|
"2. Peraturan Pemerintah Republik Indonesia No. 76 tahun 2020;\n"
|
||||||
|
"3.\n"
|
||||||
|
"Keterangan Catatan Kepolisian (SKCK);\n"
|
||||||
|
"4.\n"
|
||||||
|
"Pelayanan dilingkungan Badan Intelijen Keamanan Polri.\n"
|
||||||
|
"5. DIPA Petikan Satker Polres Cimahi.\n"
|
||||||
|
"DIPERINTAHKAN\n"
|
||||||
|
)
|
||||||
|
items = find_dasar_list(text)
|
||||||
|
assert len(items) == 5
|
||||||
|
assert items[0].startswith("Undang - Undang")
|
||||||
|
assert items[2].startswith("Keterangan Catatan")
|
||||||
|
assert items[3].startswith("Pelayanan dilingkungan")
|
||||||
|
assert items[4].startswith("DIPA")
|
||||||
|
|
||||||
|
|
||||||
|
class TestUntuk:
|
||||||
|
def test_extracts_numbered_untuk_bullets(self) -> None:
|
||||||
|
text = (
|
||||||
|
"DIPERINTAHKAN\n"
|
||||||
|
"Kepada\n"
|
||||||
|
"Untuk\n"
|
||||||
|
"1.\n"
|
||||||
|
"melaksanakan tugas A;\n"
|
||||||
|
"2.\n"
|
||||||
|
"melaksanakan tugas B;\n"
|
||||||
|
"Selesai.\n"
|
||||||
|
)
|
||||||
|
items = find_untuk_list(text)
|
||||||
|
assert len(items) == 2
|
||||||
|
assert items[0] == "melaksanakan tugas A;"
|
||||||
|
assert items[1] == "melaksanakan tugas B;"
|
||||||
|
|
||||||
|
def test_returns_empty_when_section_missing(self) -> None:
|
||||||
|
assert find_untuk_list("no untuk section") == []
|
||||||
|
|
||||||
|
def test_stops_at_dikeluarkan(self) -> None:
|
||||||
|
text = "Untuk\n1. tugas A;\nDikeluarkan di Cimahi\n2. should not be captured\n"
|
||||||
|
items = find_untuk_list(text)
|
||||||
|
assert items == ["tugas A;"]
|
||||||
|
|
||||||
|
|
||||||
class TestSignatory:
|
class TestSignatory:
|
||||||
def test_extracts_last_nrp(self) -> None:
|
def test_extracts_last_nrp(self) -> None:
|
||||||
|
|||||||
@@ -2,8 +2,12 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from types import ModuleType, SimpleNamespace
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from ocr_sprint.pipeline import table as table_module
|
||||||
from ocr_sprint.pipeline.table import (
|
from ocr_sprint.pipeline.table import (
|
||||||
DetectedTable,
|
DetectedTable,
|
||||||
extract_tables_from_pp_result,
|
extract_tables_from_pp_result,
|
||||||
@@ -82,6 +86,34 @@ class TestDetectedTable:
|
|||||||
assert table.n_cols == 0
|
assert table.n_cols == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestPpStructureInit:
|
||||||
|
def test_gpu_init_falls_back_to_cpu(self, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
calls: list[dict[str, object]] = []
|
||||||
|
|
||||||
|
class FakePPStructure:
|
||||||
|
def __init__(self, **kwargs: object) -> None:
|
||||||
|
calls.append(kwargs)
|
||||||
|
if kwargs["use_gpu"]:
|
||||||
|
raise RuntimeError("gpu init failed")
|
||||||
|
|
||||||
|
fake_paddleocr = ModuleType("paddleocr")
|
||||||
|
fake_paddleocr.PPStructure = FakePPStructure
|
||||||
|
monkeypatch.setitem(sys.modules, "paddleocr", fake_paddleocr)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
table_module,
|
||||||
|
"get_settings",
|
||||||
|
lambda: SimpleNamespace(ocr_lang="latin", ocr_use_gpu=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
engine = table_module._build_pp_structure()
|
||||||
|
|
||||||
|
assert isinstance(engine, FakePPStructure)
|
||||||
|
assert calls == [
|
||||||
|
{"lang": "en", "use_gpu": True, "layout": True, "show_log": False},
|
||||||
|
{"lang": "en", "use_gpu": False, "layout": True, "show_log": False},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sample_personnel_table() -> DetectedTable:
|
def sample_personnel_table() -> DetectedTable:
|
||||||
"""Header + three personnel rows in a typical Polres-level format."""
|
"""Header + three personnel rows in a typical Polres-level format."""
|
||||||
|
|||||||
@@ -62,6 +62,20 @@ class TestPersonnelValidator:
|
|||||||
entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
|
entry = PersonnelEntry(pangkat="Sersan Mayor", nrp="12345678", nama="Test")
|
||||||
assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
|
assert ReviewFlag.UNKNOWN_PANGKAT in validate_personnel_entry(entry)
|
||||||
|
|
||||||
|
def test_row_with_only_name_is_flagged_incomplete(self) -> None:
|
||||||
|
# A row that captured only `nama` (no pangkat AND no nrp) is the
|
||||||
|
# signature of mis-aligned table extraction. Must be flagged so
|
||||||
|
# the operator routes the document to needs_review.
|
||||||
|
entry = PersonnelEntry(nama="LEAKED FROM SOMEWHERE")
|
||||||
|
flags = validate_personnel_entry(entry)
|
||||||
|
assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW in flags
|
||||||
|
|
||||||
|
def test_row_with_only_pangkat_is_not_flagged_incomplete(self) -> None:
|
||||||
|
# Having pangkat without NRP is suboptimal but still identifies a
|
||||||
|
# rank, so we don't raise the structural-incompleteness flag.
|
||||||
|
entry = PersonnelEntry(pangkat="AKP", nama="Test")
|
||||||
|
assert ReviewFlag.INCOMPLETE_PERSONNEL_ROW not in validate_personnel_entry(entry)
|
||||||
|
|
||||||
|
|
||||||
class TestHeaderValidator:
|
class TestHeaderValidator:
|
||||||
def test_complete_header_no_flags(self) -> None:
|
def test_complete_header_no_flags(self) -> None:
|
||||||
|
|||||||
214
update.ps1
Normal file
214
update.ps1
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
#!/usr/bin/env pwsh
|
||||||
|
# update.ps1 - One-command update & restart for ocr-sprint-service (local dev)
|
||||||
|
|
||||||
|
param(
|
||||||
|
[ValidateSet("cpu", "gpu")]
|
||||||
|
[string] $OcrMode
|
||||||
|
)
|
||||||
|
|
||||||
|
$ErrorActionPreference = "Stop"
|
||||||
|
|
||||||
|
$Port = 8000
|
||||||
|
$ProjectRoot = $PSScriptRoot
|
||||||
|
$VenvDir = Join-Path $ProjectRoot ".venv"
|
||||||
|
$Python = Join-Path $VenvDir "Scripts\python.exe"
|
||||||
|
|
||||||
|
function Invoke-Step {
|
||||||
|
param(
|
||||||
|
[Parameter(Mandatory = $true)]
|
||||||
|
[scriptblock] $Command,
|
||||||
|
[Parameter(Mandatory = $true)]
|
||||||
|
[string] $FailureMessage
|
||||||
|
)
|
||||||
|
|
||||||
|
& $Command
|
||||||
|
if ($LASTEXITCODE -ne 0) {
|
||||||
|
Write-Host " $FailureMessage" -ForegroundColor Red
|
||||||
|
exit $LASTEXITCODE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function Get-DotEnvValue {
|
||||||
|
param(
|
||||||
|
[Parameter(Mandatory = $true)]
|
||||||
|
[string] $Name
|
||||||
|
)
|
||||||
|
|
||||||
|
$envFile = Join-Path $ProjectRoot ".env"
|
||||||
|
if (Test-Path $envFile) {
|
||||||
|
$line = Get-Content $envFile | Where-Object { $_ -match "^\s*$Name\s*=" } | Select-Object -Last 1
|
||||||
|
if ($line) {
|
||||||
|
return (($line -split "=", 2)[1] -split "\s+#", 2)[0].Trim()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return [Environment]::GetEnvironmentVariable($Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
function Set-DotEnvValue {
|
||||||
|
param(
|
||||||
|
[Parameter(Mandatory = $true)]
|
||||||
|
[string] $Name,
|
||||||
|
[Parameter(Mandatory = $true)]
|
||||||
|
[string] $Value
|
||||||
|
)
|
||||||
|
|
||||||
|
$envFile = Join-Path $ProjectRoot ".env"
|
||||||
|
if (-not (Test-Path $envFile)) {
|
||||||
|
New-Item -Path $envFile -ItemType File | Out-Null
|
||||||
|
}
|
||||||
|
|
||||||
|
$lines = @(Get-Content $envFile)
|
||||||
|
$updated = $false
|
||||||
|
for ($i = 0; $i -lt $lines.Count; $i++) {
|
||||||
|
if ($lines[$i] -match "^\s*$Name\s*=") {
|
||||||
|
$comment = ""
|
||||||
|
if ($lines[$i] -match "(\s+#.*)$") {
|
||||||
|
$comment = $Matches[1]
|
||||||
|
}
|
||||||
|
$lines[$i] = "$Name=$Value$comment"
|
||||||
|
$updated = $true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (-not $updated) {
|
||||||
|
$lines += "$Name=$Value"
|
||||||
|
}
|
||||||
|
Set-Content -Path $envFile -Value $lines
|
||||||
|
}
|
||||||
|
|
||||||
|
function Test-PythonPackage {
|
||||||
|
param(
|
||||||
|
[Parameter(Mandatory = $true)]
|
||||||
|
[string] $Name
|
||||||
|
)
|
||||||
|
|
||||||
|
& $Python -m pip show $Name *> $null
|
||||||
|
return $LASTEXITCODE -eq 0
|
||||||
|
}
|
||||||
|
|
||||||
|
function Add-NvidiaDllPaths {
|
||||||
|
$dllDirs = @(
|
||||||
|
(Join-Path $VenvDir "Lib\site-packages\nvidia\cudnn\bin"),
|
||||||
|
(Join-Path $VenvDir "Lib\site-packages\nvidia\cublas\bin"),
|
||||||
|
(Join-Path $VenvDir "Lib\site-packages\nvidia\cuda_nvrtc\bin")
|
||||||
|
)
|
||||||
|
foreach ($dir in $dllDirs) {
|
||||||
|
if ((Test-Path $dir) -and (($env:PATH -split ";") -notcontains $dir)) {
|
||||||
|
$env:PATH = "$dir;$env:PATH"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Set-Location $ProjectRoot
|
||||||
|
|
||||||
|
if (-not (Test-Path $Python)) {
|
||||||
|
Write-Host "Virtualenv not found at $VenvDir. Creating one..." -ForegroundColor Yellow
|
||||||
|
$venvCreated = $false
|
||||||
|
$pythonLauncher = Get-Command py -ErrorAction SilentlyContinue
|
||||||
|
if ($pythonLauncher) {
|
||||||
|
foreach ($version in @("3.12", "3.11", "3.10")) {
|
||||||
|
& py "-$version" -m venv $VenvDir 2>$null
|
||||||
|
if ($LASTEXITCODE -eq 0) {
|
||||||
|
$venvCreated = $true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (-not $venvCreated) {
|
||||||
|
$systemPython = Get-Command python -ErrorAction SilentlyContinue
|
||||||
|
if (-not $systemPython) {
|
||||||
|
Write-Host " Python was not found. Install Python 3.10-3.12, then rerun this script." -ForegroundColor Red
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
& python -m venv $VenvDir
|
||||||
|
$venvCreated = ($LASTEXITCODE -eq 0)
|
||||||
|
}
|
||||||
|
if (-not $venvCreated) {
|
||||||
|
Write-Host " Failed to create virtualenv." -ForegroundColor Red
|
||||||
|
exit $LASTEXITCODE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$env:VIRTUAL_ENV = $VenvDir
|
||||||
|
$env:PATH = "$(Join-Path $VenvDir 'Scripts');$env:PATH"
|
||||||
|
|
||||||
|
if ($PSBoundParameters.ContainsKey("OcrMode")) {
|
||||||
|
$ocrUseGpuValue = if ($OcrMode -eq "gpu") { "true" } else { "false" }
|
||||||
|
Set-DotEnvValue "OCR_USE_GPU" $ocrUseGpuValue
|
||||||
|
$env:OCR_USE_GPU = $ocrUseGpuValue
|
||||||
|
Write-Host "OCR mode set to $($OcrMode.ToUpperInvariant()) and saved to .env." -ForegroundColor Green
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── [1/5] Git pull ──────────────────────────────────────────────────────────
|
||||||
|
Write-Host "`n[1/5] Pulling latest code..." -ForegroundColor Cyan
|
||||||
|
Invoke-Step { git pull } "Git pull failed."
|
||||||
|
|
||||||
|
# ── [2/5] Install/update dependencies ───────────────────────────────────────
|
||||||
|
Write-Host "`n[2/5] Installing/updating dependencies..." -ForegroundColor Cyan
|
||||||
|
Invoke-Step { & $Python -m pip install -e ".[dev]" -q } "Dependency install failed."
|
||||||
|
|
||||||
|
$ocrUseGpu = (Get-DotEnvValue "OCR_USE_GPU")
|
||||||
|
if ($ocrUseGpu -and $ocrUseGpu.ToLowerInvariant() -in @("1", "true", "yes", "on")) {
|
||||||
|
Write-Host " GPU mode enabled; checking Paddle CUDA runtime..." -ForegroundColor Cyan
|
||||||
|
if (-not (Test-PythonPackage "paddlepaddle-gpu")) {
|
||||||
|
Invoke-Step {
|
||||||
|
& $Python -m pip install paddlepaddle-gpu==2.6.2 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ -q
|
||||||
|
} "Paddle GPU install failed."
|
||||||
|
}
|
||||||
|
if (-not (Test-PythonPackage "nvidia-cudnn-cu11")) {
|
||||||
|
Invoke-Step { & $Python -m pip install nvidia-cudnn-cu11==8.9.5.29 -q } "NVIDIA cuDNN install failed."
|
||||||
|
}
|
||||||
|
Add-NvidiaDllPaths
|
||||||
|
} else {
|
||||||
|
Write-Host " CPU mode enabled; checking Paddle CPU runtime..." -ForegroundColor Cyan
|
||||||
|
if (-not ((Test-PythonPackage "paddlepaddle") -or (Test-PythonPackage "paddlepaddle-gpu"))) {
|
||||||
|
Invoke-Step { & $Python -m pip install paddlepaddle==2.6.2 -q } "Paddle CPU install failed."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── [3/5] Database migration ─────────────────────────────────────────────────
|
||||||
|
Write-Host "`n[3/5] Running database migrations..." -ForegroundColor Cyan
|
||||||
|
& $Python -m alembic upgrade head
|
||||||
|
if ($LASTEXITCODE -ne 0) {
|
||||||
|
Write-Host " Migration conflict detected, stamping current state as head..." -ForegroundColor Yellow
|
||||||
|
Invoke-Step { & $Python -m alembic stamp head } "Alembic stamp failed."
|
||||||
|
Write-Host " Retrying upgrade for any remaining new migrations..." -ForegroundColor Yellow
|
||||||
|
& $Python -m alembic upgrade head
|
||||||
|
if ($LASTEXITCODE -ne 0) {
|
||||||
|
Write-Host " Migration still failed. Please check alembic manually." -ForegroundColor Red
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Write-Host " Migrations OK." -ForegroundColor Green
|
||||||
|
|
||||||
|
# ── [4/5] Free up port ───────────────────────────────────────────────────────
|
||||||
|
Write-Host "`n[4/5] Checking port $Port..." -ForegroundColor Cyan
|
||||||
|
|
||||||
|
# Use Get-NetTCPConnection for reliable port detection on Windows
|
||||||
|
$connections = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue
|
||||||
|
if ($connections) {
|
||||||
|
foreach ($conn in $connections) {
|
||||||
|
$procId = $conn.OwningProcess
|
||||||
|
$procName = (Get-Process -Id $procId -ErrorAction SilentlyContinue).Name
|
||||||
|
Write-Host " Port $Port used by '$procName' (PID $procId), killing..." -ForegroundColor Yellow
|
||||||
|
Stop-Process -Id $procId -Force -ErrorAction SilentlyContinue
|
||||||
|
}
|
||||||
|
# Wait until port is actually released (max 5 seconds)
|
||||||
|
$waited = 0
|
||||||
|
do {
|
||||||
|
Start-Sleep -Milliseconds 500
|
||||||
|
$waited += 500
|
||||||
|
$still = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue
|
||||||
|
} while ($still -and $waited -lt 5000)
|
||||||
|
|
||||||
|
if ($still) {
|
||||||
|
Write-Host " Port $Port still in use after waiting. Try a different port or restart manually." -ForegroundColor Red
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
Write-Host " Port $Port freed." -ForegroundColor Green
|
||||||
|
} else {
|
||||||
|
Write-Host " Port $Port is free." -ForegroundColor Green
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── [5/5] Start dev server ───────────────────────────────────────────────────
|
||||||
|
Write-Host "`n[5/5] Starting dev server on port $Port (Ctrl+C to stop)..." -ForegroundColor Cyan
|
||||||
|
& $Python -m uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port $Port
|
||||||
Reference in New Issue
Block a user