feat: implement robust personnel data extraction pipeline with text-based fallback and coordinate-aware processing
This commit is contained in:
BIN
# leave empty to use PaddleOCR defaults/inference.pdiparams
Normal file
BIN
# leave empty to use PaddleOCR defaults/inference.pdiparams
Normal file
Binary file not shown.
BIN
# leave empty to use PaddleOCR defaults/inference.pdiparams.info
Normal file
BIN
# leave empty to use PaddleOCR defaults/inference.pdiparams.info
Normal file
Binary file not shown.
BIN
# leave empty to use PaddleOCR defaults/inference.pdmodel
Normal file
BIN
# leave empty to use PaddleOCR defaults/inference.pdmodel
Normal file
Binary file not shown.
18
.claude/settings.local.json
Normal file
18
.claude/settings.local.json
Normal file
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Bash(python -m pytest tests/unit/test_personnel_text_fallback.py -x -q)",
|
||||
"Bash(python -c \"import sys; print\\(sys.executable\\)\")",
|
||||
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_personnel_text_fallback.py -x -q)",
|
||||
"Bash(.venv/Scripts/python.exe -m pytest tests/unit -x -q)",
|
||||
"Bash(git stash *)",
|
||||
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_api.py::test_documents_sync_returns_pipeline_output -x -q)",
|
||||
"Bash(.venv/Scripts/python.exe -m pytest tests/unit --ignore=tests/unit/test_api.py -q)",
|
||||
"Bash(.venv/Scripts/python.exe -c ' *)",
|
||||
"Bash(xargs grep *)",
|
||||
"Bash(.venv/Scripts/python.exe -m pytest tests/unit -q --ignore=tests/unit/test_api.py --ignore=tests/unit/test_api_hitl.py --ignore=tests/unit/test_blob_storage.py)",
|
||||
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_ocr_layout.py tests/unit/test_personnel_text_fallback.py -q)",
|
||||
"Bash(.venv/Scripts/python.exe -m pytest tests/unit/test_personnel_text_fallback.py tests/unit/test_ocr_layout.py -q)"
|
||||
]
|
||||
}
|
||||
}
|
||||
13
Makefile
13
Makefile
@@ -1,9 +1,10 @@
|
||||
.PHONY: help install dev fmt lint typecheck test test-cov run docker-build docker-up docker-down clean
|
||||
.PHONY: help install dev update fmt lint typecheck test test-cov run docker-build docker-up docker-down clean
|
||||
|
||||
help:
|
||||
@echo "Targets:"
|
||||
@echo " install - install runtime + dev deps in current env"
|
||||
@echo " dev - run FastAPI app with autoreload"
|
||||
@echo " update - git pull + install deps + migrate db + run dev server"
|
||||
@echo " fmt - format code with ruff"
|
||||
@echo " lint - lint with ruff"
|
||||
@echo " typecheck - run mypy"
|
||||
@@ -21,6 +22,16 @@ install:
|
||||
dev:
|
||||
uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
|
||||
|
||||
update:
|
||||
@echo "[1/4] Pulling latest code..."
|
||||
git pull
|
||||
@echo "[2/4] Installing/updating dependencies..."
|
||||
pip install -e ".[dev]"
|
||||
@echo "[3/4] Running database migrations..."
|
||||
alembic upgrade head
|
||||
@echo "[4/4] Starting dev server..."
|
||||
uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port 8000
|
||||
|
||||
fmt:
|
||||
ruff format src tests
|
||||
ruff check --fix src tests
|
||||
|
||||
858
docs/DEPLOYMENT-EXISTING-STACK.md
Normal file
858
docs/DEPLOYMENT-EXISTING-STACK.md
Normal file
@@ -0,0 +1,858 @@
|
||||
# Deployment OCR Sprint Service (Existing Stack)
|
||||
|
||||
Panduan deployment untuk server dengan Python 3.12.3, PostgreSQL 16.13, dan Redis 7.0.15 yang sudah terinstall.
|
||||
|
||||
## Informasi Server Anda
|
||||
|
||||
- **OS**: Ubuntu 24.04
|
||||
- **Python**: 3.12.3 ✅
|
||||
- **PostgreSQL**: 16.13 ✅
|
||||
- **Redis**: 7.0.15 ✅
|
||||
|
||||
Semua versi sudah kompatibel dan optimal untuk OCR Sprint Service!
|
||||
|
||||
## Langkah 1: Install System Libraries untuk OpenCV & PaddleOCR
|
||||
|
||||
```bash
|
||||
# Update package list
|
||||
sudo apt update
|
||||
|
||||
# Install libraries yang dibutuhkan oleh OpenCV dan PaddleOCR
|
||||
sudo apt install -y \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
libxrender1 \
|
||||
libgomp1 \
|
||||
libmagic1 \
|
||||
python3.12-venv \
|
||||
python3.12-dev \
|
||||
build-essential \
|
||||
git
|
||||
```
|
||||
|
||||
## Langkah 2: Setup PostgreSQL Database
|
||||
|
||||
```bash
|
||||
# Login ke PostgreSQL
|
||||
sudo -u postgres psql
|
||||
```
|
||||
|
||||
Jalankan SQL commands berikut:
|
||||
|
||||
```sql
|
||||
-- Create user dan database
|
||||
CREATE USER ocr WITH PASSWORD '@Offroader123';
|
||||
CREATE DATABASE ocr_sprint OWNER ocr;
|
||||
|
||||
-- Grant privileges
|
||||
GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
|
||||
|
||||
-- Connect ke database untuk grant schema privileges
|
||||
\c ocr_sprint
|
||||
|
||||
-- Grant schema privileges (PostgreSQL 15+)
|
||||
GRANT ALL ON SCHEMA public TO ocr;
|
||||
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO ocr;
|
||||
GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO ocr;
|
||||
|
||||
-- Verify
|
||||
\l ocr_sprint
|
||||
\du ocr
|
||||
|
||||
-- Exit
|
||||
\q
|
||||
```
|
||||
|
||||
**Generate password yang aman:**
|
||||
|
||||
```bash
|
||||
# Generate random password
|
||||
openssl rand -base64 32
|
||||
+J33GdYQcWcfqXs169cmgPrQJpLFgybjoedr/tNb0d4=
|
||||
```
|
||||
|
||||
Simpan password ini, akan digunakan di konfigurasi nanti.
|
||||
|
||||
## Langkah 3: Verify Redis
|
||||
|
||||
```bash
|
||||
# Check Redis status
|
||||
sudo systemctl status redis-server
|
||||
|
||||
# Test connection
|
||||
redis-cli ping
|
||||
# Expected output: PONG
|
||||
|
||||
# Check Redis config (opsional)
|
||||
redis-cli CONFIG GET maxmemory
|
||||
```
|
||||
|
||||
Jika Redis belum running:
|
||||
|
||||
```bash
|
||||
sudo systemctl enable redis-server
|
||||
sudo systemctl start redis-server
|
||||
```
|
||||
|
||||
## Langkah 4: Create Application User
|
||||
|
||||
```bash
|
||||
# Create dedicated user untuk aplikasi
|
||||
sudo useradd -m -s /bin/bash ocr
|
||||
|
||||
# Create application directory
|
||||
sudo mkdir -p /opt/ocr-sprint-service
|
||||
sudo chown ocr:ocr /opt/ocr-sprint-service
|
||||
```
|
||||
|
||||
## Langkah 5: Clone dan Install Application
|
||||
|
||||
```bash
|
||||
# Switch ke user ocr
|
||||
sudo su - ocr
|
||||
|
||||
# Clone repository
|
||||
cd /opt
|
||||
git clone https://github.com/Adriankf59/ocr-sprint-service.git
|
||||
cd ocr-sprint-service
|
||||
|
||||
# Create virtual environment dengan Python 3.12
|
||||
python3.12 -m venv .venv
|
||||
|
||||
# Activate virtual environment
|
||||
source .venv/bin/activate
|
||||
|
||||
# Verify Python version di venv
|
||||
python --version
|
||||
# Expected: Python 3.12.3
|
||||
|
||||
# Upgrade pip
|
||||
pip install --upgrade pip setuptools wheel
|
||||
|
||||
# Install application dengan OCR dependencies
|
||||
# Ini akan download ~1.5GB PaddlePaddle wheels
|
||||
pip install -e ".[ocr]"
|
||||
|
||||
# Verify installation
|
||||
python -c "import paddleocr; print('PaddleOCR OK')"
|
||||
python -c "import cv2; print('OpenCV OK')"
|
||||
python -c "import fastapi; print('FastAPI OK')"
|
||||
```
|
||||
|
||||
## Langkah 6: Konfigurasi Application
|
||||
|
||||
```bash
|
||||
# Masih sebagai user ocr
|
||||
cd /opt/ocr-sprint-service
|
||||
|
||||
# Copy environment template
|
||||
cp .env.example .env
|
||||
|
||||
# Edit konfigurasi
|
||||
nano .env
|
||||
```
|
||||
|
||||
**Konfigurasi `/opt/ocr-sprint-service/.env`:**
|
||||
|
||||
```bash
|
||||
# ==== App ====
|
||||
APP_ENV=prod
|
||||
APP_HOST=0.0.0.0
|
||||
APP_PORT=8000
|
||||
APP_LOG_LEVEL=INFO
|
||||
|
||||
# ==== Storage ====
|
||||
STORAGE_LOCAL_DIR=/opt/ocr-sprint-service/storage
|
||||
BLOB_STORAGE_DIR=/opt/ocr-sprint-service/storage/blobs
|
||||
BLOB_MAX_UPLOAD_MB=25
|
||||
|
||||
# ==== OCR ====
|
||||
OCR_LANG=latin
|
||||
OCR_USE_GPU=false
|
||||
OCR_MAX_IMAGE_SIDE=2200
|
||||
|
||||
# ==== Preprocessing ====
|
||||
PREPROCESS_TARGET_DPI=300
|
||||
PREPROCESS_DENOISE=true
|
||||
PREPROCESS_DESKEW=true
|
||||
PREPROCESS_DETECT_DOCUMENT=true
|
||||
PREPROCESS_REMOVE_SHADOW=true
|
||||
PREPROCESS_MIN_QUAD_AREA_FRACTION=0.20
|
||||
|
||||
# ==== Table Extraction ====
|
||||
TABLES_ENABLED=true
|
||||
|
||||
# ==== Confidence ====
|
||||
CONFIDENCE_AUTO_APPROVE=0.95
|
||||
CONFIDENCE_NEEDS_REVIEW=0.85
|
||||
|
||||
# ==== LLM (Phase 5, optional - disable untuk sekarang) ====
|
||||
LLM_ENABLED=false
|
||||
|
||||
# ==== Async Pipeline ====
|
||||
QUEUE_ENABLED=true
|
||||
REDIS_URL=redis://localhost:6379/0
|
||||
CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
|
||||
|
||||
# ==== Database ====
|
||||
# Ganti 'your-password-here' dengan password yang Anda generate di Langkah 2
|
||||
DATABASE_URL=postgresql+psycopg://ocr:your-password-here@localhost:5432/ocr_sprint
|
||||
DATABASE_ECHO=false
|
||||
|
||||
# ==== Auth (WAJIB untuk production!) ====
|
||||
# Generate dengan: openssl rand -hex 32
|
||||
API_KEYS=paste-api-key-1-here,paste-api-key-2-here
|
||||
API_KEY_HEADER=X-API-Key
|
||||
```
|
||||
|
||||
**Generate API keys:**
|
||||
|
||||
```bash
|
||||
# Generate 2 API keys
|
||||
echo "API Key 1: $(openssl rand -hex 32)"
|
||||
echo "API Key 2: $(openssl rand -hex 32)"
|
||||
```
|
||||
|
||||
Copy output dan paste ke `API_KEYS` di file `.env`.
|
||||
|
||||
**Create storage directories:**
|
||||
|
||||
```bash
|
||||
mkdir -p /opt/ocr-sprint-service/storage/blobs
|
||||
chmod 755 /opt/ocr-sprint-service/storage
|
||||
```
|
||||
|
||||
## Langkah 7: Run Database Migrations
|
||||
|
||||
```bash
|
||||
# Masih sebagai user ocr, dengan venv activated
|
||||
cd /opt/ocr-sprint-service
|
||||
source .venv/bin/activate
|
||||
|
||||
# Run migrations
|
||||
alembic upgrade head
|
||||
|
||||
# Verify - should show current revision
|
||||
alembic current
|
||||
|
||||
# Expected output: (head) atau revision number
|
||||
```
|
||||
|
||||
## Langkah 8: Test Manual Run
|
||||
|
||||
```bash
|
||||
# Masih sebagai user ocr
|
||||
cd /opt/ocr-sprint-service
|
||||
source .venv/bin/activate
|
||||
|
||||
# Test API server
|
||||
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
**Di terminal lain (sebagai user ubuntu):**
|
||||
|
||||
```bash
|
||||
# Test health check
|
||||
curl http://localhost:8000/api/v1/health
|
||||
|
||||
# Expected: {"status":"ok","version":"0.1.0"}
|
||||
|
||||
# Test dengan sample file (jika ada)
|
||||
curl -X POST "http://localhost:8000/api/v1/documents?sync=true" \
|
||||
-H "X-API-Key: your-api-key-here" \
|
||||
-F "file=@/path/to/test.pdf"
|
||||
```
|
||||
|
||||
Jika berhasil, stop server dengan `Ctrl+C`.
|
||||
|
||||
## Langkah 9: Setup Systemd Services
|
||||
|
||||
```bash
|
||||
# Exit dari user ocr
|
||||
exit
|
||||
|
||||
# Kembali sebagai user ubuntu dengan sudo
|
||||
```
|
||||
|
||||
### Create API Service
|
||||
|
||||
```bash
|
||||
sudo nano /etc/systemd/system/ocr-sprint-api.service
|
||||
```
|
||||
|
||||
**Content:**
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=OCR Sprint API Service
|
||||
After=network.target postgresql.service redis-server.service
|
||||
Wants=postgresql.service redis-server.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=ocr
|
||||
Group=ocr
|
||||
WorkingDirectory=/opt/ocr-sprint-service
|
||||
|
||||
# Environment
|
||||
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
|
||||
EnvironmentFile=/opt/ocr-sprint-service/.env
|
||||
|
||||
# Start command - 4 workers untuk production
|
||||
ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn \
|
||||
ocr_sprint.main:app \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000 \
|
||||
--workers 4 \
|
||||
--log-level info
|
||||
|
||||
# Restart policy
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
StartLimitInterval=0
|
||||
|
||||
# Resource limits
|
||||
LimitNOFILE=65536
|
||||
|
||||
# Security
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
### Create Celery Worker Service
|
||||
|
||||
```bash
|
||||
sudo nano /etc/systemd/system/ocr-sprint-worker.service
|
||||
```
|
||||
|
||||
**Content:**
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=OCR Sprint Celery Worker
|
||||
After=network.target postgresql.service redis-server.service ocr-sprint-api.service
|
||||
Wants=postgresql.service redis-server.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=ocr
|
||||
Group=ocr
|
||||
WorkingDirectory=/opt/ocr-sprint-service
|
||||
|
||||
# Environment
|
||||
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
|
||||
EnvironmentFile=/opt/ocr-sprint-service/.env
|
||||
|
||||
# Start command - concurrency 2 untuk CPU dengan 4 cores
|
||||
# Sesuaikan dengan jumlah CPU cores server Anda
|
||||
ExecStart=/opt/ocr-sprint-service/.venv/bin/celery \
|
||||
-A ocr_sprint.worker.celery_app \
|
||||
worker \
|
||||
--loglevel=info \
|
||||
--concurrency=2 \
|
||||
--max-tasks-per-child=100
|
||||
|
||||
# Restart policy
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
StartLimitInterval=0
|
||||
|
||||
# Resource limits
|
||||
LimitNOFILE=65536
|
||||
|
||||
# Security
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
### Enable dan Start Services
|
||||
|
||||
```bash
|
||||
# Reload systemd
|
||||
sudo systemctl daemon-reload
|
||||
|
||||
# Enable services (auto-start on boot)
|
||||
sudo systemctl enable ocr-sprint-api
|
||||
sudo systemctl enable ocr-sprint-worker
|
||||
|
||||
# Start services
|
||||
sudo systemctl start ocr-sprint-api
|
||||
sudo systemctl start ocr-sprint-worker
|
||||
|
||||
# Check status
|
||||
sudo systemctl status ocr-sprint-api
|
||||
sudo systemctl status ocr-sprint-worker
|
||||
```
|
||||
|
||||
**Expected output:** `active (running)` dengan warna hijau.
|
||||
|
||||
### View Logs
|
||||
|
||||
```bash
|
||||
# API logs (real-time)
|
||||
sudo journalctl -u ocr-sprint-api -f
|
||||
|
||||
# Worker logs (real-time)
|
||||
sudo journalctl -u ocr-sprint-worker -f
|
||||
|
||||
# Last 50 lines
|
||||
sudo journalctl -u ocr-sprint-api -n 50
|
||||
sudo journalctl -u ocr-sprint-worker -n 50
|
||||
```
|
||||
|
||||
## Langkah 10: Install dan Setup Nginx
|
||||
|
||||
```bash
|
||||
# Install Nginx dan Certbot
|
||||
sudo apt install -y nginx certbot python3-certbot-nginx
|
||||
|
||||
# Check Nginx status
|
||||
sudo systemctl status nginx
|
||||
```
|
||||
|
||||
### Create Nginx Configuration
|
||||
|
||||
```bash
|
||||
sudo nano /etc/nginx/sites-available/ocr-sprint
|
||||
```
|
||||
|
||||
**Content (ganti `ocr.yourdomain.com` dengan domain Anda):**
|
||||
|
||||
```nginx
|
||||
# Upstream
|
||||
upstream ocr_api {
|
||||
server 127.0.0.1:8000;
|
||||
keepalive 32;
|
||||
}
|
||||
|
||||
# Rate limiting
|
||||
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
server_name ocr.yourdomain.com;
|
||||
|
||||
# Max upload size
|
||||
client_max_body_size 30M;
|
||||
client_body_buffer_size 128k;
|
||||
|
||||
# Timeouts
|
||||
proxy_connect_timeout 300s;
|
||||
proxy_send_timeout 300s;
|
||||
proxy_read_timeout 300s;
|
||||
send_timeout 300s;
|
||||
|
||||
# Logging
|
||||
access_log /var/log/nginx/ocr-sprint-access.log;
|
||||
error_log /var/log/nginx/ocr-sprint-error.log;
|
||||
|
||||
# API endpoints
|
||||
location /api/ {
|
||||
limit_req zone=api_limit burst=20 nodelay;
|
||||
|
||||
proxy_pass http://ocr_api;
|
||||
proxy_http_version 1.1;
|
||||
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
proxy_set_header Connection "";
|
||||
|
||||
proxy_buffering off;
|
||||
}
|
||||
|
||||
# Health check
|
||||
location /api/v1/health {
|
||||
proxy_pass http://ocr_api;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
access_log off;
|
||||
}
|
||||
|
||||
# Metrics (restrict access)
|
||||
location /metrics {
|
||||
allow 127.0.0.1;
|
||||
allow 10.0.0.0/8;
|
||||
deny all;
|
||||
|
||||
proxy_pass http://ocr_api;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
}
|
||||
|
||||
# API docs
|
||||
location /docs {
|
||||
proxy_pass http://ocr_api;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
}
|
||||
|
||||
location /redoc {
|
||||
proxy_pass http://ocr_api;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Enable Site
|
||||
|
||||
```bash
|
||||
# Test konfigurasi
|
||||
sudo nginx -t
|
||||
|
||||
# Enable site
|
||||
sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
|
||||
|
||||
# Reload Nginx
|
||||
sudo systemctl reload nginx
|
||||
```
|
||||
|
||||
### Setup SSL (jika punya domain)
|
||||
|
||||
```bash
|
||||
# Obtain certificate
|
||||
sudo certbot --nginx -d ocr.yourdomain.com
|
||||
|
||||
# Test auto-renewal
|
||||
sudo certbot renew --dry-run
|
||||
```
|
||||
|
||||
## Langkah 11: Setup Firewall
|
||||
|
||||
```bash
|
||||
# Check UFW status
|
||||
sudo ufw status
|
||||
|
||||
# Allow SSH (PENTING!)
|
||||
sudo ufw allow 22/tcp
|
||||
|
||||
# Allow HTTP dan HTTPS
|
||||
sudo ufw allow 80/tcp
|
||||
sudo ufw allow 443/tcp
|
||||
|
||||
# Enable firewall (jika belum)
|
||||
sudo ufw enable
|
||||
|
||||
# Verify
|
||||
sudo ufw status numbered
|
||||
```
|
||||
|
||||
## Langkah 12: Verifikasi Final
|
||||
|
||||
### Test dari Server
|
||||
|
||||
```bash
|
||||
# Health check
|
||||
curl http://localhost:8000/api/v1/health
|
||||
|
||||
# Test async endpoint
|
||||
curl -X POST http://localhost:8000/api/v1/documents \
|
||||
-H "X-API-Key: your-api-key-here" \
|
||||
-F "file=@/path/to/test.pdf"
|
||||
|
||||
# Expected: {"job_id":"...","status":"pending",...}
|
||||
|
||||
# Check job status
|
||||
curl -H "X-API-Key: your-api-key-here" \
|
||||
http://localhost:8000/api/v1/documents/JOB_ID_HERE
|
||||
```
|
||||
|
||||
### Test via Domain (jika sudah setup SSL)
|
||||
|
||||
```bash
|
||||
curl https://ocr.yourdomain.com/api/v1/health
|
||||
```
|
||||
|
||||
### Check Services
|
||||
|
||||
```bash
|
||||
# All services should be active
|
||||
sudo systemctl status ocr-sprint-api
|
||||
sudo systemctl status ocr-sprint-worker
|
||||
sudo systemctl status postgresql
|
||||
sudo systemctl status redis-server
|
||||
sudo systemctl status nginx
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### View Logs
|
||||
|
||||
```bash
|
||||
# API logs
|
||||
sudo journalctl -u ocr-sprint-api -f
|
||||
|
||||
# Worker logs
|
||||
sudo journalctl -u ocr-sprint-worker -f
|
||||
|
||||
# Nginx access logs
|
||||
sudo tail -f /var/log/nginx/ocr-sprint-access.log
|
||||
|
||||
# Nginx error logs
|
||||
sudo tail -f /var/log/nginx/ocr-sprint-error.log
|
||||
```
|
||||
|
||||
### Prometheus Metrics
|
||||
|
||||
```bash
|
||||
# View metrics
|
||||
curl http://localhost:8000/metrics
|
||||
|
||||
# Key metrics:
|
||||
# - ocr_documents_total
|
||||
# - ocr_processing_duration_seconds
|
||||
# - ocr_confidence_score
|
||||
```
|
||||
|
||||
## Maintenance
|
||||
|
||||
### Restart Services
|
||||
|
||||
```bash
|
||||
sudo systemctl restart ocr-sprint-api
|
||||
sudo systemctl restart ocr-sprint-worker
|
||||
```
|
||||
|
||||
### Update Application
|
||||
|
||||
```bash
|
||||
# Switch ke user ocr
|
||||
sudo su - ocr
|
||||
cd /opt/ocr-sprint-service
|
||||
|
||||
# Pull latest code
|
||||
git pull
|
||||
|
||||
# Activate venv
|
||||
source .venv/bin/activate
|
||||
|
||||
# Update dependencies
|
||||
pip install -e ".[ocr]"
|
||||
|
||||
# Run migrations
|
||||
alembic upgrade head
|
||||
|
||||
# Exit
|
||||
exit
|
||||
|
||||
# Restart services
|
||||
sudo systemctl restart ocr-sprint-api
|
||||
sudo systemctl restart ocr-sprint-worker
|
||||
|
||||
# Check logs
|
||||
sudo journalctl -u ocr-sprint-api -n 50
|
||||
```
|
||||
|
||||
### Database Backup
|
||||
|
||||
```bash
|
||||
# Create backup directory
|
||||
sudo mkdir -p /opt/ocr-sprint-service/backups
|
||||
sudo chown ocr:ocr /opt/ocr-sprint-service/backups
|
||||
|
||||
# Manual backup
|
||||
sudo -u ocr pg_dump -h localhost -U ocr ocr_sprint | gzip > /opt/ocr-sprint-service/backups/backup_$(date +%Y%m%d_%H%M%S).sql.gz
|
||||
```
|
||||
|
||||
**Setup automated backup:**
|
||||
|
||||
```bash
|
||||
# Create backup script
|
||||
sudo nano /opt/ocr-sprint-service/backup.sh
|
||||
```
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
BACKUP_DIR="/opt/ocr-sprint-service/backups"
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
# Backup database
|
||||
PGPASSWORD='your-db-password' pg_dump -h localhost -U ocr ocr_sprint | gzip > $BACKUP_DIR/db_$DATE.sql.gz
|
||||
|
||||
# Keep only last 7 days
|
||||
find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete
|
||||
|
||||
echo "Backup completed: $DATE"
|
||||
```
|
||||
|
||||
```bash
|
||||
# Make executable
|
||||
sudo chmod +x /opt/ocr-sprint-service/backup.sh
|
||||
sudo chown ocr:ocr /opt/ocr-sprint-service/backup.sh
|
||||
|
||||
# Setup cron (daily at 2 AM)
|
||||
sudo crontab -e -u ocr
|
||||
|
||||
# Add line:
|
||||
0 2 * * * /opt/ocr-sprint-service/backup.sh >> /var/log/ocr-backup.log 2>&1
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Service tidak start
|
||||
|
||||
```bash
|
||||
# Check detailed logs
|
||||
sudo journalctl -u ocr-sprint-api -n 100 --no-pager
|
||||
sudo journalctl -u ocr-sprint-worker -n 100 --no-pager
|
||||
|
||||
# Check file permissions
|
||||
ls -la /opt/ocr-sprint-service
|
||||
ls -la /opt/ocr-sprint-service/storage
|
||||
|
||||
# Test manual run
|
||||
sudo su - ocr
|
||||
cd /opt/ocr-sprint-service
|
||||
source .venv/bin/activate
|
||||
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### Database connection error
|
||||
|
||||
```bash
|
||||
# Test connection
|
||||
sudo -u ocr psql -h localhost -U ocr -d ocr_sprint
|
||||
|
||||
# Check PostgreSQL status
|
||||
sudo systemctl status postgresql
|
||||
|
||||
# Check PostgreSQL logs
|
||||
sudo journalctl -u postgresql -n 50
|
||||
```
|
||||
|
||||
### Redis connection error
|
||||
|
||||
```bash
|
||||
# Test Redis
|
||||
redis-cli ping
|
||||
|
||||
# Check Redis status
|
||||
sudo systemctl status redis-server
|
||||
|
||||
# Check Redis logs
|
||||
sudo journalctl -u redis-server -n 50
|
||||
```
|
||||
|
||||
### Worker tidak memproses jobs
|
||||
|
||||
```bash
|
||||
# Check Celery worker status
|
||||
sudo su - ocr
|
||||
cd /opt/ocr-sprint-service
|
||||
source .venv/bin/activate
|
||||
celery -A ocr_sprint.worker.celery_app inspect active
|
||||
celery -A ocr_sprint.worker.celery_app inspect stats
|
||||
|
||||
# Check Redis queue
|
||||
redis-cli LLEN ocr_sprint
|
||||
```
|
||||
|
||||
### PaddleOCR error
|
||||
|
||||
```bash
|
||||
# Re-download models
|
||||
sudo su - ocr
|
||||
cd /opt/ocr-sprint-service
|
||||
source .venv/bin/activate
|
||||
|
||||
python << EOF
|
||||
from paddleocr import PaddleOCR
|
||||
ocr = PaddleOCR(use_angle_cls=True, lang='latin')
|
||||
print("Models downloaded successfully")
|
||||
EOF
|
||||
```
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### Check CPU cores
|
||||
|
||||
```bash
|
||||
nproc
|
||||
```
|
||||
|
||||
### Adjust worker concurrency
|
||||
|
||||
```bash
|
||||
# Edit worker service
|
||||
sudo nano /etc/systemd/system/ocr-sprint-worker.service
|
||||
|
||||
# Untuk 4 cores: --concurrency=2
|
||||
# Untuk 8 cores: --concurrency=4
|
||||
# Untuk 16 cores: --concurrency=8
|
||||
|
||||
# Reload dan restart
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl restart ocr-sprint-worker
|
||||
```
|
||||
|
||||
### PostgreSQL 16 Tuning
|
||||
|
||||
```bash
|
||||
sudo nano /etc/postgresql/16/main/postgresql.conf
|
||||
```
|
||||
|
||||
**Recommended settings (sesuaikan dengan RAM server):**
|
||||
|
||||
```
|
||||
# Untuk 8GB RAM:
|
||||
shared_buffers = 2GB
|
||||
effective_cache_size = 6GB
|
||||
maintenance_work_mem = 512MB
|
||||
work_mem = 8MB
|
||||
|
||||
# Untuk 16GB RAM:
|
||||
shared_buffers = 4GB
|
||||
effective_cache_size = 12GB
|
||||
maintenance_work_mem = 1GB
|
||||
work_mem = 10MB
|
||||
|
||||
# General
|
||||
checkpoint_completion_target = 0.9
|
||||
wal_buffers = 16MB
|
||||
default_statistics_target = 100
|
||||
random_page_cost = 1.1
|
||||
effective_io_concurrency = 200
|
||||
max_worker_processes = 4
|
||||
max_parallel_workers_per_gather = 2
|
||||
max_parallel_workers = 4
|
||||
```
|
||||
|
||||
```bash
|
||||
sudo systemctl restart postgresql
|
||||
```
|
||||
|
||||
## Security Checklist
|
||||
|
||||
- [ ] API keys set dengan nilai random yang kuat
|
||||
- [ ] Database password diganti dari default
|
||||
- [ ] Firewall enabled (UFW)
|
||||
- [ ] SSL/TLS enabled (jika punya domain)
|
||||
- [ ] `/metrics` endpoint restricted
|
||||
- [ ] PostgreSQL hanya listen di localhost
|
||||
- [ ] Redis hanya listen di localhost
|
||||
- [ ] Backup automated (cron job)
|
||||
- [ ] OS security updates enabled
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Setup monitoring** - Install Prometheus + Grafana (opsional)
|
||||
2. **Setup alerting** - Email/Slack notification untuk errors
|
||||
3. **Load testing** - Test dengan volume dokumen production
|
||||
4. **Backup verification** - Test restore dari backup
|
||||
5. **Documentation** - Dokumentasi API keys untuk tim
|
||||
|
||||
## Support
|
||||
|
||||
Untuk pertanyaan atau issues, hubungi tim development.
|
||||
943
docs/DEPLOYMENT-MANUAL.md
Normal file
943
docs/DEPLOYMENT-MANUAL.md
Normal file
@@ -0,0 +1,943 @@
|
||||
# Deployment Manual OCR Sprint Service (Tanpa Docker)
|
||||
|
||||
Panduan lengkap deployment OCR Sprint Service langsung di server tanpa menggunakan Docker.
|
||||
|
||||
## Prasyarat Server
|
||||
|
||||
### Spesifikasi Minimum
|
||||
- **OS**: Ubuntu 20.04+ / Debian 11+ / RHEL 8+
|
||||
- **CPU**: 4 cores (8 cores recommended)
|
||||
- **RAM**: 8 GB minimum (16 GB recommended)
|
||||
- **Storage**: 50 GB free space
|
||||
- **User**: Non-root user dengan sudo access
|
||||
|
||||
### Port yang Dibutuhkan
|
||||
- `8000`: API server (internal, akan di-proxy oleh Nginx)
|
||||
- `80/443`: HTTP/HTTPS (Nginx)
|
||||
- `5432`: PostgreSQL (localhost only)
|
||||
- `6379`: Redis (localhost only)
|
||||
|
||||
## Langkah 1: Install System Dependencies
|
||||
|
||||
### Ubuntu/Debian
|
||||
|
||||
```bash
|
||||
# Update system
|
||||
sudo apt update && sudo apt upgrade -y
|
||||
|
||||
# Install Python 3.11
|
||||
sudo apt install -y software-properties-common
|
||||
sudo add-apt-repository ppa:deadsnakes/ppa -y
|
||||
sudo apt update
|
||||
sudo apt install -y python3.11 python3.11-venv python3.11-dev python3-pip
|
||||
|
||||
# Install system libraries untuk OpenCV dan PaddleOCR
|
||||
sudo apt install -y \
|
||||
libgl1-mesa-glx \
|
||||
libglib2.0-0 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
libxrender1 \
|
||||
libgomp1 \
|
||||
libmagic1 \
|
||||
build-essential \
|
||||
git \
|
||||
curl \
|
||||
wget
|
||||
|
||||
# Install Redis
|
||||
sudo apt install -y redis-server
|
||||
sudo systemctl enable redis-server
|
||||
sudo systemctl start redis-server
|
||||
|
||||
# Install PostgreSQL
|
||||
sudo apt install -y postgresql postgresql-contrib
|
||||
sudo systemctl enable postgresql
|
||||
sudo systemctl start postgresql
|
||||
```
|
||||
|
||||
### RHEL/CentOS/Rocky Linux
|
||||
|
||||
```bash
|
||||
# Update system
|
||||
sudo dnf update -y
|
||||
|
||||
# Install Python 3.11
|
||||
sudo dnf install -y python3.11 python3.11-devel python3.11-pip
|
||||
|
||||
# Install system libraries
|
||||
sudo dnf install -y \
|
||||
mesa-libGL \
|
||||
glib2 \
|
||||
libSM \
|
||||
libXext \
|
||||
libXrender \
|
||||
file-libs \
|
||||
gcc \
|
||||
gcc-c++ \
|
||||
make \
|
||||
git
|
||||
|
||||
# Install Redis
|
||||
sudo dnf install -y redis
|
||||
sudo systemctl enable redis
|
||||
sudo systemctl start redis
|
||||
|
||||
# Install PostgreSQL
|
||||
sudo dnf install -y postgresql-server postgresql-contrib
|
||||
sudo postgresql-setup --initdb
|
||||
sudo systemctl enable postgresql
|
||||
sudo systemctl start postgresql
|
||||
```
|
||||
|
||||
## Langkah 2: Setup Database PostgreSQL
|
||||
|
||||
```bash
|
||||
# Masuk sebagai postgres user
|
||||
sudo -u postgres psql
|
||||
|
||||
# Jalankan SQL commands berikut:
|
||||
```
|
||||
|
||||
```sql
|
||||
-- Create user dan database
|
||||
CREATE USER ocr WITH PASSWORD 'ganti-dengan-password-kuat';
|
||||
CREATE DATABASE ocr_sprint OWNER ocr;
|
||||
|
||||
-- Grant privileges
|
||||
GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
|
||||
|
||||
-- Connect ke database
|
||||
\c ocr_sprint
|
||||
|
||||
-- Grant schema privileges (PostgreSQL 15+)
|
||||
GRANT ALL ON SCHEMA public TO ocr;
|
||||
|
||||
-- Exit
|
||||
\q
|
||||
```
|
||||
|
||||
**Konfigurasi PostgreSQL untuk remote access (opsional):**
|
||||
|
||||
```bash
|
||||
# Edit postgresql.conf
|
||||
sudo nano /etc/postgresql/14/main/postgresql.conf
|
||||
|
||||
# Uncomment dan ubah:
|
||||
listen_addresses = 'localhost' # Tetap localhost untuk keamanan
|
||||
|
||||
# Edit pg_hba.conf
|
||||
sudo nano /etc/postgresql/14/main/pg_hba.conf
|
||||
|
||||
# Tambahkan line:
|
||||
local ocr_sprint ocr scram-sha-256
|
||||
|
||||
# Restart PostgreSQL
|
||||
sudo systemctl restart postgresql
|
||||
```
|
||||
|
||||
## Langkah 3: Setup Application User
|
||||
|
||||
```bash
|
||||
# Create dedicated user untuk aplikasi
|
||||
sudo useradd -m -s /bin/bash ocr
|
||||
sudo usermod -aG sudo ocr # Opsional, untuk maintenance
|
||||
|
||||
# Create application directory
|
||||
sudo mkdir -p /opt/ocr-sprint-service
|
||||
sudo chown ocr:ocr /opt/ocr-sprint-service
|
||||
|
||||
# Switch ke user ocr
|
||||
sudo su - ocr
|
||||
```
|
||||
|
||||
## Langkah 4: Install Application
|
||||
|
||||
```bash
|
||||
# Clone repository
|
||||
cd /opt
|
||||
git clone https://github.com/Adriankf59/ocr-sprint-service.git
|
||||
cd ocr-sprint-service
|
||||
|
||||
# Create virtual environment
|
||||
python3.11 -m venv .venv
|
||||
|
||||
# Activate virtual environment
|
||||
source .venv/bin/activate
|
||||
|
||||
# Upgrade pip
|
||||
pip install --upgrade pip setuptools wheel
|
||||
|
||||
# Install application dengan OCR dependencies
|
||||
pip install -e ".[ocr]"
|
||||
|
||||
# Verify installation
|
||||
python -c "import paddleocr; print('PaddleOCR installed successfully')"
|
||||
```
|
||||
|
||||
## Langkah 5: Konfigurasi Application
|
||||
|
||||
```bash
|
||||
# Copy environment template
|
||||
cp .env.example .env
|
||||
|
||||
# Edit konfigurasi
|
||||
nano .env
|
||||
```
|
||||
|
||||
**Konfigurasi production (`/opt/ocr-sprint-service/.env`):**
|
||||
|
||||
```bash
|
||||
# ==== App ====
|
||||
APP_ENV=prod
|
||||
APP_HOST=0.0.0.0
|
||||
APP_PORT=8000
|
||||
APP_LOG_LEVEL=INFO
|
||||
|
||||
# ==== Storage ====
|
||||
STORAGE_LOCAL_DIR=/opt/ocr-sprint-service/storage
|
||||
BLOB_STORAGE_DIR=/opt/ocr-sprint-service/storage/blobs
|
||||
BLOB_MAX_UPLOAD_MB=25
|
||||
|
||||
# ==== OCR ====
|
||||
OCR_LANG=latin
|
||||
OCR_USE_GPU=false
|
||||
OCR_MAX_IMAGE_SIDE=2200
|
||||
|
||||
# ==== Preprocessing ====
|
||||
PREPROCESS_TARGET_DPI=300
|
||||
PREPROCESS_DENOISE=true
|
||||
PREPROCESS_DESKEW=true
|
||||
PREPROCESS_DETECT_DOCUMENT=true
|
||||
PREPROCESS_REMOVE_SHADOW=true
|
||||
PREPROCESS_MIN_QUAD_AREA_FRACTION=0.20
|
||||
|
||||
# ==== Table Extraction ====
|
||||
TABLES_ENABLED=true
|
||||
|
||||
# ==== Confidence ====
|
||||
CONFIDENCE_AUTO_APPROVE=0.95
|
||||
CONFIDENCE_NEEDS_REVIEW=0.85
|
||||
|
||||
# ==== LLM (Phase 5, optional) ====
|
||||
LLM_ENABLED=false
|
||||
|
||||
# ==== Async Pipeline ====
|
||||
QUEUE_ENABLED=true
|
||||
REDIS_URL=redis://localhost:6379/0
|
||||
CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
|
||||
|
||||
# ==== Database ====
|
||||
DATABASE_URL=postgresql+psycopg://ocr:ganti-dengan-password-kuat@localhost:5432/ocr_sprint
|
||||
DATABASE_ECHO=false
|
||||
|
||||
# ==== Auth (WAJIB!) ====
|
||||
API_KEYS=key1-ganti-dengan-random-string,key2-ganti-dengan-random-string
|
||||
API_KEY_HEADER=X-API-Key
|
||||
```
|
||||
|
||||
**Generate secure API keys:**
|
||||
|
||||
```bash
|
||||
# Generate 2 API keys
|
||||
openssl rand -hex 32
|
||||
openssl rand -hex 32
|
||||
```
|
||||
|
||||
**Create storage directories:**
|
||||
|
||||
```bash
|
||||
mkdir -p /opt/ocr-sprint-service/storage/blobs
|
||||
chmod 755 /opt/ocr-sprint-service/storage
|
||||
```
|
||||
|
||||
## Langkah 6: Run Database Migrations
|
||||
|
||||
```bash
|
||||
# Masih sebagai user ocr, dengan venv activated
|
||||
cd /opt/ocr-sprint-service
|
||||
source .venv/bin/activate
|
||||
|
||||
# Run migrations
|
||||
alembic upgrade head
|
||||
|
||||
# Verify
|
||||
alembic current
|
||||
```
|
||||
|
||||
## Langkah 7: Test Manual Run
|
||||
|
||||
```bash
|
||||
# Test API server
|
||||
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
|
||||
|
||||
# Di terminal lain, test health check
|
||||
curl http://localhost:8000/api/v1/health
|
||||
|
||||
# Jika berhasil, stop dengan Ctrl+C
|
||||
```
|
||||
|
||||
## Langkah 8: Setup Systemd Services
|
||||
|
||||
### API Service
|
||||
|
||||
```bash
|
||||
# Exit dari user ocr, kembali ke user dengan sudo
|
||||
exit
|
||||
|
||||
# Create systemd service file
|
||||
sudo nano /etc/systemd/system/ocr-sprint-api.service
|
||||
```
|
||||
|
||||
**Content `/etc/systemd/system/ocr-sprint-api.service`:**
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=OCR Sprint API Service
|
||||
After=network.target postgresql.service redis.service
|
||||
Wants=postgresql.service redis.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=ocr
|
||||
Group=ocr
|
||||
WorkingDirectory=/opt/ocr-sprint-service
|
||||
|
||||
# Environment
|
||||
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
|
||||
EnvironmentFile=/opt/ocr-sprint-service/.env
|
||||
|
||||
# Start command - 4 workers untuk production
|
||||
ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn \
|
||||
ocr_sprint.main:app \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000 \
|
||||
--workers 4 \
|
||||
--log-level info
|
||||
|
||||
# Restart policy
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
StartLimitInterval=0
|
||||
|
||||
# Resource limits
|
||||
LimitNOFILE=65536
|
||||
MemoryLimit=6G
|
||||
|
||||
# Security
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
### Celery Worker Service
|
||||
|
||||
```bash
|
||||
sudo nano /etc/systemd/system/ocr-sprint-worker.service
|
||||
```
|
||||
|
||||
**Content `/etc/systemd/system/ocr-sprint-worker.service`:**
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=OCR Sprint Celery Worker
|
||||
After=network.target postgresql.service redis.service ocr-sprint-api.service
|
||||
Wants=postgresql.service redis.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=ocr
|
||||
Group=ocr
|
||||
WorkingDirectory=/opt/ocr-sprint-service
|
||||
|
||||
# Environment
|
||||
Environment="PATH=/opt/ocr-sprint-service/.venv/bin:/usr/local/bin:/usr/bin:/bin"
|
||||
EnvironmentFile=/opt/ocr-sprint-service/.env
|
||||
|
||||
# Start command - concurrency 2 untuk 4 core CPU
|
||||
ExecStart=/opt/ocr-sprint-service/.venv/bin/celery \
|
||||
-A ocr_sprint.worker.celery_app \
|
||||
worker \
|
||||
--loglevel=info \
|
||||
--concurrency=2 \
|
||||
--max-tasks-per-child=100
|
||||
|
||||
# Restart policy
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
StartLimitInterval=0
|
||||
|
||||
# Resource limits
|
||||
LimitNOFILE=65536
|
||||
MemoryLimit=4G
|
||||
|
||||
# Security
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
### Enable dan Start Services
|
||||
|
||||
```bash
|
||||
# Reload systemd
|
||||
sudo systemctl daemon-reload
|
||||
|
||||
# Enable services (auto-start on boot)
|
||||
sudo systemctl enable ocr-sprint-api
|
||||
sudo systemctl enable ocr-sprint-worker
|
||||
|
||||
# Start services
|
||||
sudo systemctl start ocr-sprint-api
|
||||
sudo systemctl start ocr-sprint-worker
|
||||
|
||||
# Check status
|
||||
sudo systemctl status ocr-sprint-api
|
||||
sudo systemctl status ocr-sprint-worker
|
||||
|
||||
# View logs
|
||||
sudo journalctl -u ocr-sprint-api -f
|
||||
sudo journalctl -u ocr-sprint-worker -f
|
||||
```
|
||||
|
||||
## Langkah 9: Setup Nginx Reverse Proxy
|
||||
|
||||
### Install Nginx
|
||||
|
||||
```bash
|
||||
sudo apt install -y nginx certbot python3-certbot-nginx
|
||||
```
|
||||
|
||||
### Konfigurasi Nginx
|
||||
|
||||
```bash
|
||||
sudo nano /etc/nginx/sites-available/ocr-sprint
|
||||
```
|
||||
|
||||
**Content `/etc/nginx/sites-available/ocr-sprint`:**
|
||||
|
||||
```nginx
|
||||
# Upstream untuk load balancing (jika scale horizontal)
|
||||
upstream ocr_api {
|
||||
server 127.0.0.1:8000;
|
||||
keepalive 32;
|
||||
}
|
||||
|
||||
# Rate limiting
|
||||
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
server_name ocr.yourdomain.com; # Ganti dengan domain Anda
|
||||
|
||||
# Max upload size (sesuaikan dengan BLOB_MAX_UPLOAD_MB)
|
||||
client_max_body_size 30M;
|
||||
client_body_buffer_size 128k;
|
||||
|
||||
# Timeouts untuk dokumen besar
|
||||
proxy_connect_timeout 300s;
|
||||
proxy_send_timeout 300s;
|
||||
proxy_read_timeout 300s;
|
||||
send_timeout 300s;
|
||||
|
||||
# Logging
|
||||
access_log /var/log/nginx/ocr-sprint-access.log;
|
||||
error_log /var/log/nginx/ocr-sprint-error.log;
|
||||
|
||||
# API endpoints
|
||||
location /api/ {
|
||||
# Rate limiting
|
||||
limit_req zone=api_limit burst=20 nodelay;
|
||||
|
||||
proxy_pass http://ocr_api;
|
||||
proxy_http_version 1.1;
|
||||
|
||||
# Headers
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
proxy_set_header Connection "";
|
||||
|
||||
# Disable buffering untuk streaming responses
|
||||
proxy_buffering off;
|
||||
}
|
||||
|
||||
# Health check endpoint (no rate limit)
|
||||
location /api/v1/health {
|
||||
proxy_pass http://ocr_api;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
access_log off;
|
||||
}
|
||||
|
||||
# Metrics endpoint (restrict access)
|
||||
location /metrics {
|
||||
# Allow only from internal network
|
||||
allow 10.0.0.0/8;
|
||||
allow 172.16.0.0/12;
|
||||
allow 192.168.0.0/16;
|
||||
allow 127.0.0.1;
|
||||
deny all;
|
||||
|
||||
proxy_pass http://ocr_api;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
}
|
||||
|
||||
# Docs (opsional, bisa di-disable di production)
|
||||
location /docs {
|
||||
proxy_pass http://ocr_api;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
}
|
||||
|
||||
location /redoc {
|
||||
proxy_pass http://ocr_api;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Enable Site
|
||||
|
||||
```bash
|
||||
# Test konfigurasi
|
||||
sudo nginx -t
|
||||
|
||||
# Enable site
|
||||
sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
|
||||
|
||||
# Remove default site (opsional)
|
||||
sudo rm /etc/nginx/sites-enabled/default
|
||||
|
||||
# Reload Nginx
|
||||
sudo systemctl reload nginx
|
||||
```
|
||||
|
||||
### Setup SSL dengan Let's Encrypt
|
||||
|
||||
```bash
|
||||
# Install certbot
|
||||
sudo apt install -y certbot python3-certbot-nginx
|
||||
|
||||
# Obtain certificate (ganti dengan domain Anda)
|
||||
sudo certbot --nginx -d ocr.yourdomain.com
|
||||
|
||||
# Test auto-renewal
|
||||
sudo certbot renew --dry-run
|
||||
```
|
||||
|
||||
Certbot akan otomatis mengupdate konfigurasi Nginx untuk HTTPS.
|
||||
|
||||
## Langkah 10: Setup Firewall
|
||||
|
||||
```bash
|
||||
# Install UFW (jika belum ada)
|
||||
sudo apt install -y ufw
|
||||
|
||||
# Allow SSH (PENTING! Jangan sampai terkunci)
|
||||
sudo ufw allow 22/tcp
|
||||
|
||||
# Allow HTTP dan HTTPS
|
||||
sudo ufw allow 80/tcp
|
||||
sudo ufw allow 443/tcp
|
||||
|
||||
# Enable firewall
|
||||
sudo ufw enable
|
||||
|
||||
# Check status
|
||||
sudo ufw status
|
||||
```
|
||||
|
||||
## Langkah 11: Verifikasi Deployment
|
||||
|
||||
### Test dari Server
|
||||
|
||||
```bash
|
||||
# Health check
|
||||
curl http://localhost:8000/api/v1/health
|
||||
|
||||
# Test dengan API key
|
||||
curl -X POST http://localhost:8000/api/v1/documents?sync=true \
|
||||
-H "X-API-Key: your-api-key-here" \
|
||||
-F "file=@/path/to/test.pdf"
|
||||
```
|
||||
|
||||
### Test dari Client
|
||||
|
||||
```bash
|
||||
# Health check via domain
|
||||
curl https://ocr.yourdomain.com/api/v1/health
|
||||
|
||||
# Upload dokumen
|
||||
curl -X POST https://ocr.yourdomain.com/api/v1/documents \
|
||||
-H "X-API-Key: your-api-key-here" \
|
||||
-F "file=@document.pdf"
|
||||
```
|
||||
|
||||
## Monitoring dan Maintenance
|
||||
|
||||
### View Logs
|
||||
|
||||
```bash
|
||||
# API logs
|
||||
sudo journalctl -u ocr-sprint-api -f
|
||||
|
||||
# Worker logs
|
||||
sudo journalctl -u ocr-sprint-worker -f
|
||||
|
||||
# Nginx logs
|
||||
sudo tail -f /var/log/nginx/ocr-sprint-access.log
|
||||
sudo tail -f /var/log/nginx/ocr-sprint-error.log
|
||||
|
||||
# PostgreSQL logs
|
||||
sudo tail -f /var/log/postgresql/postgresql-14-main.log
|
||||
```
|
||||
|
||||
### Service Management
|
||||
|
||||
```bash
|
||||
# Restart services
|
||||
sudo systemctl restart ocr-sprint-api
|
||||
sudo systemctl restart ocr-sprint-worker
|
||||
|
||||
# Stop services
|
||||
sudo systemctl stop ocr-sprint-api
|
||||
sudo systemctl stop ocr-sprint-worker
|
||||
|
||||
# Check status
|
||||
sudo systemctl status ocr-sprint-api
|
||||
sudo systemctl status ocr-sprint-worker
|
||||
```
|
||||
|
||||
### Database Backup
|
||||
|
||||
```bash
|
||||
# Create backup script
|
||||
sudo nano /opt/ocr-sprint-service/backup.sh
|
||||
```
|
||||
|
||||
**Content `backup.sh`:**
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
BACKUP_DIR="/opt/ocr-sprint-service/backups"
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
mkdir -p $BACKUP_DIR
|
||||
|
||||
# Backup database
|
||||
pg_dump -U ocr -h localhost ocr_sprint | gzip > $BACKUP_DIR/db_$DATE.sql.gz
|
||||
|
||||
# Backup blobs (opsional, bisa besar)
|
||||
# tar -czf $BACKUP_DIR/blobs_$DATE.tar.gz /opt/ocr-sprint-service/storage/blobs
|
||||
|
||||
# Keep only last 7 days
|
||||
find $BACKUP_DIR -name "db_*.sql.gz" -mtime +7 -delete
|
||||
|
||||
echo "Backup completed: $DATE"
|
||||
```
|
||||
|
||||
```bash
|
||||
# Make executable
|
||||
chmod +x /opt/ocr-sprint-service/backup.sh
|
||||
|
||||
# Setup cron job (daily at 2 AM)
|
||||
sudo crontab -e
|
||||
|
||||
# Add line:
|
||||
0 2 * * * /opt/ocr-sprint-service/backup.sh >> /var/log/ocr-backup.log 2>&1
|
||||
```
|
||||
|
||||
### Log Rotation
|
||||
|
||||
```bash
|
||||
sudo nano /etc/logrotate.d/ocr-sprint
|
||||
```
|
||||
|
||||
**Content:**
|
||||
|
||||
```
|
||||
/var/log/nginx/ocr-sprint-*.log {
|
||||
daily
|
||||
rotate 14
|
||||
compress
|
||||
delaycompress
|
||||
notifempty
|
||||
create 0640 www-data adm
|
||||
sharedscripts
|
||||
postrotate
|
||||
[ -f /var/run/nginx.pid ] && kill -USR1 `cat /var/run/nginx.pid`
|
||||
endscript
|
||||
}
|
||||
```
|
||||
|
||||
## Update Application
|
||||
|
||||
```bash
|
||||
# Switch ke user ocr
|
||||
sudo su - ocr
|
||||
cd /opt/ocr-sprint-service
|
||||
|
||||
# Pull latest code
|
||||
git pull
|
||||
|
||||
# Activate venv
|
||||
source .venv/bin/activate
|
||||
|
||||
# Update dependencies
|
||||
pip install -e ".[ocr]"
|
||||
|
||||
# Run migrations
|
||||
alembic upgrade head
|
||||
|
||||
# Exit user ocr
|
||||
exit
|
||||
|
||||
# Restart services
|
||||
sudo systemctl restart ocr-sprint-api
|
||||
sudo systemctl restart ocr-sprint-worker
|
||||
|
||||
# Check logs
|
||||
sudo journalctl -u ocr-sprint-api -n 50
|
||||
```
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### Increase Worker Concurrency
|
||||
|
||||
```bash
|
||||
# Edit worker service
|
||||
sudo nano /etc/systemd/system/ocr-sprint-worker.service
|
||||
|
||||
# Ubah --concurrency sesuai CPU cores
|
||||
# Untuk 8 cores: --concurrency=4
|
||||
# Untuk 16 cores: --concurrency=8
|
||||
|
||||
# Reload dan restart
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl restart ocr-sprint-worker
|
||||
```
|
||||
|
||||
### PostgreSQL Tuning
|
||||
|
||||
```bash
|
||||
sudo nano /etc/postgresql/14/main/postgresql.conf
|
||||
```
|
||||
|
||||
**Recommended settings untuk 16GB RAM:**
|
||||
|
||||
```
|
||||
shared_buffers = 4GB
|
||||
effective_cache_size = 12GB
|
||||
maintenance_work_mem = 1GB
|
||||
checkpoint_completion_target = 0.9
|
||||
wal_buffers = 16MB
|
||||
default_statistics_target = 100
|
||||
random_page_cost = 1.1
|
||||
effective_io_concurrency = 200
|
||||
work_mem = 10MB
|
||||
min_wal_size = 1GB
|
||||
max_wal_size = 4GB
|
||||
max_worker_processes = 4
|
||||
max_parallel_workers_per_gather = 2
|
||||
max_parallel_workers = 4
|
||||
```
|
||||
|
||||
```bash
|
||||
sudo systemctl restart postgresql
|
||||
```
|
||||
|
||||
### Redis Tuning
|
||||
|
||||
```bash
|
||||
sudo nano /etc/redis/redis.conf
|
||||
```
|
||||
|
||||
**Recommended settings:**
|
||||
|
||||
```
|
||||
maxmemory 2gb
|
||||
maxmemory-policy allkeys-lru
|
||||
save "" # Disable RDB snapshots untuk performance
|
||||
```
|
||||
|
||||
```bash
|
||||
sudo systemctl restart redis
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Service tidak start
|
||||
|
||||
```bash
|
||||
# Check logs
|
||||
sudo journalctl -u ocr-sprint-api -n 100 --no-pager
|
||||
sudo journalctl -u ocr-sprint-worker -n 100 --no-pager
|
||||
|
||||
# Check permissions
|
||||
ls -la /opt/ocr-sprint-service
|
||||
ls -la /opt/ocr-sprint-service/storage
|
||||
|
||||
# Test manual run
|
||||
sudo su - ocr
|
||||
cd /opt/ocr-sprint-service
|
||||
source .venv/bin/activate
|
||||
uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### Database connection error
|
||||
|
||||
```bash
|
||||
# Test connection
|
||||
sudo -u ocr psql -h localhost -U ocr -d ocr_sprint
|
||||
|
||||
# Check PostgreSQL status
|
||||
sudo systemctl status postgresql
|
||||
|
||||
# Check pg_hba.conf
|
||||
sudo cat /etc/postgresql/14/main/pg_hba.conf | grep ocr
|
||||
```
|
||||
|
||||
### Redis connection error
|
||||
|
||||
```bash
|
||||
# Test Redis
|
||||
redis-cli ping
|
||||
|
||||
# Check Redis status
|
||||
sudo systemctl status redis
|
||||
|
||||
# Check Redis logs
|
||||
sudo journalctl -u redis -n 50
|
||||
```
|
||||
|
||||
### PaddleOCR model download gagal
|
||||
|
||||
```bash
|
||||
# Download manual
|
||||
sudo su - ocr
|
||||
cd /opt/ocr-sprint-service
|
||||
source .venv/bin/activate
|
||||
|
||||
python << EOF
|
||||
from paddleocr import PaddleOCR
|
||||
ocr = PaddleOCR(use_angle_cls=True, lang='latin')
|
||||
print("Models downloaded successfully")
|
||||
EOF
|
||||
```
|
||||
|
||||
### Out of memory
|
||||
|
||||
```bash
|
||||
# Check memory usage
|
||||
free -h
|
||||
htop
|
||||
|
||||
# Reduce worker concurrency
|
||||
sudo nano /etc/systemd/system/ocr-sprint-worker.service
|
||||
# Ubah --concurrency=1
|
||||
|
||||
# Add swap (jika perlu)
|
||||
sudo fallocate -l 4G /swapfile
|
||||
sudo chmod 600 /swapfile
|
||||
sudo mkswap /swapfile
|
||||
sudo swapon /swapfile
|
||||
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
|
||||
```
|
||||
|
||||
## Security Checklist
|
||||
|
||||
- [ ] API keys diganti dengan nilai random yang kuat
|
||||
- [ ] Database password diganti dari default
|
||||
- [ ] Firewall enabled (UFW) - hanya port 22, 80, 443 terbuka
|
||||
- [ ] SSL/TLS enabled via Let's Encrypt
|
||||
- [ ] `/metrics` endpoint restricted ke internal network
|
||||
- [ ] Nginx rate limiting configured
|
||||
- [ ] PostgreSQL hanya listen di localhost
|
||||
- [ ] Redis hanya listen di localhost
|
||||
- [ ] Regular backup configured (cron job)
|
||||
- [ ] Log rotation configured
|
||||
- [ ] OS security updates enabled (`unattended-upgrades`)
|
||||
- [ ] Fail2ban installed untuk SSH protection
|
||||
|
||||
## Monitoring dengan Prometheus (Opsional)
|
||||
|
||||
### Install Prometheus
|
||||
|
||||
```bash
|
||||
# Download Prometheus
|
||||
cd /tmp
|
||||
wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
|
||||
tar xvfz prometheus-*.tar.gz
|
||||
sudo mv prometheus-2.45.0.linux-amd64 /opt/prometheus
|
||||
|
||||
# Create user
|
||||
sudo useradd --no-create-home --shell /bin/false prometheus
|
||||
|
||||
# Create directories
|
||||
sudo mkdir /etc/prometheus /var/lib/prometheus
|
||||
sudo chown prometheus:prometheus /var/lib/prometheus
|
||||
```
|
||||
|
||||
### Configure Prometheus
|
||||
|
||||
```bash
|
||||
sudo nano /etc/prometheus/prometheus.yml
|
||||
```
|
||||
|
||||
**Content:**
|
||||
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'ocr-sprint'
|
||||
static_configs:
|
||||
- targets: ['localhost:8000']
|
||||
metrics_path: '/metrics'
|
||||
```
|
||||
|
||||
### Create Systemd Service
|
||||
|
||||
```bash
|
||||
sudo nano /etc/systemd/system/prometheus.service
|
||||
```
|
||||
|
||||
**Content:**
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=Prometheus
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
User=prometheus
|
||||
Group=prometheus
|
||||
Type=simple
|
||||
ExecStart=/opt/prometheus/prometheus \
|
||||
--config.file=/etc/prometheus/prometheus.yml \
|
||||
--storage.tsdb.path=/var/lib/prometheus/
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
```bash
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable prometheus
|
||||
sudo systemctl start prometheus
|
||||
```
|
||||
|
||||
Access Prometheus di `http://localhost:9090`
|
||||
|
||||
## Support
|
||||
|
||||
Untuk pertanyaan atau issues, hubungi tim development.
|
||||
437
docs/DEPLOYMENT.md
Normal file
437
docs/DEPLOYMENT.md
Normal file
@@ -0,0 +1,437 @@
|
||||
# Quickstart Deployment OCR Sprint Service
|
||||
|
||||
Panduan deployment OCR Sprint Service ke server production untuk pemrosesan dokumen surat sprint Polri.
|
||||
|
||||
## Prasyarat Server
|
||||
|
||||
### Spesifikasi Minimum
|
||||
- **OS**: Linux (Ubuntu 20.04+ / Debian 11+ / RHEL 8+)
|
||||
- **CPU**: 4 cores (8 cores recommended untuk throughput tinggi)
|
||||
- **RAM**: 8 GB minimum (16 GB recommended)
|
||||
- **Storage**: 50 GB free space
|
||||
- ~3 GB untuk model PaddleOCR
|
||||
- ~1.5 GB untuk dependencies Python
|
||||
- Sisanya untuk blob storage dokumen
|
||||
- **Network**: Port 8000 terbuka untuk API access
|
||||
|
||||
### Software Requirements
|
||||
- Docker 24.0+ dan Docker Compose v2
|
||||
- Git
|
||||
- (Opsional) Nginx/Caddy untuk reverse proxy + SSL
|
||||
|
||||
## Deployment dengan Docker Compose (Recommended)
|
||||
|
||||
### 1. Clone Repository
|
||||
|
||||
```bash
|
||||
# Login ke server sebagai user non-root dengan sudo access
|
||||
ssh user@your-server.com
|
||||
|
||||
# Clone repository
|
||||
git clone https://github.com/Adriankf59/ocr-sprint-service.git
|
||||
cd ocr-sprint-service
|
||||
```
|
||||
|
||||
### 2. Konfigurasi Environment
|
||||
|
||||
```bash
|
||||
# Copy template environment
|
||||
cp .env.example .env
|
||||
|
||||
# Edit konfigurasi production
|
||||
nano .env
|
||||
```
|
||||
|
||||
**Konfigurasi penting untuk production:**
|
||||
|
||||
```bash
|
||||
# ==== App ====
|
||||
APP_ENV=prod
|
||||
APP_LOG_LEVEL=INFO
|
||||
|
||||
# ==== Storage ====
|
||||
STORAGE_LOCAL_DIR=/app/storage
|
||||
BLOB_STORAGE_DIR=/app/storage/blobs
|
||||
BLOB_MAX_UPLOAD_MB=25
|
||||
|
||||
# ==== OCR ====
|
||||
OCR_LANG=latin
|
||||
OCR_USE_GPU=false # set true jika server punya GPU NVIDIA
|
||||
OCR_MAX_IMAGE_SIDE=2200
|
||||
|
||||
# ==== Preprocessing ====
|
||||
PREPROCESS_TARGET_DPI=300
|
||||
PREPROCESS_DENOISE=true
|
||||
PREPROCESS_DESKEW=true
|
||||
PREPROCESS_DETECT_DOCUMENT=true
|
||||
PREPROCESS_REMOVE_SHADOW=true
|
||||
|
||||
# ==== Table Extraction ====
|
||||
TABLES_ENABLED=true
|
||||
|
||||
# ==== Async Pipeline ====
|
||||
QUEUE_ENABLED=true
|
||||
REDIS_URL=redis://redis:6379/0
|
||||
CELERY_TASK_DEFAULT_QUEUE=ocr_sprint
|
||||
|
||||
# ==== Database ====
|
||||
DATABASE_URL=postgresql+psycopg://ocr:ocr@postgres:5432/ocr_sprint
|
||||
DATABASE_ECHO=false
|
||||
|
||||
# ==== Auth (WAJIB untuk production!) ====
|
||||
API_KEYS=your-secret-key-1,your-secret-key-2
|
||||
API_KEY_HEADER=X-API-Key
|
||||
```
|
||||
|
||||
**Generate API keys yang aman:**
|
||||
|
||||
```bash
|
||||
# Generate random API key
|
||||
openssl rand -hex 32
|
||||
```
|
||||
|
||||
### 3. Build dan Start Services
|
||||
|
||||
```bash
|
||||
# Build Docker images
|
||||
docker compose build
|
||||
|
||||
# Start semua services (API, Worker, Redis, Postgres)
|
||||
docker compose up -d
|
||||
|
||||
# Cek logs untuk memastikan semua berjalan
|
||||
docker compose logs -f api worker
|
||||
```
|
||||
|
||||
**Services yang berjalan:**
|
||||
- `api`: FastAPI server di port 8000
|
||||
- `worker`: Celery worker untuk async processing
|
||||
- `redis`: Message broker untuk job queue
|
||||
- `postgres`: Database untuk job state
|
||||
|
||||
### 4. Verifikasi Deployment
|
||||
|
||||
```bash
|
||||
# Health check
|
||||
curl http://localhost:8000/api/v1/health
|
||||
|
||||
# Expected response:
|
||||
# {"status":"ok","version":"0.1.0"}
|
||||
|
||||
# Test OCR endpoint (sync mode untuk testing)
|
||||
curl -X POST http://localhost:8000/api/v1/documents?sync=true \
|
||||
-H "X-API-Key: your-secret-key-1" \
|
||||
-F "file=@samples/pdf/example.pdf" \
|
||||
| jq
|
||||
```
|
||||
|
||||
### 5. Setup Reverse Proxy (Nginx)
|
||||
|
||||
**Install Nginx:**
|
||||
|
||||
```bash
|
||||
sudo apt update
|
||||
sudo apt install nginx certbot python3-certbot-nginx
|
||||
```
|
||||
|
||||
**Konfigurasi Nginx (`/etc/nginx/sites-available/ocr-sprint`):**
|
||||
|
||||
```nginx
|
||||
upstream ocr_api {
|
||||
server localhost:8000;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
server_name ocr.yourdomain.com;
|
||||
|
||||
client_max_body_size 30M; # Sesuaikan dengan BLOB_MAX_UPLOAD_MB
|
||||
|
||||
location / {
|
||||
proxy_pass http://ocr_api;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# Timeout untuk dokumen besar
|
||||
proxy_read_timeout 300s;
|
||||
proxy_connect_timeout 75s;
|
||||
}
|
||||
|
||||
location /metrics {
|
||||
# Restrict metrics endpoint
|
||||
allow 10.0.0.0/8; # Internal network only
|
||||
deny all;
|
||||
proxy_pass http://ocr_api;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Enable site dan setup SSL:**
|
||||
|
||||
```bash
|
||||
# Enable site
|
||||
sudo ln -s /etc/nginx/sites-available/ocr-sprint /etc/nginx/sites-enabled/
|
||||
sudo nginx -t
|
||||
sudo systemctl reload nginx
|
||||
|
||||
# Setup SSL dengan Let's Encrypt
|
||||
sudo certbot --nginx -d ocr.yourdomain.com
|
||||
```
|
||||
|
||||
## Deployment Manual (Tanpa Docker)
|
||||
|
||||
### 1. Install System Dependencies
|
||||
|
||||
```bash
|
||||
# Ubuntu/Debian
|
||||
sudo apt update
|
||||
sudo apt install -y \
|
||||
python3.11 python3.11-venv python3-pip \
|
||||
libgl1 libglib2.0-0 libsm6 libxext6 libxrender1 \
|
||||
libgomp1 libmagic1 \
|
||||
redis-server postgresql-14
|
||||
|
||||
# Start services
|
||||
sudo systemctl enable --now redis-server postgresql
|
||||
```
|
||||
|
||||
### 2. Setup Database
|
||||
|
||||
```bash
|
||||
# Create database dan user
|
||||
sudo -u postgres psql << EOF
|
||||
CREATE USER ocr WITH PASSWORD 'your-secure-password';
|
||||
CREATE DATABASE ocr_sprint OWNER ocr;
|
||||
GRANT ALL PRIVILEGES ON DATABASE ocr_sprint TO ocr;
|
||||
EOF
|
||||
```
|
||||
|
||||
### 3. Install Application
|
||||
|
||||
```bash
|
||||
# Clone repository
|
||||
git clone https://github.com/Adriankf59/ocr-sprint-service.git
|
||||
cd ocr-sprint-service
|
||||
|
||||
# Create virtual environment
|
||||
python3.11 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
|
||||
# Install dependencies
|
||||
pip install --upgrade pip
|
||||
pip install -e ".[ocr]"
|
||||
|
||||
# Copy dan edit .env
|
||||
cp .env.example .env
|
||||
nano .env
|
||||
```
|
||||
|
||||
**Update DATABASE_URL di .env:**
|
||||
|
||||
```bash
|
||||
DATABASE_URL=postgresql+psycopg://ocr:your-secure-password@localhost:5432/ocr_sprint
|
||||
REDIS_URL=redis://localhost:6379/0
|
||||
QUEUE_ENABLED=true
|
||||
```
|
||||
|
||||
### 4. Run Database Migrations
|
||||
|
||||
```bash
|
||||
alembic upgrade head
|
||||
```
|
||||
|
||||
### 5. Setup Systemd Services
|
||||
|
||||
**API Service (`/etc/systemd/system/ocr-sprint-api.service`):**
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=OCR Sprint API
|
||||
After=network.target postgresql.service redis.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=ocr
|
||||
WorkingDirectory=/opt/ocr-sprint-service
|
||||
Environment="PATH=/opt/ocr-sprint-service/.venv/bin"
|
||||
ExecStart=/opt/ocr-sprint-service/.venv/bin/uvicorn ocr_sprint.main:app --host 0.0.0.0 --port 8000 --workers 4
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
**Worker Service (`/etc/systemd/system/ocr-sprint-worker.service`):**
|
||||
|
||||
```ini
|
||||
[Unit]
|
||||
Description=OCR Sprint Celery Worker
|
||||
After=network.target postgresql.service redis.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=ocr
|
||||
WorkingDirectory=/opt/ocr-sprint-service
|
||||
Environment="PATH=/opt/ocr-sprint-service/.venv/bin"
|
||||
ExecStart=/opt/ocr-sprint-service/.venv/bin/celery -A ocr_sprint.worker.celery_app worker -l info --concurrency=2
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
**Enable dan start services:**
|
||||
|
||||
```bash
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now ocr-sprint-api ocr-sprint-worker
|
||||
sudo systemctl status ocr-sprint-api ocr-sprint-worker
|
||||
```
|
||||
|
||||
## Monitoring dan Maintenance
|
||||
|
||||
### Monitoring Logs
|
||||
|
||||
```bash
|
||||
# Docker deployment
|
||||
docker compose logs -f api worker
|
||||
|
||||
# Manual deployment
|
||||
sudo journalctl -u ocr-sprint-api -f
|
||||
sudo journalctl -u ocr-sprint-worker -f
|
||||
```
|
||||
|
||||
### Prometheus Metrics
|
||||
|
||||
Metrics tersedia di endpoint `/metrics`:
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/metrics
|
||||
```
|
||||
|
||||
**Key metrics:**
|
||||
- `ocr_documents_total`: Total dokumen diproses
|
||||
- `ocr_processing_duration_seconds`: Durasi processing
|
||||
- `ocr_confidence_score`: Distribusi confidence score
|
||||
- `celery_task_*`: Celery worker metrics
|
||||
|
||||
### Backup Database
|
||||
|
||||
```bash
|
||||
# Docker deployment
|
||||
docker compose exec postgres pg_dump -U ocr ocr_sprint > backup_$(date +%Y%m%d).sql
|
||||
|
||||
# Manual deployment
|
||||
pg_dump -U ocr ocr_sprint > backup_$(date +%Y%m%d).sql
|
||||
```
|
||||
|
||||
### Update Service
|
||||
|
||||
```bash
|
||||
# Docker deployment
|
||||
cd ocr-sprint-service
|
||||
git pull
|
||||
docker compose build
|
||||
docker compose up -d
|
||||
|
||||
# Manual deployment
|
||||
cd ocr-sprint-service
|
||||
git pull
|
||||
source .venv/bin/activate
|
||||
pip install -e ".[ocr]"
|
||||
alembic upgrade head
|
||||
sudo systemctl restart ocr-sprint-api ocr-sprint-worker
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Service tidak start
|
||||
|
||||
```bash
|
||||
# Cek logs
|
||||
docker compose logs api worker
|
||||
|
||||
# Cek health check
|
||||
curl http://localhost:8000/api/v1/health
|
||||
```
|
||||
|
||||
### PaddleOCR model download gagal
|
||||
|
||||
```bash
|
||||
# Download manual ke volume
|
||||
docker compose exec api python -c "from paddleocr import PaddleOCR; PaddleOCR(use_angle_cls=True, lang='latin')"
|
||||
```
|
||||
|
||||
### Worker tidak memproses jobs
|
||||
|
||||
```bash
|
||||
# Cek Redis connection
|
||||
docker compose exec worker redis-cli -h redis ping
|
||||
|
||||
# Cek Celery worker status
|
||||
docker compose exec worker celery -A ocr_sprint.worker.celery_app inspect active
|
||||
```
|
||||
|
||||
### Database migration error
|
||||
|
||||
```bash
|
||||
# Cek current revision
|
||||
docker compose exec api alembic current
|
||||
|
||||
# Force upgrade
|
||||
docker compose exec api alembic upgrade head
|
||||
```
|
||||
|
||||
### Out of memory
|
||||
|
||||
```bash
|
||||
# Kurangi worker concurrency di docker-compose.yml
|
||||
# Ubah: --concurrency=1 (default) atau tambahkan memory limit
|
||||
```
|
||||
|
||||
## Security Checklist
|
||||
|
||||
- [ ] API_KEYS diset dengan nilai random yang kuat
|
||||
- [ ] Firewall configured (hanya port 80/443 terbuka)
|
||||
- [ ] SSL/TLS enabled via Nginx + Let's Encrypt
|
||||
- [ ] Database password diganti dari default
|
||||
- [ ] `/metrics` endpoint restricted ke internal network
|
||||
- [ ] Regular backup database dan blob storage
|
||||
- [ ] Log rotation configured
|
||||
- [ ] OS security updates enabled
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### Untuk throughput tinggi:
|
||||
|
||||
1. **Increase worker concurrency:**
|
||||
```yaml
|
||||
# docker-compose.yml
|
||||
command: ["celery", "-A", "ocr_sprint.worker.celery_app", "worker", "-l", "info", "--concurrency=4"]
|
||||
```
|
||||
|
||||
2. **Scale workers horizontally:**
|
||||
```bash
|
||||
docker compose up -d --scale worker=3
|
||||
```
|
||||
|
||||
3. **Enable GPU (jika tersedia):**
|
||||
```bash
|
||||
# .env
|
||||
OCR_USE_GPU=true
|
||||
```
|
||||
|
||||
4. **Tune Postgres:**
|
||||
```sql
|
||||
-- Increase connection pool
|
||||
ALTER SYSTEM SET max_connections = 200;
|
||||
ALTER SYSTEM SET shared_buffers = '2GB';
|
||||
```
|
||||
|
||||
## Support
|
||||
|
||||
Untuk pertanyaan atau issues, hubungi tim development atau buat issue di repository.
|
||||
@@ -86,14 +86,18 @@ def _row_to_response(row: object) -> DocumentResponse:
|
||||
|
||||
assert isinstance(row, JobRow)
|
||||
status_enum = DocumentStatus(row.status)
|
||||
result_obj: ExtractionResult | None = None
|
||||
personel_list = None
|
||||
if row.result is not None:
|
||||
result_obj = ExtractionResult.model_validate(row.result)
|
||||
# Auto-number personnel entries sequentially (1, 2, 3, ...)
|
||||
for idx, entry in enumerate(result_obj.personel, start=1):
|
||||
entry.no = idx
|
||||
personel_list = result_obj.personel
|
||||
return DocumentResponse(
|
||||
job_id=row.job_id,
|
||||
status=status_enum,
|
||||
confidence=row.confidence,
|
||||
data=result_obj,
|
||||
data=personel_list,
|
||||
review_flags=list(row.review_flags or []),
|
||||
error=row.error,
|
||||
approved=bool(row.approved),
|
||||
|
||||
@@ -33,12 +33,45 @@ PANGKAT_VARIANTS: dict[str, tuple[str, ...]] = {
|
||||
# Perwira Menengah
|
||||
"KOMPOL": ("KOMPOL",),
|
||||
"AKBP": ("AKBP",),
|
||||
"KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP"),
|
||||
"KOMBES POL": ("KOMBES POL", "KOMBESPOL", "KBP", "KOMBES"),
|
||||
# Perwira Tinggi
|
||||
"BRIGJEN POL": ("BRIGJEN POL", "BRIGJENPOL", "BRIGJEN"),
|
||||
"IRJEN POL": ("IRJEN POL", "IRJENPOL", "IRJEN"),
|
||||
"KOMJEN POL": ("KOMJEN POL", "KOMJENPOL", "KOMJEN"),
|
||||
"JENDERAL POL": ("JENDERAL POL", "JENDERALPOL", "JENDERAL"),
|
||||
# PNS Polri (Pegawai Negeri Sipil di lingkungan Polri). PNS appear
|
||||
# routinely on sprint panitia / undangan templates alongside Polri
|
||||
# personnel, so we treat them as valid ranks for extraction.
|
||||
# Sources: PP 11/2017 jo PP 17/2020 (Manajemen PNS); golongan I-IV.
|
||||
# Golongan I (Juru)
|
||||
"JURU MUDA": ("JURU MUDA",),
|
||||
"JURU MUDA TK I": ("JURU MUDA TK I", "JURU MUDA TK.I", "JURU MUDA TINGKAT I"),
|
||||
"JURU": ("JURU",),
|
||||
"JURU TK I": ("JURU TK I", "JURU TK.I", "JURU TINGKAT I"),
|
||||
# Golongan II (Pengatur)
|
||||
"PENGATUR MUDA": ("PENGATUR MUDA",),
|
||||
"PENGATUR MUDA TK I": (
|
||||
"PENGATUR MUDA TK I",
|
||||
"PENGATUR MUDA TK.I",
|
||||
"PENGATUR MUDA TINGKAT I",
|
||||
),
|
||||
"PENGATUR": ("PENGATUR",),
|
||||
"PENGATUR TK I": ("PENGATUR TK I", "PENGATUR TK.I", "PENGATUR TINGKAT I"),
|
||||
# Golongan III (Penata)
|
||||
"PENATA MUDA": ("PENATA MUDA",),
|
||||
"PENATA MUDA TK I": (
|
||||
"PENATA MUDA TK I",
|
||||
"PENATA MUDA TK.I",
|
||||
"PENATA MUDA TINGKAT I",
|
||||
),
|
||||
"PENATA": ("PENATA",),
|
||||
"PENATA TK I": ("PENATA TK I", "PENATA TK.I", "PENATA TINGKAT I"),
|
||||
# Golongan IV (Pembina)
|
||||
"PEMBINA": ("PEMBINA",),
|
||||
"PEMBINA TK I": ("PEMBINA TK I", "PEMBINA TK.I", "PEMBINA TINGKAT I"),
|
||||
"PEMBINA UTAMA MUDA": ("PEMBINA UTAMA MUDA",),
|
||||
"PEMBINA UTAMA MADYA": ("PEMBINA UTAMA MADYA",),
|
||||
"PEMBINA UTAMA": ("PEMBINA UTAMA",),
|
||||
}
|
||||
|
||||
# Reverse lookup: any variant (uppercased) → canonical form.
|
||||
|
||||
@@ -64,6 +64,8 @@ _HEADER_SYNONYMS: dict[str, str] = {
|
||||
"jabatan dinas": "jabatan_dinas",
|
||||
"jabatan dalam dinas": "jabatan_dinas",
|
||||
"jbt dinas": "jabatan_dinas",
|
||||
"struktural": "jabatan_dinas",
|
||||
"jabatan struktural": "jabatan_dinas",
|
||||
# jabatan dalam sprint (role for this dispatch)
|
||||
"jabatan dalam sprint": "jabatan_sprint",
|
||||
"jabatan dalam sprin": "jabatan_sprint",
|
||||
@@ -72,6 +74,8 @@ _HEADER_SYNONYMS: dict[str, str] = {
|
||||
"jabatan sprin": "jabatan_sprint",
|
||||
"tugas": "jabatan_sprint",
|
||||
"penugasan": "jabatan_sprint",
|
||||
"dalam penugasan": "jabatan_sprint",
|
||||
"jabatan dalam penugasan": "jabatan_sprint",
|
||||
# remarks
|
||||
"keterangan": "keterangan",
|
||||
"ket": "keterangan",
|
||||
|
||||
@@ -38,12 +38,18 @@ _RANK_TOKENS: tuple[str, ...] = tuple(
|
||||
)
|
||||
)
|
||||
_RANK_ALT = "|".join(re.escape(tok) for tok in _RANK_TOKENS)
|
||||
# A line that contains a rank token followed (anywhere on the same line) by
|
||||
# an 8-digit NRP. We allow common separators: '/', '-', '.', ',', ':' or
|
||||
# whitespace. Rank token must be word-bounded so "BRIPDA" doesn't match
|
||||
# inside e.g. "ABRIPDA-style" text.
|
||||
# A rank token followed (within a few characters) by an 8-digit NRP.
|
||||
# We allow common separators: '/', '-', '.', ',', ':' or whitespace.
|
||||
# The trailing ``\b`` plus proximity to the 8-digit NRP is the
|
||||
# specificity signal — we deliberately do *not* require a leading
|
||||
# ``\b`` because real Polri sprint OCR routinely mashes the rank into
|
||||
# the trailing characters of the previous cell (observed on Polres
|
||||
# Banjar: "...CPHR., CBA, CI" runs straight into "AKP" giving
|
||||
# "CIAKP 84011113"). Requiring a leading boundary loses that row
|
||||
# entirely. The longest-first alternation order ensures multi-token
|
||||
# ranks ("KOMBES POL") still win over short overlaps ("KBP").
|
||||
_RE_RANK_NRP_LINE = re.compile(
|
||||
rf"\b(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
|
||||
rf"(?P<rank>{_RANK_ALT})\b[\s/.\-,:]*?(?P<nrp>\d{{8}})\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# A bare row number marker like "1." or "12)". OCR often puts it on its own
|
||||
@@ -143,31 +149,248 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
|
||||
|
||||
Strategy:
|
||||
|
||||
**Pass 1** — same-line rank+NRP (original strategy):
|
||||
1. Iterate every line. Skip lines that don't contain both a known rank
|
||||
and an 8-digit NRP (those are the only signal we trust).
|
||||
2. For each rank+NRP line, look back for the most recent plausible name
|
||||
line, and forward 1-3 lines for jabatan content.
|
||||
3. Emit a ``PersonnelEntry`` only when we have at least pangkat + nrp.
|
||||
|
||||
**Pass 2** — separate-line rank and NRP (for tabular sprint formats):
|
||||
If pass 1 produces no results, scan for lines containing a standalone
|
||||
rank token, then look up to 2 lines forward for a standalone NRP.
|
||||
This handles sprint formats where OCR renders each column on its own
|
||||
line (e.g. Polres Banjar layout).
|
||||
|
||||
**Pass 3** — rank-only (for sprint formats *without* an NRP column):
|
||||
Some sprint templates (panitia, undangan, etc.) list only nama +
|
||||
pangkat + jabatan, no NRP. If pass 1 and pass 2 both yield nothing,
|
||||
fall back to a rank-only scan: every standalone rank line (or
|
||||
two-line rank like "KOMBES" + "POL" produced by narrow-column OCR)
|
||||
becomes a row, with name assembled from preceding lines and jabatan
|
||||
from following lines. ``nrp`` stays ``None``. False-positive risk
|
||||
is higher (stray rank tokens in body text), so this only fires when
|
||||
nothing else matched.
|
||||
|
||||
The fallback is intentionally rate-limited: the first matching rank
|
||||
token on a line wins (no greedy multi-match per line), and a name line
|
||||
can only be consumed once (so a stray ranked text inside a paragraph
|
||||
doesn't turn into multiple bogus entries).
|
||||
"""
|
||||
lines = raw_text.splitlines()
|
||||
|
||||
# ── Pass 1: rank+NRP on the same line ────────────────────────────
|
||||
rows = _extract_same_line(lines)
|
||||
if rows:
|
||||
return rows
|
||||
|
||||
# ── Pass 2: rank and NRP on separate lines ───────────────────────
|
||||
rows = _extract_separate_lines(lines)
|
||||
if rows:
|
||||
return rows
|
||||
|
||||
# ── Pass 3: rank-only (no NRP column) ────────────────────────────
|
||||
return _extract_rank_only(lines)
|
||||
|
||||
|
||||
# Regex for a line that is *only* a rank token (possibly with punctuation).
|
||||
_RE_RANK_ONLY = re.compile(
|
||||
rf"^\s*(?P<rank>{_RANK_ALT})\s*[/.\-,:]*\s*$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Regex for a line that contains a standalone 8-digit NRP.
|
||||
_RE_NRP_ONLY = re.compile(r"(?<!\d)(?P<nrp>\d{8})(?!\d)")
|
||||
|
||||
|
||||
# Strip a leading row number marker like "1 ", "1.", "12)" from a name
|
||||
# prefix taken from the same OCR line as a rank+NRP match. Unlike
|
||||
# _RE_ROW_NUMBER (which matches a *whole* line), this is a prefix strip
|
||||
# for embedded same-line cases like "1 CUCU JUHANA, A.K.S. KOMPOL ...".
|
||||
_RE_LEADING_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s+")
|
||||
|
||||
|
||||
def _extract_same_line(lines: list[str]) -> list[PersonnelEntry]:
|
||||
"""Pass 1: rank+NRP pairs found anywhere in the joined text.
|
||||
|
||||
Uses ``finditer`` over the full ``\\n``-joined OCR text rather than
|
||||
``re.search`` per line so that multiple rank+NRP pairs on the same
|
||||
OCR line still produce separate rows. This is required for sprint
|
||||
scans where Paddle merges several table rows into one OCR line
|
||||
(observed on Polres Banjar where row 2's "...CBA.AKP 77020049 KASAT
|
||||
RESKRIM" was being swallowed into row 1's jabatan because per-line
|
||||
``search`` only returns the first match).
|
||||
|
||||
For each match we resolve nama from text *before* the match (the
|
||||
same-line prefix takes precedence; otherwise look back through the
|
||||
preceding lines bounded by the previous match) and jabatan from text
|
||||
*after* the match (same-line suffix plus up to ~3 follow-up lines,
|
||||
bounded by the next match).
|
||||
"""
|
||||
if not lines:
|
||||
return []
|
||||
full_text = "\n".join(lines)
|
||||
|
||||
line_starts: list[int] = []
|
||||
pos = 0
|
||||
for line in lines:
|
||||
line_starts.append(pos)
|
||||
pos += len(line) + 1 # +1 for the joining "\n"
|
||||
|
||||
def offset_to_line(offset: int) -> int:
|
||||
lo, hi = 0, len(line_starts)
|
||||
while lo < hi:
|
||||
mid = (lo + hi) // 2
|
||||
if line_starts[mid] <= offset:
|
||||
lo = mid + 1
|
||||
else:
|
||||
hi = mid
|
||||
return max(0, lo - 1)
|
||||
|
||||
matches = list(_RE_RANK_NRP_LINE.finditer(full_text))
|
||||
rows: list[PersonnelEntry] = []
|
||||
consumed_lines: set[int] = set()
|
||||
|
||||
for i, m in enumerate(matches):
|
||||
pangkat = normalize_pangkat(m.group("rank"))
|
||||
if not pangkat or not is_valid_pangkat(pangkat):
|
||||
continue
|
||||
nrp = m.group("nrp")
|
||||
ml = offset_to_line(m.start())
|
||||
prev_ml = (
|
||||
offset_to_line(matches[i - 1].start()) if i > 0 else -1
|
||||
)
|
||||
next_ml = (
|
||||
offset_to_line(matches[i + 1].start())
|
||||
if i + 1 < len(matches)
|
||||
else len(lines)
|
||||
)
|
||||
|
||||
line_text = lines[ml]
|
||||
line_off = line_starts[ml]
|
||||
|
||||
# Same-line prefix: text on this line *before* the rank token.
|
||||
# If the previous match was on this same line, only consider the
|
||||
# text after that previous match's NRP (otherwise we'd reuse the
|
||||
# earlier row's tail as this row's name).
|
||||
prefix_start_local = 0
|
||||
if prev_ml == ml and i > 0:
|
||||
prefix_start_local = max(0, matches[i - 1].end() - line_off)
|
||||
prefix = line_text[prefix_start_local : m.start() - line_off]
|
||||
|
||||
# Same-line suffix: text on this line *after* the NRP, capped at
|
||||
# the next match's start if it's on this same line.
|
||||
suffix_end_local = len(line_text)
|
||||
if next_ml == ml and i + 1 < len(matches):
|
||||
suffix_end_local = matches[i + 1].start() - line_off
|
||||
suffix = line_text[m.end() - line_off : suffix_end_local]
|
||||
|
||||
# ── Resolve nama ────────────────────────────────────────────
|
||||
nama: str | None = None
|
||||
prefix_clean = _RE_LEADING_ROW_NUMBER.sub("", prefix).strip()
|
||||
if prefix_clean and _is_plausible_name(prefix_clean):
|
||||
nama = prefix_clean
|
||||
elif prev_ml < ml:
|
||||
for back in range(ml - 1, prev_ml, -1):
|
||||
if back in consumed_lines or back < 0:
|
||||
continue
|
||||
candidate = lines[back].strip()
|
||||
if _is_plausible_name(candidate):
|
||||
nama = candidate
|
||||
consumed_lines.add(back)
|
||||
break
|
||||
|
||||
# ── Resolve jabatan ─────────────────────────────────────────
|
||||
jabatan_parts: list[str] = []
|
||||
suffix_clean = suffix.strip()
|
||||
if suffix_clean:
|
||||
jabatan_parts.append(suffix_clean)
|
||||
if next_ml > ml:
|
||||
max_fwd = min(ml + 4, next_ml, len(lines))
|
||||
for fwd in range(ml + 1, max_fwd):
|
||||
candidate = lines[fwd].strip()
|
||||
if not candidate:
|
||||
if jabatan_parts:
|
||||
break
|
||||
continue
|
||||
if _RE_NAME_BLOCKLIST.match(candidate):
|
||||
break
|
||||
if _RE_ROW_NUMBER.match(candidate):
|
||||
break
|
||||
jabatan_parts.append(candidate)
|
||||
jabatan = (
|
||||
" ".join(" ".join(jabatan_parts).split())
|
||||
if jabatan_parts
|
||||
else None
|
||||
)
|
||||
|
||||
rows.append(
|
||||
PersonnelEntry(
|
||||
no=None,
|
||||
pangkat=pangkat,
|
||||
nrp=nrp,
|
||||
nama=nama,
|
||||
jabatan_dinas=jabatan,
|
||||
jabatan_sprint=None,
|
||||
keterangan=None,
|
||||
)
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def _extract_separate_lines(lines: list[str]) -> list[PersonnelEntry]:
|
||||
"""Pass 2: rank and NRP on separate nearby lines.
|
||||
|
||||
Handles tabular sprint formats where OCR outputs each column as its
|
||||
own line, e.g.:
|
||||
1
|
||||
CUCU JUHANA, A.K.S.
|
||||
KOMPOL
|
||||
70100418
|
||||
KABAGOPS
|
||||
"""
|
||||
consumed_names: set[int] = set()
|
||||
consumed_nrps: set[int] = set()
|
||||
rows: list[PersonnelEntry] = []
|
||||
|
||||
for idx, raw_line in enumerate(lines):
|
||||
line = raw_line.strip()
|
||||
match = _RE_RANK_NRP_LINE.search(line)
|
||||
if not match:
|
||||
rank_match = _RE_RANK_ONLY.match(line)
|
||||
if not rank_match:
|
||||
# Also try: line starts with a rank token (may have trailing text)
|
||||
for tok in _RANK_TOKENS:
|
||||
if line.upper().startswith(tok) and len(line) - len(tok) < 5:
|
||||
rank_match = re.match(
|
||||
rf"^\s*(?P<rank>{re.escape(tok)})\s*[/.\-,:]*",
|
||||
line,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if rank_match:
|
||||
break
|
||||
if not rank_match:
|
||||
continue
|
||||
pangkat = normalize_pangkat(match.group("rank"))
|
||||
|
||||
pangkat = normalize_pangkat(rank_match.group("rank"))
|
||||
if not pangkat or not is_valid_pangkat(pangkat):
|
||||
continue
|
||||
nrp = match.group("nrp")
|
||||
|
||||
# Look forward up to 2 lines for NRP
|
||||
nrp: str | None = None
|
||||
nrp_idx: int | None = None
|
||||
for fwd in range(idx + 1, min(idx + 3, len(lines))):
|
||||
if fwd in consumed_nrps:
|
||||
continue
|
||||
nrp_match = _RE_NRP_ONLY.search(lines[fwd].strip())
|
||||
if nrp_match:
|
||||
nrp = nrp_match.group("nrp")
|
||||
nrp_idx = fwd
|
||||
break
|
||||
|
||||
if not nrp:
|
||||
continue
|
||||
assert nrp_idx is not None
|
||||
consumed_nrps.add(nrp_idx)
|
||||
|
||||
# Look back for name
|
||||
nama: str | None = None
|
||||
for back in range(idx - 1, max(idx - 6, -1), -1):
|
||||
if back in consumed_names:
|
||||
@@ -178,7 +401,8 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
|
||||
consumed_names.add(back)
|
||||
break
|
||||
|
||||
jabatan = _following_jabatan(lines, idx)
|
||||
# Look forward after NRP for jabatan
|
||||
jabatan = _following_jabatan(lines, nrp_idx)
|
||||
rows.append(
|
||||
PersonnelEntry(
|
||||
no=None,
|
||||
@@ -193,6 +417,370 @@ def extract_personnel_from_text(raw_text: str) -> list[PersonnelEntry]:
|
||||
return rows
|
||||
|
||||
|
||||
# Bare row-number markers used by sprint formats without NRP (the dot
|
||||
# is often missing in narrow-column OCR, e.g. just "1" on its own line).
|
||||
_RE_BARE_ROW_NUMBER = re.compile(r"^\s*\d{1,3}\s*[.):]?\s*$")
|
||||
|
||||
|
||||
def _try_match_rank_at(lines: list[str], idx: int) -> tuple[str, int] | None:
|
||||
"""Try to match a standalone rank starting at ``lines[idx]``.
|
||||
|
||||
Returns ``(rank_text, lines_consumed)`` on success. Handles narrow-
|
||||
column OCR that splits a multi-token rank across two lines (e.g.
|
||||
``"KOMBES"`` + ``"POL"`` or ``"PENATA"`` + ``"TK I"``).
|
||||
|
||||
The two-line concatenation is tried *first* so that more-specific
|
||||
multi-token ranks ("PENATA TK I") win over their less-specific
|
||||
single-line prefix ("PENATA"). Without this preference, "TK I"
|
||||
would leak into the jabatan column.
|
||||
"""
|
||||
if idx >= len(lines):
|
||||
return None
|
||||
line = lines[idx].strip()
|
||||
if idx + 1 < len(lines):
|
||||
combined = (line + " " + lines[idx + 1].strip()).strip()
|
||||
m2 = _RE_RANK_ONLY.match(combined)
|
||||
if m2:
|
||||
return m2.group("rank"), 2
|
||||
m = _RE_RANK_ONLY.match(line)
|
||||
if m:
|
||||
return m.group("rank"), 1
|
||||
return None
|
||||
|
||||
|
||||
def _extract_rank_only(lines: list[str]) -> list[PersonnelEntry]:
|
||||
"""Pass 3: rank-only fallback for sprint formats without an NRP column.
|
||||
|
||||
Each standalone rank line (single line or two-line concatenation) is
|
||||
treated as the pivot of a personnel row. ``nama`` is assembled from
|
||||
the preceding contiguous plausible-name lines (typical OCR splits a
|
||||
long name across 2-3 short lines because of narrow columns); jabatan
|
||||
is collected from following lines until the next rank or row marker.
|
||||
|
||||
``nrp`` is always ``None`` for rows produced by this pass.
|
||||
"""
|
||||
rows: list[PersonnelEntry] = []
|
||||
consumed_lines: set[int] = set()
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
match = _try_match_rank_at(lines, i)
|
||||
if not match:
|
||||
i += 1
|
||||
continue
|
||||
rank_text, rank_span = match
|
||||
pangkat = normalize_pangkat(rank_text)
|
||||
if not pangkat or not is_valid_pangkat(pangkat):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# ── Look back for name lines (assemble up to 4 contiguous lines) ──
|
||||
name_lines: list[str] = []
|
||||
for back in range(i - 1, max(i - 6, -1), -1):
|
||||
if back in consumed_lines:
|
||||
break
|
||||
candidate = lines[back].strip()
|
||||
if not candidate:
|
||||
if name_lines:
|
||||
break
|
||||
continue
|
||||
if _RE_BARE_ROW_NUMBER.match(candidate):
|
||||
break
|
||||
if _RE_NAME_BLOCKLIST.match(candidate):
|
||||
break
|
||||
if _try_match_rank_at(lines, back) is not None:
|
||||
break
|
||||
if not _is_plausible_name(candidate):
|
||||
break
|
||||
name_lines.insert(0, candidate)
|
||||
consumed_lines.add(back)
|
||||
nama = " ".join(" ".join(name_lines).split()) if name_lines else None
|
||||
|
||||
# ── Look forward for jabatan (stop at next rank / row marker) ─────
|
||||
jabatan_parts: list[str] = []
|
||||
fwd = i + rank_span
|
||||
steps = 0
|
||||
while fwd < len(lines) and steps < 8:
|
||||
candidate = lines[fwd].strip()
|
||||
if not candidate:
|
||||
if jabatan_parts:
|
||||
break
|
||||
fwd += 1
|
||||
steps += 1
|
||||
continue
|
||||
if _RE_BARE_ROW_NUMBER.match(candidate):
|
||||
break
|
||||
if _try_match_rank_at(lines, fwd) is not None:
|
||||
break
|
||||
if _RE_NAME_BLOCKLIST.match(candidate):
|
||||
break
|
||||
jabatan_parts.append(candidate)
|
||||
fwd += 1
|
||||
steps += 1
|
||||
jabatan = " ".join(" ".join(jabatan_parts).split()) if jabatan_parts else None
|
||||
|
||||
rows.append(
|
||||
PersonnelEntry(
|
||||
no=None,
|
||||
pangkat=pangkat,
|
||||
nrp=None,
|
||||
nama=nama,
|
||||
jabatan_dinas=jabatan,
|
||||
jabatan_sprint=None,
|
||||
keterangan=None,
|
||||
)
|
||||
)
|
||||
i += rank_span
|
||||
return rows
|
||||
|
||||
|
||||
# ── Column-aware Pass 3 (uses OCR bounding boxes) ───────────────────────
|
||||
|
||||
|
||||
def _box_x_left(box: tuple[tuple[float, float], ...]) -> float:
|
||||
return min(p[0] for p in box)
|
||||
|
||||
|
||||
def _box_x_right(box: tuple[tuple[float, float], ...]) -> float:
|
||||
return max(p[0] for p in box)
|
||||
|
||||
|
||||
def _box_x_center(box: tuple[tuple[float, float], ...]) -> float:
|
||||
return (_box_x_left(box) + _box_x_right(box)) / 2
|
||||
|
||||
|
||||
def _box_y_top(box: tuple[tuple[float, float], ...]) -> float:
|
||||
return min(p[1] for p in box)
|
||||
|
||||
|
||||
def _box_y_bottom(box: tuple[tuple[float, float], ...]) -> float:
|
||||
return max(p[1] for p in box)
|
||||
|
||||
|
||||
def _box_y_center(box: tuple[tuple[float, float], ...]) -> float:
|
||||
return (_box_y_top(box) + _box_y_bottom(box)) / 2
|
||||
|
||||
|
||||
def _box_height(box: tuple[tuple[float, float], ...]) -> float:
|
||||
return _box_y_bottom(box) - _box_y_top(box)
|
||||
|
||||
|
||||
def extract_personnel_from_ocr_lines(ocr_lines: list) -> list[PersonnelEntry]:
|
||||
"""Column-aware Pass 3 for sprint formats without an NRP column.
|
||||
|
||||
Each ``ocr_line`` must expose ``text`` (str) and ``box`` (a tuple of
|
||||
4 ``(x, y)`` corner points). We use the geometry to:
|
||||
|
||||
1. Detect rank lines (single-line or vertically-stacked two-line).
|
||||
2. Estimate the PANGKAT column X-center from those rank lines.
|
||||
3. For each rank, gather **only** lines in the NAMA column (X left
|
||||
of PANGKAT) within the row's Y span as the name fragments, and
|
||||
**only** lines in the JABATAN column (X right of PANGKAT) for
|
||||
jabatan. This prevents column-bleed that flat-text Pass 3
|
||||
suffers from on dense tables.
|
||||
|
||||
Returns ``[]`` if no rank lines are detected (caller can fall back
|
||||
to the text-only Pass 3).
|
||||
"""
|
||||
if not ocr_lines:
|
||||
return []
|
||||
|
||||
# Sort by (y_top, x_left) for vertical-stacking rank detection.
|
||||
indexed = sorted(
|
||||
range(len(ocr_lines)),
|
||||
key=lambda i: (_box_y_top(ocr_lines[i].box), _box_x_left(ocr_lines[i].box)),
|
||||
)
|
||||
|
||||
# Pass 1: find rank anchors.
|
||||
# An anchor is one or two stacked OCR lines whose combined text matches
|
||||
# _RE_RANK_ONLY and normalises to a known pangkat. Two-line stacks must
|
||||
# X-overlap so we don't accidentally merge cells from different columns.
|
||||
used: set[int] = set()
|
||||
anchors: list[dict] = []
|
||||
for pos, idx in enumerate(indexed):
|
||||
if idx in used:
|
||||
continue
|
||||
ln = ocr_lines[idx]
|
||||
text = ln.text.strip()
|
||||
|
||||
rank_text: str | None = None
|
||||
member_idxs: list[int] = [idx]
|
||||
|
||||
# Try two-line stack first (so PENATA TK I beats PENATA).
|
||||
for j_pos in range(pos + 1, min(pos + 5, len(indexed))):
|
||||
j_idx = indexed[j_pos]
|
||||
if j_idx in used:
|
||||
continue
|
||||
other = ocr_lines[j_idx]
|
||||
x_overlap = (
|
||||
min(_box_x_right(ln.box), _box_x_right(other.box))
|
||||
- max(_box_x_left(ln.box), _box_x_left(other.box))
|
||||
)
|
||||
if x_overlap <= 0:
|
||||
continue
|
||||
y_gap = _box_y_top(other.box) - _box_y_bottom(ln.box)
|
||||
if y_gap > _box_height(ln.box) * 1.5:
|
||||
break
|
||||
combined = (text + " " + other.text.strip()).strip()
|
||||
m2 = _RE_RANK_ONLY.match(combined)
|
||||
if m2:
|
||||
rank_text = m2.group("rank")
|
||||
member_idxs.append(j_idx)
|
||||
break
|
||||
|
||||
if rank_text is None:
|
||||
m1 = _RE_RANK_ONLY.match(text)
|
||||
if m1:
|
||||
rank_text = m1.group("rank")
|
||||
|
||||
if rank_text is None:
|
||||
continue
|
||||
pangkat = normalize_pangkat(rank_text)
|
||||
if not pangkat or not is_valid_pangkat(pangkat):
|
||||
continue
|
||||
|
||||
anchors.append(
|
||||
{
|
||||
"member_idxs": member_idxs,
|
||||
"pangkat": pangkat,
|
||||
"x_center": _box_x_center(ln.box),
|
||||
"y_top": min(_box_y_top(ocr_lines[m].box) for m in member_idxs),
|
||||
"y_bottom": max(_box_y_bottom(ocr_lines[m].box) for m in member_idxs),
|
||||
}
|
||||
)
|
||||
used.update(member_idxs)
|
||||
|
||||
if not anchors:
|
||||
return []
|
||||
|
||||
# Sort anchors by Y so we can compute row spans.
|
||||
anchors.sort(key=lambda a: a["y_top"])
|
||||
|
||||
# Estimate PANGKAT column X-center as the median of rank anchor X-centers.
|
||||
xs_sorted = sorted(a["x_center"] for a in anchors)
|
||||
pangkat_x = xs_sorted[len(xs_sorted) // 2]
|
||||
|
||||
# X tolerance: half the median rank-line width. Lines with x_center
|
||||
# within ±tolerance of pangkat_x are *in* the PANGKAT column and
|
||||
# excluded from both NAMA and JABATAN buckets.
|
||||
rank_widths = [
|
||||
_box_x_right(ocr_lines[a["member_idxs"][0]].box)
|
||||
- _box_x_left(ocr_lines[a["member_idxs"][0]].box)
|
||||
for a in anchors
|
||||
]
|
||||
rank_widths.sort()
|
||||
median_rank_width = rank_widths[len(rank_widths) // 2] if rank_widths else 50.0
|
||||
column_margin = max(median_rank_width * 0.5, 5.0)
|
||||
|
||||
# Try to split the JABATAN side into STRUKTURAL (jabatan_dinas) and
|
||||
# DALAM SPRIN (jabatan_sprint) by clustering jabatan-side X-centers.
|
||||
# This is a 2-cluster k-means-style split: collect all X-centers of
|
||||
# lines to the right of PANGKAT, find the largest X-gap among them,
|
||||
# and use that gap as the column boundary. KET is typically the
|
||||
# right-most narrow column we let bleed into jabatan_sprint since
|
||||
# it's commonly empty.
|
||||
jabatan_xs: list[float] = []
|
||||
for ln in ocr_lines:
|
||||
x = _box_x_center(ln.box)
|
||||
if x > pangkat_x + column_margin and ln.text.strip():
|
||||
jabatan_xs.append(x)
|
||||
jabatan_split_x: float | None = None
|
||||
if len(jabatan_xs) >= 4:
|
||||
jabatan_xs.sort()
|
||||
max_gap = 0.0
|
||||
max_gap_x: float | None = None
|
||||
for k in range(1, len(jabatan_xs)):
|
||||
gap = jabatan_xs[k] - jabatan_xs[k - 1]
|
||||
if gap > max_gap:
|
||||
max_gap = gap
|
||||
max_gap_x = (jabatan_xs[k] + jabatan_xs[k - 1]) / 2
|
||||
# Only use the split if the gap is meaningfully larger than a
|
||||
# within-column gap (heuristic: > 1.5× median rank width).
|
||||
if max_gap_x is not None and max_gap > median_rank_width * 1.5:
|
||||
jabatan_split_x = max_gap_x
|
||||
|
||||
# Pre-compute each anchor's y_center for midpoint row dividers.
|
||||
anchor_y_centers = [(a["y_top"] + a["y_bottom"]) / 2 for a in anchors]
|
||||
|
||||
rows: list[PersonnelEntry] = []
|
||||
for i, anchor in enumerate(anchors):
|
||||
# Row Y span: midpoint between this anchor and its neighbours.
|
||||
# Using the midpoint (rather than the previous anchor's
|
||||
# y_bottom) prevents row N's tail content (e.g. last name
|
||||
# fragment "M.H.") from leaking into row N+1's nama bucket
|
||||
# when rank lines don't extend to the full visual row height.
|
||||
y_lo = (
|
||||
(anchor_y_centers[i - 1] + anchor_y_centers[i]) / 2
|
||||
if i > 0
|
||||
else float("-inf")
|
||||
)
|
||||
y_hi = (
|
||||
(anchor_y_centers[i] + anchor_y_centers[i + 1]) / 2
|
||||
if i + 1 < len(anchors)
|
||||
else float("inf")
|
||||
)
|
||||
|
||||
nama_pieces: list[tuple[float, str]] = []
|
||||
struktural_pieces: list[tuple[float, str]] = []
|
||||
sprint_pieces: list[tuple[float, str]] = []
|
||||
for j, ln in enumerate(ocr_lines):
|
||||
if j in anchor["member_idxs"]:
|
||||
continue
|
||||
text = ln.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
x = _box_x_center(ln.box)
|
||||
y = _box_y_center(ln.box)
|
||||
if not (y_lo <= y <= y_hi):
|
||||
continue
|
||||
if x < pangkat_x - column_margin:
|
||||
# NAMA side
|
||||
if _RE_NAME_BLOCKLIST.match(text):
|
||||
continue
|
||||
if _RE_BARE_ROW_NUMBER.match(text):
|
||||
continue
|
||||
if not _is_plausible_name(text):
|
||||
continue
|
||||
nama_pieces.append((y, text))
|
||||
elif x > pangkat_x + column_margin:
|
||||
# JABATAN side — split into STRUKTURAL vs DALAM SPRIN
|
||||
# using the geometric column boundary detected above.
|
||||
if _RE_NAME_BLOCKLIST.match(text):
|
||||
continue
|
||||
if jabatan_split_x is not None and x > jabatan_split_x:
|
||||
sprint_pieces.append((y, text))
|
||||
else:
|
||||
struktural_pieces.append((y, text))
|
||||
# else: in PANGKAT column or column margin — skip
|
||||
|
||||
nama_pieces.sort(key=lambda p: p[0])
|
||||
struktural_pieces.sort(key=lambda p: p[0])
|
||||
sprint_pieces.sort(key=lambda p: p[0])
|
||||
|
||||
# Strip leading row number from the first nama piece (e.g. "1 F. GUNTUR"
|
||||
# collapses to "F. GUNTUR" if the row marker happens to share a box).
|
||||
if nama_pieces:
|
||||
head = _RE_LEADING_ROW_NUMBER.sub("", nama_pieces[0][1]).strip()
|
||||
nama_pieces[0] = (nama_pieces[0][0], head)
|
||||
|
||||
def _join(pieces: list[tuple[float, str]]) -> str | None:
|
||||
text = " ".join(t for _, t in pieces if t).strip()
|
||||
text = " ".join(text.split())
|
||||
return text or None
|
||||
|
||||
rows.append(
|
||||
PersonnelEntry(
|
||||
no=None,
|
||||
pangkat=anchor["pangkat"],
|
||||
nrp=None,
|
||||
nama=_join(nama_pieces),
|
||||
jabatan_dinas=_join(struktural_pieces),
|
||||
jabatan_sprint=_join(sprint_pieces),
|
||||
keterangan=None,
|
||||
)
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def is_low_quality(rows: list[PersonnelEntry]) -> bool:
|
||||
"""Heuristic: did PP-Structure produce useless rows?
|
||||
|
||||
|
||||
@@ -36,6 +36,73 @@ class OCRLine:
|
||||
box: tuple[tuple[float, float], ...] # 4 (x, y) corner points
|
||||
|
||||
|
||||
def _line_y_center(line: OCRLine) -> float:
|
||||
return sum(p[1] for p in line.box) / len(line.box)
|
||||
|
||||
|
||||
def _line_x_left(line: OCRLine) -> float:
|
||||
return min(p[0] for p in line.box)
|
||||
|
||||
|
||||
def _line_height(line: OCRLine) -> float:
|
||||
ys = [p[1] for p in line.box]
|
||||
return max(ys) - min(ys)
|
||||
|
||||
|
||||
def sort_lines_by_layout(lines: list[OCRLine]) -> list[OCRLine]:
|
||||
"""Reorder lines into top-to-bottom, left-to-right reading order.
|
||||
|
||||
PaddleOCR's natural output order reflects detection order, not visual
|
||||
layout. On dense tables (e.g. Polda Kalbar Akpol-panitia sprint) this
|
||||
interleaves rows and columns — Paddle may emit a row's KET column
|
||||
before its NAMA column, breaking every downstream extractor that
|
||||
assumes top-to-bottom row order.
|
||||
|
||||
We rebuild reading order by:
|
||||
|
||||
1. Sorting by ``y_center``.
|
||||
2. Grouping consecutive lines into row-bands when their ``y_center``
|
||||
differs by less than half the median line height (so visually
|
||||
same-row cells stay together even when their boxes don't perfectly
|
||||
align).
|
||||
3. Sorting each band left-to-right by ``x_left``.
|
||||
"""
|
||||
if not lines:
|
||||
return []
|
||||
|
||||
heights = [_line_height(ln) for ln in lines if _line_height(ln) > 0]
|
||||
if not heights:
|
||||
return list(lines)
|
||||
median_height = sorted(heights)[len(heights) // 2]
|
||||
band_threshold = max(1.0, median_height * 0.5)
|
||||
|
||||
by_y = sorted(lines, key=_line_y_center)
|
||||
bands: list[list[OCRLine]] = []
|
||||
current_band: list[OCRLine] = []
|
||||
current_y: float | None = None
|
||||
for ln in by_y:
|
||||
y = _line_y_center(ln)
|
||||
if current_y is None or abs(y - current_y) <= band_threshold:
|
||||
current_band.append(ln)
|
||||
# Track the band's running y-center as the mean of its
|
||||
# members so a slowly-drifting set of cells doesn't split
|
||||
# mid-row.
|
||||
current_y = (
|
||||
sum(_line_y_center(b) for b in current_band) / len(current_band)
|
||||
)
|
||||
else:
|
||||
bands.append(current_band)
|
||||
current_band = [ln]
|
||||
current_y = y
|
||||
if current_band:
|
||||
bands.append(current_band)
|
||||
|
||||
ordered: list[OCRLine] = []
|
||||
for band in bands:
|
||||
ordered.extend(sorted(band, key=_line_x_left))
|
||||
return ordered
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class OCRPage:
|
||||
"""OCR output for a single page."""
|
||||
@@ -44,8 +111,8 @@ class OCRPage:
|
||||
|
||||
@property
|
||||
def text(self) -> str:
|
||||
"""Reconstruct page text by concatenating lines (order = paddle's output order)."""
|
||||
return "\n".join(line.text for line in self.lines)
|
||||
"""Reconstruct page text in visual reading order (top-to-bottom, left-to-right)."""
|
||||
return "\n".join(line.text for line in sort_lines_by_layout(self.lines))
|
||||
|
||||
@property
|
||||
def mean_confidence(self) -> float:
|
||||
|
||||
@@ -20,6 +20,7 @@ from ocr_sprint.pipeline.confidence import compute_confidence, route
|
||||
from ocr_sprint.pipeline.document_detect import DocumentDetectConfig, detect_and_correct
|
||||
from ocr_sprint.pipeline.extract.personnel import extract_personnel
|
||||
from ocr_sprint.pipeline.extract.personnel_text import (
|
||||
extract_personnel_from_ocr_lines,
|
||||
extract_personnel_from_text,
|
||||
is_low_quality,
|
||||
)
|
||||
@@ -144,12 +145,37 @@ def run_pipeline(content: bytes) -> PipelineOutput:
|
||||
# through the preferred path.
|
||||
if is_low_quality(personel):
|
||||
fallback_rows = extract_personnel_from_text(full_text)
|
||||
# If text-based fallback produced rows but they all lack NRP
|
||||
# (Pass 3 territory), retry with the column-aware extractor that
|
||||
# uses OCR bounding boxes. On dense tables (e.g. Polda Kalbar
|
||||
# Akpol-panitia), text-only Pass 3 bleeds adjacent columns into
|
||||
# nama/jabatan because lines are interleaved within each Y-band;
|
||||
# the columnar variant restricts each field to its visual column.
|
||||
text_only_no_nrp = bool(fallback_rows) and all(
|
||||
r.nrp is None for r in fallback_rows
|
||||
)
|
||||
if (not fallback_rows) or text_only_no_nrp:
|
||||
ocr_lines = [ln for page in ocr_pages for ln in page.lines]
|
||||
columnar_rows = extract_personnel_from_ocr_lines(ocr_lines)
|
||||
if columnar_rows and (
|
||||
not fallback_rows or len(columnar_rows) >= len(fallback_rows)
|
||||
):
|
||||
fallback_rows = columnar_rows
|
||||
if fallback_rows:
|
||||
personel = fallback_rows
|
||||
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
|
||||
# Pass 3 / columnar emit rows with nrp=None for sprint
|
||||
# templates without an NRP column. Surface that with a
|
||||
# distinct flag so operators know to expect missing NRPs by
|
||||
# design rather than by OCR failure.
|
||||
no_nrp = all(r.nrp is None for r in fallback_rows)
|
||||
if no_nrp:
|
||||
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK_NO_NRP)
|
||||
else:
|
||||
table_flags.append(ReviewFlag.PERSONNEL_TEXT_FALLBACK)
|
||||
_logger.info(
|
||||
"pipeline.personnel_text_fallback",
|
||||
fallback_rows=len(fallback_rows),
|
||||
no_nrp=no_nrp,
|
||||
)
|
||||
|
||||
untuk_items = find_untuk_list(full_text)
|
||||
|
||||
@@ -71,11 +71,16 @@ def _build_pp_structure() -> PPStructure:
|
||||
from paddleocr import PPStructure
|
||||
|
||||
s = get_settings()
|
||||
_logger.info("pp_structure.init", lang=s.ocr_lang, use_gpu=s.ocr_use_gpu)
|
||||
# PPStructure layout models only support 'en' and 'ch', not 'latin'.
|
||||
# Use 'en' for layout/table detection — it's language-agnostic (detects
|
||||
# table structure, not text language). OCR within cells still works for
|
||||
# Indonesian text because the recognition model handles Latin scripts.
|
||||
pp_lang = "en" if s.ocr_lang not in ("en", "ch") else s.ocr_lang
|
||||
_logger.info("pp_structure.init", lang=pp_lang, use_gpu=s.ocr_use_gpu)
|
||||
# layout=True so that PP-Structure also returns figure/text regions; we
|
||||
# filter to tables only afterwards. show_log=False to keep stdout clean.
|
||||
return PPStructure(
|
||||
lang=s.ocr_lang,
|
||||
lang=pp_lang,
|
||||
use_gpu=s.ocr_use_gpu,
|
||||
layout=True,
|
||||
show_log=False,
|
||||
|
||||
@@ -10,6 +10,7 @@ from uuid import UUID, uuid4
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
from ocr_sprint.schemas.extraction import ExtractionResult
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
|
||||
class SourceKind(str, Enum):
|
||||
@@ -52,7 +53,7 @@ class DocumentResponse(BaseModel):
|
||||
job_id: UUID
|
||||
status: DocumentStatus
|
||||
confidence: float | None = None
|
||||
data: ExtractionResult | None = None
|
||||
data: list[PersonnelEntry] | None = None
|
||||
review_flags: list[str] = Field(default_factory=list)
|
||||
error: str | None = None
|
||||
# Phase 6 — HITL review state.
|
||||
|
||||
@@ -22,6 +22,7 @@ class ReviewFlag(str, Enum):
|
||||
LLM_FALLBACK = "llm_fallback"
|
||||
LLM_UNAVAILABLE = "llm_unavailable"
|
||||
PERSONNEL_TEXT_FALLBACK = "personnel_text_fallback"
|
||||
PERSONNEL_TEXT_FALLBACK_NO_NRP = "personnel_text_fallback_no_nrp"
|
||||
INCOMPLETE_PERSONNEL_ROW = "incomplete_personnel_row"
|
||||
|
||||
|
||||
|
||||
75
tests/unit/test_ocr_layout.py
Normal file
75
tests/unit/test_ocr_layout.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""Tests for OCR layout reordering.
|
||||
|
||||
PaddleOCR emits text boxes in detection order, not visual reading order.
|
||||
On dense table layouts (Polda Kalbar Akpol-panitia regression) this
|
||||
interleaves columns within a row and breaks every downstream extractor
|
||||
that assumes top-to-bottom row order. ``sort_lines_by_layout`` rebuilds
|
||||
reading order from the bounding-box geometry.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ocr_sprint.pipeline.ocr import OCRLine, OCRPage, sort_lines_by_layout
|
||||
|
||||
|
||||
def _box(x: float, y: float, w: float = 30, h: float = 15):
|
||||
return ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
|
||||
|
||||
|
||||
def _make(text: str, x: float, y: float) -> OCRLine:
|
||||
return OCRLine(text=text, confidence=1.0, box=_box(x, y))
|
||||
|
||||
|
||||
class TestSortLinesByLayout:
|
||||
def test_empty_returns_empty(self) -> None:
|
||||
assert sort_lines_by_layout([]) == []
|
||||
|
||||
def test_already_sorted_is_stable(self) -> None:
|
||||
lines = [_make("A", 10, 10), _make("B", 50, 10), _make("C", 10, 30)]
|
||||
assert [ln.text for ln in sort_lines_by_layout(lines)] == ["A", "B", "C"]
|
||||
|
||||
def test_reorders_column_first_detection_to_row_first(self) -> None:
|
||||
# Simulate a 2-row, 3-col table where Paddle returned cells
|
||||
# column-first instead of row-first.
|
||||
lines = [
|
||||
_make("B1", 50, 10),
|
||||
_make("B2", 50, 30),
|
||||
_make("A1", 10, 10),
|
||||
_make("A2", 10, 30),
|
||||
_make("C1", 90, 10),
|
||||
_make("C2", 90, 30),
|
||||
]
|
||||
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||
assert result == ["A1", "B1", "C1", "A2", "B2", "C2"]
|
||||
|
||||
def test_groups_slightly_misaligned_cells_into_one_band(self) -> None:
|
||||
# Real OCR boxes for a single visual row are rarely perfectly
|
||||
# y-aligned; we still want them grouped.
|
||||
lines = [
|
||||
_make("LEFT", 10, 10),
|
||||
_make("MID", 50, 12), # 2px below LEFT — same row visually
|
||||
_make("RIGHT", 90, 11),
|
||||
]
|
||||
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||
assert result == ["LEFT", "MID", "RIGHT"]
|
||||
|
||||
def test_separates_rows_when_y_gap_exceeds_threshold(self) -> None:
|
||||
# Lines with a y gap larger than ~½ line-height must NOT collapse
|
||||
# into the same band.
|
||||
lines = [
|
||||
_make("ROW1A", 10, 10),
|
||||
_make("ROW1B", 50, 10),
|
||||
_make("ROW2A", 10, 30), # gap of 20 vs height 15 → new band
|
||||
_make("ROW2B", 50, 30),
|
||||
]
|
||||
result = [ln.text for ln in sort_lines_by_layout(lines)]
|
||||
assert result == ["ROW1A", "ROW1B", "ROW2A", "ROW2B"]
|
||||
|
||||
def test_ocrpage_text_uses_sorted_order(self) -> None:
|
||||
lines = [
|
||||
_make("RIGHT", 90, 10),
|
||||
_make("LEFT", 10, 10),
|
||||
_make("BOTTOM", 10, 30),
|
||||
]
|
||||
page = OCRPage(lines=lines)
|
||||
assert page.text == "LEFT\nRIGHT\nBOTTOM"
|
||||
@@ -8,11 +8,18 @@ recover at least the rank + NRP for every row.
|
||||
from __future__ import annotations
|
||||
|
||||
from ocr_sprint.pipeline.extract.personnel_text import (
|
||||
extract_personnel_from_ocr_lines,
|
||||
extract_personnel_from_text,
|
||||
is_low_quality,
|
||||
)
|
||||
from ocr_sprint.pipeline.ocr import OCRLine
|
||||
from ocr_sprint.schemas.personnel import PersonnelEntry
|
||||
|
||||
|
||||
def _ocr_line(text: str, x: float, y: float, w: float = 80, h: float = 15) -> OCRLine:
|
||||
box = ((x, y), (x + w, y), (x + w, y + h), (x, y + h))
|
||||
return OCRLine(text=text, confidence=1.0, box=box)
|
||||
|
||||
_CIMAHI_FIXTURE = """\
|
||||
DAFTAR PERSONIL SKCK POLRES DAN POLSEK JAJARAN POLRES CIMAHI TA 2024
|
||||
NO
|
||||
@@ -115,6 +122,86 @@ class TestExtractPersonnelFromText:
|
||||
names = [r.nama for r in rows]
|
||||
assert names == ["KETUT WARDANA", "NOVA SARI", "NOOR HIDAYAT"]
|
||||
|
||||
def test_extracts_multiple_rows_when_collapsed_to_one_line(self) -> None:
|
||||
# Polres Banjar regression: when PaddleOCR merges several table
|
||||
# rows onto a single OCR line, every rank+NRP pair on that line
|
||||
# must still produce a separate row. Previously per-line
|
||||
# ``re.search`` returned only the first match.
|
||||
text = (
|
||||
"DAFTAR NAMA INSTRUKTUR\n"
|
||||
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS "
|
||||
"INSTRUKTUR LAT PRA OPS "
|
||||
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 "
|
||||
"KASAT RESKRIM SDA "
|
||||
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 "
|
||||
"KASATINTELKAM POLRES BANJAR SDA\n"
|
||||
)
|
||||
rows = extract_personnel_from_text(text)
|
||||
assert len(rows) == 3
|
||||
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
|
||||
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
|
||||
assert rows[0].nama == "CUCU JUHANA, A.K.S."
|
||||
assert rows[1].nama is not None and "HERU SAMSUL BAHRI" in rows[1].nama
|
||||
assert rows[2].nama is not None and "YAYAN SOPIANA" in rows[2].nama
|
||||
|
||||
def test_extracts_multiple_rows_when_split_across_lines(self) -> None:
|
||||
# Variant of the squished case where OCR produces one line per
|
||||
# table row. Each row still ends up with multiple rank+NRP pairs
|
||||
# never being on the same line, but verifies the finditer-based
|
||||
# path doesn't regress this layout.
|
||||
text = (
|
||||
"1 CUCU JUHANA, A.K.S. KOMPOL 70100418 KABAGOPS\n"
|
||||
"INSTRUKTUR LAT PRA OPS\n"
|
||||
"HERU SAMSUL BAHRI, S.E., M.M., CPHR., CBA.AKP 77020049 KASAT RESKRIM\n"
|
||||
"SDA\n"
|
||||
"YAYAN SOPIANA, S.A.P., M.A.P., CPHR., CBA, CIAKP 84011113 KASATINTELKAM\n"
|
||||
"POLRES BANJAR SDA\n"
|
||||
)
|
||||
rows = extract_personnel_from_text(text)
|
||||
assert [r.pangkat for r in rows] == ["KOMPOL", "AKP", "AKP"]
|
||||
assert [r.nrp for r in rows] == ["70100418", "77020049", "84011113"]
|
||||
assert rows[0].nama == "CUCU JUHANA, A.K.S."
|
||||
|
||||
def test_extracts_rows_when_sprint_has_no_nrp_column(self) -> None:
|
||||
# Polda Kalbar Akpol-panitia regression: sprint formats without
|
||||
# an NRP column (panitia, undangan templates) must still extract
|
||||
# rows via the rank-only Pass 3 path. Names span multiple OCR
|
||||
# lines (narrow column), and the multi-token rank "KOMBES POL"
|
||||
# is split across two lines.
|
||||
text = (
|
||||
"DAFTAR NAMA PANITIA\n"
|
||||
"NO\nNAMA\nPANGKAT\nJABATAN\nSTRUKTURAL\nDALAM SPRIN\nKET\n"
|
||||
"1\nF. GUNTUR\nSUNOTO, S.I.K.,\nM.H.\n"
|
||||
"KOMBES\nPOL\n"
|
||||
"KARO SDM\nPOLDA KALBAR\nKETUA\nPELAKSANA\n"
|
||||
"2\nJUDA TRISNO\nTAMPUBOLON,\nS.H., S.I.K., M.H.\n"
|
||||
"AKBP\n"
|
||||
"KABAGDALPERS\nRO SDM\nPOLDA KALBAR\nSEKRETARIS\n"
|
||||
"3\nPRAYITNO, S.H.,\nM.H.\n"
|
||||
"KOMPOL\n"
|
||||
"KASUBBAG DIAPERS\nANGGOTA\n"
|
||||
)
|
||||
rows = extract_personnel_from_text(text)
|
||||
assert len(rows) == 3
|
||||
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "KOMPOL"]
|
||||
# All Pass 3 rows have nrp=None by design.
|
||||
assert all(r.nrp is None for r in rows)
|
||||
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
|
||||
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
|
||||
assert rows[2].nama == "PRAYITNO, S.H., M.H."
|
||||
assert rows[0].jabatan_dinas is not None and "KARO SDM" in rows[0].jabatan_dinas
|
||||
|
||||
def test_pass3_does_not_run_when_pass1_succeeds(self) -> None:
|
||||
# If a sprint has NRPs (Pass 1 succeeds), Pass 3 must not fire
|
||||
# and produce duplicate/contaminating rows.
|
||||
text = (
|
||||
"1\nSRI WAHYUNI\nAIPTU / 75070328\nBAUR SKCK\n"
|
||||
"2\nCITRA DWI PUTRI\nBRIPTU / 95070659\nBA PELAKSANA\n"
|
||||
)
|
||||
rows = extract_personnel_from_text(text)
|
||||
assert len(rows) == 2
|
||||
assert all(r.nrp is not None for r in rows)
|
||||
|
||||
def test_still_blocks_bare_column_header_tokens(self) -> None:
|
||||
# Word-boundary fix must still reject the actual column-header
|
||||
# rows that motivated the blocklist in the first place.
|
||||
@@ -124,6 +211,94 @@ class TestExtractPersonnelFromText:
|
||||
assert rows[0].nama == "REAL NAME"
|
||||
|
||||
|
||||
class TestExtractPersonnelFromOcrLines:
|
||||
"""Column-aware Pass 3 — Polda Kalbar Akpol-panitia regression.
|
||||
|
||||
Verifies that bounding-box geometry preserves column boundaries on
|
||||
dense tables where text-only Pass 3 bleeds adjacent columns into
|
||||
nama/jabatan.
|
||||
"""
|
||||
|
||||
def _kalbar_lines(self) -> list[OCRLine]:
|
||||
# Stylised Polda Kalbar layout: NO | NAMA | PANGKAT | STRUKTURAL | SPRIN
|
||||
# X columns: 10, 100, 250, 380, 520. Each row may have multi-line cells.
|
||||
return [
|
||||
# Row 1 — KOMBES POL spans two stacked OCR boxes
|
||||
_ocr_line("1", 10, 100),
|
||||
_ocr_line("F. GUNTUR", 100, 100),
|
||||
_ocr_line("SUNOTO, S.I.K.,", 100, 120),
|
||||
_ocr_line("M.H.", 100, 140),
|
||||
_ocr_line("KOMBES", 250, 100),
|
||||
_ocr_line("POL", 250, 120),
|
||||
_ocr_line("KARO SDM", 380, 100),
|
||||
_ocr_line("POLDA KALBAR", 380, 120),
|
||||
_ocr_line("KETUA", 520, 100),
|
||||
_ocr_line("PELAKSANA", 520, 120),
|
||||
# Row 2
|
||||
_ocr_line("2", 10, 200),
|
||||
_ocr_line("JUDA TRISNO", 100, 200),
|
||||
_ocr_line("TAMPUBOLON,", 100, 220),
|
||||
_ocr_line("S.H., S.I.K., M.H.", 100, 240),
|
||||
_ocr_line("AKBP", 250, 200),
|
||||
_ocr_line("KABAGDALPERS", 380, 200),
|
||||
_ocr_line("RO SDM", 380, 220),
|
||||
_ocr_line("POLDA KALBAR", 380, 240),
|
||||
_ocr_line("SEKRETARIS", 520, 200),
|
||||
# Row 9 — PNS PENATA TK I (multi-token rank stacked)
|
||||
_ocr_line("9", 10, 500),
|
||||
_ocr_line("FITRIANSYAH,", 100, 500),
|
||||
_ocr_line("S.E.", 100, 520),
|
||||
_ocr_line("PENATA", 250, 500),
|
||||
_ocr_line("TK I", 250, 520),
|
||||
_ocr_line("KAURKEU", 380, 500),
|
||||
_ocr_line("RO SDM", 380, 520),
|
||||
_ocr_line("POLDA KALBAR", 380, 540),
|
||||
_ocr_line("BENDAHARA", 520, 500),
|
||||
]
|
||||
|
||||
def test_extracts_three_rows(self) -> None:
|
||||
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||
assert len(rows) == 3
|
||||
assert [r.pangkat for r in rows] == ["KOMBES POL", "AKBP", "PENATA TK I"]
|
||||
|
||||
def test_nama_is_assembled_only_from_nama_column(self) -> None:
|
||||
# Each row's nama must contain *all* its multi-line fragments
|
||||
# and *only* its multi-line fragments — no bleed from struktural.
|
||||
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||
assert rows[0].nama == "F. GUNTUR SUNOTO, S.I.K., M.H."
|
||||
assert rows[1].nama == "JUDA TRISNO TAMPUBOLON, S.H., S.I.K., M.H."
|
||||
assert rows[2].nama == "FITRIANSYAH, S.E."
|
||||
|
||||
def test_jabatan_split_into_struktural_and_sprint(self) -> None:
|
||||
# The geometric column boundary must split STRUKTURAL (jabatan_dinas)
|
||||
# from DALAM SPRIN (jabatan_sprint).
|
||||
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||
assert rows[0].jabatan_dinas == "KARO SDM POLDA KALBAR"
|
||||
assert rows[0].jabatan_sprint == "KETUA PELAKSANA"
|
||||
assert rows[1].jabatan_dinas == "KABAGDALPERS RO SDM POLDA KALBAR"
|
||||
assert rows[1].jabatan_sprint == "SEKRETARIS"
|
||||
|
||||
def test_returns_empty_when_no_rank_anchors(self) -> None:
|
||||
lines = [
|
||||
_ocr_line("DAFTAR NAMA", 100, 50),
|
||||
_ocr_line("HEADER", 100, 100),
|
||||
]
|
||||
assert extract_personnel_from_ocr_lines(lines) == []
|
||||
|
||||
def test_returns_empty_for_empty_input(self) -> None:
|
||||
assert extract_personnel_from_ocr_lines([]) == []
|
||||
|
||||
def test_no_row_bleed_between_consecutive_rows(self) -> None:
|
||||
# Row 1's last name fragment ("F. GUNTUR") sits BELOW its rank
|
||||
# line but inside row 1's visual span. It must NOT leak into
|
||||
# row 2's nama, which should start with "JUDA TRISNO".
|
||||
rows = extract_personnel_from_ocr_lines(self._kalbar_lines())
|
||||
assert rows[1].nama is not None
|
||||
assert rows[1].nama.startswith("JUDA TRISNO")
|
||||
assert "GUNTUR" not in rows[1].nama
|
||||
assert "SUNOTO" not in rows[1].nama
|
||||
|
||||
|
||||
class TestIsLowQuality:
|
||||
def test_empty_list_is_low_quality(self) -> None:
|
||||
assert is_low_quality([]) is True
|
||||
|
||||
60
update.ps1
Normal file
60
update.ps1
Normal file
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env pwsh
|
||||
# update.ps1 - One-command update & restart for ocr-sprint-service (local dev)
|
||||
|
||||
$Port = 8000
|
||||
|
||||
# ── [1/5] Git pull ──────────────────────────────────────────────────────────
|
||||
Write-Host "`n[1/5] Pulling latest code..." -ForegroundColor Cyan
|
||||
git pull
|
||||
|
||||
# ── [2/5] Install/update dependencies ───────────────────────────────────────
|
||||
Write-Host "`n[2/5] Installing/updating dependencies..." -ForegroundColor Cyan
|
||||
pip install -e ".[dev]" -q
|
||||
|
||||
# ── [3/5] Database migration ─────────────────────────────────────────────────
|
||||
Write-Host "`n[3/5] Running database migrations..." -ForegroundColor Cyan
|
||||
alembic upgrade head
|
||||
if ($LASTEXITCODE -ne 0) {
|
||||
Write-Host " Migration conflict detected, stamping current state as head..." -ForegroundColor Yellow
|
||||
alembic stamp head
|
||||
Write-Host " Retrying upgrade for any remaining new migrations..." -ForegroundColor Yellow
|
||||
alembic upgrade head
|
||||
if ($LASTEXITCODE -ne 0) {
|
||||
Write-Host " Migration still failed. Please check alembic manually." -ForegroundColor Red
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
Write-Host " Migrations OK." -ForegroundColor Green
|
||||
|
||||
# ── [4/5] Free up port ───────────────────────────────────────────────────────
|
||||
Write-Host "`n[4/5] Checking port $Port..." -ForegroundColor Cyan
|
||||
|
||||
# Use Get-NetTCPConnection for reliable port detection on Windows
|
||||
$connections = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue
|
||||
if ($connections) {
|
||||
foreach ($conn in $connections) {
|
||||
$procId = $conn.OwningProcess
|
||||
$procName = (Get-Process -Id $procId -ErrorAction SilentlyContinue).Name
|
||||
Write-Host " Port $Port used by '$procName' (PID $procId), killing..." -ForegroundColor Yellow
|
||||
Stop-Process -Id $procId -Force -ErrorAction SilentlyContinue
|
||||
}
|
||||
# Wait until port is actually released (max 5 seconds)
|
||||
$waited = 0
|
||||
do {
|
||||
Start-Sleep -Milliseconds 500
|
||||
$waited += 500
|
||||
$still = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue
|
||||
} while ($still -and $waited -lt 5000)
|
||||
|
||||
if ($still) {
|
||||
Write-Host " Port $Port still in use after waiting. Try a different port or restart manually." -ForegroundColor Red
|
||||
exit 1
|
||||
}
|
||||
Write-Host " Port $Port freed." -ForegroundColor Green
|
||||
} else {
|
||||
Write-Host " Port $Port is free." -ForegroundColor Green
|
||||
}
|
||||
|
||||
# ── [5/5] Start dev server ───────────────────────────────────────────────────
|
||||
Write-Host "`n[5/5] Starting dev server on port $Port (Ctrl+C to stop)..." -ForegroundColor Cyan
|
||||
uvicorn ocr_sprint.main:app --reload --host 0.0.0.0 --port $Port
|
||||
Reference in New Issue
Block a user