update

2026-04-26 22:08:41 +08:00
parent 5d9d9f784a
commit 9d969e61fd
6 changed files with 149 additions and 7 deletions
--- a/src/ocr_sprint/api/routes/documents.py
+++ b/src/ocr_sprint/api/routes/documents.py
@@ -10,7 +10,10 @@ flow on top:
 * `POST /documents?sync=true` — runs the pipeline inline (the original
                                 Phase 1 behaviour). Useful for tests and
                                 small-volume single-tenant deploys without
-                                 a Celery worker.
+                                 a Celery worker. The heavy OCR work is
+                                 offloaded to a thread-pool executor so the
+                                 uvicorn event loop stays responsive during
+                                 processing (~30-120s on CPU).
 * `GET  /documents/{job_id}`  — returns the current job state. Async
                                 clients poll this until `status` is in a
                                 terminal state (completed / needs_review /
@@ -19,9 +22,19 @@ flow on top:

 from __future__ import annotations

+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
 from typing import Annotated
 from uuid import UUID, uuid4

+# Thread pool dedicated to blocking OCR work. Using a *separate* pool
+# (rather than the default loop executor) lets us cap the number of
+# concurrent heavy OCR jobs independently of other thread-pool users.
+# With 1 Celery worker + 1 sync slot we never exceed 2 parallel OCR
+# runs; keep the pool at 1 so RAM stays bounded on the 7.4 GB server.
+_OCR_EXECUTOR = ThreadPoolExecutor(max_workers=1, thread_name_prefix="ocr-inline")
+
 from fastapi import (
    APIRouter,
    Depends,
@@ -165,11 +178,13 @@ async def create_document(


 async def _run_inline(job_id: UUID, content: bytes) -> DocumentResponse:
-    """Synchronous pipeline execution.
+    """Run the OCR pipeline without blocking the uvicorn event loop.

-    Each state transition opens its own short session so the request-scoped
-    session's rollback-on-exception behaviour cannot wipe out the
-    ``mark_failed`` write or strand the blob on disk.
+    ``run_pipeline`` is CPU-bound and can take 30-120 s on a 2 vCPU server.
+    Awaiting it directly on the async handler would freeze the entire event
+    loop (and therefore block health-checks, metrics, and every other request)
+    for the full duration. We push the work onto a dedicated single-thread
+    executor so the loop stays free while the OCR runs in the background.
    """
    import time

@@ -177,8 +192,13 @@ async def _run_inline(job_id: UUID, content: bytes) -> DocumentResponse:
        JobRepository(s).mark_processing(job_id)

    started = time.perf_counter()
+    loop = asyncio.get_event_loop()
    try:
-        output = run_pipeline(content)
+        # run_pipeline is synchronous; wrap it so asyncio can await it.
+        output = await loop.run_in_executor(
+            _OCR_EXECUTOR,
+            partial(run_pipeline, content),
+        )
    except ValueError as exc:
        with session_scope() as s:
            JobRepository(s).mark_failed(job_id, error=str(exc))
--- a/src/ocr_sprint/api/routes/health.py
+++ b/src/ocr_sprint/api/routes/health.py
@@ -3,8 +3,11 @@
 from __future__ import annotations

 from fastapi import APIRouter
+from fastapi.responses import JSONResponse

 from ocr_sprint import __version__
+from ocr_sprint.pipeline import ocr as _ocr
+from ocr_sprint.pipeline import table as _table

 router = APIRouter(tags=["health"])

@@ -13,3 +16,20 @@ router = APIRouter(tags=["health"])
 async def health() -> dict[str, str]:
    """Lightweight liveness check — does NOT touch the OCR engine."""
    return {"status": "ok", "version": __version__}
+
+
+@router.get("/health/ready")
+async def readiness() -> JSONResponse:
+    """Readiness check — returns 200 when OCR models are loaded, 503 if still warming up."""
+    ocr_ready = _ocr._instance is not None
+    table_ready = _table._instance is not None
+    ready = ocr_ready and table_ready
+    payload = {
+        "status": "ready" if ready else "warming_up",
+        "version": __version__,
+        "models": {
+            "paddleocr": "ready" if ocr_ready else "loading",
+            "pp_structure": "ready" if table_ready else "loading",
+        },
+    }
+    return JSONResponse(content=payload, status_code=200 if ready else 503)
--- a/src/ocr_sprint/main.py
+++ b/src/ocr_sprint/main.py
@@ -2,6 +2,10 @@

 from __future__ import annotations

+import threading
+from contextlib import asynccontextmanager
+from typing import AsyncIterator
+
 from fastapi import FastAPI

 from ocr_sprint import __version__
@@ -11,7 +15,10 @@ from ocr_sprint.api.routes import documents, ground_truth, health
 from ocr_sprint.config import get_settings
 from ocr_sprint.db import models as _models  # noqa: F401  (register ORM tables)
 from ocr_sprint.db.base import Base, get_engine
-from ocr_sprint.utils.logging import configure_logging
+from ocr_sprint.utils.logging import configure_logging, get_logger
+
+
+_startup_logger = get_logger(__name__)


 def _ensure_schema() -> None:
@@ -24,6 +31,42 @@ def _ensure_schema() -> None:
    Base.metadata.create_all(bind=get_engine())


+def _warmup_models_background() -> None:
+    """Load PaddleOCR and PP-Structure models in a background thread.
+
+    Running in a thread keeps the lifespan non-blocking so uvicorn can
+    start accepting health-check requests immediately while the heavy models
+    load (~5-15s on CPU). Requests that arrive before warmup completes will
+    wait on the existing _lock in each module rather than racing to load.
+    """
+    from ocr_sprint.config import get_settings as _gs
+    from ocr_sprint.pipeline import ocr as _ocr
+    from ocr_sprint.pipeline import table as _table
+
+    s = _gs()
+    try:
+        _ocr.warmup()
+    except Exception as exc:
+        _startup_logger.warning("paddleocr.warmup.failed", error=str(exc))
+
+    if s.tables_enabled:
+        try:
+            _table.warmup()
+        except Exception as exc:
+            _startup_logger.warning("pp_structure.warmup.failed", error=str(exc))
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI) -> AsyncIterator[None]:
+    """FastAPI lifespan: warm OCR models on startup in a background thread."""
+    _startup_logger.info("startup.warmup.begin")
+    t = threading.Thread(target=_warmup_models_background, name="ocr-warmup", daemon=True)
+    t.start()
+    yield
+    # Shutdown: nothing to clean up (models are process-global singletons).
+    _startup_logger.info("shutdown.complete")
+
+
 def create_app() -> FastAPI:
    """Application factory — keeps top-level state easy to test."""
    settings = get_settings()
@@ -34,6 +77,7 @@ def create_app() -> FastAPI:
    root_path = getattr(settings, "root_path", "")

    app = FastAPI(
+        lifespan=lifespan,
        title="OCR Sprint Service",
        version=__version__,
        description="OCR + structured extraction for Indonesian police 'surat sprint' documents.",
--- a/src/ocr_sprint/pipeline/ocr.py
+++ b/src/ocr_sprint/pipeline/ocr.py
@@ -151,6 +151,19 @@ def get_ocr() -> PaddleOCR:
    return _instance


+def warmup() -> None:
+    """Eagerly initialize the PaddleOCR engine.
+
+    Call this during application startup so the first real request does not
+    pay the model-loading cost (~2-5s on CPU). Also prevents the process from
+    entering Disk-Sleep state (state D) mid-request when memory is tight,
+    because the OS has already paged in all model weights during startup.
+    """
+    _logger.info("paddleocr.warmup.start")
+    get_ocr()
+    _logger.info("paddleocr.warmup.done")
+
+
 def run_ocr(image: NDArrayU8) -> OCRPage:
    """Run OCR on a single BGR image and return a structured page result."""
    engine = get_ocr()
--- a/src/ocr_sprint/pipeline/table.py
+++ b/src/ocr_sprint/pipeline/table.py
@@ -97,6 +97,18 @@ def get_pp_structure() -> PPStructure:
    return _instance


+def warmup() -> None:
+    """Eagerly initialize the PP-Structure engine.
+
+    Call this during application startup so the first real request does not
+    pay the model-loading cost (~3-6s on CPU). Mirrors ocr.warmup() so the
+    lifespan handler can warm both engines in one place.
+    """
+    _logger.info("pp_structure.warmup.start")
+    get_pp_structure()
+    _logger.info("pp_structure.warmup.done")
+
+
 # ---------- table parsing ----------


--- a/src/ocr_sprint/worker/celery_app.py
+++ b/src/ocr_sprint/worker/celery_app.py
@@ -15,8 +15,12 @@ from __future__ import annotations
 import os

 from celery import Celery
+from celery.signals import worker_ready

 from ocr_sprint.config import get_settings
+from ocr_sprint.utils.logging import get_logger
+
+_logger = get_logger(__name__)


 def build_celery_app() -> Celery:
@@ -47,3 +51,32 @@ def build_celery_app() -> Celery:


 celery_app = build_celery_app()
+
+
+@worker_ready.connect
+def preload_ocr_models(sender: object, **kwargs: object) -> None:
+    """Warm up PaddleOCR and PP-Structure when the worker process is ready.
+
+    With ``--pool=solo`` the worker runs tasks in the *same* process that
+    receives this signal, so models loaded here are reused for every
+    subsequent task — no fork overhead, no duplicate model loading, and
+    RAM usage stays bounded (~1.5 GB instead of 1.5 GB × n_forks).
+    """
+    from ocr_sprint.config import get_settings as _gs
+    from ocr_sprint.pipeline import ocr as _ocr
+    from ocr_sprint.pipeline import table as _table
+
+    _logger.info("celery.worker.warmup.start")
+    s = _gs()
+    try:
+        _ocr.warmup()
+    except Exception as exc:
+        _logger.warning("celery.worker.paddleocr.warmup.failed", error=str(exc))
+
+    if s.tables_enabled:
+        try:
+            _table.warmup()
+        except Exception as exc:
+            _logger.warning("celery.worker.pp_structure.warmup.failed", error=str(exc))
+
+    _logger.info("celery.worker.warmup.done")