Files
OCR-SPRIN-SERVICE/tests/unit/test_api.py
Devin AI ca0c0a0428 Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service:
- FastAPI app with /api/v1/health and /api/v1/documents (sync upload)
- Pydantic v2 schemas for documents, extraction result, personnel
- Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew,
  denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based
  header extraction (nomor sprint, tanggal, satuan, perihal, dasar),
  signatory NRP, master-pangkat validation, confidence scoring + routing.
- Tests: 61 unit tests covering regex rules, validators, preprocess,
  ingest, confidence, and API contract (PaddleOCR mocked).
- Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit,
  Dockerfile, docker-compose, Makefile.
- Docs: README + docs/architecture.md (full hybrid stack rationale and
  6-phase roadmap).

Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
2026-04-25 14:58:50 +00:00

88 lines
2.6 KiB
Python

"""API tests with the OCR engine mocked.
These tests do NOT load PaddleOCR — instead they monkeypatch the orchestrator
so we can exercise the FastAPI surface without the heavy ML init cost.
"""
from __future__ import annotations
from datetime import date
import pytest
from fastapi.testclient import TestClient
from ocr_sprint.main import create_app
from ocr_sprint.pipeline import orchestrator as orch_module
from ocr_sprint.pipeline.orchestrator import PipelineOutput
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields
@pytest.fixture
def client() -> TestClient:
return TestClient(create_app())
def test_health_endpoint(client: TestClient) -> None:
response = client.get("/api/v1/health")
assert response.status_code == 200
assert response.json()["status"] == "ok"
def test_documents_rejects_empty_upload(client: TestClient) -> None:
response = client.post(
"/api/v1/documents",
files={"file": ("empty.pdf", b"", "application/pdf")},
)
assert response.status_code == 400
def test_documents_rejects_unknown_format(
client: TestClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
response = client.post(
"/api/v1/documents",
files={"file": ("x.bin", b"random garbage bytes here", "application/octet-stream")},
)
assert response.status_code == 400
def test_documents_returns_pipeline_output(
client: TestClient,
monkeypatch: pytest.MonkeyPatch,
) -> None:
fake_result = ExtractionResult(
header=HeaderFields(
nomor_sprint="Sprin/1/I/2025",
tanggal=date(2025, 1, 1),
satuan_penerbit="POLRES TEST",
),
confidence=0.97,
)
fake_output = PipelineOutput(
source_kind=SourceKind.PDF,
status=DocumentStatus.COMPLETED,
confidence=0.97,
result=fake_result,
)
def _fake_run(_content: bytes) -> PipelineOutput:
return fake_output
# Patch the symbol *imported into* the routes module.
monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
from ocr_sprint.api.routes import documents as docs_module
monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
response = client.post(
"/api/v1/documents",
files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
)
assert response.status_code == 200
body = response.json()
assert body["status"] == "completed"
assert body["confidence"] == 0.97
assert body["data"]["header"]["nomor_sprint"] == "Sprin/1/I/2025"