Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
88 lines
2.6 KiB
Python
88 lines
2.6 KiB
Python
"""API tests with the OCR engine mocked.
|
|
|
|
These tests do NOT load PaddleOCR — instead they monkeypatch the orchestrator
|
|
so we can exercise the FastAPI surface without the heavy ML init cost.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date
|
|
|
|
import pytest
|
|
from fastapi.testclient import TestClient
|
|
|
|
from ocr_sprint.main import create_app
|
|
from ocr_sprint.pipeline import orchestrator as orch_module
|
|
from ocr_sprint.pipeline.orchestrator import PipelineOutput
|
|
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
|
|
from ocr_sprint.schemas.extraction import ExtractionResult, HeaderFields
|
|
|
|
|
|
@pytest.fixture
|
|
def client() -> TestClient:
|
|
return TestClient(create_app())
|
|
|
|
|
|
def test_health_endpoint(client: TestClient) -> None:
|
|
response = client.get("/api/v1/health")
|
|
assert response.status_code == 200
|
|
assert response.json()["status"] == "ok"
|
|
|
|
|
|
def test_documents_rejects_empty_upload(client: TestClient) -> None:
|
|
response = client.post(
|
|
"/api/v1/documents",
|
|
files={"file": ("empty.pdf", b"", "application/pdf")},
|
|
)
|
|
assert response.status_code == 400
|
|
|
|
|
|
def test_documents_rejects_unknown_format(
|
|
client: TestClient,
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
response = client.post(
|
|
"/api/v1/documents",
|
|
files={"file": ("x.bin", b"random garbage bytes here", "application/octet-stream")},
|
|
)
|
|
assert response.status_code == 400
|
|
|
|
|
|
def test_documents_returns_pipeline_output(
|
|
client: TestClient,
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
) -> None:
|
|
fake_result = ExtractionResult(
|
|
header=HeaderFields(
|
|
nomor_sprint="Sprin/1/I/2025",
|
|
tanggal=date(2025, 1, 1),
|
|
satuan_penerbit="POLRES TEST",
|
|
),
|
|
confidence=0.97,
|
|
)
|
|
fake_output = PipelineOutput(
|
|
source_kind=SourceKind.PDF,
|
|
status=DocumentStatus.COMPLETED,
|
|
confidence=0.97,
|
|
result=fake_result,
|
|
)
|
|
|
|
def _fake_run(_content: bytes) -> PipelineOutput:
|
|
return fake_output
|
|
|
|
# Patch the symbol *imported into* the routes module.
|
|
monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
|
|
from ocr_sprint.api.routes import documents as docs_module
|
|
|
|
monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
|
|
|
|
response = client.post(
|
|
"/api/v1/documents",
|
|
files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
|
|
)
|
|
assert response.status_code == 200
|
|
body = response.json()
|
|
assert body["status"] == "completed"
|
|
assert body["confidence"] == 0.97
|
|
assert body["data"]["header"]["nomor_sprint"] == "Sprin/1/I/2025"
|