- New job_corrections table (append-only audit log) + migration
- Add approved / reviewed_by / reviewed_at columns to jobs
- PATCH /documents/{id} apply field-level corrections
- GET /documents/{id}/history return chronological audit trail
- POST /documents/{id}/approve lock final version (idempotent)
- Dotted field-path applier with root allow-list + list-index support
- Auto-clear `missing_field` review flag when required header keys filled
- Atomic batch apply: malformed path in batch rolls back all changes
- 22 new tests (11 repository-level, 11 API-level); 184 total passing
Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
249 lines
7.7 KiB
Python
249 lines
7.7 KiB
Python
"""End-to-end HTTP tests for the HITL endpoints.
|
|
|
|
We re-use the ``fake_pipeline`` style from ``test_api.py`` so we don't pay
|
|
the PaddleOCR init cost; the orchestrator is monkey-patched to return a
|
|
synthetic ``ExtractionResult``.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date
|
|
|
|
import pytest
|
|
from fastapi.testclient import TestClient
|
|
|
|
from ocr_sprint.main import create_app
|
|
from ocr_sprint.pipeline import orchestrator as orch_module
|
|
from ocr_sprint.pipeline.orchestrator import PipelineOutput
|
|
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
|
|
from ocr_sprint.schemas.extraction import (
|
|
ExtractionResult,
|
|
HeaderFields,
|
|
PersonnelEntry,
|
|
ReviewFlag,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def client() -> TestClient:
|
|
return TestClient(create_app())
|
|
|
|
|
|
@pytest.fixture
|
|
def fake_pipeline(monkeypatch: pytest.MonkeyPatch) -> PipelineOutput:
|
|
result = ExtractionResult(
|
|
header=HeaderFields(
|
|
nomor_sprint="Sprin/1/I/2025",
|
|
tanggal=date(2025, 1, 1),
|
|
satuan_penerbit="POLRES TEST",
|
|
perihal=None, # intentional gap so a PATCH can fill it
|
|
),
|
|
personel=[
|
|
PersonnelEntry(pangkat="AIPDA", nrp="77060000", nama="BUDI", jabatan="ANGGOTA"),
|
|
],
|
|
review_flags=[ReviewFlag.MISSING_FIELD],
|
|
confidence=0.7,
|
|
)
|
|
output = PipelineOutput(
|
|
source_kind=SourceKind.PDF,
|
|
status=DocumentStatus.NEEDS_REVIEW,
|
|
confidence=0.7,
|
|
result=result,
|
|
)
|
|
|
|
def _fake_run(_content: bytes) -> PipelineOutput:
|
|
return output
|
|
|
|
monkeypatch.setattr(orch_module, "run_pipeline", _fake_run)
|
|
from ocr_sprint.api.routes import documents as docs_module
|
|
|
|
monkeypatch.setattr(docs_module, "run_pipeline", _fake_run)
|
|
from ocr_sprint.worker import tasks as tasks_module
|
|
|
|
monkeypatch.setattr(tasks_module, "run_pipeline", _fake_run)
|
|
return output
|
|
|
|
|
|
def _create_job(client: TestClient) -> str:
|
|
post = client.post(
|
|
"/api/v1/documents?sync=true",
|
|
files={"file": ("x.pdf", b"%PDF-1.4\n%fake", "application/pdf")},
|
|
)
|
|
assert post.status_code == 200, post.text
|
|
body = post.json()
|
|
assert body["status"] == "needs_review"
|
|
return str(body["job_id"])
|
|
|
|
|
|
def test_patch_applies_correction_and_clears_missing_field(
|
|
client: TestClient,
|
|
fake_pipeline: PipelineOutput,
|
|
) -> None:
|
|
job_id = _create_job(client)
|
|
patched = client.patch(
|
|
f"/api/v1/documents/{job_id}",
|
|
json={
|
|
"corrections": [
|
|
{
|
|
"path": "header.perihal",
|
|
"value": "Penyelidikan kasus X",
|
|
"reason": "LLM missed it",
|
|
}
|
|
]
|
|
},
|
|
headers={"X-User-Id": "reviewer-a"},
|
|
)
|
|
assert patched.status_code == 200, patched.text
|
|
body = patched.json()
|
|
assert body["data"]["header"]["perihal"] == "Penyelidikan kasus X"
|
|
# The fake pipeline has both required header fields filled, so the
|
|
# ``missing_field`` flag is auto-cleared as soon as any correction
|
|
# lands (the policy re-evaluates required-field coverage on every
|
|
# edit).
|
|
assert "missing_field" not in body["review_flags"]
|
|
|
|
|
|
def test_patch_returns_400_for_unknown_path(
|
|
client: TestClient,
|
|
fake_pipeline: PipelineOutput,
|
|
) -> None:
|
|
job_id = _create_job(client)
|
|
resp = client.patch(
|
|
f"/api/v1/documents/{job_id}",
|
|
json={"corrections": [{"path": "bogus.field", "value": "x"}]},
|
|
)
|
|
assert resp.status_code == 400
|
|
|
|
|
|
def test_patch_is_atomic_on_partial_failure(
|
|
client: TestClient,
|
|
fake_pipeline: PipelineOutput,
|
|
) -> None:
|
|
job_id = _create_job(client)
|
|
resp = client.patch(
|
|
f"/api/v1/documents/{job_id}",
|
|
json={
|
|
"corrections": [
|
|
{"path": "header.perihal", "value": "OK"},
|
|
{"path": "bogus.root", "value": "X"},
|
|
]
|
|
},
|
|
)
|
|
assert resp.status_code == 400
|
|
|
|
# The first correction must not have persisted.
|
|
got = client.get(f"/api/v1/documents/{job_id}")
|
|
assert got.json()["data"]["header"]["perihal"] is None
|
|
|
|
|
|
def test_history_returns_corrections_in_order(
|
|
client: TestClient,
|
|
fake_pipeline: PipelineOutput,
|
|
) -> None:
|
|
job_id = _create_job(client)
|
|
client.patch(
|
|
f"/api/v1/documents/{job_id}",
|
|
json={"corrections": [{"path": "header.perihal", "value": "first"}]},
|
|
headers={"X-User-Id": "reviewer-a"},
|
|
)
|
|
client.patch(
|
|
f"/api/v1/documents/{job_id}",
|
|
json={"corrections": [{"path": "header.perihal", "value": "second"}]},
|
|
headers={"X-User-Id": "reviewer-b"},
|
|
)
|
|
|
|
history = client.get(f"/api/v1/documents/{job_id}/history")
|
|
assert history.status_code == 200
|
|
events = history.json()
|
|
assert [e["new_value"] for e in events] == ["first", "second"]
|
|
assert [e["corrected_by"] for e in events] == ["reviewer-a", "reviewer-b"]
|
|
# old_value of the second event should reflect the first edit.
|
|
assert events[1]["old_value"] == "first"
|
|
|
|
|
|
def test_history_returns_empty_list_for_untouched_job(
|
|
client: TestClient,
|
|
fake_pipeline: PipelineOutput,
|
|
) -> None:
|
|
job_id = _create_job(client)
|
|
history = client.get(f"/api/v1/documents/{job_id}/history")
|
|
assert history.status_code == 200
|
|
assert history.json() == []
|
|
|
|
|
|
def test_history_returns_404_for_unknown_job(client: TestClient) -> None:
|
|
resp = client.get("/api/v1/documents/00000000-0000-0000-0000-000000000000/history")
|
|
assert resp.status_code == 404
|
|
|
|
|
|
def test_approve_locks_subsequent_patches(
|
|
client: TestClient,
|
|
fake_pipeline: PipelineOutput,
|
|
) -> None:
|
|
job_id = _create_job(client)
|
|
approved = client.post(
|
|
f"/api/v1/documents/{job_id}/approve",
|
|
headers={"X-User-Id": "reviewer-a"},
|
|
)
|
|
assert approved.status_code == 200, approved.text
|
|
body = approved.json()
|
|
assert body["approved"] is True
|
|
assert body["reviewed_by"] == "reviewer-a"
|
|
assert body["reviewed_at"] # non-empty timestamp
|
|
|
|
# GET reflects the approval state.
|
|
got = client.get(f"/api/v1/documents/{job_id}").json()
|
|
assert got["approved"] is True
|
|
|
|
# PATCH after approve must be rejected with 409.
|
|
patched = client.patch(
|
|
f"/api/v1/documents/{job_id}",
|
|
json={"corrections": [{"path": "header.perihal", "value": "X"}]},
|
|
)
|
|
assert patched.status_code == 409
|
|
|
|
|
|
def test_approve_is_idempotent(
|
|
client: TestClient,
|
|
fake_pipeline: PipelineOutput,
|
|
) -> None:
|
|
job_id = _create_job(client)
|
|
first = client.post(
|
|
f"/api/v1/documents/{job_id}/approve",
|
|
headers={"X-User-Id": "reviewer-a"},
|
|
)
|
|
second = client.post(
|
|
f"/api/v1/documents/{job_id}/approve",
|
|
headers={"X-User-Id": "reviewer-b"},
|
|
)
|
|
assert first.status_code == 200
|
|
assert second.status_code == 200
|
|
# Second approve must NOT change the attribution. (SQLite drops tzinfo
|
|
# on roundtrip, which changes Pydantic's serialization between the two
|
|
# calls; compare the naive components.)
|
|
assert second.json()["reviewed_by"] == "reviewer-a"
|
|
assert (
|
|
second.json()["reviewed_at"].rstrip("Z").split("+")[0]
|
|
== (first.json()["reviewed_at"].rstrip("Z").split("+")[0])
|
|
)
|
|
|
|
|
|
def test_patch_requires_at_least_one_correction(
|
|
client: TestClient,
|
|
fake_pipeline: PipelineOutput,
|
|
) -> None:
|
|
job_id = _create_job(client)
|
|
resp = client.patch(
|
|
f"/api/v1/documents/{job_id}",
|
|
json={"corrections": []},
|
|
)
|
|
assert resp.status_code == 422 # Pydantic min_length=1 violation
|
|
|
|
|
|
def test_patch_missing_job_returns_404(client: TestClient) -> None:
|
|
resp = client.patch(
|
|
"/api/v1/documents/00000000-0000-0000-0000-000000000000",
|
|
json={"corrections": [{"path": "header.perihal", "value": "X"}]},
|
|
)
|
|
assert resp.status_code == 404
|