- New job_corrections table (append-only audit log) + migration
- Add approved / reviewed_by / reviewed_at columns to jobs
- PATCH /documents/{id} apply field-level corrections
- GET /documents/{id}/history return chronological audit trail
- POST /documents/{id}/approve lock final version (idempotent)
- Dotted field-path applier with root allow-list + list-index support
- Auto-clear `missing_field` review flag when required header keys filled
- Atomic batch apply: malformed path in batch rolls back all changes
- 22 new tests (11 repository-level, 11 API-level); 184 total passing
Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
239 lines
8.1 KiB
Python
239 lines
8.1 KiB
Python
"""Repository tests for Phase 6 HITL helpers."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from uuid import uuid4
|
|
|
|
import pytest
|
|
|
|
from ocr_sprint.db.base import Base, get_engine, session_scope
|
|
from ocr_sprint.db.repositories import (
|
|
InvalidFieldPathError,
|
|
JobAlreadyApprovedError,
|
|
JobNotCompletedError,
|
|
JobNotFoundError,
|
|
JobRepository,
|
|
)
|
|
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
|
|
|
|
|
|
@pytest.fixture
|
|
def db_ready() -> None:
|
|
Base.metadata.create_all(bind=get_engine())
|
|
|
|
|
|
def _seed_completed_job(
|
|
*,
|
|
result: dict[str, object] | None = None,
|
|
flags: list[str] | None = None,
|
|
) -> uuid4: # type: ignore[type-arg]
|
|
jid = uuid4()
|
|
with session_scope() as session:
|
|
repo = JobRepository(session)
|
|
repo.create(
|
|
job_id=jid,
|
|
filename="x.pdf",
|
|
source_kind=SourceKind.PDF,
|
|
blob_key="k",
|
|
)
|
|
with session_scope() as session:
|
|
JobRepository(session).mark_completed(
|
|
jid,
|
|
status=DocumentStatus.NEEDS_REVIEW,
|
|
confidence=0.7,
|
|
result=result
|
|
or {
|
|
"header": {
|
|
"nomor_sprint": "Sprin/1/I/2025",
|
|
"satuan_penerbit": "POLRES X",
|
|
"perihal": None,
|
|
},
|
|
"personel": [
|
|
{"pangkat": "AIPDA", "nrp": "77060000", "nama": "BUDI"},
|
|
],
|
|
"untuk": ["Melaksanakan tugas"],
|
|
},
|
|
review_flags=flags or [],
|
|
)
|
|
return jid
|
|
|
|
|
|
def test_apply_corrections_updates_nested_header_field(db_ready: None) -> None:
|
|
jid = _seed_completed_job()
|
|
with session_scope() as session:
|
|
repo = JobRepository(session)
|
|
repo.apply_corrections(
|
|
jid,
|
|
corrections=[("header.perihal", "Penyelidikan kasus X", "regex miss")],
|
|
corrected_by="reviewer-a",
|
|
)
|
|
row = repo.get_or_raise(jid)
|
|
assert row.result is not None
|
|
assert row.result["header"]["perihal"] == "Penyelidikan kasus X"
|
|
|
|
|
|
def test_apply_corrections_writes_audit_row(db_ready: None) -> None:
|
|
jid = _seed_completed_job()
|
|
with session_scope() as session:
|
|
JobRepository(session).apply_corrections(
|
|
jid,
|
|
corrections=[("header.perihal", "Penyelidikan", None)],
|
|
corrected_by="reviewer-a",
|
|
)
|
|
with session_scope() as session:
|
|
events = JobRepository(session).list_corrections(jid)
|
|
assert len(events) == 1
|
|
assert events[0].field_path == "header.perihal"
|
|
assert events[0].old_value is None
|
|
assert events[0].new_value == "Penyelidikan"
|
|
assert events[0].corrected_by == "reviewer-a"
|
|
|
|
|
|
def test_apply_corrections_supports_list_index(db_ready: None) -> None:
|
|
jid = _seed_completed_job()
|
|
with session_scope() as session:
|
|
JobRepository(session).apply_corrections(
|
|
jid,
|
|
corrections=[("personel[0].nrp", "77060001", None)],
|
|
corrected_by=None,
|
|
)
|
|
row = JobRepository(session).get_or_raise(jid)
|
|
assert row.result is not None
|
|
assert row.result["personel"][0]["nrp"] == "77060001"
|
|
|
|
|
|
def test_apply_corrections_is_atomic_on_invalid_path(db_ready: None) -> None:
|
|
"""A second-correction failure must roll back the first one."""
|
|
jid = _seed_completed_job()
|
|
with session_scope() as session, pytest.raises(InvalidFieldPathError):
|
|
JobRepository(session).apply_corrections(
|
|
jid,
|
|
corrections=[
|
|
("header.perihal", "OK", None),
|
|
("bogus.root", "X", None),
|
|
],
|
|
corrected_by=None,
|
|
)
|
|
# The first correction must not have persisted.
|
|
with session_scope() as session:
|
|
row = JobRepository(session).get_or_raise(jid)
|
|
assert row.result is not None
|
|
assert row.result["header"].get("perihal") is None
|
|
|
|
|
|
def test_apply_corrections_rejects_out_of_range_index(db_ready: None) -> None:
|
|
jid = _seed_completed_job()
|
|
with session_scope() as session, pytest.raises(InvalidFieldPathError):
|
|
JobRepository(session).apply_corrections(
|
|
jid,
|
|
corrections=[("personel[99].nrp", "77060001", None)],
|
|
corrected_by=None,
|
|
)
|
|
|
|
|
|
def test_apply_corrections_rejects_after_approve(db_ready: None) -> None:
|
|
jid = _seed_completed_job()
|
|
with session_scope() as session:
|
|
JobRepository(session).approve(jid, reviewed_by="reviewer-a")
|
|
with session_scope() as session, pytest.raises(JobAlreadyApprovedError):
|
|
JobRepository(session).apply_corrections(
|
|
jid,
|
|
corrections=[("header.perihal", "X", None)],
|
|
corrected_by="reviewer-a",
|
|
)
|
|
|
|
|
|
def test_apply_corrections_rejects_missing_job(db_ready: None) -> None:
|
|
with session_scope() as session, pytest.raises(JobNotFoundError):
|
|
JobRepository(session).apply_corrections(
|
|
uuid4(),
|
|
corrections=[("header.perihal", "X", None)],
|
|
corrected_by=None,
|
|
)
|
|
|
|
|
|
def test_apply_corrections_rejects_pending_job(db_ready: None) -> None:
|
|
jid = uuid4()
|
|
with session_scope() as session:
|
|
JobRepository(session).create(
|
|
job_id=jid, filename="x", source_kind=SourceKind.PDF, blob_key="k"
|
|
)
|
|
with session_scope() as session, pytest.raises(JobNotCompletedError):
|
|
JobRepository(session).apply_corrections(
|
|
jid,
|
|
corrections=[("header.perihal", "X", None)],
|
|
corrected_by=None,
|
|
)
|
|
|
|
|
|
def test_missing_field_flag_cleared_when_header_gap_filled(db_ready: None) -> None:
|
|
jid = _seed_completed_job(
|
|
result={
|
|
"header": {
|
|
"nomor_sprint": None,
|
|
"satuan_penerbit": "POLRES X",
|
|
}
|
|
},
|
|
flags=["missing_field", "low_ocr_confidence"],
|
|
)
|
|
with session_scope() as session:
|
|
JobRepository(session).apply_corrections(
|
|
jid,
|
|
corrections=[("header.nomor_sprint", "Sprin/2/I/2025", None)],
|
|
corrected_by="reviewer-a",
|
|
)
|
|
row = JobRepository(session).get_or_raise(jid)
|
|
# ``low_ocr_confidence`` stays (correction doesn't resolve that signal),
|
|
# but ``missing_field`` is gone because every required header key is
|
|
# now non-empty.
|
|
assert list(row.review_flags) == ["low_ocr_confidence"]
|
|
|
|
|
|
def test_approve_sets_timestamps_and_is_idempotent(db_ready: None) -> None:
|
|
jid = _seed_completed_job()
|
|
with session_scope() as session:
|
|
row = JobRepository(session).approve(jid, reviewed_by="reviewer-a")
|
|
first_at = row.reviewed_at
|
|
assert first_at is not None
|
|
with session_scope() as session:
|
|
row = JobRepository(session).approve(jid, reviewed_by="reviewer-b")
|
|
# Second call must NOT overwrite reviewed_by or reviewed_at.
|
|
# SQLite drops tzinfo on roundtrip, so compare the naive components.
|
|
assert row.approved is True
|
|
assert row.reviewed_by == "reviewer-a"
|
|
assert row.reviewed_at is not None
|
|
assert row.reviewed_at.replace(tzinfo=None) == first_at.replace(tzinfo=None)
|
|
|
|
|
|
def test_approve_rejects_pending_job(db_ready: None) -> None:
|
|
jid = uuid4()
|
|
with session_scope() as session:
|
|
JobRepository(session).create(
|
|
job_id=jid, filename="x", source_kind=SourceKind.PDF, blob_key="k"
|
|
)
|
|
with session_scope() as session, pytest.raises(JobNotCompletedError):
|
|
JobRepository(session).approve(jid, reviewed_by="rev")
|
|
|
|
|
|
def test_history_returns_events_in_order(db_ready: None) -> None:
|
|
jid = _seed_completed_job()
|
|
with session_scope() as session:
|
|
JobRepository(session).apply_corrections(
|
|
jid,
|
|
corrections=[("header.perihal", "one", None)],
|
|
corrected_by="r1",
|
|
)
|
|
with session_scope() as session:
|
|
JobRepository(session).apply_corrections(
|
|
jid,
|
|
corrections=[
|
|
("header.perihal", "two", None),
|
|
("personel[0].nama", "ANDI", None),
|
|
],
|
|
corrected_by="r2",
|
|
)
|
|
with session_scope() as session:
|
|
events = JobRepository(session).list_corrections(jid)
|
|
assert [e.new_value for e in events] == ["one", "two", "ANDI"]
|
|
assert [e.corrected_by for e in events] == ["r1", "r2", "r2"]
|