Devin Review caught that `--out -` discarded the sample count, so the stderr summary always said 'wrote 0 sample(s)' even when bytes were streamed. Capture the return value like the file-output branch does, and add a regression test that exercises the stdout path. Co-Authored-By: adrian kuman firmansah <adriancuman@gmail.com>
97 lines
3.2 KiB
Python
97 lines
3.2 KiB
Python
"""Smoke tests for the ``ocr_sprint.tools.export_ground_truth`` CLI."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from uuid import uuid4
|
|
|
|
import pytest
|
|
|
|
from ocr_sprint.db.base import Base, get_engine, session_scope
|
|
from ocr_sprint.db.repositories import JobRepository
|
|
from ocr_sprint.schemas.document import DocumentStatus, SourceKind
|
|
from ocr_sprint.tools.export_ground_truth import main as export_main
|
|
|
|
|
|
@pytest.fixture
|
|
def db_ready() -> None:
|
|
Base.metadata.create_all(bind=get_engine())
|
|
|
|
|
|
def _seed_two_approved_jobs() -> None:
|
|
for _ in range(2):
|
|
jid = uuid4()
|
|
with session_scope() as session:
|
|
JobRepository(session).create(
|
|
job_id=jid, filename="x.pdf", source_kind=SourceKind.PDF, blob_key="k"
|
|
)
|
|
with session_scope() as session:
|
|
JobRepository(session).mark_completed(
|
|
jid,
|
|
status=DocumentStatus.COMPLETED,
|
|
confidence=0.9,
|
|
result={"header": {"nomor_sprint": "SPR/1/2025"}},
|
|
review_flags=[],
|
|
)
|
|
with session_scope() as session:
|
|
JobRepository(session).approve(jid, reviewed_by="rev")
|
|
|
|
|
|
def test_cli_writes_expected_number_of_lines(
|
|
db_ready: None, tmp_path: Path, capsys: pytest.CaptureFixture[str]
|
|
) -> None:
|
|
_seed_two_approved_jobs()
|
|
out = tmp_path / "corpus.jsonl"
|
|
|
|
exit_code = export_main(["--out", str(out)])
|
|
assert exit_code == 0
|
|
lines = [line for line in out.read_text().splitlines() if line.strip()]
|
|
assert len(lines) == 2
|
|
for line in lines:
|
|
parsed = json.loads(line)
|
|
assert parsed["approved"] is True
|
|
|
|
stderr = capsys.readouterr().err
|
|
assert "wrote 2 sample(s)" in stderr
|
|
|
|
|
|
def test_cli_respects_limit(db_ready: None, tmp_path: Path) -> None:
|
|
_seed_two_approved_jobs()
|
|
out = tmp_path / "corpus.jsonl"
|
|
exit_code = export_main(["--out", str(out), "--limit", "1"])
|
|
assert exit_code == 0
|
|
lines = [line for line in out.read_text().splitlines() if line.strip()]
|
|
assert len(lines) == 1
|
|
|
|
|
|
def test_cli_stdout_reports_correct_count(
|
|
db_ready: None, capsys: pytest.CaptureFixture[str]
|
|
) -> None:
|
|
"""``--out -`` writes JSONL to stdout; the "wrote N" message must
|
|
reflect what actually streamed, not 0."""
|
|
_seed_two_approved_jobs()
|
|
exit_code = export_main(["--out", "-"])
|
|
assert exit_code == 0
|
|
captured = capsys.readouterr()
|
|
stdout_lines = [line for line in captured.out.splitlines() if line.strip()]
|
|
assert len(stdout_lines) == 2
|
|
for line in stdout_lines:
|
|
assert json.loads(line)["approved"] is True
|
|
assert "wrote 2 sample(s)" in captured.err
|
|
|
|
|
|
def test_cli_print_stats_emits_json_to_stderr(
|
|
db_ready: None, tmp_path: Path, capsys: pytest.CaptureFixture[str]
|
|
) -> None:
|
|
_seed_two_approved_jobs()
|
|
out = tmp_path / "corpus.jsonl"
|
|
exit_code = export_main(["--out", str(out), "--print-stats"])
|
|
assert exit_code == 0
|
|
stderr = capsys.readouterr().err
|
|
# Validate the JSON prologue (after the "wrote N" line).
|
|
json_start = stderr.index("{")
|
|
stats = json.loads(stderr[json_start:])
|
|
assert stats["total_jobs"] == 2
|
|
assert stats["approved_jobs"] == 2
|