Phase 1 MVP: synchronous OCR + regex header extraction
Implements the foundation of the OCR Sprint service: - FastAPI app with /api/v1/health and /api/v1/documents (sync upload) - Pydantic v2 schemas for documents, extraction result, personnel - Pipeline: PDF/image ingest (PyMuPDF), preprocessing (resize, deskew, denoise, optional adaptive threshold), PaddleOCR wrapper, regex-based header extraction (nomor sprint, tanggal, satuan, perihal, dasar), signatory NRP, master-pangkat validation, confidence scoring + routing. - Tests: 61 unit tests covering regex rules, validators, preprocess, ingest, confidence, and API contract (PaddleOCR mocked). - Tooling: pyproject (setuptools), ruff, mypy strict, pytest, pre-commit, Dockerfile, docker-compose, Makefile. - Docs: README + docs/architecture.md (full hybrid stack rationale and 6-phase roadmap). Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
46
tests/unit/test_confidence.py
Normal file
46
tests/unit/test_confidence.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""Tests for confidence scoring + routing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ocr_sprint.pipeline.confidence import compute_confidence, route
|
||||
from ocr_sprint.schemas.document import DocumentStatus
|
||||
from ocr_sprint.schemas.extraction import ReviewFlag
|
||||
|
||||
|
||||
def test_no_flags_returns_blend_of_ocr_only() -> None:
|
||||
score = compute_confidence(0.9, [])
|
||||
# OCR weight 0.6 * 0.9 + validation 0.4 * 1.0 = 0.94
|
||||
assert abs(score - 0.94) < 1e-6
|
||||
|
||||
|
||||
def test_flags_reduce_score() -> None:
|
||||
base = compute_confidence(0.9, [])
|
||||
with_flags = compute_confidence(0.9, [ReviewFlag.MISSING_FIELD])
|
||||
assert with_flags < base
|
||||
|
||||
|
||||
def test_score_is_clamped() -> None:
|
||||
catastrophic = compute_confidence(
|
||||
0.0,
|
||||
[
|
||||
ReviewFlag.MISSING_FIELD,
|
||||
ReviewFlag.LOW_OCR_CONFIDENCE,
|
||||
ReviewFlag.PERSONNEL_COUNT_MISMATCH,
|
||||
ReviewFlag.INVALID_NRP,
|
||||
ReviewFlag.UNKNOWN_PANGKAT,
|
||||
ReviewFlag.DATE_PARSE_FAILED,
|
||||
],
|
||||
)
|
||||
assert 0.0 <= catastrophic <= 1.0
|
||||
|
||||
|
||||
def test_route_high_confidence() -> None:
|
||||
assert route(0.97) == DocumentStatus.COMPLETED
|
||||
|
||||
|
||||
def test_route_mid_goes_to_review() -> None:
|
||||
assert route(0.88) == DocumentStatus.NEEDS_REVIEW
|
||||
|
||||
|
||||
def test_route_low_goes_to_review() -> None:
|
||||
assert route(0.40) == DocumentStatus.NEEDS_REVIEW
|
||||
Reference in New Issue
Block a user