Adds OpenCV-based phone-photo handling that runs before the standard preprocessing pipeline for IMAGE source kinds (PDF renders are flat by construction and skip this stage). Pipeline additions in src/ocr_sprint/pipeline/document_detect.py: - _find_document_quad: Canny + dilate + contour search, picks the largest convex 4-point polygon above a configurable area threshold; fails gracefully and returns None when no usable quad is found. - _four_point_warp: orders corners (TL/TR/BR/BL via sum/diff trick) and runs cv2.getPerspectiveTransform + warpPerspective. - _remove_shadow: per-channel background-division (dilate + median blur + 255 - absdiff + normalize) for uneven phone-shot lighting. - detect_and_correct: top-level entrypoint with graceful fallback to the original image when detection fails. Wired into the synchronous orchestrator: only enabled for IMAGE sources, skipped for PDF. New settings: - preprocess_detect_document (default: true) - preprocess_remove_shadow (default: true) - preprocess_min_quad_area_fraction (default: 0.20) Tests: 9 new unit tests covering corner ordering, quad detection on synthetic skewed documents, perspective warp output sanity, shadow removal shape preservation, full-pipeline behavior, and graceful fallback when detection fails. 70 tests total, all green. ML-based dewarping (DewarpNet) and DocTR detector are deferred to a future phase per the roadmap; the existing API is structured so they can be added as alternative backends behind DocumentDetectConfig. Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
129 lines
4.6 KiB
Python
129 lines
4.6 KiB
Python
"""Tests for Phase 2 document detection + perspective correction."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import cv2
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from ocr_sprint.pipeline.document_detect import (
|
|
DocumentDetectConfig,
|
|
_find_document_quad,
|
|
_four_point_warp,
|
|
_order_corners,
|
|
_remove_shadow,
|
|
detect_and_correct,
|
|
)
|
|
|
|
|
|
def _synthetic_skewed_doc(
|
|
canvas_size: tuple[int, int] = (900, 700),
|
|
page_size: tuple[int, int] = (500, 380),
|
|
skew: tuple[int, int, int, int] = (40, -20, -30, 25),
|
|
) -> np.ndarray:
|
|
"""Create a dark canvas containing a white quadrilateral 'document'.
|
|
|
|
Returns a BGR uint8 image. Useful for testing detection without bringing
|
|
in real photos.
|
|
"""
|
|
h, w = canvas_size
|
|
img = np.full((h, w, 3), 30, dtype=np.uint8) # dark grey background
|
|
page_w, page_h = page_size
|
|
cx, cy = w // 2, h // 2
|
|
# corners offset by `skew` to simulate perspective distortion
|
|
sx_tl, sx_tr, sx_br, sx_bl = skew
|
|
pts = np.array(
|
|
[
|
|
[cx - page_w // 2 + sx_tl, cy - page_h // 2 + sx_tl], # TL
|
|
[cx + page_w // 2 + sx_tr, cy - page_h // 2 - sx_tr], # TR
|
|
[cx + page_w // 2 + sx_br, cy + page_h // 2 + sx_br], # BR
|
|
[cx - page_w // 2 + sx_bl, cy + page_h // 2 - sx_bl], # BL
|
|
],
|
|
dtype=np.int32,
|
|
)
|
|
cv2.fillPoly(img, [pts], color=(245, 245, 245))
|
|
# add a fake text line so the page isn't a flat white blob
|
|
cv2.line(img, (cx - 100, cy), (cx + 100, cy), color=(20, 20, 20), thickness=4)
|
|
return img
|
|
|
|
|
|
class TestOrderCorners:
|
|
def test_canonical_order(self) -> None:
|
|
# input pts unordered: BR, TL, TR, BL
|
|
pts = np.array([[100, 100], [10, 10], [110, 8], [9, 95]], dtype=np.float32)
|
|
ordered = _order_corners(pts)
|
|
# TL has the smallest sum
|
|
assert tuple(ordered[0]) == pytest.approx((10.0, 10.0))
|
|
# BR has the largest sum
|
|
assert tuple(ordered[2]) == pytest.approx((100.0, 100.0))
|
|
|
|
|
|
class TestFindDocumentQuad:
|
|
def test_finds_quad_in_synthetic_image(self) -> None:
|
|
img = _synthetic_skewed_doc()
|
|
quad = _find_document_quad(img, min_area_fraction=0.10)
|
|
assert quad is not None
|
|
assert quad.shape == (4, 2)
|
|
|
|
def test_returns_none_for_flat_image(self) -> None:
|
|
flat = np.full((400, 400, 3), 200, dtype=np.uint8)
|
|
assert _find_document_quad(flat) is None
|
|
|
|
def test_returns_none_for_tiny_image(self) -> None:
|
|
tiny = np.full((20, 20, 3), 200, dtype=np.uint8)
|
|
assert _find_document_quad(tiny) is None
|
|
|
|
|
|
class TestFourPointWarp:
|
|
def test_warp_produces_axis_aligned_rectangle(self) -> None:
|
|
img = _synthetic_skewed_doc()
|
|
quad = _find_document_quad(img, min_area_fraction=0.10)
|
|
assert quad is not None
|
|
warped = _four_point_warp(img, quad)
|
|
# warped output should be smaller than the canvas (it's only the page)
|
|
assert warped.shape[0] < img.shape[0]
|
|
assert warped.shape[1] < img.shape[1]
|
|
# majority of the warped image should be near-white (the page)
|
|
gray = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY)
|
|
bright_fraction = float((gray > 200).mean())
|
|
assert bright_fraction > 0.7
|
|
|
|
|
|
class TestRemoveShadow:
|
|
def test_output_shape_matches_input(self) -> None:
|
|
img = np.random.randint(0, 256, (200, 300, 3), dtype=np.uint8)
|
|
out = _remove_shadow(img)
|
|
assert out.shape == img.shape
|
|
assert out.dtype == np.uint8
|
|
|
|
|
|
class TestDetectAndCorrect:
|
|
def test_full_pipeline_on_skewed_doc(self) -> None:
|
|
img = _synthetic_skewed_doc()
|
|
out = detect_and_correct(
|
|
img,
|
|
DocumentDetectConfig(detect_document=True, remove_shadow=True, min_area_fraction=0.1),
|
|
)
|
|
# the output is the cropped+flattened page, so it must be smaller
|
|
assert out.shape[0] < img.shape[0]
|
|
|
|
def test_disabled_returns_input_unchanged(self) -> None:
|
|
img = _synthetic_skewed_doc()
|
|
out = detect_and_correct(
|
|
img,
|
|
DocumentDetectConfig(detect_document=False, remove_shadow=False),
|
|
)
|
|
assert out.shape == img.shape
|
|
# disabled both flags → bytes-identical
|
|
assert np.array_equal(out, img)
|
|
|
|
def test_failed_detection_falls_back_to_input(self) -> None:
|
|
# flat grey image with no contour-detectable document
|
|
flat = np.full((300, 300, 3), 128, dtype=np.uint8)
|
|
out = detect_and_correct(
|
|
flat,
|
|
DocumentDetectConfig(detect_document=True, remove_shadow=False, min_area_fraction=0.5),
|
|
)
|
|
# detection failed, so the original shape is preserved
|
|
assert out.shape == flat.shape
|