Phase 2: document detection + perspective correction + shadow removal
Adds OpenCV-based phone-photo handling that runs before the standard preprocessing pipeline for IMAGE source kinds (PDF renders are flat by construction and skip this stage). Pipeline additions in src/ocr_sprint/pipeline/document_detect.py: - _find_document_quad: Canny + dilate + contour search, picks the largest convex 4-point polygon above a configurable area threshold; fails gracefully and returns None when no usable quad is found. - _four_point_warp: orders corners (TL/TR/BR/BL via sum/diff trick) and runs cv2.getPerspectiveTransform + warpPerspective. - _remove_shadow: per-channel background-division (dilate + median blur + 255 - absdiff + normalize) for uneven phone-shot lighting. - detect_and_correct: top-level entrypoint with graceful fallback to the original image when detection fails. Wired into the synchronous orchestrator: only enabled for IMAGE sources, skipped for PDF. New settings: - preprocess_detect_document (default: true) - preprocess_remove_shadow (default: true) - preprocess_min_quad_area_fraction (default: 0.20) Tests: 9 new unit tests covering corner ordering, quad detection on synthetic skewed documents, perspective warp output sanity, shadow removal shape preservation, full-pipeline behavior, and graceful fallback when detection fails. 70 tests total, all green. ML-based dewarping (DewarpNet) and DocTR detector are deferred to a future phase per the roadmap; the existing API is structured so they can be added as alternative backends behind DocumentDetectConfig. Co-authored-by: adrian kuman firmansah <adriancuman@gmail.com>
This commit is contained in:
128
tests/unit/test_document_detect.py
Normal file
128
tests/unit/test_document_detect.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""Tests for Phase 2 document detection + perspective correction."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from ocr_sprint.pipeline.document_detect import (
|
||||
DocumentDetectConfig,
|
||||
_find_document_quad,
|
||||
_four_point_warp,
|
||||
_order_corners,
|
||||
_remove_shadow,
|
||||
detect_and_correct,
|
||||
)
|
||||
|
||||
|
||||
def _synthetic_skewed_doc(
|
||||
canvas_size: tuple[int, int] = (900, 700),
|
||||
page_size: tuple[int, int] = (500, 380),
|
||||
skew: tuple[int, int, int, int] = (40, -20, -30, 25),
|
||||
) -> np.ndarray:
|
||||
"""Create a dark canvas containing a white quadrilateral 'document'.
|
||||
|
||||
Returns a BGR uint8 image. Useful for testing detection without bringing
|
||||
in real photos.
|
||||
"""
|
||||
h, w = canvas_size
|
||||
img = np.full((h, w, 3), 30, dtype=np.uint8) # dark grey background
|
||||
page_w, page_h = page_size
|
||||
cx, cy = w // 2, h // 2
|
||||
# corners offset by `skew` to simulate perspective distortion
|
||||
sx_tl, sx_tr, sx_br, sx_bl = skew
|
||||
pts = np.array(
|
||||
[
|
||||
[cx - page_w // 2 + sx_tl, cy - page_h // 2 + sx_tl], # TL
|
||||
[cx + page_w // 2 + sx_tr, cy - page_h // 2 - sx_tr], # TR
|
||||
[cx + page_w // 2 + sx_br, cy + page_h // 2 + sx_br], # BR
|
||||
[cx - page_w // 2 + sx_bl, cy + page_h // 2 - sx_bl], # BL
|
||||
],
|
||||
dtype=np.int32,
|
||||
)
|
||||
cv2.fillPoly(img, [pts], color=(245, 245, 245))
|
||||
# add a fake text line so the page isn't a flat white blob
|
||||
cv2.line(img, (cx - 100, cy), (cx + 100, cy), color=(20, 20, 20), thickness=4)
|
||||
return img
|
||||
|
||||
|
||||
class TestOrderCorners:
|
||||
def test_canonical_order(self) -> None:
|
||||
# input pts unordered: BR, TL, TR, BL
|
||||
pts = np.array([[100, 100], [10, 10], [110, 8], [9, 95]], dtype=np.float32)
|
||||
ordered = _order_corners(pts)
|
||||
# TL has the smallest sum
|
||||
assert tuple(ordered[0]) == pytest.approx((10.0, 10.0))
|
||||
# BR has the largest sum
|
||||
assert tuple(ordered[2]) == pytest.approx((100.0, 100.0))
|
||||
|
||||
|
||||
class TestFindDocumentQuad:
|
||||
def test_finds_quad_in_synthetic_image(self) -> None:
|
||||
img = _synthetic_skewed_doc()
|
||||
quad = _find_document_quad(img, min_area_fraction=0.10)
|
||||
assert quad is not None
|
||||
assert quad.shape == (4, 2)
|
||||
|
||||
def test_returns_none_for_flat_image(self) -> None:
|
||||
flat = np.full((400, 400, 3), 200, dtype=np.uint8)
|
||||
assert _find_document_quad(flat) is None
|
||||
|
||||
def test_returns_none_for_tiny_image(self) -> None:
|
||||
tiny = np.full((20, 20, 3), 200, dtype=np.uint8)
|
||||
assert _find_document_quad(tiny) is None
|
||||
|
||||
|
||||
class TestFourPointWarp:
|
||||
def test_warp_produces_axis_aligned_rectangle(self) -> None:
|
||||
img = _synthetic_skewed_doc()
|
||||
quad = _find_document_quad(img, min_area_fraction=0.10)
|
||||
assert quad is not None
|
||||
warped = _four_point_warp(img, quad)
|
||||
# warped output should be smaller than the canvas (it's only the page)
|
||||
assert warped.shape[0] < img.shape[0]
|
||||
assert warped.shape[1] < img.shape[1]
|
||||
# majority of the warped image should be near-white (the page)
|
||||
gray = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY)
|
||||
bright_fraction = float((gray > 200).mean())
|
||||
assert bright_fraction > 0.7
|
||||
|
||||
|
||||
class TestRemoveShadow:
|
||||
def test_output_shape_matches_input(self) -> None:
|
||||
img = np.random.randint(0, 256, (200, 300, 3), dtype=np.uint8)
|
||||
out = _remove_shadow(img)
|
||||
assert out.shape == img.shape
|
||||
assert out.dtype == np.uint8
|
||||
|
||||
|
||||
class TestDetectAndCorrect:
|
||||
def test_full_pipeline_on_skewed_doc(self) -> None:
|
||||
img = _synthetic_skewed_doc()
|
||||
out = detect_and_correct(
|
||||
img,
|
||||
DocumentDetectConfig(detect_document=True, remove_shadow=True, min_area_fraction=0.1),
|
||||
)
|
||||
# the output is the cropped+flattened page, so it must be smaller
|
||||
assert out.shape[0] < img.shape[0]
|
||||
|
||||
def test_disabled_returns_input_unchanged(self) -> None:
|
||||
img = _synthetic_skewed_doc()
|
||||
out = detect_and_correct(
|
||||
img,
|
||||
DocumentDetectConfig(detect_document=False, remove_shadow=False),
|
||||
)
|
||||
assert out.shape == img.shape
|
||||
# disabled both flags → bytes-identical
|
||||
assert np.array_equal(out, img)
|
||||
|
||||
def test_failed_detection_falls_back_to_input(self) -> None:
|
||||
# flat grey image with no contour-detectable document
|
||||
flat = np.full((300, 300, 3), 128, dtype=np.uint8)
|
||||
out = detect_and_correct(
|
||||
flat,
|
||||
DocumentDetectConfig(detect_document=True, remove_shadow=False, min_area_fraction=0.5),
|
||||
)
|
||||
# detection failed, so the original shape is preserved
|
||||
assert out.shape == flat.shape
|
||||
Reference in New Issue
Block a user