diff --git a/libs/extractor-api-lib/tests/modality_contract_test.py b/libs/extractor-api-lib/tests/modality_contract_test.py new file mode 100644 index 00000000..134c5cd4 --- /dev/null +++ b/libs/extractor-api-lib/tests/modality_contract_test.py @@ -0,0 +1,84 @@ +"""Contract tests for modality metadata consistency across extraction paths.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pytest + + +CONTRACT_DIR = Path(__file__).parent / "test_data" / "modality_contract" +ALLOWED_MODALITIES = {"TEXT", "TABLE", "IMAGE"} + + +def _load_fixture(path: Path) -> dict[str, Any]: + with path.open("r", encoding="utf-8") as handle: + return json.load(handle) + + +def _validate_piece(piece: dict[str, Any]) -> list[str]: + errors: list[str] = [] + + modality = piece.get("type") + metadata = piece.get("metadata") + page_content = piece.get("page_content") + + if modality not in ALLOWED_MODALITIES: + errors.append("type must be one of TEXT/TABLE/IMAGE") + + if not isinstance(page_content, str): + errors.append("page_content must be a string") + + if not isinstance(metadata, dict): + errors.append("metadata must be an object") + return errors + + required_common = ("document", "page", "id", "related") + for key in required_common: + if key not in metadata: + errors.append(f'metadata missing required key: "{key}"') + + if "related" in metadata and not isinstance(metadata.get("related"), list): + errors.append('metadata["related"] must be a list') + + if modality == "IMAGE": + has_legacy = bool(metadata.get("base64_image")) + has_image_url = bool(metadata.get("image_url")) + has_image_ref = bool(metadata.get("image_ref")) + has_reference = has_image_url or has_image_ref + + if not (has_legacy or has_reference): + errors.append("IMAGE metadata must contain base64_image or image_url/image_ref") + + if has_reference and not metadata.get("image_mime"): + errors.append("IMAGE metadata with image_url/image_ref must include image_mime") + + return errors + + +@pytest.mark.parametrize( + "fixture_path", + sorted(CONTRACT_DIR.glob("*.json")), + ids=lambda p: p.stem, +) +def test_modality_contract_fixture(fixture_path: Path): + """Validate every contract fixture against the shared modality schema.""" + fixture = _load_fixture(fixture_path) + piece = fixture["piece"] + valid = bool(fixture["valid"]) + + errors = _validate_piece(piece) + + if valid: + assert errors == [], f"Fixture {fixture_path.name} failed contract checks: {errors}" + else: + assert errors, f"Fixture {fixture_path.name} was expected to fail but passed." + + +def test_modality_contract_fixture_names_are_unique(): + """Ensure fixture names stay unique for clear test diagnostics.""" + fixtures = [_load_fixture(path) for path in sorted(CONTRACT_DIR.glob("*.json"))] + names = [fixture.get("name") for fixture in fixtures] + assert len(names) == len(set(names)) diff --git a/libs/extractor-api-lib/tests/test_data/modality_contract/image_invalid_missing_payload.json b/libs/extractor-api-lib/tests/test_data/modality_contract/image_invalid_missing_payload.json new file mode 100644 index 00000000..5ce6f42a --- /dev/null +++ b/libs/extractor-api-lib/tests/test_data/modality_contract/image_invalid_missing_payload.json @@ -0,0 +1,14 @@ +{ + "name": "image_invalid_missing_payload", + "valid": false, + "piece": { + "type": "IMAGE", + "page_content": "caption only", + "metadata": { + "document": "file:image.png", + "page": 1, + "id": "img-bad-1", + "related": [] + } + } +} diff --git a/libs/extractor-api-lib/tests/test_data/modality_contract/image_legacy_base64_valid.json b/libs/extractor-api-lib/tests/test_data/modality_contract/image_legacy_base64_valid.json new file mode 100644 index 00000000..5b0a5fe3 --- /dev/null +++ b/libs/extractor-api-lib/tests/test_data/modality_contract/image_legacy_base64_valid.json @@ -0,0 +1,15 @@ +{ + "name": "image_legacy_base64_valid", + "valid": true, + "piece": { + "type": "IMAGE", + "page_content": "diagram describing architecture", + "metadata": { + "document": "file:diagram.png", + "page": 1, + "id": "img-legacy-1", + "related": [], + "base64_image": "iVBORw0KGgoAAAANSUhEUgAA..." + } + } +} diff --git a/libs/extractor-api-lib/tests/test_data/modality_contract/image_reference_valid.json b/libs/extractor-api-lib/tests/test_data/modality_contract/image_reference_valid.json new file mode 100644 index 00000000..d3524b7f --- /dev/null +++ b/libs/extractor-api-lib/tests/test_data/modality_contract/image_reference_valid.json @@ -0,0 +1,16 @@ +{ + "name": "image_reference_valid", + "valid": true, + "piece": { + "type": "IMAGE", + "page_content": "OCR fallback text", + "metadata": { + "document": "file:screenshot.jpg", + "page": 1, + "id": "img-ref-1", + "related": [], + "image_ref": "s3://documents/screenshot.jpg", + "image_mime": "image/jpeg" + } + } +} diff --git a/libs/extractor-api-lib/tests/test_data/modality_contract/table_valid.json b/libs/extractor-api-lib/tests/test_data/modality_contract/table_valid.json new file mode 100644 index 00000000..f86d0134 --- /dev/null +++ b/libs/extractor-api-lib/tests/test_data/modality_contract/table_valid.json @@ -0,0 +1,14 @@ +{ + "name": "table_valid", + "valid": true, + "piece": { + "type": "TABLE", + "page_content": "| A | B |\n| --- | --- |\n| 1 | 2 |", + "metadata": { + "document": "file:sample.csv", + "page": 1, + "id": "tbl-123", + "related": ["chunk-1"] + } + } +} diff --git a/libs/extractor-api-lib/tests/test_data/modality_contract/text_valid.json b/libs/extractor-api-lib/tests/test_data/modality_contract/text_valid.json new file mode 100644 index 00000000..510a80b5 --- /dev/null +++ b/libs/extractor-api-lib/tests/test_data/modality_contract/text_valid.json @@ -0,0 +1,14 @@ +{ + "name": "text_valid", + "valid": true, + "piece": { + "type": "TEXT", + "page_content": "example body text", + "metadata": { + "document": "file:sample.txt", + "page": 1, + "id": "abc123", + "related": [] + } + } +}