-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_inputs.py
More file actions
78 lines (61 loc) · 2.56 KB
/
Copy pathtest_inputs.py
File metadata and controls
78 lines (61 loc) · 2.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from __future__ import annotations
import json
from pathlib import Path
import compute_metrics
import run_retab_splits
import streamlit_viewer
ROOT = Path(__file__).resolve().parent
def test_inputs_are_split_one_file_per_document() -> None:
input_paths = sorted((ROOT / "inputs").glob("*.json"))
assert not (ROOT / "inputs.json").exists()
assert len(input_paths) == 30
assert len({path.stem for path in input_paths}) == 30
for path in input_paths:
item = json.loads(path.read_text(encoding="utf-8"))
assert item["document"] == f"{path.stem}.pdf"
assert item["pdf_url"].startswith("https://")
assert item["page_count"] > 0
assert item["ground_truth"]
def test_runner_loads_split_inputs_with_shared_config() -> None:
inputs = run_retab_splits.load_inputs()
assert len(inputs["documents"]) == 30
assert len(inputs["subdocuments"]) == 40
assert [model["sdk_model"] for model in inputs["models"]] == [
"retab-micro",
"retab-small",
"retab-large",
]
def test_snapshot_results_have_one_json_per_document_model_pair() -> None:
result_paths = sorted((ROOT / "results" / "article_snapshot").glob("*/*.json"))
suffixed_paths = sorted((ROOT / "results" / "article_snapshot").glob("*/*.*.json"))
pairs = {(path.parent.name, path.stem) for path in result_paths}
assert len(result_paths) == 90
assert len(pairs) == 90
assert suffixed_paths == []
def test_result_loaders_accept_single_live_json_files(tmp_path) -> None:
result_root = tmp_path / "results"
live_model_dir = result_root / "live" / "run-1" / "retab-small"
live_model_dir.mkdir(parents=True)
split = {
"file": {"filename": "sample.pdf"},
"model": "retab-small",
"output": [{"name": "Form 1040", "pages": [1]}],
}
(live_model_dir / "sample.json").write_text(
json.dumps(split),
encoding="utf-8",
)
original_metrics_results_dir = compute_metrics.RESULTS_DIR
original_viewer_results_dir = streamlit_viewer.RESULTS_DIR
compute_metrics.RESULTS_DIR = result_root
streamlit_viewer.RESULTS_DIR = result_root
try:
assert compute_metrics.result_json_paths() == [
("live/run-1", "retab-small", live_model_dir / "sample.json")
]
assert streamlit_viewer.load_result_index("live/run-1") == {
("sample.pdf", "retab-small"): live_model_dir / "sample.json"
}
finally:
compute_metrics.RESULTS_DIR = original_metrics_results_dir
streamlit_viewer.RESULTS_DIR = original_viewer_results_dir