diff --git a/changelog.d/refresh-release-bundle.added.md b/changelog.d/refresh-release-bundle.added.md new file mode 100644 index 00000000..e588a79b --- /dev/null +++ b/changelog.d/refresh-release-bundle.added.md @@ -0,0 +1 @@ +Added `policyengine.provenance.refresh_release_bundle`. Given a country and optional new model/data versions, the helper fetches the updated PyPI wheel metadata + HF dataset sha256, rewrites `data/release_manifests/{country}.json` and the matching extra pin in `pyproject.toml`, and (optionally) regenerates the bundle's TRACE TRO sidecar. A thin `scripts/refresh_release_bundle.py` wrapper exposes the library function as a CLI for release engineers. Unit-tested offline via mocked PyPI/HF responses. diff --git a/scripts/refresh_release_bundle.py b/scripts/refresh_release_bundle.py new file mode 100644 index 00000000..06e64570 --- /dev/null +++ b/scripts/refresh_release_bundle.py @@ -0,0 +1,79 @@ +"""CLI wrapper around :func:`policyengine.provenance.refresh_release_bundle`. + +Usage:: + + python scripts/refresh_release_bundle.py --country us \\ + --model-version 1.653.3 --data-version 1.83.4 + +Fetches PyPI wheel metadata and streams the HF dataset to compute its +sha256, then writes updated ``data/release_manifests/{country}.json``, +bumps the matching pin in ``pyproject.toml`` (unless +``--no-pyproject``), and regenerates the bundle's TRACE TRO sidecar +(unless ``--no-tro``). + +Private HF datasets require ``HUGGING_FACE_TOKEN`` in the env. + +After running: + +- commit the changed manifest / TRO / pyproject.toml, +- manually rerun + ``PE_UPDATE_SNAPSHOTS=1 pytest tests/test_household_calculator_snapshot.py`` + to rebaseline expected household outputs — those numbers will + almost certainly drift when the data version bumps, and the drift + deserves human review before being committed. +""" + +from __future__ import annotations + +import argparse +import sys + +from policyengine.provenance.bundle import ( + refresh_release_bundle, + regenerate_trace_tro, +) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--country", required=True, choices=("us", "uk")) + parser.add_argument( + "--model-version", + help="New policyengine-{country} version (e.g. 1.653.3)", + ) + parser.add_argument( + "--data-version", + help="New policyengine-{country}-data version (e.g. 1.83.4)", + ) + parser.add_argument( + "--no-pyproject", + action="store_true", + help="Do not bump the country extra in pyproject.toml", + ) + parser.add_argument( + "--no-tro", + action="store_true", + help="Skip TRACE TRO regeneration", + ) + args = parser.parse_args(argv) + + if args.model_version is None and args.data_version is None: + parser.error("Pass at least --model-version or --data-version") + + result = refresh_release_bundle( + country=args.country, + model_version=args.model_version, + data_version=args.data_version, + update_pyproject=not args.no_pyproject, + ) + print(result.summary()) + + if not args.no_tro: + tro_path = regenerate_trace_tro(args.country) + print(f" TRO regenerated: {tro_path}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/policyengine/provenance/__init__.py b/src/policyengine/provenance/__init__.py index 548b7fc6..b17361c1 100644 --- a/src/policyengine/provenance/__init__.py +++ b/src/policyengine/provenance/__init__.py @@ -15,6 +15,9 @@ ) """ +from .bundle import RefreshResult as RefreshResult +from .bundle import refresh_release_bundle as refresh_release_bundle +from .bundle import regenerate_trace_tro as regenerate_trace_tro from .manifest import ( CertifiedDataArtifact as CertifiedDataArtifact, ) diff --git a/src/policyengine/provenance/bundle.py b/src/policyengine/provenance/bundle.py new file mode 100644 index 00000000..54e32ebd --- /dev/null +++ b/src/policyengine/provenance/bundle.py @@ -0,0 +1,319 @@ +"""Refresh a country release manifest in place. + +The release manifest at ``data/release_manifests/{country}.json`` pins +three artifacts by content hash: + +- the country model wheel (sha256 + PyPI download URL), +- the certified microdata artifact (sha256 + HF resolve URL), +- the data package metadata used to compute the build fingerprint. + +When a country bumps its PyPI wheel or HF dataset, every one of those +pins has to move together, and the TRACE TRO sidecar at +``data/release_manifests/{country}.trace.tro.jsonld`` must be +regenerated so replication reviewers see the right hashes. + +This module exposes the refresh as a library function: + +.. code-block:: python + + from policyengine.provenance.bundle import refresh_release_bundle + + result = refresh_release_bundle( + country="us", + model_version="1.653.3", + data_version="1.83.4", + ) + print(result.summary()) + +``scripts/refresh_release_bundle.py`` is a thin argparse wrapper for +operational use. Network access is required (PyPI JSON API + HF HEAD +against the dataset URI). Private country data (UK) additionally +needs ``HUGGING_FACE_TOKEN``. +""" + +from __future__ import annotations + +import hashlib +import json +import os +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Optional +from urllib.request import Request, urlopen + +from policyengine.provenance.manifest import ( + CountryReleaseManifest, + get_release_manifest, +) + +# --------------------------------------------------------------------------- +# Paths inside the installed / source-tree wheel. +# --------------------------------------------------------------------------- + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent +MANIFEST_DIR = REPO_ROOT / "src" / "policyengine" / "data" / "release_manifests" +PYPROJECT = REPO_ROOT / "pyproject.toml" + + +# --------------------------------------------------------------------------- +# PyPI metadata resolution +# --------------------------------------------------------------------------- + + +def _pypi_wheel_metadata(package: str, version: str) -> dict: + """Return ``{"url": ..., "sha256": ...}`` for the py3-none-any wheel + of ``package==version`` on PyPI. + + Raises if PyPI reports no matching wheel, or if multiple matching + wheels exist with different sha256s (i.e. the release is + unambiguous). + """ + url = f"https://pypi.org/pypi/{package}/{version}/json" + with urlopen(Request(url, headers={"User-Agent": "policyengine.py"})) as f: + payload = json.load(f) + wheels = [ + f + for f in payload.get("urls", []) + if f.get("packagetype") == "bdist_wheel" + and "py3-none-any" in f.get("filename", "") + ] + if not wheels: + raise ValueError( + f"No py3-none-any wheel found on PyPI for {package}=={version}" + ) + sha256s = {f["digests"]["sha256"] for f in wheels} + if len(sha256s) > 1: + raise ValueError( + f"Multiple distinct py3-none-any wheels for {package}=={version}: {sha256s}" + ) + wheel = wheels[0] + return {"url": wheel["url"], "sha256": wheel["digests"]["sha256"]} + + +# --------------------------------------------------------------------------- +# Hugging Face dataset resolution +# --------------------------------------------------------------------------- + + +def _hf_dataset_sha256(repo_id: str, path: str, revision: str) -> str: + """Fetch the dataset file's sha256 by streaming the resolve URL. + + Uses the ``HUGGING_FACE_TOKEN`` env var for private repos. Streams + the file in 8 MiB chunks so memory usage stays flat. + """ + url = f"https://huggingface.co/datasets/{repo_id}/resolve/{revision}/{path}" + headers = {"User-Agent": "policyengine.py"} + token = os.environ.get("HUGGING_FACE_TOKEN") or os.environ.get("HF_TOKEN") + if token: + headers["Authorization"] = f"Bearer {token}" + + hasher = hashlib.sha256() + with urlopen(Request(url, headers=headers)) as f: + while True: + chunk = f.read(8 * 1024 * 1024) + if not chunk: + break + hasher.update(chunk) + return hasher.hexdigest() + + +# --------------------------------------------------------------------------- +# Refresh result +# --------------------------------------------------------------------------- + + +@dataclass +class RefreshResult: + """What the refresh changed, for logs and PR bodies.""" + + country: str + old_model: str + new_model: str + old_data: str + new_data: str + old_wheel_sha256: str + new_wheel_sha256: str + old_dataset_sha256: str + new_dataset_sha256: str + manifest_path: Path + pyproject_updated: bool + + def summary(self) -> str: + lines = [ + f"Refreshed {self.country} release bundle:", + f" model: {self.old_model} -> {self.new_model}", + f" data: {self.old_data} -> {self.new_data}", + f" wheel sha256: {self.old_wheel_sha256[:12]}... -> " + f"{self.new_wheel_sha256[:12]}...", + f" dataset sha256: {self.old_dataset_sha256[:12]}... -> " + f"{self.new_dataset_sha256[:12]}...", + f" manifest: {self.manifest_path}", + ] + if self.pyproject_updated: + lines.append(" pyproject.toml: pin updated") + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Core refresh function +# --------------------------------------------------------------------------- + + +def refresh_release_bundle( + country: str, + *, + model_version: Optional[str] = None, + data_version: Optional[str] = None, + update_pyproject: bool = True, + manifest_dir: Path = MANIFEST_DIR, + pyproject_path: Path = PYPROJECT, +) -> RefreshResult: + """Refresh a country's release manifest in place. + + Args: + country: ``"us"`` or ``"uk"``. + model_version: New country-package version, e.g. ``"1.653.3"``. + If ``None``, keeps the existing pin. + data_version: New data-package version, e.g. ``"1.83.4"``. If + ``None``, keeps the existing pin. + update_pyproject: When True, also bumps the country extra in + ``pyproject.toml`` to ``model_version``. + manifest_dir: Overridable for tests. + pyproject_path: Overridable for tests. + + Returns a :class:`RefreshResult` with the before/after of every + content-addressed pin. + """ + manifest_path = manifest_dir / f"{country}.json" + manifest_json = json.loads(manifest_path.read_text()) + current = CountryReleaseManifest.model_validate(manifest_json) + + old_model = current.model_package.version + old_data = current.data_package.version + old_wheel_sha256 = current.model_package.sha256 or "" + old_dataset_sha256 = current.certified_data_artifact.sha256 or "" + + new_model = model_version or old_model + new_data = data_version or old_data + + package_name = current.model_package.name # "policyengine-us" / "policyengine-uk" + + # Only hit PyPI if the model actually changed. Keeps no-op + # refreshes and data-only refreshes offline for the wheel pin. + if new_model != old_model: + wheel = _pypi_wheel_metadata(package_name, new_model) + new_wheel_sha256 = wheel["sha256"] + new_wheel_url = wheel["url"] + else: + new_wheel_sha256 = old_wheel_sha256 + new_wheel_url = current.model_package.wheel_url or "" + + # Dataset HF resolve URL inferred from the existing URI: we only + # change the ``@{revision}`` tail. + current_uri = current.certified_data_artifact.uri + repo_id_match = re.match(r"hf://([^/]+/[^/]+)/(.+?)@(.+)", current_uri) + if not repo_id_match: + raise ValueError( + f"Cannot parse current dataset URI {current_uri!r}; expected " + f"'hf://{{owner}}/{{repo}}/{{path}}@{{revision}}'" + ) + repo_id, dataset_path, _old_revision = repo_id_match.groups() + + # Only hit HF if the data version actually changed. + if new_data != old_data: + new_dataset_sha256 = _hf_dataset_sha256(repo_id, dataset_path, new_data) + else: + new_dataset_sha256 = old_dataset_sha256 + new_uri = f"hf://{repo_id}/{dataset_path}@{new_data}" + + # Mutate the manifest JSON in place (keep unknown fields untouched). + manifest_json["model_package"]["version"] = new_model + manifest_json["model_package"]["sha256"] = new_wheel_sha256 + manifest_json["model_package"]["wheel_url"] = new_wheel_url + manifest_json["data_package"]["version"] = new_data + manifest_json["certified_data_artifact"]["data_package"]["version"] = new_data + manifest_json["certified_data_artifact"]["build_id"] = ( + f"{current.data_package.name}-{new_data}" + ) + manifest_json["certified_data_artifact"]["uri"] = new_uri + manifest_json["certified_data_artifact"]["sha256"] = new_dataset_sha256 + manifest_json["certification"]["data_build_id"] = ( + f"{current.data_package.name}-{new_data}" + ) + manifest_json["certification"]["certified_for_model_version"] = new_model + + manifest_path.write_text( + json.dumps(manifest_json, indent=2, sort_keys=False) + "\n" + ) + + pyproject_updated = False + if update_pyproject and model_version is not None: + pyproject_updated = _bump_pyproject_pin(pyproject_path, package_name, new_model) + + return RefreshResult( + country=country, + old_model=old_model, + new_model=new_model, + old_data=old_data, + new_data=new_data, + old_wheel_sha256=old_wheel_sha256, + new_wheel_sha256=new_wheel_sha256, + old_dataset_sha256=old_dataset_sha256, + new_dataset_sha256=new_dataset_sha256, + manifest_path=manifest_path, + pyproject_updated=pyproject_updated, + ) + + +# --------------------------------------------------------------------------- +# pyproject.toml pin update (regex-based; avoids adding a TOML writer dep) +# --------------------------------------------------------------------------- + + +def _bump_pyproject_pin( + pyproject_path: Path, package_name: str, new_version: str +) -> bool: + """Update the ``{package_name}=={version}`` line under country + extras. Returns True if a change was written. + + Only matches the exact ``"{package_name}==X.Y.Z"`` pin form that the + release manifests produce; any looser pin (``>=``, ``~=``, extras + markers) is left alone and signalled via the return value. + """ + text = pyproject_path.read_text() + pattern = rf'("{re.escape(package_name)}==)[^"]+(")' + new_text, n = re.subn(pattern, rf"\g<1>{new_version}\g<2>", text) + if n == 0: + return False + if new_text != text: + pyproject_path.write_text(new_text) + return True + return False + + +# --------------------------------------------------------------------------- +# Trace TRO regeneration +# --------------------------------------------------------------------------- + + +def regenerate_trace_tro(country: str, manifest_dir: Path = MANIFEST_DIR) -> Path: + """Regenerate ``{country}.trace.tro.jsonld`` from the country's + release manifest + the live data-release manifest on HF. + + Thin wrapper around the same code path ``scripts/generate_trace_tros.py`` + uses; exposed here so the refresh function can chain + ``refresh_release_bundle(...)`` with TRO regeneration in one call. + """ + from policyengine.provenance.manifest import get_data_release_manifest + from policyengine.provenance.trace import ( + build_trace_tro_from_release_bundle, + serialize_trace_tro, + ) + + release = get_release_manifest(country) + data_release = get_data_release_manifest(country) + tro = build_trace_tro_from_release_bundle(release, data_release) + out_path = manifest_dir / f"{country}.trace.tro.jsonld" + out_path.write_bytes(serialize_trace_tro(tro)) + return out_path diff --git a/tests/test_bundle_refresh.py b/tests/test_bundle_refresh.py new file mode 100644 index 00000000..38537fea --- /dev/null +++ b/tests/test_bundle_refresh.py @@ -0,0 +1,321 @@ +"""Unit tests for ``policyengine.provenance.bundle.refresh_release_bundle``. + +Mocks the PyPI JSON API and the HF ``resolve`` endpoint so the tests +run offline. Exercises: + +- Updating only the model version (data-version unchanged). +- Updating only the data version (model unchanged). +- Updating both in one call. +- ``pyproject.toml`` pin rewrite. +- ``--no-pyproject`` / ``update_pyproject=False`` short-circuits. +- Error paths: PyPI has no matching wheel; URI is malformed. + +The end-to-end TRO regeneration requires the bundled +release-manifest resolver and a live HF metadata call, so it is +tested separately in ``tests/test_release_manifests.py`` via the +existing script-level hook. This file covers only the pure-refresh +surface. +""" + +from __future__ import annotations + +import hashlib +import io +import json +from pathlib import Path +from unittest.mock import patch + +import pytest + +from policyengine.provenance.bundle import refresh_release_bundle + +PYPI_PAYLOAD_TEMPLATE = { + "urls": [ + { + "packagetype": "bdist_wheel", + "filename": "policyengine_us-NEW_VERSION-py3-none-any.whl", + "url": "https://files.pythonhosted.org/packages/ff/00/policyengine_us-NEW_VERSION-py3-none-any.whl", + "digests": {"sha256": "a" * 64}, + }, + # Source-dist should be ignored. + { + "packagetype": "sdist", + "filename": "policyengine_us-NEW_VERSION.tar.gz", + "url": "https://files.pythonhosted.org/packages/ff/00/policyengine_us-NEW_VERSION.tar.gz", + "digests": {"sha256": "b" * 64}, + }, + ] +} + + +def _pypi_response(package: str, version: str): + """Return a mock PyPI ``urlopen`` response.""" + payload = json.loads( + json.dumps(PYPI_PAYLOAD_TEMPLATE).replace("NEW_VERSION", version) + ) + # PyPI urls contain the filename; replace the package placeholder too. + for u in payload["urls"]: + u["filename"] = u["filename"].replace( + "policyengine_us", package.replace("-", "_") + ) + u["url"] = u["url"].replace("policyengine_us", package.replace("-", "_")) + return io.BytesIO(json.dumps(payload).encode()) + + +class _FakeHFResponse: + """Streams a deterministic byte sequence so sha256 is predictable.""" + + def __init__(self, content: bytes) -> None: + self._buffer = io.BytesIO(content) + + def read(self, size: int = -1) -> bytes: + return self._buffer.read(size) + + def __enter__(self): + return self + + def __exit__(self, *args): + self._buffer.close() + + +@pytest.fixture +def sandbox(tmp_path: Path) -> dict: + """A writable scratch copy of the US release manifest + a stub + pyproject.toml, returned as ``{manifest_dir, pyproject_path, + manifest_sha256}``. + """ + manifest_dir = tmp_path / "manifests" + manifest_dir.mkdir() + manifest = { + "schema_version": 1, + "bundle_id": "us-4.0.0", + "country_id": "us", + "policyengine_version": "4.0.0", + "model_package": { + "name": "policyengine-us", + "version": "1.600.0", + "sha256": "c" * 64, + "wheel_url": "https://files.pythonhosted.org/packages/old.whl", + }, + "data_package": { + "name": "policyengine-us-data", + "version": "1.70.0", + "repo_id": "policyengine/policyengine-us-data", + }, + "certified_data_artifact": { + "data_package": { + "name": "policyengine-us-data", + "version": "1.70.0", + }, + "build_id": "policyengine-us-data-1.70.0", + "dataset": "enhanced_cps_2024", + "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.70.0", + "sha256": "d" * 64, + }, + "certification": { + "compatibility_basis": "matching_data_build_fingerprint", + "data_build_id": "policyengine-us-data-1.70.0", + "built_with_model_version": "1.595.0", + "certified_for_model_version": "1.600.0", + "certified_by": "test fixture", + }, + "default_dataset": "enhanced_cps_2024", + "datasets": {"enhanced_cps_2024": {"path": "enhanced_cps_2024.h5"}}, + "region_datasets": {"national": {"path_template": "enhanced_cps_2024.h5"}}, + } + (manifest_dir / "us.json").write_text(json.dumps(manifest, indent=2)) + + pyproject_path = tmp_path / "pyproject.toml" + pyproject_path.write_text( + "[project.optional-dependencies]\n" + "us = [\n" + ' "policyengine_core>=3.25.0",\n' + ' "policyengine-us==1.600.0",\n' + "]\n" + ) + return { + "manifest_dir": manifest_dir, + "pyproject_path": pyproject_path, + } + + +def test__bump_model_only_rewrites_wheel_pins_and_pyproject(sandbox) -> None: + """Bumping only the model version pulls fresh wheel metadata, + keeps the dataset pin intact, and updates pyproject.toml. + """ + + def fake_urlopen(request, *args, **kwargs): + url = request.full_url + if "pypi.org" in url: + return _pypi_response("policyengine-us", "1.653.3") + raise AssertionError(f"Unexpected URL fetched: {url}") + + with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): + result = refresh_release_bundle( + country="us", + model_version="1.653.3", + manifest_dir=sandbox["manifest_dir"], + pyproject_path=sandbox["pyproject_path"], + ) + + assert result.new_model == "1.653.3" + assert result.new_data == "1.70.0" # untouched + assert result.pyproject_updated + assert "policyengine-us==1.653.3" in sandbox["pyproject_path"].read_text() + + written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) + assert written["model_package"]["version"] == "1.653.3" + assert written["model_package"]["sha256"] == "a" * 64 + # Dataset pins untouched. + assert written["data_package"]["version"] == "1.70.0" + assert written["certified_data_artifact"]["sha256"] == "d" * 64 + + +def test__bump_data_only_streams_hf_and_updates_uri(sandbox) -> None: + """Bumping only the data version streams the HF file, recomputes + its sha256, and rewrites the URI revision.""" + hf_bytes = b"synthetic dataset payload" + expected_sha256 = hashlib.sha256(hf_bytes).hexdigest() + + def fake_urlopen(request, *args, **kwargs): + url = request.full_url + if "huggingface.co" in url: + assert "@" not in url # URI revision is in the URL path + assert "1.83.4" in url + return _FakeHFResponse(hf_bytes) + raise AssertionError(f"Unexpected URL: {url}") + + with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): + result = refresh_release_bundle( + country="us", + data_version="1.83.4", + manifest_dir=sandbox["manifest_dir"], + pyproject_path=sandbox["pyproject_path"], + ) + + assert result.new_model == "1.600.0" # untouched + assert result.new_data == "1.83.4" + assert result.new_dataset_sha256 == expected_sha256 + assert not result.pyproject_updated # no model bump => no pyproject change + + written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) + assert written["data_package"]["version"] == "1.83.4" + assert written["certified_data_artifact"]["data_package"]["version"] == "1.83.4" + assert written["certified_data_artifact"]["build_id"] == ( + "policyengine-us-data-1.83.4" + ) + assert written["certified_data_artifact"]["sha256"] == expected_sha256 + assert ( + written["certified_data_artifact"]["uri"] + == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.83.4" + ) + + +def test__bump_both_updates_everything(sandbox) -> None: + hf_bytes = b"another payload" + + def fake_urlopen(request, *args, **kwargs): + url = request.full_url + if "pypi.org" in url: + return _pypi_response("policyengine-us", "1.653.3") + if "huggingface.co" in url: + return _FakeHFResponse(hf_bytes) + raise AssertionError(url) + + with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): + result = refresh_release_bundle( + country="us", + model_version="1.653.3", + data_version="1.83.4", + manifest_dir=sandbox["manifest_dir"], + pyproject_path=sandbox["pyproject_path"], + ) + + assert result.pyproject_updated + assert result.new_model == "1.653.3" + assert result.new_data == "1.83.4" + + +def test__update_pyproject_false_leaves_pins_alone(sandbox) -> None: + def fake_urlopen(*args, **kwargs): + return _pypi_response("policyengine-us", "1.653.3") + + with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): + result = refresh_release_bundle( + country="us", + model_version="1.653.3", + update_pyproject=False, + manifest_dir=sandbox["manifest_dir"], + pyproject_path=sandbox["pyproject_path"], + ) + + assert not result.pyproject_updated + assert "policyengine-us==1.600.0" in sandbox["pyproject_path"].read_text() + + +def test__no_matching_wheel_on_pypi_raises(sandbox) -> None: + def fake_urlopen(*args, **kwargs): + return io.BytesIO(json.dumps({"urls": []}).encode()) + + with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): + with pytest.raises(ValueError, match="No py3-none-any wheel"): + refresh_release_bundle( + country="us", + model_version="1.999.0", + manifest_dir=sandbox["manifest_dir"], + pyproject_path=sandbox["pyproject_path"], + ) + + +def test__malformed_uri_raises(tmp_path) -> None: + """If the current manifest's URI doesn't match the expected + ``hf://.../path@revision`` shape, we refuse to guess.""" + manifest_dir = tmp_path / "m" + manifest_dir.mkdir() + bad = { + "schema_version": 1, + "bundle_id": "us-4.0.0", + "country_id": "us", + "policyengine_version": "4.0.0", + "model_package": { + "name": "policyengine-us", + "version": "1.600.0", + "sha256": "c" * 64, + "wheel_url": "https://…old.whl", + }, + "data_package": { + "name": "policyengine-us-data", + "version": "1.70.0", + "repo_id": "policyengine/policyengine-us-data", + }, + "certified_data_artifact": { + "data_package": { + "name": "policyengine-us-data", + "version": "1.70.0", + }, + "build_id": "policyengine-us-data-1.70.0", + "dataset": "enhanced_cps_2024", + # Malformed: no @revision. + "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5", + "sha256": "d" * 64, + }, + "certification": { + "compatibility_basis": "matching_data_build_fingerprint", + "data_build_id": "policyengine-us-data-1.70.0", + "built_with_model_version": "1.595.0", + "certified_for_model_version": "1.600.0", + "certified_by": "test fixture", + }, + "default_dataset": "enhanced_cps_2024", + "datasets": {"enhanced_cps_2024": {"path": "enhanced_cps_2024.h5"}}, + "region_datasets": {}, + } + (manifest_dir / "us.json").write_text(json.dumps(bad)) + + with pytest.raises(ValueError, match="Cannot parse current dataset URI"): + refresh_release_bundle( + country="us", + data_version="1.83.4", + manifest_dir=manifest_dir, + pyproject_path=tmp_path / "pyproject.toml", + )