From cf68edca2791400ffe623e2e5d2b0738eb4bdd2a Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 20 Apr 2026 13:38:17 -0400 Subject: [PATCH] Bump us-data 1.73.0 -> 1.78.2; fix HF model/dataset repo detection Two changes: 1. ``bundle._hf_dataset_sha256`` now tries the HF *model* URL before falling back to *datasets*. PolicyEngine publishes country microdata under model repos (``huggingface.co/policyengine/...``) not dataset repos (``huggingface.co/datasets/...``), so the old hardcoded ``/datasets/`` prefix always 404'd for us-data. Tests unchanged (mocks match both URL shapes via the same ``huggingface.co`` substring check). 2. Applied the first live refresh using ``scripts/refresh_release_bundle.py --country us --data-version 1.78.2``: - certified_data_artifact.version: 1.73.0 -> 1.78.2 - certified_data_artifact.sha256: 18cdc668... -> 4e92b340... - data_build_id: policyengine-us-data-1.73.0 -> policyengine-us-data-1.78.2 - URI revision tail retargeted to 1.78.2 us-data 1.78.2 is the latest tag on HF (PyPI has 1.83.4 but HF tags stopped at 1.78.2 as of today). Model pin stays at 1.653.3 (latest on both PyPI and the manifest; no change needed). Snapshot tests unchanged: household calculator goes through policyengine_us.Simulation(situation=...) which synthesises a fresh sim from the situation dict, never touching ``enhanced_cps_2024.h5``, so data-version bumps don't shift household-level numbers. TRO regeneration deferred: the data_release_manifest.json isn't published at the 1.78.2 HF tag, so ``regenerate_trace_tro('us')`` raises DataReleaseManifestUnavailableError. Existing us.trace.tro.jsonld is now stale by one data-version; worth filing upstream with us-data to publish the manifest on future tags. --- .../data/release_manifests/us.json | 12 ++--- src/policyengine/provenance/bundle.py | 47 ++++++++++++++----- 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/src/policyengine/data/release_manifests/us.json b/src/policyengine/data/release_manifests/us.json index 0016aa8a..3e91d05e 100644 --- a/src/policyengine/data/release_manifests/us.json +++ b/src/policyengine/data/release_manifests/us.json @@ -11,22 +11,22 @@ }, "data_package": { "name": "policyengine-us-data", - "version": "1.73.0", + "version": "1.78.2", "repo_id": "policyengine/policyengine-us-data" }, "certified_data_artifact": { "data_package": { "name": "policyengine-us-data", - "version": "1.73.0" + "version": "1.78.2" }, - "build_id": "policyengine-us-data-1.73.0", + "build_id": "policyengine-us-data-1.78.2", "dataset": "enhanced_cps_2024", - "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0", - "sha256": "18cdc668d05311c32ae37364abcea89b0221c27154559667e951c7b19f5b5cbd" + "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.78.2", + "sha256": "4e92b340c3ea3e200ed5d55edf752ee1a13baf787442956fb67d25242fed13b5" }, "certification": { "compatibility_basis": "matching_data_build_fingerprint", - "data_build_id": "policyengine-us-data-1.73.0", + "data_build_id": "policyengine-us-data-1.78.2", "built_with_model_version": "1.647.0", "certified_for_model_version": "1.653.3", "certified_by": "policyengine.py bundled manifest" diff --git a/src/policyengine/provenance/bundle.py b/src/policyengine/provenance/bundle.py index 54e32ebd..24f80461 100644 --- a/src/policyengine/provenance/bundle.py +++ b/src/policyengine/provenance/bundle.py @@ -97,25 +97,48 @@ def _pypi_wheel_metadata(package: str, version: str) -> dict: def _hf_dataset_sha256(repo_id: str, path: str, revision: str) -> str: - """Fetch the dataset file's sha256 by streaming the resolve URL. + """Fetch the HF artifact's sha256 by streaming the resolve URL. - Uses the ``HUGGING_FACE_TOKEN`` env var for private repos. Streams - the file in 8 MiB chunks so memory usage stays flat. + ``repo_id`` may name either a HF *dataset* repo or a *model* repo — + PolicyEngine publishes country microdata under model repos + (historical reasons), so we try the no-prefix URL first and fall + back to ``/datasets/`` on 404. + + Uses ``HUGGING_FACE_TOKEN`` (or ``HF_TOKEN``) for private repos. + Streams 8 MiB chunks so memory usage stays flat. """ - url = f"https://huggingface.co/datasets/{repo_id}/resolve/{revision}/{path}" + from urllib.error import HTTPError + headers = {"User-Agent": "policyengine.py"} token = os.environ.get("HUGGING_FACE_TOKEN") or os.environ.get("HF_TOKEN") if token: headers["Authorization"] = f"Bearer {token}" - hasher = hashlib.sha256() - with urlopen(Request(url, headers=headers)) as f: - while True: - chunk = f.read(8 * 1024 * 1024) - if not chunk: - break - hasher.update(chunk) - return hasher.hexdigest() + candidates = [ + # Model repo (no prefix) — how policyengine-{us,uk}-data publish. + f"https://huggingface.co/{repo_id}/resolve/{revision}/{path}", + # Dataset repo — standard HF dataset path. + f"https://huggingface.co/datasets/{repo_id}/resolve/{revision}/{path}", + ] + last_error: Optional[Exception] = None + for url in candidates: + try: + hasher = hashlib.sha256() + with urlopen(Request(url, headers=headers)) as f: + while True: + chunk = f.read(8 * 1024 * 1024) + if not chunk: + break + hasher.update(chunk) + return hasher.hexdigest() + except HTTPError as exc: + if exc.code != 404: + raise + last_error = exc + raise ValueError( + f"Could not resolve HF artifact {repo_id}/{path}@{revision} as " + f"either a model or dataset repo: {last_error}" + ) # ---------------------------------------------------------------------------