From cf68edca2791400ffe623e2e5d2b0738eb4bdd2a Mon Sep 17 00:00:00 2001
From: Max Ghenis <max@policyengine.org>
Date: Mon, 20 Apr 2026 13:38:17 -0400
Subject: [PATCH] Bump us-data 1.73.0 -> 1.78.2; fix HF model/dataset repo
 detection

Two changes:

1. ``bundle._hf_dataset_sha256`` now tries the HF *model* URL before
   falling back to *datasets*. PolicyEngine publishes country
   microdata under model repos (``huggingface.co/policyengine/...``)
   not dataset repos (``huggingface.co/datasets/...``), so the
   old hardcoded ``/datasets/`` prefix always 404'd for us-data.
   Tests unchanged (mocks match both URL shapes via the same
   ``huggingface.co`` substring check).

2. Applied the first live refresh using
   ``scripts/refresh_release_bundle.py --country us --data-version 1.78.2``:
   - certified_data_artifact.version: 1.73.0 -> 1.78.2
   - certified_data_artifact.sha256: 18cdc668... -> 4e92b340...
   - data_build_id: policyengine-us-data-1.73.0 ->
     policyengine-us-data-1.78.2
   - URI revision tail retargeted to 1.78.2

us-data 1.78.2 is the latest tag on HF (PyPI has 1.83.4 but HF tags
stopped at 1.78.2 as of today). Model pin stays at 1.653.3 (latest
on both PyPI and the manifest; no change needed).

Snapshot tests unchanged: household calculator goes through
policyengine_us.Simulation(situation=...) which synthesises a fresh
sim from the situation dict, never touching ``enhanced_cps_2024.h5``,
so data-version bumps don't shift household-level numbers.

TRO regeneration deferred: the data_release_manifest.json isn't
published at the 1.78.2 HF tag, so
``regenerate_trace_tro('us')`` raises
DataReleaseManifestUnavailableError. Existing us.trace.tro.jsonld is
now stale by one data-version; worth filing upstream with us-data
to publish the manifest on future tags.
---
 .../data/release_manifests/us.json            | 12 ++---
 src/policyengine/provenance/bundle.py         | 47 ++++++++++++++-----
 2 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/src/policyengine/data/release_manifests/us.json b/src/policyengine/data/release_manifests/us.json
index 0016aa8a..3e91d05e 100644
--- a/src/policyengine/data/release_manifests/us.json
+++ b/src/policyengine/data/release_manifests/us.json
@@ -11,22 +11,22 @@
   },
   "data_package": {
     "name": "policyengine-us-data",
-    "version": "1.73.0",
+    "version": "1.78.2",
     "repo_id": "policyengine/policyengine-us-data"
   },
   "certified_data_artifact": {
     "data_package": {
       "name": "policyengine-us-data",
-      "version": "1.73.0"
+      "version": "1.78.2"
     },
-    "build_id": "policyengine-us-data-1.73.0",
+    "build_id": "policyengine-us-data-1.78.2",
     "dataset": "enhanced_cps_2024",
-    "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0",
-    "sha256": "18cdc668d05311c32ae37364abcea89b0221c27154559667e951c7b19f5b5cbd"
+    "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.78.2",
+    "sha256": "4e92b340c3ea3e200ed5d55edf752ee1a13baf787442956fb67d25242fed13b5"
   },
   "certification": {
     "compatibility_basis": "matching_data_build_fingerprint",
-    "data_build_id": "policyengine-us-data-1.73.0",
+    "data_build_id": "policyengine-us-data-1.78.2",
     "built_with_model_version": "1.647.0",
     "certified_for_model_version": "1.653.3",
     "certified_by": "policyengine.py bundled manifest"
diff --git a/src/policyengine/provenance/bundle.py b/src/policyengine/provenance/bundle.py
index 54e32ebd..24f80461 100644
--- a/src/policyengine/provenance/bundle.py
+++ b/src/policyengine/provenance/bundle.py
@@ -97,25 +97,48 @@ def _pypi_wheel_metadata(package: str, version: str) -> dict:
 
 
 def _hf_dataset_sha256(repo_id: str, path: str, revision: str) -> str:
-    """Fetch the dataset file's sha256 by streaming the resolve URL.
+    """Fetch the HF artifact's sha256 by streaming the resolve URL.
 
-    Uses the ``HUGGING_FACE_TOKEN`` env var for private repos. Streams
-    the file in 8 MiB chunks so memory usage stays flat.
+    ``repo_id`` may name either a HF *dataset* repo or a *model* repo —
+    PolicyEngine publishes country microdata under model repos
+    (historical reasons), so we try the no-prefix URL first and fall
+    back to ``/datasets/`` on 404.
+
+    Uses ``HUGGING_FACE_TOKEN`` (or ``HF_TOKEN``) for private repos.
+    Streams 8 MiB chunks so memory usage stays flat.
     """
-    url = f"https://huggingface.co/datasets/{repo_id}/resolve/{revision}/{path}"
+    from urllib.error import HTTPError
+
     headers = {"User-Agent": "policyengine.py"}
     token = os.environ.get("HUGGING_FACE_TOKEN") or os.environ.get("HF_TOKEN")
     if token:
         headers["Authorization"] = f"Bearer {token}"
 
-    hasher = hashlib.sha256()
-    with urlopen(Request(url, headers=headers)) as f:
-        while True:
-            chunk = f.read(8 * 1024 * 1024)
-            if not chunk:
-                break
-            hasher.update(chunk)
-    return hasher.hexdigest()
+    candidates = [
+        # Model repo (no prefix) — how policyengine-{us,uk}-data publish.
+        f"https://huggingface.co/{repo_id}/resolve/{revision}/{path}",
+        # Dataset repo — standard HF dataset path.
+        f"https://huggingface.co/datasets/{repo_id}/resolve/{revision}/{path}",
+    ]
+    last_error: Optional[Exception] = None
+    for url in candidates:
+        try:
+            hasher = hashlib.sha256()
+            with urlopen(Request(url, headers=headers)) as f:
+                while True:
+                    chunk = f.read(8 * 1024 * 1024)
+                    if not chunk:
+                        break
+                    hasher.update(chunk)
+            return hasher.hexdigest()
+        except HTTPError as exc:
+            if exc.code != 404:
+                raise
+            last_error = exc
+    raise ValueError(
+        f"Could not resolve HF artifact {repo_id}/{path}@{revision} as "
+        f"either a model or dataset repo: {last_error}"
+    )
 
 
 # ---------------------------------------------------------------------------