From a0c55133c41fbc74e5716ccc22d2ce41ec6dce16 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 28 Jun 2026 08:14:24 -0400 Subject: [PATCH] Use Populace geoslices in policyengine.py --- changelog.d/populace-geoslices.changed.md | 1 + docs/bundles.md | 17 +- docs/countries.md | 5 +- docs/data-publishing-design.md | 6 +- docs/engineering/skills/data-certification.md | 25 +- docs/getting-started.md | 7 +- docs/impact-analysis.md | 7 +- docs/microsim.md | 19 +- docs/regions.md | 27 +- docs/release-bundles.md | 16 +- scripts/generate_trace_tros.py | 1 + src/policyengine/core/region.py | 6 +- src/policyengine/core/scoping_strategy.py | 16 +- src/policyengine/core/simulation.py | 7 +- .../countries/us/data/__init__.py | 3 +- src/policyengine/countries/us/data/places.py | 4 +- src/policyengine/countries/us/data/states.py | 55 ++++ src/policyengine/countries/us/regions.py | 31 +- src/policyengine/data/bundle/manifest.json | 311 ------------------ .../data/bundle/uk.trace.tro.jsonld | 9 +- .../data/bundle/us.trace.tro.jsonld | 9 +- src/policyengine/provenance/certification.py | 16 + src/policyengine/provenance/manifest.py | 18 + src/policyengine/provenance/trace.py | 5 +- .../tax_benefit_models/us/datasets.py | 20 +- src/policyengine/utils/entity_utils.py | 29 +- tests/fixtures/filtering_fixtures.py | 7 +- tests/test_certify_data_release.py | 56 ++-- tests/test_entity_utils.py | 63 ++++ tests/test_release_manifests.py | 90 ++++- tests/test_us_regions.py | 78 +++-- 31 files changed, 452 insertions(+), 512 deletions(-) create mode 100644 changelog.d/populace-geoslices.changed.md diff --git a/changelog.d/populace-geoslices.changed.md b/changelog.d/populace-geoslices.changed.md new file mode 100644 index 00000000..421937e3 --- /dev/null +++ b/changelog.d/populace-geoslices.changed.md @@ -0,0 +1 @@ +Use the certified national Populace US dataset for state and congressional-district regions via row filters, and stop vendoring derived Populace area H5 slices into the PolicyEngine bundle manifest. diff --git a/docs/bundles.md b/docs/bundles.md index df28a2a0..e3099709 100644 --- a/docs/bundles.md +++ b/docs/bundles.md @@ -88,26 +88,21 @@ python scripts/bundle.py certify-data \ --manifest-uri hf://dataset/policyengine/populace-uk-private@/releases//release_manifest.json ``` -For US Populace releases, include the inherited state datasets from -`policyengine-us-data`: +For US Populace releases, certify the Populace release manifest directly: ```bash python scripts/bundle.py certify-data \ --country us \ --data-producer populace \ --manifest-uri hf://dataset/policyengine/populace-us@/releases//release_manifest.json \ - --regional-manifest-uri hf://model/policyengine/policyengine-us-data@/releases//release_manifest.json \ --model-version ``` -The regional manifest must include all 51 `states/{STATE}.h5` artifacts with -their original repo, revision, and sha256 pins. The resulting bundle manifest -certifies Populace as the US national default dataset and -`policyengine-us-data` as the state dataset source. -The regional manifest URI is recorded for traceability; the bundle does not -currently record the regional manifest's own sha256. The citable pins are the -artifact-level repo, revision, and sha256 values copied into -`data_releases.us.datasets`. +US state and congressional-district regions scope the certified national +Populace dataset with row filters. If a Populace release also publishes derived +`states/*.h5` or `districts/*.h5` area slices, the bundle certification omits +those slices from `data_releases.us.datasets`; they are not runtime dataset +dependencies. Use `python scripts/bundle.py generate` to regenerate derived bundle metadata, and `python scripts/bundle.py generate --include-tros` when TRACE TRO sidecars diff --git a/docs/countries.md b/docs/countries.md index 29864bea..b8d63bc2 100644 --- a/docs/countries.md +++ b/docs/countries.md @@ -32,12 +32,13 @@ Override in any output with `income_variable=`. | | Dataset | |---|---| -| US | Enhanced CPS 2024 (`enhanced_cps_2024.h5`) | +| US | Populace US 2024 (`populace_us_2024.h5`) | | UK | Populace UK 2023 (`populace_uk_2023.h5`) | ## State / regional breakdown -US: `state_code` and `congressional_district` on every household. +US: Populace row scoping uses `state_fips` and `congressional_district_geoid`. +`state_code` remains the human-readable input for custom households. UK: constituency code and local authority code on every household where available. diff --git a/docs/data-publishing-design.md b/docs/data-publishing-design.md index 26ad03a5..9bdb33fb 100644 --- a/docs/data-publishing-design.md +++ b/docs/data-publishing-design.md @@ -168,7 +168,7 @@ Minimal. The existing `pe.us.ensure_datasets` takes a URI today: ```python pe.us.ensure_datasets( - datasets=["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"], + datasets=["hf://policyengine/populace-us/populace_us_2024.h5@"], years=[2026], ) ``` @@ -178,13 +178,13 @@ Under the substrate, the URI scheme gains a new prefix: ```python # The release manifest pins a specific artifact: pe.us.ensure_datasets( - datasets=["pe-data://us/enhanced_cps_2024@sha256:4e92b340…"], + datasets=["pe-data://us/populace_us_2024@sha256:4e92b340…"], years=[2026], ) # A developer asking for operational newest: pe.us.ensure_datasets( - datasets=["pe-data://us/enhanced_cps_2024@latest"], # resolves via channel + datasets=["pe-data://us/populace_us_2024@latest"], # resolves via channel years=[2026], ) ``` diff --git a/docs/engineering/skills/data-certification.md b/docs/engineering/skills/data-certification.md index 890a0fd4..4e3f24b5 100644 --- a/docs/engineering/skills/data-certification.md +++ b/docs/engineering/skills/data-certification.md @@ -26,37 +26,26 @@ python scripts/bundle.py certify-data --country uk --data-producer populace \ --manifest-uri "hf://dataset/policyengine/populace-uk-private@/releases//release_manifest.json" ``` -For US Populace certification, include the inherited state datasets from the -certified `policyengine-us-data` release manifest: +For US Populace certification, certify the Populace release manifest directly: ```bash python scripts/bundle.py certify-data --country us --data-producer populace \ --manifest-uri "hf://dataset/policyengine/populace-us@/releases//release_manifest.json" \ - --regional-manifest-uri "hf://model/policyengine/policyengine-us-data@/releases//release_manifest.json" \ --model-version "" ``` -The regional manifest is required for US while the stack still serves -state-level datasets from `policyengine-us-data`. It must contain all 51 -`states/{STATE}.h5` artifacts, including DC, and each state artifact must carry -its original `repo_id`, `revision`, and `sha256`. Certification preserves those -per-artifact pins in `data_releases.us.datasets` and writes: +US state and congressional-district regions are row filters over the certified +national Populace dataset. Certification writes: ```json "region_datasets": { - "national": {"path_template": "populace_us_2024.h5"}, - "state": {"path_template": "states/{state_code}.h5"} + "national": {"path_template": "populace_us_2024.h5"} } ``` -Do not move or rewrite state artifacts into the Populace repo. The certified -bundle is intentionally hybrid: Populace owns the national default dataset, and -`policyengine-us-data` owns the inherited state datasets until that path is -migrated. -The regional manifest URI is recorded for traceability, but the bundle does not -currently record the regional manifest's own sha256. Treat the copied -artifact-level repo, revision, and sha256 pins in `data_releases.us.datasets` -as the citable state dataset certification. +If the Populace release publishes derived `states/*.h5` or `districts/*.h5` +files for compatibility checks, certification omits them from the runtime +bundle. The national H5 is the canonical `.py` dataset. The script fetches and validates the manifest (every artifact must carry a revision pin; the certified dataset must be reachable), writes the canonical diff --git a/docs/getting-started.md b/docs/getting-started.md index bbaa3cee..6dd489b1 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -76,11 +76,8 @@ For population estimates — budget cost, distributional impact, poverty — mov ```python from policyengine.core import Simulation -datasets = pe.us.ensure_datasets( - datasets=["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"], - years=[2026], -) -dataset = datasets["enhanced_cps_2024_2026"] +datasets = pe.us.ensure_datasets(years=[2026]) +dataset = next(iter(datasets.values())) baseline = Simulation( dataset=dataset, diff --git a/docs/impact-analysis.md b/docs/impact-analysis.md index 25db85ae..275dc14c 100644 --- a/docs/impact-analysis.md +++ b/docs/impact-analysis.md @@ -10,11 +10,8 @@ title: "Impact analysis" import policyengine as pe from policyengine.core import Simulation -datasets = pe.us.ensure_datasets( - datasets=["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"], - years=[2026], -) -dataset = datasets["enhanced_cps_2024_2026"] +datasets = pe.us.ensure_datasets(years=[2026]) +dataset = next(iter(datasets.values())) baseline = Simulation(dataset=dataset, tax_benefit_model_version=pe.us.model) reformed = Simulation( diff --git a/docs/microsim.md b/docs/microsim.md index 5804431c..f6927d93 100644 --- a/docs/microsim.md +++ b/docs/microsim.md @@ -11,11 +11,8 @@ import policyengine as pe from policyengine.core import Simulation from policyengine.outputs import Aggregate, AggregateType -datasets = pe.us.ensure_datasets( - datasets=["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"], - years=[2026], -) -dataset = datasets["enhanced_cps_2024_2026"] +datasets = pe.us.ensure_datasets(years=[2026]) +dataset = next(iter(datasets.values())) baseline = Simulation(dataset=dataset, tax_benefit_model_version=pe.us.model) baseline.ensure() @@ -37,15 +34,13 @@ Microdata is stored as HDF5 on Hugging Face. `ensure_datasets` downloads, caches ```python datasets = pe.us.ensure_datasets( - datasets=["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"], years=[2024, 2026], data_folder="./data", # local cache directory ) -# Keys are "_": -dataset = datasets["enhanced_cps_2024_2026"] +dataset = datasets["populace_us_2024_2026"] ``` -The default US dataset is **Enhanced CPS 2024** — CPS ASEC fused with IRS SOI tax-return records and calibrated to IRS, CMS, SNAP, and other administrative totals. The UK default is **Populace UK 2023** — a Populace-built Family Resources Survey dataset calibrated to UK administrative targets. +The default US dataset is **Populace US 2024** — a Populace-built dataset calibrated to IRS, CMS, SNAP, Census, and other administrative totals. The UK default is **Populace UK 2023** — a Populace-built Family Resources Survey dataset calibrated to UK administrative targets. List datasets already known to the country: @@ -158,7 +153,7 @@ See [Outputs](outputs.md) for the full catalog. ## Memory and performance -A full Enhanced CPS microsimulation uses roughly 4 GB of memory and takes 15–30 seconds on a laptop. For parameter sweeps, reuse the baseline: +A full Populace US microsimulation uses roughly 4 GB of memory and takes 15-30 seconds on a laptop. For parameter sweeps, reuse the baseline: ```python baseline = Simulation(dataset=dataset, tax_benefit_model_version=pe.us.model) @@ -171,11 +166,11 @@ for amount in [0, 1_000, 2_000, 3_000]: # each iteration runs only the reform ``` -Downsampled datasets are available for testing: +Smaller custom H5 datasets can be passed explicitly for testing: ```python datasets = pe.us.ensure_datasets( - datasets=["hf://policyengine/policyengine-us-data/cps_small_2024.h5"], + datasets=["/path/to/smoke_test_populace_us_2024.h5"], years=[2026], ) ``` diff --git a/docs/regions.md b/docs/regions.md index f6fda56e..f493dcf9 100644 --- a/docs/regions.md +++ b/docs/regions.md @@ -6,7 +6,9 @@ Sub-national breakdowns: state / district filters on any output, plus dedicated ## US states -`state_code` is an Enum variable on every household (values `"CA"`, `"TX"`, ...). Pass it as a filter on any `Aggregate` or `ChangeAggregate`: +For custom households, `state_code` remains the public input (values `"CA"`, +`"TX"`, ...). Pass it as a filter on any `Aggregate` or `ChangeAggregate` when +working with simulated outputs that expose that variable: ```python from policyengine.outputs import Aggregate, AggregateType @@ -21,15 +23,18 @@ ca_snap = Aggregate( ca_snap.run() ``` -Each state is a region in the US registry, with its own dataset: +Each state is a region in the US registry. State regions scope the certified +national Populace dataset by `state_fips`; they do not require separate state +H5 files: ```python states = pe.us.model.region_registry.get_by_type("state") for region in states: - print(region.code, region.label, region.dataset_path) + print(region.code, region.label, region.scoping_strategy) ``` -For state-specific datasets (rather than filtering a national one), pass `scoping_strategy=region.scoping_strategy` or resolve the dataset path directly. +For state-specific simulations, pass `scoping_strategy=region.scoping_strategy` +with the certified national dataset. ## US congressional districts @@ -44,7 +49,7 @@ for row in impacts.district_results: print(row["district_geoid"], row["avg_change"], row["winner_percentage"]) ``` -`district_geoid` is the SSDD integer (state FIPS × 100 + district number). Requires a dataset with `congressional_district_geoid` populated — the default enhanced CPS does. +`district_geoid` is the SSDD integer (state FIPS × 100 + district number; at-large districts use `00`). Congressional district regions scope the certified national Populace dataset by `congressional_district_geoid`. ## UK parliamentary constituencies @@ -136,21 +141,19 @@ baseline = Simulation( dataset=dataset, tax_benefit_model_version=pe.us.model, scoping_strategy=RowFilterStrategy( - variable_name="state_code", - variable_value="CA", + variable_name="state_fips", + variable_value=6, ), ) ``` -Regions that filter (US places, UK countries, and any region with `region.requires_filter == True`) carry their own `scoping_strategy`. Pull it off the region object rather than reconstructing it: +Regions that filter (US states and congressional districts, UK countries, and any region with `region.requires_filter == True`) carry their own `scoping_strategy`. Pull it off the region object rather than reconstructing it. US place regions are present as hierarchy metadata, but current Populace datasets do not carry `place_fips`, so they do not expose runtime scoping yet: ```python -nyc = pe.us.model.region_registry.get("place/NY-51000") +ca = pe.us.model.region_registry.get("state/ca") baseline = Simulation( dataset=dataset, tax_benefit_model_version=pe.us.model, - scoping_strategy=nyc.scoping_strategy, + scoping_strategy=ca.scoping_strategy, ) ``` - -US states and congressional districts don't use a scoping strategy — they point to dedicated state- or district-specific datasets via `region.dataset_path`. Pass that dataset to `Simulation` instead. diff --git a/docs/release-bundles.md b/docs/release-bundles.md index c16a938f..328f8e2c 100644 --- a/docs/release-bundles.md +++ b/docs/release-bundles.md @@ -96,7 +96,7 @@ It does not own final runtime bundle certification. ### Country data package -Examples: `policyengine-uk-data`, `policyengine-us-data` +Examples: `populace-data`, `policyengine-uk-data` The country data package owns: @@ -128,24 +128,18 @@ python scripts/bundle.py certify-data --country us \ --manifest-uri "hf://dataset/policyengine/populace-us@/releases//release_manifest.json" ``` -US Populace certification currently also needs the inherited state-level -datasets from the certified `policyengine-us-data` release manifest: +US Populace certification uses the Populace release manifest directly: ```bash python scripts/bundle.py certify-data --country us --data-producer populace \ --manifest-uri "hf://dataset/policyengine/populace-us@/releases//release_manifest.json" \ - --regional-manifest-uri "hf://model/policyengine/policyengine-us-data@/releases//release_manifest.json" \ --model-version "" ``` That produces one US bundle manifest entry containing the Populace national -default dataset plus all 51 `states/{STATE}.h5` artifacts pinned to -`policyengine-us-data`. The resulting `region_datasets.state` template lets -runtime code resolve a state region to the exact certified state artifact. -The regional manifest URI is retained for traceability, but the bundle does not -currently store the regional manifest's own sha256. For inherited state data, -the citable pins are the copied artifact-level repo, revision, and sha256 -values in `data_releases.us.datasets`. +default dataset. State and congressional-district regions are runtime row +filters over that national dataset, so derived `states/*.h5` or +`districts/*.h5` files are not vendored into `data_releases.us.datasets`. Earlier releases (policyengine 4.15.x–4.16.x) were certified through the `PolicyEngine/policyengine-bundles` archive flow; those bundles remain the diff --git a/scripts/generate_trace_tros.py b/scripts/generate_trace_tros.py index 0b252be7..bc7d5d26 100644 --- a/scripts/generate_trace_tros.py +++ b/scripts/generate_trace_tros.py @@ -61,6 +61,7 @@ def generated_tros() -> list[tuple[Path, bytes]]: certification=country_manifest.certification, model_wheel_sha256=country_manifest.model_package.sha256, model_wheel_url=country_manifest.model_package.wheel_url, + emission_context={"pe:emittedIn": "repository-bundle"}, ) payloads.append((tro_path, serialize_trace_tro(tro))) return payloads diff --git a/src/policyengine/core/region.py b/src/policyengine/core/region.py index 6c5faf2a..d5d177a6 100644 --- a/src/policyengine/core/region.py +++ b/src/policyengine/core/region.py @@ -2,9 +2,9 @@ This module provides the Region and RegionRegistry classes for defining geographic regions that a tax-benefit model supports. Regions can have: -1. A dedicated dataset (e.g., US states, congressional districts) +1. A dedicated dataset, usually for the national default. 2. A scoping strategy that derives the region from a parent dataset - (row filter or weight replacement) + (row filter or weight replacement). """ from typing import Literal, Optional, Union @@ -56,7 +56,7 @@ class Region(BaseModel): # Dataset configuration dataset_path: Optional[str] = Field( default=None, - description="GCS path to dedicated dataset (e.g., 'gs://policyengine-us-data/states/CA.h5')", + description="URI to a dedicated dataset when the region has one.", ) # Scoping strategy for regions that derive from a parent dataset diff --git a/src/policyengine/core/scoping_strategy.py b/src/policyengine/core/scoping_strategy.py index 2cbc8490..abd117f3 100644 --- a/src/policyengine/core/scoping_strategy.py +++ b/src/policyengine/core/scoping_strategy.py @@ -3,7 +3,8 @@ Provides two concrete strategies for scoping datasets to sub-national regions: 1. RowFilterStrategy: Filters dataset rows where a household variable matches - a specific value (e.g., UK countries by 'country' field, US places by 'place_fips'). + a specific value (e.g., US states by 'state_fips', US congressional districts + by 'congressional_district_geoid'). 2. WeightReplacementStrategy: Legacy strategy that replaces household weights from a pre-computed weight matrix resolved locally or from GCS. @@ -16,7 +17,7 @@ import numpy as np import pandas as pd from microdf import MicroDataFrame -from pydantic import BaseModel, Discriminator +from pydantic import BaseModel, Discriminator, Field from policyengine.utils.entity_utils import ( filter_dataset_by_household_variable, @@ -62,12 +63,13 @@ class RowFilterStrategy(RegionScopingStrategy): """Scoping strategy that filters dataset rows by a household variable. Used for regions where we want to keep only households matching a - specific variable value (e.g., UK countries, US places/cities). + specific variable value (e.g., US states or congressional districts). """ strategy_type: Literal["row_filter"] = "row_filter" variable_name: str variable_value: Union[str, int, float] + additional_filters: dict[str, Union[str, int, float]] = Field(default_factory=dict) def apply( self, @@ -80,11 +82,17 @@ def apply( group_entities=group_entities, variable_name=self.variable_name, variable_value=self.variable_value, + additional_filters=self.additional_filters, ) @property def cache_key(self) -> str: - return f"row_filter:{self.variable_name}={self.variable_value}" + filters = [ + (self.variable_name, self.variable_value), + *self.additional_filters.items(), + ] + filter_key = ",".join(f"{name}={value}" for name, value in sorted(filters)) + return f"row_filter:{filter_key}" class WeightReplacementStrategy(RegionScopingStrategy): diff --git a/src/policyengine/core/simulation.py b/src/policyengine/core/simulation.py index e9cabc91..d9afd5ba 100644 --- a/src/policyengine/core/simulation.py +++ b/src/policyengine/core/simulation.py @@ -27,11 +27,8 @@ class Simulation(BaseModel): import policyengine as pe from policyengine.core import Simulation - datasets = pe.us.ensure_datasets( - datasets=["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"], - years=[2026], data_folder="./data", - ) - dataset = datasets["enhanced_cps_2024_2026"] + datasets = pe.us.ensure_datasets(years=[2026], data_folder="./data") + dataset = next(iter(datasets.values())) # Baseline baseline = Simulation(dataset=dataset, tax_benefit_model_version=pe.us.model) diff --git a/src/policyengine/countries/us/data/__init__.py b/src/policyengine/countries/us/data/__init__.py index fb833b64..5824f257 100644 --- a/src/policyengine/countries/us/data/__init__.py +++ b/src/policyengine/countries/us/data/__init__.py @@ -8,10 +8,11 @@ from .districts import AT_LARGE_STATES, DISTRICT_COUNTS from .places import US_PLACES -from .states import US_STATES +from .states import US_STATE_FIPS, US_STATES __all__ = [ "US_STATES", + "US_STATE_FIPS", "DISTRICT_COUNTS", "AT_LARGE_STATES", "US_PLACES", diff --git a/src/policyengine/countries/us/data/places.py b/src/policyengine/countries/us/data/places.py index a5fe632f..5669e26b 100644 --- a/src/policyengine/countries/us/data/places.py +++ b/src/policyengine/countries/us/data/places.py @@ -4,8 +4,8 @@ Synced with policyengine-app-v2 main branch. """ -# US cities/places with population over 100K (from Census data) -# These filter from their parent state's dataset using place_fips +# US cities/places with population over 100K (from Census data). +# These are registry metadata until Populace emits place-level row keys. # Total: 333 places US_PLACES: list[dict[str, str]] = [ { diff --git a/src/policyengine/countries/us/data/states.py b/src/policyengine/countries/us/data/states.py index 1309201b..3a5c6259 100644 --- a/src/policyengine/countries/us/data/states.py +++ b/src/policyengine/countries/us/data/states.py @@ -57,3 +57,58 @@ "WI": "Wisconsin", "WY": "Wyoming", } + + +US_STATE_FIPS: dict[str, int] = { + "AL": 1, + "AK": 2, + "AZ": 4, + "AR": 5, + "CA": 6, + "CO": 8, + "CT": 9, + "DE": 10, + "DC": 11, + "FL": 12, + "GA": 13, + "HI": 15, + "ID": 16, + "IL": 17, + "IN": 18, + "IA": 19, + "KS": 20, + "KY": 21, + "LA": 22, + "ME": 23, + "MD": 24, + "MA": 25, + "MI": 26, + "MN": 27, + "MS": 28, + "MO": 29, + "MT": 30, + "NE": 31, + "NV": 32, + "NH": 33, + "NJ": 34, + "NM": 35, + "NY": 36, + "NC": 37, + "ND": 38, + "OH": 39, + "OK": 40, + "OR": 41, + "PA": 42, + "RI": 44, + "SC": 45, + "SD": 46, + "TN": 47, + "TX": 48, + "UT": 49, + "VT": 50, + "VA": 51, + "WA": 53, + "WV": 54, + "WI": 55, + "WY": 56, +} diff --git a/src/policyengine/countries/us/regions.py b/src/policyengine/countries/us/regions.py index ca2f6b4f..266fb5c4 100644 --- a/src/policyengine/countries/us/regions.py +++ b/src/policyengine/countries/us/regions.py @@ -11,9 +11,7 @@ from policyengine.core.scoping_strategy import RowFilterStrategy from policyengine.provenance.manifest import resolve_region_dataset_path -from .data import AT_LARGE_STATES, DISTRICT_COUNTS, US_PLACES, US_STATES - -US_DATA_BUCKET = "gs://policyengine-us-data" +from .data import AT_LARGE_STATES, DISTRICT_COUNTS, US_PLACES, US_STATE_FIPS, US_STATES def _ordinal(n: int) -> str: @@ -45,7 +43,7 @@ def build_us_region_registry() -> RegionRegistry: ) ) - # 2. State regions (each has dedicated dataset) + # 2. State regions (filtered from the certified national dataset) for abbrev, name in US_STATES.items(): regions.append( Region( @@ -53,21 +51,23 @@ def build_us_region_registry() -> RegionRegistry: label=name, region_type="state", parent_code="us", - dataset_path=resolve_region_dataset_path( - "us", - "state", - state_code=abbrev, + scoping_strategy=RowFilterStrategy( + variable_name="state_fips", + variable_value=US_STATE_FIPS[abbrev], ), state_code=abbrev, state_name=name, ) ) - # 3. Congressional district regions (each has dedicated dataset) + # 3. Congressional district regions (filtered from the national dataset) for state_abbrev, count in DISTRICT_COUNTS.items(): state_name = US_STATES[state_abbrev] + state_fips = US_STATE_FIPS[state_abbrev] for i in range(1, count + 1): district_code = f"{state_abbrev}-{i:02d}" + district_number = 0 if state_abbrev in AT_LARGE_STATES else i + district_geoid = state_fips * 100 + district_number # Create appropriate label if state_abbrev in AT_LARGE_STATES: @@ -81,17 +81,16 @@ def build_us_region_registry() -> RegionRegistry: label=label, region_type="congressional_district", parent_code=f"state/{state_abbrev.lower()}", - dataset_path=resolve_region_dataset_path( - "us", - "congressional_district", - district_code=district_code, + scoping_strategy=RowFilterStrategy( + variable_name="congressional_district_geoid", + variable_value=district_geoid, ), state_code=state_abbrev, state_name=state_name, ) ) - # 4. Place/city regions (filter from state datasets) + # 4. Place/city regions (hierarchy metadata only until Populace emits place_fips) for place in US_PLACES: state_abbrev = place["state"] fips = place["fips"] @@ -103,10 +102,6 @@ def build_us_region_registry() -> RegionRegistry: parent_code=f"state/{state_abbrev.lower()}", state_code=state_abbrev, state_name=place["state_name"], - scoping_strategy=RowFilterStrategy( - variable_name="place_fips", - variable_value=fips, - ), ) ) diff --git a/src/policyengine/data/bundle/manifest.json b/src/policyengine/data/bundle/manifest.json index 6fe3c9d2..25a188f8 100644 --- a/src/policyengine/data/bundle/manifest.json +++ b/src/policyengine/data/bundle/manifest.json @@ -157,312 +157,6 @@ "revision": "populace-us-2024-cd-concept-budget-dbbdcec-512e-b2500-r2-20260627T022640Z", "sha256": "f6360c3668f38dd9c3bfe600170fdaf1a9a631a0c2accc5ecab03adb7ddfd8d6" }, - "states/AK": { - "path": "states/AK.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "758b84f75d167ebf35b529c7344f6b6154a86252a68780624d4542c436bf3903" - }, - "states/AL": { - "path": "states/AL.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "7d5555a154e0f4f4bd7c9677cb6473fec69f260c9d1ddae2e2cd2ee2febbcd8a" - }, - "states/AR": { - "path": "states/AR.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "a18d787140d33df51f8f31fe6893892e7228ad2679b1b16011f06d46eb34aedd" - }, - "states/AZ": { - "path": "states/AZ.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "dc047de094fda3f6c61b9ab0b0f73f1e4d481b2b6d5a773abd513653514e0dfd" - }, - "states/CA": { - "path": "states/CA.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "9d7f271cc1a3c84222e284b14cdd749f5364d16a598e1e3693053ce6c580e954" - }, - "states/CO": { - "path": "states/CO.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "7eae1034e34eacd91dcc439a951d8777606fb13b97093a5bf1553bf805445dc0" - }, - "states/CT": { - "path": "states/CT.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "5e5ab0aa48b4dba8fe4879829717461ea6e668637faee9eaae5d3114f45ba2f3" - }, - "states/DC": { - "path": "states/DC.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "51b8cdeecab13d45588206139ae4d106cc972b83d846a1a0f5becb6876707d93" - }, - "states/DE": { - "path": "states/DE.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "186e9b90a11413ae0459de26a799029eb74d4d7ea2bd7625031fd3e7a1f0bf98" - }, - "states/FL": { - "path": "states/FL.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "84bfcecffc59a7c892b4929bf5b4a150e122470a09ebc7643d374726d17057e9" - }, - "states/GA": { - "path": "states/GA.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "f2a3edbb813a43ec17d189e86bb8b087b51f44be2de66cdcbbd933f91a21eed0" - }, - "states/HI": { - "path": "states/HI.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "5ae626eac48714bbdf18ded2485b767e37a9d842bc631035400b455e015dd218" - }, - "states/IA": { - "path": "states/IA.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "6eda342bff2a146af371d1e05f9b47bbc083100a2987a20ef5effcd282017cd2" - }, - "states/ID": { - "path": "states/ID.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "bd43c3dce2c602abf71ab064dce76db0d99846ca4db20d78a78373377aab4201" - }, - "states/IL": { - "path": "states/IL.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "f3840cc2833f3c8f7975631d6bda07a9c27a81dd21c986abf6c831066d6880eb" - }, - "states/IN": { - "path": "states/IN.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "8319b0232e8883e3dc486e98888339a4eff84d22f7b12cae62e54079bc4857d3" - }, - "states/KS": { - "path": "states/KS.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "e9256e79bc0daaa6cb2965ef65768ec336fd9d8c09b449242516832a5bf245df" - }, - "states/KY": { - "path": "states/KY.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "ab6c15f006c0c2f9f66dd9925fc887bfe261a0cff690d40cb09665f4983e89ee" - }, - "states/LA": { - "path": "states/LA.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "d159f6c358019a0ddeeec16072efb7720c2c5efd0aca381f8e0d48ac6aa8ecd9" - }, - "states/MA": { - "path": "states/MA.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "e5b6c0fff3c638185d1c02adbca8aab74359bb92c9969a98af8046c926faf91d" - }, - "states/MD": { - "path": "states/MD.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "80b3a88c36f441dd9d5af24ce1649448eb74f11fa9c68be14da2169742babd7f" - }, - "states/ME": { - "path": "states/ME.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "683c320f238b7e99cecf660194f09c60486beb5b1bc919405b7fffd2a7d19314" - }, - "states/MI": { - "path": "states/MI.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "f1e8220bf6420402b1ae0efe0f745c5b997a3c7a009f4a4f47cd49caa3fa1208" - }, - "states/MN": { - "path": "states/MN.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "481db29ecc2128b59f4190b302dc284afdcbca95d221a559153d565cf2919a11" - }, - "states/MO": { - "path": "states/MO.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "6b61f2dc508fbf9c4b22d0d3054a263c37f31b797b482fd25122ce46a97286d5" - }, - "states/MS": { - "path": "states/MS.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "52c940f80de66ab143df2d9259c36140eb71ce93e4fede8139596be5ad6cf5ff" - }, - "states/MT": { - "path": "states/MT.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "14a208de0e3d97ad95f0ff979ff7fd594b4495f12e6cbbe432a16bb3a1e0cede" - }, - "states/NC": { - "path": "states/NC.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "c4418f5396fc2201f0100d1253f321ebad2d808c174d96b01c36230b43e31d54" - }, - "states/ND": { - "path": "states/ND.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "c5761dc56e30460b50176f088364d95d221eb305d7e9128f028e8f17eb36ca83" - }, - "states/NE": { - "path": "states/NE.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "e187ca6652931ba0efa402bd0570bd714bd789774b7d16e7215fe27c16246132" - }, - "states/NH": { - "path": "states/NH.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "695323c9c07b4cff5f49c7a76e40cb05476e3f464bef0e700f79f46cb6334326" - }, - "states/NJ": { - "path": "states/NJ.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "d1818cac60caed75ce7515715ec3b52cff886ecbd13123bfaeb909241c37ac16" - }, - "states/NM": { - "path": "states/NM.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "c4f561151751f4bf11189c1cabe7988e22bc15e5eb1a0dea059de5685e817989" - }, - "states/NV": { - "path": "states/NV.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "4db82bda4fb9c0a37304c9e0424e30341389f8addb7bbbdb01316066b8332cf9" - }, - "states/NY": { - "path": "states/NY.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "4a1476e298c552a673b88a29c11f12210511ba188291dc15001c5a71d83f437c" - }, - "states/OH": { - "path": "states/OH.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "f6984d9295d00e0e9a7b84c72a0ccb1a231598a5e9f5ff744e8326e4119cca77" - }, - "states/OK": { - "path": "states/OK.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "a92e61e7d445e9d757a0c052afcb31868621882c6b60b3cbd4ef35354bdcf04c" - }, - "states/OR": { - "path": "states/OR.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "5758866cdf930f8312f51c656b7e6ce88cd2877f81e73a27348828dc152948ce" - }, - "states/PA": { - "path": "states/PA.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "c1a7056b6f424cc4c9e847c1cf20395a0ec202a2e0f6d17c46c3b42bb6b5a6d2" - }, - "states/RI": { - "path": "states/RI.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "d1e0bde70b9a760e1963d3481397920a8b73114b22b6b1493afbfc04cb9a7c09" - }, - "states/SC": { - "path": "states/SC.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "8a7bb8d513d73cbddbfab5325d02ca94e43157d0b9b14758c74e0efe58253a17" - }, - "states/SD": { - "path": "states/SD.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "ccd5c65a96ed73e1ddd840556bf6fdf10713d796a4e99f09e228490917ffebad" - }, - "states/TN": { - "path": "states/TN.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "84addf0af5364750b22734246e838fffecdf9a6ed08b1bdbd1d6ca4a76e3be3d" - }, - "states/TX": { - "path": "states/TX.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "27578258f5998b9f3dceefea04d21c5fde9fd5b8e05d80b7b5198fb5b9db924e" - }, - "states/UT": { - "path": "states/UT.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "e165266d283105549d5395409ad6bdf02c600821ce464f95bc4f0ef8d5365b38" - }, - "states/VA": { - "path": "states/VA.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "fb1c3098639521ef744c926327c1ed2ed9f6dd17a3bee637667fb9fb6d5d9b53" - }, - "states/VT": { - "path": "states/VT.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "16c80a27f8a3bf2e832219e9471731a6de1f4da7bb08e770dab07def180dd8bb" - }, - "states/WA": { - "path": "states/WA.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "75723f11aebf83c9867312b422097a273e3f180e033ba2ec23cc4102bf3dc1a6" - }, - "states/WI": { - "path": "states/WI.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "fd1171e43cfcd510b4dcd675849e6d4a6afeb0057956123087a083db83425e5d" - }, - "states/WV": { - "path": "states/WV.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "8b0e718d4a91a3acceedc5a630ea6fd845dad1896b5dfea6baf26786f7b531fe" - }, - "states/WY": { - "path": "states/WY.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.115.5", - "sha256": "731d83ae37863ff994df2f953740ddb10b36910f43af01b38d36ffb55a88d4b5" - }, "us_source_coverage": { "path": "releases/populace-us-2024-cd-concept-budget-dbbdcec-512e-b2500-r2-20260627T022640Z/us_source_coverage.json", "repo_id": "policyengine/populace-us", @@ -482,13 +176,8 @@ "region_datasets": { "national": { "path_template": "populace_us_2024.h5" - }, - "state": { - "path_template": "states/{state_code}.h5" } }, - "regional_release_manifest_uri": "https://huggingface.co/policyengine/policyengine-us-data/resolve/1.115.5/releases/1.115.5/release_manifest.json", - "regional_source_manifest_uri": "hf://model/policyengine/policyengine-us-data@1.115.5/releases/1.115.5/release_manifest.json", "release_manifest_uri": "https://huggingface.co/datasets/policyengine/populace-us/resolve/populace-us-2024-cd-concept-budget-dbbdcec-512e-b2500-r2-20260627T022640Z/releases/populace-us-2024-cd-concept-budget-dbbdcec-512e-b2500-r2-20260627T022640Z/release_manifest.json", "schema_version": 1, "source_manifest_uri": "hf://dataset/policyengine/populace-us@populace-us-2024-cd-concept-budget-dbbdcec-512e-b2500-r2-20260627T022640Z/releases/populace-us-2024-cd-concept-budget-dbbdcec-512e-b2500-r2-20260627T022640Z/release_manifest.json", diff --git a/src/policyengine/data/bundle/uk.trace.tro.jsonld b/src/policyengine/data/bundle/uk.trace.tro.jsonld index 69c85353..02ed041d 100644 --- a/src/policyengine/data/bundle/uk.trace.tro.jsonld +++ b/src/policyengine/data/bundle/uk.trace.tro.jsonld @@ -75,7 +75,7 @@ "@type": "trov:ResearchArtifact", "schema:name": "policyengine.py bundle manifest for uk", "trov:mimeType": "application/json", - "trov:sha256": "48f8a3946d96a8da9493709b3c681196aada942e313ddd4cb44840416cd13978" + "trov:sha256": "86bfa279db3a6f416e85f5e093db41de2ef7e5aa30d1d1ef76c30c9361ef0c1e" }, { "@id": "composition/1/artifact/data_release_manifest", @@ -102,7 +102,7 @@ "trov:hasFingerprint": { "@id": "composition/1/fingerprint", "@type": "trov:CompositionFingerprint", - "trov:sha256": "260f416ed5c24c1969eb9b532a9f9070afcbd2f69bf861cad7e9b642ca9eb2f9" + "trov:sha256": "dfaa03f0d36a88fa5c7b969c6eca30664e843391cec93a288fff1ca949241f95" } }, "trov:hasPerformance": { @@ -111,12 +111,9 @@ "pe:builtWithModelVersion": "2.89.2", "pe:certifiedBy": "policyengine.py bundle certification", "pe:certifiedForModelVersion": "2.89.2", - "pe:ciGitRef": "refs/heads/main", - "pe:ciGitSha": "44a7cac06f506d34f7565b203ca9c948f4585a43", - "pe:ciRunUrl": "https://github.com/PolicyEngine/policyengine.py/actions/runs/28281042856", "pe:compatibilityBasis": "built_with_model_package", "pe:dataBuildId": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", - "pe:emittedIn": "github-actions", + "pe:emittedIn": "repository-bundle", "rdfs:comment": "Certification of build populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z for policyengine-uk 2.89.2.", "trov:accessedArrangement": { "@id": "arrangement/1" diff --git a/src/policyengine/data/bundle/us.trace.tro.jsonld b/src/policyengine/data/bundle/us.trace.tro.jsonld index 422e82e5..1b2e6605 100644 --- a/src/policyengine/data/bundle/us.trace.tro.jsonld +++ b/src/policyengine/data/bundle/us.trace.tro.jsonld @@ -75,7 +75,7 @@ "@type": "trov:ResearchArtifact", "schema:name": "policyengine.py bundle manifest for us", "trov:mimeType": "application/json", - "trov:sha256": "48f8a3946d96a8da9493709b3c681196aada942e313ddd4cb44840416cd13978" + "trov:sha256": "86bfa279db3a6f416e85f5e093db41de2ef7e5aa30d1d1ef76c30c9361ef0c1e" }, { "@id": "composition/1/artifact/data_release_manifest", @@ -102,7 +102,7 @@ "trov:hasFingerprint": { "@id": "composition/1/fingerprint", "@type": "trov:CompositionFingerprint", - "trov:sha256": "a516bc0d555b6e648bb027a85d837def92c76b7e3c4dd2e5256bbc01ac900534" + "trov:sha256": "16263eeda537c28d24fde6f8e39b4ebe21ef72728e8021c14c728632c0e1cd3d" } }, "trov:hasPerformance": { @@ -111,12 +111,9 @@ "pe:builtWithModelVersion": "1.745.0", "pe:certifiedBy": "policyengine.py bundle certification", "pe:certifiedForModelVersion": "1.745.0", - "pe:ciGitRef": "refs/heads/main", - "pe:ciGitSha": "44a7cac06f506d34f7565b203ca9c948f4585a43", - "pe:ciRunUrl": "https://github.com/PolicyEngine/policyengine.py/actions/runs/28281042856", "pe:compatibilityBasis": "built_with_model_package", "pe:dataBuildId": "populace-us-2024-cd-concept-budget-dbbdcec-512e-b2500-r2-20260627T022640Z", - "pe:emittedIn": "github-actions", + "pe:emittedIn": "repository-bundle", "rdfs:comment": "Certification of build populace-us-2024-cd-concept-budget-dbbdcec-512e-b2500-r2-20260627T022640Z for policyengine-us 1.745.0.", "trov:accessedArrangement": { "@id": "arrangement/1" diff --git a/src/policyengine/provenance/certification.py b/src/policyengine/provenance/certification.py index a0ade420..44bd9ef5 100644 --- a/src/policyengine/provenance/certification.py +++ b/src/policyengine/provenance/certification.py @@ -484,6 +484,16 @@ def build_country_manifest_payload( datasets: dict[str, dict] = {} for name, artifact in manifest.artifacts.items(): + if ( + country == "us" + and manifest.data_package.name == "populace-data" + and artifact.path.endswith(".h5") + and ( + artifact.path.startswith("states/") + or artifact.path.startswith("districts/") + ) + ): + continue payload: dict = { "path": artifact_path_for_country_manifest(artifact, uri_parts), "revision": artifact.revision, @@ -498,6 +508,12 @@ def build_country_manifest_payload( raw_regions = manifest.metadata.get("region_datasets") if isinstance(raw_regions, dict): for region, template in sorted(raw_regions.items()): + if ( + country == "us" + and manifest.data_package.name == "populace-data" + and region in {"state", "congressional_district"} + ): + continue if isinstance(template, dict) and "path_template" in template: region_datasets[region] = {"path_template": template["path_template"]} diff --git a/src/policyengine/provenance/manifest.py b/src/policyengine/provenance/manifest.py index cd3dd7bd..b9a7d616 100644 --- a/src/policyengine/provenance/manifest.py +++ b/src/policyengine/provenance/manifest.py @@ -473,6 +473,10 @@ def resolve_dataset_reference(country_id: str, dataset: str) -> str: or _artifact_revision(manifest.data_package), ) + local_path = Path(dataset).expanduser() + if local_path.exists(): + return str(local_path) + data_release_manifest = get_data_release_manifest(country_id) artifact = data_release_manifest.artifacts.get(dataset) if artifact is None: @@ -506,6 +510,20 @@ def resolve_managed_dataset_reference( if dataset is None: return manifest.default_dataset_uri + if dataset in manifest.datasets: + return resolve_dataset_reference(country_id, dataset) + + local_path = Path(dataset).expanduser() + if local_path.exists(): + if allow_unmanaged: + return str(local_path) + raise ValueError( + "Explicit local dataset paths bypass the policyengine.py release " + "bundle. Pass a manifest dataset name or omit `dataset` to use the " + "certified default dataset. Set `allow_unmanaged=True` only if you " + "intend to bypass bundle enforcement." + ) + if "://" in dataset: if dataset == manifest.default_dataset_uri: return dataset diff --git a/src/policyengine/provenance/trace.py b/src/policyengine/provenance/trace.py index 777b233d..be314265 100644 --- a/src/policyengine/provenance/trace.py +++ b/src/policyengine/provenance/trace.py @@ -297,6 +297,7 @@ def build_trace_tro_from_release_bundle( model_wheel_url: Optional[str] = None, fetch_pypi: Any = fetch_pypi_wheel_metadata, self_url: Optional[str] = None, + emission_context: Optional[Mapping[str, str]] = None, ) -> dict: """Build a TRACE TRO for a certified runtime bundle. @@ -449,6 +450,7 @@ def build_trace_tro_from_release_bundle( f"{country_manifest.data_package.version}" ), certification=effective_certification, + emission_context=emission_context, started_at=( data_release_manifest.build.built_at if ( @@ -500,6 +502,7 @@ def _build_bundle_performance( *, certified_data_build_id: str, certification: Optional[DataCertification], + emission_context: Optional[Mapping[str, str]], started_at: Optional[str], ended_at: Optional[str], ) -> dict[str, Any]: @@ -539,7 +542,7 @@ def _build_bundle_performance( performance["pe:dataBuildId"] = certification.data_build_id if certification.certified_by is not None: performance["pe:certifiedBy"] = certification.certified_by - performance.update(_emission_context()) + performance.update(dict(emission_context or _emission_context())) return performance diff --git a/src/policyengine/tax_benefit_models/us/datasets.py b/src/policyengine/tax_benefit_models/us/datasets.py index 71bbb395..00250fa2 100644 --- a/src/policyengine/tax_benefit_models/us/datasets.py +++ b/src/policyengine/tax_benefit_models/us/datasets.py @@ -264,9 +264,7 @@ def _load_policyengine_core_h5(path: Path, year: int) -> USYearData: def create_datasets( - datasets: list[str] = [ - "enhanced_cps_2024", - ], + datasets: Optional[list[str]] = None, years: list[int] = [2024, 2025, 2026, 2027, 2028], data_folder: str = "./data", ) -> dict[str, PolicyEngineUSDataset]: @@ -278,10 +276,11 @@ def create_datasets( data_folder: Directory to save the dataset files Returns: - Dictionary mapping dataset keys (e.g., "enhanced_cps_2024") to PolicyEngineUSDataset objects + Dictionary mapping dataset keys (e.g., "populace_us_2024") to PolicyEngineUSDataset objects """ from policyengine_us import Microsimulation + datasets = datasets or [get_release_manifest("us").default_dataset] result = {} for dataset in datasets: resolved_dataset = resolve_dataset_reference("us", dataset) @@ -451,9 +450,7 @@ def create_datasets( def load_datasets( - datasets: list[str] = [ - "enhanced_cps_2024", - ], + datasets: Optional[list[str]] = None, years: list[int] = [2024, 2025, 2026, 2027, 2028], data_folder: str = "./data", ) -> dict[str, PolicyEngineUSDataset]: @@ -465,8 +462,9 @@ def load_datasets( data_folder: Directory containing the dataset files Returns: - Dictionary mapping dataset keys (e.g., "enhanced_cps_2024") to PolicyEngineUSDataset objects + Dictionary mapping dataset keys (e.g., "populace_us_2024") to PolicyEngineUSDataset objects """ + datasets = datasets or [get_release_manifest("us").default_dataset] result = {} for dataset in datasets: resolved_dataset = resolve_dataset_reference("us", dataset) @@ -1140,9 +1138,7 @@ def load_managed_long_term_datasets( def ensure_datasets( - datasets: list[str] = [ - "enhanced_cps_2024", - ], + datasets: Optional[list[str]] = None, years: list[int] = [2024, 2025, 2026, 2027, 2028], data_folder: str = "./data", ) -> dict[str, PolicyEngineUSDataset]: @@ -1156,6 +1152,8 @@ def ensure_datasets( Returns: Dictionary mapping dataset keys to PolicyEngineUSDataset objects """ + datasets = datasets or [get_release_manifest("us").default_dataset] + # Check if all dataset files exist all_exist = True for dataset in datasets: diff --git a/src/policyengine/utils/entity_utils.py b/src/policyengine/utils/entity_utils.py index f06b5d59..481ff62a 100644 --- a/src/policyengine/utils/entity_utils.py +++ b/src/policyengine/utils/entity_utils.py @@ -1,6 +1,7 @@ """Shared utilities for entity relationship building and dataset filtering.""" import logging +from typing import Optional, Union import pandas as pd from microdf import MicroDataFrame @@ -55,9 +56,10 @@ def filter_dataset_by_household_variable( entity_data: dict[str, MicroDataFrame], group_entities: list[str], variable_name: str, - variable_value: str, + variable_value: Union[str, int, float], + additional_filters: Optional[dict[str, Union[str, int, float]]] = None, ) -> dict[str, MicroDataFrame]: - """Filter dataset entities to only include households where a variable matches. + """Filter dataset entities to only include households matching variables. Uses an entity relationship approach: builds an explicit map of all entity relationships, filters at the household level, and keeps all @@ -69,6 +71,8 @@ def filter_dataset_by_household_variable( group_entities: List of group entity names for this country. variable_name: The household-level variable to filter on. variable_value: The value to match. Handles both str and bytes encoding. + additional_filters: Optional household-level filters that must also + match, keyed by variable name. Returns: A dict mapping entity names to filtered MicroDataFrames. @@ -84,18 +88,23 @@ def filter_dataset_by_household_variable( f"Variable '{variable_name}' not found in household data. " f"Available columns: {list(household_data.columns)}" ) + additional_filters = additional_filters or {} + for extra_variable in additional_filters: + if extra_variable not in household_data.columns: + raise ValueError( + f"Variable '{extra_variable}' not found in household data. " + f"Available columns: {list(household_data.columns)}" + ) # Build entity relationships entity_rel = build_entity_relationships(person_data, group_entities) # Find matching household IDs - hh_values = household_data[variable_name].values hh_ids = household_data["household_id"].values - if isinstance(variable_value, str): - hh_mask = (hh_values == variable_value) | (hh_values == variable_value.encode()) - else: - hh_mask = hh_values == variable_value + hh_mask = _values_match(household_data[variable_name].values, variable_value) + for extra_variable, extra_value in additional_filters.items(): + hh_mask &= _values_match(household_data[extra_variable].values, extra_value) matching_hh_ids = set(hh_ids[hh_mask]) @@ -138,3 +147,9 @@ def filter_dataset_by_household_variable( ) return result + + +def _values_match(values, expected: Union[str, int, float]): + if isinstance(expected, str): + return (values == expected) | (values == expected.encode()) + return values == expected diff --git a/tests/fixtures/filtering_fixtures.py b/tests/fixtures/filtering_fixtures.py index 4534ad97..7776b798 100644 --- a/tests/fixtures/filtering_fixtures.py +++ b/tests/fixtures/filtering_fixtures.py @@ -20,7 +20,7 @@ def create_us_test_dataset() -> PolicyEngineUSDataset: Creates a dataset with 6 persons across 3 households: - Household 1 (place_fips="44000"): 2 persons - Household 2 (place_fips="44000"): 2 persons - - Household 3 (place_fips="57000"): 2 persons + - Household 3 (place_fips="44000", state_fips=34): 2 persons """ # Person data - 6 persons across 3 households person_data = pd.DataFrame( @@ -36,13 +36,14 @@ def create_us_test_dataset() -> PolicyEngineUSDataset: } ) - # Household data - 3 households, 2 in place 44000, 1 in place 57000 + # Household data - place_fips is only unique within state. household_data = pd.DataFrame( { "household_id": [1, 2, 3], "household_weight": [1000.0, 1000.0, 1000.0], - "place_fips": ["44000", "44000", "57000"], + "place_fips": ["44000", "44000", "44000"], "state_fips": [6, 6, 34], # CA, CA, NJ + "congressional_district_geoid": [601, 602, 3401], } ) diff --git a/tests/test_certify_data_release.py b/tests/test_certify_data_release.py index c8d9119f..62a1467f 100644 --- a/tests/test_certify_data_release.py +++ b/tests/test_certify_data_release.py @@ -103,6 +103,14 @@ def _release_manifest_payload() -> dict: "sha256": "b" * 64, "size_bytes": 1, }, + "districts/CA-01": { + "kind": "microdata", + "path": "districts/CA-01.h5", + "repo_id": "policyengine/populace-us", + "revision": TAG, + "sha256": "1" * 64, + "size_bytes": 1, + }, }, } @@ -111,6 +119,7 @@ def _populace_manifest_payload_without_regions() -> dict: payload = _release_manifest_payload() payload["metadata"] = {} payload["artifacts"].pop("states/AK") + payload["artifacts"].pop("districts/CA-01") return payload @@ -179,6 +188,8 @@ def _uk_release_manifest_payload() -> dict: artifact["repo_id"] = "policyengine/populace-uk-private" artifact["revision"] = UK_TAG payload["artifacts"].pop("us_source_coverage") + payload["artifacts"].pop("states/AK") + payload["artifacts"].pop("districts/CA-01") return payload @@ -290,15 +301,11 @@ def test__given_manifest__then_pins_data_package_and_default(self): assert payload["model_package"]["sha256"] == "d" * 64 assert payload["model_package"]["wheel_url"] == "https://example/wheel" - def test__given_inherited_artifact__then_keeps_its_repo_pin(self): + def test__given_populace_area_h5_artifact__then_omits_it_from_runtime_bundle(self): payload = self._payload() - assert payload["datasets"]["states/AK"] == { - "path": "states/AK.h5", - "revision": "1.115.5", - "sha256": "b" * 64, - "repo_id": "policyengine/policyengine-us-data", - } + assert "states/AK" not in payload["datasets"] + assert "districts/CA-01" not in payload["datasets"] def test__given_release_scoped_diagnostics__then_rewrites_paths(self): payload = self._payload() @@ -315,11 +322,13 @@ def test__given_release_scoped_diagnostics__then_rewrites_paths(self): f"releases/{TAG}/us_source_coverage.json" ) - def test__given_region_templates__then_carried_through(self): + def test__given_populace_region_templates__then_only_national_is_carried_through( + self, + ): payload = self._payload() - assert payload["region_datasets"]["state"] == { - "path_template": "states/{state_code}.h5" + assert payload["region_datasets"] == { + "national": {"path_template": "populace_us_2024.h5"} } def test__given_build_provenance__then_certification_carries_it(self): @@ -338,7 +347,7 @@ def test__given_build_provenance__then_certification_carries_it(self): class TestMergeUSStateReleaseManifest: - def test__given_state_manifest__then_adds_state_region_artifacts(self): + def test__given_state_manifest__then_does_not_vendor_state_region_artifacts(self): primary = DataReleaseManifest.model_validate( _populace_manifest_payload_without_regions() ) @@ -355,15 +364,9 @@ def test__given_state_manifest__then_adds_state_region_artifacts(self): model_wheel={}, ) - assert payload["datasets"]["states/CA"] == { - "path": "states/CA.h5", - "revision": US_DATA_VERSION, - "sha256": f"{US_STATE_CODES.index('CA') + 1:064x}", - "repo_id": "policyengine/policyengine-us-data", - } + assert "states/CA" not in payload["datasets"] assert payload["region_datasets"] == { "national": {"path_template": "populace_us_2024.h5"}, - "state": {"path_template": "states/{state_code}.h5"}, } def test__given_missing_state_artifact__then_raises(self): @@ -473,11 +476,11 @@ def test__given_fetched_populace_manifest__then_updates_bundle_manifest( assert release["source_manifest_uri"] == UK_MANIFEST_URI assert written["packages"]["policyengine-uk"]["version"] == "2.89.2" assert result.data_producer == "populace" - assert result.dataset_count == 4 + assert result.dataset_count == 3 assert result.build_id == UK_TAG assert result.bundle_path == bundle_path - def test__given_us_regional_manifest__then_certifies_state_artifacts( + def test__given_us_regional_manifest__then_validates_but_does_not_vendor_state_artifacts( self, tmp_path ): bundle_path = tmp_path / "manifest.json" @@ -532,14 +535,11 @@ def test__given_us_regional_manifest__then_certifies_state_artifacts( release = written["data_releases"]["us"] assert release["source_manifest_uri"] == MANIFEST_URI assert release["regional_source_manifest_uri"] == US_DATA_MANIFEST_URI - assert release["region_datasets"]["state"] == { - "path_template": "states/{state_code}.h5" + assert release["region_datasets"] == { + "national": {"path_template": "populace_us_2024.h5"} } - assert release["datasets"]["states/CA"]["repo_id"] == ( - "policyengine/policyengine-us-data" - ) - assert release["datasets"]["states/CA"]["revision"] == US_DATA_VERSION - assert result.dataset_count == 4 + len(US_STATE_CODES) + assert "states/CA" not in release["datasets"] + assert result.dataset_count == 4 def test__given_us_without_data_producer__then_legacy_update_is_explicitly_unsupported( self, tmp_path @@ -720,3 +720,5 @@ def test__given_vendored_bundle_manifest__then_tro_sidecar_binds_it(self): ) assert bundle_manifest["trov:sha256"] == expected + performance = tro["@graph"][0]["trov:hasPerformance"] + assert performance["pe:emittedIn"] == "repository-bundle" diff --git a/tests/test_entity_utils.py b/tests/test_entity_utils.py index f8846457..cc51c6b1 100644 --- a/tests/test_entity_utils.py +++ b/tests/test_entity_utils.py @@ -158,6 +158,69 @@ def test__given_matching_value__then_returns_filtered_entities(self): assert len(pd.DataFrame(result["person"])) == 2 assert len(pd.DataFrame(result["household"])) == 1 + def test__given_us_numeric_geography__then_filters_state_and_district( + self, us_test_dataset + ): + """Given: US data with Populace geography columns + When: Filtering by state FIPS and congressional district GEOID + Then: Related entities are preserved for matching households only + """ + state_result = filter_dataset_by_household_variable( + entity_data=us_test_dataset.data.entity_data, + group_entities=[ + "household", + "tax_unit", + "spm_unit", + "family", + "marital_unit", + ], + variable_name="state_fips", + variable_value=6, + ) + district_result = filter_dataset_by_household_variable( + entity_data=us_test_dataset.data.entity_data, + group_entities=[ + "household", + "tax_unit", + "spm_unit", + "family", + "marital_unit", + ], + variable_name="congressional_district_geoid", + variable_value=601, + ) + + assert len(pd.DataFrame(state_result["household"])) == 2 + assert len(pd.DataFrame(state_result["person"])) == 4 + assert len(pd.DataFrame(district_result["household"])) == 1 + assert len(pd.DataFrame(district_result["person"])) == 2 + + def test__given_place_fips_collision__then_additional_state_filter_disambiguates( + self, us_test_dataset + ): + """Given: Two states with the same place FIPS code + When: Filtering by place FIPS plus state FIPS + Then: Only households from the requested state are included + """ + result = filter_dataset_by_household_variable( + entity_data=us_test_dataset.data.entity_data, + group_entities=[ + "household", + "tax_unit", + "spm_unit", + "family", + "marital_unit", + ], + variable_name="place_fips", + variable_value="44000", + additional_filters={"state_fips": 6}, + ) + + households = pd.DataFrame(result["household"]) + assert len(households) == 2 + assert set(households["state_fips"]) == {6} + assert len(pd.DataFrame(result["person"])) == 4 + def test__given_no_match__then_raises_value_error(self): """Given: Dataset with no matching households When: Filtering diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index df8c9fa5..627e8478 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -26,8 +26,10 @@ get_release_manifest, https_release_manifest_uri, resolve_dataset_reference, + resolve_default_datasets, resolve_local_managed_dataset_source, resolve_managed_dataset_reference, + resolve_region_dataset_path, ) PYPROJECT = Path(__file__).resolve().parents[1] / "pyproject.toml" @@ -236,6 +238,12 @@ def test__given_explicit_url__then_resolution_is_noop(self): assert resolve_dataset_reference("us", url) == url + def test__given_existing_local_path__then_resolution_is_noop(self, tmp_path): + dataset = tmp_path / "smoke_test_populace_us_2024.h5" + dataset.write_bytes(b"") + + assert resolve_dataset_reference("us", str(dataset)) == str(dataset) + def test__given_default_dataset__then_prefers_certified_data_artifact_uri(self): manifest = get_release_manifest("us") @@ -248,6 +256,49 @@ def test__given_no_dataset__then_managed_resolution_uses_certified_default(self) == get_release_manifest("us").default_dataset_uri ) + def test__given_us_manifest__then_has_no_inherited_area_artifacts(self): + manifest = get_release_manifest("us") + + assert "state" not in manifest.region_datasets + assert "congressional_district" not in manifest.region_datasets + assert resolve_region_dataset_path("us", "state", state_code="CA") is None + assert ( + resolve_region_dataset_path( + "us", + "congressional_district", + district_code="CA-01", + ) + is None + ) + assert not any( + key.startswith(("states/", "districts/")) + for key in resolve_default_datasets("us") + ) + + def test__given_us_ensure_datasets_without_dataset__then_uses_certified_default( + self, + ): + us_datasets = importlib.import_module( + "policyengine.tax_benefit_models.us.datasets" + ) + + with ( + patch.object(us_datasets.Path, "exists", return_value=False), + patch.object( + us_datasets, + "create_datasets", + return_value={"populace_us_2024_2026": object()}, + ) as create_datasets, + ): + result = us_datasets.ensure_datasets(years=[2026]) + + assert list(result) == ["populace_us_2024_2026"] + create_datasets.assert_called_once_with( + datasets=["populace_us_2024"], + years=[2026], + data_folder="./data", + ) + def test__given_explicit_uri__then_managed_resolution_requires_opt_in(self): dataset = "hf://policyengine/policyengine-us-data/cps_2023.h5@1.73.0" @@ -267,6 +318,41 @@ def test__given_explicit_uri__then_managed_resolution_requires_opt_in(self): == dataset ) + def test__given_local_path__then_managed_resolution_requires_opt_in(self, tmp_path): + dataset = tmp_path / "smoke_test_populace_us_2024.h5" + dataset.write_bytes(b"") + + try: + resolve_managed_dataset_reference("us", str(dataset)) + except ValueError as error: + assert "bypass the policyengine.py release bundle" in str(error) + else: + raise AssertionError("Expected explicit local path to be rejected") + + assert resolve_managed_dataset_reference( + "us", + str(dataset), + allow_unmanaged=True, + ) == str(dataset) + + def test__given_local_file_named_like_logical_dataset__then_manifest_wins( + self, + tmp_path, + monkeypatch, + ): + dataset = "populace_us_2024" + (tmp_path / dataset).write_bytes(b"") + monkeypatch.chdir(tmp_path) + + assert ( + resolve_dataset_reference("us", dataset) + == get_release_manifest("us").default_dataset_uri + ) + assert ( + resolve_managed_dataset_reference("us", dataset) + == get_release_manifest("us").default_dataset_uri + ) + def test__given_versioned_dataset_url__then_logical_name_drops_version(self): dataset = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0" @@ -773,7 +859,7 @@ def test__given_us_managed_microsimulation__then_passes_certified_dataset_and_bu with patch.object( us_model, "materialize_dataset_source", - return_value="/tmp/enhanced_cps_2024.h5", + return_value="/tmp/populace_us_2024.h5", ): microsim = us_model.managed_microsimulation() @@ -788,7 +874,7 @@ def test__given_us_managed_microsimulation__then_passes_certified_dataset_and_bu == us_model.us_latest.default_dataset_uri ) dataset_source = microsim.policyengine_bundle["runtime_dataset_source"] - assert dataset_source == "/tmp/enhanced_cps_2024.h5" + assert dataset_source == "/tmp/populace_us_2024.h5" def test__given_us_unmanaged_dataset_uri__then_source_is_not_rewritten(self): dataset = "hf://policyengine/policyengine-us-data/cps_2023.h5@1.73.0" diff --git a/tests/test_us_regions.py b/tests/test_us_regions.py index 37cd9a0a..95be4cf4 100644 --- a/tests/test_us_regions.py +++ b/tests/test_us_regions.py @@ -1,6 +1,6 @@ """Tests for US region definitions.""" -from policyengine.countries.us.data import DISTRICT_COUNTS, US_STATES +from policyengine.countries.us.data import DISTRICT_COUNTS, US_STATE_FIPS, US_STATES from policyengine.countries.us.regions import ( build_us_region_registry, us_region_registry, @@ -135,12 +135,13 @@ def test__given_california_region__then_has_correct_format(self): assert ca.label == "California" assert ca.region_type == "state" assert ca.parent_code == "us" - assert ca.dataset_path == ( - "hf://policyengine/policyengine-us-data/states/CA.h5@1.115.5" - ) + assert ca.dataset_path is None + assert ca.requires_filter + assert ca.scoping_strategy is not None + assert ca.scoping_strategy.variable_name == "state_fips" + assert ca.scoping_strategy.variable_value == US_STATE_FIPS["CA"] assert ca.state_code == "CA" assert ca.state_name == "California" - assert not ca.requires_filter def test__given_us_registry__then_has_436_congressional_districts(self): """Given: US region registry @@ -168,8 +169,25 @@ def test__given_ca_first_district__then_has_correct_format(self): assert ca01.region_type == "congressional_district" assert ca01.parent_code == "state/ca" assert ca01.dataset_path is None + assert ca01.requires_filter + assert ca01.scoping_strategy is not None + assert ca01.scoping_strategy.variable_name == "congressional_district_geoid" + assert ca01.scoping_strategy.variable_value == US_STATE_FIPS["CA"] * 100 + 1 assert ca01.state_code == "CA" - assert not ca01.requires_filter + + def test__given_at_large_district__then_filter_uses_zero_district_geoid(self): + """Given: an at-large congressional district + When: Checking its row filter + Then: It uses the Populace/Census SS00 district GEOID convention + """ + # When + ak_al = us_region_registry.get("congressional_district/AK-01") + + # Then + assert ak_al is not None + assert ak_al.scoping_strategy is not None + assert ak_al.scoping_strategy.variable_name == "congressional_district_geoid" + assert ak_al.scoping_strategy.variable_value == US_STATE_FIPS["AK"] * 100 def test__given_dc_district__then_is_at_large(self): """Given: DC's congressional district @@ -183,6 +201,8 @@ def test__given_dc_district__then_is_at_large(self): assert dc_al is not None assert dc_al.label == "District of Columbia's at-large congressional district" assert dc_al.parent_code == "state/dc" + assert dc_al.scoping_strategy is not None + assert dc_al.scoping_strategy.variable_value == US_STATE_FIPS["DC"] * 100 def test__given_us_registry__then_has_places(self): """Given: US region registry @@ -198,7 +218,7 @@ def test__given_us_registry__then_has_places(self): def test__given_los_angeles_region__then_has_correct_format(self): """Given: Los Angeles place region When: Checking its properties - Then: Requires filter with place_fips field + Then: Exists as hierarchy metadata but does not claim runtime scoping """ # When la = us_region_registry.get("place/CA-44000") @@ -208,10 +228,8 @@ def test__given_los_angeles_region__then_has_correct_format(self): assert "Los Angeles" in la.label assert la.region_type == "place" assert la.parent_code == "state/ca" - assert la.requires_filter - assert la.scoping_strategy is not None - assert la.scoping_strategy.variable_name == "place_fips" - assert la.scoping_strategy.variable_value == "44000" + assert not la.requires_filter + assert la.scoping_strategy is None assert la.state_code == "CA" assert la.dataset_path is None # No dedicated dataset @@ -233,27 +251,24 @@ def test__given_california__then_children_include_districts_and_places( assert len(district_children) == DISTRICT_COUNTS["CA"] assert len(place_children) >= 10 # CA has many large cities - def test__given_us_registry__then_dataset_regions_are_national_and_states(self): + def test__given_us_registry__then_dataset_regions_are_national_only(self): """Given: US region registry When: Getting regions with datasets - Then: Current certified bundle has national and state datasets + Then: Only the national canonical Populace dataset is dedicated """ # When dataset_regions = us_region_registry.get_dataset_regions() # Then - assert len(dataset_regions) == 52 - assert {region.region_type for region in dataset_regions} == { - "national", - "state", - } + assert len(dataset_regions) == 1 + assert dataset_regions[0].region_type == "national" - def test__given_certified_state_template__then_states_have_dataset_paths( + def test__given_certified_state_template__then_state_filters_national_dataset( self, monkeypatch ): """Given: US bundle manifest with a certified state template When: Building the region registry - Then: State regions resolve to pinned state dataset artifacts + Then: State regions still filter the national certified dataset """ manifest = CountryReleaseManifest.model_validate( { @@ -298,20 +313,31 @@ def test__given_certified_state_template__then_states_have_dataset_paths( ca = registry.get("state/ca") assert ca is not None - assert ca.dataset_path == ( - "hf://policyengine/policyengine-us-data/states/CA.h5@1.115.5" - ) + assert ca.dataset_path is None + assert ca.requires_filter + assert ca.scoping_strategy is not None + assert ca.scoping_strategy.variable_name == "state_fips" + assert ca.scoping_strategy.variable_value == US_STATE_FIPS["CA"] - def test__given_us_registry__then_filter_regions_are_all_places(self): + def test__given_us_registry__then_filter_regions_include_states_and_districts(self): """Given: US region registry When: Getting regions requiring filter - Then: All are place regions + Then: State and congressional district regions filter the national data """ # When filter_regions = us_region_registry.get_filter_regions() # Then - assert all(r.region_type == "place" for r in filter_regions) + region_types = {r.region_type for r in filter_regions} + assert {"state", "congressional_district"} <= region_types + assert "place" not in region_types + assert len([r for r in filter_regions if r.region_type == "state"]) == 51 + assert ( + len( + [r for r in filter_regions if r.region_type == "congressional_district"] + ) + == 436 + ) def test__given_us_registry__then_total_exceeds_588(self): """Given: US region registry