From 9d61da5ee7a217b62511502c61ea357c3aebcfc7 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 17 Apr 2026 08:23:51 -0400 Subject: [PATCH 1/4] Support Python 3.9-3.12 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lower `requires-python` to `>=3.9`. Previously only 3.13/3.14 were supported. Changes fall into four buckets: 1. PEP 604 `X | Y` union annotations (evaluated at runtime by pydantic at class-creation time) rewritten as `Optional[X]` / `Union[X, Y]` across ~34 files (~200 sites). A rewriter script handled most mechanically; two edge cases with non-`None` unions (`RowFilterStrategy | WeightReplacementStrategy` in `scoping_strategy.py`, `USRegionType | UKRegionType` in `region.py`) were fixed by hand. 2. `from enum import StrEnum` (3.11+) in four output modules (`aggregate`, `change_aggregate`, `inequality`, `poverty`) replaced with `class Foo(str, Enum)`; two further sites used `SomeEnum | str` unions, converted to `Union[...]`. 3. PEP 695 generic class syntax (3.12+) in `core/cache.py` (`class LRUCache[T]`) and `core/output.py` (`class OutputCollection[T: "Output"]`) rewritten using `typing.TypeVar` + `typing.Generic[T]`. 4. `from datetime import UTC` (3.11+) in `tax_benefit_model_version.py` replaced with `timezone.utc`. Tooling: - `[tool.ruff] target-version = "py39"` (was py313) - `[tool.ruff.lint] ignore` now explicitly excludes UP006, UP007, UP035, UP045 (ruff 0.15 still suggests `X | Y`/`X | None`/`list[T]` rewrites even with py39 target; these would all be wrong on 3.9) - `[tool.mypy] python_version = "3.9"` (was 3.13) CI: new `Python-Compat` matrix job installs the package (no extras) and smoke-imports across 3.9-3.14. Existing `Test` job remains on 3.13/3.14 because the `[dev]` extras pin `policyengine-us==1.602.0` and `policyengine-uk==2.74.0`, which themselves require >=3.11 — those version pins should be bumped in a follow-up once us/uk ship their 3.9 support. Dependency chain: full 3.9 support requires PolicyEngine/policyengine-core#454, PolicyEngine/policyengine-us#8035, and PolicyEngine/policyengine-uk#1625 to land first. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/pr_code_changes.yaml | 20 ++++++++ changelog.d/support-py39.added.md | 1 + pyproject.toml | 21 +++++++-- src/policyengine/core/cache.py | 7 ++- src/policyengine/core/dataset.py | 13 ++--- src/policyengine/core/dynamic.py | 5 +- src/policyengine/core/output.py | 6 +-- src/policyengine/core/parameter.py | 12 ++--- src/policyengine/core/parameter_node.py | 6 +-- src/policyengine/core/parameter_value.py | 8 ++-- src/policyengine/core/policy.py | 5 +- src/policyengine/core/region.py | 22 ++++----- src/policyengine/core/release_manifest.py | 47 ++++++++++--------- src/policyengine/core/scoping_strategy.py | 6 +-- src/policyengine/core/simulation.py | 15 +++--- src/policyengine/core/tax_benefit_model.py | 4 +- .../core/tax_benefit_model_version.py | 26 +++++----- src/policyengine/core/trace_tro.py | 9 ++-- src/policyengine/core/variable.py | 14 +++--- src/policyengine/outputs/aggregate.py | 28 ++++++----- src/policyengine/outputs/change_aggregate.py | 40 ++++++++-------- .../outputs/congressional_district_impact.py | 4 +- .../outputs/constituency_impact.py | 4 +- src/policyengine/outputs/decile_impact.py | 36 +++++++------- src/policyengine/outputs/inequality.py | 44 ++++++++--------- .../outputs/intra_decile_impact.py | 16 ++++--- .../outputs/local_authority_impact.py | 4 +- src/policyengine/outputs/poverty.py | 42 ++++++++--------- .../tax_benefit_models/uk/analysis.py | 4 +- .../tax_benefit_models/uk/datasets.py | 3 +- .../tax_benefit_models/uk/model.py | 10 ++-- .../tax_benefit_models/uk/outputs.py | 16 ++++--- .../tax_benefit_models/us/analysis.py | 6 +-- .../tax_benefit_models/us/datasets.py | 3 +- .../tax_benefit_models/us/model.py | 10 ++-- .../tax_benefit_models/us/outputs.py | 16 ++++--- src/policyengine/utils/parametric_reforms.py | 14 +++--- src/policyengine/utils/plotting.py | 12 +++-- tests/fixtures/parameter_labels_fixtures.py | 20 ++++---- tests/fixtures/parametric_reforms_fixtures.py | 3 +- tests/fixtures/variable_label_fixtures.py | 5 +- tests/test_intra_decile_impact.py | 3 +- 42 files changed, 328 insertions(+), 262 deletions(-) create mode 100644 changelog.d/support-py39.added.md diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index bdffb04a..8850174e 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -44,6 +44,26 @@ jobs: run: uv pip install -e .[dev] --system - name: Run mypy (informational) run: mypy src/policyengine || echo "::warning::mypy found errors (non-blocking until codebase is clean)" + Python-Compat: + name: Install + smoke-import (py${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + allow-prereleases: true + - name: Install uv + uses: astral-sh/setup-uv@v5 + - name: Install package (no extras — skip country models to isolate the wrapper) + run: uv pip install --system . + - name: Smoke-import core modules + run: python -c "import policyengine; from policyengine.core import Dataset, Policy, Simulation; from policyengine.outputs import aggregate, poverty, inequality; print('import OK')" Test: runs-on: macos-latest strategy: diff --git a/changelog.d/support-py39.added.md b/changelog.d/support-py39.added.md new file mode 100644 index 00000000..edb247fc --- /dev/null +++ b/changelog.d/support-py39.added.md @@ -0,0 +1 @@ +Support Python 3.9–3.12 (in addition to 3.13–3.14). PEP 604 `X | Y` annotations (evaluated at runtime by pydantic) are rewritten as `Optional[X]` / `Union[X, Y]`; `StrEnum` (3.11+) is replaced with `class Foo(str, Enum)`; PEP 695 generic class syntax in `core/cache.py` and `core/output.py` is rewritten using `typing.TypeVar` + `typing.Generic`. Ruff and mypy target versions dropped to py39. Requires `policyengine-us==1.602.0+` and `policyengine-uk==2.74.0+` from the `[us]`/`[uk]`/`[dev]` extras to also support 3.9/3.10. diff --git a/pyproject.toml b/pyproject.toml index 94b31711..bc5034b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,8 +11,12 @@ authors = [ {name = "PolicyEngine", email = "hello@policyengine.org"}, ] license = {file = "LICENSE"} -requires-python = ">=3.13" +requires-python = ">=3.9" classifiers = [ + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", ] @@ -73,7 +77,7 @@ filterwarnings = [ [tool.ruff] line-length = 88 -target-version = "py313" +target-version = "py39" extend-exclude = ["*.ipynb"] [tool.ruff.lint] @@ -84,7 +88,16 @@ select = [ "W", # pycodestyle warnings "UP", # pyupgrade ] -ignore = ["E501"] # Ignore line length errors +ignore = [ + "E501", # Ignore line length errors + # The following pyupgrade rules would require Python 3.10+, but we + # support 3.9+. Re-enable these once the 3.9 floor is dropped. + "UP006", # prefer `list` over `List` — OK on 3.9 at runtime, but for + # pydantic models we use typing.List for consistency + "UP007", # prefer `X | Y` over `Union[X, Y]` — needs 3.10+ + "UP035", # `typing.List` is deprecated — same as UP006 + "UP045", # prefer `X | None` over `Optional[X]` — needs 3.10+ +] [tool.ruff.format] quote-style = "double" @@ -93,7 +106,7 @@ skip-magic-trailing-comma = false line-ending = "auto" [tool.mypy] -python_version = "3.13" +python_version = "3.9" warn_return_any = true warn_unused_configs = true ignore_missing_imports = true diff --git a/src/policyengine/core/cache.py b/src/policyengine/core/cache.py index 44de06e3..410301e4 100644 --- a/src/policyengine/core/cache.py +++ b/src/policyengine/core/cache.py @@ -1,5 +1,6 @@ import logging from collections import OrderedDict +from typing import Generic, Optional, TypeVar import psutil @@ -8,15 +9,17 @@ _MEMORY_THRESHOLDS_GB = [8, 16, 32] _warned_thresholds: set[int] = set() +T = TypeVar("T") -class LRUCache[T]: + +class LRUCache(Generic[T]): """Least-recently-used cache with configurable size limit and memory monitoring.""" def __init__(self, max_size: int = 100): self._max_size = max_size self._cache: OrderedDict[str, T] = OrderedDict() - def get(self, key: str) -> T | None: + def get(self, key: str) -> Optional[T]: """Get item from cache, marking it as recently used.""" if key not in self._cache: return None diff --git a/src/policyengine/core/dataset.py b/src/policyengine/core/dataset.py index f10a5d22..27f51d16 100644 --- a/src/policyengine/core/dataset.py +++ b/src/policyengine/core/dataset.py @@ -1,3 +1,4 @@ +from typing import Optional from uuid import uuid4 import numpy as np @@ -76,7 +77,7 @@ class YearData(BaseModel): household: pd.DataFrame class MyDataset(Dataset): - data: YearData | None = None + data: Optional[YearData] = None """ model_config = ConfigDict(arbitrary_types_allowed=True) @@ -84,13 +85,13 @@ class MyDataset(Dataset): id: str = Field(default_factory=lambda: str(uuid4())) name: str description: str - dataset_version: DatasetVersion | None = None + dataset_version: Optional[DatasetVersion] = None filepath: str is_output_dataset: bool = False - tax_benefit_model: TaxBenefitModel | None = None + tax_benefit_model: Optional[TaxBenefitModel] = None year: int - data: BaseModel | None = None + data: Optional[BaseModel] = None def map_to_entity( @@ -98,8 +99,8 @@ def map_to_entity( source_entity: str, target_entity: str, person_entity: str = "person", - columns: list[str] | None = None, - values: np.ndarray | None = None, + columns: Optional[list[str]] = None, + values: Optional[np.ndarray] = None, how: str = "sum", ) -> MicroDataFrame: """Map data from source entity to target entity using join keys. diff --git a/src/policyengine/core/dynamic.py b/src/policyengine/core/dynamic.py index 81ef62b7..d707b9b2 100644 --- a/src/policyengine/core/dynamic.py +++ b/src/policyengine/core/dynamic.py @@ -1,5 +1,6 @@ from collections.abc import Callable from datetime import datetime +from typing import Optional from uuid import uuid4 from pydantic import BaseModel, Field @@ -10,9 +11,9 @@ class Dynamic(BaseModel): id: str = Field(default_factory=lambda: str(uuid4())) name: str - description: str | None = None + description: Optional[str] = None parameter_values: list[ParameterValue] = [] - simulation_modifier: Callable | None = None + simulation_modifier: Optional[Callable] = None created_at: datetime = Field(default_factory=datetime.now) updated_at: datetime = Field(default_factory=datetime.now) diff --git a/src/policyengine/core/output.py b/src/policyengine/core/output.py index a4bf969a..e71634ab 100644 --- a/src/policyengine/core/output.py +++ b/src/policyengine/core/output.py @@ -1,4 +1,4 @@ -from typing import TypeVar +from typing import Generic, List, TypeVar import pandas as pd from pydantic import BaseModel, ConfigDict @@ -17,10 +17,10 @@ def run(self): raise NotImplementedError("Subclasses must implement run()") -class OutputCollection[T: "Output"](BaseModel): +class OutputCollection(BaseModel, Generic[T]): """Container for a collection of outputs with their DataFrame representation.""" model_config = ConfigDict(arbitrary_types_allowed=True) - outputs: list[T] + outputs: List[T] dataframe: pd.DataFrame diff --git a/src/policyengine/core/parameter.py b/src/policyengine/core/parameter.py index cd5a2c88..49f2b282 100644 --- a/src/policyengine/core/parameter.py +++ b/src/policyengine/core/parameter.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional from uuid import uuid4 from pydantic import BaseModel, Field, PrivateAttr @@ -15,15 +15,15 @@ class Parameter(BaseModel): id: str = Field(default_factory=lambda: str(uuid4())) name: str - label: str | None = None - description: str | None = None - data_type: type | None = None + label: Optional[str] = None + description: Optional[str] = None + data_type: Optional[type] = None tax_benefit_model_version: TaxBenefitModelVersion - unit: str | None = None + unit: Optional[str] = None # Lazy loading: store core param ref, build values on demand _core_param: Any = PrivateAttr(default=None) - _parameter_values: list["ParameterValue"] | None = PrivateAttr(default=None) + _parameter_values: Optional[list["ParameterValue"]] = PrivateAttr(default=None) def __init__(self, _core_param: Any = None, **data): super().__init__(**data) diff --git a/src/policyengine/core/parameter_node.py b/src/policyengine/core/parameter_node.py index 9a3e25a0..54d384a5 100644 --- a/src/policyengine/core/parameter_node.py +++ b/src/policyengine/core/parameter_node.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional from uuid import uuid4 from pydantic import BaseModel, Field @@ -22,8 +22,8 @@ class ParameterNode(BaseModel): id: str = Field(default_factory=lambda: str(uuid4())) name: str = Field(description="Full path of the node (e.g., 'gov.hmrc')") - label: str | None = Field( + label: Optional[str] = Field( default=None, description="Human-readable label (e.g., 'HMRC')" ) - description: str | None = Field(default=None, description="Node description") + description: Optional[str] = Field(default=None, description="Node description") tax_benefit_model_version: "TaxBenefitModelVersion" diff --git a/src/policyengine/core/parameter_value.py b/src/policyengine/core/parameter_value.py index 073cd74b..a51ffeb0 100644 --- a/src/policyengine/core/parameter_value.py +++ b/src/policyengine/core/parameter_value.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional, Union from uuid import uuid4 from pydantic import BaseModel, Field @@ -10,7 +10,7 @@ class ParameterValue(BaseModel): id: str = Field(default_factory=lambda: str(uuid4())) - parameter: "Parameter | None" = None - value: float | int | str | bool | list | None = None + parameter: "Optional[Parameter]" = None + value: Optional[Union[float, int, str, bool, list]] = None start_date: datetime - end_date: datetime | None = None + end_date: Optional[datetime] = None diff --git a/src/policyengine/core/policy.py b/src/policyengine/core/policy.py index bfb4ca9e..3860a817 100644 --- a/src/policyengine/core/policy.py +++ b/src/policyengine/core/policy.py @@ -1,5 +1,6 @@ from collections.abc import Callable from datetime import datetime +from typing import Optional from uuid import uuid4 from pydantic import BaseModel, Field @@ -10,9 +11,9 @@ class Policy(BaseModel): id: str = Field(default_factory=lambda: str(uuid4())) name: str - description: str | None = None + description: Optional[str] = None parameter_values: list[ParameterValue] = [] - simulation_modifier: Callable | None = None + simulation_modifier: Optional[Callable] = None created_at: datetime = Field(default_factory=datetime.now) updated_at: datetime = Field(default_factory=datetime.now) diff --git a/src/policyengine/core/region.py b/src/policyengine/core/region.py index ebf1f93a..7ff55a64 100644 --- a/src/policyengine/core/region.py +++ b/src/policyengine/core/region.py @@ -6,7 +6,7 @@ 2. Filter from a parent region's dataset (e.g., US places/cities, UK countries) """ -from typing import Literal +from typing import Literal, Optional, Union from pydantic import BaseModel, Field, PrivateAttr @@ -15,7 +15,7 @@ # Region type literals for US and UK USRegionType = Literal["national", "state", "congressional_district", "place"] UKRegionType = Literal["national", "country", "constituency", "local_authority"] -RegionType = USRegionType | UKRegionType +RegionType = Union[USRegionType, UKRegionType] class Region(BaseModel): @@ -46,19 +46,19 @@ class Region(BaseModel): ) # Hierarchy - parent_code: str | None = Field( + parent_code: Optional[str] = Field( default=None, description="Code of parent region (e.g., 'us' for states, 'state/nj' for places in New Jersey)", ) # Dataset configuration - dataset_path: str | None = Field( + dataset_path: Optional[str] = Field( default=None, description="GCS path to dedicated dataset (e.g., 'gs://policyengine-us-data/states/CA.h5')", ) # Scoping strategy (preferred over legacy filter fields) - scoping_strategy: ScopingStrategy | None = Field( + scoping_strategy: Optional[ScopingStrategy] = Field( default=None, description="Strategy for scoping dataset to this region (row filtering or weight replacement)", ) @@ -68,20 +68,20 @@ class Region(BaseModel): default=False, description="True if this region filters from a parent dataset rather than having its own", ) - filter_field: str | None = Field( + filter_field: Optional[str] = Field( default=None, description="Dataset field to filter on (e.g., 'place_fips', 'country')", ) - filter_value: str | None = Field( + filter_value: Optional[str] = Field( default=None, description="Value to match when filtering (defaults to code suffix if not set)", ) # Metadata (primarily for US congressional districts) - state_code: str | None = Field( + state_code: Optional[str] = Field( default=None, description="Two-letter state code (e.g., 'CA', 'NJ')" ) - state_name: str | None = Field( + state_name: Optional[str] = Field( default=None, description="Full state name (e.g., 'California', 'New Jersey')", ) @@ -137,7 +137,7 @@ def add_region(self, region: Region) -> None: self._by_type[region.region_type] = [] self._by_type[region.region_type].append(region) - def get(self, code: str) -> Region | None: + def get(self, code: str) -> Optional[Region]: """Get a region by its code. Args: @@ -159,7 +159,7 @@ def get_by_type(self, region_type: str) -> list[Region]: """ return self._by_type.get(region_type, []) - def get_national(self) -> Region | None: + def get_national(self) -> Optional[Region]: """Get the national-level region. Returns: diff --git a/src/policyengine/core/release_manifest.py b/src/policyengine/core/release_manifest.py index 3106998e..90a09f32 100644 --- a/src/policyengine/core/release_manifest.py +++ b/src/policyengine/core/release_manifest.py @@ -3,6 +3,7 @@ from importlib import import_module from importlib.resources import files from pathlib import Path +from typing import Optional import requests from pydantic import BaseModel, Field @@ -35,14 +36,14 @@ class CompatibleModelPackage(BaseModel): class BuiltWithModelPackage(PackageVersion): - git_sha: str | None = None - data_build_fingerprint: str | None = None + git_sha: Optional[str] = None + data_build_fingerprint: Optional[str] = None class DataBuildInfo(BaseModel): - build_id: str | None = None - built_at: str | None = None - built_with_model_package: BuiltWithModelPackage | None = None + build_id: Optional[str] = None + built_at: Optional[str] = None + built_with_model_package: Optional[BuiltWithModelPackage] = None class ArtifactPathReference(BaseModel): @@ -61,8 +62,8 @@ class DataReleaseArtifact(BaseModel): path: str repo_id: str revision: str - sha256: str | None = None - size_bytes: int | None = None + sha256: Optional[str] = None + size_bytes: Optional[int] = None @property def uri(self) -> str: @@ -80,32 +81,32 @@ class DataReleaseManifest(BaseModel): default_factory=list ) default_datasets: dict[str, str] = Field(default_factory=dict) - build: DataBuildInfo | None = None + build: Optional[DataBuildInfo] = None artifacts: dict[str, DataReleaseArtifact] = Field(default_factory=dict) class DataCertification(BaseModel): compatibility_basis: str certified_for_model_version: str - data_build_id: str | None = None - built_with_model_version: str | None = None - built_with_model_git_sha: str | None = None - data_build_fingerprint: str | None = None - certified_by: str | None = None + data_build_id: Optional[str] = None + built_with_model_version: Optional[str] = None + built_with_model_git_sha: Optional[str] = None + data_build_fingerprint: Optional[str] = None + certified_by: Optional[str] = None class CertifiedDataArtifact(BaseModel): - data_package: PackageVersion | None = None + data_package: Optional[PackageVersion] = None dataset: str uri: str - sha256: str | None = None - build_id: str | None = None + sha256: Optional[str] = None + build_id: Optional[str] = None class CountryReleaseManifest(BaseModel): schema_version: int = 1 - bundle_id: str | None = None - published_at: str | None = None + bundle_id: Optional[str] = None + published_at: Optional[str] = None country_id: str policyengine_version: str model_package: PackageVersion @@ -113,8 +114,8 @@ class CountryReleaseManifest(BaseModel): default_dataset: str datasets: dict[str, ArtifactPathReference] = Field(default_factory=dict) region_datasets: dict[str, ArtifactPathTemplate] = Field(default_factory=dict) - certified_data_artifact: CertifiedDataArtifact | None = None - certification: DataCertification | None = None + certified_data_artifact: Optional[CertifiedDataArtifact] = None + certification: Optional[DataCertification] = None @property def default_dataset_uri(self) -> str: @@ -186,7 +187,7 @@ def _specifier_matches(version: str, specifier: str) -> bool: def certify_data_release_compatibility( country_id: str, runtime_model_version: str, - runtime_data_build_fingerprint: str | None = None, + runtime_data_build_fingerprint: Optional[str] = None, ) -> DataCertification: country_manifest = get_release_manifest(country_id) try: @@ -322,7 +323,7 @@ def resolve_dataset_reference(country_id: str, dataset: str) -> str: def resolve_managed_dataset_reference( country_id: str, - dataset: str | None = None, + dataset: Optional[str] = None, *, allow_unmanaged: bool = False, ) -> str: @@ -414,7 +415,7 @@ def resolve_region_dataset_path( country_id: str, region_type: str, **kwargs: str, -) -> str | None: +) -> Optional[str]: manifest = get_release_manifest(country_id) template = manifest.region_datasets.get(region_type) if template is None: diff --git a/src/policyengine/core/scoping_strategy.py b/src/policyengine/core/scoping_strategy.py index 75449a6d..7d9b5126 100644 --- a/src/policyengine/core/scoping_strategy.py +++ b/src/policyengine/core/scoping_strategy.py @@ -12,7 +12,7 @@ import logging from abc import abstractmethod from pathlib import Path -from typing import Annotated, Literal +from typing import Annotated, Literal, Optional, Union import h5py import numpy as np @@ -201,7 +201,7 @@ def _find_region_index(lookup_df: pd.DataFrame, region_code: str) -> int: ) @staticmethod - def _find_household_id_column(df: pd.DataFrame, entity_name: str) -> str | None: + def _find_household_id_column(df: pd.DataFrame, entity_name: str) -> Optional[str]: """Find the column linking an entity to its household.""" candidates = [ "person_household_id", @@ -219,6 +219,6 @@ def cache_key(self) -> str: ScopingStrategy = Annotated[ - RowFilterStrategy | WeightReplacementStrategy, + Union[RowFilterStrategy, WeightReplacementStrategy], Discriminator("strategy_type"), ] diff --git a/src/policyengine/core/simulation.py b/src/policyengine/core/simulation.py index b9af105d..6456e5bc 100644 --- a/src/policyengine/core/simulation.py +++ b/src/policyengine/core/simulation.py @@ -1,5 +1,6 @@ import logging from datetime import datetime +from typing import Optional from uuid import uuid4 from pydantic import BaseModel, Field, model_validator @@ -21,22 +22,22 @@ class Simulation(BaseModel): created_at: datetime = Field(default_factory=datetime.now) updated_at: datetime = Field(default_factory=datetime.now) - policy: Policy | None = None - dynamic: Dynamic | None = None + policy: Optional[Policy] = None + dynamic: Optional[Dynamic] = None dataset: Dataset = None # Scoping strategy (preferred over legacy filter fields) - scoping_strategy: ScopingStrategy | None = Field( + scoping_strategy: Optional[ScopingStrategy] = Field( default=None, description="Strategy for scoping dataset to a sub-national region", ) # Legacy regional filtering parameters (kept for backward compatibility) - filter_field: str | None = Field( + filter_field: Optional[str] = Field( default=None, description="Household-level variable to filter dataset by (e.g., 'place_fips', 'country')", ) - filter_value: str | None = Field( + filter_value: Optional[str] = Field( default=None, description="Value to match when filtering (e.g., '44000', 'ENGLAND')", ) @@ -61,7 +62,7 @@ def _auto_construct_strategy(self) -> "Simulation": ) return self - output_dataset: Dataset | None = None + output_dataset: Optional[Dataset] = None def run(self): self.tax_benefit_model_version.run(self) @@ -96,7 +97,7 @@ def load(self): self.tax_benefit_model_version.load(self) @property - def release_bundle(self) -> dict[str, str | None]: + def release_bundle(self) -> dict[str, Optional[str]]: bundle = ( self.tax_benefit_model_version.release_bundle if self.tax_benefit_model_version is not None diff --git a/src/policyengine/core/tax_benefit_model.py b/src/policyengine/core/tax_benefit_model.py index 02cb94ef..c2d4e26d 100644 --- a/src/policyengine/core/tax_benefit_model.py +++ b/src/policyengine/core/tax_benefit_model.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional from pydantic import BaseModel @@ -8,4 +8,4 @@ class TaxBenefitModel(BaseModel): id: str - description: str | None = None + description: Optional[str] = None diff --git a/src/policyengine/core/tax_benefit_model_version.py b/src/policyengine/core/tax_benefit_model_version.py index f253fc5c..7fb03334 100644 --- a/src/policyengine/core/tax_benefit_model_version.py +++ b/src/policyengine/core/tax_benefit_model_version.py @@ -1,5 +1,5 @@ -from datetime import UTC, datetime -from typing import TYPE_CHECKING +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Optional from uuid import uuid4 from pydantic import BaseModel, Field @@ -22,25 +22,27 @@ class TaxBenefitModelVersion(BaseModel): id: str = Field(default_factory=lambda: str(uuid4())) model: TaxBenefitModel version: str - description: str | None = None - created_at: datetime | None = Field(default_factory=lambda: datetime.now(UTC)) + description: Optional[str] = None + created_at: Optional[datetime] = Field( + default_factory=lambda: datetime.now(timezone.utc) + ) variables: list["Variable"] = Field(default_factory=list) parameters: list["Parameter"] = Field(default_factory=list) parameter_nodes: list["ParameterNode"] = Field(default_factory=list) # Region registry for geographic simulations - region_registry: "RegionRegistry | None" = Field( + region_registry: "Optional[RegionRegistry]" = Field( default=None, description="Registry of supported geographic regions" ) - release_manifest: CountryReleaseManifest | None = Field( + release_manifest: Optional[CountryReleaseManifest] = Field( default=None, exclude=True, ) - model_package: PackageVersion | None = Field(default=None) - data_package: PackageVersion | None = Field(default=None) - default_dataset_uri: str | None = Field(default=None) - data_certification: DataCertification | None = Field(default=None) + model_package: Optional[PackageVersion] = Field(default=None) + data_package: Optional[PackageVersion] = Field(default=None) + default_dataset_uri: Optional[str] = Field(default=None) + data_certification: Optional[DataCertification] = Field(default=None) @property def parameter_values(self) -> list["ParameterValue"]: @@ -112,7 +114,7 @@ def get_parameter_node(self, name: str) -> "ParameterNode": f"ParameterNode '{name}' not found in {self.model.id} version {self.version}" ) - def get_region(self, code: str) -> "Region | None": + def get_region(self, code: str) -> "Optional[Region]": """Get a region by its code. Args: @@ -126,7 +128,7 @@ def get_region(self, code: str) -> "Region | None": return self.region_registry.get(code) @property - def release_bundle(self) -> dict[str, str | None]: + def release_bundle(self) -> dict[str, Optional[str]]: manifest_certification = ( self.release_manifest.certification if self.release_manifest is not None diff --git a/src/policyengine/core/trace_tro.py b/src/policyengine/core/trace_tro.py index 52ae7b15..ae31a29e 100644 --- a/src/policyengine/core/trace_tro.py +++ b/src/policyengine/core/trace_tro.py @@ -3,6 +3,7 @@ import hashlib import json from collections.abc import Iterable, Mapping +from typing import Optional from .release_manifest import ( CountryReleaseManifest, @@ -28,7 +29,7 @@ def _hash_object(value: str) -> dict[str, str]: } -def _artifact_mime_type(path_or_uri: str) -> str | None: +def _artifact_mime_type(path_or_uri: str) -> Optional[str]: suffix = path_or_uri.rsplit(".", 1)[-1].lower() if "." in path_or_uri else "" return { "h5": "application/x-hdf5", @@ -53,9 +54,9 @@ def build_trace_tro_from_release_bundle( country_manifest: CountryReleaseManifest, data_release_manifest: DataReleaseManifest, *, - certification: DataCertification | None = None, - bundle_manifest_path: str | None = None, - data_release_manifest_path: str | None = None, + certification: Optional[DataCertification] = None, + bundle_manifest_path: Optional[str] = None, + data_release_manifest_path: Optional[str] = None, ) -> dict: certified_artifact = country_manifest.certified_data_artifact if certified_artifact is None: diff --git a/src/policyengine/core/variable.py b/src/policyengine/core/variable.py index 60aea9c5..03e53495 100644 --- a/src/policyengine/core/variable.py +++ b/src/policyengine/core/variable.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Optional from pydantic import BaseModel @@ -8,13 +8,13 @@ class Variable(BaseModel): id: str name: str - label: str | None = None + label: Optional[str] = None tax_benefit_model_version: TaxBenefitModelVersion entity: str - description: str | None = None + description: Optional[str] = None data_type: type = None - possible_values: list[Any] | None = None + possible_values: Optional[list[Any]] = None default_value: Any = None - value_type: type | None = None - adds: list[str] | None = None - subtracts: list[str] | None = None + value_type: Optional[type] = None + adds: Optional[list[str]] = None + subtracts: Optional[list[str]] = None diff --git a/src/policyengine/outputs/aggregate.py b/src/policyengine/outputs/aggregate.py index 9406a4d7..d014b06c 100644 --- a/src/policyengine/outputs/aggregate.py +++ b/src/policyengine/outputs/aggregate.py @@ -1,10 +1,10 @@ -from enum import StrEnum -from typing import Any +from enum import Enum +from typing import Any, Optional from policyengine.core import Output, Simulation -class AggregateType(StrEnum): +class AggregateType(str, Enum): SUM = "sum" MEAN = "mean" COUNT = "count" @@ -14,23 +14,25 @@ class Aggregate(Output): simulation: Simulation variable: str aggregate_type: AggregateType - entity: str | None = None + entity: Optional[str] = None - filter_variable: str | None = None - filter_variable_eq: Any | None = None - filter_variable_leq: Any | None = None - filter_variable_geq: Any | None = None + filter_variable: Optional[str] = None + filter_variable_eq: Optional[Any] = None + filter_variable_leq: Optional[Any] = None + filter_variable_geq: Optional[Any] = None filter_variable_describes_quantiles: bool = False # Convenient quantile specification (alternative to describes_quantiles) - quantile: int | None = ( + quantile: Optional[int] = ( None # Number of quantiles (e.g., 10 for deciles, 5 for quintiles) ) - quantile_eq: int | None = None # Exact quantile (e.g., 3 for 3rd decile) - quantile_leq: int | None = None # Maximum quantile (e.g., 5 for bottom 5 deciles) - quantile_geq: int | None = None # Minimum quantile (e.g., 9 for top 2 deciles) + quantile_eq: Optional[int] = None # Exact quantile (e.g., 3 for 3rd decile) + quantile_leq: Optional[int] = ( + None # Maximum quantile (e.g., 5 for bottom 5 deciles) + ) + quantile_geq: Optional[int] = None # Minimum quantile (e.g., 9 for top 2 deciles) - result: Any | None = None + result: Optional[Any] = None def run(self): # Convert quantile specification to describes_quantiles format diff --git a/src/policyengine/outputs/change_aggregate.py b/src/policyengine/outputs/change_aggregate.py index e1cd3985..87d2e0d9 100644 --- a/src/policyengine/outputs/change_aggregate.py +++ b/src/policyengine/outputs/change_aggregate.py @@ -1,10 +1,10 @@ -from enum import StrEnum -from typing import Any +from enum import Enum +from typing import Any, Optional from policyengine.core import Output, Simulation -class ChangeAggregateType(StrEnum): +class ChangeAggregateType(str, Enum): COUNT = "count" SUM = "sum" MEAN = "mean" @@ -15,34 +15,36 @@ class ChangeAggregate(Output): reform_simulation: Simulation variable: str aggregate_type: ChangeAggregateType - entity: str | None = None + entity: Optional[str] = None # Filter by absolute change - change_geq: float | None = None # Change >= value (e.g., gain >= 500) - change_leq: float | None = None # Change <= value (e.g., loss <= -500) - change_eq: float | None = None # Change == value + change_geq: Optional[float] = None # Change >= value (e.g., gain >= 500) + change_leq: Optional[float] = None # Change <= value (e.g., loss <= -500) + change_eq: Optional[float] = None # Change == value # Filter by relative change (as decimal, e.g., 0.05 = 5%) - relative_change_geq: float | None = None # Relative change >= value - relative_change_leq: float | None = None # Relative change <= value - relative_change_eq: float | None = None # Relative change == value + relative_change_geq: Optional[float] = None # Relative change >= value + relative_change_leq: Optional[float] = None # Relative change <= value + relative_change_eq: Optional[float] = None # Relative change == value # Filter by another variable (e.g., only count people with age >= 30) - filter_variable: str | None = None - filter_variable_eq: Any | None = None - filter_variable_leq: Any | None = None - filter_variable_geq: Any | None = None + filter_variable: Optional[str] = None + filter_variable_eq: Optional[Any] = None + filter_variable_leq: Optional[Any] = None + filter_variable_geq: Optional[Any] = None filter_variable_describes_quantiles: bool = False # Convenient quantile specification (alternative to describes_quantiles) - quantile: int | None = ( + quantile: Optional[int] = ( None # Number of quantiles (e.g., 10 for deciles, 5 for quintiles) ) - quantile_eq: int | None = None # Exact quantile (e.g., 3 for 3rd decile) - quantile_leq: int | None = None # Maximum quantile (e.g., 5 for bottom 5 deciles) - quantile_geq: int | None = None # Minimum quantile (e.g., 9 for top 2 deciles) + quantile_eq: Optional[int] = None # Exact quantile (e.g., 3 for 3rd decile) + quantile_leq: Optional[int] = ( + None # Maximum quantile (e.g., 5 for bottom 5 deciles) + ) + quantile_geq: Optional[int] = None # Minimum quantile (e.g., 9 for top 2 deciles) - result: Any | None = None + result: Optional[Any] = None def run(self): # Convert quantile specification to describes_quantiles format diff --git a/src/policyengine/outputs/congressional_district_impact.py b/src/policyengine/outputs/congressional_district_impact.py index d8162a6d..4a1d0d90 100644 --- a/src/policyengine/outputs/congressional_district_impact.py +++ b/src/policyengine/outputs/congressional_district_impact.py @@ -1,6 +1,6 @@ """Congressional district impact output class for US policy reforms.""" -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import numpy as np from pydantic import ConfigDict @@ -26,7 +26,7 @@ class CongressionalDistrictImpact(Output): reform_simulation: "Simulation" # Results populated by run() - district_results: list[dict] | None = None + district_results: Optional[list[dict]] = None def run(self) -> None: """Group households by geoid and compute per-district metrics.""" diff --git a/src/policyengine/outputs/constituency_impact.py b/src/policyengine/outputs/constituency_impact.py index 5cee7f4d..60f76e0b 100644 --- a/src/policyengine/outputs/constituency_impact.py +++ b/src/policyengine/outputs/constituency_impact.py @@ -5,7 +5,7 @@ that reweights all households to represent that constituency's demographics. """ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import h5py import numpy as np @@ -35,7 +35,7 @@ class ConstituencyImpact(Output): year: str = "2025" # Results populated by run() - constituency_results: list[dict] | None = None + constituency_results: Optional[list[dict]] = None def run(self) -> None: """Load weight matrix and compute per-constituency metrics.""" diff --git a/src/policyengine/outputs/decile_impact.py b/src/policyengine/outputs/decile_impact.py index d3339003..b0f2306e 100644 --- a/src/policyengine/outputs/decile_impact.py +++ b/src/policyengine/outputs/decile_impact.py @@ -1,3 +1,5 @@ +from typing import Optional + import pandas as pd from pydantic import ConfigDict @@ -16,19 +18,19 @@ class DecileImpact(Output): baseline_simulation: Simulation reform_simulation: Simulation income_variable: str = "equiv_hbai_household_net_income" - decile_variable: str | None = None # If set, use pre-computed grouping variable - entity: str | None = None + decile_variable: Optional[str] = None # If set, use pre-computed grouping variable + entity: Optional[str] = None decile: int quantiles: int = 10 # Results populated by run() - baseline_mean: float | None = None - reform_mean: float | None = None - absolute_change: float | None = None - relative_change: float | None = None - count_better_off: float | None = None - count_worse_off: float | None = None - count_no_change: float | None = None + baseline_mean: Optional[float] = None + reform_mean: Optional[float] = None + absolute_change: Optional[float] = None + relative_change: Optional[float] = None + count_better_off: Optional[float] = None + count_worse_off: Optional[float] = None + count_no_change: Optional[float] = None def run(self): """Calculate impact for this specific decile.""" @@ -97,16 +99,16 @@ def run(self): def calculate_decile_impacts( - dataset: Dataset | None = None, - tax_benefit_model_version: TaxBenefitModelVersion | None = None, - baseline_policy: Policy | None = None, - reform_policy: Policy | None = None, - dynamic: Dynamic | None = None, + dataset: Optional[Dataset] = None, + tax_benefit_model_version: Optional[TaxBenefitModelVersion] = None, + baseline_policy: Optional[Policy] = None, + reform_policy: Optional[Policy] = None, + dynamic: Optional[Dynamic] = None, income_variable: str = "equiv_hbai_household_net_income", - entity: str | None = None, + entity: Optional[str] = None, quantiles: int = 10, - baseline_simulation: Simulation | None = None, - reform_simulation: Simulation | None = None, + baseline_simulation: Optional[Simulation] = None, + reform_simulation: Optional[Simulation] = None, ) -> OutputCollection[DecileImpact]: """Calculate decile-by-decile impact of a reform. diff --git a/src/policyengine/outputs/inequality.py b/src/policyengine/outputs/inequality.py index 8656dc65..4b16f7a9 100644 --- a/src/policyengine/outputs/inequality.py +++ b/src/policyengine/outputs/inequality.py @@ -1,7 +1,7 @@ """Inequality analysis output types.""" -from enum import StrEnum -from typing import Any +from enum import Enum +from typing import Any, Optional, Union import numpy as np import pandas as pd @@ -10,7 +10,7 @@ from policyengine.core import Output, Simulation -class USInequalityPreset(StrEnum): +class USInequalityPreset(str, Enum): """Preset configurations for US inequality analysis.""" STANDARD = "standard" @@ -86,21 +86,21 @@ class Inequality(Output): simulation: Simulation income_variable: str entity: str = "household" - weight_multiplier_variable: str | None = None - equivalization_variable: str | None = None + weight_multiplier_variable: Optional[str] = None + equivalization_variable: Optional[str] = None equivalization_power: float = 0.0 # Optional demographic filters - filter_variable: str | None = None - filter_variable_eq: Any | None = None - filter_variable_leq: Any | None = None - filter_variable_geq: Any | None = None + filter_variable: Optional[str] = None + filter_variable_eq: Optional[Any] = None + filter_variable_leq: Optional[Any] = None + filter_variable_geq: Optional[Any] = None # Results populated by run() - gini: float | None = None - top_10_share: float | None = None - top_1_share: float | None = None - bottom_50_share: float | None = None + gini: Optional[float] = None + top_10_share: Optional[float] = None + top_1_share: Optional[float] = None + bottom_50_share: Optional[float] = None def run(self): """Calculate inequality metrics.""" @@ -235,10 +235,10 @@ def run(self): def calculate_uk_inequality( simulation: Simulation, income_variable: str = UK_INEQUALITY_INCOME_VARIABLE, - filter_variable: str | None = None, - filter_variable_eq: Any | None = None, - filter_variable_leq: Any | None = None, - filter_variable_geq: Any | None = None, + filter_variable: Optional[str] = None, + filter_variable_eq: Optional[Any] = None, + filter_variable_leq: Optional[Any] = None, + filter_variable_geq: Optional[Any] = None, ) -> Inequality: """Calculate inequality metrics for a UK simulation. @@ -269,11 +269,11 @@ def calculate_uk_inequality( def calculate_us_inequality( simulation: Simulation, income_variable: str = US_INEQUALITY_INCOME_VARIABLE, - preset: USInequalityPreset | str = USInequalityPreset.STANDARD, - filter_variable: str | None = None, - filter_variable_eq: Any | None = None, - filter_variable_leq: Any | None = None, - filter_variable_geq: Any | None = None, + preset: Union[USInequalityPreset, str] = USInequalityPreset.STANDARD, + filter_variable: Optional[str] = None, + filter_variable_eq: Optional[Any] = None, + filter_variable_leq: Optional[Any] = None, + filter_variable_geq: Optional[Any] = None, ) -> Inequality: """Calculate inequality metrics for a US simulation. diff --git a/src/policyengine/outputs/intra_decile_impact.py b/src/policyengine/outputs/intra_decile_impact.py index e2b01243..b91a04e2 100644 --- a/src/policyengine/outputs/intra_decile_impact.py +++ b/src/policyengine/outputs/intra_decile_impact.py @@ -15,6 +15,8 @@ household_weight) so they reflect the share of people, not households. """ +from typing import Optional + import numpy as np import pandas as pd from pydantic import ConfigDict @@ -41,17 +43,17 @@ class IntraDecileImpact(Output): baseline_simulation: Simulation reform_simulation: Simulation income_variable: str = "household_net_income" - decile_variable: str | None = None # If set, use pre-computed grouping + decile_variable: Optional[str] = None # If set, use pre-computed grouping entity: str = "household" decile: int # 1-10 for individual deciles quantiles: int = 10 # Results populated by run() - lose_more_than_5pct: float | None = None - lose_less_than_5pct: float | None = None - no_change: float | None = None - gain_less_than_5pct: float | None = None - gain_more_than_5pct: float | None = None + lose_more_than_5pct: Optional[float] = None + lose_less_than_5pct: Optional[float] = None + no_change: Optional[float] = None + gain_less_than_5pct: Optional[float] = None + gain_more_than_5pct: Optional[float] = None def run(self): """Calculate intra-decile proportions for this specific decile.""" @@ -117,7 +119,7 @@ def compute_intra_decile_impacts( baseline_simulation: Simulation, reform_simulation: Simulation, income_variable: str = "household_net_income", - decile_variable: str | None = None, + decile_variable: Optional[str] = None, entity: str = "household", quantiles: int = 10, ) -> OutputCollection[IntraDecileImpact]: diff --git a/src/policyengine/outputs/local_authority_impact.py b/src/policyengine/outputs/local_authority_impact.py index fc91f3ec..20b17efe 100644 --- a/src/policyengine/outputs/local_authority_impact.py +++ b/src/policyengine/outputs/local_authority_impact.py @@ -5,7 +5,7 @@ that reweights all households to represent that local authority's demographics. """ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import h5py import numpy as np @@ -35,7 +35,7 @@ class LocalAuthorityImpact(Output): year: str = "2025" # Results populated by run() - local_authority_results: list[dict] | None = None + local_authority_results: Optional[list[dict]] = None def run(self) -> None: """Load weight matrix and compute per-local-authority metrics.""" diff --git a/src/policyengine/outputs/poverty.py b/src/policyengine/outputs/poverty.py index 10db4682..6fc59705 100644 --- a/src/policyengine/outputs/poverty.py +++ b/src/policyengine/outputs/poverty.py @@ -1,7 +1,7 @@ """Poverty analysis output types.""" -from enum import StrEnum -from typing import Any +from enum import Enum +from typing import Any, Optional import pandas as pd from pydantic import ConfigDict @@ -9,7 +9,7 @@ from policyengine.core import Output, OutputCollection, Simulation -class UKPovertyType(StrEnum): +class UKPovertyType(str, Enum): """UK poverty measure types.""" ABSOLUTE_BHC = "absolute_bhc" @@ -18,7 +18,7 @@ class UKPovertyType(StrEnum): RELATIVE_AHC = "relative_ahc" -class USPovertyType(StrEnum): +class USPovertyType(str, Enum): """US poverty measure types.""" SPM = "spm" @@ -51,22 +51,22 @@ class Poverty(Output): simulation: Simulation poverty_variable: str - poverty_type: str | None = None + poverty_type: Optional[str] = None entity: str = "person" # Optional demographic filters - filter_variable: str | None = None - filter_variable_eq: Any | None = None - filter_variable_leq: Any | None = None - filter_variable_geq: Any | None = None + filter_variable: Optional[str] = None + filter_variable_eq: Optional[Any] = None + filter_variable_leq: Optional[Any] = None + filter_variable_geq: Optional[Any] = None # Convenience group label (set by by_age/by_gender/by_race wrappers) - filter_group: str | None = None + filter_group: Optional[str] = None # Results populated by run() - headcount: float | None = None - total_population: float | None = None - rate: float | None = None + headcount: Optional[float] = None + total_population: Optional[float] = None + rate: Optional[float] = None def run(self): """Calculate poverty headcount and rate.""" @@ -128,10 +128,10 @@ def run(self): def calculate_uk_poverty_rates( simulation: Simulation, - filter_variable: str | None = None, - filter_variable_eq: Any | None = None, - filter_variable_leq: Any | None = None, - filter_variable_geq: Any | None = None, + filter_variable: Optional[str] = None, + filter_variable_eq: Optional[Any] = None, + filter_variable_leq: Optional[Any] = None, + filter_variable_geq: Optional[Any] = None, ) -> OutputCollection[Poverty]: """Calculate all UK poverty rates for a simulation. @@ -184,10 +184,10 @@ def calculate_uk_poverty_rates( def calculate_us_poverty_rates( simulation: Simulation, - filter_variable: str | None = None, - filter_variable_eq: Any | None = None, - filter_variable_leq: Any | None = None, - filter_variable_geq: Any | None = None, + filter_variable: Optional[str] = None, + filter_variable_eq: Optional[Any] = None, + filter_variable_leq: Optional[Any] = None, + filter_variable_geq: Optional[Any] = None, ) -> OutputCollection[Poverty]: """Calculate all US poverty rates for a simulation. diff --git a/src/policyengine/tax_benefit_models/uk/analysis.py b/src/policyengine/tax_benefit_models/uk/analysis.py index f7f5af5b..0a545b52 100644 --- a/src/policyengine/tax_benefit_models/uk/analysis.py +++ b/src/policyengine/tax_benefit_models/uk/analysis.py @@ -2,7 +2,7 @@ import tempfile from pathlib import Path -from typing import Any +from typing import Any, Optional import pandas as pd from microdf import MicroDataFrame @@ -65,7 +65,7 @@ class UKHouseholdInput(BaseModel): def calculate_household_impact( household_input: UKHouseholdInput, - policy: Policy | None = None, + policy: Optional[Policy] = None, ) -> UKHouseholdOutput: """Calculate tax and benefit impacts for a single UK household.""" n_people = len(household_input.people) diff --git a/src/policyengine/tax_benefit_models/uk/datasets.py b/src/policyengine/tax_benefit_models/uk/datasets.py index ec0f579b..47f78403 100644 --- a/src/policyengine/tax_benefit_models/uk/datasets.py +++ b/src/policyengine/tax_benefit_models/uk/datasets.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Optional import pandas as pd from microdf import MicroDataFrame @@ -33,7 +34,7 @@ def entity_data(self) -> dict[str, MicroDataFrame]: class PolicyEngineUKDataset(Dataset): """UK dataset with multi-year entity-level data.""" - data: UKYearData | None = None + data: Optional[UKYearData] = None def model_post_init(self, __context): """Called after Pydantic initialization.""" diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py index ff65be1b..edd5c069 100644 --- a/src/policyengine/tax_benefit_models/uk/model.py +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -1,7 +1,7 @@ import datetime from importlib import metadata from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import pandas as pd from microdf import MicroDataFrame @@ -45,7 +45,7 @@ class PolicyEngineUK(TaxBenefitModel): uk_model = PolicyEngineUK() -def _get_runtime_data_build_metadata() -> dict[str, str | None]: +def _get_runtime_data_build_metadata() -> dict[str, Optional[str]]: try: from policyengine_uk.build_metadata import get_data_build_metadata except ModuleNotFoundError as exc: @@ -430,8 +430,8 @@ def load(self, simulation: "Simulation"): def _managed_release_bundle( dataset_uri: str, - dataset_source: str | None = None, -) -> dict[str, str | None]: + dataset_source: Optional[str] = None, +) -> dict[str, Optional[str]]: bundle = dict(uk_latest.release_bundle) bundle["runtime_dataset"] = dataset_logical_name(dataset_uri) bundle["runtime_dataset_uri"] = dataset_uri @@ -443,7 +443,7 @@ def _managed_release_bundle( def managed_microsimulation( *, - dataset: str | None = None, + dataset: Optional[str] = None, allow_unmanaged: bool = False, **kwargs, ): diff --git a/src/policyengine/tax_benefit_models/uk/outputs.py b/src/policyengine/tax_benefit_models/uk/outputs.py index 273a27c6..97032a9c 100644 --- a/src/policyengine/tax_benefit_models/uk/outputs.py +++ b/src/policyengine/tax_benefit_models/uk/outputs.py @@ -1,5 +1,7 @@ """UK-specific output templates.""" +from typing import Optional + from pydantic import ConfigDict from policyengine.core import Output, Simulation @@ -22,13 +24,13 @@ class ProgrammeStatistics(Output): is_tax: bool = False # Results populated by run() - baseline_total: float | None = None - reform_total: float | None = None - change: float | None = None - baseline_count: float | None = None - reform_count: float | None = None - winners: float | None = None - losers: float | None = None + baseline_total: Optional[float] = None + reform_total: Optional[float] = None + change: Optional[float] = None + baseline_count: Optional[float] = None + reform_count: Optional[float] = None + winners: Optional[float] = None + losers: Optional[float] = None def run(self): """Calculate statistics for this programme.""" diff --git a/src/policyengine/tax_benefit_models/us/analysis.py b/src/policyengine/tax_benefit_models/us/analysis.py index 375a4e5f..122ae2af 100644 --- a/src/policyengine/tax_benefit_models/us/analysis.py +++ b/src/policyengine/tax_benefit_models/us/analysis.py @@ -2,7 +2,7 @@ import tempfile from pathlib import Path -from typing import Any +from typing import Any, Optional, Union import pandas as pd from microdf import MicroDataFrame @@ -54,7 +54,7 @@ class USHouseholdInput(BaseModel): def calculate_household_impact( household_input: USHouseholdInput, - policy: Policy | None = None, + policy: Optional[Policy] = None, ) -> USHouseholdOutput: """Calculate tax and benefit impacts for a single US household.""" n_people = len(household_input.people) @@ -201,7 +201,7 @@ class PolicyReformAnalysis(BaseModel): def economic_impact_analysis( baseline_simulation: Simulation, reform_simulation: Simulation, - inequality_preset: USInequalityPreset | str = USInequalityPreset.STANDARD, + inequality_preset: Union[USInequalityPreset, str] = USInequalityPreset.STANDARD, ) -> PolicyReformAnalysis: """Perform comprehensive analysis of a policy reform. diff --git a/src/policyengine/tax_benefit_models/us/datasets.py b/src/policyengine/tax_benefit_models/us/datasets.py index 7ea12f8e..da10733b 100644 --- a/src/policyengine/tax_benefit_models/us/datasets.py +++ b/src/policyengine/tax_benefit_models/us/datasets.py @@ -1,5 +1,6 @@ import warnings from pathlib import Path +from typing import Optional import pandas as pd from microdf import MicroDataFrame @@ -40,7 +41,7 @@ def entity_data(self) -> dict[str, MicroDataFrame]: class PolicyEngineUSDataset(Dataset): """US dataset with multi-year entity-level data.""" - data: USYearData | None = None + data: Optional[USYearData] = None def model_post_init(self, __context) -> None: """Called after Pydantic initialization.""" diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py index 2c560e3a..a896f5c4 100644 --- a/src/policyengine/tax_benefit_models/us/model.py +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -1,7 +1,7 @@ import datetime from importlib import metadata from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import pandas as pd from microdf import MicroDataFrame @@ -51,7 +51,7 @@ class PolicyEngineUS(TaxBenefitModel): us_model = PolicyEngineUS() -def _get_runtime_data_build_metadata() -> dict[str, str | None]: +def _get_runtime_data_build_metadata() -> dict[str, Optional[str]]: try: from policyengine_us.build_metadata import get_data_build_metadata except ModuleNotFoundError as exc: @@ -595,8 +595,8 @@ def _build_simulation_from_dataset(self, microsim, dataset, system): def _managed_release_bundle( dataset_uri: str, - dataset_source: str | None = None, -) -> dict[str, str | None]: + dataset_source: Optional[str] = None, +) -> dict[str, Optional[str]]: bundle = dict(us_latest.release_bundle) bundle["runtime_dataset"] = dataset_logical_name(dataset_uri) bundle["runtime_dataset_uri"] = dataset_uri @@ -608,7 +608,7 @@ def _managed_release_bundle( def managed_microsimulation( *, - dataset: str | None = None, + dataset: Optional[str] = None, allow_unmanaged: bool = False, **kwargs, ): diff --git a/src/policyengine/tax_benefit_models/us/outputs.py b/src/policyengine/tax_benefit_models/us/outputs.py index 63fd1a36..1dd6f001 100644 --- a/src/policyengine/tax_benefit_models/us/outputs.py +++ b/src/policyengine/tax_benefit_models/us/outputs.py @@ -1,5 +1,7 @@ """US-specific output templates.""" +from typing import Optional + from pydantic import ConfigDict from policyengine.core import Output, Simulation @@ -22,13 +24,13 @@ class ProgramStatistics(Output): is_tax: bool = False # Results populated by run() - baseline_total: float | None = None - reform_total: float | None = None - change: float | None = None - baseline_count: float | None = None - reform_count: float | None = None - winners: float | None = None - losers: float | None = None + baseline_total: Optional[float] = None + reform_total: Optional[float] = None + change: Optional[float] = None + baseline_count: Optional[float] = None + reform_count: Optional[float] = None + winners: Optional[float] = None + losers: Optional[float] = None def run(self): """Calculate statistics for this program.""" diff --git a/src/policyengine/utils/parametric_reforms.py b/src/policyengine/utils/parametric_reforms.py index 71476afa..025df22e 100644 --- a/src/policyengine/utils/parametric_reforms.py +++ b/src/policyengine/utils/parametric_reforms.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections.abc import Callable -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional, Union from policyengine_core.periods import period @@ -13,8 +13,8 @@ def reform_dict_from_parameter_values( - parameter_values: list[ParameterValue] | None, -) -> dict | None: + parameter_values: Optional[list[ParameterValue]], +) -> Optional[dict]: """ Convert a list of ParameterValue objects to a reform dict format. @@ -83,8 +83,8 @@ def modifier(simulation): def build_reform_dict( - policy_or_dynamic: Policy | Dynamic | None, -) -> dict | None: + policy_or_dynamic: Optional[Union[Policy, Dynamic]], +) -> Optional[dict]: """Extract a reform dict from a Policy or Dynamic object. If the object has parameter_values, converts them to reform dict format. @@ -103,7 +103,9 @@ def build_reform_dict( return None -def merge_reform_dicts(base: dict | None, override: dict | None) -> dict | None: +def merge_reform_dicts( + base: Optional[dict], override: Optional[dict] +) -> Optional[dict]: """Merge two reform dicts, with override values taking precedence. Either or both dicts can be None. When both have entries for the same diff --git a/src/policyengine/utils/plotting.py b/src/policyengine/utils/plotting.py index 4478a02d..2ca8e48c 100644 --- a/src/policyengine/utils/plotting.py +++ b/src/policyengine/utils/plotting.py @@ -1,5 +1,7 @@ """Plotting utilities for PolicyEngine visualisations.""" +from typing import Optional + import plotly.graph_objects as go # PolicyEngine brand colours @@ -26,12 +28,12 @@ def format_fig( fig: go.Figure, - title: str | None = None, - xaxis_title: str | None = None, - yaxis_title: str | None = None, + title: Optional[str] = None, + xaxis_title: Optional[str] = None, + yaxis_title: Optional[str] = None, show_legend: bool = True, - height: int | None = None, - width: int | None = None, + height: Optional[int] = None, + width: Optional[int] = None, ) -> go.Figure: """Apply PolicyEngine visual style to a plotly figure. diff --git a/tests/fixtures/parameter_labels_fixtures.py b/tests/fixtures/parameter_labels_fixtures.py index 0e22424c..d86c900f 100644 --- a/tests/fixtures/parameter_labels_fixtures.py +++ b/tests/fixtures/parameter_labels_fixtures.py @@ -1,7 +1,7 @@ """Fixtures for parameter_labels utility tests.""" from enum import Enum -from typing import Any +from typing import Any, Optional from unittest.mock import MagicMock @@ -24,7 +24,7 @@ class MockStateCode(Enum): def create_mock_parameter( name: str, - label: str | None = None, + label: Optional[str] = None, parent: Any = None, ) -> MagicMock: """Create a mock CoreParameter object.""" @@ -37,9 +37,9 @@ def create_mock_parameter( def create_mock_parent_node( name: str, - label: str | None = None, - breakdown: list[str] | None = None, - breakdown_labels: list[str] | None = None, + label: Optional[str] = None, + breakdown: Optional[list[str]] = None, + breakdown_labels: Optional[list[str]] = None, parent: Any = None, ) -> MagicMock: """Create a mock parent ParameterNode with optional breakdown metadata.""" @@ -58,8 +58,8 @@ def create_mock_parent_node( def create_mock_scale( name: str, - label: str | None = None, - scale_type: str | None = None, + label: Optional[str] = None, + scale_type: Optional[str] = None, ) -> MagicMock: """Create a mock ParameterScale object.""" scale = MagicMock() @@ -74,7 +74,7 @@ def create_mock_scale( def create_mock_variable( name: str, - possible_values: type[Enum] | None = None, + possible_values: Optional[type[Enum]] = None, ) -> MagicMock: """Create a mock Variable object with optional enum values.""" var = MagicMock() @@ -87,8 +87,8 @@ def create_mock_variable( def create_mock_system( - variables: dict[str, MagicMock] | None = None, - scales: list[MagicMock] | None = None, + variables: Optional[dict[str, MagicMock]] = None, + scales: Optional[list[MagicMock]] = None, ) -> MagicMock: """Create a mock tax-benefit system.""" system = MagicMock() diff --git a/tests/fixtures/parametric_reforms_fixtures.py b/tests/fixtures/parametric_reforms_fixtures.py index 98bc7aa2..6fcbd991 100644 --- a/tests/fixtures/parametric_reforms_fixtures.py +++ b/tests/fixtures/parametric_reforms_fixtures.py @@ -1,6 +1,7 @@ """Fixtures for parametric reforms tests.""" from datetime import date +from typing import Optional from unittest.mock import MagicMock import pytest @@ -23,7 +24,7 @@ def create_parameter_value( parameter: Parameter, value: float, start_date: date, - end_date: date | None = None, + end_date: Optional[date] = None, ) -> ParameterValue: """Create a ParameterValue for testing.""" return ParameterValue( diff --git a/tests/fixtures/variable_label_fixtures.py b/tests/fixtures/variable_label_fixtures.py index 1ce3572e..a4c01177 100644 --- a/tests/fixtures/variable_label_fixtures.py +++ b/tests/fixtures/variable_label_fixtures.py @@ -1,13 +1,14 @@ """Fixtures for variable label tests.""" +from typing import Optional from unittest.mock import MagicMock def create_mock_openfisca_variable( name: str, - label: str | None = None, + label: Optional[str] = None, entity_key: str = "person", - documentation: str | None = None, + documentation: Optional[str] = None, value_type: type = float, default_value=0, ) -> MagicMock: diff --git a/tests/test_intra_decile_impact.py b/tests/test_intra_decile_impact.py index dc4e6a96..04ae5412 100644 --- a/tests/test_intra_decile_impact.py +++ b/tests/test_intra_decile_impact.py @@ -1,5 +1,6 @@ """Unit tests for IntraDecileImpact and DecileImpact with decile_variable.""" +from typing import Optional from unittest.mock import MagicMock import numpy as np @@ -40,7 +41,7 @@ def _make_version(variable_name: str, entity: str) -> TaxBenefitModelVersion: return version -def _make_sim(household_data: dict, variables: list | None = None) -> MagicMock: +def _make_sim(household_data: dict, variables: Optional[list] = None) -> MagicMock: """Create a mock Simulation with household-level data.""" hh_df = MicroDataFrame( pd.DataFrame(household_data), From ed8dbcfcf7e4bd423653da891c3f1b0696d33b5d Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 17 Apr 2026 08:28:05 -0400 Subject: [PATCH 2/4] Install h5py alongside bare policyengine in Python-Compat job `scoping_strategy.py` imports h5py unconditionally at module level, but h5py is only pulled in transitively via the [us]/[uk] extras (through policyengine-core). The smoke-import job installs without extras to decouple from the old pinned country-model versions, so it needs h5py installed directly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/pr_code_changes.yaml | 9 +- build/lib/policyengine/__init__.py | 0 build/lib/policyengine/core/__init__.py | 50 + build/lib/policyengine/core/cache.py | 59 + build/lib/policyengine/core/dataset.py | 396 ++++ .../lib/policyengine/core/dataset_version.py | 16 + build/lib/policyengine/core/dynamic.py | 47 + build/lib/policyengine/core/output.py | 26 + build/lib/policyengine/core/parameter.py | 61 + build/lib/policyengine/core/parameter_node.py | 29 + .../lib/policyengine/core/parameter_value.py | 16 + build/lib/policyengine/core/policy.py | 47 + build/lib/policyengine/core/region.py | 212 ++ .../lib/policyengine/core/release_manifest.py | 432 ++++ .../lib/policyengine/core/scoping_strategy.py | 224 ++ build/lib/policyengine/core/simulation.py | 111 + .../policyengine/core/tax_benefit_model.py | 11 + .../core/tax_benefit_model_version.py | 208 ++ build/lib/policyengine/core/trace_tro.py | 260 +++ build/lib/policyengine/core/variable.py | 20 + build/lib/policyengine/countries/__init__.py | 9 + .../lib/policyengine/countries/uk/__init__.py | 5 + .../lib/policyengine/countries/uk/regions.py | 207 ++ .../lib/policyengine/countries/us/__init__.py | 5 + .../countries/us/data/__init__.py | 18 + .../countries/us/data/districts.py | 64 + .../policyengine/countries/us/data/places.py | 1815 +++++++++++++++++ .../policyengine/countries/us/data/states.py | 59 + .../lib/policyengine/countries/us/regions.py | 120 ++ .../data/release_manifests/uk.json | 45 + .../data/release_manifests/us.json | 48 + build/lib/policyengine/outputs/__init__.py | 91 + build/lib/policyengine/outputs/aggregate.py | 112 + .../policyengine/outputs/change_aggregate.py | 170 ++ .../outputs/congressional_district_impact.py | 131 ++ .../outputs/constituency_impact.py | 126 ++ .../lib/policyengine/outputs/decile_impact.py | 178 ++ build/lib/policyengine/outputs/inequality.py | 313 +++ .../outputs/intra_decile_impact.py | 180 ++ .../outputs/local_authority_impact.py | 125 ++ build/lib/policyengine/outputs/poverty.py | 462 +++++ .../lib/policyengine/tax_benefit_models/uk.py | 40 + .../tax_benefit_models/uk/__init__.py | 55 + .../tax_benefit_models/uk/analysis.py | 283 +++ .../tax_benefit_models/uk/datasets.py | 245 +++ .../tax_benefit_models/uk/model.py | 496 +++++ .../tax_benefit_models/uk/outputs.py | 105 + .../lib/policyengine/tax_benefit_models/us.py | 40 + .../tax_benefit_models/us/__init__.py | 55 + .../tax_benefit_models/us/analysis.py | 311 +++ .../tax_benefit_models/us/datasets.py | 359 ++++ .../tax_benefit_models/us/model.py | 650 ++++++ .../tax_benefit_models/us/outputs.py | 105 + build/lib/policyengine/utils/__init__.py | 7 + build/lib/policyengine/utils/dates.py | 43 + build/lib/policyengine/utils/entity_utils.py | 140 ++ .../policyengine/utils/parameter_labels.py | 216 ++ .../policyengine/utils/parametric_reforms.py | 131 ++ build/lib/policyengine/utils/plotting.py | 178 ++ 59 files changed, 9974 insertions(+), 2 deletions(-) create mode 100644 build/lib/policyengine/__init__.py create mode 100644 build/lib/policyengine/core/__init__.py create mode 100644 build/lib/policyengine/core/cache.py create mode 100644 build/lib/policyengine/core/dataset.py create mode 100644 build/lib/policyengine/core/dataset_version.py create mode 100644 build/lib/policyengine/core/dynamic.py create mode 100644 build/lib/policyengine/core/output.py create mode 100644 build/lib/policyengine/core/parameter.py create mode 100644 build/lib/policyengine/core/parameter_node.py create mode 100644 build/lib/policyengine/core/parameter_value.py create mode 100644 build/lib/policyengine/core/policy.py create mode 100644 build/lib/policyengine/core/region.py create mode 100644 build/lib/policyengine/core/release_manifest.py create mode 100644 build/lib/policyengine/core/scoping_strategy.py create mode 100644 build/lib/policyengine/core/simulation.py create mode 100644 build/lib/policyengine/core/tax_benefit_model.py create mode 100644 build/lib/policyengine/core/tax_benefit_model_version.py create mode 100644 build/lib/policyengine/core/trace_tro.py create mode 100644 build/lib/policyengine/core/variable.py create mode 100644 build/lib/policyengine/countries/__init__.py create mode 100644 build/lib/policyengine/countries/uk/__init__.py create mode 100644 build/lib/policyengine/countries/uk/regions.py create mode 100644 build/lib/policyengine/countries/us/__init__.py create mode 100644 build/lib/policyengine/countries/us/data/__init__.py create mode 100644 build/lib/policyengine/countries/us/data/districts.py create mode 100644 build/lib/policyengine/countries/us/data/places.py create mode 100644 build/lib/policyengine/countries/us/data/states.py create mode 100644 build/lib/policyengine/countries/us/regions.py create mode 100644 build/lib/policyengine/data/release_manifests/uk.json create mode 100644 build/lib/policyengine/data/release_manifests/us.json create mode 100644 build/lib/policyengine/outputs/__init__.py create mode 100644 build/lib/policyengine/outputs/aggregate.py create mode 100644 build/lib/policyengine/outputs/change_aggregate.py create mode 100644 build/lib/policyengine/outputs/congressional_district_impact.py create mode 100644 build/lib/policyengine/outputs/constituency_impact.py create mode 100644 build/lib/policyengine/outputs/decile_impact.py create mode 100644 build/lib/policyengine/outputs/inequality.py create mode 100644 build/lib/policyengine/outputs/intra_decile_impact.py create mode 100644 build/lib/policyengine/outputs/local_authority_impact.py create mode 100644 build/lib/policyengine/outputs/poverty.py create mode 100644 build/lib/policyengine/tax_benefit_models/uk.py create mode 100644 build/lib/policyengine/tax_benefit_models/uk/__init__.py create mode 100644 build/lib/policyengine/tax_benefit_models/uk/analysis.py create mode 100644 build/lib/policyengine/tax_benefit_models/uk/datasets.py create mode 100644 build/lib/policyengine/tax_benefit_models/uk/model.py create mode 100644 build/lib/policyengine/tax_benefit_models/uk/outputs.py create mode 100644 build/lib/policyengine/tax_benefit_models/us.py create mode 100644 build/lib/policyengine/tax_benefit_models/us/__init__.py create mode 100644 build/lib/policyengine/tax_benefit_models/us/analysis.py create mode 100644 build/lib/policyengine/tax_benefit_models/us/datasets.py create mode 100644 build/lib/policyengine/tax_benefit_models/us/model.py create mode 100644 build/lib/policyengine/tax_benefit_models/us/outputs.py create mode 100644 build/lib/policyengine/utils/__init__.py create mode 100644 build/lib/policyengine/utils/dates.py create mode 100644 build/lib/policyengine/utils/entity_utils.py create mode 100644 build/lib/policyengine/utils/parameter_labels.py create mode 100644 build/lib/policyengine/utils/parametric_reforms.py create mode 100644 build/lib/policyengine/utils/plotting.py diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index 8850174e..960bb42c 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -60,8 +60,13 @@ jobs: allow-prereleases: true - name: Install uv uses: astral-sh/setup-uv@v5 - - name: Install package (no extras — skip country models to isolate the wrapper) - run: uv pip install --system . + - name: Install package (no country-model extras) + # `h5py` is used transitively by policyengine.core.scoping_strategy + # but is normally supplied via the [us]/[uk] extras (through + # policyengine-core). Install it directly so the smoke import can + # exercise the wrapper without the country models, which pin + # versions that don't support 3.9/3.10 yet. + run: uv pip install --system . h5py - name: Smoke-import core modules run: python -c "import policyengine; from policyengine.core import Dataset, Policy, Simulation; from policyengine.outputs import aggregate, poverty, inequality; print('import OK')" Test: diff --git a/build/lib/policyengine/__init__.py b/build/lib/policyengine/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/build/lib/policyengine/core/__init__.py b/build/lib/policyengine/core/__init__.py new file mode 100644 index 00000000..bb0e80d5 --- /dev/null +++ b/build/lib/policyengine/core/__init__.py @@ -0,0 +1,50 @@ +from .dataset import Dataset +from .dataset import YearData as YearData +from .dataset import map_to_entity as map_to_entity +from .dataset_version import DatasetVersion as DatasetVersion +from .dynamic import Dynamic as Dynamic +from .output import Output as Output +from .output import OutputCollection as OutputCollection +from .parameter import Parameter as Parameter +from .parameter_node import ParameterNode as ParameterNode +from .parameter_value import ParameterValue as ParameterValue +from .policy import Policy as Policy +from .region import Region as Region +from .region import RegionRegistry as RegionRegistry +from .region import RegionType as RegionType +from .release_manifest import CertifiedDataArtifact as CertifiedDataArtifact +from .release_manifest import CountryReleaseManifest as CountryReleaseManifest +from .release_manifest import DataBuildInfo as DataBuildInfo +from .release_manifest import DataCertification as DataCertification +from .release_manifest import DataPackageVersion as DataPackageVersion +from .release_manifest import DataReleaseArtifact as DataReleaseArtifact +from .release_manifest import DataReleaseManifest as DataReleaseManifest +from .release_manifest import PackageVersion as PackageVersion +from .release_manifest import ( + certify_data_release_compatibility as certify_data_release_compatibility, +) +from .release_manifest import get_data_release_manifest as get_data_release_manifest +from .release_manifest import get_release_manifest as get_release_manifest +from .release_manifest import ( + resolve_managed_dataset_reference as resolve_managed_dataset_reference, +) +from .scoping_strategy import RegionScopingStrategy as RegionScopingStrategy +from .scoping_strategy import RowFilterStrategy as RowFilterStrategy +from .scoping_strategy import ScopingStrategy as ScopingStrategy +from .scoping_strategy import ( + WeightReplacementStrategy as WeightReplacementStrategy, +) +from .simulation import Simulation as Simulation +from .tax_benefit_model import TaxBenefitModel as TaxBenefitModel +from .tax_benefit_model_version import ( + TaxBenefitModelVersion as TaxBenefitModelVersion, +) +from .variable import Variable as Variable + +# Rebuild models to resolve forward references +Dataset.model_rebuild() +TaxBenefitModelVersion.model_rebuild() +Variable.model_rebuild() +Parameter.model_rebuild() +ParameterNode.model_rebuild() +ParameterValue.model_rebuild() diff --git a/build/lib/policyengine/core/cache.py b/build/lib/policyengine/core/cache.py new file mode 100644 index 00000000..410301e4 --- /dev/null +++ b/build/lib/policyengine/core/cache.py @@ -0,0 +1,59 @@ +import logging +from collections import OrderedDict +from typing import Generic, Optional, TypeVar + +import psutil + +logger = logging.getLogger(__name__) + +_MEMORY_THRESHOLDS_GB = [8, 16, 32] +_warned_thresholds: set[int] = set() + +T = TypeVar("T") + + +class LRUCache(Generic[T]): + """Least-recently-used cache with configurable size limit and memory monitoring.""" + + def __init__(self, max_size: int = 100): + self._max_size = max_size + self._cache: OrderedDict[str, T] = OrderedDict() + + def get(self, key: str) -> Optional[T]: + """Get item from cache, marking it as recently used.""" + if key not in self._cache: + return None + self._cache.move_to_end(key) + return self._cache[key] + + def add(self, key: str, value: T) -> None: + """Add item to cache with LRU eviction when full.""" + if key in self._cache: + self._cache.move_to_end(key) + else: + self._cache[key] = value + if len(self._cache) > self._max_size: + self._cache.popitem(last=False) + + self._check_memory_usage() + + def clear(self) -> None: + """Clear all items from cache.""" + self._cache.clear() + _warned_thresholds.clear() + + def __len__(self) -> int: + return len(self._cache) + + def _check_memory_usage(self) -> None: + """Check memory usage and warn at threshold crossings.""" + process = psutil.Process() + memory_gb = process.memory_info().rss / (1024**3) + + for threshold in _MEMORY_THRESHOLDS_GB: + if memory_gb >= threshold and threshold not in _warned_thresholds: + logger.warning( + f"Memory usage has reached {memory_gb:.2f}GB (threshold: {threshold}GB). " + f"Cache contains {len(self._cache)} items." + ) + _warned_thresholds.add(threshold) diff --git a/build/lib/policyengine/core/dataset.py b/build/lib/policyengine/core/dataset.py new file mode 100644 index 00000000..27f51d16 --- /dev/null +++ b/build/lib/policyengine/core/dataset.py @@ -0,0 +1,396 @@ +from typing import Optional +from uuid import uuid4 + +import numpy as np +import pandas as pd +from microdf import MicroDataFrame +from pydantic import BaseModel, ConfigDict, Field + +from .dataset_version import DatasetVersion +from .tax_benefit_model import TaxBenefitModel + + +class YearData(BaseModel): + """Base class for entity-level data for a single year.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + @property + def entity_data(self) -> dict[str, MicroDataFrame]: + """Return a dictionary of entity names to their data. + + This should be implemented by subclasses to return the appropriate entities. + """ + raise NotImplementedError("Subclasses must implement entity_data property") + + @property + def person_entity(self) -> str: + """Return the name of the person-level entity. + + Defaults to 'person' but can be overridden by subclasses. + """ + return "person" + + def map_to_entity( + self, + source_entity: str, + target_entity: str, + columns: list[str] = None, + values: list = None, + how: str = "sum", + ) -> MicroDataFrame: + """Map data from source entity to target entity using join keys. + + Args: + source_entity (str): The source entity name. + target_entity (str): The target entity name. + columns (list[str], optional): List of column names to map. If None, maps all columns. + values (list, optional): List of values to use instead of column data. + how (str): Aggregation method ('sum' or 'first') when mapping to higher-level entities (default 'sum'). + + Returns: + MicroDataFrame: The mapped data at the target entity level. + + Raises: + ValueError: If source or target entity is invalid. + """ + return map_to_entity( + entity_data=self.entity_data, + source_entity=source_entity, + target_entity=target_entity, + person_entity=self.person_entity, + columns=columns, + values=values, + how=how, + ) + + +class Dataset(BaseModel): + """Base class for datasets. + + The data field contains entity-level data as a BaseModel with DataFrame fields. + + Example: + class YearData(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + person: pd.DataFrame + household: pd.DataFrame + + class MyDataset(Dataset): + data: Optional[YearData] = None + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + id: str = Field(default_factory=lambda: str(uuid4())) + name: str + description: str + dataset_version: Optional[DatasetVersion] = None + filepath: str + is_output_dataset: bool = False + tax_benefit_model: Optional[TaxBenefitModel] = None + year: int + + data: Optional[BaseModel] = None + + +def map_to_entity( + entity_data: dict[str, MicroDataFrame], + source_entity: str, + target_entity: str, + person_entity: str = "person", + columns: Optional[list[str]] = None, + values: Optional[np.ndarray] = None, + how: str = "sum", +) -> MicroDataFrame: + """Map data from source entity to target entity using join keys. + + This is a generic entity mapping utility that handles: + - Same entity mapping (returns as is) + - Person to group entity mapping (aggregates values) + - Group to person entity mapping (expands values) + - Group to group entity mapping (aggregates through person entity) + + Args: + entity_data: Dictionary mapping entity names to their MicroDataFrame data + source_entity: The source entity name + target_entity: The target entity name + person_entity: The name of the person entity (default "person") + columns: List of column names to map. If None, maps all columns + values: List of values to use instead of column data. If provided, creates a single unnamed column + how: Aggregation method (default 'sum') + - For person → group: 'sum' (aggregate), 'first' (take first value) + - For group → person: 'project' (broadcast), 'divide' (split equally) + - For group → group: 'sum', 'first', 'project', 'divide' + + Returns: + MicroDataFrame: The mapped data at the target entity level + + Raises: + ValueError: If source or target entity is invalid or unsupported aggregation method + """ + valid_entities = set(entity_data.keys()) + + if source_entity not in valid_entities: + raise ValueError( + f"Invalid source entity '{source_entity}'. Must be one of {valid_entities}" + ) + if target_entity not in valid_entities: + raise ValueError( + f"Invalid target entity '{target_entity}'. Must be one of {valid_entities}" + ) + + # Get source data (convert to plain DataFrame to avoid weighted operations during mapping) + source_df = pd.DataFrame(entity_data[source_entity]) + + # Track if we should return a MicroSeries (values is a numpy array, not a list) + return_series = values is not None + + # Handle values parameter - create a temporary column with the provided values + if values is not None: + if len(values) != len(source_df): + raise ValueError( + f"Length of values ({len(values)}) must match source entity length ({len(source_df)})" + ) + # Create a temporary DataFrame with just ID columns and the values column + id_cols = {col for col in source_df.columns if col.endswith("_id")} + source_df = source_df[[col for col in id_cols]] + source_df["__mapped_value"] = values + columns = ["__mapped_value"] + + if columns: + # Select only requested columns (keep all ID columns for joins) + id_cols = {col for col in source_df.columns if col.endswith("_id")} + cols_to_keep = list(set(columns) | id_cols) + source_df = source_df[cols_to_keep] + + # Determine weight column for target entity + target_weight = f"{target_entity}_weight" + + # Same entity - return as is + if source_entity == target_entity: + result = MicroDataFrame(source_df, weights=target_weight) + if return_series: + return result["__mapped_value"] + return result + + # Get target data and key + target_df = entity_data[target_entity] + target_key = f"{target_entity}_id" + + # Person to group entity: aggregate person-level data to group level + if source_entity == person_entity and target_entity != person_entity: + # Check for both naming patterns: "entity_id" and "person_entity_id" + person_target_key = f"{person_entity}_{target_entity}_id" + join_key = ( + person_target_key if person_target_key in source_df.columns else target_key + ) + + if join_key in source_df.columns: + # Get columns to aggregate (exclude ID and weight columns) + id_cols = {col for col in source_df.columns if col.endswith("_id")} + weight_cols = {col for col in source_df.columns if col.endswith("_weight")} + agg_cols = [ + c + for c in source_df.columns + if c not in id_cols and c not in weight_cols + ] + + # Group by join key and aggregate + if how == "sum": + aggregated = source_df.groupby(join_key, as_index=False)[agg_cols].sum() + elif how == "first": + aggregated = source_df.groupby(join_key, as_index=False)[ + agg_cols + ].first() + else: + raise ValueError(f"Unsupported aggregation method: {how}") + + # Rename join key to target key if needed + if join_key != target_key: + aggregated = aggregated.rename(columns={join_key: target_key}) + + # Merge with target, preserving original order + target_pd = pd.DataFrame(target_df)[[target_key, target_weight]] + target_pd = target_pd.reset_index(drop=False) + result = target_pd.merge(aggregated, on=target_key, how="left") + + # Sort back to original order + result = ( + result.sort_values("index").drop("index", axis=1).reset_index(drop=True) + ) + + # Fill NaN with 0 for groups with no members in source entity + result[agg_cols] = result[agg_cols].fillna(0) + + result_df = MicroDataFrame(result, weights=target_weight) + if return_series: + return result_df["__mapped_value"] + return result_df + + # Group entity to person: expand group-level data to person level + if source_entity != person_entity and target_entity == person_entity: + # Default to 'project' (broadcast) for group -> person if 'sum' was provided + if how == "sum": + how = "project" + + source_key = f"{source_entity}_id" + # Check for both naming patterns + person_source_key = f"{person_entity}_{source_entity}_id" + + target_pd = pd.DataFrame(target_df) + join_key = ( + person_source_key if person_source_key in target_pd.columns else source_key + ) + + if join_key in target_pd.columns: + # Rename source key to match join key if needed + if join_key != source_key and source_key in source_df.columns: + source_df = source_df.rename(columns={source_key: join_key}) + + result = target_pd.merge(source_df, on=join_key, how="left") + + # Handle divide operation + if how == "divide": + # Get columns to divide (exclude ID and weight columns) + id_cols = {col for col in result.columns if col.endswith("_id")} + weight_cols = {col for col in result.columns if col.endswith("_weight")} + value_cols = [ + c + for c in result.columns + if c not in id_cols and c not in weight_cols + ] + + # Count members in each group + group_counts = ( + target_pd.groupby(join_key, as_index=False) + .size() + .rename(columns={"size": "__group_count"}) + ) + result = result.merge(group_counts, on=join_key, how="left") + + # Divide values by group count + for col in value_cols: + result[col] = result[col] / result["__group_count"] + + result = result.drop(columns=["__group_count"]) + elif how not in ["project"]: + raise ValueError( + f"Unsupported aggregation method for group->person: {how}. Use 'project' or 'divide'." + ) + + result_df = MicroDataFrame(result, weights=target_weight) + if return_series: + return result_df["__mapped_value"] + return result_df + + # Group to group: go through person table + if source_entity != person_entity and target_entity != person_entity: + # Get person link table with both entity IDs + person_df = pd.DataFrame(entity_data[person_entity]) + source_key = f"{source_entity}_id" + + # Check for both naming patterns for person-level links + person_source_key = f"{person_entity}_{source_entity}_id" + person_target_key = f"{person_entity}_{target_entity}_id" + + # Determine which keys exist in person table + source_link_key = ( + person_source_key if person_source_key in person_df.columns else source_key + ) + target_link_key = ( + person_target_key if person_target_key in person_df.columns else target_key + ) + + # Link source -> person -> target + if ( + source_link_key in person_df.columns + and target_link_key in person_df.columns + ): + person_link = person_df[ + [source_link_key, target_link_key] + ].drop_duplicates() + + # Rename source key to match link key if needed + source_df_copy = source_df.copy() + if source_link_key != source_key and source_key in source_df_copy.columns: + source_df_copy = source_df_copy.rename( + columns={source_key: source_link_key} + ) + + # Join source data with target key + source_with_target = source_df_copy.merge( + person_link, on=source_link_key, how="left" + ) + + # Aggregate to target level + id_cols = {col for col in source_with_target.columns if col.endswith("_id")} + weight_cols = { + col for col in source_with_target.columns if col.endswith("_weight") + } + agg_cols = [ + c + for c in source_with_target.columns + if c not in id_cols and c not in weight_cols + ] + + if how == "sum": + aggregated = source_with_target.groupby( + target_link_key, as_index=False + )[agg_cols].sum() + elif how == "first": + aggregated = source_with_target.groupby( + target_link_key, as_index=False + )[agg_cols].first() + elif how == "project": + # Just take first value (broadcast to target groups) + aggregated = source_with_target.groupby( + target_link_key, as_index=False + )[agg_cols].first() + elif how == "divide": + # Count persons in each source group + source_group_counts = ( + person_df.groupby(source_link_key, as_index=False) + .size() + .rename(columns={"size": "__source_count"}) + ) + source_with_target = source_with_target.merge( + source_group_counts, on=source_link_key, how="left" + ) + + # Divide values by source group count (per-person share) + for col in agg_cols: + source_with_target[col] = ( + source_with_target[col] / source_with_target["__source_count"] + ) + + # Now aggregate (sum of per-person shares) to target level + aggregated = source_with_target.groupby( + target_link_key, as_index=False + )[agg_cols].sum() + else: + raise ValueError(f"Unsupported aggregation method: {how}") + + # Rename target link key to target key if needed + if target_link_key != target_key: + aggregated = aggregated.rename(columns={target_link_key: target_key}) + + # Merge with target, preserving original order + target_pd = pd.DataFrame(target_df)[[target_key, target_weight]] + target_pd = target_pd.reset_index(drop=False) + result = target_pd.merge(aggregated, on=target_key, how="left") + + # Sort back to original order + result = ( + result.sort_values("index").drop("index", axis=1).reset_index(drop=True) + ) + + # Fill NaN with 0 + result[agg_cols] = result[agg_cols].fillna(0) + + result_df = MicroDataFrame(result, weights=target_weight) + if return_series: + return result_df["__mapped_value"] + return result_df + + raise ValueError(f"Unsupported mapping from {source_entity} to {target_entity}") diff --git a/build/lib/policyengine/core/dataset_version.py b/build/lib/policyengine/core/dataset_version.py new file mode 100644 index 00000000..711cd7d7 --- /dev/null +++ b/build/lib/policyengine/core/dataset_version.py @@ -0,0 +1,16 @@ +from typing import TYPE_CHECKING +from uuid import uuid4 + +from pydantic import BaseModel, Field + +from .tax_benefit_model import TaxBenefitModel + +if TYPE_CHECKING: + from .dataset import Dataset + + +class DatasetVersion(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + dataset: "Dataset" + description: str + tax_benefit_model: TaxBenefitModel = None diff --git a/build/lib/policyengine/core/dynamic.py b/build/lib/policyengine/core/dynamic.py new file mode 100644 index 00000000..d707b9b2 --- /dev/null +++ b/build/lib/policyengine/core/dynamic.py @@ -0,0 +1,47 @@ +from collections.abc import Callable +from datetime import datetime +from typing import Optional +from uuid import uuid4 + +from pydantic import BaseModel, Field + +from .parameter_value import ParameterValue + + +class Dynamic(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + name: str + description: Optional[str] = None + parameter_values: list[ParameterValue] = [] + simulation_modifier: Optional[Callable] = None + created_at: datetime = Field(default_factory=datetime.now) + updated_at: datetime = Field(default_factory=datetime.now) + + def __add__(self, other: "Dynamic") -> "Dynamic": + """Combine two dynamics by appending parameter values and chaining simulation modifiers.""" + if not isinstance(other, Dynamic): + return NotImplemented + + # Combine simulation modifiers + combined_modifier = None + if ( + self.simulation_modifier is not None + and other.simulation_modifier is not None + ): + + def combined_modifier(sim): + sim = self.simulation_modifier(sim) + sim = other.simulation_modifier(sim) + return sim + + elif self.simulation_modifier is not None: + combined_modifier = self.simulation_modifier + elif other.simulation_modifier is not None: + combined_modifier = other.simulation_modifier + + return Dynamic( + name=f"{self.name} + {other.name}", + description=f"Combined dynamic: {self.name} and {other.name}", + parameter_values=self.parameter_values + other.parameter_values, + simulation_modifier=combined_modifier, + ) diff --git a/build/lib/policyengine/core/output.py b/build/lib/policyengine/core/output.py new file mode 100644 index 00000000..e71634ab --- /dev/null +++ b/build/lib/policyengine/core/output.py @@ -0,0 +1,26 @@ +from typing import Generic, List, TypeVar + +import pandas as pd +from pydantic import BaseModel, ConfigDict + +T = TypeVar("T", bound="Output") + + +class Output(BaseModel): + """Base class for all output templates.""" + + def run(self): + """Calculate and populate the output fields. + + Must be implemented by subclasses. + """ + raise NotImplementedError("Subclasses must implement run()") + + +class OutputCollection(BaseModel, Generic[T]): + """Container for a collection of outputs with their DataFrame representation.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + outputs: List[T] + dataframe: pd.DataFrame diff --git a/build/lib/policyengine/core/parameter.py b/build/lib/policyengine/core/parameter.py new file mode 100644 index 00000000..49f2b282 --- /dev/null +++ b/build/lib/policyengine/core/parameter.py @@ -0,0 +1,61 @@ +from typing import TYPE_CHECKING, Any, Optional +from uuid import uuid4 + +from pydantic import BaseModel, Field, PrivateAttr + +from .parameter_value import ParameterValue +from .tax_benefit_model_version import TaxBenefitModelVersion + +if TYPE_CHECKING: + from .parameter_value import ParameterValue + + +class Parameter(BaseModel): + model_config = {"arbitrary_types_allowed": True} + + id: str = Field(default_factory=lambda: str(uuid4())) + name: str + label: Optional[str] = None + description: Optional[str] = None + data_type: Optional[type] = None + tax_benefit_model_version: TaxBenefitModelVersion + unit: Optional[str] = None + + # Lazy loading: store core param ref, build values on demand + _core_param: Any = PrivateAttr(default=None) + _parameter_values: Optional[list["ParameterValue"]] = PrivateAttr(default=None) + + def __init__(self, _core_param: Any = None, **data): + super().__init__(**data) + self._core_param = _core_param + self._parameter_values = None + + @property + def parameter_values(self) -> list["ParameterValue"]: + """Lazily build parameter values on first access.""" + if self._parameter_values is None: + self._parameter_values = [] + if self._core_param is not None: + from policyengine.utils import parse_safe_date + + for i in range(len(self._core_param.values_list)): + param_at_instant = self._core_param.values_list[i] + if i + 1 < len(self._core_param.values_list): + next_instant = self._core_param.values_list[i + 1] + else: + next_instant = None + pv = ParameterValue( + parameter=self, + start_date=parse_safe_date(param_at_instant.instant_str), + end_date=parse_safe_date(next_instant.instant_str) + if next_instant + else None, + value=param_at_instant.value, + ) + self._parameter_values.append(pv) + return self._parameter_values + + @parameter_values.setter + def parameter_values(self, value: list["ParameterValue"]) -> None: + """Allow direct setting of parameter values.""" + self._parameter_values = value diff --git a/build/lib/policyengine/core/parameter_node.py b/build/lib/policyengine/core/parameter_node.py new file mode 100644 index 00000000..54d384a5 --- /dev/null +++ b/build/lib/policyengine/core/parameter_node.py @@ -0,0 +1,29 @@ +from typing import TYPE_CHECKING, Optional +from uuid import uuid4 + +from pydantic import BaseModel, Field + +if TYPE_CHECKING: + from .tax_benefit_model_version import TaxBenefitModelVersion + + +class ParameterNode(BaseModel): + """Represents a folder/category node in the parameter hierarchy. + + Parameter nodes are intermediate nodes in the parameter tree (e.g., "gov", + "gov.hmrc", "gov.hmrc.income_tax"). They provide structure and human-readable + labels for navigating the parameter tree, but don't have values themselves. + + Unlike Parameter objects (which are leaf nodes with actual values), + ParameterNode objects are purely organizational. + """ + + model_config = {"arbitrary_types_allowed": True} + + id: str = Field(default_factory=lambda: str(uuid4())) + name: str = Field(description="Full path of the node (e.g., 'gov.hmrc')") + label: Optional[str] = Field( + default=None, description="Human-readable label (e.g., 'HMRC')" + ) + description: Optional[str] = Field(default=None, description="Node description") + tax_benefit_model_version: "TaxBenefitModelVersion" diff --git a/build/lib/policyengine/core/parameter_value.py b/build/lib/policyengine/core/parameter_value.py new file mode 100644 index 00000000..a51ffeb0 --- /dev/null +++ b/build/lib/policyengine/core/parameter_value.py @@ -0,0 +1,16 @@ +from datetime import datetime +from typing import TYPE_CHECKING, Optional, Union +from uuid import uuid4 + +from pydantic import BaseModel, Field + +if TYPE_CHECKING: + from .parameter import Parameter + + +class ParameterValue(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + parameter: "Optional[Parameter]" = None + value: Optional[Union[float, int, str, bool, list]] = None + start_date: datetime + end_date: Optional[datetime] = None diff --git a/build/lib/policyengine/core/policy.py b/build/lib/policyengine/core/policy.py new file mode 100644 index 00000000..3860a817 --- /dev/null +++ b/build/lib/policyengine/core/policy.py @@ -0,0 +1,47 @@ +from collections.abc import Callable +from datetime import datetime +from typing import Optional +from uuid import uuid4 + +from pydantic import BaseModel, Field + +from .parameter_value import ParameterValue + + +class Policy(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + name: str + description: Optional[str] = None + parameter_values: list[ParameterValue] = [] + simulation_modifier: Optional[Callable] = None + created_at: datetime = Field(default_factory=datetime.now) + updated_at: datetime = Field(default_factory=datetime.now) + + def __add__(self, other: "Policy") -> "Policy": + """Combine two policies by appending parameter values and chaining simulation modifiers.""" + if not isinstance(other, Policy): + return NotImplemented + + # Combine simulation modifiers + combined_modifier = None + if ( + self.simulation_modifier is not None + and other.simulation_modifier is not None + ): + + def combined_modifier(sim): + sim = self.simulation_modifier(sim) + sim = other.simulation_modifier(sim) + return sim + + elif self.simulation_modifier is not None: + combined_modifier = self.simulation_modifier + elif other.simulation_modifier is not None: + combined_modifier = other.simulation_modifier + + return Policy( + name=f"{self.name} + {other.name}", + description=f"Combined policy: {self.name} and {other.name}", + parameter_values=self.parameter_values + other.parameter_values, + simulation_modifier=combined_modifier, + ) diff --git a/build/lib/policyengine/core/region.py b/build/lib/policyengine/core/region.py new file mode 100644 index 00000000..7ff55a64 --- /dev/null +++ b/build/lib/policyengine/core/region.py @@ -0,0 +1,212 @@ +"""Region definitions for geographic simulations. + +This module provides the Region and RegionRegistry classes for defining +geographic regions that a tax-benefit model supports. Regions can have: +1. A dedicated dataset (e.g., US states, congressional districts) +2. Filter from a parent region's dataset (e.g., US places/cities, UK countries) +""" + +from typing import Literal, Optional, Union + +from pydantic import BaseModel, Field, PrivateAttr + +from .scoping_strategy import ScopingStrategy + +# Region type literals for US and UK +USRegionType = Literal["national", "state", "congressional_district", "place"] +UKRegionType = Literal["national", "country", "constituency", "local_authority"] +RegionType = Union[USRegionType, UKRegionType] + + +class Region(BaseModel): + """Geographic region for tax-benefit simulations. + + Regions can either have: + 1. A dedicated dataset (dataset_path is set, requires_filter is False) + 2. Filter from a parent region's dataset (requires_filter is True) + + The unique identifier is the code field, which uses a prefixed format: + - National: "us", "uk" + - State: "state/ca", "state/ny" + - Congressional District: "congressional_district/CA-01" + - Place: "place/NJ-57000" + - UK Country: "country/england" + - Constituency: "constituency/Sheffield Central" + - Local Authority: "local_authority/E09000001" + """ + + # Core identification + code: str = Field( + ..., + description="Unique region code with type prefix (e.g., 'state/ca', 'place/NJ-57000')", + ) + label: str = Field(..., description="Human-readable label (e.g., 'California')") + region_type: RegionType = Field( + ..., description="Type of region (e.g., 'state', 'place')" + ) + + # Hierarchy + parent_code: Optional[str] = Field( + default=None, + description="Code of parent region (e.g., 'us' for states, 'state/nj' for places in New Jersey)", + ) + + # Dataset configuration + dataset_path: Optional[str] = Field( + default=None, + description="GCS path to dedicated dataset (e.g., 'gs://policyengine-us-data/states/CA.h5')", + ) + + # Scoping strategy (preferred over legacy filter fields) + scoping_strategy: Optional[ScopingStrategy] = Field( + default=None, + description="Strategy for scoping dataset to this region (row filtering or weight replacement)", + ) + + # Legacy filtering configuration (kept for backward compatibility) + requires_filter: bool = Field( + default=False, + description="True if this region filters from a parent dataset rather than having its own", + ) + filter_field: Optional[str] = Field( + default=None, + description="Dataset field to filter on (e.g., 'place_fips', 'country')", + ) + filter_value: Optional[str] = Field( + default=None, + description="Value to match when filtering (defaults to code suffix if not set)", + ) + + # Metadata (primarily for US congressional districts) + state_code: Optional[str] = Field( + default=None, description="Two-letter state code (e.g., 'CA', 'NJ')" + ) + state_name: Optional[str] = Field( + default=None, + description="Full state name (e.g., 'California', 'New Jersey')", + ) + + def __hash__(self) -> int: + """Hash by code for use in sets and dict keys.""" + return hash(self.code) + + def __eq__(self, other: object) -> bool: + """Equality by code.""" + if not isinstance(other, Region): + return False + return self.code == other.code + + +class RegionRegistry(BaseModel): + """Registry of all regions for a country model. + + Provides indexed lookups for regions by code and type. + Indices are rebuilt automatically after initialization. + """ + + country_id: str = Field(..., description="Country identifier (e.g., 'us', 'uk')") + regions: list[Region] = Field(default_factory=list) + + # Private indexed lookups (excluded from serialization) + _by_code: dict[str, Region] = PrivateAttr(default_factory=dict) + _by_type: dict[str, list[Region]] = PrivateAttr(default_factory=dict) + + def model_post_init(self, __context: object) -> None: + """Build lookup indices after initialization.""" + self._rebuild_indices() + + def _rebuild_indices(self) -> None: + """Rebuild all lookup indices from the regions list.""" + self._by_code = {} + self._by_type = {} + + for region in self.regions: + # Index by code + self._by_code[region.code] = region + + # Index by type + if region.region_type not in self._by_type: + self._by_type[region.region_type] = [] + self._by_type[region.region_type].append(region) + + def add_region(self, region: Region) -> None: + """Add a region to the registry and update indices.""" + self.regions.append(region) + self._by_code[region.code] = region + if region.region_type not in self._by_type: + self._by_type[region.region_type] = [] + self._by_type[region.region_type].append(region) + + def get(self, code: str) -> Optional[Region]: + """Get a region by its code. + + Args: + code: Region code (e.g., 'state/ca', 'place/NJ-57000') + + Returns: + The Region if found, None otherwise + """ + return self._by_code.get(code) + + def get_by_type(self, region_type: str) -> list[Region]: + """Get all regions of a given type. + + Args: + region_type: Type to filter by (e.g., 'state', 'place') + + Returns: + List of regions with the given type + """ + return self._by_type.get(region_type, []) + + def get_national(self) -> Optional[Region]: + """Get the national-level region. + + Returns: + The national Region if found, None otherwise + """ + national = self.get_by_type("national") + return national[0] if national else None + + def get_children(self, parent_code: str) -> list[Region]: + """Get all regions with a given parent code. + + Args: + parent_code: Parent region code to filter by + + Returns: + List of regions with the given parent + """ + return [r for r in self.regions if r.parent_code == parent_code] + + def get_dataset_regions(self) -> list[Region]: + """Get all regions that have dedicated datasets. + + Returns: + List of regions with dataset_path set and requires_filter False + """ + return [ + r + for r in self.regions + if r.dataset_path is not None and not r.requires_filter + ] + + def get_filter_regions(self) -> list[Region]: + """Get all regions that require filtering from parent datasets. + + Returns: + List of regions with requires_filter True + """ + return [r for r in self.regions if r.requires_filter] + + def __len__(self) -> int: + """Return the number of regions in the registry.""" + return len(self.regions) + + def __iter__(self): + """Iterate over regions.""" + return iter(self.regions) + + def __contains__(self, code: str) -> bool: + """Check if a region code exists in the registry.""" + return code in self._by_code diff --git a/build/lib/policyengine/core/release_manifest.py b/build/lib/policyengine/core/release_manifest.py new file mode 100644 index 00000000..90a09f32 --- /dev/null +++ b/build/lib/policyengine/core/release_manifest.py @@ -0,0 +1,432 @@ +import os +from functools import lru_cache +from importlib import import_module +from importlib.resources import files +from pathlib import Path +from typing import Optional + +import requests +from pydantic import BaseModel, Field + +HF_REQUEST_TIMEOUT_SECONDS = 30 +LOCAL_DATA_REPO_HINTS = { + "us": ("policyengine_us", "policyengine-us-data", "policyengine_us_data"), + "uk": ("policyengine_uk", "policyengine-uk-data", "policyengine_uk_data"), +} + + +class DataReleaseManifestUnavailableError(ValueError): + """Raised when a data release manifest cannot be fetched or is absent.""" + + +class PackageVersion(BaseModel): + name: str + version: str + + +class DataPackageVersion(PackageVersion): + repo_id: str + repo_type: str = "model" + release_manifest_path: str = "release_manifest.json" + + +class CompatibleModelPackage(BaseModel): + name: str + specifier: str + + +class BuiltWithModelPackage(PackageVersion): + git_sha: Optional[str] = None + data_build_fingerprint: Optional[str] = None + + +class DataBuildInfo(BaseModel): + build_id: Optional[str] = None + built_at: Optional[str] = None + built_with_model_package: Optional[BuiltWithModelPackage] = None + + +class ArtifactPathReference(BaseModel): + path: str + + +class ArtifactPathTemplate(BaseModel): + path_template: str + + def resolve(self, **kwargs: str) -> str: + return self.path_template.format(**kwargs) + + +class DataReleaseArtifact(BaseModel): + kind: str + path: str + repo_id: str + revision: str + sha256: Optional[str] = None + size_bytes: Optional[int] = None + + @property + def uri(self) -> str: + return build_hf_uri( + repo_id=self.repo_id, + path_in_repo=self.path, + revision=self.revision, + ) + + +class DataReleaseManifest(BaseModel): + schema_version: int + data_package: PackageVersion + compatible_model_packages: list[CompatibleModelPackage] = Field( + default_factory=list + ) + default_datasets: dict[str, str] = Field(default_factory=dict) + build: Optional[DataBuildInfo] = None + artifacts: dict[str, DataReleaseArtifact] = Field(default_factory=dict) + + +class DataCertification(BaseModel): + compatibility_basis: str + certified_for_model_version: str + data_build_id: Optional[str] = None + built_with_model_version: Optional[str] = None + built_with_model_git_sha: Optional[str] = None + data_build_fingerprint: Optional[str] = None + certified_by: Optional[str] = None + + +class CertifiedDataArtifact(BaseModel): + data_package: Optional[PackageVersion] = None + dataset: str + uri: str + sha256: Optional[str] = None + build_id: Optional[str] = None + + +class CountryReleaseManifest(BaseModel): + schema_version: int = 1 + bundle_id: Optional[str] = None + published_at: Optional[str] = None + country_id: str + policyengine_version: str + model_package: PackageVersion + data_package: DataPackageVersion + default_dataset: str + datasets: dict[str, ArtifactPathReference] = Field(default_factory=dict) + region_datasets: dict[str, ArtifactPathTemplate] = Field(default_factory=dict) + certified_data_artifact: Optional[CertifiedDataArtifact] = None + certification: Optional[DataCertification] = None + + @property + def default_dataset_uri(self) -> str: + if ( + self.certified_data_artifact is not None + and self.certified_data_artifact.dataset == self.default_dataset + ): + return self.certified_data_artifact.uri + return resolve_dataset_reference(self.country_id, self.default_dataset) + + +def build_hf_uri(repo_id: str, path_in_repo: str, revision: str) -> str: + return f"hf://{repo_id}/{path_in_repo}@{revision}" + + +@lru_cache +def get_release_manifest(country_id: str) -> CountryReleaseManifest: + manifest_path = files("policyengine").joinpath( + "data", "release_manifests", f"{country_id}.json" + ) + if not manifest_path.is_file(): + raise ValueError(f"No bundled release manifest for country '{country_id}'") + + return CountryReleaseManifest.model_validate_json(manifest_path.read_text()) + + +def _data_release_manifest_url(data_package: DataPackageVersion) -> str: + return ( + "https://huggingface.co/" + f"{data_package.repo_id}/resolve/{data_package.version}/" + f"{data_package.release_manifest_path}" + ) + + +@lru_cache +def get_data_release_manifest(country_id: str) -> DataReleaseManifest: + country_manifest = get_release_manifest(country_id) + data_package = country_manifest.data_package + + headers = {} + token = os.environ.get("HUGGING_FACE_TOKEN") + if token: + headers["Authorization"] = f"Bearer {token}" + + response = requests.get( + _data_release_manifest_url(data_package), + headers=headers, + timeout=HF_REQUEST_TIMEOUT_SECONDS, + ) + if response.status_code in (401, 403): + raise DataReleaseManifestUnavailableError( + "Could not fetch the data release manifest from Hugging Face. " + "If this country uses a private data repo, set HUGGING_FACE_TOKEN." + ) + if response.status_code == 404: + raise DataReleaseManifestUnavailableError( + "No data release manifest was published for this data package." + ) + response.raise_for_status() + return DataReleaseManifest.model_validate_json(response.text) + + +def _specifier_matches(version: str, specifier: str) -> bool: + if specifier.startswith("=="): + return version == specifier[2:] + return False + + +def certify_data_release_compatibility( + country_id: str, + runtime_model_version: str, + runtime_data_build_fingerprint: Optional[str] = None, +) -> DataCertification: + country_manifest = get_release_manifest(country_id) + try: + data_release_manifest = get_data_release_manifest(country_id) + except DataReleaseManifestUnavailableError as exc: + bundled_certification = country_manifest.certification + if ( + bundled_certification is not None + and bundled_certification.certified_for_model_version + == runtime_model_version + ): + if ( + runtime_data_build_fingerprint is not None + and bundled_certification.data_build_fingerprint is not None + and runtime_data_build_fingerprint + != bundled_certification.data_build_fingerprint + ): + raise ValueError( + "Runtime data build fingerprint does not match the bundled " + "data certification." + ) + return bundled_certification + raise exc + built_with_model = ( + data_release_manifest.build.built_with_model_package + if data_release_manifest.build is not None + else None + ) + + if ( + built_with_model is not None + and built_with_model.name != country_manifest.model_package.name + ): + raise ValueError( + "Data release manifest was built with a different model package: " + f"expected {country_manifest.model_package.name}, " + f"got {built_with_model.name}." + ) + + if ( + built_with_model is not None + and built_with_model.version == runtime_model_version + ): + return DataCertification( + compatibility_basis="exact_build_model_version", + certified_for_model_version=runtime_model_version, + data_build_id=( + data_release_manifest.build.build_id + if data_release_manifest.build is not None + else None + ), + built_with_model_version=built_with_model.version, + built_with_model_git_sha=built_with_model.git_sha, + data_build_fingerprint=built_with_model.data_build_fingerprint, + ) + + if ( + built_with_model is not None + and built_with_model.data_build_fingerprint is not None + and runtime_data_build_fingerprint is not None + and built_with_model.data_build_fingerprint == runtime_data_build_fingerprint + ): + return DataCertification( + compatibility_basis="matching_data_build_fingerprint", + certified_for_model_version=runtime_model_version, + data_build_id=( + data_release_manifest.build.build_id + if data_release_manifest.build is not None + else None + ), + built_with_model_version=built_with_model.version, + built_with_model_git_sha=built_with_model.git_sha, + data_build_fingerprint=built_with_model.data_build_fingerprint, + ) + + for compatible_model_package in data_release_manifest.compatible_model_packages: + if compatible_model_package.name != country_manifest.model_package.name: + continue + if _specifier_matches( + version=runtime_model_version, + specifier=compatible_model_package.specifier, + ): + return DataCertification( + compatibility_basis="legacy_compatible_model_package", + certified_for_model_version=runtime_model_version, + data_build_id=( + data_release_manifest.build.build_id + if data_release_manifest.build is not None + else None + ), + built_with_model_version=( + built_with_model.version if built_with_model is not None else None + ), + built_with_model_git_sha=( + built_with_model.git_sha if built_with_model is not None else None + ), + data_build_fingerprint=( + built_with_model.data_build_fingerprint + if built_with_model is not None + else None + ), + ) + + raise ValueError( + "Data release manifest is not certified for the runtime model version " + f"{runtime_model_version} in country '{country_id}'." + ) + + +def resolve_dataset_reference(country_id: str, dataset: str) -> str: + if "://" in dataset: + return dataset + + manifest = get_release_manifest(country_id) + path_reference = manifest.datasets.get(dataset) + if path_reference is not None: + return build_hf_uri( + repo_id=manifest.data_package.repo_id, + path_in_repo=path_reference.path, + revision=manifest.data_package.version, + ) + + data_release_manifest = get_data_release_manifest(country_id) + artifact = data_release_manifest.artifacts.get(dataset) + if artifact is None: + raise ValueError( + f"Unknown dataset '{dataset}' for country '{country_id}'. " + f"Known datasets: {sorted(manifest.datasets)}" + ) + + return artifact.uri + + +def resolve_managed_dataset_reference( + country_id: str, + dataset: Optional[str] = None, + *, + allow_unmanaged: bool = False, +) -> str: + """Resolve a dataset reference under policyengine.py bundle enforcement. + + Managed mode pins dataset selection to the bundled `policyengine.py` + release manifest. Callers can: + + - omit `dataset` to use the certified default dataset for the bundle + - pass a logical dataset name present in the bundled/data-release manifests + + Direct URLs or raw Hugging Face references are treated as unmanaged unless + `allow_unmanaged=True` is set explicitly. + """ + + manifest = get_release_manifest(country_id) + if dataset is None: + return manifest.default_dataset_uri + + if "://" in dataset: + if dataset == manifest.default_dataset_uri: + return dataset + if allow_unmanaged: + return dataset + raise ValueError( + "Explicit dataset URIs bypass the policyengine.py release bundle. " + "Pass a manifest dataset name or omit `dataset` to use the certified " + "default dataset. Set `allow_unmanaged=True` only if you intend to " + "bypass bundle enforcement." + ) + + return resolve_dataset_reference(country_id, dataset) + + +def resolve_local_managed_dataset_source( + country_id: str, + dataset_uri: str, + *, + allow_local_mirror: bool = True, +) -> str: + """Resolve a local mirror of a managed dataset when available. + + This preserves the bundled dataset URI for provenance while allowing local + development environments with sibling data-repo checkouts to load the + exact certified artifact from disk rather than re-downloading it. + """ + + if not allow_local_mirror or not dataset_uri.startswith("hf://"): + return dataset_uri + + local_hint = LOCAL_DATA_REPO_HINTS.get(country_id) + if local_hint is None: + return dataset_uri + + path_without_revision = dataset_uri[5:].rsplit("@", 1)[0] + parts = path_without_revision.split("/", 2) + if len(parts) != 3: + return dataset_uri + _, _, path_in_repo = parts + + model_module_name, data_repo_name, data_package_name = local_hint + try: + model_module = import_module(model_module_name) + except ImportError: + return dataset_uri + + repo_root = Path(model_module.__file__).resolve().parents[1] + local_path = ( + repo_root.with_name(data_repo_name) + / data_package_name + / "storage" + / path_in_repo + ) + if local_path.exists(): + return str(local_path) + return dataset_uri + + +def dataset_logical_name(dataset: str) -> str: + return Path(dataset.rsplit("@", 1)[0]).stem + + +def resolve_default_datasets(country_id: str) -> list[str]: + manifest = get_release_manifest(country_id) + return list(manifest.datasets.keys()) + + +def resolve_region_dataset_path( + country_id: str, + region_type: str, + **kwargs: str, +) -> Optional[str]: + manifest = get_release_manifest(country_id) + template = manifest.region_datasets.get(region_type) + if template is None: + return None + + resolved_path = template.resolve(**kwargs) + if "://" in resolved_path: + return resolved_path + + return build_hf_uri( + repo_id=manifest.data_package.repo_id, + path_in_repo=resolved_path, + revision=manifest.data_package.version, + ) diff --git a/build/lib/policyengine/core/scoping_strategy.py b/build/lib/policyengine/core/scoping_strategy.py new file mode 100644 index 00000000..7d9b5126 --- /dev/null +++ b/build/lib/policyengine/core/scoping_strategy.py @@ -0,0 +1,224 @@ +"""Region scoping strategies for geographic simulations. + +Provides two concrete strategies for scoping datasets to sub-national regions: + +1. RowFilterStrategy: Filters dataset rows where a household variable matches + a specific value (e.g., UK countries by 'country' field, US places by 'place_fips'). + +2. WeightReplacementStrategy: Replaces household weights from a pre-computed weight + matrix stored in GCS (e.g., UK constituencies and local authorities). +""" + +import logging +from abc import abstractmethod +from pathlib import Path +from typing import Annotated, Literal, Optional, Union + +import h5py +import numpy as np +import pandas as pd +from microdf import MicroDataFrame +from pydantic import BaseModel, Discriminator + +from policyengine.utils.entity_utils import ( + filter_dataset_by_household_variable, +) + +logger = logging.getLogger(__name__) + + +class RegionScopingStrategy(BaseModel): + """Base class for region scoping strategies. + + Subclasses implement apply() to scope a dataset's entity data + to a specific sub-national region. + """ + + strategy_type: str + + @abstractmethod + def apply( + self, + entity_data: dict[str, MicroDataFrame], + group_entities: list[str], + year: int, + ) -> dict[str, MicroDataFrame]: + """Apply the scoping strategy to entity data. + + Args: + entity_data: Dict mapping entity names to their MicroDataFrames. + group_entities: List of group entity names for this country. + year: The simulation year (used for time-indexed weight matrices). + + Returns: + A dict mapping entity names to scoped MicroDataFrames. + """ + + @property + def cache_key(self) -> str: + """Return a string key for deterministic simulation ID hashing.""" + return f"{self.strategy_type}:{self.model_dump_json()}" + + +class RowFilterStrategy(RegionScopingStrategy): + """Scoping strategy that filters dataset rows by a household variable. + + Used for regions where we want to keep only households matching a + specific variable value (e.g., UK countries, US places/cities). + """ + + strategy_type: Literal["row_filter"] = "row_filter" + variable_name: str + variable_value: str + + def apply( + self, + entity_data: dict[str, MicroDataFrame], + group_entities: list[str], + year: int, + ) -> dict[str, MicroDataFrame]: + return filter_dataset_by_household_variable( + entity_data=entity_data, + group_entities=group_entities, + variable_name=self.variable_name, + variable_value=self.variable_value, + ) + + @property + def cache_key(self) -> str: + return f"row_filter:{self.variable_name}={self.variable_value}" + + +class WeightReplacementStrategy(RegionScopingStrategy): + """Scoping strategy that replaces household weights from a pre-computed matrix. + + Used for UK constituencies and local authorities. Instead of removing + households, this strategy keeps all households but replaces their weights + with region-specific values from a weight matrix stored in GCS. + + The weight matrix is an HDF5 file with shape (N_regions x N_households), + where each row contains household weights for a specific region. + A companion CSV maps region codes/names to row indices. + """ + + strategy_type: Literal["weight_replacement"] = "weight_replacement" + weight_matrix_bucket: str + weight_matrix_key: str + lookup_csv_bucket: str + lookup_csv_key: str + region_code: str + + def apply( + self, + entity_data: dict[str, MicroDataFrame], + group_entities: list[str], + year: int, + ) -> dict[str, MicroDataFrame]: + from policyengine_core.tools.google_cloud import download_gcs_file + + # Download lookup CSV and find region index + lookup_path = Path( + download_gcs_file( + bucket=self.lookup_csv_bucket, + file_path=self.lookup_csv_key, + ) + ) + lookup_df = pd.read_csv(lookup_path) + + region_id = self._find_region_index(lookup_df, self.region_code) + + # Download weight matrix and extract weights for this region + weights_path = download_gcs_file( + bucket=self.weight_matrix_bucket, + file_path=self.weight_matrix_key, + ) + with h5py.File(weights_path, "r") as f: + weights = f[str(year)][...] + + region_weights = weights[region_id] + + # Validate weight row length matches household count + household_df = pd.DataFrame(entity_data["household"]) + if len(region_weights) != len(household_df): + raise ValueError( + f"Weight matrix row length ({len(region_weights)}) does not match " + f"household count ({len(household_df)}) for region '{self.region_code}'. " + f"The weight matrix may be out of date." + ) + + # Replace household weights + result = {} + for entity_name, mdf in entity_data.items(): + df = pd.DataFrame(mdf) + if entity_name == "household": + df["household_weight"] = region_weights + result[entity_name] = MicroDataFrame(df, weights="household_weight") + else: + weight_col = f"{entity_name}_weight" + if weight_col in df.columns: + # Map new household weights to sub-entities via their + # household membership. Build a mapping from household_id + # to new weight. + hh_ids = household_df["household_id"].values + weight_map = dict(zip(hh_ids, region_weights)) + + # Find the entity's household ID column + person_hh_col = self._find_household_id_column(df, entity_name) + if person_hh_col: + new_weights = np.array( + [ + weight_map.get(hh_id, 0.0) + for hh_id in df[person_hh_col].values + ] + ) + df[weight_col] = new_weights + + result[entity_name] = MicroDataFrame( + df, + weights=( + f"{entity_name}_weight" + if f"{entity_name}_weight" in df.columns + else None + ), + ) + + return result + + @staticmethod + def _find_region_index(lookup_df: pd.DataFrame, region_code: str) -> int: + """Find the row index for a region in the lookup CSV. + + Searches by 'code' column first, then 'name' column. + """ + if "code" in lookup_df.columns and region_code in lookup_df["code"].values: + return lookup_df[lookup_df["code"] == region_code].index[0] + if "name" in lookup_df.columns and region_code in lookup_df["name"].values: + return lookup_df[lookup_df["name"] == region_code].index[0] + raise ValueError( + f"Region '{region_code}' not found in lookup CSV. " + f"Available columns: {list(lookup_df.columns)}. " + f"Searched 'code' and 'name' columns." + ) + + @staticmethod + def _find_household_id_column(df: pd.DataFrame, entity_name: str) -> Optional[str]: + """Find the column linking an entity to its household.""" + candidates = [ + "person_household_id", + f"{entity_name}_household_id", + "household_id", + ] + for col in candidates: + if col in df.columns: + return col + return None + + @property + def cache_key(self) -> str: + return f"weight_replacement:{self.weight_matrix_key}:{self.region_code}" + + +ScopingStrategy = Annotated[ + Union[RowFilterStrategy, WeightReplacementStrategy], + Discriminator("strategy_type"), +] diff --git a/build/lib/policyengine/core/simulation.py b/build/lib/policyengine/core/simulation.py new file mode 100644 index 00000000..6456e5bc --- /dev/null +++ b/build/lib/policyengine/core/simulation.py @@ -0,0 +1,111 @@ +import logging +from datetime import datetime +from typing import Optional +from uuid import uuid4 + +from pydantic import BaseModel, Field, model_validator + +from .cache import LRUCache +from .dataset import Dataset +from .dynamic import Dynamic +from .policy import Policy +from .scoping_strategy import RowFilterStrategy, ScopingStrategy +from .tax_benefit_model_version import TaxBenefitModelVersion + +logger = logging.getLogger(__name__) + +_cache: LRUCache["Simulation"] = LRUCache(max_size=100) + + +class Simulation(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + created_at: datetime = Field(default_factory=datetime.now) + updated_at: datetime = Field(default_factory=datetime.now) + + policy: Optional[Policy] = None + dynamic: Optional[Dynamic] = None + dataset: Dataset = None + + # Scoping strategy (preferred over legacy filter fields) + scoping_strategy: Optional[ScopingStrategy] = Field( + default=None, + description="Strategy for scoping dataset to a sub-national region", + ) + + # Legacy regional filtering parameters (kept for backward compatibility) + filter_field: Optional[str] = Field( + default=None, + description="Household-level variable to filter dataset by (e.g., 'place_fips', 'country')", + ) + filter_value: Optional[str] = Field( + default=None, + description="Value to match when filtering (e.g., '44000', 'ENGLAND')", + ) + + tax_benefit_model_version: TaxBenefitModelVersion = None + + @model_validator(mode="after") + def _auto_construct_strategy(self) -> "Simulation": + """Auto-construct a RowFilterStrategy from legacy filter fields. + + If filter_field and filter_value are set but scoping_strategy is not, + create a RowFilterStrategy for backward compatibility. + """ + if ( + self.scoping_strategy is None + and self.filter_field is not None + and self.filter_value is not None + ): + self.scoping_strategy = RowFilterStrategy( + variable_name=self.filter_field, + variable_value=self.filter_value, + ) + return self + + output_dataset: Optional[Dataset] = None + + def run(self): + self.tax_benefit_model_version.run(self) + + def ensure(self): + cached_result = _cache.get(self.id) + if cached_result: + self.output_dataset = cached_result.output_dataset + return + try: + self.tax_benefit_model_version.load(self) + except FileNotFoundError: + self.run() + self.save() + except Exception: + logger.warning( + "Unexpected error loading simulation %s; falling back to run()", + self.id, + exc_info=True, + ) + self.run() + self.save() + + _cache.add(self.id, self) + + def save(self): + """Save the simulation's output dataset.""" + self.tax_benefit_model_version.save(self) + + def load(self): + """Load the simulation's output dataset.""" + self.tax_benefit_model_version.load(self) + + @property + def release_bundle(self) -> dict[str, Optional[str]]: + bundle = ( + self.tax_benefit_model_version.release_bundle + if self.tax_benefit_model_version is not None + else {} + ) + return { + **bundle, + "dataset_filepath": self.dataset.filepath + if self.dataset is not None + else None, + } diff --git a/build/lib/policyengine/core/tax_benefit_model.py b/build/lib/policyengine/core/tax_benefit_model.py new file mode 100644 index 00000000..c2d4e26d --- /dev/null +++ b/build/lib/policyengine/core/tax_benefit_model.py @@ -0,0 +1,11 @@ +from typing import TYPE_CHECKING, Optional + +from pydantic import BaseModel + +if TYPE_CHECKING: + pass + + +class TaxBenefitModel(BaseModel): + id: str + description: Optional[str] = None diff --git a/build/lib/policyengine/core/tax_benefit_model_version.py b/build/lib/policyengine/core/tax_benefit_model_version.py new file mode 100644 index 00000000..7fb03334 --- /dev/null +++ b/build/lib/policyengine/core/tax_benefit_model_version.py @@ -0,0 +1,208 @@ +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Optional +from uuid import uuid4 + +from pydantic import BaseModel, Field + +from .release_manifest import CountryReleaseManifest, DataCertification, PackageVersion +from .tax_benefit_model import TaxBenefitModel + +if TYPE_CHECKING: + from .parameter import Parameter + from .parameter_node import ParameterNode + from .parameter_value import ParameterValue + from .region import Region, RegionRegistry + from .simulation import Simulation + from .variable import Variable + + +class TaxBenefitModelVersion(BaseModel): + model_config = {"arbitrary_types_allowed": True} + + id: str = Field(default_factory=lambda: str(uuid4())) + model: TaxBenefitModel + version: str + description: Optional[str] = None + created_at: Optional[datetime] = Field( + default_factory=lambda: datetime.now(timezone.utc) + ) + + variables: list["Variable"] = Field(default_factory=list) + parameters: list["Parameter"] = Field(default_factory=list) + parameter_nodes: list["ParameterNode"] = Field(default_factory=list) + + # Region registry for geographic simulations + region_registry: "Optional[RegionRegistry]" = Field( + default=None, description="Registry of supported geographic regions" + ) + release_manifest: Optional[CountryReleaseManifest] = Field( + default=None, + exclude=True, + ) + model_package: Optional[PackageVersion] = Field(default=None) + data_package: Optional[PackageVersion] = Field(default=None) + default_dataset_uri: Optional[str] = Field(default=None) + data_certification: Optional[DataCertification] = Field(default=None) + + @property + def parameter_values(self) -> list["ParameterValue"]: + """Aggregate all parameter values from all parameters.""" + return [ + pv for parameter in self.parameters for pv in parameter.parameter_values + ] + + # Lookup dicts for O(1) access (excluded from serialization) + variables_by_name: dict[str, "Variable"] = Field(default_factory=dict, exclude=True) + parameters_by_name: dict[str, "Parameter"] = Field( + default_factory=dict, exclude=True + ) + parameter_nodes_by_name: dict[str, "ParameterNode"] = Field( + default_factory=dict, exclude=True + ) + + def run(self, simulation: "Simulation") -> "Simulation": + raise NotImplementedError( + "The TaxBenefitModel class must define a method to execute simulations." + ) + + def save(self, simulation: "Simulation"): + raise NotImplementedError( + "The TaxBenefitModel class must define a method to save simulations." + ) + + def load(self, simulation: "Simulation"): + raise NotImplementedError( + "The TaxBenefitModel class must define a method to load simulations." + ) + + def add_parameter(self, param: "Parameter") -> None: + """Add a parameter and index it for fast lookup.""" + self.parameters.append(param) + self.parameters_by_name[param.name] = param + + def add_variable(self, var: "Variable") -> None: + """Add a variable and index it for fast lookup.""" + self.variables.append(var) + self.variables_by_name[var.name] = var + + def add_parameter_node(self, node: "ParameterNode") -> None: + """Add a parameter node and index it for fast lookup.""" + self.parameter_nodes.append(node) + self.parameter_nodes_by_name[node.name] = node + + def get_parameter(self, name: str) -> "Parameter": + """Get a parameter by name (O(1) lookup).""" + if name in self.parameters_by_name: + return self.parameters_by_name[name] + raise ValueError( + f"Parameter '{name}' not found in {self.model.id} version {self.version}" + ) + + def get_variable(self, name: str) -> "Variable": + """Get a variable by name (O(1) lookup).""" + if name in self.variables_by_name: + return self.variables_by_name[name] + raise ValueError( + f"Variable '{name}' not found in {self.model.id} version {self.version}" + ) + + def get_parameter_node(self, name: str) -> "ParameterNode": + """Get a parameter node by name (O(1) lookup).""" + if name in self.parameter_nodes_by_name: + return self.parameter_nodes_by_name[name] + raise ValueError( + f"ParameterNode '{name}' not found in {self.model.id} version {self.version}" + ) + + def get_region(self, code: str) -> "Optional[Region]": + """Get a region by its code. + + Args: + code: Region code (e.g., 'state/ca', 'place/NJ-57000') + + Returns: + The Region if found, None if not found or no region registry + """ + if self.region_registry is None: + return None + return self.region_registry.get(code) + + @property + def release_bundle(self) -> dict[str, Optional[str]]: + manifest_certification = ( + self.release_manifest.certification + if self.release_manifest is not None + else None + ) + certification = self.data_certification or manifest_certification + certified_data_artifact = ( + self.release_manifest.certified_data_artifact + if self.release_manifest is not None + else None + ) + return { + "bundle_id": self.release_manifest.bundle_id + if self.release_manifest is not None + else None, + "country_id": self.release_manifest.country_id + if self.release_manifest is not None + else None, + "policyengine_version": self.release_manifest.policyengine_version + if self.release_manifest is not None + else None, + "model_package": self.model_package.name + if self.model_package is not None + else None, + "model_version": self.model_package.version + if self.model_package is not None + else None, + "data_package": self.data_package.name + if self.data_package is not None + else None, + "data_version": self.data_package.version + if self.data_package is not None + else None, + "default_dataset": self.release_manifest.default_dataset + if self.release_manifest is not None + else None, + "default_dataset_uri": self.default_dataset_uri, + "certified_data_build_id": ( + certification.data_build_id + if certification is not None + else ( + certified_data_artifact.build_id + if certified_data_artifact is not None + else None + ) + ), + "certified_data_artifact_sha256": ( + certified_data_artifact.sha256 + if certified_data_artifact is not None + else None + ), + "data_build_model_version": ( + certification.built_with_model_version + if certification is not None + else None + ), + "data_build_model_git_sha": ( + certification.built_with_model_git_sha + if certification is not None + else None + ), + "data_build_fingerprint": ( + certification.data_build_fingerprint + if certification is not None + else None + ), + "compatibility_basis": ( + certification.compatibility_basis if certification is not None else None + ), + "certified_by": ( + certification.certified_by if certification is not None else None + ), + } + + def __repr__(self) -> str: + # Give the id and version, and the number of variables, parameters, parameter nodes, parameter values + return f"" diff --git a/build/lib/policyengine/core/trace_tro.py b/build/lib/policyengine/core/trace_tro.py new file mode 100644 index 00000000..ae31a29e --- /dev/null +++ b/build/lib/policyengine/core/trace_tro.py @@ -0,0 +1,260 @@ +from __future__ import annotations + +import hashlib +import json +from collections.abc import Iterable, Mapping +from typing import Optional + +from .release_manifest import ( + CountryReleaseManifest, + DataCertification, + DataReleaseManifest, +) + +TRACE_TROV_VERSION = "0.1" +TRACE_CONTEXT = [ + { + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "trov": "https://w3id.org/trace/trov/0.1#", + "schema": "https://schema.org/", + } +] + + +def _hash_object(value: str) -> dict[str, str]: + return { + "trov:hashAlgorithm": "sha256", + "trov:hashValue": value, + } + + +def _artifact_mime_type(path_or_uri: str) -> Optional[str]: + suffix = path_or_uri.rsplit(".", 1)[-1].lower() if "." in path_or_uri else "" + return { + "h5": "application/x-hdf5", + "json": "application/json", + "jsonld": "application/ld+json", + }.get(suffix) + + +def _canonical_json_bytes(value: Mapping) -> bytes: + return (json.dumps(value, indent=2, sort_keys=True) + "\n").encode("utf-8") + + +def compute_trace_composition_fingerprint( + artifact_hashes: Iterable[str], +) -> str: + digest = hashlib.sha256() + digest.update("".join(sorted(artifact_hashes)).encode("utf-8")) + return digest.hexdigest() + + +def build_trace_tro_from_release_bundle( + country_manifest: CountryReleaseManifest, + data_release_manifest: DataReleaseManifest, + *, + certification: Optional[DataCertification] = None, + bundle_manifest_path: Optional[str] = None, + data_release_manifest_path: Optional[str] = None, +) -> dict: + certified_artifact = country_manifest.certified_data_artifact + if certified_artifact is None: + raise ValueError( + "Country release manifest does not define a certified artifact." + ) + + dataset_artifact = data_release_manifest.artifacts.get(certified_artifact.dataset) + if dataset_artifact is None: + raise ValueError( + "Data release manifest does not include the certified dataset " + f"'{certified_artifact.dataset}'." + ) + if dataset_artifact.sha256 is None: + raise ValueError( + "Data release manifest does not include a SHA256 for the certified dataset " + f"'{certified_artifact.dataset}'." + ) + + effective_certification = certification or country_manifest.certification + bundle_manifest_location = ( + bundle_manifest_path + or f"data/release_manifests/{country_manifest.country_id}.json" + ) + data_manifest_location = data_release_manifest_path or ( + "https://huggingface.co/" + f"{country_manifest.data_package.repo_id}/resolve/" + f"{country_manifest.data_package.version}/" + f"{country_manifest.data_package.release_manifest_path}" + ) + + bundle_manifest_payload = country_manifest.model_dump(mode="json") + data_release_payload = data_release_manifest.model_dump(mode="json") + bundle_manifest_hash = hashlib.sha256( + _canonical_json_bytes(bundle_manifest_payload) + ).hexdigest() + data_release_manifest_hash = hashlib.sha256( + _canonical_json_bytes(data_release_payload) + ).hexdigest() + + artifact_specs = [ + { + "hash": bundle_manifest_hash, + "location": bundle_manifest_location, + "mime_type": "application/json", + }, + { + "hash": data_release_manifest_hash, + "location": data_manifest_location, + "mime_type": "application/json", + }, + { + "hash": dataset_artifact.sha256, + "location": certified_artifact.uri, + "mime_type": _artifact_mime_type(certified_artifact.uri), + }, + ] + + composition_artifacts = [] + arrangement_locations = [] + artifact_hashes = [] + + for index, artifact in enumerate(artifact_specs): + artifact_id = f"composition/1/artifact/{index}" + artifact_hashes.append(artifact["hash"]) + artifact_entry = { + "@id": artifact_id, + "@type": "trov:ResearchArtifact", + "trov:hash": _hash_object(artifact["hash"]), + } + if artifact["mime_type"] is not None: + artifact_entry["trov:mimeType"] = artifact["mime_type"] + composition_artifacts.append(artifact_entry) + arrangement_locations.append( + { + "@id": f"arrangement/0/location/{index}", + "@type": "trov:ArtifactLocation", + "trov:artifact": {"@id": artifact_id}, + "trov:path": artifact["location"], + } + ) + + certification_description = "" + if effective_certification is not None: + certification_description = ( + f" Certified for runtime model version " + f"{effective_certification.certified_for_model_version} via " + f"{effective_certification.compatibility_basis}." + ) + if effective_certification.built_with_model_version is not None: + certification_description += ( + f" Built with {country_manifest.model_package.name} " + f"{effective_certification.built_with_model_version}." + ) + if effective_certification.data_build_fingerprint is not None: + certification_description += ( + f" Data-build fingerprint: " + f"{effective_certification.data_build_fingerprint}." + ) + + created_at = country_manifest.published_at or ( + data_release_manifest.build.built_at + if data_release_manifest.build is not None + else None + ) + build_id = ( + effective_certification.data_build_id + if effective_certification is not None + else ( + certified_artifact.build_id + or f"{country_manifest.data_package.name}-{country_manifest.data_package.version}" + ) + ) + + return { + "@context": TRACE_CONTEXT, + "@graph": [ + { + "@id": "tro", + "@type": ["trov:TransparentResearchObject", "schema:CreativeWork"], + "trov:vocabularyVersion": TRACE_TROV_VERSION, + "schema:creator": country_manifest.policyengine_version, + "schema:name": ( + f"policyengine {country_manifest.country_id} certified bundle TRO" + ), + "schema:description": ( + f"TRACE TRO for certified runtime bundle " + f"{country_manifest.bundle_id or country_manifest.country_id} " + f"covering the bundled country release manifest, the country data " + f"release manifest, and the certified dataset artifact." + f"{certification_description}" + ), + "schema:dateCreated": created_at, + "trov:wasAssembledBy": { + "@id": "trs", + "@type": ["trov:TrustedResearchSystem", "schema:Organization"], + "schema:name": "PolicyEngine certified release bundle pipeline", + "schema:description": ( + "PolicyEngine certification workflow for runtime bundles that " + "pin a country model version, a country data release, and a " + "specific dataset artifact." + ), + }, + "trov:createdWith": { + "@type": "schema:SoftwareApplication", + "schema:name": "policyengine", + "schema:softwareVersion": country_manifest.policyengine_version, + }, + "trov:hasComposition": { + "@id": "composition/1", + "@type": "trov:ArtifactComposition", + "trov:hasFingerprint": { + "@id": "fingerprint", + "@type": "trov:CompositionFingerprint", + "trov:hash": _hash_object( + compute_trace_composition_fingerprint(artifact_hashes) + ), + }, + "trov:hasArtifact": composition_artifacts, + }, + "trov:hasArrangement": [ + { + "@id": "arrangement/0", + "@type": "trov:ArtifactArrangement", + "rdfs:comment": ( + f"Certified arrangement for bundle " + f"{country_manifest.bundle_id or country_manifest.country_id}." + ), + "trov:hasArtifactLocation": arrangement_locations, + } + ], + "trov:hasPerformance": [ + { + "@id": "trp/0", + "@type": "trov:TrustedResearchPerformance", + "rdfs:comment": ( + f"Certification of build {build_id} for " + f"{country_manifest.model_package.name} " + f"{country_manifest.model_package.version}." + ), + "trov:wasConductedBy": {"@id": "trs"}, + "trov:startedAtTime": ( + data_release_manifest.build.built_at + if data_release_manifest.build is not None + else created_at + ), + "trov:endedAtTime": created_at, + "trov:contributedToArrangement": { + "@id": "trp/0/binding/0", + "@type": "trov:ArrangementBinding", + "trov:arrangement": {"@id": "arrangement/0"}, + }, + } + ], + } + ], + } + + +def serialize_trace_tro(tro: Mapping) -> bytes: + return (json.dumps(tro, indent=2, sort_keys=True) + "\n").encode("utf-8") diff --git a/build/lib/policyengine/core/variable.py b/build/lib/policyengine/core/variable.py new file mode 100644 index 00000000..03e53495 --- /dev/null +++ b/build/lib/policyengine/core/variable.py @@ -0,0 +1,20 @@ +from typing import Any, Optional + +from pydantic import BaseModel + +from .tax_benefit_model_version import TaxBenefitModelVersion + + +class Variable(BaseModel): + id: str + name: str + label: Optional[str] = None + tax_benefit_model_version: TaxBenefitModelVersion + entity: str + description: Optional[str] = None + data_type: type = None + possible_values: Optional[list[Any]] = None + default_value: Any = None + value_type: Optional[type] = None + adds: Optional[list[str]] = None + subtracts: Optional[list[str]] = None diff --git a/build/lib/policyengine/countries/__init__.py b/build/lib/policyengine/countries/__init__.py new file mode 100644 index 00000000..3f647fd9 --- /dev/null +++ b/build/lib/policyengine/countries/__init__.py @@ -0,0 +1,9 @@ +"""Country-specific region definitions. + +This package contains region registries for each supported country. +""" + +from .uk.regions import uk_region_registry +from .us.regions import us_region_registry + +__all__ = ["us_region_registry", "uk_region_registry"] diff --git a/build/lib/policyengine/countries/uk/__init__.py b/build/lib/policyengine/countries/uk/__init__.py new file mode 100644 index 00000000..b2c255d3 --- /dev/null +++ b/build/lib/policyengine/countries/uk/__init__.py @@ -0,0 +1,5 @@ +"""UK country-specific region definitions.""" + +from .regions import uk_region_registry + +__all__ = ["uk_region_registry"] diff --git a/build/lib/policyengine/countries/uk/regions.py b/build/lib/policyengine/countries/uk/regions.py new file mode 100644 index 00000000..2f100524 --- /dev/null +++ b/build/lib/policyengine/countries/uk/regions.py @@ -0,0 +1,207 @@ +"""UK region definitions. + +This module defines all UK geographic regions: +- National (1) +- Countries (4: England, Scotland, Wales, Northern Ireland) +- Constituencies (loaded from CSV at runtime) +- Local Authorities (loaded from CSV at runtime) + +Note: Constituencies and local authorities use weight adjustment rather than +data filtering. They modify household_weight based on pre-computed weights +from H5 files stored in GCS. +""" + +import logging +from typing import TYPE_CHECKING + +from policyengine.core.region import Region, RegionRegistry +from policyengine.core.release_manifest import resolve_region_dataset_path +from policyengine.core.scoping_strategy import ( + RowFilterStrategy, + WeightReplacementStrategy, +) + +if TYPE_CHECKING: + pass + +logger = logging.getLogger(__name__) + +UK_DATA_BUCKET = "gs://policyengine-uk-data-private" + +# UK countries +UK_COUNTRIES = { + "england": "England", + "scotland": "Scotland", + "wales": "Wales", + "northern_ireland": "Northern Ireland", +} + + +def _load_constituencies_from_csv() -> list[dict]: + """Load UK constituency data from CSV. + + Constituencies are loaded from: + gs://policyengine-uk-data-private/constituencies_2024.csv + + Returns: + List of dicts with 'code' and 'name' keys + """ + try: + from policyengine_core.tools.google_cloud import download + except ImportError: + # If policyengine_core is not available, return empty list + return [] + + try: + csv_path = download( + gcs_bucket="policyengine-uk-data-private", + gcs_key="constituencies_2024.csv", + ) + import pandas as pd + + df = pd.read_csv(csv_path) + return [{"code": row["code"], "name": row["name"]} for _, row in df.iterrows()] + except (OSError, KeyError, ValueError) as exc: + logger.warning("Failed to load constituencies CSV: %s", exc) + return [] + except Exception: + logger.error("Unexpected error loading constituencies CSV", exc_info=True) + return [] + + +def _load_local_authorities_from_csv() -> list[dict]: + """Load UK local authority data from CSV. + + Local authorities are loaded from: + gs://policyengine-uk-data-private/local_authorities_2021.csv + + Returns: + List of dicts with 'code' and 'name' keys + """ + try: + from policyengine_core.tools.google_cloud import download + except ImportError: + # If policyengine_core is not available, return empty list + return [] + + try: + csv_path = download( + gcs_bucket="policyengine-uk-data-private", + gcs_key="local_authorities_2021.csv", + ) + import pandas as pd + + df = pd.read_csv(csv_path) + return [{"code": row["code"], "name": row["name"]} for _, row in df.iterrows()] + except (OSError, KeyError, ValueError) as exc: + logger.warning("Failed to load local authorities CSV: %s", exc) + return [] + except Exception: + logger.error("Unexpected error loading local authorities CSV", exc_info=True) + return [] + + +def build_uk_region_registry( + include_constituencies: bool = False, + include_local_authorities: bool = False, +) -> RegionRegistry: + """Build the UK region registry. + + Args: + include_constituencies: If True, load and include constituencies from CSV. + Defaults to False to avoid GCS dependency at import time. + include_local_authorities: If True, load and include local authorities from CSV. + Defaults to False to avoid GCS dependency at import time. + + Returns: + RegionRegistry containing: + - 1 national region + - 4 country regions + - Optionally: constituencies (if include_constituencies=True) + - Optionally: local authorities (if include_local_authorities=True) + """ + regions: list[Region] = [] + + # 1. National region (has dedicated dataset) + regions.append( + Region( + code="uk", + label="United Kingdom", + region_type="national", + dataset_path=resolve_region_dataset_path("uk", "national"), + ) + ) + + # 2. Country regions (filter from national by 'country' variable) + for code, name in UK_COUNTRIES.items(): + regions.append( + Region( + code=f"country/{code}", + label=name, + region_type="country", + parent_code="uk", + requires_filter=True, + filter_field="country", + filter_value=code.upper(), + scoping_strategy=RowFilterStrategy( + variable_name="country", + variable_value=code.upper(), + ), + ) + ) + + # 3. Constituencies (optional, loaded from CSV) + # Note: These use weight replacement, not data filtering + if include_constituencies: + constituencies = _load_constituencies_from_csv() + for const in constituencies: + regions.append( + Region( + code=f"constituency/{const['code']}", + label=const["name"], + region_type="constituency", + parent_code="uk", + requires_filter=True, + filter_field="household_weight", + filter_value=const["code"], + scoping_strategy=WeightReplacementStrategy( + weight_matrix_bucket="policyengine-uk-data-private", + weight_matrix_key="parliamentary_constituency_weights.h5", + lookup_csv_bucket="policyengine-uk-data-private", + lookup_csv_key="constituencies_2024.csv", + region_code=const["code"], + ), + ) + ) + + # 4. Local Authorities (optional, loaded from CSV) + # Note: These use weight replacement, not data filtering + if include_local_authorities: + local_authorities = _load_local_authorities_from_csv() + for la in local_authorities: + regions.append( + Region( + code=f"local_authority/{la['code']}", + label=la["name"], + region_type="local_authority", + parent_code="uk", + requires_filter=True, + filter_field="household_weight", + filter_value=la["code"], + scoping_strategy=WeightReplacementStrategy( + weight_matrix_bucket="policyengine-uk-data-private", + weight_matrix_key="local_authority_weights.h5", + lookup_csv_bucket="policyengine-uk-data-private", + lookup_csv_key="local_authorities_2021.csv", + region_code=la["code"], + ), + ) + ) + + return RegionRegistry(country_id="uk", regions=regions) + + +# Default registry with just core regions (national + countries) +# To get full registry with constituencies/LAs, call: +# build_uk_region_registry(include_constituencies=True, include_local_authorities=True) +uk_region_registry = build_uk_region_registry() diff --git a/build/lib/policyengine/countries/us/__init__.py b/build/lib/policyengine/countries/us/__init__.py new file mode 100644 index 00000000..68592459 --- /dev/null +++ b/build/lib/policyengine/countries/us/__init__.py @@ -0,0 +1,5 @@ +"""US country-specific region definitions.""" + +from .regions import us_region_registry + +__all__ = ["us_region_registry"] diff --git a/build/lib/policyengine/countries/us/data/__init__.py b/build/lib/policyengine/countries/us/data/__init__.py new file mode 100644 index 00000000..fb833b64 --- /dev/null +++ b/build/lib/policyengine/countries/us/data/__init__.py @@ -0,0 +1,18 @@ +"""US geographic data definitions. + +This module provides static data for US geographic regions: +- states.py: State abbreviations and full names +- districts.py: Congressional district counts by state +- places.py: US Census places (cities/towns over 100K population) +""" + +from .districts import AT_LARGE_STATES, DISTRICT_COUNTS +from .places import US_PLACES +from .states import US_STATES + +__all__ = [ + "US_STATES", + "DISTRICT_COUNTS", + "AT_LARGE_STATES", + "US_PLACES", +] diff --git a/build/lib/policyengine/countries/us/data/districts.py b/build/lib/policyengine/countries/us/data/districts.py new file mode 100644 index 00000000..e77d5e62 --- /dev/null +++ b/build/lib/policyengine/countries/us/data/districts.py @@ -0,0 +1,64 @@ +"""US congressional district definitions. + +Based on 2020 Census apportionment. +Total: 435 voting representatives + 1 DC non-voting delegate = 436 +""" + +# Congressional district counts by state (2020 Census apportionment) +# States with 1 district are "at-large" +DISTRICT_COUNTS: dict[str, int] = { + "AL": 7, + "AK": 1, + "AZ": 9, + "AR": 4, + "CA": 52, + "CO": 8, + "CT": 5, + "DE": 1, + "DC": 1, # Non-voting delegate + "FL": 28, + "GA": 14, + "HI": 2, + "ID": 2, + "IL": 17, + "IN": 9, + "IA": 4, + "KS": 4, + "KY": 6, + "LA": 6, + "ME": 2, + "MD": 8, + "MA": 9, + "MI": 13, + "MN": 8, + "MS": 4, + "MO": 8, + "MT": 2, + "NE": 3, + "NV": 4, + "NH": 2, + "NJ": 12, + "NM": 3, + "NY": 26, + "NC": 14, + "ND": 1, + "OH": 15, + "OK": 5, + "OR": 6, + "PA": 17, + "RI": 2, + "SC": 7, + "SD": 1, + "TN": 9, + "TX": 38, + "UT": 4, + "VT": 1, + "VA": 11, + "WA": 10, + "WV": 2, + "WI": 8, + "WY": 1, +} + +# States with at-large congressional districts (single representative) +AT_LARGE_STATES: set[str] = {"AK", "DE", "DC", "ND", "SD", "VT", "WY"} diff --git a/build/lib/policyengine/countries/us/data/places.py b/build/lib/policyengine/countries/us/data/places.py new file mode 100644 index 00000000..a5fe632f --- /dev/null +++ b/build/lib/policyengine/countries/us/data/places.py @@ -0,0 +1,1815 @@ +"""US Census places with population over 100,000. + +Source: US Census Bureau Population Estimates 2023 +Synced with policyengine-app-v2 main branch. +""" + +# US cities/places with population over 100K (from Census data) +# These filter from their parent state's dataset using place_fips +# Total: 333 places +US_PLACES: list[dict[str, str]] = [ + { + "fips": "03000", + "name": "Anchorage", + "state": "AK", + "state_name": "Alaska", + }, + { + "fips": "07000", + "name": "Birmingham", + "state": "AL", + "state_name": "Alabama", + }, + { + "fips": "37000", + "name": "Huntsville", + "state": "AL", + "state_name": "Alabama", + }, + { + "fips": "50000", + "name": "Mobile", + "state": "AL", + "state_name": "Alabama", + }, + { + "fips": "51000", + "name": "Montgomery", + "state": "AL", + "state_name": "Alabama", + }, + { + "fips": "77256", + "name": "Tuscaloosa", + "state": "AL", + "state_name": "Alabama", + }, + { + "fips": "23290", + "name": "Fayetteville", + "state": "AR", + "state_name": "Arkansas", + }, + { + "fips": "41000", + "name": "Little Rock", + "state": "AR", + "state_name": "Arkansas", + }, + { + "fips": "07940", + "name": "Buckeye", + "state": "AZ", + "state_name": "Arizona", + }, + { + "fips": "12000", + "name": "Chandler", + "state": "AZ", + "state_name": "Arizona", + }, + { + "fips": "27400", + "name": "Gilbert", + "state": "AZ", + "state_name": "Arizona", + }, + { + "fips": "27820", + "name": "Glendale", + "state": "AZ", + "state_name": "Arizona", + }, + { + "fips": "28380", + "name": "Goodyear", + "state": "AZ", + "state_name": "Arizona", + }, + {"fips": "46000", "name": "Mesa", "state": "AZ", "state_name": "Arizona"}, + { + "fips": "54050", + "name": "Peoria", + "state": "AZ", + "state_name": "Arizona", + }, + { + "fips": "55000", + "name": "Phoenix", + "state": "AZ", + "state_name": "Arizona", + }, + { + "fips": "65000", + "name": "Scottsdale", + "state": "AZ", + "state_name": "Arizona", + }, + { + "fips": "71510", + "name": "Surprise", + "state": "AZ", + "state_name": "Arizona", + }, + {"fips": "73000", "name": "Tempe", "state": "AZ", "state_name": "Arizona"}, + { + "fips": "77000", + "name": "Tucson", + "state": "AZ", + "state_name": "Arizona", + }, + {"fips": "85540", "name": "Yuma", "state": "AZ", "state_name": "Arizona"}, + { + "fips": "02000", + "name": "Anaheim", + "state": "CA", + "state_name": "California", + }, + { + "fips": "02252", + "name": "Antioch", + "state": "CA", + "state_name": "California", + }, + { + "fips": "03526", + "name": "Bakersfield", + "state": "CA", + "state_name": "California", + }, + { + "fips": "06000", + "name": "Berkeley", + "state": "CA", + "state_name": "California", + }, + { + "fips": "08954", + "name": "Burbank", + "state": "CA", + "state_name": "California", + }, + { + "fips": "11194", + "name": "Carlsbad", + "state": "CA", + "state_name": "California", + }, + { + "fips": "13014", + "name": "Chico", + "state": "CA", + "state_name": "California", + }, + { + "fips": "13392", + "name": "Chula Vista", + "state": "CA", + "state_name": "California", + }, + { + "fips": "14218", + "name": "Clovis", + "state": "CA", + "state_name": "California", + }, + { + "fips": "16000", + "name": "Concord", + "state": "CA", + "state_name": "California", + }, + { + "fips": "16350", + "name": "Corona", + "state": "CA", + "state_name": "California", + }, + { + "fips": "16532", + "name": "Costa Mesa", + "state": "CA", + "state_name": "California", + }, + { + "fips": "19766", + "name": "Downey", + "state": "CA", + "state_name": "California", + }, + { + "fips": "21712", + "name": "El Cajon", + "state": "CA", + "state_name": "California", + }, + { + "fips": "22230", + "name": "El Monte", + "state": "CA", + "state_name": "California", + }, + { + "fips": "22020", + "name": "Elk Grove", + "state": "CA", + "state_name": "California", + }, + { + "fips": "22804", + "name": "Escondido", + "state": "CA", + "state_name": "California", + }, + { + "fips": "23182", + "name": "Fairfield", + "state": "CA", + "state_name": "California", + }, + { + "fips": "24680", + "name": "Fontana", + "state": "CA", + "state_name": "California", + }, + { + "fips": "26000", + "name": "Fremont", + "state": "CA", + "state_name": "California", + }, + { + "fips": "27000", + "name": "Fresno", + "state": "CA", + "state_name": "California", + }, + { + "fips": "28000", + "name": "Fullerton", + "state": "CA", + "state_name": "California", + }, + { + "fips": "29000", + "name": "Garden Grove", + "state": "CA", + "state_name": "California", + }, + { + "fips": "30000", + "name": "Glendale", + "state": "CA", + "state_name": "California", + }, + { + "fips": "33000", + "name": "Hayward", + "state": "CA", + "state_name": "California", + }, + { + "fips": "33434", + "name": "Hesperia", + "state": "CA", + "state_name": "California", + }, + { + "fips": "36000", + "name": "Huntington Beach", + "state": "CA", + "state_name": "California", + }, + { + "fips": "36546", + "name": "Inglewood", + "state": "CA", + "state_name": "California", + }, + { + "fips": "36770", + "name": "Irvine", + "state": "CA", + "state_name": "California", + }, + { + "fips": "37692", + "name": "Jurupa Valley", + "state": "CA", + "state_name": "California", + }, + { + "fips": "40130", + "name": "Lancaster", + "state": "CA", + "state_name": "California", + }, + { + "fips": "43000", + "name": "Long Beach", + "state": "CA", + "state_name": "California", + }, + { + "fips": "44000", + "name": "Los Angeles", + "state": "CA", + "state_name": "California", + }, + { + "fips": "46842", + "name": "Menifee", + "state": "CA", + "state_name": "California", + }, + { + "fips": "48354", + "name": "Modesto", + "state": "CA", + "state_name": "California", + }, + { + "fips": "49270", + "name": "Moreno Valley", + "state": "CA", + "state_name": "California", + }, + { + "fips": "50076", + "name": "Murrieta", + "state": "CA", + "state_name": "California", + }, + { + "fips": "53000", + "name": "Oakland", + "state": "CA", + "state_name": "California", + }, + { + "fips": "53322", + "name": "Oceanside", + "state": "CA", + "state_name": "California", + }, + { + "fips": "53896", + "name": "Ontario", + "state": "CA", + "state_name": "California", + }, + { + "fips": "53980", + "name": "Orange", + "state": "CA", + "state_name": "California", + }, + { + "fips": "54652", + "name": "Oxnard", + "state": "CA", + "state_name": "California", + }, + { + "fips": "55156", + "name": "Palmdale", + "state": "CA", + "state_name": "California", + }, + { + "fips": "56000", + "name": "Pasadena", + "state": "CA", + "state_name": "California", + }, + { + "fips": "58072", + "name": "Pomona", + "state": "CA", + "state_name": "California", + }, + { + "fips": "59451", + "name": "Rancho Cucamonga", + "state": "CA", + "state_name": "California", + }, + { + "fips": "60466", + "name": "Rialto", + "state": "CA", + "state_name": "California", + }, + { + "fips": "60620", + "name": "Richmond", + "state": "CA", + "state_name": "California", + }, + { + "fips": "62000", + "name": "Riverside", + "state": "CA", + "state_name": "California", + }, + { + "fips": "62938", + "name": "Roseville", + "state": "CA", + "state_name": "California", + }, + { + "fips": "64000", + "name": "Sacramento", + "state": "CA", + "state_name": "California", + }, + { + "fips": "64224", + "name": "Salinas", + "state": "CA", + "state_name": "California", + }, + { + "fips": "65000", + "name": "San Bernardino", + "state": "CA", + "state_name": "California", + }, + { + "fips": "66000", + "name": "San Diego", + "state": "CA", + "state_name": "California", + }, + { + "fips": "67000", + "name": "San Francisco", + "state": "CA", + "state_name": "California", + }, + { + "fips": "68000", + "name": "San Jose", + "state": "CA", + "state_name": "California", + }, + { + "fips": "68252", + "name": "San Mateo", + "state": "CA", + "state_name": "California", + }, + { + "fips": "69000", + "name": "Santa Ana", + "state": "CA", + "state_name": "California", + }, + { + "fips": "69084", + "name": "Santa Clara", + "state": "CA", + "state_name": "California", + }, + { + "fips": "69088", + "name": "Santa Clarita", + "state": "CA", + "state_name": "California", + }, + { + "fips": "69196", + "name": "Santa Maria", + "state": "CA", + "state_name": "California", + }, + { + "fips": "70098", + "name": "Santa Rosa", + "state": "CA", + "state_name": "California", + }, + { + "fips": "72016", + "name": "Simi Valley", + "state": "CA", + "state_name": "California", + }, + { + "fips": "75000", + "name": "Stockton", + "state": "CA", + "state_name": "California", + }, + { + "fips": "77000", + "name": "Sunnyvale", + "state": "CA", + "state_name": "California", + }, + { + "fips": "78120", + "name": "Temecula", + "state": "CA", + "state_name": "California", + }, + { + "fips": "78582", + "name": "Thousand Oaks", + "state": "CA", + "state_name": "California", + }, + { + "fips": "80000", + "name": "Torrance", + "state": "CA", + "state_name": "California", + }, + { + "fips": "81554", + "name": "Vacaville", + "state": "CA", + "state_name": "California", + }, + { + "fips": "81666", + "name": "Vallejo", + "state": "CA", + "state_name": "California", + }, + { + "fips": "65042", + "name": "Ventura", + "state": "CA", + "state_name": "California", + }, + { + "fips": "82590", + "name": "Victorville", + "state": "CA", + "state_name": "California", + }, + { + "fips": "82954", + "name": "Visalia", + "state": "CA", + "state_name": "California", + }, + { + "fips": "84200", + "name": "West Covina", + "state": "CA", + "state_name": "California", + }, + { + "fips": "03455", + "name": "Arvada", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "04000", + "name": "Aurora", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "07850", + "name": "Boulder", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "12815", + "name": "Centennial", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "16000", + "name": "Colorado Springs", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "20000", + "name": "Denver", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "27425", + "name": "Fort Collins", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "32155", + "name": "Greeley", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "43000", + "name": "Lakewood", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "62000", + "name": "Pueblo", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "77290", + "name": "Thornton", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "83835", + "name": "Westminster", + "state": "CO", + "state_name": "Colorado", + }, + { + "fips": "08000", + "name": "Bridgeport", + "state": "CT", + "state_name": "Connecticut", + }, + { + "fips": "37000", + "name": "Hartford", + "state": "CT", + "state_name": "Connecticut", + }, + { + "fips": "52000", + "name": "New Haven", + "state": "CT", + "state_name": "Connecticut", + }, + { + "fips": "73000", + "name": "Stamford", + "state": "CT", + "state_name": "Connecticut", + }, + { + "fips": "80000", + "name": "Waterbury", + "state": "CT", + "state_name": "Connecticut", + }, + { + "fips": "50000", + "name": "Washington", + "state": "DC", + "state_name": "District of Columbia", + }, + { + "fips": "10275", + "name": "Cape Coral", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "12875", + "name": "Clearwater", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "14400", + "name": "Coral Springs", + "state": "FL", + "state_name": "Florida", + }, + {"fips": "16475", "name": "Davie", "state": "FL", "state_name": "Florida"}, + { + "fips": "24000", + "name": "Fort Lauderdale", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "25175", + "name": "Gainesville", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "30000", + "name": "Hialeah", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "32000", + "name": "Hollywood", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "35000", + "name": "Jacksonville", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "38250", + "name": "Lakeland", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "45060", + "name": "Miami Gardens", + "state": "FL", + "state_name": "Florida", + }, + {"fips": "45000", "name": "Miami", "state": "FL", "state_name": "Florida"}, + { + "fips": "45975", + "name": "Miramar", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "53000", + "name": "Orlando", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "54000", + "name": "Palm Bay", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "54200", + "name": "Palm Coast", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "55775", + "name": "Pembroke Pines", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "58050", + "name": "Pompano Beach", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "58715", + "name": "Port St. Lucie", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "63000", + "name": "St. Petersburg", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "70600", + "name": "Tallahassee", + "state": "FL", + "state_name": "Florida", + }, + {"fips": "71000", "name": "Tampa", "state": "FL", "state_name": "Florida"}, + { + "fips": "76600", + "name": "West Palm Beach", + "state": "FL", + "state_name": "Florida", + }, + { + "fips": "03440", + "name": "Athens-Clarke County", + "state": "GA", + "state_name": "Georgia", + }, + { + "fips": "04000", + "name": "Atlanta", + "state": "GA", + "state_name": "Georgia", + }, + { + "fips": "04204", + "name": "Augusta-Richmond County", + "state": "GA", + "state_name": "Georgia", + }, + { + "fips": "19000", + "name": "Columbus", + "state": "GA", + "state_name": "Georgia", + }, + { + "fips": "49008", + "name": "Macon-Bibb County", + "state": "GA", + "state_name": "Georgia", + }, + { + "fips": "68516", + "name": "Sandy Springs", + "state": "GA", + "state_name": "Georgia", + }, + { + "fips": "69000", + "name": "Savannah", + "state": "GA", + "state_name": "Georgia", + }, + { + "fips": "72122", + "name": "South Fulton", + "state": "GA", + "state_name": "Georgia", + }, + { + "fips": "71550", + "name": "Urban Honolulu", + "state": "HI", + "state_name": "Hawaii", + }, + { + "fips": "12000", + "name": "Cedar Rapids", + "state": "IA", + "state_name": "Iowa", + }, + { + "fips": "19000", + "name": "Davenport", + "state": "IA", + "state_name": "Iowa", + }, + { + "fips": "21000", + "name": "Des Moines", + "state": "IA", + "state_name": "Iowa", + }, + { + "fips": "08830", + "name": "Boise City", + "state": "ID", + "state_name": "Idaho", + }, + { + "fips": "52120", + "name": "Meridian", + "state": "ID", + "state_name": "Idaho", + }, + {"fips": "56260", "name": "Nampa", "state": "ID", "state_name": "Idaho"}, + { + "fips": "03012", + "name": "Aurora", + "state": "IL", + "state_name": "Illinois", + }, + { + "fips": "14000", + "name": "Chicago", + "state": "IL", + "state_name": "Illinois", + }, + { + "fips": "23074", + "name": "Elgin", + "state": "IL", + "state_name": "Illinois", + }, + { + "fips": "38570", + "name": "Joliet", + "state": "IL", + "state_name": "Illinois", + }, + { + "fips": "51622", + "name": "Naperville", + "state": "IL", + "state_name": "Illinois", + }, + { + "fips": "59000", + "name": "Peoria", + "state": "IL", + "state_name": "Illinois", + }, + { + "fips": "65000", + "name": "Rockford", + "state": "IL", + "state_name": "Illinois", + }, + { + "fips": "72000", + "name": "Springfield", + "state": "IL", + "state_name": "Illinois", + }, + { + "fips": "10342", + "name": "Carmel", + "state": "IN", + "state_name": "Indiana", + }, + { + "fips": "22000", + "name": "Evansville", + "state": "IN", + "state_name": "Indiana", + }, + { + "fips": "23278", + "name": "Fishers", + "state": "IN", + "state_name": "Indiana", + }, + { + "fips": "25000", + "name": "Fort Wayne", + "state": "IN", + "state_name": "Indiana", + }, + { + "fips": "36003", + "name": "Indianapolis", + "state": "IN", + "state_name": "Indiana", + }, + { + "fips": "71000", + "name": "South Bend", + "state": "IN", + "state_name": "Indiana", + }, + { + "fips": "36000", + "name": "Kansas City", + "state": "KS", + "state_name": "Kansas", + }, + {"fips": "52575", "name": "Olathe", "state": "KS", "state_name": "Kansas"}, + { + "fips": "53775", + "name": "Overland Park", + "state": "KS", + "state_name": "Kansas", + }, + {"fips": "71000", "name": "Topeka", "state": "KS", "state_name": "Kansas"}, + { + "fips": "79000", + "name": "Wichita", + "state": "KS", + "state_name": "Kansas", + }, + { + "fips": "46027", + "name": "Lexington-Fayette", + "state": "KY", + "state_name": "Kentucky", + }, + { + "fips": "48006", + "name": "Louisville/Jefferson County", + "state": "KY", + "state_name": "Kentucky", + }, + { + "fips": "05000", + "name": "Baton Rouge", + "state": "LA", + "state_name": "Louisiana", + }, + { + "fips": "40735", + "name": "Lafayette", + "state": "LA", + "state_name": "Louisiana", + }, + { + "fips": "55000", + "name": "New Orleans", + "state": "LA", + "state_name": "Louisiana", + }, + { + "fips": "70000", + "name": "Shreveport", + "state": "LA", + "state_name": "Louisiana", + }, + { + "fips": "07000", + "name": "Boston", + "state": "MA", + "state_name": "Massachusetts", + }, + { + "fips": "09000", + "name": "Brockton", + "state": "MA", + "state_name": "Massachusetts", + }, + { + "fips": "11000", + "name": "Cambridge", + "state": "MA", + "state_name": "Massachusetts", + }, + { + "fips": "37000", + "name": "Lowell", + "state": "MA", + "state_name": "Massachusetts", + }, + { + "fips": "37490", + "name": "Lynn", + "state": "MA", + "state_name": "Massachusetts", + }, + { + "fips": "45000", + "name": "New Bedford", + "state": "MA", + "state_name": "Massachusetts", + }, + { + "fips": "55745", + "name": "Quincy", + "state": "MA", + "state_name": "Massachusetts", + }, + { + "fips": "67000", + "name": "Springfield", + "state": "MA", + "state_name": "Massachusetts", + }, + { + "fips": "82000", + "name": "Worcester", + "state": "MA", + "state_name": "Massachusetts", + }, + { + "fips": "04000", + "name": "Baltimore", + "state": "MD", + "state_name": "Maryland", + }, + { + "fips": "03000", + "name": "Ann Arbor", + "state": "MI", + "state_name": "Michigan", + }, + { + "fips": "21000", + "name": "Dearborn", + "state": "MI", + "state_name": "Michigan", + }, + { + "fips": "22000", + "name": "Detroit", + "state": "MI", + "state_name": "Michigan", + }, + { + "fips": "34000", + "name": "Grand Rapids", + "state": "MI", + "state_name": "Michigan", + }, + { + "fips": "46000", + "name": "Lansing", + "state": "MI", + "state_name": "Michigan", + }, + { + "fips": "76460", + "name": "Sterling Heights", + "state": "MI", + "state_name": "Michigan", + }, + { + "fips": "84000", + "name": "Warren", + "state": "MI", + "state_name": "Michigan", + }, + { + "fips": "43000", + "name": "Minneapolis", + "state": "MN", + "state_name": "Minnesota", + }, + { + "fips": "54880", + "name": "Rochester", + "state": "MN", + "state_name": "Minnesota", + }, + { + "fips": "58000", + "name": "St. Paul", + "state": "MN", + "state_name": "Minnesota", + }, + { + "fips": "15670", + "name": "Columbia", + "state": "MO", + "state_name": "Missouri", + }, + { + "fips": "35000", + "name": "Independence", + "state": "MO", + "state_name": "Missouri", + }, + { + "fips": "38000", + "name": "Kansas City", + "state": "MO", + "state_name": "Missouri", + }, + { + "fips": "41348", + "name": "Lee's Summit", + "state": "MO", + "state_name": "Missouri", + }, + { + "fips": "70000", + "name": "Springfield", + "state": "MO", + "state_name": "Missouri", + }, + { + "fips": "65000", + "name": "St. Louis", + "state": "MO", + "state_name": "Missouri", + }, + { + "fips": "36000", + "name": "Jackson", + "state": "MS", + "state_name": "Mississippi", + }, + { + "fips": "06550", + "name": "Billings", + "state": "MT", + "state_name": "Montana", + }, + { + "fips": "10740", + "name": "Cary", + "state": "NC", + "state_name": "North Carolina", + }, + { + "fips": "12000", + "name": "Charlotte", + "state": "NC", + "state_name": "North Carolina", + }, + { + "fips": "14100", + "name": "Concord", + "state": "NC", + "state_name": "North Carolina", + }, + { + "fips": "19000", + "name": "Durham", + "state": "NC", + "state_name": "North Carolina", + }, + { + "fips": "22920", + "name": "Fayetteville", + "state": "NC", + "state_name": "North Carolina", + }, + { + "fips": "28000", + "name": "Greensboro", + "state": "NC", + "state_name": "North Carolina", + }, + { + "fips": "31400", + "name": "High Point", + "state": "NC", + "state_name": "North Carolina", + }, + { + "fips": "55000", + "name": "Raleigh", + "state": "NC", + "state_name": "North Carolina", + }, + { + "fips": "74440", + "name": "Wilmington", + "state": "NC", + "state_name": "North Carolina", + }, + { + "fips": "75000", + "name": "Winston-Salem", + "state": "NC", + "state_name": "North Carolina", + }, + { + "fips": "25700", + "name": "Fargo", + "state": "ND", + "state_name": "North Dakota", + }, + { + "fips": "28000", + "name": "Lincoln", + "state": "NE", + "state_name": "Nebraska", + }, + { + "fips": "37000", + "name": "Omaha", + "state": "NE", + "state_name": "Nebraska", + }, + { + "fips": "45140", + "name": "Manchester", + "state": "NH", + "state_name": "New Hampshire", + }, + { + "fips": "21000", + "name": "Elizabeth", + "state": "NJ", + "state_name": "New Jersey", + }, + { + "fips": "36000", + "name": "Jersey City", + "state": "NJ", + "state_name": "New Jersey", + }, + { + "fips": "51000", + "name": "Newark", + "state": "NJ", + "state_name": "New Jersey", + }, + { + "fips": "57000", + "name": "Paterson", + "state": "NJ", + "state_name": "New Jersey", + }, + { + "fips": "02000", + "name": "Albuquerque", + "state": "NM", + "state_name": "New Mexico", + }, + { + "fips": "39380", + "name": "Las Cruces", + "state": "NM", + "state_name": "New Mexico", + }, + { + "fips": "63460", + "name": "Rio Rancho", + "state": "NM", + "state_name": "New Mexico", + }, + { + "fips": "31900", + "name": "Henderson", + "state": "NV", + "state_name": "Nevada", + }, + { + "fips": "40000", + "name": "Las Vegas", + "state": "NV", + "state_name": "Nevada", + }, + { + "fips": "51800", + "name": "North Las Vegas", + "state": "NV", + "state_name": "Nevada", + }, + {"fips": "60600", "name": "Reno", "state": "NV", "state_name": "Nevada"}, + {"fips": "68400", "name": "Sparks", "state": "NV", "state_name": "Nevada"}, + { + "fips": "01000", + "name": "Albany", + "state": "NY", + "state_name": "New York", + }, + { + "fips": "11000", + "name": "Buffalo", + "state": "NY", + "state_name": "New York", + }, + { + "fips": "51000", + "name": "New York City", + "state": "NY", + "state_name": "New York", + }, + { + "fips": "63000", + "name": "Rochester", + "state": "NY", + "state_name": "New York", + }, + { + "fips": "73000", + "name": "Syracuse", + "state": "NY", + "state_name": "New York", + }, + { + "fips": "84000", + "name": "Yonkers", + "state": "NY", + "state_name": "New York", + }, + {"fips": "01000", "name": "Akron", "state": "OH", "state_name": "Ohio"}, + { + "fips": "15000", + "name": "Cincinnati", + "state": "OH", + "state_name": "Ohio", + }, + { + "fips": "16000", + "name": "Cleveland", + "state": "OH", + "state_name": "Ohio", + }, + {"fips": "18000", "name": "Columbus", "state": "OH", "state_name": "Ohio"}, + {"fips": "21000", "name": "Dayton", "state": "OH", "state_name": "Ohio"}, + {"fips": "77000", "name": "Toledo", "state": "OH", "state_name": "Ohio"}, + { + "fips": "09050", + "name": "Broken Arrow", + "state": "OK", + "state_name": "Oklahoma", + }, + { + "fips": "52500", + "name": "Norman", + "state": "OK", + "state_name": "Oklahoma", + }, + { + "fips": "55000", + "name": "Oklahoma City", + "state": "OK", + "state_name": "Oklahoma", + }, + { + "fips": "75000", + "name": "Tulsa", + "state": "OK", + "state_name": "Oklahoma", + }, + {"fips": "05800", "name": "Bend", "state": "OR", "state_name": "Oregon"}, + {"fips": "23850", "name": "Eugene", "state": "OR", "state_name": "Oregon"}, + { + "fips": "31250", + "name": "Gresham", + "state": "OR", + "state_name": "Oregon", + }, + { + "fips": "34100", + "name": "Hillsboro", + "state": "OR", + "state_name": "Oregon", + }, + { + "fips": "59000", + "name": "Portland", + "state": "OR", + "state_name": "Oregon", + }, + {"fips": "64900", "name": "Salem", "state": "OR", "state_name": "Oregon"}, + { + "fips": "02000", + "name": "Allentown", + "state": "PA", + "state_name": "Pennsylvania", + }, + { + "fips": "60000", + "name": "Philadelphia", + "state": "PA", + "state_name": "Pennsylvania", + }, + { + "fips": "61000", + "name": "Pittsburgh", + "state": "PA", + "state_name": "Pennsylvania", + }, + { + "fips": "59000", + "name": "Providence", + "state": "RI", + "state_name": "Rhode Island", + }, + { + "fips": "13330", + "name": "Charleston", + "state": "SC", + "state_name": "South Carolina", + }, + { + "fips": "16000", + "name": "Columbia", + "state": "SC", + "state_name": "South Carolina", + }, + { + "fips": "50875", + "name": "North Charleston", + "state": "SC", + "state_name": "South Carolina", + }, + { + "fips": "59020", + "name": "Sioux Falls", + "state": "SD", + "state_name": "South Dakota", + }, + { + "fips": "14000", + "name": "Chattanooga", + "state": "TN", + "state_name": "Tennessee", + }, + { + "fips": "15160", + "name": "Clarksville", + "state": "TN", + "state_name": "Tennessee", + }, + { + "fips": "40000", + "name": "Knoxville", + "state": "TN", + "state_name": "Tennessee", + }, + { + "fips": "48000", + "name": "Memphis", + "state": "TN", + "state_name": "Tennessee", + }, + { + "fips": "51560", + "name": "Murfreesboro", + "state": "TN", + "state_name": "Tennessee", + }, + # Extracted 332 places + { + "fips": "52006", + "name": "Nashville-Davidson", + "state": "TN", + "state_name": "Tennessee", + }, + {"fips": "01000", "name": "Abilene", "state": "TX", "state_name": "Texas"}, + {"fips": "01924", "name": "Allen", "state": "TX", "state_name": "Texas"}, + { + "fips": "03000", + "name": "Amarillo", + "state": "TX", + "state_name": "Texas", + }, + { + "fips": "04000", + "name": "Arlington", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "05000", "name": "Austin", "state": "TX", "state_name": "Texas"}, + { + "fips": "07000", + "name": "Beaumont", + "state": "TX", + "state_name": "Texas", + }, + { + "fips": "10768", + "name": "Brownsville", + "state": "TX", + "state_name": "Texas", + }, + { + "fips": "13024", + "name": "Carrollton", + "state": "TX", + "state_name": "Texas", + }, + { + "fips": "15976", + "name": "College Station", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "16432", "name": "Conroe", "state": "TX", "state_name": "Texas"}, + { + "fips": "17000", + "name": "Corpus Christi", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "19000", "name": "Dallas", "state": "TX", "state_name": "Texas"}, + {"fips": "19972", "name": "Denton", "state": "TX", "state_name": "Texas"}, + { + "fips": "22660", + "name": "Edinburg", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "24000", "name": "El Paso", "state": "TX", "state_name": "Texas"}, + { + "fips": "27000", + "name": "Fort Worth", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "27684", "name": "Frisco", "state": "TX", "state_name": "Texas"}, + {"fips": "29000", "name": "Garland", "state": "TX", "state_name": "Texas"}, + { + "fips": "30464", + "name": "Grand Prairie", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "35000", "name": "Houston", "state": "TX", "state_name": "Texas"}, + {"fips": "37000", "name": "Irving", "state": "TX", "state_name": "Texas"}, + {"fips": "39148", "name": "Killeen", "state": "TX", "state_name": "Texas"}, + {"fips": "41464", "name": "Laredo", "state": "TX", "state_name": "Texas"}, + { + "fips": "41980", + "name": "League City", + "state": "TX", + "state_name": "Texas", + }, + { + "fips": "42508", + "name": "Lewisville", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "45000", "name": "Lubbock", "state": "TX", "state_name": "Texas"}, + {"fips": "45384", "name": "McAllen", "state": "TX", "state_name": "Texas"}, + { + "fips": "45744", + "name": "McKinney", + "state": "TX", + "state_name": "Texas", + }, + { + "fips": "47892", + "name": "Mesquite", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "48072", "name": "Midland", "state": "TX", "state_name": "Texas"}, + { + "fips": "50820", + "name": "New Braunfels", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "53388", "name": "Odessa", "state": "TX", "state_name": "Texas"}, + { + "fips": "56000", + "name": "Pasadena", + "state": "TX", + "state_name": "Texas", + }, + { + "fips": "56348", + "name": "Pearland", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "58016", "name": "Plano", "state": "TX", "state_name": "Texas"}, + { + "fips": "61796", + "name": "Richardson", + "state": "TX", + "state_name": "Texas", + }, + { + "fips": "63500", + "name": "Round Rock", + "state": "TX", + "state_name": "Texas", + }, + { + "fips": "65000", + "name": "San Antonio", + "state": "TX", + "state_name": "Texas", + }, + { + "fips": "70808", + "name": "Sugar Land", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "74144", "name": "Tyler", "state": "TX", "state_name": "Texas"}, + {"fips": "76000", "name": "Waco", "state": "TX", "state_name": "Texas"}, + { + "fips": "79000", + "name": "Wichita Falls", + "state": "TX", + "state_name": "Texas", + }, + {"fips": "62470", "name": "Provo", "state": "UT", "state_name": "Utah"}, + { + "fips": "67000", + "name": "Salt Lake City", + "state": "UT", + "state_name": "Utah", + }, + { + "fips": "65330", + "name": "St. George", + "state": "UT", + "state_name": "Utah", + }, + { + "fips": "82950", + "name": "West Jordan", + "state": "UT", + "state_name": "Utah", + }, + { + "fips": "83470", + "name": "West Valley City", + "state": "UT", + "state_name": "Utah", + }, + { + "fips": "01000", + "name": "Alexandria", + "state": "VA", + "state_name": "Virginia", + }, + { + "fips": "16000", + "name": "Chesapeake", + "state": "VA", + "state_name": "Virginia", + }, + { + "fips": "35000", + "name": "Hampton", + "state": "VA", + "state_name": "Virginia", + }, + { + "fips": "56000", + "name": "Newport News", + "state": "VA", + "state_name": "Virginia", + }, + { + "fips": "57000", + "name": "Norfolk", + "state": "VA", + "state_name": "Virginia", + }, + { + "fips": "67000", + "name": "Richmond", + "state": "VA", + "state_name": "Virginia", + }, + { + "fips": "76432", + "name": "Suffolk", + "state": "VA", + "state_name": "Virginia", + }, + { + "fips": "82000", + "name": "Virginia Beach", + "state": "VA", + "state_name": "Virginia", + }, + { + "fips": "05210", + "name": "Bellevue", + "state": "WA", + "state_name": "Washington", + }, + { + "fips": "22640", + "name": "Everett", + "state": "WA", + "state_name": "Washington", + }, + { + "fips": "35415", + "name": "Kent", + "state": "WA", + "state_name": "Washington", + }, + { + "fips": "57745", + "name": "Renton", + "state": "WA", + "state_name": "Washington", + }, + { + "fips": "63000", + "name": "Seattle", + "state": "WA", + "state_name": "Washington", + }, + { + "fips": "67167", + "name": "Spokane Valley", + "state": "WA", + "state_name": "Washington", + }, + { + "fips": "67000", + "name": "Spokane", + "state": "WA", + "state_name": "Washington", + }, + { + "fips": "70000", + "name": "Tacoma", + "state": "WA", + "state_name": "Washington", + }, + { + "fips": "74060", + "name": "Vancouver", + "state": "WA", + "state_name": "Washington", + }, + { + "fips": "31000", + "name": "Green Bay", + "state": "WI", + "state_name": "Wisconsin", + }, + { + "fips": "48000", + "name": "Madison", + "state": "WI", + "state_name": "Wisconsin", + }, + { + "fips": "53000", + "name": "Milwaukee", + "state": "WI", + "state_name": "Wisconsin", + }, +] diff --git a/build/lib/policyengine/countries/us/data/states.py b/build/lib/policyengine/countries/us/data/states.py new file mode 100644 index 00000000..1309201b --- /dev/null +++ b/build/lib/policyengine/countries/us/data/states.py @@ -0,0 +1,59 @@ +"""US state definitions. + +All 50 states plus District of Columbia. +""" + +# All US states and territories with their full names +US_STATES: dict[str, str] = { + "AL": "Alabama", + "AK": "Alaska", + "AZ": "Arizona", + "AR": "Arkansas", + "CA": "California", + "CO": "Colorado", + "CT": "Connecticut", + "DE": "Delaware", + "DC": "District of Columbia", + "FL": "Florida", + "GA": "Georgia", + "HI": "Hawaii", + "ID": "Idaho", + "IL": "Illinois", + "IN": "Indiana", + "IA": "Iowa", + "KS": "Kansas", + "KY": "Kentucky", + "LA": "Louisiana", + "ME": "Maine", + "MD": "Maryland", + "MA": "Massachusetts", + "MI": "Michigan", + "MN": "Minnesota", + "MS": "Mississippi", + "MO": "Missouri", + "MT": "Montana", + "NE": "Nebraska", + "NV": "Nevada", + "NH": "New Hampshire", + "NJ": "New Jersey", + "NM": "New Mexico", + "NY": "New York", + "NC": "North Carolina", + "ND": "North Dakota", + "OH": "Ohio", + "OK": "Oklahoma", + "OR": "Oregon", + "PA": "Pennsylvania", + "RI": "Rhode Island", + "SC": "South Carolina", + "SD": "South Dakota", + "TN": "Tennessee", + "TX": "Texas", + "UT": "Utah", + "VT": "Vermont", + "VA": "Virginia", + "WA": "Washington", + "WV": "West Virginia", + "WI": "Wisconsin", + "WY": "Wyoming", +} diff --git a/build/lib/policyengine/countries/us/regions.py b/build/lib/policyengine/countries/us/regions.py new file mode 100644 index 00000000..f335805f --- /dev/null +++ b/build/lib/policyengine/countries/us/regions.py @@ -0,0 +1,120 @@ +"""US region registry builder. + +This module builds the complete US region registry from the data definitions +in the data/ subdirectory: +- data/states.py: State definitions +- data/districts.py: Congressional district counts +- data/places.py: Census places over 100K population +""" + +from policyengine.core.region import Region, RegionRegistry +from policyengine.core.release_manifest import resolve_region_dataset_path +from policyengine.core.scoping_strategy import RowFilterStrategy + +from .data import AT_LARGE_STATES, DISTRICT_COUNTS, US_PLACES, US_STATES + +US_DATA_BUCKET = "gs://policyengine-us-data" + + +def _ordinal(n: int) -> str: + """Return ordinal suffix for a number (1st, 2nd, 3rd, etc.).""" + if 11 <= n % 100 <= 13: + return f"{n}th" + return f"{n}" + {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th") + + +def build_us_region_registry() -> RegionRegistry: + """Build the complete US region registry. + + Returns: + RegionRegistry containing: + - 1 national region + - 51 state regions (50 states + DC) + - 436 congressional district regions (435 + DC delegate) + - 333 place/city regions (Census places over 100K population) + """ + regions: list[Region] = [] + + # 1. National region (has dedicated dataset) + regions.append( + Region( + code="us", + label="United States", + region_type="national", + dataset_path=resolve_region_dataset_path("us", "national"), + ) + ) + + # 2. State regions (each has dedicated dataset) + for abbrev, name in US_STATES.items(): + regions.append( + Region( + code=f"state/{abbrev.lower()}", + label=name, + region_type="state", + parent_code="us", + dataset_path=resolve_region_dataset_path( + "us", + "state", + state_code=abbrev, + ), + state_code=abbrev, + state_name=name, + ) + ) + + # 3. Congressional district regions (each has dedicated dataset) + for state_abbrev, count in DISTRICT_COUNTS.items(): + state_name = US_STATES[state_abbrev] + for i in range(1, count + 1): + district_code = f"{state_abbrev}-{i:02d}" + + # Create appropriate label + if state_abbrev in AT_LARGE_STATES: + label = f"{state_name}'s at-large congressional district" + else: + label = f"{state_name}'s {_ordinal(i)} congressional district" + + regions.append( + Region( + code=f"congressional_district/{district_code}", + label=label, + region_type="congressional_district", + parent_code=f"state/{state_abbrev.lower()}", + dataset_path=resolve_region_dataset_path( + "us", + "congressional_district", + district_code=district_code, + ), + state_code=state_abbrev, + state_name=state_name, + ) + ) + + # 4. Place/city regions (filter from state datasets) + for place in US_PLACES: + state_abbrev = place["state"] + fips = place["fips"] + regions.append( + Region( + code=f"place/{state_abbrev}-{fips}", + label=place["name"], + region_type="place", + parent_code=f"state/{state_abbrev.lower()}", + requires_filter=True, + filter_field="place_fips", + filter_value=fips, + state_code=state_abbrev, + state_name=place["state_name"], + scoping_strategy=RowFilterStrategy( + variable_name="place_fips", + variable_value=fips, + ), + ) + ) + + return RegionRegistry(country_id="us", regions=regions) + + +# Singleton instance for import +us_region_registry = build_us_region_registry() diff --git a/build/lib/policyengine/data/release_manifests/uk.json b/build/lib/policyengine/data/release_manifests/uk.json new file mode 100644 index 00000000..90cc1cc1 --- /dev/null +++ b/build/lib/policyengine/data/release_manifests/uk.json @@ -0,0 +1,45 @@ +{ + "schema_version": 1, + "bundle_id": "uk-3.4.0", + "country_id": "uk", + "policyengine_version": "3.4.0", + "model_package": { + "name": "policyengine-uk", + "version": "2.74.0" + }, + "data_package": { + "name": "policyengine-uk-data", + "version": "1.40.4", + "repo_id": "policyengine/policyengine-uk-data-private" + }, + "certified_data_artifact": { + "data_package": { + "name": "policyengine-uk-data", + "version": "1.40.4" + }, + "build_id": "policyengine-uk-data-1.40.4", + "dataset": "enhanced_frs_2023_24", + "uri": "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" + }, + "certification": { + "compatibility_basis": "exact_build_model_version", + "data_build_id": "policyengine-uk-data-1.40.4", + "built_with_model_version": "2.74.0", + "certified_for_model_version": "2.74.0", + "certified_by": "policyengine.py bundled manifest" + }, + "default_dataset": "enhanced_frs_2023_24", + "datasets": { + "frs_2023_24": { + "path": "frs_2023_24.h5" + }, + "enhanced_frs_2023_24": { + "path": "enhanced_frs_2023_24.h5" + } + }, + "region_datasets": { + "national": { + "path_template": "enhanced_frs_2023_24.h5" + } + } +} diff --git a/build/lib/policyengine/data/release_manifests/us.json b/build/lib/policyengine/data/release_manifests/us.json new file mode 100644 index 00000000..20526da9 --- /dev/null +++ b/build/lib/policyengine/data/release_manifests/us.json @@ -0,0 +1,48 @@ +{ + "schema_version": 1, + "bundle_id": "us-3.4.0", + "country_id": "us", + "policyengine_version": "3.4.0", + "model_package": { + "name": "policyengine-us", + "version": "1.602.0" + }, + "data_package": { + "name": "policyengine-us-data", + "version": "1.73.0", + "repo_id": "policyengine/policyengine-us-data" + }, + "certified_data_artifact": { + "data_package": { + "name": "policyengine-us-data", + "version": "1.73.0" + }, + "build_id": "policyengine-us-data-1.73.0", + "dataset": "enhanced_cps_2024", + "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0" + }, + "certification": { + "compatibility_basis": "exact_build_model_version", + "data_build_id": "policyengine-us-data-1.73.0", + "built_with_model_version": "1.602.0", + "certified_for_model_version": "1.602.0", + "certified_by": "policyengine.py bundled manifest" + }, + "default_dataset": "enhanced_cps_2024", + "datasets": { + "enhanced_cps_2024": { + "path": "enhanced_cps_2024.h5" + } + }, + "region_datasets": { + "national": { + "path_template": "enhanced_cps_2024.h5" + }, + "state": { + "path_template": "states/{state_code}.h5" + }, + "congressional_district": { + "path_template": "districts/{district_code}.h5" + } + } +} diff --git a/build/lib/policyengine/outputs/__init__.py b/build/lib/policyengine/outputs/__init__.py new file mode 100644 index 00000000..61311f46 --- /dev/null +++ b/build/lib/policyengine/outputs/__init__.py @@ -0,0 +1,91 @@ +from policyengine.core import Output, OutputCollection +from policyengine.outputs.aggregate import Aggregate, AggregateType +from policyengine.outputs.change_aggregate import ( + ChangeAggregate, + ChangeAggregateType, +) +from policyengine.outputs.congressional_district_impact import ( + CongressionalDistrictImpact, + compute_us_congressional_district_impacts, +) +from policyengine.outputs.constituency_impact import ( + ConstituencyImpact, + compute_uk_constituency_impacts, +) +from policyengine.outputs.decile_impact import ( + DecileImpact, + calculate_decile_impacts, +) +from policyengine.outputs.inequality import ( + UK_INEQUALITY_INCOME_VARIABLE, + US_INEQUALITY_INCOME_VARIABLE, + Inequality, + USInequalityPreset, + calculate_uk_inequality, + calculate_us_inequality, +) +from policyengine.outputs.intra_decile_impact import ( + IntraDecileImpact, + compute_intra_decile_impacts, +) +from policyengine.outputs.local_authority_impact import ( + LocalAuthorityImpact, + compute_uk_local_authority_impacts, +) +from policyengine.outputs.poverty import ( + AGE_GROUPS, + GENDER_GROUPS, + RACE_GROUPS, + UK_POVERTY_VARIABLES, + US_POVERTY_VARIABLES, + Poverty, + UKPovertyType, + USPovertyType, + calculate_uk_poverty_by_age, + calculate_uk_poverty_by_gender, + calculate_uk_poverty_rates, + calculate_us_poverty_by_age, + calculate_us_poverty_by_gender, + calculate_us_poverty_by_race, + calculate_us_poverty_rates, +) + +__all__ = [ + "Output", + "OutputCollection", + "Aggregate", + "AggregateType", + "ChangeAggregate", + "ChangeAggregateType", + "DecileImpact", + "calculate_decile_impacts", + "IntraDecileImpact", + "compute_intra_decile_impacts", + "Poverty", + "UKPovertyType", + "USPovertyType", + "UK_POVERTY_VARIABLES", + "US_POVERTY_VARIABLES", + "calculate_uk_poverty_rates", + "calculate_us_poverty_rates", + "calculate_uk_poverty_by_age", + "calculate_us_poverty_by_age", + "calculate_uk_poverty_by_gender", + "calculate_us_poverty_by_gender", + "calculate_us_poverty_by_race", + "AGE_GROUPS", + "GENDER_GROUPS", + "RACE_GROUPS", + "Inequality", + "USInequalityPreset", + "UK_INEQUALITY_INCOME_VARIABLE", + "US_INEQUALITY_INCOME_VARIABLE", + "calculate_uk_inequality", + "calculate_us_inequality", + "CongressionalDistrictImpact", + "compute_us_congressional_district_impacts", + "ConstituencyImpact", + "compute_uk_constituency_impacts", + "LocalAuthorityImpact", + "compute_uk_local_authority_impacts", +] diff --git a/build/lib/policyengine/outputs/aggregate.py b/build/lib/policyengine/outputs/aggregate.py new file mode 100644 index 00000000..d014b06c --- /dev/null +++ b/build/lib/policyengine/outputs/aggregate.py @@ -0,0 +1,112 @@ +from enum import Enum +from typing import Any, Optional + +from policyengine.core import Output, Simulation + + +class AggregateType(str, Enum): + SUM = "sum" + MEAN = "mean" + COUNT = "count" + + +class Aggregate(Output): + simulation: Simulation + variable: str + aggregate_type: AggregateType + entity: Optional[str] = None + + filter_variable: Optional[str] = None + filter_variable_eq: Optional[Any] = None + filter_variable_leq: Optional[Any] = None + filter_variable_geq: Optional[Any] = None + filter_variable_describes_quantiles: bool = False + + # Convenient quantile specification (alternative to describes_quantiles) + quantile: Optional[int] = ( + None # Number of quantiles (e.g., 10 for deciles, 5 for quintiles) + ) + quantile_eq: Optional[int] = None # Exact quantile (e.g., 3 for 3rd decile) + quantile_leq: Optional[int] = ( + None # Maximum quantile (e.g., 5 for bottom 5 deciles) + ) + quantile_geq: Optional[int] = None # Minimum quantile (e.g., 9 for top 2 deciles) + + result: Optional[Any] = None + + def run(self): + # Convert quantile specification to describes_quantiles format + if self.quantile is not None: + self.filter_variable_describes_quantiles = True + if self.quantile_eq is not None: + # For a specific quantile, filter between (quantile-1)/n and quantile/n + self.filter_variable_geq = (self.quantile_eq - 1) / self.quantile + self.filter_variable_leq = self.quantile_eq / self.quantile + elif self.quantile_leq is not None: + self.filter_variable_leq = self.quantile_leq / self.quantile + elif self.quantile_geq is not None: + self.filter_variable_geq = (self.quantile_geq - 1) / self.quantile + + # Get variable object + var_obj = next( + v + for v in self.simulation.tax_benefit_model_version.variables + if v.name == self.variable + ) + + # Get the target entity data + target_entity = self.entity or var_obj.entity + data = getattr(self.simulation.output_dataset.data, target_entity) + + # Map variable to target entity if needed + if var_obj.entity != target_entity: + mapped = self.simulation.output_dataset.data.map_to_entity( + var_obj.entity, target_entity, columns=[self.variable] + ) + series = mapped[self.variable] + else: + series = data[self.variable] + + # Apply filters + if self.filter_variable is not None: + filter_var_obj = next( + v + for v in self.simulation.tax_benefit_model_version.variables + if v.name == self.filter_variable + ) + + if filter_var_obj.entity != target_entity: + filter_mapped = self.simulation.output_dataset.data.map_to_entity( + filter_var_obj.entity, + target_entity, + columns=[self.filter_variable], + ) + filter_series = filter_mapped[self.filter_variable] + else: + filter_series = data[self.filter_variable] + + if self.filter_variable_describes_quantiles: + if self.filter_variable_eq is not None: + threshold = filter_series.quantile(self.filter_variable_eq) + series = series[filter_series <= threshold] + if self.filter_variable_leq is not None: + threshold = filter_series.quantile(self.filter_variable_leq) + series = series[filter_series <= threshold] + if self.filter_variable_geq is not None: + threshold = filter_series.quantile(self.filter_variable_geq) + series = series[filter_series >= threshold] + else: + if self.filter_variable_eq is not None: + series = series[filter_series == self.filter_variable_eq] + if self.filter_variable_leq is not None: + series = series[filter_series <= self.filter_variable_leq] + if self.filter_variable_geq is not None: + series = series[filter_series >= self.filter_variable_geq] + + # Aggregate - MicroSeries will automatically apply weights + if self.aggregate_type == AggregateType.SUM: + self.result = series.sum() + elif self.aggregate_type == AggregateType.MEAN: + self.result = series.mean() + elif self.aggregate_type == AggregateType.COUNT: + self.result = series.count() diff --git a/build/lib/policyengine/outputs/change_aggregate.py b/build/lib/policyengine/outputs/change_aggregate.py new file mode 100644 index 00000000..87d2e0d9 --- /dev/null +++ b/build/lib/policyengine/outputs/change_aggregate.py @@ -0,0 +1,170 @@ +from enum import Enum +from typing import Any, Optional + +from policyengine.core import Output, Simulation + + +class ChangeAggregateType(str, Enum): + COUNT = "count" + SUM = "sum" + MEAN = "mean" + + +class ChangeAggregate(Output): + baseline_simulation: Simulation + reform_simulation: Simulation + variable: str + aggregate_type: ChangeAggregateType + entity: Optional[str] = None + + # Filter by absolute change + change_geq: Optional[float] = None # Change >= value (e.g., gain >= 500) + change_leq: Optional[float] = None # Change <= value (e.g., loss <= -500) + change_eq: Optional[float] = None # Change == value + + # Filter by relative change (as decimal, e.g., 0.05 = 5%) + relative_change_geq: Optional[float] = None # Relative change >= value + relative_change_leq: Optional[float] = None # Relative change <= value + relative_change_eq: Optional[float] = None # Relative change == value + + # Filter by another variable (e.g., only count people with age >= 30) + filter_variable: Optional[str] = None + filter_variable_eq: Optional[Any] = None + filter_variable_leq: Optional[Any] = None + filter_variable_geq: Optional[Any] = None + filter_variable_describes_quantiles: bool = False + + # Convenient quantile specification (alternative to describes_quantiles) + quantile: Optional[int] = ( + None # Number of quantiles (e.g., 10 for deciles, 5 for quintiles) + ) + quantile_eq: Optional[int] = None # Exact quantile (e.g., 3 for 3rd decile) + quantile_leq: Optional[int] = ( + None # Maximum quantile (e.g., 5 for bottom 5 deciles) + ) + quantile_geq: Optional[int] = None # Minimum quantile (e.g., 9 for top 2 deciles) + + result: Optional[Any] = None + + def run(self): + # Convert quantile specification to describes_quantiles format + if self.quantile is not None: + self.filter_variable_describes_quantiles = True + if self.quantile_eq is not None: + # For a specific quantile, filter between (quantile-1)/n and quantile/n + self.filter_variable_geq = (self.quantile_eq - 1) / self.quantile + self.filter_variable_leq = self.quantile_eq / self.quantile + elif self.quantile_leq is not None: + self.filter_variable_leq = self.quantile_leq / self.quantile + elif self.quantile_geq is not None: + self.filter_variable_geq = (self.quantile_geq - 1) / self.quantile + + # Get variable object + var_obj = next( + v + for v in self.baseline_simulation.tax_benefit_model_version.variables + if v.name == self.variable + ) + + # Get the target entity data + target_entity = self.entity or var_obj.entity + baseline_data = getattr( + self.baseline_simulation.output_dataset.data, target_entity + ) + reform_data = getattr(self.reform_simulation.output_dataset.data, target_entity) + + # Map variable to target entity if needed + if var_obj.entity != target_entity: + baseline_mapped = ( + self.baseline_simulation.output_dataset.data.map_to_entity( + var_obj.entity, target_entity + ) + ) + baseline_series = baseline_mapped[self.variable] + + reform_mapped = self.reform_simulation.output_dataset.data.map_to_entity( + var_obj.entity, target_entity + ) + reform_series = reform_mapped[self.variable] + else: + baseline_series = baseline_data[self.variable] + reform_series = reform_data[self.variable] + + # Calculate change (reform - baseline) + change_series = reform_series - baseline_series + + # Calculate relative change (handling division by zero) + # Where baseline is 0, relative change is undefined; we'll mask these out if relative filters are used + import numpy as np + + with np.errstate(divide="ignore", invalid="ignore"): + relative_change_series = change_series / baseline_series + relative_change_series = relative_change_series.replace( + [np.inf, -np.inf], np.nan + ) + + # Start with all rows + mask = baseline_series.notna() + + # Apply absolute change filters + if self.change_eq is not None: + mask &= change_series == self.change_eq + if self.change_leq is not None: + mask &= change_series <= self.change_leq + if self.change_geq is not None: + mask &= change_series >= self.change_geq + + # Apply relative change filters + if self.relative_change_eq is not None: + mask &= relative_change_series == self.relative_change_eq + if self.relative_change_leq is not None: + mask &= relative_change_series <= self.relative_change_leq + if self.relative_change_geq is not None: + mask &= relative_change_series >= self.relative_change_geq + + # Apply filter_variable filters + if self.filter_variable is not None: + filter_var_obj = next( + v + for v in self.baseline_simulation.tax_benefit_model_version.variables + if v.name == self.filter_variable + ) + + if filter_var_obj.entity != target_entity: + filter_mapped = ( + self.baseline_simulation.output_dataset.data.map_to_entity( + filter_var_obj.entity, target_entity + ) + ) + filter_series = filter_mapped[self.filter_variable] + else: + filter_series = baseline_data[self.filter_variable] + + if self.filter_variable_describes_quantiles: + if self.filter_variable_eq is not None: + threshold = filter_series.quantile(self.filter_variable_eq) + mask &= filter_series <= threshold + if self.filter_variable_leq is not None: + threshold = filter_series.quantile(self.filter_variable_leq) + mask &= filter_series <= threshold + if self.filter_variable_geq is not None: + threshold = filter_series.quantile(self.filter_variable_geq) + mask &= filter_series >= threshold + else: + if self.filter_variable_eq is not None: + mask &= filter_series == self.filter_variable_eq + if self.filter_variable_leq is not None: + mask &= filter_series <= self.filter_variable_leq + if self.filter_variable_geq is not None: + mask &= filter_series >= self.filter_variable_geq + + # Apply mask to get filtered data + filtered_change = change_series[mask] + + # Aggregate + if self.aggregate_type == ChangeAggregateType.COUNT: + self.result = filtered_change.count() + elif self.aggregate_type == ChangeAggregateType.SUM: + self.result = filtered_change.sum() + elif self.aggregate_type == ChangeAggregateType.MEAN: + self.result = filtered_change.mean() diff --git a/build/lib/policyengine/outputs/congressional_district_impact.py b/build/lib/policyengine/outputs/congressional_district_impact.py new file mode 100644 index 00000000..4a1d0d90 --- /dev/null +++ b/build/lib/policyengine/outputs/congressional_district_impact.py @@ -0,0 +1,131 @@ +"""Congressional district impact output class for US policy reforms.""" + +from typing import TYPE_CHECKING, Optional + +import numpy as np +from pydantic import ConfigDict + +from policyengine.core import Output + +if TYPE_CHECKING: + from policyengine.core.simulation import Simulation + + +class CongressionalDistrictImpact(Output): + """Per-congressional-district income change from a policy reform. + + Groups households by congressional_district_geoid (integer SSDD format + where SS = state FIPS, DD = district number) and computes weighted + average and relative household income changes per district, plus the + district-level shares of people who are winners, losers, or unchanged. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + baseline_simulation: "Simulation" + reform_simulation: "Simulation" + + # Results populated by run() + district_results: Optional[list[dict]] = None + + def run(self) -> None: + """Group households by geoid and compute per-district metrics.""" + baseline_hh = self.baseline_simulation.output_dataset.data.household + reform_hh = self.reform_simulation.output_dataset.data.household + + geoids = baseline_hh["congressional_district_geoid"].values + baseline_income = baseline_hh["household_net_income"].values + reform_income = reform_hh["household_net_income"].values + weights = baseline_hh["household_weight"].values + household_count_people = ( + baseline_hh["household_count_people"].values + if "household_count_people" in baseline_hh.columns + else np.ones_like(weights) + ) + + # Only include valid geoids (positive integers) + unique_geoids = np.unique(geoids[geoids > 0]) + + results: list[dict] = [] + for geoid in unique_geoids: + mask = geoids == geoid + w = weights[mask] + total_weight = float(w.sum()) + if total_weight == 0: + continue + + b_inc = baseline_income[mask] + r_inc = reform_income[mask] + people_weights = household_count_people[mask] * w + + weighted_baseline = float((b_inc * w).sum()) + weighted_reform = float((r_inc * w).sum()) + + avg_change = (weighted_reform - weighted_baseline) / total_weight + rel_change = ( + (weighted_reform / weighted_baseline - 1.0) + if weighted_baseline != 0 + else 0.0 + ) + capped_baseline = np.maximum(b_inc, 1.0) + income_change = (r_inc - b_inc) / capped_baseline + people_total = float(people_weights.sum()) + + if people_total == 0: + winner_percentage = 0.0 + loser_percentage = 0.0 + no_change_percentage = 1.0 + else: + winner_percentage = float( + people_weights[income_change > 1e-3].sum() / people_total + ) + loser_percentage = float( + people_weights[income_change <= -1e-3].sum() / people_total + ) + no_change_percentage = float( + people_weights[ + (income_change > -1e-3) & (income_change <= 1e-3) + ].sum() + / people_total + ) + + geoid_int = int(geoid) + state_fips = geoid_int // 100 + district_number = geoid_int % 100 + + results.append( + { + "district_geoid": geoid_int, + "state_fips": state_fips, + "district_number": district_number, + "average_household_income_change": float(avg_change), + "relative_household_income_change": float(rel_change), + "winner_percentage": winner_percentage, + "loser_percentage": loser_percentage, + "no_change_percentage": no_change_percentage, + "population": total_weight, + } + ) + + self.district_results = results + + +def compute_us_congressional_district_impacts( + baseline_simulation: "Simulation", + reform_simulation: "Simulation", +) -> CongressionalDistrictImpact: + """Compute per-congressional-district income changes. + + Args: + baseline_simulation: Completed baseline simulation. + reform_simulation: Completed reform simulation. + + Returns: + CongressionalDistrictImpact with district_results populated. + """ + impact = CongressionalDistrictImpact.model_construct( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + ) + impact.run() + return impact diff --git a/build/lib/policyengine/outputs/constituency_impact.py b/build/lib/policyengine/outputs/constituency_impact.py new file mode 100644 index 00000000..60f76e0b --- /dev/null +++ b/build/lib/policyengine/outputs/constituency_impact.py @@ -0,0 +1,126 @@ +"""UK parliamentary constituency impact output class. + +Computes per-constituency income changes using pre-computed weight matrices. +Each constituency has a row in the weight matrix (shape: 650 x N_households) +that reweights all households to represent that constituency's demographics. +""" + +from typing import TYPE_CHECKING, Optional + +import h5py +import numpy as np +import pandas as pd +from pydantic import ConfigDict + +from policyengine.core import Output + +if TYPE_CHECKING: + from policyengine.core.simulation import Simulation + + +class ConstituencyImpact(Output): + """Per-parliamentary-constituency income change from a UK policy reform. + + Uses pre-computed weight matrices from GCS to reweight households + for each of 650 constituencies, then computes weighted average and + relative household income changes. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + baseline_simulation: "Simulation" + reform_simulation: "Simulation" + weight_matrix_path: str + constituency_csv_path: str + year: str = "2025" + + # Results populated by run() + constituency_results: Optional[list[dict]] = None + + def run(self) -> None: + """Load weight matrix and compute per-constituency metrics.""" + # Load constituency metadata (code, name, x, y) + constituency_df = pd.read_csv(self.constituency_csv_path) + + # Load weight matrix: shape (N_constituencies, N_households) + with h5py.File(self.weight_matrix_path, "r") as f: + weight_matrix = f[self.year][...] + + # Get household income arrays from output datasets + baseline_hh = self.baseline_simulation.output_dataset.data.household + reform_hh = self.reform_simulation.output_dataset.data.household + + baseline_income = baseline_hh["household_net_income"].values + reform_income = reform_hh["household_net_income"].values + + results: list[dict] = [] + for i in range(len(constituency_df)): + row = constituency_df.iloc[i] + code = str(row["code"]) + name = str(row["name"]) + x = int(row["x"]) + y = int(row["y"]) + w = weight_matrix[i] + + total_weight = float(np.sum(w)) + if total_weight == 0: + continue + + weighted_baseline = float(np.sum(baseline_income * w)) + weighted_reform = float(np.sum(reform_income * w)) + + # Count of weighted households + count = float(np.sum(w > 0)) + if count == 0: + continue + + avg_change = (weighted_reform - weighted_baseline) / total_weight + rel_change = ( + (weighted_reform / weighted_baseline - 1.0) + if weighted_baseline != 0 + else 0.0 + ) + + results.append( + { + "constituency_code": code, + "constituency_name": name, + "x": x, + "y": y, + "average_household_income_change": float(avg_change), + "relative_household_income_change": float(rel_change), + "population": total_weight, + } + ) + + self.constituency_results = results + + +def compute_uk_constituency_impacts( + baseline_simulation: "Simulation", + reform_simulation: "Simulation", + weight_matrix_path: str, + constituency_csv_path: str, + year: str = "2025", +) -> ConstituencyImpact: + """Compute per-constituency income changes for UK. + + Args: + baseline_simulation: Completed baseline simulation. + reform_simulation: Completed reform simulation. + weight_matrix_path: Path to parliamentary_constituency_weights.h5. + constituency_csv_path: Path to constituencies_2024.csv. + year: Year key in the H5 file (default "2025"). + + Returns: + ConstituencyImpact with constituency_results populated. + """ + impact = ConstituencyImpact.model_construct( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + weight_matrix_path=weight_matrix_path, + constituency_csv_path=constituency_csv_path, + year=year, + ) + impact.run() + return impact diff --git a/build/lib/policyengine/outputs/decile_impact.py b/build/lib/policyengine/outputs/decile_impact.py new file mode 100644 index 00000000..b0f2306e --- /dev/null +++ b/build/lib/policyengine/outputs/decile_impact.py @@ -0,0 +1,178 @@ +from typing import Optional + +import pandas as pd +from pydantic import ConfigDict + +from policyengine.core import Output, OutputCollection, Simulation +from policyengine.core.dataset import Dataset +from policyengine.core.dynamic import Dynamic +from policyengine.core.policy import Policy +from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion + + +class DecileImpact(Output): + """Single decile's impact from a policy reform - represents one database row.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + baseline_simulation: Simulation + reform_simulation: Simulation + income_variable: str = "equiv_hbai_household_net_income" + decile_variable: Optional[str] = None # If set, use pre-computed grouping variable + entity: Optional[str] = None + decile: int + quantiles: int = 10 + + # Results populated by run() + baseline_mean: Optional[float] = None + reform_mean: Optional[float] = None + absolute_change: Optional[float] = None + relative_change: Optional[float] = None + count_better_off: Optional[float] = None + count_worse_off: Optional[float] = None + count_no_change: Optional[float] = None + + def run(self): + """Calculate impact for this specific decile.""" + # Get variable object to determine entity + var_obj = next( + v + for v in self.baseline_simulation.tax_benefit_model_version.variables + if v.name == self.income_variable + ) + + # Get target entity + target_entity = self.entity or var_obj.entity + + # Get data from both simulations + baseline_data = getattr( + self.baseline_simulation.output_dataset.data, target_entity + ) + reform_data = getattr(self.reform_simulation.output_dataset.data, target_entity) + + # Map income variable to target entity if needed + if var_obj.entity != target_entity: + baseline_mapped = ( + self.baseline_simulation.output_dataset.data.map_to_entity( + var_obj.entity, target_entity + ) + ) + baseline_income = baseline_mapped[self.income_variable] + + reform_mapped = self.reform_simulation.output_dataset.data.map_to_entity( + var_obj.entity, target_entity + ) + reform_income = reform_mapped[self.income_variable] + else: + baseline_income = baseline_data[self.income_variable] + reform_income = reform_data[self.income_variable] + + # Calculate deciles: use pre-computed variable or qcut + if self.decile_variable: + decile_series = baseline_data[self.decile_variable] + else: + decile_series = ( + pd.qcut( + baseline_income, + self.quantiles, + labels=False, + duplicates="drop", + ) + + 1 + ) + + # Calculate changes + absolute_change = reform_income - baseline_income + relative_change = (absolute_change / baseline_income) * 100 + + # Filter to this decile + mask = decile_series == self.decile + + # Populate results + self.baseline_mean = float(baseline_income[mask].mean()) + self.reform_mean = float(reform_income[mask].mean()) + self.absolute_change = float(absolute_change[mask].mean()) + self.relative_change = float(relative_change[mask].mean()) + self.count_better_off = float((absolute_change[mask] > 0).sum()) + self.count_worse_off = float((absolute_change[mask] < 0).sum()) + self.count_no_change = float((absolute_change[mask] == 0).sum()) + + +def calculate_decile_impacts( + dataset: Optional[Dataset] = None, + tax_benefit_model_version: Optional[TaxBenefitModelVersion] = None, + baseline_policy: Optional[Policy] = None, + reform_policy: Optional[Policy] = None, + dynamic: Optional[Dynamic] = None, + income_variable: str = "equiv_hbai_household_net_income", + entity: Optional[str] = None, + quantiles: int = 10, + baseline_simulation: Optional[Simulation] = None, + reform_simulation: Optional[Simulation] = None, +) -> OutputCollection[DecileImpact]: + """Calculate decile-by-decile impact of a reform. + + Returns: + OutputCollection containing list of DecileImpact objects and DataFrame + """ + if (baseline_simulation is None) != (reform_simulation is None): + raise ValueError( + "baseline_simulation and reform_simulation must be provided together" + ) + + if baseline_simulation is None: + if dataset is None or tax_benefit_model_version is None: + raise ValueError( + "dataset and tax_benefit_model_version are required when simulations are not provided" + ) + + baseline_simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=tax_benefit_model_version, + policy=baseline_policy, + dynamic=dynamic, + ) + reform_simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=tax_benefit_model_version, + policy=reform_policy, + dynamic=dynamic, + ) + + baseline_simulation.ensure() + reform_simulation.ensure() + + results = [] + for decile in range(1, quantiles + 1): + impact = DecileImpact( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + income_variable=income_variable, + entity=entity, + decile=decile, + quantiles=quantiles, + ) + impact.run() + results.append(impact) + + # Create DataFrame + df = pd.DataFrame( + [ + { + "baseline_simulation_id": r.baseline_simulation.id, + "reform_simulation_id": r.reform_simulation.id, + "income_variable": r.income_variable, + "decile": r.decile, + "baseline_mean": r.baseline_mean, + "reform_mean": r.reform_mean, + "absolute_change": r.absolute_change, + "relative_change": r.relative_change, + "count_better_off": r.count_better_off, + "count_worse_off": r.count_worse_off, + "count_no_change": r.count_no_change, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) diff --git a/build/lib/policyengine/outputs/inequality.py b/build/lib/policyengine/outputs/inequality.py new file mode 100644 index 00000000..4b16f7a9 --- /dev/null +++ b/build/lib/policyengine/outputs/inequality.py @@ -0,0 +1,313 @@ +"""Inequality analysis output types.""" + +from enum import Enum +from typing import Any, Optional, Union + +import numpy as np +import pandas as pd +from pydantic import ConfigDict + +from policyengine.core import Output, Simulation + + +class USInequalityPreset(str, Enum): + """Preset configurations for US inequality analysis.""" + + STANDARD = "standard" + CBO_COMPARABLE = "cbo_comparable" + + +def _gini(values: np.ndarray, weights: np.ndarray) -> float: + """Calculate weighted Gini coefficient. + + Args: + values: Array of income values + weights: Array of weights + + Returns: + Gini coefficient between 0 (perfect equality) and 1 (perfect inequality) + """ + # Handle edge cases + if len(values) == 0 or weights.sum() == 0: + return 0.0 + + # Sort by values + sorted_indices = np.argsort(values) + sorted_values = values[sorted_indices] + sorted_weights = weights[sorted_indices] + + # Cumulative weights and weighted values + cumulative_weights = np.cumsum(sorted_weights) + total_weight = cumulative_weights[-1] + cumulative_weighted_values = np.cumsum(sorted_values * sorted_weights) + total_weighted_value = cumulative_weighted_values[-1] + + if total_weighted_value == 0: + return 0.0 + + # Calculate Gini using the area formula + # Gini = 1 - 2 * (area under Lorenz curve) + lorenz_curve = cumulative_weighted_values / total_weighted_value + weight_fractions = sorted_weights / total_weight + + # Area under Lorenz curve using trapezoidal rule + area = np.sum(weight_fractions * (lorenz_curve - weight_fractions / 2)) + + return float(1 - 2 * area) + + +def _series_for_entity( + simulation: Simulation, variable_name: str, target_entity: str, data: pd.DataFrame +) -> pd.Series: + """Return a variable series aligned to the requested entity.""" + variable = simulation.tax_benefit_model_version.get_variable(variable_name) + + if variable.entity != target_entity: + mapped = simulation.output_dataset.data.map_to_entity( + variable.entity, + target_entity, + columns=[variable_name], + ) + return mapped[variable_name] + + return data[variable_name] + + +class Inequality(Output): + """Single inequality measure result - represents one database row. + + This is a single-simulation output type that calculates inequality + metrics for a given income variable, optionally filtered by + demographic variables. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + simulation: Simulation + income_variable: str + entity: str = "household" + weight_multiplier_variable: Optional[str] = None + equivalization_variable: Optional[str] = None + equivalization_power: float = 0.0 + + # Optional demographic filters + filter_variable: Optional[str] = None + filter_variable_eq: Optional[Any] = None + filter_variable_leq: Optional[Any] = None + filter_variable_geq: Optional[Any] = None + + # Results populated by run() + gini: Optional[float] = None + top_10_share: Optional[float] = None + top_1_share: Optional[float] = None + bottom_50_share: Optional[float] = None + + def run(self): + """Calculate inequality metrics.""" + # Get target entity data + target_entity = self.entity + data = getattr(self.simulation.output_dataset.data, target_entity) + + income_series = _series_for_entity( + self.simulation, self.income_variable, target_entity, data + ) + + # Get weights + weight_col = f"{target_entity}_weight" + if weight_col in data.columns: + weights = data[weight_col] + else: + weights = pd.Series(np.ones(len(income_series)), index=income_series.index) + + if self.weight_multiplier_variable is not None: + weight_multiplier = _series_for_entity( + self.simulation, + self.weight_multiplier_variable, + target_entity, + data, + ) + weights = weights * weight_multiplier + + # Apply demographic filter if specified + if self.filter_variable is not None: + filter_series = _series_for_entity( + self.simulation, self.filter_variable, target_entity, data + ) + + # Build filter mask + mask = filter_series.notna() + if self.filter_variable_eq is not None: + mask &= filter_series == self.filter_variable_eq + if self.filter_variable_leq is not None: + mask &= filter_series <= self.filter_variable_leq + if self.filter_variable_geq is not None: + mask &= filter_series >= self.filter_variable_geq + + # Apply mask + income_series = income_series[mask] + weights = weights[mask] + + equivalization_arr = None + if self.equivalization_variable is not None and self.equivalization_power != 0: + equivalization_series = _series_for_entity( + self.simulation, + self.equivalization_variable, + target_entity, + data, + ) + if self.filter_variable is not None: + equivalization_series = equivalization_series[mask] + equivalization_arr = pd.to_numeric( + equivalization_series, errors="coerce" + ).to_numpy(dtype=float) + + # Convert to numpy arrays + values = pd.to_numeric(income_series, errors="coerce").to_numpy(dtype=float) + weights_arr = pd.to_numeric(weights, errors="coerce").to_numpy(dtype=float) + + # Remove invalid values + valid_mask = ~np.isnan(values) & ~np.isnan(weights_arr) + if equivalization_arr is not None: + valid_mask &= ~np.isnan(equivalization_arr) & (equivalization_arr > 0) + + values = values[valid_mask] + weights_arr = weights_arr[valid_mask] + if equivalization_arr is not None: + values = values / np.power( + equivalization_arr[valid_mask], self.equivalization_power + ) + + # Calculate Gini coefficient + self.gini = _gini(values, weights_arr) + + # Calculate income shares + if len(values) > 0 and weights_arr.sum() > 0: + total_income = np.sum(values * weights_arr) + + if total_income > 0: + # Sort by income + sorted_indices = np.argsort(values) + sorted_values = values[sorted_indices] + sorted_weights = weights_arr[sorted_indices] + + # Cumulative weight fractions + cumulative_weights = np.cumsum(sorted_weights) + total_weight = cumulative_weights[-1] + weight_fractions = cumulative_weights / total_weight + + # Top 10% share + top_10_mask = weight_fractions > 0.9 + self.top_10_share = float( + np.sum(sorted_values[top_10_mask] * sorted_weights[top_10_mask]) + / total_income + ) + + # Top 1% share + top_1_mask = weight_fractions > 0.99 + self.top_1_share = float( + np.sum(sorted_values[top_1_mask] * sorted_weights[top_1_mask]) + / total_income + ) + + # Bottom 50% share + bottom_50_mask = weight_fractions <= 0.5 + self.bottom_50_share = float( + np.sum( + sorted_values[bottom_50_mask] * sorted_weights[bottom_50_mask] + ) + / total_income + ) + else: + self.top_10_share = 0.0 + self.top_1_share = 0.0 + self.bottom_50_share = 0.0 + else: + self.top_10_share = 0.0 + self.top_1_share = 0.0 + self.bottom_50_share = 0.0 + + +# Default income variables for each country +UK_INEQUALITY_INCOME_VARIABLE = "equiv_hbai_household_net_income" +US_INEQUALITY_INCOME_VARIABLE = "household_net_income" + + +def calculate_uk_inequality( + simulation: Simulation, + income_variable: str = UK_INEQUALITY_INCOME_VARIABLE, + filter_variable: Optional[str] = None, + filter_variable_eq: Optional[Any] = None, + filter_variable_leq: Optional[Any] = None, + filter_variable_geq: Optional[Any] = None, +) -> Inequality: + """Calculate inequality metrics for a UK simulation. + + Args: + simulation: The simulation to analyse + income_variable: Income variable to use (default: equiv_hbai_household_net_income) + filter_variable: Optional variable to filter by + filter_variable_eq: Filter for exact match + filter_variable_leq: Filter for less than or equal + filter_variable_geq: Filter for greater than or equal + + Returns: + Inequality object with Gini and income share metrics + """ + inequality = Inequality( + simulation=simulation, + income_variable=income_variable, + entity="household", + filter_variable=filter_variable, + filter_variable_eq=filter_variable_eq, + filter_variable_leq=filter_variable_leq, + filter_variable_geq=filter_variable_geq, + ) + inequality.run() + return inequality + + +def calculate_us_inequality( + simulation: Simulation, + income_variable: str = US_INEQUALITY_INCOME_VARIABLE, + preset: Union[USInequalityPreset, str] = USInequalityPreset.STANDARD, + filter_variable: Optional[str] = None, + filter_variable_eq: Optional[Any] = None, + filter_variable_leq: Optional[Any] = None, + filter_variable_geq: Optional[Any] = None, +) -> Inequality: + """Calculate inequality metrics for a US simulation. + + Args: + simulation: The simulation to analyse + income_variable: Income variable to use (default: household_net_income) + preset: Optional preset for weighting/equivalization + filter_variable: Optional variable to filter by + filter_variable_eq: Filter for exact match + filter_variable_leq: Filter for less than or equal + filter_variable_geq: Filter for greater than or equal + + Returns: + Inequality object with Gini and income share metrics + """ + preset = USInequalityPreset(preset) + inequality_kwargs = {} + + if preset == USInequalityPreset.CBO_COMPARABLE: + inequality_kwargs = { + "weight_multiplier_variable": "household_count_people", + "equivalization_variable": "household_count_people", + "equivalization_power": 0.5, + } + + inequality = Inequality( + simulation=simulation, + income_variable=income_variable, + entity="household", + **inequality_kwargs, + filter_variable=filter_variable, + filter_variable_eq=filter_variable_eq, + filter_variable_leq=filter_variable_leq, + filter_variable_geq=filter_variable_geq, + ) + inequality.run() + return inequality diff --git a/build/lib/policyengine/outputs/intra_decile_impact.py b/build/lib/policyengine/outputs/intra_decile_impact.py new file mode 100644 index 00000000..b91a04e2 --- /dev/null +++ b/build/lib/policyengine/outputs/intra_decile_impact.py @@ -0,0 +1,180 @@ +"""Intra-decile impact output. + +Computes the distribution of income change categories within each decile. +Each row represents one decile (1-10) or the overall average (decile=0), +with five proportion columns summing to ~1.0. + +The five categories classify households by their percentage income change: + - lose_more_than_5pct: change <= -5% + - lose_less_than_5pct: -5% < change <= -0.1% + - no_change: -0.1% < change <= 0.1% + - gain_less_than_5pct: 0.1% < change <= 5% + - gain_more_than_5pct: change > 5% + +Proportions are people-weighted (using household_count_people * +household_weight) so they reflect the share of people, not households. +""" + +from typing import Optional + +import numpy as np +import pandas as pd +from pydantic import ConfigDict + +from policyengine.core import Output, OutputCollection, Simulation + +# The 5-category thresholds +BOUNDS = [-np.inf, -0.05, -1e-3, 1e-3, 0.05, np.inf] +CATEGORY_NAMES = [ + "lose_more_than_5pct", + "lose_less_than_5pct", + "no_change", + "gain_less_than_5pct", + "gain_more_than_5pct", +] + + +class IntraDecileImpact(Output): + """Single decile's intra-decile impact — proportion of people in each + income change category.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + baseline_simulation: Simulation + reform_simulation: Simulation + income_variable: str = "household_net_income" + decile_variable: Optional[str] = None # If set, use pre-computed grouping + entity: str = "household" + decile: int # 1-10 for individual deciles + quantiles: int = 10 + + # Results populated by run() + lose_more_than_5pct: Optional[float] = None + lose_less_than_5pct: Optional[float] = None + no_change: Optional[float] = None + gain_less_than_5pct: Optional[float] = None + gain_more_than_5pct: Optional[float] = None + + def run(self): + """Calculate intra-decile proportions for this specific decile.""" + baseline_data = getattr( + self.baseline_simulation.output_dataset.data, self.entity + ) + reform_data = getattr(self.reform_simulation.output_dataset.data, self.entity) + + baseline_income = baseline_data[self.income_variable].values + reform_income = reform_data[self.income_variable].values + + # Determine decile grouping + if self.decile_variable: + decile_series = baseline_data[self.decile_variable].values + else: + decile_series = ( + pd.qcut( + baseline_income, + self.quantiles, + labels=False, + duplicates="drop", + ) + + 1 + ) + + # People-weighted counts + weights = baseline_data[f"{self.entity}_weight"].values + if self.entity == "household": + people_count = baseline_data["household_count_people"].values + people = people_count * weights + else: + people = weights + + # Compute percentage income change + capped_baseline = np.maximum(baseline_income, 1.0) + income_change = (reform_income - baseline_income) / capped_baseline + + in_decile = decile_series == self.decile + people_in_decile = float(np.sum(people[in_decile])) + + if people_in_decile == 0: + self.lose_more_than_5pct = 0.0 + self.lose_less_than_5pct = 0.0 + self.no_change = 1.0 + self.gain_less_than_5pct = 0.0 + self.gain_more_than_5pct = 0.0 + return + + proportions = [] + for lower, upper in zip(BOUNDS[:-1], BOUNDS[1:]): + in_category = (income_change > lower) & (income_change <= upper) + in_both = in_decile & in_category + proportions.append(float(np.sum(people[in_both]) / people_in_decile)) + + self.lose_more_than_5pct = proportions[0] + self.lose_less_than_5pct = proportions[1] + self.no_change = proportions[2] + self.gain_less_than_5pct = proportions[3] + self.gain_more_than_5pct = proportions[4] + + +def compute_intra_decile_impacts( + baseline_simulation: Simulation, + reform_simulation: Simulation, + income_variable: str = "household_net_income", + decile_variable: Optional[str] = None, + entity: str = "household", + quantiles: int = 10, +) -> OutputCollection[IntraDecileImpact]: + """Compute intra-decile proportions for all deciles + overall average. + + Returns: + OutputCollection containing list of IntraDecileImpact objects + (deciles 1-N plus overall average at decile=0) and DataFrame. + """ + results = [] + for decile in range(1, quantiles + 1): + impact = IntraDecileImpact.model_construct( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + income_variable=income_variable, + decile_variable=decile_variable, + entity=entity, + decile=decile, + quantiles=quantiles, + ) + impact.run() + results.append(impact) + + # Overall average (decile=0): arithmetic mean of decile proportions + overall = IntraDecileImpact.model_construct( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + income_variable=income_variable, + decile_variable=decile_variable, + entity=entity, + decile=0, + quantiles=quantiles, + lose_more_than_5pct=sum(r.lose_more_than_5pct for r in results) / quantiles, + lose_less_than_5pct=sum(r.lose_less_than_5pct for r in results) / quantiles, + no_change=sum(r.no_change for r in results) / quantiles, + gain_less_than_5pct=sum(r.gain_less_than_5pct for r in results) / quantiles, + gain_more_than_5pct=sum(r.gain_more_than_5pct for r in results) / quantiles, + ) + results.append(overall) + + # Create DataFrame + df = pd.DataFrame( + [ + { + "baseline_simulation_id": r.baseline_simulation.id, + "reform_simulation_id": r.reform_simulation.id, + "decile": r.decile, + "lose_more_than_5pct": r.lose_more_than_5pct, + "lose_less_than_5pct": r.lose_less_than_5pct, + "no_change": r.no_change, + "gain_less_than_5pct": r.gain_less_than_5pct, + "gain_more_than_5pct": r.gain_more_than_5pct, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) diff --git a/build/lib/policyengine/outputs/local_authority_impact.py b/build/lib/policyengine/outputs/local_authority_impact.py new file mode 100644 index 00000000..20b17efe --- /dev/null +++ b/build/lib/policyengine/outputs/local_authority_impact.py @@ -0,0 +1,125 @@ +"""UK local authority impact output class. + +Computes per-local-authority income changes using pre-computed weight matrices. +Each local authority has a row in the weight matrix (shape: 360 x N_households) +that reweights all households to represent that local authority's demographics. +""" + +from typing import TYPE_CHECKING, Optional + +import h5py +import numpy as np +import pandas as pd +from pydantic import ConfigDict + +from policyengine.core import Output + +if TYPE_CHECKING: + from policyengine.core.simulation import Simulation + + +class LocalAuthorityImpact(Output): + """Per-local-authority income change from a UK policy reform. + + Uses pre-computed weight matrices from GCS to reweight households + for each of 360 local authorities, then computes weighted average and + relative household income changes. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + baseline_simulation: "Simulation" + reform_simulation: "Simulation" + weight_matrix_path: str + local_authority_csv_path: str + year: str = "2025" + + # Results populated by run() + local_authority_results: Optional[list[dict]] = None + + def run(self) -> None: + """Load weight matrix and compute per-local-authority metrics.""" + # Load local authority metadata (code, x, y, name) + la_df = pd.read_csv(self.local_authority_csv_path) + + # Load weight matrix: shape (N_local_authorities, N_households) + with h5py.File(self.weight_matrix_path, "r") as f: + weight_matrix = f[self.year][...] + + # Get household income arrays from output datasets + baseline_hh = self.baseline_simulation.output_dataset.data.household + reform_hh = self.reform_simulation.output_dataset.data.household + + baseline_income = baseline_hh["household_net_income"].values + reform_income = reform_hh["household_net_income"].values + + results: list[dict] = [] + for i in range(len(la_df)): + row = la_df.iloc[i] + code = str(row["code"]) + name = str(row["name"]) + x = int(row["x"]) + y = int(row["y"]) + w = weight_matrix[i] + + total_weight = float(np.sum(w)) + if total_weight == 0: + continue + + weighted_baseline = float(np.sum(baseline_income * w)) + weighted_reform = float(np.sum(reform_income * w)) + + count = float(np.sum(w > 0)) + if count == 0: + continue + + avg_change = (weighted_reform - weighted_baseline) / total_weight + rel_change = ( + (weighted_reform / weighted_baseline - 1.0) + if weighted_baseline != 0 + else 0.0 + ) + + results.append( + { + "local_authority_code": code, + "local_authority_name": name, + "x": x, + "y": y, + "average_household_income_change": float(avg_change), + "relative_household_income_change": float(rel_change), + "population": total_weight, + } + ) + + self.local_authority_results = results + + +def compute_uk_local_authority_impacts( + baseline_simulation: "Simulation", + reform_simulation: "Simulation", + weight_matrix_path: str, + local_authority_csv_path: str, + year: str = "2025", +) -> LocalAuthorityImpact: + """Compute per-local-authority income changes for UK. + + Args: + baseline_simulation: Completed baseline simulation. + reform_simulation: Completed reform simulation. + weight_matrix_path: Path to local_authority_weights.h5. + local_authority_csv_path: Path to local_authorities_2021.csv. + year: Year key in the H5 file (default "2025"). + + Returns: + LocalAuthorityImpact with local_authority_results populated. + """ + impact = LocalAuthorityImpact.model_construct( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + weight_matrix_path=weight_matrix_path, + local_authority_csv_path=local_authority_csv_path, + year=year, + ) + impact.run() + return impact diff --git a/build/lib/policyengine/outputs/poverty.py b/build/lib/policyengine/outputs/poverty.py new file mode 100644 index 00000000..6fc59705 --- /dev/null +++ b/build/lib/policyengine/outputs/poverty.py @@ -0,0 +1,462 @@ +"""Poverty analysis output types.""" + +from enum import Enum +from typing import Any, Optional + +import pandas as pd +from pydantic import ConfigDict + +from policyengine.core import Output, OutputCollection, Simulation + + +class UKPovertyType(str, Enum): + """UK poverty measure types.""" + + ABSOLUTE_BHC = "absolute_bhc" + ABSOLUTE_AHC = "absolute_ahc" + RELATIVE_BHC = "relative_bhc" + RELATIVE_AHC = "relative_ahc" + + +class USPovertyType(str, Enum): + """US poverty measure types.""" + + SPM = "spm" + SPM_DEEP = "spm_deep" + + +# Mapping from poverty type to variable name +UK_POVERTY_VARIABLES = { + UKPovertyType.ABSOLUTE_BHC: "in_poverty_bhc", + UKPovertyType.ABSOLUTE_AHC: "in_poverty_ahc", + UKPovertyType.RELATIVE_BHC: "in_relative_poverty_bhc", + UKPovertyType.RELATIVE_AHC: "in_relative_poverty_ahc", +} + +US_POVERTY_VARIABLES = { + USPovertyType.SPM: "spm_unit_is_in_spm_poverty", + USPovertyType.SPM_DEEP: "spm_unit_is_in_deep_spm_poverty", +} + + +class Poverty(Output): + """Single poverty measure result - represents one database row. + + This is a single-simulation output type that calculates poverty + headcount and rate for a given poverty measure, optionally filtered + by demographic variables. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + simulation: Simulation + poverty_variable: str + poverty_type: Optional[str] = None + entity: str = "person" + + # Optional demographic filters + filter_variable: Optional[str] = None + filter_variable_eq: Optional[Any] = None + filter_variable_leq: Optional[Any] = None + filter_variable_geq: Optional[Any] = None + + # Convenience group label (set by by_age/by_gender/by_race wrappers) + filter_group: Optional[str] = None + + # Results populated by run() + headcount: Optional[float] = None + total_population: Optional[float] = None + rate: Optional[float] = None + + def run(self): + """Calculate poverty headcount and rate.""" + # Get poverty variable info + poverty_var_obj = self.simulation.tax_benefit_model_version.get_variable( + self.poverty_variable + ) + + # Get target entity data + target_entity = self.entity + data = getattr(self.simulation.output_dataset.data, target_entity) + + # Map poverty variable to target entity if needed + if poverty_var_obj.entity != target_entity: + mapped = self.simulation.output_dataset.data.map_to_entity( + poverty_var_obj.entity, + target_entity, + columns=[self.poverty_variable], + ) + poverty_series = mapped[self.poverty_variable] + else: + poverty_series = data[self.poverty_variable] + + # Apply demographic filter if specified + if self.filter_variable is not None: + filter_var_obj = self.simulation.tax_benefit_model_version.get_variable( + self.filter_variable + ) + + if filter_var_obj.entity != target_entity: + filter_mapped = self.simulation.output_dataset.data.map_to_entity( + filter_var_obj.entity, + target_entity, + columns=[self.filter_variable], + ) + filter_series = filter_mapped[self.filter_variable] + else: + filter_series = data[self.filter_variable] + + # Build filter mask + mask = filter_series.notna() + if self.filter_variable_eq is not None: + mask &= filter_series == self.filter_variable_eq + if self.filter_variable_leq is not None: + mask &= filter_series <= self.filter_variable_leq + if self.filter_variable_geq is not None: + mask &= filter_series >= self.filter_variable_geq + + # Apply mask + poverty_series = poverty_series[mask] + + # Calculate results using weighted counts + self.headcount = float((poverty_series == True).sum()) # noqa: E712 + self.total_population = float(poverty_series.count()) + self.rate = ( + self.headcount / self.total_population if self.total_population > 0 else 0.0 + ) + + +def calculate_uk_poverty_rates( + simulation: Simulation, + filter_variable: Optional[str] = None, + filter_variable_eq: Optional[Any] = None, + filter_variable_leq: Optional[Any] = None, + filter_variable_geq: Optional[Any] = None, +) -> OutputCollection[Poverty]: + """Calculate all UK poverty rates for a simulation. + + Args: + simulation: The simulation to analyse + filter_variable: Optional variable to filter by (e.g., "is_child") + filter_variable_eq: Filter for exact match + filter_variable_leq: Filter for less than or equal + filter_variable_geq: Filter for greater than or equal + + Returns: + OutputCollection containing Poverty objects for each UK poverty type + """ + results = [] + + for poverty_type, poverty_variable in UK_POVERTY_VARIABLES.items(): + poverty = Poverty( + simulation=simulation, + poverty_variable=poverty_variable, + poverty_type=str(poverty_type), + entity="person", + filter_variable=filter_variable, + filter_variable_eq=filter_variable_eq, + filter_variable_leq=filter_variable_leq, + filter_variable_geq=filter_variable_geq, + ) + poverty.run() + results.append(poverty) + + df = pd.DataFrame( + [ + { + "simulation_id": r.simulation.id, + "poverty_type": r.poverty_type, + "poverty_variable": r.poverty_variable, + "filter_variable": r.filter_variable, + "filter_variable_eq": r.filter_variable_eq, + "filter_variable_leq": r.filter_variable_leq, + "filter_variable_geq": r.filter_variable_geq, + "headcount": r.headcount, + "total_population": r.total_population, + "rate": r.rate, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) + + +def calculate_us_poverty_rates( + simulation: Simulation, + filter_variable: Optional[str] = None, + filter_variable_eq: Optional[Any] = None, + filter_variable_leq: Optional[Any] = None, + filter_variable_geq: Optional[Any] = None, +) -> OutputCollection[Poverty]: + """Calculate all US poverty rates for a simulation. + + Args: + simulation: The simulation to analyse + filter_variable: Optional variable to filter by (e.g., "is_child") + filter_variable_eq: Filter for exact match + filter_variable_leq: Filter for less than or equal + filter_variable_geq: Filter for greater than or equal + + Returns: + OutputCollection containing Poverty objects for each US poverty type + """ + results = [] + + for poverty_type, poverty_variable in US_POVERTY_VARIABLES.items(): + poverty = Poverty( + simulation=simulation, + poverty_variable=poverty_variable, + poverty_type=str(poverty_type), + entity="person", + filter_variable=filter_variable, + filter_variable_eq=filter_variable_eq, + filter_variable_leq=filter_variable_leq, + filter_variable_geq=filter_variable_geq, + ) + poverty.run() + results.append(poverty) + + df = pd.DataFrame( + [ + { + "simulation_id": r.simulation.id, + "poverty_type": r.poverty_type, + "poverty_variable": r.poverty_variable, + "filter_variable": r.filter_variable, + "filter_variable_eq": r.filter_variable_eq, + "filter_variable_leq": r.filter_variable_leq, + "filter_variable_geq": r.filter_variable_geq, + "headcount": r.headcount, + "total_population": r.total_population, + "rate": r.rate, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) + + +# Race group definitions (US only — race Enum stored as string names) +RACE_GROUPS = { + "white": {"filter_variable": "race", "filter_variable_eq": "WHITE"}, + "black": {"filter_variable": "race", "filter_variable_eq": "BLACK"}, + "hispanic": {"filter_variable": "race", "filter_variable_eq": "HISPANIC"}, + "other": {"filter_variable": "race", "filter_variable_eq": "OTHER"}, +} + +# Gender group definitions (same for UK and US — both use is_male boolean) +GENDER_GROUPS = { + "male": {"filter_variable": "is_male", "filter_variable_eq": True}, + "female": {"filter_variable": "is_male", "filter_variable_eq": False}, +} + +# Age group definitions (same for UK and US) +AGE_GROUPS = { + "child": {"filter_variable": "age", "filter_variable_leq": 17}, + "adult": { + "filter_variable": "age", + "filter_variable_geq": 18, + "filter_variable_leq": 64, + }, + "senior": {"filter_variable": "age", "filter_variable_geq": 65}, +} + + +def calculate_uk_poverty_by_age( + simulation: Simulation, +) -> OutputCollection[Poverty]: + """Calculate UK poverty rates broken down by age group. + + Computes poverty rates for child (< 18), adult (18-64), and + senior (65+) groups across all UK poverty types. + + Returns: + OutputCollection containing Poverty objects for each + age group x poverty type combination (3 x 4 = 12 records). + """ + results = [] + + for group_name, filters in AGE_GROUPS.items(): + group_results = calculate_uk_poverty_rates(simulation, **filters) + for pov in group_results.outputs: + pov.filter_group = group_name + results.append(pov) + + df = pd.DataFrame( + [ + { + "simulation_id": r.simulation.id, + "poverty_type": r.poverty_type, + "poverty_variable": r.poverty_variable, + "filter_variable": r.filter_variable, + "filter_group": r.filter_group, + "headcount": r.headcount, + "total_population": r.total_population, + "rate": r.rate, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) + + +def calculate_us_poverty_by_age( + simulation: Simulation, +) -> OutputCollection[Poverty]: + """Calculate US poverty rates broken down by age group. + + Computes poverty rates for child (< 18), adult (18-64), and + senior (65+) groups across all US poverty types. + + Returns: + OutputCollection containing Poverty objects for each + age group x poverty type combination (3 x 2 = 6 records). + """ + results = [] + + for group_name, filters in AGE_GROUPS.items(): + group_results = calculate_us_poverty_rates(simulation, **filters) + for pov in group_results.outputs: + pov.filter_group = group_name + results.append(pov) + + df = pd.DataFrame( + [ + { + "simulation_id": r.simulation.id, + "poverty_type": r.poverty_type, + "poverty_variable": r.poverty_variable, + "filter_variable": r.filter_variable, + "filter_group": r.filter_group, + "headcount": r.headcount, + "total_population": r.total_population, + "rate": r.rate, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) + + +def calculate_uk_poverty_by_gender( + simulation: Simulation, +) -> OutputCollection[Poverty]: + """Calculate UK poverty rates broken down by gender. + + Computes poverty rates for male and female groups across + all UK poverty types using the is_male boolean variable. + + Returns: + OutputCollection containing Poverty objects for each + gender x poverty type combination (2 x 4 = 8 records). + """ + results = [] + + for group_name, filters in GENDER_GROUPS.items(): + group_results = calculate_uk_poverty_rates(simulation, **filters) + for pov in group_results.outputs: + pov.filter_group = group_name + results.append(pov) + + df = pd.DataFrame( + [ + { + "simulation_id": r.simulation.id, + "poverty_type": r.poverty_type, + "poverty_variable": r.poverty_variable, + "filter_variable": r.filter_variable, + "filter_group": r.filter_group, + "headcount": r.headcount, + "total_population": r.total_population, + "rate": r.rate, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) + + +def calculate_us_poverty_by_gender( + simulation: Simulation, +) -> OutputCollection[Poverty]: + """Calculate US poverty rates broken down by gender. + + Computes poverty rates for male and female groups across + all US poverty types using the is_male boolean variable. + + Returns: + OutputCollection containing Poverty objects for each + gender x poverty type combination (2 x 2 = 4 records). + """ + results = [] + + for group_name, filters in GENDER_GROUPS.items(): + group_results = calculate_us_poverty_rates(simulation, **filters) + for pov in group_results.outputs: + pov.filter_group = group_name + results.append(pov) + + df = pd.DataFrame( + [ + { + "simulation_id": r.simulation.id, + "poverty_type": r.poverty_type, + "poverty_variable": r.poverty_variable, + "filter_variable": r.filter_variable, + "filter_group": r.filter_group, + "headcount": r.headcount, + "total_population": r.total_population, + "rate": r.rate, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) + + +def calculate_us_poverty_by_race( + simulation: Simulation, +) -> OutputCollection[Poverty]: + """Calculate US poverty rates broken down by race. + + Computes poverty rates for white, black, hispanic, and other + racial groups across all US poverty types using the race Enum + variable (stored as string names in the output dataset). + + US-only — the UK does not have a race variable. + + Returns: + OutputCollection containing Poverty objects for each + race x poverty type combination (4 x 2 = 8 records). + """ + results = [] + + for group_name, filters in RACE_GROUPS.items(): + group_results = calculate_us_poverty_rates(simulation, **filters) + for pov in group_results.outputs: + pov.filter_group = group_name + results.append(pov) + + df = pd.DataFrame( + [ + { + "simulation_id": r.simulation.id, + "poverty_type": r.poverty_type, + "poverty_variable": r.poverty_variable, + "filter_variable": r.filter_variable, + "filter_group": r.filter_group, + "headcount": r.headcount, + "total_population": r.total_population, + "rate": r.rate, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) diff --git a/build/lib/policyengine/tax_benefit_models/uk.py b/build/lib/policyengine/tax_benefit_models/uk.py new file mode 100644 index 00000000..52abcb18 --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/uk.py @@ -0,0 +1,40 @@ +"""PolicyEngine UK tax-benefit model - imports from uk/ module.""" + +from importlib.util import find_spec + +if find_spec("policyengine_uk") is not None: + from .uk import ( + PolicyEngineUK, + PolicyEngineUKDataset, + PolicyEngineUKLatest, + ProgrammeStatistics, + UKYearData, + create_datasets, + ensure_datasets, + general_policy_reform_analysis, + load_datasets, + managed_microsimulation, + uk_latest, + uk_model, + ) + + __all__ = [ + "UKYearData", + "PolicyEngineUKDataset", + "create_datasets", + "load_datasets", + "ensure_datasets", + "PolicyEngineUK", + "PolicyEngineUKLatest", + "managed_microsimulation", + "uk_model", + "uk_latest", + "general_policy_reform_analysis", + "ProgrammeStatistics", + ] + + # Rebuild models to resolve forward references + PolicyEngineUKDataset.model_rebuild() + PolicyEngineUKLatest.model_rebuild() +else: + __all__ = [] diff --git a/build/lib/policyengine/tax_benefit_models/uk/__init__.py b/build/lib/policyengine/tax_benefit_models/uk/__init__.py new file mode 100644 index 00000000..93533245 --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/uk/__init__.py @@ -0,0 +1,55 @@ +"""PolicyEngine UK tax-benefit model.""" + +from importlib.util import find_spec + +if find_spec("policyengine_uk") is not None: + from policyengine.core import Dataset + + from .analysis import ( + UKHouseholdInput, + UKHouseholdOutput, + calculate_household_impact, + economic_impact_analysis, + ) + from .datasets import ( + PolicyEngineUKDataset, + UKYearData, + create_datasets, + ensure_datasets, + load_datasets, + ) + from .model import ( + PolicyEngineUK, + PolicyEngineUKLatest, + managed_microsimulation, + uk_latest, + uk_model, + ) + from .outputs import ProgrammeStatistics + + # Rebuild Pydantic models to resolve forward references + Dataset.model_rebuild() + UKYearData.model_rebuild() + PolicyEngineUKDataset.model_rebuild() + PolicyEngineUKLatest.model_rebuild() + ProgrammeStatistics.model_rebuild() + + __all__ = [ + "UKYearData", + "PolicyEngineUKDataset", + "create_datasets", + "load_datasets", + "ensure_datasets", + "PolicyEngineUK", + "PolicyEngineUKLatest", + "managed_microsimulation", + "uk_model", + "uk_latest", + "economic_impact_analysis", + "calculate_household_impact", + "UKHouseholdInput", + "UKHouseholdOutput", + "ProgrammeStatistics", + ] +else: + __all__ = [] diff --git a/build/lib/policyengine/tax_benefit_models/uk/analysis.py b/build/lib/policyengine/tax_benefit_models/uk/analysis.py new file mode 100644 index 00000000..0a545b52 --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/uk/analysis.py @@ -0,0 +1,283 @@ +"""General utility functions for UK policy reform analysis.""" + +import tempfile +from pathlib import Path +from typing import Any, Optional + +import pandas as pd +from microdf import MicroDataFrame +from pydantic import BaseModel, Field, create_model + +from policyengine.core import OutputCollection, Simulation +from policyengine.core.policy import Policy +from policyengine.outputs.decile_impact import ( + DecileImpact, + calculate_decile_impacts, +) +from policyengine.outputs.inequality import ( + Inequality, + calculate_uk_inequality, +) +from policyengine.outputs.poverty import ( + Poverty, + calculate_uk_poverty_rates, +) + +from .datasets import PolicyEngineUKDataset, UKYearData +from .model import uk_latest +from .outputs import ProgrammeStatistics + + +def _create_entity_output_model(entity: str, variables: list[str]) -> type[BaseModel]: + """Create a dynamic Pydantic model for entity output variables.""" + fields = {var: (float, ...) for var in variables} + return create_model(f"{entity.title()}Output", **fields) + + +# Create output models dynamically from uk_latest.entity_variables +PersonOutput = _create_entity_output_model( + "person", uk_latest.entity_variables["person"] +) +BenunitOutput = _create_entity_output_model( + "benunit", uk_latest.entity_variables["benunit"] +) +HouseholdEntityOutput = _create_entity_output_model( + "household", uk_latest.entity_variables["household"] +) + + +class UKHouseholdOutput(BaseModel): + """Output from a UK household calculation with all entity data.""" + + person: list[dict[str, Any]] + benunit: list[dict[str, Any]] + household: dict[str, Any] + + +class UKHouseholdInput(BaseModel): + """Input for a UK household calculation.""" + + people: list[dict[str, Any]] + benunit: dict[str, Any] = Field(default_factory=dict) + household: dict[str, Any] = Field(default_factory=dict) + year: int = 2026 + + +def calculate_household_impact( + household_input: UKHouseholdInput, + policy: Optional[Policy] = None, +) -> UKHouseholdOutput: + """Calculate tax and benefit impacts for a single UK household.""" + n_people = len(household_input.people) + + # Build person data with defaults + person_data = { + "person_id": list(range(n_people)), + "person_benunit_id": [0] * n_people, + "person_household_id": [0] * n_people, + "person_weight": [1.0] * n_people, + } + # Add user-provided person fields + for i, person in enumerate(household_input.people): + for key, value in person.items(): + if key not in person_data: + person_data[key] = [0.0] * n_people # Default to 0 for numeric fields + person_data[key][i] = value + + # Build benunit data with defaults + benunit_data = { + "benunit_id": [0], + "benunit_weight": [1.0], + } + for key, value in household_input.benunit.items(): + benunit_data[key] = [value] + + # Build household data with defaults (required for uprating) + household_data = { + "household_id": [0], + "household_weight": [1.0], + "region": ["LONDON"], + "tenure_type": ["RENT_PRIVATELY"], + "council_tax": [0.0], + "rent": [0.0], + } + for key, value in household_input.household.items(): + household_data[key] = [value] + + # Create MicroDataFrames + person_df = MicroDataFrame(pd.DataFrame(person_data), weights="person_weight") + benunit_df = MicroDataFrame(pd.DataFrame(benunit_data), weights="benunit_weight") + household_df = MicroDataFrame( + pd.DataFrame(household_data), weights="household_weight" + ) + + # Create temporary dataset + tmpdir = tempfile.mkdtemp() + filepath = str(Path(tmpdir) / "household_impact.h5") + + dataset = PolicyEngineUKDataset( + name="Household impact calculation", + description="Single household for impact calculation", + filepath=filepath, + year=household_input.year, + data=UKYearData( + person=person_df, + benunit=benunit_df, + household=household_df, + ), + ) + + # Run simulation + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + policy=policy, + ) + simulation.run() + + # Extract all output variables defined in entity_variables + output_data = simulation.output_dataset.data + + def safe_convert(value): + """Convert value to float if numeric, otherwise return as string.""" + try: + return float(value) + except (ValueError, TypeError): + return str(value) + + person_outputs = [] + for i in range(n_people): + person_dict = {} + for var in uk_latest.entity_variables["person"]: + person_dict[var] = safe_convert(output_data.person[var].iloc[i]) + person_outputs.append(person_dict) + + benunit_outputs = [] + for i in range(len(output_data.benunit)): + benunit_dict = {} + for var in uk_latest.entity_variables["benunit"]: + benunit_dict[var] = safe_convert(output_data.benunit[var].iloc[i]) + benunit_outputs.append(benunit_dict) + + household_dict = {} + for var in uk_latest.entity_variables["household"]: + household_dict[var] = safe_convert(output_data.household[var].iloc[0]) + + return UKHouseholdOutput( + person=person_outputs, + benunit=benunit_outputs, + household=household_dict, + ) + + +class PolicyReformAnalysis(BaseModel): + """Complete policy reform analysis result.""" + + decile_impacts: OutputCollection[DecileImpact] + programme_statistics: OutputCollection[ProgrammeStatistics] + baseline_poverty: OutputCollection[Poverty] + reform_poverty: OutputCollection[Poverty] + baseline_inequality: Inequality + reform_inequality: Inequality + + +def economic_impact_analysis( + baseline_simulation: Simulation, + reform_simulation: Simulation, +) -> PolicyReformAnalysis: + """Perform comprehensive analysis of a policy reform. + + Returns: + PolicyReformAnalysis containing decile impacts and programme statistics + """ + baseline_simulation.ensure() + reform_simulation.ensure() + + assert len(baseline_simulation.dataset.data.household) > 100, ( + "Baseline simulation must have more than 100 households" + ) + assert len(reform_simulation.dataset.data.household) > 100, ( + "Reform simulation must have more than 100 households" + ) + + # Decile impact + decile_impacts = calculate_decile_impacts( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + ) + + # Major programmes to analyse + programmes = { + # Tax + "income_tax": {"is_tax": True}, + "national_insurance": {"is_tax": True}, + "vat": {"is_tax": True}, + "council_tax": {"is_tax": True}, + # Benefits + "universal_credit": {"is_tax": False}, + "child_benefit": {"is_tax": False}, + "pension_credit": {"is_tax": False}, + "income_support": {"is_tax": False}, + "working_tax_credit": {"is_tax": False}, + "child_tax_credit": {"is_tax": False}, + } + + programme_statistics = [] + + for programme_name, programme_info in programmes.items(): + entity = baseline_simulation.tax_benefit_model_version.get_variable( + programme_name + ).entity + is_tax = programme_info["is_tax"] + + stats = ProgrammeStatistics( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + programme_name=programme_name, + entity=entity, + is_tax=is_tax, + ) + stats.run() + programme_statistics.append(stats) + + # Create DataFrame + programme_df = pd.DataFrame( + [ + { + "baseline_simulation_id": p.baseline_simulation.id, + "reform_simulation_id": p.reform_simulation.id, + "programme_name": p.programme_name, + "entity": p.entity, + "is_tax": p.is_tax, + "baseline_total": p.baseline_total, + "reform_total": p.reform_total, + "change": p.change, + "baseline_count": p.baseline_count, + "reform_count": p.reform_count, + "winners": p.winners, + "losers": p.losers, + } + for p in programme_statistics + ] + ) + + programme_collection = OutputCollection( + outputs=programme_statistics, dataframe=programme_df + ) + + # Calculate poverty rates for both simulations + baseline_poverty = calculate_uk_poverty_rates(baseline_simulation) + reform_poverty = calculate_uk_poverty_rates(reform_simulation) + + # Calculate inequality for both simulations + baseline_inequality = calculate_uk_inequality(baseline_simulation) + reform_inequality = calculate_uk_inequality(reform_simulation) + + return PolicyReformAnalysis( + decile_impacts=decile_impacts, + programme_statistics=programme_collection, + baseline_poverty=baseline_poverty, + reform_poverty=reform_poverty, + baseline_inequality=baseline_inequality, + reform_inequality=reform_inequality, + ) diff --git a/build/lib/policyengine/tax_benefit_models/uk/datasets.py b/build/lib/policyengine/tax_benefit_models/uk/datasets.py new file mode 100644 index 00000000..47f78403 --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/uk/datasets.py @@ -0,0 +1,245 @@ +from pathlib import Path +from typing import Optional + +import pandas as pd +from microdf import MicroDataFrame +from pydantic import ConfigDict + +from policyengine.core import Dataset, YearData +from policyengine.core.release_manifest import ( + dataset_logical_name, + resolve_dataset_reference, +) + + +class UKYearData(YearData): + """Entity-level data for a single year.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + person: MicroDataFrame + benunit: MicroDataFrame + household: MicroDataFrame + + @property + def entity_data(self) -> dict[str, MicroDataFrame]: + """Return a dictionary of entity names to their data.""" + return { + "person": self.person, + "benunit": self.benunit, + "household": self.household, + } + + +class PolicyEngineUKDataset(Dataset): + """UK dataset with multi-year entity-level data.""" + + data: Optional[UKYearData] = None + + def model_post_init(self, __context): + """Called after Pydantic initialization.""" + # Make sure we are synchronised between in-memory and storage, at least on initialisation + if self.data is not None: + self.save() + elif self.filepath and not self.data: + self.load() + + def save(self) -> None: + """Save dataset to HDF5 file. + + Converts object columns to categorical dtype to avoid slow pickle serialization. + """ + filepath = Path(self.filepath) + if not filepath.parent.exists(): + filepath.parent.mkdir(parents=True, exist_ok=True) + + # Convert DataFrames and optimize object columns to categorical + person_df = pd.DataFrame(self.data.person) + benunit_df = pd.DataFrame(self.data.benunit) + household_df = pd.DataFrame(self.data.household) + + # Convert object columns to categorical to avoid pickle serialization + for col in person_df.columns: + if person_df[col].dtype == "object": + person_df[col] = person_df[col].astype("category") + + for col in benunit_df.columns: + if benunit_df[col].dtype == "object": + benunit_df[col] = benunit_df[col].astype("category") + + for col in household_df.columns: + if household_df[col].dtype == "object": + household_df[col] = household_df[col].astype("category") + + with pd.HDFStore(filepath, mode="w") as store: + # Use format='table' to support categorical dtypes + store.put("person", person_df, format="table") + store.put("benunit", benunit_df, format="table") + store.put("household", household_df, format="table") + + def load(self) -> None: + """Load dataset from HDF5 file into this instance.""" + filepath = self.filepath + with pd.HDFStore(filepath, mode="r") as store: + self.data = UKYearData( + person=MicroDataFrame(store["person"], weights="person_weight"), + benunit=MicroDataFrame(store["benunit"], weights="benunit_weight"), + household=MicroDataFrame( + store["household"], weights="household_weight" + ), + ) + + def __repr__(self) -> str: + if self.data is None: + return f"" + else: + n_people = len(self.data.person) + n_benunits = len(self.data.benunit) + n_households = len(self.data.household) + return f"" + + +def create_datasets( + datasets: list[str] = [ + "frs_2023_24", + "enhanced_frs_2023_24", + ], + years: list[int] = [2026, 2027, 2028, 2029, 2030], + data_folder: str = "./data", +) -> dict[str, PolicyEngineUKDataset]: + result = {} + for dataset in datasets: + resolved_dataset = resolve_dataset_reference("uk", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) + from policyengine_uk import Microsimulation + + sim = Microsimulation(dataset=resolved_dataset) + for year in years: + year_dataset = sim.dataset[year] + + # Convert to pandas DataFrames and add weight columns + person_df = pd.DataFrame(year_dataset.person) + benunit_df = pd.DataFrame(year_dataset.benunit) + household_df = pd.DataFrame(year_dataset.household) + + # Map household weights to person and benunit levels + person_df = person_df.merge( + household_df[["household_id", "household_weight"]], + left_on="person_household_id", + right_on="household_id", + how="left", + ) + person_df = person_df.rename(columns={"household_weight": "person_weight"}) + person_df = person_df.drop(columns=["household_id"]) + + # Get household_id for each benunit from person table + benunit_household_map = person_df[ + ["person_benunit_id", "person_household_id"] + ].drop_duplicates() + benunit_df = benunit_df.merge( + benunit_household_map, + left_on="benunit_id", + right_on="person_benunit_id", + how="left", + ) + benunit_df = benunit_df.merge( + household_df[["household_id", "household_weight"]], + left_on="person_household_id", + right_on="household_id", + how="left", + ) + benunit_df = benunit_df.rename( + columns={"household_weight": "benunit_weight"} + ) + benunit_df = benunit_df.drop( + columns=[ + "person_benunit_id", + "person_household_id", + "household_id", + ], + errors="ignore", + ) + + uk_dataset = PolicyEngineUKDataset( + id=f"{dataset_stem}_year_{year}", + name=f"{dataset_stem}-year-{year}", + description=f"UK Dataset for year {year} based on {dataset_stem}", + filepath=f"{data_folder}/{dataset_stem}_year_{year}.h5", + year=int(year), + data=UKYearData( + person=MicroDataFrame(person_df, weights="person_weight"), + benunit=MicroDataFrame(benunit_df, weights="benunit_weight"), + household=MicroDataFrame(household_df, weights="household_weight"), + ), + ) + uk_dataset.save() + + dataset_key = f"{dataset_stem}_{year}" + result[dataset_key] = uk_dataset + + return result + + +def load_datasets( + datasets: list[str] = [ + "frs_2023_24", + "enhanced_frs_2023_24", + ], + years: list[int] = [2026, 2027, 2028, 2029, 2030], + data_folder: str = "./data", +) -> dict[str, PolicyEngineUKDataset]: + result = {} + for dataset in datasets: + resolved_dataset = resolve_dataset_reference("uk", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) + for year in years: + filepath = f"{data_folder}/{dataset_stem}_year_{year}.h5" + uk_dataset = PolicyEngineUKDataset( + name=f"{dataset_stem}-year-{year}", + description=f"UK Dataset for year {year} based on {dataset_stem}", + filepath=filepath, + year=int(year), + ) + uk_dataset.load() + + dataset_key = f"{dataset_stem}_{year}" + result[dataset_key] = uk_dataset + + return result + + +def ensure_datasets( + datasets: list[str] = [ + "frs_2023_24", + "enhanced_frs_2023_24", + ], + years: list[int] = [2026, 2027, 2028, 2029, 2030], + data_folder: str = "./data", +) -> dict[str, PolicyEngineUKDataset]: + """Ensure datasets exist, loading if available or creating if not. + + Args: + datasets: List of HuggingFace dataset paths + years: List of years to load/create data for + data_folder: Directory containing or to save the dataset files + + Returns: + Dictionary mapping dataset keys to PolicyEngineUKDataset objects + """ + # Check if all dataset files exist + all_exist = True + for dataset in datasets: + resolved_dataset = resolve_dataset_reference("uk", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) + for year in years: + filepath = Path(f"{data_folder}/{dataset_stem}_year_{year}.h5") + if not filepath.exists(): + all_exist = False + break + if not all_exist: + break + + if all_exist: + return load_datasets(datasets=datasets, years=years, data_folder=data_folder) + else: + return create_datasets(datasets=datasets, years=years, data_folder=data_folder) diff --git a/build/lib/policyengine/tax_benefit_models/uk/model.py b/build/lib/policyengine/tax_benefit_models/uk/model.py new file mode 100644 index 00000000..edd5c069 --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/uk/model.py @@ -0,0 +1,496 @@ +import datetime +from importlib import metadata +from pathlib import Path +from typing import TYPE_CHECKING, Optional + +import pandas as pd +from microdf import MicroDataFrame + +from policyengine.core import ( + Parameter, + ParameterNode, + TaxBenefitModel, + TaxBenefitModelVersion, + Variable, +) +from policyengine.core.release_manifest import ( + certify_data_release_compatibility, + dataset_logical_name, + get_release_manifest, + resolve_local_managed_dataset_source, + resolve_managed_dataset_reference, +) +from policyengine.utils.entity_utils import ( + build_entity_relationships, + filter_dataset_by_household_variable, +) +from policyengine.utils.parameter_labels import ( + build_scale_lookup, + generate_label_for_parameter, +) + +from .datasets import PolicyEngineUKDataset, UKYearData + +if TYPE_CHECKING: + from policyengine.core.simulation import Simulation + +UK_GROUP_ENTITIES = ["benunit", "household"] + + +class PolicyEngineUK(TaxBenefitModel): + id: str = "policyengine-uk" + description: str = "The UK's open-source dynamic tax and benefit microsimulation model maintained by PolicyEngine." + + +uk_model = PolicyEngineUK() + + +def _get_runtime_data_build_metadata() -> dict[str, Optional[str]]: + try: + from policyengine_uk.build_metadata import get_data_build_metadata + except ModuleNotFoundError as exc: + if exc.name != "policyengine_uk.build_metadata": + raise + return {} + + return get_data_build_metadata() or {} + + +class PolicyEngineUKLatest(TaxBenefitModelVersion): + model: TaxBenefitModel = uk_model + version: str = None + created_at: datetime.datetime = None + + entity_variables: dict[str, list[str]] = { + "person": [ + # IDs and weights + "person_id", + "benunit_id", + "household_id", + "person_weight", + # Demographics + "age", + "gender", + "is_male", + "is_adult", + "is_SP_age", + "is_child", + # Income + "employment_income", + "self_employment_income", + "pension_income", + "private_pension_income", + "savings_interest_income", + "dividend_income", + "property_income", + "total_income", + "earned_income", + # Benefits + "universal_credit", + "child_benefit", + "pension_credit", + "income_support", + "working_tax_credit", + "child_tax_credit", + # Tax + "income_tax", + "national_insurance", + ], + "benunit": [ + # IDs and weights + "benunit_id", + "benunit_weight", + # Structure + "family_type", + # Income and benefits + "universal_credit", + "child_benefit", + "pension_credit", + "income_support", + "working_tax_credit", + "child_tax_credit", + ], + "household": [ + # IDs and weights + "household_id", + "household_weight", + "household_count_people", + # Income measures + "household_net_income", + "household_income_decile", + "household_wealth_decile", + "hbai_household_net_income", + "equiv_hbai_household_net_income", + "household_market_income", + "household_gross_income", + # Benefits and tax + "household_benefits", + "household_tax", + "vat", + # Housing + "rent", + "council_tax", + "tenure_type", + # Poverty measures + "in_poverty_bhc", + "in_poverty_ahc", + "in_relative_poverty_bhc", + "in_relative_poverty_ahc", + ], + } + + def __init__(self, **kwargs: dict): + manifest = get_release_manifest("uk") + if "version" not in kwargs or kwargs.get("version") is None: + kwargs["version"] = manifest.model_package.version + + installed_model_version = metadata.version("policyengine-uk") + if installed_model_version != manifest.model_package.version: + raise ValueError( + "Installed policyengine-uk version does not match the " + f"bundled policyengine.py manifest. Expected " + f"{manifest.model_package.version}, got {installed_model_version}." + ) + + model_build_metadata = _get_runtime_data_build_metadata() + data_certification = certify_data_release_compatibility( + "uk", + runtime_model_version=installed_model_version, + runtime_data_build_fingerprint=model_build_metadata.get( + "data_build_fingerprint" + ), + ) + + super().__init__(**kwargs) + self.release_manifest = manifest + self.model_package = manifest.model_package + self.data_package = manifest.data_package + self.default_dataset_uri = manifest.default_dataset_uri + self.data_certification = data_certification + from policyengine_core.enums import Enum + from policyengine_uk.system import system + + # Attach region registry + from policyengine.countries.uk.regions import uk_region_registry + + self.region_registry = uk_region_registry + + self.id = f"{self.model.id}@{self.version}" + + for var_obj in system.variables.values(): + # Serialize default_value for JSON compatibility + default_val = var_obj.default_value + if var_obj.value_type is Enum: + default_val = default_val.name + elif var_obj.value_type is datetime.date: + default_val = default_val.isoformat() + + variable = Variable( + id=self.id + "-" + var_obj.name, + name=var_obj.name, + label=getattr(var_obj, "label", None), + tax_benefit_model_version=self, + entity=var_obj.entity.key, + description=var_obj.documentation, + data_type=var_obj.value_type if var_obj.value_type is not Enum else str, + default_value=default_val, + value_type=var_obj.value_type, + ) + if ( + hasattr(var_obj, "possible_values") + and var_obj.possible_values is not None + ): + variable.possible_values = list( + map( + lambda x: x.name, + var_obj.possible_values._value2member_map_.values(), + ) + ) + # Extract and resolve adds/subtracts. + # Core stores these as either list[str] or a parameter path string. + # Resolve parameter paths to lists so consumers always get list[str]. + if hasattr(var_obj, "adds") and var_obj.adds is not None: + if isinstance(var_obj.adds, str): + try: + from policyengine_core.parameters.operations.get_parameter import ( + get_parameter, + ) + + param = get_parameter(system.parameters, var_obj.adds) + variable.adds = list(param("2025-01-01")) + except (ValueError, Exception): + variable.adds = None + else: + variable.adds = var_obj.adds + if hasattr(var_obj, "subtracts") and var_obj.subtracts is not None: + if isinstance(var_obj.subtracts, str): + try: + from policyengine_core.parameters.operations.get_parameter import ( + get_parameter, + ) + + param = get_parameter(system.parameters, var_obj.subtracts) + variable.subtracts = list(param("2025-01-01")) + except (ValueError, Exception): + variable.subtracts = None + else: + variable.subtracts = var_obj.subtracts + self.add_variable(variable) + + from policyengine_core.parameters import Parameter as CoreParameter + from policyengine_core.parameters import ParameterNode as CoreParameterNode + + scale_lookup = build_scale_lookup(system) + + for param_node in system.parameters.get_descendants(): + if isinstance(param_node, CoreParameter): + parameter = Parameter( + id=self.id + "-" + param_node.name, + name=param_node.name, + label=generate_label_for_parameter( + param_node, system, scale_lookup + ), + tax_benefit_model_version=self, + description=param_node.description, + data_type=type(param_node(2025)), + unit=param_node.metadata.get("unit"), + _core_param=param_node, + ) + self.add_parameter(parameter) + elif isinstance(param_node, CoreParameterNode): + node = ParameterNode( + id=self.id + "-" + param_node.name, + name=param_node.name, + label=param_node.metadata.get("label"), + description=param_node.description, + tax_benefit_model_version=self, + ) + self.add_parameter_node(node) + + def _build_entity_relationships( + self, dataset: PolicyEngineUKDataset + ) -> pd.DataFrame: + """Build a DataFrame mapping each person to their containing entities.""" + person_data = pd.DataFrame(dataset.data.person) + return build_entity_relationships(person_data, UK_GROUP_ENTITIES) + + def _filter_dataset_by_household_variable( + self, + dataset: PolicyEngineUKDataset, + variable_name: str, + variable_value: str, + ) -> PolicyEngineUKDataset: + """Filter a dataset to only include households where a variable matches.""" + filtered = filter_dataset_by_household_variable( + entity_data=dataset.data.entity_data, + group_entities=UK_GROUP_ENTITIES, + variable_name=variable_name, + variable_value=variable_value, + ) + return PolicyEngineUKDataset( + id=dataset.id + f"_filtered_{variable_name}_{variable_value}", + name=dataset.name, + description=f"{dataset.description} (filtered: {variable_name}={variable_value})", + filepath=dataset.filepath, + year=dataset.year, + is_output_dataset=dataset.is_output_dataset, + data=UKYearData( + person=filtered["person"], + benunit=filtered["benunit"], + household=filtered["household"], + ), + ) + + def run(self, simulation: "Simulation") -> "Simulation": + from policyengine_uk import Microsimulation + from policyengine_uk.data import UKSingleYearDataset + + from policyengine.utils.parametric_reforms import ( + simulation_modifier_from_parameter_values, + ) + + assert isinstance(simulation.dataset, PolicyEngineUKDataset) + + dataset = simulation.dataset + dataset.load() + + # Apply regional scoping if specified + if simulation.scoping_strategy: + scoped_data = simulation.scoping_strategy.apply( + entity_data=dataset.data.entity_data, + group_entities=UK_GROUP_ENTITIES, + year=dataset.year, + ) + dataset = PolicyEngineUKDataset( + id=dataset.id + "_scoped", + name=dataset.name, + description=dataset.description, + filepath=dataset.filepath, + year=dataset.year, + is_output_dataset=dataset.is_output_dataset, + data=UKYearData( + person=scoped_data["person"], + benunit=scoped_data["benunit"], + household=scoped_data["household"], + ), + ) + elif simulation.filter_field and simulation.filter_value: + dataset = self._filter_dataset_by_household_variable( + dataset, simulation.filter_field, simulation.filter_value + ) + + input_data = UKSingleYearDataset( + person=dataset.data.person, + benunit=dataset.data.benunit, + household=dataset.data.household, + fiscal_year=dataset.year, + ) + microsim = Microsimulation(dataset=input_data) + + if simulation.policy and simulation.policy.simulation_modifier is not None: + simulation.policy.simulation_modifier(microsim) + elif simulation.policy: + modifier = simulation_modifier_from_parameter_values( + simulation.policy.parameter_values + ) + modifier(microsim) + + if simulation.dynamic and simulation.dynamic.simulation_modifier is not None: + simulation.dynamic.simulation_modifier(microsim) + elif simulation.dynamic: + modifier = simulation_modifier_from_parameter_values( + simulation.dynamic.parameter_values + ) + modifier(microsim) + + data = { + "person": pd.DataFrame(), + "benunit": pd.DataFrame(), + "household": pd.DataFrame(), + } + + for entity, variables in self.entity_variables.items(): + for var in variables: + data[entity][var] = microsim.calculate( + var, period=simulation.dataset.year, map_to=entity + ).values + + data["person"] = MicroDataFrame(data["person"], weights="person_weight") + data["benunit"] = MicroDataFrame(data["benunit"], weights="benunit_weight") + data["household"] = MicroDataFrame( + data["household"], weights="household_weight" + ) + + simulation.output_dataset = PolicyEngineUKDataset( + id=simulation.id, + name=dataset.name, + description=dataset.description, + filepath=str( + Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") + ), + year=simulation.dataset.year, + is_output_dataset=True, + data=UKYearData( + person=data["person"], + benunit=data["benunit"], + household=data["household"], + ), + ) + + def save(self, simulation: "Simulation"): + """Save the simulation's output dataset.""" + simulation.output_dataset.save() + + def load(self, simulation: "Simulation"): + """Load the simulation's output dataset.""" + import os + + filepath = str( + Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") + ) + + simulation.output_dataset = PolicyEngineUKDataset( + id=simulation.id, + name=simulation.dataset.name, + description=simulation.dataset.description, + filepath=filepath, + year=simulation.dataset.year, + is_output_dataset=True, + ) + + # Load timestamps from file system metadata + if os.path.exists(filepath): + simulation.created_at = datetime.datetime.fromtimestamp( + os.path.getctime(filepath) + ) + simulation.updated_at = datetime.datetime.fromtimestamp( + os.path.getmtime(filepath) + ) + + +def _managed_release_bundle( + dataset_uri: str, + dataset_source: Optional[str] = None, +) -> dict[str, Optional[str]]: + bundle = dict(uk_latest.release_bundle) + bundle["runtime_dataset"] = dataset_logical_name(dataset_uri) + bundle["runtime_dataset_uri"] = dataset_uri + if dataset_source: + bundle["runtime_dataset_source"] = dataset_source + bundle["managed_by"] = "policyengine.py" + return bundle + + +def managed_microsimulation( + *, + dataset: Optional[str] = None, + allow_unmanaged: bool = False, + **kwargs, +): + """Construct a country-package Microsimulation pinned to this bundle. + + By default this enforces the dataset selection from the bundled + `policyengine.py` release manifest. Arbitrary dataset URIs require + `allow_unmanaged=True`. + """ + + from policyengine_uk import Microsimulation + + if "dataset" in kwargs: + raise ValueError( + "Pass `dataset=` directly to managed_microsimulation, not through " + "**kwargs, so policyengine.py can enforce the release bundle." + ) + + dataset_uri = resolve_managed_dataset_reference( + "uk", + dataset, + allow_unmanaged=allow_unmanaged, + ) + dataset_source = resolve_local_managed_dataset_source( + "uk", + dataset_uri, + allow_local_mirror=not ( + allow_unmanaged and dataset is not None and "://" in dataset + ), + ) + runtime_dataset = dataset_source + if isinstance(dataset_source, str) and "hf://" not in dataset_source: + from policyengine_uk.data.dataset_schema import ( + UKMultiYearDataset, + UKSingleYearDataset, + ) + + if UKMultiYearDataset.validate_file_path(dataset_source, False): + runtime_dataset = UKMultiYearDataset(dataset_source) + elif UKSingleYearDataset.validate_file_path(dataset_source, False): + runtime_dataset = UKSingleYearDataset(dataset_source) + microsim = Microsimulation(dataset=runtime_dataset, **kwargs) + microsim.policyengine_bundle = _managed_release_bundle( + dataset_uri, + dataset_source, + ) + return microsim + + +uk_latest = PolicyEngineUKLatest() diff --git a/build/lib/policyengine/tax_benefit_models/uk/outputs.py b/build/lib/policyengine/tax_benefit_models/uk/outputs.py new file mode 100644 index 00000000..97032a9c --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/uk/outputs.py @@ -0,0 +1,105 @@ +"""UK-specific output templates.""" + +from typing import Optional + +from pydantic import ConfigDict + +from policyengine.core import Output, Simulation +from policyengine.outputs.aggregate import Aggregate, AggregateType +from policyengine.outputs.change_aggregate import ( + ChangeAggregate, + ChangeAggregateType, +) + + +class ProgrammeStatistics(Output): + """Single programme's statistics from a policy reform - represents one database row.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + baseline_simulation: Simulation + reform_simulation: Simulation + programme_name: str + entity: str + is_tax: bool = False + + # Results populated by run() + baseline_total: Optional[float] = None + reform_total: Optional[float] = None + change: Optional[float] = None + baseline_count: Optional[float] = None + reform_count: Optional[float] = None + winners: Optional[float] = None + losers: Optional[float] = None + + def run(self): + """Calculate statistics for this programme.""" + # Baseline totals + baseline_total = Aggregate( + simulation=self.baseline_simulation, + variable=self.programme_name, + aggregate_type=AggregateType.SUM, + entity=self.entity, + ) + baseline_total.run() + + # Reform totals + reform_total = Aggregate( + simulation=self.reform_simulation, + variable=self.programme_name, + aggregate_type=AggregateType.SUM, + entity=self.entity, + ) + reform_total.run() + + # Count of recipients/payers (baseline) + baseline_count = Aggregate( + simulation=self.baseline_simulation, + variable=self.programme_name, + aggregate_type=AggregateType.COUNT, + entity=self.entity, + filter_variable=self.programme_name, + filter_variable_geq=0.01, + ) + baseline_count.run() + + # Count of recipients/payers (reform) + reform_count = Aggregate( + simulation=self.reform_simulation, + variable=self.programme_name, + aggregate_type=AggregateType.COUNT, + entity=self.entity, + filter_variable=self.programme_name, + filter_variable_geq=0.01, + ) + reform_count.run() + + # Winners and losers + winners = ChangeAggregate( + baseline_simulation=self.baseline_simulation, + reform_simulation=self.reform_simulation, + variable=self.programme_name, + aggregate_type=ChangeAggregateType.COUNT, + entity=self.entity, + change_geq=0.01 if not self.is_tax else -0.01, + ) + winners.run() + + losers = ChangeAggregate( + baseline_simulation=self.baseline_simulation, + reform_simulation=self.reform_simulation, + variable=self.programme_name, + aggregate_type=ChangeAggregateType.COUNT, + entity=self.entity, + change_leq=-0.01 if not self.is_tax else 0.01, + ) + losers.run() + + # Populate results + self.baseline_total = float(baseline_total.result) + self.reform_total = float(reform_total.result) + self.change = float(reform_total.result - baseline_total.result) + self.baseline_count = float(baseline_count.result) + self.reform_count = float(reform_count.result) + self.winners = float(winners.result) + self.losers = float(losers.result) diff --git a/build/lib/policyengine/tax_benefit_models/us.py b/build/lib/policyengine/tax_benefit_models/us.py new file mode 100644 index 00000000..bbc29486 --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/us.py @@ -0,0 +1,40 @@ +"""PolicyEngine US tax-benefit model - imports from us/ module.""" + +from importlib.util import find_spec + +if find_spec("policyengine_us") is not None: + from .us import ( + PolicyEngineUS, + PolicyEngineUSDataset, + PolicyEngineUSLatest, + ProgramStatistics, + USYearData, + create_datasets, + ensure_datasets, + general_policy_reform_analysis, + load_datasets, + managed_microsimulation, + us_latest, + us_model, + ) + + __all__ = [ + "USYearData", + "PolicyEngineUSDataset", + "create_datasets", + "load_datasets", + "ensure_datasets", + "PolicyEngineUS", + "PolicyEngineUSLatest", + "managed_microsimulation", + "us_model", + "us_latest", + "general_policy_reform_analysis", + "ProgramStatistics", + ] + + # Rebuild models to resolve forward references + PolicyEngineUSDataset.model_rebuild() + PolicyEngineUSLatest.model_rebuild() +else: + __all__ = [] diff --git a/build/lib/policyengine/tax_benefit_models/us/__init__.py b/build/lib/policyengine/tax_benefit_models/us/__init__.py new file mode 100644 index 00000000..75d2aa79 --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/us/__init__.py @@ -0,0 +1,55 @@ +"""PolicyEngine US tax-benefit model.""" + +from importlib.util import find_spec + +if find_spec("policyengine_us") is not None: + from policyengine.core import Dataset + + from .analysis import ( + USHouseholdInput, + USHouseholdOutput, + calculate_household_impact, + economic_impact_analysis, + ) + from .datasets import ( + PolicyEngineUSDataset, + USYearData, + create_datasets, + ensure_datasets, + load_datasets, + ) + from .model import ( + PolicyEngineUS, + PolicyEngineUSLatest, + managed_microsimulation, + us_latest, + us_model, + ) + from .outputs import ProgramStatistics + + # Rebuild Pydantic models to resolve forward references + Dataset.model_rebuild() + USYearData.model_rebuild() + PolicyEngineUSDataset.model_rebuild() + PolicyEngineUSLatest.model_rebuild() + ProgramStatistics.model_rebuild() + + __all__ = [ + "USYearData", + "PolicyEngineUSDataset", + "create_datasets", + "load_datasets", + "ensure_datasets", + "PolicyEngineUS", + "PolicyEngineUSLatest", + "managed_microsimulation", + "us_model", + "us_latest", + "economic_impact_analysis", + "calculate_household_impact", + "USHouseholdInput", + "USHouseholdOutput", + "ProgramStatistics", + ] +else: + __all__ = [] diff --git a/build/lib/policyengine/tax_benefit_models/us/analysis.py b/build/lib/policyengine/tax_benefit_models/us/analysis.py new file mode 100644 index 00000000..122ae2af --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/us/analysis.py @@ -0,0 +1,311 @@ +"""General utility functions for US policy reform analysis.""" + +import tempfile +from pathlib import Path +from typing import Any, Optional, Union + +import pandas as pd +from microdf import MicroDataFrame +from pydantic import BaseModel, Field + +from policyengine.core import OutputCollection, Simulation +from policyengine.core.policy import Policy +from policyengine.outputs.decile_impact import ( + DecileImpact, + calculate_decile_impacts, +) +from policyengine.outputs.inequality import ( + Inequality, + USInequalityPreset, + calculate_us_inequality, +) +from policyengine.outputs.poverty import ( + Poverty, + calculate_us_poverty_rates, +) + +from .datasets import PolicyEngineUSDataset, USYearData +from .model import us_latest +from .outputs import ProgramStatistics + + +class USHouseholdOutput(BaseModel): + """Output from a US household calculation with all entity data.""" + + person: list[dict[str, Any]] + marital_unit: list[dict[str, Any]] + family: list[dict[str, Any]] + spm_unit: list[dict[str, Any]] + tax_unit: list[dict[str, Any]] + household: dict[str, Any] + + +class USHouseholdInput(BaseModel): + """Input for a US household calculation.""" + + people: list[dict[str, Any]] + marital_unit: dict[str, Any] = Field(default_factory=dict) + family: dict[str, Any] = Field(default_factory=dict) + spm_unit: dict[str, Any] = Field(default_factory=dict) + tax_unit: dict[str, Any] = Field(default_factory=dict) + household: dict[str, Any] = Field(default_factory=dict) + year: int = 2024 + + +def calculate_household_impact( + household_input: USHouseholdInput, + policy: Optional[Policy] = None, +) -> USHouseholdOutput: + """Calculate tax and benefit impacts for a single US household.""" + n_people = len(household_input.people) + + # Build person data with defaults + person_data = { + "person_id": list(range(n_people)), + "person_household_id": [0] * n_people, + "person_marital_unit_id": [0] * n_people, + "person_family_id": [0] * n_people, + "person_spm_unit_id": [0] * n_people, + "person_tax_unit_id": [0] * n_people, + "person_weight": [1.0] * n_people, + } + # Add user-provided person fields + for i, person in enumerate(household_input.people): + for key, value in person.items(): + if key not in person_data: + person_data[key] = [0.0] * n_people # Default to 0 for numeric fields + person_data[key][i] = value + + # Build entity data with defaults + household_data = { + "household_id": [0], + "household_weight": [1.0], + } + for key, value in household_input.household.items(): + household_data[key] = [value] + + marital_unit_data = { + "marital_unit_id": [0], + "marital_unit_weight": [1.0], + } + for key, value in household_input.marital_unit.items(): + marital_unit_data[key] = [value] + + family_data = { + "family_id": [0], + "family_weight": [1.0], + } + for key, value in household_input.family.items(): + family_data[key] = [value] + + spm_unit_data = { + "spm_unit_id": [0], + "spm_unit_weight": [1.0], + } + for key, value in household_input.spm_unit.items(): + spm_unit_data[key] = [value] + + tax_unit_data = { + "tax_unit_id": [0], + "tax_unit_weight": [1.0], + } + for key, value in household_input.tax_unit.items(): + tax_unit_data[key] = [value] + + # Create MicroDataFrames + person_df = MicroDataFrame(pd.DataFrame(person_data), weights="person_weight") + household_df = MicroDataFrame( + pd.DataFrame(household_data), weights="household_weight" + ) + marital_unit_df = MicroDataFrame( + pd.DataFrame(marital_unit_data), weights="marital_unit_weight" + ) + family_df = MicroDataFrame(pd.DataFrame(family_data), weights="family_weight") + spm_unit_df = MicroDataFrame(pd.DataFrame(spm_unit_data), weights="spm_unit_weight") + tax_unit_df = MicroDataFrame(pd.DataFrame(tax_unit_data), weights="tax_unit_weight") + + # Create temporary dataset + tmpdir = tempfile.mkdtemp() + filepath = str(Path(tmpdir) / "household_impact.h5") + + dataset = PolicyEngineUSDataset( + name="Household impact calculation", + description="Single household for impact calculation", + filepath=filepath, + year=household_input.year, + data=USYearData( + person=person_df, + household=household_df, + marital_unit=marital_unit_df, + family=family_df, + spm_unit=spm_unit_df, + tax_unit=tax_unit_df, + ), + ) + + # Run simulation + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=us_latest, + policy=policy, + ) + simulation.run() + + # Extract all output variables defined in entity_variables + output_data = simulation.output_dataset.data + + def safe_convert(value): + """Convert value to float if numeric, otherwise return as string.""" + try: + return float(value) + except (ValueError, TypeError): + return str(value) + + def extract_entity_outputs( + entity_name: str, entity_data, n_rows: int + ) -> list[dict[str, Any]]: + outputs = [] + for i in range(n_rows): + row_dict = {} + for var in us_latest.entity_variables[entity_name]: + row_dict[var] = safe_convert(entity_data[var].iloc[i]) + outputs.append(row_dict) + return outputs + + return USHouseholdOutput( + person=extract_entity_outputs("person", output_data.person, n_people), + marital_unit=extract_entity_outputs( + "marital_unit", output_data.marital_unit, 1 + ), + family=extract_entity_outputs("family", output_data.family, 1), + spm_unit=extract_entity_outputs("spm_unit", output_data.spm_unit, 1), + tax_unit=extract_entity_outputs("tax_unit", output_data.tax_unit, 1), + household={ + var: safe_convert(output_data.household[var].iloc[0]) + for var in us_latest.entity_variables["household"] + }, + ) + + +class PolicyReformAnalysis(BaseModel): + """Complete policy reform analysis result.""" + + decile_impacts: OutputCollection[DecileImpact] + program_statistics: OutputCollection[ProgramStatistics] + baseline_poverty: OutputCollection[Poverty] + reform_poverty: OutputCollection[Poverty] + baseline_inequality: Inequality + reform_inequality: Inequality + + +def economic_impact_analysis( + baseline_simulation: Simulation, + reform_simulation: Simulation, + inequality_preset: Union[USInequalityPreset, str] = USInequalityPreset.STANDARD, +) -> PolicyReformAnalysis: + """Perform comprehensive analysis of a policy reform. + + Args: + baseline_simulation: Baseline simulation + reform_simulation: Reform simulation + inequality_preset: Optional preset for the inequality outputs + + Returns: + PolicyReformAnalysis containing decile impacts and program statistics + """ + baseline_simulation.ensure() + reform_simulation.ensure() + + assert len(baseline_simulation.dataset.data.household) > 100, ( + "Baseline simulation must have more than 100 households" + ) + assert len(reform_simulation.dataset.data.household) > 100, ( + "Reform simulation must have more than 100 households" + ) + + # Decile impact (using household_net_income for US) + decile_impacts = calculate_decile_impacts( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + income_variable="household_net_income", + ) + + # Major programs to analyse + programs = { + # Federal taxes + "income_tax": {"entity": "tax_unit", "is_tax": True}, + "payroll_tax": {"entity": "person", "is_tax": True}, + # State and local taxes + "state_income_tax": {"entity": "tax_unit", "is_tax": True}, + # Benefits + "snap": {"entity": "spm_unit", "is_tax": False}, + "tanf": {"entity": "spm_unit", "is_tax": False}, + "ssi": {"entity": "person", "is_tax": False}, + "social_security": {"entity": "person", "is_tax": False}, + "medicare": {"entity": "person", "is_tax": False}, + "medicaid": {"entity": "person", "is_tax": False}, + "eitc": {"entity": "tax_unit", "is_tax": False}, + "ctc": {"entity": "tax_unit", "is_tax": False}, + } + + program_statistics = [] + + for program_name, program_info in programs.items(): + entity = program_info["entity"] + is_tax = program_info["is_tax"] + + stats = ProgramStatistics( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + program_name=program_name, + entity=entity, + is_tax=is_tax, + ) + stats.run() + program_statistics.append(stats) + + # Create DataFrame + program_df = pd.DataFrame( + [ + { + "baseline_simulation_id": p.baseline_simulation.id, + "reform_simulation_id": p.reform_simulation.id, + "program_name": p.program_name, + "entity": p.entity, + "is_tax": p.is_tax, + "baseline_total": p.baseline_total, + "reform_total": p.reform_total, + "change": p.change, + "baseline_count": p.baseline_count, + "reform_count": p.reform_count, + "winners": p.winners, + "losers": p.losers, + } + for p in program_statistics + ] + ) + + program_collection = OutputCollection( + outputs=program_statistics, dataframe=program_df + ) + + # Calculate poverty rates for both simulations + baseline_poverty = calculate_us_poverty_rates(baseline_simulation) + reform_poverty = calculate_us_poverty_rates(reform_simulation) + + # Calculate inequality for both simulations + baseline_inequality = calculate_us_inequality( + baseline_simulation, preset=inequality_preset + ) + reform_inequality = calculate_us_inequality( + reform_simulation, preset=inequality_preset + ) + + return PolicyReformAnalysis( + decile_impacts=decile_impacts, + program_statistics=program_collection, + baseline_poverty=baseline_poverty, + reform_poverty=reform_poverty, + baseline_inequality=baseline_inequality, + reform_inequality=reform_inequality, + ) diff --git a/build/lib/policyengine/tax_benefit_models/us/datasets.py b/build/lib/policyengine/tax_benefit_models/us/datasets.py new file mode 100644 index 00000000..da10733b --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/us/datasets.py @@ -0,0 +1,359 @@ +import warnings +from pathlib import Path +from typing import Optional + +import pandas as pd +from microdf import MicroDataFrame +from pydantic import ConfigDict + +from policyengine.core import Dataset, YearData +from policyengine.core.release_manifest import ( + dataset_logical_name, + resolve_dataset_reference, +) + + +class USYearData(YearData): + """Entity-level data for a single year.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + person: MicroDataFrame + marital_unit: MicroDataFrame + family: MicroDataFrame + spm_unit: MicroDataFrame + tax_unit: MicroDataFrame + household: MicroDataFrame + + @property + def entity_data(self) -> dict[str, MicroDataFrame]: + """Return a dictionary of entity names to their data.""" + return { + "person": self.person, + "marital_unit": self.marital_unit, + "family": self.family, + "spm_unit": self.spm_unit, + "tax_unit": self.tax_unit, + "household": self.household, + } + + +class PolicyEngineUSDataset(Dataset): + """US dataset with multi-year entity-level data.""" + + data: Optional[USYearData] = None + + def model_post_init(self, __context) -> None: + """Called after Pydantic initialization.""" + # Make sure we are synchronised between in-memory and storage, at least on initialisation + if self.data is not None: + self.save() + elif self.filepath and not self.data: + self.load() + + def save(self) -> None: + """Save dataset to HDF5 file.""" + filepath = Path(self.filepath) + if not filepath.parent.exists(): + filepath.parent.mkdir(parents=True, exist_ok=True) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=pd.errors.PerformanceWarning, + message=".*PyTables will pickle object types.*", + ) + with pd.HDFStore(filepath, mode="w") as store: + store["person"] = pd.DataFrame(self.data.person) + store["marital_unit"] = pd.DataFrame(self.data.marital_unit) + store["family"] = pd.DataFrame(self.data.family) + store["spm_unit"] = pd.DataFrame(self.data.spm_unit) + store["tax_unit"] = pd.DataFrame(self.data.tax_unit) + store["household"] = pd.DataFrame(self.data.household) + + def load(self) -> None: + """Load dataset from HDF5 file into this instance.""" + filepath = self.filepath + with pd.HDFStore(filepath, mode="r") as store: + self.data = USYearData( + person=MicroDataFrame(store["person"], weights="person_weight"), + marital_unit=MicroDataFrame( + store["marital_unit"], weights="marital_unit_weight" + ), + family=MicroDataFrame(store["family"], weights="family_weight"), + spm_unit=MicroDataFrame(store["spm_unit"], weights="spm_unit_weight"), + tax_unit=MicroDataFrame(store["tax_unit"], weights="tax_unit_weight"), + household=MicroDataFrame( + store["household"], weights="household_weight" + ), + ) + + def __repr__(self) -> str: + if self.data is None: + return f"" + else: + n_people = len(self.data.person) + n_marital_units = len(self.data.marital_unit) + n_families = len(self.data.family) + n_spm_units = len(self.data.spm_unit) + n_tax_units = len(self.data.tax_unit) + n_households = len(self.data.household) + return f"" + + +def create_datasets( + datasets: list[str] = [ + "enhanced_cps_2024", + ], + years: list[int] = [2024, 2025, 2026, 2027, 2028], + data_folder: str = "./data", +) -> dict[str, PolicyEngineUSDataset]: + """Create PolicyEngineUSDataset instances from logical dataset names or URLs. + + Args: + datasets: List of logical dataset names or HuggingFace dataset URLs + years: List of years to extract data for + data_folder: Directory to save the dataset files + + Returns: + Dictionary mapping dataset keys (e.g., "enhanced_cps_2024") to PolicyEngineUSDataset objects + """ + from policyengine_us import Microsimulation + + result = {} + for dataset in datasets: + resolved_dataset = resolve_dataset_reference("us", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) + sim = Microsimulation(dataset=resolved_dataset) + + for year in years: + # Get all input variables from the simulation + # We'll calculate each input variable for the specified year + entity_data = { + "person": {}, + "household": {}, + "marital_unit": {}, + "family": {}, + "spm_unit": {}, + "tax_unit": {}, + } + + # First, get ID columns which are structural (not input variables) + # These define entity membership and relationships + # For person-level links to group entities, use person_X_id naming + id_variables = { + "person": [ + "person_id", + "person_household_id", + "person_marital_unit_id", + "person_family_id", + "person_spm_unit_id", + "person_tax_unit_id", + ], + "household": ["household_id"], + "marital_unit": ["marital_unit_id"], + "family": ["family_id"], + "spm_unit": ["spm_unit_id"], + "tax_unit": ["tax_unit_id"], + } + + for entity_key, var_names in id_variables.items(): + for id_var in var_names: + if id_var in sim.tax_benefit_system.variables: + values = sim.calculate(id_var, period=year).values + entity_data[entity_key][id_var] = values + + # Get input variables and calculate them for this year + for variable_name in sim.input_variables: + variable = sim.tax_benefit_system.variables[variable_name] + entity_key = variable.entity.key + + # Calculate the variable for the given year + values = sim.calculate(variable_name, period=year).values + + # Store in the appropriate entity dictionary + entity_data[entity_key][variable_name] = values + + # Build entity DataFrames + person_df = pd.DataFrame(entity_data["person"]) + household_df = pd.DataFrame(entity_data["household"]) + marital_unit_df = pd.DataFrame(entity_data["marital_unit"]) + family_df = pd.DataFrame(entity_data["family"]) + spm_unit_df = pd.DataFrame(entity_data["spm_unit"]) + tax_unit_df = pd.DataFrame(entity_data["tax_unit"]) + + # Add weight columns - household weights are primary, map to all entities + # Person weights = household weights (mapped via person_household_id) + if "household_weight" in household_df.columns: + # Only add person_weight if it doesn't already exist + if "person_weight" not in person_df.columns: + person_df = person_df.merge( + household_df[["household_id", "household_weight"]], + left_on="person_household_id", + right_on="household_id", + how="left", + ) + person_df = person_df.rename( + columns={"household_weight": "person_weight"} + ) + person_df = person_df.drop( + columns=["household_id"], errors="ignore" + ) + + # Map household weights to other group entities via person table + for entity_name, entity_df, person_id_col, entity_id_col in [ + ( + "marital_unit", + marital_unit_df, + "person_marital_unit_id", + "marital_unit_id", + ), + ("family", family_df, "person_family_id", "family_id"), + ( + "spm_unit", + spm_unit_df, + "person_spm_unit_id", + "spm_unit_id", + ), + ( + "tax_unit", + tax_unit_df, + "person_tax_unit_id", + "tax_unit_id", + ), + ]: + # Only add entity weight if it doesn't already exist + if f"{entity_name}_weight" not in entity_df.columns: + # Get household_id for each entity from person table + entity_household_map = person_df[ + [person_id_col, "person_household_id"] + ].drop_duplicates() + entity_df = entity_df.merge( + entity_household_map, + left_on=entity_id_col, + right_on=person_id_col, + how="left", + ) + entity_df = entity_df.merge( + household_df[["household_id", "household_weight"]], + left_on="person_household_id", + right_on="household_id", + how="left", + ) + entity_df = entity_df.rename( + columns={"household_weight": f"{entity_name}_weight"} + ) + entity_df = entity_df.drop( + columns=[ + "household_id", + "person_household_id", + person_id_col, + ], + errors="ignore", + ) + + # Update the entity_data + if entity_name == "marital_unit": + marital_unit_df = entity_df + elif entity_name == "family": + family_df = entity_df + elif entity_name == "spm_unit": + spm_unit_df = entity_df + elif entity_name == "tax_unit": + tax_unit_df = entity_df + + us_dataset = PolicyEngineUSDataset( + id=f"{dataset_stem}_year_{year}", + name=f"{dataset_stem}-year-{year}", + description=f"US Dataset for year {year} based on {dataset_stem}", + filepath=f"{data_folder}/{dataset_stem}_year_{year}.h5", + year=int(year), + data=USYearData( + person=MicroDataFrame(person_df, weights="person_weight"), + household=MicroDataFrame(household_df, weights="household_weight"), + marital_unit=MicroDataFrame( + marital_unit_df, weights="marital_unit_weight" + ), + family=MicroDataFrame(family_df, weights="family_weight"), + spm_unit=MicroDataFrame(spm_unit_df, weights="spm_unit_weight"), + tax_unit=MicroDataFrame(tax_unit_df, weights="tax_unit_weight"), + ), + ) + us_dataset.save() + + dataset_key = f"{dataset_stem}_{year}" + result[dataset_key] = us_dataset + + return result + + +def load_datasets( + datasets: list[str] = [ + "enhanced_cps_2024", + ], + years: list[int] = [2024, 2025, 2026, 2027, 2028], + data_folder: str = "./data", +) -> dict[str, PolicyEngineUSDataset]: + """Load PolicyEngineUSDataset instances from saved HDF5 files. + + Args: + datasets: List of HuggingFace dataset paths (used to derive file names) + years: List of years to load data for + data_folder: Directory containing the dataset files + + Returns: + Dictionary mapping dataset keys (e.g., "enhanced_cps_2024") to PolicyEngineUSDataset objects + """ + result = {} + for dataset in datasets: + resolved_dataset = resolve_dataset_reference("us", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) + for year in years: + filepath = f"{data_folder}/{dataset_stem}_year_{year}.h5" + us_dataset = PolicyEngineUSDataset( + name=f"{dataset_stem}-year-{year}", + description=f"US Dataset for year {year} based on {dataset_stem}", + filepath=filepath, + year=year, + ) + us_dataset.load() + + dataset_key = f"{dataset_stem}_{year}" + result[dataset_key] = us_dataset + + return result + + +def ensure_datasets( + datasets: list[str] = [ + "enhanced_cps_2024", + ], + years: list[int] = [2024, 2025, 2026, 2027, 2028], + data_folder: str = "./data", +) -> dict[str, PolicyEngineUSDataset]: + """Ensure datasets exist, loading if available or creating if not. + + Args: + datasets: List of HuggingFace dataset paths + years: List of years to load/create data for + data_folder: Directory containing or to save the dataset files + + Returns: + Dictionary mapping dataset keys to PolicyEngineUSDataset objects + """ + # Check if all dataset files exist + all_exist = True + for dataset in datasets: + resolved_dataset = resolve_dataset_reference("us", dataset) + dataset_stem = dataset_logical_name(resolved_dataset) + for year in years: + filepath = Path(f"{data_folder}/{dataset_stem}_year_{year}.h5") + if not filepath.exists(): + all_exist = False + break + if not all_exist: + break + + if all_exist: + return load_datasets(datasets=datasets, years=years, data_folder=data_folder) + else: + return create_datasets(datasets=datasets, years=years, data_folder=data_folder) diff --git a/build/lib/policyengine/tax_benefit_models/us/model.py b/build/lib/policyengine/tax_benefit_models/us/model.py new file mode 100644 index 00000000..a896f5c4 --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/us/model.py @@ -0,0 +1,650 @@ +import datetime +from importlib import metadata +from pathlib import Path +from typing import TYPE_CHECKING, Optional + +import pandas as pd +from microdf import MicroDataFrame + +from policyengine.core import ( + Parameter, + ParameterNode, + TaxBenefitModel, + TaxBenefitModelVersion, + Variable, +) +from policyengine.core.release_manifest import ( + certify_data_release_compatibility, + dataset_logical_name, + get_release_manifest, + resolve_local_managed_dataset_source, + resolve_managed_dataset_reference, +) +from policyengine.utils.entity_utils import ( + build_entity_relationships, + filter_dataset_by_household_variable, +) +from policyengine.utils.parameter_labels import ( + build_scale_lookup, + generate_label_for_parameter, +) + +from .datasets import PolicyEngineUSDataset, USYearData + +if TYPE_CHECKING: + from policyengine.core.simulation import Simulation + +US_GROUP_ENTITIES = [ + "household", + "tax_unit", + "spm_unit", + "family", + "marital_unit", +] + + +class PolicyEngineUS(TaxBenefitModel): + id: str = "policyengine-us" + description: str = "The US's open-source dynamic tax and benefit microsimulation model maintained by PolicyEngine." + + +us_model = PolicyEngineUS() + + +def _get_runtime_data_build_metadata() -> dict[str, Optional[str]]: + try: + from policyengine_us.build_metadata import get_data_build_metadata + except ModuleNotFoundError as exc: + if exc.name != "policyengine_us.build_metadata": + raise + return {} + + return get_data_build_metadata() or {} + + +class PolicyEngineUSLatest(TaxBenefitModelVersion): + model: TaxBenefitModel = us_model + version: str = None + created_at: datetime.datetime = None + + entity_variables: dict[str, list[str]] = { + "person": [ + # IDs and weights + "person_id", + "marital_unit_id", + "family_id", + "spm_unit_id", + "tax_unit_id", + "household_id", + "person_weight", + # Demographics + "age", + "is_male", + "race", + "is_child", + "is_adult", + # Income + "employment_income", + # Benefits + "ssi", + "social_security", + "medicaid", + "unemployment_compensation", + ], + "marital_unit": [ + "marital_unit_id", + "marital_unit_weight", + ], + "family": [ + "family_id", + "family_weight", + ], + "spm_unit": [ + "spm_unit_id", + "spm_unit_weight", + "snap", + "tanf", + "spm_unit_net_income", + # Poverty measures + "spm_unit_is_in_spm_poverty", + "spm_unit_is_in_deep_spm_poverty", + ], + "tax_unit": [ + "tax_unit_id", + "tax_unit_weight", + "income_tax", + "employee_payroll_tax", + "household_state_income_tax", + "eitc", + "ctc", + ], + "household": [ + "household_id", + "household_weight", + "household_count_people", + "household_net_income", + "household_income_decile", + "household_benefits", + "household_tax", + "household_market_income", + "congressional_district_geoid", + ], + } + + def __init__(self, **kwargs: dict): + manifest = get_release_manifest("us") + if "version" not in kwargs or kwargs.get("version") is None: + kwargs["version"] = manifest.model_package.version + + installed_model_version = metadata.version("policyengine-us") + if installed_model_version != manifest.model_package.version: + raise ValueError( + "Installed policyengine-us version does not match the " + f"bundled policyengine.py manifest. Expected " + f"{manifest.model_package.version}, got {installed_model_version}." + ) + + model_build_metadata = _get_runtime_data_build_metadata() + data_certification = certify_data_release_compatibility( + "us", + runtime_model_version=installed_model_version, + runtime_data_build_fingerprint=model_build_metadata.get( + "data_build_fingerprint" + ), + ) + + super().__init__(**kwargs) + self.release_manifest = manifest + self.model_package = manifest.model_package + self.data_package = manifest.data_package + self.default_dataset_uri = manifest.default_dataset_uri + self.data_certification = data_certification + from policyengine_core.enums import Enum + from policyengine_us.system import system + + # Attach region registry + from policyengine.countries.us.regions import us_region_registry + + self.region_registry = us_region_registry + + self.id = f"{self.model.id}@{self.version}" + + for var_obj in system.variables.values(): + # Serialize default_value for JSON compatibility + default_val = var_obj.default_value + if var_obj.value_type is Enum: + default_val = default_val.name + elif var_obj.value_type is datetime.date: + default_val = default_val.isoformat() + + variable = Variable( + id=self.id + "-" + var_obj.name, + name=var_obj.name, + label=getattr(var_obj, "label", None), + tax_benefit_model_version=self, + entity=var_obj.entity.key, + description=var_obj.documentation, + data_type=var_obj.value_type if var_obj.value_type is not Enum else str, + default_value=default_val, + value_type=var_obj.value_type, + ) + if ( + hasattr(var_obj, "possible_values") + and var_obj.possible_values is not None + ): + variable.possible_values = list( + map( + lambda x: x.name, + var_obj.possible_values._value2member_map_.values(), + ) + ) + # Extract and resolve adds/subtracts. + # Core stores these as either list[str] or a parameter path string. + # Resolve parameter paths to lists so consumers always get list[str]. + if hasattr(var_obj, "adds") and var_obj.adds is not None: + if isinstance(var_obj.adds, str): + try: + from policyengine_core.parameters.operations.get_parameter import ( + get_parameter, + ) + + param = get_parameter(system.parameters, var_obj.adds) + variable.adds = list(param("2025-01-01")) + except (ValueError, Exception): + variable.adds = None + else: + variable.adds = var_obj.adds + if hasattr(var_obj, "subtracts") and var_obj.subtracts is not None: + if isinstance(var_obj.subtracts, str): + try: + from policyengine_core.parameters.operations.get_parameter import ( + get_parameter, + ) + + param = get_parameter(system.parameters, var_obj.subtracts) + variable.subtracts = list(param("2025-01-01")) + except (ValueError, Exception): + variable.subtracts = None + else: + variable.subtracts = var_obj.subtracts + self.add_variable(variable) + + from policyengine_core.parameters import Parameter as CoreParameter + from policyengine_core.parameters import ParameterNode as CoreParameterNode + + scale_lookup = build_scale_lookup(system) + + for param_node in system.parameters.get_descendants(): + if isinstance(param_node, CoreParameter): + parameter = Parameter( + id=self.id + "-" + param_node.name, + name=param_node.name, + label=generate_label_for_parameter( + param_node, system, scale_lookup + ), + tax_benefit_model_version=self, + description=param_node.description, + data_type=type(param_node(2025)), + unit=param_node.metadata.get("unit"), + _core_param=param_node, + ) + self.add_parameter(parameter) + elif isinstance(param_node, CoreParameterNode): + node = ParameterNode( + id=self.id + "-" + param_node.name, + name=param_node.name, + label=param_node.metadata.get("label"), + description=param_node.description, + tax_benefit_model_version=self, + ) + self.add_parameter_node(node) + + def _build_entity_relationships( + self, dataset: PolicyEngineUSDataset + ) -> pd.DataFrame: + """Build a DataFrame mapping each person to their containing entities.""" + person_data = pd.DataFrame(dataset.data.person) + return build_entity_relationships(person_data, US_GROUP_ENTITIES) + + def _filter_dataset_by_household_variable( + self, + dataset: PolicyEngineUSDataset, + variable_name: str, + variable_value: str, + ) -> PolicyEngineUSDataset: + """Filter a dataset to only include households where a variable matches.""" + filtered = filter_dataset_by_household_variable( + entity_data=dataset.data.entity_data, + group_entities=US_GROUP_ENTITIES, + variable_name=variable_name, + variable_value=variable_value, + ) + return PolicyEngineUSDataset( + id=dataset.id + f"_filtered_{variable_name}_{variable_value}", + name=dataset.name, + description=f"{dataset.description} (filtered: {variable_name}={variable_value})", + filepath=dataset.filepath, + year=dataset.year, + is_output_dataset=dataset.is_output_dataset, + data=USYearData( + person=filtered["person"], + marital_unit=filtered["marital_unit"], + family=filtered["family"], + spm_unit=filtered["spm_unit"], + tax_unit=filtered["tax_unit"], + household=filtered["household"], + ), + ) + + def run(self, simulation: "Simulation") -> "Simulation": + from policyengine_us import Microsimulation + from policyengine_us.system import system + + from policyengine.utils.parametric_reforms import ( + build_reform_dict, + merge_reform_dicts, + ) + + assert isinstance(simulation.dataset, PolicyEngineUSDataset) + + dataset = simulation.dataset + dataset.load() + + # Apply regional scoping if specified + if simulation.scoping_strategy: + scoped_data = simulation.scoping_strategy.apply( + entity_data=dataset.data.entity_data, + group_entities=US_GROUP_ENTITIES, + year=dataset.year, + ) + dataset = PolicyEngineUSDataset( + id=dataset.id + "_scoped", + name=dataset.name, + description=dataset.description, + filepath=dataset.filepath, + year=dataset.year, + is_output_dataset=dataset.is_output_dataset, + data=USYearData( + person=scoped_data["person"], + marital_unit=scoped_data["marital_unit"], + family=scoped_data["family"], + spm_unit=scoped_data["spm_unit"], + tax_unit=scoped_data["tax_unit"], + household=scoped_data["household"], + ), + ) + elif simulation.filter_field and simulation.filter_value: + dataset = self._filter_dataset_by_household_variable( + dataset, simulation.filter_field, simulation.filter_value + ) + + # Build reform dict from policy and dynamic parameter values. + # US requires reforms at Microsimulation construction time + # (unlike UK which supports p.update() after construction). + policy_reform = build_reform_dict(simulation.policy) + dynamic_reform = build_reform_dict(simulation.dynamic) + reform_dict = merge_reform_dicts(policy_reform, dynamic_reform) + + # Create Microsimulation with reform at construction time + microsim = Microsimulation(reform=reform_dict) + self._build_simulation_from_dataset(microsim, dataset, system) + + data = { + "person": pd.DataFrame(), + "marital_unit": pd.DataFrame(), + "family": pd.DataFrame(), + "spm_unit": pd.DataFrame(), + "tax_unit": pd.DataFrame(), + "household": pd.DataFrame(), + } + + # ID columns should be preserved from input dataset, not calculated + id_columns = { + "person_id", + "household_id", + "marital_unit_id", + "family_id", + "spm_unit_id", + "tax_unit_id", + } + weight_columns = { + "person_weight", + "household_weight", + "marital_unit_weight", + "family_weight", + "spm_unit_weight", + "tax_unit_weight", + } + + # First, copy ID and weight columns from input dataset + for entity in data.keys(): + input_df = pd.DataFrame(getattr(dataset.data, entity)) + entity_id_col = f"{entity}_id" + entity_weight_col = f"{entity}_weight" + + if entity_id_col in input_df.columns: + data[entity][entity_id_col] = input_df[entity_id_col].values + if entity_weight_col in input_df.columns: + data[entity][entity_weight_col] = input_df[entity_weight_col].values + + # For person entity, also copy person-level group ID columns + person_input_df = pd.DataFrame(dataset.data.person) + for col in person_input_df.columns: + if col.startswith("person_") and col.endswith("_id"): + # Map person_household_id -> household_id, etc. + target_col = col.replace("person_", "") + if target_col in id_columns: + data["person"][target_col] = person_input_df[col].values + + # Then calculate non-ID, non-weight variables from simulation + for entity, variables in self.entity_variables.items(): + for var in variables: + if var not in id_columns and var not in weight_columns: + data[entity][var] = microsim.calculate( + var, period=simulation.dataset.year, map_to=entity + ).values + + data["person"] = MicroDataFrame(data["person"], weights="person_weight") + data["marital_unit"] = MicroDataFrame( + data["marital_unit"], weights="marital_unit_weight" + ) + data["family"] = MicroDataFrame(data["family"], weights="family_weight") + data["spm_unit"] = MicroDataFrame(data["spm_unit"], weights="spm_unit_weight") + data["tax_unit"] = MicroDataFrame(data["tax_unit"], weights="tax_unit_weight") + data["household"] = MicroDataFrame( + data["household"], weights="household_weight" + ) + + simulation.output_dataset = PolicyEngineUSDataset( + id=simulation.id, + name=dataset.name, + description=dataset.description, + filepath=str( + Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") + ), + year=simulation.dataset.year, + is_output_dataset=True, + data=USYearData( + person=data["person"], + marital_unit=data["marital_unit"], + family=data["family"], + spm_unit=data["spm_unit"], + tax_unit=data["tax_unit"], + household=data["household"], + ), + ) + + def save(self, simulation: "Simulation"): + """Save the simulation's output dataset.""" + simulation.output_dataset.save() + + def load(self, simulation: "Simulation"): + """Load the simulation's output dataset.""" + import os + + filepath = str( + Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") + ) + + simulation.output_dataset = PolicyEngineUSDataset( + id=simulation.id, + name=simulation.dataset.name, + description=simulation.dataset.description, + filepath=filepath, + year=simulation.dataset.year, + is_output_dataset=True, + ) + + # Load timestamps from file system metadata + if os.path.exists(filepath): + simulation.created_at = datetime.datetime.fromtimestamp( + os.path.getctime(filepath) + ) + simulation.updated_at = datetime.datetime.fromtimestamp( + os.path.getmtime(filepath) + ) + + def _build_simulation_from_dataset(self, microsim, dataset, system): + """Build a PolicyEngine Core simulation from dataset entity IDs. + + This follows the same pattern as policyengine-uk, initializing + entities from IDs first, then using set_input() for variables. + + Args: + microsim: The Microsimulation object to populate + dataset: The dataset containing entity data + system: The tax-benefit system + """ + import numpy as np + from policyengine_core.simulations.simulation_builder import ( + SimulationBuilder, + ) + + # Create builder and instantiate entities + builder = SimulationBuilder() + builder.populations = system.instantiate_entities() + + # Extract entity IDs from dataset + person_data = pd.DataFrame(dataset.data.person) + + # Determine column naming convention + # Support both person_X_id (from create_datasets) and X_id (from custom datasets) + household_id_col = ( + "person_household_id" + if "person_household_id" in person_data.columns + else "household_id" + ) + marital_unit_id_col = ( + "person_marital_unit_id" + if "person_marital_unit_id" in person_data.columns + else "marital_unit_id" + ) + family_id_col = ( + "person_family_id" + if "person_family_id" in person_data.columns + else "family_id" + ) + spm_unit_id_col = ( + "person_spm_unit_id" + if "person_spm_unit_id" in person_data.columns + else "spm_unit_id" + ) + tax_unit_id_col = ( + "person_tax_unit_id" + if "person_tax_unit_id" in person_data.columns + else "tax_unit_id" + ) + + # Declare entities + builder.declare_person_entity("person", person_data["person_id"].values) + builder.declare_entity( + "household", np.unique(person_data[household_id_col].values) + ) + builder.declare_entity( + "spm_unit", np.unique(person_data[spm_unit_id_col].values) + ) + builder.declare_entity("family", np.unique(person_data[family_id_col].values)) + builder.declare_entity( + "tax_unit", np.unique(person_data[tax_unit_id_col].values) + ) + builder.declare_entity( + "marital_unit", np.unique(person_data[marital_unit_id_col].values) + ) + + # Join persons to group entities + builder.join_with_persons( + builder.populations["household"], + person_data[household_id_col].values, + np.array(["member"] * len(person_data)), + ) + builder.join_with_persons( + builder.populations["spm_unit"], + person_data[spm_unit_id_col].values, + np.array(["member"] * len(person_data)), + ) + builder.join_with_persons( + builder.populations["family"], + person_data[family_id_col].values, + np.array(["member"] * len(person_data)), + ) + builder.join_with_persons( + builder.populations["tax_unit"], + person_data[tax_unit_id_col].values, + np.array(["member"] * len(person_data)), + ) + builder.join_with_persons( + builder.populations["marital_unit"], + person_data[marital_unit_id_col].values, + np.array(["member"] * len(person_data)), + ) + + # Build simulation from populations + microsim.build_from_populations(builder.populations) + + # Set input variables for each entity + # Skip ID columns as they're structural and already used in entity building + # Support both naming conventions + id_columns = { + "person_id", + "household_id", + "person_household_id", + "spm_unit_id", + "person_spm_unit_id", + "family_id", + "person_family_id", + "tax_unit_id", + "person_tax_unit_id", + "marital_unit_id", + "person_marital_unit_id", + } + + for entity_name, entity_df in [ + ("person", dataset.data.person), + ("household", dataset.data.household), + ("spm_unit", dataset.data.spm_unit), + ("family", dataset.data.family), + ("tax_unit", dataset.data.tax_unit), + ("marital_unit", dataset.data.marital_unit), + ]: + df = pd.DataFrame(entity_df) + for column in df.columns: + # Skip ID columns and check if variable exists in system + if column not in id_columns and column in system.variables: + microsim.set_input(column, dataset.year, df[column].values) + + +def _managed_release_bundle( + dataset_uri: str, + dataset_source: Optional[str] = None, +) -> dict[str, Optional[str]]: + bundle = dict(us_latest.release_bundle) + bundle["runtime_dataset"] = dataset_logical_name(dataset_uri) + bundle["runtime_dataset_uri"] = dataset_uri + if dataset_source: + bundle["runtime_dataset_source"] = dataset_source + bundle["managed_by"] = "policyengine.py" + return bundle + + +def managed_microsimulation( + *, + dataset: Optional[str] = None, + allow_unmanaged: bool = False, + **kwargs, +): + """Construct a country-package Microsimulation pinned to this bundle. + + By default this enforces the dataset selection from the bundled + `policyengine.py` release manifest. Arbitrary dataset URIs require + `allow_unmanaged=True`. + """ + + from policyengine_us import Microsimulation + + if "dataset" in kwargs: + raise ValueError( + "Pass `dataset=` directly to managed_microsimulation, not through " + "**kwargs, so policyengine.py can enforce the release bundle." + ) + + dataset_uri = resolve_managed_dataset_reference( + "us", + dataset, + allow_unmanaged=allow_unmanaged, + ) + dataset_source = resolve_local_managed_dataset_source( + "us", + dataset_uri, + allow_local_mirror=not ( + allow_unmanaged and dataset is not None and "://" in dataset + ), + ) + microsim = Microsimulation(dataset=dataset_source, **kwargs) + microsim.policyengine_bundle = _managed_release_bundle( + dataset_uri, + dataset_source, + ) + return microsim + + +us_latest = PolicyEngineUSLatest() diff --git a/build/lib/policyengine/tax_benefit_models/us/outputs.py b/build/lib/policyengine/tax_benefit_models/us/outputs.py new file mode 100644 index 00000000..1dd6f001 --- /dev/null +++ b/build/lib/policyengine/tax_benefit_models/us/outputs.py @@ -0,0 +1,105 @@ +"""US-specific output templates.""" + +from typing import Optional + +from pydantic import ConfigDict + +from policyengine.core import Output, Simulation +from policyengine.outputs.aggregate import Aggregate, AggregateType +from policyengine.outputs.change_aggregate import ( + ChangeAggregate, + ChangeAggregateType, +) + + +class ProgramStatistics(Output): + """Single program's statistics from a policy reform - represents one database row.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + baseline_simulation: Simulation + reform_simulation: Simulation + program_name: str + entity: str + is_tax: bool = False + + # Results populated by run() + baseline_total: Optional[float] = None + reform_total: Optional[float] = None + change: Optional[float] = None + baseline_count: Optional[float] = None + reform_count: Optional[float] = None + winners: Optional[float] = None + losers: Optional[float] = None + + def run(self): + """Calculate statistics for this program.""" + # Baseline totals + baseline_total = Aggregate( + simulation=self.baseline_simulation, + variable=self.program_name, + aggregate_type=AggregateType.SUM, + entity=self.entity, + ) + baseline_total.run() + + # Reform totals + reform_total = Aggregate( + simulation=self.reform_simulation, + variable=self.program_name, + aggregate_type=AggregateType.SUM, + entity=self.entity, + ) + reform_total.run() + + # Count of recipients/payers (baseline) + baseline_count = Aggregate( + simulation=self.baseline_simulation, + variable=self.program_name, + aggregate_type=AggregateType.COUNT, + entity=self.entity, + filter_variable=self.program_name, + filter_variable_geq=0.01, + ) + baseline_count.run() + + # Count of recipients/payers (reform) + reform_count = Aggregate( + simulation=self.reform_simulation, + variable=self.program_name, + aggregate_type=AggregateType.COUNT, + entity=self.entity, + filter_variable=self.program_name, + filter_variable_geq=0.01, + ) + reform_count.run() + + # Winners and losers + winners = ChangeAggregate( + baseline_simulation=self.baseline_simulation, + reform_simulation=self.reform_simulation, + variable=self.program_name, + aggregate_type=ChangeAggregateType.COUNT, + entity=self.entity, + change_geq=0.01 if not self.is_tax else -0.01, + ) + winners.run() + + losers = ChangeAggregate( + baseline_simulation=self.baseline_simulation, + reform_simulation=self.reform_simulation, + variable=self.program_name, + aggregate_type=ChangeAggregateType.COUNT, + entity=self.entity, + change_leq=-0.01 if not self.is_tax else 0.01, + ) + losers.run() + + # Populate results + self.baseline_total = float(baseline_total.result) + self.reform_total = float(reform_total.result) + self.change = float(reform_total.result - baseline_total.result) + self.baseline_count = float(baseline_count.result) + self.reform_count = float(reform_count.result) + self.winners = float(winners.result) + self.losers = float(losers.result) diff --git a/build/lib/policyengine/utils/__init__.py b/build/lib/policyengine/utils/__init__.py new file mode 100644 index 00000000..bf3cc681 --- /dev/null +++ b/build/lib/policyengine/utils/__init__.py @@ -0,0 +1,7 @@ +from .dates import parse_safe_date as parse_safe_date +from .parameter_labels import build_scale_lookup as build_scale_lookup +from .parameter_labels import ( + generate_label_for_parameter as generate_label_for_parameter, +) +from .plotting import COLORS as COLORS +from .plotting import format_fig as format_fig diff --git a/build/lib/policyengine/utils/dates.py b/build/lib/policyengine/utils/dates.py new file mode 100644 index 00000000..46cec198 --- /dev/null +++ b/build/lib/policyengine/utils/dates.py @@ -0,0 +1,43 @@ +import calendar +from datetime import datetime + + +def parse_safe_date(date_string: str) -> datetime: + """ + Parse a YYYY-MM-DD date string and ensure the year is at least 1. + Handles invalid day values by capping to the last valid day of the month. + + Args: + date_string: Date string in YYYY-MM-DD format + + Returns: + Safe datetime object with year >= 1 + """ + try: + date_string = date_string.replace("0000-", "0001-") + date_obj = datetime.strptime(date_string, "%Y-%m-%d") + if date_obj.year < 1: + # Replace year 0 or negative years with year 1 + return date_obj.replace(year=1) + return date_obj + except ValueError as e: + # Try to handle invalid day values (e.g., 2021-06-31) + # Python <3.14: "day is out of range for month" + # Python 3.14+: "day N must be in range 1..M for month ..." + error_msg = str(e) + if "day is out of range for month" in error_msg or ( + "must be in range" in error_msg and "for month" in error_msg + ): + parts = date_string.split("-") + if len(parts) == 3: + year = int(parts[0]) + month = int(parts[1]) + # Get the last valid day of the month + last_day = calendar.monthrange(year, month)[1] + # Use the last valid day instead + corrected_date = f"{year:04d}-{month:02d}-{last_day:02d}" + date_obj = datetime.strptime(corrected_date, "%Y-%m-%d") + if date_obj.year < 1: + return date_obj.replace(year=1) + return date_obj + raise ValueError(f"Invalid date format: {date_string}. Expected YYYY-MM-DD") diff --git a/build/lib/policyengine/utils/entity_utils.py b/build/lib/policyengine/utils/entity_utils.py new file mode 100644 index 00000000..f06b5d59 --- /dev/null +++ b/build/lib/policyengine/utils/entity_utils.py @@ -0,0 +1,140 @@ +"""Shared utilities for entity relationship building and dataset filtering.""" + +import logging + +import pandas as pd +from microdf import MicroDataFrame + +logger = logging.getLogger(__name__) + + +def _resolve_id_column(person_data: pd.DataFrame, entity_name: str) -> str: + """Resolve the ID column name for a group entity in person data. + + Tries `person_{entity}_id` first (standard convention), falls back + to `{entity}_id` (custom datasets). + """ + prefixed = f"person_{entity_name}_id" + bare = f"{entity_name}_id" + if prefixed in person_data.columns: + return prefixed + if bare in person_data.columns: + return bare + raise ValueError( + f"No ID column found for entity '{entity_name}'. " + f"Tried '{prefixed}' and '{bare}'. " + f"Available columns: {list(person_data.columns)}" + ) + + +def build_entity_relationships( + person_data: pd.DataFrame, + group_entities: list[str], +) -> pd.DataFrame: + """Build a DataFrame mapping each person to their containing entities. + + Creates an explicit relationship map between persons and all specified + group entity types. This enables filtering at any entity level while + preserving the integrity of all related entities. + + Args: + person_data: DataFrame of person-level data with ID columns. + group_entities: List of group entity names (e.g., ["household", "tax_unit"]). + + Returns: + A DataFrame with person_id and one {entity}_id column per group entity. + """ + columns = {"person_id": person_data["person_id"].values} + for entity in group_entities: + id_col = _resolve_id_column(person_data, entity) + columns[f"{entity}_id"] = person_data[id_col].values + return pd.DataFrame(columns) + + +def filter_dataset_by_household_variable( + entity_data: dict[str, MicroDataFrame], + group_entities: list[str], + variable_name: str, + variable_value: str, +) -> dict[str, MicroDataFrame]: + """Filter dataset entities to only include households where a variable matches. + + Uses an entity relationship approach: builds an explicit map of all + entity relationships, filters at the household level, and keeps all + persons in matching households to preserve entity integrity. + + Args: + entity_data: Dict mapping entity names to their MicroDataFrames + (from YearData.entity_data). + group_entities: List of group entity names for this country. + variable_name: The household-level variable to filter on. + variable_value: The value to match. Handles both str and bytes encoding. + + Returns: + A dict mapping entity names to filtered MicroDataFrames. + + Raises: + ValueError: If variable_name is not found or no households match. + """ + person_data = pd.DataFrame(entity_data["person"]) + household_data = pd.DataFrame(entity_data["household"]) + + if variable_name not in household_data.columns: + raise ValueError( + f"Variable '{variable_name}' not found in household data. " + f"Available columns: {list(household_data.columns)}" + ) + + # Build entity relationships + entity_rel = build_entity_relationships(person_data, group_entities) + + # Find matching household IDs + hh_values = household_data[variable_name].values + hh_ids = household_data["household_id"].values + + if isinstance(variable_value, str): + hh_mask = (hh_values == variable_value) | (hh_values == variable_value.encode()) + else: + hh_mask = hh_values == variable_value + + matching_hh_ids = set(hh_ids[hh_mask]) + + if len(matching_hh_ids) == 0: + raise ValueError( + f"No households found matching {variable_name}={variable_value}" + ) + + # Filter persons to those in matching households + person_mask = entity_rel["household_id"].isin(matching_hh_ids) + filtered_rel = entity_rel[person_mask] + + # Collect filtered IDs for each entity + filtered_ids = {"person": set(filtered_rel["person_id"])} + for entity in group_entities: + filtered_ids[entity] = set(filtered_rel[f"{entity}_id"]) + + # Filter each entity DataFrame + result = {} + for entity_name, mdf in entity_data.items(): + df = pd.DataFrame(mdf) + id_col = f"{entity_name}_id" + if entity_name in filtered_ids and id_col in df.columns: + filtered_df = df[df[id_col].isin(filtered_ids[entity_name])] + else: + if entity_name != "person": + logger.warning( + "Entity '%s' not in filtered_ids or missing '%s' column; " + "passing through unfiltered.", + entity_name, + id_col, + ) + filtered_df = df + + weight_col = f"{entity_name}_weight" + weights = weight_col if weight_col in filtered_df.columns else None + result[entity_name] = MicroDataFrame( + filtered_df.reset_index(drop=True), + weights=weights, + ) + + return result diff --git a/build/lib/policyengine/utils/parameter_labels.py b/build/lib/policyengine/utils/parameter_labels.py new file mode 100644 index 00000000..6a574be8 --- /dev/null +++ b/build/lib/policyengine/utils/parameter_labels.py @@ -0,0 +1,216 @@ +"""Utilities for generating human-readable labels for tax-benefit parameters.""" + +import re + + +def generate_label_for_parameter(param_node, system, scale_lookup): + """ + Generate a label for a parameter that doesn't have one. + + For breakdown parameters: Uses parent label + enum value + For bracket parameters: Uses scale label + bracket info + + Args: + param_node: The CoreParameter object + system: The tax-benefit system (has variables and parameters) + scale_lookup: Dict mapping scale names to ParameterScale objects + + Returns: + str or None: Generated label, or None if cannot generate + """ + if param_node.metadata.get("label"): + return param_node.metadata.get("label") + + param_name = param_node.name + + if "[" in param_name: + return _generate_bracket_label(param_name, scale_lookup) + + # Check for breakdown - either direct child or nested + breakdown_parent = _find_breakdown_parent(param_node) + if breakdown_parent: + return _generate_breakdown_label(param_node, system, breakdown_parent) + + return None + + +def _find_breakdown_parent(param_node): + """ + Walk up the tree to find the nearest ancestor with breakdown metadata. + + Args: + param_node: The CoreParameter object + + Returns: + The breakdown parent node, or None if not found + """ + current = param_node.parent + while current: + if current.metadata.get("breakdown"): + return current + current = getattr(current, "parent", None) + return None + + +def _generate_breakdown_label(param_node, system, breakdown_parent=None): + """ + Generate label for a breakdown parameter using enum values. + + Handles both single-level and nested breakdowns by walking up to the + breakdown parent and collecting all dimension values. + + Args: + param_node: The CoreParameter object + system: The tax-benefit system + breakdown_parent: The ancestor node with breakdown metadata (optional) + + Returns: + str or None: Generated label, or None if cannot generate + """ + # Find breakdown parent if not provided + if breakdown_parent is None: + breakdown_parent = _find_breakdown_parent(param_node) + if not breakdown_parent: + return None + + parent_label = breakdown_parent.metadata.get("label") + if not parent_label: + return None + + breakdown_vars = breakdown_parent.metadata.get("breakdown", []) + breakdown_labels = breakdown_parent.metadata.get("breakdown_labels", []) + + # Collect dimension values from breakdown parent to param_node + dimension_values = _collect_dimension_values(param_node, breakdown_parent) + + if not dimension_values: + return None + + # Generate labels for each dimension + formatted_parts = [] + for i, (dim_key, dim_value) in enumerate(dimension_values): + var_name = breakdown_vars[i] if i < len(breakdown_vars) else None + dim_label = breakdown_labels[i] if i < len(breakdown_labels) else None + + formatted_value = _format_dimension_value( + dim_value, var_name, dim_label, system + ) + formatted_parts.append(formatted_value) + + return f"{parent_label} ({', '.join(formatted_parts)})" + + +def _collect_dimension_values(param_node, breakdown_parent): + """ + Collect dimension keys and values from breakdown parent to param_node. + + Args: + param_node: The CoreParameter object + breakdown_parent: The ancestor node with breakdown metadata + + Returns: + list of (dimension_key, value) tuples, ordered from parent to child + """ + # Build path from param_node up to breakdown_parent + path = [] + current = param_node + while current and current != breakdown_parent: + path.append(current) + current = getattr(current, "parent", None) + + # Reverse to get parent-to-child order + path.reverse() + + # Extract dimension values + dimension_values = [] + for i, node in enumerate(path): + key = node.name.split(".")[-1] + dimension_values.append((i, key)) + + return dimension_values + + +def _format_dimension_value(value, var_name, dim_label, system): + """ + Format a single dimension value with semantic label if available. + + Args: + value: The raw dimension value (e.g., "SINGLE", "1", "CA") + var_name: The breakdown variable name (e.g., "filing_status", "range(1, 9)") + dim_label: The human-readable label for this dimension (e.g., "Household size") + system: The tax-benefit system + + Returns: + str: Formatted dimension value + """ + # First, try to get enum display value + if ( + var_name + and isinstance(var_name, str) + and not var_name.startswith("range(") + and not var_name.startswith("list(") + ): + var = system.variables.get(var_name) + if var and hasattr(var, "possible_values") and var.possible_values: + try: + enum_value = var.possible_values[value].value + return str(enum_value) + except (KeyError, AttributeError): + pass + + # For range() dimensions or when no enum found, use breakdown_label if available + if dim_label: + return f"{dim_label} {value}" + + return value + + +def _generate_bracket_label(param_name, scale_lookup): + """Generate label for a bracket parameter.""" + match = re.match(r"^(.+)\[(\d+)\]\.(\w+)$", param_name) + if not match: + return None + + scale_name = match.group(1) + bracket_index = int(match.group(2)) + field_name = match.group(3) + + scale = scale_lookup.get(scale_name) + if not scale: + return None + + scale_label = scale.metadata.get("label") + scale_type = scale.metadata.get("type", "") + + if not scale_label: + return None + + bracket_num = bracket_index + 1 + + if scale_type in ("marginal_rate", "marginal_amount"): + bracket_desc = f"bracket {bracket_num}" + elif scale_type == "single_amount": + bracket_desc = f"tier {bracket_num}" + else: + bracket_desc = f"bracket {bracket_num}" + + return f"{scale_label} ({bracket_desc} {field_name})" + + +def build_scale_lookup(system): + """ + Build a lookup dict mapping scale names to ParameterScale objects. + + Args: + system: The tax-benefit system + + Returns: + dict: Mapping of scale name -> ParameterScale object + """ + from policyengine_core.parameters import ParameterScale + + return { + p.name: p + for p in system.parameters.get_descendants() + if isinstance(p, ParameterScale) + } diff --git a/build/lib/policyengine/utils/parametric_reforms.py b/build/lib/policyengine/utils/parametric_reforms.py new file mode 100644 index 00000000..025df22e --- /dev/null +++ b/build/lib/policyengine/utils/parametric_reforms.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +from collections.abc import Callable +from typing import TYPE_CHECKING, Optional, Union + +from policyengine_core.periods import period + +from policyengine.core import ParameterValue + +if TYPE_CHECKING: + from policyengine.core.dynamic import Dynamic + from policyengine.core.policy import Policy + + +def reform_dict_from_parameter_values( + parameter_values: Optional[list[ParameterValue]], +) -> Optional[dict]: + """ + Convert a list of ParameterValue objects to a reform dict format. + + This format is accepted by policyengine_us.Microsimulation(reform=...) and + policyengine_uk.Microsimulation(reform=...) at construction time. + + Args: + parameter_values: List of ParameterValue objects to convert. + + Returns: + A dict mapping parameter names to period-value dicts, e.g.: + { + "gov.irs.deductions.standard.amount.SINGLE": { + "2024-01-01": 29200 + } + } + """ + if not parameter_values: + return None + + reform_dict = {} + for pv in parameter_values: + param_name = pv.parameter.name + if param_name not in reform_dict: + reform_dict[param_name] = {} + + # Format the period string + period_str = pv.start_date.strftime("%Y-%m-%d") + if pv.end_date: + # Use period range format: "start.end" + period_str = f"{period_str}.{pv.end_date.strftime('%Y-%m-%d')}" + + reform_dict[param_name][period_str] = pv.value + + return reform_dict + + +def simulation_modifier_from_parameter_values( + parameter_values: list[ParameterValue], +) -> Callable: + """ + Create a simulation modifier function that applies the given parameter values to a simulation. + + Args: + parameter_values (list[ParameterValue]): List of ParameterValue objects to apply. + + Returns: + Callable: A function that takes a Simulation object and applies the parameter values. + """ + + def modifier(simulation): + for pv in parameter_values: + p = simulation.tax_benefit_system.parameters.get_child(pv.parameter.name) + start_period = period(pv.start_date.strftime("%Y-%m-%d")) + stop_period = ( + period(pv.end_date.strftime("%Y-%m-%d")) if pv.end_date else None + ) + p.update( + value=pv.value, + start=start_period, + stop=stop_period, + ) + return simulation + + return modifier + + +def build_reform_dict( + policy_or_dynamic: Optional[Union[Policy, Dynamic]], +) -> Optional[dict]: + """Extract a reform dict from a Policy or Dynamic object. + + If the object has parameter_values, converts them to reform dict format. + Returns None if the object is None or has no parameter values. + + Args: + policy_or_dynamic: A Policy or Dynamic object, or None. + + Returns: + A reform dict suitable for Microsimulation(reform=...), or None. + """ + if policy_or_dynamic is None: + return None + if policy_or_dynamic.parameter_values: + return reform_dict_from_parameter_values(policy_or_dynamic.parameter_values) + return None + + +def merge_reform_dicts( + base: Optional[dict], override: Optional[dict] +) -> Optional[dict]: + """Merge two reform dicts, with override values taking precedence. + + Either or both dicts can be None. When both have entries for the same + parameter, period-level values from override replace those in base. + + Args: + base: The base reform dict (e.g., from policy). + override: The override reform dict (e.g., from dynamic). + + Returns: + The merged reform dict, or None if both inputs are None. + """ + if base is None: + return override + if override is None: + return base + + merged = {k: dict(v) for k, v in base.items()} + for param_name, period_values in override.items(): + if param_name not in merged: + merged[param_name] = {} + merged[param_name].update(period_values) + return merged diff --git a/build/lib/policyengine/utils/plotting.py b/build/lib/policyengine/utils/plotting.py new file mode 100644 index 00000000..2ca8e48c --- /dev/null +++ b/build/lib/policyengine/utils/plotting.py @@ -0,0 +1,178 @@ +"""Plotting utilities for PolicyEngine visualisations.""" + +from typing import Optional + +import plotly.graph_objects as go + +# PolicyEngine brand colours +COLORS = { + "primary": "#319795", # Teal + "primary_light": "#E6FFFA", + "primary_dark": "#1D4044", + "success": "#22C55E", # Green (positive changes) + "warning": "#FEC601", # Yellow (cautions) + "error": "#EF4444", # Red (negative changes) + "info": "#1890FF", # Blue (neutral info) + "gray_light": "#F2F4F7", + "gray": "#667085", + "gray_dark": "#101828", + "blue_secondary": "#026AA2", +} + +# Typography +FONT_FAMILY = "Inter, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif" +FONT_SIZE_LABEL = 12 +FONT_SIZE_DEFAULT = 14 +FONT_SIZE_TITLE = 16 + + +def format_fig( + fig: go.Figure, + title: Optional[str] = None, + xaxis_title: Optional[str] = None, + yaxis_title: Optional[str] = None, + show_legend: bool = True, + height: Optional[int] = None, + width: Optional[int] = None, +) -> go.Figure: + """Apply PolicyEngine visual style to a plotly figure. + + Applies professional, clean styling following PolicyEngine design principles: + - Data-driven clarity prioritising immediate understanding + - Professional brand colours (teal primary, semantic colours) + - Clean typography with Inter font family + - Minimal visual clutter + - Appropriate spacing and margins + + Args: + fig: Plotly figure to format + title: Optional title to set/override + xaxis_title: Optional x-axis title to set/override + yaxis_title: Optional y-axis title to set/override + show_legend: Whether to show the legend (default: True) + height: Optional height in pixels + width: Optional width in pixels + + Returns: + Formatted plotly figure (same object, modified in place) + + Example: + >>> import plotly.graph_objects as go + >>> from policyengine.utils import format_fig + >>> fig = go.Figure(data=go.Scatter(x=[1, 2, 3], y=[4, 5, 6])) + >>> format_fig(fig, title="Example chart", xaxis_title="X", yaxis_title="Y") + """ + # Build layout updates + layout_updates = { + "font": { + "family": FONT_FAMILY, + "size": FONT_SIZE_DEFAULT, + "color": COLORS["gray_dark"], + }, + "plot_bgcolor": "#FAFAFA", + "paper_bgcolor": "white", + "margin": {"l": 100, "r": 60, "t": 100, "b": 80}, + "showlegend": show_legend, + "xaxis": { + "title": { + "font": { + "size": FONT_SIZE_DEFAULT, + "family": FONT_FAMILY, + "color": COLORS["gray_dark"], + }, + "standoff": 20, + }, + "tickfont": { + "size": FONT_SIZE_LABEL, + "family": FONT_FAMILY, + "color": COLORS["gray"], + }, + "showgrid": False, + "showline": True, + "linewidth": 2, + "linecolor": COLORS["gray_light"], + "zeroline": False, + "ticks": "outside", + "tickwidth": 1, + "tickcolor": COLORS["gray_light"], + }, + "yaxis": { + "title": { + "font": { + "size": FONT_SIZE_DEFAULT, + "family": FONT_FAMILY, + "color": COLORS["gray_dark"], + }, + "standoff": 20, + }, + "tickfont": { + "size": FONT_SIZE_LABEL, + "family": FONT_FAMILY, + "color": COLORS["gray"], + }, + "showgrid": True, + "gridwidth": 1, + "gridcolor": "#E5E7EB", + "showline": False, + "zeroline": False, + }, + "legend": { + "bgcolor": "white", + "bordercolor": COLORS["gray_light"], + "borderwidth": 1, + "font": {"size": FONT_SIZE_LABEL, "family": FONT_FAMILY}, + "orientation": "v", + "yanchor": "top", + "y": 0.99, + "xanchor": "right", + "x": 0.99, + }, + } + + # Add optional parameters + if title is not None: + layout_updates["title"] = { + "text": title, + "font": { + "size": 18, + "family": FONT_FAMILY, + "color": COLORS["gray_dark"], + "weight": 600, + }, + "x": 0, + "xanchor": "left", + "y": 0.98, + "yanchor": "top", + } + + if xaxis_title is not None: + layout_updates["xaxis"]["title"]["text"] = xaxis_title + + if yaxis_title is not None: + layout_updates["yaxis"]["title"]["text"] = yaxis_title + + if height is not None: + layout_updates["height"] = height + + if width is not None: + layout_updates["width"] = width + + # Apply layout + fig.update_layout(**layout_updates) + + # Update all traces to have cleaner styling + fig.update_traces( + marker=dict(size=8, line=dict(width=0)), + line=dict(width=3), + selector=dict(mode="markers+lines"), + ) + fig.update_traces( + marker=dict(size=8, line=dict(width=0)), + selector=dict(mode="markers"), + ) + fig.update_traces( + line=dict(width=3), + selector=dict(mode="lines"), + ) + + return fig From ee02e5f0ce3f67f141e124d798f3c770ab3d6209 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 17 Apr 2026 08:28:19 -0400 Subject: [PATCH 3/4] Remove accidentally-committed build/ directory and gitignore it The previous commit picked up `build/lib/...` artifacts from a local setuptools build. These aren't source and shouldn't be tracked. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 2 +- build/lib/policyengine/__init__.py | 0 build/lib/policyengine/core/__init__.py | 50 - build/lib/policyengine/core/cache.py | 59 - build/lib/policyengine/core/dataset.py | 396 ---- .../lib/policyengine/core/dataset_version.py | 16 - build/lib/policyengine/core/dynamic.py | 47 - build/lib/policyengine/core/output.py | 26 - build/lib/policyengine/core/parameter.py | 61 - build/lib/policyengine/core/parameter_node.py | 29 - .../lib/policyengine/core/parameter_value.py | 16 - build/lib/policyengine/core/policy.py | 47 - build/lib/policyengine/core/region.py | 212 -- .../lib/policyengine/core/release_manifest.py | 432 ---- .../lib/policyengine/core/scoping_strategy.py | 224 -- build/lib/policyengine/core/simulation.py | 111 - .../policyengine/core/tax_benefit_model.py | 11 - .../core/tax_benefit_model_version.py | 208 -- build/lib/policyengine/core/trace_tro.py | 260 --- build/lib/policyengine/core/variable.py | 20 - build/lib/policyengine/countries/__init__.py | 9 - .../lib/policyengine/countries/uk/__init__.py | 5 - .../lib/policyengine/countries/uk/regions.py | 207 -- .../lib/policyengine/countries/us/__init__.py | 5 - .../countries/us/data/__init__.py | 18 - .../countries/us/data/districts.py | 64 - .../policyengine/countries/us/data/places.py | 1815 ----------------- .../policyengine/countries/us/data/states.py | 59 - .../lib/policyengine/countries/us/regions.py | 120 -- .../data/release_manifests/uk.json | 45 - .../data/release_manifests/us.json | 48 - build/lib/policyengine/outputs/__init__.py | 91 - build/lib/policyengine/outputs/aggregate.py | 112 - .../policyengine/outputs/change_aggregate.py | 170 -- .../outputs/congressional_district_impact.py | 131 -- .../outputs/constituency_impact.py | 126 -- .../lib/policyengine/outputs/decile_impact.py | 178 -- build/lib/policyengine/outputs/inequality.py | 313 --- .../outputs/intra_decile_impact.py | 180 -- .../outputs/local_authority_impact.py | 125 -- build/lib/policyengine/outputs/poverty.py | 462 ----- .../lib/policyengine/tax_benefit_models/uk.py | 40 - .../tax_benefit_models/uk/__init__.py | 55 - .../tax_benefit_models/uk/analysis.py | 283 --- .../tax_benefit_models/uk/datasets.py | 245 --- .../tax_benefit_models/uk/model.py | 496 ----- .../tax_benefit_models/uk/outputs.py | 105 - .../lib/policyengine/tax_benefit_models/us.py | 40 - .../tax_benefit_models/us/__init__.py | 55 - .../tax_benefit_models/us/analysis.py | 311 --- .../tax_benefit_models/us/datasets.py | 359 ---- .../tax_benefit_models/us/model.py | 650 ------ .../tax_benefit_models/us/outputs.py | 105 - build/lib/policyengine/utils/__init__.py | 7 - build/lib/policyengine/utils/dates.py | 43 - build/lib/policyengine/utils/entity_utils.py | 140 -- .../policyengine/utils/parameter_labels.py | 216 -- .../policyengine/utils/parametric_reforms.py | 131 -- build/lib/policyengine/utils/plotting.py | 178 -- 59 files changed, 1 insertion(+), 9968 deletions(-) delete mode 100644 build/lib/policyengine/__init__.py delete mode 100644 build/lib/policyengine/core/__init__.py delete mode 100644 build/lib/policyengine/core/cache.py delete mode 100644 build/lib/policyengine/core/dataset.py delete mode 100644 build/lib/policyengine/core/dataset_version.py delete mode 100644 build/lib/policyengine/core/dynamic.py delete mode 100644 build/lib/policyengine/core/output.py delete mode 100644 build/lib/policyengine/core/parameter.py delete mode 100644 build/lib/policyengine/core/parameter_node.py delete mode 100644 build/lib/policyengine/core/parameter_value.py delete mode 100644 build/lib/policyengine/core/policy.py delete mode 100644 build/lib/policyengine/core/region.py delete mode 100644 build/lib/policyengine/core/release_manifest.py delete mode 100644 build/lib/policyengine/core/scoping_strategy.py delete mode 100644 build/lib/policyengine/core/simulation.py delete mode 100644 build/lib/policyengine/core/tax_benefit_model.py delete mode 100644 build/lib/policyengine/core/tax_benefit_model_version.py delete mode 100644 build/lib/policyengine/core/trace_tro.py delete mode 100644 build/lib/policyengine/core/variable.py delete mode 100644 build/lib/policyengine/countries/__init__.py delete mode 100644 build/lib/policyengine/countries/uk/__init__.py delete mode 100644 build/lib/policyengine/countries/uk/regions.py delete mode 100644 build/lib/policyengine/countries/us/__init__.py delete mode 100644 build/lib/policyengine/countries/us/data/__init__.py delete mode 100644 build/lib/policyengine/countries/us/data/districts.py delete mode 100644 build/lib/policyengine/countries/us/data/places.py delete mode 100644 build/lib/policyengine/countries/us/data/states.py delete mode 100644 build/lib/policyengine/countries/us/regions.py delete mode 100644 build/lib/policyengine/data/release_manifests/uk.json delete mode 100644 build/lib/policyengine/data/release_manifests/us.json delete mode 100644 build/lib/policyengine/outputs/__init__.py delete mode 100644 build/lib/policyengine/outputs/aggregate.py delete mode 100644 build/lib/policyengine/outputs/change_aggregate.py delete mode 100644 build/lib/policyengine/outputs/congressional_district_impact.py delete mode 100644 build/lib/policyengine/outputs/constituency_impact.py delete mode 100644 build/lib/policyengine/outputs/decile_impact.py delete mode 100644 build/lib/policyengine/outputs/inequality.py delete mode 100644 build/lib/policyengine/outputs/intra_decile_impact.py delete mode 100644 build/lib/policyengine/outputs/local_authority_impact.py delete mode 100644 build/lib/policyengine/outputs/poverty.py delete mode 100644 build/lib/policyengine/tax_benefit_models/uk.py delete mode 100644 build/lib/policyengine/tax_benefit_models/uk/__init__.py delete mode 100644 build/lib/policyengine/tax_benefit_models/uk/analysis.py delete mode 100644 build/lib/policyengine/tax_benefit_models/uk/datasets.py delete mode 100644 build/lib/policyengine/tax_benefit_models/uk/model.py delete mode 100644 build/lib/policyengine/tax_benefit_models/uk/outputs.py delete mode 100644 build/lib/policyengine/tax_benefit_models/us.py delete mode 100644 build/lib/policyengine/tax_benefit_models/us/__init__.py delete mode 100644 build/lib/policyengine/tax_benefit_models/us/analysis.py delete mode 100644 build/lib/policyengine/tax_benefit_models/us/datasets.py delete mode 100644 build/lib/policyengine/tax_benefit_models/us/model.py delete mode 100644 build/lib/policyengine/tax_benefit_models/us/outputs.py delete mode 100644 build/lib/policyengine/utils/__init__.py delete mode 100644 build/lib/policyengine/utils/dates.py delete mode 100644 build/lib/policyengine/utils/entity_utils.py delete mode 100644 build/lib/policyengine/utils/parameter_labels.py delete mode 100644 build/lib/policyengine/utils/parametric_reforms.py delete mode 100644 build/lib/policyengine/utils/plotting.py diff --git a/.gitignore b/.gitignore index 57a0fc21..b7c5f008 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,4 @@ *.ipynb _build/ .env -**/.DS_Store \ No newline at end of file +**/.DS_Storebuild/ diff --git a/build/lib/policyengine/__init__.py b/build/lib/policyengine/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/build/lib/policyengine/core/__init__.py b/build/lib/policyengine/core/__init__.py deleted file mode 100644 index bb0e80d5..00000000 --- a/build/lib/policyengine/core/__init__.py +++ /dev/null @@ -1,50 +0,0 @@ -from .dataset import Dataset -from .dataset import YearData as YearData -from .dataset import map_to_entity as map_to_entity -from .dataset_version import DatasetVersion as DatasetVersion -from .dynamic import Dynamic as Dynamic -from .output import Output as Output -from .output import OutputCollection as OutputCollection -from .parameter import Parameter as Parameter -from .parameter_node import ParameterNode as ParameterNode -from .parameter_value import ParameterValue as ParameterValue -from .policy import Policy as Policy -from .region import Region as Region -from .region import RegionRegistry as RegionRegistry -from .region import RegionType as RegionType -from .release_manifest import CertifiedDataArtifact as CertifiedDataArtifact -from .release_manifest import CountryReleaseManifest as CountryReleaseManifest -from .release_manifest import DataBuildInfo as DataBuildInfo -from .release_manifest import DataCertification as DataCertification -from .release_manifest import DataPackageVersion as DataPackageVersion -from .release_manifest import DataReleaseArtifact as DataReleaseArtifact -from .release_manifest import DataReleaseManifest as DataReleaseManifest -from .release_manifest import PackageVersion as PackageVersion -from .release_manifest import ( - certify_data_release_compatibility as certify_data_release_compatibility, -) -from .release_manifest import get_data_release_manifest as get_data_release_manifest -from .release_manifest import get_release_manifest as get_release_manifest -from .release_manifest import ( - resolve_managed_dataset_reference as resolve_managed_dataset_reference, -) -from .scoping_strategy import RegionScopingStrategy as RegionScopingStrategy -from .scoping_strategy import RowFilterStrategy as RowFilterStrategy -from .scoping_strategy import ScopingStrategy as ScopingStrategy -from .scoping_strategy import ( - WeightReplacementStrategy as WeightReplacementStrategy, -) -from .simulation import Simulation as Simulation -from .tax_benefit_model import TaxBenefitModel as TaxBenefitModel -from .tax_benefit_model_version import ( - TaxBenefitModelVersion as TaxBenefitModelVersion, -) -from .variable import Variable as Variable - -# Rebuild models to resolve forward references -Dataset.model_rebuild() -TaxBenefitModelVersion.model_rebuild() -Variable.model_rebuild() -Parameter.model_rebuild() -ParameterNode.model_rebuild() -ParameterValue.model_rebuild() diff --git a/build/lib/policyengine/core/cache.py b/build/lib/policyengine/core/cache.py deleted file mode 100644 index 410301e4..00000000 --- a/build/lib/policyengine/core/cache.py +++ /dev/null @@ -1,59 +0,0 @@ -import logging -from collections import OrderedDict -from typing import Generic, Optional, TypeVar - -import psutil - -logger = logging.getLogger(__name__) - -_MEMORY_THRESHOLDS_GB = [8, 16, 32] -_warned_thresholds: set[int] = set() - -T = TypeVar("T") - - -class LRUCache(Generic[T]): - """Least-recently-used cache with configurable size limit and memory monitoring.""" - - def __init__(self, max_size: int = 100): - self._max_size = max_size - self._cache: OrderedDict[str, T] = OrderedDict() - - def get(self, key: str) -> Optional[T]: - """Get item from cache, marking it as recently used.""" - if key not in self._cache: - return None - self._cache.move_to_end(key) - return self._cache[key] - - def add(self, key: str, value: T) -> None: - """Add item to cache with LRU eviction when full.""" - if key in self._cache: - self._cache.move_to_end(key) - else: - self._cache[key] = value - if len(self._cache) > self._max_size: - self._cache.popitem(last=False) - - self._check_memory_usage() - - def clear(self) -> None: - """Clear all items from cache.""" - self._cache.clear() - _warned_thresholds.clear() - - def __len__(self) -> int: - return len(self._cache) - - def _check_memory_usage(self) -> None: - """Check memory usage and warn at threshold crossings.""" - process = psutil.Process() - memory_gb = process.memory_info().rss / (1024**3) - - for threshold in _MEMORY_THRESHOLDS_GB: - if memory_gb >= threshold and threshold not in _warned_thresholds: - logger.warning( - f"Memory usage has reached {memory_gb:.2f}GB (threshold: {threshold}GB). " - f"Cache contains {len(self._cache)} items." - ) - _warned_thresholds.add(threshold) diff --git a/build/lib/policyengine/core/dataset.py b/build/lib/policyengine/core/dataset.py deleted file mode 100644 index 27f51d16..00000000 --- a/build/lib/policyengine/core/dataset.py +++ /dev/null @@ -1,396 +0,0 @@ -from typing import Optional -from uuid import uuid4 - -import numpy as np -import pandas as pd -from microdf import MicroDataFrame -from pydantic import BaseModel, ConfigDict, Field - -from .dataset_version import DatasetVersion -from .tax_benefit_model import TaxBenefitModel - - -class YearData(BaseModel): - """Base class for entity-level data for a single year.""" - - model_config = ConfigDict(arbitrary_types_allowed=True) - - @property - def entity_data(self) -> dict[str, MicroDataFrame]: - """Return a dictionary of entity names to their data. - - This should be implemented by subclasses to return the appropriate entities. - """ - raise NotImplementedError("Subclasses must implement entity_data property") - - @property - def person_entity(self) -> str: - """Return the name of the person-level entity. - - Defaults to 'person' but can be overridden by subclasses. - """ - return "person" - - def map_to_entity( - self, - source_entity: str, - target_entity: str, - columns: list[str] = None, - values: list = None, - how: str = "sum", - ) -> MicroDataFrame: - """Map data from source entity to target entity using join keys. - - Args: - source_entity (str): The source entity name. - target_entity (str): The target entity name. - columns (list[str], optional): List of column names to map. If None, maps all columns. - values (list, optional): List of values to use instead of column data. - how (str): Aggregation method ('sum' or 'first') when mapping to higher-level entities (default 'sum'). - - Returns: - MicroDataFrame: The mapped data at the target entity level. - - Raises: - ValueError: If source or target entity is invalid. - """ - return map_to_entity( - entity_data=self.entity_data, - source_entity=source_entity, - target_entity=target_entity, - person_entity=self.person_entity, - columns=columns, - values=values, - how=how, - ) - - -class Dataset(BaseModel): - """Base class for datasets. - - The data field contains entity-level data as a BaseModel with DataFrame fields. - - Example: - class YearData(BaseModel): - model_config = ConfigDict(arbitrary_types_allowed=True) - person: pd.DataFrame - household: pd.DataFrame - - class MyDataset(Dataset): - data: Optional[YearData] = None - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - id: str = Field(default_factory=lambda: str(uuid4())) - name: str - description: str - dataset_version: Optional[DatasetVersion] = None - filepath: str - is_output_dataset: bool = False - tax_benefit_model: Optional[TaxBenefitModel] = None - year: int - - data: Optional[BaseModel] = None - - -def map_to_entity( - entity_data: dict[str, MicroDataFrame], - source_entity: str, - target_entity: str, - person_entity: str = "person", - columns: Optional[list[str]] = None, - values: Optional[np.ndarray] = None, - how: str = "sum", -) -> MicroDataFrame: - """Map data from source entity to target entity using join keys. - - This is a generic entity mapping utility that handles: - - Same entity mapping (returns as is) - - Person to group entity mapping (aggregates values) - - Group to person entity mapping (expands values) - - Group to group entity mapping (aggregates through person entity) - - Args: - entity_data: Dictionary mapping entity names to their MicroDataFrame data - source_entity: The source entity name - target_entity: The target entity name - person_entity: The name of the person entity (default "person") - columns: List of column names to map. If None, maps all columns - values: List of values to use instead of column data. If provided, creates a single unnamed column - how: Aggregation method (default 'sum') - - For person → group: 'sum' (aggregate), 'first' (take first value) - - For group → person: 'project' (broadcast), 'divide' (split equally) - - For group → group: 'sum', 'first', 'project', 'divide' - - Returns: - MicroDataFrame: The mapped data at the target entity level - - Raises: - ValueError: If source or target entity is invalid or unsupported aggregation method - """ - valid_entities = set(entity_data.keys()) - - if source_entity not in valid_entities: - raise ValueError( - f"Invalid source entity '{source_entity}'. Must be one of {valid_entities}" - ) - if target_entity not in valid_entities: - raise ValueError( - f"Invalid target entity '{target_entity}'. Must be one of {valid_entities}" - ) - - # Get source data (convert to plain DataFrame to avoid weighted operations during mapping) - source_df = pd.DataFrame(entity_data[source_entity]) - - # Track if we should return a MicroSeries (values is a numpy array, not a list) - return_series = values is not None - - # Handle values parameter - create a temporary column with the provided values - if values is not None: - if len(values) != len(source_df): - raise ValueError( - f"Length of values ({len(values)}) must match source entity length ({len(source_df)})" - ) - # Create a temporary DataFrame with just ID columns and the values column - id_cols = {col for col in source_df.columns if col.endswith("_id")} - source_df = source_df[[col for col in id_cols]] - source_df["__mapped_value"] = values - columns = ["__mapped_value"] - - if columns: - # Select only requested columns (keep all ID columns for joins) - id_cols = {col for col in source_df.columns if col.endswith("_id")} - cols_to_keep = list(set(columns) | id_cols) - source_df = source_df[cols_to_keep] - - # Determine weight column for target entity - target_weight = f"{target_entity}_weight" - - # Same entity - return as is - if source_entity == target_entity: - result = MicroDataFrame(source_df, weights=target_weight) - if return_series: - return result["__mapped_value"] - return result - - # Get target data and key - target_df = entity_data[target_entity] - target_key = f"{target_entity}_id" - - # Person to group entity: aggregate person-level data to group level - if source_entity == person_entity and target_entity != person_entity: - # Check for both naming patterns: "entity_id" and "person_entity_id" - person_target_key = f"{person_entity}_{target_entity}_id" - join_key = ( - person_target_key if person_target_key in source_df.columns else target_key - ) - - if join_key in source_df.columns: - # Get columns to aggregate (exclude ID and weight columns) - id_cols = {col for col in source_df.columns if col.endswith("_id")} - weight_cols = {col for col in source_df.columns if col.endswith("_weight")} - agg_cols = [ - c - for c in source_df.columns - if c not in id_cols and c not in weight_cols - ] - - # Group by join key and aggregate - if how == "sum": - aggregated = source_df.groupby(join_key, as_index=False)[agg_cols].sum() - elif how == "first": - aggregated = source_df.groupby(join_key, as_index=False)[ - agg_cols - ].first() - else: - raise ValueError(f"Unsupported aggregation method: {how}") - - # Rename join key to target key if needed - if join_key != target_key: - aggregated = aggregated.rename(columns={join_key: target_key}) - - # Merge with target, preserving original order - target_pd = pd.DataFrame(target_df)[[target_key, target_weight]] - target_pd = target_pd.reset_index(drop=False) - result = target_pd.merge(aggregated, on=target_key, how="left") - - # Sort back to original order - result = ( - result.sort_values("index").drop("index", axis=1).reset_index(drop=True) - ) - - # Fill NaN with 0 for groups with no members in source entity - result[agg_cols] = result[agg_cols].fillna(0) - - result_df = MicroDataFrame(result, weights=target_weight) - if return_series: - return result_df["__mapped_value"] - return result_df - - # Group entity to person: expand group-level data to person level - if source_entity != person_entity and target_entity == person_entity: - # Default to 'project' (broadcast) for group -> person if 'sum' was provided - if how == "sum": - how = "project" - - source_key = f"{source_entity}_id" - # Check for both naming patterns - person_source_key = f"{person_entity}_{source_entity}_id" - - target_pd = pd.DataFrame(target_df) - join_key = ( - person_source_key if person_source_key in target_pd.columns else source_key - ) - - if join_key in target_pd.columns: - # Rename source key to match join key if needed - if join_key != source_key and source_key in source_df.columns: - source_df = source_df.rename(columns={source_key: join_key}) - - result = target_pd.merge(source_df, on=join_key, how="left") - - # Handle divide operation - if how == "divide": - # Get columns to divide (exclude ID and weight columns) - id_cols = {col for col in result.columns if col.endswith("_id")} - weight_cols = {col for col in result.columns if col.endswith("_weight")} - value_cols = [ - c - for c in result.columns - if c not in id_cols and c not in weight_cols - ] - - # Count members in each group - group_counts = ( - target_pd.groupby(join_key, as_index=False) - .size() - .rename(columns={"size": "__group_count"}) - ) - result = result.merge(group_counts, on=join_key, how="left") - - # Divide values by group count - for col in value_cols: - result[col] = result[col] / result["__group_count"] - - result = result.drop(columns=["__group_count"]) - elif how not in ["project"]: - raise ValueError( - f"Unsupported aggregation method for group->person: {how}. Use 'project' or 'divide'." - ) - - result_df = MicroDataFrame(result, weights=target_weight) - if return_series: - return result_df["__mapped_value"] - return result_df - - # Group to group: go through person table - if source_entity != person_entity and target_entity != person_entity: - # Get person link table with both entity IDs - person_df = pd.DataFrame(entity_data[person_entity]) - source_key = f"{source_entity}_id" - - # Check for both naming patterns for person-level links - person_source_key = f"{person_entity}_{source_entity}_id" - person_target_key = f"{person_entity}_{target_entity}_id" - - # Determine which keys exist in person table - source_link_key = ( - person_source_key if person_source_key in person_df.columns else source_key - ) - target_link_key = ( - person_target_key if person_target_key in person_df.columns else target_key - ) - - # Link source -> person -> target - if ( - source_link_key in person_df.columns - and target_link_key in person_df.columns - ): - person_link = person_df[ - [source_link_key, target_link_key] - ].drop_duplicates() - - # Rename source key to match link key if needed - source_df_copy = source_df.copy() - if source_link_key != source_key and source_key in source_df_copy.columns: - source_df_copy = source_df_copy.rename( - columns={source_key: source_link_key} - ) - - # Join source data with target key - source_with_target = source_df_copy.merge( - person_link, on=source_link_key, how="left" - ) - - # Aggregate to target level - id_cols = {col for col in source_with_target.columns if col.endswith("_id")} - weight_cols = { - col for col in source_with_target.columns if col.endswith("_weight") - } - agg_cols = [ - c - for c in source_with_target.columns - if c not in id_cols and c not in weight_cols - ] - - if how == "sum": - aggregated = source_with_target.groupby( - target_link_key, as_index=False - )[agg_cols].sum() - elif how == "first": - aggregated = source_with_target.groupby( - target_link_key, as_index=False - )[agg_cols].first() - elif how == "project": - # Just take first value (broadcast to target groups) - aggregated = source_with_target.groupby( - target_link_key, as_index=False - )[agg_cols].first() - elif how == "divide": - # Count persons in each source group - source_group_counts = ( - person_df.groupby(source_link_key, as_index=False) - .size() - .rename(columns={"size": "__source_count"}) - ) - source_with_target = source_with_target.merge( - source_group_counts, on=source_link_key, how="left" - ) - - # Divide values by source group count (per-person share) - for col in agg_cols: - source_with_target[col] = ( - source_with_target[col] / source_with_target["__source_count"] - ) - - # Now aggregate (sum of per-person shares) to target level - aggregated = source_with_target.groupby( - target_link_key, as_index=False - )[agg_cols].sum() - else: - raise ValueError(f"Unsupported aggregation method: {how}") - - # Rename target link key to target key if needed - if target_link_key != target_key: - aggregated = aggregated.rename(columns={target_link_key: target_key}) - - # Merge with target, preserving original order - target_pd = pd.DataFrame(target_df)[[target_key, target_weight]] - target_pd = target_pd.reset_index(drop=False) - result = target_pd.merge(aggregated, on=target_key, how="left") - - # Sort back to original order - result = ( - result.sort_values("index").drop("index", axis=1).reset_index(drop=True) - ) - - # Fill NaN with 0 - result[agg_cols] = result[agg_cols].fillna(0) - - result_df = MicroDataFrame(result, weights=target_weight) - if return_series: - return result_df["__mapped_value"] - return result_df - - raise ValueError(f"Unsupported mapping from {source_entity} to {target_entity}") diff --git a/build/lib/policyengine/core/dataset_version.py b/build/lib/policyengine/core/dataset_version.py deleted file mode 100644 index 711cd7d7..00000000 --- a/build/lib/policyengine/core/dataset_version.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import TYPE_CHECKING -from uuid import uuid4 - -from pydantic import BaseModel, Field - -from .tax_benefit_model import TaxBenefitModel - -if TYPE_CHECKING: - from .dataset import Dataset - - -class DatasetVersion(BaseModel): - id: str = Field(default_factory=lambda: str(uuid4())) - dataset: "Dataset" - description: str - tax_benefit_model: TaxBenefitModel = None diff --git a/build/lib/policyengine/core/dynamic.py b/build/lib/policyengine/core/dynamic.py deleted file mode 100644 index d707b9b2..00000000 --- a/build/lib/policyengine/core/dynamic.py +++ /dev/null @@ -1,47 +0,0 @@ -from collections.abc import Callable -from datetime import datetime -from typing import Optional -from uuid import uuid4 - -from pydantic import BaseModel, Field - -from .parameter_value import ParameterValue - - -class Dynamic(BaseModel): - id: str = Field(default_factory=lambda: str(uuid4())) - name: str - description: Optional[str] = None - parameter_values: list[ParameterValue] = [] - simulation_modifier: Optional[Callable] = None - created_at: datetime = Field(default_factory=datetime.now) - updated_at: datetime = Field(default_factory=datetime.now) - - def __add__(self, other: "Dynamic") -> "Dynamic": - """Combine two dynamics by appending parameter values and chaining simulation modifiers.""" - if not isinstance(other, Dynamic): - return NotImplemented - - # Combine simulation modifiers - combined_modifier = None - if ( - self.simulation_modifier is not None - and other.simulation_modifier is not None - ): - - def combined_modifier(sim): - sim = self.simulation_modifier(sim) - sim = other.simulation_modifier(sim) - return sim - - elif self.simulation_modifier is not None: - combined_modifier = self.simulation_modifier - elif other.simulation_modifier is not None: - combined_modifier = other.simulation_modifier - - return Dynamic( - name=f"{self.name} + {other.name}", - description=f"Combined dynamic: {self.name} and {other.name}", - parameter_values=self.parameter_values + other.parameter_values, - simulation_modifier=combined_modifier, - ) diff --git a/build/lib/policyengine/core/output.py b/build/lib/policyengine/core/output.py deleted file mode 100644 index e71634ab..00000000 --- a/build/lib/policyengine/core/output.py +++ /dev/null @@ -1,26 +0,0 @@ -from typing import Generic, List, TypeVar - -import pandas as pd -from pydantic import BaseModel, ConfigDict - -T = TypeVar("T", bound="Output") - - -class Output(BaseModel): - """Base class for all output templates.""" - - def run(self): - """Calculate and populate the output fields. - - Must be implemented by subclasses. - """ - raise NotImplementedError("Subclasses must implement run()") - - -class OutputCollection(BaseModel, Generic[T]): - """Container for a collection of outputs with their DataFrame representation.""" - - model_config = ConfigDict(arbitrary_types_allowed=True) - - outputs: List[T] - dataframe: pd.DataFrame diff --git a/build/lib/policyengine/core/parameter.py b/build/lib/policyengine/core/parameter.py deleted file mode 100644 index 49f2b282..00000000 --- a/build/lib/policyengine/core/parameter.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import TYPE_CHECKING, Any, Optional -from uuid import uuid4 - -from pydantic import BaseModel, Field, PrivateAttr - -from .parameter_value import ParameterValue -from .tax_benefit_model_version import TaxBenefitModelVersion - -if TYPE_CHECKING: - from .parameter_value import ParameterValue - - -class Parameter(BaseModel): - model_config = {"arbitrary_types_allowed": True} - - id: str = Field(default_factory=lambda: str(uuid4())) - name: str - label: Optional[str] = None - description: Optional[str] = None - data_type: Optional[type] = None - tax_benefit_model_version: TaxBenefitModelVersion - unit: Optional[str] = None - - # Lazy loading: store core param ref, build values on demand - _core_param: Any = PrivateAttr(default=None) - _parameter_values: Optional[list["ParameterValue"]] = PrivateAttr(default=None) - - def __init__(self, _core_param: Any = None, **data): - super().__init__(**data) - self._core_param = _core_param - self._parameter_values = None - - @property - def parameter_values(self) -> list["ParameterValue"]: - """Lazily build parameter values on first access.""" - if self._parameter_values is None: - self._parameter_values = [] - if self._core_param is not None: - from policyengine.utils import parse_safe_date - - for i in range(len(self._core_param.values_list)): - param_at_instant = self._core_param.values_list[i] - if i + 1 < len(self._core_param.values_list): - next_instant = self._core_param.values_list[i + 1] - else: - next_instant = None - pv = ParameterValue( - parameter=self, - start_date=parse_safe_date(param_at_instant.instant_str), - end_date=parse_safe_date(next_instant.instant_str) - if next_instant - else None, - value=param_at_instant.value, - ) - self._parameter_values.append(pv) - return self._parameter_values - - @parameter_values.setter - def parameter_values(self, value: list["ParameterValue"]) -> None: - """Allow direct setting of parameter values.""" - self._parameter_values = value diff --git a/build/lib/policyengine/core/parameter_node.py b/build/lib/policyengine/core/parameter_node.py deleted file mode 100644 index 54d384a5..00000000 --- a/build/lib/policyengine/core/parameter_node.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import TYPE_CHECKING, Optional -from uuid import uuid4 - -from pydantic import BaseModel, Field - -if TYPE_CHECKING: - from .tax_benefit_model_version import TaxBenefitModelVersion - - -class ParameterNode(BaseModel): - """Represents a folder/category node in the parameter hierarchy. - - Parameter nodes are intermediate nodes in the parameter tree (e.g., "gov", - "gov.hmrc", "gov.hmrc.income_tax"). They provide structure and human-readable - labels for navigating the parameter tree, but don't have values themselves. - - Unlike Parameter objects (which are leaf nodes with actual values), - ParameterNode objects are purely organizational. - """ - - model_config = {"arbitrary_types_allowed": True} - - id: str = Field(default_factory=lambda: str(uuid4())) - name: str = Field(description="Full path of the node (e.g., 'gov.hmrc')") - label: Optional[str] = Field( - default=None, description="Human-readable label (e.g., 'HMRC')" - ) - description: Optional[str] = Field(default=None, description="Node description") - tax_benefit_model_version: "TaxBenefitModelVersion" diff --git a/build/lib/policyengine/core/parameter_value.py b/build/lib/policyengine/core/parameter_value.py deleted file mode 100644 index a51ffeb0..00000000 --- a/build/lib/policyengine/core/parameter_value.py +++ /dev/null @@ -1,16 +0,0 @@ -from datetime import datetime -from typing import TYPE_CHECKING, Optional, Union -from uuid import uuid4 - -from pydantic import BaseModel, Field - -if TYPE_CHECKING: - from .parameter import Parameter - - -class ParameterValue(BaseModel): - id: str = Field(default_factory=lambda: str(uuid4())) - parameter: "Optional[Parameter]" = None - value: Optional[Union[float, int, str, bool, list]] = None - start_date: datetime - end_date: Optional[datetime] = None diff --git a/build/lib/policyengine/core/policy.py b/build/lib/policyengine/core/policy.py deleted file mode 100644 index 3860a817..00000000 --- a/build/lib/policyengine/core/policy.py +++ /dev/null @@ -1,47 +0,0 @@ -from collections.abc import Callable -from datetime import datetime -from typing import Optional -from uuid import uuid4 - -from pydantic import BaseModel, Field - -from .parameter_value import ParameterValue - - -class Policy(BaseModel): - id: str = Field(default_factory=lambda: str(uuid4())) - name: str - description: Optional[str] = None - parameter_values: list[ParameterValue] = [] - simulation_modifier: Optional[Callable] = None - created_at: datetime = Field(default_factory=datetime.now) - updated_at: datetime = Field(default_factory=datetime.now) - - def __add__(self, other: "Policy") -> "Policy": - """Combine two policies by appending parameter values and chaining simulation modifiers.""" - if not isinstance(other, Policy): - return NotImplemented - - # Combine simulation modifiers - combined_modifier = None - if ( - self.simulation_modifier is not None - and other.simulation_modifier is not None - ): - - def combined_modifier(sim): - sim = self.simulation_modifier(sim) - sim = other.simulation_modifier(sim) - return sim - - elif self.simulation_modifier is not None: - combined_modifier = self.simulation_modifier - elif other.simulation_modifier is not None: - combined_modifier = other.simulation_modifier - - return Policy( - name=f"{self.name} + {other.name}", - description=f"Combined policy: {self.name} and {other.name}", - parameter_values=self.parameter_values + other.parameter_values, - simulation_modifier=combined_modifier, - ) diff --git a/build/lib/policyengine/core/region.py b/build/lib/policyengine/core/region.py deleted file mode 100644 index 7ff55a64..00000000 --- a/build/lib/policyengine/core/region.py +++ /dev/null @@ -1,212 +0,0 @@ -"""Region definitions for geographic simulations. - -This module provides the Region and RegionRegistry classes for defining -geographic regions that a tax-benefit model supports. Regions can have: -1. A dedicated dataset (e.g., US states, congressional districts) -2. Filter from a parent region's dataset (e.g., US places/cities, UK countries) -""" - -from typing import Literal, Optional, Union - -from pydantic import BaseModel, Field, PrivateAttr - -from .scoping_strategy import ScopingStrategy - -# Region type literals for US and UK -USRegionType = Literal["national", "state", "congressional_district", "place"] -UKRegionType = Literal["national", "country", "constituency", "local_authority"] -RegionType = Union[USRegionType, UKRegionType] - - -class Region(BaseModel): - """Geographic region for tax-benefit simulations. - - Regions can either have: - 1. A dedicated dataset (dataset_path is set, requires_filter is False) - 2. Filter from a parent region's dataset (requires_filter is True) - - The unique identifier is the code field, which uses a prefixed format: - - National: "us", "uk" - - State: "state/ca", "state/ny" - - Congressional District: "congressional_district/CA-01" - - Place: "place/NJ-57000" - - UK Country: "country/england" - - Constituency: "constituency/Sheffield Central" - - Local Authority: "local_authority/E09000001" - """ - - # Core identification - code: str = Field( - ..., - description="Unique region code with type prefix (e.g., 'state/ca', 'place/NJ-57000')", - ) - label: str = Field(..., description="Human-readable label (e.g., 'California')") - region_type: RegionType = Field( - ..., description="Type of region (e.g., 'state', 'place')" - ) - - # Hierarchy - parent_code: Optional[str] = Field( - default=None, - description="Code of parent region (e.g., 'us' for states, 'state/nj' for places in New Jersey)", - ) - - # Dataset configuration - dataset_path: Optional[str] = Field( - default=None, - description="GCS path to dedicated dataset (e.g., 'gs://policyengine-us-data/states/CA.h5')", - ) - - # Scoping strategy (preferred over legacy filter fields) - scoping_strategy: Optional[ScopingStrategy] = Field( - default=None, - description="Strategy for scoping dataset to this region (row filtering or weight replacement)", - ) - - # Legacy filtering configuration (kept for backward compatibility) - requires_filter: bool = Field( - default=False, - description="True if this region filters from a parent dataset rather than having its own", - ) - filter_field: Optional[str] = Field( - default=None, - description="Dataset field to filter on (e.g., 'place_fips', 'country')", - ) - filter_value: Optional[str] = Field( - default=None, - description="Value to match when filtering (defaults to code suffix if not set)", - ) - - # Metadata (primarily for US congressional districts) - state_code: Optional[str] = Field( - default=None, description="Two-letter state code (e.g., 'CA', 'NJ')" - ) - state_name: Optional[str] = Field( - default=None, - description="Full state name (e.g., 'California', 'New Jersey')", - ) - - def __hash__(self) -> int: - """Hash by code for use in sets and dict keys.""" - return hash(self.code) - - def __eq__(self, other: object) -> bool: - """Equality by code.""" - if not isinstance(other, Region): - return False - return self.code == other.code - - -class RegionRegistry(BaseModel): - """Registry of all regions for a country model. - - Provides indexed lookups for regions by code and type. - Indices are rebuilt automatically after initialization. - """ - - country_id: str = Field(..., description="Country identifier (e.g., 'us', 'uk')") - regions: list[Region] = Field(default_factory=list) - - # Private indexed lookups (excluded from serialization) - _by_code: dict[str, Region] = PrivateAttr(default_factory=dict) - _by_type: dict[str, list[Region]] = PrivateAttr(default_factory=dict) - - def model_post_init(self, __context: object) -> None: - """Build lookup indices after initialization.""" - self._rebuild_indices() - - def _rebuild_indices(self) -> None: - """Rebuild all lookup indices from the regions list.""" - self._by_code = {} - self._by_type = {} - - for region in self.regions: - # Index by code - self._by_code[region.code] = region - - # Index by type - if region.region_type not in self._by_type: - self._by_type[region.region_type] = [] - self._by_type[region.region_type].append(region) - - def add_region(self, region: Region) -> None: - """Add a region to the registry and update indices.""" - self.regions.append(region) - self._by_code[region.code] = region - if region.region_type not in self._by_type: - self._by_type[region.region_type] = [] - self._by_type[region.region_type].append(region) - - def get(self, code: str) -> Optional[Region]: - """Get a region by its code. - - Args: - code: Region code (e.g., 'state/ca', 'place/NJ-57000') - - Returns: - The Region if found, None otherwise - """ - return self._by_code.get(code) - - def get_by_type(self, region_type: str) -> list[Region]: - """Get all regions of a given type. - - Args: - region_type: Type to filter by (e.g., 'state', 'place') - - Returns: - List of regions with the given type - """ - return self._by_type.get(region_type, []) - - def get_national(self) -> Optional[Region]: - """Get the national-level region. - - Returns: - The national Region if found, None otherwise - """ - national = self.get_by_type("national") - return national[0] if national else None - - def get_children(self, parent_code: str) -> list[Region]: - """Get all regions with a given parent code. - - Args: - parent_code: Parent region code to filter by - - Returns: - List of regions with the given parent - """ - return [r for r in self.regions if r.parent_code == parent_code] - - def get_dataset_regions(self) -> list[Region]: - """Get all regions that have dedicated datasets. - - Returns: - List of regions with dataset_path set and requires_filter False - """ - return [ - r - for r in self.regions - if r.dataset_path is not None and not r.requires_filter - ] - - def get_filter_regions(self) -> list[Region]: - """Get all regions that require filtering from parent datasets. - - Returns: - List of regions with requires_filter True - """ - return [r for r in self.regions if r.requires_filter] - - def __len__(self) -> int: - """Return the number of regions in the registry.""" - return len(self.regions) - - def __iter__(self): - """Iterate over regions.""" - return iter(self.regions) - - def __contains__(self, code: str) -> bool: - """Check if a region code exists in the registry.""" - return code in self._by_code diff --git a/build/lib/policyengine/core/release_manifest.py b/build/lib/policyengine/core/release_manifest.py deleted file mode 100644 index 90a09f32..00000000 --- a/build/lib/policyengine/core/release_manifest.py +++ /dev/null @@ -1,432 +0,0 @@ -import os -from functools import lru_cache -from importlib import import_module -from importlib.resources import files -from pathlib import Path -from typing import Optional - -import requests -from pydantic import BaseModel, Field - -HF_REQUEST_TIMEOUT_SECONDS = 30 -LOCAL_DATA_REPO_HINTS = { - "us": ("policyengine_us", "policyengine-us-data", "policyengine_us_data"), - "uk": ("policyengine_uk", "policyengine-uk-data", "policyengine_uk_data"), -} - - -class DataReleaseManifestUnavailableError(ValueError): - """Raised when a data release manifest cannot be fetched or is absent.""" - - -class PackageVersion(BaseModel): - name: str - version: str - - -class DataPackageVersion(PackageVersion): - repo_id: str - repo_type: str = "model" - release_manifest_path: str = "release_manifest.json" - - -class CompatibleModelPackage(BaseModel): - name: str - specifier: str - - -class BuiltWithModelPackage(PackageVersion): - git_sha: Optional[str] = None - data_build_fingerprint: Optional[str] = None - - -class DataBuildInfo(BaseModel): - build_id: Optional[str] = None - built_at: Optional[str] = None - built_with_model_package: Optional[BuiltWithModelPackage] = None - - -class ArtifactPathReference(BaseModel): - path: str - - -class ArtifactPathTemplate(BaseModel): - path_template: str - - def resolve(self, **kwargs: str) -> str: - return self.path_template.format(**kwargs) - - -class DataReleaseArtifact(BaseModel): - kind: str - path: str - repo_id: str - revision: str - sha256: Optional[str] = None - size_bytes: Optional[int] = None - - @property - def uri(self) -> str: - return build_hf_uri( - repo_id=self.repo_id, - path_in_repo=self.path, - revision=self.revision, - ) - - -class DataReleaseManifest(BaseModel): - schema_version: int - data_package: PackageVersion - compatible_model_packages: list[CompatibleModelPackage] = Field( - default_factory=list - ) - default_datasets: dict[str, str] = Field(default_factory=dict) - build: Optional[DataBuildInfo] = None - artifacts: dict[str, DataReleaseArtifact] = Field(default_factory=dict) - - -class DataCertification(BaseModel): - compatibility_basis: str - certified_for_model_version: str - data_build_id: Optional[str] = None - built_with_model_version: Optional[str] = None - built_with_model_git_sha: Optional[str] = None - data_build_fingerprint: Optional[str] = None - certified_by: Optional[str] = None - - -class CertifiedDataArtifact(BaseModel): - data_package: Optional[PackageVersion] = None - dataset: str - uri: str - sha256: Optional[str] = None - build_id: Optional[str] = None - - -class CountryReleaseManifest(BaseModel): - schema_version: int = 1 - bundle_id: Optional[str] = None - published_at: Optional[str] = None - country_id: str - policyengine_version: str - model_package: PackageVersion - data_package: DataPackageVersion - default_dataset: str - datasets: dict[str, ArtifactPathReference] = Field(default_factory=dict) - region_datasets: dict[str, ArtifactPathTemplate] = Field(default_factory=dict) - certified_data_artifact: Optional[CertifiedDataArtifact] = None - certification: Optional[DataCertification] = None - - @property - def default_dataset_uri(self) -> str: - if ( - self.certified_data_artifact is not None - and self.certified_data_artifact.dataset == self.default_dataset - ): - return self.certified_data_artifact.uri - return resolve_dataset_reference(self.country_id, self.default_dataset) - - -def build_hf_uri(repo_id: str, path_in_repo: str, revision: str) -> str: - return f"hf://{repo_id}/{path_in_repo}@{revision}" - - -@lru_cache -def get_release_manifest(country_id: str) -> CountryReleaseManifest: - manifest_path = files("policyengine").joinpath( - "data", "release_manifests", f"{country_id}.json" - ) - if not manifest_path.is_file(): - raise ValueError(f"No bundled release manifest for country '{country_id}'") - - return CountryReleaseManifest.model_validate_json(manifest_path.read_text()) - - -def _data_release_manifest_url(data_package: DataPackageVersion) -> str: - return ( - "https://huggingface.co/" - f"{data_package.repo_id}/resolve/{data_package.version}/" - f"{data_package.release_manifest_path}" - ) - - -@lru_cache -def get_data_release_manifest(country_id: str) -> DataReleaseManifest: - country_manifest = get_release_manifest(country_id) - data_package = country_manifest.data_package - - headers = {} - token = os.environ.get("HUGGING_FACE_TOKEN") - if token: - headers["Authorization"] = f"Bearer {token}" - - response = requests.get( - _data_release_manifest_url(data_package), - headers=headers, - timeout=HF_REQUEST_TIMEOUT_SECONDS, - ) - if response.status_code in (401, 403): - raise DataReleaseManifestUnavailableError( - "Could not fetch the data release manifest from Hugging Face. " - "If this country uses a private data repo, set HUGGING_FACE_TOKEN." - ) - if response.status_code == 404: - raise DataReleaseManifestUnavailableError( - "No data release manifest was published for this data package." - ) - response.raise_for_status() - return DataReleaseManifest.model_validate_json(response.text) - - -def _specifier_matches(version: str, specifier: str) -> bool: - if specifier.startswith("=="): - return version == specifier[2:] - return False - - -def certify_data_release_compatibility( - country_id: str, - runtime_model_version: str, - runtime_data_build_fingerprint: Optional[str] = None, -) -> DataCertification: - country_manifest = get_release_manifest(country_id) - try: - data_release_manifest = get_data_release_manifest(country_id) - except DataReleaseManifestUnavailableError as exc: - bundled_certification = country_manifest.certification - if ( - bundled_certification is not None - and bundled_certification.certified_for_model_version - == runtime_model_version - ): - if ( - runtime_data_build_fingerprint is not None - and bundled_certification.data_build_fingerprint is not None - and runtime_data_build_fingerprint - != bundled_certification.data_build_fingerprint - ): - raise ValueError( - "Runtime data build fingerprint does not match the bundled " - "data certification." - ) - return bundled_certification - raise exc - built_with_model = ( - data_release_manifest.build.built_with_model_package - if data_release_manifest.build is not None - else None - ) - - if ( - built_with_model is not None - and built_with_model.name != country_manifest.model_package.name - ): - raise ValueError( - "Data release manifest was built with a different model package: " - f"expected {country_manifest.model_package.name}, " - f"got {built_with_model.name}." - ) - - if ( - built_with_model is not None - and built_with_model.version == runtime_model_version - ): - return DataCertification( - compatibility_basis="exact_build_model_version", - certified_for_model_version=runtime_model_version, - data_build_id=( - data_release_manifest.build.build_id - if data_release_manifest.build is not None - else None - ), - built_with_model_version=built_with_model.version, - built_with_model_git_sha=built_with_model.git_sha, - data_build_fingerprint=built_with_model.data_build_fingerprint, - ) - - if ( - built_with_model is not None - and built_with_model.data_build_fingerprint is not None - and runtime_data_build_fingerprint is not None - and built_with_model.data_build_fingerprint == runtime_data_build_fingerprint - ): - return DataCertification( - compatibility_basis="matching_data_build_fingerprint", - certified_for_model_version=runtime_model_version, - data_build_id=( - data_release_manifest.build.build_id - if data_release_manifest.build is not None - else None - ), - built_with_model_version=built_with_model.version, - built_with_model_git_sha=built_with_model.git_sha, - data_build_fingerprint=built_with_model.data_build_fingerprint, - ) - - for compatible_model_package in data_release_manifest.compatible_model_packages: - if compatible_model_package.name != country_manifest.model_package.name: - continue - if _specifier_matches( - version=runtime_model_version, - specifier=compatible_model_package.specifier, - ): - return DataCertification( - compatibility_basis="legacy_compatible_model_package", - certified_for_model_version=runtime_model_version, - data_build_id=( - data_release_manifest.build.build_id - if data_release_manifest.build is not None - else None - ), - built_with_model_version=( - built_with_model.version if built_with_model is not None else None - ), - built_with_model_git_sha=( - built_with_model.git_sha if built_with_model is not None else None - ), - data_build_fingerprint=( - built_with_model.data_build_fingerprint - if built_with_model is not None - else None - ), - ) - - raise ValueError( - "Data release manifest is not certified for the runtime model version " - f"{runtime_model_version} in country '{country_id}'." - ) - - -def resolve_dataset_reference(country_id: str, dataset: str) -> str: - if "://" in dataset: - return dataset - - manifest = get_release_manifest(country_id) - path_reference = manifest.datasets.get(dataset) - if path_reference is not None: - return build_hf_uri( - repo_id=manifest.data_package.repo_id, - path_in_repo=path_reference.path, - revision=manifest.data_package.version, - ) - - data_release_manifest = get_data_release_manifest(country_id) - artifact = data_release_manifest.artifacts.get(dataset) - if artifact is None: - raise ValueError( - f"Unknown dataset '{dataset}' for country '{country_id}'. " - f"Known datasets: {sorted(manifest.datasets)}" - ) - - return artifact.uri - - -def resolve_managed_dataset_reference( - country_id: str, - dataset: Optional[str] = None, - *, - allow_unmanaged: bool = False, -) -> str: - """Resolve a dataset reference under policyengine.py bundle enforcement. - - Managed mode pins dataset selection to the bundled `policyengine.py` - release manifest. Callers can: - - - omit `dataset` to use the certified default dataset for the bundle - - pass a logical dataset name present in the bundled/data-release manifests - - Direct URLs or raw Hugging Face references are treated as unmanaged unless - `allow_unmanaged=True` is set explicitly. - """ - - manifest = get_release_manifest(country_id) - if dataset is None: - return manifest.default_dataset_uri - - if "://" in dataset: - if dataset == manifest.default_dataset_uri: - return dataset - if allow_unmanaged: - return dataset - raise ValueError( - "Explicit dataset URIs bypass the policyengine.py release bundle. " - "Pass a manifest dataset name or omit `dataset` to use the certified " - "default dataset. Set `allow_unmanaged=True` only if you intend to " - "bypass bundle enforcement." - ) - - return resolve_dataset_reference(country_id, dataset) - - -def resolve_local_managed_dataset_source( - country_id: str, - dataset_uri: str, - *, - allow_local_mirror: bool = True, -) -> str: - """Resolve a local mirror of a managed dataset when available. - - This preserves the bundled dataset URI for provenance while allowing local - development environments with sibling data-repo checkouts to load the - exact certified artifact from disk rather than re-downloading it. - """ - - if not allow_local_mirror or not dataset_uri.startswith("hf://"): - return dataset_uri - - local_hint = LOCAL_DATA_REPO_HINTS.get(country_id) - if local_hint is None: - return dataset_uri - - path_without_revision = dataset_uri[5:].rsplit("@", 1)[0] - parts = path_without_revision.split("/", 2) - if len(parts) != 3: - return dataset_uri - _, _, path_in_repo = parts - - model_module_name, data_repo_name, data_package_name = local_hint - try: - model_module = import_module(model_module_name) - except ImportError: - return dataset_uri - - repo_root = Path(model_module.__file__).resolve().parents[1] - local_path = ( - repo_root.with_name(data_repo_name) - / data_package_name - / "storage" - / path_in_repo - ) - if local_path.exists(): - return str(local_path) - return dataset_uri - - -def dataset_logical_name(dataset: str) -> str: - return Path(dataset.rsplit("@", 1)[0]).stem - - -def resolve_default_datasets(country_id: str) -> list[str]: - manifest = get_release_manifest(country_id) - return list(manifest.datasets.keys()) - - -def resolve_region_dataset_path( - country_id: str, - region_type: str, - **kwargs: str, -) -> Optional[str]: - manifest = get_release_manifest(country_id) - template = manifest.region_datasets.get(region_type) - if template is None: - return None - - resolved_path = template.resolve(**kwargs) - if "://" in resolved_path: - return resolved_path - - return build_hf_uri( - repo_id=manifest.data_package.repo_id, - path_in_repo=resolved_path, - revision=manifest.data_package.version, - ) diff --git a/build/lib/policyengine/core/scoping_strategy.py b/build/lib/policyengine/core/scoping_strategy.py deleted file mode 100644 index 7d9b5126..00000000 --- a/build/lib/policyengine/core/scoping_strategy.py +++ /dev/null @@ -1,224 +0,0 @@ -"""Region scoping strategies for geographic simulations. - -Provides two concrete strategies for scoping datasets to sub-national regions: - -1. RowFilterStrategy: Filters dataset rows where a household variable matches - a specific value (e.g., UK countries by 'country' field, US places by 'place_fips'). - -2. WeightReplacementStrategy: Replaces household weights from a pre-computed weight - matrix stored in GCS (e.g., UK constituencies and local authorities). -""" - -import logging -from abc import abstractmethod -from pathlib import Path -from typing import Annotated, Literal, Optional, Union - -import h5py -import numpy as np -import pandas as pd -from microdf import MicroDataFrame -from pydantic import BaseModel, Discriminator - -from policyengine.utils.entity_utils import ( - filter_dataset_by_household_variable, -) - -logger = logging.getLogger(__name__) - - -class RegionScopingStrategy(BaseModel): - """Base class for region scoping strategies. - - Subclasses implement apply() to scope a dataset's entity data - to a specific sub-national region. - """ - - strategy_type: str - - @abstractmethod - def apply( - self, - entity_data: dict[str, MicroDataFrame], - group_entities: list[str], - year: int, - ) -> dict[str, MicroDataFrame]: - """Apply the scoping strategy to entity data. - - Args: - entity_data: Dict mapping entity names to their MicroDataFrames. - group_entities: List of group entity names for this country. - year: The simulation year (used for time-indexed weight matrices). - - Returns: - A dict mapping entity names to scoped MicroDataFrames. - """ - - @property - def cache_key(self) -> str: - """Return a string key for deterministic simulation ID hashing.""" - return f"{self.strategy_type}:{self.model_dump_json()}" - - -class RowFilterStrategy(RegionScopingStrategy): - """Scoping strategy that filters dataset rows by a household variable. - - Used for regions where we want to keep only households matching a - specific variable value (e.g., UK countries, US places/cities). - """ - - strategy_type: Literal["row_filter"] = "row_filter" - variable_name: str - variable_value: str - - def apply( - self, - entity_data: dict[str, MicroDataFrame], - group_entities: list[str], - year: int, - ) -> dict[str, MicroDataFrame]: - return filter_dataset_by_household_variable( - entity_data=entity_data, - group_entities=group_entities, - variable_name=self.variable_name, - variable_value=self.variable_value, - ) - - @property - def cache_key(self) -> str: - return f"row_filter:{self.variable_name}={self.variable_value}" - - -class WeightReplacementStrategy(RegionScopingStrategy): - """Scoping strategy that replaces household weights from a pre-computed matrix. - - Used for UK constituencies and local authorities. Instead of removing - households, this strategy keeps all households but replaces their weights - with region-specific values from a weight matrix stored in GCS. - - The weight matrix is an HDF5 file with shape (N_regions x N_households), - where each row contains household weights for a specific region. - A companion CSV maps region codes/names to row indices. - """ - - strategy_type: Literal["weight_replacement"] = "weight_replacement" - weight_matrix_bucket: str - weight_matrix_key: str - lookup_csv_bucket: str - lookup_csv_key: str - region_code: str - - def apply( - self, - entity_data: dict[str, MicroDataFrame], - group_entities: list[str], - year: int, - ) -> dict[str, MicroDataFrame]: - from policyengine_core.tools.google_cloud import download_gcs_file - - # Download lookup CSV and find region index - lookup_path = Path( - download_gcs_file( - bucket=self.lookup_csv_bucket, - file_path=self.lookup_csv_key, - ) - ) - lookup_df = pd.read_csv(lookup_path) - - region_id = self._find_region_index(lookup_df, self.region_code) - - # Download weight matrix and extract weights for this region - weights_path = download_gcs_file( - bucket=self.weight_matrix_bucket, - file_path=self.weight_matrix_key, - ) - with h5py.File(weights_path, "r") as f: - weights = f[str(year)][...] - - region_weights = weights[region_id] - - # Validate weight row length matches household count - household_df = pd.DataFrame(entity_data["household"]) - if len(region_weights) != len(household_df): - raise ValueError( - f"Weight matrix row length ({len(region_weights)}) does not match " - f"household count ({len(household_df)}) for region '{self.region_code}'. " - f"The weight matrix may be out of date." - ) - - # Replace household weights - result = {} - for entity_name, mdf in entity_data.items(): - df = pd.DataFrame(mdf) - if entity_name == "household": - df["household_weight"] = region_weights - result[entity_name] = MicroDataFrame(df, weights="household_weight") - else: - weight_col = f"{entity_name}_weight" - if weight_col in df.columns: - # Map new household weights to sub-entities via their - # household membership. Build a mapping from household_id - # to new weight. - hh_ids = household_df["household_id"].values - weight_map = dict(zip(hh_ids, region_weights)) - - # Find the entity's household ID column - person_hh_col = self._find_household_id_column(df, entity_name) - if person_hh_col: - new_weights = np.array( - [ - weight_map.get(hh_id, 0.0) - for hh_id in df[person_hh_col].values - ] - ) - df[weight_col] = new_weights - - result[entity_name] = MicroDataFrame( - df, - weights=( - f"{entity_name}_weight" - if f"{entity_name}_weight" in df.columns - else None - ), - ) - - return result - - @staticmethod - def _find_region_index(lookup_df: pd.DataFrame, region_code: str) -> int: - """Find the row index for a region in the lookup CSV. - - Searches by 'code' column first, then 'name' column. - """ - if "code" in lookup_df.columns and region_code in lookup_df["code"].values: - return lookup_df[lookup_df["code"] == region_code].index[0] - if "name" in lookup_df.columns and region_code in lookup_df["name"].values: - return lookup_df[lookup_df["name"] == region_code].index[0] - raise ValueError( - f"Region '{region_code}' not found in lookup CSV. " - f"Available columns: {list(lookup_df.columns)}. " - f"Searched 'code' and 'name' columns." - ) - - @staticmethod - def _find_household_id_column(df: pd.DataFrame, entity_name: str) -> Optional[str]: - """Find the column linking an entity to its household.""" - candidates = [ - "person_household_id", - f"{entity_name}_household_id", - "household_id", - ] - for col in candidates: - if col in df.columns: - return col - return None - - @property - def cache_key(self) -> str: - return f"weight_replacement:{self.weight_matrix_key}:{self.region_code}" - - -ScopingStrategy = Annotated[ - Union[RowFilterStrategy, WeightReplacementStrategy], - Discriminator("strategy_type"), -] diff --git a/build/lib/policyengine/core/simulation.py b/build/lib/policyengine/core/simulation.py deleted file mode 100644 index 6456e5bc..00000000 --- a/build/lib/policyengine/core/simulation.py +++ /dev/null @@ -1,111 +0,0 @@ -import logging -from datetime import datetime -from typing import Optional -from uuid import uuid4 - -from pydantic import BaseModel, Field, model_validator - -from .cache import LRUCache -from .dataset import Dataset -from .dynamic import Dynamic -from .policy import Policy -from .scoping_strategy import RowFilterStrategy, ScopingStrategy -from .tax_benefit_model_version import TaxBenefitModelVersion - -logger = logging.getLogger(__name__) - -_cache: LRUCache["Simulation"] = LRUCache(max_size=100) - - -class Simulation(BaseModel): - id: str = Field(default_factory=lambda: str(uuid4())) - created_at: datetime = Field(default_factory=datetime.now) - updated_at: datetime = Field(default_factory=datetime.now) - - policy: Optional[Policy] = None - dynamic: Optional[Dynamic] = None - dataset: Dataset = None - - # Scoping strategy (preferred over legacy filter fields) - scoping_strategy: Optional[ScopingStrategy] = Field( - default=None, - description="Strategy for scoping dataset to a sub-national region", - ) - - # Legacy regional filtering parameters (kept for backward compatibility) - filter_field: Optional[str] = Field( - default=None, - description="Household-level variable to filter dataset by (e.g., 'place_fips', 'country')", - ) - filter_value: Optional[str] = Field( - default=None, - description="Value to match when filtering (e.g., '44000', 'ENGLAND')", - ) - - tax_benefit_model_version: TaxBenefitModelVersion = None - - @model_validator(mode="after") - def _auto_construct_strategy(self) -> "Simulation": - """Auto-construct a RowFilterStrategy from legacy filter fields. - - If filter_field and filter_value are set but scoping_strategy is not, - create a RowFilterStrategy for backward compatibility. - """ - if ( - self.scoping_strategy is None - and self.filter_field is not None - and self.filter_value is not None - ): - self.scoping_strategy = RowFilterStrategy( - variable_name=self.filter_field, - variable_value=self.filter_value, - ) - return self - - output_dataset: Optional[Dataset] = None - - def run(self): - self.tax_benefit_model_version.run(self) - - def ensure(self): - cached_result = _cache.get(self.id) - if cached_result: - self.output_dataset = cached_result.output_dataset - return - try: - self.tax_benefit_model_version.load(self) - except FileNotFoundError: - self.run() - self.save() - except Exception: - logger.warning( - "Unexpected error loading simulation %s; falling back to run()", - self.id, - exc_info=True, - ) - self.run() - self.save() - - _cache.add(self.id, self) - - def save(self): - """Save the simulation's output dataset.""" - self.tax_benefit_model_version.save(self) - - def load(self): - """Load the simulation's output dataset.""" - self.tax_benefit_model_version.load(self) - - @property - def release_bundle(self) -> dict[str, Optional[str]]: - bundle = ( - self.tax_benefit_model_version.release_bundle - if self.tax_benefit_model_version is not None - else {} - ) - return { - **bundle, - "dataset_filepath": self.dataset.filepath - if self.dataset is not None - else None, - } diff --git a/build/lib/policyengine/core/tax_benefit_model.py b/build/lib/policyengine/core/tax_benefit_model.py deleted file mode 100644 index c2d4e26d..00000000 --- a/build/lib/policyengine/core/tax_benefit_model.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import TYPE_CHECKING, Optional - -from pydantic import BaseModel - -if TYPE_CHECKING: - pass - - -class TaxBenefitModel(BaseModel): - id: str - description: Optional[str] = None diff --git a/build/lib/policyengine/core/tax_benefit_model_version.py b/build/lib/policyengine/core/tax_benefit_model_version.py deleted file mode 100644 index 7fb03334..00000000 --- a/build/lib/policyengine/core/tax_benefit_model_version.py +++ /dev/null @@ -1,208 +0,0 @@ -from datetime import datetime, timezone -from typing import TYPE_CHECKING, Optional -from uuid import uuid4 - -from pydantic import BaseModel, Field - -from .release_manifest import CountryReleaseManifest, DataCertification, PackageVersion -from .tax_benefit_model import TaxBenefitModel - -if TYPE_CHECKING: - from .parameter import Parameter - from .parameter_node import ParameterNode - from .parameter_value import ParameterValue - from .region import Region, RegionRegistry - from .simulation import Simulation - from .variable import Variable - - -class TaxBenefitModelVersion(BaseModel): - model_config = {"arbitrary_types_allowed": True} - - id: str = Field(default_factory=lambda: str(uuid4())) - model: TaxBenefitModel - version: str - description: Optional[str] = None - created_at: Optional[datetime] = Field( - default_factory=lambda: datetime.now(timezone.utc) - ) - - variables: list["Variable"] = Field(default_factory=list) - parameters: list["Parameter"] = Field(default_factory=list) - parameter_nodes: list["ParameterNode"] = Field(default_factory=list) - - # Region registry for geographic simulations - region_registry: "Optional[RegionRegistry]" = Field( - default=None, description="Registry of supported geographic regions" - ) - release_manifest: Optional[CountryReleaseManifest] = Field( - default=None, - exclude=True, - ) - model_package: Optional[PackageVersion] = Field(default=None) - data_package: Optional[PackageVersion] = Field(default=None) - default_dataset_uri: Optional[str] = Field(default=None) - data_certification: Optional[DataCertification] = Field(default=None) - - @property - def parameter_values(self) -> list["ParameterValue"]: - """Aggregate all parameter values from all parameters.""" - return [ - pv for parameter in self.parameters for pv in parameter.parameter_values - ] - - # Lookup dicts for O(1) access (excluded from serialization) - variables_by_name: dict[str, "Variable"] = Field(default_factory=dict, exclude=True) - parameters_by_name: dict[str, "Parameter"] = Field( - default_factory=dict, exclude=True - ) - parameter_nodes_by_name: dict[str, "ParameterNode"] = Field( - default_factory=dict, exclude=True - ) - - def run(self, simulation: "Simulation") -> "Simulation": - raise NotImplementedError( - "The TaxBenefitModel class must define a method to execute simulations." - ) - - def save(self, simulation: "Simulation"): - raise NotImplementedError( - "The TaxBenefitModel class must define a method to save simulations." - ) - - def load(self, simulation: "Simulation"): - raise NotImplementedError( - "The TaxBenefitModel class must define a method to load simulations." - ) - - def add_parameter(self, param: "Parameter") -> None: - """Add a parameter and index it for fast lookup.""" - self.parameters.append(param) - self.parameters_by_name[param.name] = param - - def add_variable(self, var: "Variable") -> None: - """Add a variable and index it for fast lookup.""" - self.variables.append(var) - self.variables_by_name[var.name] = var - - def add_parameter_node(self, node: "ParameterNode") -> None: - """Add a parameter node and index it for fast lookup.""" - self.parameter_nodes.append(node) - self.parameter_nodes_by_name[node.name] = node - - def get_parameter(self, name: str) -> "Parameter": - """Get a parameter by name (O(1) lookup).""" - if name in self.parameters_by_name: - return self.parameters_by_name[name] - raise ValueError( - f"Parameter '{name}' not found in {self.model.id} version {self.version}" - ) - - def get_variable(self, name: str) -> "Variable": - """Get a variable by name (O(1) lookup).""" - if name in self.variables_by_name: - return self.variables_by_name[name] - raise ValueError( - f"Variable '{name}' not found in {self.model.id} version {self.version}" - ) - - def get_parameter_node(self, name: str) -> "ParameterNode": - """Get a parameter node by name (O(1) lookup).""" - if name in self.parameter_nodes_by_name: - return self.parameter_nodes_by_name[name] - raise ValueError( - f"ParameterNode '{name}' not found in {self.model.id} version {self.version}" - ) - - def get_region(self, code: str) -> "Optional[Region]": - """Get a region by its code. - - Args: - code: Region code (e.g., 'state/ca', 'place/NJ-57000') - - Returns: - The Region if found, None if not found or no region registry - """ - if self.region_registry is None: - return None - return self.region_registry.get(code) - - @property - def release_bundle(self) -> dict[str, Optional[str]]: - manifest_certification = ( - self.release_manifest.certification - if self.release_manifest is not None - else None - ) - certification = self.data_certification or manifest_certification - certified_data_artifact = ( - self.release_manifest.certified_data_artifact - if self.release_manifest is not None - else None - ) - return { - "bundle_id": self.release_manifest.bundle_id - if self.release_manifest is not None - else None, - "country_id": self.release_manifest.country_id - if self.release_manifest is not None - else None, - "policyengine_version": self.release_manifest.policyengine_version - if self.release_manifest is not None - else None, - "model_package": self.model_package.name - if self.model_package is not None - else None, - "model_version": self.model_package.version - if self.model_package is not None - else None, - "data_package": self.data_package.name - if self.data_package is not None - else None, - "data_version": self.data_package.version - if self.data_package is not None - else None, - "default_dataset": self.release_manifest.default_dataset - if self.release_manifest is not None - else None, - "default_dataset_uri": self.default_dataset_uri, - "certified_data_build_id": ( - certification.data_build_id - if certification is not None - else ( - certified_data_artifact.build_id - if certified_data_artifact is not None - else None - ) - ), - "certified_data_artifact_sha256": ( - certified_data_artifact.sha256 - if certified_data_artifact is not None - else None - ), - "data_build_model_version": ( - certification.built_with_model_version - if certification is not None - else None - ), - "data_build_model_git_sha": ( - certification.built_with_model_git_sha - if certification is not None - else None - ), - "data_build_fingerprint": ( - certification.data_build_fingerprint - if certification is not None - else None - ), - "compatibility_basis": ( - certification.compatibility_basis if certification is not None else None - ), - "certified_by": ( - certification.certified_by if certification is not None else None - ), - } - - def __repr__(self) -> str: - # Give the id and version, and the number of variables, parameters, parameter nodes, parameter values - return f"" diff --git a/build/lib/policyengine/core/trace_tro.py b/build/lib/policyengine/core/trace_tro.py deleted file mode 100644 index ae31a29e..00000000 --- a/build/lib/policyengine/core/trace_tro.py +++ /dev/null @@ -1,260 +0,0 @@ -from __future__ import annotations - -import hashlib -import json -from collections.abc import Iterable, Mapping -from typing import Optional - -from .release_manifest import ( - CountryReleaseManifest, - DataCertification, - DataReleaseManifest, -) - -TRACE_TROV_VERSION = "0.1" -TRACE_CONTEXT = [ - { - "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "trov": "https://w3id.org/trace/trov/0.1#", - "schema": "https://schema.org/", - } -] - - -def _hash_object(value: str) -> dict[str, str]: - return { - "trov:hashAlgorithm": "sha256", - "trov:hashValue": value, - } - - -def _artifact_mime_type(path_or_uri: str) -> Optional[str]: - suffix = path_or_uri.rsplit(".", 1)[-1].lower() if "." in path_or_uri else "" - return { - "h5": "application/x-hdf5", - "json": "application/json", - "jsonld": "application/ld+json", - }.get(suffix) - - -def _canonical_json_bytes(value: Mapping) -> bytes: - return (json.dumps(value, indent=2, sort_keys=True) + "\n").encode("utf-8") - - -def compute_trace_composition_fingerprint( - artifact_hashes: Iterable[str], -) -> str: - digest = hashlib.sha256() - digest.update("".join(sorted(artifact_hashes)).encode("utf-8")) - return digest.hexdigest() - - -def build_trace_tro_from_release_bundle( - country_manifest: CountryReleaseManifest, - data_release_manifest: DataReleaseManifest, - *, - certification: Optional[DataCertification] = None, - bundle_manifest_path: Optional[str] = None, - data_release_manifest_path: Optional[str] = None, -) -> dict: - certified_artifact = country_manifest.certified_data_artifact - if certified_artifact is None: - raise ValueError( - "Country release manifest does not define a certified artifact." - ) - - dataset_artifact = data_release_manifest.artifacts.get(certified_artifact.dataset) - if dataset_artifact is None: - raise ValueError( - "Data release manifest does not include the certified dataset " - f"'{certified_artifact.dataset}'." - ) - if dataset_artifact.sha256 is None: - raise ValueError( - "Data release manifest does not include a SHA256 for the certified dataset " - f"'{certified_artifact.dataset}'." - ) - - effective_certification = certification or country_manifest.certification - bundle_manifest_location = ( - bundle_manifest_path - or f"data/release_manifests/{country_manifest.country_id}.json" - ) - data_manifest_location = data_release_manifest_path or ( - "https://huggingface.co/" - f"{country_manifest.data_package.repo_id}/resolve/" - f"{country_manifest.data_package.version}/" - f"{country_manifest.data_package.release_manifest_path}" - ) - - bundle_manifest_payload = country_manifest.model_dump(mode="json") - data_release_payload = data_release_manifest.model_dump(mode="json") - bundle_manifest_hash = hashlib.sha256( - _canonical_json_bytes(bundle_manifest_payload) - ).hexdigest() - data_release_manifest_hash = hashlib.sha256( - _canonical_json_bytes(data_release_payload) - ).hexdigest() - - artifact_specs = [ - { - "hash": bundle_manifest_hash, - "location": bundle_manifest_location, - "mime_type": "application/json", - }, - { - "hash": data_release_manifest_hash, - "location": data_manifest_location, - "mime_type": "application/json", - }, - { - "hash": dataset_artifact.sha256, - "location": certified_artifact.uri, - "mime_type": _artifact_mime_type(certified_artifact.uri), - }, - ] - - composition_artifacts = [] - arrangement_locations = [] - artifact_hashes = [] - - for index, artifact in enumerate(artifact_specs): - artifact_id = f"composition/1/artifact/{index}" - artifact_hashes.append(artifact["hash"]) - artifact_entry = { - "@id": artifact_id, - "@type": "trov:ResearchArtifact", - "trov:hash": _hash_object(artifact["hash"]), - } - if artifact["mime_type"] is not None: - artifact_entry["trov:mimeType"] = artifact["mime_type"] - composition_artifacts.append(artifact_entry) - arrangement_locations.append( - { - "@id": f"arrangement/0/location/{index}", - "@type": "trov:ArtifactLocation", - "trov:artifact": {"@id": artifact_id}, - "trov:path": artifact["location"], - } - ) - - certification_description = "" - if effective_certification is not None: - certification_description = ( - f" Certified for runtime model version " - f"{effective_certification.certified_for_model_version} via " - f"{effective_certification.compatibility_basis}." - ) - if effective_certification.built_with_model_version is not None: - certification_description += ( - f" Built with {country_manifest.model_package.name} " - f"{effective_certification.built_with_model_version}." - ) - if effective_certification.data_build_fingerprint is not None: - certification_description += ( - f" Data-build fingerprint: " - f"{effective_certification.data_build_fingerprint}." - ) - - created_at = country_manifest.published_at or ( - data_release_manifest.build.built_at - if data_release_manifest.build is not None - else None - ) - build_id = ( - effective_certification.data_build_id - if effective_certification is not None - else ( - certified_artifact.build_id - or f"{country_manifest.data_package.name}-{country_manifest.data_package.version}" - ) - ) - - return { - "@context": TRACE_CONTEXT, - "@graph": [ - { - "@id": "tro", - "@type": ["trov:TransparentResearchObject", "schema:CreativeWork"], - "trov:vocabularyVersion": TRACE_TROV_VERSION, - "schema:creator": country_manifest.policyengine_version, - "schema:name": ( - f"policyengine {country_manifest.country_id} certified bundle TRO" - ), - "schema:description": ( - f"TRACE TRO for certified runtime bundle " - f"{country_manifest.bundle_id or country_manifest.country_id} " - f"covering the bundled country release manifest, the country data " - f"release manifest, and the certified dataset artifact." - f"{certification_description}" - ), - "schema:dateCreated": created_at, - "trov:wasAssembledBy": { - "@id": "trs", - "@type": ["trov:TrustedResearchSystem", "schema:Organization"], - "schema:name": "PolicyEngine certified release bundle pipeline", - "schema:description": ( - "PolicyEngine certification workflow for runtime bundles that " - "pin a country model version, a country data release, and a " - "specific dataset artifact." - ), - }, - "trov:createdWith": { - "@type": "schema:SoftwareApplication", - "schema:name": "policyengine", - "schema:softwareVersion": country_manifest.policyengine_version, - }, - "trov:hasComposition": { - "@id": "composition/1", - "@type": "trov:ArtifactComposition", - "trov:hasFingerprint": { - "@id": "fingerprint", - "@type": "trov:CompositionFingerprint", - "trov:hash": _hash_object( - compute_trace_composition_fingerprint(artifact_hashes) - ), - }, - "trov:hasArtifact": composition_artifacts, - }, - "trov:hasArrangement": [ - { - "@id": "arrangement/0", - "@type": "trov:ArtifactArrangement", - "rdfs:comment": ( - f"Certified arrangement for bundle " - f"{country_manifest.bundle_id or country_manifest.country_id}." - ), - "trov:hasArtifactLocation": arrangement_locations, - } - ], - "trov:hasPerformance": [ - { - "@id": "trp/0", - "@type": "trov:TrustedResearchPerformance", - "rdfs:comment": ( - f"Certification of build {build_id} for " - f"{country_manifest.model_package.name} " - f"{country_manifest.model_package.version}." - ), - "trov:wasConductedBy": {"@id": "trs"}, - "trov:startedAtTime": ( - data_release_manifest.build.built_at - if data_release_manifest.build is not None - else created_at - ), - "trov:endedAtTime": created_at, - "trov:contributedToArrangement": { - "@id": "trp/0/binding/0", - "@type": "trov:ArrangementBinding", - "trov:arrangement": {"@id": "arrangement/0"}, - }, - } - ], - } - ], - } - - -def serialize_trace_tro(tro: Mapping) -> bytes: - return (json.dumps(tro, indent=2, sort_keys=True) + "\n").encode("utf-8") diff --git a/build/lib/policyengine/core/variable.py b/build/lib/policyengine/core/variable.py deleted file mode 100644 index 03e53495..00000000 --- a/build/lib/policyengine/core/variable.py +++ /dev/null @@ -1,20 +0,0 @@ -from typing import Any, Optional - -from pydantic import BaseModel - -from .tax_benefit_model_version import TaxBenefitModelVersion - - -class Variable(BaseModel): - id: str - name: str - label: Optional[str] = None - tax_benefit_model_version: TaxBenefitModelVersion - entity: str - description: Optional[str] = None - data_type: type = None - possible_values: Optional[list[Any]] = None - default_value: Any = None - value_type: Optional[type] = None - adds: Optional[list[str]] = None - subtracts: Optional[list[str]] = None diff --git a/build/lib/policyengine/countries/__init__.py b/build/lib/policyengine/countries/__init__.py deleted file mode 100644 index 3f647fd9..00000000 --- a/build/lib/policyengine/countries/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Country-specific region definitions. - -This package contains region registries for each supported country. -""" - -from .uk.regions import uk_region_registry -from .us.regions import us_region_registry - -__all__ = ["us_region_registry", "uk_region_registry"] diff --git a/build/lib/policyengine/countries/uk/__init__.py b/build/lib/policyengine/countries/uk/__init__.py deleted file mode 100644 index b2c255d3..00000000 --- a/build/lib/policyengine/countries/uk/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""UK country-specific region definitions.""" - -from .regions import uk_region_registry - -__all__ = ["uk_region_registry"] diff --git a/build/lib/policyengine/countries/uk/regions.py b/build/lib/policyengine/countries/uk/regions.py deleted file mode 100644 index 2f100524..00000000 --- a/build/lib/policyengine/countries/uk/regions.py +++ /dev/null @@ -1,207 +0,0 @@ -"""UK region definitions. - -This module defines all UK geographic regions: -- National (1) -- Countries (4: England, Scotland, Wales, Northern Ireland) -- Constituencies (loaded from CSV at runtime) -- Local Authorities (loaded from CSV at runtime) - -Note: Constituencies and local authorities use weight adjustment rather than -data filtering. They modify household_weight based on pre-computed weights -from H5 files stored in GCS. -""" - -import logging -from typing import TYPE_CHECKING - -from policyengine.core.region import Region, RegionRegistry -from policyengine.core.release_manifest import resolve_region_dataset_path -from policyengine.core.scoping_strategy import ( - RowFilterStrategy, - WeightReplacementStrategy, -) - -if TYPE_CHECKING: - pass - -logger = logging.getLogger(__name__) - -UK_DATA_BUCKET = "gs://policyengine-uk-data-private" - -# UK countries -UK_COUNTRIES = { - "england": "England", - "scotland": "Scotland", - "wales": "Wales", - "northern_ireland": "Northern Ireland", -} - - -def _load_constituencies_from_csv() -> list[dict]: - """Load UK constituency data from CSV. - - Constituencies are loaded from: - gs://policyengine-uk-data-private/constituencies_2024.csv - - Returns: - List of dicts with 'code' and 'name' keys - """ - try: - from policyengine_core.tools.google_cloud import download - except ImportError: - # If policyengine_core is not available, return empty list - return [] - - try: - csv_path = download( - gcs_bucket="policyengine-uk-data-private", - gcs_key="constituencies_2024.csv", - ) - import pandas as pd - - df = pd.read_csv(csv_path) - return [{"code": row["code"], "name": row["name"]} for _, row in df.iterrows()] - except (OSError, KeyError, ValueError) as exc: - logger.warning("Failed to load constituencies CSV: %s", exc) - return [] - except Exception: - logger.error("Unexpected error loading constituencies CSV", exc_info=True) - return [] - - -def _load_local_authorities_from_csv() -> list[dict]: - """Load UK local authority data from CSV. - - Local authorities are loaded from: - gs://policyengine-uk-data-private/local_authorities_2021.csv - - Returns: - List of dicts with 'code' and 'name' keys - """ - try: - from policyengine_core.tools.google_cloud import download - except ImportError: - # If policyengine_core is not available, return empty list - return [] - - try: - csv_path = download( - gcs_bucket="policyengine-uk-data-private", - gcs_key="local_authorities_2021.csv", - ) - import pandas as pd - - df = pd.read_csv(csv_path) - return [{"code": row["code"], "name": row["name"]} for _, row in df.iterrows()] - except (OSError, KeyError, ValueError) as exc: - logger.warning("Failed to load local authorities CSV: %s", exc) - return [] - except Exception: - logger.error("Unexpected error loading local authorities CSV", exc_info=True) - return [] - - -def build_uk_region_registry( - include_constituencies: bool = False, - include_local_authorities: bool = False, -) -> RegionRegistry: - """Build the UK region registry. - - Args: - include_constituencies: If True, load and include constituencies from CSV. - Defaults to False to avoid GCS dependency at import time. - include_local_authorities: If True, load and include local authorities from CSV. - Defaults to False to avoid GCS dependency at import time. - - Returns: - RegionRegistry containing: - - 1 national region - - 4 country regions - - Optionally: constituencies (if include_constituencies=True) - - Optionally: local authorities (if include_local_authorities=True) - """ - regions: list[Region] = [] - - # 1. National region (has dedicated dataset) - regions.append( - Region( - code="uk", - label="United Kingdom", - region_type="national", - dataset_path=resolve_region_dataset_path("uk", "national"), - ) - ) - - # 2. Country regions (filter from national by 'country' variable) - for code, name in UK_COUNTRIES.items(): - regions.append( - Region( - code=f"country/{code}", - label=name, - region_type="country", - parent_code="uk", - requires_filter=True, - filter_field="country", - filter_value=code.upper(), - scoping_strategy=RowFilterStrategy( - variable_name="country", - variable_value=code.upper(), - ), - ) - ) - - # 3. Constituencies (optional, loaded from CSV) - # Note: These use weight replacement, not data filtering - if include_constituencies: - constituencies = _load_constituencies_from_csv() - for const in constituencies: - regions.append( - Region( - code=f"constituency/{const['code']}", - label=const["name"], - region_type="constituency", - parent_code="uk", - requires_filter=True, - filter_field="household_weight", - filter_value=const["code"], - scoping_strategy=WeightReplacementStrategy( - weight_matrix_bucket="policyengine-uk-data-private", - weight_matrix_key="parliamentary_constituency_weights.h5", - lookup_csv_bucket="policyengine-uk-data-private", - lookup_csv_key="constituencies_2024.csv", - region_code=const["code"], - ), - ) - ) - - # 4. Local Authorities (optional, loaded from CSV) - # Note: These use weight replacement, not data filtering - if include_local_authorities: - local_authorities = _load_local_authorities_from_csv() - for la in local_authorities: - regions.append( - Region( - code=f"local_authority/{la['code']}", - label=la["name"], - region_type="local_authority", - parent_code="uk", - requires_filter=True, - filter_field="household_weight", - filter_value=la["code"], - scoping_strategy=WeightReplacementStrategy( - weight_matrix_bucket="policyengine-uk-data-private", - weight_matrix_key="local_authority_weights.h5", - lookup_csv_bucket="policyengine-uk-data-private", - lookup_csv_key="local_authorities_2021.csv", - region_code=la["code"], - ), - ) - ) - - return RegionRegistry(country_id="uk", regions=regions) - - -# Default registry with just core regions (national + countries) -# To get full registry with constituencies/LAs, call: -# build_uk_region_registry(include_constituencies=True, include_local_authorities=True) -uk_region_registry = build_uk_region_registry() diff --git a/build/lib/policyengine/countries/us/__init__.py b/build/lib/policyengine/countries/us/__init__.py deleted file mode 100644 index 68592459..00000000 --- a/build/lib/policyengine/countries/us/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""US country-specific region definitions.""" - -from .regions import us_region_registry - -__all__ = ["us_region_registry"] diff --git a/build/lib/policyengine/countries/us/data/__init__.py b/build/lib/policyengine/countries/us/data/__init__.py deleted file mode 100644 index fb833b64..00000000 --- a/build/lib/policyengine/countries/us/data/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -"""US geographic data definitions. - -This module provides static data for US geographic regions: -- states.py: State abbreviations and full names -- districts.py: Congressional district counts by state -- places.py: US Census places (cities/towns over 100K population) -""" - -from .districts import AT_LARGE_STATES, DISTRICT_COUNTS -from .places import US_PLACES -from .states import US_STATES - -__all__ = [ - "US_STATES", - "DISTRICT_COUNTS", - "AT_LARGE_STATES", - "US_PLACES", -] diff --git a/build/lib/policyengine/countries/us/data/districts.py b/build/lib/policyengine/countries/us/data/districts.py deleted file mode 100644 index e77d5e62..00000000 --- a/build/lib/policyengine/countries/us/data/districts.py +++ /dev/null @@ -1,64 +0,0 @@ -"""US congressional district definitions. - -Based on 2020 Census apportionment. -Total: 435 voting representatives + 1 DC non-voting delegate = 436 -""" - -# Congressional district counts by state (2020 Census apportionment) -# States with 1 district are "at-large" -DISTRICT_COUNTS: dict[str, int] = { - "AL": 7, - "AK": 1, - "AZ": 9, - "AR": 4, - "CA": 52, - "CO": 8, - "CT": 5, - "DE": 1, - "DC": 1, # Non-voting delegate - "FL": 28, - "GA": 14, - "HI": 2, - "ID": 2, - "IL": 17, - "IN": 9, - "IA": 4, - "KS": 4, - "KY": 6, - "LA": 6, - "ME": 2, - "MD": 8, - "MA": 9, - "MI": 13, - "MN": 8, - "MS": 4, - "MO": 8, - "MT": 2, - "NE": 3, - "NV": 4, - "NH": 2, - "NJ": 12, - "NM": 3, - "NY": 26, - "NC": 14, - "ND": 1, - "OH": 15, - "OK": 5, - "OR": 6, - "PA": 17, - "RI": 2, - "SC": 7, - "SD": 1, - "TN": 9, - "TX": 38, - "UT": 4, - "VT": 1, - "VA": 11, - "WA": 10, - "WV": 2, - "WI": 8, - "WY": 1, -} - -# States with at-large congressional districts (single representative) -AT_LARGE_STATES: set[str] = {"AK", "DE", "DC", "ND", "SD", "VT", "WY"} diff --git a/build/lib/policyengine/countries/us/data/places.py b/build/lib/policyengine/countries/us/data/places.py deleted file mode 100644 index a5fe632f..00000000 --- a/build/lib/policyengine/countries/us/data/places.py +++ /dev/null @@ -1,1815 +0,0 @@ -"""US Census places with population over 100,000. - -Source: US Census Bureau Population Estimates 2023 -Synced with policyengine-app-v2 main branch. -""" - -# US cities/places with population over 100K (from Census data) -# These filter from their parent state's dataset using place_fips -# Total: 333 places -US_PLACES: list[dict[str, str]] = [ - { - "fips": "03000", - "name": "Anchorage", - "state": "AK", - "state_name": "Alaska", - }, - { - "fips": "07000", - "name": "Birmingham", - "state": "AL", - "state_name": "Alabama", - }, - { - "fips": "37000", - "name": "Huntsville", - "state": "AL", - "state_name": "Alabama", - }, - { - "fips": "50000", - "name": "Mobile", - "state": "AL", - "state_name": "Alabama", - }, - { - "fips": "51000", - "name": "Montgomery", - "state": "AL", - "state_name": "Alabama", - }, - { - "fips": "77256", - "name": "Tuscaloosa", - "state": "AL", - "state_name": "Alabama", - }, - { - "fips": "23290", - "name": "Fayetteville", - "state": "AR", - "state_name": "Arkansas", - }, - { - "fips": "41000", - "name": "Little Rock", - "state": "AR", - "state_name": "Arkansas", - }, - { - "fips": "07940", - "name": "Buckeye", - "state": "AZ", - "state_name": "Arizona", - }, - { - "fips": "12000", - "name": "Chandler", - "state": "AZ", - "state_name": "Arizona", - }, - { - "fips": "27400", - "name": "Gilbert", - "state": "AZ", - "state_name": "Arizona", - }, - { - "fips": "27820", - "name": "Glendale", - "state": "AZ", - "state_name": "Arizona", - }, - { - "fips": "28380", - "name": "Goodyear", - "state": "AZ", - "state_name": "Arizona", - }, - {"fips": "46000", "name": "Mesa", "state": "AZ", "state_name": "Arizona"}, - { - "fips": "54050", - "name": "Peoria", - "state": "AZ", - "state_name": "Arizona", - }, - { - "fips": "55000", - "name": "Phoenix", - "state": "AZ", - "state_name": "Arizona", - }, - { - "fips": "65000", - "name": "Scottsdale", - "state": "AZ", - "state_name": "Arizona", - }, - { - "fips": "71510", - "name": "Surprise", - "state": "AZ", - "state_name": "Arizona", - }, - {"fips": "73000", "name": "Tempe", "state": "AZ", "state_name": "Arizona"}, - { - "fips": "77000", - "name": "Tucson", - "state": "AZ", - "state_name": "Arizona", - }, - {"fips": "85540", "name": "Yuma", "state": "AZ", "state_name": "Arizona"}, - { - "fips": "02000", - "name": "Anaheim", - "state": "CA", - "state_name": "California", - }, - { - "fips": "02252", - "name": "Antioch", - "state": "CA", - "state_name": "California", - }, - { - "fips": "03526", - "name": "Bakersfield", - "state": "CA", - "state_name": "California", - }, - { - "fips": "06000", - "name": "Berkeley", - "state": "CA", - "state_name": "California", - }, - { - "fips": "08954", - "name": "Burbank", - "state": "CA", - "state_name": "California", - }, - { - "fips": "11194", - "name": "Carlsbad", - "state": "CA", - "state_name": "California", - }, - { - "fips": "13014", - "name": "Chico", - "state": "CA", - "state_name": "California", - }, - { - "fips": "13392", - "name": "Chula Vista", - "state": "CA", - "state_name": "California", - }, - { - "fips": "14218", - "name": "Clovis", - "state": "CA", - "state_name": "California", - }, - { - "fips": "16000", - "name": "Concord", - "state": "CA", - "state_name": "California", - }, - { - "fips": "16350", - "name": "Corona", - "state": "CA", - "state_name": "California", - }, - { - "fips": "16532", - "name": "Costa Mesa", - "state": "CA", - "state_name": "California", - }, - { - "fips": "19766", - "name": "Downey", - "state": "CA", - "state_name": "California", - }, - { - "fips": "21712", - "name": "El Cajon", - "state": "CA", - "state_name": "California", - }, - { - "fips": "22230", - "name": "El Monte", - "state": "CA", - "state_name": "California", - }, - { - "fips": "22020", - "name": "Elk Grove", - "state": "CA", - "state_name": "California", - }, - { - "fips": "22804", - "name": "Escondido", - "state": "CA", - "state_name": "California", - }, - { - "fips": "23182", - "name": "Fairfield", - "state": "CA", - "state_name": "California", - }, - { - "fips": "24680", - "name": "Fontana", - "state": "CA", - "state_name": "California", - }, - { - "fips": "26000", - "name": "Fremont", - "state": "CA", - "state_name": "California", - }, - { - "fips": "27000", - "name": "Fresno", - "state": "CA", - "state_name": "California", - }, - { - "fips": "28000", - "name": "Fullerton", - "state": "CA", - "state_name": "California", - }, - { - "fips": "29000", - "name": "Garden Grove", - "state": "CA", - "state_name": "California", - }, - { - "fips": "30000", - "name": "Glendale", - "state": "CA", - "state_name": "California", - }, - { - "fips": "33000", - "name": "Hayward", - "state": "CA", - "state_name": "California", - }, - { - "fips": "33434", - "name": "Hesperia", - "state": "CA", - "state_name": "California", - }, - { - "fips": "36000", - "name": "Huntington Beach", - "state": "CA", - "state_name": "California", - }, - { - "fips": "36546", - "name": "Inglewood", - "state": "CA", - "state_name": "California", - }, - { - "fips": "36770", - "name": "Irvine", - "state": "CA", - "state_name": "California", - }, - { - "fips": "37692", - "name": "Jurupa Valley", - "state": "CA", - "state_name": "California", - }, - { - "fips": "40130", - "name": "Lancaster", - "state": "CA", - "state_name": "California", - }, - { - "fips": "43000", - "name": "Long Beach", - "state": "CA", - "state_name": "California", - }, - { - "fips": "44000", - "name": "Los Angeles", - "state": "CA", - "state_name": "California", - }, - { - "fips": "46842", - "name": "Menifee", - "state": "CA", - "state_name": "California", - }, - { - "fips": "48354", - "name": "Modesto", - "state": "CA", - "state_name": "California", - }, - { - "fips": "49270", - "name": "Moreno Valley", - "state": "CA", - "state_name": "California", - }, - { - "fips": "50076", - "name": "Murrieta", - "state": "CA", - "state_name": "California", - }, - { - "fips": "53000", - "name": "Oakland", - "state": "CA", - "state_name": "California", - }, - { - "fips": "53322", - "name": "Oceanside", - "state": "CA", - "state_name": "California", - }, - { - "fips": "53896", - "name": "Ontario", - "state": "CA", - "state_name": "California", - }, - { - "fips": "53980", - "name": "Orange", - "state": "CA", - "state_name": "California", - }, - { - "fips": "54652", - "name": "Oxnard", - "state": "CA", - "state_name": "California", - }, - { - "fips": "55156", - "name": "Palmdale", - "state": "CA", - "state_name": "California", - }, - { - "fips": "56000", - "name": "Pasadena", - "state": "CA", - "state_name": "California", - }, - { - "fips": "58072", - "name": "Pomona", - "state": "CA", - "state_name": "California", - }, - { - "fips": "59451", - "name": "Rancho Cucamonga", - "state": "CA", - "state_name": "California", - }, - { - "fips": "60466", - "name": "Rialto", - "state": "CA", - "state_name": "California", - }, - { - "fips": "60620", - "name": "Richmond", - "state": "CA", - "state_name": "California", - }, - { - "fips": "62000", - "name": "Riverside", - "state": "CA", - "state_name": "California", - }, - { - "fips": "62938", - "name": "Roseville", - "state": "CA", - "state_name": "California", - }, - { - "fips": "64000", - "name": "Sacramento", - "state": "CA", - "state_name": "California", - }, - { - "fips": "64224", - "name": "Salinas", - "state": "CA", - "state_name": "California", - }, - { - "fips": "65000", - "name": "San Bernardino", - "state": "CA", - "state_name": "California", - }, - { - "fips": "66000", - "name": "San Diego", - "state": "CA", - "state_name": "California", - }, - { - "fips": "67000", - "name": "San Francisco", - "state": "CA", - "state_name": "California", - }, - { - "fips": "68000", - "name": "San Jose", - "state": "CA", - "state_name": "California", - }, - { - "fips": "68252", - "name": "San Mateo", - "state": "CA", - "state_name": "California", - }, - { - "fips": "69000", - "name": "Santa Ana", - "state": "CA", - "state_name": "California", - }, - { - "fips": "69084", - "name": "Santa Clara", - "state": "CA", - "state_name": "California", - }, - { - "fips": "69088", - "name": "Santa Clarita", - "state": "CA", - "state_name": "California", - }, - { - "fips": "69196", - "name": "Santa Maria", - "state": "CA", - "state_name": "California", - }, - { - "fips": "70098", - "name": "Santa Rosa", - "state": "CA", - "state_name": "California", - }, - { - "fips": "72016", - "name": "Simi Valley", - "state": "CA", - "state_name": "California", - }, - { - "fips": "75000", - "name": "Stockton", - "state": "CA", - "state_name": "California", - }, - { - "fips": "77000", - "name": "Sunnyvale", - "state": "CA", - "state_name": "California", - }, - { - "fips": "78120", - "name": "Temecula", - "state": "CA", - "state_name": "California", - }, - { - "fips": "78582", - "name": "Thousand Oaks", - "state": "CA", - "state_name": "California", - }, - { - "fips": "80000", - "name": "Torrance", - "state": "CA", - "state_name": "California", - }, - { - "fips": "81554", - "name": "Vacaville", - "state": "CA", - "state_name": "California", - }, - { - "fips": "81666", - "name": "Vallejo", - "state": "CA", - "state_name": "California", - }, - { - "fips": "65042", - "name": "Ventura", - "state": "CA", - "state_name": "California", - }, - { - "fips": "82590", - "name": "Victorville", - "state": "CA", - "state_name": "California", - }, - { - "fips": "82954", - "name": "Visalia", - "state": "CA", - "state_name": "California", - }, - { - "fips": "84200", - "name": "West Covina", - "state": "CA", - "state_name": "California", - }, - { - "fips": "03455", - "name": "Arvada", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "04000", - "name": "Aurora", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "07850", - "name": "Boulder", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "12815", - "name": "Centennial", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "16000", - "name": "Colorado Springs", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "20000", - "name": "Denver", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "27425", - "name": "Fort Collins", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "32155", - "name": "Greeley", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "43000", - "name": "Lakewood", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "62000", - "name": "Pueblo", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "77290", - "name": "Thornton", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "83835", - "name": "Westminster", - "state": "CO", - "state_name": "Colorado", - }, - { - "fips": "08000", - "name": "Bridgeport", - "state": "CT", - "state_name": "Connecticut", - }, - { - "fips": "37000", - "name": "Hartford", - "state": "CT", - "state_name": "Connecticut", - }, - { - "fips": "52000", - "name": "New Haven", - "state": "CT", - "state_name": "Connecticut", - }, - { - "fips": "73000", - "name": "Stamford", - "state": "CT", - "state_name": "Connecticut", - }, - { - "fips": "80000", - "name": "Waterbury", - "state": "CT", - "state_name": "Connecticut", - }, - { - "fips": "50000", - "name": "Washington", - "state": "DC", - "state_name": "District of Columbia", - }, - { - "fips": "10275", - "name": "Cape Coral", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "12875", - "name": "Clearwater", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "14400", - "name": "Coral Springs", - "state": "FL", - "state_name": "Florida", - }, - {"fips": "16475", "name": "Davie", "state": "FL", "state_name": "Florida"}, - { - "fips": "24000", - "name": "Fort Lauderdale", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "25175", - "name": "Gainesville", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "30000", - "name": "Hialeah", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "32000", - "name": "Hollywood", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "35000", - "name": "Jacksonville", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "38250", - "name": "Lakeland", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "45060", - "name": "Miami Gardens", - "state": "FL", - "state_name": "Florida", - }, - {"fips": "45000", "name": "Miami", "state": "FL", "state_name": "Florida"}, - { - "fips": "45975", - "name": "Miramar", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "53000", - "name": "Orlando", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "54000", - "name": "Palm Bay", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "54200", - "name": "Palm Coast", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "55775", - "name": "Pembroke Pines", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "58050", - "name": "Pompano Beach", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "58715", - "name": "Port St. Lucie", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "63000", - "name": "St. Petersburg", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "70600", - "name": "Tallahassee", - "state": "FL", - "state_name": "Florida", - }, - {"fips": "71000", "name": "Tampa", "state": "FL", "state_name": "Florida"}, - { - "fips": "76600", - "name": "West Palm Beach", - "state": "FL", - "state_name": "Florida", - }, - { - "fips": "03440", - "name": "Athens-Clarke County", - "state": "GA", - "state_name": "Georgia", - }, - { - "fips": "04000", - "name": "Atlanta", - "state": "GA", - "state_name": "Georgia", - }, - { - "fips": "04204", - "name": "Augusta-Richmond County", - "state": "GA", - "state_name": "Georgia", - }, - { - "fips": "19000", - "name": "Columbus", - "state": "GA", - "state_name": "Georgia", - }, - { - "fips": "49008", - "name": "Macon-Bibb County", - "state": "GA", - "state_name": "Georgia", - }, - { - "fips": "68516", - "name": "Sandy Springs", - "state": "GA", - "state_name": "Georgia", - }, - { - "fips": "69000", - "name": "Savannah", - "state": "GA", - "state_name": "Georgia", - }, - { - "fips": "72122", - "name": "South Fulton", - "state": "GA", - "state_name": "Georgia", - }, - { - "fips": "71550", - "name": "Urban Honolulu", - "state": "HI", - "state_name": "Hawaii", - }, - { - "fips": "12000", - "name": "Cedar Rapids", - "state": "IA", - "state_name": "Iowa", - }, - { - "fips": "19000", - "name": "Davenport", - "state": "IA", - "state_name": "Iowa", - }, - { - "fips": "21000", - "name": "Des Moines", - "state": "IA", - "state_name": "Iowa", - }, - { - "fips": "08830", - "name": "Boise City", - "state": "ID", - "state_name": "Idaho", - }, - { - "fips": "52120", - "name": "Meridian", - "state": "ID", - "state_name": "Idaho", - }, - {"fips": "56260", "name": "Nampa", "state": "ID", "state_name": "Idaho"}, - { - "fips": "03012", - "name": "Aurora", - "state": "IL", - "state_name": "Illinois", - }, - { - "fips": "14000", - "name": "Chicago", - "state": "IL", - "state_name": "Illinois", - }, - { - "fips": "23074", - "name": "Elgin", - "state": "IL", - "state_name": "Illinois", - }, - { - "fips": "38570", - "name": "Joliet", - "state": "IL", - "state_name": "Illinois", - }, - { - "fips": "51622", - "name": "Naperville", - "state": "IL", - "state_name": "Illinois", - }, - { - "fips": "59000", - "name": "Peoria", - "state": "IL", - "state_name": "Illinois", - }, - { - "fips": "65000", - "name": "Rockford", - "state": "IL", - "state_name": "Illinois", - }, - { - "fips": "72000", - "name": "Springfield", - "state": "IL", - "state_name": "Illinois", - }, - { - "fips": "10342", - "name": "Carmel", - "state": "IN", - "state_name": "Indiana", - }, - { - "fips": "22000", - "name": "Evansville", - "state": "IN", - "state_name": "Indiana", - }, - { - "fips": "23278", - "name": "Fishers", - "state": "IN", - "state_name": "Indiana", - }, - { - "fips": "25000", - "name": "Fort Wayne", - "state": "IN", - "state_name": "Indiana", - }, - { - "fips": "36003", - "name": "Indianapolis", - "state": "IN", - "state_name": "Indiana", - }, - { - "fips": "71000", - "name": "South Bend", - "state": "IN", - "state_name": "Indiana", - }, - { - "fips": "36000", - "name": "Kansas City", - "state": "KS", - "state_name": "Kansas", - }, - {"fips": "52575", "name": "Olathe", "state": "KS", "state_name": "Kansas"}, - { - "fips": "53775", - "name": "Overland Park", - "state": "KS", - "state_name": "Kansas", - }, - {"fips": "71000", "name": "Topeka", "state": "KS", "state_name": "Kansas"}, - { - "fips": "79000", - "name": "Wichita", - "state": "KS", - "state_name": "Kansas", - }, - { - "fips": "46027", - "name": "Lexington-Fayette", - "state": "KY", - "state_name": "Kentucky", - }, - { - "fips": "48006", - "name": "Louisville/Jefferson County", - "state": "KY", - "state_name": "Kentucky", - }, - { - "fips": "05000", - "name": "Baton Rouge", - "state": "LA", - "state_name": "Louisiana", - }, - { - "fips": "40735", - "name": "Lafayette", - "state": "LA", - "state_name": "Louisiana", - }, - { - "fips": "55000", - "name": "New Orleans", - "state": "LA", - "state_name": "Louisiana", - }, - { - "fips": "70000", - "name": "Shreveport", - "state": "LA", - "state_name": "Louisiana", - }, - { - "fips": "07000", - "name": "Boston", - "state": "MA", - "state_name": "Massachusetts", - }, - { - "fips": "09000", - "name": "Brockton", - "state": "MA", - "state_name": "Massachusetts", - }, - { - "fips": "11000", - "name": "Cambridge", - "state": "MA", - "state_name": "Massachusetts", - }, - { - "fips": "37000", - "name": "Lowell", - "state": "MA", - "state_name": "Massachusetts", - }, - { - "fips": "37490", - "name": "Lynn", - "state": "MA", - "state_name": "Massachusetts", - }, - { - "fips": "45000", - "name": "New Bedford", - "state": "MA", - "state_name": "Massachusetts", - }, - { - "fips": "55745", - "name": "Quincy", - "state": "MA", - "state_name": "Massachusetts", - }, - { - "fips": "67000", - "name": "Springfield", - "state": "MA", - "state_name": "Massachusetts", - }, - { - "fips": "82000", - "name": "Worcester", - "state": "MA", - "state_name": "Massachusetts", - }, - { - "fips": "04000", - "name": "Baltimore", - "state": "MD", - "state_name": "Maryland", - }, - { - "fips": "03000", - "name": "Ann Arbor", - "state": "MI", - "state_name": "Michigan", - }, - { - "fips": "21000", - "name": "Dearborn", - "state": "MI", - "state_name": "Michigan", - }, - { - "fips": "22000", - "name": "Detroit", - "state": "MI", - "state_name": "Michigan", - }, - { - "fips": "34000", - "name": "Grand Rapids", - "state": "MI", - "state_name": "Michigan", - }, - { - "fips": "46000", - "name": "Lansing", - "state": "MI", - "state_name": "Michigan", - }, - { - "fips": "76460", - "name": "Sterling Heights", - "state": "MI", - "state_name": "Michigan", - }, - { - "fips": "84000", - "name": "Warren", - "state": "MI", - "state_name": "Michigan", - }, - { - "fips": "43000", - "name": "Minneapolis", - "state": "MN", - "state_name": "Minnesota", - }, - { - "fips": "54880", - "name": "Rochester", - "state": "MN", - "state_name": "Minnesota", - }, - { - "fips": "58000", - "name": "St. Paul", - "state": "MN", - "state_name": "Minnesota", - }, - { - "fips": "15670", - "name": "Columbia", - "state": "MO", - "state_name": "Missouri", - }, - { - "fips": "35000", - "name": "Independence", - "state": "MO", - "state_name": "Missouri", - }, - { - "fips": "38000", - "name": "Kansas City", - "state": "MO", - "state_name": "Missouri", - }, - { - "fips": "41348", - "name": "Lee's Summit", - "state": "MO", - "state_name": "Missouri", - }, - { - "fips": "70000", - "name": "Springfield", - "state": "MO", - "state_name": "Missouri", - }, - { - "fips": "65000", - "name": "St. Louis", - "state": "MO", - "state_name": "Missouri", - }, - { - "fips": "36000", - "name": "Jackson", - "state": "MS", - "state_name": "Mississippi", - }, - { - "fips": "06550", - "name": "Billings", - "state": "MT", - "state_name": "Montana", - }, - { - "fips": "10740", - "name": "Cary", - "state": "NC", - "state_name": "North Carolina", - }, - { - "fips": "12000", - "name": "Charlotte", - "state": "NC", - "state_name": "North Carolina", - }, - { - "fips": "14100", - "name": "Concord", - "state": "NC", - "state_name": "North Carolina", - }, - { - "fips": "19000", - "name": "Durham", - "state": "NC", - "state_name": "North Carolina", - }, - { - "fips": "22920", - "name": "Fayetteville", - "state": "NC", - "state_name": "North Carolina", - }, - { - "fips": "28000", - "name": "Greensboro", - "state": "NC", - "state_name": "North Carolina", - }, - { - "fips": "31400", - "name": "High Point", - "state": "NC", - "state_name": "North Carolina", - }, - { - "fips": "55000", - "name": "Raleigh", - "state": "NC", - "state_name": "North Carolina", - }, - { - "fips": "74440", - "name": "Wilmington", - "state": "NC", - "state_name": "North Carolina", - }, - { - "fips": "75000", - "name": "Winston-Salem", - "state": "NC", - "state_name": "North Carolina", - }, - { - "fips": "25700", - "name": "Fargo", - "state": "ND", - "state_name": "North Dakota", - }, - { - "fips": "28000", - "name": "Lincoln", - "state": "NE", - "state_name": "Nebraska", - }, - { - "fips": "37000", - "name": "Omaha", - "state": "NE", - "state_name": "Nebraska", - }, - { - "fips": "45140", - "name": "Manchester", - "state": "NH", - "state_name": "New Hampshire", - }, - { - "fips": "21000", - "name": "Elizabeth", - "state": "NJ", - "state_name": "New Jersey", - }, - { - "fips": "36000", - "name": "Jersey City", - "state": "NJ", - "state_name": "New Jersey", - }, - { - "fips": "51000", - "name": "Newark", - "state": "NJ", - "state_name": "New Jersey", - }, - { - "fips": "57000", - "name": "Paterson", - "state": "NJ", - "state_name": "New Jersey", - }, - { - "fips": "02000", - "name": "Albuquerque", - "state": "NM", - "state_name": "New Mexico", - }, - { - "fips": "39380", - "name": "Las Cruces", - "state": "NM", - "state_name": "New Mexico", - }, - { - "fips": "63460", - "name": "Rio Rancho", - "state": "NM", - "state_name": "New Mexico", - }, - { - "fips": "31900", - "name": "Henderson", - "state": "NV", - "state_name": "Nevada", - }, - { - "fips": "40000", - "name": "Las Vegas", - "state": "NV", - "state_name": "Nevada", - }, - { - "fips": "51800", - "name": "North Las Vegas", - "state": "NV", - "state_name": "Nevada", - }, - {"fips": "60600", "name": "Reno", "state": "NV", "state_name": "Nevada"}, - {"fips": "68400", "name": "Sparks", "state": "NV", "state_name": "Nevada"}, - { - "fips": "01000", - "name": "Albany", - "state": "NY", - "state_name": "New York", - }, - { - "fips": "11000", - "name": "Buffalo", - "state": "NY", - "state_name": "New York", - }, - { - "fips": "51000", - "name": "New York City", - "state": "NY", - "state_name": "New York", - }, - { - "fips": "63000", - "name": "Rochester", - "state": "NY", - "state_name": "New York", - }, - { - "fips": "73000", - "name": "Syracuse", - "state": "NY", - "state_name": "New York", - }, - { - "fips": "84000", - "name": "Yonkers", - "state": "NY", - "state_name": "New York", - }, - {"fips": "01000", "name": "Akron", "state": "OH", "state_name": "Ohio"}, - { - "fips": "15000", - "name": "Cincinnati", - "state": "OH", - "state_name": "Ohio", - }, - { - "fips": "16000", - "name": "Cleveland", - "state": "OH", - "state_name": "Ohio", - }, - {"fips": "18000", "name": "Columbus", "state": "OH", "state_name": "Ohio"}, - {"fips": "21000", "name": "Dayton", "state": "OH", "state_name": "Ohio"}, - {"fips": "77000", "name": "Toledo", "state": "OH", "state_name": "Ohio"}, - { - "fips": "09050", - "name": "Broken Arrow", - "state": "OK", - "state_name": "Oklahoma", - }, - { - "fips": "52500", - "name": "Norman", - "state": "OK", - "state_name": "Oklahoma", - }, - { - "fips": "55000", - "name": "Oklahoma City", - "state": "OK", - "state_name": "Oklahoma", - }, - { - "fips": "75000", - "name": "Tulsa", - "state": "OK", - "state_name": "Oklahoma", - }, - {"fips": "05800", "name": "Bend", "state": "OR", "state_name": "Oregon"}, - {"fips": "23850", "name": "Eugene", "state": "OR", "state_name": "Oregon"}, - { - "fips": "31250", - "name": "Gresham", - "state": "OR", - "state_name": "Oregon", - }, - { - "fips": "34100", - "name": "Hillsboro", - "state": "OR", - "state_name": "Oregon", - }, - { - "fips": "59000", - "name": "Portland", - "state": "OR", - "state_name": "Oregon", - }, - {"fips": "64900", "name": "Salem", "state": "OR", "state_name": "Oregon"}, - { - "fips": "02000", - "name": "Allentown", - "state": "PA", - "state_name": "Pennsylvania", - }, - { - "fips": "60000", - "name": "Philadelphia", - "state": "PA", - "state_name": "Pennsylvania", - }, - { - "fips": "61000", - "name": "Pittsburgh", - "state": "PA", - "state_name": "Pennsylvania", - }, - { - "fips": "59000", - "name": "Providence", - "state": "RI", - "state_name": "Rhode Island", - }, - { - "fips": "13330", - "name": "Charleston", - "state": "SC", - "state_name": "South Carolina", - }, - { - "fips": "16000", - "name": "Columbia", - "state": "SC", - "state_name": "South Carolina", - }, - { - "fips": "50875", - "name": "North Charleston", - "state": "SC", - "state_name": "South Carolina", - }, - { - "fips": "59020", - "name": "Sioux Falls", - "state": "SD", - "state_name": "South Dakota", - }, - { - "fips": "14000", - "name": "Chattanooga", - "state": "TN", - "state_name": "Tennessee", - }, - { - "fips": "15160", - "name": "Clarksville", - "state": "TN", - "state_name": "Tennessee", - }, - { - "fips": "40000", - "name": "Knoxville", - "state": "TN", - "state_name": "Tennessee", - }, - { - "fips": "48000", - "name": "Memphis", - "state": "TN", - "state_name": "Tennessee", - }, - { - "fips": "51560", - "name": "Murfreesboro", - "state": "TN", - "state_name": "Tennessee", - }, - # Extracted 332 places - { - "fips": "52006", - "name": "Nashville-Davidson", - "state": "TN", - "state_name": "Tennessee", - }, - {"fips": "01000", "name": "Abilene", "state": "TX", "state_name": "Texas"}, - {"fips": "01924", "name": "Allen", "state": "TX", "state_name": "Texas"}, - { - "fips": "03000", - "name": "Amarillo", - "state": "TX", - "state_name": "Texas", - }, - { - "fips": "04000", - "name": "Arlington", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "05000", "name": "Austin", "state": "TX", "state_name": "Texas"}, - { - "fips": "07000", - "name": "Beaumont", - "state": "TX", - "state_name": "Texas", - }, - { - "fips": "10768", - "name": "Brownsville", - "state": "TX", - "state_name": "Texas", - }, - { - "fips": "13024", - "name": "Carrollton", - "state": "TX", - "state_name": "Texas", - }, - { - "fips": "15976", - "name": "College Station", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "16432", "name": "Conroe", "state": "TX", "state_name": "Texas"}, - { - "fips": "17000", - "name": "Corpus Christi", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "19000", "name": "Dallas", "state": "TX", "state_name": "Texas"}, - {"fips": "19972", "name": "Denton", "state": "TX", "state_name": "Texas"}, - { - "fips": "22660", - "name": "Edinburg", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "24000", "name": "El Paso", "state": "TX", "state_name": "Texas"}, - { - "fips": "27000", - "name": "Fort Worth", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "27684", "name": "Frisco", "state": "TX", "state_name": "Texas"}, - {"fips": "29000", "name": "Garland", "state": "TX", "state_name": "Texas"}, - { - "fips": "30464", - "name": "Grand Prairie", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "35000", "name": "Houston", "state": "TX", "state_name": "Texas"}, - {"fips": "37000", "name": "Irving", "state": "TX", "state_name": "Texas"}, - {"fips": "39148", "name": "Killeen", "state": "TX", "state_name": "Texas"}, - {"fips": "41464", "name": "Laredo", "state": "TX", "state_name": "Texas"}, - { - "fips": "41980", - "name": "League City", - "state": "TX", - "state_name": "Texas", - }, - { - "fips": "42508", - "name": "Lewisville", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "45000", "name": "Lubbock", "state": "TX", "state_name": "Texas"}, - {"fips": "45384", "name": "McAllen", "state": "TX", "state_name": "Texas"}, - { - "fips": "45744", - "name": "McKinney", - "state": "TX", - "state_name": "Texas", - }, - { - "fips": "47892", - "name": "Mesquite", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "48072", "name": "Midland", "state": "TX", "state_name": "Texas"}, - { - "fips": "50820", - "name": "New Braunfels", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "53388", "name": "Odessa", "state": "TX", "state_name": "Texas"}, - { - "fips": "56000", - "name": "Pasadena", - "state": "TX", - "state_name": "Texas", - }, - { - "fips": "56348", - "name": "Pearland", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "58016", "name": "Plano", "state": "TX", "state_name": "Texas"}, - { - "fips": "61796", - "name": "Richardson", - "state": "TX", - "state_name": "Texas", - }, - { - "fips": "63500", - "name": "Round Rock", - "state": "TX", - "state_name": "Texas", - }, - { - "fips": "65000", - "name": "San Antonio", - "state": "TX", - "state_name": "Texas", - }, - { - "fips": "70808", - "name": "Sugar Land", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "74144", "name": "Tyler", "state": "TX", "state_name": "Texas"}, - {"fips": "76000", "name": "Waco", "state": "TX", "state_name": "Texas"}, - { - "fips": "79000", - "name": "Wichita Falls", - "state": "TX", - "state_name": "Texas", - }, - {"fips": "62470", "name": "Provo", "state": "UT", "state_name": "Utah"}, - { - "fips": "67000", - "name": "Salt Lake City", - "state": "UT", - "state_name": "Utah", - }, - { - "fips": "65330", - "name": "St. George", - "state": "UT", - "state_name": "Utah", - }, - { - "fips": "82950", - "name": "West Jordan", - "state": "UT", - "state_name": "Utah", - }, - { - "fips": "83470", - "name": "West Valley City", - "state": "UT", - "state_name": "Utah", - }, - { - "fips": "01000", - "name": "Alexandria", - "state": "VA", - "state_name": "Virginia", - }, - { - "fips": "16000", - "name": "Chesapeake", - "state": "VA", - "state_name": "Virginia", - }, - { - "fips": "35000", - "name": "Hampton", - "state": "VA", - "state_name": "Virginia", - }, - { - "fips": "56000", - "name": "Newport News", - "state": "VA", - "state_name": "Virginia", - }, - { - "fips": "57000", - "name": "Norfolk", - "state": "VA", - "state_name": "Virginia", - }, - { - "fips": "67000", - "name": "Richmond", - "state": "VA", - "state_name": "Virginia", - }, - { - "fips": "76432", - "name": "Suffolk", - "state": "VA", - "state_name": "Virginia", - }, - { - "fips": "82000", - "name": "Virginia Beach", - "state": "VA", - "state_name": "Virginia", - }, - { - "fips": "05210", - "name": "Bellevue", - "state": "WA", - "state_name": "Washington", - }, - { - "fips": "22640", - "name": "Everett", - "state": "WA", - "state_name": "Washington", - }, - { - "fips": "35415", - "name": "Kent", - "state": "WA", - "state_name": "Washington", - }, - { - "fips": "57745", - "name": "Renton", - "state": "WA", - "state_name": "Washington", - }, - { - "fips": "63000", - "name": "Seattle", - "state": "WA", - "state_name": "Washington", - }, - { - "fips": "67167", - "name": "Spokane Valley", - "state": "WA", - "state_name": "Washington", - }, - { - "fips": "67000", - "name": "Spokane", - "state": "WA", - "state_name": "Washington", - }, - { - "fips": "70000", - "name": "Tacoma", - "state": "WA", - "state_name": "Washington", - }, - { - "fips": "74060", - "name": "Vancouver", - "state": "WA", - "state_name": "Washington", - }, - { - "fips": "31000", - "name": "Green Bay", - "state": "WI", - "state_name": "Wisconsin", - }, - { - "fips": "48000", - "name": "Madison", - "state": "WI", - "state_name": "Wisconsin", - }, - { - "fips": "53000", - "name": "Milwaukee", - "state": "WI", - "state_name": "Wisconsin", - }, -] diff --git a/build/lib/policyengine/countries/us/data/states.py b/build/lib/policyengine/countries/us/data/states.py deleted file mode 100644 index 1309201b..00000000 --- a/build/lib/policyengine/countries/us/data/states.py +++ /dev/null @@ -1,59 +0,0 @@ -"""US state definitions. - -All 50 states plus District of Columbia. -""" - -# All US states and territories with their full names -US_STATES: dict[str, str] = { - "AL": "Alabama", - "AK": "Alaska", - "AZ": "Arizona", - "AR": "Arkansas", - "CA": "California", - "CO": "Colorado", - "CT": "Connecticut", - "DE": "Delaware", - "DC": "District of Columbia", - "FL": "Florida", - "GA": "Georgia", - "HI": "Hawaii", - "ID": "Idaho", - "IL": "Illinois", - "IN": "Indiana", - "IA": "Iowa", - "KS": "Kansas", - "KY": "Kentucky", - "LA": "Louisiana", - "ME": "Maine", - "MD": "Maryland", - "MA": "Massachusetts", - "MI": "Michigan", - "MN": "Minnesota", - "MS": "Mississippi", - "MO": "Missouri", - "MT": "Montana", - "NE": "Nebraska", - "NV": "Nevada", - "NH": "New Hampshire", - "NJ": "New Jersey", - "NM": "New Mexico", - "NY": "New York", - "NC": "North Carolina", - "ND": "North Dakota", - "OH": "Ohio", - "OK": "Oklahoma", - "OR": "Oregon", - "PA": "Pennsylvania", - "RI": "Rhode Island", - "SC": "South Carolina", - "SD": "South Dakota", - "TN": "Tennessee", - "TX": "Texas", - "UT": "Utah", - "VT": "Vermont", - "VA": "Virginia", - "WA": "Washington", - "WV": "West Virginia", - "WI": "Wisconsin", - "WY": "Wyoming", -} diff --git a/build/lib/policyengine/countries/us/regions.py b/build/lib/policyengine/countries/us/regions.py deleted file mode 100644 index f335805f..00000000 --- a/build/lib/policyengine/countries/us/regions.py +++ /dev/null @@ -1,120 +0,0 @@ -"""US region registry builder. - -This module builds the complete US region registry from the data definitions -in the data/ subdirectory: -- data/states.py: State definitions -- data/districts.py: Congressional district counts -- data/places.py: Census places over 100K population -""" - -from policyengine.core.region import Region, RegionRegistry -from policyengine.core.release_manifest import resolve_region_dataset_path -from policyengine.core.scoping_strategy import RowFilterStrategy - -from .data import AT_LARGE_STATES, DISTRICT_COUNTS, US_PLACES, US_STATES - -US_DATA_BUCKET = "gs://policyengine-us-data" - - -def _ordinal(n: int) -> str: - """Return ordinal suffix for a number (1st, 2nd, 3rd, etc.).""" - if 11 <= n % 100 <= 13: - return f"{n}th" - return f"{n}" + {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th") - - -def build_us_region_registry() -> RegionRegistry: - """Build the complete US region registry. - - Returns: - RegionRegistry containing: - - 1 national region - - 51 state regions (50 states + DC) - - 436 congressional district regions (435 + DC delegate) - - 333 place/city regions (Census places over 100K population) - """ - regions: list[Region] = [] - - # 1. National region (has dedicated dataset) - regions.append( - Region( - code="us", - label="United States", - region_type="national", - dataset_path=resolve_region_dataset_path("us", "national"), - ) - ) - - # 2. State regions (each has dedicated dataset) - for abbrev, name in US_STATES.items(): - regions.append( - Region( - code=f"state/{abbrev.lower()}", - label=name, - region_type="state", - parent_code="us", - dataset_path=resolve_region_dataset_path( - "us", - "state", - state_code=abbrev, - ), - state_code=abbrev, - state_name=name, - ) - ) - - # 3. Congressional district regions (each has dedicated dataset) - for state_abbrev, count in DISTRICT_COUNTS.items(): - state_name = US_STATES[state_abbrev] - for i in range(1, count + 1): - district_code = f"{state_abbrev}-{i:02d}" - - # Create appropriate label - if state_abbrev in AT_LARGE_STATES: - label = f"{state_name}'s at-large congressional district" - else: - label = f"{state_name}'s {_ordinal(i)} congressional district" - - regions.append( - Region( - code=f"congressional_district/{district_code}", - label=label, - region_type="congressional_district", - parent_code=f"state/{state_abbrev.lower()}", - dataset_path=resolve_region_dataset_path( - "us", - "congressional_district", - district_code=district_code, - ), - state_code=state_abbrev, - state_name=state_name, - ) - ) - - # 4. Place/city regions (filter from state datasets) - for place in US_PLACES: - state_abbrev = place["state"] - fips = place["fips"] - regions.append( - Region( - code=f"place/{state_abbrev}-{fips}", - label=place["name"], - region_type="place", - parent_code=f"state/{state_abbrev.lower()}", - requires_filter=True, - filter_field="place_fips", - filter_value=fips, - state_code=state_abbrev, - state_name=place["state_name"], - scoping_strategy=RowFilterStrategy( - variable_name="place_fips", - variable_value=fips, - ), - ) - ) - - return RegionRegistry(country_id="us", regions=regions) - - -# Singleton instance for import -us_region_registry = build_us_region_registry() diff --git a/build/lib/policyengine/data/release_manifests/uk.json b/build/lib/policyengine/data/release_manifests/uk.json deleted file mode 100644 index 90cc1cc1..00000000 --- a/build/lib/policyengine/data/release_manifests/uk.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "schema_version": 1, - "bundle_id": "uk-3.4.0", - "country_id": "uk", - "policyengine_version": "3.4.0", - "model_package": { - "name": "policyengine-uk", - "version": "2.74.0" - }, - "data_package": { - "name": "policyengine-uk-data", - "version": "1.40.4", - "repo_id": "policyengine/policyengine-uk-data-private" - }, - "certified_data_artifact": { - "data_package": { - "name": "policyengine-uk-data", - "version": "1.40.4" - }, - "build_id": "policyengine-uk-data-1.40.4", - "dataset": "enhanced_frs_2023_24", - "uri": "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" - }, - "certification": { - "compatibility_basis": "exact_build_model_version", - "data_build_id": "policyengine-uk-data-1.40.4", - "built_with_model_version": "2.74.0", - "certified_for_model_version": "2.74.0", - "certified_by": "policyengine.py bundled manifest" - }, - "default_dataset": "enhanced_frs_2023_24", - "datasets": { - "frs_2023_24": { - "path": "frs_2023_24.h5" - }, - "enhanced_frs_2023_24": { - "path": "enhanced_frs_2023_24.h5" - } - }, - "region_datasets": { - "national": { - "path_template": "enhanced_frs_2023_24.h5" - } - } -} diff --git a/build/lib/policyengine/data/release_manifests/us.json b/build/lib/policyengine/data/release_manifests/us.json deleted file mode 100644 index 20526da9..00000000 --- a/build/lib/policyengine/data/release_manifests/us.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "schema_version": 1, - "bundle_id": "us-3.4.0", - "country_id": "us", - "policyengine_version": "3.4.0", - "model_package": { - "name": "policyengine-us", - "version": "1.602.0" - }, - "data_package": { - "name": "policyengine-us-data", - "version": "1.73.0", - "repo_id": "policyengine/policyengine-us-data" - }, - "certified_data_artifact": { - "data_package": { - "name": "policyengine-us-data", - "version": "1.73.0" - }, - "build_id": "policyengine-us-data-1.73.0", - "dataset": "enhanced_cps_2024", - "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.73.0" - }, - "certification": { - "compatibility_basis": "exact_build_model_version", - "data_build_id": "policyengine-us-data-1.73.0", - "built_with_model_version": "1.602.0", - "certified_for_model_version": "1.602.0", - "certified_by": "policyengine.py bundled manifest" - }, - "default_dataset": "enhanced_cps_2024", - "datasets": { - "enhanced_cps_2024": { - "path": "enhanced_cps_2024.h5" - } - }, - "region_datasets": { - "national": { - "path_template": "enhanced_cps_2024.h5" - }, - "state": { - "path_template": "states/{state_code}.h5" - }, - "congressional_district": { - "path_template": "districts/{district_code}.h5" - } - } -} diff --git a/build/lib/policyengine/outputs/__init__.py b/build/lib/policyengine/outputs/__init__.py deleted file mode 100644 index 61311f46..00000000 --- a/build/lib/policyengine/outputs/__init__.py +++ /dev/null @@ -1,91 +0,0 @@ -from policyengine.core import Output, OutputCollection -from policyengine.outputs.aggregate import Aggregate, AggregateType -from policyengine.outputs.change_aggregate import ( - ChangeAggregate, - ChangeAggregateType, -) -from policyengine.outputs.congressional_district_impact import ( - CongressionalDistrictImpact, - compute_us_congressional_district_impacts, -) -from policyengine.outputs.constituency_impact import ( - ConstituencyImpact, - compute_uk_constituency_impacts, -) -from policyengine.outputs.decile_impact import ( - DecileImpact, - calculate_decile_impacts, -) -from policyengine.outputs.inequality import ( - UK_INEQUALITY_INCOME_VARIABLE, - US_INEQUALITY_INCOME_VARIABLE, - Inequality, - USInequalityPreset, - calculate_uk_inequality, - calculate_us_inequality, -) -from policyengine.outputs.intra_decile_impact import ( - IntraDecileImpact, - compute_intra_decile_impacts, -) -from policyengine.outputs.local_authority_impact import ( - LocalAuthorityImpact, - compute_uk_local_authority_impacts, -) -from policyengine.outputs.poverty import ( - AGE_GROUPS, - GENDER_GROUPS, - RACE_GROUPS, - UK_POVERTY_VARIABLES, - US_POVERTY_VARIABLES, - Poverty, - UKPovertyType, - USPovertyType, - calculate_uk_poverty_by_age, - calculate_uk_poverty_by_gender, - calculate_uk_poverty_rates, - calculate_us_poverty_by_age, - calculate_us_poverty_by_gender, - calculate_us_poverty_by_race, - calculate_us_poverty_rates, -) - -__all__ = [ - "Output", - "OutputCollection", - "Aggregate", - "AggregateType", - "ChangeAggregate", - "ChangeAggregateType", - "DecileImpact", - "calculate_decile_impacts", - "IntraDecileImpact", - "compute_intra_decile_impacts", - "Poverty", - "UKPovertyType", - "USPovertyType", - "UK_POVERTY_VARIABLES", - "US_POVERTY_VARIABLES", - "calculate_uk_poverty_rates", - "calculate_us_poverty_rates", - "calculate_uk_poverty_by_age", - "calculate_us_poverty_by_age", - "calculate_uk_poverty_by_gender", - "calculate_us_poverty_by_gender", - "calculate_us_poverty_by_race", - "AGE_GROUPS", - "GENDER_GROUPS", - "RACE_GROUPS", - "Inequality", - "USInequalityPreset", - "UK_INEQUALITY_INCOME_VARIABLE", - "US_INEQUALITY_INCOME_VARIABLE", - "calculate_uk_inequality", - "calculate_us_inequality", - "CongressionalDistrictImpact", - "compute_us_congressional_district_impacts", - "ConstituencyImpact", - "compute_uk_constituency_impacts", - "LocalAuthorityImpact", - "compute_uk_local_authority_impacts", -] diff --git a/build/lib/policyengine/outputs/aggregate.py b/build/lib/policyengine/outputs/aggregate.py deleted file mode 100644 index d014b06c..00000000 --- a/build/lib/policyengine/outputs/aggregate.py +++ /dev/null @@ -1,112 +0,0 @@ -from enum import Enum -from typing import Any, Optional - -from policyengine.core import Output, Simulation - - -class AggregateType(str, Enum): - SUM = "sum" - MEAN = "mean" - COUNT = "count" - - -class Aggregate(Output): - simulation: Simulation - variable: str - aggregate_type: AggregateType - entity: Optional[str] = None - - filter_variable: Optional[str] = None - filter_variable_eq: Optional[Any] = None - filter_variable_leq: Optional[Any] = None - filter_variable_geq: Optional[Any] = None - filter_variable_describes_quantiles: bool = False - - # Convenient quantile specification (alternative to describes_quantiles) - quantile: Optional[int] = ( - None # Number of quantiles (e.g., 10 for deciles, 5 for quintiles) - ) - quantile_eq: Optional[int] = None # Exact quantile (e.g., 3 for 3rd decile) - quantile_leq: Optional[int] = ( - None # Maximum quantile (e.g., 5 for bottom 5 deciles) - ) - quantile_geq: Optional[int] = None # Minimum quantile (e.g., 9 for top 2 deciles) - - result: Optional[Any] = None - - def run(self): - # Convert quantile specification to describes_quantiles format - if self.quantile is not None: - self.filter_variable_describes_quantiles = True - if self.quantile_eq is not None: - # For a specific quantile, filter between (quantile-1)/n and quantile/n - self.filter_variable_geq = (self.quantile_eq - 1) / self.quantile - self.filter_variable_leq = self.quantile_eq / self.quantile - elif self.quantile_leq is not None: - self.filter_variable_leq = self.quantile_leq / self.quantile - elif self.quantile_geq is not None: - self.filter_variable_geq = (self.quantile_geq - 1) / self.quantile - - # Get variable object - var_obj = next( - v - for v in self.simulation.tax_benefit_model_version.variables - if v.name == self.variable - ) - - # Get the target entity data - target_entity = self.entity or var_obj.entity - data = getattr(self.simulation.output_dataset.data, target_entity) - - # Map variable to target entity if needed - if var_obj.entity != target_entity: - mapped = self.simulation.output_dataset.data.map_to_entity( - var_obj.entity, target_entity, columns=[self.variable] - ) - series = mapped[self.variable] - else: - series = data[self.variable] - - # Apply filters - if self.filter_variable is not None: - filter_var_obj = next( - v - for v in self.simulation.tax_benefit_model_version.variables - if v.name == self.filter_variable - ) - - if filter_var_obj.entity != target_entity: - filter_mapped = self.simulation.output_dataset.data.map_to_entity( - filter_var_obj.entity, - target_entity, - columns=[self.filter_variable], - ) - filter_series = filter_mapped[self.filter_variable] - else: - filter_series = data[self.filter_variable] - - if self.filter_variable_describes_quantiles: - if self.filter_variable_eq is not None: - threshold = filter_series.quantile(self.filter_variable_eq) - series = series[filter_series <= threshold] - if self.filter_variable_leq is not None: - threshold = filter_series.quantile(self.filter_variable_leq) - series = series[filter_series <= threshold] - if self.filter_variable_geq is not None: - threshold = filter_series.quantile(self.filter_variable_geq) - series = series[filter_series >= threshold] - else: - if self.filter_variable_eq is not None: - series = series[filter_series == self.filter_variable_eq] - if self.filter_variable_leq is not None: - series = series[filter_series <= self.filter_variable_leq] - if self.filter_variable_geq is not None: - series = series[filter_series >= self.filter_variable_geq] - - # Aggregate - MicroSeries will automatically apply weights - if self.aggregate_type == AggregateType.SUM: - self.result = series.sum() - elif self.aggregate_type == AggregateType.MEAN: - self.result = series.mean() - elif self.aggregate_type == AggregateType.COUNT: - self.result = series.count() diff --git a/build/lib/policyengine/outputs/change_aggregate.py b/build/lib/policyengine/outputs/change_aggregate.py deleted file mode 100644 index 87d2e0d9..00000000 --- a/build/lib/policyengine/outputs/change_aggregate.py +++ /dev/null @@ -1,170 +0,0 @@ -from enum import Enum -from typing import Any, Optional - -from policyengine.core import Output, Simulation - - -class ChangeAggregateType(str, Enum): - COUNT = "count" - SUM = "sum" - MEAN = "mean" - - -class ChangeAggregate(Output): - baseline_simulation: Simulation - reform_simulation: Simulation - variable: str - aggregate_type: ChangeAggregateType - entity: Optional[str] = None - - # Filter by absolute change - change_geq: Optional[float] = None # Change >= value (e.g., gain >= 500) - change_leq: Optional[float] = None # Change <= value (e.g., loss <= -500) - change_eq: Optional[float] = None # Change == value - - # Filter by relative change (as decimal, e.g., 0.05 = 5%) - relative_change_geq: Optional[float] = None # Relative change >= value - relative_change_leq: Optional[float] = None # Relative change <= value - relative_change_eq: Optional[float] = None # Relative change == value - - # Filter by another variable (e.g., only count people with age >= 30) - filter_variable: Optional[str] = None - filter_variable_eq: Optional[Any] = None - filter_variable_leq: Optional[Any] = None - filter_variable_geq: Optional[Any] = None - filter_variable_describes_quantiles: bool = False - - # Convenient quantile specification (alternative to describes_quantiles) - quantile: Optional[int] = ( - None # Number of quantiles (e.g., 10 for deciles, 5 for quintiles) - ) - quantile_eq: Optional[int] = None # Exact quantile (e.g., 3 for 3rd decile) - quantile_leq: Optional[int] = ( - None # Maximum quantile (e.g., 5 for bottom 5 deciles) - ) - quantile_geq: Optional[int] = None # Minimum quantile (e.g., 9 for top 2 deciles) - - result: Optional[Any] = None - - def run(self): - # Convert quantile specification to describes_quantiles format - if self.quantile is not None: - self.filter_variable_describes_quantiles = True - if self.quantile_eq is not None: - # For a specific quantile, filter between (quantile-1)/n and quantile/n - self.filter_variable_geq = (self.quantile_eq - 1) / self.quantile - self.filter_variable_leq = self.quantile_eq / self.quantile - elif self.quantile_leq is not None: - self.filter_variable_leq = self.quantile_leq / self.quantile - elif self.quantile_geq is not None: - self.filter_variable_geq = (self.quantile_geq - 1) / self.quantile - - # Get variable object - var_obj = next( - v - for v in self.baseline_simulation.tax_benefit_model_version.variables - if v.name == self.variable - ) - - # Get the target entity data - target_entity = self.entity or var_obj.entity - baseline_data = getattr( - self.baseline_simulation.output_dataset.data, target_entity - ) - reform_data = getattr(self.reform_simulation.output_dataset.data, target_entity) - - # Map variable to target entity if needed - if var_obj.entity != target_entity: - baseline_mapped = ( - self.baseline_simulation.output_dataset.data.map_to_entity( - var_obj.entity, target_entity - ) - ) - baseline_series = baseline_mapped[self.variable] - - reform_mapped = self.reform_simulation.output_dataset.data.map_to_entity( - var_obj.entity, target_entity - ) - reform_series = reform_mapped[self.variable] - else: - baseline_series = baseline_data[self.variable] - reform_series = reform_data[self.variable] - - # Calculate change (reform - baseline) - change_series = reform_series - baseline_series - - # Calculate relative change (handling division by zero) - # Where baseline is 0, relative change is undefined; we'll mask these out if relative filters are used - import numpy as np - - with np.errstate(divide="ignore", invalid="ignore"): - relative_change_series = change_series / baseline_series - relative_change_series = relative_change_series.replace( - [np.inf, -np.inf], np.nan - ) - - # Start with all rows - mask = baseline_series.notna() - - # Apply absolute change filters - if self.change_eq is not None: - mask &= change_series == self.change_eq - if self.change_leq is not None: - mask &= change_series <= self.change_leq - if self.change_geq is not None: - mask &= change_series >= self.change_geq - - # Apply relative change filters - if self.relative_change_eq is not None: - mask &= relative_change_series == self.relative_change_eq - if self.relative_change_leq is not None: - mask &= relative_change_series <= self.relative_change_leq - if self.relative_change_geq is not None: - mask &= relative_change_series >= self.relative_change_geq - - # Apply filter_variable filters - if self.filter_variable is not None: - filter_var_obj = next( - v - for v in self.baseline_simulation.tax_benefit_model_version.variables - if v.name == self.filter_variable - ) - - if filter_var_obj.entity != target_entity: - filter_mapped = ( - self.baseline_simulation.output_dataset.data.map_to_entity( - filter_var_obj.entity, target_entity - ) - ) - filter_series = filter_mapped[self.filter_variable] - else: - filter_series = baseline_data[self.filter_variable] - - if self.filter_variable_describes_quantiles: - if self.filter_variable_eq is not None: - threshold = filter_series.quantile(self.filter_variable_eq) - mask &= filter_series <= threshold - if self.filter_variable_leq is not None: - threshold = filter_series.quantile(self.filter_variable_leq) - mask &= filter_series <= threshold - if self.filter_variable_geq is not None: - threshold = filter_series.quantile(self.filter_variable_geq) - mask &= filter_series >= threshold - else: - if self.filter_variable_eq is not None: - mask &= filter_series == self.filter_variable_eq - if self.filter_variable_leq is not None: - mask &= filter_series <= self.filter_variable_leq - if self.filter_variable_geq is not None: - mask &= filter_series >= self.filter_variable_geq - - # Apply mask to get filtered data - filtered_change = change_series[mask] - - # Aggregate - if self.aggregate_type == ChangeAggregateType.COUNT: - self.result = filtered_change.count() - elif self.aggregate_type == ChangeAggregateType.SUM: - self.result = filtered_change.sum() - elif self.aggregate_type == ChangeAggregateType.MEAN: - self.result = filtered_change.mean() diff --git a/build/lib/policyengine/outputs/congressional_district_impact.py b/build/lib/policyengine/outputs/congressional_district_impact.py deleted file mode 100644 index 4a1d0d90..00000000 --- a/build/lib/policyengine/outputs/congressional_district_impact.py +++ /dev/null @@ -1,131 +0,0 @@ -"""Congressional district impact output class for US policy reforms.""" - -from typing import TYPE_CHECKING, Optional - -import numpy as np -from pydantic import ConfigDict - -from policyengine.core import Output - -if TYPE_CHECKING: - from policyengine.core.simulation import Simulation - - -class CongressionalDistrictImpact(Output): - """Per-congressional-district income change from a policy reform. - - Groups households by congressional_district_geoid (integer SSDD format - where SS = state FIPS, DD = district number) and computes weighted - average and relative household income changes per district, plus the - district-level shares of people who are winners, losers, or unchanged. - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - baseline_simulation: "Simulation" - reform_simulation: "Simulation" - - # Results populated by run() - district_results: Optional[list[dict]] = None - - def run(self) -> None: - """Group households by geoid and compute per-district metrics.""" - baseline_hh = self.baseline_simulation.output_dataset.data.household - reform_hh = self.reform_simulation.output_dataset.data.household - - geoids = baseline_hh["congressional_district_geoid"].values - baseline_income = baseline_hh["household_net_income"].values - reform_income = reform_hh["household_net_income"].values - weights = baseline_hh["household_weight"].values - household_count_people = ( - baseline_hh["household_count_people"].values - if "household_count_people" in baseline_hh.columns - else np.ones_like(weights) - ) - - # Only include valid geoids (positive integers) - unique_geoids = np.unique(geoids[geoids > 0]) - - results: list[dict] = [] - for geoid in unique_geoids: - mask = geoids == geoid - w = weights[mask] - total_weight = float(w.sum()) - if total_weight == 0: - continue - - b_inc = baseline_income[mask] - r_inc = reform_income[mask] - people_weights = household_count_people[mask] * w - - weighted_baseline = float((b_inc * w).sum()) - weighted_reform = float((r_inc * w).sum()) - - avg_change = (weighted_reform - weighted_baseline) / total_weight - rel_change = ( - (weighted_reform / weighted_baseline - 1.0) - if weighted_baseline != 0 - else 0.0 - ) - capped_baseline = np.maximum(b_inc, 1.0) - income_change = (r_inc - b_inc) / capped_baseline - people_total = float(people_weights.sum()) - - if people_total == 0: - winner_percentage = 0.0 - loser_percentage = 0.0 - no_change_percentage = 1.0 - else: - winner_percentage = float( - people_weights[income_change > 1e-3].sum() / people_total - ) - loser_percentage = float( - people_weights[income_change <= -1e-3].sum() / people_total - ) - no_change_percentage = float( - people_weights[ - (income_change > -1e-3) & (income_change <= 1e-3) - ].sum() - / people_total - ) - - geoid_int = int(geoid) - state_fips = geoid_int // 100 - district_number = geoid_int % 100 - - results.append( - { - "district_geoid": geoid_int, - "state_fips": state_fips, - "district_number": district_number, - "average_household_income_change": float(avg_change), - "relative_household_income_change": float(rel_change), - "winner_percentage": winner_percentage, - "loser_percentage": loser_percentage, - "no_change_percentage": no_change_percentage, - "population": total_weight, - } - ) - - self.district_results = results - - -def compute_us_congressional_district_impacts( - baseline_simulation: "Simulation", - reform_simulation: "Simulation", -) -> CongressionalDistrictImpact: - """Compute per-congressional-district income changes. - - Args: - baseline_simulation: Completed baseline simulation. - reform_simulation: Completed reform simulation. - - Returns: - CongressionalDistrictImpact with district_results populated. - """ - impact = CongressionalDistrictImpact.model_construct( - baseline_simulation=baseline_simulation, - reform_simulation=reform_simulation, - ) - impact.run() - return impact diff --git a/build/lib/policyengine/outputs/constituency_impact.py b/build/lib/policyengine/outputs/constituency_impact.py deleted file mode 100644 index 60f76e0b..00000000 --- a/build/lib/policyengine/outputs/constituency_impact.py +++ /dev/null @@ -1,126 +0,0 @@ -"""UK parliamentary constituency impact output class. - -Computes per-constituency income changes using pre-computed weight matrices. -Each constituency has a row in the weight matrix (shape: 650 x N_households) -that reweights all households to represent that constituency's demographics. -""" - -from typing import TYPE_CHECKING, Optional - -import h5py -import numpy as np -import pandas as pd -from pydantic import ConfigDict - -from policyengine.core import Output - -if TYPE_CHECKING: - from policyengine.core.simulation import Simulation - - -class ConstituencyImpact(Output): - """Per-parliamentary-constituency income change from a UK policy reform. - - Uses pre-computed weight matrices from GCS to reweight households - for each of 650 constituencies, then computes weighted average and - relative household income changes. - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - baseline_simulation: "Simulation" - reform_simulation: "Simulation" - weight_matrix_path: str - constituency_csv_path: str - year: str = "2025" - - # Results populated by run() - constituency_results: Optional[list[dict]] = None - - def run(self) -> None: - """Load weight matrix and compute per-constituency metrics.""" - # Load constituency metadata (code, name, x, y) - constituency_df = pd.read_csv(self.constituency_csv_path) - - # Load weight matrix: shape (N_constituencies, N_households) - with h5py.File(self.weight_matrix_path, "r") as f: - weight_matrix = f[self.year][...] - - # Get household income arrays from output datasets - baseline_hh = self.baseline_simulation.output_dataset.data.household - reform_hh = self.reform_simulation.output_dataset.data.household - - baseline_income = baseline_hh["household_net_income"].values - reform_income = reform_hh["household_net_income"].values - - results: list[dict] = [] - for i in range(len(constituency_df)): - row = constituency_df.iloc[i] - code = str(row["code"]) - name = str(row["name"]) - x = int(row["x"]) - y = int(row["y"]) - w = weight_matrix[i] - - total_weight = float(np.sum(w)) - if total_weight == 0: - continue - - weighted_baseline = float(np.sum(baseline_income * w)) - weighted_reform = float(np.sum(reform_income * w)) - - # Count of weighted households - count = float(np.sum(w > 0)) - if count == 0: - continue - - avg_change = (weighted_reform - weighted_baseline) / total_weight - rel_change = ( - (weighted_reform / weighted_baseline - 1.0) - if weighted_baseline != 0 - else 0.0 - ) - - results.append( - { - "constituency_code": code, - "constituency_name": name, - "x": x, - "y": y, - "average_household_income_change": float(avg_change), - "relative_household_income_change": float(rel_change), - "population": total_weight, - } - ) - - self.constituency_results = results - - -def compute_uk_constituency_impacts( - baseline_simulation: "Simulation", - reform_simulation: "Simulation", - weight_matrix_path: str, - constituency_csv_path: str, - year: str = "2025", -) -> ConstituencyImpact: - """Compute per-constituency income changes for UK. - - Args: - baseline_simulation: Completed baseline simulation. - reform_simulation: Completed reform simulation. - weight_matrix_path: Path to parliamentary_constituency_weights.h5. - constituency_csv_path: Path to constituencies_2024.csv. - year: Year key in the H5 file (default "2025"). - - Returns: - ConstituencyImpact with constituency_results populated. - """ - impact = ConstituencyImpact.model_construct( - baseline_simulation=baseline_simulation, - reform_simulation=reform_simulation, - weight_matrix_path=weight_matrix_path, - constituency_csv_path=constituency_csv_path, - year=year, - ) - impact.run() - return impact diff --git a/build/lib/policyengine/outputs/decile_impact.py b/build/lib/policyengine/outputs/decile_impact.py deleted file mode 100644 index b0f2306e..00000000 --- a/build/lib/policyengine/outputs/decile_impact.py +++ /dev/null @@ -1,178 +0,0 @@ -from typing import Optional - -import pandas as pd -from pydantic import ConfigDict - -from policyengine.core import Output, OutputCollection, Simulation -from policyengine.core.dataset import Dataset -from policyengine.core.dynamic import Dynamic -from policyengine.core.policy import Policy -from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion - - -class DecileImpact(Output): - """Single decile's impact from a policy reform - represents one database row.""" - - model_config = ConfigDict(arbitrary_types_allowed=True) - - baseline_simulation: Simulation - reform_simulation: Simulation - income_variable: str = "equiv_hbai_household_net_income" - decile_variable: Optional[str] = None # If set, use pre-computed grouping variable - entity: Optional[str] = None - decile: int - quantiles: int = 10 - - # Results populated by run() - baseline_mean: Optional[float] = None - reform_mean: Optional[float] = None - absolute_change: Optional[float] = None - relative_change: Optional[float] = None - count_better_off: Optional[float] = None - count_worse_off: Optional[float] = None - count_no_change: Optional[float] = None - - def run(self): - """Calculate impact for this specific decile.""" - # Get variable object to determine entity - var_obj = next( - v - for v in self.baseline_simulation.tax_benefit_model_version.variables - if v.name == self.income_variable - ) - - # Get target entity - target_entity = self.entity or var_obj.entity - - # Get data from both simulations - baseline_data = getattr( - self.baseline_simulation.output_dataset.data, target_entity - ) - reform_data = getattr(self.reform_simulation.output_dataset.data, target_entity) - - # Map income variable to target entity if needed - if var_obj.entity != target_entity: - baseline_mapped = ( - self.baseline_simulation.output_dataset.data.map_to_entity( - var_obj.entity, target_entity - ) - ) - baseline_income = baseline_mapped[self.income_variable] - - reform_mapped = self.reform_simulation.output_dataset.data.map_to_entity( - var_obj.entity, target_entity - ) - reform_income = reform_mapped[self.income_variable] - else: - baseline_income = baseline_data[self.income_variable] - reform_income = reform_data[self.income_variable] - - # Calculate deciles: use pre-computed variable or qcut - if self.decile_variable: - decile_series = baseline_data[self.decile_variable] - else: - decile_series = ( - pd.qcut( - baseline_income, - self.quantiles, - labels=False, - duplicates="drop", - ) - + 1 - ) - - # Calculate changes - absolute_change = reform_income - baseline_income - relative_change = (absolute_change / baseline_income) * 100 - - # Filter to this decile - mask = decile_series == self.decile - - # Populate results - self.baseline_mean = float(baseline_income[mask].mean()) - self.reform_mean = float(reform_income[mask].mean()) - self.absolute_change = float(absolute_change[mask].mean()) - self.relative_change = float(relative_change[mask].mean()) - self.count_better_off = float((absolute_change[mask] > 0).sum()) - self.count_worse_off = float((absolute_change[mask] < 0).sum()) - self.count_no_change = float((absolute_change[mask] == 0).sum()) - - -def calculate_decile_impacts( - dataset: Optional[Dataset] = None, - tax_benefit_model_version: Optional[TaxBenefitModelVersion] = None, - baseline_policy: Optional[Policy] = None, - reform_policy: Optional[Policy] = None, - dynamic: Optional[Dynamic] = None, - income_variable: str = "equiv_hbai_household_net_income", - entity: Optional[str] = None, - quantiles: int = 10, - baseline_simulation: Optional[Simulation] = None, - reform_simulation: Optional[Simulation] = None, -) -> OutputCollection[DecileImpact]: - """Calculate decile-by-decile impact of a reform. - - Returns: - OutputCollection containing list of DecileImpact objects and DataFrame - """ - if (baseline_simulation is None) != (reform_simulation is None): - raise ValueError( - "baseline_simulation and reform_simulation must be provided together" - ) - - if baseline_simulation is None: - if dataset is None or tax_benefit_model_version is None: - raise ValueError( - "dataset and tax_benefit_model_version are required when simulations are not provided" - ) - - baseline_simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=tax_benefit_model_version, - policy=baseline_policy, - dynamic=dynamic, - ) - reform_simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=tax_benefit_model_version, - policy=reform_policy, - dynamic=dynamic, - ) - - baseline_simulation.ensure() - reform_simulation.ensure() - - results = [] - for decile in range(1, quantiles + 1): - impact = DecileImpact( - baseline_simulation=baseline_simulation, - reform_simulation=reform_simulation, - income_variable=income_variable, - entity=entity, - decile=decile, - quantiles=quantiles, - ) - impact.run() - results.append(impact) - - # Create DataFrame - df = pd.DataFrame( - [ - { - "baseline_simulation_id": r.baseline_simulation.id, - "reform_simulation_id": r.reform_simulation.id, - "income_variable": r.income_variable, - "decile": r.decile, - "baseline_mean": r.baseline_mean, - "reform_mean": r.reform_mean, - "absolute_change": r.absolute_change, - "relative_change": r.relative_change, - "count_better_off": r.count_better_off, - "count_worse_off": r.count_worse_off, - "count_no_change": r.count_no_change, - } - for r in results - ] - ) - - return OutputCollection(outputs=results, dataframe=df) diff --git a/build/lib/policyengine/outputs/inequality.py b/build/lib/policyengine/outputs/inequality.py deleted file mode 100644 index 4b16f7a9..00000000 --- a/build/lib/policyengine/outputs/inequality.py +++ /dev/null @@ -1,313 +0,0 @@ -"""Inequality analysis output types.""" - -from enum import Enum -from typing import Any, Optional, Union - -import numpy as np -import pandas as pd -from pydantic import ConfigDict - -from policyengine.core import Output, Simulation - - -class USInequalityPreset(str, Enum): - """Preset configurations for US inequality analysis.""" - - STANDARD = "standard" - CBO_COMPARABLE = "cbo_comparable" - - -def _gini(values: np.ndarray, weights: np.ndarray) -> float: - """Calculate weighted Gini coefficient. - - Args: - values: Array of income values - weights: Array of weights - - Returns: - Gini coefficient between 0 (perfect equality) and 1 (perfect inequality) - """ - # Handle edge cases - if len(values) == 0 or weights.sum() == 0: - return 0.0 - - # Sort by values - sorted_indices = np.argsort(values) - sorted_values = values[sorted_indices] - sorted_weights = weights[sorted_indices] - - # Cumulative weights and weighted values - cumulative_weights = np.cumsum(sorted_weights) - total_weight = cumulative_weights[-1] - cumulative_weighted_values = np.cumsum(sorted_values * sorted_weights) - total_weighted_value = cumulative_weighted_values[-1] - - if total_weighted_value == 0: - return 0.0 - - # Calculate Gini using the area formula - # Gini = 1 - 2 * (area under Lorenz curve) - lorenz_curve = cumulative_weighted_values / total_weighted_value - weight_fractions = sorted_weights / total_weight - - # Area under Lorenz curve using trapezoidal rule - area = np.sum(weight_fractions * (lorenz_curve - weight_fractions / 2)) - - return float(1 - 2 * area) - - -def _series_for_entity( - simulation: Simulation, variable_name: str, target_entity: str, data: pd.DataFrame -) -> pd.Series: - """Return a variable series aligned to the requested entity.""" - variable = simulation.tax_benefit_model_version.get_variable(variable_name) - - if variable.entity != target_entity: - mapped = simulation.output_dataset.data.map_to_entity( - variable.entity, - target_entity, - columns=[variable_name], - ) - return mapped[variable_name] - - return data[variable_name] - - -class Inequality(Output): - """Single inequality measure result - represents one database row. - - This is a single-simulation output type that calculates inequality - metrics for a given income variable, optionally filtered by - demographic variables. - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - simulation: Simulation - income_variable: str - entity: str = "household" - weight_multiplier_variable: Optional[str] = None - equivalization_variable: Optional[str] = None - equivalization_power: float = 0.0 - - # Optional demographic filters - filter_variable: Optional[str] = None - filter_variable_eq: Optional[Any] = None - filter_variable_leq: Optional[Any] = None - filter_variable_geq: Optional[Any] = None - - # Results populated by run() - gini: Optional[float] = None - top_10_share: Optional[float] = None - top_1_share: Optional[float] = None - bottom_50_share: Optional[float] = None - - def run(self): - """Calculate inequality metrics.""" - # Get target entity data - target_entity = self.entity - data = getattr(self.simulation.output_dataset.data, target_entity) - - income_series = _series_for_entity( - self.simulation, self.income_variable, target_entity, data - ) - - # Get weights - weight_col = f"{target_entity}_weight" - if weight_col in data.columns: - weights = data[weight_col] - else: - weights = pd.Series(np.ones(len(income_series)), index=income_series.index) - - if self.weight_multiplier_variable is not None: - weight_multiplier = _series_for_entity( - self.simulation, - self.weight_multiplier_variable, - target_entity, - data, - ) - weights = weights * weight_multiplier - - # Apply demographic filter if specified - if self.filter_variable is not None: - filter_series = _series_for_entity( - self.simulation, self.filter_variable, target_entity, data - ) - - # Build filter mask - mask = filter_series.notna() - if self.filter_variable_eq is not None: - mask &= filter_series == self.filter_variable_eq - if self.filter_variable_leq is not None: - mask &= filter_series <= self.filter_variable_leq - if self.filter_variable_geq is not None: - mask &= filter_series >= self.filter_variable_geq - - # Apply mask - income_series = income_series[mask] - weights = weights[mask] - - equivalization_arr = None - if self.equivalization_variable is not None and self.equivalization_power != 0: - equivalization_series = _series_for_entity( - self.simulation, - self.equivalization_variable, - target_entity, - data, - ) - if self.filter_variable is not None: - equivalization_series = equivalization_series[mask] - equivalization_arr = pd.to_numeric( - equivalization_series, errors="coerce" - ).to_numpy(dtype=float) - - # Convert to numpy arrays - values = pd.to_numeric(income_series, errors="coerce").to_numpy(dtype=float) - weights_arr = pd.to_numeric(weights, errors="coerce").to_numpy(dtype=float) - - # Remove invalid values - valid_mask = ~np.isnan(values) & ~np.isnan(weights_arr) - if equivalization_arr is not None: - valid_mask &= ~np.isnan(equivalization_arr) & (equivalization_arr > 0) - - values = values[valid_mask] - weights_arr = weights_arr[valid_mask] - if equivalization_arr is not None: - values = values / np.power( - equivalization_arr[valid_mask], self.equivalization_power - ) - - # Calculate Gini coefficient - self.gini = _gini(values, weights_arr) - - # Calculate income shares - if len(values) > 0 and weights_arr.sum() > 0: - total_income = np.sum(values * weights_arr) - - if total_income > 0: - # Sort by income - sorted_indices = np.argsort(values) - sorted_values = values[sorted_indices] - sorted_weights = weights_arr[sorted_indices] - - # Cumulative weight fractions - cumulative_weights = np.cumsum(sorted_weights) - total_weight = cumulative_weights[-1] - weight_fractions = cumulative_weights / total_weight - - # Top 10% share - top_10_mask = weight_fractions > 0.9 - self.top_10_share = float( - np.sum(sorted_values[top_10_mask] * sorted_weights[top_10_mask]) - / total_income - ) - - # Top 1% share - top_1_mask = weight_fractions > 0.99 - self.top_1_share = float( - np.sum(sorted_values[top_1_mask] * sorted_weights[top_1_mask]) - / total_income - ) - - # Bottom 50% share - bottom_50_mask = weight_fractions <= 0.5 - self.bottom_50_share = float( - np.sum( - sorted_values[bottom_50_mask] * sorted_weights[bottom_50_mask] - ) - / total_income - ) - else: - self.top_10_share = 0.0 - self.top_1_share = 0.0 - self.bottom_50_share = 0.0 - else: - self.top_10_share = 0.0 - self.top_1_share = 0.0 - self.bottom_50_share = 0.0 - - -# Default income variables for each country -UK_INEQUALITY_INCOME_VARIABLE = "equiv_hbai_household_net_income" -US_INEQUALITY_INCOME_VARIABLE = "household_net_income" - - -def calculate_uk_inequality( - simulation: Simulation, - income_variable: str = UK_INEQUALITY_INCOME_VARIABLE, - filter_variable: Optional[str] = None, - filter_variable_eq: Optional[Any] = None, - filter_variable_leq: Optional[Any] = None, - filter_variable_geq: Optional[Any] = None, -) -> Inequality: - """Calculate inequality metrics for a UK simulation. - - Args: - simulation: The simulation to analyse - income_variable: Income variable to use (default: equiv_hbai_household_net_income) - filter_variable: Optional variable to filter by - filter_variable_eq: Filter for exact match - filter_variable_leq: Filter for less than or equal - filter_variable_geq: Filter for greater than or equal - - Returns: - Inequality object with Gini and income share metrics - """ - inequality = Inequality( - simulation=simulation, - income_variable=income_variable, - entity="household", - filter_variable=filter_variable, - filter_variable_eq=filter_variable_eq, - filter_variable_leq=filter_variable_leq, - filter_variable_geq=filter_variable_geq, - ) - inequality.run() - return inequality - - -def calculate_us_inequality( - simulation: Simulation, - income_variable: str = US_INEQUALITY_INCOME_VARIABLE, - preset: Union[USInequalityPreset, str] = USInequalityPreset.STANDARD, - filter_variable: Optional[str] = None, - filter_variable_eq: Optional[Any] = None, - filter_variable_leq: Optional[Any] = None, - filter_variable_geq: Optional[Any] = None, -) -> Inequality: - """Calculate inequality metrics for a US simulation. - - Args: - simulation: The simulation to analyse - income_variable: Income variable to use (default: household_net_income) - preset: Optional preset for weighting/equivalization - filter_variable: Optional variable to filter by - filter_variable_eq: Filter for exact match - filter_variable_leq: Filter for less than or equal - filter_variable_geq: Filter for greater than or equal - - Returns: - Inequality object with Gini and income share metrics - """ - preset = USInequalityPreset(preset) - inequality_kwargs = {} - - if preset == USInequalityPreset.CBO_COMPARABLE: - inequality_kwargs = { - "weight_multiplier_variable": "household_count_people", - "equivalization_variable": "household_count_people", - "equivalization_power": 0.5, - } - - inequality = Inequality( - simulation=simulation, - income_variable=income_variable, - entity="household", - **inequality_kwargs, - filter_variable=filter_variable, - filter_variable_eq=filter_variable_eq, - filter_variable_leq=filter_variable_leq, - filter_variable_geq=filter_variable_geq, - ) - inequality.run() - return inequality diff --git a/build/lib/policyengine/outputs/intra_decile_impact.py b/build/lib/policyengine/outputs/intra_decile_impact.py deleted file mode 100644 index b91a04e2..00000000 --- a/build/lib/policyengine/outputs/intra_decile_impact.py +++ /dev/null @@ -1,180 +0,0 @@ -"""Intra-decile impact output. - -Computes the distribution of income change categories within each decile. -Each row represents one decile (1-10) or the overall average (decile=0), -with five proportion columns summing to ~1.0. - -The five categories classify households by their percentage income change: - - lose_more_than_5pct: change <= -5% - - lose_less_than_5pct: -5% < change <= -0.1% - - no_change: -0.1% < change <= 0.1% - - gain_less_than_5pct: 0.1% < change <= 5% - - gain_more_than_5pct: change > 5% - -Proportions are people-weighted (using household_count_people * -household_weight) so they reflect the share of people, not households. -""" - -from typing import Optional - -import numpy as np -import pandas as pd -from pydantic import ConfigDict - -from policyengine.core import Output, OutputCollection, Simulation - -# The 5-category thresholds -BOUNDS = [-np.inf, -0.05, -1e-3, 1e-3, 0.05, np.inf] -CATEGORY_NAMES = [ - "lose_more_than_5pct", - "lose_less_than_5pct", - "no_change", - "gain_less_than_5pct", - "gain_more_than_5pct", -] - - -class IntraDecileImpact(Output): - """Single decile's intra-decile impact — proportion of people in each - income change category.""" - - model_config = ConfigDict(arbitrary_types_allowed=True) - - baseline_simulation: Simulation - reform_simulation: Simulation - income_variable: str = "household_net_income" - decile_variable: Optional[str] = None # If set, use pre-computed grouping - entity: str = "household" - decile: int # 1-10 for individual deciles - quantiles: int = 10 - - # Results populated by run() - lose_more_than_5pct: Optional[float] = None - lose_less_than_5pct: Optional[float] = None - no_change: Optional[float] = None - gain_less_than_5pct: Optional[float] = None - gain_more_than_5pct: Optional[float] = None - - def run(self): - """Calculate intra-decile proportions for this specific decile.""" - baseline_data = getattr( - self.baseline_simulation.output_dataset.data, self.entity - ) - reform_data = getattr(self.reform_simulation.output_dataset.data, self.entity) - - baseline_income = baseline_data[self.income_variable].values - reform_income = reform_data[self.income_variable].values - - # Determine decile grouping - if self.decile_variable: - decile_series = baseline_data[self.decile_variable].values - else: - decile_series = ( - pd.qcut( - baseline_income, - self.quantiles, - labels=False, - duplicates="drop", - ) - + 1 - ) - - # People-weighted counts - weights = baseline_data[f"{self.entity}_weight"].values - if self.entity == "household": - people_count = baseline_data["household_count_people"].values - people = people_count * weights - else: - people = weights - - # Compute percentage income change - capped_baseline = np.maximum(baseline_income, 1.0) - income_change = (reform_income - baseline_income) / capped_baseline - - in_decile = decile_series == self.decile - people_in_decile = float(np.sum(people[in_decile])) - - if people_in_decile == 0: - self.lose_more_than_5pct = 0.0 - self.lose_less_than_5pct = 0.0 - self.no_change = 1.0 - self.gain_less_than_5pct = 0.0 - self.gain_more_than_5pct = 0.0 - return - - proportions = [] - for lower, upper in zip(BOUNDS[:-1], BOUNDS[1:]): - in_category = (income_change > lower) & (income_change <= upper) - in_both = in_decile & in_category - proportions.append(float(np.sum(people[in_both]) / people_in_decile)) - - self.lose_more_than_5pct = proportions[0] - self.lose_less_than_5pct = proportions[1] - self.no_change = proportions[2] - self.gain_less_than_5pct = proportions[3] - self.gain_more_than_5pct = proportions[4] - - -def compute_intra_decile_impacts( - baseline_simulation: Simulation, - reform_simulation: Simulation, - income_variable: str = "household_net_income", - decile_variable: Optional[str] = None, - entity: str = "household", - quantiles: int = 10, -) -> OutputCollection[IntraDecileImpact]: - """Compute intra-decile proportions for all deciles + overall average. - - Returns: - OutputCollection containing list of IntraDecileImpact objects - (deciles 1-N plus overall average at decile=0) and DataFrame. - """ - results = [] - for decile in range(1, quantiles + 1): - impact = IntraDecileImpact.model_construct( - baseline_simulation=baseline_simulation, - reform_simulation=reform_simulation, - income_variable=income_variable, - decile_variable=decile_variable, - entity=entity, - decile=decile, - quantiles=quantiles, - ) - impact.run() - results.append(impact) - - # Overall average (decile=0): arithmetic mean of decile proportions - overall = IntraDecileImpact.model_construct( - baseline_simulation=baseline_simulation, - reform_simulation=reform_simulation, - income_variable=income_variable, - decile_variable=decile_variable, - entity=entity, - decile=0, - quantiles=quantiles, - lose_more_than_5pct=sum(r.lose_more_than_5pct for r in results) / quantiles, - lose_less_than_5pct=sum(r.lose_less_than_5pct for r in results) / quantiles, - no_change=sum(r.no_change for r in results) / quantiles, - gain_less_than_5pct=sum(r.gain_less_than_5pct for r in results) / quantiles, - gain_more_than_5pct=sum(r.gain_more_than_5pct for r in results) / quantiles, - ) - results.append(overall) - - # Create DataFrame - df = pd.DataFrame( - [ - { - "baseline_simulation_id": r.baseline_simulation.id, - "reform_simulation_id": r.reform_simulation.id, - "decile": r.decile, - "lose_more_than_5pct": r.lose_more_than_5pct, - "lose_less_than_5pct": r.lose_less_than_5pct, - "no_change": r.no_change, - "gain_less_than_5pct": r.gain_less_than_5pct, - "gain_more_than_5pct": r.gain_more_than_5pct, - } - for r in results - ] - ) - - return OutputCollection(outputs=results, dataframe=df) diff --git a/build/lib/policyengine/outputs/local_authority_impact.py b/build/lib/policyengine/outputs/local_authority_impact.py deleted file mode 100644 index 20b17efe..00000000 --- a/build/lib/policyengine/outputs/local_authority_impact.py +++ /dev/null @@ -1,125 +0,0 @@ -"""UK local authority impact output class. - -Computes per-local-authority income changes using pre-computed weight matrices. -Each local authority has a row in the weight matrix (shape: 360 x N_households) -that reweights all households to represent that local authority's demographics. -""" - -from typing import TYPE_CHECKING, Optional - -import h5py -import numpy as np -import pandas as pd -from pydantic import ConfigDict - -from policyengine.core import Output - -if TYPE_CHECKING: - from policyengine.core.simulation import Simulation - - -class LocalAuthorityImpact(Output): - """Per-local-authority income change from a UK policy reform. - - Uses pre-computed weight matrices from GCS to reweight households - for each of 360 local authorities, then computes weighted average and - relative household income changes. - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - baseline_simulation: "Simulation" - reform_simulation: "Simulation" - weight_matrix_path: str - local_authority_csv_path: str - year: str = "2025" - - # Results populated by run() - local_authority_results: Optional[list[dict]] = None - - def run(self) -> None: - """Load weight matrix and compute per-local-authority metrics.""" - # Load local authority metadata (code, x, y, name) - la_df = pd.read_csv(self.local_authority_csv_path) - - # Load weight matrix: shape (N_local_authorities, N_households) - with h5py.File(self.weight_matrix_path, "r") as f: - weight_matrix = f[self.year][...] - - # Get household income arrays from output datasets - baseline_hh = self.baseline_simulation.output_dataset.data.household - reform_hh = self.reform_simulation.output_dataset.data.household - - baseline_income = baseline_hh["household_net_income"].values - reform_income = reform_hh["household_net_income"].values - - results: list[dict] = [] - for i in range(len(la_df)): - row = la_df.iloc[i] - code = str(row["code"]) - name = str(row["name"]) - x = int(row["x"]) - y = int(row["y"]) - w = weight_matrix[i] - - total_weight = float(np.sum(w)) - if total_weight == 0: - continue - - weighted_baseline = float(np.sum(baseline_income * w)) - weighted_reform = float(np.sum(reform_income * w)) - - count = float(np.sum(w > 0)) - if count == 0: - continue - - avg_change = (weighted_reform - weighted_baseline) / total_weight - rel_change = ( - (weighted_reform / weighted_baseline - 1.0) - if weighted_baseline != 0 - else 0.0 - ) - - results.append( - { - "local_authority_code": code, - "local_authority_name": name, - "x": x, - "y": y, - "average_household_income_change": float(avg_change), - "relative_household_income_change": float(rel_change), - "population": total_weight, - } - ) - - self.local_authority_results = results - - -def compute_uk_local_authority_impacts( - baseline_simulation: "Simulation", - reform_simulation: "Simulation", - weight_matrix_path: str, - local_authority_csv_path: str, - year: str = "2025", -) -> LocalAuthorityImpact: - """Compute per-local-authority income changes for UK. - - Args: - baseline_simulation: Completed baseline simulation. - reform_simulation: Completed reform simulation. - weight_matrix_path: Path to local_authority_weights.h5. - local_authority_csv_path: Path to local_authorities_2021.csv. - year: Year key in the H5 file (default "2025"). - - Returns: - LocalAuthorityImpact with local_authority_results populated. - """ - impact = LocalAuthorityImpact.model_construct( - baseline_simulation=baseline_simulation, - reform_simulation=reform_simulation, - weight_matrix_path=weight_matrix_path, - local_authority_csv_path=local_authority_csv_path, - year=year, - ) - impact.run() - return impact diff --git a/build/lib/policyengine/outputs/poverty.py b/build/lib/policyengine/outputs/poverty.py deleted file mode 100644 index 6fc59705..00000000 --- a/build/lib/policyengine/outputs/poverty.py +++ /dev/null @@ -1,462 +0,0 @@ -"""Poverty analysis output types.""" - -from enum import Enum -from typing import Any, Optional - -import pandas as pd -from pydantic import ConfigDict - -from policyengine.core import Output, OutputCollection, Simulation - - -class UKPovertyType(str, Enum): - """UK poverty measure types.""" - - ABSOLUTE_BHC = "absolute_bhc" - ABSOLUTE_AHC = "absolute_ahc" - RELATIVE_BHC = "relative_bhc" - RELATIVE_AHC = "relative_ahc" - - -class USPovertyType(str, Enum): - """US poverty measure types.""" - - SPM = "spm" - SPM_DEEP = "spm_deep" - - -# Mapping from poverty type to variable name -UK_POVERTY_VARIABLES = { - UKPovertyType.ABSOLUTE_BHC: "in_poverty_bhc", - UKPovertyType.ABSOLUTE_AHC: "in_poverty_ahc", - UKPovertyType.RELATIVE_BHC: "in_relative_poverty_bhc", - UKPovertyType.RELATIVE_AHC: "in_relative_poverty_ahc", -} - -US_POVERTY_VARIABLES = { - USPovertyType.SPM: "spm_unit_is_in_spm_poverty", - USPovertyType.SPM_DEEP: "spm_unit_is_in_deep_spm_poverty", -} - - -class Poverty(Output): - """Single poverty measure result - represents one database row. - - This is a single-simulation output type that calculates poverty - headcount and rate for a given poverty measure, optionally filtered - by demographic variables. - """ - - model_config = ConfigDict(arbitrary_types_allowed=True) - - simulation: Simulation - poverty_variable: str - poverty_type: Optional[str] = None - entity: str = "person" - - # Optional demographic filters - filter_variable: Optional[str] = None - filter_variable_eq: Optional[Any] = None - filter_variable_leq: Optional[Any] = None - filter_variable_geq: Optional[Any] = None - - # Convenience group label (set by by_age/by_gender/by_race wrappers) - filter_group: Optional[str] = None - - # Results populated by run() - headcount: Optional[float] = None - total_population: Optional[float] = None - rate: Optional[float] = None - - def run(self): - """Calculate poverty headcount and rate.""" - # Get poverty variable info - poverty_var_obj = self.simulation.tax_benefit_model_version.get_variable( - self.poverty_variable - ) - - # Get target entity data - target_entity = self.entity - data = getattr(self.simulation.output_dataset.data, target_entity) - - # Map poverty variable to target entity if needed - if poverty_var_obj.entity != target_entity: - mapped = self.simulation.output_dataset.data.map_to_entity( - poverty_var_obj.entity, - target_entity, - columns=[self.poverty_variable], - ) - poverty_series = mapped[self.poverty_variable] - else: - poverty_series = data[self.poverty_variable] - - # Apply demographic filter if specified - if self.filter_variable is not None: - filter_var_obj = self.simulation.tax_benefit_model_version.get_variable( - self.filter_variable - ) - - if filter_var_obj.entity != target_entity: - filter_mapped = self.simulation.output_dataset.data.map_to_entity( - filter_var_obj.entity, - target_entity, - columns=[self.filter_variable], - ) - filter_series = filter_mapped[self.filter_variable] - else: - filter_series = data[self.filter_variable] - - # Build filter mask - mask = filter_series.notna() - if self.filter_variable_eq is not None: - mask &= filter_series == self.filter_variable_eq - if self.filter_variable_leq is not None: - mask &= filter_series <= self.filter_variable_leq - if self.filter_variable_geq is not None: - mask &= filter_series >= self.filter_variable_geq - - # Apply mask - poverty_series = poverty_series[mask] - - # Calculate results using weighted counts - self.headcount = float((poverty_series == True).sum()) # noqa: E712 - self.total_population = float(poverty_series.count()) - self.rate = ( - self.headcount / self.total_population if self.total_population > 0 else 0.0 - ) - - -def calculate_uk_poverty_rates( - simulation: Simulation, - filter_variable: Optional[str] = None, - filter_variable_eq: Optional[Any] = None, - filter_variable_leq: Optional[Any] = None, - filter_variable_geq: Optional[Any] = None, -) -> OutputCollection[Poverty]: - """Calculate all UK poverty rates for a simulation. - - Args: - simulation: The simulation to analyse - filter_variable: Optional variable to filter by (e.g., "is_child") - filter_variable_eq: Filter for exact match - filter_variable_leq: Filter for less than or equal - filter_variable_geq: Filter for greater than or equal - - Returns: - OutputCollection containing Poverty objects for each UK poverty type - """ - results = [] - - for poverty_type, poverty_variable in UK_POVERTY_VARIABLES.items(): - poverty = Poverty( - simulation=simulation, - poverty_variable=poverty_variable, - poverty_type=str(poverty_type), - entity="person", - filter_variable=filter_variable, - filter_variable_eq=filter_variable_eq, - filter_variable_leq=filter_variable_leq, - filter_variable_geq=filter_variable_geq, - ) - poverty.run() - results.append(poverty) - - df = pd.DataFrame( - [ - { - "simulation_id": r.simulation.id, - "poverty_type": r.poverty_type, - "poverty_variable": r.poverty_variable, - "filter_variable": r.filter_variable, - "filter_variable_eq": r.filter_variable_eq, - "filter_variable_leq": r.filter_variable_leq, - "filter_variable_geq": r.filter_variable_geq, - "headcount": r.headcount, - "total_population": r.total_population, - "rate": r.rate, - } - for r in results - ] - ) - - return OutputCollection(outputs=results, dataframe=df) - - -def calculate_us_poverty_rates( - simulation: Simulation, - filter_variable: Optional[str] = None, - filter_variable_eq: Optional[Any] = None, - filter_variable_leq: Optional[Any] = None, - filter_variable_geq: Optional[Any] = None, -) -> OutputCollection[Poverty]: - """Calculate all US poverty rates for a simulation. - - Args: - simulation: The simulation to analyse - filter_variable: Optional variable to filter by (e.g., "is_child") - filter_variable_eq: Filter for exact match - filter_variable_leq: Filter for less than or equal - filter_variable_geq: Filter for greater than or equal - - Returns: - OutputCollection containing Poverty objects for each US poverty type - """ - results = [] - - for poverty_type, poverty_variable in US_POVERTY_VARIABLES.items(): - poverty = Poverty( - simulation=simulation, - poverty_variable=poverty_variable, - poverty_type=str(poverty_type), - entity="person", - filter_variable=filter_variable, - filter_variable_eq=filter_variable_eq, - filter_variable_leq=filter_variable_leq, - filter_variable_geq=filter_variable_geq, - ) - poverty.run() - results.append(poverty) - - df = pd.DataFrame( - [ - { - "simulation_id": r.simulation.id, - "poverty_type": r.poverty_type, - "poverty_variable": r.poverty_variable, - "filter_variable": r.filter_variable, - "filter_variable_eq": r.filter_variable_eq, - "filter_variable_leq": r.filter_variable_leq, - "filter_variable_geq": r.filter_variable_geq, - "headcount": r.headcount, - "total_population": r.total_population, - "rate": r.rate, - } - for r in results - ] - ) - - return OutputCollection(outputs=results, dataframe=df) - - -# Race group definitions (US only — race Enum stored as string names) -RACE_GROUPS = { - "white": {"filter_variable": "race", "filter_variable_eq": "WHITE"}, - "black": {"filter_variable": "race", "filter_variable_eq": "BLACK"}, - "hispanic": {"filter_variable": "race", "filter_variable_eq": "HISPANIC"}, - "other": {"filter_variable": "race", "filter_variable_eq": "OTHER"}, -} - -# Gender group definitions (same for UK and US — both use is_male boolean) -GENDER_GROUPS = { - "male": {"filter_variable": "is_male", "filter_variable_eq": True}, - "female": {"filter_variable": "is_male", "filter_variable_eq": False}, -} - -# Age group definitions (same for UK and US) -AGE_GROUPS = { - "child": {"filter_variable": "age", "filter_variable_leq": 17}, - "adult": { - "filter_variable": "age", - "filter_variable_geq": 18, - "filter_variable_leq": 64, - }, - "senior": {"filter_variable": "age", "filter_variable_geq": 65}, -} - - -def calculate_uk_poverty_by_age( - simulation: Simulation, -) -> OutputCollection[Poverty]: - """Calculate UK poverty rates broken down by age group. - - Computes poverty rates for child (< 18), adult (18-64), and - senior (65+) groups across all UK poverty types. - - Returns: - OutputCollection containing Poverty objects for each - age group x poverty type combination (3 x 4 = 12 records). - """ - results = [] - - for group_name, filters in AGE_GROUPS.items(): - group_results = calculate_uk_poverty_rates(simulation, **filters) - for pov in group_results.outputs: - pov.filter_group = group_name - results.append(pov) - - df = pd.DataFrame( - [ - { - "simulation_id": r.simulation.id, - "poverty_type": r.poverty_type, - "poverty_variable": r.poverty_variable, - "filter_variable": r.filter_variable, - "filter_group": r.filter_group, - "headcount": r.headcount, - "total_population": r.total_population, - "rate": r.rate, - } - for r in results - ] - ) - - return OutputCollection(outputs=results, dataframe=df) - - -def calculate_us_poverty_by_age( - simulation: Simulation, -) -> OutputCollection[Poverty]: - """Calculate US poverty rates broken down by age group. - - Computes poverty rates for child (< 18), adult (18-64), and - senior (65+) groups across all US poverty types. - - Returns: - OutputCollection containing Poverty objects for each - age group x poverty type combination (3 x 2 = 6 records). - """ - results = [] - - for group_name, filters in AGE_GROUPS.items(): - group_results = calculate_us_poverty_rates(simulation, **filters) - for pov in group_results.outputs: - pov.filter_group = group_name - results.append(pov) - - df = pd.DataFrame( - [ - { - "simulation_id": r.simulation.id, - "poverty_type": r.poverty_type, - "poverty_variable": r.poverty_variable, - "filter_variable": r.filter_variable, - "filter_group": r.filter_group, - "headcount": r.headcount, - "total_population": r.total_population, - "rate": r.rate, - } - for r in results - ] - ) - - return OutputCollection(outputs=results, dataframe=df) - - -def calculate_uk_poverty_by_gender( - simulation: Simulation, -) -> OutputCollection[Poverty]: - """Calculate UK poverty rates broken down by gender. - - Computes poverty rates for male and female groups across - all UK poverty types using the is_male boolean variable. - - Returns: - OutputCollection containing Poverty objects for each - gender x poverty type combination (2 x 4 = 8 records). - """ - results = [] - - for group_name, filters in GENDER_GROUPS.items(): - group_results = calculate_uk_poverty_rates(simulation, **filters) - for pov in group_results.outputs: - pov.filter_group = group_name - results.append(pov) - - df = pd.DataFrame( - [ - { - "simulation_id": r.simulation.id, - "poverty_type": r.poverty_type, - "poverty_variable": r.poverty_variable, - "filter_variable": r.filter_variable, - "filter_group": r.filter_group, - "headcount": r.headcount, - "total_population": r.total_population, - "rate": r.rate, - } - for r in results - ] - ) - - return OutputCollection(outputs=results, dataframe=df) - - -def calculate_us_poverty_by_gender( - simulation: Simulation, -) -> OutputCollection[Poverty]: - """Calculate US poverty rates broken down by gender. - - Computes poverty rates for male and female groups across - all US poverty types using the is_male boolean variable. - - Returns: - OutputCollection containing Poverty objects for each - gender x poverty type combination (2 x 2 = 4 records). - """ - results = [] - - for group_name, filters in GENDER_GROUPS.items(): - group_results = calculate_us_poverty_rates(simulation, **filters) - for pov in group_results.outputs: - pov.filter_group = group_name - results.append(pov) - - df = pd.DataFrame( - [ - { - "simulation_id": r.simulation.id, - "poverty_type": r.poverty_type, - "poverty_variable": r.poverty_variable, - "filter_variable": r.filter_variable, - "filter_group": r.filter_group, - "headcount": r.headcount, - "total_population": r.total_population, - "rate": r.rate, - } - for r in results - ] - ) - - return OutputCollection(outputs=results, dataframe=df) - - -def calculate_us_poverty_by_race( - simulation: Simulation, -) -> OutputCollection[Poverty]: - """Calculate US poverty rates broken down by race. - - Computes poverty rates for white, black, hispanic, and other - racial groups across all US poverty types using the race Enum - variable (stored as string names in the output dataset). - - US-only — the UK does not have a race variable. - - Returns: - OutputCollection containing Poverty objects for each - race x poverty type combination (4 x 2 = 8 records). - """ - results = [] - - for group_name, filters in RACE_GROUPS.items(): - group_results = calculate_us_poverty_rates(simulation, **filters) - for pov in group_results.outputs: - pov.filter_group = group_name - results.append(pov) - - df = pd.DataFrame( - [ - { - "simulation_id": r.simulation.id, - "poverty_type": r.poverty_type, - "poverty_variable": r.poverty_variable, - "filter_variable": r.filter_variable, - "filter_group": r.filter_group, - "headcount": r.headcount, - "total_population": r.total_population, - "rate": r.rate, - } - for r in results - ] - ) - - return OutputCollection(outputs=results, dataframe=df) diff --git a/build/lib/policyengine/tax_benefit_models/uk.py b/build/lib/policyengine/tax_benefit_models/uk.py deleted file mode 100644 index 52abcb18..00000000 --- a/build/lib/policyengine/tax_benefit_models/uk.py +++ /dev/null @@ -1,40 +0,0 @@ -"""PolicyEngine UK tax-benefit model - imports from uk/ module.""" - -from importlib.util import find_spec - -if find_spec("policyengine_uk") is not None: - from .uk import ( - PolicyEngineUK, - PolicyEngineUKDataset, - PolicyEngineUKLatest, - ProgrammeStatistics, - UKYearData, - create_datasets, - ensure_datasets, - general_policy_reform_analysis, - load_datasets, - managed_microsimulation, - uk_latest, - uk_model, - ) - - __all__ = [ - "UKYearData", - "PolicyEngineUKDataset", - "create_datasets", - "load_datasets", - "ensure_datasets", - "PolicyEngineUK", - "PolicyEngineUKLatest", - "managed_microsimulation", - "uk_model", - "uk_latest", - "general_policy_reform_analysis", - "ProgrammeStatistics", - ] - - # Rebuild models to resolve forward references - PolicyEngineUKDataset.model_rebuild() - PolicyEngineUKLatest.model_rebuild() -else: - __all__ = [] diff --git a/build/lib/policyengine/tax_benefit_models/uk/__init__.py b/build/lib/policyengine/tax_benefit_models/uk/__init__.py deleted file mode 100644 index 93533245..00000000 --- a/build/lib/policyengine/tax_benefit_models/uk/__init__.py +++ /dev/null @@ -1,55 +0,0 @@ -"""PolicyEngine UK tax-benefit model.""" - -from importlib.util import find_spec - -if find_spec("policyengine_uk") is not None: - from policyengine.core import Dataset - - from .analysis import ( - UKHouseholdInput, - UKHouseholdOutput, - calculate_household_impact, - economic_impact_analysis, - ) - from .datasets import ( - PolicyEngineUKDataset, - UKYearData, - create_datasets, - ensure_datasets, - load_datasets, - ) - from .model import ( - PolicyEngineUK, - PolicyEngineUKLatest, - managed_microsimulation, - uk_latest, - uk_model, - ) - from .outputs import ProgrammeStatistics - - # Rebuild Pydantic models to resolve forward references - Dataset.model_rebuild() - UKYearData.model_rebuild() - PolicyEngineUKDataset.model_rebuild() - PolicyEngineUKLatest.model_rebuild() - ProgrammeStatistics.model_rebuild() - - __all__ = [ - "UKYearData", - "PolicyEngineUKDataset", - "create_datasets", - "load_datasets", - "ensure_datasets", - "PolicyEngineUK", - "PolicyEngineUKLatest", - "managed_microsimulation", - "uk_model", - "uk_latest", - "economic_impact_analysis", - "calculate_household_impact", - "UKHouseholdInput", - "UKHouseholdOutput", - "ProgrammeStatistics", - ] -else: - __all__ = [] diff --git a/build/lib/policyengine/tax_benefit_models/uk/analysis.py b/build/lib/policyengine/tax_benefit_models/uk/analysis.py deleted file mode 100644 index 0a545b52..00000000 --- a/build/lib/policyengine/tax_benefit_models/uk/analysis.py +++ /dev/null @@ -1,283 +0,0 @@ -"""General utility functions for UK policy reform analysis.""" - -import tempfile -from pathlib import Path -from typing import Any, Optional - -import pandas as pd -from microdf import MicroDataFrame -from pydantic import BaseModel, Field, create_model - -from policyengine.core import OutputCollection, Simulation -from policyengine.core.policy import Policy -from policyengine.outputs.decile_impact import ( - DecileImpact, - calculate_decile_impacts, -) -from policyengine.outputs.inequality import ( - Inequality, - calculate_uk_inequality, -) -from policyengine.outputs.poverty import ( - Poverty, - calculate_uk_poverty_rates, -) - -from .datasets import PolicyEngineUKDataset, UKYearData -from .model import uk_latest -from .outputs import ProgrammeStatistics - - -def _create_entity_output_model(entity: str, variables: list[str]) -> type[BaseModel]: - """Create a dynamic Pydantic model for entity output variables.""" - fields = {var: (float, ...) for var in variables} - return create_model(f"{entity.title()}Output", **fields) - - -# Create output models dynamically from uk_latest.entity_variables -PersonOutput = _create_entity_output_model( - "person", uk_latest.entity_variables["person"] -) -BenunitOutput = _create_entity_output_model( - "benunit", uk_latest.entity_variables["benunit"] -) -HouseholdEntityOutput = _create_entity_output_model( - "household", uk_latest.entity_variables["household"] -) - - -class UKHouseholdOutput(BaseModel): - """Output from a UK household calculation with all entity data.""" - - person: list[dict[str, Any]] - benunit: list[dict[str, Any]] - household: dict[str, Any] - - -class UKHouseholdInput(BaseModel): - """Input for a UK household calculation.""" - - people: list[dict[str, Any]] - benunit: dict[str, Any] = Field(default_factory=dict) - household: dict[str, Any] = Field(default_factory=dict) - year: int = 2026 - - -def calculate_household_impact( - household_input: UKHouseholdInput, - policy: Optional[Policy] = None, -) -> UKHouseholdOutput: - """Calculate tax and benefit impacts for a single UK household.""" - n_people = len(household_input.people) - - # Build person data with defaults - person_data = { - "person_id": list(range(n_people)), - "person_benunit_id": [0] * n_people, - "person_household_id": [0] * n_people, - "person_weight": [1.0] * n_people, - } - # Add user-provided person fields - for i, person in enumerate(household_input.people): - for key, value in person.items(): - if key not in person_data: - person_data[key] = [0.0] * n_people # Default to 0 for numeric fields - person_data[key][i] = value - - # Build benunit data with defaults - benunit_data = { - "benunit_id": [0], - "benunit_weight": [1.0], - } - for key, value in household_input.benunit.items(): - benunit_data[key] = [value] - - # Build household data with defaults (required for uprating) - household_data = { - "household_id": [0], - "household_weight": [1.0], - "region": ["LONDON"], - "tenure_type": ["RENT_PRIVATELY"], - "council_tax": [0.0], - "rent": [0.0], - } - for key, value in household_input.household.items(): - household_data[key] = [value] - - # Create MicroDataFrames - person_df = MicroDataFrame(pd.DataFrame(person_data), weights="person_weight") - benunit_df = MicroDataFrame(pd.DataFrame(benunit_data), weights="benunit_weight") - household_df = MicroDataFrame( - pd.DataFrame(household_data), weights="household_weight" - ) - - # Create temporary dataset - tmpdir = tempfile.mkdtemp() - filepath = str(Path(tmpdir) / "household_impact.h5") - - dataset = PolicyEngineUKDataset( - name="Household impact calculation", - description="Single household for impact calculation", - filepath=filepath, - year=household_input.year, - data=UKYearData( - person=person_df, - benunit=benunit_df, - household=household_df, - ), - ) - - # Run simulation - simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, - policy=policy, - ) - simulation.run() - - # Extract all output variables defined in entity_variables - output_data = simulation.output_dataset.data - - def safe_convert(value): - """Convert value to float if numeric, otherwise return as string.""" - try: - return float(value) - except (ValueError, TypeError): - return str(value) - - person_outputs = [] - for i in range(n_people): - person_dict = {} - for var in uk_latest.entity_variables["person"]: - person_dict[var] = safe_convert(output_data.person[var].iloc[i]) - person_outputs.append(person_dict) - - benunit_outputs = [] - for i in range(len(output_data.benunit)): - benunit_dict = {} - for var in uk_latest.entity_variables["benunit"]: - benunit_dict[var] = safe_convert(output_data.benunit[var].iloc[i]) - benunit_outputs.append(benunit_dict) - - household_dict = {} - for var in uk_latest.entity_variables["household"]: - household_dict[var] = safe_convert(output_data.household[var].iloc[0]) - - return UKHouseholdOutput( - person=person_outputs, - benunit=benunit_outputs, - household=household_dict, - ) - - -class PolicyReformAnalysis(BaseModel): - """Complete policy reform analysis result.""" - - decile_impacts: OutputCollection[DecileImpact] - programme_statistics: OutputCollection[ProgrammeStatistics] - baseline_poverty: OutputCollection[Poverty] - reform_poverty: OutputCollection[Poverty] - baseline_inequality: Inequality - reform_inequality: Inequality - - -def economic_impact_analysis( - baseline_simulation: Simulation, - reform_simulation: Simulation, -) -> PolicyReformAnalysis: - """Perform comprehensive analysis of a policy reform. - - Returns: - PolicyReformAnalysis containing decile impacts and programme statistics - """ - baseline_simulation.ensure() - reform_simulation.ensure() - - assert len(baseline_simulation.dataset.data.household) > 100, ( - "Baseline simulation must have more than 100 households" - ) - assert len(reform_simulation.dataset.data.household) > 100, ( - "Reform simulation must have more than 100 households" - ) - - # Decile impact - decile_impacts = calculate_decile_impacts( - baseline_simulation=baseline_simulation, - reform_simulation=reform_simulation, - ) - - # Major programmes to analyse - programmes = { - # Tax - "income_tax": {"is_tax": True}, - "national_insurance": {"is_tax": True}, - "vat": {"is_tax": True}, - "council_tax": {"is_tax": True}, - # Benefits - "universal_credit": {"is_tax": False}, - "child_benefit": {"is_tax": False}, - "pension_credit": {"is_tax": False}, - "income_support": {"is_tax": False}, - "working_tax_credit": {"is_tax": False}, - "child_tax_credit": {"is_tax": False}, - } - - programme_statistics = [] - - for programme_name, programme_info in programmes.items(): - entity = baseline_simulation.tax_benefit_model_version.get_variable( - programme_name - ).entity - is_tax = programme_info["is_tax"] - - stats = ProgrammeStatistics( - baseline_simulation=baseline_simulation, - reform_simulation=reform_simulation, - programme_name=programme_name, - entity=entity, - is_tax=is_tax, - ) - stats.run() - programme_statistics.append(stats) - - # Create DataFrame - programme_df = pd.DataFrame( - [ - { - "baseline_simulation_id": p.baseline_simulation.id, - "reform_simulation_id": p.reform_simulation.id, - "programme_name": p.programme_name, - "entity": p.entity, - "is_tax": p.is_tax, - "baseline_total": p.baseline_total, - "reform_total": p.reform_total, - "change": p.change, - "baseline_count": p.baseline_count, - "reform_count": p.reform_count, - "winners": p.winners, - "losers": p.losers, - } - for p in programme_statistics - ] - ) - - programme_collection = OutputCollection( - outputs=programme_statistics, dataframe=programme_df - ) - - # Calculate poverty rates for both simulations - baseline_poverty = calculate_uk_poverty_rates(baseline_simulation) - reform_poverty = calculate_uk_poverty_rates(reform_simulation) - - # Calculate inequality for both simulations - baseline_inequality = calculate_uk_inequality(baseline_simulation) - reform_inequality = calculate_uk_inequality(reform_simulation) - - return PolicyReformAnalysis( - decile_impacts=decile_impacts, - programme_statistics=programme_collection, - baseline_poverty=baseline_poverty, - reform_poverty=reform_poverty, - baseline_inequality=baseline_inequality, - reform_inequality=reform_inequality, - ) diff --git a/build/lib/policyengine/tax_benefit_models/uk/datasets.py b/build/lib/policyengine/tax_benefit_models/uk/datasets.py deleted file mode 100644 index 47f78403..00000000 --- a/build/lib/policyengine/tax_benefit_models/uk/datasets.py +++ /dev/null @@ -1,245 +0,0 @@ -from pathlib import Path -from typing import Optional - -import pandas as pd -from microdf import MicroDataFrame -from pydantic import ConfigDict - -from policyengine.core import Dataset, YearData -from policyengine.core.release_manifest import ( - dataset_logical_name, - resolve_dataset_reference, -) - - -class UKYearData(YearData): - """Entity-level data for a single year.""" - - model_config = ConfigDict(arbitrary_types_allowed=True) - - person: MicroDataFrame - benunit: MicroDataFrame - household: MicroDataFrame - - @property - def entity_data(self) -> dict[str, MicroDataFrame]: - """Return a dictionary of entity names to their data.""" - return { - "person": self.person, - "benunit": self.benunit, - "household": self.household, - } - - -class PolicyEngineUKDataset(Dataset): - """UK dataset with multi-year entity-level data.""" - - data: Optional[UKYearData] = None - - def model_post_init(self, __context): - """Called after Pydantic initialization.""" - # Make sure we are synchronised between in-memory and storage, at least on initialisation - if self.data is not None: - self.save() - elif self.filepath and not self.data: - self.load() - - def save(self) -> None: - """Save dataset to HDF5 file. - - Converts object columns to categorical dtype to avoid slow pickle serialization. - """ - filepath = Path(self.filepath) - if not filepath.parent.exists(): - filepath.parent.mkdir(parents=True, exist_ok=True) - - # Convert DataFrames and optimize object columns to categorical - person_df = pd.DataFrame(self.data.person) - benunit_df = pd.DataFrame(self.data.benunit) - household_df = pd.DataFrame(self.data.household) - - # Convert object columns to categorical to avoid pickle serialization - for col in person_df.columns: - if person_df[col].dtype == "object": - person_df[col] = person_df[col].astype("category") - - for col in benunit_df.columns: - if benunit_df[col].dtype == "object": - benunit_df[col] = benunit_df[col].astype("category") - - for col in household_df.columns: - if household_df[col].dtype == "object": - household_df[col] = household_df[col].astype("category") - - with pd.HDFStore(filepath, mode="w") as store: - # Use format='table' to support categorical dtypes - store.put("person", person_df, format="table") - store.put("benunit", benunit_df, format="table") - store.put("household", household_df, format="table") - - def load(self) -> None: - """Load dataset from HDF5 file into this instance.""" - filepath = self.filepath - with pd.HDFStore(filepath, mode="r") as store: - self.data = UKYearData( - person=MicroDataFrame(store["person"], weights="person_weight"), - benunit=MicroDataFrame(store["benunit"], weights="benunit_weight"), - household=MicroDataFrame( - store["household"], weights="household_weight" - ), - ) - - def __repr__(self) -> str: - if self.data is None: - return f"" - else: - n_people = len(self.data.person) - n_benunits = len(self.data.benunit) - n_households = len(self.data.household) - return f"" - - -def create_datasets( - datasets: list[str] = [ - "frs_2023_24", - "enhanced_frs_2023_24", - ], - years: list[int] = [2026, 2027, 2028, 2029, 2030], - data_folder: str = "./data", -) -> dict[str, PolicyEngineUKDataset]: - result = {} - for dataset in datasets: - resolved_dataset = resolve_dataset_reference("uk", dataset) - dataset_stem = dataset_logical_name(resolved_dataset) - from policyengine_uk import Microsimulation - - sim = Microsimulation(dataset=resolved_dataset) - for year in years: - year_dataset = sim.dataset[year] - - # Convert to pandas DataFrames and add weight columns - person_df = pd.DataFrame(year_dataset.person) - benunit_df = pd.DataFrame(year_dataset.benunit) - household_df = pd.DataFrame(year_dataset.household) - - # Map household weights to person and benunit levels - person_df = person_df.merge( - household_df[["household_id", "household_weight"]], - left_on="person_household_id", - right_on="household_id", - how="left", - ) - person_df = person_df.rename(columns={"household_weight": "person_weight"}) - person_df = person_df.drop(columns=["household_id"]) - - # Get household_id for each benunit from person table - benunit_household_map = person_df[ - ["person_benunit_id", "person_household_id"] - ].drop_duplicates() - benunit_df = benunit_df.merge( - benunit_household_map, - left_on="benunit_id", - right_on="person_benunit_id", - how="left", - ) - benunit_df = benunit_df.merge( - household_df[["household_id", "household_weight"]], - left_on="person_household_id", - right_on="household_id", - how="left", - ) - benunit_df = benunit_df.rename( - columns={"household_weight": "benunit_weight"} - ) - benunit_df = benunit_df.drop( - columns=[ - "person_benunit_id", - "person_household_id", - "household_id", - ], - errors="ignore", - ) - - uk_dataset = PolicyEngineUKDataset( - id=f"{dataset_stem}_year_{year}", - name=f"{dataset_stem}-year-{year}", - description=f"UK Dataset for year {year} based on {dataset_stem}", - filepath=f"{data_folder}/{dataset_stem}_year_{year}.h5", - year=int(year), - data=UKYearData( - person=MicroDataFrame(person_df, weights="person_weight"), - benunit=MicroDataFrame(benunit_df, weights="benunit_weight"), - household=MicroDataFrame(household_df, weights="household_weight"), - ), - ) - uk_dataset.save() - - dataset_key = f"{dataset_stem}_{year}" - result[dataset_key] = uk_dataset - - return result - - -def load_datasets( - datasets: list[str] = [ - "frs_2023_24", - "enhanced_frs_2023_24", - ], - years: list[int] = [2026, 2027, 2028, 2029, 2030], - data_folder: str = "./data", -) -> dict[str, PolicyEngineUKDataset]: - result = {} - for dataset in datasets: - resolved_dataset = resolve_dataset_reference("uk", dataset) - dataset_stem = dataset_logical_name(resolved_dataset) - for year in years: - filepath = f"{data_folder}/{dataset_stem}_year_{year}.h5" - uk_dataset = PolicyEngineUKDataset( - name=f"{dataset_stem}-year-{year}", - description=f"UK Dataset for year {year} based on {dataset_stem}", - filepath=filepath, - year=int(year), - ) - uk_dataset.load() - - dataset_key = f"{dataset_stem}_{year}" - result[dataset_key] = uk_dataset - - return result - - -def ensure_datasets( - datasets: list[str] = [ - "frs_2023_24", - "enhanced_frs_2023_24", - ], - years: list[int] = [2026, 2027, 2028, 2029, 2030], - data_folder: str = "./data", -) -> dict[str, PolicyEngineUKDataset]: - """Ensure datasets exist, loading if available or creating if not. - - Args: - datasets: List of HuggingFace dataset paths - years: List of years to load/create data for - data_folder: Directory containing or to save the dataset files - - Returns: - Dictionary mapping dataset keys to PolicyEngineUKDataset objects - """ - # Check if all dataset files exist - all_exist = True - for dataset in datasets: - resolved_dataset = resolve_dataset_reference("uk", dataset) - dataset_stem = dataset_logical_name(resolved_dataset) - for year in years: - filepath = Path(f"{data_folder}/{dataset_stem}_year_{year}.h5") - if not filepath.exists(): - all_exist = False - break - if not all_exist: - break - - if all_exist: - return load_datasets(datasets=datasets, years=years, data_folder=data_folder) - else: - return create_datasets(datasets=datasets, years=years, data_folder=data_folder) diff --git a/build/lib/policyengine/tax_benefit_models/uk/model.py b/build/lib/policyengine/tax_benefit_models/uk/model.py deleted file mode 100644 index edd5c069..00000000 --- a/build/lib/policyengine/tax_benefit_models/uk/model.py +++ /dev/null @@ -1,496 +0,0 @@ -import datetime -from importlib import metadata -from pathlib import Path -from typing import TYPE_CHECKING, Optional - -import pandas as pd -from microdf import MicroDataFrame - -from policyengine.core import ( - Parameter, - ParameterNode, - TaxBenefitModel, - TaxBenefitModelVersion, - Variable, -) -from policyengine.core.release_manifest import ( - certify_data_release_compatibility, - dataset_logical_name, - get_release_manifest, - resolve_local_managed_dataset_source, - resolve_managed_dataset_reference, -) -from policyengine.utils.entity_utils import ( - build_entity_relationships, - filter_dataset_by_household_variable, -) -from policyengine.utils.parameter_labels import ( - build_scale_lookup, - generate_label_for_parameter, -) - -from .datasets import PolicyEngineUKDataset, UKYearData - -if TYPE_CHECKING: - from policyengine.core.simulation import Simulation - -UK_GROUP_ENTITIES = ["benunit", "household"] - - -class PolicyEngineUK(TaxBenefitModel): - id: str = "policyengine-uk" - description: str = "The UK's open-source dynamic tax and benefit microsimulation model maintained by PolicyEngine." - - -uk_model = PolicyEngineUK() - - -def _get_runtime_data_build_metadata() -> dict[str, Optional[str]]: - try: - from policyengine_uk.build_metadata import get_data_build_metadata - except ModuleNotFoundError as exc: - if exc.name != "policyengine_uk.build_metadata": - raise - return {} - - return get_data_build_metadata() or {} - - -class PolicyEngineUKLatest(TaxBenefitModelVersion): - model: TaxBenefitModel = uk_model - version: str = None - created_at: datetime.datetime = None - - entity_variables: dict[str, list[str]] = { - "person": [ - # IDs and weights - "person_id", - "benunit_id", - "household_id", - "person_weight", - # Demographics - "age", - "gender", - "is_male", - "is_adult", - "is_SP_age", - "is_child", - # Income - "employment_income", - "self_employment_income", - "pension_income", - "private_pension_income", - "savings_interest_income", - "dividend_income", - "property_income", - "total_income", - "earned_income", - # Benefits - "universal_credit", - "child_benefit", - "pension_credit", - "income_support", - "working_tax_credit", - "child_tax_credit", - # Tax - "income_tax", - "national_insurance", - ], - "benunit": [ - # IDs and weights - "benunit_id", - "benunit_weight", - # Structure - "family_type", - # Income and benefits - "universal_credit", - "child_benefit", - "pension_credit", - "income_support", - "working_tax_credit", - "child_tax_credit", - ], - "household": [ - # IDs and weights - "household_id", - "household_weight", - "household_count_people", - # Income measures - "household_net_income", - "household_income_decile", - "household_wealth_decile", - "hbai_household_net_income", - "equiv_hbai_household_net_income", - "household_market_income", - "household_gross_income", - # Benefits and tax - "household_benefits", - "household_tax", - "vat", - # Housing - "rent", - "council_tax", - "tenure_type", - # Poverty measures - "in_poverty_bhc", - "in_poverty_ahc", - "in_relative_poverty_bhc", - "in_relative_poverty_ahc", - ], - } - - def __init__(self, **kwargs: dict): - manifest = get_release_manifest("uk") - if "version" not in kwargs or kwargs.get("version") is None: - kwargs["version"] = manifest.model_package.version - - installed_model_version = metadata.version("policyengine-uk") - if installed_model_version != manifest.model_package.version: - raise ValueError( - "Installed policyengine-uk version does not match the " - f"bundled policyengine.py manifest. Expected " - f"{manifest.model_package.version}, got {installed_model_version}." - ) - - model_build_metadata = _get_runtime_data_build_metadata() - data_certification = certify_data_release_compatibility( - "uk", - runtime_model_version=installed_model_version, - runtime_data_build_fingerprint=model_build_metadata.get( - "data_build_fingerprint" - ), - ) - - super().__init__(**kwargs) - self.release_manifest = manifest - self.model_package = manifest.model_package - self.data_package = manifest.data_package - self.default_dataset_uri = manifest.default_dataset_uri - self.data_certification = data_certification - from policyengine_core.enums import Enum - from policyengine_uk.system import system - - # Attach region registry - from policyengine.countries.uk.regions import uk_region_registry - - self.region_registry = uk_region_registry - - self.id = f"{self.model.id}@{self.version}" - - for var_obj in system.variables.values(): - # Serialize default_value for JSON compatibility - default_val = var_obj.default_value - if var_obj.value_type is Enum: - default_val = default_val.name - elif var_obj.value_type is datetime.date: - default_val = default_val.isoformat() - - variable = Variable( - id=self.id + "-" + var_obj.name, - name=var_obj.name, - label=getattr(var_obj, "label", None), - tax_benefit_model_version=self, - entity=var_obj.entity.key, - description=var_obj.documentation, - data_type=var_obj.value_type if var_obj.value_type is not Enum else str, - default_value=default_val, - value_type=var_obj.value_type, - ) - if ( - hasattr(var_obj, "possible_values") - and var_obj.possible_values is not None - ): - variable.possible_values = list( - map( - lambda x: x.name, - var_obj.possible_values._value2member_map_.values(), - ) - ) - # Extract and resolve adds/subtracts. - # Core stores these as either list[str] or a parameter path string. - # Resolve parameter paths to lists so consumers always get list[str]. - if hasattr(var_obj, "adds") and var_obj.adds is not None: - if isinstance(var_obj.adds, str): - try: - from policyengine_core.parameters.operations.get_parameter import ( - get_parameter, - ) - - param = get_parameter(system.parameters, var_obj.adds) - variable.adds = list(param("2025-01-01")) - except (ValueError, Exception): - variable.adds = None - else: - variable.adds = var_obj.adds - if hasattr(var_obj, "subtracts") and var_obj.subtracts is not None: - if isinstance(var_obj.subtracts, str): - try: - from policyengine_core.parameters.operations.get_parameter import ( - get_parameter, - ) - - param = get_parameter(system.parameters, var_obj.subtracts) - variable.subtracts = list(param("2025-01-01")) - except (ValueError, Exception): - variable.subtracts = None - else: - variable.subtracts = var_obj.subtracts - self.add_variable(variable) - - from policyengine_core.parameters import Parameter as CoreParameter - from policyengine_core.parameters import ParameterNode as CoreParameterNode - - scale_lookup = build_scale_lookup(system) - - for param_node in system.parameters.get_descendants(): - if isinstance(param_node, CoreParameter): - parameter = Parameter( - id=self.id + "-" + param_node.name, - name=param_node.name, - label=generate_label_for_parameter( - param_node, system, scale_lookup - ), - tax_benefit_model_version=self, - description=param_node.description, - data_type=type(param_node(2025)), - unit=param_node.metadata.get("unit"), - _core_param=param_node, - ) - self.add_parameter(parameter) - elif isinstance(param_node, CoreParameterNode): - node = ParameterNode( - id=self.id + "-" + param_node.name, - name=param_node.name, - label=param_node.metadata.get("label"), - description=param_node.description, - tax_benefit_model_version=self, - ) - self.add_parameter_node(node) - - def _build_entity_relationships( - self, dataset: PolicyEngineUKDataset - ) -> pd.DataFrame: - """Build a DataFrame mapping each person to their containing entities.""" - person_data = pd.DataFrame(dataset.data.person) - return build_entity_relationships(person_data, UK_GROUP_ENTITIES) - - def _filter_dataset_by_household_variable( - self, - dataset: PolicyEngineUKDataset, - variable_name: str, - variable_value: str, - ) -> PolicyEngineUKDataset: - """Filter a dataset to only include households where a variable matches.""" - filtered = filter_dataset_by_household_variable( - entity_data=dataset.data.entity_data, - group_entities=UK_GROUP_ENTITIES, - variable_name=variable_name, - variable_value=variable_value, - ) - return PolicyEngineUKDataset( - id=dataset.id + f"_filtered_{variable_name}_{variable_value}", - name=dataset.name, - description=f"{dataset.description} (filtered: {variable_name}={variable_value})", - filepath=dataset.filepath, - year=dataset.year, - is_output_dataset=dataset.is_output_dataset, - data=UKYearData( - person=filtered["person"], - benunit=filtered["benunit"], - household=filtered["household"], - ), - ) - - def run(self, simulation: "Simulation") -> "Simulation": - from policyengine_uk import Microsimulation - from policyengine_uk.data import UKSingleYearDataset - - from policyengine.utils.parametric_reforms import ( - simulation_modifier_from_parameter_values, - ) - - assert isinstance(simulation.dataset, PolicyEngineUKDataset) - - dataset = simulation.dataset - dataset.load() - - # Apply regional scoping if specified - if simulation.scoping_strategy: - scoped_data = simulation.scoping_strategy.apply( - entity_data=dataset.data.entity_data, - group_entities=UK_GROUP_ENTITIES, - year=dataset.year, - ) - dataset = PolicyEngineUKDataset( - id=dataset.id + "_scoped", - name=dataset.name, - description=dataset.description, - filepath=dataset.filepath, - year=dataset.year, - is_output_dataset=dataset.is_output_dataset, - data=UKYearData( - person=scoped_data["person"], - benunit=scoped_data["benunit"], - household=scoped_data["household"], - ), - ) - elif simulation.filter_field and simulation.filter_value: - dataset = self._filter_dataset_by_household_variable( - dataset, simulation.filter_field, simulation.filter_value - ) - - input_data = UKSingleYearDataset( - person=dataset.data.person, - benunit=dataset.data.benunit, - household=dataset.data.household, - fiscal_year=dataset.year, - ) - microsim = Microsimulation(dataset=input_data) - - if simulation.policy and simulation.policy.simulation_modifier is not None: - simulation.policy.simulation_modifier(microsim) - elif simulation.policy: - modifier = simulation_modifier_from_parameter_values( - simulation.policy.parameter_values - ) - modifier(microsim) - - if simulation.dynamic and simulation.dynamic.simulation_modifier is not None: - simulation.dynamic.simulation_modifier(microsim) - elif simulation.dynamic: - modifier = simulation_modifier_from_parameter_values( - simulation.dynamic.parameter_values - ) - modifier(microsim) - - data = { - "person": pd.DataFrame(), - "benunit": pd.DataFrame(), - "household": pd.DataFrame(), - } - - for entity, variables in self.entity_variables.items(): - for var in variables: - data[entity][var] = microsim.calculate( - var, period=simulation.dataset.year, map_to=entity - ).values - - data["person"] = MicroDataFrame(data["person"], weights="person_weight") - data["benunit"] = MicroDataFrame(data["benunit"], weights="benunit_weight") - data["household"] = MicroDataFrame( - data["household"], weights="household_weight" - ) - - simulation.output_dataset = PolicyEngineUKDataset( - id=simulation.id, - name=dataset.name, - description=dataset.description, - filepath=str( - Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") - ), - year=simulation.dataset.year, - is_output_dataset=True, - data=UKYearData( - person=data["person"], - benunit=data["benunit"], - household=data["household"], - ), - ) - - def save(self, simulation: "Simulation"): - """Save the simulation's output dataset.""" - simulation.output_dataset.save() - - def load(self, simulation: "Simulation"): - """Load the simulation's output dataset.""" - import os - - filepath = str( - Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") - ) - - simulation.output_dataset = PolicyEngineUKDataset( - id=simulation.id, - name=simulation.dataset.name, - description=simulation.dataset.description, - filepath=filepath, - year=simulation.dataset.year, - is_output_dataset=True, - ) - - # Load timestamps from file system metadata - if os.path.exists(filepath): - simulation.created_at = datetime.datetime.fromtimestamp( - os.path.getctime(filepath) - ) - simulation.updated_at = datetime.datetime.fromtimestamp( - os.path.getmtime(filepath) - ) - - -def _managed_release_bundle( - dataset_uri: str, - dataset_source: Optional[str] = None, -) -> dict[str, Optional[str]]: - bundle = dict(uk_latest.release_bundle) - bundle["runtime_dataset"] = dataset_logical_name(dataset_uri) - bundle["runtime_dataset_uri"] = dataset_uri - if dataset_source: - bundle["runtime_dataset_source"] = dataset_source - bundle["managed_by"] = "policyengine.py" - return bundle - - -def managed_microsimulation( - *, - dataset: Optional[str] = None, - allow_unmanaged: bool = False, - **kwargs, -): - """Construct a country-package Microsimulation pinned to this bundle. - - By default this enforces the dataset selection from the bundled - `policyengine.py` release manifest. Arbitrary dataset URIs require - `allow_unmanaged=True`. - """ - - from policyengine_uk import Microsimulation - - if "dataset" in kwargs: - raise ValueError( - "Pass `dataset=` directly to managed_microsimulation, not through " - "**kwargs, so policyengine.py can enforce the release bundle." - ) - - dataset_uri = resolve_managed_dataset_reference( - "uk", - dataset, - allow_unmanaged=allow_unmanaged, - ) - dataset_source = resolve_local_managed_dataset_source( - "uk", - dataset_uri, - allow_local_mirror=not ( - allow_unmanaged and dataset is not None and "://" in dataset - ), - ) - runtime_dataset = dataset_source - if isinstance(dataset_source, str) and "hf://" not in dataset_source: - from policyengine_uk.data.dataset_schema import ( - UKMultiYearDataset, - UKSingleYearDataset, - ) - - if UKMultiYearDataset.validate_file_path(dataset_source, False): - runtime_dataset = UKMultiYearDataset(dataset_source) - elif UKSingleYearDataset.validate_file_path(dataset_source, False): - runtime_dataset = UKSingleYearDataset(dataset_source) - microsim = Microsimulation(dataset=runtime_dataset, **kwargs) - microsim.policyengine_bundle = _managed_release_bundle( - dataset_uri, - dataset_source, - ) - return microsim - - -uk_latest = PolicyEngineUKLatest() diff --git a/build/lib/policyengine/tax_benefit_models/uk/outputs.py b/build/lib/policyengine/tax_benefit_models/uk/outputs.py deleted file mode 100644 index 97032a9c..00000000 --- a/build/lib/policyengine/tax_benefit_models/uk/outputs.py +++ /dev/null @@ -1,105 +0,0 @@ -"""UK-specific output templates.""" - -from typing import Optional - -from pydantic import ConfigDict - -from policyengine.core import Output, Simulation -from policyengine.outputs.aggregate import Aggregate, AggregateType -from policyengine.outputs.change_aggregate import ( - ChangeAggregate, - ChangeAggregateType, -) - - -class ProgrammeStatistics(Output): - """Single programme's statistics from a policy reform - represents one database row.""" - - model_config = ConfigDict(arbitrary_types_allowed=True) - - baseline_simulation: Simulation - reform_simulation: Simulation - programme_name: str - entity: str - is_tax: bool = False - - # Results populated by run() - baseline_total: Optional[float] = None - reform_total: Optional[float] = None - change: Optional[float] = None - baseline_count: Optional[float] = None - reform_count: Optional[float] = None - winners: Optional[float] = None - losers: Optional[float] = None - - def run(self): - """Calculate statistics for this programme.""" - # Baseline totals - baseline_total = Aggregate( - simulation=self.baseline_simulation, - variable=self.programme_name, - aggregate_type=AggregateType.SUM, - entity=self.entity, - ) - baseline_total.run() - - # Reform totals - reform_total = Aggregate( - simulation=self.reform_simulation, - variable=self.programme_name, - aggregate_type=AggregateType.SUM, - entity=self.entity, - ) - reform_total.run() - - # Count of recipients/payers (baseline) - baseline_count = Aggregate( - simulation=self.baseline_simulation, - variable=self.programme_name, - aggregate_type=AggregateType.COUNT, - entity=self.entity, - filter_variable=self.programme_name, - filter_variable_geq=0.01, - ) - baseline_count.run() - - # Count of recipients/payers (reform) - reform_count = Aggregate( - simulation=self.reform_simulation, - variable=self.programme_name, - aggregate_type=AggregateType.COUNT, - entity=self.entity, - filter_variable=self.programme_name, - filter_variable_geq=0.01, - ) - reform_count.run() - - # Winners and losers - winners = ChangeAggregate( - baseline_simulation=self.baseline_simulation, - reform_simulation=self.reform_simulation, - variable=self.programme_name, - aggregate_type=ChangeAggregateType.COUNT, - entity=self.entity, - change_geq=0.01 if not self.is_tax else -0.01, - ) - winners.run() - - losers = ChangeAggregate( - baseline_simulation=self.baseline_simulation, - reform_simulation=self.reform_simulation, - variable=self.programme_name, - aggregate_type=ChangeAggregateType.COUNT, - entity=self.entity, - change_leq=-0.01 if not self.is_tax else 0.01, - ) - losers.run() - - # Populate results - self.baseline_total = float(baseline_total.result) - self.reform_total = float(reform_total.result) - self.change = float(reform_total.result - baseline_total.result) - self.baseline_count = float(baseline_count.result) - self.reform_count = float(reform_count.result) - self.winners = float(winners.result) - self.losers = float(losers.result) diff --git a/build/lib/policyengine/tax_benefit_models/us.py b/build/lib/policyengine/tax_benefit_models/us.py deleted file mode 100644 index bbc29486..00000000 --- a/build/lib/policyengine/tax_benefit_models/us.py +++ /dev/null @@ -1,40 +0,0 @@ -"""PolicyEngine US tax-benefit model - imports from us/ module.""" - -from importlib.util import find_spec - -if find_spec("policyengine_us") is not None: - from .us import ( - PolicyEngineUS, - PolicyEngineUSDataset, - PolicyEngineUSLatest, - ProgramStatistics, - USYearData, - create_datasets, - ensure_datasets, - general_policy_reform_analysis, - load_datasets, - managed_microsimulation, - us_latest, - us_model, - ) - - __all__ = [ - "USYearData", - "PolicyEngineUSDataset", - "create_datasets", - "load_datasets", - "ensure_datasets", - "PolicyEngineUS", - "PolicyEngineUSLatest", - "managed_microsimulation", - "us_model", - "us_latest", - "general_policy_reform_analysis", - "ProgramStatistics", - ] - - # Rebuild models to resolve forward references - PolicyEngineUSDataset.model_rebuild() - PolicyEngineUSLatest.model_rebuild() -else: - __all__ = [] diff --git a/build/lib/policyengine/tax_benefit_models/us/__init__.py b/build/lib/policyengine/tax_benefit_models/us/__init__.py deleted file mode 100644 index 75d2aa79..00000000 --- a/build/lib/policyengine/tax_benefit_models/us/__init__.py +++ /dev/null @@ -1,55 +0,0 @@ -"""PolicyEngine US tax-benefit model.""" - -from importlib.util import find_spec - -if find_spec("policyengine_us") is not None: - from policyengine.core import Dataset - - from .analysis import ( - USHouseholdInput, - USHouseholdOutput, - calculate_household_impact, - economic_impact_analysis, - ) - from .datasets import ( - PolicyEngineUSDataset, - USYearData, - create_datasets, - ensure_datasets, - load_datasets, - ) - from .model import ( - PolicyEngineUS, - PolicyEngineUSLatest, - managed_microsimulation, - us_latest, - us_model, - ) - from .outputs import ProgramStatistics - - # Rebuild Pydantic models to resolve forward references - Dataset.model_rebuild() - USYearData.model_rebuild() - PolicyEngineUSDataset.model_rebuild() - PolicyEngineUSLatest.model_rebuild() - ProgramStatistics.model_rebuild() - - __all__ = [ - "USYearData", - "PolicyEngineUSDataset", - "create_datasets", - "load_datasets", - "ensure_datasets", - "PolicyEngineUS", - "PolicyEngineUSLatest", - "managed_microsimulation", - "us_model", - "us_latest", - "economic_impact_analysis", - "calculate_household_impact", - "USHouseholdInput", - "USHouseholdOutput", - "ProgramStatistics", - ] -else: - __all__ = [] diff --git a/build/lib/policyengine/tax_benefit_models/us/analysis.py b/build/lib/policyengine/tax_benefit_models/us/analysis.py deleted file mode 100644 index 122ae2af..00000000 --- a/build/lib/policyengine/tax_benefit_models/us/analysis.py +++ /dev/null @@ -1,311 +0,0 @@ -"""General utility functions for US policy reform analysis.""" - -import tempfile -from pathlib import Path -from typing import Any, Optional, Union - -import pandas as pd -from microdf import MicroDataFrame -from pydantic import BaseModel, Field - -from policyengine.core import OutputCollection, Simulation -from policyengine.core.policy import Policy -from policyengine.outputs.decile_impact import ( - DecileImpact, - calculate_decile_impacts, -) -from policyengine.outputs.inequality import ( - Inequality, - USInequalityPreset, - calculate_us_inequality, -) -from policyengine.outputs.poverty import ( - Poverty, - calculate_us_poverty_rates, -) - -from .datasets import PolicyEngineUSDataset, USYearData -from .model import us_latest -from .outputs import ProgramStatistics - - -class USHouseholdOutput(BaseModel): - """Output from a US household calculation with all entity data.""" - - person: list[dict[str, Any]] - marital_unit: list[dict[str, Any]] - family: list[dict[str, Any]] - spm_unit: list[dict[str, Any]] - tax_unit: list[dict[str, Any]] - household: dict[str, Any] - - -class USHouseholdInput(BaseModel): - """Input for a US household calculation.""" - - people: list[dict[str, Any]] - marital_unit: dict[str, Any] = Field(default_factory=dict) - family: dict[str, Any] = Field(default_factory=dict) - spm_unit: dict[str, Any] = Field(default_factory=dict) - tax_unit: dict[str, Any] = Field(default_factory=dict) - household: dict[str, Any] = Field(default_factory=dict) - year: int = 2024 - - -def calculate_household_impact( - household_input: USHouseholdInput, - policy: Optional[Policy] = None, -) -> USHouseholdOutput: - """Calculate tax and benefit impacts for a single US household.""" - n_people = len(household_input.people) - - # Build person data with defaults - person_data = { - "person_id": list(range(n_people)), - "person_household_id": [0] * n_people, - "person_marital_unit_id": [0] * n_people, - "person_family_id": [0] * n_people, - "person_spm_unit_id": [0] * n_people, - "person_tax_unit_id": [0] * n_people, - "person_weight": [1.0] * n_people, - } - # Add user-provided person fields - for i, person in enumerate(household_input.people): - for key, value in person.items(): - if key not in person_data: - person_data[key] = [0.0] * n_people # Default to 0 for numeric fields - person_data[key][i] = value - - # Build entity data with defaults - household_data = { - "household_id": [0], - "household_weight": [1.0], - } - for key, value in household_input.household.items(): - household_data[key] = [value] - - marital_unit_data = { - "marital_unit_id": [0], - "marital_unit_weight": [1.0], - } - for key, value in household_input.marital_unit.items(): - marital_unit_data[key] = [value] - - family_data = { - "family_id": [0], - "family_weight": [1.0], - } - for key, value in household_input.family.items(): - family_data[key] = [value] - - spm_unit_data = { - "spm_unit_id": [0], - "spm_unit_weight": [1.0], - } - for key, value in household_input.spm_unit.items(): - spm_unit_data[key] = [value] - - tax_unit_data = { - "tax_unit_id": [0], - "tax_unit_weight": [1.0], - } - for key, value in household_input.tax_unit.items(): - tax_unit_data[key] = [value] - - # Create MicroDataFrames - person_df = MicroDataFrame(pd.DataFrame(person_data), weights="person_weight") - household_df = MicroDataFrame( - pd.DataFrame(household_data), weights="household_weight" - ) - marital_unit_df = MicroDataFrame( - pd.DataFrame(marital_unit_data), weights="marital_unit_weight" - ) - family_df = MicroDataFrame(pd.DataFrame(family_data), weights="family_weight") - spm_unit_df = MicroDataFrame(pd.DataFrame(spm_unit_data), weights="spm_unit_weight") - tax_unit_df = MicroDataFrame(pd.DataFrame(tax_unit_data), weights="tax_unit_weight") - - # Create temporary dataset - tmpdir = tempfile.mkdtemp() - filepath = str(Path(tmpdir) / "household_impact.h5") - - dataset = PolicyEngineUSDataset( - name="Household impact calculation", - description="Single household for impact calculation", - filepath=filepath, - year=household_input.year, - data=USYearData( - person=person_df, - household=household_df, - marital_unit=marital_unit_df, - family=family_df, - spm_unit=spm_unit_df, - tax_unit=tax_unit_df, - ), - ) - - # Run simulation - simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=us_latest, - policy=policy, - ) - simulation.run() - - # Extract all output variables defined in entity_variables - output_data = simulation.output_dataset.data - - def safe_convert(value): - """Convert value to float if numeric, otherwise return as string.""" - try: - return float(value) - except (ValueError, TypeError): - return str(value) - - def extract_entity_outputs( - entity_name: str, entity_data, n_rows: int - ) -> list[dict[str, Any]]: - outputs = [] - for i in range(n_rows): - row_dict = {} - for var in us_latest.entity_variables[entity_name]: - row_dict[var] = safe_convert(entity_data[var].iloc[i]) - outputs.append(row_dict) - return outputs - - return USHouseholdOutput( - person=extract_entity_outputs("person", output_data.person, n_people), - marital_unit=extract_entity_outputs( - "marital_unit", output_data.marital_unit, 1 - ), - family=extract_entity_outputs("family", output_data.family, 1), - spm_unit=extract_entity_outputs("spm_unit", output_data.spm_unit, 1), - tax_unit=extract_entity_outputs("tax_unit", output_data.tax_unit, 1), - household={ - var: safe_convert(output_data.household[var].iloc[0]) - for var in us_latest.entity_variables["household"] - }, - ) - - -class PolicyReformAnalysis(BaseModel): - """Complete policy reform analysis result.""" - - decile_impacts: OutputCollection[DecileImpact] - program_statistics: OutputCollection[ProgramStatistics] - baseline_poverty: OutputCollection[Poverty] - reform_poverty: OutputCollection[Poverty] - baseline_inequality: Inequality - reform_inequality: Inequality - - -def economic_impact_analysis( - baseline_simulation: Simulation, - reform_simulation: Simulation, - inequality_preset: Union[USInequalityPreset, str] = USInequalityPreset.STANDARD, -) -> PolicyReformAnalysis: - """Perform comprehensive analysis of a policy reform. - - Args: - baseline_simulation: Baseline simulation - reform_simulation: Reform simulation - inequality_preset: Optional preset for the inequality outputs - - Returns: - PolicyReformAnalysis containing decile impacts and program statistics - """ - baseline_simulation.ensure() - reform_simulation.ensure() - - assert len(baseline_simulation.dataset.data.household) > 100, ( - "Baseline simulation must have more than 100 households" - ) - assert len(reform_simulation.dataset.data.household) > 100, ( - "Reform simulation must have more than 100 households" - ) - - # Decile impact (using household_net_income for US) - decile_impacts = calculate_decile_impacts( - baseline_simulation=baseline_simulation, - reform_simulation=reform_simulation, - income_variable="household_net_income", - ) - - # Major programs to analyse - programs = { - # Federal taxes - "income_tax": {"entity": "tax_unit", "is_tax": True}, - "payroll_tax": {"entity": "person", "is_tax": True}, - # State and local taxes - "state_income_tax": {"entity": "tax_unit", "is_tax": True}, - # Benefits - "snap": {"entity": "spm_unit", "is_tax": False}, - "tanf": {"entity": "spm_unit", "is_tax": False}, - "ssi": {"entity": "person", "is_tax": False}, - "social_security": {"entity": "person", "is_tax": False}, - "medicare": {"entity": "person", "is_tax": False}, - "medicaid": {"entity": "person", "is_tax": False}, - "eitc": {"entity": "tax_unit", "is_tax": False}, - "ctc": {"entity": "tax_unit", "is_tax": False}, - } - - program_statistics = [] - - for program_name, program_info in programs.items(): - entity = program_info["entity"] - is_tax = program_info["is_tax"] - - stats = ProgramStatistics( - baseline_simulation=baseline_simulation, - reform_simulation=reform_simulation, - program_name=program_name, - entity=entity, - is_tax=is_tax, - ) - stats.run() - program_statistics.append(stats) - - # Create DataFrame - program_df = pd.DataFrame( - [ - { - "baseline_simulation_id": p.baseline_simulation.id, - "reform_simulation_id": p.reform_simulation.id, - "program_name": p.program_name, - "entity": p.entity, - "is_tax": p.is_tax, - "baseline_total": p.baseline_total, - "reform_total": p.reform_total, - "change": p.change, - "baseline_count": p.baseline_count, - "reform_count": p.reform_count, - "winners": p.winners, - "losers": p.losers, - } - for p in program_statistics - ] - ) - - program_collection = OutputCollection( - outputs=program_statistics, dataframe=program_df - ) - - # Calculate poverty rates for both simulations - baseline_poverty = calculate_us_poverty_rates(baseline_simulation) - reform_poverty = calculate_us_poverty_rates(reform_simulation) - - # Calculate inequality for both simulations - baseline_inequality = calculate_us_inequality( - baseline_simulation, preset=inequality_preset - ) - reform_inequality = calculate_us_inequality( - reform_simulation, preset=inequality_preset - ) - - return PolicyReformAnalysis( - decile_impacts=decile_impacts, - program_statistics=program_collection, - baseline_poverty=baseline_poverty, - reform_poverty=reform_poverty, - baseline_inequality=baseline_inequality, - reform_inequality=reform_inequality, - ) diff --git a/build/lib/policyengine/tax_benefit_models/us/datasets.py b/build/lib/policyengine/tax_benefit_models/us/datasets.py deleted file mode 100644 index da10733b..00000000 --- a/build/lib/policyengine/tax_benefit_models/us/datasets.py +++ /dev/null @@ -1,359 +0,0 @@ -import warnings -from pathlib import Path -from typing import Optional - -import pandas as pd -from microdf import MicroDataFrame -from pydantic import ConfigDict - -from policyengine.core import Dataset, YearData -from policyengine.core.release_manifest import ( - dataset_logical_name, - resolve_dataset_reference, -) - - -class USYearData(YearData): - """Entity-level data for a single year.""" - - model_config = ConfigDict(arbitrary_types_allowed=True) - - person: MicroDataFrame - marital_unit: MicroDataFrame - family: MicroDataFrame - spm_unit: MicroDataFrame - tax_unit: MicroDataFrame - household: MicroDataFrame - - @property - def entity_data(self) -> dict[str, MicroDataFrame]: - """Return a dictionary of entity names to their data.""" - return { - "person": self.person, - "marital_unit": self.marital_unit, - "family": self.family, - "spm_unit": self.spm_unit, - "tax_unit": self.tax_unit, - "household": self.household, - } - - -class PolicyEngineUSDataset(Dataset): - """US dataset with multi-year entity-level data.""" - - data: Optional[USYearData] = None - - def model_post_init(self, __context) -> None: - """Called after Pydantic initialization.""" - # Make sure we are synchronised between in-memory and storage, at least on initialisation - if self.data is not None: - self.save() - elif self.filepath and not self.data: - self.load() - - def save(self) -> None: - """Save dataset to HDF5 file.""" - filepath = Path(self.filepath) - if not filepath.parent.exists(): - filepath.parent.mkdir(parents=True, exist_ok=True) - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - category=pd.errors.PerformanceWarning, - message=".*PyTables will pickle object types.*", - ) - with pd.HDFStore(filepath, mode="w") as store: - store["person"] = pd.DataFrame(self.data.person) - store["marital_unit"] = pd.DataFrame(self.data.marital_unit) - store["family"] = pd.DataFrame(self.data.family) - store["spm_unit"] = pd.DataFrame(self.data.spm_unit) - store["tax_unit"] = pd.DataFrame(self.data.tax_unit) - store["household"] = pd.DataFrame(self.data.household) - - def load(self) -> None: - """Load dataset from HDF5 file into this instance.""" - filepath = self.filepath - with pd.HDFStore(filepath, mode="r") as store: - self.data = USYearData( - person=MicroDataFrame(store["person"], weights="person_weight"), - marital_unit=MicroDataFrame( - store["marital_unit"], weights="marital_unit_weight" - ), - family=MicroDataFrame(store["family"], weights="family_weight"), - spm_unit=MicroDataFrame(store["spm_unit"], weights="spm_unit_weight"), - tax_unit=MicroDataFrame(store["tax_unit"], weights="tax_unit_weight"), - household=MicroDataFrame( - store["household"], weights="household_weight" - ), - ) - - def __repr__(self) -> str: - if self.data is None: - return f"" - else: - n_people = len(self.data.person) - n_marital_units = len(self.data.marital_unit) - n_families = len(self.data.family) - n_spm_units = len(self.data.spm_unit) - n_tax_units = len(self.data.tax_unit) - n_households = len(self.data.household) - return f"" - - -def create_datasets( - datasets: list[str] = [ - "enhanced_cps_2024", - ], - years: list[int] = [2024, 2025, 2026, 2027, 2028], - data_folder: str = "./data", -) -> dict[str, PolicyEngineUSDataset]: - """Create PolicyEngineUSDataset instances from logical dataset names or URLs. - - Args: - datasets: List of logical dataset names or HuggingFace dataset URLs - years: List of years to extract data for - data_folder: Directory to save the dataset files - - Returns: - Dictionary mapping dataset keys (e.g., "enhanced_cps_2024") to PolicyEngineUSDataset objects - """ - from policyengine_us import Microsimulation - - result = {} - for dataset in datasets: - resolved_dataset = resolve_dataset_reference("us", dataset) - dataset_stem = dataset_logical_name(resolved_dataset) - sim = Microsimulation(dataset=resolved_dataset) - - for year in years: - # Get all input variables from the simulation - # We'll calculate each input variable for the specified year - entity_data = { - "person": {}, - "household": {}, - "marital_unit": {}, - "family": {}, - "spm_unit": {}, - "tax_unit": {}, - } - - # First, get ID columns which are structural (not input variables) - # These define entity membership and relationships - # For person-level links to group entities, use person_X_id naming - id_variables = { - "person": [ - "person_id", - "person_household_id", - "person_marital_unit_id", - "person_family_id", - "person_spm_unit_id", - "person_tax_unit_id", - ], - "household": ["household_id"], - "marital_unit": ["marital_unit_id"], - "family": ["family_id"], - "spm_unit": ["spm_unit_id"], - "tax_unit": ["tax_unit_id"], - } - - for entity_key, var_names in id_variables.items(): - for id_var in var_names: - if id_var in sim.tax_benefit_system.variables: - values = sim.calculate(id_var, period=year).values - entity_data[entity_key][id_var] = values - - # Get input variables and calculate them for this year - for variable_name in sim.input_variables: - variable = sim.tax_benefit_system.variables[variable_name] - entity_key = variable.entity.key - - # Calculate the variable for the given year - values = sim.calculate(variable_name, period=year).values - - # Store in the appropriate entity dictionary - entity_data[entity_key][variable_name] = values - - # Build entity DataFrames - person_df = pd.DataFrame(entity_data["person"]) - household_df = pd.DataFrame(entity_data["household"]) - marital_unit_df = pd.DataFrame(entity_data["marital_unit"]) - family_df = pd.DataFrame(entity_data["family"]) - spm_unit_df = pd.DataFrame(entity_data["spm_unit"]) - tax_unit_df = pd.DataFrame(entity_data["tax_unit"]) - - # Add weight columns - household weights are primary, map to all entities - # Person weights = household weights (mapped via person_household_id) - if "household_weight" in household_df.columns: - # Only add person_weight if it doesn't already exist - if "person_weight" not in person_df.columns: - person_df = person_df.merge( - household_df[["household_id", "household_weight"]], - left_on="person_household_id", - right_on="household_id", - how="left", - ) - person_df = person_df.rename( - columns={"household_weight": "person_weight"} - ) - person_df = person_df.drop( - columns=["household_id"], errors="ignore" - ) - - # Map household weights to other group entities via person table - for entity_name, entity_df, person_id_col, entity_id_col in [ - ( - "marital_unit", - marital_unit_df, - "person_marital_unit_id", - "marital_unit_id", - ), - ("family", family_df, "person_family_id", "family_id"), - ( - "spm_unit", - spm_unit_df, - "person_spm_unit_id", - "spm_unit_id", - ), - ( - "tax_unit", - tax_unit_df, - "person_tax_unit_id", - "tax_unit_id", - ), - ]: - # Only add entity weight if it doesn't already exist - if f"{entity_name}_weight" not in entity_df.columns: - # Get household_id for each entity from person table - entity_household_map = person_df[ - [person_id_col, "person_household_id"] - ].drop_duplicates() - entity_df = entity_df.merge( - entity_household_map, - left_on=entity_id_col, - right_on=person_id_col, - how="left", - ) - entity_df = entity_df.merge( - household_df[["household_id", "household_weight"]], - left_on="person_household_id", - right_on="household_id", - how="left", - ) - entity_df = entity_df.rename( - columns={"household_weight": f"{entity_name}_weight"} - ) - entity_df = entity_df.drop( - columns=[ - "household_id", - "person_household_id", - person_id_col, - ], - errors="ignore", - ) - - # Update the entity_data - if entity_name == "marital_unit": - marital_unit_df = entity_df - elif entity_name == "family": - family_df = entity_df - elif entity_name == "spm_unit": - spm_unit_df = entity_df - elif entity_name == "tax_unit": - tax_unit_df = entity_df - - us_dataset = PolicyEngineUSDataset( - id=f"{dataset_stem}_year_{year}", - name=f"{dataset_stem}-year-{year}", - description=f"US Dataset for year {year} based on {dataset_stem}", - filepath=f"{data_folder}/{dataset_stem}_year_{year}.h5", - year=int(year), - data=USYearData( - person=MicroDataFrame(person_df, weights="person_weight"), - household=MicroDataFrame(household_df, weights="household_weight"), - marital_unit=MicroDataFrame( - marital_unit_df, weights="marital_unit_weight" - ), - family=MicroDataFrame(family_df, weights="family_weight"), - spm_unit=MicroDataFrame(spm_unit_df, weights="spm_unit_weight"), - tax_unit=MicroDataFrame(tax_unit_df, weights="tax_unit_weight"), - ), - ) - us_dataset.save() - - dataset_key = f"{dataset_stem}_{year}" - result[dataset_key] = us_dataset - - return result - - -def load_datasets( - datasets: list[str] = [ - "enhanced_cps_2024", - ], - years: list[int] = [2024, 2025, 2026, 2027, 2028], - data_folder: str = "./data", -) -> dict[str, PolicyEngineUSDataset]: - """Load PolicyEngineUSDataset instances from saved HDF5 files. - - Args: - datasets: List of HuggingFace dataset paths (used to derive file names) - years: List of years to load data for - data_folder: Directory containing the dataset files - - Returns: - Dictionary mapping dataset keys (e.g., "enhanced_cps_2024") to PolicyEngineUSDataset objects - """ - result = {} - for dataset in datasets: - resolved_dataset = resolve_dataset_reference("us", dataset) - dataset_stem = dataset_logical_name(resolved_dataset) - for year in years: - filepath = f"{data_folder}/{dataset_stem}_year_{year}.h5" - us_dataset = PolicyEngineUSDataset( - name=f"{dataset_stem}-year-{year}", - description=f"US Dataset for year {year} based on {dataset_stem}", - filepath=filepath, - year=year, - ) - us_dataset.load() - - dataset_key = f"{dataset_stem}_{year}" - result[dataset_key] = us_dataset - - return result - - -def ensure_datasets( - datasets: list[str] = [ - "enhanced_cps_2024", - ], - years: list[int] = [2024, 2025, 2026, 2027, 2028], - data_folder: str = "./data", -) -> dict[str, PolicyEngineUSDataset]: - """Ensure datasets exist, loading if available or creating if not. - - Args: - datasets: List of HuggingFace dataset paths - years: List of years to load/create data for - data_folder: Directory containing or to save the dataset files - - Returns: - Dictionary mapping dataset keys to PolicyEngineUSDataset objects - """ - # Check if all dataset files exist - all_exist = True - for dataset in datasets: - resolved_dataset = resolve_dataset_reference("us", dataset) - dataset_stem = dataset_logical_name(resolved_dataset) - for year in years: - filepath = Path(f"{data_folder}/{dataset_stem}_year_{year}.h5") - if not filepath.exists(): - all_exist = False - break - if not all_exist: - break - - if all_exist: - return load_datasets(datasets=datasets, years=years, data_folder=data_folder) - else: - return create_datasets(datasets=datasets, years=years, data_folder=data_folder) diff --git a/build/lib/policyengine/tax_benefit_models/us/model.py b/build/lib/policyengine/tax_benefit_models/us/model.py deleted file mode 100644 index a896f5c4..00000000 --- a/build/lib/policyengine/tax_benefit_models/us/model.py +++ /dev/null @@ -1,650 +0,0 @@ -import datetime -from importlib import metadata -from pathlib import Path -from typing import TYPE_CHECKING, Optional - -import pandas as pd -from microdf import MicroDataFrame - -from policyengine.core import ( - Parameter, - ParameterNode, - TaxBenefitModel, - TaxBenefitModelVersion, - Variable, -) -from policyengine.core.release_manifest import ( - certify_data_release_compatibility, - dataset_logical_name, - get_release_manifest, - resolve_local_managed_dataset_source, - resolve_managed_dataset_reference, -) -from policyengine.utils.entity_utils import ( - build_entity_relationships, - filter_dataset_by_household_variable, -) -from policyengine.utils.parameter_labels import ( - build_scale_lookup, - generate_label_for_parameter, -) - -from .datasets import PolicyEngineUSDataset, USYearData - -if TYPE_CHECKING: - from policyengine.core.simulation import Simulation - -US_GROUP_ENTITIES = [ - "household", - "tax_unit", - "spm_unit", - "family", - "marital_unit", -] - - -class PolicyEngineUS(TaxBenefitModel): - id: str = "policyengine-us" - description: str = "The US's open-source dynamic tax and benefit microsimulation model maintained by PolicyEngine." - - -us_model = PolicyEngineUS() - - -def _get_runtime_data_build_metadata() -> dict[str, Optional[str]]: - try: - from policyengine_us.build_metadata import get_data_build_metadata - except ModuleNotFoundError as exc: - if exc.name != "policyengine_us.build_metadata": - raise - return {} - - return get_data_build_metadata() or {} - - -class PolicyEngineUSLatest(TaxBenefitModelVersion): - model: TaxBenefitModel = us_model - version: str = None - created_at: datetime.datetime = None - - entity_variables: dict[str, list[str]] = { - "person": [ - # IDs and weights - "person_id", - "marital_unit_id", - "family_id", - "spm_unit_id", - "tax_unit_id", - "household_id", - "person_weight", - # Demographics - "age", - "is_male", - "race", - "is_child", - "is_adult", - # Income - "employment_income", - # Benefits - "ssi", - "social_security", - "medicaid", - "unemployment_compensation", - ], - "marital_unit": [ - "marital_unit_id", - "marital_unit_weight", - ], - "family": [ - "family_id", - "family_weight", - ], - "spm_unit": [ - "spm_unit_id", - "spm_unit_weight", - "snap", - "tanf", - "spm_unit_net_income", - # Poverty measures - "spm_unit_is_in_spm_poverty", - "spm_unit_is_in_deep_spm_poverty", - ], - "tax_unit": [ - "tax_unit_id", - "tax_unit_weight", - "income_tax", - "employee_payroll_tax", - "household_state_income_tax", - "eitc", - "ctc", - ], - "household": [ - "household_id", - "household_weight", - "household_count_people", - "household_net_income", - "household_income_decile", - "household_benefits", - "household_tax", - "household_market_income", - "congressional_district_geoid", - ], - } - - def __init__(self, **kwargs: dict): - manifest = get_release_manifest("us") - if "version" not in kwargs or kwargs.get("version") is None: - kwargs["version"] = manifest.model_package.version - - installed_model_version = metadata.version("policyengine-us") - if installed_model_version != manifest.model_package.version: - raise ValueError( - "Installed policyengine-us version does not match the " - f"bundled policyengine.py manifest. Expected " - f"{manifest.model_package.version}, got {installed_model_version}." - ) - - model_build_metadata = _get_runtime_data_build_metadata() - data_certification = certify_data_release_compatibility( - "us", - runtime_model_version=installed_model_version, - runtime_data_build_fingerprint=model_build_metadata.get( - "data_build_fingerprint" - ), - ) - - super().__init__(**kwargs) - self.release_manifest = manifest - self.model_package = manifest.model_package - self.data_package = manifest.data_package - self.default_dataset_uri = manifest.default_dataset_uri - self.data_certification = data_certification - from policyengine_core.enums import Enum - from policyengine_us.system import system - - # Attach region registry - from policyengine.countries.us.regions import us_region_registry - - self.region_registry = us_region_registry - - self.id = f"{self.model.id}@{self.version}" - - for var_obj in system.variables.values(): - # Serialize default_value for JSON compatibility - default_val = var_obj.default_value - if var_obj.value_type is Enum: - default_val = default_val.name - elif var_obj.value_type is datetime.date: - default_val = default_val.isoformat() - - variable = Variable( - id=self.id + "-" + var_obj.name, - name=var_obj.name, - label=getattr(var_obj, "label", None), - tax_benefit_model_version=self, - entity=var_obj.entity.key, - description=var_obj.documentation, - data_type=var_obj.value_type if var_obj.value_type is not Enum else str, - default_value=default_val, - value_type=var_obj.value_type, - ) - if ( - hasattr(var_obj, "possible_values") - and var_obj.possible_values is not None - ): - variable.possible_values = list( - map( - lambda x: x.name, - var_obj.possible_values._value2member_map_.values(), - ) - ) - # Extract and resolve adds/subtracts. - # Core stores these as either list[str] or a parameter path string. - # Resolve parameter paths to lists so consumers always get list[str]. - if hasattr(var_obj, "adds") and var_obj.adds is not None: - if isinstance(var_obj.adds, str): - try: - from policyengine_core.parameters.operations.get_parameter import ( - get_parameter, - ) - - param = get_parameter(system.parameters, var_obj.adds) - variable.adds = list(param("2025-01-01")) - except (ValueError, Exception): - variable.adds = None - else: - variable.adds = var_obj.adds - if hasattr(var_obj, "subtracts") and var_obj.subtracts is not None: - if isinstance(var_obj.subtracts, str): - try: - from policyengine_core.parameters.operations.get_parameter import ( - get_parameter, - ) - - param = get_parameter(system.parameters, var_obj.subtracts) - variable.subtracts = list(param("2025-01-01")) - except (ValueError, Exception): - variable.subtracts = None - else: - variable.subtracts = var_obj.subtracts - self.add_variable(variable) - - from policyengine_core.parameters import Parameter as CoreParameter - from policyengine_core.parameters import ParameterNode as CoreParameterNode - - scale_lookup = build_scale_lookup(system) - - for param_node in system.parameters.get_descendants(): - if isinstance(param_node, CoreParameter): - parameter = Parameter( - id=self.id + "-" + param_node.name, - name=param_node.name, - label=generate_label_for_parameter( - param_node, system, scale_lookup - ), - tax_benefit_model_version=self, - description=param_node.description, - data_type=type(param_node(2025)), - unit=param_node.metadata.get("unit"), - _core_param=param_node, - ) - self.add_parameter(parameter) - elif isinstance(param_node, CoreParameterNode): - node = ParameterNode( - id=self.id + "-" + param_node.name, - name=param_node.name, - label=param_node.metadata.get("label"), - description=param_node.description, - tax_benefit_model_version=self, - ) - self.add_parameter_node(node) - - def _build_entity_relationships( - self, dataset: PolicyEngineUSDataset - ) -> pd.DataFrame: - """Build a DataFrame mapping each person to their containing entities.""" - person_data = pd.DataFrame(dataset.data.person) - return build_entity_relationships(person_data, US_GROUP_ENTITIES) - - def _filter_dataset_by_household_variable( - self, - dataset: PolicyEngineUSDataset, - variable_name: str, - variable_value: str, - ) -> PolicyEngineUSDataset: - """Filter a dataset to only include households where a variable matches.""" - filtered = filter_dataset_by_household_variable( - entity_data=dataset.data.entity_data, - group_entities=US_GROUP_ENTITIES, - variable_name=variable_name, - variable_value=variable_value, - ) - return PolicyEngineUSDataset( - id=dataset.id + f"_filtered_{variable_name}_{variable_value}", - name=dataset.name, - description=f"{dataset.description} (filtered: {variable_name}={variable_value})", - filepath=dataset.filepath, - year=dataset.year, - is_output_dataset=dataset.is_output_dataset, - data=USYearData( - person=filtered["person"], - marital_unit=filtered["marital_unit"], - family=filtered["family"], - spm_unit=filtered["spm_unit"], - tax_unit=filtered["tax_unit"], - household=filtered["household"], - ), - ) - - def run(self, simulation: "Simulation") -> "Simulation": - from policyengine_us import Microsimulation - from policyengine_us.system import system - - from policyengine.utils.parametric_reforms import ( - build_reform_dict, - merge_reform_dicts, - ) - - assert isinstance(simulation.dataset, PolicyEngineUSDataset) - - dataset = simulation.dataset - dataset.load() - - # Apply regional scoping if specified - if simulation.scoping_strategy: - scoped_data = simulation.scoping_strategy.apply( - entity_data=dataset.data.entity_data, - group_entities=US_GROUP_ENTITIES, - year=dataset.year, - ) - dataset = PolicyEngineUSDataset( - id=dataset.id + "_scoped", - name=dataset.name, - description=dataset.description, - filepath=dataset.filepath, - year=dataset.year, - is_output_dataset=dataset.is_output_dataset, - data=USYearData( - person=scoped_data["person"], - marital_unit=scoped_data["marital_unit"], - family=scoped_data["family"], - spm_unit=scoped_data["spm_unit"], - tax_unit=scoped_data["tax_unit"], - household=scoped_data["household"], - ), - ) - elif simulation.filter_field and simulation.filter_value: - dataset = self._filter_dataset_by_household_variable( - dataset, simulation.filter_field, simulation.filter_value - ) - - # Build reform dict from policy and dynamic parameter values. - # US requires reforms at Microsimulation construction time - # (unlike UK which supports p.update() after construction). - policy_reform = build_reform_dict(simulation.policy) - dynamic_reform = build_reform_dict(simulation.dynamic) - reform_dict = merge_reform_dicts(policy_reform, dynamic_reform) - - # Create Microsimulation with reform at construction time - microsim = Microsimulation(reform=reform_dict) - self._build_simulation_from_dataset(microsim, dataset, system) - - data = { - "person": pd.DataFrame(), - "marital_unit": pd.DataFrame(), - "family": pd.DataFrame(), - "spm_unit": pd.DataFrame(), - "tax_unit": pd.DataFrame(), - "household": pd.DataFrame(), - } - - # ID columns should be preserved from input dataset, not calculated - id_columns = { - "person_id", - "household_id", - "marital_unit_id", - "family_id", - "spm_unit_id", - "tax_unit_id", - } - weight_columns = { - "person_weight", - "household_weight", - "marital_unit_weight", - "family_weight", - "spm_unit_weight", - "tax_unit_weight", - } - - # First, copy ID and weight columns from input dataset - for entity in data.keys(): - input_df = pd.DataFrame(getattr(dataset.data, entity)) - entity_id_col = f"{entity}_id" - entity_weight_col = f"{entity}_weight" - - if entity_id_col in input_df.columns: - data[entity][entity_id_col] = input_df[entity_id_col].values - if entity_weight_col in input_df.columns: - data[entity][entity_weight_col] = input_df[entity_weight_col].values - - # For person entity, also copy person-level group ID columns - person_input_df = pd.DataFrame(dataset.data.person) - for col in person_input_df.columns: - if col.startswith("person_") and col.endswith("_id"): - # Map person_household_id -> household_id, etc. - target_col = col.replace("person_", "") - if target_col in id_columns: - data["person"][target_col] = person_input_df[col].values - - # Then calculate non-ID, non-weight variables from simulation - for entity, variables in self.entity_variables.items(): - for var in variables: - if var not in id_columns and var not in weight_columns: - data[entity][var] = microsim.calculate( - var, period=simulation.dataset.year, map_to=entity - ).values - - data["person"] = MicroDataFrame(data["person"], weights="person_weight") - data["marital_unit"] = MicroDataFrame( - data["marital_unit"], weights="marital_unit_weight" - ) - data["family"] = MicroDataFrame(data["family"], weights="family_weight") - data["spm_unit"] = MicroDataFrame(data["spm_unit"], weights="spm_unit_weight") - data["tax_unit"] = MicroDataFrame(data["tax_unit"], weights="tax_unit_weight") - data["household"] = MicroDataFrame( - data["household"], weights="household_weight" - ) - - simulation.output_dataset = PolicyEngineUSDataset( - id=simulation.id, - name=dataset.name, - description=dataset.description, - filepath=str( - Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") - ), - year=simulation.dataset.year, - is_output_dataset=True, - data=USYearData( - person=data["person"], - marital_unit=data["marital_unit"], - family=data["family"], - spm_unit=data["spm_unit"], - tax_unit=data["tax_unit"], - household=data["household"], - ), - ) - - def save(self, simulation: "Simulation"): - """Save the simulation's output dataset.""" - simulation.output_dataset.save() - - def load(self, simulation: "Simulation"): - """Load the simulation's output dataset.""" - import os - - filepath = str( - Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") - ) - - simulation.output_dataset = PolicyEngineUSDataset( - id=simulation.id, - name=simulation.dataset.name, - description=simulation.dataset.description, - filepath=filepath, - year=simulation.dataset.year, - is_output_dataset=True, - ) - - # Load timestamps from file system metadata - if os.path.exists(filepath): - simulation.created_at = datetime.datetime.fromtimestamp( - os.path.getctime(filepath) - ) - simulation.updated_at = datetime.datetime.fromtimestamp( - os.path.getmtime(filepath) - ) - - def _build_simulation_from_dataset(self, microsim, dataset, system): - """Build a PolicyEngine Core simulation from dataset entity IDs. - - This follows the same pattern as policyengine-uk, initializing - entities from IDs first, then using set_input() for variables. - - Args: - microsim: The Microsimulation object to populate - dataset: The dataset containing entity data - system: The tax-benefit system - """ - import numpy as np - from policyengine_core.simulations.simulation_builder import ( - SimulationBuilder, - ) - - # Create builder and instantiate entities - builder = SimulationBuilder() - builder.populations = system.instantiate_entities() - - # Extract entity IDs from dataset - person_data = pd.DataFrame(dataset.data.person) - - # Determine column naming convention - # Support both person_X_id (from create_datasets) and X_id (from custom datasets) - household_id_col = ( - "person_household_id" - if "person_household_id" in person_data.columns - else "household_id" - ) - marital_unit_id_col = ( - "person_marital_unit_id" - if "person_marital_unit_id" in person_data.columns - else "marital_unit_id" - ) - family_id_col = ( - "person_family_id" - if "person_family_id" in person_data.columns - else "family_id" - ) - spm_unit_id_col = ( - "person_spm_unit_id" - if "person_spm_unit_id" in person_data.columns - else "spm_unit_id" - ) - tax_unit_id_col = ( - "person_tax_unit_id" - if "person_tax_unit_id" in person_data.columns - else "tax_unit_id" - ) - - # Declare entities - builder.declare_person_entity("person", person_data["person_id"].values) - builder.declare_entity( - "household", np.unique(person_data[household_id_col].values) - ) - builder.declare_entity( - "spm_unit", np.unique(person_data[spm_unit_id_col].values) - ) - builder.declare_entity("family", np.unique(person_data[family_id_col].values)) - builder.declare_entity( - "tax_unit", np.unique(person_data[tax_unit_id_col].values) - ) - builder.declare_entity( - "marital_unit", np.unique(person_data[marital_unit_id_col].values) - ) - - # Join persons to group entities - builder.join_with_persons( - builder.populations["household"], - person_data[household_id_col].values, - np.array(["member"] * len(person_data)), - ) - builder.join_with_persons( - builder.populations["spm_unit"], - person_data[spm_unit_id_col].values, - np.array(["member"] * len(person_data)), - ) - builder.join_with_persons( - builder.populations["family"], - person_data[family_id_col].values, - np.array(["member"] * len(person_data)), - ) - builder.join_with_persons( - builder.populations["tax_unit"], - person_data[tax_unit_id_col].values, - np.array(["member"] * len(person_data)), - ) - builder.join_with_persons( - builder.populations["marital_unit"], - person_data[marital_unit_id_col].values, - np.array(["member"] * len(person_data)), - ) - - # Build simulation from populations - microsim.build_from_populations(builder.populations) - - # Set input variables for each entity - # Skip ID columns as they're structural and already used in entity building - # Support both naming conventions - id_columns = { - "person_id", - "household_id", - "person_household_id", - "spm_unit_id", - "person_spm_unit_id", - "family_id", - "person_family_id", - "tax_unit_id", - "person_tax_unit_id", - "marital_unit_id", - "person_marital_unit_id", - } - - for entity_name, entity_df in [ - ("person", dataset.data.person), - ("household", dataset.data.household), - ("spm_unit", dataset.data.spm_unit), - ("family", dataset.data.family), - ("tax_unit", dataset.data.tax_unit), - ("marital_unit", dataset.data.marital_unit), - ]: - df = pd.DataFrame(entity_df) - for column in df.columns: - # Skip ID columns and check if variable exists in system - if column not in id_columns and column in system.variables: - microsim.set_input(column, dataset.year, df[column].values) - - -def _managed_release_bundle( - dataset_uri: str, - dataset_source: Optional[str] = None, -) -> dict[str, Optional[str]]: - bundle = dict(us_latest.release_bundle) - bundle["runtime_dataset"] = dataset_logical_name(dataset_uri) - bundle["runtime_dataset_uri"] = dataset_uri - if dataset_source: - bundle["runtime_dataset_source"] = dataset_source - bundle["managed_by"] = "policyengine.py" - return bundle - - -def managed_microsimulation( - *, - dataset: Optional[str] = None, - allow_unmanaged: bool = False, - **kwargs, -): - """Construct a country-package Microsimulation pinned to this bundle. - - By default this enforces the dataset selection from the bundled - `policyengine.py` release manifest. Arbitrary dataset URIs require - `allow_unmanaged=True`. - """ - - from policyengine_us import Microsimulation - - if "dataset" in kwargs: - raise ValueError( - "Pass `dataset=` directly to managed_microsimulation, not through " - "**kwargs, so policyengine.py can enforce the release bundle." - ) - - dataset_uri = resolve_managed_dataset_reference( - "us", - dataset, - allow_unmanaged=allow_unmanaged, - ) - dataset_source = resolve_local_managed_dataset_source( - "us", - dataset_uri, - allow_local_mirror=not ( - allow_unmanaged and dataset is not None and "://" in dataset - ), - ) - microsim = Microsimulation(dataset=dataset_source, **kwargs) - microsim.policyengine_bundle = _managed_release_bundle( - dataset_uri, - dataset_source, - ) - return microsim - - -us_latest = PolicyEngineUSLatest() diff --git a/build/lib/policyengine/tax_benefit_models/us/outputs.py b/build/lib/policyengine/tax_benefit_models/us/outputs.py deleted file mode 100644 index 1dd6f001..00000000 --- a/build/lib/policyengine/tax_benefit_models/us/outputs.py +++ /dev/null @@ -1,105 +0,0 @@ -"""US-specific output templates.""" - -from typing import Optional - -from pydantic import ConfigDict - -from policyengine.core import Output, Simulation -from policyengine.outputs.aggregate import Aggregate, AggregateType -from policyengine.outputs.change_aggregate import ( - ChangeAggregate, - ChangeAggregateType, -) - - -class ProgramStatistics(Output): - """Single program's statistics from a policy reform - represents one database row.""" - - model_config = ConfigDict(arbitrary_types_allowed=True) - - baseline_simulation: Simulation - reform_simulation: Simulation - program_name: str - entity: str - is_tax: bool = False - - # Results populated by run() - baseline_total: Optional[float] = None - reform_total: Optional[float] = None - change: Optional[float] = None - baseline_count: Optional[float] = None - reform_count: Optional[float] = None - winners: Optional[float] = None - losers: Optional[float] = None - - def run(self): - """Calculate statistics for this program.""" - # Baseline totals - baseline_total = Aggregate( - simulation=self.baseline_simulation, - variable=self.program_name, - aggregate_type=AggregateType.SUM, - entity=self.entity, - ) - baseline_total.run() - - # Reform totals - reform_total = Aggregate( - simulation=self.reform_simulation, - variable=self.program_name, - aggregate_type=AggregateType.SUM, - entity=self.entity, - ) - reform_total.run() - - # Count of recipients/payers (baseline) - baseline_count = Aggregate( - simulation=self.baseline_simulation, - variable=self.program_name, - aggregate_type=AggregateType.COUNT, - entity=self.entity, - filter_variable=self.program_name, - filter_variable_geq=0.01, - ) - baseline_count.run() - - # Count of recipients/payers (reform) - reform_count = Aggregate( - simulation=self.reform_simulation, - variable=self.program_name, - aggregate_type=AggregateType.COUNT, - entity=self.entity, - filter_variable=self.program_name, - filter_variable_geq=0.01, - ) - reform_count.run() - - # Winners and losers - winners = ChangeAggregate( - baseline_simulation=self.baseline_simulation, - reform_simulation=self.reform_simulation, - variable=self.program_name, - aggregate_type=ChangeAggregateType.COUNT, - entity=self.entity, - change_geq=0.01 if not self.is_tax else -0.01, - ) - winners.run() - - losers = ChangeAggregate( - baseline_simulation=self.baseline_simulation, - reform_simulation=self.reform_simulation, - variable=self.program_name, - aggregate_type=ChangeAggregateType.COUNT, - entity=self.entity, - change_leq=-0.01 if not self.is_tax else 0.01, - ) - losers.run() - - # Populate results - self.baseline_total = float(baseline_total.result) - self.reform_total = float(reform_total.result) - self.change = float(reform_total.result - baseline_total.result) - self.baseline_count = float(baseline_count.result) - self.reform_count = float(reform_count.result) - self.winners = float(winners.result) - self.losers = float(losers.result) diff --git a/build/lib/policyengine/utils/__init__.py b/build/lib/policyengine/utils/__init__.py deleted file mode 100644 index bf3cc681..00000000 --- a/build/lib/policyengine/utils/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .dates import parse_safe_date as parse_safe_date -from .parameter_labels import build_scale_lookup as build_scale_lookup -from .parameter_labels import ( - generate_label_for_parameter as generate_label_for_parameter, -) -from .plotting import COLORS as COLORS -from .plotting import format_fig as format_fig diff --git a/build/lib/policyengine/utils/dates.py b/build/lib/policyengine/utils/dates.py deleted file mode 100644 index 46cec198..00000000 --- a/build/lib/policyengine/utils/dates.py +++ /dev/null @@ -1,43 +0,0 @@ -import calendar -from datetime import datetime - - -def parse_safe_date(date_string: str) -> datetime: - """ - Parse a YYYY-MM-DD date string and ensure the year is at least 1. - Handles invalid day values by capping to the last valid day of the month. - - Args: - date_string: Date string in YYYY-MM-DD format - - Returns: - Safe datetime object with year >= 1 - """ - try: - date_string = date_string.replace("0000-", "0001-") - date_obj = datetime.strptime(date_string, "%Y-%m-%d") - if date_obj.year < 1: - # Replace year 0 or negative years with year 1 - return date_obj.replace(year=1) - return date_obj - except ValueError as e: - # Try to handle invalid day values (e.g., 2021-06-31) - # Python <3.14: "day is out of range for month" - # Python 3.14+: "day N must be in range 1..M for month ..." - error_msg = str(e) - if "day is out of range for month" in error_msg or ( - "must be in range" in error_msg and "for month" in error_msg - ): - parts = date_string.split("-") - if len(parts) == 3: - year = int(parts[0]) - month = int(parts[1]) - # Get the last valid day of the month - last_day = calendar.monthrange(year, month)[1] - # Use the last valid day instead - corrected_date = f"{year:04d}-{month:02d}-{last_day:02d}" - date_obj = datetime.strptime(corrected_date, "%Y-%m-%d") - if date_obj.year < 1: - return date_obj.replace(year=1) - return date_obj - raise ValueError(f"Invalid date format: {date_string}. Expected YYYY-MM-DD") diff --git a/build/lib/policyengine/utils/entity_utils.py b/build/lib/policyengine/utils/entity_utils.py deleted file mode 100644 index f06b5d59..00000000 --- a/build/lib/policyengine/utils/entity_utils.py +++ /dev/null @@ -1,140 +0,0 @@ -"""Shared utilities for entity relationship building and dataset filtering.""" - -import logging - -import pandas as pd -from microdf import MicroDataFrame - -logger = logging.getLogger(__name__) - - -def _resolve_id_column(person_data: pd.DataFrame, entity_name: str) -> str: - """Resolve the ID column name for a group entity in person data. - - Tries `person_{entity}_id` first (standard convention), falls back - to `{entity}_id` (custom datasets). - """ - prefixed = f"person_{entity_name}_id" - bare = f"{entity_name}_id" - if prefixed in person_data.columns: - return prefixed - if bare in person_data.columns: - return bare - raise ValueError( - f"No ID column found for entity '{entity_name}'. " - f"Tried '{prefixed}' and '{bare}'. " - f"Available columns: {list(person_data.columns)}" - ) - - -def build_entity_relationships( - person_data: pd.DataFrame, - group_entities: list[str], -) -> pd.DataFrame: - """Build a DataFrame mapping each person to their containing entities. - - Creates an explicit relationship map between persons and all specified - group entity types. This enables filtering at any entity level while - preserving the integrity of all related entities. - - Args: - person_data: DataFrame of person-level data with ID columns. - group_entities: List of group entity names (e.g., ["household", "tax_unit"]). - - Returns: - A DataFrame with person_id and one {entity}_id column per group entity. - """ - columns = {"person_id": person_data["person_id"].values} - for entity in group_entities: - id_col = _resolve_id_column(person_data, entity) - columns[f"{entity}_id"] = person_data[id_col].values - return pd.DataFrame(columns) - - -def filter_dataset_by_household_variable( - entity_data: dict[str, MicroDataFrame], - group_entities: list[str], - variable_name: str, - variable_value: str, -) -> dict[str, MicroDataFrame]: - """Filter dataset entities to only include households where a variable matches. - - Uses an entity relationship approach: builds an explicit map of all - entity relationships, filters at the household level, and keeps all - persons in matching households to preserve entity integrity. - - Args: - entity_data: Dict mapping entity names to their MicroDataFrames - (from YearData.entity_data). - group_entities: List of group entity names for this country. - variable_name: The household-level variable to filter on. - variable_value: The value to match. Handles both str and bytes encoding. - - Returns: - A dict mapping entity names to filtered MicroDataFrames. - - Raises: - ValueError: If variable_name is not found or no households match. - """ - person_data = pd.DataFrame(entity_data["person"]) - household_data = pd.DataFrame(entity_data["household"]) - - if variable_name not in household_data.columns: - raise ValueError( - f"Variable '{variable_name}' not found in household data. " - f"Available columns: {list(household_data.columns)}" - ) - - # Build entity relationships - entity_rel = build_entity_relationships(person_data, group_entities) - - # Find matching household IDs - hh_values = household_data[variable_name].values - hh_ids = household_data["household_id"].values - - if isinstance(variable_value, str): - hh_mask = (hh_values == variable_value) | (hh_values == variable_value.encode()) - else: - hh_mask = hh_values == variable_value - - matching_hh_ids = set(hh_ids[hh_mask]) - - if len(matching_hh_ids) == 0: - raise ValueError( - f"No households found matching {variable_name}={variable_value}" - ) - - # Filter persons to those in matching households - person_mask = entity_rel["household_id"].isin(matching_hh_ids) - filtered_rel = entity_rel[person_mask] - - # Collect filtered IDs for each entity - filtered_ids = {"person": set(filtered_rel["person_id"])} - for entity in group_entities: - filtered_ids[entity] = set(filtered_rel[f"{entity}_id"]) - - # Filter each entity DataFrame - result = {} - for entity_name, mdf in entity_data.items(): - df = pd.DataFrame(mdf) - id_col = f"{entity_name}_id" - if entity_name in filtered_ids and id_col in df.columns: - filtered_df = df[df[id_col].isin(filtered_ids[entity_name])] - else: - if entity_name != "person": - logger.warning( - "Entity '%s' not in filtered_ids or missing '%s' column; " - "passing through unfiltered.", - entity_name, - id_col, - ) - filtered_df = df - - weight_col = f"{entity_name}_weight" - weights = weight_col if weight_col in filtered_df.columns else None - result[entity_name] = MicroDataFrame( - filtered_df.reset_index(drop=True), - weights=weights, - ) - - return result diff --git a/build/lib/policyengine/utils/parameter_labels.py b/build/lib/policyengine/utils/parameter_labels.py deleted file mode 100644 index 6a574be8..00000000 --- a/build/lib/policyengine/utils/parameter_labels.py +++ /dev/null @@ -1,216 +0,0 @@ -"""Utilities for generating human-readable labels for tax-benefit parameters.""" - -import re - - -def generate_label_for_parameter(param_node, system, scale_lookup): - """ - Generate a label for a parameter that doesn't have one. - - For breakdown parameters: Uses parent label + enum value - For bracket parameters: Uses scale label + bracket info - - Args: - param_node: The CoreParameter object - system: The tax-benefit system (has variables and parameters) - scale_lookup: Dict mapping scale names to ParameterScale objects - - Returns: - str or None: Generated label, or None if cannot generate - """ - if param_node.metadata.get("label"): - return param_node.metadata.get("label") - - param_name = param_node.name - - if "[" in param_name: - return _generate_bracket_label(param_name, scale_lookup) - - # Check for breakdown - either direct child or nested - breakdown_parent = _find_breakdown_parent(param_node) - if breakdown_parent: - return _generate_breakdown_label(param_node, system, breakdown_parent) - - return None - - -def _find_breakdown_parent(param_node): - """ - Walk up the tree to find the nearest ancestor with breakdown metadata. - - Args: - param_node: The CoreParameter object - - Returns: - The breakdown parent node, or None if not found - """ - current = param_node.parent - while current: - if current.metadata.get("breakdown"): - return current - current = getattr(current, "parent", None) - return None - - -def _generate_breakdown_label(param_node, system, breakdown_parent=None): - """ - Generate label for a breakdown parameter using enum values. - - Handles both single-level and nested breakdowns by walking up to the - breakdown parent and collecting all dimension values. - - Args: - param_node: The CoreParameter object - system: The tax-benefit system - breakdown_parent: The ancestor node with breakdown metadata (optional) - - Returns: - str or None: Generated label, or None if cannot generate - """ - # Find breakdown parent if not provided - if breakdown_parent is None: - breakdown_parent = _find_breakdown_parent(param_node) - if not breakdown_parent: - return None - - parent_label = breakdown_parent.metadata.get("label") - if not parent_label: - return None - - breakdown_vars = breakdown_parent.metadata.get("breakdown", []) - breakdown_labels = breakdown_parent.metadata.get("breakdown_labels", []) - - # Collect dimension values from breakdown parent to param_node - dimension_values = _collect_dimension_values(param_node, breakdown_parent) - - if not dimension_values: - return None - - # Generate labels for each dimension - formatted_parts = [] - for i, (dim_key, dim_value) in enumerate(dimension_values): - var_name = breakdown_vars[i] if i < len(breakdown_vars) else None - dim_label = breakdown_labels[i] if i < len(breakdown_labels) else None - - formatted_value = _format_dimension_value( - dim_value, var_name, dim_label, system - ) - formatted_parts.append(formatted_value) - - return f"{parent_label} ({', '.join(formatted_parts)})" - - -def _collect_dimension_values(param_node, breakdown_parent): - """ - Collect dimension keys and values from breakdown parent to param_node. - - Args: - param_node: The CoreParameter object - breakdown_parent: The ancestor node with breakdown metadata - - Returns: - list of (dimension_key, value) tuples, ordered from parent to child - """ - # Build path from param_node up to breakdown_parent - path = [] - current = param_node - while current and current != breakdown_parent: - path.append(current) - current = getattr(current, "parent", None) - - # Reverse to get parent-to-child order - path.reverse() - - # Extract dimension values - dimension_values = [] - for i, node in enumerate(path): - key = node.name.split(".")[-1] - dimension_values.append((i, key)) - - return dimension_values - - -def _format_dimension_value(value, var_name, dim_label, system): - """ - Format a single dimension value with semantic label if available. - - Args: - value: The raw dimension value (e.g., "SINGLE", "1", "CA") - var_name: The breakdown variable name (e.g., "filing_status", "range(1, 9)") - dim_label: The human-readable label for this dimension (e.g., "Household size") - system: The tax-benefit system - - Returns: - str: Formatted dimension value - """ - # First, try to get enum display value - if ( - var_name - and isinstance(var_name, str) - and not var_name.startswith("range(") - and not var_name.startswith("list(") - ): - var = system.variables.get(var_name) - if var and hasattr(var, "possible_values") and var.possible_values: - try: - enum_value = var.possible_values[value].value - return str(enum_value) - except (KeyError, AttributeError): - pass - - # For range() dimensions or when no enum found, use breakdown_label if available - if dim_label: - return f"{dim_label} {value}" - - return value - - -def _generate_bracket_label(param_name, scale_lookup): - """Generate label for a bracket parameter.""" - match = re.match(r"^(.+)\[(\d+)\]\.(\w+)$", param_name) - if not match: - return None - - scale_name = match.group(1) - bracket_index = int(match.group(2)) - field_name = match.group(3) - - scale = scale_lookup.get(scale_name) - if not scale: - return None - - scale_label = scale.metadata.get("label") - scale_type = scale.metadata.get("type", "") - - if not scale_label: - return None - - bracket_num = bracket_index + 1 - - if scale_type in ("marginal_rate", "marginal_amount"): - bracket_desc = f"bracket {bracket_num}" - elif scale_type == "single_amount": - bracket_desc = f"tier {bracket_num}" - else: - bracket_desc = f"bracket {bracket_num}" - - return f"{scale_label} ({bracket_desc} {field_name})" - - -def build_scale_lookup(system): - """ - Build a lookup dict mapping scale names to ParameterScale objects. - - Args: - system: The tax-benefit system - - Returns: - dict: Mapping of scale name -> ParameterScale object - """ - from policyengine_core.parameters import ParameterScale - - return { - p.name: p - for p in system.parameters.get_descendants() - if isinstance(p, ParameterScale) - } diff --git a/build/lib/policyengine/utils/parametric_reforms.py b/build/lib/policyengine/utils/parametric_reforms.py deleted file mode 100644 index 025df22e..00000000 --- a/build/lib/policyengine/utils/parametric_reforms.py +++ /dev/null @@ -1,131 +0,0 @@ -from __future__ import annotations - -from collections.abc import Callable -from typing import TYPE_CHECKING, Optional, Union - -from policyengine_core.periods import period - -from policyengine.core import ParameterValue - -if TYPE_CHECKING: - from policyengine.core.dynamic import Dynamic - from policyengine.core.policy import Policy - - -def reform_dict_from_parameter_values( - parameter_values: Optional[list[ParameterValue]], -) -> Optional[dict]: - """ - Convert a list of ParameterValue objects to a reform dict format. - - This format is accepted by policyengine_us.Microsimulation(reform=...) and - policyengine_uk.Microsimulation(reform=...) at construction time. - - Args: - parameter_values: List of ParameterValue objects to convert. - - Returns: - A dict mapping parameter names to period-value dicts, e.g.: - { - "gov.irs.deductions.standard.amount.SINGLE": { - "2024-01-01": 29200 - } - } - """ - if not parameter_values: - return None - - reform_dict = {} - for pv in parameter_values: - param_name = pv.parameter.name - if param_name not in reform_dict: - reform_dict[param_name] = {} - - # Format the period string - period_str = pv.start_date.strftime("%Y-%m-%d") - if pv.end_date: - # Use period range format: "start.end" - period_str = f"{period_str}.{pv.end_date.strftime('%Y-%m-%d')}" - - reform_dict[param_name][period_str] = pv.value - - return reform_dict - - -def simulation_modifier_from_parameter_values( - parameter_values: list[ParameterValue], -) -> Callable: - """ - Create a simulation modifier function that applies the given parameter values to a simulation. - - Args: - parameter_values (list[ParameterValue]): List of ParameterValue objects to apply. - - Returns: - Callable: A function that takes a Simulation object and applies the parameter values. - """ - - def modifier(simulation): - for pv in parameter_values: - p = simulation.tax_benefit_system.parameters.get_child(pv.parameter.name) - start_period = period(pv.start_date.strftime("%Y-%m-%d")) - stop_period = ( - period(pv.end_date.strftime("%Y-%m-%d")) if pv.end_date else None - ) - p.update( - value=pv.value, - start=start_period, - stop=stop_period, - ) - return simulation - - return modifier - - -def build_reform_dict( - policy_or_dynamic: Optional[Union[Policy, Dynamic]], -) -> Optional[dict]: - """Extract a reform dict from a Policy or Dynamic object. - - If the object has parameter_values, converts them to reform dict format. - Returns None if the object is None or has no parameter values. - - Args: - policy_or_dynamic: A Policy or Dynamic object, or None. - - Returns: - A reform dict suitable for Microsimulation(reform=...), or None. - """ - if policy_or_dynamic is None: - return None - if policy_or_dynamic.parameter_values: - return reform_dict_from_parameter_values(policy_or_dynamic.parameter_values) - return None - - -def merge_reform_dicts( - base: Optional[dict], override: Optional[dict] -) -> Optional[dict]: - """Merge two reform dicts, with override values taking precedence. - - Either or both dicts can be None. When both have entries for the same - parameter, period-level values from override replace those in base. - - Args: - base: The base reform dict (e.g., from policy). - override: The override reform dict (e.g., from dynamic). - - Returns: - The merged reform dict, or None if both inputs are None. - """ - if base is None: - return override - if override is None: - return base - - merged = {k: dict(v) for k, v in base.items()} - for param_name, period_values in override.items(): - if param_name not in merged: - merged[param_name] = {} - merged[param_name].update(period_values) - return merged diff --git a/build/lib/policyengine/utils/plotting.py b/build/lib/policyengine/utils/plotting.py deleted file mode 100644 index 2ca8e48c..00000000 --- a/build/lib/policyengine/utils/plotting.py +++ /dev/null @@ -1,178 +0,0 @@ -"""Plotting utilities for PolicyEngine visualisations.""" - -from typing import Optional - -import plotly.graph_objects as go - -# PolicyEngine brand colours -COLORS = { - "primary": "#319795", # Teal - "primary_light": "#E6FFFA", - "primary_dark": "#1D4044", - "success": "#22C55E", # Green (positive changes) - "warning": "#FEC601", # Yellow (cautions) - "error": "#EF4444", # Red (negative changes) - "info": "#1890FF", # Blue (neutral info) - "gray_light": "#F2F4F7", - "gray": "#667085", - "gray_dark": "#101828", - "blue_secondary": "#026AA2", -} - -# Typography -FONT_FAMILY = "Inter, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif" -FONT_SIZE_LABEL = 12 -FONT_SIZE_DEFAULT = 14 -FONT_SIZE_TITLE = 16 - - -def format_fig( - fig: go.Figure, - title: Optional[str] = None, - xaxis_title: Optional[str] = None, - yaxis_title: Optional[str] = None, - show_legend: bool = True, - height: Optional[int] = None, - width: Optional[int] = None, -) -> go.Figure: - """Apply PolicyEngine visual style to a plotly figure. - - Applies professional, clean styling following PolicyEngine design principles: - - Data-driven clarity prioritising immediate understanding - - Professional brand colours (teal primary, semantic colours) - - Clean typography with Inter font family - - Minimal visual clutter - - Appropriate spacing and margins - - Args: - fig: Plotly figure to format - title: Optional title to set/override - xaxis_title: Optional x-axis title to set/override - yaxis_title: Optional y-axis title to set/override - show_legend: Whether to show the legend (default: True) - height: Optional height in pixels - width: Optional width in pixels - - Returns: - Formatted plotly figure (same object, modified in place) - - Example: - >>> import plotly.graph_objects as go - >>> from policyengine.utils import format_fig - >>> fig = go.Figure(data=go.Scatter(x=[1, 2, 3], y=[4, 5, 6])) - >>> format_fig(fig, title="Example chart", xaxis_title="X", yaxis_title="Y") - """ - # Build layout updates - layout_updates = { - "font": { - "family": FONT_FAMILY, - "size": FONT_SIZE_DEFAULT, - "color": COLORS["gray_dark"], - }, - "plot_bgcolor": "#FAFAFA", - "paper_bgcolor": "white", - "margin": {"l": 100, "r": 60, "t": 100, "b": 80}, - "showlegend": show_legend, - "xaxis": { - "title": { - "font": { - "size": FONT_SIZE_DEFAULT, - "family": FONT_FAMILY, - "color": COLORS["gray_dark"], - }, - "standoff": 20, - }, - "tickfont": { - "size": FONT_SIZE_LABEL, - "family": FONT_FAMILY, - "color": COLORS["gray"], - }, - "showgrid": False, - "showline": True, - "linewidth": 2, - "linecolor": COLORS["gray_light"], - "zeroline": False, - "ticks": "outside", - "tickwidth": 1, - "tickcolor": COLORS["gray_light"], - }, - "yaxis": { - "title": { - "font": { - "size": FONT_SIZE_DEFAULT, - "family": FONT_FAMILY, - "color": COLORS["gray_dark"], - }, - "standoff": 20, - }, - "tickfont": { - "size": FONT_SIZE_LABEL, - "family": FONT_FAMILY, - "color": COLORS["gray"], - }, - "showgrid": True, - "gridwidth": 1, - "gridcolor": "#E5E7EB", - "showline": False, - "zeroline": False, - }, - "legend": { - "bgcolor": "white", - "bordercolor": COLORS["gray_light"], - "borderwidth": 1, - "font": {"size": FONT_SIZE_LABEL, "family": FONT_FAMILY}, - "orientation": "v", - "yanchor": "top", - "y": 0.99, - "xanchor": "right", - "x": 0.99, - }, - } - - # Add optional parameters - if title is not None: - layout_updates["title"] = { - "text": title, - "font": { - "size": 18, - "family": FONT_FAMILY, - "color": COLORS["gray_dark"], - "weight": 600, - }, - "x": 0, - "xanchor": "left", - "y": 0.98, - "yanchor": "top", - } - - if xaxis_title is not None: - layout_updates["xaxis"]["title"]["text"] = xaxis_title - - if yaxis_title is not None: - layout_updates["yaxis"]["title"]["text"] = yaxis_title - - if height is not None: - layout_updates["height"] = height - - if width is not None: - layout_updates["width"] = width - - # Apply layout - fig.update_layout(**layout_updates) - - # Update all traces to have cleaner styling - fig.update_traces( - marker=dict(size=8, line=dict(width=0)), - line=dict(width=3), - selector=dict(mode="markers+lines"), - ) - fig.update_traces( - marker=dict(size=8, line=dict(width=0)), - selector=dict(mode="markers"), - ) - fig.update_traces( - line=dict(width=3), - selector=dict(mode="lines"), - ) - - return fig From bcfc97c71c33b3865fc783d137191d45796ee617 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 17 Apr 2026 13:13:58 -0400 Subject: [PATCH 4/4] Fix StrEnum str() regression and .gitignore typo Two issues caught in review: 1. `str(poverty_type)` in `outputs/poverty.py:154,210` produced `"UKPovertyType.ABSOLUTE_BHC"` rather than `"absolute_bhc"` after the StrEnum -> `(str, Enum)` swap. The `Poverty.poverty_type` field and the output DataFrame column both carried the wrong string on 3.11+. Replaced `str(poverty_type)` with `poverty_type.value` to match the original StrEnum behavior on all Python versions. 2. `.gitignore:8` read `**/.DS_Storebuild/` because the previous commit appended `build/` without a trailing newline. Split into two entries. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 3 ++- src/policyengine/outputs/poverty.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index b7c5f008..3c351eab 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ *.ipynb _build/ .env -**/.DS_Storebuild/ +**/.DS_Store +build/ diff --git a/src/policyengine/outputs/poverty.py b/src/policyengine/outputs/poverty.py index 6fc59705..85a761a5 100644 --- a/src/policyengine/outputs/poverty.py +++ b/src/policyengine/outputs/poverty.py @@ -151,7 +151,7 @@ def calculate_uk_poverty_rates( poverty = Poverty( simulation=simulation, poverty_variable=poverty_variable, - poverty_type=str(poverty_type), + poverty_type=poverty_type.value, entity="person", filter_variable=filter_variable, filter_variable_eq=filter_variable_eq, @@ -207,7 +207,7 @@ def calculate_us_poverty_rates( poverty = Poverty( simulation=simulation, poverty_variable=poverty_variable, - poverty_type=str(poverty_type), + poverty_type=poverty_type.value, entity="person", filter_variable=filter_variable, filter_variable_eq=filter_variable_eq,