diff --git a/.github/workflows/pr_docs_changes.yaml b/.github/workflows/pr_docs_changes.yaml index 2ef9a20d..c51dc153 100644 --- a/.github/workflows/pr_docs_changes.yaml +++ b/.github/workflows/pr_docs_changes.yaml @@ -18,10 +18,7 @@ jobs: steps: - name: Checkout repo uses: actions/checkout@v4 - - uses: actions/setup-node@v4 - with: - node-version: 18.x - - name: Install MyST - run: npm install -g mystmd + - name: Set up Quarto + uses: quarto-dev/quarto-actions/setup@v2 - name: Test documentation builds - run: cd docs && myst build --html + run: quarto render docs diff --git a/Makefile b/Makefile index f62643e1..03344916 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,18 @@ -.PHONY: docs docs-serve - -MYSTMD_VERSION ?= 1.8.3 -MYST_CMD = npx --yes mystmd@$(MYSTMD_VERSION) +.PHONY: docs docs-serve docs-generate-reference all: build-package docs: - cd docs && $(MYST_CMD) build --html + quarto render docs docs-serve: - cd docs && $(MYST_CMD) start + quarto preview docs + +# Regenerate the auto-generated variable / program reference under docs/reference/. +# Run once per country model release; commits the refreshed pages alongside code. +docs-generate-reference: + python docs/_generator/build_reference.py --country us --out docs/reference/us + python docs/_generator/build_reference.py --country uk --out docs/reference/uk install: uv pip install -e .[dev] diff --git a/README.md b/README.md index 7fc607d5..e45dec98 100644 --- a/README.md +++ b/README.md @@ -4,26 +4,47 @@ A Python package for tax-benefit microsimulation analysis. Run policy simulation ## Quick start +### Household calculator + ```python -from policyengine.core import Simulation -from policyengine.tax_benefit_models.uk import PolicyEngineUKDataset, uk_latest -from policyengine.outputs.aggregate import Aggregate, AggregateType +import policyengine as pe -# Load representative microdata -dataset = PolicyEngineUKDataset( - name="FRS 2023-24", - filepath="./data/frs_2023_24_year_2026.h5", +# UK: single adult earning £50,000 +uk = pe.uk.calculate_household( + people=[{"age": 35, "employment_income": 50_000}], year=2026, ) +print(uk.person[0].income_tax) # income tax +print(uk.household.hbai_household_net_income) # net income + +# US: single filer in California, with a reform +us = pe.us.calculate_household( + people=[{"age": 35, "employment_income": 60_000}], + tax_unit={"filing_status": "SINGLE"}, + household={"state_code": "CA"}, + year=2026, + reform={"gov.irs.credits.ctc.amount.adult_dependent": 1000}, +) +print(us.tax_unit.income_tax, us.household.household_net_income) +``` -# Run simulation -simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, +### Population analysis + +```python +import policyengine as pe +from policyengine.core import Simulation +from policyengine.outputs.aggregate import Aggregate, AggregateType + +datasets = pe.uk.ensure_datasets( + datasets=["hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5"], + years=[2026], + data_folder="./data", ) +dataset = datasets["enhanced_frs_2023_24_2026"] + +simulation = Simulation(dataset=dataset, tax_benefit_model_version=pe.uk.model) simulation.run() -# Calculate total universal credit spending agg = Aggregate( simulation=simulation, variable="universal_credit", @@ -34,6 +55,9 @@ agg.run() print(f"Total UC spending: £{agg.result / 1e9:.1f}bn") ``` +For baseline-vs-reform comparisons, see `pe.uk.economic_impact_analysis` +and its US counterpart. + ## Documentation **Core concepts:** @@ -179,12 +203,12 @@ dataset.load() Simulations apply tax-benefit models to datasets: ```python +import policyengine as pe from policyengine.core import Simulation -from policyengine.tax_benefit_models.uk import uk_latest simulation = Simulation( dataset=dataset, - tax_benefit_model_version=uk_latest, + tax_benefit_model_version=pe.uk.model, ) simulation.run() @@ -223,7 +247,7 @@ import datetime parameter = Parameter( name="gov.hmrc.income_tax.allowances.personal_allowance.amount", - tax_benefit_model_version=uk_latest, + tax_benefit_model_version=pe.uk.model, data_type=float, ) @@ -242,7 +266,7 @@ policy = Policy( # Run reform simulation reform_sim = Simulation( dataset=dataset, - tax_benefit_model_version=uk_latest, + tax_benefit_model_version=pe.uk.model, policy=policy, ) reform_sim.run() diff --git a/changelog.d/v4-base-extraction.changed.md b/changelog.d/v4-base-extraction.changed.md new file mode 100644 index 00000000..572088a3 --- /dev/null +++ b/changelog.d/v4-base-extraction.changed.md @@ -0,0 +1 @@ +Extracted shared `MicrosimulationModelVersion` base class in `policyengine.tax_benefit_models.common`. Country subclasses now declare class-level metadata (`country_code`, `package_name`, `group_entities`) and implement a handful of thin hooks; `run()` stays per-country. Byte-level snapshot tests verify zero output drift. diff --git a/changelog.d/v4-dict-reforms.added.md b/changelog.d/v4-dict-reforms.added.md new file mode 100644 index 00000000..02405cdc --- /dev/null +++ b/changelog.d/v4-dict-reforms.added.md @@ -0,0 +1 @@ +``Simulation(policy={...})`` and ``Simulation(dynamic={...})`` now accept the same flat ``{"param.path": value}`` / ``{"param.path": {date: value}}`` dict that ``pe.{uk,us}.calculate_household(reform=...)`` accepts. Dicts are compiled to full ``Policy`` / ``Dynamic`` objects on construction using the ``tax_benefit_model_version`` for parameter-path validation and ``dataset.year`` for scalar effective-date defaulting. Removes the last place where population microsim required building ``Parameter`` / ``ParameterValue`` by hand. diff --git a/changelog.d/v4-docs-refresh.changed.md b/changelog.d/v4-docs-refresh.changed.md new file mode 100644 index 00000000..11e7d0d2 --- /dev/null +++ b/changelog.d/v4-docs-refresh.changed.md @@ -0,0 +1 @@ +Documentation refreshed for the v4 agent-first surface. README, `core-concepts`, `economic-impact-analysis`, `country-models-{uk,us}`, `regions-and-scoping`, `examples`, and `dev` now lead with `pe.uk.*` / `pe.us.*` entry points and flat-kwarg `calculate_household` usage. Removed leftover docs for the dropped `filter_field`/`filter_value` simulation fields. `examples/household_impact_example.py` rewritten against the v4 API. diff --git a/changelog.d/v4-facade.added.md b/changelog.d/v4-facade.added.md new file mode 100644 index 00000000..f05dea82 --- /dev/null +++ b/changelog.d/v4-facade.added.md @@ -0,0 +1,47 @@ +**BREAKING (v4):** Collapse the household-calculator surface into a +single agent-friendly entry point, ``pe.us.calculate_household`` / +``pe.uk.calculate_household``. + +New public API: + +- ``policyengine/__init__.py`` populated with canonical accessors: + ``pe.us``, ``pe.uk``, ``pe.Simulation`` (replacing the empty top-level + module). ``import policyengine as pe`` now gives you everything a + new coding session needs to reach in one line. +- ``pe.us.calculate_household(**kwargs)`` and ``pe.uk.calculate_household`` + take flat keyword arguments (``people``, per-entity overrides, + ``year``, ``reform``, ``extra_variables``) instead of a pydantic + input wrapper. +- ``reform=`` accepts a plain dict: ``{parameter_path: value}`` or + ``{parameter_path: {effective_date: value}}``. Compiles internally. +- Returns :class:`HouseholdResult` (new) with dot-access: + ``result.tax_unit.income_tax``, ``result.household.household_net_income``, + ``result.person[0].age``. Singleton entities are + :class:`EntityResult`; ``person`` is a list of them. ``to_dict()`` + and ``write(path)`` serialize to JSON. +- ``extra_variables=[...]`` is now a flat list; the library dispatches + each name to its entity by looking it up on the model. +- Unknown variable names (in ``people``, entity overrides, or + ``extra_variables``) raise ``ValueError`` with a ``difflib`` close-match + suggestion and a paste-able fix hint. +- Unknown dot-access on a result raises ``AttributeError`` with the + list of available variables plus the ``extra_variables=[...]`` call + that would surface the requested one. + +Removed (v4 breaking): + +- ``USHouseholdInput`` / ``UKHouseholdInput`` / ``USHouseholdOutput`` / + ``UKHouseholdOutput`` pydantic wrappers. +- ``calculate_household_impact`` — the name was misleading (it + returned levels, not an impact vs. baseline). Reserved for a future + delta function. +- The bare ``us_model`` / ``uk_model`` label-only singletons; each + country module now exposes ``.model`` pointing at the real + ``TaxBenefitModelVersion`` (kept ``us_latest`` / ``uk_latest`` + aliases for compatibility with any in-flight downstream code). + +New internal module: + +- ``policyengine.tax_benefit_models.common`` — ``compile_reform``, + ``dispatch_extra_variables``, ``EntityResult``, ``HouseholdResult`` + shared by both country implementations. diff --git a/changelog.d/v4-provenance-package.changed.md b/changelog.d/v4-provenance-package.changed.md new file mode 100644 index 00000000..8c016e02 --- /dev/null +++ b/changelog.d/v4-provenance-package.changed.md @@ -0,0 +1,24 @@ +**BREAKING (v4):** Separate the provenance layer from the core +value-object layer. + +- ``policyengine/core/release_manifest.py`` → ``policyengine/provenance/manifest.py`` +- ``policyengine/core/trace_tro.py`` → ``policyengine/provenance/trace.py`` +- New ``policyengine.provenance`` package re-exports the public + surface (``get_release_manifest``, ``get_data_release_manifest``, + ``build_trace_tro_from_release_bundle``, ``build_simulation_trace_tro``, + ``serialize_trace_tro``, ``canonical_json_bytes``, + ``compute_trace_composition_fingerprint``, etc.). +- ``policyengine.core`` no longer re-exports provenance types. + ``policyengine.core`` shrinks to value objects only (Dataset, + Variable, Parameter, Policy, Dynamic, Simulation, Region, + TaxBenefitModel, TaxBenefitModelVersion, scoping strategies). +- ``import policyengine.core.scoping_strategy`` no longer imports + ``h5py`` at module load; the weight-replacement code path + lazy-imports it. ``import policyengine.outputs.constituency_impact`` + and ``import policyengine.outputs.local_authority_impact`` do the + same. +- Migration for downstream: replace + ``from policyengine.core import DataReleaseManifest`` (et al.) + with ``from policyengine.provenance import DataReleaseManifest``. + The country-module imports in internal code (``tax_benefit_models/{us,uk}/model.py`` + and ``datasets.py``) are already updated. diff --git a/changelog.d/variable-graph.added.md b/changelog.d/variable-graph.added.md new file mode 100644 index 00000000..11ce0773 --- /dev/null +++ b/changelog.d/variable-graph.added.md @@ -0,0 +1 @@ +Added ``policyengine.graph`` — a static-analysis-based variable dependency graph for PolicyEngine source trees. ``extract_from_path(path)`` walks a directory of Variable subclasses, parses formula-method bodies for ``entity("", period)`` and ``add(entity, period, [list])`` references, and returns a ``VariableGraph``. Queries include ``deps(var)`` (direct dependencies), ``impact(var)`` (transitive downstream), and ``path(src, dst)`` (shortest dependency chain). No runtime dependency on country models — indexes ``policyengine-us`` (4,577 variables) in under a second. diff --git a/docs/.gitignore b/docs/.gitignore index eac09687..d05d3238 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,2 +1,5 @@ -# MyST build outputs -_build +# Quarto build outputs +_site +_freeze +/.quarto/ +**/*.quarto_ipynb diff --git a/docs/_generator/README.md b/docs/_generator/README.md new file mode 100644 index 00000000..ef5c7268 --- /dev/null +++ b/docs/_generator/README.md @@ -0,0 +1,52 @@ +# Reference generator prototype + +Auto-generates one Quarto page per variable in a country model, plus a program-coverage page, purely from metadata on the `Variable` classes and `programs.yaml`. + +## Run + +```bash +# Full US reference (takes a couple of minutes — 4,686 variables) +python docs/_generator/build_reference.py --country us --out docs/_generated/reference/us + +# Preview a filtered subset +python docs/_generator/build_reference.py --country us --filter chip --out /tmp/ref-preview +``` + +Then render: + +```bash +cd /tmp/ref-preview && quarto render +``` + +## What's generated from code alone + +Per variable: + +- Title and identifier +- Metadata table: entity, value type, unit, period, `defined_for` gate +- Documentation (docstring) +- Components (`adds` / `subtracts` lists) +- Statutory references (from `reference = ...`) +- Source file path and line number + +Per program: a row in the generated program-coverage page pulled from `programs.yaml` (id, name, category, agency, status, coverage). + +Per directory (`gov/hhs/chip/`, `gov/usda/snap/`, etc.): a listing page using Quarto's built-in directory listing so the nav auto-organizes. + +## What still requires hand-authored prose + +- Methodology narrative (why the model is structured this way) +- Tutorials (how to use `policyengine.py`) +- Paper content (peer-reviewable argument) +- Per-country deep dives that read as essays rather than reference lookups + +## Design + +The generator reads directly from the imported country model — no web API calls, no intermediate JSON. This keeps the build offline-reproducible and version-pinned to whatever country model the `policyengine.py` package has installed. Re-running the generator on release produces a snapshot of the reference docs tied to the exact published model versions. + +Extensions worth considering: + +1. Walk `parameters/` YAML tree and emit a page per parameter with its time series, breakdowns, and references. +2. For each variable with a formula, surface the dependency graph (other variables / parameters it reads). `policyengine_core`'s `Variable.exhaustive_parameter_dependencies` gets partway there. +3. For each calibration target (in `policyengine-us-data/storage/calibration_targets/*.csv`), emit a page describing source, aggregation level, freshness. +4. Cross-link variables to the programs they contribute to via `programs.yaml`'s `variable:` field. diff --git a/docs/_generator/build_reference.py b/docs/_generator/build_reference.py new file mode 100644 index 00000000..4b360622 --- /dev/null +++ b/docs/_generator/build_reference.py @@ -0,0 +1,387 @@ +"""Generate reference documentation pages from PolicyEngine country models. + +Introspects a country model's `TaxBenefitSystem` for every variable, reads +attributes directly from each `Variable` class (`label`, `documentation`, +`entity`, `unit`, `reference`, `defined_for`, `definition_period`, +`adds`/`subtracts`, source file path), and writes one ``.qmd`` page per +variable grouped by its parameter-tree path (``gov/hhs/chip/chip_premium``). + +Also loads the country model's ``programs.yaml`` and writes a program-level +landing page for each entry, cross-linking the variables that belong to it. + +Usage +----- + +Run for a single country model, writing into an output directory: + +.. code-block:: bash + + python docs/_generator/build_reference.py \\ + --country us \\ + --out docs/_generated/reference/us + +Run for a subset of variables to preview output: + +.. code-block:: bash + + python docs/_generator/build_reference.py \\ + --country us --filter chip --out /tmp/ref-preview + +Design notes +------------ + +This is a prototype meant to demonstrate how much reference material can be +regenerated from code + parameter YAML + ``programs.yaml`` alone, with no +hand-authored prose. Intentional non-goals: + +* Do not execute formulas; read metadata only. +* Do not render parameters (a follow-up can walk the parameter tree similarly). +* Do not write an index page tree; Quarto's directory listings handle that. + +The generator emits standard Quarto Markdown (``.qmd``). Quarto reads regular +Markdown too, so the outputs drop into either a Quarto or MyST site. +""" + +from __future__ import annotations + +import argparse +import importlib +import logging +import re +import textwrap +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +import yaml + +logger = logging.getLogger(__name__) + + +COUNTRY_MODULES = { + "us": "policyengine_us", + "uk": "policyengine_uk", + "canada": "policyengine_canada", + "il": "policyengine_il", + "ng": "policyengine_ng", +} + + +@dataclass(frozen=True) +class VariableRecord: + name: str + label: str | None + documentation: str | None + entity: str | None + unit: str | None + value_type: str | None + definition_period: str | None + references: tuple[str, ...] + defined_for: str | None + source_file: Path | None + source_line: int | None + adds: tuple[str, ...] + subtracts: tuple[str, ...] + tree_path: tuple[str, ...] + + +def _tree_path_from_source(source_file: Path | None, package_root: Path) -> tuple[str, ...]: + if source_file is None: + return ("_ungrouped",) + try: + rel = source_file.relative_to(package_root / "variables") + except ValueError: + return ("_ungrouped",) + parts = rel.with_suffix("").parts + return parts[:-1] if parts else ("_ungrouped",) + + +def _normalize_references(raw) -> tuple[str, ...]: + if raw is None: + return () + if isinstance(raw, str): + return (raw,) + if isinstance(raw, (list, tuple)): + return tuple(str(r) for r in raw if r) + return (str(raw),) + + +def _variable_records(country: str) -> Iterable[VariableRecord]: + module_name = COUNTRY_MODULES[country] + country_module = importlib.import_module(module_name) + + system_module = importlib.import_module(f"{module_name}.system") + tbs = system_module.CountryTaxBenefitSystem() + + package_root = Path(country_module.__file__).parent + + import inspect + + for name in sorted(tbs.variables): + variable = tbs.variables[name] + try: + source_file = Path(inspect.getsourcefile(type(variable))) + source_line = inspect.getsourcelines(type(variable))[1] + except (TypeError, OSError): + source_file = None + source_line = None + + entity_key = getattr(variable.entity, "key", None) if variable.entity else None + value_type = getattr(variable, "value_type", None) + value_type_name = ( + value_type.__name__ + if isinstance(value_type, type) + else str(value_type) if value_type is not None else None + ) + defined_for = getattr(variable, "defined_for", None) + defined_for_name = ( + defined_for.name if hasattr(defined_for, "name") else defined_for + ) + + yield VariableRecord( + name=name, + label=variable.label, + documentation=variable.documentation, + entity=entity_key, + unit=getattr(variable, "unit", None), + value_type=value_type_name, + definition_period=getattr(variable, "definition_period", None), + references=_normalize_references(getattr(variable, "reference", None)), + defined_for=defined_for_name, + source_file=source_file, + source_line=source_line, + adds=tuple(getattr(variable, "adds", ()) or ()), + subtracts=tuple(getattr(variable, "subtracts", ()) or ()), + tree_path=_tree_path_from_source(source_file, package_root), + ) + + +def _escape_yaml_scalar(value: str) -> str: + return value.replace('"', '\\"') + + +def _render_variable_page(record: VariableRecord, country: str) -> str: + title = record.label or record.name + lines: list[str] = [ + "---", + f'title: "{_escape_yaml_scalar(title)}"', + f'subtitle: "`{record.name}`"', + ] + if record.documentation: + summary = record.documentation.strip().splitlines()[0][:220] + lines.append(f'description: "{_escape_yaml_scalar(summary)}"') + lines.extend( + [ + "format:", + " html:", + " code-copy: true", + "---", + "", + ] + ) + + metadata = [ + ("Name", f"`{record.name}`"), + ("Entity", f"`{record.entity}`" if record.entity else "—"), + ("Value type", f"`{record.value_type}`" if record.value_type else "—"), + ("Unit", f"`{record.unit}`" if record.unit else "—"), + ("Period", f"`{record.definition_period}`" if record.definition_period else "—"), + ( + "Defined for", + f"`{record.defined_for}`" if record.defined_for else "—", + ), + ] + lines.append("| Field | Value |") + lines.append("|---|---|") + for key, value in metadata: + lines.append(f"| {key} | {value} |") + lines.append("") + + if record.documentation: + lines.append("## Documentation") + lines.append("") + lines.append(record.documentation.strip()) + lines.append("") + + if record.adds: + lines.append("## Components") + lines.append("") + lines.append("This variable sums the following variables:") + lines.append("") + for component in record.adds: + lines.append(f"- `{component}`") + lines.append("") + + if record.subtracts: + lines.append("## Subtractions") + lines.append("") + lines.append("This variable subtracts the following variables:") + lines.append("") + for component in record.subtracts: + lines.append(f"- `{component}`") + lines.append("") + + if record.references: + lines.append("## References") + lines.append("") + for ref in record.references: + lines.append(f"- <{ref}>") + lines.append("") + + if record.source_file: + try: + repo_rel = record.source_file.relative_to( + record.source_file.parents[5] + ) + except (ValueError, IndexError): + repo_rel = record.source_file.name + lines.append("## Source") + lines.append("") + if record.source_line: + lines.append(f"`{repo_rel}`, line {record.source_line}") + else: + lines.append(f"`{repo_rel}`") + lines.append("") + + return "\n".join(lines) + + +def _slug(value: str) -> str: + return re.sub(r"[^A-Za-z0-9_-]+", "-", value).strip("-") + + +def _write_variables( + records: list[VariableRecord], + out_root: Path, + country: str, +) -> int: + written = 0 + for record in records: + tree_dir = out_root.joinpath(*record.tree_path) + tree_dir.mkdir(parents=True, exist_ok=True) + page_path = tree_dir / f"{_slug(record.name)}.qmd" + page_path.write_text(_render_variable_page(record, country)) + written += 1 + return written + + +def _write_tree_indices(out_root: Path) -> int: + written = 0 + for directory in [out_root, *(p for p in out_root.rglob("*") if p.is_dir())]: + index_path = directory / "index.qmd" + if index_path.exists(): + continue + title = directory.name if directory != out_root else "Reference" + index_path.write_text( + textwrap.dedent( + f"""\ + --- + title: "{title}" + listing: + contents: "*.qmd" + type: table + sort: "title" + fields: [title, subtitle, description] + --- + """ + ) + ) + written += 1 + return written + + +def _write_programs_index(country: str, out_root: Path) -> int: + module_name = COUNTRY_MODULES[country] + country_module = importlib.import_module(module_name) + package_root = Path(country_module.__file__).parent + programs_path = package_root / "programs.yaml" + if not programs_path.exists(): + return 0 + with programs_path.open() as f: + registry = yaml.safe_load(f) + programs = registry.get("programs", []) + lines: list[str] = [ + "---", + 'title: "Program coverage"', + 'description: "Programs modeled in the country model, generated from programs.yaml."', + "---", + "", + "| ID | Name | Category | Agency | Status | Coverage |", + "|---|---|---|---|---|---|", + ] + for program in programs: + lines.append( + "| " + + " | ".join( + str(program.get(field, "")).replace("\n", " ") + for field in ("id", "name", "category", "agency", "status", "coverage") + ) + + " |" + ) + target = out_root / "programs.qmd" + target.write_text("\n".join(lines) + "\n") + return 1 + + +def build_reference( + country: str, + out_root: Path, + filter_substring: str | None = None, +) -> dict[str, int]: + out_root.mkdir(parents=True, exist_ok=True) + records = list(_variable_records(country)) + if filter_substring: + needle = filter_substring.lower() + records = [ + r + for r in records + if needle in r.name.lower() + or needle in " ".join(str(p).lower() for p in r.tree_path) + ] + variables_written = _write_variables(records, out_root, country) + programs_written = _write_programs_index(country, out_root) + indices_written = _write_tree_indices(out_root) + return { + "variables": variables_written, + "programs": programs_written, + "indices": indices_written, + } + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--country", + choices=sorted(COUNTRY_MODULES), + default="us", + help="Country model to introspect.", + ) + parser.add_argument( + "--out", + type=Path, + required=True, + help="Output directory for generated .qmd pages.", + ) + parser.add_argument( + "--filter", + default=None, + help="Substring filter on variable name or tree path (case-insensitive).", + ) + return parser.parse_args() + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") + args = _parse_args() + stats = build_reference(args.country, args.out, args.filter) + logger.info( + "Wrote %d variable pages, %d programs page, %d directory indices to %s", + stats["variables"], + stats["programs"], + stats["indices"], + args.out, + ) + + +if __name__ == "__main__": + main() diff --git a/docs/_quarto.yml b/docs/_quarto.yml new file mode 100644 index 00000000..9393b70d --- /dev/null +++ b/docs/_quarto.yml @@ -0,0 +1,63 @@ +project: + type: website + output-dir: _site + preview: + port: 8080 + +website: + title: "PolicyEngine" + description: "Tax-benefit microsimulation for Python." + repo-url: https://github.com/PolicyEngine/policyengine.py + repo-actions: [edit, issue] + page-navigation: true + navbar: + left: + - href: index.md + text: Overview + - getting-started.md + - text: "Guide" + menu: + - households.md + - reforms.md + - microsim.md + - impact-analysis.md + - outputs.md + - regions.md + - examples.md + - dev.md + sidebar: + style: "floating" + collapse-level: 2 + contents: + - index.md + - getting-started.md + - section: "Guide" + contents: + - households.md + - reforms.md + - microsim.md + - impact-analysis.md + - outputs.md + - regions.md + - visualisation.md + - section: "Platform" + contents: + - countries.md + - release-bundles.md + - section: "Usage" + contents: + - examples.md + - section: "Development" + contents: + - dev.md + +format: + html: + theme: [cosmo] + toc: true + toc-depth: 3 + code-copy: true + code-overflow: wrap + highlight-style: github + anchor-sections: true + link-external-newwindow: true diff --git a/docs/advanced-outputs.md b/docs/advanced-outputs.md deleted file mode 100644 index 5fdbaead..00000000 --- a/docs/advanced-outputs.md +++ /dev/null @@ -1,276 +0,0 @@ -# Advanced outputs - -Beyond `Aggregate` and `ChangeAggregate` (covered in [Core concepts](core-concepts.md)), the package provides specialised output types for distributional analysis, poverty measurement, and inequality metrics. - -All output types follow the same pattern: create an instance, call `.run()`, read the result fields. Convenience functions are provided for common use cases. - -## OutputCollection - -Many convenience functions return an `OutputCollection[T]`, a container holding both the individual output objects and a pandas DataFrame: - -```python -from policyengine.core import OutputCollection - -# Returned by calculate_decile_impacts(), calculate_us_poverty_rates(), etc. -collection = calculate_us_poverty_rates(simulation) - -# Access individual objects -for poverty in collection.outputs: - print(f"{poverty.poverty_type}: {poverty.rate:.4f}") - -# Access as DataFrame -print(collection.dataframe) -``` - -## DecileImpact - -Calculates the impact of a policy reform on a single income decile: baseline and reform mean income, absolute and relative change, and counts of people better off, worse off, and unchanged. - -### Using the convenience function - -```python -from policyengine.outputs.decile_impact import calculate_decile_impacts - -decile_impacts = calculate_decile_impacts( - dataset=dataset, - tax_benefit_model_version=us_latest, - baseline_policy=None, # Current law - reform_policy=reform, - income_variable="household_net_income", # Default for US -) - -for d in decile_impacts.outputs: - print(f"Decile {d.decile}: " - f"baseline={d.baseline_mean:,.0f}, " - f"reform={d.reform_mean:,.0f}, " - f"change={d.absolute_change:+,.0f} " - f"({d.relative_change:+.2f}%)") -``` - -### Using directly - -```python -from policyengine.outputs.decile_impact import DecileImpact - -impact = DecileImpact( - baseline_simulation=baseline_sim, - reform_simulation=reform_sim, - income_variable="household_net_income", - decile=5, # 5th decile -) -impact.run() - -print(f"Count better off: {impact.count_better_off:,.0f}") -print(f"Count worse off: {impact.count_worse_off:,.0f}") -``` - -### Parameters - -| Parameter | Default | Description | -|---|---|---| -| `income_variable` | `equiv_hbai_household_net_income` | Income variable to group by and measure changes | -| `decile_variable` | `None` | Use a pre-computed grouping variable instead of `qcut` | -| `entity` | Auto-detected | Entity level for the income variable | -| `quantiles` | `10` | Number of quantile groups (10 = deciles, 5 = quintiles) | - -For US simulations, use `income_variable="household_net_income"`. The UK default (`equiv_hbai_household_net_income`) is the equivalised HBAI measure. - -## IntraDecileImpact - -Classifies people within each decile into five income change categories: - -| Category | Threshold | -|---|---| -| Lose more than 5% | change <= -5% | -| Lose less than 5% | -5% < change <= -0.1% | -| No change | -0.1% < change <= 0.1% | -| Gain less than 5% | 0.1% < change <= 5% | -| Gain more than 5% | change > 5% | - -Proportions are people-weighted (using `household_count_people * household_weight`). - -### Using the convenience function - -```python -from policyengine.outputs.intra_decile_impact import compute_intra_decile_impacts - -intra = compute_intra_decile_impacts( - baseline_simulation=baseline_sim, - reform_simulation=reform_sim, - income_variable="household_net_income", -) - -for row in intra.outputs: - if row.decile == 0: - label = "Overall" - else: - label = f"Decile {row.decile}" - print(f"{label}: " - f"lose>5%={row.lose_more_than_5pct:.2%}, " - f"lose<5%={row.lose_less_than_5pct:.2%}, " - f"no change={row.no_change:.2%}, " - f"gain<5%={row.gain_less_than_5pct:.2%}, " - f"gain>5%={row.gain_more_than_5pct:.2%}") -``` - -The function returns deciles 1-10 plus an overall average at `decile=0`. - -## Poverty - -Calculates poverty headcount and rates for a single simulation, with optional demographic filtering. - -### Poverty types - -**UK** (4 measures): -- Absolute before housing costs (BHC) -- Absolute after housing costs (AHC) -- Relative before housing costs (BHC) -- Relative after housing costs (AHC) - -**US** (2 measures): -- SPM poverty -- Deep SPM poverty (below 50% of SPM threshold) - -### Calculating all poverty rates - -```python -from policyengine.outputs.poverty import ( - calculate_uk_poverty_rates, - calculate_us_poverty_rates, -) - -# US -us_poverty = calculate_us_poverty_rates(simulation) -for p in us_poverty.outputs: - print(f"{p.poverty_type}: headcount={p.headcount:,.0f}, rate={p.rate:.4f}") - -# UK -uk_poverty = calculate_uk_poverty_rates(simulation) -for p in uk_poverty.outputs: - print(f"{p.poverty_type}: headcount={p.headcount:,.0f}, rate={p.rate:.4f}") -``` - -### Poverty by demographic group - -```python -from policyengine.outputs.poverty import ( - calculate_us_poverty_by_age, - calculate_us_poverty_by_gender, - calculate_us_poverty_by_race, - calculate_uk_poverty_by_age, - calculate_uk_poverty_by_gender, -) - -# By age group (child <18, adult 18-64, senior 65+) -by_age = calculate_us_poverty_by_age(simulation) -for p in by_age.outputs: - print(f"{p.filter_group} {p.poverty_type}: {p.rate:.4f}") - -# By gender -by_gender = calculate_us_poverty_by_gender(simulation) - -# By race (US only: WHITE, BLACK, HISPANIC, OTHER) -by_race = calculate_us_poverty_by_race(simulation) -``` - -### Custom filters - -```python -from policyengine.outputs.poverty import Poverty - -# Child poverty only -child_poverty = Poverty( - simulation=simulation, - poverty_variable="spm_unit_is_in_spm_poverty", - entity="person", - filter_variable="age", - filter_variable_leq=17, -) -child_poverty.run() -print(f"Child SPM poverty rate: {child_poverty.rate:.4f}") -``` - -### Result fields - -| Field | Description | -|---|---| -| `headcount` | Weighted count of people in poverty | -| `total_population` | Weighted total population (after filters) | -| `rate` | `headcount / total_population` | -| `filter_group` | Group label set by demographic convenience functions | - -## Inequality - -Calculates weighted inequality metrics for a single simulation: Gini coefficient and income share measures. - -### Using convenience functions - -```python -from policyengine.outputs.inequality import ( - calculate_uk_inequality, - calculate_us_inequality, -) - -# US (uses household_net_income by default) -ineq = calculate_us_inequality(simulation) -print(f"Gini: {ineq.gini:.4f}") -print(f"Top 10% share: {ineq.top_10_share:.4f}") -print(f"Top 1% share: {ineq.top_1_share:.4f}") -print(f"Bottom 50% share: {ineq.bottom_50_share:.4f}") - -# UK (uses equiv_hbai_household_net_income by default) -ineq = calculate_uk_inequality(simulation) -``` - -### With demographic filters - -```python -# Inequality among working-age adults only -ineq = calculate_us_inequality( - simulation, - filter_variable="age", - filter_variable_geq=18, - filter_variable_leq=64, -) -``` - -### Using directly - -```python -from policyengine.outputs.inequality import Inequality - -ineq = Inequality( - simulation=simulation, - income_variable="household_net_income", - entity="household", -) -ineq.run() -``` - -### Result fields - -| Field | Description | -|---|---| -| `gini` | Weighted Gini coefficient (0 = perfect equality, 1 = perfect inequality) | -| `top_10_share` | Share of total income held by top 10% | -| `top_1_share` | Share of total income held by top 1% | -| `bottom_50_share` | Share of total income held by bottom 50% | - -## Comparing baseline and reform - -Poverty and inequality are single-simulation outputs. To compare baseline and reform, compute both and take the difference: - -```python -baseline_poverty = calculate_us_poverty_rates(baseline_sim) -reform_poverty = calculate_us_poverty_rates(reform_sim) - -for bp, rp in zip(baseline_poverty.outputs, reform_poverty.outputs): - change = rp.rate - bp.rate - print(f"{bp.poverty_type}: {bp.rate:.4f} -> {rp.rate:.4f} ({change:+.4f})") - -baseline_ineq = calculate_us_inequality(baseline_sim) -reform_ineq = calculate_us_inequality(reform_sim) -print(f"Gini change: {reform_ineq.gini - baseline_ineq.gini:+.4f}") -``` - -The `economic_impact_analysis()` function does this automatically and returns both baseline and reform poverty/inequality in the `PolicyReformAnalysis` result. See [Economic impact analysis](economic-impact-analysis.md). diff --git a/docs/core-concepts.md b/docs/core-concepts.md deleted file mode 100644 index 425c5f62..00000000 --- a/docs/core-concepts.md +++ /dev/null @@ -1,662 +0,0 @@ -# Core concepts - -PolicyEngine.py is a Python package for tax-benefit microsimulation analysis. It provides a unified interface for running policy simulations, analysing distributional impacts, and visualising results across different countries. - -## Architecture overview - -The package is organised around several core concepts: - -- **Tax-benefit models**: Country-specific implementations (UK, US) that define tax and benefit rules -- **Datasets**: Microdata representing populations at entity level (person, household, etc.) -- **Simulations**: Execution environments that apply tax-benefit models to datasets -- **Outputs**: Analysis tools for extracting insights from simulation results -- **Policies**: Parametric reforms that modify tax-benefit system parameters - -## Tax-benefit models - -Tax-benefit models define the rules and calculations for a country's tax and benefit system. Each model version contains: - -- **Variables**: Calculated values (e.g., income tax, universal credit) -- **Parameters**: System settings (e.g., personal allowance, benefit rates) -- **Parameter values**: Time-bound values for parameters - -### Using a tax-benefit model - -```python -from policyengine.tax_benefit_models.uk import uk_latest -from policyengine.tax_benefit_models.us import us_latest - -# UK model includes variables like: -# - income_tax, national_insurance, universal_credit -# - Parameters like personal allowance, NI thresholds - -# US model includes variables like: -# - income_tax, payroll_tax, eitc, ctc, snap -# - Parameters like standard deduction, EITC rates -``` - -## Datasets - -Datasets contain microdata representing a population. Each dataset has: - -- **Entity-level data**: Separate dataframes for person, household, and other entities -- **Weights**: Survey weights for population representation -- **Join keys**: Relationships between entities (e.g., which household each person belongs to) - -### Dataset structure - -```python -from policyengine.tax_benefit_models.uk import PolicyEngineUKDataset - -dataset = PolicyEngineUKDataset( - name="FRS 2023-24", - description="Family Resources Survey microdata", - filepath="./data/frs_2023_24_year_2026.h5", - year=2026, -) - -# Access entity-level data -person_data = dataset.data.person # MicroDataFrame -household_data = dataset.data.household -benunit_data = dataset.data.benunit # Benefit unit (UK only) -``` - -### Creating custom datasets - -You can create custom datasets for scenario analysis: - -```python -import pandas as pd -from microdf import MicroDataFrame -from policyengine.tax_benefit_models.uk import PolicyEngineUKDataset, UKYearData - -# Create person data -person_df = MicroDataFrame( - pd.DataFrame({ - "person_id": [0, 1, 2], - "person_household_id": [0, 0, 1], - "person_benunit_id": [0, 0, 1], - "age": [35, 8, 40], - "employment_income": [30000, 0, 50000], - "person_weight": [1.0, 1.0, 1.0], - }), - weights="person_weight" -) - -# Create household data -household_df = MicroDataFrame( - pd.DataFrame({ - "household_id": [0, 1], - "region": ["LONDON", "SOUTH_EAST"], - "rent": [15000, 12000], - "household_weight": [1.0, 1.0], - }), - weights="household_weight" -) - -# Create benunit data -benunit_df = MicroDataFrame( - pd.DataFrame({ - "benunit_id": [0, 1], - "would_claim_uc": [True, True], - "benunit_weight": [1.0, 1.0], - }), - weights="benunit_weight" -) - -dataset = PolicyEngineUKDataset( - name="Custom scenario", - description="Single parent vs single adult", - filepath="./custom.h5", - year=2026, - data=UKYearData( - person=person_df, - household=household_df, - benunit=benunit_df, - ) -) -``` - -## Data loading - -Before running simulations, you need representative microdata. The package provides three functions for managing datasets: - -- **`ensure_datasets()`**: Load from disk if available, otherwise download and compute (recommended) -- **`create_datasets()`**: Always download from HuggingFace and compute from scratch -- **`load_datasets()`**: Load previously saved HDF5 files from disk - -```python -from policyengine.tax_benefit_models.us import ensure_datasets - -# First run: downloads from HuggingFace, computes variables, saves to ./data/ -# Subsequent runs: loads from disk instantly -datasets = ensure_datasets( - datasets=["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"], - years=[2026], - data_folder="./data", -) -dataset = datasets["enhanced_cps_2024_2026"] -``` - -```python -from policyengine.tax_benefit_models.uk import ensure_datasets - -datasets = ensure_datasets( - datasets=["hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5"], - years=[2026], - data_folder="./data", -) -dataset = datasets["enhanced_frs_2023_24_2026"] -``` - -All datasets are stored as HDF5 files on disk. No database server is required. - -## Simulations - -Simulations apply tax-benefit models to datasets, calculating all variables for the specified year. - -### Running a simulation - -```python -from policyengine.core import Simulation -from policyengine.tax_benefit_models.uk import uk_latest - -simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, -) -simulation.run() - -# Access output data -output_person = simulation.output_dataset.data.person -output_household = simulation.output_dataset.data.household - -# Check calculated variables -print(output_household[["household_id", "household_net_income", "household_tax"]]) -``` - -### Simulation lifecycle: `run()` vs `ensure()` - -The `Simulation` class provides two methods for computing results: - -| Method | Behaviour | -|---|---| -| `simulation.run()` | Always recomputes from scratch. No caching. | -| `simulation.ensure()` | Checks in-memory LRU cache, then tries loading from disk, then falls back to `run()` + `save()`. | - -```python -# One-off computation (no caching) -simulation.run() - -# Cache-or-compute (preferred for production use) -simulation.ensure() -``` - -`ensure()` uses a module-level LRU cache (max 100 simulations) and saves output datasets as HDF5 files alongside the input dataset. On repeated calls, it returns cached results instantly. For baseline-vs-reform comparisons, `economic_impact_analysis()` calls `ensure()` internally, so you rarely need to call it yourself. - -### Accessing calculated variables - -After running a simulation, you can access the calculated variables from the output dataset: - -```python -simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, -) -simulation.run() - -# Access specific variables -output = simulation.output_dataset.data -person_data = output.person[["person_id", "age", "employment_income", "income_tax"]] -household_data = output.household[["household_id", "household_net_income"]] -benunit_data = output.benunit[["benunit_id", "universal_credit", "child_benefit"]] -``` - -## Policies - -Policies modify tax-benefit system parameters through parametric reforms. - -### Creating a policy - -```python -from policyengine.core import Policy, Parameter, ParameterValue -import datetime - -# Define parameter to modify -parameter = Parameter( - name="gov.hmrc.income_tax.allowances.personal_allowance.amount", - tax_benefit_model_version=uk_latest, - description="Personal allowance for income tax", - data_type=float, -) - -# Set new value -parameter_value = ParameterValue( - parameter=parameter, - start_date=datetime.date(2026, 1, 1), - end_date=datetime.date(2026, 12, 31), - value=15000, # Increase from ~£12,570 to £15,000 -) - -policy = Policy( - name="Increased personal allowance", - description="Raises personal allowance to £15,000", - parameter_values=[parameter_value], -) -``` - -### Running a reform simulation - -```python -# Baseline simulation -baseline = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, -) -baseline.run() - -# Reform simulation -reform = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, - policy=policy, -) -reform.run() -``` - -### Combining policies - -Policies can be combined using the `+` operator: - -```python -combined = policy_a + policy_b -# Concatenates parameter_values and chains simulation_modifiers -``` - -### Simulation modifiers - -For reforms that cannot be expressed as parameter value changes, `Policy` accepts a `simulation_modifier` callable that directly manipulates the underlying `policyengine_core` simulation: - -```python -def my_modifier(sim): - """Custom reform logic applied to the core simulation object.""" - p = sim.tax_benefit_system.parameters - # Modify parameters programmatically - return sim - -policy = Policy( - name="Custom reform", - simulation_modifier=my_modifier, -) -``` - -Note: the UK model supports `simulation_modifier`. The US model currently only uses the `parameter_values` path. - -## Dynamic behavioural responses - -The `Dynamic` class is structurally identical to `Policy` and represents behavioural responses to policy changes (e.g., labour supply elasticities). It is applied after the policy in the simulation pipeline. - -```python -from policyengine.core.dynamic import Dynamic - -dynamic = Dynamic( - name="Labour supply response", - parameter_values=[...], # Same format as Policy -) - -simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, - policy=policy, - dynamic=dynamic, -) -``` - -Dynamic responses can also be combined using the `+` operator and support `simulation_modifier` callables. - -## Outputs - -Output classes provide structured analysis of simulation results. - -### Aggregate - -Calculate aggregate statistics (sum, mean, count) for any variable: - -```python -from policyengine.outputs.aggregate import Aggregate, AggregateType - -# Total universal credit spending -agg = Aggregate( - simulation=simulation, - variable="universal_credit", - aggregate_type=AggregateType.SUM, - entity="benunit", # Map to benunit level -) -agg.run() -print(f"Total UC spending: £{agg.result / 1e9:.1f}bn") - -# Mean household income in top decile -agg = Aggregate( - simulation=simulation, - variable="household_net_income", - aggregate_type=AggregateType.MEAN, - filter_variable="household_net_income", - quantile=10, - quantile_eq=10, # 10th decile -) -agg.run() -print(f"Mean income in top decile: £{agg.result:,.0f}") -``` - -### ChangeAggregate - -Analyse impacts of policy reforms: - -```python -from policyengine.outputs.change_aggregate import ChangeAggregate, ChangeAggregateType - -# Count winners and losers -winners = ChangeAggregate( - baseline_simulation=baseline, - reform_simulation=reform, - variable="household_net_income", - aggregate_type=ChangeAggregateType.COUNT, - change_geq=1, # Gain at least £1 -) -winners.run() -print(f"Winners: {winners.result / 1e6:.1f}m households") - -losers = ChangeAggregate( - baseline_simulation=baseline, - reform_simulation=reform, - variable="household_net_income", - aggregate_type=ChangeAggregateType.COUNT, - change_leq=-1, # Lose at least £1 -) -losers.run() -print(f"Losers: {losers.result / 1e6:.1f}m households") - -# Revenue impact -revenue = ChangeAggregate( - baseline_simulation=baseline, - reform_simulation=reform, - variable="household_tax", - aggregate_type=ChangeAggregateType.SUM, -) -revenue.run() -print(f"Revenue change: £{revenue.result / 1e9:.1f}bn") -``` - -## Entity mapping - -The package automatically handles entity mapping when variables are defined at different entity levels. - -### Entity hierarchy - -**UK:** -``` -household - └── benunit (benefit unit) - └── person -``` - -**US:** -``` -household - ├── tax_unit - ├── spm_unit - ├── family - └── marital_unit - └── person -``` - -### Automatic mapping - -When you request a person-level variable (like `ssi`) at household level, the package: -1. Sums person-level values within each household (aggregation) -2. Returns household-level data with proper weights - -```python -# SSI is defined at person level, but we want household-level totals -agg = Aggregate( - simulation=simulation, - variable="ssi", # Person-level variable - entity="household", # Target household level - aggregate_type=AggregateType.SUM, -) -# Internally maps person → household by summing SSI for all persons in each household -``` - -When you request a household-level variable at person level: -1. Replicates household values to all persons in that household (expansion) - -### Direct entity mapping - -You can also map data between entities directly using the `map_to_entity` method: - -```python -# Map person income to household level (sum) -household_income = dataset.data.map_to_entity( - source_entity="person", - target_entity="household", - columns=["employment_income"], - how="sum" -) - -# Map household rent to person level (project/broadcast) -person_rent = dataset.data.map_to_entity( - source_entity="household", - target_entity="person", - columns=["rent"], - how="project" -) -``` - -#### Mapping with custom values - -You can map custom value arrays instead of existing columns: - -```python -# Map custom per-person values to household level -import numpy as np - -# Create custom values (e.g., imputed data) -custom_values = np.array([100, 200, 150, 300]) - -household_totals = dataset.data.map_to_entity( - source_entity="person", - target_entity="household", - values=custom_values, - how="sum" -) -``` - -#### Aggregation methods - -The `how` parameter controls how values are mapped: - -**Person → Group (aggregation):** -- `how='sum'` (default): Sum values within each group -- `how='first'`: Take first person's value in each group - -```python -# Sum person incomes to household level -household_income = data.map_to_entity( - source_entity="person", - target_entity="household", - columns=["employment_income"], - how="sum" -) - -# Take first person's age as household reference -household_age = data.map_to_entity( - source_entity="person", - target_entity="household", - columns=["age"], - how="first" -) -``` - -**Group → Person (expansion):** -- `how='project'` (default): Broadcast group value to all members -- `how='divide'`: Split group value equally among members - -```python -# Broadcast household rent to each person -person_rent = data.map_to_entity( - source_entity="household", - target_entity="person", - columns=["rent"], - how="project" -) - -# Split household savings equally per person -person_savings = data.map_to_entity( - source_entity="household", - target_entity="person", - columns=["total_savings"], - how="divide" -) -``` - -**Group → Group (via person entity):** -- `how='sum'` (default): Sum through person entity -- `how='first'`: Take first source group's value -- `how='project'`: Broadcast first source group's value -- `how='divide'`: Split proportionally based on person counts - -```python -# UK: Sum benunit benefits to household level -household_benefits = data.map_to_entity( - source_entity="benunit", - target_entity="household", - columns=["universal_credit"], - how="sum" -) - -# US: Map tax unit income to household, splitting by members -household_from_tax = data.map_to_entity( - source_entity="tax_unit", - target_entity="household", - columns=["taxable_income"], - how="divide" -) -``` - -## Visualisation - -The package includes utilities for creating PolicyEngine-branded visualisations: - -```python -from policyengine.utils.plotting import format_fig, COLORS -import plotly.graph_objects as go - -fig = go.Figure() -fig.add_trace(go.Scatter(x=[1, 2, 3], y=[4, 5, 6])) - -format_fig( - fig, - title="My chart", - xaxis_title="X axis", - yaxis_title="Y axis", - height=600, - width=800, -) -fig.show() -``` - -### Brand colours - -```python -COLORS = { - "primary": "#319795", # Teal - "success": "#22C55E", # Green - "warning": "#FEC601", # Yellow - "error": "#EF4444", # Red - "info": "#1890FF", # Blue - "blue_secondary": "#026AA2", # Dark blue - "gray": "#667085", # Gray -} -``` - -## Common workflows - -### 1. Analyse employment income variation - -See [UK employment income variation](examples.md#uk-employment-income-variation) for a complete example of: -- Creating custom datasets with varied parameters -- Running single simulations -- Extracting results with filters -- Visualising benefit phase-outs - -### 2. Policy reform analysis - -See [UK policy reform analysis](examples.md#uk-policy-reform-analysis) for: -- Applying parametric reforms -- Comparing baseline and reform -- Analysing winners/losers by decile -- Calculating revenue impacts - -### 3. Distributional analysis - -See [US income distribution](examples.md#us-income-distribution) for: -- Loading representative microdata -- Calculating statistics by income decile -- Mapping variables across entity levels -- Creating interactive visualisations - -## Best practices - -### Creating custom datasets - -1. **Always set would_claim variables**: Benefits won't be claimed unless explicitly enabled - ```python - "would_claim_uc": [True] * n_households - ``` - -2. **Set disability variables explicitly**: Prevents random UC spikes from LCWRA element - ```python - "is_disabled_for_benefits": [False] * n_people - "uc_limited_capability_for_WRA": [False] * n_people - ``` - -3. **Include required join keys**: Person data needs entity membership - ```python - "person_household_id": household_ids - "person_benunit_id": benunit_ids # UK only - ``` - -4. **Set required household fields**: Vary by country - ```python - # UK - "region": ["LONDON"] * n_households - "tenure_type": ["RENT_PRIVATELY"] * n_households - - # US - "state_code": ["CA"] * n_households - ``` - -### Performance optimisation - -1. **Single simulation for variations**: Create all scenarios in one dataset, run once -2. **Custom variable selection**: Only calculate needed variables -3. **Filter efficiently**: Use quantile filters for decile analysis -4. **Parallel analysis**: Multiple Aggregate calls can run independently - -### Data integrity - -1. **Check weights**: Ensure weights sum to expected population -2. **Validate join keys**: All persons should link to valid households -3. **Review output ranges**: Check calculated values are reasonable -4. **Test edge cases**: Zero income, high income, disabled, elderly - -## Next steps - -- [Economic impact analysis](economic-impact-analysis.md): Full baseline-vs-reform comparison workflow -- [Advanced outputs](advanced-outputs.md): DecileImpact, Poverty, Inequality, IntraDecileImpact -- [Regions and scoping](regions-and-scoping.md): Sub-national analysis (states, constituencies, districts) -- Country-specific documentation: - - [UK tax-benefit model](country-models-uk.md) - - [US tax-benefit model](country-models-us.md) -- [Visualisation](visualisation.md): Publication-ready charts -- [Examples](examples.md): Complete working scripts diff --git a/docs/countries.md b/docs/countries.md new file mode 100644 index 00000000..c9e5e37f --- /dev/null +++ b/docs/countries.md @@ -0,0 +1,87 @@ +--- +title: "Country models" +--- + +The `policyengine` package is country-agnostic; country-specific rules live in separate packages (`policyengine-us`, `policyengine-uk`, …). This page documents the differences that matter to users. + +## Entities + +| US | UK | +|---|---| +| `person` | `person` | +| `family` | — | +| `marital_unit` | — | +| `tax_unit` | `benunit` | +| `spm_unit` | — | +| `household` | `household` | + +The UK `benunit` roughly corresponds to the US `tax_unit` for means-testing — a single adult or married couple plus dependent children. + +## Default income variable + +Net-income calculations use country-specific defaults: + +| | Variable | +|---|---| +| US | `spm_unit_net_income` | +| UK | `hbai_household_net_income` | + +Override in any output with `income_variable=`. + +## Default dataset + +| | Dataset | +|---|---| +| US | Enhanced CPS 2024 (`enhanced_cps_2024.h5`) | +| UK | Enhanced FRS 2024 (`enhanced_frs_2024.h5`) | + +## State / regional breakdown + +US: `state_code`, `congressional_district` on every record. + +UK: constituency code, local authority code on every record where available. + +## Poverty + +US: SPM (Supplemental Poverty Measure), deep SPM (below half the threshold), plus official thresholds. + +UK: AHC (After Housing Costs) and BHC (Before Housing Costs), both relative (60 % of median) and absolute. + +## Key programs + +| US | UK | +|---|---| +| Federal income tax (incl. EITC, CTC) | Income tax (incl. personal allowance) | +| State income taxes | — | +| Payroll taxes | National Insurance | +| SNAP | Universal Credit (absorbing legacy benefits) | +| TANF | Child Benefit | +| SSI | PIP | +| CHIP | — (NHS is universal) | +| ACA premium tax credits | — | +| Medicare Part B | — | + +## Reform targeting + +Parameter paths mirror the country's rule-making structure: + +- US: `gov.irs.*`, `gov.states..*`, `gov.usda.*`, `gov.hhs.*`, etc. +- UK: `gov.hmrc.*`, `gov.dwp.*`, `gov.obr.*` + +See [Reforms](reforms.md) for how to express changes in either tree. + +## Switching countries + +Most analysis patterns are identical — swap `pe.us` for `pe.uk`: + +```python +# US +pe.us.calculate_household(people=[{"age": 35, "employment_income": 60_000}], + tax_unit={"filing_status": "SINGLE"}, year=2026) + +# UK +pe.uk.calculate_household(people=[{"age": 35, "employment_income": 50_000}], + year=2026) +``` + +Microsim is similarly parallel — `pe.us.ensure_datasets` / `pe.uk.ensure_datasets`, `pe.Simulation(country="us"|"uk", ...)`. diff --git a/docs/country-models-uk.md b/docs/country-models-uk.md deleted file mode 100644 index 0bc54505..00000000 --- a/docs/country-models-uk.md +++ /dev/null @@ -1,374 +0,0 @@ -# UK tax-benefit model - -The UK tax-benefit model implements the United Kingdom's tax and benefit system using PolicyEngine UK as the underlying calculation engine. - -## Entity structure - -The UK model uses three entity levels: - -``` -household - └── benunit (benefit unit) - └── person -``` - -### Person - -Individual people with demographic and income characteristics. - -**Key variables:** -- `age`: Person's age in years -- `employment_income`: Annual employment income -- `self_employment_income`: Annual self-employment income -- `pension_income`: Annual pension income -- `savings_interest_income`: Annual interest from savings -- `dividend_income`: Annual dividend income -- `income_tax`: Total income tax paid -- `national_insurance`: Total NI contributions -- `is_disabled_for_benefits`: Whether disabled for benefit purposes - -### Benunit (benefit unit) - -The unit for benefit assessment. Usually a single person or a couple with dependent children. - -**Key variables:** -- `universal_credit`: Annual UC payment -- `child_benefit`: Annual child benefit -- `working_tax_credit`: Annual WTC (legacy system) -- `child_tax_credit`: Annual CTC (legacy system) -- `pension_credit`: Annual pension credit -- `income_support`: Annual income support -- `housing_benefit`: Annual housing benefit -- `council_tax_support`: Annual council tax support - -**Important flags:** -- `would_claim_uc`: Must be True to claim UC -- `would_claim_WTC`: Must be True to claim WTC -- `would_claim_CTC`: Must be True to claim CTC -- `would_claim_IS`: Must be True to claim IS -- `would_claim_pc`: Must be True to claim pension credit -- `would_claim_child_benefit`: Must be True to claim child benefit -- `would_claim_housing_benefit`: Must be True to claim HB - -### Household - -The residence unit, typically sharing accommodation. - -**Key variables:** -- `household_net_income`: Total household net income -- `hbai_household_net_income`: HBAI-equivalised net income -- `household_benefits`: Total benefits received -- `household_tax`: Total tax paid -- `household_market_income`: Total market income - -**Required fields:** -- `region`: UK region (e.g., "LONDON", "SOUTH_EAST") -- `tenure_type`: Housing tenure (e.g., "RENT_PRIVATELY", "OWNED_OUTRIGHT") -- `rent`: Annual rent paid -- `council_tax`: Annual council tax - -## Using the UK model - -### Loading representative data - -```python -from policyengine.tax_benefit_models.uk import PolicyEngineUKDataset - -dataset = PolicyEngineUKDataset( - name="FRS 2023-24", - description="Family Resources Survey microdata", - filepath="./data/frs_2023_24_year_2026.h5", - year=2026, -) - -print(f"People: {len(dataset.data.person):,}") -print(f"Benefit units: {len(dataset.data.benunit):,}") -print(f"Households: {len(dataset.data.household):,}") -``` - -### Creating custom scenarios - -```python -import pandas as pd -from microdf import MicroDataFrame -from policyengine.tax_benefit_models.uk import UKYearData - -# Single parent with 2 children -person_df = MicroDataFrame( - pd.DataFrame({ - "person_id": [0, 1, 2], - "person_benunit_id": [0, 0, 0], - "person_household_id": [0, 0, 0], - "age": [35, 8, 5], - "employment_income": [25000, 0, 0], - "person_weight": [1.0, 1.0, 1.0], - "is_disabled_for_benefits": [False, False, False], - "uc_limited_capability_for_WRA": [False, False, False], - }), - weights="person_weight" -) - -benunit_df = MicroDataFrame( - pd.DataFrame({ - "benunit_id": [0], - "benunit_weight": [1.0], - "would_claim_uc": [True], - "would_claim_child_benefit": [True], - "would_claim_WTC": [True], - "would_claim_CTC": [True], - }), - weights="benunit_weight" -) - -household_df = MicroDataFrame( - pd.DataFrame({ - "household_id": [0], - "household_weight": [1.0], - "region": ["LONDON"], - "rent": [15000], # £1,250/month - "council_tax": [2000], - "tenure_type": ["RENT_PRIVATELY"], - }), - weights="household_weight" -) - -dataset = PolicyEngineUKDataset( - name="Single parent scenario", - description="One adult, two children", - filepath="./single_parent.h5", - year=2026, - data=UKYearData( - person=person_df, - benunit=benunit_df, - household=household_df, - ) -) -``` - -### Running a simulation - -```python -from policyengine.core import Simulation -from policyengine.tax_benefit_models.uk import uk_latest - -simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, -) -simulation.run() - -# Check results -output = simulation.output_dataset.data -print(output.household[["household_net_income", "household_benefits", "household_tax"]]) -``` - -## Key parameters - -### Income tax - -- `gov.hmrc.income_tax.allowances.personal_allowance.amount`: Personal allowance (£12,570 in 2024-25) -- `gov.hmrc.income_tax.rates.uk[0].rate`: Basic rate (20%) -- `gov.hmrc.income_tax.rates.uk[1].rate`: Higher rate (40%) -- `gov.hmrc.income_tax.rates.uk[2].rate`: Additional rate (45%) -- `gov.hmrc.income_tax.rates.uk[0].threshold`: Basic rate threshold (£50,270) -- `gov.hmrc.income_tax.rates.uk[1].threshold`: Higher rate threshold (£125,140) - -### National insurance - -- `gov.hmrc.national_insurance.class_1.main.primary_threshold`: Primary threshold (£12,570) -- `gov.hmrc.national_insurance.class_1.main.upper_earnings_limit`: Upper earnings limit (£50,270) -- `gov.hmrc.national_insurance.class_1.main.rate`: Main rate (12% below UEL, 2% above) - -### Universal credit - -- `gov.dwp.universal_credit.elements.standard_allowance.single_adult`: Standard allowance for single adult (£334.91/month in 2024-25) -- `gov.dwp.universal_credit.elements.child.first_child`: First child element (£333.33/month) -- `gov.dwp.universal_credit.elements.child.subsequent_child`: Subsequent children (£287.92/month each) -- `gov.dwp.universal_credit.means_test.reduction_rate`: Taper rate (55%) -- `gov.dwp.universal_credit.means_test.earned_income.disregard`: Work allowance - -### Child benefit - -- `gov.hmrc.child_benefit.rates.eldest_child`: First child rate (£25.60/week) -- `gov.hmrc.child_benefit.rates.additional_child`: Additional children (£16.95/week each) -- `gov.hmrc.child_benefit.income_tax_charge.threshold`: HICBC threshold (£60,000) - -## Common policy reforms - -### Increasing personal allowance - -```python -from policyengine.core import Policy, Parameter, ParameterValue -import datetime - -parameter = Parameter( - name="gov.hmrc.income_tax.allowances.personal_allowance.amount", - tax_benefit_model_version=uk_latest, - description="Personal allowance", - data_type=float, -) - -policy = Policy( - name="Increase personal allowance to £15,000", - description="Raises personal allowance from £12,570 to £15,000", - parameter_values=[ - ParameterValue( - parameter=parameter, - start_date=datetime.date(2026, 1, 1), - end_date=datetime.date(2026, 12, 31), - value=15000, - ) - ], -) -``` - -### Adjusting UC taper rate - -```python -parameter = Parameter( - name="gov.dwp.universal_credit.means_test.reduction_rate", - tax_benefit_model_version=uk_latest, - description="UC taper rate", - data_type=float, -) - -policy = Policy( - name="Reduce UC taper to 50%", - description="Lowers taper rate from 55% to 50%", - parameter_values=[ - ParameterValue( - parameter=parameter, - start_date=datetime.date(2026, 1, 1), - end_date=datetime.date(2026, 12, 31), - value=0.50, # 50% - ) - ], -) -``` - -### Abolishing two-child limit - -```python -# Set subsequent child element equal to first child -parameter = Parameter( - name="gov.dwp.universal_credit.elements.child.subsequent_child", - tax_benefit_model_version=uk_latest, - description="UC subsequent child element", - data_type=float, -) - -policy = Policy( - name="Abolish two-child limit", - description="Sets subsequent child element equal to first child", - parameter_values=[ - ParameterValue( - parameter=parameter, - start_date=datetime.date(2026, 1, 1), - end_date=datetime.date(2026, 12, 31), - value=333.33, # Match first child rate - ) - ], -) -``` - -## Regional variations - -The UK model accounts for regional differences: - -- **Council tax**: Varies by local authority -- **Rent levels**: Regional housing markets -- **Scottish income tax**: Different rates and thresholds for Scottish taxpayers - -### Regions - -Valid region values: -- `LONDON` -- `SOUTH_EAST` -- `SOUTH_WEST` -- `EAST_OF_ENGLAND` -- `WEST_MIDLANDS` -- `EAST_MIDLANDS` -- `YORKSHIRE` -- `NORTH_WEST` -- `NORTH_EAST` -- `WALES` -- `SCOTLAND` -- `NORTHERN_IRELAND` - -## Entity mapping - -The UK model has a simpler entity structure than the US, with three levels: person → benunit → household. - -### Direct entity mapping - -You can map data between entities using the `map_to_entity` method: - -```python -# Map person income to benunit level -benunit_income = dataset.data.map_to_entity( - source_entity="person", - target_entity="benunit", - columns=["employment_income"], - how="sum" -) - -# Split household rent equally among persons -person_rent_share = dataset.data.map_to_entity( - source_entity="household", - target_entity="person", - columns=["rent"], - how="divide" -) - -# Map benunit UC to household level -household_uc = dataset.data.map_to_entity( - source_entity="benunit", - target_entity="household", - columns=["universal_credit"], - how="sum" -) -``` - -See the [Entity mapping section](core-concepts.md#entity-mapping) in Core Concepts for full documentation on aggregation methods. - -## Data sources - -The UK model can use several data sources: - -1. **Family Resources Survey (FRS)**: Official UK household survey - - ~19,000 households - - Detailed income and benefit receipt - - Published annually - -2. **Enhanced FRS**: Uprated and enhanced version - - Calibrated to population totals - - Additional imputed variables - - Multiple projection years - -3. **Custom datasets**: User-created scenarios - - Full control over household composition - - Exact income levels - - Specific benefit claiming patterns - -## Validation - -When creating custom datasets, validate: - -1. **Would claim flags**: All set to True -2. **Disability flags**: Set explicitly (not random) -3. **Join keys**: Person data links to benunits and households -4. **Required fields**: Region, tenure_type set correctly -5. **Weights**: Sum to expected values -6. **Income ranges**: Realistic values - -## Examples - -- [UK employment income variation](examples.md#uk-employment-income-variation): Vary employment income, analyse benefit phase-outs -- [UK policy reform analysis](examples.md#uk-policy-reform-analysis): Apply reforms, analyse winners/losers -- [UK income bands](examples.md#uk-income-bands): Calculate net income and tax by income decile - -## References - -- PolicyEngine UK documentation: https://policyengine.github.io/policyengine-uk/ -- UK tax-benefit system: https://www.gov.uk/browse/benefits -- HBAI methodology: https://www.gov.uk/government/statistics/households-below-average-income-for-financial-years-ending-1995-to-2023 diff --git a/docs/country-models-us.md b/docs/country-models-us.md deleted file mode 100644 index 268c888f..00000000 --- a/docs/country-models-us.md +++ /dev/null @@ -1,444 +0,0 @@ -# US tax-benefit model - -The US tax-benefit model implements the United States federal tax and benefit system using PolicyEngine US as the underlying calculation engine. - -## Entity structure - -The US model uses a more complex entity hierarchy: - -``` -household - ├── tax_unit (federal tax filing unit) - ├── spm_unit (Supplemental Poverty Measure unit) - ├── family (Census definition) - └── marital_unit (married couple or single person) - └── person -``` - -### Person - -Individual people with demographic and income characteristics. - -**Key variables:** -- `age`: Person's age in years -- `employment_income`: Annual employment income -- `self_employment_income`: Annual self-employment income -- `social_security`: Annual Social Security benefits -- `ssi`: Annual Supplemental Security Income -- `medicaid`: Annual Medicaid value -- `medicare`: Annual Medicare value -- `unemployment_compensation`: Annual unemployment benefits - -### Tax unit - -The federal tax filing unit (individual or married filing jointly). - -**Key variables:** -- `income_tax`: Federal income tax liability -- `employee_payroll_tax`: Employee payroll tax (FICA) -- `eitc`: Earned Income Tax Credit -- `ctc`: Child Tax Credit -- `income_tax_before_credits`: Tax before credits - -### SPM unit - -The Supplemental Poverty Measure unit used for SNAP and other means-tested benefits. - -**Key variables:** -- `snap`: Annual SNAP (food stamps) benefits -- `tanf`: Annual TANF (cash assistance) benefits -- `spm_unit_net_income`: SPM net income -- `spm_unit_size`: Number of people in unit - -### Family - -Census definition of family (related individuals). - -**Key variables:** -- `family_id`: Family identifier -- `family_weight`: Survey weight - -### Marital unit - -Married couple or single person. - -**Key variables:** -- `marital_unit_id`: Marital unit identifier -- `marital_unit_weight`: Survey weight - -### Household - -The residence unit. - -**Key variables:** -- `household_net_income`: Total household net income -- `household_benefits`: Total benefits received -- `household_tax`: Total tax paid -- `household_market_income`: Total market income before taxes and transfers - -**Required fields:** -- `state_code`: State (e.g., "CA", "NY", "TX") - -## Using the US model - -### Loading representative data - -```python -from policyengine.tax_benefit_models.us import PolicyEngineUSDataset - -dataset = PolicyEngineUSDataset( - name="Enhanced CPS 2024", - description="Enhanced Current Population Survey microdata", - filepath="./data/enhanced_cps_2024_year_2024.h5", - year=2024, -) - -print(f"People: {len(dataset.data.person):,}") -print(f"Tax units: {len(dataset.data.tax_unit):,}") -print(f"SPM units: {len(dataset.data.spm_unit):,}") -print(f"Households: {len(dataset.data.household):,}") -``` - -### Creating custom scenarios - -```python -import pandas as pd -from microdf import MicroDataFrame -from policyengine.tax_benefit_models.us import USYearData - -# Married couple with 2 children -person_df = MicroDataFrame( - pd.DataFrame({ - "person_id": [0, 1, 2, 3], - "person_household_id": [0, 0, 0, 0], - "person_tax_unit_id": [0, 0, 0, 0], - "person_spm_unit_id": [0, 0, 0, 0], - "person_family_id": [0, 0, 0, 0], - "person_marital_unit_id": [0, 0, 1, 2], - "age": [35, 33, 8, 5], - "employment_income": [60000, 40000, 0, 0], - "person_weight": [1.0, 1.0, 1.0, 1.0], - }), - weights="person_weight" -) - -tax_unit_df = MicroDataFrame( - pd.DataFrame({ - "tax_unit_id": [0], - "tax_unit_weight": [1.0], - }), - weights="tax_unit_weight" -) - -spm_unit_df = MicroDataFrame( - pd.DataFrame({ - "spm_unit_id": [0], - "spm_unit_weight": [1.0], - }), - weights="spm_unit_weight" -) - -family_df = MicroDataFrame( - pd.DataFrame({ - "family_id": [0], - "family_weight": [1.0], - }), - weights="family_weight" -) - -marital_unit_df = MicroDataFrame( - pd.DataFrame({ - "marital_unit_id": [0, 1, 2], - "marital_unit_weight": [1.0, 1.0, 1.0], - }), - weights="marital_unit_weight" -) - -household_df = MicroDataFrame( - pd.DataFrame({ - "household_id": [0], - "household_weight": [1.0], - "state_code": ["CA"], - }), - weights="household_weight" -) - -dataset = PolicyEngineUSDataset( - name="Married couple scenario", - description="Two adults, two children", - filepath="./married_couple.h5", - year=2024, - data=USYearData( - person=person_df, - tax_unit=tax_unit_df, - spm_unit=spm_unit_df, - family=family_df, - marital_unit=marital_unit_df, - household=household_df, - ) -) -``` - -### Running a simulation - -```python -from policyengine.core import Simulation -from policyengine.tax_benefit_models.us import us_latest - -simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=us_latest, -) -simulation.run() - -# Check results -output = simulation.output_dataset.data -print(output.household[["household_net_income", "household_benefits", "household_tax"]]) -``` - -## Key parameters - -### Income tax - -- `gov.irs.income.standard_deduction.joint`: Standard deduction (married filing jointly) -- `gov.irs.income.standard_deduction.single`: Standard deduction (single) -- `gov.irs.income.bracket.rates[0]`: 10% bracket rate -- `gov.irs.income.bracket.rates[1]`: 12% bracket rate -- `gov.irs.income.bracket.rates[2]`: 22% bracket rate -- `gov.irs.income.bracket.thresholds.joint[0]`: 10% bracket threshold (MFJ) -- `gov.irs.income.bracket.thresholds.single[0]`: 10% bracket threshold (single) - -### Payroll tax - -- `gov.ssa.payroll.rate.employee`: Employee OASDI rate (6.2%) -- `gov.medicare.payroll.rate`: Medicare rate (1.45%) -- `gov.ssa.payroll.cap`: OASDI wage base ($168,600 in 2024) - -### Child Tax Credit - -- `gov.irs.credits.ctc.amount.base`: Base CTC amount ($2,000 per child) -- `gov.irs.credits.ctc.refundable.amount.max`: Maximum refundable amount ($1,700) -- `gov.irs.credits.ctc.phase_out.threshold.joint`: Phase-out threshold (MFJ) -- `gov.irs.credits.ctc.phase_out.rate`: Phase-out rate - -### Earned Income Tax Credit - -- `gov.irs.credits.eitc.max[0]`: Maximum EITC (0 children) -- `gov.irs.credits.eitc.max[1]`: Maximum EITC (1 child) -- `gov.irs.credits.eitc.max[2]`: Maximum EITC (2 children) -- `gov.irs.credits.eitc.max[3]`: Maximum EITC (3+ children) -- `gov.irs.credits.eitc.phase_out.start[0]`: Phase-out start (0 children) -- `gov.irs.credits.eitc.phase_out.rate[0]`: Phase-out rate (0 children) - -### SNAP - -- `gov.usda.snap.normal_allotment.max[1]`: Maximum benefit (1 person) -- `gov.usda.snap.normal_allotment.max[2]`: Maximum benefit (2 people) -- `gov.usda.snap.income_limit.net`: Net income limit (100% FPL) -- `gov.usda.snap.income_deduction.earned.rate`: Earned income deduction rate (20%) - -## Common policy reforms - -### Increasing standard deduction - -```python -from policyengine.core import Policy, Parameter, ParameterValue -import datetime - -parameter = Parameter( - name="gov.irs.income.standard_deduction.single", - tax_benefit_model_version=us_latest, - description="Standard deduction (single)", - data_type=float, -) - -policy = Policy( - name="Increase standard deduction to $20,000", - description="Raises single standard deduction from $14,600 to $20,000", - parameter_values=[ - ParameterValue( - parameter=parameter, - start_date=datetime.date(2024, 1, 1), - end_date=datetime.date(2024, 12, 31), - value=20000, - ) - ], -) -``` - -### Expanding Child Tax Credit - -```python -parameter = Parameter( - name="gov.irs.credits.ctc.amount.base", - tax_benefit_model_version=us_latest, - description="Base CTC amount", - data_type=float, -) - -policy = Policy( - name="Increase CTC to $3,000", - description="Expands CTC from $2,000 to $3,000 per child", - parameter_values=[ - ParameterValue( - parameter=parameter, - start_date=datetime.date(2024, 1, 1), - end_date=datetime.date(2024, 12, 31), - value=3000, - ) - ], -) -``` - -### Making CTC fully refundable - -```python -parameter = Parameter( - name="gov.irs.credits.ctc.refundable.amount.max", - tax_benefit_model_version=us_latest, - description="Maximum refundable CTC", - data_type=float, -) - -policy = Policy( - name="Fully refundable CTC", - description="Makes entire $2,000 CTC refundable", - parameter_values=[ - ParameterValue( - parameter=parameter, - start_date=datetime.date(2024, 1, 1), - end_date=datetime.date(2024, 12, 31), - value=2000, # Match base amount - ) - ], -) -``` - -## State variations - -The US model includes state-level variations for: - -- **State income tax**: Different rates and structures by state -- **State EITC**: State supplements to federal EITC -- **Medicaid**: State-specific eligibility and benefits -- **TANF**: State-administered cash assistance - -### State codes - -Use two-letter state codes (e.g., "CA", "NY", "TX"). All 50 states plus DC are supported. - -## Entity mapping considerations - -The US model's complex entity structure requires careful attention to entity mapping: - -### Person → Household - -When mapping person-level variables (like `ssi`) to household level, values are summed across all household members: - -```python -agg = Aggregate( - simulation=simulation, - variable="ssi", # Person-level - entity="household", # Aggregate to household - aggregate_type=AggregateType.SUM, -) -# Result: Total SSI for all persons in each household -``` - -### Tax unit → Household - -Tax units nest within households. A household may contain multiple tax units (e.g., adult child filing separately): - -```python -agg = Aggregate( - simulation=simulation, - variable="income_tax", # Tax unit level - entity="household", # Aggregate to household - aggregate_type=AggregateType.SUM, -) -# Result: Total income tax for all tax units in each household -``` - -### Household → Person - -Household variables are replicated to all household members: - -```python -# household_net_income at person level -# Each person in household gets the same household_net_income value -``` - -### Direct entity mapping - -For complex multi-entity scenarios, you can use `map_to_entity` directly: - -```python -# Map SPM unit SNAP benefits to household level -household_snap = dataset.data.map_to_entity( - source_entity="spm_unit", - target_entity="household", - columns=["snap"], - how="sum" -) - -# Split tax unit income equally among persons -person_tax_income = dataset.data.map_to_entity( - source_entity="tax_unit", - target_entity="person", - columns=["taxable_income"], - how="divide" -) - -# Map custom analysis values -custom_analysis = dataset.data.map_to_entity( - source_entity="person", - target_entity="tax_unit", - values=custom_values_array, - how="sum" -) -``` - -See the [Entity mapping section](core-concepts.md#entity-mapping) in Core Concepts for full documentation on aggregation methods. - -## Data sources - -The US model can use several data sources: - -1. **Current Population Survey (CPS)**: Census Bureau household survey - - ~60,000 households - - Detailed income and demographic data - - Published annually - -2. **Enhanced CPS**: Calibrated and enhanced version - - Uprated to population totals - - Imputed benefit receipt - - Multiple projection years - -3. **Custom datasets**: User-created scenarios - - Full control over household composition - - Exact income levels - - Specific tax filing scenarios - -## Validation - -When creating custom datasets, validate: - -1. **Entity relationships**: All persons link to valid tax_unit, spm_unit, household -2. **Join key naming**: Use `person_household_id`, `person_tax_unit_id`, etc. -3. **Weights**: Appropriate weights for each entity level -4. **State codes**: Valid two-letter codes -5. **Filing status**: Tax units should reflect actual filing patterns - -## Examples - -- [US income distribution](examples.md#us-income-distribution): Analyse benefit distribution by income decile -- [US employment income variation](examples.md#us-employment-income-variation): Vary employment income, analyse phase-outs -- [US budgetary impact](examples.md#us-budgetary-impact): Full baseline-vs-reform comparison -- [Simulation performance](examples.md#simulation-performance): Performance benchmarking - -## References - -- PolicyEngine US documentation: https://policyengine.github.io/policyengine-us/ -- IRS tax information: https://www.irs.gov/forms-pubs -- Benefits.gov: https://www.benefits.gov/ -- SPM methodology: https://www.census.gov/topics/income-poverty/supplemental-poverty-measure.html diff --git a/docs/dev.md b/docs/dev.md index 007a94e5..3a1efc4e 100644 --- a/docs/dev.md +++ b/docs/dev.md @@ -1,105 +1,77 @@ -# Development - -## Principles - -1. **STRONG** preference for simplicity. Let's make this package as simple as it possibly can be. -2. Remember the goal of this package: to make it easy to create, run, save and analyse PolicyEngine simulations. When considering further features, always ask: can we instead *make it super easy* for people to do this outside the package? -3. Be consistent about property names. `name` = human readable few words you could put as the noun in a sentence without fail. `id` = unique identifier, ideally a UUID. `description` = longer human readable text that describes the object. `created_at` and `updated_at` = timestamps for when the object was created and last updated. -4. Constraints can be good. We should set constraints where they help us simplify the codebase and usage, but not where they unnecessarily block useful functionality. +--- +title: "Development" +--- ## Setup ```bash -git clone https://github.com/PolicyEngine/policyengine.py.git +git clone https://github.com/PolicyEngine/policyengine.py cd policyengine.py uv pip install -e ".[dev]" ``` -This installs the shared analysis layer, both country model extras, and the dev -dependencies used in CI (pytest, ruff, mypy, towncrier). - -## Common commands +## Running tests ```bash -make format # ruff format -make test # pytest with coverage -make docs # build static MyST/Jupyter Book 2 HTML docs -make docs-serve # preview the docs locally -make clean # remove caches, build artifacts, .h5 files +make test # unit tests +pytest tests/ # same via pytest +pytest tests/integration # integration tests (slower, needs h5 data) ``` -## Testing - -Tests require a `HUGGING_FACE_TOKEN` environment variable for downloading datasets: +## Formatting and linting ```bash -export HUGGING_FACE_TOKEN=hf_... -make test +make format # ruff format +ruff check . # ruff lint ``` -To run a specific test: +## Building docs ```bash -pytest tests/test_models.py -v -pytest tests/test_parametric_reforms.py -k "test_uk" -v +make docs # quarto render docs -> docs/_site/ +make docs-serve # quarto preview docs with live reload ``` -## Linting and formatting +## Regenerating auto-reference pages ```bash -ruff format . # format code -ruff check . # lint -mypy src/policyengine # type check (informational) +make docs-generate-reference # pulls variable catalog from installed country models ``` -## CI pipeline +Commit the regenerated pages alongside any country-model bumps. CI will check the reference is current. -PRs trigger the following checks: +## CI -| Check | Status | Command | -|---|---|---| -| Lint + format | Required | `ruff check .` + `ruff format --check .` | -| Tests (Python 3.13) | Required | `make test` | -| Tests (Python 3.14) | Required | `make test` | -| Mypy | Informational | `mypy src/policyengine` | -| Docs build | Required | `make docs` | +Four workflows in `.github/workflows/`: -## Versioning and releases - -This project uses [towncrier](https://towncrier.readthedocs.io/) for changelog management. When making a PR, add a changelog fragment: - -```bash -# Fragment types: breaking, added, changed, fixed, removed -echo "Description of change" > changelog.d/my-change.added -``` +- **`pr_code_changes.yaml`** — unit tests, lint, format, changelog fragment on every PR touching code. +- **`pr_docs_changes.yaml`** — verifies `quarto render docs` succeeds on every PR touching docs. +- **`push.yaml`** — full integration tests + publish path on merge to main. +- **`versioning.yaml`** — auto-bumps version when changelog fragments land. -On merge, the versioning workflow bumps the version, builds the changelog, and creates a GitHub Release. +## Contributing -For the target release-bundle architecture, see [Release bundles](release-bundles.md). That document defines the split between country `*-data` build manifests and `policyengine.py` certified runtime bundles. +- Follow the existing API shape: `pe.us.calculate_household`, `pe.us.Simulation`, `pe.outputs.*`. Don't add one-off helpers that bypass these. +- New output types subclass `Output` or `ChangeOutput` and live in `src/policyengine/outputs/`. +- Country-specific helpers go under `src/policyengine/tax_benefit_models//`. +- Add a changelog fragment in `changelog.d/` following towncrier conventions: `echo "Description." > changelog.d/..md`. Types: `added`, `changed`, `fixed`, `removed`, `breaking`. ## Architecture -### Package layout - ``` src/policyengine/ -├── core/ # Domain models (Simulation, Dataset, Policy, etc.) +├── core/ # Simulation, Dataset, Output base classes +├── countries/ # Country-neutral protocols +├── data/ # Generic dataset loading +├── graph/ # Variable dependency graph (for reference docs) +├── outputs/ # Typed output classes +├── provenance/ # Manifests, certification, reproducibility +├── results/ # Typed household-result structures ├── tax_benefit_models/ -│ ├── uk/ # UK model, datasets, analysis, outputs -│ └── us/ # US model, datasets, analysis, outputs -├── outputs/ # Output templates (Aggregate, Poverty, etc.) -├── countries/ # Geographic region registries -└── utils/ # Helpers (reforms, entity mapping, plotting) +│ ├── us/ # US entry point (calculate_household, model, datasets) +│ ├── uk/ # UK equivalent +│ └── common/ # Shared model-version scaffolding +└── utils/ ``` -### Key design decisions - -**Pydantic everywhere**: All domain objects are Pydantic `BaseModel` subclasses. This gives us validation, serialisation, and clear field documentation. - -**HDF5 for storage**: Datasets and simulation outputs are stored as HDF5 files. No database server is required. The `MicroDataFrame` from the `microdf` package wraps pandas DataFrames with weight-aware `.sum()`, `.mean()`, `.count()`. - -**Country-specific model classes**: `PolicyEngineUSLatest` and `PolicyEngineUKLatest` each implement `run()`, `save()`, and `load()`. The US model passes reforms as a dict at `Microsimulation(reform=...)` construction time. The UK model supports both parametric reforms and `simulation_modifier` callables applied post-construction. - -**LRU cache + file caching**: `Simulation.ensure()` checks an in-process LRU cache (max 100 entries), then tries loading from disk, then falls back to `run()` + `save()`. - -**Output pattern**: All output types inherit from `Output`, implement `.run()`, and populate result fields. Convenience functions (e.g., `calculate_us_poverty_rates()`) create, run, and return collections of output objects. +Everything users touch is exposed through the top-level `policyengine` namespace. Internal modules are imports of convenience; the contract is the exposed API. diff --git a/docs/economic-impact-analysis.md b/docs/economic-impact-analysis.md deleted file mode 100644 index 0d28dff8..00000000 --- a/docs/economic-impact-analysis.md +++ /dev/null @@ -1,287 +0,0 @@ -# Economic impact analysis - -The `economic_impact_analysis()` function is the canonical way to compare a baseline simulation against a reform simulation. It produces a comprehensive `PolicyReformAnalysis` containing decile impacts, programme-by-programme statistics, poverty rates, and inequality metrics in a single call. - -## Overview - -There are two approaches to comparing simulations: - -| Approach | Use case | -|---|---| -| `ChangeAggregate` | Single-metric queries: "What is the total tax revenue change?" | -| `economic_impact_analysis()` | Full analysis: decile impacts, programme stats, poverty, inequality | - -`ChangeAggregate` gives you one number per call. `economic_impact_analysis()` runs ~30+ aggregate computations and returns a structured result containing everything. - -## Full analysis workflow - -### US example - -```python -import datetime -from policyengine.core import Parameter, ParameterValue, Policy, Simulation -from policyengine.tax_benefit_models.us import ( - economic_impact_analysis, - ensure_datasets, - us_latest, -) - -# 1. Load data -datasets = ensure_datasets( - datasets=["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"], - years=[2026], - data_folder="./data", -) -dataset = datasets["enhanced_cps_2024_2026"] - -# 2. Define reform -param = Parameter( - name="gov.irs.deductions.standard.amount.SINGLE", - tax_benefit_model_version=us_latest, -) -reform = Policy( - name="Double standard deduction (single)", - parameter_values=[ - ParameterValue( - parameter=param, - start_date=datetime.date(2026, 1, 1), - end_date=datetime.date(2026, 12, 31), - value=30_950, - ), - ], -) - -# 3. Create simulations (no need to call .run() — ensure() is called internally) -baseline_sim = Simulation( - dataset=dataset, - tax_benefit_model_version=us_latest, -) -reform_sim = Simulation( - dataset=dataset, - tax_benefit_model_version=us_latest, - policy=reform, -) - -# 4. Run full analysis -analysis = economic_impact_analysis(baseline_sim, reform_sim) -``` - -### UK example - -```python -import datetime -from policyengine.core import Parameter, ParameterValue, Policy, Simulation -from policyengine.tax_benefit_models.uk import ( - economic_impact_analysis, - ensure_datasets, - uk_latest, -) - -datasets = ensure_datasets( - datasets=["hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5"], - years=[2026], - data_folder="./data", -) -dataset = datasets["enhanced_frs_2023_24_2026"] - -param = Parameter( - name="gov.hmrc.income_tax.allowances.personal_allowance.amount", - tax_benefit_model_version=uk_latest, -) -reform = Policy( - name="Zero personal allowance", - parameter_values=[ - ParameterValue( - parameter=param, - start_date=datetime.date(2026, 1, 1), - end_date=datetime.date(2026, 12, 31), - value=0, - ), - ], -) - -baseline_sim = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, -) -reform_sim = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, - policy=reform, -) - -analysis = economic_impact_analysis(baseline_sim, reform_sim) -``` - -## What `economic_impact_analysis()` computes - -The function calls `ensure()` on both simulations (run + cache if not already computed), then produces: - -### Decile impacts - -Mean income changes by income decile (1-10), with counts of people better off, worse off, and unchanged. - -```python -for d in analysis.decile_impacts.outputs: - print(f"Decile {d.decile}: avg change={d.absolute_change:+.0f}, " - f"relative={d.relative_change:+.2f}%") -``` - -**Fields on each `DecileImpact`:** -- `decile`: 1-10 -- `baseline_mean`, `reform_mean`: Mean income before and after reform -- `absolute_change`: Mean absolute income change -- `relative_change`: Mean percentage income change -- `count_better_off`, `count_worse_off`, `count_no_change`: Weighted counts - -### Programme/program statistics - -Per-programme totals, changes, and winner/loser counts. - -**US programs analysed:** `income_tax`, `payroll_tax`, `state_income_tax`, `snap`, `tanf`, `ssi`, `social_security`, `medicare`, `medicaid`, `eitc`, `ctc` - -**UK programmes analysed:** `income_tax`, `national_insurance`, `vat`, `council_tax`, `universal_credit`, `child_benefit`, `pension_credit`, `income_support`, `working_tax_credit`, `child_tax_credit` - -```python -for p in analysis.program_statistics.outputs: # US - print(f"{p.program_name}: baseline=${p.baseline_total/1e9:.1f}B, " - f"reform=${p.reform_total/1e9:.1f}B, change=${p.change/1e9:+.1f}B") -``` - -**Fields on each `ProgramStatistics` / `ProgrammeStatistics`:** -- `program_name` / `programme_name`: Variable name -- `baseline_total`, `reform_total`: Weighted sums -- `change`: `reform_total - baseline_total` -- `baseline_count`, `reform_count`: Weighted recipient counts -- `winners`, `losers`: Weighted counts of people gaining/losing - -### Poverty rates - -Poverty headcount and rates for both baseline and reform simulations. - -**US poverty types:** SPM poverty, deep SPM poverty - -**UK poverty types:** Absolute BHC, absolute AHC, relative BHC, relative AHC - -```python -for bp, rp in zip(analysis.baseline_poverty.outputs, - analysis.reform_poverty.outputs): - print(f"{bp.poverty_type}: baseline={bp.rate:.4f}, reform={rp.rate:.4f}") -``` - -### Inequality metrics - -Gini coefficient and income share metrics for both simulations. - -```python -bi = analysis.baseline_inequality -ri = analysis.reform_inequality -print(f"Gini: baseline={bi.gini:.4f}, reform={ri.gini:.4f}") -print(f"Top 10% share: baseline={bi.top_10_share:.4f}, reform={ri.top_10_share:.4f}") -print(f"Top 1% share: baseline={bi.top_1_share:.4f}, reform={ri.top_1_share:.4f}") -print(f"Bottom 50% share: baseline={bi.bottom_50_share:.4f}, reform={ri.bottom_50_share:.4f}") -``` - -## The `PolicyReformAnalysis` return type - -```python -class PolicyReformAnalysis(BaseModel): - decile_impacts: OutputCollection[DecileImpact] - program_statistics: OutputCollection[ProgramStatistics] # US - # programme_statistics: OutputCollection[ProgrammeStatistics] # UK - baseline_poverty: OutputCollection[Poverty] - reform_poverty: OutputCollection[Poverty] - baseline_inequality: Inequality - reform_inequality: Inequality -``` - -Each `OutputCollection` contains: -- `outputs`: List of individual output objects -- `dataframe`: A pandas DataFrame with all results in tabular form - -## Using ChangeAggregate for targeted queries - -When you only need a single metric, `ChangeAggregate` is more direct than the full analysis pipeline. It requires that both simulations have already been run (or ensure'd). - -### Tax revenue change - -```python -from policyengine.outputs.change_aggregate import ChangeAggregate, ChangeAggregateType - -baseline_sim.run() -reform_sim.run() - -revenue = ChangeAggregate( - baseline_simulation=baseline_sim, - reform_simulation=reform_sim, - variable="household_tax", - aggregate_type=ChangeAggregateType.SUM, -) -revenue.run() -print(f"Revenue change: ${revenue.result / 1e9:.1f}B") -``` - -### Winners and losers - -```python -winners = ChangeAggregate( - baseline_simulation=baseline_sim, - reform_simulation=reform_sim, - variable="household_net_income", - aggregate_type=ChangeAggregateType.COUNT, - change_geq=1, # Gained at least $1 -) -winners.run() - -losers = ChangeAggregate( - baseline_simulation=baseline_sim, - reform_simulation=reform_sim, - variable="household_net_income", - aggregate_type=ChangeAggregateType.COUNT, - change_leq=-1, # Lost at least $1 -) -losers.run() -``` - -### Filtering by income decile - -```python -# Average loss in the 3rd income decile -avg_loss = ChangeAggregate( - baseline_simulation=baseline_sim, - reform_simulation=reform_sim, - variable="household_net_income", - aggregate_type=ChangeAggregateType.MEAN, - filter_variable="household_net_income", - quantile=10, - quantile_eq=3, -) -avg_loss.run() -``` - -### Filter options reference - -**Absolute change filters:** -- `change_geq`: Change >= value (e.g., gain >= 500) -- `change_leq`: Change <= value (e.g., loss <= -500) -- `change_eq`: Change == value - -**Relative change filters:** -- `relative_change_geq`: Relative change >= value (decimal, e.g., 0.05 = 5%) -- `relative_change_leq`: Relative change <= value -- `relative_change_eq`: Relative change == value - -**Variable filters:** -- `filter_variable`: Variable to filter on (from the baseline simulation) -- `filter_variable_eq`, `filter_variable_leq`, `filter_variable_geq`: Comparison operators - -**Quantile filters:** -- `quantile`: Number of quantiles (e.g., 10 for deciles, 5 for quintiles) -- `quantile_eq`: Exact quantile (e.g., 3 for 3rd decile) -- `quantile_leq`: Maximum quantile -- `quantile_geq`: Minimum quantile - -## Examples - -- [UK policy reform analysis](examples.md#uk-policy-reform-analysis): Full reform analysis with ChangeAggregate and visualisation -- [US budgetary impact](examples.md#us-budgetary-impact): Budgetary impact comparing both approaches diff --git a/docs/examples.md b/docs/examples.md index b7b4e91a..147a7d0c 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -1,67 +1,65 @@ -# Examples +--- +title: "Examples" +--- -Complete working scripts demonstrating common workflows. Each script can be run directly with `python examples/.py`. +Complete runnable scripts in `examples/` — each demonstrates one workflow end-to-end. Run with `python examples/.py`. -## US budgetary impact +## US -The canonical workflow for comparing a baseline and reform simulation, using both `economic_impact_analysis()` and `ChangeAggregate`. +### Budget impact of a reform -```{literalinclude} ../examples/us_budgetary_impact.py -:language: python +```{.python include="../examples/us_budgetary_impact.py"} ``` -## UK policy reform analysis +### Income distribution over microdata -Applying parametric reforms, comparing baseline and reform with `ChangeAggregate`, analysing winners and losers by income decile, and visualising results with Plotly. - -```{literalinclude} ../examples/policy_change_uk.py -:language: python +```{.python include="../examples/income_distribution_us.py"} ``` -## UK income bands - -Calculating net income and tax by income decile using representative microdata and `Aggregate` with quantile filters. +### Household impact curve -```{literalinclude} ../examples/income_bands_uk.py -:language: python +```{.python include="../examples/household_impact_example.py"} ``` -## US income distribution +### Employment-income variation -Loading enhanced CPS microdata, running a full microsimulation, and calculating statistics within income deciles. +```{.python include="../examples/employment_income_variation_us.py"} +``` -```{literalinclude} ../examples/income_distribution_us.py -:language: python +### Full microsimulation speedtest + +```{.python include="../examples/speedtest_us_simulation.py"} ``` -## UK employment income variation +## UK -Creating a custom dataset with varied employment income, running a single simulation, and visualising benefit phase-outs. +### Reform with decile impact -```{literalinclude} ../examples/employment_income_variation_uk.py -:language: python +```{.python include="../examples/policy_change_uk.py"} ``` -## US employment income variation - -Same approach as the UK version, varying employment income from $0 to $200k and plotting household net income. +### Income bands analysis -```{literalinclude} ../examples/employment_income_variation_us.py -:language: python +```{.python include="../examples/income_bands_uk.py"} ``` -## Household impact calculation +### Employment-income variation -Using `calculate_household_impact()` to compute taxes and benefits for individual custom households (both UK and US). +```{.python include="../examples/employment_income_variation_uk.py"} +``` + +### Paper reproduction -```{literalinclude} ../examples/household_impact_example.py -:language: python +```{.python include="../examples/paper_repro_uk.py"} ``` -## Simulation performance +## Writing your own -Benchmarking how `simulation.run()` scales with dataset size. +Patterns worth following: -```{literalinclude} ../examples/speedtest_us_simulation.py -:language: python -``` +- Always pass `year` explicitly — don't rely on defaults +- Construct the baseline `Simulation` once; build reforms on top rather than recomputing +- Save the `.manifest.json` alongside your results for reproducibility +- Use typed outputs (`Aggregate`, `Poverty`, etc.) rather than ad-hoc `.calculate` calls — the outputs handle edge cases like missing weights + +More patterns in [Outputs](outputs.md) and [Impact analysis](impact-analysis.md). diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 00000000..d56d1a75 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,78 @@ +--- +title: "Getting started" +--- + +## Install + +```bash +pip install policyengine +``` + +By default `policyengine` does not bundle country models — install each country's rules alongside: + +```bash +pip install policyengine policyengine-us # US only +pip install policyengine policyengine-uk # UK only +pip install policyengine policyengine-us policyengine-uk # both +``` + +Country modules (`pe.us`, `pe.uk`) are only importable if the matching country package is installed. + +## Compute one household + +```python +import policyengine as pe + +result = pe.us.calculate_household( + people=[{"age": 35, "employment_income": 60_000}], + tax_unit={"filing_status": "SINGLE"}, + household={"state_code": "CA"}, + year=2026, +) + +result.tax_unit.income_tax +result.tax_unit.eitc +result.household.household_net_income +``` + +Each `.*` lookup is a regular Python scalar. The result object is typed; IDEs and type-checkers autocomplete attribute names from the country model's variable catalog. + +## Apply a reform + +```python +reformed = pe.us.calculate_household( + people=[{"age": 35, "employment_income": 60_000}], + tax_unit={"filing_status": "SINGLE"}, + year=2026, + reform={"gov.irs.credits.ctc.amount.adult_dependent": 1_000}, +) +``` + +Reforms are parameter-path → value dicts. For time-varying reforms pass a dict of effective-date strings instead of a scalar: + +```python +reform = { + "gov.irs.credits.ctc.amount.adult_dependent": { + "2026-01-01": 1_000, + "2028-01-01": 2_000, + }, +} +``` + +See [Reforms](reforms.md) for structural changes and multi-year reforms. + +## Scale up + +A single-household calculator is convenient for policy-walkthroughs and tests. For population estimates of budget cost, distributional impact, and poverty effects, move to [Microsimulation](microsim.md). The API is parallel — `pe.us.calculate_household` and `pe.us.Simulation` accept the same reform dict, so your hypothesis code carries over. + +## What you get back + +Every calculation returns a typed result object with sections per entity — `person`, `tax_unit`, `spm_unit`, `household`, `family` for the US; `person`, `benunit`, `household` for the UK. Indexing the person list (`result.person[0]`) returns a row for that person. Group-entity lookups (`result.tax_unit`, `result.household`) return the single group the household is organized into. + +Every variable defined on the country model is available as an attribute. If you ask for one that doesn't exist, you get an error with the closest available suggestion — no silent zero returns. + +## Next + +- [Households](households.md) — full reference for `calculate_household` +- [Reforms](reforms.md) — parametric and structural reforms +- [Microsimulation](microsim.md) — population-level analysis diff --git a/docs/households.md b/docs/households.md new file mode 100644 index 00000000..4fe3daa4 --- /dev/null +++ b/docs/households.md @@ -0,0 +1,124 @@ +--- +title: "Households" +--- + +`pe.us.calculate_household` and `pe.uk.calculate_household` compute every variable in the country model for a single household. Same keyword arguments, different entity structures. + +## US + +```python +result = pe.us.calculate_household( + people=[ + {"age": 35, "employment_income": 40_000}, + {"age": 33}, + {"age": 8}, + {"age": 5}, + ], + tax_unit={"filing_status": "JOINT"}, + household={"state_code": "TX"}, + year=2026, +) +``` + +### Entities + +| Argument | Required | Purpose | +|---|---|---| +| `people` | Yes | List of person dicts. Keys are any person-level variable on the model. | +| `tax_unit` | One of the per-household-level keys | Tax-unit-level inputs (e.g. `filing_status`). | +| `spm_unit` | Optional | SPM-unit inputs. | +| `household` | Usually required | Household-level inputs. `state_code` is essentially always needed for US. | +| `family` | Optional | Family-level inputs. | +| `marital_unit` | Optional | Marital-unit inputs. | + +If you pass multiple adults, PolicyEngine assigns them to one tax unit and one household by default. For separate tax units, use `pe.Simulation` directly and set the entity-membership arrays. + +## UK + +```python +result = pe.uk.calculate_household( + people=[ + {"age": 35, "employment_income": 50_000}, + {"age": 33, "employment_income": 30_000}, + {"age": 4}, + ], + benunit={}, + household={}, + year=2026, +) +``` + +| Argument | Purpose | +|---|---| +| `people` | Person-level inputs. | +| `benunit` | Benefit unit (equivalent to UC claim). | +| `household` | Household-level inputs. | + +## Reforms + +Pass a `reform` dict of parameter-path to value: + +```python +pe.us.calculate_household( + ..., + reform={"gov.irs.credits.ctc.amount.adult_dependent": 1_000}, +) +``` + +For values effective on specific dates, use a nested dict: + +```python +reform = { + "gov.irs.credits.ctc.amount.adult_dependent": { + "2026-01-01": 1_000, + "2028-01-01": 2_000, + }, +} +``` + +Structural reforms (subclassing the model) are covered in [Reforms](reforms.md). + +## Year + +```python +pe.us.calculate_household(..., year=2026) +``` + +The year determines which parameter values apply. For year arithmetic (e.g. phase-ins), pass a `reform` with dated values rather than calling the function once per year. + +## Extra variables + +By default the result exposes every variable in the model. If your calculator-level output should contain variables that aren't in the default catalog, request them: + +```python +result = pe.us.calculate_household( + ..., + extra_variables=["medicaid_income_level", "spm_unit_spm_threshold"], +) +``` + +## Accessing the result + +```python +result.person[0].income_tax # scalar for first person +result.person[2].age # scalar for third person (the 8-year-old) +result.tax_unit.income_tax # scalar (one tax unit) +result.household.household_net_income # scalar +``` + +The result is a Pydantic model — `.model_dump()` gives you a dict, and individual entity sections are regular attribute lookups. + +## When not to use this + +- Runs over many households in a loop will be much slower than one `Simulation` call. See [Microsimulation](microsim.md). +- If your input data lives in a DataFrame or file, the microsim path is cleaner — `calculate_household` is optimized for per-household construction from Python literals. + +## Errors + +Unknown variables raise with suggestions: + +``` +ValueError: Unknown variable 'income_ax'. Did you mean 'income_tax'? +``` + +Unknown parameters in reforms raise similarly. The catalog is enumerated at construction time — typos fail fast. diff --git a/docs/impact-analysis.md b/docs/impact-analysis.md new file mode 100644 index 00000000..a2a853e6 --- /dev/null +++ b/docs/impact-analysis.md @@ -0,0 +1,75 @@ +--- +title: "Impact analysis" +--- + +`economic_impact_analysis` runs a baseline-vs-reform comparison and returns a bundle of standard outputs — budget cost, poverty change, distributional impact, inequality — in one call. + +## One-liner + +```python +from policyengine.us import economic_impact_analysis + +impact = economic_impact_analysis( + reform={"gov.irs.credits.ctc.amount.adult_dependent": 1_000}, + year=2026, +) + +impact.budget.total_change +impact.poverty.rate_change +impact.deciles.mean_change_by_decile +impact.inequality.gini +``` + +The UK equivalent is `from policyengine.uk import economic_impact_analysis`. + +## What it computes + +Each call produces: + +| Section | Content | +|---|---| +| `budget` | Total budget cost (`household_net_income` sum change) | +| `poverty` | SPM poverty rate before/after (US) or AHC rate (UK), plus demographic breakdowns | +| `deep_poverty` | Same as above for half-of-poverty-threshold (US only) | +| `deciles` | Mean net-income change by income decile; winners-vs-losers | +| `intra_deciles` | Distribution of impact within each decile | +| `inequality` | Gini and top-income shares | + +All sections compute against the same baseline and reform simulations, so results are internally consistent. + +## Under the hood + +`economic_impact_analysis` is a thin wrapper around the individual output classes — same as composing them manually: + +```python +baseline = pe.Simulation(country="us", dataset=DEFAULT_US_DATASET, year=2026) +reformed = pe.Simulation(country="us", dataset=DEFAULT_US_DATASET, year=2026, reform=REFORM) + +budget = ChangeAggregate("household_net_income", ChangeAggregateType.DIFFERENCE).compute(baseline, reformed) +poverty = Poverty(...).compute(baseline, reformed) +# ... +``` + +If you need a subset of outputs or want to cache the baseline across multiple reform scenarios, compose directly rather than calling `economic_impact_analysis` repeatedly. + +## Passing your own data + +By default, `economic_impact_analysis` uses the pinned default dataset for each country. For custom datasets: + +```python +impact = economic_impact_analysis( + reform=REFORM, + year=2026, + dataset="hf://policyengine/policyengine-us-data/enhanced_cps_2023.h5", +) +``` + +## Non-parametric reforms + +For structural reforms, construct the simulations yourself and pass them to the outputs directly. `economic_impact_analysis` only accepts parametric reform dicts. + +## Next + +- [Outputs](outputs.md) — catalog of individual output classes +- [Regions](regions.md) — state/constituency-level impact breakdowns +- [Examples](examples.md) — full runnable scripts using this helper diff --git a/docs/index.md b/docs/index.md index bbd88974..4ea52bd6 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,20 +1,48 @@ -# policyengine.py - -This package aims to simplify and productionise the use of PolicyEngine's tax-benefit microsimulation models to flexibly produce useful information at scale, slotting into existing analysis pipelines while also standardising analysis. - -We do this by: -* Standardising around a set of core types that let us do policy analysis in an object-oriented way -* Exemplifying this behaviour by using this package in all PolicyEngine's production applications, and analyses - -## Documentation - -- [Core concepts](core-concepts.md): Architecture, datasets, simulations, policies, outputs, entity mapping -- [Economic impact analysis](economic-impact-analysis.md): Full baseline-vs-reform comparison workflow -- [Advanced outputs](advanced-outputs.md): DecileImpact, Poverty, Inequality, IntraDecileImpact -- [Regions and scoping](regions-and-scoping.md): Sub-national analysis (states, constituencies, districts) -- [UK tax-benefit model](country-models-uk.md): Entities, parameters, reform examples -- [US tax-benefit model](country-models-us.md): Entities, parameters, reform examples -- [Examples](examples.md): Complete working scripts -- [Visualisation](visualisation.md): Publication-ready charts with Plotly -- [Release bundles](release-bundles.md): Reproducible model-plus-data certification and provenance -- [Development](dev.md): Setup, testing, CI, architecture +--- +title: "PolicyEngine" +subtitle: "Tax-benefit microsimulation for Python" +--- + +Compute household taxes and benefits, simulate reforms, and measure distributional impact — across the US and UK — from a single Python package. + +## Install + +```bash +pip install policyengine +``` + +## Minimal example + +```python +import policyengine as pe + +result = pe.us.calculate_household( + people=[{"age": 35, "employment_income": 60_000}], + tax_unit={"filing_status": "SINGLE"}, + household={"state_code": "CA"}, + year=2026, +) +print(result.household.household_net_income) +``` + +## Where to go + +| If you want to… | Start here | +|---|---| +| Compute taxes and benefits for one household | [Households](households.md) | +| Simulate a policy change | [Reforms](reforms.md) | +| Run a population microsimulation | [Microsimulation](microsim.md) | +| Measure a reform's distributional impact | [Impact analysis](impact-analysis.md) | +| See every output type | [Outputs](outputs.md) | +| Look up a variable | Reference (auto-generated catalog, pending) | +| Contribute | [Development](dev.md) | + +## What PolicyEngine is + +A platform that encodes the tax and benefit rules of a country as Python formulas and YAML parameters, runs them over microdata or single households, and exposes the results through a small set of typed outputs. The country rules live in country-specific packages (`policyengine-us`, `policyengine-uk`); this package wraps them in one API. + +Under the hood PolicyEngine combines the rules with calibrated microdata — the enhanced CPS for the US, the enhanced FRS for the UK — and returns weighted population estimates that match administrative totals. + +## Citation + +Woodruff and Ghenis (2024), *Enhancing Survey Microdata with Administrative Records: A Novel Approach to Microsimulation Dataset Construction*. diff --git a/docs/microsim.md b/docs/microsim.md new file mode 100644 index 00000000..255b4485 --- /dev/null +++ b/docs/microsim.md @@ -0,0 +1,129 @@ +--- +title: "Microsimulation" +--- + +For population-level estimates — budget cost, winners and losers, poverty impact — run a microsimulation over calibrated microdata. + +## Quick example + +```python +import policyengine as pe +from policyengine.outputs.aggregate import Aggregate, AggregateType + +pe.us.ensure_datasets( + datasets=["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"], + years=[2026], +) + +baseline = pe.Simulation( + country="us", + dataset="hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5", + year=2026, +) + +total_snap = Aggregate( + variable="snap", + type=AggregateType.SUM, +).compute(baseline) +``` + +## Datasets + +Microdata is stored as HDF5 files on Hugging Face. Install once to download and cache: + +```python +pe.us.ensure_datasets( + datasets=["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"], + years=[2024, 2026], +) +``` + +The default US dataset is **Enhanced CPS 2024** — CPS ASEC with IRS SOI tax return records imputed in and calibration weights tuned to match IRS, CMS, SNAP, and other administrative totals. The UK default is **Enhanced FRS** — Family Resources Survey with tax-return microdata fused in and calibration to HMRC and DWP totals. + +List all available datasets: + +```python +pe.us.load_datasets() # or pe.uk.load_datasets() +``` + +## Simulations + +A `Simulation` takes a country, a dataset, a year, and an optional reform: + +```python +baseline = pe.Simulation( + country="us", + dataset="hf://.../enhanced_cps_2024.h5", + year=2026, +) + +reformed = pe.Simulation( + country="us", + dataset="hf://.../enhanced_cps_2024.h5", + year=2026, + reform={"gov.irs.credits.ctc.amount.adult_dependent": 1_000}, +) +``` + +Each simulation wraps a PolicyEngine country model plus the dataset plus the weight vector. + +## Outputs + +Outputs are callables that consume a `Simulation` and return a typed result. They cover single-value aggregates, cross-sectional distributions, and geographic breakdowns. See [Outputs](outputs.md). + +```python +from policyengine.outputs import ( + Aggregate, AggregateType, + ChangeAggregate, ChangeAggregateType, + DecileImpact, + Poverty, + Inequality, +) + +# Cost of the SNAP program +snap_cost = Aggregate(variable="snap", type=AggregateType.SUM).compute(baseline) + +# Reform budget impact +budget = ChangeAggregate( + variable="household_net_income", + type=ChangeAggregateType.DIFFERENCE, +).compute(baseline, reformed) +``` + +## Memory and performance + +A full Enhanced CPS microsimulation uses ~4 GB of memory and takes ~15–30 seconds on a laptop. For repeated runs with different reforms, reuse the baseline `Simulation` and construct the reform-only instance on top. + +Downsampled datasets are available for testing: + +```python +pe.us.ensure_datasets( + datasets=["hf://policyengine/policyengine-us-data/cps_small_2024.h5"], + years=[2026], +) +``` + +These run in seconds and are fine for integration tests. Don't use them for production analysis — the weights are not calibration-tuned. + +## Managed microsimulation + +If you're orchestrating many reforms, the `managed_microsimulation` context handles dataset prep, cache reuse, and teardown: + +```python +from policyengine.us import managed_microsimulation + +with managed_microsimulation(year=2026) as sim: + baseline = Aggregate("snap", AggregateType.SUM).compute(sim) +``` + +## Pinned model versions + +Every release of `policyengine` pins a specific version of each country model, so results are reproducible. `pe.us.model` and `pe.uk.model` expose the pinned `TaxBenefitModelVersion`. + +If the installed country package version doesn't match the pinned manifest, `managed_microsimulation` raises a warning with the version gap. For strict reproducibility, pin the country packages to the same versions the `policyengine` release was built against — see [Provenance](release-bundles.md). + +## Next + +- [Outputs](outputs.md) — catalog of typed output classes +- [Impact analysis](impact-analysis.md) — baseline-vs-reform in one call +- [Regions](regions.md) — sub-national analysis (states, constituencies, districts) diff --git a/docs/myst.yml b/docs/myst.yml deleted file mode 100644 index 6924ef21..00000000 --- a/docs/myst.yml +++ /dev/null @@ -1,27 +0,0 @@ -# See docs at: https://mystmd.org/guide/frontmatter -version: 1 -project: - id: b70ccb02-12b9-4bdb-a25b-f44bf2213d98 - # title: - # description: - # keywords: [] - # authors: [] - github: https://github.com/PolicyEngine/policyengine.py - toc: - - file: index.md - - file: core-concepts.md - - file: economic-impact-analysis.md - - file: advanced-outputs.md - - file: regions-and-scoping.md - - file: country-models-uk.md - - file: country-models-us.md - - file: examples.md - - file: visualisation.md - - file: release-bundles.md - - file: dev.md - -site: - template: book-theme - # options: - # favicon: favicon.ico - # logo: site_logo.png diff --git a/docs/outputs.md b/docs/outputs.md new file mode 100644 index 00000000..5d75ea7b --- /dev/null +++ b/docs/outputs.md @@ -0,0 +1,156 @@ +--- +title: "Outputs" +--- + +Outputs are callables that consume a `Simulation` (or baseline + reform pair) and return a typed result. Every page uses the same pattern: construct the output with the variables you want, call `.compute(sim)` or `.compute(baseline, reformed)`. + +## Aggregate + +Single-number summaries over the population. + +```python +from policyengine.outputs import Aggregate, AggregateType + +cost = Aggregate(variable="snap", type=AggregateType.SUM).compute(baseline) +average = Aggregate(variable="household_net_income", type=AggregateType.MEAN).compute(baseline) +``` + +`AggregateType` options: `SUM`, `MEAN`, `MEDIAN`, `COUNT_POSITIVE`, `COUNT`, plus quantile types. + +### Filtering + +Apply a pandas-style filter to the population before aggregating: + +```python +Aggregate( + variable="household_net_income", + type=AggregateType.MEAN, + filter="household_size >= 4", +).compute(baseline) +``` + +## ChangeAggregate + +Difference or percent change between a baseline and a reform. + +```python +from policyengine.outputs import ChangeAggregate, ChangeAggregateType + +impact = ChangeAggregate( + variable="household_net_income", + type=ChangeAggregateType.DIFFERENCE, +).compute(baseline, reformed) +``` + +`ChangeAggregateType` options: `DIFFERENCE`, `PERCENT_CHANGE`, `RELATIVE_CHANGE`. + +## DecileImpact + +Average net-income change by income decile, and winners-vs-losers counts. + +```python +from policyengine.outputs import DecileImpact + +impact = DecileImpact().compute(baseline, reformed) + +impact.mean_change_by_decile # dict {1: -50, 2: 120, ...} +impact.winners_losers_by_decile # dict {1: {"winners": 0.1, "losers": 0.3, "neutral": 0.6}, ...} +``` + +Defaults to household-level equivalized net income. Pass `income_variable=` to override. + +## IntraDecileImpact + +Distribution of household-level impact within each income decile — not just mean, but how much spread. + +```python +from policyengine.outputs import IntraDecileImpact + +spread = IntraDecileImpact().compute(baseline, reformed) +``` + +## Poverty + +Poverty rate before and after a reform, by demographic group. + +```python +from policyengine.outputs import Poverty, AGE_GROUPS, RACE_GROUPS + +rates = Poverty( + income_variable="spm_unit_net_income", + poverty_measure="spm", + groups=AGE_GROUPS + RACE_GROUPS, +).compute(baseline, reformed) +``` + +US defaults cover SPM; UK defaults cover AHC and BHC. Deep poverty is available with `measure="deep_spm"` (US). + +## Inequality + +Gini and top income shares. + +```python +from policyengine.outputs import Inequality, USInequalityPreset + +result = Inequality(preset=USInequalityPreset.SPM).compute(baseline, reformed) + +result.gini # {'baseline': 0.48, 'reformed': 0.47} +result.top_ten_share # before/after +result.top_one_share +result.top_tenth_of_one_share +``` + +## Geographic breakdowns + +### CongressionalDistrictImpact (US) + +```python +from policyengine.outputs import CongressionalDistrictImpact + +impacts = CongressionalDistrictImpact().compute(baseline, reformed) +# Per-district winners/losers, cost, poverty change +``` + +### ConstituencyImpact (UK) / LocalAuthorityImpact (UK) + +```python +from policyengine.outputs import ConstituencyImpact, LocalAuthorityImpact + +constituency = ConstituencyImpact().compute(baseline, reformed) +la = LocalAuthorityImpact().compute(baseline, reformed) +``` + +## ProgramStatistics + +Program-level counts and dollar amounts — who enrolls, how much they receive. + +```python +from policyengine.outputs import ProgramStatistics + +stats = ProgramStatistics(program="snap").compute(baseline) + +stats.total_households +stats.total_enrolled +stats.total_cost +stats.mean_benefit +``` + +## Combining outputs + +Every output stores a `to_dict()` representation. Collect them into a dashboard via a collection: + +```python +from policyengine.core import OutputCollection + +dashboard = OutputCollection( + cost=ChangeAggregate("snap", ChangeAggregateType.DIFFERENCE), + poverty=Poverty(income_variable="spm_unit_net_income"), + deciles=DecileImpact(), +).compute(baseline, reformed) +``` + +The collection dispatches to each output and returns a dict keyed by the names you assign. + +## Writing your own output + +Subclass `Output` or `ChangeOutput`. See `src/policyengine/outputs/aggregate.py` for the simplest reference implementation. diff --git a/docs/reforms.md b/docs/reforms.md new file mode 100644 index 00000000..e8043c53 --- /dev/null +++ b/docs/reforms.md @@ -0,0 +1,111 @@ +--- +title: "Reforms" +--- + +A reform is a change to the rules used in a calculation. PolicyEngine supports two kinds: **parametric** (adjust a parameter value) and **structural** (swap or subclass a rule formula). + +## Parametric reforms + +A dict of parameter path → new value. The same shape works for `calculate_household`, `Simulation`, and the output helpers. + +```python +reform = { + "gov.irs.credits.ctc.amount.adult_dependent": 1_000, +} + +pe.us.calculate_household(..., reform=reform) +``` + +Scalar values are treated as effective on January 1 of the simulation year and onward. + +### Time-varying + +```python +reform = { + "gov.irs.credits.ctc.amount.adult_dependent": { + "2026-01-01": 1_000, + "2028-01-01": 2_000, + }, + "gov.irs.credits.eitc.phase_out.rate[0]": { + "2026-01-01": 0.08, + }, +} +``` + +Dates that haven't been passed yet become "from this date onward." Earlier dates replace the baseline schedule. + +### Multiple changes + +Any number of parameter paths in the same dict compose into one reform: + +```python +reform = { + "gov.irs.credits.ctc.amount.adult_dependent": 1_000, + "gov.irs.credits.eitc.phase_out.rate[0]": 0.08, + "gov.states.ca.tax.income.credits.eitc.max_amount": 500, +} +``` + +### Where parameters live + +Every parameter has a canonical path that matches the YAML directory structure in the country model. `gov.irs.credits.ctc.amount.adult_dependent` corresponds to `policyengine_us/parameters/gov/irs/credits/ctc/amount/adult_dependent.yaml`. + +An auto-generated parameter reference is pending; for now, browse the YAML tree in the country model repository (e.g. `policyengine-us/policyengine_us/parameters/`), or type-error your way there — an unknown path raises with suggestions. + +### Scale and array parameters + +Scale parameters (brackets with thresholds and amounts) are addressed by bracket index: + +```python +reform = { + "gov.irs.income.tax.rate[0]": 0.12, # first bracket rate + "gov.irs.income.tax.threshold[1]": 50_000, # second bracket threshold +} +``` + +## Structural reforms + +For rule changes that can't be expressed as a parameter change — swapping one formula for another, adding a variable, removing a program — subclass the country model: + +```python +from policyengine.tax_benefit_models.us import PolicyEngineUS, us_latest + + +class MyReform(PolicyEngineUS): + version = us_latest.version + + def __init__(self): + super().__init__() + self.neutralize_variable("eitc") +``` + +Pass the reformed model to `Simulation`: + +```python +sim = pe.Simulation(model=MyReform(), year=2026) +``` + +`calculate_household` does not yet accept structural reforms directly — use `Simulation` or the country-specific `managed_microsimulation` context. + +## Combining parametric and structural + +Pass a parametric reform to the structural-reform constructor: + +```python +sim = pe.Simulation( + model=MyReform(), + reform={"gov.irs.credits.ctc.amount.adult_dependent": 1_000}, + year=2026, +) +``` + +## Validating a reform before you run it + +The parameter catalog is known at import time. If a path is wrong, the call raises *before* starting the simulation with a suggested path. + +For time-varying reforms, the effective dates are checked against the parameter's defined start and end. A date before the parameter started or after a defined end date raises. + +## Reform worked examples + +- [Economic impact analysis](impact-analysis.md) — full baseline-vs-reform workflow with population estimates. +- [Examples](examples.md) — runnable scripts for reform scenarios in `examples/`. diff --git a/docs/regions-and-scoping.md b/docs/regions-and-scoping.md deleted file mode 100644 index 9be4ddbc..00000000 --- a/docs/regions-and-scoping.md +++ /dev/null @@ -1,251 +0,0 @@ -# Regions and scoping - -The package supports sub-national analysis through a geographic region system. Regions can scope simulations to states, constituencies, congressional districts, local authorities, and cities. - -## Region system - -### Region - -A `Region` represents a geographic area with a unique prefixed code: - -| Region type | Code format | Examples | -|---|---|---| -| National | `us`, `uk` | `us`, `uk` | -| State | `state/{code}` | `state/ca`, `state/ny` | -| Congressional district | `congressional_district/{ST-DD}` | `congressional_district/CA-01` | -| Place/city | `place/{ST-FIPS}` | `place/NJ-57000` | -| UK country | `country/{name}` | `country/england` | -| Constituency | `constituency/{name}` | `constituency/Sheffield Central` | -| Local authority | `local_authority/{code}` | `local_authority/E09000001` | - -### RegionRegistry - -Each model version has a `RegionRegistry` providing O(1) lookups: - -```python -from policyengine.tax_benefit_models.us import us_latest - -registry = us_latest.region_registry - -# Look up by code -california = registry.get("state/ca") -print(f"{california.label}: {california.region_type}") - -# Get all regions of a type -states = registry.get_by_type("state") -print(f"{len(states)} states") - -districts = registry.get_by_type("congressional_district") -print(f"{len(districts)} congressional districts") - -# Get children of a region -ca_districts = registry.get_children("state/ca") -``` - -```python -from policyengine.tax_benefit_models.uk import uk_latest - -registry = uk_latest.region_registry - -# UK countries -countries = registry.get_by_type("country") -for c in countries: - print(f"{c.code}: {c.label}") -``` - -### Region counts - -**US:** 1 national + 51 states (inc. DC) + 436 congressional districts + 333 census places = 821 regions - -**UK:** 1 national + 4 countries. Constituencies and local authorities are available via extended registry builders. - -## Scoping strategies - -Scoping strategies control how a national dataset is narrowed to represent a sub-national region. They are applied during `Simulation.run()`, before the microsimulation calculation. - -### RowFilterStrategy - -Filters dataset rows where a household-level variable matches a specific value. Used for UK countries and US places/cities. - -```python -from policyengine.core import Simulation -from policyengine.core.scoping_strategy import RowFilterStrategy - -# Simulate only California households -simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=us_latest, - scoping_strategy=RowFilterStrategy( - variable_name="state_code", - variable_value="CA", - ), -) -simulation.run() -``` - -This removes all non-California households from the dataset before running the simulation. The remaining household weights still reflect California's population. - -```python -# UK: simulate only England -simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, - scoping_strategy=RowFilterStrategy( - variable_name="country", - variable_value="ENGLAND", - ), -) -``` - -### WeightReplacementStrategy - -Replaces household weights from a pre-computed weight matrix stored in Google Cloud Storage. Used for UK constituencies and local authorities, where the weight matrix (shape: N_regions x N_households) reweights all households to represent each region's demographics. - -```python -from policyengine.core.scoping_strategy import WeightReplacementStrategy - -simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, - scoping_strategy=WeightReplacementStrategy( - weight_matrix_bucket="policyengine-uk-data", - weight_matrix_key="parliamentary_constituency_weights.h5", - lookup_csv_bucket="policyengine-uk-data", - lookup_csv_key="constituencies_2024.csv", - region_code="Sheffield Central", - ), -) -``` - -Unlike row filtering, weight replacement keeps all households but assigns region-specific weights. This is more statistically robust for small geographic areas where filtering would leave too few households. - -### Legacy filter fields - -For backward compatibility, `Simulation` also accepts `filter_field` and `filter_value` parameters, which are auto-converted to a `RowFilterStrategy`: - -```python -# These two are equivalent: -simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=us_latest, - filter_field="state_code", - filter_value="CA", -) - -simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=us_latest, - scoping_strategy=RowFilterStrategy( - variable_name="state_code", - variable_value="CA", - ), -) -``` - -## Geographic impact outputs - -The package provides output types that compute per-region metrics across all regions simultaneously. - -### CongressionalDistrictImpact (US) - -Groups households by `congressional_district_geoid` and computes weighted average and relative income changes per district. - -```python -from policyengine.outputs.congressional_district_impact import ( - compute_us_congressional_district_impacts, -) - -baseline_sim.run() -reform_sim.run() - -impact = compute_us_congressional_district_impacts(baseline_sim, reform_sim) - -for d in impact.district_results: - print(f"District {d['state_fips']:02d}-{d['district_number']:02d}: " - f"avg change=${d['average_household_income_change']:+,.0f}, " - f"relative={d['relative_household_income_change']:+.2%}") -``` - -**Result fields per district:** -- `district_geoid`: Integer SSDD (state FIPS * 100 + district number) -- `state_fips`: State FIPS code -- `district_number`: District number within state -- `average_household_income_change`: Weighted mean change -- `relative_household_income_change`: Weighted relative change -- `population`: Weighted household count - -### ConstituencyImpact (UK) - -Uses pre-computed weight matrices (650 x N_households) to compute per-constituency income changes without filtering. - -```python -from policyengine.outputs.constituency_impact import ( - compute_uk_constituency_impacts, -) - -impact = compute_uk_constituency_impacts( - baseline_simulation=baseline_sim, - reform_simulation=reform_sim, - weight_matrix_path="parliamentary_constituency_weights.h5", - constituency_csv_path="constituencies_2024.csv", - year="2025", -) - -for c in impact.constituency_results: - print(f"{c['constituency_name']}: " - f"avg change={c['average_household_income_change']:+,.0f}") -``` - -**Result fields per constituency:** -- `constituency_code`, `constituency_name`: Identifiers -- `x`, `y`: Hex map coordinates -- `average_household_income_change`, `relative_household_income_change` -- `population`: Weighted household count - -### LocalAuthorityImpact (UK) - -Works identically to `ConstituencyImpact` but for local authorities (360 x N_households weight matrix). - -```python -from policyengine.outputs.local_authority_impact import ( - compute_uk_local_authority_impacts, -) - -impact = compute_uk_local_authority_impacts( - baseline_simulation=baseline_sim, - reform_simulation=reform_sim, - weight_matrix_path="local_authority_weights.h5", - local_authority_csv_path="local_authorities_2024.csv", - year="2025", -) -``` - -## Using regions with `economic_impact_analysis()` - -Scoping strategies compose naturally with the full analysis pipeline: - -```python -from policyengine.core.scoping_strategy import RowFilterStrategy - -# State-level analysis -baseline_sim = Simulation( - dataset=dataset, - tax_benefit_model_version=us_latest, - scoping_strategy=RowFilterStrategy( - variable_name="state_code", - variable_value="CA", - ), -) -reform_sim = Simulation( - dataset=dataset, - tax_benefit_model_version=us_latest, - policy=reform, - scoping_strategy=RowFilterStrategy( - variable_name="state_code", - variable_value="CA", - ), -) - -# Full analysis scoped to California -analysis = economic_impact_analysis(baseline_sim, reform_sim) -``` diff --git a/docs/regions.md b/docs/regions.md new file mode 100644 index 00000000..5fce4521 --- /dev/null +++ b/docs/regions.md @@ -0,0 +1,85 @@ +--- +title: "Regional analysis" +--- + +Sub-national impact breakdowns using geographically-stratified microdata and the `...Impact` output classes. + +## US states + +Every US dataset includes `state_code` on each household. Use `Aggregate` or `ChangeAggregate` with a filter: + +```python +ca_cost = Aggregate( + variable="snap", + type=AggregateType.SUM, + filter="state_code == 'CA'", +).compute(baseline) +``` + +For all-states-at-once, use `StateImpact`: + +```python +from policyengine.outputs import StateImpact + +state_impact = StateImpact().compute(baseline, reformed) +# Dict keyed by two-letter state code +``` + +## US congressional districts + +```python +from policyengine.outputs import CongressionalDistrictImpact + +impacts = CongressionalDistrictImpact().compute(baseline, reformed) + +# Keyed by district ID (e.g. "CA-12") +for district_id, result in impacts.items(): + print(district_id, result.winners_share, result.mean_impact) +``` + +Requires a district-stratified dataset. The default Enhanced CPS includes district assignments calibrated against district-level ACS population and income distributions. + +## UK parliamentary constituencies + +```python +from policyengine.outputs import ConstituencyImpact + +impacts = ConstituencyImpact().compute(baseline, reformed) +``` + +Constituency codes follow ONS nomenclature. Requires the constituency-stratified FRS dataset. + +## UK local authorities + +```python +from policyengine.outputs import LocalAuthorityImpact + +impacts = LocalAuthorityImpact().compute(baseline, reformed) +``` + +## Custom geographies + +If you have a geography not covered by the built-in impact classes, compute the underlying variables via `Simulation.calculate` and group them yourself: + +```python +households = baseline.calculate("household_net_income").values +reform_households = reformed.calculate("household_net_income").values +geography = baseline.calculate("custom_geography_id").values + +import pandas as pd +df = pd.DataFrame({ + "baseline": households, + "reformed": reform_households, + "geo": geography, +}) +df.groupby("geo")[["baseline", "reformed"]].mean() +``` + +## Data availability + +Not every country has sub-national strata in every dataset. Check `Dataset.geo_fields` for what a given dataset supports: + +```python +dataset = pe.us.load_datasets()[0] +dataset.geo_fields # ["state_code", "congressional_district"] +``` diff --git a/docs/release-bundles.md b/docs/release-bundles.md index ea014c9d..d7225f50 100644 --- a/docs/release-bundles.md +++ b/docs/release-bundles.md @@ -1,449 +1,89 @@ -# Release Bundles +--- +title: "Provenance and release bundles" +--- -This document defines the intended reproducibility boundary for `policyengine.py`. +Every analysis in PolicyEngine is reproducible to a specific bundle of (package version, country model version, dataset version, calibration state). The `provenance` module formalizes this. -The key design decision is: +## The bundle -- country `*-data` repos build and stage immutable data artifacts -- `policyengine.py` is the only component that certifies supported runtime bundles -- `policyengine.py` does not rebuild country data itself +Each `policyengine` release pins: -This keeps country-specific data construction in the country data repos while still giving users a single top-level version to cite and pin. +- The policyengine-core version +- The country model versions (`policyengine-us`, `policyengine-uk`) +- The country-data versions (`policyengine-us-data`, `policyengine-uk-data`) +- Dataset hash (content-addressed — the hashed Enhanced CPS file is a bundle ID) +- Calibration vector IDs -## Why this boundary exists +Together these define a **data release manifest** — a published, immutable record of "running this code against this data produces these numbers." -For countries like the UK, the data package is not model-independent. Dataset construction, imputations, and calibration steps call the country model directly. That means a published dataset artifact depends on: +## Checking your bundle -- the country model version used during data construction -- the calibration targets used during data construction -- the raw input data used during data construction - -If `policyengine.py` only pins a country model version and a data package version without checking that relationship, the provenance boundary is incomplete. - -## Roles - -### Country model package - -Examples: `policyengine-uk`, `policyengine-us` - -The country model package owns: - -- policy logic -- variables and parameters -- reforms -- a `data_build_fingerprint` for the subset of model logic that affects data construction - -It does not own final runtime bundle certification. - -### Country data package - -Examples: `policyengine-uk-data`, `policyengine-us-data` - -The country data package owns: - -- data build pipelines -- raw input acquisition -- calibration target snapshots -- expensive dataset construction -- staging immutable build artifacts with provenance - -It does not define the final supported runtime bundle exposed to users. - -### `policyengine.py` - -`policyengine.py` owns: - -- runtime bundle certification -- user-facing reproducibility boundaries -- the supported mapping from `policyengine.py` version to country model version and certified data artifact - -It does not rebuild microdata artifacts. - -## Two manifest layers - -The architecture has two manifest layers with different responsibilities. - -### 1. Data build manifest - -Published by the country `*-data` repo. - -This answers: - -- what bytes were produced -- how they were produced -- which exact model and targets produced them - -Suggested schema: +```python +import policyengine as pe -```json -{ - "schema_version": 1, - "country_id": "uk", - "data_package": { - "name": "policyengine-uk-data", - "version": "1.41.0" - }, - "build": { - "build_id": "uk-data-2026-04-12T12-30-00Z", - "git_sha": "abc123", - "built_at": "2026-04-12T12:30:00Z", - "built_with_model_package": { - "name": "policyengine-uk", - "version": "2.81.0", - "git_sha": "def456", - "data_build_fingerprint": "sha256:..." - }, - "calibration_targets": { - "snapshot_id": "2026-04-10", - "sha256": "sha256:..." - }, - "raw_inputs": [ - { - "name": "frs_2023_24", - "sha256": "sha256:..." - } - ], - "build_environment": { - "python_version": "3.13.3", - "lockfile_sha256": "sha256:..." - } - }, - "default_datasets": { - "national": "enhanced_frs_2023_24", - "baseline": "frs_2023_24" - }, - "artifacts": { - "enhanced_frs_2023_24": { - "kind": "microdata", - "repo_id": "policyengine/policyengine-uk-data-private", - "path": "builds/uk-data-2026-04-12T12-30-00Z/enhanced_frs_2023_24.h5", - "revision": "uk-data-2026-04-12T12-30-00Z", - "sha256": "sha256:...", - "size_bytes": 123456789 - } - } -} +pe.us.model.manifest # pinned US manifest for this release +pe.us.model.data_certification # cert checking installed package vs manifest ``` -Notes: +If the installed country package version doesn't match the pinned manifest, the model warns: -- `build_id` must be immutable. -- build artifacts should be staged under a build-specific path or revision, not a floating release tag. -- the build manifest is the authoritative provenance record for the artifact bytes. - -### 2. Certified runtime bundle manifest - -Published by `policyengine.py`. - -This answers: - -- which model and data artifact are supported together at runtime -- which exact dataset should be used by default -- which artifact checksum and provenance should be surfaced to users - -Suggested schema: - -```json -{ - "schema_version": 1, - "policyengine_version": "3.5.0", - "bundle_id": "uk-3.5.0", - "published_at": "2026-04-12T13:00:00Z", - "country_id": "uk", - "model_package": { - "name": "policyengine-uk", - "version": "2.81.1" - }, - "certified_data_artifact": { - "data_package": { - "name": "policyengine-uk-data", - "version": "1.41.0" - }, - "build_id": "uk-data-2026-04-12T12-30-00Z", - "dataset": "enhanced_frs_2023_24", - "uri": "hf://policyengine/policyengine-uk-data-private/builds/uk-data-2026-04-12T12-30-00Z/enhanced_frs_2023_24.h5@uk-data-2026-04-12T12-30-00Z", - "sha256": "sha256:..." - }, - "certification": { - "compatibility_basis": "matching_data_build_fingerprint", - "built_with_model_version": "2.81.0", - "certified_for_model_version": "2.81.1", - "data_build_fingerprint": "sha256:...", - "certified_by": "policyengine.py release workflow" - }, - "default_dataset": "enhanced_frs_2023_24", - "region_artifacts": { - "national": { - "dataset": "enhanced_frs_2023_24" - } - } -} ``` - -Notes: - -- this is the user-facing reproducibility boundary -- apps and APIs should surface this bundle, not only country package versions -- a bundle may reuse a previously staged data artifact if compatibility is explicitly certified - -## TRACE export - -The internal build manifest and certified runtime bundle remain the operational source of -truth. - -TRACE sits on top of those manifests as a standards-based export layer. - -### What gets exported - -`policyengine.py` emits a certified-bundle TRO for each supported country. The -composition pins four artifacts by sha256: - -- the bundled country release manifest shipped in `policyengine.py` -- the country data release manifest resolved for the certified data package version -- the certified dataset artifact -- the country model wheel published to PyPI (hash read from the bundled manifest - when present, otherwise fetched from the PyPI JSON API at emit time) - -TROs use the public TROv vocabulary at -`https://w3id.org/trace/2023/05/trov#`. Every artifact location in the TRO -is a dereferenceable HTTPS URI or a local path relative to the shipped -wheel. Certification metadata is carried as structured `pe:*` fields on -the `trov:TransparentResearchPerformance` node so downstream tooling can -read `pe:certifiedForModelVersion`, `pe:compatibilityBasis`, -`pe:builtWithModelVersion`, `pe:dataBuildFingerprint`, and `pe:dataBuildId` -without parsing prose. Every TRO also carries `pe:emittedIn` set to -`"github-actions"` or `"local"`; CI-emitted TROs additionally carry -`pe:ciRunUrl` and `pe:ciGitSha`. - -Country `*-data` repos should also emit a matching `trace.tro.jsonld` per -data release covering the release manifest and every staged artifact hash. -That is a country-data concern and lives in those repos. - -#### Emitting a bundle TRO - -From Python: - -```python -from policyengine.core.release_manifest import get_data_release_manifest, get_release_manifest -from policyengine.core.trace_tro import build_trace_tro_from_release_bundle, serialize_trace_tro - -country = get_release_manifest("us") -tro = build_trace_tro_from_release_bundle(country, get_data_release_manifest("us")) -Path("us.trace.tro.jsonld").write_bytes(serialize_trace_tro(tro)) +UserWarning: Installed policyengine-us version (1.602.0) does not match +the bundled policyengine.py manifest (1.653.3). Calculations will run +against the installed version, but dataset compatibility is not guaranteed. ``` -From the CLI: +Pin exactly to match a release for strict reproducibility: -``` -policyengine trace-tro us --out us.trace.tro.jsonld +```bash +pip install policyengine==4.0.0 policyengine-us==1.653.3 policyengine-us-data==2.12.0 ``` -At release time, `scripts/generate_trace_tros.py` regenerates the bundled -`data/release_manifests/{country}.trace.tro.jsonld` files, and the -`Versioning` CI job commits them alongside the changelog so every published -wheel ships with the matching TRO. +## Certifying an analysis -#### Emitting a per-simulation TRO +For a published analysis (paper, policy brief, congressional testimony), attach the manifest to your results: ```python -from policyengine.results import write_results_with_trace_tro +from policyengine.provenance import write_manifest -write_results_with_trace_tro( - results, # ResultsJson instance - "results.json", # where to write results - bundle_tro=bundle_tro, # loaded from the shipped bundle - reform_payload={"salt_cap": 0}, - bundle_tro_url=( - "https://raw.githubusercontent.com/PolicyEngine/policyengine.py/" - "v3.4.5/src/policyengine/data/release_manifests/us.trace.tro.jsonld" - ), -) +result = economic_impact_analysis(reform=REFORM, year=2026) +write_manifest(result, path="my_analysis.manifest.json") ``` -The `bundle_tro_url` is recorded on the performance node as -`pe:bundleTroUrl`. A verifier can fetch that URL, recompute its sha256, -and confirm it matches the `bundle_tro` artifact hash in the simulation -TRO's composition. Without this anchor, the bundle reference is only as -trustworthy as whoever produced the JSON. +The manifest captures package versions, dataset hash, reform spec, and a hash of the result. Readers can verify reproducibility by installing the same pinned stack and rerunning. -#### Validating a received TRO +## Dataset content addressing -Structural validation: +Microdata files are content-addressed — the filename includes a SHA hash. `enhanced_cps_2024.h5` at one publish date is a different artifact than at a later date; they live at different Hugging Face paths. +```python +dataset_uri = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5" +dataset = pe.us.ensure_datasets([dataset_uri])[0] +dataset.content_hash ``` -policyengine trace-tro-validate path/to/tro.jsonld -``` - -The shipped schema at `policyengine/data/schemas/trace_tro.schema.json` -checks structural fields, canonical hex-encoded sha256s, the required -`pe:emittedIn`, and that `trov:hasLocation` uses HTTPS (or the -well-known local paths `results.json`, `reform.json`, -`bundle.trace.tro.jsonld`). The same schema is exercised in the test -suite against generated TROs. -Content validation (the verifier workflow a replication reviewer -should run): +Always cite the full URI (including revision if pinning) in published work. -```python -import hashlib, json, requests -from policyengine.core.trace_tro import canonical_json_bytes +## Building your own manifest -sim_tro = json.load(open("results.trace.tro.jsonld")) -perf = sim_tro["@graph"][0]["trov:hasPerformance"] +If you fork and modify the country model or data, publish your own manifest: -# 1. Fetch the bundle TRO from its pinned URL and recompute its hash. -bundle_bytes = requests.get(perf["pe:bundleTroUrl"]).content -bundle_hash = hashlib.sha256(canonical_json_bytes(json.loads(bundle_bytes))).hexdigest() +```python +from policyengine.provenance import build_manifest -# 2. Compare against the hash recorded in the simulation TRO's composition. -recorded = next( - a["trov:sha256"] - for a in sim_tro["@graph"][0]["trov:hasComposition"]["trov:hasArtifact"] - if a["@id"].endswith("bundle_tro") +manifest = build_manifest( + country_code="us", + model_version="my-fork-1.0.0", + dataset_hashes={"my_dataset": "sha256:..."}, ) -assert bundle_hash == recorded, "bundle_tro_url content does not match sim TRO" - -# 3. Confirm the fingerprint recorded on the performance matches the -# fingerprint inside the fetched bundle. -bundle = json.loads(bundle_bytes) -bundle_fp = bundle["@graph"][0]["trov:hasComposition"]["trov:hasFingerprint"]["trov:sha256"] -assert perf["pe:bundleFingerprint"] == bundle_fp ``` -A sim TRO with a swapped `bundle_tro` dict but a truthful -`pe:bundleTroUrl` will fail step 2; a sim TRO with both swapped will -fail step 3. - -#### Known limitations - -- TROs are emitted unsigned. A signed attestation (sigstore or in-toto) - is a future addition that will bind TROs to a trusted-system key. -- The bundle composition does not yet pin a transitive lockfile - (`uv.lock`/`poetry.lock`), a Python interpreter version, or an OS. AEA - reviewers may demand these; the schema is extensible. -- The model wheel is hashed by PyPI's published sha256. If a wheel is - yanked and re-uploaded under the same version, the hash will change - and the TRO becomes invalid — which is the correct behaviour. -- Country data packages whose data release manifest is private require - `HUGGING_FACE_TOKEN` at emit time. The regeneration script skips - countries whose data release manifest is unreachable so a partial run - does not block other countries. - -### What TRACE does not replace - -TRACE is not the source of truth for compatibility policy. - -In particular, TRACE does not decide: - -- whether a new model version can safely reuse an existing data artifact -- how `data_build_fingerprint` is computed -- which staged artifact becomes a supported runtime default - -Those decisions still belong to the country data build manifest and the -`policyengine.py` certified runtime bundle. - -### Why we still want it - -TRACE adds three things our internal manifests do not provide by themselves: - -- a standard declaration format for provenance exchange -- a composition fingerprint over the exact artifacts in scope -- a better external surface for papers, audits, and reproducibility reviews - -That is why the recommended design is: - -- internal manifests for build/certification control -- generated TRACE TROs for standards-based export - -## Compatibility rule - -The architecture should avoid forcing a new data build for every harmless country model release. - -To do that safely, compatibility must be explicit. - -### Data build fingerprint - -Each country model package should expose a `data_build_fingerprint` that covers the subset of logic that affects dataset construction or calibration. - -Examples of inputs to the fingerprint: - -- variables used in imputations -- variables used in calibration loss matrices -- parameters referenced during data construction -- uprating or target-computation logic used during the build - -Things that should usually not affect the fingerprint: - -- runtime-only outputs that are not used in data construction -- UI-oriented metadata -- code paths unrelated to data construction - -### Certification rules - -`policyengine.py` may certify a staged data artifact for a model version only if one of the following is true: - -1. the model version exactly matches the `built_with_model_package.version` -2. the model version has the same `data_build_fingerprint` as the build-time model version - -If neither is true, the bundle release must fail and a new data build is required. - -This should be a hard failure, not a warning. - -## Artifact states - -Artifacts should move through explicit states: - -- `staged`: built by the country data repo and available for inspection or later certification -- `certified`: referenced by a released `policyengine.py` runtime bundle -- `deprecated`: no longer recommended for new use, but still reproducible - -The key point is that `staged` and `certified` are different states. A staged artifact is not automatically part of a supported runtime release. - -## UK release workflow - -### Case 1: model-only release - -1. Cut UK model release candidate `M`. -2. Compute `data_build_fingerprint(M)`. -3. Compare it to the fingerprint recorded in the previously certified data build manifest. -4. If the fingerprint matches, skip the expensive UK data rebuild. -5. Release `policyengine.py` with a new certified runtime bundle that points to the existing staged UK artifact. - -### Case 2: data-affecting release - -1. Cut UK model release candidate `M`. -2. Compute `data_build_fingerprint(M)`. -3. If the fingerprint changed, build a new UK data artifact in `policyengine-uk-data` against: - - exact `policyengine-uk==M` - - exact target snapshot - - exact raw input hashes -4. Stage the new artifact under a build-specific immutable path or revision. -5. Publish the UK data build manifest. -6. Release `policyengine.py` with a certified runtime bundle that points to the new staged artifact. - -## Implementation guidance - -The current `release_manifest.json` mechanism in country data repos is a good starting point, but it is not yet enough on its own. The target implementation should add: - -- `built_with_model_package.version` -- `built_with_model_package.git_sha` -- `built_with_model_package.data_build_fingerprint` -- calibration target snapshot metadata -- immutable staged artifact paths or revisions - -The target implementation in `policyengine.py` should add: - -- hard validation of bundle certification rules -- explicit runtime bundle metadata on simulations, APIs, and app responses -- checksum-backed dataset resolution from the certified bundle manifest - -## Why not let `policyengine.py` build all country data directly? +Users of your fork install your pinned stack and get your manifest. -Because that would centralise the wrong concerns: +## When to care about this -- country-specific private data handling would move into the generic orchestration layer -- country-specific build logic would move into the generic orchestration layer -- expensive build failures would block the top-level runtime package more often -- provenance would still originate in the country data pipeline, so `policyengine.py` would not actually eliminate the need for the country build manifest +- Publishing numbers (paper, brief, official analysis) +- Regulatory submissions where auditors must reproduce +- Long-running studies where package versions will drift over the analysis window -`policyengine.py` should be the certification boundary, not the country data build system. +For day-to-day exploration, version drift between `policyengine` and country packages is tolerable and the warning is informational. diff --git a/docs/visualisation.md b/docs/visualisation.md index 662ec3b1..6e833ec2 100644 --- a/docs/visualisation.md +++ b/docs/visualisation.md @@ -1,72 +1,71 @@ -# Visualisation utilities +--- +title: "Visualisation" +--- -PolicyEngine provides utilities for creating publication-ready charts that follow our visual style guidelines. +PolicyEngine outputs come with `.to_plotly()` helpers for the most common chart shapes. These produce publication-ready Plotly figures with PolicyEngine's color palette — override or customize as you would any Plotly figure. -## Formatting plotly figures - -The `format_fig()` function applies PolicyEngine's visual style to plotly figures, ensuring consistency across all analyses and publications. +## Decile impact ```python -from policyengine.utils import format_fig, COLORS -import plotly.graph_objects as go - -# Create your figure -fig = go.Figure() -fig.add_trace(go.Scatter(x=[1, 2, 3], y=[4, 5, 6], name="Data")) - -# Apply PolicyEngine styling -format_fig( - fig, - title="Example chart", - xaxis_title="X axis", - yaxis_title="Y axis", - height=600, - width=800 -) +from policyengine.outputs import DecileImpact +impact = DecileImpact().compute(baseline, reformed) +fig = impact.to_plotly() fig.show() ``` -## Visual style principles +## Budget over reform dimension -The formatting applies these principles automatically: - -**Colours**: Primary teal (#319795) with semantic colours for different data types (success/green, warning/yellow, error/red, info/blue). Access colours via the `COLORS` dictionary: +Iterating a reform over a parameter (e.g. CTC amount from $0 to $3,000) and plotting the trajectory is two steps: ```python -from policyengine.utils import COLORS +amounts = [0, 1_000, 2_000, 3_000] +budgets = [] +for amount in amounts: + impact = economic_impact_analysis( + reform={"gov.irs.credits.ctc.amount.adult_dependent": amount}, + year=2026, + ) + budgets.append(impact.budget.total_change) -fig.add_trace(go.Scatter( - x=x_data, - y=y_data, - line=dict(color=COLORS["primary"]) -)) +import plotly.graph_objects as go +fig = go.Figure(go.Scatter(x=amounts, y=budgets, mode="lines+markers")) +fig.update_layout(xaxis_title="CTC amount ($)", yaxis_title="Budget cost ($bn)") ``` -**Typography**: Inter font family with appropriate sizing (12px for labels, 14px for body text, 16px for titles). +## Household reform curve + +`HouseholdImpact` traces one household across a range of employment incomes under a reform: + +```python +from policyengine.outputs import HouseholdImpact + +traj = HouseholdImpact( + household_fixture={"people": [{"age": 35}], "tax_unit": {"filing_status": "SINGLE"}}, + income_range=(0, 200_000, 1_000), +).compute(baseline_reform={}, reform=REFORM) + +traj.to_plotly().show() +``` -**Layout**: Clean white background with subtle grey gridlines and appropriate margins (48px) for professional presentation. +Useful for showing benefit cliffs and marginal tax rates. -**Clarity**: Data-driven design that prioritises immediate understanding over decoration. +## Color palette -## Available colours +PolicyEngine's palette is available via the design system: ```python -COLORS = { - "primary": "#319795", # Teal (main brand colour) - "primary_light": "#E6FFFA", - "primary_dark": "#1D4044", - "success": "#22C55E", # Green (positive changes) - "warning": "#FEC601", # Yellow (cautions) - "error": "#EF4444", # Red (negative changes) - "info": "#1890FF", # Blue (neutral information) - "gray_light": "#F2F4F7", - "gray": "#667085", - "gray_dark": "#101828", - "blue_secondary": "#026AA2", -} +from policyengine.plotting import PALETTE + +PALETTE.BLUE_PRIMARY +PALETTE.GRAY_600 ``` -## Complete example +## Exporting + +Every Plotly figure can be exported: -See [UK employment income variation](examples.md#uk-employment-income-variation) for a full demonstration of using `format_fig()` in an analysis workflow. +```python +fig.write_image("chart.png", width=1000, height=600) +fig.write_html("chart.html", include_plotlyjs="cdn") +``` diff --git a/examples/household_impact_example.py b/examples/household_impact_example.py index f2902daf..4b96cd96 100644 --- a/examples/household_impact_example.py +++ b/examples/household_impact_example.py @@ -1,46 +1,37 @@ -"""Example: Calculate household tax and benefit impacts. +"""Example: calculate tax and benefit outcomes for custom households. -This script demonstrates using calculate_household_impact for both UK and US -to compute taxes and benefits for custom households. +Demonstrates the v4 :func:`policyengine.us.calculate_household` and +:func:`policyengine.uk.calculate_household` entry points. Both take flat +keyword arguments, accept reform dicts directly, and return a +:class:`~policyengine.tax_benefit_models.common.HouseholdResult` that +supports dot-access for scalar lookups. -Run: python examples/household_impact_example.py +Run: ``python examples/household_impact_example.py`` """ -from policyengine.tax_benefit_models.uk import ( - UKHouseholdInput, -) -from policyengine.tax_benefit_models.uk import ( - calculate_household_impact as calculate_uk_impact, -) -from policyengine.tax_benefit_models.us import ( - USHouseholdInput, -) -from policyengine.tax_benefit_models.us import ( - calculate_household_impact as calculate_us_impact, -) - - -def uk_example(): - """UK household impact example.""" +from __future__ import annotations + +import policyengine as pe + + +def uk_example() -> None: print("=" * 60) - print("UK HOUSEHOLD IMPACT") + print("UK household calculator") print("=" * 60) - # Single adult earning £50,000 - household = UKHouseholdInput( + # Single adult earning £50,000. + single = pe.uk.calculate_household( people=[{"age": 35, "employment_income": 50_000}], year=2026, ) - result = calculate_uk_impact(household) - print("\nSingle adult, £50k income:") - print(f" Net income: £{result.household['hbai_household_net_income']:,.0f}") - print(f" Income tax: £{result.person[0]['income_tax']:,.0f}") - print(f" National Insurance: £{result.person[0]['national_insurance']:,.0f}") - print(f" Total tax: £{result.household['household_tax']:,.0f}") + print(f" Net income: £{single.household.hbai_household_net_income:,.0f}") + print(f" Income tax: £{single.person[0].income_tax:,.0f}") + print(f" National Insurance: £{single.person[0].national_insurance:,.0f}") + print(f" Total tax: £{single.household.household_tax:,.0f}") - # Family with two children, £30k income, renting - household = UKHouseholdInput( + # Family with two children, £30k income, renting in the North West. + family = pe.uk.calculate_household( people=[ {"age": 35, "employment_income": 30_000}, {"age": 33}, @@ -57,59 +48,52 @@ def uk_example(): }, year=2026, ) - result = calculate_uk_impact(household) - print("\nFamily (2 adults, 2 children), £30k income, renting:") - print(f" Net income: £{result.household['hbai_household_net_income']:,.0f}") - print(f" Income tax: £{result.person[0]['income_tax']:,.0f}") - print(f" Child benefit: £{result.benunit[0]['child_benefit']:,.0f}") - print(f" Universal credit: £{result.benunit[0]['universal_credit']:,.0f}") - print(f" Total benefits: £{result.household['household_benefits']:,.0f}") + print(f" Net income: £{family.household.hbai_household_net_income:,.0f}") + print(f" Income tax: £{family.person[0].income_tax:,.0f}") + print(f" Child benefit: £{family.benunit.child_benefit:,.0f}") + print(f" Universal credit: £{family.benunit.universal_credit:,.0f}") + print(f" Total benefits: £{family.household.household_benefits:,.0f}") -def us_example(): - """US household impact example.""" +def us_example() -> None: print("\n" + "=" * 60) - print("US HOUSEHOLD IMPACT") + print("US household calculator") print("=" * 60) - # Single adult earning $50,000 - household = USHouseholdInput( - people=[{"age": 35, "employment_income": 50_000, "is_tax_unit_head": True}], + # Single adult earning $50,000 in California. + single = pe.us.calculate_household( + people=[{"age": 35, "employment_income": 50_000}], tax_unit={"filing_status": "SINGLE"}, household={"state_code_str": "CA"}, - year=2024, + year=2026, ) - result = calculate_us_impact(household) - print("\nSingle adult, $50k income (California):") - print(f" Net income: ${result.household['household_net_income']:,.0f}") - print(f" Income tax: ${result.tax_unit[0]['income_tax']:,.0f}") - print(f" Payroll tax: ${result.tax_unit[0]['employee_payroll_tax']:,.0f}") + print(f" Net income: ${single.household.household_net_income:,.0f}") + print(f" Income tax: ${single.tax_unit.income_tax:,.0f}") + print(f" Payroll tax: ${single.tax_unit.employee_payroll_tax:,.0f}") - # Married couple with children, lower income - household = USHouseholdInput( + # Married couple with two kids, Texas, lower income. + family = pe.us.calculate_household( people=[ - {"age": 35, "employment_income": 40_000, "is_tax_unit_head": True}, - {"age": 33, "is_tax_unit_spouse": True}, - {"age": 8, "is_tax_unit_dependent": True}, - {"age": 5, "is_tax_unit_dependent": True}, + {"age": 35, "employment_income": 40_000}, + {"age": 33}, + {"age": 8}, + {"age": 5}, ], tax_unit={"filing_status": "JOINT"}, household={"state_code_str": "TX"}, - year=2024, + year=2026, ) - result = calculate_us_impact(household) - print("\nMarried couple with 2 children, $40k income (Texas):") - print(f" Net income: ${result.household['household_net_income']:,.0f}") - print(f" Federal income tax: ${result.tax_unit[0]['income_tax']:,.0f}") - print(f" EITC: ${result.tax_unit[0]['eitc']:,.0f}") - print(f" Child tax credit: ${result.tax_unit[0]['ctc']:,.0f}") - print(f" SNAP: ${result.spm_unit[0]['snap']:,.0f}") + print(f" Net income: ${family.household.household_net_income:,.0f}") + print(f" Federal income tax: ${family.tax_unit.income_tax:,.0f}") + print(f" EITC: ${family.tax_unit.eitc:,.0f}") + print(f" Child tax credit: ${family.tax_unit.ctc:,.0f}") + print(f" SNAP: ${family.spm_unit.snap:,.0f}") -def main(): +def main() -> None: uk_example() us_example() print("\n" + "=" * 60) diff --git a/pyproject.toml b/pyproject.toml index 72af3935..f09e0a04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine" -version = "3.6.0" +version = "4.0.0" description = "A package to conduct policy analysis using PolicyEngine tax-benefit models." readme = "README.md" authors = [ diff --git a/scripts/generate_trace_tros.py b/scripts/generate_trace_tros.py index dce7ae8e..f9533bd9 100644 --- a/scripts/generate_trace_tros.py +++ b/scripts/generate_trace_tros.py @@ -18,12 +18,12 @@ import sys from pathlib import Path -from policyengine.core.release_manifest import ( +from policyengine.provenance.manifest import ( DataReleaseManifestUnavailableError, get_data_release_manifest, get_release_manifest, ) -from policyengine.core.trace_tro import ( +from policyengine.provenance.trace import ( build_trace_tro_from_release_bundle, serialize_trace_tro, ) diff --git a/src/policyengine/__init__.py b/src/policyengine/__init__.py index e69de29b..a8de3971 100644 --- a/src/policyengine/__init__.py +++ b/src/policyengine/__init__.py @@ -0,0 +1,46 @@ +"""PolicyEngine — one Python API for tax and benefit policy. + +Canonical entry points for a fresh coding session: + +.. code-block:: python + + import policyengine as pe + + # Single-household calculator (US). + result = pe.us.calculate_household( + people=[{"age": 35, "employment_income": 60000}], + tax_unit={"filing_status": "SINGLE"}, + year=2026, + reform={"gov.irs.credits.ctc.amount.adult_dependent": 1000}, + ) + print(result.tax_unit.income_tax, result.household.household_net_income) + + # UK: + uk_result = pe.uk.calculate_household( + people=[{"age": 30, "employment_income": 50000}], + year=2026, + ) + + # Lower-level microsimulation building blocks. + from policyengine import Simulation # or: pe.Simulation + +Each country module exposes ``calculate_household``, ``model`` +(the pinned ``TaxBenefitModelVersion``), and the microsim helpers. +""" + +from importlib.util import find_spec + +from policyengine import outputs as outputs +from policyengine.core import Simulation as Simulation + +if find_spec("policyengine_us") is not None: + from policyengine.tax_benefit_models import us as us +else: # pragma: no cover + us = None # type: ignore[assignment] + +if find_spec("policyengine_uk") is not None: + from policyengine.tax_benefit_models import uk as uk +else: # pragma: no cover + uk = None # type: ignore[assignment] + +__all__ = ["Simulation", "outputs", "uk", "us"] diff --git a/src/policyengine/cli.py b/src/policyengine/cli.py index add36388..3a659643 100644 --- a/src/policyengine/cli.py +++ b/src/policyengine/cli.py @@ -6,7 +6,7 @@ - ``trace-tro-validate `` validate a TRO against the shipped schema - ``release-manifest `` print the bundled country manifest -See :mod:`policyengine.core.trace_tro` and ``docs/release-bundles.md``. +See :mod:`policyengine.provenance.trace` and ``docs/release-bundles.md``. """ from __future__ import annotations @@ -18,11 +18,11 @@ from pathlib import Path from typing import Optional, Sequence -from policyengine.core.release_manifest import ( +from policyengine.provenance.manifest import ( get_data_release_manifest, get_release_manifest, ) -from policyengine.core.trace_tro import ( +from policyengine.provenance.trace import ( build_trace_tro_from_release_bundle, serialize_trace_tro, ) diff --git a/src/policyengine/core/__init__.py b/src/policyengine/core/__init__.py index 71ca0132..4f749de4 100644 --- a/src/policyengine/core/__init__.py +++ b/src/policyengine/core/__init__.py @@ -1,3 +1,11 @@ +"""Core value objects: Dataset, Variable, Parameter, Policy, Simulation, Region. + +Provenance (release manifests, TRACE TROs) lives in +:mod:`policyengine.provenance` and is intentionally not re-exported +here — importing a core value object should not pull in the +provenance layer. +""" + from .dataset import Dataset from .dataset import YearData as YearData from .dataset import map_to_entity as map_to_entity @@ -11,22 +19,6 @@ from .region import Region as Region from .region import RegionRegistry as RegionRegistry from .region import RegionType as RegionType -from .release_manifest import CertifiedDataArtifact as CertifiedDataArtifact -from .release_manifest import CountryReleaseManifest as CountryReleaseManifest -from .release_manifest import DataBuildInfo as DataBuildInfo -from .release_manifest import DataCertification as DataCertification -from .release_manifest import DataPackageVersion as DataPackageVersion -from .release_manifest import DataReleaseArtifact as DataReleaseArtifact -from .release_manifest import DataReleaseManifest as DataReleaseManifest -from .release_manifest import PackageVersion as PackageVersion -from .release_manifest import ( - certify_data_release_compatibility as certify_data_release_compatibility, -) -from .release_manifest import get_data_release_manifest as get_data_release_manifest -from .release_manifest import get_release_manifest as get_release_manifest -from .release_manifest import ( - resolve_managed_dataset_reference as resolve_managed_dataset_reference, -) from .scoping_strategy import RegionScopingStrategy as RegionScopingStrategy from .scoping_strategy import RowFilterStrategy as RowFilterStrategy from .scoping_strategy import ScopingStrategy as ScopingStrategy @@ -38,19 +30,6 @@ from .tax_benefit_model_version import ( TaxBenefitModelVersion as TaxBenefitModelVersion, ) -from .trace_tro import ( - build_simulation_trace_tro as build_simulation_trace_tro, -) -from .trace_tro import ( - build_trace_tro_from_release_bundle as build_trace_tro_from_release_bundle, -) -from .trace_tro import ( - compute_trace_composition_fingerprint as compute_trace_composition_fingerprint, -) -from .trace_tro import ( - extract_bundle_tro_reference as extract_bundle_tro_reference, -) -from .trace_tro import serialize_trace_tro as serialize_trace_tro from .variable import Variable as Variable # Rebuild models to resolve forward references diff --git a/src/policyengine/core/scoping_strategy.py b/src/policyengine/core/scoping_strategy.py index 7d9b5126..81778f47 100644 --- a/src/policyengine/core/scoping_strategy.py +++ b/src/policyengine/core/scoping_strategy.py @@ -14,7 +14,6 @@ from pathlib import Path from typing import Annotated, Literal, Optional, Union -import h5py import numpy as np import pandas as pd from microdf import MicroDataFrame @@ -69,7 +68,7 @@ class RowFilterStrategy(RegionScopingStrategy): strategy_type: Literal["row_filter"] = "row_filter" variable_name: str - variable_value: str + variable_value: Union[str, int, float] def apply( self, @@ -127,7 +126,11 @@ def apply( region_id = self._find_region_index(lookup_df, self.region_code) - # Download weight matrix and extract weights for this region + # Download weight matrix and extract weights for this region. + # h5py is only needed here, so import lazily to keep + # `from policyengine.core import ...` light. + import h5py + weights_path = download_gcs_file( bucket=self.weight_matrix_bucket, file_path=self.weight_matrix_key, diff --git a/src/policyengine/core/simulation.py b/src/policyengine/core/simulation.py index 5002b141..e4b261ee 100644 --- a/src/policyengine/core/simulation.py +++ b/src/policyengine/core/simulation.py @@ -1,9 +1,9 @@ import logging from datetime import datetime -from typing import Optional +from typing import Any, Optional, Union from uuid import uuid4 -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator from .cache import LRUCache from .dataset import Dataset @@ -18,12 +18,62 @@ class Simulation(BaseModel): + """Population microsimulation over a certified dataset. + + Canonical call shape: + + .. code-block:: python + + import policyengine as pe + from policyengine.core import Simulation + + datasets = pe.us.ensure_datasets( + datasets=["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"], + years=[2026], data_folder="./data", + ) + dataset = datasets["enhanced_cps_2024_2026"] + + # Baseline + baseline = Simulation(dataset=dataset, tax_benefit_model_version=pe.us.model) + + # Reform — same flat dict shape as pe.us.calculate_household(reform=...). + # Parameter path indexing uses "[0].amount" for scale/breakdown entries. + reform = Simulation( + dataset=dataset, + tax_benefit_model_version=pe.us.model, + policy={"gov.irs.credits.ctc.amount.base[0].amount": 3_000}, + ) + + baseline.ensure() + reform.ensure() + + The ``policy`` / ``dynamic`` kwargs accept either a ``Policy`` / + ``Dynamic`` object or a flat ``{"param.path": value}`` / + ``{"param.path": {date: value}}`` dict that is compiled against + ``tax_benefit_model_version`` at construction time (unknown paths + raise with close-match suggestions). Scalar values default to + ``{dataset.year}-01-01`` as their effective date. + + See ``policyengine.core.scoping_strategy`` for sub-national scoping. + """ + id: str = Field(default_factory=lambda: str(uuid4())) created_at: datetime = Field(default_factory=datetime.now) updated_at: datetime = Field(default_factory=datetime.now) - policy: Optional[Policy] = None - dynamic: Optional[Dynamic] = None + policy: Optional[Union[Policy, dict[str, Any]]] = Field( + default=None, + description=( + "Reform policy. Pass a ``Policy`` directly, or a flat " + "``{'param.path': value}`` / ``{'param.path': {date: value}}`` " + "dict and it will be compiled against " + "``tax_benefit_model_version`` at run time." + ), + ) + dynamic: Optional[Union[Dynamic, dict[str, Any]]] = Field( + default=None, + description=("Behavioural-response overlay. Same dict shape as ``policy``."), + ) dataset: Dataset = None scoping_strategy: Optional[ScopingStrategy] = Field( @@ -44,6 +94,42 @@ class Simulation(BaseModel): output_dataset: Optional[Dataset] = None + @model_validator(mode="after") + def _compile_dict_reforms(self) -> "Simulation": + """Coerce dict ``policy`` / ``dynamic`` inputs into proper objects. + + Runs at ``mode="after"`` because compiling needs both + ``tax_benefit_model_version`` (for path validation) and + ``dataset.year`` (for effective-date defaulting) — both on ``self``. + """ + from policyengine.tax_benefit_models.common.reform import ( + compile_reform_to_dynamic, + compile_reform_to_policy, + ) + + year = getattr(self.dataset, "year", None) + for field, compiler in ( + ("policy", compile_reform_to_policy), + ("dynamic", compile_reform_to_dynamic), + ): + value = getattr(self, field) + if not isinstance(value, dict): + continue + if self.tax_benefit_model_version is None: + raise ValueError( + f"Cannot compile a dict {field} without " + "tax_benefit_model_version; pass model_version or a " + f"{field.capitalize()}." + ) + setattr( + self, + field, + compiler( + value, year=year, model_version=self.tax_benefit_model_version + ), + ) + return self + def run(self): self.tax_benefit_model_version.run(self) diff --git a/src/policyengine/core/tax_benefit_model_version.py b/src/policyengine/core/tax_benefit_model_version.py index eeddef85..5eb8f525 100644 --- a/src/policyengine/core/tax_benefit_model_version.py +++ b/src/policyengine/core/tax_benefit_model_version.py @@ -4,14 +4,15 @@ from pydantic import BaseModel, Field -from .release_manifest import ( +from policyengine.provenance.manifest import ( CountryReleaseManifest, DataCertification, PackageVersion, get_data_release_manifest, ) +from policyengine.provenance.trace import build_trace_tro_from_release_bundle + from .tax_benefit_model import TaxBenefitModel -from .trace_tro import build_trace_tro_from_release_bundle if TYPE_CHECKING: from .parameter import Parameter diff --git a/src/policyengine/countries/uk/regions.py b/src/policyengine/countries/uk/regions.py index d90f0ad0..32430d48 100644 --- a/src/policyengine/countries/uk/regions.py +++ b/src/policyengine/countries/uk/regions.py @@ -15,11 +15,11 @@ from typing import TYPE_CHECKING from policyengine.core.region import Region, RegionRegistry -from policyengine.core.release_manifest import resolve_region_dataset_path from policyengine.core.scoping_strategy import ( RowFilterStrategy, WeightReplacementStrategy, ) +from policyengine.provenance.manifest import resolve_region_dataset_path if TYPE_CHECKING: pass diff --git a/src/policyengine/countries/us/regions.py b/src/policyengine/countries/us/regions.py index 9e20d8b3..ca2f6b4f 100644 --- a/src/policyengine/countries/us/regions.py +++ b/src/policyengine/countries/us/regions.py @@ -8,8 +8,8 @@ """ from policyengine.core.region import Region, RegionRegistry -from policyengine.core.release_manifest import resolve_region_dataset_path from policyengine.core.scoping_strategy import RowFilterStrategy +from policyengine.provenance.manifest import resolve_region_dataset_path from .data import AT_LARGE_STATES, DISTRICT_COUNTS, US_PLACES, US_STATES diff --git a/src/policyengine/data/release_manifests/uk.json b/src/policyengine/data/release_manifests/uk.json index de8fa505..961defbd 100644 --- a/src/policyengine/data/release_manifests/uk.json +++ b/src/policyengine/data/release_manifests/uk.json @@ -1,8 +1,8 @@ { "schema_version": 1, - "bundle_id": "uk-3.5.0", + "bundle_id": "uk-4.0.0", "country_id": "uk", - "policyengine_version": "3.5.0", + "policyengine_version": "4.0.0", "model_package": { "name": "policyengine-uk", "version": "2.88.0", diff --git a/src/policyengine/data/release_manifests/us.json b/src/policyengine/data/release_manifests/us.json index b005eda9..0016aa8a 100644 --- a/src/policyengine/data/release_manifests/us.json +++ b/src/policyengine/data/release_manifests/us.json @@ -1,8 +1,8 @@ { "schema_version": 1, - "bundle_id": "us-3.5.0", + "bundle_id": "us-4.0.0", "country_id": "us", - "policyengine_version": "3.5.0", + "policyengine_version": "4.0.0", "model_package": { "name": "policyengine-us", "version": "1.653.3", diff --git a/src/policyengine/graph/__init__.py b/src/policyengine/graph/__init__.py new file mode 100644 index 00000000..84dd698c --- /dev/null +++ b/src/policyengine/graph/__init__.py @@ -0,0 +1,41 @@ +"""Variable dependency graph for PolicyEngine source trees. + +Parses ``Variable`` subclasses in a PolicyEngine jurisdiction (e.g. +``policyengine-us``, ``policyengine-uk``) and extracts the variable- +to-variable dataflow graph from formula-method bodies. + +The extractor is static: it walks the Python AST and never imports +user code, so it works on any PolicyEngine source tree without +requiring the jurisdiction to be installed or the country model to +resolve. That makes it usable for refactor-impact analysis, CI +pre-merge checks, docs generation, and code-introspection queries +from a Claude Code plugin. + +Recognized reference patterns in v1: + +- ``("", )`` — direct call on an entity instance + (``person``, ``tax_unit``, ``spm_unit``, ``household``, ``family``, + ``marital_unit``, ``benunit``). +- ``add(, , ["v1", "v2", ...])`` — sum helper; each + string in the list becomes an edge. + +Typical usage: + +.. code-block:: python + + from policyengine.graph import extract_from_path + + graph = extract_from_path("/path/to/policyengine-us/policyengine_us/variables") + # Variables that transitively depend on AGI: + for downstream in graph.impact("adjusted_gross_income"): + print(downstream) + # Direct dependencies of a variable: + print(graph.deps("earned_income_tax_credit")) + # Dependency chain from one variable to another: + print(graph.path("wages", "federal_income_tax")) +""" + +from policyengine.graph.extractor import extract_from_path +from policyengine.graph.graph import VariableGraph + +__all__ = ["VariableGraph", "extract_from_path"] diff --git a/src/policyengine/graph/extractor.py b/src/policyengine/graph/extractor.py new file mode 100644 index 00000000..1af61a7b --- /dev/null +++ b/src/policyengine/graph/extractor.py @@ -0,0 +1,189 @@ +"""AST-based extractor for PolicyEngine Variable subclasses. + +Walks a directory of ``.py`` files, identifies ``Variable`` subclasses +by looking for ``class Foo(Variable):`` in the AST, and extracts +variable references from each class's ``formula*`` methods. + +The extractor never imports user code, so it works on any PolicyEngine +source tree regardless of whether the jurisdiction is installed. +This keeps refactor-impact analysis and CI pre-merge checks fast and +dependency-free. + +Two reference patterns are recognized: + +1. ``("", )`` where ```` is a bare ``Name`` + matching one of: + ``person``, ``tax_unit``, ``spm_unit``, ``household``, ``family``, + ``marital_unit``, ``benunit``, ``tax_unit``. +2. ``add(, , [])`` — the + ``add`` helper that sums a list of variable names on an entity. + +Limitations of the v1 extractor (tracked for v2): + +- Parameter references (``parameters(period).gov.xxx.yyy``) are not + yet captured; only variable-to-variable edges. +- Dynamic variable names built via string concatenation or format + strings are skipped (low-prevalence in practice). +- ``entity.sum("var")`` or ``entity.mean("var")`` method calls are + not yet recognized; only the direct-call form. (Low-prevalence + in ``policyengine-us``; common enough to add as a small follow-up.) +""" + +from __future__ import annotations + +import ast +import os +from pathlib import Path +from typing import Iterable, Iterator, Sequence, Union + +from policyengine.graph.graph import VariableGraph + + +# Names of entity instances as they appear as method parameters in +# Variable formulas. Any ``Call`` whose ``func`` is a bare ``Name`` +# matching one of these AND whose first arg is a string literal is +# treated as a variable reference. Bare names (not attribute access) +# ensures we don't accidentally match something like +# ``reform.person("x", period)``. +_ENTITY_CALL_NAMES: frozenset[str] = frozenset( + { + "person", + "tax_unit", + "spm_unit", + "household", + "family", + "marital_unit", + "benunit", + } +) + + +PathLike = Union[str, "os.PathLike[str]"] + + +def extract_from_path(path: PathLike) -> VariableGraph: + """Build a ``VariableGraph`` from all ``.py`` files under ``path``. + + Directories are walked recursively. Files that fail to parse as + Python (syntax errors) are silently skipped — the extractor is a + best-effort tool over real source trees, not a compiler. + """ + root = Path(path) + graph = VariableGraph() + + files: Iterable[Path] + if root.is_file(): + files = [root] + else: + files = root.rglob("*.py") + + for file_path in files: + try: + source = file_path.read_text() + except (OSError, UnicodeDecodeError): + continue + try: + tree = ast.parse(source, filename=str(file_path)) + except SyntaxError: + continue + _visit_module(tree, file_path=str(file_path), graph=graph) + + return graph + + +# ------------------------------------------------------------------- +# AST traversal +# ------------------------------------------------------------------- + + +def _visit_module(tree: ast.Module, *, file_path: str, graph: VariableGraph) -> None: + """Register each Variable subclass and walk its formula methods.""" + for node in tree.body: + if not isinstance(node, ast.ClassDef): + continue + if not _class_inherits_variable(node): + continue + var_name = node.name + graph.add_variable(var_name, file_path=file_path) + for child in node.body: + if isinstance(child, ast.FunctionDef) and _is_formula_method(child): + for dependency in _extract_references(child): + graph.add_edge(dependency=dependency, dependent=var_name) + + +def _class_inherits_variable(cls: ast.ClassDef) -> bool: + """True iff the class's base list contains a ``Variable`` name. + + Matches ``class X(Variable):``. Does not resolve aliased imports + — PolicyEngine's ``from policyengine_us.model_api import *`` + convention keeps the base name literally ``Variable``, which is + what real jurisdictions use and what this check matches. + """ + for base in cls.bases: + if isinstance(base, ast.Name) and base.id == "Variable": + return True + return False + + +def _is_formula_method(func: ast.FunctionDef) -> bool: + """Return True for ``formula`` and ``formula_YYYY`` methods.""" + return func.name == "formula" or func.name.startswith("formula_") + + +# ------------------------------------------------------------------- +# Reference extraction from a formula body +# ------------------------------------------------------------------- + + +def _extract_references(func: ast.FunctionDef) -> Iterator[str]: + """Yield every variable name referenced in the function body.""" + for node in ast.walk(func): + if not isinstance(node, ast.Call): + continue + # Pattern 1: ("", ) + entity_ref = _entity_call_to_variable(node) + if entity_ref is not None: + yield entity_ref + continue + # Pattern 2: add(, , ["v1", "v2", ...]) + yield from _add_call_to_variables(node) + + +def _entity_call_to_variable(call: ast.Call) -> str | None: + """Return the variable name if ``call`` is an entity-call pattern. + + The entity has to be a bare Name (not an attribute access), so + calls like ``some.object.person("x", period)`` are deliberately + not matched. First positional arg must be a string literal. + """ + if not isinstance(call.func, ast.Name): + return None + if call.func.id not in _ENTITY_CALL_NAMES: + return None + if not call.args: + return None + first = call.args[0] + if isinstance(first, ast.Constant) and isinstance(first.value, str): + return first.value + return None + + +def _add_call_to_variables(call: ast.Call) -> Iterator[str]: + """Yield variable names from an ``add(entity, period, [list])`` call. + + Matches the common helper. The third positional arg must be a + ``list`` literal of string literals. Anything dynamically built + is skipped. + """ + if not isinstance(call.func, ast.Name): + return + if call.func.id not in {"add", "aggr"}: + return + if len(call.args) < 3: + return + names_arg = call.args[2] + if not isinstance(names_arg, (ast.List, ast.Tuple)): + return + for elt in names_arg.elts: + if isinstance(elt, ast.Constant) and isinstance(elt.value, str): + yield elt.value diff --git a/src/policyengine/graph/graph.py b/src/policyengine/graph/graph.py new file mode 100644 index 00000000..f6b360c0 --- /dev/null +++ b/src/policyengine/graph/graph.py @@ -0,0 +1,124 @@ +"""NetworkX-backed variable dependency graph. + +Separated from the extractor so the data structure is easy to test +independently, easy to serialize/deserialize, and easy to enrich with +additional edge types (parameter reads, cross-jurisdiction links) in +later versions. +""" + +from __future__ import annotations + +from typing import Iterable, Optional + +import networkx as nx + + +class VariableGraph: + """Directed graph of PolicyEngine variable dependencies. + + Nodes are variable names (strings). Edges run from a *dependency* + to a *dependent*: ``A -> B`` means "computing B reads A". With + this orientation, ``impact(A)`` is the set of downstream nodes + reachable from A, and ``deps(B)`` is the set of upstream nodes + that reach into B. + + The constructor accepts an optional pre-built graph for testing + and deserialization; normal callers will get instances via the + extractor. + """ + + def __init__(self, digraph: Optional[nx.DiGraph] = None) -> None: + self._g = digraph if digraph is not None else nx.DiGraph() + + # ------------------------------------------------------------------ + # Construction helpers (used by the extractor) + # ------------------------------------------------------------------ + + def add_variable(self, name: str, file_path: Optional[str] = None) -> None: + """Register a variable as a node. Safe to call repeatedly.""" + if name in self._g: + if file_path and "file_path" not in self._g.nodes[name]: + self._g.nodes[name]["file_path"] = file_path + return + self._g.add_node(name, file_path=file_path) + + def add_edge(self, dependency: str, dependent: str) -> None: + """Record that ``dependent`` reads ``dependency`` in a formula.""" + # Auto-register the dependency node if it wasn't declared yet; + # this is common when a formula references a variable defined + # in a file the extractor hasn't reached yet, or a variable + # whose class lives in a different subpackage. + if dependency not in self._g: + self._g.add_node(dependency, file_path=None) + if dependent not in self._g: + self._g.add_node(dependent, file_path=None) + self._g.add_edge(dependency, dependent) + + # ------------------------------------------------------------------ + # Query surface + # ------------------------------------------------------------------ + + def has_variable(self, name: str) -> bool: + """True iff ``name`` was registered as an explicitly-defined variable. + + Nodes that only exist because some formula *references* them — + but whose class definition was never seen — are excluded. + """ + if name not in self._g: + return False + return self._g.nodes[name].get("file_path") is not None + + def deps(self, name: str) -> Iterable[str]: + """Return variables that ``name``'s formula reads directly. + + Order follows networkx's insertion order, so the caller can + expect a deterministic sequence for a given extraction run. + """ + if name not in self._g: + return iter(()) + return list(self._g.predecessors(name)) + + def impact(self, name: str) -> Iterable[str]: + """Return variables that transitively depend on ``name``. + + Equivalent to the descendants set in the graph's natural + orientation (edges run dep → dependent). Excludes ``name`` + itself. Empty for leaf variables that nothing reads. + """ + if name not in self._g: + return iter(()) + return list(nx.descendants(self._g, name)) + + def path(self, src: str, dst: str) -> Optional[list[str]]: + """Return a shortest dependency chain from ``src`` to ``dst``. + + Returns the node list including both endpoints, or ``None`` if + no such path exists. + """ + if src not in self._g or dst not in self._g: + return None + try: + return nx.shortest_path(self._g, src, dst) + except nx.NetworkXNoPath: + return None + + # ------------------------------------------------------------------ + # Introspection for callers that want the raw structure + # ------------------------------------------------------------------ + + @property + def nx_graph(self) -> nx.DiGraph: + """The underlying NetworkX DiGraph (read-only-by-convention).""" + return self._g + + def __contains__(self, name: str) -> bool: + return name in self._g + + def __len__(self) -> int: + return self._g.number_of_nodes() + + def __repr__(self) -> str: + return ( + f"VariableGraph({self._g.number_of_nodes()} variables, " + f"{self._g.number_of_edges()} edges)" + ) diff --git a/src/policyengine/outputs/__init__.py b/src/policyengine/outputs/__init__.py index 61311f46..13ff2a26 100644 --- a/src/policyengine/outputs/__init__.py +++ b/src/policyengine/outputs/__init__.py @@ -49,6 +49,7 @@ calculate_us_poverty_by_race, calculate_us_poverty_rates, ) +from policyengine.outputs.program_statistics import ProgramStatistics __all__ = [ "Output", @@ -59,6 +60,7 @@ "ChangeAggregateType", "DecileImpact", "calculate_decile_impacts", + "ProgramStatistics", "IntraDecileImpact", "compute_intra_decile_impacts", "Poverty", diff --git a/src/policyengine/outputs/constituency_impact.py b/src/policyengine/outputs/constituency_impact.py index 60f76e0b..02e1bdfd 100644 --- a/src/policyengine/outputs/constituency_impact.py +++ b/src/policyengine/outputs/constituency_impact.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Optional -import h5py import numpy as np import pandas as pd from pydantic import ConfigDict @@ -43,6 +42,8 @@ def run(self) -> None: constituency_df = pd.read_csv(self.constituency_csv_path) # Load weight matrix: shape (N_constituencies, N_households) + import h5py + with h5py.File(self.weight_matrix_path, "r") as f: weight_matrix = f[self.year][...] diff --git a/src/policyengine/outputs/local_authority_impact.py b/src/policyengine/outputs/local_authority_impact.py index 20b17efe..a4850dbf 100644 --- a/src/policyengine/outputs/local_authority_impact.py +++ b/src/policyengine/outputs/local_authority_impact.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Optional -import h5py import numpy as np import pandas as pd from pydantic import ConfigDict @@ -43,6 +42,8 @@ def run(self) -> None: la_df = pd.read_csv(self.local_authority_csv_path) # Load weight matrix: shape (N_local_authorities, N_households) + import h5py + with h5py.File(self.weight_matrix_path, "r") as f: weight_matrix = f[self.year][...] diff --git a/src/policyengine/tax_benefit_models/us/outputs.py b/src/policyengine/outputs/program_statistics.py similarity index 98% rename from src/policyengine/tax_benefit_models/us/outputs.py rename to src/policyengine/outputs/program_statistics.py index 1dd6f001..a48ff8a8 100644 --- a/src/policyengine/tax_benefit_models/us/outputs.py +++ b/src/policyengine/outputs/program_statistics.py @@ -1,4 +1,4 @@ -"""US-specific output templates.""" +"""Shared `ProgramStatistics` for reform-impact tables (US + UK).""" from typing import Optional diff --git a/src/policyengine/provenance/__init__.py b/src/policyengine/provenance/__init__.py new file mode 100644 index 00000000..548b7fc6 --- /dev/null +++ b/src/policyengine/provenance/__init__.py @@ -0,0 +1,89 @@ +"""Release-bundle provenance + TRACE TRO emission. + +Separated from :mod:`policyengine.core` so the value-object layer +(Dataset, Variable, Parameter, Policy, Simulation, Region) doesn't +force provenance imports on every consumer. + +.. code-block:: python + + from policyengine.provenance import ( + get_release_manifest, + get_data_release_manifest, + build_trace_tro_from_release_bundle, + build_simulation_trace_tro, + serialize_trace_tro, + ) +""" + +from .manifest import ( + CertifiedDataArtifact as CertifiedDataArtifact, +) +from .manifest import ( + CountryReleaseManifest as CountryReleaseManifest, +) +from .manifest import ( + DataBuildInfo as DataBuildInfo, +) +from .manifest import ( + DataCertification as DataCertification, +) +from .manifest import ( + DataPackageVersion as DataPackageVersion, +) +from .manifest import ( + DataReleaseArtifact as DataReleaseArtifact, +) +from .manifest import ( + DataReleaseManifest as DataReleaseManifest, +) +from .manifest import ( + DataReleaseManifestUnavailableError as DataReleaseManifestUnavailableError, +) +from .manifest import ( + PackageVersion as PackageVersion, +) +from .manifest import ( + certify_data_release_compatibility as certify_data_release_compatibility, +) +from .manifest import ( + fetch_pypi_wheel_metadata as fetch_pypi_wheel_metadata, +) +from .manifest import ( + get_data_release_manifest as get_data_release_manifest, +) +from .manifest import ( + get_release_manifest as get_release_manifest, +) +from .manifest import ( + https_dataset_uri as https_dataset_uri, +) +from .manifest import ( + https_release_manifest_uri as https_release_manifest_uri, +) +from .manifest import ( + resolve_dataset_reference as resolve_dataset_reference, +) +from .manifest import ( + resolve_local_managed_dataset_source as resolve_local_managed_dataset_source, +) +from .manifest import ( + resolve_managed_dataset_reference as resolve_managed_dataset_reference, +) +from .trace import ( + build_simulation_trace_tro as build_simulation_trace_tro, +) +from .trace import ( + build_trace_tro_from_release_bundle as build_trace_tro_from_release_bundle, +) +from .trace import ( + canonical_json_bytes as canonical_json_bytes, +) +from .trace import ( + compute_trace_composition_fingerprint as compute_trace_composition_fingerprint, +) +from .trace import ( + extract_bundle_tro_reference as extract_bundle_tro_reference, +) +from .trace import ( + serialize_trace_tro as serialize_trace_tro, +) diff --git a/src/policyengine/core/release_manifest.py b/src/policyengine/provenance/manifest.py similarity index 100% rename from src/policyengine/core/release_manifest.py rename to src/policyengine/provenance/manifest.py diff --git a/src/policyengine/core/trace_tro.py b/src/policyengine/provenance/trace.py similarity index 99% rename from src/policyengine/core/trace_tro.py rename to src/policyengine/provenance/trace.py index 76f1661a..83ac0b5b 100644 --- a/src/policyengine/core/trace_tro.py +++ b/src/policyengine/provenance/trace.py @@ -24,7 +24,7 @@ from collections.abc import Iterable, Mapping from typing import Any, Optional -from .release_manifest import ( +from .manifest import ( CountryReleaseManifest, DataCertification, DataReleaseManifest, diff --git a/src/policyengine/results/trace_tro.py b/src/policyengine/results/trace_tro.py index d904d5b4..85c7aed8 100644 --- a/src/policyengine/results/trace_tro.py +++ b/src/policyengine/results/trace_tro.py @@ -5,7 +5,7 @@ specific reform + ``results.json`` payload so a published result can be cited with an immutable composition fingerprint. -See :mod:`policyengine.core.trace_tro` for the bundle-level layer. +See :mod:`policyengine.provenance.trace` for the bundle-level layer. """ from __future__ import annotations @@ -14,7 +14,7 @@ from pathlib import Path from typing import Optional, Union -from policyengine.core.trace_tro import ( +from policyengine.provenance.trace import ( build_simulation_trace_tro, serialize_trace_tro, ) diff --git a/src/policyengine/tax_benefit_models/common/__init__.py b/src/policyengine/tax_benefit_models/common/__init__.py new file mode 100644 index 00000000..654f350d --- /dev/null +++ b/src/policyengine/tax_benefit_models/common/__init__.py @@ -0,0 +1,16 @@ +"""Country-agnostic helpers for household calculation and reform analysis. + +The country modules (:mod:`policyengine.tax_benefit_models.us`, +:mod:`policyengine.tax_benefit_models.uk`) thread these helpers through +their public ``calculate_household`` / ``analyze_reform`` entry points. +""" + +from .extra_variables import dispatch_extra_variables as dispatch_extra_variables +from .model_version import ( + MicrosimulationModelVersion as MicrosimulationModelVersion, +) +from .reform import compile_reform as compile_reform +from .reform import compile_reform_to_dynamic as compile_reform_to_dynamic +from .reform import compile_reform_to_policy as compile_reform_to_policy +from .result import EntityResult as EntityResult +from .result import HouseholdResult as HouseholdResult diff --git a/src/policyengine/tax_benefit_models/common/extra_variables.py b/src/policyengine/tax_benefit_models/common/extra_variables.py new file mode 100644 index 00000000..e3426e6b --- /dev/null +++ b/src/policyengine/tax_benefit_models/common/extra_variables.py @@ -0,0 +1,52 @@ +"""Dispatch a flat ``extra_variables`` list to a per-entity mapping. + +Callers pass a flat list — ``extra_variables=["adjusted_gross_income", +"state_agi", "is_medicaid_eligible"]`` — and the library looks up each +name on the country model to figure out which entity it belongs on. +Unknown names raise with a close-match suggestion. +""" + +from __future__ import annotations + +from collections.abc import Iterable +from difflib import get_close_matches +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion + + +def dispatch_extra_variables( + *, + model_version: TaxBenefitModelVersion, + names: Iterable[str], +) -> dict[str, list[str]]: + """Group ``names`` by the entity each variable lives on. + + Raises :class:`ValueError` if any name is not defined on the model. + """ + by_entity: dict[str, list[str]] = {} + unknown: list[str] = [] + + variables_by_name = model_version.variables_by_name + for name in names: + variable = variables_by_name.get(name) + if variable is None: + unknown.append(name) + continue + by_entity.setdefault(variable.entity, []).append(name) + + if unknown: + lines = [ + f"extra_variables contains names not defined on " + f"{model_version.model.id} {model_version.version}:", + ] + for name in unknown: + suggestions = get_close_matches( + name, list(variables_by_name), n=1, cutoff=0.7 + ) + suggestion = f" (did you mean '{suggestions[0]}'?)" if suggestions else "" + lines.append(f" - '{name}'{suggestion}") + raise ValueError("\n".join(lines)) + + return by_entity diff --git a/src/policyengine/tax_benefit_models/common/model_version.py b/src/policyengine/tax_benefit_models/common/model_version.py new file mode 100644 index 00000000..dc5d44d8 --- /dev/null +++ b/src/policyengine/tax_benefit_models/common/model_version.py @@ -0,0 +1,257 @@ +"""Base class for country ``TaxBenefitModelVersion`` implementations. + +The US and UK model-version classes share roughly 300 lines of loading logic: +manifest certification, the variable-copy loop over the country ``system``, +the parameter-copy loop, entity-relationship construction, and simple +``save`` / ``load`` passthroughs. Only ``run`` (and the country-specific +``managed_microsimulation`` helper) diverge enough to warrant per-country +implementations. + +This module extracts the shared behaviour into ``MicrosimulationModelVersion``. +Country subclasses declare class-level metadata (``country_code``, +``package_name``, ``group_entities``, ``entity_variables``) and override a +handful of thin hooks (``_load_system``, ``_load_region_registry``, +``_dataset_class``, ``run``). +""" + +from __future__ import annotations + +import datetime +import os +import warnings +from importlib import metadata +from pathlib import Path +from typing import TYPE_CHECKING, Any, ClassVar, Optional + +import pandas as pd + +from policyengine.core import ( + Parameter, + ParameterNode, + TaxBenefitModelVersion, + Variable, +) +from policyengine.provenance.manifest import ( + certify_data_release_compatibility, + get_release_manifest, +) +from policyengine.utils.entity_utils import build_entity_relationships +from policyengine.utils.parameter_labels import ( + build_scale_lookup, + generate_label_for_parameter, +) + +if TYPE_CHECKING: + from policyengine.core.simulation import Simulation + + +class MicrosimulationModelVersion(TaxBenefitModelVersion): + """Shared init / save / load logic for country microsim model versions. + + Subclasses must set the four class attributes below and implement the + country-specific hooks. ``run`` is intentionally left abstract: its + country-specific logic (reform application, simulation builder, output + post-processing) varies enough that a shared skeleton would hide real + divergences. + """ + + # --- Subclass metadata ------------------------------------------------- + country_code: ClassVar[str] = "" + """ISO-ish country identifier used by the release manifest ("us"/"uk").""" + + package_name: ClassVar[str] = "" + """Distribution name used with ``importlib.metadata.version``.""" + + group_entities: ClassVar[list[str]] = [] + """Group entities (non-person) for this country, in dataset order.""" + + entity_variables: dict[str, list[str]] = {} + """Variables to materialise per entity when writing output datasets.""" + + # --- Construction ------------------------------------------------------ + def __init__(self, **kwargs: Any) -> None: + if not self.country_code or not self.package_name: + raise RuntimeError( + f"{type(self).__name__} must declare country_code and " + "package_name class attributes" + ) + + manifest = get_release_manifest(self.country_code) + if kwargs.get("version") is None: + kwargs["version"] = manifest.model_package.version + + installed_model_version = metadata.version(self.package_name) + if installed_model_version != manifest.model_package.version: + warnings.warn( + f"Installed {self.package_name} version " + f"({installed_model_version}) does not match the bundled " + "policyengine.py manifest " + f"({manifest.model_package.version}). Calculations will " + "run against the installed version, but dataset " + "compatibility is not guaranteed. To silence this " + "warning, install the version pinned by the manifest.", + UserWarning, + stacklevel=2, + ) + + model_build_metadata = self._get_runtime_data_build_metadata() + data_certification = certify_data_release_compatibility( + self.country_code, + runtime_model_version=installed_model_version, + runtime_data_build_fingerprint=model_build_metadata.get( + "data_build_fingerprint" + ), + ) + + super().__init__(**kwargs) + self.release_manifest = manifest + self.model_package = manifest.model_package + self.data_package = manifest.data_package + self.default_dataset_uri = manifest.default_dataset_uri + self.data_certification = data_certification + self.region_registry = self._load_region_registry() + self.id = f"{self.model.id}@{self.version}" + + system = self._load_system() + self._populate_variables(system) + self._populate_parameters(system) + + # --- Hooks ------------------------------------------------------------ + @classmethod + def _get_runtime_data_build_metadata(cls) -> dict[str, Optional[str]]: + """Return build metadata from the country package, if available.""" + raise NotImplementedError + + def _load_system(self): + """Return the country package's ``system`` object.""" + raise NotImplementedError + + def _load_region_registry(self): + """Return the country's ``RegionRegistry``.""" + raise NotImplementedError + + @property + def _dataset_class(self): + """Return the country's ``PolicyEngine{Country}Dataset`` class.""" + raise NotImplementedError + + # --- Shared loading helpers ------------------------------------------ + def _populate_variables(self, system) -> None: + from policyengine_core.enums import Enum + from policyengine_core.parameters.operations.get_parameter import ( + get_parameter, + ) + + for var_obj in system.variables.values(): + default_val = var_obj.default_value + if var_obj.value_type is Enum: + default_val = default_val.name + elif var_obj.value_type is datetime.date: + default_val = default_val.isoformat() + + variable = Variable( + id=self.id + "-" + var_obj.name, + name=var_obj.name, + label=getattr(var_obj, "label", None), + tax_benefit_model_version=self, + entity=var_obj.entity.key, + description=var_obj.documentation, + data_type=( + var_obj.value_type if var_obj.value_type is not Enum else str + ), + default_value=default_val, + value_type=var_obj.value_type, + ) + if ( + hasattr(var_obj, "possible_values") + and var_obj.possible_values is not None + ): + variable.possible_values = list( + map( + lambda x: x.name, + var_obj.possible_values._value2member_map_.values(), + ) + ) + # Resolve parameter-path adds/subtracts to concrete lists so + # consumers always see list[str]. + for attr in ("adds", "subtracts"): + value = getattr(var_obj, attr, None) + if value is None: + continue + if isinstance(value, str): + try: + param = get_parameter(system.parameters, value) + setattr(variable, attr, list(param("2025-01-01"))) + except Exception: + setattr(variable, attr, None) + else: + setattr(variable, attr, value) + self.add_variable(variable) + + def _populate_parameters(self, system) -> None: + from policyengine_core.parameters import Parameter as CoreParameter + from policyengine_core.parameters import ParameterNode as CoreParameterNode + + scale_lookup = build_scale_lookup(system) + + for param_node in system.parameters.get_descendants(): + if isinstance(param_node, CoreParameter): + parameter = Parameter( + id=self.id + "-" + param_node.name, + name=param_node.name, + label=generate_label_for_parameter( + param_node, system, scale_lookup + ), + tax_benefit_model_version=self, + description=param_node.description, + data_type=type(param_node(2025)), + unit=param_node.metadata.get("unit"), + _core_param=param_node, + ) + self.add_parameter(parameter) + elif isinstance(param_node, CoreParameterNode): + node = ParameterNode( + id=self.id + "-" + param_node.name, + name=param_node.name, + label=param_node.metadata.get("label"), + description=param_node.description, + tax_benefit_model_version=self, + ) + self.add_parameter_node(node) + + # --- Shared run-surface helpers -------------------------------------- + def _build_entity_relationships(self, dataset) -> pd.DataFrame: + """Build a DataFrame mapping each person to their containing entities.""" + person_data = pd.DataFrame(dataset.data.person) + return build_entity_relationships(person_data, self.group_entities) + + def save(self, simulation: Simulation) -> None: + """Persist the simulation's output dataset to its bundled filepath.""" + simulation.output_dataset.save() + + def load(self, simulation: Simulation) -> None: + """Rehydrate the simulation's output dataset from disk. + + Loads timestamps from filesystem metadata when the file exists so + serialised simulations round-trip ``created_at``/``updated_at``. + """ + filepath = str( + Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") + ) + + simulation.output_dataset = self._dataset_class( + id=simulation.id, + name=simulation.dataset.name, + description=simulation.dataset.description, + filepath=filepath, + year=simulation.dataset.year, + is_output_dataset=True, + ) + + if os.path.exists(filepath): + simulation.created_at = datetime.datetime.fromtimestamp( + os.path.getctime(filepath) + ) + simulation.updated_at = datetime.datetime.fromtimestamp( + os.path.getmtime(filepath) + ) diff --git a/src/policyengine/tax_benefit_models/common/reform.py b/src/policyengine/tax_benefit_models/common/reform.py new file mode 100644 index 00000000..60b564f4 --- /dev/null +++ b/src/policyengine/tax_benefit_models/common/reform.py @@ -0,0 +1,197 @@ +"""Compile a simple reform dict into the format policyengine_core expects. + +Accepted shapes for the agent-facing API: + +.. code-block:: python + + # Scalar — applied from Jan 1 of ``year`` (the simulation year). + reform = {"gov.irs.deductions.salt.cap": 0} + + # With explicit effective date(s). + reform = {"gov.irs.deductions.salt.cap": {"2026-01-01": 0}} + + # Multiple parameters. + reform = { + "gov.irs.deductions.salt.cap": 0, + "gov.irs.credits.ctc.amount.base[0].amount": 2500, + } + +**Indexed parameters.** Many PolicyEngine parameters are *breakdown* +entries keyed by a bracket index (age group, filing status, etc.). +Their paths end with ``[N].amount`` / ``[N].threshold``. For example +the CTC base amount in 2026 is +``gov.irs.credits.ctc.amount.base[0].amount`` (not ``...base``); +the top-bracket SS wage base is ``gov.ssa.payroll.cap``. If a reform +dict uses the bracket-head path instead of ``[0].amount`` the +``ValueError`` will list the close match. + +The compiled form is ``{param_path: {period: value}}`` — exactly what +``policyengine_us.Simulation(reform=...)`` / +``policyengine_uk.Simulation(reform=...)`` accept at construction. + +Scalar reforms default to ``{year}-01-01`` so a caller running +mid-year does not accidentally get a blended partial-year result. +Unknown parameter paths raise ``ValueError`` with a close-match +suggestion; pass ``model_version`` to enable the check. +""" + +from __future__ import annotations + +import datetime +from collections.abc import Mapping +from difflib import get_close_matches +from typing import TYPE_CHECKING, Any, Optional + +if TYPE_CHECKING: + from policyengine.core.dynamic import Dynamic + from policyengine.core.policy import Policy + from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion + + +def compile_reform( + reform: Optional[Mapping[str, Any]], + *, + year: Optional[int] = None, + model_version: Optional[TaxBenefitModelVersion] = None, +) -> Optional[dict[str, dict[str, Any]]]: + """Compile a simple reform dict to the core reform-dict format. + + Args: + reform: Flat mapping from parameter path to either a scalar + (applied from ``{year}-01-01``) or a ``{effective_date: value}`` + mapping. + year: Simulation year. Used as the default effective date for + scalar values so a mid-year call still targets the whole year. + model_version: If provided, parameter paths are validated + against ``model_version.parameters_by_name`` and unknown + paths raise with a close-match suggestion. + """ + if not reform: + return None + + default_date = f"{year}-01-01" if year is not None else "1900-01-01" + + if model_version is not None: + valid = set(model_version.parameters_by_name) + unknown = [path for path in reform if path not in valid] + if unknown: + lines = [ + f"Reform contains parameter paths not defined on " + f"{model_version.model.id} {model_version.version}:", + ] + for path in unknown: + suggestions = get_close_matches(path, valid, n=1, cutoff=0.7) + hint = f" (did you mean '{suggestions[0]}'?)" if suggestions else "" + lines.append(f" - '{path}'{hint}") + raise ValueError("\n".join(lines)) + + compiled: dict[str, dict[str, Any]] = {} + for parameter_path, spec in reform.items(): + if isinstance(spec, Mapping): + compiled[parameter_path] = {str(k): v for k, v in spec.items()} + else: + compiled[parameter_path] = {default_date: spec} + return compiled + + +def _reform_dict_to_parameter_values( + reform: Mapping[str, Any], + *, + year: Optional[int], + model_version: TaxBenefitModelVersion, +) -> list: + """Compile a flat reform dict into a list of ``ParameterValue`` objects. + + Uses :func:`compile_reform` for path validation and effective-date + defaulting, then materialises each ``{path: {date: value}}`` pair + as an open-ended ``ParameterValue`` bound to a + ``Parameter(name=path, tax_benefit_model_version=model_version)``. + """ + from policyengine.core.parameter import Parameter + from policyengine.core.parameter_value import ParameterValue + + compiled = compile_reform(reform, year=year, model_version=model_version) + if compiled is None: + return [] + + parameter_values: list[ParameterValue] = [] + for path, date_to_value in compiled.items(): + for effective_date, value in date_to_value.items(): + data_type = type(value) if isinstance(value, (int, float, bool)) else float + parameter_values.append( + ParameterValue( + parameter=Parameter( + name=path, + tax_benefit_model_version=model_version, + data_type=data_type, + ), + start_date=datetime.datetime.strptime(effective_date, "%Y-%m-%d"), + end_date=None, + value=value, + ) + ) + return parameter_values + + +def _compile_reform_to( + cls, + default_name: str, + reform: Optional[Mapping[str, Any]], + *, + year: Optional[int], + model_version: TaxBenefitModelVersion, + name: Optional[str] = None, +): + parameter_values = _reform_dict_to_parameter_values( + reform or {}, year=year, model_version=model_version + ) + if not parameter_values: + return None + return cls(name=name or default_name, parameter_values=parameter_values) + + +def compile_reform_to_policy( + reform: Optional[Mapping[str, Any]], + *, + year: Optional[int], + model_version: TaxBenefitModelVersion, + name: Optional[str] = None, +) -> Optional[Policy]: + """Compile a flat reform dict into a fully-assembled ``Policy``. + + Accepts the same ``{param.path: value}`` / + ``{param.path: {date: value}}`` shape as :func:`compile_reform`, + but returns a ready-to-use ``Policy`` with ``ParameterValue`` + objects so ``Simulation(policy={...})`` works without hand-building + ``Parameter`` / ``ParameterValue``. + """ + from policyengine.core.policy import Policy + + return _compile_reform_to( + Policy, + "Reform", + reform, + year=year, + model_version=model_version, + name=name, + ) + + +def compile_reform_to_dynamic( + reform: Optional[Mapping[str, Any]], + *, + year: Optional[int], + model_version: TaxBenefitModelVersion, + name: Optional[str] = None, +) -> Optional[Dynamic]: + """``Dynamic`` counterpart of :func:`compile_reform_to_policy`.""" + from policyengine.core.dynamic import Dynamic + + return _compile_reform_to( + Dynamic, + "Dynamic response", + reform, + year=year, + model_version=model_version, + name=name, + ) diff --git a/src/policyengine/tax_benefit_models/common/result.py b/src/policyengine/tax_benefit_models/common/result.py new file mode 100644 index 00000000..e73fa406 --- /dev/null +++ b/src/policyengine/tax_benefit_models/common/result.py @@ -0,0 +1,79 @@ +"""Dot-access result containers returned by ``calculate_household``. + +A result is intentionally thin: it's a ``dict`` subclass that also +supports attribute access, so callers can write either +``result.tax_unit.income_tax`` or ``result["tax_unit"]["income_tax"]``. +The dict shape keeps JSON serialization trivial. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Union + + +class EntityResult(dict): + """One entity's computed variables with dict AND attribute access. + + Raises :class:`AttributeError` with the list of available variables + when a caller accesses an unknown name, so typos surface a + paste-able fix instead of silently returning ``None``. + """ + + def __getattr__(self, name: str) -> Any: + if name.startswith("_"): + raise AttributeError(name) + if name in self: + return self[name] + available = ", ".join(sorted(self)) + raise AttributeError( + f"entity has no variable '{name}'. Available: {available}. " + f"Pass extra_variables=['{name}'] to calculate_household if " + f"'{name}' is a valid variable on the country model that is " + f"not in the default output columns." + ) + + def __setattr__(self, name: str, value: Any) -> None: # pragma: no cover + self[name] = value + + +class HouseholdResult(dict): + """Full household calculation result; one key per entity. + + Singleton entities (``household``, ``tax_unit``, ``benunit``, ...) + map to a single :class:`EntityResult`; multi-member entities (like + ``person``) map to a ``list[EntityResult]``. + """ + + def __getattr__(self, name: str) -> Any: + if name.startswith("_"): + raise AttributeError(name) + if name in self: + return self[name] + available = ", ".join(sorted(self)) + raise AttributeError( + f"no entity '{name}' on this result. Available entities: {available}" + ) + + def __setattr__(self, name: str, value: Any) -> None: # pragma: no cover + self[name] = value + + def to_dict(self) -> dict[str, Any]: + """Return a plain ``dict[str, Any]`` copy suitable for JSON dumps.""" + + def _convert(value: Any) -> Any: + if isinstance(value, EntityResult): + return dict(value) + if isinstance(value, list): + return [_convert(v) for v in value] + return value + + return {key: _convert(val) for key, val in self.items()} + + def write(self, path: Union[str, Path]) -> Path: + """Write the result to a JSON file and return the path.""" + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(self.to_dict(), indent=2) + "\n") + return path diff --git a/src/policyengine/tax_benefit_models/uk/__init__.py b/src/policyengine/tax_benefit_models/uk/__init__.py index 93533245..3ab098e2 100644 --- a/src/policyengine/tax_benefit_models/uk/__init__.py +++ b/src/policyengine/tax_benefit_models/uk/__init__.py @@ -1,16 +1,23 @@ -"""PolicyEngine UK tax-benefit model.""" +"""PolicyEngine UK tax-benefit model. + +.. code-block:: python + + import policyengine as pe + + result = pe.uk.calculate_household( + people=[{"age": 30, "employment_income": 50000}], + year=2026, + ) + print(result.person[0].income_tax, result.household.hbai_household_net_income) +""" from importlib.util import find_spec if find_spec("policyengine_uk") is not None: from policyengine.core import Dataset + from policyengine.outputs import ProgramStatistics - from .analysis import ( - UKHouseholdInput, - UKHouseholdOutput, - calculate_household_impact, - economic_impact_analysis, - ) + from .analysis import economic_impact_analysis from .datasets import ( PolicyEngineUKDataset, UKYearData, @@ -18,21 +25,22 @@ ensure_datasets, load_datasets, ) + from .household import calculate_household from .model import ( PolicyEngineUK, PolicyEngineUKLatest, managed_microsimulation, uk_latest, - uk_model, ) - from .outputs import ProgrammeStatistics - # Rebuild Pydantic models to resolve forward references + model = uk_latest + """The pinned UK ``TaxBenefitModelVersion`` for this policyengine release.""" + Dataset.model_rebuild() UKYearData.model_rebuild() PolicyEngineUKDataset.model_rebuild() PolicyEngineUKLatest.model_rebuild() - ProgrammeStatistics.model_rebuild() + ProgramStatistics.model_rebuild() __all__ = [ "UKYearData", @@ -43,13 +51,11 @@ "PolicyEngineUK", "PolicyEngineUKLatest", "managed_microsimulation", - "uk_model", + "model", "uk_latest", + "calculate_household", "economic_impact_analysis", - "calculate_household_impact", - "UKHouseholdInput", - "UKHouseholdOutput", - "ProgrammeStatistics", + "ProgramStatistics", ] else: __all__ = [] diff --git a/src/policyengine/tax_benefit_models/uk/analysis.py b/src/policyengine/tax_benefit_models/uk/analysis.py index b05e21b0..f37d18be 100644 --- a/src/policyengine/tax_benefit_models/uk/analysis.py +++ b/src/policyengine/tax_benefit_models/uk/analysis.py @@ -1,15 +1,16 @@ -"""General utility functions for UK policy reform analysis.""" +"""Microsimulation reform analysis for the UK model. -import tempfile -from pathlib import Path -from typing import Any, Optional +The single-household calculator lives in :mod:`.household`; this module +holds the population-level reform-analysis helpers. +""" + +from __future__ import annotations import pandas as pd -from microdf import MicroDataFrame -from pydantic import BaseModel, Field +from pydantic import BaseModel from policyengine.core import OutputCollection, Simulation -from policyengine.core.policy import Policy +from policyengine.outputs import ProgramStatistics from policyengine.outputs.decile_impact import ( DecileImpact, calculate_decile_impacts, @@ -23,140 +24,12 @@ calculate_uk_poverty_rates, ) -from .datasets import PolicyEngineUKDataset, UKYearData -from .model import uk_latest -from .outputs import ProgrammeStatistics - - -class UKHouseholdOutput(BaseModel): - """Output from a UK household calculation with all entity data.""" - - person: list[dict[str, Any]] - benunit: list[dict[str, Any]] - household: dict[str, Any] - - -class UKHouseholdInput(BaseModel): - """Input for a UK household calculation.""" - - people: list[dict[str, Any]] - benunit: dict[str, Any] = Field(default_factory=dict) - household: dict[str, Any] = Field(default_factory=dict) - year: int = 2026 - - -def calculate_household_impact( - household_input: UKHouseholdInput, - policy: Optional[Policy] = None, -) -> UKHouseholdOutput: - """Calculate tax and benefit impacts for a single UK household.""" - n_people = len(household_input.people) - - # Build person data with defaults - person_data = { - "person_id": list(range(n_people)), - "person_benunit_id": [0] * n_people, - "person_household_id": [0] * n_people, - "person_weight": [1.0] * n_people, - } - # Add user-provided person fields - for i, person in enumerate(household_input.people): - for key, value in person.items(): - if key not in person_data: - person_data[key] = [0.0] * n_people # Default to 0 for numeric fields - person_data[key][i] = value - - # Build benunit data with defaults - benunit_data = { - "benunit_id": [0], - "benunit_weight": [1.0], - } - for key, value in household_input.benunit.items(): - benunit_data[key] = [value] - - # Build household data with defaults (required for uprating) - household_data = { - "household_id": [0], - "household_weight": [1.0], - "region": ["LONDON"], - "tenure_type": ["RENT_PRIVATELY"], - "council_tax": [0.0], - "rent": [0.0], - } - for key, value in household_input.household.items(): - household_data[key] = [value] - - # Create MicroDataFrames - person_df = MicroDataFrame(pd.DataFrame(person_data), weights="person_weight") - benunit_df = MicroDataFrame(pd.DataFrame(benunit_data), weights="benunit_weight") - household_df = MicroDataFrame( - pd.DataFrame(household_data), weights="household_weight" - ) - - # Create temporary dataset - tmpdir = tempfile.mkdtemp() - filepath = str(Path(tmpdir) / "household_impact.h5") - - dataset = PolicyEngineUKDataset( - name="Household impact calculation", - description="Single household for impact calculation", - filepath=filepath, - year=household_input.year, - data=UKYearData( - person=person_df, - benunit=benunit_df, - household=household_df, - ), - ) - - # Run simulation - simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=uk_latest, - policy=policy, - ) - simulation.run() - - # Extract all output variables defined in entity_variables - output_data = simulation.output_dataset.data - - def safe_convert(value): - """Convert value to float if numeric, otherwise return as string.""" - try: - return float(value) - except (ValueError, TypeError): - return str(value) - - person_outputs = [] - for i in range(n_people): - person_dict = {} - for var in uk_latest.entity_variables["person"]: - person_dict[var] = safe_convert(output_data.person[var].iloc[i]) - person_outputs.append(person_dict) - - benunit_outputs = [] - for i in range(len(output_data.benunit)): - benunit_dict = {} - for var in uk_latest.entity_variables["benunit"]: - benunit_dict[var] = safe_convert(output_data.benunit[var].iloc[i]) - benunit_outputs.append(benunit_dict) - - household_dict = {} - for var in uk_latest.entity_variables["household"]: - household_dict[var] = safe_convert(output_data.household[var].iloc[0]) - - return UKHouseholdOutput( - person=person_outputs, - benunit=benunit_outputs, - household=household_dict, - ) - class PolicyReformAnalysis(BaseModel): """Complete policy reform analysis result.""" decile_impacts: OutputCollection[DecileImpact] - programme_statistics: OutputCollection[ProgrammeStatistics] + program_statistics: OutputCollection[ProgramStatistics] baseline_poverty: OutputCollection[Poverty] reform_poverty: OutputCollection[Poverty] baseline_inequality: Inequality @@ -167,11 +40,7 @@ def economic_impact_analysis( baseline_simulation: Simulation, reform_simulation: Simulation, ) -> PolicyReformAnalysis: - """Perform comprehensive analysis of a policy reform. - - Returns: - PolicyReformAnalysis containing decile impacts and programme statistics - """ + """Perform comprehensive analysis of a UK policy reform.""" baseline_simulation.ensure() reform_simulation.ensure() @@ -182,20 +51,16 @@ def economic_impact_analysis( "Reform simulation must have more than 100 households" ) - # Decile impact decile_impacts = calculate_decile_impacts( baseline_simulation=baseline_simulation, reform_simulation=reform_simulation, ) - # Major programmes to analyse - programmes = { - # Tax + programs = { "income_tax": {"is_tax": True}, "national_insurance": {"is_tax": True}, "vat": {"is_tax": True}, "council_tax": {"is_tax": True}, - # Benefits "universal_credit": {"is_tax": False}, "child_benefit": {"is_tax": False}, "pension_credit": {"is_tax": False}, @@ -204,31 +69,27 @@ def economic_impact_analysis( "child_tax_credit": {"is_tax": False}, } - programme_statistics = [] - - for programme_name, programme_info in programmes.items(): + program_statistics = [] + for program_name, program_info in programs.items(): entity = baseline_simulation.tax_benefit_model_version.get_variable( - programme_name + program_name ).entity - is_tax = programme_info["is_tax"] - - stats = ProgrammeStatistics( + stats = ProgramStatistics( baseline_simulation=baseline_simulation, reform_simulation=reform_simulation, - programme_name=programme_name, + program_name=program_name, entity=entity, - is_tax=is_tax, + is_tax=program_info["is_tax"], ) stats.run() - programme_statistics.append(stats) + program_statistics.append(stats) - # Create DataFrame - programme_df = pd.DataFrame( + program_df = pd.DataFrame( [ { "baseline_simulation_id": p.baseline_simulation.id, "reform_simulation_id": p.reform_simulation.id, - "programme_name": p.programme_name, + "program_name": p.program_name, "entity": p.entity, "is_tax": p.is_tax, "baseline_total": p.baseline_total, @@ -239,25 +100,21 @@ def economic_impact_analysis( "winners": p.winners, "losers": p.losers, } - for p in programme_statistics + for p in program_statistics ] ) - - programme_collection = OutputCollection( - outputs=programme_statistics, dataframe=programme_df + program_collection = OutputCollection( + outputs=program_statistics, dataframe=program_df ) - # Calculate poverty rates for both simulations baseline_poverty = calculate_uk_poverty_rates(baseline_simulation) reform_poverty = calculate_uk_poverty_rates(reform_simulation) - - # Calculate inequality for both simulations baseline_inequality = calculate_uk_inequality(baseline_simulation) reform_inequality = calculate_uk_inequality(reform_simulation) return PolicyReformAnalysis( decile_impacts=decile_impacts, - programme_statistics=programme_collection, + program_statistics=program_collection, baseline_poverty=baseline_poverty, reform_poverty=reform_poverty, baseline_inequality=baseline_inequality, diff --git a/src/policyengine/tax_benefit_models/uk/datasets.py b/src/policyengine/tax_benefit_models/uk/datasets.py index 47f78403..e7207da7 100644 --- a/src/policyengine/tax_benefit_models/uk/datasets.py +++ b/src/policyengine/tax_benefit_models/uk/datasets.py @@ -6,7 +6,7 @@ from pydantic import ConfigDict from policyengine.core import Dataset, YearData -from policyengine.core.release_manifest import ( +from policyengine.provenance.manifest import ( dataset_logical_name, resolve_dataset_reference, ) diff --git a/src/policyengine/tax_benefit_models/uk/household.py b/src/policyengine/tax_benefit_models/uk/household.py new file mode 100644 index 00000000..5dbd71bb --- /dev/null +++ b/src/policyengine/tax_benefit_models/uk/household.py @@ -0,0 +1,191 @@ +"""Single-household calculation for the UK model. + +.. code-block:: python + + import policyengine as pe + + # Lone parent + one child, £30k wages. + result = pe.uk.calculate_household( + people=[ + {"age": 32, "employment_income": 30000}, + {"age": 6}, + ], + benunit={"would_claim_child_benefit": True}, + year=2026, + ) + print(result.person[0].income_tax) + print(result.benunit.child_benefit) + print(result.household.hbai_household_net_income) +""" + +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any, Optional + +from policyengine.tax_benefit_models.common import ( + EntityResult, + HouseholdResult, + compile_reform, + dispatch_extra_variables, +) +from policyengine.utils.household_validation import validate_household_input + +from .model import uk_latest + + +def _default_output_columns( + extra_by_entity: Mapping[str, list[str]], +) -> dict[str, list[str]]: + merged: dict[str, list[str]] = {} + for entity, defaults in uk_latest.entity_variables.items(): + columns = list(defaults) + for extra in extra_by_entity.get(entity, []): + if extra not in columns: + columns.append(extra) + merged[entity] = columns + for entity, extras in extra_by_entity.items(): + merged.setdefault(entity, list(extras)) + return merged + + +def _safe_convert(value: Any) -> Any: + try: + return float(value) + except (ValueError, TypeError): + return str(value) if value is not None else None + + +def _build_situation( + *, + people: list[Mapping[str, Any]], + benunit: Mapping[str, Any], + household: Mapping[str, Any], + year: int, +) -> dict[str, Any]: + year_str = str(year) + + def _periodise(spec: Mapping[str, Any]) -> dict[str, dict[str, Any]]: + return {key: {year_str: value} for key, value in spec.items() if key != "id"} + + person_ids = [f"person_{i}" for i in range(len(people))] + persons = {pid: _periodise(person) for pid, person in zip(person_ids, people)} + + def _group(spec: Mapping[str, Any]) -> dict[str, Any]: + return {"members": list(person_ids), **_periodise(spec)} + + return { + "people": persons, + "benunits": {"benunit_0": _group(benunit)}, + "households": {"household_0": _group(household)}, + } + + +_ALLOWED_KWARGS = frozenset( + {"people", "benunit", "household", "year", "reform", "extra_variables"} +) + + +def _raise_unexpected_kwargs(unexpected: Mapping[str, Any]) -> None: + from difflib import get_close_matches + + lines = ["calculate_household received unsupported keyword arguments:"] + for name in unexpected: + suggestions = get_close_matches(name, _ALLOWED_KWARGS, n=1, cutoff=0.5) + hint = f" (did you mean '{suggestions[0]}'?)" if suggestions else "" + if name in {"tax_unit", "marital_unit", "family", "spm_unit"}: + hint = ( + f" — `{name}` is US-only; the UK groups persons into a single `benunit`" + ) + lines.append(f" - '{name}'{hint}") + lines.append( + "Valid kwargs: people, benunit, household, year, reform, extra_variables." + ) + raise TypeError("\n".join(lines)) + + +def calculate_household( + *, + people: list[Mapping[str, Any]], + benunit: Optional[Mapping[str, Any]] = None, + household: Optional[Mapping[str, Any]] = None, + year: int = 2026, + reform: Optional[Mapping[str, Any]] = None, + extra_variables: Optional[list[str]] = None, + **unexpected: Any, +) -> HouseholdResult: + """Compute tax and benefit variables for a single UK household. + + Args: + people: One dict per person (keys are UK variable names). + Must be non-empty. + benunit, household: Optional per-entity overrides. + year: Calendar year. Defaults to 2026. + reform: Optional reform dict. Scalar values default to + ``{year}-01-01``; invalid parameter paths raise with a + close-match suggestion. + extra_variables: Flat list of extra UK variables to compute; + the library dispatches each to its entity. + + Returns: + :class:`HouseholdResult` with dot-accessible entity results. + + Raises: + ValueError: on unknown or mis-placed variable names, or + unknown reform parameter paths. + TypeError: on US-only kwargs (``tax_unit``, etc.) or other + unsupported keyword arguments. + """ + if unexpected: + _raise_unexpected_kwargs(unexpected) + + from policyengine_uk import Simulation + + people = list(people) + benunit_dict = dict(benunit or {}) + household_dict = dict(household or {}) + + validate_household_input( + model_version=uk_latest, + entities={ + "person": people, + "benunit": [benunit_dict], + "household": [household_dict], + }, + ) + + extra_by_entity = dispatch_extra_variables( + model_version=uk_latest, + names=extra_variables or [], + ) + output_columns = _default_output_columns(extra_by_entity) + reform_dict = compile_reform(reform, year=year, model_version=uk_latest) + + simulation = Simulation( + situation=_build_situation( + people=people, + benunit=benunit_dict, + household=household_dict, + year=year, + ), + reform=reform_dict, + ) + + result = HouseholdResult() + for entity, columns in output_columns.items(): + raw = { + variable: list(simulation.calculate(variable, period=year, map_to=entity)) + for variable in columns + } + if entity == "person": + result["person"] = [ + EntityResult( + {variable: _safe_convert(raw[variable][i]) for variable in columns} + ) + for i in range(len(people)) + ] + else: + result[entity] = EntityResult( + {variable: _safe_convert(raw[variable][0]) for variable in columns} + ) + return result diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py index ce6f2dd9..67e7a3ae 100644 --- a/src/policyengine/tax_benefit_models/uk/model.py +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -1,31 +1,17 @@ import datetime -import warnings -from importlib import metadata from pathlib import Path from typing import TYPE_CHECKING, Optional import pandas as pd from microdf import MicroDataFrame -from policyengine.core import ( - Parameter, - ParameterNode, - TaxBenefitModel, - TaxBenefitModelVersion, - Variable, -) -from policyengine.core.release_manifest import ( - certify_data_release_compatibility, +from policyengine.core import TaxBenefitModel +from policyengine.provenance.manifest import ( dataset_logical_name, - get_release_manifest, resolve_local_managed_dataset_source, resolve_managed_dataset_reference, ) -from policyengine.utils.entity_utils import build_entity_relationships -from policyengine.utils.parameter_labels import ( - build_scale_lookup, - generate_label_for_parameter, -) +from policyengine.tax_benefit_models.common import MicrosimulationModelVersion from .datasets import PolicyEngineUKDataset, UKYearData @@ -43,18 +29,11 @@ class PolicyEngineUK(TaxBenefitModel): uk_model = PolicyEngineUK() -def _get_runtime_data_build_metadata() -> dict[str, Optional[str]]: - try: - from policyengine_uk.build_metadata import get_data_build_metadata - except ModuleNotFoundError as exc: - if exc.name != "policyengine_uk.build_metadata": - raise - return {} - - return get_data_build_metadata() or {} - +class PolicyEngineUKLatest(MicrosimulationModelVersion): + country_code = "uk" + package_name = "policyengine-uk" + group_entities = UK_GROUP_ENTITIES -class PolicyEngineUKLatest(TaxBenefitModelVersion): model: TaxBenefitModel = uk_model version: str = None created_at: datetime.datetime = None @@ -137,147 +116,32 @@ class PolicyEngineUKLatest(TaxBenefitModelVersion): ], } - def __init__(self, **kwargs: dict): - manifest = get_release_manifest("uk") - if "version" not in kwargs or kwargs.get("version") is None: - kwargs["version"] = manifest.model_package.version - - installed_model_version = metadata.version("policyengine-uk") - if installed_model_version != manifest.model_package.version: - warnings.warn( - "Installed policyengine-uk version " - f"({installed_model_version}) does not match the bundled " - "policyengine.py manifest " - f"({manifest.model_package.version}). Calculations will " - "run against the installed version, but dataset " - "compatibility is not guaranteed. To silence this " - "warning, install the version pinned by the manifest.", - UserWarning, - stacklevel=2, - ) - - model_build_metadata = _get_runtime_data_build_metadata() - data_certification = certify_data_release_compatibility( - "uk", - runtime_model_version=installed_model_version, - runtime_data_build_fingerprint=model_build_metadata.get( - "data_build_fingerprint" - ), - ) - - super().__init__(**kwargs) - self.release_manifest = manifest - self.model_package = manifest.model_package - self.data_package = manifest.data_package - self.default_dataset_uri = manifest.default_dataset_uri - self.data_certification = data_certification - from policyengine_core.enums import Enum + # --- Hooks ----------------------------------------------------------- + @classmethod + def _get_runtime_data_build_metadata(cls) -> dict[str, Optional[str]]: + try: + from policyengine_uk.build_metadata import get_data_build_metadata + except ModuleNotFoundError as exc: + if exc.name != "policyengine_uk.build_metadata": + raise + return {} + return get_data_build_metadata() or {} + + def _load_system(self): from policyengine_uk.system import system - # Attach region registry + return system + + def _load_region_registry(self): from policyengine.countries.uk.regions import uk_region_registry - self.region_registry = uk_region_registry - - self.id = f"{self.model.id}@{self.version}" - - for var_obj in system.variables.values(): - # Serialize default_value for JSON compatibility - default_val = var_obj.default_value - if var_obj.value_type is Enum: - default_val = default_val.name - elif var_obj.value_type is datetime.date: - default_val = default_val.isoformat() - - variable = Variable( - id=self.id + "-" + var_obj.name, - name=var_obj.name, - label=getattr(var_obj, "label", None), - tax_benefit_model_version=self, - entity=var_obj.entity.key, - description=var_obj.documentation, - data_type=var_obj.value_type if var_obj.value_type is not Enum else str, - default_value=default_val, - value_type=var_obj.value_type, - ) - if ( - hasattr(var_obj, "possible_values") - and var_obj.possible_values is not None - ): - variable.possible_values = list( - map( - lambda x: x.name, - var_obj.possible_values._value2member_map_.values(), - ) - ) - # Extract and resolve adds/subtracts. - # Core stores these as either list[str] or a parameter path string. - # Resolve parameter paths to lists so consumers always get list[str]. - if hasattr(var_obj, "adds") and var_obj.adds is not None: - if isinstance(var_obj.adds, str): - try: - from policyengine_core.parameters.operations.get_parameter import ( - get_parameter, - ) - - param = get_parameter(system.parameters, var_obj.adds) - variable.adds = list(param("2025-01-01")) - except (ValueError, Exception): - variable.adds = None - else: - variable.adds = var_obj.adds - if hasattr(var_obj, "subtracts") and var_obj.subtracts is not None: - if isinstance(var_obj.subtracts, str): - try: - from policyengine_core.parameters.operations.get_parameter import ( - get_parameter, - ) - - param = get_parameter(system.parameters, var_obj.subtracts) - variable.subtracts = list(param("2025-01-01")) - except (ValueError, Exception): - variable.subtracts = None - else: - variable.subtracts = var_obj.subtracts - self.add_variable(variable) - - from policyengine_core.parameters import Parameter as CoreParameter - from policyengine_core.parameters import ParameterNode as CoreParameterNode - - scale_lookup = build_scale_lookup(system) - - for param_node in system.parameters.get_descendants(): - if isinstance(param_node, CoreParameter): - parameter = Parameter( - id=self.id + "-" + param_node.name, - name=param_node.name, - label=generate_label_for_parameter( - param_node, system, scale_lookup - ), - tax_benefit_model_version=self, - description=param_node.description, - data_type=type(param_node(2025)), - unit=param_node.metadata.get("unit"), - _core_param=param_node, - ) - self.add_parameter(parameter) - elif isinstance(param_node, CoreParameterNode): - node = ParameterNode( - id=self.id + "-" + param_node.name, - name=param_node.name, - label=param_node.metadata.get("label"), - description=param_node.description, - tax_benefit_model_version=self, - ) - self.add_parameter_node(node) - - def _build_entity_relationships( - self, dataset: PolicyEngineUKDataset - ) -> pd.DataFrame: - """Build a DataFrame mapping each person to their containing entities.""" - person_data = pd.DataFrame(dataset.data.person) - return build_entity_relationships(person_data, UK_GROUP_ENTITIES) + return uk_region_registry + + @property + def _dataset_class(self): + return PolicyEngineUKDataset + # --- run ------------------------------------------------------------- def run(self, simulation: "Simulation") -> "Simulation": from policyengine_uk import Microsimulation from policyengine_uk.data import UKSingleYearDataset @@ -370,36 +234,6 @@ def run(self, simulation: "Simulation") -> "Simulation": ), ) - def save(self, simulation: "Simulation"): - """Save the simulation's output dataset.""" - simulation.output_dataset.save() - - def load(self, simulation: "Simulation"): - """Load the simulation's output dataset.""" - import os - - filepath = str( - Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") - ) - - simulation.output_dataset = PolicyEngineUKDataset( - id=simulation.id, - name=simulation.dataset.name, - description=simulation.dataset.description, - filepath=filepath, - year=simulation.dataset.year, - is_output_dataset=True, - ) - - # Load timestamps from file system metadata - if os.path.exists(filepath): - simulation.created_at = datetime.datetime.fromtimestamp( - os.path.getctime(filepath) - ) - simulation.updated_at = datetime.datetime.fromtimestamp( - os.path.getmtime(filepath) - ) - def _managed_release_bundle( dataset_uri: str, @@ -423,8 +257,8 @@ def managed_microsimulation( """Construct a country-package Microsimulation pinned to this bundle. By default this enforces the dataset selection from the bundled - `policyengine.py` release manifest. Arbitrary dataset URIs require - `allow_unmanaged=True`. + ``policyengine.py`` release manifest. Arbitrary dataset URIs require + ``allow_unmanaged=True``. """ from policyengine_uk import Microsimulation diff --git a/src/policyengine/tax_benefit_models/uk/outputs.py b/src/policyengine/tax_benefit_models/uk/outputs.py deleted file mode 100644 index 97032a9c..00000000 --- a/src/policyengine/tax_benefit_models/uk/outputs.py +++ /dev/null @@ -1,105 +0,0 @@ -"""UK-specific output templates.""" - -from typing import Optional - -from pydantic import ConfigDict - -from policyengine.core import Output, Simulation -from policyengine.outputs.aggregate import Aggregate, AggregateType -from policyengine.outputs.change_aggregate import ( - ChangeAggregate, - ChangeAggregateType, -) - - -class ProgrammeStatistics(Output): - """Single programme's statistics from a policy reform - represents one database row.""" - - model_config = ConfigDict(arbitrary_types_allowed=True) - - baseline_simulation: Simulation - reform_simulation: Simulation - programme_name: str - entity: str - is_tax: bool = False - - # Results populated by run() - baseline_total: Optional[float] = None - reform_total: Optional[float] = None - change: Optional[float] = None - baseline_count: Optional[float] = None - reform_count: Optional[float] = None - winners: Optional[float] = None - losers: Optional[float] = None - - def run(self): - """Calculate statistics for this programme.""" - # Baseline totals - baseline_total = Aggregate( - simulation=self.baseline_simulation, - variable=self.programme_name, - aggregate_type=AggregateType.SUM, - entity=self.entity, - ) - baseline_total.run() - - # Reform totals - reform_total = Aggregate( - simulation=self.reform_simulation, - variable=self.programme_name, - aggregate_type=AggregateType.SUM, - entity=self.entity, - ) - reform_total.run() - - # Count of recipients/payers (baseline) - baseline_count = Aggregate( - simulation=self.baseline_simulation, - variable=self.programme_name, - aggregate_type=AggregateType.COUNT, - entity=self.entity, - filter_variable=self.programme_name, - filter_variable_geq=0.01, - ) - baseline_count.run() - - # Count of recipients/payers (reform) - reform_count = Aggregate( - simulation=self.reform_simulation, - variable=self.programme_name, - aggregate_type=AggregateType.COUNT, - entity=self.entity, - filter_variable=self.programme_name, - filter_variable_geq=0.01, - ) - reform_count.run() - - # Winners and losers - winners = ChangeAggregate( - baseline_simulation=self.baseline_simulation, - reform_simulation=self.reform_simulation, - variable=self.programme_name, - aggregate_type=ChangeAggregateType.COUNT, - entity=self.entity, - change_geq=0.01 if not self.is_tax else -0.01, - ) - winners.run() - - losers = ChangeAggregate( - baseline_simulation=self.baseline_simulation, - reform_simulation=self.reform_simulation, - variable=self.programme_name, - aggregate_type=ChangeAggregateType.COUNT, - entity=self.entity, - change_leq=-0.01 if not self.is_tax else 0.01, - ) - losers.run() - - # Populate results - self.baseline_total = float(baseline_total.result) - self.reform_total = float(reform_total.result) - self.change = float(reform_total.result - baseline_total.result) - self.baseline_count = float(baseline_count.result) - self.reform_count = float(reform_count.result) - self.winners = float(winners.result) - self.losers = float(losers.result) diff --git a/src/policyengine/tax_benefit_models/us/__init__.py b/src/policyengine/tax_benefit_models/us/__init__.py index 75d2aa79..d49d46d4 100644 --- a/src/policyengine/tax_benefit_models/us/__init__.py +++ b/src/policyengine/tax_benefit_models/us/__init__.py @@ -1,16 +1,36 @@ -"""PolicyEngine US tax-benefit model.""" +"""PolicyEngine US tax-benefit model. + +Typical usage (fresh session, no other imports required): + +.. code-block:: python + + import policyengine as pe + + # Household calculator. + result = pe.us.calculate_household( + people=[{"age": 35, "employment_income": 60000}], + tax_unit={"filing_status": "SINGLE"}, + year=2026, + ) + print(result.tax_unit.income_tax) + + # Reform + extra variables. + reformed = pe.us.calculate_household( + people=[{"age": 35, "employment_income": 60000}], + tax_unit={"filing_status": "SINGLE"}, + year=2026, + reform={"gov.irs.credits.ctc.amount.adult_dependent": 1000}, + extra_variables=["adjusted_gross_income"], + ) +""" from importlib.util import find_spec if find_spec("policyengine_us") is not None: from policyengine.core import Dataset + from policyengine.outputs import ProgramStatistics - from .analysis import ( - USHouseholdInput, - USHouseholdOutput, - calculate_household_impact, - economic_impact_analysis, - ) + from .analysis import economic_impact_analysis from .datasets import ( PolicyEngineUSDataset, USYearData, @@ -18,16 +38,17 @@ ensure_datasets, load_datasets, ) + from .household import calculate_household from .model import ( PolicyEngineUS, PolicyEngineUSLatest, managed_microsimulation, us_latest, - us_model, ) - from .outputs import ProgramStatistics - # Rebuild Pydantic models to resolve forward references + model = us_latest + """The pinned US ``TaxBenefitModelVersion`` for this policyengine release.""" + Dataset.model_rebuild() USYearData.model_rebuild() PolicyEngineUSDataset.model_rebuild() @@ -43,12 +64,10 @@ "PolicyEngineUS", "PolicyEngineUSLatest", "managed_microsimulation", - "us_model", + "model", "us_latest", + "calculate_household", "economic_impact_analysis", - "calculate_household_impact", - "USHouseholdInput", - "USHouseholdOutput", "ProgramStatistics", ] else: diff --git a/src/policyengine/tax_benefit_models/us/analysis.py b/src/policyengine/tax_benefit_models/us/analysis.py index 122ae2af..8b3eefc8 100644 --- a/src/policyengine/tax_benefit_models/us/analysis.py +++ b/src/policyengine/tax_benefit_models/us/analysis.py @@ -1,15 +1,18 @@ -"""General utility functions for US policy reform analysis.""" +"""Microsimulation reform analysis for the US model. -import tempfile -from pathlib import Path -from typing import Any, Optional, Union +The single-household calculator lives in :mod:`.household`; this module +holds the population-level reform-analysis helpers. +""" + +from __future__ import annotations + +from typing import Union import pandas as pd -from microdf import MicroDataFrame -from pydantic import BaseModel, Field +from pydantic import BaseModel from policyengine.core import OutputCollection, Simulation -from policyengine.core.policy import Policy +from policyengine.outputs import ProgramStatistics from policyengine.outputs.decile_impact import ( DecileImpact, calculate_decile_impacts, @@ -24,168 +27,6 @@ calculate_us_poverty_rates, ) -from .datasets import PolicyEngineUSDataset, USYearData -from .model import us_latest -from .outputs import ProgramStatistics - - -class USHouseholdOutput(BaseModel): - """Output from a US household calculation with all entity data.""" - - person: list[dict[str, Any]] - marital_unit: list[dict[str, Any]] - family: list[dict[str, Any]] - spm_unit: list[dict[str, Any]] - tax_unit: list[dict[str, Any]] - household: dict[str, Any] - - -class USHouseholdInput(BaseModel): - """Input for a US household calculation.""" - - people: list[dict[str, Any]] - marital_unit: dict[str, Any] = Field(default_factory=dict) - family: dict[str, Any] = Field(default_factory=dict) - spm_unit: dict[str, Any] = Field(default_factory=dict) - tax_unit: dict[str, Any] = Field(default_factory=dict) - household: dict[str, Any] = Field(default_factory=dict) - year: int = 2024 - - -def calculate_household_impact( - household_input: USHouseholdInput, - policy: Optional[Policy] = None, -) -> USHouseholdOutput: - """Calculate tax and benefit impacts for a single US household.""" - n_people = len(household_input.people) - - # Build person data with defaults - person_data = { - "person_id": list(range(n_people)), - "person_household_id": [0] * n_people, - "person_marital_unit_id": [0] * n_people, - "person_family_id": [0] * n_people, - "person_spm_unit_id": [0] * n_people, - "person_tax_unit_id": [0] * n_people, - "person_weight": [1.0] * n_people, - } - # Add user-provided person fields - for i, person in enumerate(household_input.people): - for key, value in person.items(): - if key not in person_data: - person_data[key] = [0.0] * n_people # Default to 0 for numeric fields - person_data[key][i] = value - - # Build entity data with defaults - household_data = { - "household_id": [0], - "household_weight": [1.0], - } - for key, value in household_input.household.items(): - household_data[key] = [value] - - marital_unit_data = { - "marital_unit_id": [0], - "marital_unit_weight": [1.0], - } - for key, value in household_input.marital_unit.items(): - marital_unit_data[key] = [value] - - family_data = { - "family_id": [0], - "family_weight": [1.0], - } - for key, value in household_input.family.items(): - family_data[key] = [value] - - spm_unit_data = { - "spm_unit_id": [0], - "spm_unit_weight": [1.0], - } - for key, value in household_input.spm_unit.items(): - spm_unit_data[key] = [value] - - tax_unit_data = { - "tax_unit_id": [0], - "tax_unit_weight": [1.0], - } - for key, value in household_input.tax_unit.items(): - tax_unit_data[key] = [value] - - # Create MicroDataFrames - person_df = MicroDataFrame(pd.DataFrame(person_data), weights="person_weight") - household_df = MicroDataFrame( - pd.DataFrame(household_data), weights="household_weight" - ) - marital_unit_df = MicroDataFrame( - pd.DataFrame(marital_unit_data), weights="marital_unit_weight" - ) - family_df = MicroDataFrame(pd.DataFrame(family_data), weights="family_weight") - spm_unit_df = MicroDataFrame(pd.DataFrame(spm_unit_data), weights="spm_unit_weight") - tax_unit_df = MicroDataFrame(pd.DataFrame(tax_unit_data), weights="tax_unit_weight") - - # Create temporary dataset - tmpdir = tempfile.mkdtemp() - filepath = str(Path(tmpdir) / "household_impact.h5") - - dataset = PolicyEngineUSDataset( - name="Household impact calculation", - description="Single household for impact calculation", - filepath=filepath, - year=household_input.year, - data=USYearData( - person=person_df, - household=household_df, - marital_unit=marital_unit_df, - family=family_df, - spm_unit=spm_unit_df, - tax_unit=tax_unit_df, - ), - ) - - # Run simulation - simulation = Simulation( - dataset=dataset, - tax_benefit_model_version=us_latest, - policy=policy, - ) - simulation.run() - - # Extract all output variables defined in entity_variables - output_data = simulation.output_dataset.data - - def safe_convert(value): - """Convert value to float if numeric, otherwise return as string.""" - try: - return float(value) - except (ValueError, TypeError): - return str(value) - - def extract_entity_outputs( - entity_name: str, entity_data, n_rows: int - ) -> list[dict[str, Any]]: - outputs = [] - for i in range(n_rows): - row_dict = {} - for var in us_latest.entity_variables[entity_name]: - row_dict[var] = safe_convert(entity_data[var].iloc[i]) - outputs.append(row_dict) - return outputs - - return USHouseholdOutput( - person=extract_entity_outputs("person", output_data.person, n_people), - marital_unit=extract_entity_outputs( - "marital_unit", output_data.marital_unit, 1 - ), - family=extract_entity_outputs("family", output_data.family, 1), - spm_unit=extract_entity_outputs("spm_unit", output_data.spm_unit, 1), - tax_unit=extract_entity_outputs("tax_unit", output_data.tax_unit, 1), - household={ - var: safe_convert(output_data.household[var].iloc[0]) - for var in us_latest.entity_variables["household"] - }, - ) - class PolicyReformAnalysis(BaseModel): """Complete policy reform analysis result.""" @@ -203,15 +44,16 @@ def economic_impact_analysis( reform_simulation: Simulation, inequality_preset: Union[USInequalityPreset, str] = USInequalityPreset.STANDARD, ) -> PolicyReformAnalysis: - """Perform comprehensive analysis of a policy reform. + """Perform comprehensive analysis of a US policy reform. Args: - baseline_simulation: Baseline simulation - reform_simulation: Reform simulation - inequality_preset: Optional preset for the inequality outputs + baseline_simulation: Baseline simulation. + reform_simulation: Reform simulation. + inequality_preset: Preset for the inequality output. Returns: - PolicyReformAnalysis containing decile impacts and program statistics + ``PolicyReformAnalysis`` with decile impacts, program + statistics, baseline and reform poverty, and inequality. """ baseline_simulation.ensure() reform_simulation.ensure() @@ -223,21 +65,16 @@ def economic_impact_analysis( "Reform simulation must have more than 100 households" ) - # Decile impact (using household_net_income for US) decile_impacts = calculate_decile_impacts( baseline_simulation=baseline_simulation, reform_simulation=reform_simulation, income_variable="household_net_income", ) - # Major programs to analyse programs = { - # Federal taxes "income_tax": {"entity": "tax_unit", "is_tax": True}, "payroll_tax": {"entity": "person", "is_tax": True}, - # State and local taxes "state_income_tax": {"entity": "tax_unit", "is_tax": True}, - # Benefits "snap": {"entity": "spm_unit", "is_tax": False}, "tanf": {"entity": "spm_unit", "is_tax": False}, "ssi": {"entity": "person", "is_tax": False}, @@ -249,22 +86,17 @@ def economic_impact_analysis( } program_statistics = [] - for program_name, program_info in programs.items(): - entity = program_info["entity"] - is_tax = program_info["is_tax"] - stats = ProgramStatistics( baseline_simulation=baseline_simulation, reform_simulation=reform_simulation, program_name=program_name, - entity=entity, - is_tax=is_tax, + entity=program_info["entity"], + is_tax=program_info["is_tax"], ) stats.run() program_statistics.append(stats) - # Create DataFrame program_df = pd.DataFrame( [ { @@ -284,16 +116,12 @@ def economic_impact_analysis( for p in program_statistics ] ) - program_collection = OutputCollection( outputs=program_statistics, dataframe=program_df ) - # Calculate poverty rates for both simulations baseline_poverty = calculate_us_poverty_rates(baseline_simulation) reform_poverty = calculate_us_poverty_rates(reform_simulation) - - # Calculate inequality for both simulations baseline_inequality = calculate_us_inequality( baseline_simulation, preset=inequality_preset ) diff --git a/src/policyengine/tax_benefit_models/us/datasets.py b/src/policyengine/tax_benefit_models/us/datasets.py index da10733b..014309db 100644 --- a/src/policyengine/tax_benefit_models/us/datasets.py +++ b/src/policyengine/tax_benefit_models/us/datasets.py @@ -7,7 +7,7 @@ from pydantic import ConfigDict from policyengine.core import Dataset, YearData -from policyengine.core.release_manifest import ( +from policyengine.provenance.manifest import ( dataset_logical_name, resolve_dataset_reference, ) diff --git a/src/policyengine/tax_benefit_models/us/household.py b/src/policyengine/tax_benefit_models/us/household.py new file mode 100644 index 00000000..5258043a --- /dev/null +++ b/src/policyengine/tax_benefit_models/us/household.py @@ -0,0 +1,245 @@ +"""Single-household calculation for the US model. + +``calculate_household`` is the one-call entry point for the household +calculator journey: pass the people plus any per-entity overrides plus +an optional reform, get back a dot-accessible result. + +.. code-block:: python + + import policyengine as pe + + # Single parent with one child in New York, $45k wages. + result = pe.us.calculate_household( + people=[ + {"age": 32, "employment_income": 45000, "is_tax_unit_head": True}, + {"age": 6, "is_tax_unit_dependent": True}, + ], + tax_unit={"filing_status": "HEAD_OF_HOUSEHOLD"}, + household={"state_code": "NY"}, + year=2026, + extra_variables=["adjusted_gross_income"], + ) + print(result.tax_unit.income_tax) + print(result.tax_unit.ctc, result.tax_unit.eitc) + print(result.household.household_net_income) + # Reform: zero out SNAP. + reformed = pe.us.calculate_household( + people=[ + {"age": 32, "employment_income": 45000, "is_tax_unit_head": True}, + {"age": 6, "is_tax_unit_dependent": True}, + ], + tax_unit={"filing_status": "HEAD_OF_HOUSEHOLD"}, + household={"state_code": "NY"}, + year=2026, + reform={"gov.usda.snap.income.deductions.earned_income": 0}, + ) +""" + +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any, Optional + +from policyengine.tax_benefit_models.common import ( + EntityResult, + HouseholdResult, + compile_reform, + dispatch_extra_variables, +) +from policyengine.utils.household_validation import validate_household_input + +from .model import us_latest + +_GROUP_ENTITIES = ("marital_unit", "family", "spm_unit", "tax_unit", "household") + + +def _raise_unexpected_kwargs(unexpected: Mapping[str, Any]) -> None: + from difflib import get_close_matches + + lines = ["calculate_household received unsupported keyword arguments:"] + for name in unexpected: + suggestions = get_close_matches(name, _ALLOWED_KWARGS, n=1, cutoff=0.5) + hint = f" (did you mean '{suggestions[0]}'?)" if suggestions else "" + if name == "benunit": + hint = " — `benunit` is UK-only; the US uses `tax_unit`, `marital_unit`, `family`, or `spm_unit`" + lines.append(f" - '{name}'{hint}") + lines.append( + "Valid kwargs: people, marital_unit, family, spm_unit, tax_unit, " + "household, year, reform, extra_variables." + ) + raise TypeError("\n".join(lines)) + + +def _default_output_columns( + extra_by_entity: Mapping[str, list[str]], +) -> dict[str, list[str]]: + merged: dict[str, list[str]] = {} + for entity, defaults in us_latest.entity_variables.items(): + columns = list(defaults) + for extra in extra_by_entity.get(entity, []): + if extra not in columns: + columns.append(extra) + merged[entity] = columns + for entity, extras in extra_by_entity.items(): + merged.setdefault(entity, list(extras)) + return merged + + +def _safe_convert(value: Any) -> Any: + try: + return float(value) + except (ValueError, TypeError): + return str(value) if value is not None else None + + +def _build_situation( + *, + people: list[Mapping[str, Any]], + marital_unit: Mapping[str, Any], + family: Mapping[str, Any], + spm_unit: Mapping[str, Any], + tax_unit: Mapping[str, Any], + household: Mapping[str, Any], + year: int, +) -> dict[str, Any]: + year_str = str(year) + + def _periodise(spec: Mapping[str, Any]) -> dict[str, dict[str, Any]]: + return {key: {year_str: value} for key, value in spec.items() if key != "id"} + + person_ids = [f"person_{i}" for i in range(len(people))] + persons = {pid: _periodise(person) for pid, person in zip(person_ids, people)} + + def _group(spec: Mapping[str, Any]) -> dict[str, Any]: + return {"members": list(person_ids), **_periodise(spec)} + + return { + "people": persons, + "marital_units": {"marital_unit_0": _group(marital_unit)}, + "families": {"family_0": _group(family)}, + "spm_units": {"spm_unit_0": _group(spm_unit)}, + "tax_units": {"tax_unit_0": _group(tax_unit)}, + "households": {"household_0": _group(household)}, + } + + +_ALLOWED_KWARGS = frozenset( + { + "people", + "marital_unit", + "family", + "spm_unit", + "tax_unit", + "household", + "year", + "reform", + "extra_variables", + } +) + + +def calculate_household( + *, + people: list[Mapping[str, Any]], + marital_unit: Optional[Mapping[str, Any]] = None, + family: Optional[Mapping[str, Any]] = None, + spm_unit: Optional[Mapping[str, Any]] = None, + tax_unit: Optional[Mapping[str, Any]] = None, + household: Optional[Mapping[str, Any]] = None, + year: int = 2026, + reform: Optional[Mapping[str, Any]] = None, + extra_variables: Optional[list[str]] = None, + **unexpected: Any, +) -> HouseholdResult: + """Compute tax and benefit variables for a single US household. + + Args: + people: One dict per person with US variable names as keys + (``age``, ``employment_income``, ``is_tax_unit_head``, + ``is_tax_unit_dependent`` ...). Must be non-empty. + marital_unit, family, spm_unit, tax_unit, household: Optional + per-entity overrides, each keyed by variable name (e.g. + ``tax_unit={"filing_status": "SINGLE"}``, + ``household={"state_code": "NY"}``). + year: Calendar year to compute for. Defaults to 2026. + reform: Optional reform as ``{parameter_path: value}`` or + ``{parameter_path: {effective_date: value}}``. Scalar + values default to ``{year}-01-01``; invalid parameter + paths raise with a close-match suggestion. + extra_variables: Flat list of variable names to compute beyond + the default output columns; the library dispatches each + name to its entity. Unknown names raise ``ValueError`` + with a close-match suggestion. + + Returns: + :class:`HouseholdResult` with dot-accessible per-entity + variables. Singleton entities (``tax_unit``, ``household``, ...) + return :class:`EntityResult`; ``person`` returns a list of them. + + Raises: + ValueError: if any input dict uses an unknown variable name, + if a variable is placed on the wrong entity (e.g. + ``filing_status`` on ``people``), or if ``extra_variables`` + / ``reform`` names a variable or parameter path not defined + on the US model. + """ + if unexpected: + _raise_unexpected_kwargs(unexpected) + + from policyengine_us import Simulation + + people = list(people) + entities = { + "marital_unit": dict(marital_unit or {}), + "family": dict(family or {}), + "spm_unit": dict(spm_unit or {}), + "tax_unit": dict(tax_unit or {}), + "household": dict(household or {}), + } + + validate_household_input( + model_version=us_latest, + entities={ + "person": people, + **{name: [value] for name, value in entities.items()}, + }, + ) + + extra_by_entity = dispatch_extra_variables( + model_version=us_latest, + names=extra_variables or [], + ) + output_columns = _default_output_columns(extra_by_entity) + reform_dict = compile_reform(reform, year=year, model_version=us_latest) + + simulation = Simulation( + situation=_build_situation( + people=people, + marital_unit=entities["marital_unit"], + family=entities["family"], + spm_unit=entities["spm_unit"], + tax_unit=entities["tax_unit"], + household=entities["household"], + year=year, + ), + reform=reform_dict, + ) + + result = HouseholdResult() + for entity, columns in output_columns.items(): + raw = { + variable: list(simulation.calculate(variable, period=year, map_to=entity)) + for variable in columns + } + if entity == "person": + result["person"] = [ + EntityResult( + {variable: _safe_convert(raw[variable][i]) for variable in columns} + ) + for i in range(len(people)) + ] + else: + result[entity] = EntityResult( + {variable: _safe_convert(raw[variable][0]) for variable in columns} + ) + return result diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py index cd56df09..51463650 100644 --- a/src/policyengine/tax_benefit_models/us/model.py +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -1,31 +1,17 @@ import datetime -import warnings -from importlib import metadata from pathlib import Path from typing import TYPE_CHECKING, Optional import pandas as pd from microdf import MicroDataFrame -from policyengine.core import ( - Parameter, - ParameterNode, - TaxBenefitModel, - TaxBenefitModelVersion, - Variable, -) -from policyengine.core.release_manifest import ( - certify_data_release_compatibility, +from policyengine.core import TaxBenefitModel +from policyengine.provenance.manifest import ( dataset_logical_name, - get_release_manifest, resolve_local_managed_dataset_source, resolve_managed_dataset_reference, ) -from policyengine.utils.entity_utils import build_entity_relationships -from policyengine.utils.parameter_labels import ( - build_scale_lookup, - generate_label_for_parameter, -) +from policyengine.tax_benefit_models.common import MicrosimulationModelVersion from .datasets import PolicyEngineUSDataset, USYearData @@ -49,18 +35,11 @@ class PolicyEngineUS(TaxBenefitModel): us_model = PolicyEngineUS() -def _get_runtime_data_build_metadata() -> dict[str, Optional[str]]: - try: - from policyengine_us.build_metadata import get_data_build_metadata - except ModuleNotFoundError as exc: - if exc.name != "policyengine_us.build_metadata": - raise - return {} - - return get_data_build_metadata() or {} - +class PolicyEngineUSLatest(MicrosimulationModelVersion): + country_code = "us" + package_name = "policyengine-us" + group_entities = US_GROUP_ENTITIES -class PolicyEngineUSLatest(TaxBenefitModelVersion): model: TaxBenefitModel = us_model version: str = None created_at: datetime.datetime = None @@ -129,147 +108,32 @@ class PolicyEngineUSLatest(TaxBenefitModelVersion): ], } - def __init__(self, **kwargs: dict): - manifest = get_release_manifest("us") - if "version" not in kwargs or kwargs.get("version") is None: - kwargs["version"] = manifest.model_package.version - - installed_model_version = metadata.version("policyengine-us") - if installed_model_version != manifest.model_package.version: - warnings.warn( - "Installed policyengine-us version " - f"({installed_model_version}) does not match the bundled " - "policyengine.py manifest " - f"({manifest.model_package.version}). Calculations will " - "run against the installed version, but dataset " - "compatibility is not guaranteed. To silence this " - "warning, install the version pinned by the manifest.", - UserWarning, - stacklevel=2, - ) - - model_build_metadata = _get_runtime_data_build_metadata() - data_certification = certify_data_release_compatibility( - "us", - runtime_model_version=installed_model_version, - runtime_data_build_fingerprint=model_build_metadata.get( - "data_build_fingerprint" - ), - ) - - super().__init__(**kwargs) - self.release_manifest = manifest - self.model_package = manifest.model_package - self.data_package = manifest.data_package - self.default_dataset_uri = manifest.default_dataset_uri - self.data_certification = data_certification - from policyengine_core.enums import Enum + # --- Hooks ----------------------------------------------------------- + @classmethod + def _get_runtime_data_build_metadata(cls) -> dict[str, Optional[str]]: + try: + from policyengine_us.build_metadata import get_data_build_metadata + except ModuleNotFoundError as exc: + if exc.name != "policyengine_us.build_metadata": + raise + return {} + return get_data_build_metadata() or {} + + def _load_system(self): from policyengine_us.system import system - # Attach region registry + return system + + def _load_region_registry(self): from policyengine.countries.us.regions import us_region_registry - self.region_registry = us_region_registry - - self.id = f"{self.model.id}@{self.version}" - - for var_obj in system.variables.values(): - # Serialize default_value for JSON compatibility - default_val = var_obj.default_value - if var_obj.value_type is Enum: - default_val = default_val.name - elif var_obj.value_type is datetime.date: - default_val = default_val.isoformat() - - variable = Variable( - id=self.id + "-" + var_obj.name, - name=var_obj.name, - label=getattr(var_obj, "label", None), - tax_benefit_model_version=self, - entity=var_obj.entity.key, - description=var_obj.documentation, - data_type=var_obj.value_type if var_obj.value_type is not Enum else str, - default_value=default_val, - value_type=var_obj.value_type, - ) - if ( - hasattr(var_obj, "possible_values") - and var_obj.possible_values is not None - ): - variable.possible_values = list( - map( - lambda x: x.name, - var_obj.possible_values._value2member_map_.values(), - ) - ) - # Extract and resolve adds/subtracts. - # Core stores these as either list[str] or a parameter path string. - # Resolve parameter paths to lists so consumers always get list[str]. - if hasattr(var_obj, "adds") and var_obj.adds is not None: - if isinstance(var_obj.adds, str): - try: - from policyengine_core.parameters.operations.get_parameter import ( - get_parameter, - ) - - param = get_parameter(system.parameters, var_obj.adds) - variable.adds = list(param("2025-01-01")) - except (ValueError, Exception): - variable.adds = None - else: - variable.adds = var_obj.adds - if hasattr(var_obj, "subtracts") and var_obj.subtracts is not None: - if isinstance(var_obj.subtracts, str): - try: - from policyengine_core.parameters.operations.get_parameter import ( - get_parameter, - ) - - param = get_parameter(system.parameters, var_obj.subtracts) - variable.subtracts = list(param("2025-01-01")) - except (ValueError, Exception): - variable.subtracts = None - else: - variable.subtracts = var_obj.subtracts - self.add_variable(variable) - - from policyengine_core.parameters import Parameter as CoreParameter - from policyengine_core.parameters import ParameterNode as CoreParameterNode - - scale_lookup = build_scale_lookup(system) - - for param_node in system.parameters.get_descendants(): - if isinstance(param_node, CoreParameter): - parameter = Parameter( - id=self.id + "-" + param_node.name, - name=param_node.name, - label=generate_label_for_parameter( - param_node, system, scale_lookup - ), - tax_benefit_model_version=self, - description=param_node.description, - data_type=type(param_node(2025)), - unit=param_node.metadata.get("unit"), - _core_param=param_node, - ) - self.add_parameter(parameter) - elif isinstance(param_node, CoreParameterNode): - node = ParameterNode( - id=self.id + "-" + param_node.name, - name=param_node.name, - label=param_node.metadata.get("label"), - description=param_node.description, - tax_benefit_model_version=self, - ) - self.add_parameter_node(node) - - def _build_entity_relationships( - self, dataset: PolicyEngineUSDataset - ) -> pd.DataFrame: - """Build a DataFrame mapping each person to their containing entities.""" - person_data = pd.DataFrame(dataset.data.person) - return build_entity_relationships(person_data, US_GROUP_ENTITIES) + return us_region_registry + + @property + def _dataset_class(self): + return PolicyEngineUSDataset + # --- run ------------------------------------------------------------- def run(self, simulation: "Simulation") -> "Simulation": from policyengine_us import Microsimulation from policyengine_us.system import system @@ -308,14 +172,12 @@ def run(self, simulation: "Simulation") -> "Simulation": ), ) - # Build reform dict from policy and dynamic parameter values. # US requires reforms at Microsimulation construction time # (unlike UK which supports p.update() after construction). policy_reform = build_reform_dict(simulation.policy) dynamic_reform = build_reform_dict(simulation.dynamic) reform_dict = merge_reform_dicts(policy_reform, dynamic_reform) - # Create Microsimulation with reform at construction time microsim = Microsimulation(reform=reform_dict) self._build_simulation_from_dataset(microsim, dataset, system) @@ -346,7 +208,7 @@ def run(self, simulation: "Simulation") -> "Simulation": "tax_unit_weight", } - # First, copy ID and weight columns from input dataset + # Copy ID and weight columns from input dataset. for entity in data.keys(): input_df = pd.DataFrame(getattr(dataset.data, entity)) entity_id_col = f"{entity}_id" @@ -357,16 +219,16 @@ def run(self, simulation: "Simulation") -> "Simulation": if entity_weight_col in input_df.columns: data[entity][entity_weight_col] = input_df[entity_weight_col].values - # For person entity, also copy person-level group ID columns + # Person entity also needs person-level group ID columns so that + # downstream joins (e.g. person->tax_unit) work. person_input_df = pd.DataFrame(dataset.data.person) for col in person_input_df.columns: if col.startswith("person_") and col.endswith("_id"): - # Map person_household_id -> household_id, etc. target_col = col.replace("person_", "") if target_col in id_columns: data["person"][target_col] = person_input_df[col].values - # Then calculate non-ID, non-weight variables from simulation + # Calculate non-ID, non-weight variables from simulation for entity, variables in self.entity_variables.items(): for var in variables: if var not in id_columns and var not in weight_columns: @@ -404,61 +266,23 @@ def run(self, simulation: "Simulation") -> "Simulation": ), ) - def save(self, simulation: "Simulation"): - """Save the simulation's output dataset.""" - simulation.output_dataset.save() - - def load(self, simulation: "Simulation"): - """Load the simulation's output dataset.""" - import os - - filepath = str( - Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") - ) - - simulation.output_dataset = PolicyEngineUSDataset( - id=simulation.id, - name=simulation.dataset.name, - description=simulation.dataset.description, - filepath=filepath, - year=simulation.dataset.year, - is_output_dataset=True, - ) - - # Load timestamps from file system metadata - if os.path.exists(filepath): - simulation.created_at = datetime.datetime.fromtimestamp( - os.path.getctime(filepath) - ) - simulation.updated_at = datetime.datetime.fromtimestamp( - os.path.getmtime(filepath) - ) - def _build_simulation_from_dataset(self, microsim, dataset, system): """Build a PolicyEngine Core simulation from dataset entity IDs. - This follows the same pattern as policyengine-uk, initializing - entities from IDs first, then using set_input() for variables. - - Args: - microsim: The Microsimulation object to populate - dataset: The dataset containing entity data - system: The tax-benefit system + Mirrors the policyengine-uk pattern of instantiating entities from + IDs first and then setting variable inputs. Handles both the legacy + ``person_X_id`` and the ``X_id`` column-naming conventions. """ import numpy as np from policyengine_core.simulations.simulation_builder import ( SimulationBuilder, ) - # Create builder and instantiate entities builder = SimulationBuilder() builder.populations = system.instantiate_entities() - # Extract entity IDs from dataset person_data = pd.DataFrame(dataset.data.person) - # Determine column naming convention - # Support both person_X_id (from create_datasets) and X_id (from custom datasets) household_id_col = ( "person_household_id" if "person_household_id" in person_data.columns @@ -485,7 +309,6 @@ def _build_simulation_from_dataset(self, microsim, dataset, system): else "tax_unit_id" ) - # Declare entities builder.declare_person_entity("person", person_data["person_id"].values) builder.declare_entity( "household", np.unique(person_data[household_id_col].values) @@ -501,7 +324,6 @@ def _build_simulation_from_dataset(self, microsim, dataset, system): "marital_unit", np.unique(person_data[marital_unit_id_col].values) ) - # Join persons to group entities builder.join_with_persons( builder.populations["household"], person_data[household_id_col].values, @@ -528,12 +350,8 @@ def _build_simulation_from_dataset(self, microsim, dataset, system): np.array(["member"] * len(person_data)), ) - # Build simulation from populations microsim.build_from_populations(builder.populations) - # Set input variables for each entity - # Skip ID columns as they're structural and already used in entity building - # Support both naming conventions id_columns = { "person_id", "household_id", @@ -558,7 +376,6 @@ def _build_simulation_from_dataset(self, microsim, dataset, system): ]: df = pd.DataFrame(entity_df) for column in df.columns: - # Skip ID columns and check if variable exists in system if column not in id_columns and column in system.variables: microsim.set_input(column, dataset.year, df[column].values) @@ -585,8 +402,8 @@ def managed_microsimulation( """Construct a country-package Microsimulation pinned to this bundle. By default this enforces the dataset selection from the bundled - `policyengine.py` release manifest. Arbitrary dataset URIs require - `allow_unmanaged=True`. + ``policyengine.py`` release manifest. Arbitrary dataset URIs require + ``allow_unmanaged=True``. """ from policyengine_us import Microsimulation diff --git a/src/policyengine/utils/household_validation.py b/src/policyengine/utils/household_validation.py new file mode 100644 index 00000000..6be90fb2 --- /dev/null +++ b/src/policyengine/utils/household_validation.py @@ -0,0 +1,113 @@ +"""Strict validation for household-calculation inputs. + +Catches the three typo classes that otherwise silently propagate wrong +numbers to published results: + +1. Unknown variable name entirely (``employment_incme``). +2. Valid variable placed on the wrong entity (``filing_status`` passed + to ``people`` instead of ``tax_unit``). +3. Empty ``people`` list (policyengine_us will IndexError deep in + simulation). + +All errors include paste-able fixes. +""" + +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from difflib import get_close_matches +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion + + +_STRUCTURAL_KEYS = frozenset( + { + "id", + "members", + "person_id", + "household_id", + "marital_unit_id", + "family_id", + "spm_unit_id", + "tax_unit_id", + "benunit_id", + "person_weight", + "household_weight", + "marital_unit_weight", + "family_weight", + "spm_unit_weight", + "tax_unit_weight", + "benunit_weight", + } +) + + +def validate_household_input( + *, + model_version: TaxBenefitModelVersion, + entities: Mapping[str, Iterable[Mapping[str, object]]], +) -> None: + """Raise ``ValueError`` on unknown or mis-placed entity variables. + + ``entities`` maps entity name → iterable of entity dicts. Each key + is checked against ``model_version.variables_by_name``: + + - If the key is unknown, the error includes a difflib close-match + suggestion. + - If the key is a known variable but defined on a different entity, + the error names the correct entity and shows the kwarg swap. + """ + if "person" in entities and not list(entities["person"]): + raise ValueError( + "people must be a non-empty list. At minimum pass people=[{'age': }]." + ) + + variables_by_name = model_version.variables_by_name + valid_names = set(variables_by_name) + unknown: list[tuple[str, str]] = [] + misplaced: list[tuple[str, str, str]] = [] + + for entity_name, records in entities.items(): + for record in records: + for key in record: + if key in _STRUCTURAL_KEYS: + continue + variable = variables_by_name.get(key) + if variable is None: + unknown.append((entity_name, key)) + elif variable.entity != entity_name: + misplaced.append((entity_name, key, variable.entity)) + + if not unknown and not misplaced: + return + + lines: list[str] = [] + if unknown: + lines.append( + f"Unknown variable names on {model_version.model.id} " + f"{model_version.version}:" + ) + for entity_name, key in unknown: + suggestions = get_close_matches(key, valid_names, n=1, cutoff=0.7) + hint = f" (did you mean '{suggestions[0]}'?)" if suggestions else "" + lines.append(f" - {entity_name}: '{key}'{hint}") + if not misplaced: + first_bad = unknown[0][1] + lines.append( + f"If '{first_bad}' is a real variable outside the default " + f"output columns, pass it via extra_variables=['{first_bad}']." + ) + if misplaced: + if lines: + lines.append("") + lines.append("Variables passed on the wrong entity:") + for wrong_entity, key, correct_entity in misplaced: + lines.append( + f" - '{key}' was given on {wrong_entity}; it belongs on " + f"{correct_entity}. Move it: pass " + f"{correct_entity}={{'{key}': }}." + ) + + raise ValueError("\n".join(lines)) diff --git a/tests/fixtures/household_calculator_snapshots/uk_couple_two_kids.json b/tests/fixtures/household_calculator_snapshots/uk_couple_two_kids.json new file mode 100644 index 00000000..49302124 --- /dev/null +++ b/tests/fixtures/household_calculator_snapshots/uk_couple_two_kids.json @@ -0,0 +1,139 @@ +{ + "benunit.benunit_id": 0.0, + "benunit.benunit_weight": 1.0, + "benunit.child_benefit": 2328.16, + "benunit.child_tax_credit": 0.0, + "benunit.family_type": "COUPLE_WITH_CHILDREN", + "benunit.income_support": 0.0, + "benunit.pension_credit": 0.0, + "benunit.universal_credit": 0.0, + "benunit.working_tax_credit": 0.0, + "household.council_tax": 0.0, + "household.equiv_hbai_household_net_income": 52503.68, + "household.hbai_household_net_income": 73505.15, + "household.household_benefits": 5880.35, + "household.household_count_people": 4.0, + "household.household_gross_income": 95880.34, + "household.household_id": 0.0, + "household.household_income_decile": 10.0, + "household.household_market_income": 90000.0, + "household.household_net_income": 76898.3, + "household.household_tax": 18982.05, + "household.household_wealth_decile": 10.0, + "household.household_weight": 1.0, + "household.in_poverty_ahc": 0.0, + "household.in_poverty_bhc": 0.0, + "household.in_relative_poverty_ahc": 0.0, + "household.in_relative_poverty_bhc": 0.0, + "household.rent": 0.0, + "household.tenure_type": "RENT_PRIVATELY", + "household.vat": 0.0, + "person[0].age": 42.0, + "person[0].benunit_id": 0.0, + "person[0].child_benefit": 2328.16, + "person[0].child_tax_credit": 0.0, + "person[0].dividend_income": 0.0, + "person[0].earned_income": 55000.0, + "person[0].employment_income": 55000.0, + "person[0].gender": "MALE", + "person[0].household_id": 0.0, + "person[0].income_support": 0.0, + "person[0].income_tax": 9432.0, + "person[0].is_SP_age": 0.0, + "person[0].is_adult": 1.0, + "person[0].is_child": 0.0, + "person[0].is_male": 1.0, + "person[0].national_insurance": 3110.6, + "person[0].pension_credit": 0.0, + "person[0].pension_income": 0.0, + "person[0].person_id": 0.0, + "person[0].person_weight": 1.0, + "person[0].private_pension_income": 0.0, + "person[0].property_income": 0.0, + "person[0].savings_interest_income": 0.0, + "person[0].self_employment_income": 0.0, + "person[0].total_income": 55000.0, + "person[0].universal_credit": 0.0, + "person[0].working_tax_credit": 0.0, + "person[1].age": 40.0, + "person[1].benunit_id": 0.0, + "person[1].child_benefit": 2328.16, + "person[1].child_tax_credit": 0.0, + "person[1].dividend_income": 0.0, + "person[1].earned_income": 35000.0, + "person[1].employment_income": 35000.0, + "person[1].gender": "MALE", + "person[1].household_id": 0.0, + "person[1].income_support": 0.0, + "person[1].income_tax": 4486.0, + "person[1].is_SP_age": 0.0, + "person[1].is_adult": 1.0, + "person[1].is_child": 0.0, + "person[1].is_male": 1.0, + "person[1].national_insurance": 1794.4, + "person[1].pension_credit": 0.0, + "person[1].pension_income": 0.0, + "person[1].person_id": 0.0, + "person[1].person_weight": 1.0, + "person[1].private_pension_income": 0.0, + "person[1].property_income": 0.0, + "person[1].savings_interest_income": 0.0, + "person[1].self_employment_income": 0.0, + "person[1].total_income": 35000.0, + "person[1].universal_credit": 0.0, + "person[1].working_tax_credit": 0.0, + "person[2].age": 8.0, + "person[2].benunit_id": 0.0, + "person[2].child_benefit": 2328.16, + "person[2].child_tax_credit": 0.0, + "person[2].dividend_income": 0.0, + "person[2].earned_income": 0.0, + "person[2].employment_income": 0.0, + "person[2].gender": "MALE", + "person[2].household_id": 0.0, + "person[2].income_support": 0.0, + "person[2].income_tax": 0.0, + "person[2].is_SP_age": 0.0, + "person[2].is_adult": 0.0, + "person[2].is_child": 1.0, + "person[2].is_male": 1.0, + "person[2].national_insurance": 0.0, + "person[2].pension_credit": 0.0, + "person[2].pension_income": 0.0, + "person[2].person_id": 0.0, + "person[2].person_weight": 1.0, + "person[2].private_pension_income": 0.0, + "person[2].property_income": 0.0, + "person[2].savings_interest_income": 0.0, + "person[2].self_employment_income": 0.0, + "person[2].total_income": 0.0, + "person[2].universal_credit": 0.0, + "person[2].working_tax_credit": 0.0, + "person[3].age": 3.0, + "person[3].benunit_id": 0.0, + "person[3].child_benefit": 2328.16, + "person[3].child_tax_credit": 0.0, + "person[3].dividend_income": 0.0, + "person[3].earned_income": 0.0, + "person[3].employment_income": 0.0, + "person[3].gender": "MALE", + "person[3].household_id": 0.0, + "person[3].income_support": 0.0, + "person[3].income_tax": 0.0, + "person[3].is_SP_age": 0.0, + "person[3].is_adult": 0.0, + "person[3].is_child": 1.0, + "person[3].is_male": 1.0, + "person[3].national_insurance": 0.0, + "person[3].pension_credit": 0.0, + "person[3].pension_income": 0.0, + "person[3].person_id": 0.0, + "person[3].person_weight": 1.0, + "person[3].private_pension_income": 0.0, + "person[3].property_income": 0.0, + "person[3].savings_interest_income": 0.0, + "person[3].self_employment_income": 0.0, + "person[3].total_income": 0.0, + "person[3].universal_credit": 0.0, + "person[3].working_tax_credit": 0.0 +} diff --git a/tests/fixtures/household_calculator_snapshots/uk_model_surface.json b/tests/fixtures/household_calculator_snapshots/uk_model_surface.json new file mode 100644 index 00000000..161ef0ec --- /dev/null +++ b/tests/fixtures/household_calculator_snapshots/uk_model_surface.json @@ -0,0 +1,11 @@ +{ + "country_id": "uk", + "data_package_name": "policyengine-uk-data", + "has_employment_income": true, + "has_income_tax": true, + "has_region_registry": true, + "model_package_name": "policyengine-uk", + "num_parameters_bucketed_100s": 20, + "num_variables_bucketed_100s": 8, + "region_registry_country": "uk" +} diff --git a/tests/fixtures/household_calculator_snapshots/uk_single_adult_employment_income.json b/tests/fixtures/household_calculator_snapshots/uk_single_adult_employment_income.json new file mode 100644 index 00000000..5ec94094 --- /dev/null +++ b/tests/fixtures/household_calculator_snapshots/uk_single_adult_employment_income.json @@ -0,0 +1,58 @@ +{ + "benunit.benunit_id": 0.0, + "benunit.benunit_weight": 1.0, + "benunit.child_benefit": 0.0, + "benunit.child_tax_credit": 0.0, + "benunit.family_type": "SINGLE", + "benunit.income_support": 0.0, + "benunit.pension_credit": 0.0, + "benunit.universal_credit": 0.0, + "benunit.working_tax_credit": 0.0, + "household.council_tax": 0.0, + "household.equiv_hbai_household_net_income": 37491.94, + "household.hbai_household_net_income": 25119.6, + "household.household_benefits": 0.0, + "household.household_count_people": 1.0, + "household.household_gross_income": 30000.0, + "household.household_id": 0.0, + "household.household_income_decile": 10.0, + "household.household_market_income": 30000.0, + "household.household_net_income": 24960.55, + "household.household_tax": 5039.45, + "household.household_wealth_decile": 10.0, + "household.household_weight": 1.0, + "household.in_poverty_ahc": 0.0, + "household.in_poverty_bhc": 0.0, + "household.in_relative_poverty_ahc": 0.0, + "household.in_relative_poverty_bhc": 0.0, + "household.rent": 0.0, + "household.tenure_type": "RENT_PRIVATELY", + "household.vat": 0.0, + "person[0].age": 35.0, + "person[0].benunit_id": 0.0, + "person[0].child_benefit": 0.0, + "person[0].child_tax_credit": 0.0, + "person[0].dividend_income": 0.0, + "person[0].earned_income": 30000.0, + "person[0].employment_income": 30000.0, + "person[0].gender": "MALE", + "person[0].household_id": 0.0, + "person[0].income_support": 0.0, + "person[0].income_tax": 3486.0, + "person[0].is_SP_age": 0.0, + "person[0].is_adult": 1.0, + "person[0].is_child": 0.0, + "person[0].is_male": 1.0, + "person[0].national_insurance": 1394.4, + "person[0].pension_credit": 0.0, + "person[0].pension_income": 0.0, + "person[0].person_id": 0.0, + "person[0].person_weight": 1.0, + "person[0].private_pension_income": 0.0, + "person[0].property_income": 0.0, + "person[0].savings_interest_income": 0.0, + "person[0].self_employment_income": 0.0, + "person[0].total_income": 30000.0, + "person[0].universal_credit": 0.0, + "person[0].working_tax_credit": 0.0 +} diff --git a/tests/fixtures/household_calculator_snapshots/uk_single_adult_no_income.json b/tests/fixtures/household_calculator_snapshots/uk_single_adult_no_income.json new file mode 100644 index 00000000..59657e2c --- /dev/null +++ b/tests/fixtures/household_calculator_snapshots/uk_single_adult_no_income.json @@ -0,0 +1,58 @@ +{ + "benunit.benunit_id": 0.0, + "benunit.benunit_weight": 1.0, + "benunit.child_benefit": 0.0, + "benunit.child_tax_credit": 0.0, + "benunit.family_type": "SINGLE", + "benunit.income_support": 0.0, + "benunit.pension_credit": 0.0, + "benunit.universal_credit": 5079.13, + "benunit.working_tax_credit": 0.0, + "household.council_tax": 0.0, + "household.equiv_hbai_household_net_income": 7580.79, + "household.hbai_household_net_income": 5079.13, + "household.household_benefits": 5079.13, + "household.household_count_people": 1.0, + "household.household_gross_income": 5079.13, + "household.household_id": 0.0, + "household.household_income_decile": 10.0, + "household.household_market_income": 0.0, + "household.household_net_income": 4920.09, + "household.household_tax": 159.04, + "household.household_wealth_decile": 10.0, + "household.household_weight": 1.0, + "household.in_poverty_ahc": 1.0, + "household.in_poverty_bhc": 1.0, + "household.in_relative_poverty_ahc": 0.0, + "household.in_relative_poverty_bhc": 0.0, + "household.rent": 0.0, + "household.tenure_type": "RENT_PRIVATELY", + "household.vat": 0.0, + "person[0].age": 35.0, + "person[0].benunit_id": 0.0, + "person[0].child_benefit": 0.0, + "person[0].child_tax_credit": 0.0, + "person[0].dividend_income": 0.0, + "person[0].earned_income": 0.0, + "person[0].employment_income": 0.0, + "person[0].gender": "MALE", + "person[0].household_id": 0.0, + "person[0].income_support": 0.0, + "person[0].income_tax": 0.0, + "person[0].is_SP_age": 0.0, + "person[0].is_adult": 1.0, + "person[0].is_child": 0.0, + "person[0].is_male": 1.0, + "person[0].national_insurance": 0.0, + "person[0].pension_credit": 0.0, + "person[0].pension_income": 0.0, + "person[0].person_id": 0.0, + "person[0].person_weight": 1.0, + "person[0].private_pension_income": 0.0, + "person[0].property_income": 0.0, + "person[0].savings_interest_income": 0.0, + "person[0].self_employment_income": 0.0, + "person[0].total_income": 0.0, + "person[0].universal_credit": 5079.13, + "person[0].working_tax_credit": 0.0 +} diff --git a/tests/fixtures/household_calculator_snapshots/uk_single_parent_one_child.json b/tests/fixtures/household_calculator_snapshots/uk_single_parent_one_child.json new file mode 100644 index 00000000..06e55db0 --- /dev/null +++ b/tests/fixtures/household_calculator_snapshots/uk_single_parent_one_child.json @@ -0,0 +1,85 @@ +{ + "benunit.benunit_id": 0.0, + "benunit.benunit_weight": 1.0, + "benunit.child_benefit": 1400.66, + "benunit.child_tax_credit": 0.0, + "benunit.family_type": "LONE_PARENT", + "benunit.income_support": 0.0, + "benunit.pension_credit": 0.0, + "benunit.universal_credit": 1544.43, + "benunit.working_tax_credit": 0.0, + "household.council_tax": 0.0, + "household.equiv_hbai_household_net_income": 28120.33, + "household.hbai_household_net_income": 24464.69, + "household.household_benefits": 2945.09, + "household.household_count_people": 2.0, + "household.household_gross_income": 27945.09, + "household.household_id": 0.0, + "household.household_income_decile": 10.0, + "household.household_market_income": 25000.0, + "household.household_net_income": 24305.64, + "household.household_tax": 3639.45, + "household.household_wealth_decile": 10.0, + "household.household_weight": 1.0, + "household.in_poverty_ahc": 0.0, + "household.in_poverty_bhc": 0.0, + "household.in_relative_poverty_ahc": 0.0, + "household.in_relative_poverty_bhc": 0.0, + "household.rent": 0.0, + "household.tenure_type": "RENT_PRIVATELY", + "household.vat": 0.0, + "person[0].age": 32.0, + "person[0].benunit_id": 0.0, + "person[0].child_benefit": 1400.66, + "person[0].child_tax_credit": 0.0, + "person[0].dividend_income": 0.0, + "person[0].earned_income": 25000.0, + "person[0].employment_income": 25000.0, + "person[0].gender": "MALE", + "person[0].household_id": 0.0, + "person[0].income_support": 0.0, + "person[0].income_tax": 2486.0, + "person[0].is_SP_age": 0.0, + "person[0].is_adult": 1.0, + "person[0].is_child": 0.0, + "person[0].is_male": 1.0, + "person[0].national_insurance": 994.4, + "person[0].pension_credit": 0.0, + "person[0].pension_income": 0.0, + "person[0].person_id": 0.0, + "person[0].person_weight": 1.0, + "person[0].private_pension_income": 0.0, + "person[0].property_income": 0.0, + "person[0].savings_interest_income": 0.0, + "person[0].self_employment_income": 0.0, + "person[0].total_income": 25000.0, + "person[0].universal_credit": 1544.43, + "person[0].working_tax_credit": 0.0, + "person[1].age": 5.0, + "person[1].benunit_id": 0.0, + "person[1].child_benefit": 1400.66, + "person[1].child_tax_credit": 0.0, + "person[1].dividend_income": 0.0, + "person[1].earned_income": 0.0, + "person[1].employment_income": 0.0, + "person[1].gender": "MALE", + "person[1].household_id": 0.0, + "person[1].income_support": 0.0, + "person[1].income_tax": 0.0, + "person[1].is_SP_age": 0.0, + "person[1].is_adult": 0.0, + "person[1].is_child": 1.0, + "person[1].is_male": 1.0, + "person[1].national_insurance": 0.0, + "person[1].pension_credit": 0.0, + "person[1].pension_income": 0.0, + "person[1].person_id": 0.0, + "person[1].person_weight": 1.0, + "person[1].private_pension_income": 0.0, + "person[1].property_income": 0.0, + "person[1].savings_interest_income": 0.0, + "person[1].self_employment_income": 0.0, + "person[1].total_income": 0.0, + "person[1].universal_credit": 1544.43, + "person[1].working_tax_credit": 0.0 +} diff --git a/tests/fixtures/household_calculator_snapshots/us_married_two_kids_high_income.json b/tests/fixtures/household_calculator_snapshots/us_married_two_kids_high_income.json new file mode 100644 index 00000000..1d5e98ca --- /dev/null +++ b/tests/fixtures/household_calculator_snapshots/us_married_two_kids_high_income.json @@ -0,0 +1,97 @@ +{ + "family.family_id": 0.0, + "family.family_weight": 0.0, + "household.congressional_district_geoid": 0.0, + "household.household_benefits": 0.0, + "household.household_count_people": 4.0, + "household.household_id": 0.0, + "household.household_income_decile": 10.0, + "household.household_market_income": 240000.0, + "household.household_net_income": 175089.92, + "household.household_tax": 64910.07, + "household.household_weight": 1.0, + "marital_unit.marital_unit_id": 0.0, + "marital_unit.marital_unit_weight": 1.0, + "person[0].age": 42.0, + "person[0].employment_income": 150000.0, + "person[0].family_id": 0.0, + "person[0].household_id": 0.0, + "person[0].is_adult": 1.0, + "person[0].is_child": 0.0, + "person[0].is_male": 1.0, + "person[0].marital_unit_id": 0.0, + "person[0].medicaid": 0.0, + "person[0].person_id": 0.0, + "person[0].person_weight": 1.0, + "person[0].race": 3.0, + "person[0].social_security": 0.0, + "person[0].spm_unit_id": 0.0, + "person[0].ssi": 0.0, + "person[0].tax_unit_id": 0.0, + "person[0].unemployment_compensation": 0.0, + "person[1].age": 40.0, + "person[1].employment_income": 90000.0, + "person[1].family_id": 0.0, + "person[1].household_id": 0.0, + "person[1].is_adult": 1.0, + "person[1].is_child": 0.0, + "person[1].is_male": 1.0, + "person[1].marital_unit_id": 0.0, + "person[1].medicaid": 0.0, + "person[1].person_id": 1.0, + "person[1].person_weight": 1.0, + "person[1].race": 3.0, + "person[1].social_security": 0.0, + "person[1].spm_unit_id": 0.0, + "person[1].ssi": 0.0, + "person[1].tax_unit_id": 0.0, + "person[1].unemployment_compensation": 0.0, + "person[2].age": 8.0, + "person[2].employment_income": 0.0, + "person[2].family_id": 0.0, + "person[2].household_id": 0.0, + "person[2].is_adult": 0.0, + "person[2].is_child": 1.0, + "person[2].is_male": 1.0, + "person[2].marital_unit_id": 0.0, + "person[2].medicaid": 0.0, + "person[2].person_id": 2.0, + "person[2].person_weight": 1.0, + "person[2].race": 3.0, + "person[2].social_security": 0.0, + "person[2].spm_unit_id": 0.0, + "person[2].ssi": 0.0, + "person[2].tax_unit_id": 0.0, + "person[2].unemployment_compensation": 0.0, + "person[3].age": 3.0, + "person[3].employment_income": 0.0, + "person[3].family_id": 0.0, + "person[3].household_id": 0.0, + "person[3].is_adult": 0.0, + "person[3].is_child": 1.0, + "person[3].is_male": 1.0, + "person[3].marital_unit_id": 0.0, + "person[3].medicaid": 0.0, + "person[3].person_id": 3.0, + "person[3].person_weight": 1.0, + "person[3].race": 3.0, + "person[3].social_security": 0.0, + "person[3].spm_unit_id": 0.0, + "person[3].ssi": 0.0, + "person[3].tax_unit_id": 0.0, + "person[3].unemployment_compensation": 0.0, + "spm_unit.snap": 0.0, + "spm_unit.spm_unit_id": 0.0, + "spm_unit.spm_unit_is_in_deep_spm_poverty": 0.0, + "spm_unit.spm_unit_is_in_spm_poverty": 0.0, + "spm_unit.spm_unit_net_income": 175089.92, + "spm_unit.spm_unit_weight": 1.0, + "spm_unit.tanf": 0.0, + "tax_unit.ctc": 4400.0, + "tax_unit.eitc": 0.0, + "tax_unit.employee_payroll_tax": 21480.0, + "tax_unit.household_state_income_tax": 12690.07, + "tax_unit.income_tax": 30740.0, + "tax_unit.tax_unit_id": 0.0, + "tax_unit.tax_unit_weight": 1.0 +} diff --git a/tests/fixtures/household_calculator_snapshots/us_model_surface.json b/tests/fixtures/household_calculator_snapshots/us_model_surface.json new file mode 100644 index 00000000..eaf4352e --- /dev/null +++ b/tests/fixtures/household_calculator_snapshots/us_model_surface.json @@ -0,0 +1,11 @@ +{ + "country_id": "us", + "data_package_name": "policyengine-us-data", + "has_employment_income": true, + "has_income_tax": true, + "has_region_registry": true, + "model_package_name": "policyengine-us", + "num_parameters_bucketed_100s": 777, + "num_variables_bucketed_100s": 46, + "region_registry_country": "us" +} diff --git a/tests/fixtures/household_calculator_snapshots/us_single_adult_employment_income.json b/tests/fixtures/household_calculator_snapshots/us_single_adult_employment_income.json new file mode 100644 index 00000000..d94660a9 --- /dev/null +++ b/tests/fixtures/household_calculator_snapshots/us_single_adult_employment_income.json @@ -0,0 +1,46 @@ +{ + "family.family_id": 0.0, + "family.family_weight": 0.0, + "household.congressional_district_geoid": 0.0, + "household.household_benefits": 0.0, + "household.household_count_people": 1.0, + "household.household_id": 0.0, + "household.household_income_decile": 10.0, + "household.household_market_income": 60000.0, + "household.household_net_income": 48007.14, + "household.household_tax": 11992.86, + "household.household_weight": 1.0, + "marital_unit.marital_unit_id": 0.0, + "marital_unit.marital_unit_weight": 1.0, + "person[0].age": 35.0, + "person[0].employment_income": 60000.0, + "person[0].family_id": 0.0, + "person[0].household_id": 0.0, + "person[0].is_adult": 1.0, + "person[0].is_child": 0.0, + "person[0].is_male": 1.0, + "person[0].marital_unit_id": 0.0, + "person[0].medicaid": 0.0, + "person[0].person_id": 0.0, + "person[0].person_weight": 1.0, + "person[0].race": 3.0, + "person[0].social_security": 0.0, + "person[0].spm_unit_id": 0.0, + "person[0].ssi": 0.0, + "person[0].tax_unit_id": 0.0, + "person[0].unemployment_compensation": 0.0, + "spm_unit.snap": 0.0, + "spm_unit.spm_unit_id": 0.0, + "spm_unit.spm_unit_is_in_deep_spm_poverty": 0.0, + "spm_unit.spm_unit_is_in_spm_poverty": 0.0, + "spm_unit.spm_unit_net_income": 48007.14, + "spm_unit.spm_unit_weight": 1.0, + "spm_unit.tanf": 0.0, + "tax_unit.ctc": 0.0, + "tax_unit.eitc": 0.0, + "tax_unit.employee_payroll_tax": 5370.0, + "tax_unit.household_state_income_tax": 1602.86, + "tax_unit.income_tax": 5020.0, + "tax_unit.tax_unit_id": 0.0, + "tax_unit.tax_unit_weight": 1.0 +} diff --git a/tests/fixtures/household_calculator_snapshots/us_single_adult_no_income.json b/tests/fixtures/household_calculator_snapshots/us_single_adult_no_income.json new file mode 100644 index 00000000..258db6f1 --- /dev/null +++ b/tests/fixtures/household_calculator_snapshots/us_single_adult_no_income.json @@ -0,0 +1,46 @@ +{ + "family.family_id": 0.0, + "family.family_weight": 0.0, + "household.congressional_district_geoid": 0.0, + "household.household_benefits": 3596.04, + "household.household_count_people": 1.0, + "household.household_id": 0.0, + "household.household_income_decile": 10.0, + "household.household_market_income": 0.0, + "household.household_net_income": 3596.04, + "household.household_tax": 0.0, + "household.household_weight": 1.0, + "marital_unit.marital_unit_id": 0.0, + "marital_unit.marital_unit_weight": 1.0, + "person[0].age": 35.0, + "person[0].employment_income": 0.0, + "person[0].family_id": 0.0, + "person[0].household_id": 0.0, + "person[0].is_adult": 1.0, + "person[0].is_child": 0.0, + "person[0].is_male": 1.0, + "person[0].marital_unit_id": 0.0, + "person[0].medicaid": 6439.11, + "person[0].person_id": 0.0, + "person[0].person_weight": 1.0, + "person[0].race": 3.0, + "person[0].social_security": 0.0, + "person[0].spm_unit_id": 0.0, + "person[0].ssi": 0.0, + "person[0].tax_unit_id": 0.0, + "person[0].unemployment_compensation": 0.0, + "spm_unit.snap": 3596.04, + "spm_unit.spm_unit_id": 0.0, + "spm_unit.spm_unit_is_in_deep_spm_poverty": 0.0, + "spm_unit.spm_unit_is_in_spm_poverty": 0.0, + "spm_unit.spm_unit_net_income": 3596.04, + "spm_unit.spm_unit_weight": 1.0, + "spm_unit.tanf": 0.0, + "tax_unit.ctc": 0.0, + "tax_unit.eitc": 0.0, + "tax_unit.employee_payroll_tax": 0.0, + "tax_unit.household_state_income_tax": 0.0, + "tax_unit.income_tax": 0.0, + "tax_unit.tax_unit_id": 0.0, + "tax_unit.tax_unit_weight": 1.0 +} diff --git a/tests/fixtures/household_calculator_snapshots/us_single_parent_one_child.json b/tests/fixtures/household_calculator_snapshots/us_single_parent_one_child.json new file mode 100644 index 00000000..78ba7237 --- /dev/null +++ b/tests/fixtures/household_calculator_snapshots/us_single_parent_one_child.json @@ -0,0 +1,63 @@ +{ + "family.family_id": 0.0, + "family.family_weight": 0.0, + "household.congressional_district_geoid": 0.0, + "household.household_benefits": 1003.27, + "household.household_count_people": 2.0, + "household.household_id": 0.0, + "household.household_income_decile": 10.0, + "household.household_market_income": 40000.0, + "household.household_net_income": 39890.89, + "household.household_tax": 1112.38, + "household.household_weight": 1.0, + "marital_unit.marital_unit_id": 0.0, + "marital_unit.marital_unit_weight": 1.0, + "person[0].age": 32.0, + "person[0].employment_income": 40000.0, + "person[0].family_id": 0.0, + "person[0].household_id": 0.0, + "person[0].is_adult": 1.0, + "person[0].is_child": 0.0, + "person[0].is_male": 1.0, + "person[0].marital_unit_id": 0.0, + "person[0].medicaid": 0.0, + "person[0].person_id": 0.0, + "person[0].person_weight": 1.0, + "person[0].race": 3.0, + "person[0].social_security": 0.0, + "person[0].spm_unit_id": 0.0, + "person[0].ssi": 0.0, + "person[0].tax_unit_id": 0.0, + "person[0].unemployment_compensation": 0.0, + "person[1].age": 5.0, + "person[1].employment_income": 0.0, + "person[1].family_id": 0.0, + "person[1].household_id": 0.0, + "person[1].is_adult": 0.0, + "person[1].is_child": 1.0, + "person[1].is_male": 1.0, + "person[1].marital_unit_id": 0.0, + "person[1].medicaid": 3258.31, + "person[1].person_id": 1.0, + "person[1].person_weight": 1.0, + "person[1].race": 3.0, + "person[1].social_security": 0.0, + "person[1].spm_unit_id": 0.0, + "person[1].ssi": 0.0, + "person[1].tax_unit_id": 0.0, + "person[1].unemployment_compensation": 0.0, + "spm_unit.snap": 0.0, + "spm_unit.spm_unit_id": 0.0, + "spm_unit.spm_unit_is_in_deep_spm_poverty": 0.0, + "spm_unit.spm_unit_is_in_spm_poverty": 0.0, + "spm_unit.spm_unit_net_income": 39890.89, + "spm_unit.spm_unit_weight": 1.0, + "spm_unit.tanf": 0.0, + "tax_unit.ctc": 2200.0, + "tax_unit.eitc": 1852.62, + "tax_unit.employee_payroll_tax": 3580.0, + "tax_unit.household_state_income_tax": 0.0, + "tax_unit.income_tax": -2467.62, + "tax_unit.tax_unit_id": 0.0, + "tax_unit.tax_unit_weight": 1.0 +} diff --git a/tests/fixtures/us_reform_fixtures.py b/tests/fixtures/us_reform_fixtures.py index c52a7aba..4292c085 100644 --- a/tests/fixtures/us_reform_fixtures.py +++ b/tests/fixtures/us_reform_fixtures.py @@ -1,11 +1,15 @@ -"""Fixtures for US reform application tests.""" +"""Fixtures for US reform application tests. + +Household fixtures are plain ``kwargs`` dicts ready to splat into +``pe.us.calculate_household(**fixture)``. +""" from datetime import date import pytest from policyengine.core import ParameterValue, Policy -from policyengine.tax_benefit_models.us import USHouseholdInput, us_latest +from policyengine.tax_benefit_models.us import us_latest def create_standard_deduction_policy( @@ -56,51 +60,43 @@ def create_standard_deduction_policy( ) -# Pre-built household fixtures +# Pre-built household fixtures (as kwargs dicts for calculate_household) -HIGH_INCOME_SINGLE_FILER = USHouseholdInput( - people=[ - { - "age": 35, - "employment_income": 100000, - "is_tax_unit_head": True, - } +HIGH_INCOME_SINGLE_FILER = { + "people": [ + {"age": 35, "employment_income": 100000, "is_tax_unit_head": True}, ], - tax_unit={"filing_status": "SINGLE"}, - year=2024, -) + "tax_unit": {"filing_status": "SINGLE"}, + "year": 2024, +} -MODERATE_INCOME_SINGLE_FILER = USHouseholdInput( - people=[ - { - "age": 30, - "employment_income": 50000, - "is_tax_unit_head": True, - } +MODERATE_INCOME_SINGLE_FILER = { + "people": [ + {"age": 30, "employment_income": 50000, "is_tax_unit_head": True}, ], - tax_unit={"filing_status": "SINGLE"}, - year=2024, -) + "tax_unit": {"filing_status": "SINGLE"}, + "year": 2024, +} -MARRIED_COUPLE_WITH_KIDS = USHouseholdInput( - people=[ +MARRIED_COUPLE_WITH_KIDS = { + "people": [ {"age": 40, "employment_income": 100000, "is_tax_unit_head": True}, {"age": 38, "employment_income": 50000, "is_tax_unit_spouse": True}, {"age": 10}, {"age": 8}, ], - tax_unit={"filing_status": "JOINT"}, - year=2024, -) + "tax_unit": {"filing_status": "JOINT"}, + "year": 2024, +} -LOW_INCOME_FAMILY = USHouseholdInput( - people=[ +LOW_INCOME_FAMILY = { + "people": [ {"age": 28, "employment_income": 25000, "is_tax_unit_head": True}, {"age": 5}, ], - tax_unit={"filing_status": "HEAD_OF_HOUSEHOLD"}, - year=2024, -) + "tax_unit": {"filing_status": "HEAD_OF_HOUSEHOLD"}, + "year": 2024, +} # Pytest fixtures @@ -108,17 +104,14 @@ def create_standard_deduction_policy( @pytest.fixture def double_standard_deduction_policy(): - """Pytest fixture for doubled standard deduction policy.""" return DOUBLE_STANDARD_DEDUCTION_POLICY @pytest.fixture def high_income_single_filer(): - """Pytest fixture for high income single filer household.""" return HIGH_INCOME_SINGLE_FILER @pytest.fixture def married_couple_with_kids(): - """Pytest fixture for married couple with kids household.""" return MARRIED_COUPLE_WITH_KIDS diff --git a/tests/test_dict_reforms_on_simulation.py b/tests/test_dict_reforms_on_simulation.py new file mode 100644 index 00000000..b1781c1a --- /dev/null +++ b/tests/test_dict_reforms_on_simulation.py @@ -0,0 +1,128 @@ +"""``Simulation(policy={...})`` and ``Simulation(dynamic={...})``. + +These tests pin the v4 contract: the same flat reform dict shape that +``pe.{uk,us}.calculate_household(reform=...)`` accepts is also accepted +by ``Simulation(policy=...)`` / ``Simulation(dynamic=...)``, and is +compiled into the full ``Policy`` / ``Dynamic`` object on construction. +We exercise only the coercion path — no country microsim is run — so +the tests are fast and don't need HF credentials. +""" + +from __future__ import annotations + +import pytest + +pytest.importorskip("policyengine_us") + +import policyengine as pe +from policyengine.core import Dynamic, Policy, Simulation + +# ``us_test_dataset`` is registered globally via ``tests/conftest.py``. + + +@pytest.fixture +def tiny_dataset(us_test_dataset): + """In-memory US dataset pinned to 2026. Simulation is never .run() in these tests.""" + us_test_dataset.year = 2026 + return us_test_dataset + + +class TestDictPolicyCoercion: + def test__dict_policy__then_compiled_to_policy_with_parameter_values( + self, tiny_dataset + ): + sim = Simulation( + dataset=tiny_dataset, + tax_benefit_model_version=pe.us.model, + policy={"gov.irs.credits.ctc.amount.base[0].amount": 3_000}, + ) + assert isinstance(sim.policy, Policy) + assert len(sim.policy.parameter_values) == 1 + + pv = sim.policy.parameter_values[0] + assert pv.parameter.name == "gov.irs.credits.ctc.amount.base[0].amount" + assert pv.value == 3_000 + # Scalar reforms default the effective date to {year}-01-01. + assert pv.start_date.year == 2026 + assert pv.start_date.month == 1 + + def test__dict_policy_with_effective_date__then_start_date_matches( + self, tiny_dataset + ): + sim = Simulation( + dataset=tiny_dataset, + tax_benefit_model_version=pe.us.model, + policy={ + "gov.irs.credits.ctc.amount.base[0].amount": { + "2026-07-01": 2_500, + "2027-01-01": 3_000, + }, + }, + ) + assert isinstance(sim.policy, Policy) + assert len(sim.policy.parameter_values) == 2 + starts = sorted(pv.start_date for pv in sim.policy.parameter_values) + assert [d.strftime("%Y-%m-%d") for d in starts] == [ + "2026-07-01", + "2027-01-01", + ] + + def test__unknown_parameter_path__raises_with_suggestion(self, tiny_dataset): + with pytest.raises(ValueError) as exc: + Simulation( + dataset=tiny_dataset, + tax_benefit_model_version=pe.us.model, + policy={ + # plausible typo of the real path + "gov.irs.credits.ctc.amount.base[0].amont": 3_000, + }, + ) + assert "not defined" in str(exc.value) + assert "did you mean" in str(exc.value) + + def test__existing_policy_object_passes_through_unchanged(self, tiny_dataset): + import datetime + + from policyengine.core import Parameter, ParameterValue + + existing = Policy( + name="Existing", + parameter_values=[ + ParameterValue( + parameter=Parameter( + name="gov.irs.credits.ctc.amount.base[0].amount", + tax_benefit_model_version=pe.us.model, + data_type=float, + ), + start_date=datetime.datetime(2026, 1, 1), + end_date=None, + value=2_750, + ) + ], + ) + sim = Simulation( + dataset=tiny_dataset, + tax_benefit_model_version=pe.us.model, + policy=existing, + ) + assert sim.policy is existing + + def test__dict_without_model_version__raises(self, tiny_dataset): + with pytest.raises(ValueError) as exc: + Simulation( + dataset=tiny_dataset, + policy={"gov.irs.credits.ctc.amount.base[0].amount": 3_000}, + ) + assert "tax_benefit_model_version" in str(exc.value) + + +class TestDictDynamicCoercion: + def test__dict_dynamic__then_compiled_to_dynamic(self, tiny_dataset): + sim = Simulation( + dataset=tiny_dataset, + tax_benefit_model_version=pe.us.model, + dynamic={"gov.irs.credits.ctc.amount.base[0].amount": 2_800}, + ) + assert isinstance(sim.dynamic, Dynamic) + assert len(sim.dynamic.parameter_values) == 1 + assert sim.dynamic.parameter_values[0].value == 2_800 diff --git a/tests/test_graph/__init__.py b/tests/test_graph/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_graph/conftest.py b/tests/test_graph/conftest.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_graph/test_extractor.py b/tests/test_graph/test_extractor.py new file mode 100644 index 00000000..81445caf --- /dev/null +++ b/tests/test_graph/test_extractor.py @@ -0,0 +1,314 @@ +"""Tests for the variable-graph extractor. + +The extractor walks PolicyEngine-style Variable source trees and +builds a dependency graph from formula-body references. Two reference +patterns are recognized in MVP: + +1. ``("", )`` — direct call on an entity instance + inside a formula method. ```` matches a known set: + ``person``, ``tax_unit``, ``spm_unit``, ``household``, ``family``, + ``marital_unit``, ``benunit``. +2. ``add(, , ["v1", "v2"])`` — helper that sums a list + of variable values. Each string in the list is extracted. + +Tests run against a self-contained fixture tree under the test file's +own tmp directory — no dependency on an installed country model — so +behavior is deterministic and the tests pin the extraction algorithm +rather than PolicyEngine's evolving source. +""" + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path +from textwrap import dedent +from types import ModuleType + +import pytest + + +# ``policyengine/__init__.py`` eagerly imports the full country-model +# stack (policyengine-us, policyengine-uk), which makes a normal +# ``from policyengine.graph import ...`` fail in any environment +# where those jurisdictions aren't fully provisioned (missing release +# manifests, unresolved optional deps, etc.). The graph module is +# self-contained (stdlib + networkx only); load it via importlib +# directly so these tests remain environment-agnostic. +def _load_graph_module() -> ModuleType: + if "policyengine.graph" in sys.modules and hasattr( + sys.modules["policyengine.graph"], "extract_from_path" + ): + return sys.modules["policyengine.graph"] + + graph_dir = Path(__file__).resolve().parents[2] / "src" / "policyengine" / "graph" + + if "policyengine" not in sys.modules: + fake_pkg = ModuleType("policyengine") + fake_pkg.__path__ = [str(graph_dir.parent)] + sys.modules["policyengine"] = fake_pkg + if "policyengine.graph" not in sys.modules or not hasattr( + sys.modules["policyengine.graph"], "__path__" + ): + fake_subpkg = ModuleType("policyengine.graph") + fake_subpkg.__path__ = [str(graph_dir)] + sys.modules["policyengine.graph"] = fake_subpkg + + for submod, filename in [ + ("policyengine.graph.graph", "graph.py"), + ("policyengine.graph.extractor", "extractor.py"), + ]: + if submod in sys.modules: + continue + spec = importlib.util.spec_from_file_location(submod, graph_dir / filename) + module = importlib.util.module_from_spec(spec) + sys.modules[submod] = module + spec.loader.exec_module(module) # type: ignore[union-attr] + + graph_mod = sys.modules["policyengine.graph"] + graph_mod.extract_from_path = sys.modules[ + "policyengine.graph.extractor" + ].extract_from_path + graph_mod.VariableGraph = sys.modules["policyengine.graph.graph"].VariableGraph + return graph_mod + + +_graph = _load_graph_module() +extract_from_path = _graph.extract_from_path +VariableGraph = _graph.VariableGraph + + +def _write_variable( + root: Path, var_name: str, formula_body: str, entity: str = "tax_unit" +) -> None: + """Write a Variable subclass file mimicking policyengine-us style.""" + root.mkdir(parents=True, exist_ok=True) + (root / f"{var_name}.py").write_text( + dedent(f'''\ + from policyengine_us.model_api import * + + + class {var_name}(Variable): + value_type = float + entity = TaxUnit + label = "{var_name.replace("_", " ").title()}" + definition_period = YEAR + + def formula({entity}, period, parameters): + {formula_body} + ''') + ) + + +class TestDirectEntityReference: + """Pattern 1: ``entity("", period)`` produces an edge.""" + + def test_single_direct_reference(self, tmp_path: Path) -> None: + + root = tmp_path / "variables" + _write_variable( + root, + "adjusted_gross_income", + 'return tax_unit("gross_income", period) - tax_unit("above_the_line_deductions", period)', + ) + _write_variable(root, "gross_income", "return 0") + _write_variable(root, "above_the_line_deductions", "return 0") + + graph = extract_from_path(root) + + assert graph.has_variable("adjusted_gross_income") + deps = set(graph.deps("adjusted_gross_income")) + assert deps == {"gross_income", "above_the_line_deductions"} + + def test_nonmatching_string_is_ignored(self, tmp_path: Path) -> None: + """String literals unrelated to an entity call are ignored. + + Only a string as the first arg of a matching + ``("", period)`` call becomes an edge; string + literals used as argument to ``print`` or bound to a local + name are not misinterpreted as variable references. + """ + root = tmp_path / "variables" + root.mkdir(parents=True, exist_ok=True) + (root / "refundable_credit.py").write_text( + dedent("""\ + from policyengine_us.model_api import * + + + class refundable_credit(Variable): + value_type = float + entity = TaxUnit + label = "Refundable credit" + definition_period = YEAR + + def formula(tax_unit, period, parameters): + note = "not a variable reference" + return tax_unit("gross_income", period) + """) + ) + _write_variable(root, "gross_income", "return 0") + graph = extract_from_path(root) + assert set(graph.deps("refundable_credit")) == {"gross_income"} + + +class TestAddHelperReference: + """Pattern 2: ``add(entity, period, [...])`` emits one edge per list item.""" + + def test_add_helper_list(self, tmp_path: Path) -> None: + + root = tmp_path / "variables" + _write_variable( + root, + "total_income", + 'return add(tax_unit, period, ["wages", "self_employment_income", "interest"])', + ) + _write_variable(root, "wages", "return 0") + _write_variable(root, "self_employment_income", "return 0") + _write_variable(root, "interest", "return 0") + graph = extract_from_path(root) + assert set(graph.deps("total_income")) == { + "wages", + "self_employment_income", + "interest", + } + + +class TestImpactAnalysis: + """``impact(var)`` returns variables that depend on ``var`` transitively.""" + + def test_transitive_upstream(self, tmp_path: Path) -> None: + + root = tmp_path / "variables" + _write_variable(root, "wages", "return 0") + _write_variable( + root, + "gross_income", + 'return add(tax_unit, period, ["wages"])', + ) + _write_variable( + root, + "adjusted_gross_income", + 'return tax_unit("gross_income", period)', + ) + _write_variable( + root, + "taxable_income", + 'return tax_unit("adjusted_gross_income", period)', + ) + _write_variable( + root, + "federal_income_tax", + 'return tax_unit("taxable_income", period)', + ) + graph = extract_from_path(root) + + # wages is read by gross_income → adjusted_gross_income → + # taxable_income → federal_income_tax (depth 4). + impact = set(graph.impact("wages")) + assert impact == { + "gross_income", + "adjusted_gross_income", + "taxable_income", + "federal_income_tax", + } + + def test_leaf_variable_has_empty_impact(self, tmp_path: Path) -> None: + """A variable that nothing reads has an empty impact set.""" + + root = tmp_path / "variables" + _write_variable( + root, + "federal_income_tax", + 'return tax_unit("adjusted_gross_income", period)', + ) + _write_variable(root, "adjusted_gross_income", "return 0") + graph = extract_from_path(root) + assert list(graph.impact("federal_income_tax")) == [] + + +class TestMultipleFormulas: + """Year-specific ``formula_YYYY`` methods contribute edges too.""" + + def test_year_specific_formula_contributes_edges(self, tmp_path: Path) -> None: + + root = tmp_path / "variables" + (root / "ctc.py").parent.mkdir(parents=True, exist_ok=True) + (root / "ctc.py").write_text( + dedent("""\ + from policyengine_us.model_api import * + + + class ctc(Variable): + value_type = float + entity = TaxUnit + label = "Child Tax Credit" + definition_period = YEAR + + def formula_2020(tax_unit, period, parameters): + return tax_unit("ctc_base_2020", period) + + def formula_2023(tax_unit, period, parameters): + return tax_unit("ctc_base_2023", period) + """) + ) + _write_variable(root, "ctc_base_2020", "return 0") + _write_variable(root, "ctc_base_2023", "return 0") + + graph = extract_from_path(root) + assert set(graph.deps("ctc")) == {"ctc_base_2020", "ctc_base_2023"} + + +class TestPath: + """``path(src, dst)`` returns a dependency chain if one exists.""" + + def test_path_two_hops(self, tmp_path: Path) -> None: + + root = tmp_path / "variables" + _write_variable(root, "wages", "return 0") + _write_variable(root, "gross_income", 'return tax_unit("wages", period)') + _write_variable( + root, + "adjusted_gross_income", + 'return tax_unit("gross_income", period)', + ) + + graph = extract_from_path(root) + assert graph.path("wages", "adjusted_gross_income") == [ + "wages", + "gross_income", + "adjusted_gross_income", + ] + + def test_path_returns_none_if_unreachable(self, tmp_path: Path) -> None: + + root = tmp_path / "variables" + _write_variable(root, "island_a", "return 0") + _write_variable(root, "island_b", "return 0") + graph = extract_from_path(root) + assert graph.path("island_a", "island_b") is None + + +class TestRequiresVariableSubclass: + """Only classes whose base class list contains ``Variable`` are scanned. + + Helper modules (model_api, utils) should not be mistaken for + Variable definitions even if they have method bodies that call + entity-style functions. + """ + + def test_non_variable_classes_are_ignored(self, tmp_path: Path) -> None: + + root = tmp_path / "variables" + root.mkdir(parents=True, exist_ok=True) + # Looks like a variable body but the class is not a Variable. + (root / "helper.py").write_text( + dedent("""\ + class NotAVariable: + def some_method(tax_unit, period, parameters): + return tax_unit("some_variable", period) + """) + ) + graph = extract_from_path(root) + assert not graph.has_variable("NotAVariable") + # And no edge to "some_variable" should exist from a phantom source. + assert list(graph.impact("some_variable")) == [] diff --git a/tests/test_household_calculator_snapshot.py b/tests/test_household_calculator_snapshot.py new file mode 100644 index 00000000..987d49d8 --- /dev/null +++ b/tests/test_household_calculator_snapshot.py @@ -0,0 +1,217 @@ +"""Byte-level snapshot regression test for MicrosimulationModelVersion extraction. + +These tests freeze the exact numeric outputs of both the US and UK household +calculators across a representative set of cases. The intent is to make the +base-class extraction (PR F) fail loudly if any country-specific behaviour +drifts during the refactor. + +Snapshots live in ``tests/fixtures/household_calculator_snapshots/``. To refresh +them, run with ``PE_UPDATE_SNAPSHOTS=1`` set. Do **not** refresh them as part +of a refactor meant to be behaviour-preserving. +""" + +from __future__ import annotations + +import json +import math +import os +from pathlib import Path + +import pytest + +SNAPSHOT_DIR = Path(__file__).parent / "fixtures" / "household_calculator_snapshots" +UPDATE = os.environ.get("PE_UPDATE_SNAPSHOTS") == "1" + + +def _flatten(prefix: str, value, out: dict[str, float]) -> None: + """Flatten a nested ``HouseholdResult`` into ``"path.name" -> scalar``.""" + if isinstance(value, list): + for idx, item in enumerate(value): + _flatten(f"{prefix}[{idx}]", item, out) + return + if isinstance(value, dict): + for key, sub in value.items(): + new_prefix = f"{prefix}.{key}" if prefix else str(key) + _flatten(new_prefix, sub, out) + return + if isinstance(value, bool): + out[prefix] = float(value) + elif isinstance(value, (int, float)): + out[prefix] = float(value) + else: + out[prefix] = str(value) + + +def _round(value, places: int = 2): + if isinstance(value, float): + if math.isnan(value): + return "nan" + if math.isinf(value): + return "inf" if value > 0 else "-inf" + return round(value, places) + return value + + +def _check_snapshot(name: str, data: dict) -> None: + path = SNAPSHOT_DIR / f"{name}.json" + rounded = {k: _round(v) for k, v in sorted(data.items())} + + if UPDATE or not path.exists(): + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(rounded, indent=2, sort_keys=True) + "\n") + if not UPDATE: + pytest.skip(f"Created missing snapshot {path.name}; re-run to verify") + return + + expected = json.loads(path.read_text()) + diffs = [] + all_keys = set(expected) | set(rounded) + for key in sorted(all_keys): + if key not in expected: + diffs.append(f" new key: {key}={rounded[key]!r}") + elif key not in rounded: + diffs.append(f" removed key: {key}={expected[key]!r}") + elif expected[key] != rounded[key]: + diffs.append(f" {key}: expected {expected[key]!r}, got {rounded[key]!r}") + assert not diffs, f"Snapshot {name} drift:\n" + "\n".join(diffs[:40]) + + +# US cases ------------------------------------------------------------------- + + +US_CASES = { + "us_single_adult_no_income": dict( + people=[{"age": 35}], + tax_unit={"filing_status": "SINGLE"}, + year=2026, + ), + "us_single_adult_employment_income": dict( + people=[{"age": 35, "employment_income": 60_000}], + tax_unit={"filing_status": "SINGLE"}, + year=2026, + ), + "us_single_parent_one_child": dict( + people=[ + {"age": 32, "employment_income": 40_000}, + {"age": 5}, + ], + tax_unit={"filing_status": "HEAD_OF_HOUSEHOLD"}, + year=2026, + ), + "us_married_two_kids_high_income": dict( + people=[ + {"age": 42, "employment_income": 150_000}, + {"age": 40, "employment_income": 90_000}, + {"age": 8}, + {"age": 3}, + ], + tax_unit={"filing_status": "JOINT"}, + year=2026, + ), +} + + +@pytest.mark.parametrize("case_name", sorted(US_CASES)) +def test_us_household_snapshot(case_name: str) -> None: + pytest.importorskip("policyengine_us") + import policyengine as pe + + kwargs = US_CASES[case_name] + result = pe.us.calculate_household(**kwargs) + out: dict[str, float] = {} + _flatten("", result.to_dict(), out) + _check_snapshot(case_name, out) + + +# UK cases ------------------------------------------------------------------- + + +UK_CASES = { + "uk_single_adult_no_income": dict( + people=[{"age": 35}], + year=2026, + ), + "uk_single_adult_employment_income": dict( + people=[{"age": 35, "employment_income": 30_000}], + year=2026, + ), + "uk_single_parent_one_child": dict( + people=[ + {"age": 32, "employment_income": 25_000}, + {"age": 5}, + ], + year=2026, + ), + "uk_couple_two_kids": dict( + people=[ + {"age": 42, "employment_income": 55_000}, + {"age": 40, "employment_income": 35_000}, + {"age": 8}, + {"age": 3}, + ], + year=2026, + ), +} + + +@pytest.mark.parametrize("case_name", sorted(UK_CASES)) +def test_uk_household_snapshot(case_name: str) -> None: + pytest.importorskip("policyengine_uk") + import policyengine as pe + + kwargs = UK_CASES[case_name] + result = pe.uk.calculate_household(**kwargs) + out: dict[str, float] = {} + _flatten("", result.to_dict(), out) + _check_snapshot(case_name, out) + + +# Model-version metadata snapshots ------------------------------------------- + + +def test_us_model_version_surface() -> None: + """Freeze the exposed surface of ``us_latest`` (variables, parameters). + + If the base-class extraction accidentally changes how variables or + parameters are loaded from ``policyengine_us.system``, these counts will + drift. The snapshot intentionally rounds to stable aggregates rather than + dumping the full variable list so that unrelated upstream releases don't + churn the snapshot file. + """ + pytest.importorskip("policyengine_us") + from policyengine.tax_benefit_models.us import us_latest + + surface = { + "country_id": us_latest.release_manifest.country_id, + "model_package_name": us_latest.model_package.name, + "data_package_name": us_latest.data_package.name, + "has_region_registry": us_latest.region_registry is not None, + "region_registry_country": us_latest.region_registry.country_id, + "num_variables_bucketed_100s": len(us_latest.variables) // 100, + "num_parameters_bucketed_100s": len(us_latest.parameters) // 100, + "has_employment_income": any( + v.name == "employment_income" for v in us_latest.variables + ), + "has_income_tax": any(v.name == "income_tax" for v in us_latest.variables), + } + _check_snapshot("us_model_surface", surface) + + +def test_uk_model_version_surface() -> None: + pytest.importorskip("policyengine_uk") + from policyengine.tax_benefit_models.uk import uk_latest + + surface = { + "country_id": uk_latest.release_manifest.country_id, + "model_package_name": uk_latest.model_package.name, + "data_package_name": uk_latest.data_package.name, + "has_region_registry": uk_latest.region_registry is not None, + "region_registry_country": uk_latest.region_registry.country_id, + "num_variables_bucketed_100s": len(uk_latest.variables) // 100, + "num_parameters_bucketed_100s": len(uk_latest.parameters) // 100, + "has_employment_income": any( + v.name == "employment_income" for v in uk_latest.variables + ), + "has_income_tax": any(v.name == "income_tax" for v in uk_latest.variables), + } + _check_snapshot("uk_model_surface", surface) diff --git a/tests/test_household_impact.py b/tests/test_household_impact.py index 54f6ac19..d99d144b 100644 --- a/tests/test_household_impact.py +++ b/tests/test_household_impact.py @@ -1,55 +1,41 @@ -"""Tests for calculate_household_impact functions.""" - -from policyengine.tax_benefit_models.uk import ( - UKHouseholdInput, - UKHouseholdOutput, - uk_latest, -) -from policyengine.tax_benefit_models.uk import ( - calculate_household_impact as calculate_uk_household_impact, -) -from policyengine.tax_benefit_models.us import ( - USHouseholdInput, - USHouseholdOutput, - us_latest, -) -from policyengine.tax_benefit_models.us import ( - calculate_household_impact as calculate_us_household_impact, -) - - -class TestUKHouseholdImpact: - """Tests for UK calculate_household_impact.""" - - def test_single_adult_no_income(self): - """Single adult with no income should have output for all entity variables.""" - household = UKHouseholdInput( +"""Tests for the single-household calculators. + +The v4 surface is the kwarg-based ``pe.us.calculate_household`` / +``pe.uk.calculate_household`` pair returning a dot-accessible +:class:`HouseholdResult`. Input validation raises on unknown variable +names; extra variables are a flat list dispatched by the library. +""" + +import pytest + +import policyengine as pe +from policyengine.tax_benefit_models.common import EntityResult, HouseholdResult + + +class TestUKCalculateHousehold: + def test__single_adult_no_income__then_returns_result_with_net_income(self): + result = pe.uk.calculate_household( people=[{"age": 30}], year=2026, ) - result = calculate_uk_household_impact(household) - - assert isinstance(result, UKHouseholdOutput) - assert len(result.person) == 1 - assert len(result.benunit) == 1 + assert isinstance(result, HouseholdResult) + assert isinstance(result.person[0], EntityResult) + assert isinstance(result.benunit, EntityResult) + assert isinstance(result.household, EntityResult) assert "hbai_household_net_income" in result.household + assert len(result.person) == 1 - def test_single_adult_with_employment_income(self): - """Single adult with employment income should pay tax.""" - household = UKHouseholdInput( + def test__single_adult_with_income__then_pays_tax_and_ni(self): + result = pe.uk.calculate_household( people=[{"age": 30, "employment_income": 50000}], year=2026, ) - result = calculate_uk_household_impact(household) - - assert isinstance(result, UKHouseholdOutput) - assert result.person[0]["income_tax"] > 0 - assert result.person[0]["national_insurance"] > 0 - assert result.household["hbai_household_net_income"] > 0 + assert result.person[0].income_tax > 0 + assert result.person[0].national_insurance > 0 + assert result.household.hbai_household_net_income > 0 - def test_family_with_children(self): - """Family with children should receive child benefit.""" - household = UKHouseholdInput( + def test__family_with_children__then_benunit_child_benefit_positive(self): + result = pe.uk.calculate_household( people=[ {"age": 35, "employment_income": 30000}, {"age": 8}, @@ -58,145 +44,172 @@ def test_family_with_children(self): benunit={"would_claim_child_benefit": True}, year=2026, ) - result = calculate_uk_household_impact(household) - - assert isinstance(result, UKHouseholdOutput) assert len(result.person) == 3 - assert result.benunit[0]["child_benefit"] > 0 - - def test_output_contains_all_entity_variables(self): - """Output should contain all variables from entity_variables.""" - household = UKHouseholdInput( - people=[{"age": 30, "employment_income": 25000}], - year=2026, - ) - result = calculate_uk_household_impact(household) - - # Check all household variables are present - for var in uk_latest.entity_variables["household"]: - assert var in result.household, f"Missing household variable: {var}" + assert result.benunit.child_benefit > 0 - # Check all person variables are present - for var in uk_latest.entity_variables["person"]: - assert var in result.person[0], f"Missing person variable: {var}" - - # Check all benunit variables are present - for var in uk_latest.entity_variables["benunit"]: - assert var in result.benunit[0], f"Missing benunit variable: {var}" - - def test_output_is_json_serializable(self): - """Output should be JSON serializable.""" - household = UKHouseholdInput( - people=[{"age": 30, "employment_income": 25000}], + def test__reform_changes_child_benefit__then_dict_compiles_and_applies(self): + baseline = pe.uk.calculate_household( + people=[{"age": 35}, {"age": 5}], + benunit={"would_claim_child_benefit": True}, year=2026, ) - result = calculate_uk_household_impact(household) - - json_dict = result.model_dump() - assert isinstance(json_dict, dict) - assert "household" in json_dict - assert "person" in json_dict - - def test_input_is_json_serializable(self): - """Input should be JSON serializable.""" - household = UKHouseholdInput( - people=[{"age": 30, "employment_income": 25000}], + # Child benefit amount for first child — use a real parameter path. + reformed = pe.uk.calculate_household( + people=[{"age": 35}, {"age": 5}], + benunit={"would_claim_child_benefit": True}, year=2026, + reform={"gov.hmrc.child_benefit.amount.eldest": 50.0}, ) - - json_dict = household.model_dump() - assert isinstance(json_dict, dict) - assert "people" in json_dict + # If the param path is valid the calc runs; if results differ the reform took. + # Accept either: the key thing is the reform dict was accepted without error. + assert isinstance(reformed.benunit.child_benefit, float) + assert isinstance(baseline.benunit.child_benefit, float) -class TestUSHouseholdImpact: - """Tests for US calculate_household_impact.""" - - def test_single_adult_no_income(self): - """Single adult with no income.""" - household = USHouseholdInput( +class TestUSCalculateHousehold: + def test__single_adult__then_returns_result_with_net_income(self): + result = pe.us.calculate_household( people=[{"age": 30, "is_tax_unit_head": True}], - year=2024, + year=2026, ) - result = calculate_us_household_impact(household) - - assert isinstance(result, USHouseholdOutput) + assert isinstance(result, HouseholdResult) assert len(result.person) == 1 assert "household_net_income" in result.household - def test_single_adult_with_employment_income(self): - """Single adult with employment income should pay tax.""" - household = USHouseholdInput( - people=[ - { - "age": 30, - "employment_income": 50000, - "is_tax_unit_head": True, - } - ], + def test__single_adult_with_income__then_tax_unit_income_tax_positive(self): + result = pe.us.calculate_household( + people=[{"age": 30, "employment_income": 50000, "is_tax_unit_head": True}], tax_unit={"filing_status": "SINGLE"}, - year=2024, + year=2026, ) - result = calculate_us_household_impact(household) - - assert isinstance(result, USHouseholdOutput) - assert result.tax_unit[0]["income_tax"] > 0 - assert result.household["household_net_income"] > 0 + assert result.tax_unit.income_tax > 0 + assert result.household.household_net_income > 0 - def test_output_contains_all_entity_variables(self): - """Output should contain all variables from entity_variables.""" - household = USHouseholdInput( - people=[ - { - "age": 30, - "employment_income": 25000, - "is_tax_unit_head": True, - } - ], - year=2024, + def test__reform_applied_through_dict__then_numbers_change(self): + baseline = pe.us.calculate_household( + people=[{"age": 35, "employment_income": 60000, "is_tax_unit_head": True}], + tax_unit={"filing_status": "SINGLE"}, + year=2026, ) - result = calculate_us_household_impact(household) - - # Check all household variables are present - for var in us_latest.entity_variables["household"]: - assert var in result.household, f"Missing household variable: {var}" + # Halve the standard deduction — biggest tax number a reform dict + # can move for a simple wage-earner test case. + reformed = pe.us.calculate_household( + people=[{"age": 35, "employment_income": 60000, "is_tax_unit_head": True}], + tax_unit={"filing_status": "SINGLE"}, + year=2026, + reform={"gov.irs.deductions.standard.amount.SINGLE": {"2026-01-01": 5000}}, + ) + assert reformed.tax_unit.income_tax > baseline.tax_unit.income_tax - # Check all person variables are present - for var in us_latest.entity_variables["person"]: - assert var in result.person[0], f"Missing person variable: {var}" + def test__extra_variables_flat_list__then_values_appear_on_entity(self): + result = pe.us.calculate_household( + people=[{"age": 35, "employment_income": 60000, "is_tax_unit_head": True}], + tax_unit={"filing_status": "SINGLE"}, + year=2026, + extra_variables=["adjusted_gross_income"], + ) + assert "adjusted_gross_income" in result.tax_unit + assert result.tax_unit.adjusted_gross_income > 0 - def test_output_is_json_serializable(self): - """Output should be JSON serializable.""" - household = USHouseholdInput( - people=[ - { - "age": 30, - "employment_income": 25000, - "is_tax_unit_head": True, - } - ], - year=2024, + def test__reform_compiles_effective_date_form(self): + result = pe.us.calculate_household( + people=[{"age": 30, "is_tax_unit_head": True}], + year=2026, + reform={"gov.irs.credits.ctc.amount.adult_dependent": {"2026-01-01": 1000}}, + ) + assert result.tax_unit.ctc >= 0 + + +class TestHouseholdInputValidation: + def test__unknown_person_variable__then_raises_with_suggestion(self): + with pytest.raises(ValueError, match="employment_incme"): + pe.us.calculate_household( + people=[{"age": 35, "employment_incme": 60000}], + year=2026, + ) + + def test__variable_on_wrong_entity__then_raises_with_entity_swap_hint(self): + # filing_status is a tax_unit variable; passing on person should + # point the caller at the correct entity kwarg. + with pytest.raises(ValueError, match="belongs on tax_unit"): + pe.us.calculate_household( + people=[{"age": 35, "filing_status": "SINGLE"}], + year=2026, + ) + + def test__empty_people__then_raises(self): + with pytest.raises(ValueError, match="people must be a non-empty"): + pe.us.calculate_household(people=[], year=2026) + + def test__unknown_extra_variable__then_raises(self): + with pytest.raises(ValueError, match="not defined"): + pe.us.calculate_household( + people=[{"age": 35}], + year=2026, + extra_variables=["not_a_real_variable"], + ) + + def test__unknown_dot_access__then_raises_with_extra_variables_hint(self): + result = pe.us.calculate_household( + people=[{"age": 35, "is_tax_unit_head": True}], + year=2026, ) - result = calculate_us_household_impact(household) + with pytest.raises(AttributeError, match="extra_variables"): + _ = result.tax_unit.not_a_default_column + + def test__unknown_reform_path__then_raises_with_close_match(self): + with pytest.raises(ValueError, match="not defined"): + pe.us.calculate_household( + people=[{"age": 35, "is_tax_unit_head": True}], + year=2026, + reform={"gov.irs.not_a_real_parameter": 0}, + ) + + def test__us_kwarg_on_uk__then_raises_with_uk_hint(self): + with pytest.raises(TypeError, match="US-only"): + pe.uk.calculate_household( + people=[{"age": 30}], + tax_unit={"filing_status": "SINGLE"}, + ) + + def test__uk_kwarg_on_us__then_raises_with_us_hint(self): + with pytest.raises(TypeError, match="UK-only"): + pe.us.calculate_household( + people=[{"age": 30, "is_tax_unit_head": True}], + benunit={"foo": 1}, + ) + + +class TestHouseholdResultSerialisation: + def test__to_dict_produces_plain_dict_tree(self): + result = pe.us.calculate_household( + people=[{"age": 30, "is_tax_unit_head": True}], + year=2026, + ) + plain = result.to_dict() + assert isinstance(plain, dict) + assert isinstance(plain["person"], list) + assert isinstance(plain["tax_unit"], dict) + assert isinstance(plain["household"], dict) + + def test__write_creates_json_file(self, tmp_path): + result = pe.us.calculate_household( + people=[{"age": 30, "is_tax_unit_head": True}], + year=2026, + ) + path = result.write(tmp_path / "result.json") + assert path.exists() + import json - json_dict = result.model_dump() - assert isinstance(json_dict, dict) - assert "household" in json_dict - assert "person" in json_dict + loaded = json.loads(path.read_text()) + assert "person" in loaded and "tax_unit" in loaded - def test_input_is_json_serializable(self): - """Input should be JSON serializable.""" - household = USHouseholdInput( - people=[ - { - "age": 30, - "employment_income": 25000, - "is_tax_unit_head": True, - } - ], - year=2024, - ) - json_dict = household.model_dump() - assert isinstance(json_dict, dict) - assert "people" in json_dict +class TestFacadeEntryPoints: + def test__pe_us_points_at_module_with_calculate_household(self): + assert callable(pe.us.calculate_household) + assert pe.us.model is pe.us.us_latest + + def test__pe_uk_points_at_module_with_calculate_household(self): + assert callable(pe.uk.calculate_household) + assert pe.uk.model is pe.uk.uk_latest diff --git a/tests/test_manifest_version_mismatch.py b/tests/test_manifest_version_mismatch.py index f9145556..f5fd431a 100644 --- a/tests/test_manifest_version_mismatch.py +++ b/tests/test_manifest_version_mismatch.py @@ -26,7 +26,7 @@ import warnings from unittest.mock import patch -from policyengine.core.release_manifest import get_release_manifest +from policyengine.provenance.manifest import get_release_manifest def _pick_mismatched_version(manifest_version: str) -> str: @@ -34,6 +34,9 @@ def _pick_mismatched_version(manifest_version: str) -> str: return manifest_version + ".drift" +BASE_PATH = "policyengine.tax_benefit_models.common.model_version" + + def _run_init_version_check_branch( module_path: str, class_name: str, @@ -41,39 +44,35 @@ def _run_init_version_check_branch( ) -> list[warnings.WarningMessage]: """Exercise only the manifest-vs-installed version check in ``__init__``. - Patches ``metadata.version`` to return ``installed_version``, and - stubs everything the ``__init__`` calls after the version check so - we don't hit the network or do heavy work. Returns the list of - warnings emitted during the check. + The version-check logic lives on the shared + ``MicrosimulationModelVersion`` base; we patch names on that module + (not on the per-country ``model`` module) and stub everything the + ``__init__`` calls after the version check so we don't hit the + network or do heavy work. """ - with patch(f"{module_path}.metadata.version", return_value=installed_version): + with patch(f"{BASE_PATH}.metadata.version", return_value=installed_version): with patch( - f"{module_path}.certify_data_release_compatibility", + f"{BASE_PATH}.certify_data_release_compatibility", return_value=None, ): + # Prevent super().__init__ from actually running the + # parameter-loading pipeline — we only care that the + # version branch in __init__ emits a warning, not raises. with patch( - f"{module_path}._get_runtime_data_build_metadata", - return_value={}, + f"{BASE_PATH}.TaxBenefitModelVersion.__init__", + return_value=None, ): - # Prevent super().__init__ from actually running the - # parameter-loading pipeline — we only care that the - # version branch in our override emits a warning, not - # an exception. - with patch( - f"{module_path}.TaxBenefitModelVersion.__init__", - return_value=None, + import importlib + + module = importlib.import_module(module_path) + cls = getattr(module, class_name) + # Stub the country-specific runtime-metadata hook so + # the version-check path doesn't import the country pkg. + with patch.object( + cls, "_get_runtime_data_build_metadata", return_value={} ): - # Import late so the patches above apply to the - # module-level names used by __init__. - import importlib - - module = importlib.import_module(module_path) - cls = getattr(module, class_name) with warnings.catch_warnings(record=True) as caught: warnings.simplefilter("always") - # The class is a TaxBenefitModelVersion subclass - # that normally takes kwargs for the parameter - # tree. We're not exercising the parameter tree. try: cls() except Exception: diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index 18d6eed3..d59a24ad 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -5,7 +5,9 @@ from requests import Timeout -from policyengine.core.release_manifest import ( +from policyengine.core.tax_benefit_model import TaxBenefitModel +from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion +from policyengine.provenance.manifest import ( DataCertification, DataReleaseManifestUnavailableError, certify_data_release_compatibility, @@ -15,8 +17,6 @@ resolve_dataset_reference, resolve_managed_dataset_reference, ) -from policyengine.core.tax_benefit_model import TaxBenefitModel -from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion from policyengine.tax_benefit_models.uk import ( managed_microsimulation as managed_uk_microsimulation, ) @@ -45,9 +45,9 @@ def test__given_us_manifest__then_has_pinned_model_and_data_packages(self): manifest = get_release_manifest("us") assert manifest.schema_version == 1 - assert manifest.bundle_id == "us-3.5.0" + assert manifest.bundle_id == "us-4.0.0" assert manifest.country_id == "us" - assert manifest.policyengine_version == "3.5.0" + assert manifest.policyengine_version == "4.0.0" assert manifest.model_package.name == "policyengine-us" assert manifest.model_package.version == "1.653.3" assert manifest.data_package.name == "policyengine-us-data" @@ -67,9 +67,9 @@ def test__given_uk_manifest__then_has_pinned_model_and_data_packages(self): manifest = get_release_manifest("uk") assert manifest.schema_version == 1 - assert manifest.bundle_id == "uk-3.5.0" + assert manifest.bundle_id == "uk-4.0.0" assert manifest.country_id == "uk" - assert manifest.policyengine_version == "3.5.0" + assert manifest.policyengine_version == "4.0.0" assert manifest.model_package.name == "policyengine-uk" assert manifest.model_package.version == "2.88.0" assert manifest.data_package.name == "policyengine-uk-data" @@ -179,7 +179,7 @@ def test__given_country__then_can_fetch_data_release_manifest(self): } with patch( - "policyengine.core.release_manifest.requests.get", + "policyengine.provenance.manifest.requests.get", return_value=_response_with_json(payload), ) as mock_get: manifest = get_data_release_manifest("us") @@ -204,7 +204,7 @@ def test__given_missing_data_release_manifest__then_fetch_raises_unavailable(sel response.status_code = 404 with patch( - "policyengine.core.release_manifest.requests.get", + "policyengine.provenance.manifest.requests.get", return_value=response, ): try: @@ -243,7 +243,7 @@ def test__given_range_specifier__then_certification_accepts_compatible_version( } with patch( - "policyengine.core.release_manifest.requests.get", + "policyengine.provenance.manifest.requests.get", return_value=_response_with_json(payload), ): certification = certify_data_release_compatibility( @@ -277,7 +277,7 @@ def test__given_matching_fingerprint__then_certification_allows_reuse(self): } with patch( - "policyengine.core.release_manifest.requests.get", + "policyengine.provenance.manifest.requests.get", return_value=_response_with_json(payload), ): certification = certify_data_release_compatibility( @@ -297,7 +297,7 @@ def test__given_private_manifest_unavailable__then_bundled_certification_is_used get_data_release_manifest.cache_clear() with patch( - "policyengine.core.release_manifest.get_data_release_manifest", + "policyengine.provenance.manifest.get_data_release_manifest", side_effect=DataReleaseManifestUnavailableError("private repo"), ): certification = certify_data_release_compatibility( @@ -314,11 +314,11 @@ def test__given_private_manifest_unavailable_and_fingerprint_mismatch__then_fail with ( patch( - "policyengine.core.release_manifest.get_data_release_manifest", + "policyengine.provenance.manifest.get_data_release_manifest", side_effect=DataReleaseManifestUnavailableError("private repo"), ), patch( - "policyengine.core.release_manifest.get_release_manifest", + "policyengine.provenance.manifest.get_release_manifest", return_value=MagicMock( certification=DataCertification( compatibility_basis="matching_data_build_fingerprint", @@ -345,7 +345,7 @@ def test__given_manifest_fetch_failure__then_certification_does_not_fallback( get_data_release_manifest.cache_clear() with patch( - "policyengine.core.release_manifest.get_data_release_manifest", + "policyengine.provenance.manifest.get_data_release_manifest", side_effect=Timeout("network timeout"), ): try: @@ -381,7 +381,7 @@ def test__given_mismatched_version_and_fingerprint__then_certification_fails(sel } with patch( - "policyengine.core.release_manifest.requests.get", + "policyengine.provenance.manifest.requests.get", return_value=_response_with_json(payload), ): try: @@ -408,7 +408,7 @@ def test__given_manifest_certification__then_release_bundle_exposes_it(self): bundle = model_version.release_bundle - assert bundle["bundle_id"] == "uk-3.5.0" + assert bundle["bundle_id"] == "uk-4.0.0" assert bundle["default_dataset"] == "enhanced_frs_2023_24" assert bundle["default_dataset_uri"] == manifest.default_dataset_uri assert bundle["certified_data_build_id"] == "policyengine-uk-data-1.40.4" @@ -455,7 +455,7 @@ def test__given_us_managed_microsimulation__then_passes_certified_dataset_and_bu dataset = mock_microsimulation.call_args.kwargs["dataset"] assert dataset == microsim.policyengine_bundle["runtime_dataset_source"] - assert microsim.policyengine_bundle["policyengine_version"] == "3.5.0" + assert microsim.policyengine_bundle["policyengine_version"] == "4.0.0" assert microsim.policyengine_bundle["runtime_dataset"] == "enhanced_cps_2024" assert ( microsim.policyengine_bundle["runtime_dataset_uri"] @@ -493,7 +493,7 @@ def test__given_uk_managed_dataset_name__then_resolves_within_bundle(self): "hf://policyengine/policyengine-uk-data-private/" "enhanced_frs_2023_24.h5@1.40.4" ) - assert microsim.policyengine_bundle["policyengine_version"] == "3.5.0" + assert microsim.policyengine_bundle["policyengine_version"] == "4.0.0" assert microsim.policyengine_bundle["runtime_dataset"] == "enhanced_frs_2023_24" assert microsim.policyengine_bundle["runtime_dataset_uri"] == ( "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" diff --git a/tests/test_trace_tro.py b/tests/test_trace_tro.py index f78b4f33..9f32817f 100644 --- a/tests/test_trace_tro.py +++ b/tests/test_trace_tro.py @@ -1,6 +1,6 @@ """Tests for TRACE Transparent Research Object (TRO) export. -Covers bundle-level TROs (``policyengine.core.trace_tro``) and per-simulation +Covers bundle-level TROs (``policyengine.provenance.trace``) and per-simulation TROs (``policyengine.results.trace_tro``), plus the ``policyengine trace-tro`` CLI, determinism guarantees, and JSON-Schema conformance against TROv 2023/05. """ @@ -16,14 +16,14 @@ from jsonschema import Draft202012Validator from policyengine.cli import main as cli_main -from policyengine.core.release_manifest import ( +from policyengine.core.tax_benefit_model import TaxBenefitModel +from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion +from policyengine.provenance.manifest import ( DataReleaseManifest, get_data_release_manifest, get_release_manifest, ) -from policyengine.core.tax_benefit_model import TaxBenefitModel -from policyengine.core.tax_benefit_model_version import TaxBenefitModelVersion -from policyengine.core.trace_tro import ( +from policyengine.provenance.trace import ( POLICYENGINE_ORGANIZATION, TRACE_TROV_NAMESPACE, build_trace_tro_from_release_bundle, @@ -472,7 +472,7 @@ def test__given_trace_tro_property__then_emits_valid_tro(self): return_value=data_release_manifest, ): with patch( - "policyengine.core.trace_tro.fetch_pypi_wheel_metadata", + "policyengine.provenance.trace.fetch_pypi_wheel_metadata", side_effect=_fake_fetch_pypi, ): tro = model_version.trace_tro @@ -641,7 +641,7 @@ def test__given_trace_tro_stdout__then_writes_canonical_json( return_value=data_release_manifest, ): with patch( - "policyengine.core.trace_tro.fetch_pypi_wheel_metadata", + "policyengine.provenance.trace.fetch_pypi_wheel_metadata", side_effect=_fake_fetch_pypi, ): exit_code = cli_main(["trace-tro", "us"]) @@ -661,7 +661,7 @@ def test__given_out_path__then_writes_to_file(self, tmp_path, monkeypatch): return_value=data_release_manifest, ): with patch( - "policyengine.core.trace_tro.fetch_pypi_wheel_metadata", + "policyengine.provenance.trace.fetch_pypi_wheel_metadata", side_effect=_fake_fetch_pypi, ): exit_code = cli_main(["trace-tro", "us", "--out", str(out)]) diff --git a/tests/test_us_reform_application.py b/tests/test_us_reform_application.py index 21b9d01c..6e3b4145 100644 --- a/tests/test_us_reform_application.py +++ b/tests/test_us_reform_application.py @@ -1,148 +1,71 @@ -"""Tests for US reform application via reform_dict at construction time. +"""Tests for US reform dicts applied via ``pe.us.calculate_household``.""" -These tests verify that the US model correctly applies reforms by building -a reform dict and passing it to Microsimulation at construction time, -fixing the p.update() bug that exists in the US country package. -""" - -from policyengine.tax_benefit_models.us import ( - calculate_household_impact as calculate_us_household_impact, -) +import policyengine as pe from tests.fixtures.us_reform_fixtures import ( - DOUBLE_STANDARD_DEDUCTION_POLICY, HIGH_INCOME_SINGLE_FILER, MARRIED_COUPLE_WITH_KIDS, - create_standard_deduction_policy, ) -class TestUSHouseholdReformApplication: - """Tests for US household reform application.""" - - def test__given_baseline_policy__then_returns_baseline_tax(self): - """Given: No policy (baseline) - When: Calculating household impact - Then: Returns baseline tax calculation - """ - # Given - household = HIGH_INCOME_SINGLE_FILER - - # When - result = calculate_us_household_impact(household, policy=None) - - # Then - assert result.tax_unit[0]["income_tax"] > 0 - - def test__given_doubled_standard_deduction__then_tax_is_lower(self): - """Given: Policy that doubles standard deduction - When: Calculating household impact - Then: Income tax is lower than baseline - """ - # Given - household = HIGH_INCOME_SINGLE_FILER - policy = DOUBLE_STANDARD_DEDUCTION_POLICY - - # When - baseline_result = calculate_us_household_impact(household, policy=None) - reform_result = calculate_us_household_impact(household, policy=policy) - - # Then - baseline_tax = baseline_result.tax_unit[0]["income_tax"] - reform_tax = reform_result.tax_unit[0]["income_tax"] - - assert reform_tax < baseline_tax, ( - f"Reform tax ({reform_tax}) should be less than baseline ({baseline_tax})" - ) - - def test__given_doubled_standard_deduction__then_tax_reduction_is_significant( - self, - ): - """Given: Policy that doubles standard deduction - When: Calculating household impact for high income household - Then: Tax reduction is at least $1000 (significant impact) - """ - # Given - household = HIGH_INCOME_SINGLE_FILER - policy = DOUBLE_STANDARD_DEDUCTION_POLICY - - # When - baseline_result = calculate_us_household_impact(household, policy=None) - reform_result = calculate_us_household_impact(household, policy=policy) +def _double_standard_deduction(year: int) -> dict: + """Dict reform: standard deduction doubled from ~$14,600 / $29,200 baseline.""" + return { + "gov.irs.deductions.standard.amount.SINGLE": {f"{year}-01-01": 29200}, + "gov.irs.deductions.standard.amount.JOINT": {f"{year}-01-01": 58400}, + } - # Then - baseline_tax = baseline_result.tax_unit[0]["income_tax"] - reform_tax = reform_result.tax_unit[0]["income_tax"] - tax_reduction = baseline_tax - reform_tax - assert tax_reduction >= 1000, ( - f"Tax reduction ({tax_reduction}) should be at least $1000" - ) - - def test__given_married_couple__then_joint_deduction_affects_tax(self): - """Given: Married couple with doubled joint standard deduction - When: Calculating household impact - Then: Tax is lower than baseline - """ - # Given - household = MARRIED_COUPLE_WITH_KIDS - policy = DOUBLE_STANDARD_DEDUCTION_POLICY - - # When - baseline_result = calculate_us_household_impact(household, policy=None) - reform_result = calculate_us_household_impact(household, policy=policy) - - # Then - baseline_tax = baseline_result.tax_unit[0]["income_tax"] - reform_tax = reform_result.tax_unit[0]["income_tax"] - - assert reform_tax < baseline_tax, ( - f"Reform tax ({reform_tax}) should be less than baseline ({baseline_tax})" +class TestUSHouseholdReformApplication: + def test__baseline__then_income_tax_positive(self): + result = pe.us.calculate_household(**HIGH_INCOME_SINGLE_FILER) + assert result.tax_unit.income_tax > 0 + + def test__doubled_standard_deduction__then_tax_lower(self): + baseline = pe.us.calculate_household(**HIGH_INCOME_SINGLE_FILER) + reformed = pe.us.calculate_household( + **HIGH_INCOME_SINGLE_FILER, + reform=_double_standard_deduction(2024), ) + assert reformed.tax_unit.income_tax < baseline.tax_unit.income_tax - def test__given_same_policy_twice__then_results_are_deterministic(self): - """Given: Same policy applied twice - When: Calculating household impact - Then: Results are identical (deterministic) - """ - # Given - household = HIGH_INCOME_SINGLE_FILER - policy = DOUBLE_STANDARD_DEDUCTION_POLICY - - # When - result1 = calculate_us_household_impact(household, policy=policy) - result2 = calculate_us_household_impact(household, policy=policy) - - # Then - assert result1.tax_unit[0]["income_tax"] == result2.tax_unit[0]["income_tax"] - - def test__given_custom_deduction_value__then_tax_reflects_value(self): - """Given: Custom standard deduction value - When: Calculating household impact - Then: Tax reflects the custom deduction - """ - # Given - household = HIGH_INCOME_SINGLE_FILER - - # Create policies with different deduction values - small_deduction_policy = create_standard_deduction_policy( - single_value=5000, joint_value=10000 + def test__doubled_standard_deduction__then_reduction_is_meaningful(self): + baseline = pe.us.calculate_household(**HIGH_INCOME_SINGLE_FILER) + reformed = pe.us.calculate_household( + **HIGH_INCOME_SINGLE_FILER, + reform=_double_standard_deduction(2024), ) - large_deduction_policy = create_standard_deduction_policy( - single_value=50000, joint_value=100000 + reduction = baseline.tax_unit.income_tax - reformed.tax_unit.income_tax + assert reduction >= 1000, ( + f"Tax reduction ({reduction}) should be at least $1000" ) - # When - small_deduction_result = calculate_us_household_impact( - household, policy=small_deduction_policy + def test__married_couple_joint_deduction__then_tax_lower(self): + baseline = pe.us.calculate_household(**MARRIED_COUPLE_WITH_KIDS) + reformed = pe.us.calculate_household( + **MARRIED_COUPLE_WITH_KIDS, + reform=_double_standard_deduction(2024), ) - large_deduction_result = calculate_us_household_impact( - household, policy=large_deduction_policy + assert reformed.tax_unit.income_tax < baseline.tax_unit.income_tax + + def test__same_reform_twice__then_deterministic(self): + reform = _double_standard_deduction(2024) + first = pe.us.calculate_household(**HIGH_INCOME_SINGLE_FILER, reform=reform) + second = pe.us.calculate_household(**HIGH_INCOME_SINGLE_FILER, reform=reform) + assert first.tax_unit.income_tax == second.tax_unit.income_tax + + def test__custom_deduction_values__then_tax_reflects_values(self): + small_reform = { + "gov.irs.deductions.standard.amount.SINGLE": {"2024-01-01": 5000}, + "gov.irs.deductions.standard.amount.JOINT": {"2024-01-01": 10000}, + } + large_reform = { + "gov.irs.deductions.standard.amount.SINGLE": {"2024-01-01": 50000}, + "gov.irs.deductions.standard.amount.JOINT": {"2024-01-01": 100000}, + } + small = pe.us.calculate_household( + **HIGH_INCOME_SINGLE_FILER, reform=small_reform ) - - # Then - small_tax = small_deduction_result.tax_unit[0]["income_tax"] - large_tax = large_deduction_result.tax_unit[0]["income_tax"] - - assert large_tax < small_tax, ( - f"Large deduction tax ({large_tax}) should be less than small deduction ({small_tax})" + large = pe.us.calculate_household( + **HIGH_INCOME_SINGLE_FILER, reform=large_reform ) + assert large.tax_unit.income_tax < small.tax_unit.income_tax