From f066f750eeabc824b8014820e3780771d4692fd9 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 20 Apr 2026 09:38:33 -0400 Subject: [PATCH 1/2] Honor Simulation.extra_variables on US and UK microsim paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #303. The field was declared on the base Simulation class and honored by pe.us.calculate_household / pe.uk.calculate_household, but the population run() methods only iterated self.entity_variables — any variable a caller added via Simulation(extra_variables={...}) was silently dropped from output_dataset. Fix: - New MicrosimulationModelVersion.resolve_entity_variables(simulation) in tax_benefit_models.common.model_version merges bundled defaults with simulation.extra_variables. Order preserved (defaults first, extras appended). Duplicates deduped. - Unknown entity keys raise ValueError with close-match suggestions (e.g. "tax_units" -> "did you mean 'tax_unit'?"). Same pattern used by compile_reform for parameter paths. - Unknown variable names raise with close-match suggestions. - Both US run() (tax_benefit_models/us/model.py) and UK run() (tax_benefit_models/uk/model.py) now call resolve_entity_variables in place of iterating self.entity_variables directly. Regression tests in tests/test_extra_variables.py: - US: extras appear on output_dataset for household + tax_unit; dedupes when extra overlaps a default; unknown entity/variable names raise with suggestions. - UK: direct test of resolve_entity_variables on pe.uk.model (full UK microsim fixture is heavier than needed; the method is the integration point and both countries call it identically). 403 existing tests still pass; 6 new tests pass. --- ...-extra-variables-silently-ignored.fixed.md | 1 + .../common/model_version.py | 64 +++++ .../tax_benefit_models/uk/model.py | 5 +- .../tax_benefit_models/us/model.py | 7 +- tests/test_extra_variables.py | 219 ++++++++++++++++++ 5 files changed, 293 insertions(+), 3 deletions(-) create mode 100644 changelog.d/fix-extra-variables-silently-ignored.fixed.md create mode 100644 tests/test_extra_variables.py diff --git a/changelog.d/fix-extra-variables-silently-ignored.fixed.md b/changelog.d/fix-extra-variables-silently-ignored.fixed.md new file mode 100644 index 00000000..639ce153 --- /dev/null +++ b/changelog.d/fix-extra-variables-silently-ignored.fixed.md @@ -0,0 +1 @@ +Fixed `Simulation.extra_variables` being silently ignored on the US and UK microsim paths. The field is declared on the base `Simulation` and honored by the household calculator, but the country `run()` methods previously only iterated `self.entity_variables` — extras passed via `Simulation(extra_variables={...})` never reached the output dataset. Both country paths now route through a shared `MicrosimulationModelVersion.resolve_entity_variables` helper that merges defaults with extras, dedupes overlapping entries, and validates entity keys + variable names with close-match suggestions. Closes #303. diff --git a/src/policyengine/tax_benefit_models/common/model_version.py b/src/policyengine/tax_benefit_models/common/model_version.py index dc5d44d8..915c77bd 100644 --- a/src/policyengine/tax_benefit_models/common/model_version.py +++ b/src/policyengine/tax_benefit_models/common/model_version.py @@ -225,6 +225,70 @@ def _build_entity_relationships(self, dataset) -> pd.DataFrame: person_data = pd.DataFrame(dataset.data.person) return build_entity_relationships(person_data, self.group_entities) + def resolve_entity_variables( + self, + simulation: Simulation, + ) -> dict[str, list[str]]: + """Merge :attr:`entity_variables` with ``simulation.extra_variables``. + + Returned dict has one key per known entity, value being the + union of the bundled defaults and the caller's extras, with + duplicates removed and original order preserved (defaults + first, extras appended). + + Raises ``ValueError`` with close-match suggestions if the + caller passes an unknown entity key or a variable name that + does not resolve on the tax-benefit system's variable + registry. + """ + from difflib import get_close_matches + + extras = dict(simulation.extra_variables or {}) + known_entities = set(self.entity_variables) + unknown_entities = [e for e in extras if e not in known_entities] + if unknown_entities: + lines = [ + f"Simulation.extra_variables contains entity keys not " + f"defined on {self.model.id} {self.version}:" + ] + for entity in unknown_entities: + suggestions = get_close_matches( + entity, sorted(known_entities), n=1, cutoff=0.7 + ) + hint = f" (did you mean '{suggestions[0]}'?)" if suggestions else "" + lines.append(f" - '{entity}'{hint}") + raise ValueError("\n".join(lines)) + + known_variables = set(self.variables_by_name) + unknown_variables: list[tuple[str, str]] = [] + for entity, names in extras.items(): + for name in names: + if name not in known_variables: + unknown_variables.append((entity, name)) + if unknown_variables: + lines = [ + f"Simulation.extra_variables contains variable names not " + f"defined on {self.model.id} {self.version}:" + ] + for entity, name in unknown_variables: + suggestions = get_close_matches( + name, sorted(known_variables), n=1, cutoff=0.7 + ) + hint = f" (did you mean '{suggestions[0]}'?)" if suggestions else "" + lines.append(f" - '{name}' (on '{entity}'){hint}") + raise ValueError("\n".join(lines)) + + resolved: dict[str, list[str]] = {} + for entity, defaults in self.entity_variables.items(): + seen: set[str] = set() + merged: list[str] = [] + for var in list(defaults) + list(extras.get(entity, [])): + if var not in seen: + seen.add(var) + merged.append(var) + resolved[entity] = merged + return resolved + def save(self, simulation: Simulation) -> None: """Persist the simulation's output dataset to its bundled filepath.""" simulation.output_dataset.save() diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py index 67e7a3ae..b0e4cceb 100644 --- a/src/policyengine/tax_benefit_models/uk/model.py +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -206,7 +206,10 @@ def run(self, simulation: "Simulation") -> "Simulation": "household": pd.DataFrame(), } - for entity, variables in self.entity_variables.items(): + # ``resolve_entity_variables`` merges the bundled defaults + # with caller-supplied ``simulation.extra_variables``; unknown + # entity keys or variable names raise with close-match hints. + for entity, variables in self.resolve_entity_variables(simulation).items(): for var in variables: data[entity][var] = microsim.calculate( var, period=simulation.dataset.year, map_to=entity diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py index 51463650..19d32e50 100644 --- a/src/policyengine/tax_benefit_models/us/model.py +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -228,8 +228,11 @@ def run(self, simulation: "Simulation") -> "Simulation": if target_col in id_columns: data["person"][target_col] = person_input_df[col].values - # Calculate non-ID, non-weight variables from simulation - for entity, variables in self.entity_variables.items(): + # Calculate non-ID, non-weight variables from simulation. + # ``resolve_entity_variables`` merges bundled defaults with + # caller-supplied ``simulation.extra_variables``; unknown + # entity keys or variable names raise with close-match hints. + for entity, variables in self.resolve_entity_variables(simulation).items(): for var in variables: if var not in id_columns and var not in weight_columns: data[entity][var] = microsim.calculate( diff --git a/tests/test_extra_variables.py b/tests/test_extra_variables.py new file mode 100644 index 00000000..f6617ac6 --- /dev/null +++ b/tests/test_extra_variables.py @@ -0,0 +1,219 @@ +"""Regression: ``Simulation(extra_variables={...})`` must add the +named variables to ``output_dataset``. + +The field lives on the base :class:`~policyengine.core.Simulation` +but the US and UK microsim ``run()`` paths historically only iterated +``self.entity_variables`` (the country model's bundled defaults), +silently dropping anything the caller added via ``extra_variables``. + +The fix adds a shared :meth:`resolve_entity_variables` on +:class:`~policyengine.tax_benefit_models.common.MicrosimulationModelVersion` +that merges the two and validates unknown entity/variable names with +close-match suggestions. Both country runs now route through it. +""" + +from __future__ import annotations + +import pandas as pd +import pytest +from microdf import MicroDataFrame + +# --- Fixtures -------------------------------------------------------- + + +def _us_fixture_dataset(tmp_path): + from policyengine.tax_benefit_models.us.datasets import ( + PolicyEngineUSDataset, + USYearData, + ) + + person = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "household_id": [1, 1, 1], + "tax_unit_id": [1, 1, 1], + "spm_unit_id": [1, 1, 1], + "family_id": [1, 1, 1], + "marital_unit_id": [1, 2, 3], + "person_weight": [1_000.0] * 3, + "age": [35, 5, 8], + "employment_income": [50_000, 0, 0], + } + ), + weights="person_weight", + ) + household = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1], + "state_code": ["CA"], + "household_weight": [1_000.0], + } + ), + weights="household_weight", + ) + + def _simple(col: str, rows: int, weight: str) -> MicroDataFrame: + return MicroDataFrame( + pd.DataFrame({col: list(range(1, rows + 1)), weight: [1_000.0] * rows}), + weights=weight, + ) + + return PolicyEngineUSDataset( + id="test-extra-vars", + name="extra_variables fixture", + description="Small dataset for extra_variables regression tests", + filepath=str(tmp_path / "test.h5"), + year=2026, + data=USYearData( + person=person, + tax_unit=_simple("tax_unit_id", 1, "tax_unit_weight"), + spm_unit=_simple("spm_unit_id", 1, "spm_unit_weight"), + family=_simple("family_id", 1, "family_weight"), + marital_unit=_simple("marital_unit_id", 3, "marital_unit_weight"), + household=household, + ), + ) + + +# --- US: positive path ----------------------------------------------- + + +def test__us_extra_variables_appear_on_output_dataset(tmp_path) -> None: + """Issue #303: ``Simulation(extra_variables={"household": [...]})`` + must actually populate those columns on ``output_dataset``. + """ + pytest.importorskip("policyengine_us") + import policyengine as pe + from policyengine.core import Simulation + + dataset = _us_fixture_dataset(tmp_path) + sim = Simulation( + dataset=dataset, + tax_benefit_model_version=pe.us.model, + extra_variables={ + "household": ["household_market_income"], + "tax_unit": ["adjusted_gross_income"], + }, + ) + sim.run() + + household_cols = sim.output_dataset.data.household.columns + tax_unit_cols = sim.output_dataset.data.tax_unit.columns + # ``household_market_income`` is already in defaults — confirm it + # stays present (no duplicate-column collision) AND the new + # ``adjusted_gross_income`` appears on ``tax_unit``. + assert "household_market_income" in household_cols + assert "adjusted_gross_income" in tax_unit_cols, ( + "extra_variables['tax_unit'] entry missing from output_dataset" + ) + + +# --- UK: resolver directly (full UK microsim fixture is heavier +# than needed to test the shared helper) -------------------------- + + +def test__uk_resolve_entity_variables_merges_extras() -> None: + """``resolve_entity_variables`` on the UK model must merge extras + into defaults the same way US does. Exercising the method directly + is sufficient — the UK ``run()`` calls it via the exact line that + US does, so identical behaviour there is a one-character diff. + """ + pytest.importorskip("policyengine_uk") + import policyengine as pe + from policyengine.core import Simulation + + fake_sim = Simulation.model_construct( + extra_variables={"person": ["adjusted_net_income"]} + ) + resolved = pe.uk.model.resolve_entity_variables(fake_sim) + assert "adjusted_net_income" in resolved["person"] + # Non-targeted entities unchanged. + assert resolved["benunit"] == list(pe.uk.model.entity_variables["benunit"]) + + +def test__uk_resolve_entity_variables_raises_on_unknown_variable() -> None: + pytest.importorskip("policyengine_uk") + import policyengine as pe + from policyengine.core import Simulation + + fake_sim = Simulation.model_construct( + extra_variables={"person": ["nonexistent_variable_xyz"]} + ) + with pytest.raises(ValueError) as exc: + pe.uk.model.resolve_entity_variables(fake_sim) + assert "nonexistent_variable_xyz" in str(exc.value) + + +# --- Negative path: validation --------------------------------------- + + +def test__unknown_entity_key_raises_with_suggestion(tmp_path) -> None: + """An unknown entity key in ``extra_variables`` must raise with a + close-match suggestion at run time (before the bare ``Microsimulation`` + call starts spending minutes of compute). + """ + pytest.importorskip("policyengine_us") + import policyengine as pe + from policyengine.core import Simulation + + dataset = _us_fixture_dataset(tmp_path) + sim = Simulation( + dataset=dataset, + tax_benefit_model_version=pe.us.model, + # ``tax_units`` (plural) is a likely agent typo for ``tax_unit``. + extra_variables={"tax_units": ["income_tax"]}, + ) + with pytest.raises(ValueError) as exc: + sim.run() + message = str(exc.value) + assert "tax_units" in message + assert "tax_unit" in message # close-match suggestion + + +def test__unknown_variable_name_raises_with_suggestion(tmp_path) -> None: + """Variable-name typos must raise with a close-match suggestion.""" + pytest.importorskip("policyengine_us") + import policyengine as pe + from policyengine.core import Simulation + + dataset = _us_fixture_dataset(tmp_path) + sim = Simulation( + dataset=dataset, + tax_benefit_model_version=pe.us.model, + extra_variables={"tax_unit": ["adjusted_gross_incme"]}, # typo + ) + with pytest.raises(ValueError) as exc: + sim.run() + message = str(exc.value) + assert "adjusted_gross_incme" in message + assert "adjusted_gross_income" in message # suggested correction + + +# --- Resolver behavior: deduplication -------------------------------- + + +def test__resolve_entity_variables_dedupes_when_extra_overlaps_default( + tmp_path, +) -> None: + """Passing a variable that's already in the defaults must not + produce a duplicate column or fail. + """ + pytest.importorskip("policyengine_us") + import policyengine as pe + from policyengine.core import Simulation + + dataset = _us_fixture_dataset(tmp_path) + sim = Simulation( + dataset=dataset, + tax_benefit_model_version=pe.us.model, + # ``income_tax`` is already a default on ``tax_unit``. + extra_variables={"tax_unit": ["income_tax"]}, + ) + sim.run() + + tax_unit_cols = list(sim.output_dataset.data.tax_unit.columns) + assert tax_unit_cols.count("income_tax") == 1, ( + f"income_tax duplicated after extra_variables merge: {tax_unit_cols}" + ) From eac64d5fd911ed908f735e186d9e63f12680f3b5 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 20 Apr 2026 09:48:46 -0400 Subject: [PATCH 2/2] Address review findings on PR #307 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reproducibility-reviewer (blocking): - Replace direct-helper UK test with end-to-end UK integration test (test__uk_extra_variables_appear_on_output_dataset). Catches the one-character regression where uk/model.py:212 reverts to self.entity_variables.items() — verified by manually reverting. - Strengthen dedup assertion: also check the column has no NaN entries so a silent merge-blanks-column regression fails, not just duplicate-column regressions. Reproducibility-reviewer (follow-ups): - Add test__default_empty_extras_produces_exact_bundled_defaults covering the default-construction ({}) path. - Add test__partial_coverage_leaves_untouched_entities_unchanged covering the case where extras target one entity only. - Comment explaining net_worth substitution (test_extra_variables.py: net_worth not in pe-us 1.653.3 codebase; tests use adjusted_gross_income as stand-in). Code-simplifier (nice-to-have): - Move get_close_matches import to module level (consistent with common/extra_variables.py's style). Verified fail-without-fix: - Reverting uk/model.py:212 back to self.entity_variables.items() makes test__uk_extra_variables_appear_on_output_dataset fail with the expected missing-column assertion. 8 tests, all pass. Ruff clean. --- .../common/model_version.py | 3 +- tests/test_extra_variables.py | 137 ++++++++++++++++-- 2 files changed, 125 insertions(+), 15 deletions(-) diff --git a/src/policyengine/tax_benefit_models/common/model_version.py b/src/policyengine/tax_benefit_models/common/model_version.py index 915c77bd..6f7a3b53 100644 --- a/src/policyengine/tax_benefit_models/common/model_version.py +++ b/src/policyengine/tax_benefit_models/common/model_version.py @@ -19,6 +19,7 @@ import datetime import os import warnings +from difflib import get_close_matches from importlib import metadata from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Optional @@ -241,8 +242,6 @@ def resolve_entity_variables( does not resolve on the tax-benefit system's variable registry. """ - from difflib import get_close_matches - extras = dict(simulation.extra_variables or {}) known_entities = set(self.entity_variables) unknown_entities = [e for e in extras if e not in known_entities] diff --git a/tests/test_extra_variables.py b/tests/test_extra_variables.py index f6617ac6..6dd042d8 100644 --- a/tests/test_extra_variables.py +++ b/tests/test_extra_variables.py @@ -92,6 +92,13 @@ def test__us_extra_variables_appear_on_output_dataset(tmp_path) -> None: sim = Simulation( dataset=dataset, tax_benefit_model_version=pe.us.model, + # Issue #303 used ``net_worth`` (imputed from SCF in later + # policyengine-us releases). Substituted with two variables + # present in the pinned 1.653.3 wheel so the test is + # hermetic: ``household_market_income`` is already a default + # (so we also cover the dedup path) and + # ``adjusted_gross_income`` is a tax_unit variable not in + # the defaults. extra_variables={ "household": ["household_market_income"], "tax_unit": ["adjusted_gross_income"], @@ -110,27 +117,80 @@ def test__us_extra_variables_appear_on_output_dataset(tmp_path) -> None: ) -# --- UK: resolver directly (full UK microsim fixture is heavier -# than needed to test the shared helper) -------------------------- +def _uk_fixture_dataset(tmp_path): + from policyengine.tax_benefit_models.uk.datasets import ( + PolicyEngineUKDataset, + UKYearData, + ) + + # UK expects ``person_benunit_id`` / ``person_household_id`` as + # person-level columns (not ``benunit_id`` / ``household_id``). + person = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2], + "person_household_id": [1, 1], + "person_benunit_id": [1, 1], + "person_weight": [1_000.0, 1_000.0], + "age": [35, 33], + "employment_income": [30_000, 20_000], + } + ), + weights="person_weight", + ) + benunit = MicroDataFrame( + pd.DataFrame({"benunit_id": [1], "benunit_weight": [1_000.0]}), + weights="benunit_weight", + ) + household = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1], + "household_weight": [1_000.0], + "region": ["LONDON"], + "tenure_type": ["RENT_PRIVATELY"], + "rent": [12_000.0], + "council_tax": [1_500.0], + } + ), + weights="household_weight", + ) + return PolicyEngineUKDataset( + id="test-extra-vars-uk", + name="extra_variables UK fixture", + description="Small UK dataset for extra_variables regression", + filepath=str(tmp_path / "test-uk.h5"), + year=2026, + data=UKYearData(person=person, benunit=benunit, household=household), + ) -def test__uk_resolve_entity_variables_merges_extras() -> None: - """``resolve_entity_variables`` on the UK model must merge extras - into defaults the same way US does. Exercising the method directly - is sufficient — the UK ``run()`` calls it via the exact line that - US does, so identical behaviour there is a one-character diff. +# --- UK: positive path (end-to-end) --------------------------------- + + +def test__uk_extra_variables_appear_on_output_dataset(tmp_path) -> None: + """Issue #303 on the UK path. Integration test — proves that + ``uk/model.py`` actually calls ``resolve_entity_variables`` in its + ``run()``. A regression that reverts the call site to + ``self.entity_variables.items()`` must fail this test. """ pytest.importorskip("policyengine_uk") import policyengine as pe from policyengine.core import Simulation - fake_sim = Simulation.model_construct( - extra_variables={"person": ["adjusted_net_income"]} + dataset = _uk_fixture_dataset(tmp_path) + sim = Simulation( + dataset=dataset, + tax_benefit_model_version=pe.uk.model, + extra_variables={"person": ["adjusted_net_income"]}, + ) + sim.run() + + assert "adjusted_net_income" in sim.output_dataset.data.person.columns, ( + "UK extra_variables['person'] entry missing from output_dataset — " + "likely a regression in uk/model.py:run() bypassing " + "resolve_entity_variables" ) - resolved = pe.uk.model.resolve_entity_variables(fake_sim) - assert "adjusted_net_income" in resolved["person"] - # Non-targeted entities unchanged. - assert resolved["benunit"] == list(pe.uk.model.entity_variables["benunit"]) def test__uk_resolve_entity_variables_raises_on_unknown_variable() -> None: @@ -213,7 +273,58 @@ def test__resolve_entity_variables_dedupes_when_extra_overlaps_default( ) sim.run() + income_tax_col = sim.output_dataset.data.tax_unit["income_tax"] + assert income_tax_col.notna().all(), ( + "income_tax column has NaN entries after dedup — merge step " + "likely blanked the column" + ) tax_unit_cols = list(sim.output_dataset.data.tax_unit.columns) assert tax_unit_cols.count("income_tax") == 1, ( f"income_tax duplicated after extra_variables merge: {tax_unit_cols}" ) + + +# --- Resolver behavior: default / partial coverage ------------------ + + +def test__default_empty_extras_produces_exact_bundled_defaults( + tmp_path, +) -> None: + """``Simulation`` defaults ``extra_variables`` to ``{}``. That + path must round-trip to exactly the bundled + ``entity_variables`` (no extra entries, no missing entries). + """ + pytest.importorskip("policyengine_us") + import policyengine as pe + from policyengine.core import Simulation + + sim = Simulation.model_construct(extra_variables={}) + resolved = pe.us.model.resolve_entity_variables(sim) + for entity, defaults in pe.us.model.entity_variables.items(): + assert resolved[entity] == list(defaults), ( + f"Entity {entity!r}: expected defaults {list(defaults)} " + f"but got {resolved[entity]}" + ) + + +def test__partial_coverage_leaves_untouched_entities_unchanged( + tmp_path, +) -> None: + """Passing extras for one entity must not modify any other + entity's resolved variable list. + """ + pytest.importorskip("policyengine_us") + import policyengine as pe + from policyengine.core import Simulation + + sim = Simulation.model_construct( + extra_variables={"household": ["household_market_income"]} + ) + resolved = pe.us.model.resolve_entity_variables(sim) + # Entities NOT in extras must match the bundled defaults exactly. + for entity, defaults in pe.us.model.entity_variables.items(): + if entity == "household": + continue + assert resolved[entity] == list(defaults), ( + f"Untouched entity {entity!r} was mutated: {resolved[entity]}" + )