From 3860d6a9f752d530fac5c67139955e82e24c2a1f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 4 Jun 2026 11:00:18 +0100 Subject: [PATCH 1/2] Make property_purchased assignment deterministic (seeded RNG) The data build set property_purchased via unseeded np.random.random(), so every build drew a different vector of purchasers. That made the dataset non-reproducible and intermittently spiked the first income decile's effective tax rate (the draw occasionally marked too many high-property, low-income households as purchasers), failing test_first_decile_tax_rate_reasonable and blocking releases. Draw from a seeded numpy Generator (default_rng(0)) instead of the global RNG, whose state depends on whatever ran earlier in the build. Same FRS input now always yields the same ~3.85% purchaser assignment. Pairs with the policyengine-uk fix flipping property_purchased's default to False, which fail-safes any household this build does not explicitly set. --- .../deterministic-property-purchased.fixed.md | 1 + policyengine_uk_data/datasets/frs.py | 20 +++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) create mode 100644 changelog.d/deterministic-property-purchased.fixed.md diff --git a/changelog.d/deterministic-property-purchased.fixed.md b/changelog.d/deterministic-property-purchased.fixed.md new file mode 100644 index 00000000..a087fe9f --- /dev/null +++ b/changelog.d/deterministic-property-purchased.fixed.md @@ -0,0 +1 @@ +Make property_purchased assignment deterministic by drawing from a seeded numpy Generator instead of the global unseeded np.random. Dataset builds are now reproducible: the same FRS input yields the same property_purchased vector, instead of an arbitrary share of households being marked as purchasers on each build. diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index fbfea69f..5d1cb7f8 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -1430,9 +1430,15 @@ def _reported_benunit_mask(person_column: str) -> np.ndarray: pe_benunit["is_married"] = frs["benunit"].famtypb2.isin([5, 7]) - # Stochastically set property_purchased based on UK housing transaction rate. - # Previously defaulted to True in policyengine-uk, causing all households - # to be charged SDLT as if they just bought their property (£370bn total). + # Assign property_purchased to a share of households matching the UK + # housing transaction rate, so only genuine purchasers are charged SDLT. + # + # This MUST be deterministic: a rules engine's inputs have to be + # reproducible across builds. Use a seeded Generator (not global + # np.random, whose state depends on whatever ran earlier in the build) + # so the same FRS input always yields the same assignment. An unseeded + # draw previously made the build non-reproducible and intermittently + # spiked the first decile's effective tax rate. # # Sources: # - Transactions: HMRC 2024 - 1.1m/year @@ -1443,11 +1449,13 @@ def _reported_benunit_mask(person_column: str) -> np.ndarray: # # Verification against official SDLT revenue (2024-25): # - Official SDLT: £13.9bn (https://www.gov.uk/government/statistics/uk-stamp-tax-statistics) - # - With fix (3.85%): £15.7bn (close to official) - # - Without fix (100%): £370bn (26x too high) + # - With 3.85% purchasers: £15.7bn (close to official) + # - With every household a purchaser: £370bn (26x too high) PROPERTY_PURCHASE_RATE = 0.0385 + PROPERTY_PURCHASE_SEED = 0 + purchase_rng = np.random.default_rng(PROPERTY_PURCHASE_SEED) pe_household["property_purchased"] = ( - np.random.random(len(pe_household)) < PROPERTY_PURCHASE_RATE + purchase_rng.random(len(pe_household)) < PROPERTY_PURCHASE_RATE ) if not include_internal_disability_reported_amounts: From 4213dd2a8e4f3afa2aa23de1b5e278e8abb66f81 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 4 Jun 2026 11:46:51 +0100 Subject: [PATCH 2/2] Fix review findings: seed capital gains and BRMA sampling too Independent review found the property_purchased seed was necessary but not sufficient for a reproducible build: two more assignments drew from the unseeded global numpy RNG. - imputations/capital_gains.py: quantile draws for the capital gains amount imputation now come from a seeded default_rng(0), so capital gains (and CGT revenue) are reproducible. - frs.py BRMA assignment: both pandas .sample() calls (region/category rent sampling and the household-level pick) now take a seeded random_state generator instead of the global RNG. The SPI synthetic sampling (income.py) was already seeded. The only remaining unseeded np.random is childcare/takeup_rate.py, which is not reached by the dataset build (test-only); left for separate cleanup. Broadened the changelog to reflect that the whole FRS build is now deterministic. --- changelog.d/deterministic-property-purchased.fixed.md | 2 +- policyengine_uk_data/datasets/frs.py | 11 ++++++++--- .../datasets/imputations/capital_gains.py | 7 ++++++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/changelog.d/deterministic-property-purchased.fixed.md b/changelog.d/deterministic-property-purchased.fixed.md index a087fe9f..6e8918e8 100644 --- a/changelog.d/deterministic-property-purchased.fixed.md +++ b/changelog.d/deterministic-property-purchased.fixed.md @@ -1 +1 @@ -Make property_purchased assignment deterministic by drawing from a seeded numpy Generator instead of the global unseeded np.random. Dataset builds are now reproducible: the same FRS input yields the same property_purchased vector, instead of an arbitrary share of households being marked as purchasers on each build. +Make the FRS dataset build deterministic. Several assignments drew from the unseeded global numpy RNG, so otherwise-identical builds produced different datasets: property_purchased (which households are charged stamp duty), capital gains imputation quantiles (CGT revenue), and BRMA assignment (LHA/housing-benefit geography). Each now draws from a seeded generator, so the same inputs always produce the same dataset. diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index 5d1cb7f8..58e2ac6b 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -1251,9 +1251,12 @@ def determine_education_level(fted_val, typeed2_val, age_val): lha_category = sim.calculate("LHA_category", year) brma = np.empty(len(region), dtype=object) - # Sample from a random BRMA in the region, weighted by the number of observations in each BRMA + # Sample from a random BRMA in the region, weighted by the number of observations in each BRMA. + # Use a seeded generator so the assignment is reproducible across builds; + # pandas .sample() otherwise draws from the unseeded global numpy RNG. lha_list_of_rents = pd.read_csv(STORAGE_FOLDER / "lha_list_of_rents.csv.gz") lha_list_of_rents = lha_list_of_rents.copy() + brma_rng = np.random.default_rng(0) for possible_region in lha_list_of_rents.region.unique(): for possible_lha_category in lha_list_of_rents.lha_category.unique(): @@ -1262,7 +1265,7 @@ def determine_education_level(fted_val, typeed2_val, age_val): ) mask = (region == possible_region) & (lha_category == possible_lha_category) brma[mask] = lha_list_of_rents[lor_mask].brma.sample( - n=len(region[mask]), replace=True + n=len(region[mask]), replace=True, random_state=brma_rng ) # Convert benunit-level BRMAs to household-level BRMAs (pick a random one) @@ -1276,7 +1279,9 @@ def determine_education_level(fted_val, typeed2_val, age_val): } ) - df = df.groupby("household_id").brma.aggregate(lambda x: x.sample(n=1).iloc[0]) + df = df.groupby("household_id").brma.aggregate( + lambda x: x.sample(n=1, random_state=brma_rng).iloc[0] + ) brmas = df[sim.calculate("household_id")].values pe_household["brma"] = brmas diff --git a/policyengine_uk_data/datasets/imputations/capital_gains.py b/policyengine_uk_data/datasets/imputations/capital_gains.py index 9a4790cd..22fe38d0 100644 --- a/policyengine_uk_data/datasets/imputations/capital_gains.py +++ b/policyengine_uk_data/datasets/imputations/capital_gains.py @@ -117,6 +117,11 @@ def loss(blend_factor): logging.info("Imputing capital gains among those with gains") + # Draw imputation quantiles from a seeded generator so the build is + # reproducible: an unseeded global np.random made capital gains (and hence + # CGT revenue) differ between otherwise identical builds. + cg_rng = np.random.default_rng(0) + for i in range(len(capital_gains)): row = capital_gains.iloc[i] spline = UnivariateSpline( @@ -128,7 +133,7 @@ def loss(blend_factor): upper = row.maximum_total_income ti_in_range = (ti >= lower) * (ti < upper) in_target_range = has_cg * ti_in_range > 0 - quantiles = np.random.random(int(in_target_range.sum())) + quantiles = cg_rng.random(int(in_target_range.sum())) pred_capital_gains = spline(quantiles) new_cg[in_target_range] = pred_capital_gains