diff --git a/Makefile b/Makefile index 777619dea..7d5e99618 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all format test test-unit test-integration install download upload docker documentation data validate-data calibrate calibrate-build publish-local-area upload-calibration upload-dataset push-to-modal build-data-modal build-matrices calibrate-modal calibrate-modal-national calibrate-both stage-h5s stage-national-h5 stage-all-h5s pipeline validate-staging validate-staging-full upload-validation check-staging check-sanity clean build paper clean-paper presentations database database-refresh promote-dataset promote build-h5s validate-local refresh-soi-targets push-pr-branch +.PHONY: all format test test-unit test-integration install download upload docker documentation data validate-data calibrate calibrate-build publish-local-area upload-calibration upload-dataset push-to-modal build-data-modal build-matrices calibrate-modal calibrate-modal-national calibrate-both stage-h5s stage-national-h5 stage-all-h5s pipeline validate-staging validate-staging-full upload-validation check-staging check-sanity clean build paper clean-paper presentations database database-refresh promote-dataset promote build-h5s validate-local refresh-soi-targets push-pr-branch benchmarking-install-python benchmarking-install-r benchmarking-export benchmarking-run-l0 benchmarking-run-greg benchmarking-run-ipf SOI_SOURCE_YEAR ?= 2021 SOI_TARGET_YEAR ?= 2023 @@ -13,6 +13,8 @@ BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD) NUM_WORKERS ?= 8 N_CLONES ?= 430 VERSION ?= +MANIFEST ?= +RUN_DIR ?= SOI_SOURCE_YEAR ?= 2021 SOI_TARGET_YEAR ?= 2023 @@ -37,6 +39,32 @@ install: pip install policyengine-us pip install -e ".[dev]" --config-settings editable_mode=compat +benchmarking-install-python: + pip install -r paper-l0/benchmarking/requirements-python.txt + +benchmarking-install-r: + Rscript paper-l0/benchmarking/install_r_packages.R + +benchmarking-export: + python paper-l0/benchmarking/benchmark_cli.py export \ + --manifest $(MANIFEST) \ + --output-dir $(RUN_DIR) + +benchmarking-run-l0: + python paper-l0/benchmarking/benchmark_cli.py run \ + --method l0 \ + --run-dir $(RUN_DIR) + +benchmarking-run-greg: + python paper-l0/benchmarking/benchmark_cli.py run \ + --method greg \ + --run-dir $(RUN_DIR) + +benchmarking-run-ipf: + python paper-l0/benchmarking/benchmark_cli.py run \ + --method ipf \ + --run-dir $(RUN_DIR) + changelog: python .github/bump_version.py towncrier build --yes --version $$(python -c "import re; print(re.search(r'version = \"(.+?)\"', open('pyproject.toml').read()).group(1))") diff --git a/paper-l0/.gitignore b/paper-l0/.gitignore new file mode 100644 index 000000000..02d9b69b6 --- /dev/null +++ b/paper-l0/.gitignore @@ -0,0 +1,35 @@ +## Core latex/pdflatex auxiliary files: +*.aux +*.lof +*.log +*.lot +*.fls +*.out +*.toc +*.fmt +*.fot +*.cb +*.cb2 +.*.lb +*.nav +*.snm +*.vrb + +## Generated if empty string is given at "Please type another file name for output:" +.pdf + +## Bibliography auxiliary files (bibtex/biblatex/biber): +*.bbl +*.bcf +*.blg +*-blx.aux +*-blx.bib +*.run.xml + +## Build tool auxiliary files: +*.fdb_latexmk +*.synctex +*.synctex(busy) +*.synctex.gz +*.synctex.gz(busy) +*.pdfsync diff --git a/paper-l0/benchmarking/README.md b/paper-l0/benchmarking/README.md new file mode 100644 index 000000000..c46d70c12 --- /dev/null +++ b/paper-l0/benchmarking/README.md @@ -0,0 +1,275 @@ +# Benchmarking Scaffold + +This directory contains the implementation scaffold for benchmarking the +`L0` calibration pipeline against: + +- `GREG` via R's `survey` package +- `IPF` via R's `surveysd` package + +## Experimental Setup + +The benchmark is organized around one shared exported bundle and multiple +method adapters. + +- `L0` and `GREG` are compared on the shared calibration representation: + a sparse target-by-unit matrix, the selected target table, and + initial .npy weights. +- `IPF` is benchmarked from the same target selection, but it requires a + conversion step because `surveysd::ipf` consumes a microdata table plus + IPF constraints rather than a generic sparse linear system. +- The intended benchmark tiers are: + - a practical reduced-size comparison tier, used for like-for-like `L0` + versus `GREG` runs that are small enough to execute routinely during + development + - an IPF-focused reduced-size tier on count-style targets, used because + classical `IPF` is most naturally evaluated on count or indicator margins + rather than the full arbitrary target set + - a scaling ladder over increasing target counts, used to show how runtime, + memory use, convergence, and outright failure change as the benchmark moves + from small target subsets toward the full calibration problem + - a production-feasibility tier, used to test which methods can still run at + something close to the full production clone count and target volume + +Methodologically, the benchmark treats the methods as related but not +identical: + +- `L0` and `GREG` can consume arbitrary linear calibration targets. +- `IPF` is most natural for count-style or indicator-style targets, so the + current automatic conversion path supports `person_count` and + `household_count`. + +The core workflow is: + +1. select a benchmark target subset with a manifest +2. export a shared benchmark bundle from a saved calibration package +3. auto-convert the bundle to IPF inputs when needed +4. run `L0`, `GREG`, or `IPF` +5. score each method against the target set that matches its benchmark contract + +## Layout + +- `benchmark_cli.py` + Main CLI for exporting benchmark bundles and running methods. +- `benchmark_manifest.py` + Manifest schema and target-filter logic. +- `benchmark_export.py` + Export utilities for shared benchmark artifacts. +- `ipf_conversion.py` + Automatic conversion from the saved calibration package to IPF-ready + unit and target metadata. +- `benchmark_metrics.py` + Common diagnostics and summary generation. +- `runners/greg_runner.R` + R backend for `survey`-based GREG. +- `runners/ipf_runner.R` + R backend for `surveysd`-based IPF. +- `runners/read_npy.R` + Minimal `.npy` reader used by the R scripts. +- `requirements-python.txt` + Python dependencies for the benchmarking scaffold. +- `install_r_packages.R` + Installs the required R packages for the benchmark runners. +- `manifests/*.example.json` + Example benchmark manifests. + +## Environment Setup + +Python: + +```bash +pip install -r paper-l0/benchmarking/requirements-python.txt +``` + +R: + +```bash +Rscript paper-l0/benchmarking/install_r_packages.R +``` + +Or, from the repo root: + +```bash +make benchmarking-install-python +make benchmarking-install-r +``` + +## Chosen Interchange Formats + +- sparse matrix: Matrix Market `.mtx` +- target metadata: `.csv` +- unit metadata: `.csv` +- initial weights: `.npy` +- benchmark manifest: `.json` +- method result summary: `.json` +- fitted weights: `.npy` + +## Notes + +### Shared calibration package + +The exporter reads the saved calibration package directly from pickle rather +than importing the full calibration CLI. This keeps the benchmark I/O path +lightweight. + +### IPF inputs + +The exporter auto-generates IPF inputs when the manifest includes `ipf` and no +external overrides are supplied. It reconstructs an IPF microdata table from: + +- the saved calibration package +- the package metadata's `dataset_path` +- the package metadata's `db_path` +- the selected count-like targets and their stratum constraints + +The generated `unit_metadata.csv` is built for `person_count` and +`household_count` targets. It expands cloned households to a person-level table +when person targets are present, carries a repeated household `unit_index` so +per-person weights collapse cleanly back to per-household, and adds one +string-valued derived category column per declared bucket schema (e.g. +`age_bracket`, `agi_bracket_district`, `snap_positive`). + +The generated `ipf_target_metadata.csv` contains one `categorical_margin` row +per retained IPF cell after validation. That means: + +- authored cells that belong to a closed categorical system are kept +- binary subset families may gain exactly-derived complement cells when an + authored parent total exists on the exact reduced key +- open subset families are dropped rather than emitted as 1-cell margins + +The exporter also writes: + +- `ipf_scoring_target_metadata.csv` +- `ipf_scoring_X_targets_by_units.mtx` + +These score IPF on its retained authored targets only. Derived complements are +recorded for transparency in `ipf_conversion_diagnostics.json`, but they are +not part of the main benchmark metric set. + +When comparing `L0` or `GREG` against that same subset, pass: + +```bash +python paper-l0/benchmarking/benchmark_cli.py run \ + --method l0 \ + --run-dir \ + --score-on ipf_retained_authored +``` + +External CSVs are still supported through `external_inputs.*` and override the +automatic conversion path when provided. The external-IPF contract is strict: + +- `external_inputs.ipf_unit_metadata_csv` +- `external_inputs.ipf_target_metadata_csv` +- `external_inputs.ipf_scoring_target_metadata_csv` +- `external_inputs.ipf_scoring_matrix_mtx` + +must be provided together. An optional +`external_inputs.ipf_conversion_diagnostics_json` can also be supplied and will +be copied through for reporting. External CSVs must also follow the +`categorical_margin` schema below; the runner rejects `numeric_total` rows. + +### IPF conversion step by step + +The IPF conversion is implemented in +[ipf_conversion.py](./ipf_conversion.py) and runs during +`benchmark_cli.py export`. + +1. Load the saved calibration package and apply the manifest target filters. +2. Read `dataset_path`, `db_path`, and `n_clones` from the package metadata. +3. Query `stratum_constraints` for the selected targets from the target DB. +4. Identify the source variables needed to evaluate those constraints, such as + `age`, `snap`, or `medicaid_enrolled`. +5. Reconstruct the cloned household universe from `initial_weights`, + `block_geoid`, and `cd_geoid`. This yields one benchmark unit per matrix + column. +6. If any selected IPF target is `person_count`, expand that cloned household + universe to a person-level table using the source dataset's person-to- + household links. Multiple person rows may therefore share the same + household-clone `unit_index`. +7. Calculate the needed source variables from the dataset and attach them to + the IPF unit table. +8. Materialize the string-valued derived category columns the margins cover + (e.g. `age_bracket`, `snap_positive`) on that unit table. +9. Group the resolved targets into margin families, validate them against the + observed unit-table support, and keep only families that are already closed + or can be closed exactly from authored parent totals. +10. Emit one `categorical_margin` row per retained authored or exactly-derived + cell, sharing a `margin_id` within each family. +11. Write diagnostics (`dropped_targets`, retained-authored counts, derived + complements, and any coherence issues) to + `inputs/ipf_conversion_diagnostics.json`. +12. Run `surveysd::ipf` once on the generated unit table and full validated + IPF target metadata. +13. Collapse the fitted IPF row weights back to one weight per shared + benchmark `unit_index`, so the fitted result can be scored against the + retained-authored sparse target subset used for the IPF benchmark. + +This means the benchmark keeps a shared requested target space for the export, +but an IPF-specific retained-authored scoring space for the actual IPF +comparison. + +### Why the IPF conversion exists + +`L0` and `GREG` can work directly with a sparse linear system of the form +`X w = t`. + +Classical `IPF` does not start from that object. It expects: + +- a unit-record table +- categorical or indicator variables on that table +- target totals over those variables + +So the benchmark exporter translates selected count-style calibration targets +into that IPF-friendly representation instead of trying to feed the sparse +matrix directly into `surveysd::ipf`. + +### IPF target metadata schema + +`ipf_runner.R` accepts one encoding: `categorical_margin`. One row per +authored margin cell: + +- `scope`: `person` or `household` +- `target_type`: `categorical_margin` +- `margin_id`: identifier for a margin block. Rows sharing a `margin_id` are + grouped into one `surveysd::ipf` constraint (via `xtabs`). +- `variables`: pipe-separated variable names, e.g. + `congressional_district_geoid|age_bracket` +- `cell`: pipe-separated assignments, e.g. + `congressional_district_geoid=0601|age_bracket=0-4` +- `target_value`: numeric target + +Open subset systems are not exported. If a subset family cannot be closed from +an authored parent total, it is dropped before the R call. + +## Example Commands + +Export a benchmark bundle: + +```bash +python paper-l0/benchmarking/benchmark_cli.py export \ + --manifest paper-l0/benchmarking/manifests/greg_demo_small.example.json \ + --output-dir paper-l0/benchmarking/runs/greg_demo_small +``` + +Run a GREG benchmark from an exported bundle: + +```bash +python paper-l0/benchmarking/benchmark_cli.py run \ + --method greg \ + --run-dir paper-l0/benchmarking/runs/greg_demo_small +``` + +Run `L0` on an exported bundle: + +```bash +python paper-l0/benchmarking/benchmark_cli.py run \ + --method l0 \ + --run-dir paper-l0/benchmarking/runs/greg_demo_small +``` + +Equivalent root Make targets: + +```bash +make benchmarking-export MANIFEST=paper-l0/benchmarking/manifests/greg_demo_small.example.json RUN_DIR=paper-l0/benchmarking/runs/greg_demo_small +make benchmarking-run-greg RUN_DIR=paper-l0/benchmarking/runs/greg_demo_small +make benchmarking-run-l0 RUN_DIR=paper-l0/benchmarking/runs/greg_demo_small +``` diff --git a/paper-l0/benchmarking/benchmark_cli.py b/paper-l0/benchmarking/benchmark_cli.py new file mode 100644 index 000000000..a6dc800ea --- /dev/null +++ b/paper-l0/benchmarking/benchmark_cli.py @@ -0,0 +1,320 @@ +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +import time +from pathlib import Path + +import numpy as np +import pandas as pd + +from benchmark_export import export_bundle +from benchmark_manifest import load_manifest +from benchmark_metrics import ( + compute_common_metrics, + load_targets_csv, + write_method_summary, +) + + +ROOT = Path(__file__).resolve().parent +RUNNERS_DIR = ROOT / "runners" + + +def _run_subprocess(cmd, cwd=None): + started = time.time() + proc = subprocess.run(cmd, cwd=cwd, check=False) + elapsed = time.time() - started + return proc, elapsed + + +def cmd_export(args): + manifest = load_manifest(args.manifest) + output_dir, info = export_bundle(manifest=manifest, output_dir=args.output_dir) + print(json.dumps({"output_dir": str(output_dir), **info}, indent=2, sort_keys=True)) + return 0 + + +def _run_l0(run_dir: Path): + inputs = run_dir / "inputs" + outputs = run_dir / "outputs" + + from scipy.io import mmread + from policyengine_us_data.calibration.unified_calibration import fit_l0_weights + + with open(inputs / "benchmark_manifest.json") as f: + manifest = json.load(f) + + options = manifest.get("method_options", {}).get("l0", {}) + X_sparse = mmread(str(inputs / "X_targets_by_units.mtx")).tocsr() + targets_df = pd.read_csv(inputs / "target_metadata.csv") + initial_weights = np.load(inputs / "initial_weights.npy") + + weights = fit_l0_weights( + X_sparse=X_sparse, + targets=targets_df["value"].to_numpy(dtype=np.float64), + lambda_l0=float(options.get("lambda_l0", 1e-8)), + epochs=int(options.get("epochs", 1000)), + device=str(options.get("device", "cpu")), + beta=float(options.get("beta", 0.65)), + lambda_l2=float(options.get("lambda_l2", 1e-12)), + learning_rate=float(options.get("learning_rate", 0.15)), + target_names=targets_df["target_name"].tolist(), + initial_weights=initial_weights, + targets_df=targets_df, + ) + + weights_path = outputs / "fitted_weights.npy" + np.save(weights_path, weights.astype(np.float64)) + return weights_path + + +def _run_greg(run_dir: Path): + inputs = run_dir / "inputs" + outputs = run_dir / "outputs" + temp_csv = outputs / "_greg_weights.csv" + + with open(inputs / "benchmark_manifest.json") as f: + manifest = json.load(f) + options = manifest.get("method_options", {}).get("greg", {}) + + cmd = [ + "Rscript", + str(RUNNERS_DIR / "greg_runner.R"), + str(inputs / "X_targets_by_units.mtx"), + str(inputs / "target_metadata.csv"), + str(inputs / "initial_weights.npy"), + str(temp_csv), + str(int(options.get("maxit", 50))), + str(float(options.get("epsilon", 1e-7))), + ] + proc, elapsed = _run_subprocess(cmd) + if proc.returncode != 0: + raise RuntimeError(f"GREG runner failed with exit code {proc.returncode}") + + weights = pd.read_csv(temp_csv)["fitted_weight"].to_numpy(dtype=np.float64) + weights_path = outputs / "fitted_weights.npy" + np.save(weights_path, weights) + temp_csv.unlink(missing_ok=True) + return weights_path, elapsed + + +def _collapse_ipf_rows_to_unit_weights( + raw_weights: pd.DataFrame, n_units: int +) -> np.ndarray: + """Validate a per-row IPF output and collapse it to a length-`n_units` vector. + + surveysd::ipf with `meanHH = TRUE` guarantees every row that shares a + `unit_index` carries the same fitted weight; the spread check keeps that + assumption honest. + """ + if "unit_index" not in raw_weights.columns: + raise RuntimeError("IPF runner output must include a unit_index column") + if raw_weights["unit_index"].isna().any(): + raise RuntimeError("IPF runner output contains missing unit_index values") + + raw_weights = raw_weights.copy() + raw_weights["unit_index"] = raw_weights["unit_index"].astype(np.int64) + if (raw_weights["unit_index"] < 0).any() or ( + raw_weights["unit_index"] >= n_units + ).any(): + raise RuntimeError("IPF runner output contains out-of-range unit_index values") + + per_unit_spread = raw_weights.groupby("unit_index", sort=True)["fitted_weight"].agg( + lambda series: float(series.max() - series.min()) + ) + if (per_unit_spread > 1e-9).any(): + raise RuntimeError( + "IPF runner produced inconsistent fitted weights within the same unit_index" + ) + + weights_by_unit = ( + raw_weights.groupby("unit_index", sort=True)["fitted_weight"] + .first() + .reindex(np.arange(n_units, dtype=np.int64)) + ) + if weights_by_unit.isna().any(): + raise RuntimeError( + "Aggregated IPF weights do not cover the full benchmark unit range" + ) + return weights_by_unit.to_numpy(dtype=np.float64) + + +def _run_ipf(run_dir: Path): + """Run one coherent IPF problem in a single `surveysd::ipf` call.""" + inputs = run_dir / "inputs" + outputs = run_dir / "outputs" + + with open(inputs / "benchmark_manifest.json") as f: + manifest = json.load(f) + options = manifest.get("method_options", {}).get("ipf", {}) + + target_metadata_path = inputs / "ipf_target_metadata.csv" + if not target_metadata_path.exists(): + raise FileNotFoundError( + "IPF run requires inputs/ipf_target_metadata.csv. " + "Provide external_inputs.ipf_target_metadata_csv in the manifest." + ) + unit_metadata_path = inputs / "unit_metadata.csv" + if not unit_metadata_path.exists(): + raise FileNotFoundError("IPF run requires inputs/unit_metadata.csv.") + + full_targets = pd.read_csv(target_metadata_path) + if full_targets.empty: + raise RuntimeError("IPF target metadata is empty; nothing to run.") + unit_metadata = pd.read_csv(unit_metadata_path) + if "unit_index" not in unit_metadata.columns: + raise RuntimeError("Unit metadata must include a unit_index column for IPF") + + weight_col = str(options.get("weight_col", "base_weight")) + household_id_col = str(options.get("household_id_col", "household_id")) + + initial_weights = np.load(inputs / "initial_weights.npy").astype(np.float64) + n_units = len(initial_weights) + unit_indices = unit_metadata["unit_index"].astype(np.int64).to_numpy() + if unit_indices.min() < 0 or unit_indices.max() >= n_units: + raise RuntimeError( + "Unit metadata unit_index values fall outside the initial weight vector" + ) + temp_csv = outputs / "_ipf_weights.csv" + unit_with_weights = unit_metadata.copy() + unit_with_weights[weight_col] = initial_weights[unit_indices] + temp_unit_csv = outputs / "_ipf_unit_metadata.csv" + unit_with_weights.to_csv(temp_unit_csv, index=False) + + cmd = [ + "Rscript", + str(RUNNERS_DIR / "ipf_runner.R"), + str(temp_unit_csv), + str(target_metadata_path), + str(inputs / "initial_weights.npy"), + str(temp_csv), + str(int(options.get("max_iter", 200))), + str(float(options.get("bound", 4.0))), + str(float(options.get("epsP", 1e-6))), + str(float(options.get("epsH", 1e-2))), + household_id_col, + weight_col, + ] + proc, elapsed = _run_subprocess(cmd) + if proc.returncode != 0: + raise RuntimeError(f"IPF runner failed with exit code {proc.returncode}") + + raw_weights = pd.read_csv(temp_csv) + current_weights = _collapse_ipf_rows_to_unit_weights(raw_weights, n_units) + weights_path = outputs / "fitted_weights.npy" + np.save(weights_path, current_weights) + temp_csv.unlink(missing_ok=True) + temp_unit_csv.unlink(missing_ok=True) + return weights_path, elapsed + + +def _select_scoring_inputs( + run_dir: Path, method: str, score_on: str +) -> tuple[Path, Path, str]: + inputs = run_dir / "inputs" + ipf_targets = inputs / "ipf_scoring_target_metadata.csv" + ipf_matrix = inputs / "ipf_scoring_X_targets_by_units.mtx" + has_ipf_scoring = ipf_targets.exists() and ipf_matrix.exists() + + if score_on == "ipf_retained_authored": + if not has_ipf_scoring: + raise FileNotFoundError( + "Requested score_on=ipf_retained_authored, but " + "inputs/ipf_scoring_target_metadata.csv and " + "inputs/ipf_scoring_X_targets_by_units.mtx are not both present." + ) + return ipf_targets, ipf_matrix, "ipf_retained_authored" + + if score_on == "auto" and method == "ipf" and has_ipf_scoring: + return ipf_targets, ipf_matrix, "ipf_retained_authored" + return ( + inputs / "target_metadata.csv", + inputs / "X_targets_by_units.mtx", + "shared_requested", + ) + + +def cmd_run(args): + run_dir = Path(args.run_dir) + inputs = run_dir / "inputs" + outputs = run_dir / "outputs" + outputs.mkdir(parents=True, exist_ok=True) + targets_path, matrix_path, scoring_target_set = _select_scoring_inputs( + run_dir, + args.method, + getattr(args, "score_on", "auto"), + ) + targets_df = load_targets_csv(targets_path) + + started = time.time() + if args.method == "l0": + weights_path = _run_l0(run_dir) + elif args.method == "greg": + weights_path, _ = _run_greg(run_dir) + elif args.method == "ipf": + weights_path, _ = _run_ipf(run_dir) + else: + raise ValueError(f"Unsupported method: {args.method}") + elapsed = time.time() - started + + weights = np.load(weights_path) + summary = compute_common_metrics( + weights=weights, + targets_df=targets_df, + matrix_path=matrix_path, + ) + summary["method"] = args.method + summary["run_dir"] = str(run_dir.resolve()) + summary["runtime_seconds"] = elapsed + summary["scoring_target_set"] = scoring_target_set + write_method_summary(summary, outputs / f"{args.method}_summary.json") + print(json.dumps(summary, indent=2, sort_keys=True)) + return 0 + + +def build_parser(): + parser = argparse.ArgumentParser(description="Benchmark scaffold CLI") + subparsers = parser.add_subparsers(dest="command", required=True) + + export_parser = subparsers.add_parser("export", help="Export a benchmark bundle") + export_parser.add_argument( + "--manifest", required=True, help="Path to benchmark manifest JSON" + ) + export_parser.add_argument( + "--output-dir", required=True, help="Output bundle directory" + ) + export_parser.set_defaults(func=cmd_export) + + run_parser = subparsers.add_parser( + "run", help="Run one method on an exported bundle" + ) + run_parser.add_argument("--method", required=True, choices=["l0", "greg", "ipf"]) + run_parser.add_argument( + "--run-dir", required=True, help="Exported benchmark bundle directory" + ) + run_parser.add_argument( + "--score-on", + default="auto", + choices=["auto", "shared_requested", "ipf_retained_authored"], + help=( + "Scoring target set. 'auto' uses IPF-retained-authored targets only " + "for method=ipf when available; the other methods default to the " + "shared requested target set unless explicitly overridden." + ), + ) + run_parser.set_defaults(func=cmd_run) + + return parser + + +def main(argv=None): + parser = build_parser() + args = parser.parse_args(argv) + return args.func(args) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/paper-l0/benchmarking/benchmark_export.py b/paper-l0/benchmarking/benchmark_export.py new file mode 100644 index 000000000..6c870754a --- /dev/null +++ b/paper-l0/benchmarking/benchmark_export.py @@ -0,0 +1,322 @@ +from __future__ import annotations + +import json +import pickle +import shutil +from dataclasses import asdict +from pathlib import Path +from typing import Dict, Tuple + +import numpy as np +import pandas as pd +from scipy.io import mmread, mmwrite + +from benchmark_manifest import BenchmarkManifest, filter_targets +from ipf_conversion import IPFConversionError, build_ipf_inputs + + +def load_calibration_package_raw(path: str | Path) -> Dict: + with open(path, "rb") as f: + return pickle.load(f) + + +def build_shared_unit_metadata(package: Dict) -> pd.DataFrame: + initial_weights = package.get("initial_weights") + n_units = ( + int(initial_weights.shape[0]) + if initial_weights is not None + else int(package["X_sparse"].shape[1]) + ) + data = {"unit_index": np.arange(n_units, dtype=np.int64)} + + if initial_weights is not None: + data["base_weight"] = np.asarray(initial_weights, dtype=np.float64) + + if package.get("cd_geoid") is not None: + data["cd_geoid"] = np.asarray(package["cd_geoid"]).astype(str) + + if package.get("block_geoid") is not None: + data["block_geoid"] = np.asarray(package["block_geoid"]).astype(str) + + return pd.DataFrame(data) + + +def _validate_external_ipf_inputs( + manifest: BenchmarkManifest, + *, + n_units: int, +) -> None: + ext = manifest.external_inputs + provided = { + "ipf_unit_metadata_csv": ext.ipf_unit_metadata_csv, + "ipf_target_metadata_csv": ext.ipf_target_metadata_csv, + "ipf_scoring_target_metadata_csv": ext.ipf_scoring_target_metadata_csv, + "ipf_scoring_matrix_mtx": ext.ipf_scoring_matrix_mtx, + } + if not any(provided.values()): + return + + missing = [name for name, path in provided.items() if not path] + if missing: + raise ValueError( + "External IPF overrides must provide all of: " + "ipf_unit_metadata_csv, ipf_target_metadata_csv, " + "ipf_scoring_target_metadata_csv, ipf_scoring_matrix_mtx. " + f"Missing: {missing}" + ) + + target_meta = pd.read_csv(ext.ipf_target_metadata_csv) + required_target_cols = { + "margin_id", + "scope", + "target_type", + "variables", + "cell", + "target_value", + } + missing_target_cols = sorted(required_target_cols - set(target_meta.columns)) + if missing_target_cols: + raise ValueError( + "External ipf_target_metadata_csv is missing required columns: " + f"{missing_target_cols}" + ) + unsupported_types = set(target_meta["target_type"].astype(str)) - { + "categorical_margin" + } + if unsupported_types: + raise ValueError( + "External ipf_target_metadata_csv contains unsupported target_type " + f"values: {sorted(unsupported_types)}" + ) + unsupported_scopes = set(target_meta["scope"].astype(str)) - {"person", "household"} + if unsupported_scopes: + raise ValueError( + "External ipf_target_metadata_csv contains unsupported scope values: " + f"{sorted(unsupported_scopes)}" + ) + + unit_meta = pd.read_csv(ext.ipf_unit_metadata_csv) + weight_col = str(manifest.method_options.ipf.get("weight_col", "base_weight")) + household_id_col = str( + manifest.method_options.ipf.get("household_id_col", "household_id") + ) + if "unit_index" not in unit_meta.columns and weight_col not in unit_meta.columns: + raise ValueError( + "External ipf_unit_metadata_csv must include either unit_index or the " + f"configured weight column '{weight_col}'." + ) + if "household" in set(target_meta["scope"].astype(str)) and ( + household_id_col not in unit_meta.columns + ): + raise ValueError( + "External ipf_unit_metadata_csv must include the configured household " + f"id column '{household_id_col}' when household-scope margins are present." + ) + + scoring_meta = pd.read_csv(ext.ipf_scoring_target_metadata_csv) + if "value" not in scoring_meta.columns: + raise ValueError( + "External ipf_scoring_target_metadata_csv must include a 'value' column." + ) + scoring_matrix = mmread(str(ext.ipf_scoring_matrix_mtx)).tocsr() + if scoring_matrix.shape[0] != len(scoring_meta): + raise ValueError( + "External ipf_scoring_matrix_mtx row count does not match " + "ipf_scoring_target_metadata_csv." + ) + if scoring_matrix.shape[1] != n_units: + raise ValueError( + "External ipf_scoring_matrix_mtx column count does not match the " + "shared benchmark unit count." + ) + + +def export_bundle( + manifest: BenchmarkManifest, output_dir: str | Path +) -> Tuple[Path, Dict]: + output_dir = Path(output_dir) + inputs_dir = output_dir / "inputs" + outputs_dir = output_dir / "outputs" + inputs_dir.mkdir(parents=True, exist_ok=True) + outputs_dir.mkdir(parents=True, exist_ok=True) + + package = load_calibration_package_raw(manifest.package_path) + targets_df = package["targets_df"].copy() + target_names = list(package["target_names"]) + X_sparse = package["X_sparse"] + + filtered_targets, filtered_names, filtered_matrix, kept_indices = filter_targets( + targets_df=targets_df, + target_names=target_names, + X_sparse=X_sparse, + filters=manifest.target_filters, + ) + + filtered_targets.to_csv(inputs_dir / "target_metadata.csv", index=False) + mmwrite(str(inputs_dir / "X_targets_by_units.mtx"), filtered_matrix) + + initial_weights = np.asarray(package["initial_weights"], dtype=np.float64) + np.save(inputs_dir / "initial_weights.npy", initial_weights) + + unit_metadata = build_shared_unit_metadata(package) + _validate_external_ipf_inputs( + manifest, + n_units=int(filtered_matrix.shape[1]), + ) + has_external_ipf_inputs = bool( + manifest.external_inputs.ipf_unit_metadata_csv + or manifest.external_inputs.ipf_target_metadata_csv + or manifest.external_inputs.ipf_scoring_target_metadata_csv + or manifest.external_inputs.ipf_scoring_matrix_mtx + ) + + ipf_diagnostics: Dict = {} + if manifest.external_inputs.ipf_unit_metadata_csv: + shutil.copyfile( + manifest.external_inputs.ipf_unit_metadata_csv, + inputs_dir / "unit_metadata.csv", + ) + shutil.copyfile( + manifest.external_inputs.ipf_target_metadata_csv, + inputs_dir / "ipf_target_metadata.csv", + ) + shutil.copyfile( + manifest.external_inputs.ipf_scoring_target_metadata_csv, + inputs_dir / "ipf_scoring_target_metadata.csv", + ) + shutil.copyfile( + manifest.external_inputs.ipf_scoring_matrix_mtx, + inputs_dir / "ipf_scoring_X_targets_by_units.mtx", + ) + if manifest.external_inputs.ipf_conversion_diagnostics_json: + shutil.copyfile( + manifest.external_inputs.ipf_conversion_diagnostics_json, + inputs_dir / "ipf_conversion_diagnostics.json", + ) + with open(inputs_dir / "ipf_conversion_diagnostics.json") as f: + ipf_diagnostics = json.load(f) + else: + ipf_diagnostics = { + "requested_target_count": int(filtered_matrix.shape[0]), + "retained_authored_target_count": int( + len(pd.read_csv(inputs_dir / "ipf_scoring_target_metadata.csv")) + ), + "derived_complement_count": None, + "dropped_targets": {}, + "dropped_target_details": [], + "margin_consistency_issues": [], + "derived_complement_rows": [], + "converted_target_rows": int( + len(pd.read_csv(inputs_dir / "ipf_target_metadata.csv")) + ), + "source": "external_ipf_override", + } + elif "ipf" in manifest.methods and not has_external_ipf_inputs: + try: + ipf_unit_metadata, ipf_target_metadata = build_ipf_inputs( + package=package, + manifest=manifest, + filtered_targets=filtered_targets, + ) + except IPFConversionError as exc: + if exc.diagnostics: + with open(inputs_dir / "ipf_conversion_diagnostics.json", "w") as f: + json.dump(exc.diagnostics, f, indent=2, sort_keys=True, default=str) + raise + ipf_unit_metadata.to_csv(inputs_dir / "unit_metadata.csv", index=False) + ipf_target_metadata.to_csv(inputs_dir / "ipf_target_metadata.csv", index=False) + retained_target_ids = list( + ipf_target_metadata.attrs.get("retained_authored_target_ids", []) + ) + if retained_target_ids and "target_id" in filtered_targets.columns: + retained_mask = filtered_targets["target_id"].isin(retained_target_ids) + ipf_scoring_targets = filtered_targets.loc[retained_mask].reset_index( + drop=True + ) + ipf_scoring_matrix = filtered_matrix[retained_mask.to_numpy(), :] + ipf_scoring_targets.to_csv( + inputs_dir / "ipf_scoring_target_metadata.csv", index=False + ) + mmwrite( + str(inputs_dir / "ipf_scoring_X_targets_by_units.mtx"), + ipf_scoring_matrix, + ) + ipf_diagnostics = { + "requested_target_count": int( + ipf_target_metadata.attrs.get("requested_target_count", len(filtered_targets)) + ), + "retained_authored_target_count": int( + ipf_target_metadata.attrs.get( + "retained_authored_target_count", 0 + ) + ), + "derived_complement_count": int( + ipf_target_metadata.attrs.get("derived_complement_count", 0) + ), + "dropped_targets": dict( + ipf_target_metadata.attrs.get("dropped_targets", {}) + ), + "dropped_target_details": list( + ipf_target_metadata.attrs.get("dropped_target_details", []) + ), + "margin_consistency_issues": list( + ipf_target_metadata.attrs.get("margin_consistency_issues", []) + ), + "derived_complement_rows": list( + ipf_target_metadata.attrs.get("derived_complement_rows", []) + ), + "converted_target_rows": int(len(ipf_target_metadata)), + } + with open(inputs_dir / "ipf_conversion_diagnostics.json", "w") as f: + json.dump(ipf_diagnostics, f, indent=2, sort_keys=True, default=str) + else: + unit_metadata.to_csv(inputs_dir / "unit_metadata.csv", index=False) + + runtime_manifest = manifest.to_dict() + runtime_manifest["resolved"] = { + "output_dir": str(output_dir.resolve()), + "inputs_dir": str(inputs_dir.resolve()), + "outputs_dir": str(outputs_dir.resolve()), + "n_targets": int(filtered_matrix.shape[0]), + "n_units": int(filtered_matrix.shape[1]), + "kept_target_indices": [int(i) for i in kept_indices.tolist()], + "target_names": filtered_names, + "package_metadata": package.get("metadata", {}), + } + + with open(inputs_dir / "benchmark_manifest.json", "w") as f: + json.dump(runtime_manifest, f, indent=2, sort_keys=True) + + export_info = { + "n_targets": int(filtered_matrix.shape[0]), + "n_units": int(filtered_matrix.shape[1]), + "inputs_dir": str(inputs_dir), + "outputs_dir": str(outputs_dir), + } + if ipf_diagnostics: + dropped = ipf_diagnostics.get("dropped_targets", {}) + export_info["ipf_requested_target_count"] = int( + ipf_diagnostics.get("requested_target_count", len(filtered_targets)) + ) + export_info["ipf_retained_authored_target_count"] = int( + ipf_diagnostics.get("retained_authored_target_count", 0) + ) + derived_complement_count = ipf_diagnostics.get("derived_complement_count", 0) + export_info["ipf_derived_complement_count"] = ( + int(derived_complement_count) + if derived_complement_count is not None + else 0 + ) + export_info["ipf_converted_target_rows"] = int( + ipf_diagnostics.get("converted_target_rows", 0) + ) + export_info["ipf_dropped_non_count_count"] = int( + dropped.get("non_count_style", 0) + ) + export_info["ipf_dropped_unresolvable_count"] = int( + dropped.get("unresolvable_constraints", 0) + ) + export_info["ipf_margin_consistency_issue_count"] = int( + len(ipf_diagnostics.get("margin_consistency_issues", [])) + ) + return output_dir, export_info diff --git a/paper-l0/benchmarking/benchmark_manifest.py b/paper-l0/benchmarking/benchmark_manifest.py new file mode 100644 index 000000000..e0794e7db --- /dev/null +++ b/paper-l0/benchmarking/benchmark_manifest.py @@ -0,0 +1,204 @@ +from __future__ import annotations + +import json +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional + +import numpy as np +import pandas as pd + + +COUNT_LIKE_VARIABLES = { + "person_count", + "household_count", + "tax_unit_count", + "spm_unit_count", + "family_count", + "marital_unit_count", +} + + +def _normalize_string_list(values: Optional[List[str]]) -> Optional[List[str]]: + if values is None: + return None + return [str(v) for v in values] + + +@dataclass +class TargetFilters: + include_geo_levels: Optional[List[str]] = None + include_national: bool = True + state_ids: Optional[List[str]] = None + district_ids: Optional[List[str]] = None + variables: Optional[List[str]] = None + domain_variables: Optional[List[str]] = None + count_like_only: bool = False + max_targets: Optional[int] = None + + @classmethod + def from_dict(cls, raw: Optional[Dict[str, Any]]) -> "TargetFilters": + raw = raw or {} + return cls( + include_geo_levels=_normalize_string_list(raw.get("include_geo_levels")), + include_national=bool(raw.get("include_national", True)), + state_ids=_normalize_string_list(raw.get("state_ids")), + district_ids=_normalize_string_list(raw.get("district_ids")), + variables=_normalize_string_list(raw.get("variables")), + domain_variables=_normalize_string_list(raw.get("domain_variables")), + count_like_only=bool(raw.get("count_like_only", False)), + max_targets=raw.get("max_targets"), + ) + + +@dataclass +class ExternalInputs: + ipf_unit_metadata_csv: Optional[str] = None + ipf_target_metadata_csv: Optional[str] = None + ipf_scoring_target_metadata_csv: Optional[str] = None + ipf_scoring_matrix_mtx: Optional[str] = None + ipf_conversion_diagnostics_json: Optional[str] = None + + @classmethod + def from_dict(cls, raw: Optional[Dict[str, Any]]) -> "ExternalInputs": + raw = raw or {} + return cls( + ipf_unit_metadata_csv=raw.get("ipf_unit_metadata_csv"), + ipf_target_metadata_csv=raw.get("ipf_target_metadata_csv"), + ipf_scoring_target_metadata_csv=raw.get( + "ipf_scoring_target_metadata_csv" + ), + ipf_scoring_matrix_mtx=raw.get("ipf_scoring_matrix_mtx"), + ipf_conversion_diagnostics_json=raw.get( + "ipf_conversion_diagnostics_json" + ), + ) + + +@dataclass +class MethodOptions: + l0: Dict[str, Any] = field(default_factory=dict) + greg: Dict[str, Any] = field(default_factory=dict) + ipf: Dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, raw: Optional[Dict[str, Any]]) -> "MethodOptions": + raw = raw or {} + return cls( + l0=dict(raw.get("l0", {})), + greg=dict(raw.get("greg", {})), + ipf=dict(raw.get("ipf", {})), + ) + + +@dataclass +class BenchmarkManifest: + name: str + tier: str + description: str + package_path: str + methods: List[str] + target_filters: TargetFilters = field(default_factory=TargetFilters) + external_inputs: ExternalInputs = field(default_factory=ExternalInputs) + method_options: MethodOptions = field(default_factory=MethodOptions) + + @classmethod + def from_dict(cls, raw: Dict[str, Any]) -> "BenchmarkManifest": + return cls( + name=str(raw["name"]), + tier=str(raw["tier"]), + description=str(raw.get("description", "")), + package_path=str(raw["package_path"]), + methods=[str(m) for m in raw.get("methods", [])], + target_filters=TargetFilters.from_dict(raw.get("target_filters")), + external_inputs=ExternalInputs.from_dict(raw.get("external_inputs")), + method_options=MethodOptions.from_dict(raw.get("method_options")), + ) + + def to_dict(self) -> Dict[str, Any]: + payload = asdict(self) + payload["package_path"] = str(self.package_path) + return payload + + +def load_manifest(path: str | Path) -> BenchmarkManifest: + with open(path) as f: + return BenchmarkManifest.from_dict(json.load(f)) + + +def save_manifest(manifest: BenchmarkManifest, path: str | Path) -> None: + with open(path, "w") as f: + json.dump(manifest.to_dict(), f, indent=2, sort_keys=True) + + +def is_count_like_variable(variable: str) -> bool: + return variable in COUNT_LIKE_VARIABLES or variable.endswith("_count") + + +def _build_geo_mask(targets_df: pd.DataFrame, filters: TargetFilters) -> np.ndarray: + geo_level = targets_df["geo_level"].astype(str) + geographic_id = targets_df["geographic_id"].astype(str) + mask = np.ones(len(targets_df), dtype=bool) + + if filters.include_geo_levels: + mask &= geo_level.isin(filters.include_geo_levels).to_numpy() + + geo_keep = np.zeros(len(targets_df), dtype=bool) + national_mask = geo_level.eq("national").to_numpy() + state_mask = geo_level.eq("state").to_numpy() + district_mask = geo_level.eq("district").to_numpy() + + if filters.include_national: + geo_keep |= national_mask + + if filters.state_ids: + geo_keep |= state_mask & geographic_id.isin(filters.state_ids).to_numpy() + else: + geo_keep |= state_mask + + if filters.district_ids: + geo_keep |= district_mask & geographic_id.isin(filters.district_ids).to_numpy() + else: + geo_keep |= district_mask + + other_mask = ~(national_mask | state_mask | district_mask) + geo_keep |= other_mask + return mask & geo_keep + + +def filter_targets( + targets_df: pd.DataFrame, + target_names: List[str], + X_sparse, + filters: TargetFilters, +): + mask = _build_geo_mask(targets_df, filters) + + if filters.variables: + mask &= targets_df["variable"].astype(str).isin(filters.variables).to_numpy() + + if filters.domain_variables: + domain_series = targets_df.get( + "domain_variable", pd.Series("", index=targets_df.index) + ) + mask &= ( + domain_series.fillna("") + .astype(str) + .isin(filters.domain_variables) + .to_numpy() + ) + + if filters.count_like_only: + mask &= ( + targets_df["variable"].astype(str).map(is_count_like_variable).to_numpy() + ) + + indices = np.where(mask)[0] + if filters.max_targets is not None: + indices = indices[: int(filters.max_targets)] + + filtered_targets = targets_df.iloc[indices].reset_index(drop=True).copy() + filtered_targets["target_name"] = [target_names[i] for i in indices] + filtered_names = [target_names[i] for i in indices] + filtered_matrix = X_sparse[indices, :] + return filtered_targets, filtered_names, filtered_matrix, indices diff --git a/paper-l0/benchmarking/benchmark_metrics.py b/paper-l0/benchmarking/benchmark_metrics.py new file mode 100644 index 000000000..341117202 --- /dev/null +++ b/paper-l0/benchmarking/benchmark_metrics.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Dict, Iterable + +import numpy as np +import pandas as pd +from scipy.io import mmread + + +def load_targets_csv(path: str | Path) -> pd.DataFrame: + return pd.read_csv(path) + + +def compute_common_metrics( + weights: np.ndarray, + targets_df: pd.DataFrame, + matrix_path: str | Path, +) -> Dict: + X_sparse = mmread(str(matrix_path)).tocsr() + estimates = X_sparse.dot(weights) + true_values = targets_df["value"].to_numpy(dtype=np.float64) + rel_errors = np.where( + np.abs(true_values) > 0, + (estimates - true_values) / np.abs(true_values), + 0.0, + ) + abs_rel = np.abs(rel_errors) + achievable = np.asarray(X_sparse.sum(axis=1)).ravel() > 0 + + active_weights = weights[weights > 0] + weight_sum = float(weights.sum()) + ess = ( + float(weight_sum**2 / np.square(weights).sum()) + if np.square(weights).sum() > 0 + else 0.0 + ) + + metrics = { + "n_targets": int(len(true_values)), + "n_units": int(len(weights)), + "n_achievable_targets": int(achievable.sum()), + "mean_abs_rel_error": float(abs_rel.mean()), + "median_abs_rel_error": float(np.median(abs_rel)), + "p95_abs_rel_error": float(np.quantile(abs_rel, 0.95)), + "max_abs_rel_error": float(abs_rel.max()), + "ess": ess, + "active_record_count": int((weights > 0).sum()), + "negative_weight_share": float((weights < 0).mean()), + "weight_min": float(weights.min()), + "weight_max": float(weights.max()), + "weight_mean": float(weights.mean()), + "weight_median": float(np.median(weights)), + "nonzero_weight_min": float(active_weights.min()) + if len(active_weights) + else 0.0, + "nonzero_weight_max": float(active_weights.max()) + if len(active_weights) + else 0.0, + } + + if "geo_level" in targets_df.columns: + by_geo = {} + for geo_level, group in targets_df.assign(abs_rel_error=abs_rel).groupby( + "geo_level" + ): + vals = group["abs_rel_error"].to_numpy(dtype=np.float64) + by_geo[str(geo_level)] = { + "n_targets": int(len(vals)), + "mean_abs_rel_error": float(vals.mean()), + "median_abs_rel_error": float(np.median(vals)), + "p95_abs_rel_error": float(np.quantile(vals, 0.95)), + "max_abs_rel_error": float(vals.max()), + } + metrics["by_geo_level"] = by_geo + + if "variable" in targets_df.columns: + by_variable = {} + enriched = targets_df.assign(abs_rel_error=abs_rel) + for variable, group in enriched.groupby("variable"): + vals = group["abs_rel_error"].to_numpy(dtype=np.float64) + by_variable[str(variable)] = { + "n_targets": int(len(vals)), + "mean_abs_rel_error": float(vals.mean()), + "median_abs_rel_error": float(np.median(vals)), + } + metrics["by_variable"] = by_variable + + return metrics + + +def write_method_summary(summary: Dict, path: str | Path) -> None: + with open(path, "w") as f: + json.dump(summary, f, indent=2, sort_keys=True) diff --git a/paper-l0/benchmarking/install_r_packages.R b/paper-l0/benchmarking/install_r_packages.R new file mode 100644 index 000000000..6ba0e6083 --- /dev/null +++ b/paper-l0/benchmarking/install_r_packages.R @@ -0,0 +1,23 @@ +packages <- c( + "Matrix", + "survey", + "surveysd" +) + +repos <- "https://cloud.r-project.org" + +missing <- packages[!vapply(packages, requireNamespace, logical(1), quietly = TRUE)] +if (!length(missing)) { + cat("All benchmarking R packages are already installed.\n") + quit(save = "no", status = 0) +} + +cat("Installing missing R packages:", paste(missing, collapse = ", "), "\n") +install.packages(missing, repos = repos) + +failed <- missing[!vapply(missing, requireNamespace, logical(1), quietly = TRUE)] +if (length(failed)) { + stop(sprintf("Failed to install: %s", paste(failed, collapse = ", "))) +} + +cat("Benchmarking R packages installed successfully.\n") diff --git a/paper-l0/benchmarking/ipf_conversion.py b/paper-l0/benchmarking/ipf_conversion.py new file mode 100644 index 000000000..e803c12a5 --- /dev/null +++ b/paper-l0/benchmarking/ipf_conversion.py @@ -0,0 +1,1511 @@ +"""IPF-benchmark input conversion. + +Turns a filtered slice of the calibration package into the unit-table + +categorical-margin representation `surveysd::ipf` consumes. + +The converter is intentionally stricter than the shared benchmark harness: +it keeps only count-style targets that can be represented as one coherent +closed categorical margin system. When a requested target family is a binary +subset and an authored parent total exists on the exact reduced key, the +missing complement is derived from that authored total. Otherwise the family is +dropped with explicit diagnostics instead of being run as an open 1-cell +margin or sequentialized externally. +""" + +from __future__ import annotations + +import math +import sqlite3 +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Tuple + +import numpy as np +import pandas as pd + +from benchmark_manifest import BenchmarkManifest + + +# --------------------------------------------------------------------------- +# Geography constants +# --------------------------------------------------------------------------- + +_GEO_VARS = {"state_fips", "congressional_district_geoid"} + + +# --------------------------------------------------------------------------- +# Value coercion helpers +# --------------------------------------------------------------------------- + + +def _coerce_value(v) -> float: + if isinstance(v, (int, float)): + return float(v) + s = str(v).strip().lower() + if s in ("-inf", "-infinity"): + return -math.inf + if s in ("inf", "infinity", "+inf", "+infinity"): + return math.inf + return float(s) + + +def _equality_value(v) -> object: + """Canonicalize an `==` constraint value to a hashable label.""" + try: + f = _coerce_value(v) + if math.isfinite(f) and f.is_integer(): + return int(f) + return f + except ValueError: + return str(v) + + +# --------------------------------------------------------------------------- +# Bucket schemas +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class RangeBucket: + label: str + constraints: frozenset # frozenset of (op, float_value) pairs + + +def _bucket(label: str, lo_op: str, lo_val, hi_op: str, hi_val) -> RangeBucket: + return RangeBucket( + label=label, + constraints=frozenset( + [(lo_op, _coerce_value(lo_val)), (hi_op, _coerce_value(hi_val))] + ), + ) + + +AGE_BUCKETS: Tuple[RangeBucket, ...] = tuple( + _bucket(label, ">", lo, "<", hi) + for label, lo, hi in [ + ("0-4", -1, 5), + ("5-9", 4, 10), + ("10-14", 9, 15), + ("15-19", 14, 20), + ("20-24", 19, 25), + ("25-29", 24, 30), + ("30-34", 29, 35), + ("35-39", 34, 40), + ("40-44", 39, 45), + ("45-49", 44, 50), + ("50-54", 49, 55), + ("55-59", 54, 60), + ("60-64", 59, 65), + ("65-69", 64, 70), + ("70-74", 69, 75), + ("75-79", 74, 80), + ("80-84", 79, 85), + # The target DB uses two different upper bounds for the 85+ bucket + # (`<999` and `<1000`). Both map to the same label so either encoding + # matches cleanly. + ("85+", 84, 999), + ("85+", 84, 1000), + ] +) + + +# `eitc_child_count` uses discrete values {0, 1, 2} plus an open-upper ">2" +# bucket. Declared explicitly so the matcher routes every authored target into +# the same derived column `eitc_child_count_bracket`. +EITC_CHILD_COUNT_BUCKETS: Tuple[RangeBucket, ...] = ( + RangeBucket("0", frozenset([("==", 0.0)])), + RangeBucket("1", frozenset([("==", 1.0)])), + RangeBucket("2", frozenset([("==", 2.0)])), + RangeBucket(">2", frozenset([(">", 2.0)])), +) + + +AGI_BUCKETS_DISTRICT: Tuple[RangeBucket, ...] = tuple( + _bucket(label, ">=", lo, "<", hi) + for label, lo, hi in [ + ("(-inf,1)", -math.inf, 1.0), + ("[1,10k)", 1.0, 10_000.0), + ("[10k,25k)", 10_000.0, 25_000.0), + ("[25k,50k)", 25_000.0, 50_000.0), + ("[50k,75k)", 50_000.0, 75_000.0), + ("[75k,100k)", 75_000.0, 100_000.0), + ("[100k,200k)", 100_000.0, 200_000.0), + ("[200k,500k)", 200_000.0, 500_000.0), + ("[500k,inf)", 500_000.0, math.inf), + ] +) + +AGI_BUCKETS_STATE: Tuple[RangeBucket, ...] = AGI_BUCKETS_DISTRICT[:-1] + tuple( + _bucket(label, ">=", lo, "<", hi) + for label, lo, hi in [ + ("[500k,1M)", 500_000.0, 1_000_000.0), + ("[1M,inf)", 1_000_000.0, math.inf), + ] +) + + +@dataclass(frozen=True) +class BucketSchema: + name: str # derived-column name to emit on the unit table + variable: str # source variable the schema bucketizes + buckets: Tuple[RangeBucket, ...] + applies_to_geo_levels: Tuple[str, ...] + + def match_constraints(self, pairs: frozenset) -> Optional[str]: + for b in self.buckets: + if b.constraints == pairs: + return b.label + return None + + +AGE_SCHEMA = BucketSchema( + name="age_bracket", + variable="age", + buckets=AGE_BUCKETS, + applies_to_geo_levels=("national", "state", "district"), +) + +AGI_DISTRICT_SCHEMA = BucketSchema( + name="agi_bracket_district", + variable="adjusted_gross_income", + buckets=AGI_BUCKETS_DISTRICT, + applies_to_geo_levels=("district",), +) + +AGI_STATE_SCHEMA = BucketSchema( + name="agi_bracket_state", + variable="adjusted_gross_income", + buckets=AGI_BUCKETS_STATE, + applies_to_geo_levels=("state", "national"), +) + +EITC_CHILD_COUNT_SCHEMA = BucketSchema( + name="eitc_child_count_bracket", + variable="eitc_child_count", + buckets=EITC_CHILD_COUNT_BUCKETS, + applies_to_geo_levels=("national", "state", "district"), +) + +RANGE_SCHEMAS_BY_VARIABLE: Dict[str, Tuple[BucketSchema, ...]] = { + "age": (AGE_SCHEMA,), + "adjusted_gross_income": (AGI_DISTRICT_SCHEMA, AGI_STATE_SCHEMA), + "eitc_child_count": (EITC_CHILD_COUNT_SCHEMA,), +} + + +# Cell label used for the synthetic positive/non-positive binary bucket. +POSITIVE_LABEL = "positive" +NON_POSITIVE_LABEL = "non_positive" + + +class IPFConversionError(ValueError): + """Raised when the requested target slice cannot form one valid IPF run.""" + + def __init__(self, message: str, diagnostics: Optional[Dict[str, object]] = None): + super().__init__(message) + self.diagnostics = diagnostics or {} + + +def _positive_column_name(variable: str) -> str: + return f"{variable}_positive" + + +# --------------------------------------------------------------------------- +# Target normalization +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class GeoKey: + variable: Optional[str] + value: Optional[object] + + def level(self) -> str: + if self.variable == "state_fips": + return "state" + if self.variable == "congressional_district_geoid": + return "district" + return "national" + + +@dataclass(frozen=True) +class ResolvedTarget: + target_id: int + target_name: str + target_value: float + scope: str # "person" | "household" | "tax_unit" | ... + source_variable: str + geo: GeoKey + cell: Tuple[Tuple[str, str], ...] # tuple of (derived_column, cell_label), sorted + raw_constraints: Tuple[Tuple[str, str, str], ...] # preserved for diagnostics + + +def _group_constraints_by_variable( + records: Iterable[dict], + exclude_vars: Iterable[str], +) -> Dict[str, frozenset]: + exclude = set(exclude_vars) + grouped: Dict[str, List[Tuple[str, object]]] = {} + for r in records: + var = str(r["variable"]) + if var in exclude: + continue + op = str(r["operation"]) + raw_val = r["value"] + if op == "==": + value: object = _equality_value(raw_val) + else: + value = _coerce_value(raw_val) + grouped.setdefault(var, []).append((op, value)) + return {v: frozenset(pairs) for v, pairs in grouped.items()} + + +def _resolve_variable_cell( + variable: str, + pairs: frozenset, + geo_level: str, +) -> Optional[Tuple[str, str]]: + """Return (derived_column_name, cell_label) for this variable's constraints.""" + ops = {op for op, _ in pairs} + + # Declared range or discrete schemas take precedence: they pull equality + # cases into the same derived column as the range cases (e.g. EITC). + for schema in RANGE_SCHEMAS_BY_VARIABLE.get(variable, ()): + if geo_level not in schema.applies_to_geo_levels: + continue + lbl = schema.match_constraints(pairs) + if lbl is not None: + return (schema.name, lbl) + + # Pure equality → raw categorical pass-through (value identifies the cell). + if ops == {"=="}: + if len(pairs) != 1: + return None + _, val = next(iter(pairs)) + return (variable, str(val)) + + # Positive-dollar `>`-only single constraint at zero. + if ops == {">"} and len(pairs) == 1: + _, val = next(iter(pairs)) + if isinstance(val, float) and val == 0.0: + return (_positive_column_name(variable), POSITIVE_LABEL) + + return None + + +def _resolve_target( + target_row: pd.Series, + constraints: List[dict], +) -> Optional[ResolvedTarget]: + geo = _extract_geo(constraints) + grouped = _group_constraints_by_variable(constraints, exclude_vars=_GEO_VARS) + cell: List[Tuple[str, str]] = [] + for var, pairs in grouped.items(): + label = _resolve_variable_cell(var, pairs, geo.level()) + if label is None: + return None + cell.append(label) + raw = tuple( + sorted( + (str(r["variable"]), str(r["operation"]), str(r["value"])) + for r in constraints + ) + ) + return ResolvedTarget( + target_id=int(target_row["target_id"]) + if "target_id" in target_row.index + else -1, + target_name=str( + target_row.get("target_name") + or f"target_{target_row.get('stratum_id', 'unknown')}" + ), + target_value=float(target_row["value"]), + scope=_target_scope(str(target_row["variable"])), + source_variable=str(target_row["variable"]), + geo=geo, + cell=tuple(sorted(cell)), + raw_constraints=raw, + ) + + +def _extract_geo(records: Iterable[dict]) -> GeoKey: + for r in records: + v = str(r["variable"]) + if v in _GEO_VARS: + raw = r["value"] + try: + val: object = int(raw) + except (ValueError, TypeError): + val = str(raw) + return GeoKey(variable=v, value=val) + return GeoKey(variable=None, value=None) + + +_SCOPE_BY_VARIABLE = { + "person_count": "person", + "household_count": "household", +} + + +def _target_scope(target_variable: str) -> str: + try: + return _SCOPE_BY_VARIABLE[target_variable] + except KeyError as exc: + raise ValueError( + f"IPF conversion does not support target variable " + f"'{target_variable}'. Currently supported: " + f"{sorted(_SCOPE_BY_VARIABLE)}. " + "`tax_unit_count` and `spm_unit_count` remain outside the core " + "household/person IPF path in this pass." + ) from exc + + +# --------------------------------------------------------------------------- +# Margin assembly +# --------------------------------------------------------------------------- + + +@dataclass +class MarginBlock: + margin_id: str + scope: str + source_variable: str + cell_dims: Tuple[str, ...] # sorted non-geo derived-column names + cell_vars: Tuple[str, ...] # sorted derived-column names used as cell dimensions + geo_var: Optional[str] # geo dimension included in the margin + targets: List[ResolvedTarget] + # cells that exist in the target package ( (geo_value, cell_tuple) ) + target_cells: set + + +@dataclass(frozen=True) +class MarginCell: + geo_value: Optional[object] + cell: Tuple[Tuple[str, str], ...] + target_value: float + target_name: str + is_authored: bool + authored_target_id: Optional[int] + source_target_ids: Tuple[int, ...] + derivation_reason: Optional[str] = None + + +@dataclass +class ClosedMarginBlock: + margin_id: str + scope: str + source_variable: str + cell_dims: Tuple[str, ...] + cell_vars: Tuple[str, ...] + geo_var: Optional[str] + closure_status: str + cells: List[MarginCell] + + +def _margin_key(t: ResolvedTarget) -> Tuple[str, Optional[str], Tuple[str, ...]]: + return ( + t.source_variable, + t.geo.variable, + tuple(sorted({dc for dc, _ in t.cell})), + ) + + +def _assemble_margins( + resolved: List[ResolvedTarget], +) -> List[MarginBlock]: + blocks: Dict[Tuple, MarginBlock] = {} + for t in resolved: + key = _margin_key(t) + if key not in blocks: + source_variable, geo_var, cell_vars = key + all_vars = (*cell_vars,) if geo_var is None else (geo_var, *cell_vars) + blocks[key] = MarginBlock( + margin_id=f"margin_{len(blocks):04d}", + scope=t.scope, + source_variable=source_variable, + cell_dims=cell_vars, + cell_vars=tuple(sorted(all_vars)), + geo_var=geo_var, + targets=[], + target_cells=set(), + ) + blocks[key].targets.append(t) + blocks[key].target_cells.add((t.geo.value, t.cell)) + return list(blocks.values()) + + +def _is_single_cell_per_geo(block: MarginBlock) -> bool: + """True iff the block carries exactly one cell per geography. + + surveysd handles these as 1-cell `xtabs` arrays: the authored cell is + raked to its target and units outside that cell are left unconstrained + by the margin. Kept for diagnostics and reporting — no longer drives + emission branching. + """ + if block.geo_var is None: + return len({cell for _, cell in block.target_cells}) == 1 + cells_per_geo: Dict[object, set] = {} + for geo_value, cell in block.target_cells: + cells_per_geo.setdefault(geo_value, set()).add(cell) + return all(len(cells) == 1 for cells in cells_per_geo.values()) and ( + len({cell for _, cell in block.target_cells}) == 1 + ) + + +def check_margin_consistency( + blocks: List[object], + tolerance: float = 1e-3, +) -> List[Dict[str, object]]: + """Return per-geo population-total disagreements across closed blocks.""" + totals_by_scope_geo: Dict[Tuple[str, Optional[str], object], Dict[str, float]] = {} + for block in blocks: + by_geo: Dict[object, float] = {} + if hasattr(block, "cells"): + for cell in block.cells: + by_geo[cell.geo_value] = by_geo.get(cell.geo_value, 0.0) + float( + cell.target_value + ) + scope = block.scope + geo_var = block.geo_var + else: + for t in block.targets: + by_geo[t.geo.value] = by_geo.get(t.geo.value, 0.0) + float( + t.target_value + ) + scope = block.scope + geo_var = block.geo_var + for geo_val, total in by_geo.items(): + key = (scope, geo_var, geo_val) + totals_by_scope_geo.setdefault(key, {})[block.margin_id] = total + + issues: List[Dict[str, object]] = [] + for (scope, geo_var, geo_val), margin_totals in totals_by_scope_geo.items(): + if len(margin_totals) < 2: + continue + tmax = max(margin_totals.values()) + tmin = min(margin_totals.values()) + if tmax == 0 or (tmax - tmin) / tmax <= tolerance: + continue + issues.append( + { + "scope": scope, + "geo_var": geo_var, + "geo_value": geo_val, + "margin_totals": dict(margin_totals), + "relative_spread": (tmax - tmin) / tmax, + } + ) + return issues + + +# --------------------------------------------------------------------------- +# Derived-column materialization on the unit table +# --------------------------------------------------------------------------- + + +def _bracket_age(age_values: np.ndarray) -> np.ndarray: + out = np.array(["unmatched"] * len(age_values), dtype=object) + for b in AGE_BUCKETS: + lo = next(v for op, v in b.constraints if op == ">") + hi = next(v for op, v in b.constraints if op == "<") + mask = (age_values > lo) & (age_values < hi) + out[mask] = b.label + return out + + +def _bracket_agi(agi_values: np.ndarray, schema: BucketSchema) -> np.ndarray: + out = np.array(["unmatched"] * len(agi_values), dtype=object) + for b in schema.buckets: + lo = next(v for op, v in b.constraints if op == ">=") + hi = next(v for op, v in b.constraints if op == "<") + mask = (agi_values >= lo) & (agi_values < hi) + out[mask] = b.label + return out + + +def _bracket_eitc_child_count(values: np.ndarray) -> np.ndarray: + v = np.asarray(values, dtype=float).round().astype(int) + out = np.where(v == 0, "0", np.where(v == 1, "1", np.where(v == 2, "2", ">2"))) + return out.astype(object) + + +def _materialize_derived_columns( + unit_data: pd.DataFrame, + derived_columns_needed: set, + raw_variable_values: Dict[str, np.ndarray], +) -> pd.DataFrame: + df = unit_data.copy() + for col in derived_columns_needed: + if col in df.columns: + continue + if col == AGE_SCHEMA.name: + df[col] = _bracket_age(np.asarray(raw_variable_values["age"], dtype=float)) + elif col == AGI_DISTRICT_SCHEMA.name: + df[col] = _bracket_agi( + np.asarray(raw_variable_values["adjusted_gross_income"], dtype=float), + AGI_DISTRICT_SCHEMA, + ) + elif col == AGI_STATE_SCHEMA.name: + df[col] = _bracket_agi( + np.asarray(raw_variable_values["adjusted_gross_income"], dtype=float), + AGI_STATE_SCHEMA, + ) + elif col == EITC_CHILD_COUNT_SCHEMA.name: + df[col] = _bracket_eitc_child_count( + np.asarray(raw_variable_values["eitc_child_count"], dtype=float) + ) + elif col.endswith("_positive"): + src = col[: -len("_positive")] + if src not in raw_variable_values: + raise KeyError( + f"Cannot build derived column '{col}' — raw values for '{src}' not loaded" + ) + df[col] = np.where( + np.asarray(raw_variable_values[src], dtype=float) > 0.0, + POSITIVE_LABEL, + NON_POSITIVE_LABEL, + ).astype(object) + else: + # Equality pass-through: derived column == source column, cast to str. + if col not in raw_variable_values: + raise KeyError( + f"Cannot build derived column '{col}' — raw values for '{col}' not loaded" + ) + df[col] = np.asarray(raw_variable_values[col]).astype(str) + return df + + +def _collect_required_source_variables(blocks: List[MarginBlock]) -> set: + """Return the set of raw policyengine-us variables we need loaded.""" + needed = set() + for block in blocks: + for cell_var in block.cell_vars: + if cell_var == AGE_SCHEMA.name: + needed.add("age") + elif cell_var in (AGI_DISTRICT_SCHEMA.name, AGI_STATE_SCHEMA.name): + needed.add("adjusted_gross_income") + elif cell_var == EITC_CHILD_COUNT_SCHEMA.name: + needed.add("eitc_child_count") + elif cell_var.endswith("_positive"): + needed.add(cell_var[: -len("_positive")]) + elif cell_var in _GEO_VARS: + continue + else: + # Equality pass-through uses the variable name directly. + needed.add(cell_var) + return needed + + +# --------------------------------------------------------------------------- +# Clone-unit table construction (carried over from the prior implementation) +# --------------------------------------------------------------------------- + + +def _detect_time_period(sim) -> int: + raw_keys = sim.dataset.load_dataset()["household_id"] + try: + return int(next(iter(raw_keys))) + except Exception: + return 2024 + + +def _load_stratum_constraints( + db_path: str | Path, stratum_ids: Iterable[int] +) -> Dict[int, List[dict]]: + ids = sorted({int(sid) for sid in stratum_ids}) + if not ids: + return {} + placeholders = ",".join("?" for _ in ids) + query = f""" + SELECT stratum_id, constraint_variable AS variable, operation, value + FROM stratum_constraints + WHERE stratum_id IN ({placeholders}) + ORDER BY stratum_id + """ + with sqlite3.connect(str(db_path)) as conn: + rows = pd.read_sql_query(query, conn, params=ids) + grouped: Dict[int, List[dict]] = {} + for stratum_id, group in rows.groupby("stratum_id", sort=False): + grouped[int(stratum_id)] = group[["variable", "operation", "value"]].to_dict( + "records" + ) + return grouped + + +def _build_household_clone_arrays( + package: Dict, sim +) -> Tuple[pd.DataFrame, np.ndarray]: + household_ids = sim.calculate("household_id", map_to="household").values + n_households = len(household_ids) + n_clones = int(package["metadata"]["n_clones"]) + expected_units = n_households * n_clones + initial_weights = np.asarray(package["initial_weights"], dtype=np.float64) + if len(initial_weights) != expected_units: + raise ValueError( + "Initial weight length does not match dataset households x n_clones: " + f"{len(initial_weights)} != {n_households} * {n_clones}" + ) + + if package.get("cd_geoid") is None or package.get("block_geoid") is None: + raise ValueError( + "Automatic IPF conversion requires cd_geoid and block_geoid in the package" + ) + + unit_index = np.arange(expected_units, dtype=np.int64) + block_geoid = np.asarray(package["block_geoid"]).astype(str) + cd_geoid = np.asarray(package["cd_geoid"]).astype(str) + if len(block_geoid) != expected_units or len(cd_geoid) != expected_units: + raise ValueError("Geography arrays do not match expected cloned unit count") + + household_df = pd.DataFrame( + { + "unit_index": unit_index, + "household_id": unit_index, + "base_weight": initial_weights, + "state_fips": block_geoid, + "congressional_district_geoid": cd_geoid, + } + ) + household_df["state_fips"] = household_df["state_fips"].str.slice(0, 2).astype(int) + household_df["congressional_district_geoid"] = household_df[ + "congressional_district_geoid" + ].astype(int) + return household_df, household_ids + + +def _load_sim_columns( + sim, variables: Iterable[str], level: str +) -> Dict[str, np.ndarray]: + columns: Dict[str, np.ndarray] = {} + for variable in variables: + values = sim.calculate(variable, map_to=level).values + values = np.asarray(values) + if hasattr(values, "decode_to_str"): + values = values.decode_to_str() + if values.dtype.kind == "S": + values = values.astype(str) + columns[variable] = values + return columns + + +def _build_person_level_unit_data( + package: Dict, + household_df: pd.DataFrame, + sim, + needed_variables: Iterable[str], +) -> Tuple[pd.DataFrame, Dict[str, np.ndarray]]: + household_ids = sim.calculate("household_id", map_to="household").values + person_hh_ids = sim.calculate("household_id", map_to="person").values + hh_index = {int(hid): idx for idx, hid in enumerate(household_ids)} + person_hh_index = np.array( + [hh_index[int(hid)] for hid in person_hh_ids], dtype=np.int64 + ) + n_households = len(household_ids) + n_clones = int(package["metadata"]["n_clones"]) + + person_columns = _load_sim_columns(sim, needed_variables, level="person") + person_frames = [] + stacked_raw: Dict[str, List[np.ndarray]] = {v: [] for v in person_columns} + for clone_idx in range(n_clones): + unit_index = person_hh_index + clone_idx * n_households + frame = pd.DataFrame( + { + "unit_index": unit_index, + "household_id": unit_index, + "base_weight": household_df["base_weight"].to_numpy()[unit_index], + "state_fips": household_df["state_fips"].to_numpy()[unit_index], + "congressional_district_geoid": household_df[ + "congressional_district_geoid" + ].to_numpy()[unit_index], + } + ) + for variable, values in person_columns.items(): + frame[variable] = values + stacked_raw[variable].append(values) + person_frames.append(frame) + raw_values = {v: np.concatenate(a) for v, a in stacked_raw.items()} + return pd.concat(person_frames, ignore_index=True), raw_values + + +def _build_household_level_unit_data( + household_df: pd.DataFrame, + sim, + needed_variables: Iterable[str], +) -> Tuple[pd.DataFrame, Dict[str, np.ndarray]]: + frame = household_df.copy() + household_columns = _load_sim_columns(sim, needed_variables, level="household") + repeated = { + name: np.tile(values, len(frame) // max(len(values), 1)) + for name, values in household_columns.items() + } + for name, values in repeated.items(): + frame[name] = values + return frame, repeated + + +# --------------------------------------------------------------------------- +# Closed-system validation and exact closure +# --------------------------------------------------------------------------- + + +def _targeted_unit_slice(unit_data: pd.DataFrame, block: MarginBlock) -> pd.DataFrame: + if block.geo_var is None: + return unit_data + geo_values = {geo_value for geo_value, _ in block.target_cells} + return unit_data.loc[unit_data[block.geo_var].isin(list(geo_values))].copy() + + +def _row_to_cell( + row: pd.Series | dict, + *, + geo_var: Optional[str], + cell_dims: Tuple[str, ...], +) -> Tuple[Optional[object], Tuple[Tuple[str, str], ...]]: + geo_value = row[geo_var] if geo_var is not None else None + cell = tuple(sorted((col, str(row[col])) for col in cell_dims)) + return geo_value, cell + + +def _observed_cells_for_block( + unit_data: pd.DataFrame, block: MarginBlock +) -> set[Tuple[Optional[object], Tuple[Tuple[str, str], ...]]]: + targeted = _targeted_unit_slice(unit_data, block) + if targeted.empty: + return set() + columns = [*block.cell_dims] + if block.geo_var is not None: + columns = [block.geo_var, *columns] + observed = targeted[columns].drop_duplicates() + return { + _row_to_cell(row._asdict(), geo_var=block.geo_var, cell_dims=block.cell_dims) + for row in observed.itertuples(index=False) + } + + +def _parent_key_for_cell( + geo_value: Optional[object], + cell: Tuple[Tuple[str, str], ...], + subset_dim: str, +) -> Tuple[Optional[object], Tuple[Tuple[str, str], ...]]: + return ( + geo_value, + tuple(sorted((col, label) for col, label in cell if col != subset_dim)), + ) + + +def _group_authored_cells_by_parent( + block: MarginBlock, subset_dim: str +) -> Dict[Tuple[Optional[object], Tuple[Tuple[str, str], ...]], List[ResolvedTarget]]: + grouped: Dict[Tuple[Optional[object], Tuple[Tuple[str, str], ...]], List[ResolvedTarget]] = {} + for target in block.targets: + grouped.setdefault( + _parent_key_for_cell(target.geo.value, target.cell, subset_dim), [] + ).append(target) + return grouped + + +def _group_observed_labels_by_parent( + unit_data: pd.DataFrame, block: MarginBlock, subset_dim: str +) -> Dict[Tuple[Optional[object], Tuple[Tuple[str, str], ...]], set[str]]: + targeted = _targeted_unit_slice(unit_data, block) + if targeted.empty: + return {} + columns = [subset_dim, *(dim for dim in block.cell_dims if dim != subset_dim)] + if block.geo_var is not None: + columns = [block.geo_var, *columns] + observed = targeted[columns].drop_duplicates() + grouped: Dict[Tuple[Optional[object], Tuple[Tuple[str, str], ...]], set[str]] = {} + for row in observed.to_dict("records"): + geo_value = row[block.geo_var] if block.geo_var is not None else None + parent_assignments = tuple( + sorted( + (dim, str(row[dim])) for dim in block.cell_dims if dim != subset_dim + ) + ) + grouped.setdefault((geo_value, parent_assignments), set()).add( + str(row[subset_dim]) + ) + return grouped + + +def _build_parent_total_lookup( + block: MarginBlock, +) -> Tuple[ + Dict[Tuple[Optional[object], Tuple[Tuple[str, str], ...]], ResolvedTarget], + Tuple[int, ...], +]: + lookup: Dict[Tuple[Optional[object], Tuple[Tuple[str, str], ...]], ResolvedTarget] = {} + ambiguous_target_ids: set[int] = set() + for target in block.targets: + key = (target.geo.value, tuple(sorted(target.cell))) + if key in lookup: + ambiguous_target_ids.add(int(target.target_id)) + ambiguous_target_ids.add(int(lookup[key].target_id)) + continue + lookup[key] = target + return lookup, tuple(sorted(ambiguous_target_ids)) + + +def _full_partition_cells(block: MarginBlock) -> List[MarginCell]: + return [ + MarginCell( + geo_value=target.geo.value, + cell=target.cell, + target_value=float(target.target_value), + target_name=target.target_name, + is_authored=True, + authored_target_id=target.target_id, + source_target_ids=(target.target_id,), + ) + for target in block.targets + ] + + +def _try_close_binary_subset( + block: MarginBlock, + blocks_by_key: Dict[Tuple[str, Optional[str], Tuple[str, ...]], MarginBlock], + unit_data: pd.DataFrame, + tolerance: float, +) -> Tuple[Optional[ClosedMarginBlock], Optional[Dict[str, object]]]: + missing_parent_reason: Optional[Dict[str, object]] = None + ambiguous_parent_reason: Optional[Dict[str, object]] = None + for subset_dim in block.cell_dims: + observed_by_parent = _group_observed_labels_by_parent(unit_data, block, subset_dim) + if not observed_by_parent: + return None, { + "reason": "missing_unit_support", + "margin_id": block.margin_id, + "target_ids": [int(target.target_id) for target in block.targets], + } + + global_labels = set().union(*observed_by_parent.values()) + if len(global_labels) > 2: + continue + + parent_key = ( + block.source_variable, + block.geo_var, + tuple(dim for dim in block.cell_dims if dim != subset_dim), + ) + parent_block = blocks_by_key.get(parent_key) + if parent_block is None: + missing_parent_reason = { + "reason": "missing_parent_total", + "margin_id": block.margin_id, + "subset_dimension": subset_dim, + "parent_margin_key": { + "source_variable": block.source_variable, + "geo_var": block.geo_var, + "cell_dims": [dim for dim in block.cell_dims if dim != subset_dim], + }, + "target_ids": [int(target.target_id) for target in block.targets], + } + continue + + parent_totals, ambiguous_target_ids = _build_parent_total_lookup(parent_block) + if ambiguous_target_ids: + ambiguous_parent_reason = { + "reason": "ambiguous_parent_total", + "margin_id": block.margin_id, + "subset_dimension": subset_dim, + "parent_margin_id": parent_block.margin_id, + "parent_target_ids": list(ambiguous_target_ids), + "target_ids": [int(target.target_id) for target in block.targets], + } + continue + authored_by_parent = _group_authored_cells_by_parent(block, subset_dim) + emitted_cells: List[MarginCell] = [] + valid = True + invalid_reason: Optional[Dict[str, object]] = None + + for parent_lookup_key, observed_labels in observed_by_parent.items(): + authored_targets = authored_by_parent.get(parent_lookup_key, []) + if not authored_targets: + continue + + parent_target = parent_totals.get(parent_lookup_key) + if parent_target is None: + valid = False + invalid_reason = { + "reason": "missing_parent_total", + "margin_id": block.margin_id, + "parent_margin_key": { + "source_variable": block.source_variable, + "geo_var": block.geo_var, + "cell_dims": [dim for dim in block.cell_dims if dim != subset_dim], + }, + "target_ids": [int(target.target_id) for target in block.targets], + } + break + + authored_labels = { + dict(target.cell)[subset_dim] for target in authored_targets + } + if len(observed_labels) > 2 or len(observed_labels - authored_labels) > 1: + valid = False + invalid_reason = { + "reason": "unsupported_partial_margin", + "margin_id": block.margin_id, + "subset_dimension": subset_dim, + "target_ids": [int(target.target_id) for target in block.targets], + } + break + + authored_sum = float( + sum(float(target.target_value) for target in authored_targets) + ) + complement_value = float(parent_target.target_value) - authored_sum + if complement_value < -tolerance: + valid = False + invalid_reason = { + "reason": "negative_derived_complement", + "margin_id": block.margin_id, + "subset_dimension": subset_dim, + "parent_target_id": int(parent_target.target_id), + "target_ids": [int(target.target_id) for target in block.targets], + "derived_value": complement_value, + } + break + + for target in authored_targets: + emitted_cells.append( + MarginCell( + geo_value=target.geo.value, + cell=target.cell, + target_value=float(target.target_value), + target_name=target.target_name, + is_authored=True, + authored_target_id=target.target_id, + source_target_ids=(target.target_id,), + ) + ) + + missing_labels = observed_labels - authored_labels + if len(missing_labels) == 1: + missing_label = next(iter(missing_labels)) + if complement_value > tolerance and missing_label not in observed_labels: + valid = False + invalid_reason = { + "reason": "missing_unit_support", + "margin_id": block.margin_id, + "subset_dimension": subset_dim, + "target_ids": [int(target.target_id) for target in block.targets], + } + break + parent_geo_value, parent_assignments = parent_lookup_key + derived_cell = tuple( + sorted((*parent_assignments, (subset_dim, missing_label))) + ) + emitted_cells.append( + MarginCell( + geo_value=parent_geo_value, + cell=derived_cell, + target_value=max(complement_value, 0.0), + target_name=( + f"derived::{block.margin_id}::{subset_dim}={missing_label}" + ), + is_authored=False, + authored_target_id=None, + source_target_ids=tuple( + sorted( + { + int(parent_target.target_id), + *[int(target.target_id) for target in authored_targets], + } + ) + ), + derivation_reason="authored_parent_total", + ) + ) + + if not valid: + return None, invalid_reason + + unique_cells = { + (cell.geo_value, cell.cell, cell.is_authored): cell for cell in emitted_cells + } + return ( + ClosedMarginBlock( + margin_id=block.margin_id, + scope=block.scope, + source_variable=block.source_variable, + cell_dims=block.cell_dims, + cell_vars=block.cell_vars, + geo_var=block.geo_var, + closure_status="binary_subset_with_parent_total", + cells=list(unique_cells.values()), + ), + None, + ) + + if ambiguous_parent_reason is not None: + return None, ambiguous_parent_reason + if missing_parent_reason is not None: + return None, missing_parent_reason + return None, { + "reason": "unsupported_partial_margin", + "margin_id": block.margin_id, + "target_ids": [int(target.target_id) for target in block.targets], + } + + +def _validate_household_margin_invariance( + unit_data: pd.DataFrame, + blocks: List[MarginBlock], +) -> Tuple[List[MarginBlock], List[Dict[str, object]]]: + valid_blocks: List[MarginBlock] = [] + dropped: List[Dict[str, object]] = [] + for block in blocks: + if block.scope != "household": + valid_blocks.append(block) + continue + varying_columns = [] + for column in block.cell_vars: + if column not in unit_data.columns: + continue + nunique = unit_data.groupby("household_id", sort=False)[column].nunique( + dropna=False + ) + if (nunique > 1).any(): + varying_columns.append(column) + if varying_columns: + dropped.append( + { + "reason": "non_invariant_household_constraint_variable", + "margin_id": block.margin_id, + "columns": varying_columns, + "target_ids": [int(target.target_id) for target in block.targets], + } + ) + continue + valid_blocks.append(block) + return valid_blocks, dropped + + +def _close_margin_blocks( + blocks: List[MarginBlock], + unit_data: pd.DataFrame, + tolerance: float = 1e-9, +) -> Tuple[List[ClosedMarginBlock], List[Dict[str, object]]]: + blocks_by_key = { + (block.source_variable, block.geo_var, block.cell_dims): block for block in blocks + } + closed_blocks: List[ClosedMarginBlock] = [] + dropped: List[Dict[str, object]] = [] + + for block in blocks: + observed_cells = _observed_cells_for_block(unit_data, block) + authored_cells = { + (target.geo.value, tuple(sorted(target.cell))) for target in block.targets + } + unsupported_authored = [ + target + for target in block.targets + if (target.geo.value, tuple(sorted(target.cell))) not in observed_cells + and abs(float(target.target_value)) > tolerance + ] + if unsupported_authored: + dropped.append( + { + "reason": "missing_unit_support", + "margin_id": block.margin_id, + "target_ids": [int(target.target_id) for target in unsupported_authored], + } + ) + continue + + if authored_cells == observed_cells: + closed_blocks.append( + ClosedMarginBlock( + margin_id=block.margin_id, + scope=block.scope, + source_variable=block.source_variable, + cell_dims=block.cell_dims, + cell_vars=block.cell_vars, + geo_var=block.geo_var, + closure_status="full_partition", + cells=_full_partition_cells(block), + ) + ) + continue + + closed_block, reason = _try_close_binary_subset( + block=block, + blocks_by_key=blocks_by_key, + unit_data=unit_data, + tolerance=tolerance, + ) + if closed_block is None: + dropped.append(reason or {"reason": "unsupported_partial_margin"}) + continue + closed_blocks.append(closed_block) + + return closed_blocks, dropped + + +# --------------------------------------------------------------------------- +# Main entry point +# --------------------------------------------------------------------------- + + +def build_ipf_inputs( + package: Dict, + manifest: BenchmarkManifest, + filtered_targets: pd.DataFrame, +) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Return `(unit_metadata, ipf_target_metadata)` ready for `ipf_runner.R`. + + Consumes the same `filtered_targets` slice that the GREG and L0 runners + see. Internally keeps only IPF-eligible count targets, resolves their + categorical cells, closes only exact binary subset systems backed by + authored parent totals, and rejects any remaining open or incoherent + system rather than sequentializing it. + """ + if filtered_targets.empty: + raise ValueError("filtered_targets is empty; nothing to convert.") + + metadata = package.get("metadata", {}) + dataset_path = metadata.get("dataset_path") + db_path = metadata.get("db_path") + if not dataset_path or not Path(dataset_path).exists(): + raise FileNotFoundError( + "Automatic IPF conversion requires metadata.dataset_path to exist locally" + ) + if not db_path or not Path(db_path).exists(): + raise FileNotFoundError( + "Automatic IPF conversion requires metadata.db_path to exist locally" + ) + + # --- Count check: drop non-count-style targets ------------------------ + supported_mask = ( + filtered_targets["variable"].astype(str).isin(_SCOPE_BY_VARIABLE.keys()) + ) + dropped_non_count = filtered_targets.loc[~supported_mask].copy() + targets = filtered_targets.loc[supported_mask].reset_index(drop=True) + if targets.empty: + raise ValueError( + "No count-style targets in filtered_targets; IPF has nothing to run. " + f"Supported variables: {sorted(_SCOPE_BY_VARIABLE)}." + ) + dropped_target_details: List[Dict[str, object]] = [ + { + "reason": "non_count_style", + "target_id": int(row.get("target_id", -1)), + "target_name": str(row.get("target_name", "?")), + } + for _, row in dropped_non_count.iterrows() + ] + + stratum_constraints = _load_stratum_constraints( + db_path=db_path, + stratum_ids=targets["stratum_id"].astype(int).tolist(), + ) + + # --- Resolver check: keep only targets that map to a declared cell ---- + resolved: List[ResolvedTarget] = [] + dropped_unresolvable: List[Tuple[int, str]] = [] + for _, row in targets.iterrows(): + constraints = stratum_constraints.get(int(row["stratum_id"]), []) + rt = _resolve_target(row, constraints) + if rt is None: + dropped_unresolvable.append( + (int(row.get("target_id", -1)), str(row.get("target_name", "?"))) + ) + continue + resolved.append(rt) + if not resolved: + raise ValueError( + "No targets in filtered_targets resolved through the declared " + "bucket schemas. Nothing for IPF to run." + ) + dropped_target_details.extend( + { + "reason": "unresolvable_constraints", + "target_id": target_id, + "target_name": target_name, + } + for target_id, target_name in dropped_unresolvable + ) + + blocks = _assemble_margins(resolved) + + # --- Build the cloned-unit table with needed source variables ---------- + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=str(dataset_path)) + _ = _detect_time_period(sim) + household_df, _ = _build_household_clone_arrays(package, sim) + + needed_source_vars = _collect_required_source_variables(blocks) + + has_person_scoped = any(b.scope == "person" for b in blocks) + if has_person_scoped: + unit_data, raw_values = _build_person_level_unit_data( + package=package, + household_df=household_df, + sim=sim, + needed_variables=needed_source_vars, + ) + else: + unit_data, raw_values = _build_household_level_unit_data( + household_df=household_df, + sim=sim, + needed_variables=needed_source_vars, + ) + + # --- Materialize derived columns on the unit table --------------------- + derived_cols = {cv for b in blocks for cv in b.cell_vars if cv not in _GEO_VARS} + unit_data = _materialize_derived_columns(unit_data, derived_cols, raw_values) + + if has_person_scoped and any(block.scope == "household" for block in blocks): + blocks, invariance_drops = _validate_household_margin_invariance( + unit_data=unit_data, + blocks=blocks, + ) + dropped_target_details.extend( + { + **detail, + "target_name": None, + } + for detail in invariance_drops + ) + + closed_blocks, closure_drops = _close_margin_blocks(blocks=blocks, unit_data=unit_data) + dropped_target_details.extend(closure_drops) + dropped_counts: Dict[str, int] = {} + for detail in dropped_target_details: + reason = str(detail.get("reason", "unknown")) + count = len(detail.get("target_ids", [])) or 1 + dropped_counts[reason] = dropped_counts.get(reason, 0) + int(count) + if not closed_blocks: + raise IPFConversionError( + "No closed categorical IPF margins remain after validation.", + diagnostics={ + "requested_target_count": int(len(filtered_targets)), + "retained_authored_target_count": 0, + "derived_complement_count": 0, + "dropped_targets": dropped_counts, + "dropped_target_details": dropped_target_details, + "margin_consistency_issues": [], + "derived_complement_rows": [], + }, + ) + + issues = check_margin_consistency(closed_blocks) + if issues: + incompatible_details = [] + for issue in issues: + margin_ids = list(issue.get("margin_totals", {}).keys()) + target_ids = sorted( + { + int(cell.authored_target_id) + for block in closed_blocks + if block.margin_id in margin_ids + for cell in block.cells + if cell.is_authored and cell.authored_target_id is not None + } + ) + incompatible_details.append( + { + "reason": "incompatible_totals", + "scope": issue.get("scope"), + "geo_var": issue.get("geo_var"), + "geo_value": issue.get("geo_value"), + "margin_ids": margin_ids, + "target_ids": target_ids, + } + ) + dropped_target_details.extend(incompatible_details) + dropped_counts["incompatible_totals"] = dropped_counts.get( + "incompatible_totals", 0 + ) + sum(len(detail["target_ids"]) or 1 for detail in incompatible_details) + raise IPFConversionError( + "IPF-retained margins do not form one coherent IPF problem.", + diagnostics={ + "requested_target_count": int(len(filtered_targets)), + "retained_authored_target_count": int( + len( + { + int(cell.authored_target_id) + for block in closed_blocks + for cell in block.cells + if cell.is_authored + and cell.authored_target_id is not None + } + ) + ), + "derived_complement_count": int( + sum( + 1 + for block in closed_blocks + for cell in block.cells + if not cell.is_authored + ) + ), + "dropped_targets": dropped_counts, + "dropped_target_details": dropped_target_details, + "margin_consistency_issues": issues, + "derived_complement_rows": [ + { + "margin_id": block.margin_id, + "cell": "|".join(_cell_assignments_from_cell(block, cell)), + "target_value": float(cell.target_value), + "source_target_ids": list(cell.source_target_ids), + "derivation_reason": cell.derivation_reason, + } + for block in closed_blocks + for cell in block.cells + if not cell.is_authored + ], + }, + ) + + target_metadata = emit_target_rows(closed_blocks) + retained_authored_target_ids = sorted( + { + int(cell.authored_target_id) + for block in closed_blocks + for cell in block.cells + if cell.is_authored and cell.authored_target_id is not None + } + ) + derived_complement_rows = [ + { + "margin_id": block.margin_id, + "cell": "|".join(_cell_assignments_from_cell(block, cell)), + "target_value": float(cell.target_value), + "source_target_ids": list(cell.source_target_ids), + "derivation_reason": cell.derivation_reason, + } + for block in closed_blocks + for cell in block.cells + if not cell.is_authored + ] + target_metadata.attrs["margin_consistency_issues"] = issues + target_metadata.attrs["retained_authored_target_ids"] = retained_authored_target_ids + target_metadata.attrs["derived_complement_rows"] = derived_complement_rows + target_metadata.attrs["dropped_targets"] = dropped_counts + target_metadata.attrs["dropped_target_details"] = dropped_target_details + target_metadata.attrs["requested_target_count"] = int(len(filtered_targets)) + target_metadata.attrs["retained_authored_target_count"] = int( + len(retained_authored_target_ids) + ) + target_metadata.attrs["derived_complement_count"] = int(len(derived_complement_rows)) + return unit_data, target_metadata + + +def split_target_metadata_by_margin( + target_metadata: pd.DataFrame, +) -> Dict[str, pd.DataFrame]: + """Return one sub-frame per margin_id. + + Kept as a notebook/test helper. The benchmark no longer chains separate + IPF calls across these blocks. + """ + return { + mid: sub.reset_index(drop=True) + for mid, sub in target_metadata.groupby("margin_id", sort=False) + } + + +def _cell_assignments(t: ResolvedTarget, block: MarginBlock) -> List[str]: + assignments: Dict[str, str] = {} + if block.geo_var is not None: + assignments[block.geo_var] = str(t.geo.value) + for col, lbl in t.cell: + assignments[col] = str(lbl) + return [f"{col}={assignments[col]}" for col in block.cell_vars] + + +def _cell_assignments_from_cell(block: ClosedMarginBlock, cell: MarginCell) -> List[str]: + assignments: Dict[str, str] = {} + if block.geo_var is not None: + assignments[block.geo_var] = str(cell.geo_value) + for col, lbl in cell.cell: + assignments[col] = str(lbl) + return [f"{col}={assignments[col]}" for col in block.cell_vars] + + +def emit_target_rows(blocks: List[ClosedMarginBlock]) -> pd.DataFrame: + """Emit one `categorical_margin` row per authored or derived IPF cell.""" + out_rows = [] + for block in blocks: + if not hasattr(block, "cells"): + raise TypeError( + "emit_target_rows expects closed margin blocks. " + "Call close_margins_for_testing(...) or build_ipf_inputs(...) " + "before emitting target rows." + ) + margin_vars_joined = "|".join(block.cell_vars) + for cell in block.cells: + cell_assignments = _cell_assignments_from_cell(block, cell) + out_rows.append( + { + "margin_id": block.margin_id, + "scope": block.scope, + "target_type": "categorical_margin", + "variables": margin_vars_joined, + "cell": "|".join(cell_assignments), + "target_value": float(cell.target_value), + "target_name": cell.target_name, + "source_variable": block.source_variable, + "is_authored": bool(cell.is_authored), + "authored_target_id": ( + int(cell.authored_target_id) + if cell.authored_target_id is not None + else np.nan + ), + "source_target_ids": "|".join( + str(target_id) for target_id in cell.source_target_ids + ), + "closure_status": block.closure_status, + "derivation_reason": cell.derivation_reason, + } + ) + return pd.DataFrame(out_rows) + + +# --------------------------------------------------------------------------- +# Back-compat / diagnostics helpers exported for notebooks and tests +# --------------------------------------------------------------------------- + + +def resolve_targets_for_testing( + targets_df: pd.DataFrame, + stratum_constraints: Dict[int, List[dict]], +) -> Tuple[List[ResolvedTarget], List[Tuple[int, str]]]: + """Pure-Python helper used by the notebook walkthrough and unit tests. + + Resolves each target against the declared bucket schemas without touching + the saved calibration package or the microsimulation dataset. Returns a + `(resolved, unresolved)` pair. + """ + resolved: List[ResolvedTarget] = [] + unresolved: List[Tuple[int, str]] = [] + for _, row in targets_df.reset_index(drop=True).iterrows(): + constraints = stratum_constraints.get(int(row["stratum_id"]), []) + rt = _resolve_target(row, constraints) + if rt is None: + unresolved.append( + (int(row.get("target_id", -1)), str(row.get("target_name", "?"))) + ) + else: + resolved.append(rt) + return resolved, unresolved + + +def assemble_margins_for_testing( + resolved: List[ResolvedTarget], +) -> List[MarginBlock]: + """Expose margin assembly for the notebook walkthrough and tests.""" + return _assemble_margins(resolved) + + +def close_margins_for_testing( + resolved: List[ResolvedTarget], + unit_data: pd.DataFrame, + tolerance: float = 1e-9, +) -> Tuple[List[ClosedMarginBlock], List[Dict[str, object]]]: + """Pure-Python closure helper for unit tests and notebook walkthroughs.""" + blocks = _assemble_margins(resolved) + return _close_margin_blocks(blocks=blocks, unit_data=unit_data, tolerance=tolerance) diff --git a/paper-l0/benchmarking/manifests/greg_demo_small.example.json b/paper-l0/benchmarking/manifests/greg_demo_small.example.json new file mode 100644 index 000000000..01f02696a --- /dev/null +++ b/paper-l0/benchmarking/manifests/greg_demo_small.example.json @@ -0,0 +1,54 @@ +{ + "name": "greg_demo_small", + "tier": "tier_a", + "description": "Example GREG benchmark manifest for a reduced package and a coherent geography subset.", + "package_path": "policyengine_us_data/storage/calibration/calibration_package.pkl", + "methods": [ + "l0", + "greg" + ], + "target_filters": { + "include_geo_levels": [ + "national", + "state", + "district" + ], + "include_national": true, + "state_ids": [ + "06", + "12", + "36", + "48", + "53" + ], + "district_ids": [ + "0601", + "0602", + "1201", + "1202", + "3601", + "3602", + "4801", + "4802", + "5301", + "5302" + ], + "count_like_only": false, + "max_targets": 1000 + }, + "external_inputs": {}, + "method_options": { + "l0": { + "lambda_l0": 1e-08, + "epochs": 1000, + "device": "cpu", + "beta": 0.65, + "lambda_l2": 1e-12, + "learning_rate": 0.15 + }, + "greg": { + "maxit": 200, + "epsilon": 1e-07 + } + } +} diff --git a/paper-l0/benchmarking/manifests/ipf_demo_small.example.json b/paper-l0/benchmarking/manifests/ipf_demo_small.example.json new file mode 100644 index 000000000..15939a7d7 --- /dev/null +++ b/paper-l0/benchmarking/manifests/ipf_demo_small.example.json @@ -0,0 +1,42 @@ +{ + "name": "ipf_demo_small", + "tier": "tier_1", + "description": "Example IPF benchmark. Uses the same target_filters path as GREG and L0; the IPF converter internally keeps only count-style targets whose constraints resolve through the declared bucket schemas and form a closed categorical margin system. Binary subset families survive only when authored parent totals exist on the exact reduced key; open subset families are dropped.", + "package_path": "policyengine_us_data/storage/calibration/calibration_package.pkl", + "methods": [ + "l0", + "ipf" + ], + "target_filters": { + "include_geo_levels": [ + "district" + ], + "include_national": false, + "district_ids": [ + "0601", + "0602", + "0603", + "0604", + "0605" + ] + }, + "external_inputs": {}, + "method_options": { + "l0": { + "lambda_l0": 1e-08, + "epochs": 1000, + "device": "cpu", + "beta": 0.65, + "lambda_l2": 1e-12, + "learning_rate": 0.15 + }, + "ipf": { + "max_iter": 5000, + "bound": 10.0, + "epsP": 1e-04, + "epsH": 1e-02, + "household_id_col": "household_id", + "weight_col": "base_weight" + } + } +} diff --git a/paper-l0/benchmarking/notebooks/01_greg_walkthrough.ipynb b/paper-l0/benchmarking/notebooks/01_greg_walkthrough.ipynb new file mode 100644 index 000000000..a01991e80 --- /dev/null +++ b/paper-l0/benchmarking/notebooks/01_greg_walkthrough.ipynb @@ -0,0 +1,1275 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e90d4c4c", + "metadata": {}, + "source": [ + "# GREG walkthrough: linear calibration via R's `survey` package\n", + "\n", + "This notebook walks through a single end-to-end GREG calibration run on a ten-household\n", + "synthetic survey. The goal is to show the exact input and output formats the GREG runner\n", + "consumes so that the paper's benchmark scaffold is easy to audit.\n", + "\n", + "GREG (Generalized Regression Estimator) is a classical survey-calibration method. It\n", + "rescales each unit's design weight so that, taken together, the weighted sample exactly\n", + "reproduces a set of known population totals. Its natural input is a linear system:\n", + "\n", + "$$\n", + "X\\, w_{\\text{fitted}} = t\n", + "$$\n", + "\n", + "where `X` is a `(n_targets, n_units)` matrix of per-unit coefficients, `t` is the target\n", + "vector, and `w_{fitted}` is the fitted per-unit weight. GREG finds `w_{fitted}` that hits\n", + "`t` exactly (when feasible) while staying as close as possible to the design weights under\n", + "a chi-squared-style distance." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "33a8b994", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T10:29:53.419894Z", + "iopub.status.busy": "2026-04-22T10:29:53.419671Z", + "iopub.status.idle": "2026-04-22T10:29:54.086542Z", + "shell.execute_reply": "2026-04-22T10:29:54.086232Z" + } + }, + "outputs": [], + "source": [ + "import subprocess\n", + "import tempfile\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.io import mmwrite\n", + "from scipy.sparse import csr_matrix\n", + "\n", + "from toy_data import (\n", + " HOUSEHOLDS,\n", + " TARGETS,\n", + " build_target_matrix,\n", + " diagnostic_table,\n", + " household_table_with_coefficients,\n", + ")\n", + "\n", + "NOTEBOOK_DIR = Path().resolve()\n", + "GREG_RUNNER = NOTEBOOK_DIR.parent / \"runners\" / \"greg_runner.R\"\n", + "assert GREG_RUNNER.exists(), GREG_RUNNER" + ] + }, + { + "cell_type": "markdown", + "id": "24edf197", + "metadata": {}, + "source": [ + "## 1. The toy survey\n", + "\n", + "Ten households split across two districts (`A` and `B`). Each household has a design\n", + "weight of `100`, a household size, an adult/child breakdown, and a household income.\n", + "The design weights, summed, represent 1000 households nationally." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9c97b4d8", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T10:29:54.088038Z", + "iopub.status.busy": "2026-04-22T10:29:54.087901Z", + "iopub.status.idle": "2026-04-22T10:29:54.094804Z", + "shell.execute_reply": "2026-04-22T10:29:54.094581Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hh_iddistrictn_adultsn_childrenincomedesign_weighthh_size
01A2030000100.02
12A2260000100.04
23A1025000100.01
34A2145000100.03
45A2070000100.02
56B2340000100.05
67B1120000100.02
78B2180000100.03
89B3190000100.04
910B1015000100.01
\n", + "
" + ], + "text/plain": [ + " hh_id district n_adults n_children income design_weight hh_size\n", + "0 1 A 2 0 30000 100.0 2\n", + "1 2 A 2 2 60000 100.0 4\n", + "2 3 A 1 0 25000 100.0 1\n", + "3 4 A 2 1 45000 100.0 3\n", + "4 5 A 2 0 70000 100.0 2\n", + "5 6 B 2 3 40000 100.0 5\n", + "6 7 B 1 1 20000 100.0 2\n", + "7 8 B 2 1 80000 100.0 3\n", + "8 9 B 3 1 90000 100.0 4\n", + "9 10 B 1 0 15000 100.0 1" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "HOUSEHOLDS" + ] + }, + { + "cell_type": "markdown", + "id": "05de5230", + "metadata": {}, + "source": [ + "## 2. Targets\n", + "\n", + "We want to calibrate to eight administrative totals: four per district covering household\n", + "count, adults, children, and total household income. The baseline column shows where the\n", + "design weights land before calibration." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0426309d", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T10:29:54.095989Z", + "iopub.status.busy": "2026-04-22T10:29:54.095918Z", + "iopub.status.idle": "2026-04-22T10:29:54.106231Z", + "shell.execute_reply": "2026-04-22T10:29:54.106008Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
target_nametarget_valuebaseline_weighted_total
0district_A_households520.0500.0
1district_A_adults940.0900.0
2district_A_children310.0300.0
3district_A_income23500000.023000000.0
4district_B_households480.0500.0
5district_B_adults1000.0900.0
6district_B_children600.0600.0
7district_B_income25000000.024500000.0
\n", + "
" + ], + "text/plain": [ + " target_name target_value baseline_weighted_total\n", + "0 district_A_households 520.0 500.0\n", + "1 district_A_adults 940.0 900.0\n", + "2 district_A_children 310.0 300.0\n", + "3 district_A_income 23500000.0 23000000.0\n", + "4 district_B_households 480.0 500.0\n", + "5 district_B_adults 1000.0 900.0\n", + "6 district_B_children 600.0 600.0\n", + "7 district_B_income 25000000.0 24500000.0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hh = household_table_with_coefficients()\n", + "base_w = hh[\"design_weight\"].to_numpy(dtype=np.float64)\n", + "X = build_target_matrix(hh, TARGETS)\n", + "\n", + "baseline_df = pd.DataFrame(\n", + " {\n", + " \"target_name\": TARGETS[\"target_name\"],\n", + " \"target_value\": TARGETS[\"value\"].astype(float),\n", + " \"baseline_weighted_total\": X @ base_w,\n", + " }\n", + ")\n", + "baseline_df" + ] + }, + { + "cell_type": "markdown", + "id": "7b3731d6", + "metadata": {}, + "source": [ + "## 3. Input format 1/3 — the `(n_targets, n_units)` matrix `X`\n", + "\n", + "Each row of `X` is one target; each column is one household. Entry `X[t, i]` is \"how\n", + "much household `i` would contribute to target `t` if its fitted weight were `1`\".\n", + "\n", + "In the benchmark scaffold this matrix is held in [Matrix Market](https://math.nist.gov/MatrixMarket/)\n", + "format (`.mtx`) so the R runner can read it sparsely via `Matrix::readMM`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "701a7ffa", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T10:29:54.107415Z", + "iopub.status.busy": "2026-04-22T10:29:54.107338Z", + "iopub.status.idle": "2026-04-22T10:29:54.113255Z", + "shell.execute_reply": "2026-04-22T10:29:54.113049Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hh_1hh_2hh_3hh_4hh_5hh_6hh_7hh_8hh_9hh_10
target_name
district_A_households1.01.01.01.01.00.00.00.00.00.0
district_A_adults2.02.01.02.02.00.00.00.00.00.0
district_A_children0.02.00.01.00.00.00.00.00.00.0
district_A_income30000.060000.025000.045000.070000.00.00.00.00.00.0
district_B_households0.00.00.00.00.01.01.01.01.01.0
district_B_adults0.00.00.00.00.02.01.02.03.01.0
district_B_children0.00.00.00.00.03.01.01.01.00.0
district_B_income0.00.00.00.00.040000.020000.080000.090000.015000.0
\n", + "
" + ], + "text/plain": [ + " hh_1 hh_2 hh_3 hh_4 hh_5 hh_6 \\\n", + "target_name \n", + "district_A_households 1.0 1.0 1.0 1.0 1.0 0.0 \n", + "district_A_adults 2.0 2.0 1.0 2.0 2.0 0.0 \n", + "district_A_children 0.0 2.0 0.0 1.0 0.0 0.0 \n", + "district_A_income 30000.0 60000.0 25000.0 45000.0 70000.0 0.0 \n", + "district_B_households 0.0 0.0 0.0 0.0 0.0 1.0 \n", + "district_B_adults 0.0 0.0 0.0 0.0 0.0 2.0 \n", + "district_B_children 0.0 0.0 0.0 0.0 0.0 3.0 \n", + "district_B_income 0.0 0.0 0.0 0.0 0.0 40000.0 \n", + "\n", + " hh_7 hh_8 hh_9 hh_10 \n", + "target_name \n", + "district_A_households 0.0 0.0 0.0 0.0 \n", + "district_A_adults 0.0 0.0 0.0 0.0 \n", + "district_A_children 0.0 0.0 0.0 0.0 \n", + "district_A_income 0.0 0.0 0.0 0.0 \n", + "district_B_households 1.0 1.0 1.0 1.0 \n", + "district_B_adults 1.0 2.0 3.0 1.0 \n", + "district_B_children 1.0 1.0 1.0 0.0 \n", + "district_B_income 20000.0 80000.0 90000.0 15000.0 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_df = pd.DataFrame(\n", + " X,\n", + " index=TARGETS[\"target_name\"],\n", + " columns=[f\"hh_{i}\" for i in HOUSEHOLDS[\"hh_id\"]],\n", + ")\n", + "X_df" + ] + }, + { + "cell_type": "markdown", + "id": "37917e5c", + "metadata": {}, + "source": [ + "## 4. Input format 2/3 — target metadata\n", + "\n", + "A CSV with one row per target. GREG only needs the `value` column; the `target_name`\n", + "column is carried through for diagnostics. In the production benchmark this file also\n", + "carries geography and target-family columns used by the scaffold's filtering logic." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5d6d8d89", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T10:29:54.114281Z", + "iopub.status.busy": "2026-04-22T10:29:54.114212Z", + "iopub.status.idle": "2026-04-22T10:29:54.117336Z", + "shell.execute_reply": "2026-04-22T10:29:54.117109Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
target_namevalue
0district_A_households520.0
1district_A_adults940.0
2district_A_children310.0
3district_A_income23500000.0
4district_B_households480.0
5district_B_adults1000.0
6district_B_children600.0
7district_B_income25000000.0
\n", + "
" + ], + "text/plain": [ + " target_name value\n", + "0 district_A_households 520.0\n", + "1 district_A_adults 940.0\n", + "2 district_A_children 310.0\n", + "3 district_A_income 23500000.0\n", + "4 district_B_households 480.0\n", + "5 district_B_adults 1000.0\n", + "6 district_B_children 600.0\n", + "7 district_B_income 25000000.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_metadata = pd.DataFrame(\n", + " {\n", + " \"target_name\": TARGETS[\"target_name\"],\n", + " \"value\": TARGETS[\"value\"].astype(float),\n", + " }\n", + ")\n", + "target_metadata" + ] + }, + { + "cell_type": "markdown", + "id": "0dd15424", + "metadata": {}, + "source": [ + "## 5. Input format 3/3 — initial weights\n", + "\n", + "A plain `.npy` array with one entry per unit. GREG will scale each entry up or down to\n", + "meet the targets." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d9568fd2", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T10:29:54.118341Z", + "iopub.status.busy": "2026-04-22T10:29:54.118271Z", + "iopub.status.idle": "2026-04-22T10:29:54.120286Z", + "shell.execute_reply": "2026-04-22T10:29:54.120032Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([100., 100., 100., 100., 100., 100., 100., 100., 100., 100.])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "initial_weights = HOUSEHOLDS[\"design_weight\"].to_numpy(dtype=np.float64)\n", + "initial_weights" + ] + }, + { + "cell_type": "markdown", + "id": "9970f7ef", + "metadata": {}, + "source": [ + "## 6. Write the bundle and invoke the R runner\n", + "\n", + "The R runner lives at `paper-l0/benchmarking/runners/greg_runner.R`. It is a thin wrapper\n", + "around `survey::grake` with `cal.linear` (classical GREG). We call it as a subprocess so\n", + "this notebook can be run from Python." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f243a04f", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T10:29:54.121387Z", + "iopub.status.busy": "2026-04-22T10:29:54.121318Z", + "iopub.status.idle": "2026-04-22T10:29:54.877325Z", + "shell.execute_reply": "2026-04-22T10:29:54.876992Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "return code: 0\n" + ] + }, + { + "data": { + "text/plain": [ + "['X_targets_by_units.mtx',\n", + " 'fitted_weights.csv',\n", + " 'initial_weights.npy',\n", + " 'target_metadata.csv']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp = Path(tempfile.mkdtemp(prefix=\"greg_toy_\"))\n", + "\n", + "mmwrite(str(tmp / \"X_targets_by_units.mtx\"), csr_matrix(X))\n", + "target_metadata.to_csv(tmp / \"target_metadata.csv\", index=False)\n", + "np.save(tmp / \"initial_weights.npy\", initial_weights)\n", + "\n", + "out_csv = tmp / \"fitted_weights.csv\"\n", + "\n", + "cmd = [\n", + " \"Rscript\",\n", + " str(GREG_RUNNER),\n", + " str(tmp / \"X_targets_by_units.mtx\"),\n", + " str(tmp / \"target_metadata.csv\"),\n", + " str(tmp / \"initial_weights.npy\"),\n", + " str(out_csv),\n", + " \"200\", # maxit\n", + " \"1e-7\", # epsilon\n", + "]\n", + "proc = subprocess.run(cmd, capture_output=True, text=True)\n", + "print(\"return code:\", proc.returncode)\n", + "if proc.returncode != 0:\n", + " print(\"STDERR:\\n\", proc.stderr)\n", + "sorted(p.name for p in tmp.iterdir())" + ] + }, + { + "cell_type": "markdown", + "id": "92601777", + "metadata": {}, + "source": [ + "## 7. Output — fitted weights\n", + "\n", + "The runner writes a two-column CSV (`unit_index`, `fitted_weight`). The benchmark CLI\n", + "immediately converts that to a `.npy` at the per-unit level; we do the same here for\n", + "symmetry with the IPF notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "10bddbe0", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T10:29:54.878793Z", + "iopub.status.busy": "2026-04-22T10:29:54.878683Z", + "iopub.status.idle": "2026-04-22T10:29:54.885418Z", + "shell.execute_reply": "2026-04-22T10:29:54.885215Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hh_iddistrictdesign_weightfitted_weightratio_to_design
01A100.0116.8751.16875
12A100.0100.6251.00625
23A100.0100.0001.00000
34A100.0108.7501.08750
45A100.093.7500.93750
56B100.0120.5001.20500
67B100.046.5000.46500
78B100.0-15.500-0.15500
89B100.0207.5002.07500
910B100.0121.0001.21000
\n", + "
" + ], + "text/plain": [ + " hh_id district design_weight fitted_weight ratio_to_design\n", + "0 1 A 100.0 116.875 1.16875\n", + "1 2 A 100.0 100.625 1.00625\n", + "2 3 A 100.0 100.000 1.00000\n", + "3 4 A 100.0 108.750 1.08750\n", + "4 5 A 100.0 93.750 0.93750\n", + "5 6 B 100.0 120.500 1.20500\n", + "6 7 B 100.0 46.500 0.46500\n", + "7 8 B 100.0 -15.500 -0.15500\n", + "8 9 B 100.0 207.500 2.07500\n", + "9 10 B 100.0 121.000 1.21000" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fitted_df = pd.read_csv(out_csv)\n", + "fitted_weights = fitted_df[\"fitted_weight\"].to_numpy(dtype=np.float64)\n", + "weights_view = pd.DataFrame(\n", + " {\n", + " \"hh_id\": HOUSEHOLDS[\"hh_id\"],\n", + " \"district\": HOUSEHOLDS[\"district\"],\n", + " \"design_weight\": initial_weights,\n", + " \"fitted_weight\": fitted_weights,\n", + " \"ratio_to_design\": fitted_weights / initial_weights,\n", + " }\n", + ")\n", + "weights_view" + ] + }, + { + "cell_type": "markdown", + "id": "2926226c", + "metadata": {}, + "source": [ + "## 8. Did we hit the targets?\n", + "\n", + "Compare the weighted totals under the fitted weights against the declared target values." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "22253313", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T10:29:54.886473Z", + "iopub.status.busy": "2026-04-22T10:29:54.886395Z", + "iopub.status.idle": "2026-04-22T10:29:54.931192Z", + "shell.execute_reply": "2026-04-22T10:29:54.930895Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 target_nametarget_valuebaseline_weighted_totalfitted_weighted_totalabs_rel_error
0district_A_households520.0500.0520.03.61e-13
1district_A_adults940.0900.0940.03.23e-13
2district_A_children310.0300.0310.01.26e-13
3district_A_income23,500,000.023,000,000.023,500,000.03.52e-14
4district_B_households480.0500.0480.05.65e-11
5district_B_adults1,000.0900.01,000.02.09e-11
6district_B_children600.0600.0600.02.18e-11
7district_B_income25,000,000.024,500,000.025,000,000.02.07e-11
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diagnostic_table(hh, TARGETS, fitted_weights).style.format(\n", + " {\n", + " \"target_value\": \"{:,.1f}\",\n", + " \"baseline_weighted_total\": \"{:,.1f}\",\n", + " \"fitted_weighted_total\": \"{:,.1f}\",\n", + " \"abs_rel_error\": \"{:.2e}\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0d5fa748", + "metadata": {}, + "source": [ + "## 9. Known pathology — negative weights\n", + "\n", + "With `cal.linear`, GREG imposes no sign constraint on the fitted weights. On a small,\n", + "tightly-constrained problem like this one it can drive individual weights negative. The\n", + "`survey` package supports logit / raking variants to avoid this, but those variants\n", + "introduce their own trade-offs and do not cleanly generalize to the full subnational\n", + "calibration problem. In the paper this is discussed as one of the limitations of GREG\n", + "at production scale." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cd7fb43c", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T10:29:54.932519Z", + "iopub.status.busy": "2026-04-22T10:29:54.932430Z", + "iopub.status.idle": "2026-04-22T10:29:54.937006Z", + "shell.execute_reply": "2026-04-22T10:29:54.936786Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Negative-weight units: 1\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hh_iddistrictdesign_weightfitted_weightratio_to_design
78B100.0-15.5-0.155
\n", + "
" + ], + "text/plain": [ + " hh_id district design_weight fitted_weight ratio_to_design\n", + "7 8 B 100.0 -15.5 -0.155" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_negative = int((fitted_weights < 0).sum())\n", + "print(f\"Negative-weight units: {n_negative}\")\n", + "weights_view[weights_view[\"fitted_weight\"] < 0]" + ] + }, + { + "cell_type": "markdown", + "id": "2ed96e52", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "Inputs to the GREG runner:\n", + "\n", + "| Artifact | Format | Shape |\n", + "| --- | --- | --- |\n", + "| `X_targets_by_units.mtx` | Matrix Market sparse | `(n_targets, n_units)` |\n", + "| `target_metadata.csv` | CSV with a `value` column | `n_targets` rows |\n", + "| `initial_weights.npy` | float64 array | `n_units` |\n", + "\n", + "Output of the GREG runner:\n", + "\n", + "| Artifact | Format | Shape |\n", + "| --- | --- | --- |\n", + "| `fitted_weights.csv` | CSV (`unit_index`, `fitted_weight`) | `n_units` rows |\n", + "\n", + "Next notebook: `02_ipf_walkthrough.ipynb` — same toy data, but through the IPF runner,\n", + "which needs a very different input representation." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pe3.13 (3.13.0)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/paper-l0/benchmarking/notebooks/02_ipf_walkthrough.ipynb b/paper-l0/benchmarking/notebooks/02_ipf_walkthrough.ipynb new file mode 100644 index 000000000..4f151d1bc --- /dev/null +++ b/paper-l0/benchmarking/notebooks/02_ipf_walkthrough.ipynb @@ -0,0 +1,2268 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "52f383f9", + "metadata": {}, + "source": [ + "# IPF walkthrough: raking via R's `surveysd` package\n", + "\n", + "This notebook is the companion to `01_greg_walkthrough.ipynb`. It runs the same toy\n", + "survey through the IPF runner that sits in `paper-l0/benchmarking/runners/ipf_runner.R`.\n", + "The focus is the input and output formats IPF requires, which look very different from\n", + "GREG's because IPF is structurally not a generic-linear-system method.\n", + "\n", + "IPF (Iterative Proportional Fitting, a.k.a. raking) adjusts unit weights so that the\n", + "weighted *counts* of a set of cross-tabulated categories match known population totals.\n", + "It works on categorical or indicator margins, not on arbitrary linear combinations of\n", + "unit attributes. This shows up as two concrete limitations in our benchmark:\n", + "\n", + "1. Targets must be count-like. Dollar totals such as `district_A_income` cannot be\n", + " calibrated with classical IPF and are dropped from this tier.\n", + "2. The input is a unit-record table with category columns plus a target table that\n", + " lists each margin cell, not a sparse `(n_targets, n_units)` matrix." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "58b0ecc8", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:52:55.818351Z", + "iopub.status.busy": "2026-04-22T15:52:55.817976Z", + "iopub.status.idle": "2026-04-22T15:52:57.713782Z", + "shell.execute_reply": "2026-04-22T15:52:57.712543Z" + } + }, + "outputs": [], + "source": [ + "import subprocess\n", + "import tempfile\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from toy_data import (\n", + " HOUSEHOLDS,\n", + " TARGETS,\n", + " diagnostic_table,\n", + " household_table_with_coefficients,\n", + ")\n", + "\n", + "NOTEBOOK_DIR = Path().resolve()\n", + "IPF_RUNNER = NOTEBOOK_DIR.parent / \"runners\" / \"ipf_runner.R\"\n", + "assert IPF_RUNNER.exists(), IPF_RUNNER" + ] + }, + { + "cell_type": "markdown", + "id": "7ba5d741", + "metadata": {}, + "source": [ + "## 1. The toy survey (same as the GREG walkthrough)\n", + "\n", + "Ten households across two districts with adult/child breakdowns and household incomes.\n", + "The design weight is 100 for every household." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "942aa620", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:52:57.731752Z", + "iopub.status.busy": "2026-04-22T15:52:57.730813Z", + "iopub.status.idle": "2026-04-22T15:52:57.756288Z", + "shell.execute_reply": "2026-04-22T15:52:57.755107Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hh_iddistrictn_adultsn_childrenincomedesign_weighthh_size
01A2030000100.02
12A2260000100.04
23A1025000100.01
34A2145000100.03
45A2070000100.02
56B2340000100.05
67B1120000100.02
78B2180000100.03
89B3190000100.04
910B1015000100.01
\n", + "
" + ], + "text/plain": [ + " hh_id district n_adults n_children income design_weight hh_size\n", + "0 1 A 2 0 30000 100.0 2\n", + "1 2 A 2 2 60000 100.0 4\n", + "2 3 A 1 0 25000 100.0 1\n", + "3 4 A 2 1 45000 100.0 3\n", + "4 5 A 2 0 70000 100.0 2\n", + "5 6 B 2 3 40000 100.0 5\n", + "6 7 B 1 1 20000 100.0 2\n", + "7 8 B 2 1 80000 100.0 3\n", + "8 9 B 3 1 90000 100.0 4\n", + "9 10 B 1 0 15000 100.0 1" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "HOUSEHOLDS" + ] + }, + { + "cell_type": "markdown", + "id": "09a679e5", + "metadata": {}, + "source": [ + "## 2. Target set — IPF-eligible subset\n", + "\n", + "The GREG walkthrough calibrates eight targets, four per district. Of those, only\n", + "**six** are count-style and therefore runnable through classical IPF: adults and children\n", + "in each district, plus a household-level total in each district. The two `_income`\n", + "targets are dollar magnitudes and are dropped here.\n", + "\n", + "For this walkthrough we stay with the person-level count targets only and use them to\n", + "build a single categorical margin over a `district_role` factor. Household counts would\n", + "require a separate household-level margin, which the `ipf_runner.R` supports, but we\n", + "skip them in the demo to keep the narrative concentrated on one margin construction." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7473b3bb", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:52:57.767375Z", + "iopub.status.busy": "2026-04-22T15:52:57.766686Z", + "iopub.status.idle": "2026-04-22T15:52:57.781093Z", + "shell.execute_reply": "2026-04-22T15:52:57.780513Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
target_namecell_labeltarget_value
0district_A_adultsA_adult940.0
1district_A_childrenA_child310.0
2district_B_adultsB_adult1000.0
3district_B_childrenB_child600.0
\n", + "
" + ], + "text/plain": [ + " target_name cell_label target_value\n", + "0 district_A_adults A_adult 940.0\n", + "1 district_A_children A_child 310.0\n", + "2 district_B_adults B_adult 1000.0\n", + "3 district_B_children B_child 600.0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ipf_target_specs = pd.DataFrame(\n", + " [\n", + " (\"district_A_adults\", \"A_adult\", 940.0),\n", + " (\"district_A_children\", \"A_child\", 310.0),\n", + " (\"district_B_adults\", \"B_adult\", 1000.0),\n", + " (\"district_B_children\", \"B_child\", 600.0),\n", + " ],\n", + " columns=[\"target_name\", \"cell_label\", \"target_value\"],\n", + ")\n", + "ipf_target_specs" + ] + }, + { + "cell_type": "markdown", + "id": "d196b0b2", + "metadata": {}, + "source": [ + "## 3. Input format 1/3 — a unit-record microdata table\n", + "\n", + "IPF consumes one row per *observation* in the dimension of the constraints. Because our\n", + "targets are person-level counts, we expand the ten households into 27 person-level rows.\n", + "Every person in the same household keeps the same `unit_index` and the same\n", + "`base_weight`, so the fitted weights collapse cleanly back to one weight per household\n", + "after the IPF run.\n", + "\n", + "The key column is `district_role`, a categorical that combines district and role. IPF\n", + "will read the target margin against this column." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a38a3df8", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:52:57.785356Z", + "iopub.status.busy": "2026-04-22T15:52:57.785050Z", + "iopub.status.idle": "2026-04-22T15:52:57.816369Z", + "shell.execute_reply": "2026-04-22T15:52:57.815478Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hh_iddistrictroleunit_indexhousehold_iddistrict_rolebase_weight
01Aadult01A_adult100.0
11Aadult01A_adult100.0
22Aadult12A_adult100.0
32Aadult12A_adult100.0
42Achild12A_child100.0
52Achild12A_child100.0
63Aadult23A_adult100.0
74Aadult34A_adult100.0
84Aadult34A_adult100.0
94Achild34A_child100.0
\n", + "
" + ], + "text/plain": [ + " hh_id district role unit_index household_id district_role base_weight\n", + "0 1 A adult 0 1 A_adult 100.0\n", + "1 1 A adult 0 1 A_adult 100.0\n", + "2 2 A adult 1 2 A_adult 100.0\n", + "3 2 A adult 1 2 A_adult 100.0\n", + "4 2 A child 1 2 A_child 100.0\n", + "5 2 A child 1 2 A_child 100.0\n", + "6 3 A adult 2 3 A_adult 100.0\n", + "7 4 A adult 3 4 A_adult 100.0\n", + "8 4 A adult 3 4 A_adult 100.0\n", + "9 4 A child 3 4 A_child 100.0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rows = []\n", + "for _, r in HOUSEHOLDS.iterrows():\n", + " hid = int(r[\"hh_id\"])\n", + " for _ in range(int(r[\"n_adults\"])):\n", + " rows.append({\"hh_id\": hid, \"district\": r[\"district\"], \"role\": \"adult\"})\n", + " for _ in range(int(r[\"n_children\"])):\n", + " rows.append({\"hh_id\": hid, \"district\": r[\"district\"], \"role\": \"child\"})\n", + "\n", + "persons = pd.DataFrame(rows)\n", + "persons[\"unit_index\"] = persons[\"hh_id\"] - 1 # 0-indexed household clone id\n", + "persons[\"household_id\"] = persons[\"hh_id\"] # surveysd::ipf 'hid' column\n", + "persons[\"district_role\"] = persons[\"district\"] + \"_\" + persons[\"role\"]\n", + "persons[\"base_weight\"] = persons[\"hh_id\"].map(\n", + " dict(zip(HOUSEHOLDS[\"hh_id\"], HOUSEHOLDS[\"design_weight\"]))\n", + ")\n", + "persons.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "d6f1495f", + "metadata": {}, + "source": [ + "The `unit_index` column matters: `ipf_runner.R` uses it to collapse per-person weights\n", + "back to per-unit weights. All persons sharing an `unit_index` must end the IPF run with\n", + "the same fitted weight; the runner enforces this and errors otherwise. (surveysd's\n", + "default `meanHH = TRUE` guarantees it.)" + ] + }, + { + "cell_type": "markdown", + "id": "0d1d21de", + "metadata": {}, + "source": [ + "## 4. Input format 2/3 — target metadata\n", + "\n", + "The IPF runner accepts one target encoding: `categorical_margin`. One row per\n", + "authored margin cell; rows sharing a `margin_id` are grouped into a single\n", + "`surveysd::ipf` constraint via `xtabs`.\n", + "\n", + "For this textbook-style demo we write the four person-count targets as four\n", + "`categorical_margin` rows over a single `district_role` margin.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7b01aae8", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:52:57.820513Z", + "iopub.status.busy": "2026-04-22T15:52:57.820253Z", + "iopub.status.idle": "2026-04-22T15:52:57.838627Z", + "shell.execute_reply": "2026-04-22T15:52:57.837691Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
margin_idscopetarget_typevariablescelltarget_valuetarget_name
0margin_rolepersoncategorical_margindistrict_roledistrict_role=A_adult940.0district_A_adults
1margin_rolepersoncategorical_margindistrict_roledistrict_role=A_child310.0district_A_children
2margin_rolepersoncategorical_margindistrict_roledistrict_role=B_adult1000.0district_B_adults
3margin_rolepersoncategorical_margindistrict_roledistrict_role=B_child600.0district_B_children
\n", + "
" + ], + "text/plain": [ + " margin_id scope target_type variables \\\n", + "0 margin_role person categorical_margin district_role \n", + "1 margin_role person categorical_margin district_role \n", + "2 margin_role person categorical_margin district_role \n", + "3 margin_role person categorical_margin district_role \n", + "\n", + " cell target_value target_name \n", + "0 district_role=A_adult 940.0 district_A_adults \n", + "1 district_role=A_child 310.0 district_A_children \n", + "2 district_role=B_adult 1000.0 district_B_adults \n", + "3 district_role=B_child 600.0 district_B_children " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ipf_targets = pd.DataFrame(\n", + " {\n", + " \"margin_id\": [\"margin_role\"] * 4,\n", + " \"scope\": [\"person\"] * 4,\n", + " \"target_type\": [\"categorical_margin\"] * 4,\n", + " \"variables\": [\"district_role\"] * 4,\n", + " \"cell\": [f\"district_role={c}\" for c in ipf_target_specs[\"cell_label\"]],\n", + " \"target_value\": ipf_target_specs[\"target_value\"].astype(float),\n", + " \"target_name\": ipf_target_specs[\"target_name\"], # carried for diagnostics only\n", + " }\n", + ")\n", + "ipf_targets" + ] + }, + { + "cell_type": "markdown", + "id": "d5e14a28", + "metadata": {}, + "source": [ + "## 5. Input format 3/3 — initial weights\n", + "\n", + "Same `.npy` format as GREG: one weight per unit. Unit index here is the 0-indexed\n", + "household clone, so the vector has length 10 even though the unit table has 27 rows." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b8bbecff", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:52:57.843331Z", + "iopub.status.busy": "2026-04-22T15:52:57.843045Z", + "iopub.status.idle": "2026-04-22T15:52:57.847716Z", + "shell.execute_reply": "2026-04-22T15:52:57.847077Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([100., 100., 100., 100., 100., 100., 100., 100., 100., 100.])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "initial_weights = HOUSEHOLDS[\"design_weight\"].to_numpy(dtype=np.float64)\n", + "initial_weights" + ] + }, + { + "cell_type": "markdown", + "id": "12676e8e", + "metadata": {}, + "source": [ + "## 6. Write the bundle and invoke the R runner\n", + "\n", + "The IPF runner signature is longer than GREG's because surveysd exposes more convergence\n", + "knobs. The ones that matter for a small problem like this one:\n", + "\n", + "| argument | meaning | our choice |\n", + "| --- | --- | --- |\n", + "| `max_iter` | hard iteration cap | `5000` |\n", + "| `bound` | per-step weight change limit | `10.0` |\n", + "| `epsP` | person-constraint tolerance | `1e-4` |\n", + "| `epsH` | household-constraint tolerance | `1e-2` |\n", + "| `household_id_col` | hid column in the unit table | `household_id` |\n", + "| `weight_col` | base weight column | `base_weight` |" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a05a01ee", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:52:57.850576Z", + "iopub.status.busy": "2026-04-22T15:52:57.850350Z", + "iopub.status.idle": "2026-04-22T15:52:59.988165Z", + "shell.execute_reply": "2026-04-22T15:52:59.986772Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "return code: 0\n" + ] + }, + { + "data": { + "text/plain": [ + "['fitted_weights.csv',\n", + " 'initial_weights.npy',\n", + " 'ipf_target_metadata.csv',\n", + " 'unit_metadata.csv']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp = Path(tempfile.mkdtemp(prefix=\"ipf_toy_\"))\n", + "\n", + "persons_cols = [\n", + " \"unit_index\",\n", + " \"household_id\",\n", + " \"district\",\n", + " \"role\",\n", + " \"district_role\",\n", + " \"base_weight\",\n", + "]\n", + "persons[persons_cols].to_csv(tmp / \"unit_metadata.csv\", index=False)\n", + "ipf_targets.to_csv(tmp / \"ipf_target_metadata.csv\", index=False)\n", + "np.save(tmp / \"initial_weights.npy\", initial_weights)\n", + "\n", + "out_csv = tmp / \"fitted_weights.csv\"\n", + "cmd = [\n", + " \"Rscript\",\n", + " str(IPF_RUNNER),\n", + " str(tmp / \"unit_metadata.csv\"),\n", + " str(tmp / \"ipf_target_metadata.csv\"),\n", + " str(tmp / \"initial_weights.npy\"),\n", + " str(out_csv),\n", + " \"5000\", # max_iter\n", + " \"10.0\", # bound\n", + " \"1e-4\", # epsP\n", + " \"1e-2\", # epsH\n", + " \"household_id\",\n", + " \"base_weight\",\n", + "]\n", + "proc = subprocess.run(cmd, capture_output=True, text=True)\n", + "print(\"return code:\", proc.returncode)\n", + "if proc.returncode != 0:\n", + " print(\"STDERR:\\n\", proc.stderr)\n", + "sorted(p.name for p in tmp.iterdir())" + ] + }, + { + "cell_type": "markdown", + "id": "5443c219", + "metadata": {}, + "source": [ + "## 7. Output — per-row fitted weights, collapsed to per-unit\n", + "\n", + "The IPF runner writes one row per person in `fitted_weights.csv`, keyed by `unit_index`.\n", + "Because surveysd enforces equal weights within each household (`meanHH = TRUE`), we can\n", + "take the first row per `unit_index` as the per-household fitted weight. This collapse\n", + "step is exactly what `benchmark_cli.py._run_ipf` does in production." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a55772a1", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:52:59.993937Z", + "iopub.status.busy": "2026-04-22T15:52:59.993551Z", + "iopub.status.idle": "2026-04-22T15:53:00.020116Z", + "shell.execute_reply": "2026-04-22T15:53:00.019300Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rows returned by IPF (one per person): 27\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hh_iddistrictdesign_weightfitted_weightratio_to_design
01A100.0105.2386871.052387
12A100.0103.0967261.030967
23A100.0105.2386871.052387
34A100.0103.8065531.038066
45A100.0105.2386871.052387
56B100.090.8067900.908068
67B100.098.0136830.980137
78B100.0111.2108081.112108
89B100.0118.4086741.184087
910B100.0142.6715641.426716
\n", + "
" + ], + "text/plain": [ + " hh_id district design_weight fitted_weight ratio_to_design\n", + "0 1 A 100.0 105.238687 1.052387\n", + "1 2 A 100.0 103.096726 1.030967\n", + "2 3 A 100.0 105.238687 1.052387\n", + "3 4 A 100.0 103.806553 1.038066\n", + "4 5 A 100.0 105.238687 1.052387\n", + "5 6 B 100.0 90.806790 0.908068\n", + "6 7 B 100.0 98.013683 0.980137\n", + "7 8 B 100.0 111.210808 1.112108\n", + "8 9 B 100.0 118.408674 1.184087\n", + "9 10 B 100.0 142.671564 1.426716" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw = pd.read_csv(out_csv)\n", + "raw[\"unit_index\"] = raw[\"unit_index\"].astype(int)\n", + "\n", + "print(\"rows returned by IPF (one per person):\", len(raw))\n", + "per_unit = (\n", + " raw.groupby(\"unit_index\", sort=True)[\"fitted_weight\"]\n", + " .first()\n", + " .reindex(np.arange(len(HOUSEHOLDS), dtype=np.int64))\n", + ")\n", + "fitted_weights = per_unit.to_numpy(dtype=np.float64)\n", + "\n", + "weights_view = pd.DataFrame(\n", + " {\n", + " \"hh_id\": HOUSEHOLDS[\"hh_id\"],\n", + " \"district\": HOUSEHOLDS[\"district\"],\n", + " \"design_weight\": initial_weights,\n", + " \"fitted_weight\": fitted_weights,\n", + " \"ratio_to_design\": fitted_weights / initial_weights,\n", + " }\n", + ")\n", + "weights_view" + ] + }, + { + "cell_type": "markdown", + "id": "50394ac8", + "metadata": {}, + "source": [ + "## 8. Did we hit the person-count targets?\n", + "\n", + "We diagnose only the four person-level count targets IPF was actually asked to fit.\n", + "The household-count and income targets that GREG received are outside IPF's natural\n", + "scope and are excluded from this diagnostic." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "79f26958", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:53:00.022698Z", + "iopub.status.busy": "2026-04-22T15:53:00.022525Z", + "iopub.status.idle": "2026-04-22T15:53:00.131161Z", + "shell.execute_reply": "2026-04-22T15:53:00.130561Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 target_nametarget_valuebaseline_weighted_totalfitted_weighted_totalabs_rel_error
0district_A_adults940.0900.0940.06.07e-09
1district_A_children310.0300.0310.01.84e-08
2district_B_adults1,000.0900.0999.95.35e-05
3district_B_children600.0600.0600.18.92e-05
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ipf_scored_targets = TARGETS[\n", + " TARGETS[\"target_name\"].isin(ipf_target_specs[\"target_name\"])\n", + "].reset_index(drop=True)\n", + "diagnostic_table(\n", + " household_table_with_coefficients(),\n", + " ipf_scored_targets,\n", + " fitted_weights,\n", + ").style.format(\n", + " {\n", + " \"target_value\": \"{:,.1f}\",\n", + " \"baseline_weighted_total\": \"{:,.1f}\",\n", + " \"fitted_weighted_total\": \"{:,.1f}\",\n", + " \"abs_rel_error\": \"{:.2e}\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9560723f", + "metadata": {}, + "source": [ + "## 9. Known pathology — targets that IPF cannot accept\n", + "\n", + "What classical IPF *cannot* do on this same toy survey:\n", + "\n", + "- **`district_A_income` and `district_B_income`** — these are dollar totals, not cell\n", + " counts. surveysd's raking expects margin structure, not arbitrary linear-combination\n", + " targets. We drop them in this walkthrough.\n", + "- **Targets over unit-level attributes that cross household/person boundaries** — mixing\n", + " these with person-level count margins needs care because `meanHH = TRUE` is what lets\n", + " us collapse person weights back to one per household.\n", + "\n", + "These are the core reasons the benchmark plan treats IPF as a *reference method on\n", + "count-style target subsets*, not a direct competitor to GREG or L0 on the full mixed\n", + "target set." + ] + }, + { + "cell_type": "markdown", + "id": "0ba12597", + "metadata": {}, + "source": [ + "## 10. How the benchmark scaffold produces these inputs from raw target constraints\n", + "\n", + "Sections 3-5 hand-built the unit table, the `district_role` category, and the margin rows directly. In production, those artifacts come out of `paper-l0/benchmarking/ipf_conversion.py`, which takes the **same `filtered_targets` DataFrame** that the GREG and L0 runners see — the manifest's `target_filters` controls target selection for all three methods — and then applies three checks before anything reaches `surveysd::ipf`:\n", + "\n", + "1. **Count check.** Keep only targets whose `variable` is a supported count (`person_count`, `household_count`). Dollar-total targets and tax-unit / SPM-unit counts stay in the shared sparse matrix for GREG and L0 but are dropped here.\n", + "2. **Resolver check.** For each surviving target, map its stratum constraints to a categorical cell label using declared bucket schemas:\n", + "\n", + " | variable | schema | cell labels |\n", + " | --- | --- | --- |\n", + " | `age` | 18 five-year buckets | `0-4`, `5-9`, …, `85+` |\n", + " | `adjusted_gross_income` (district) | 9 brackets | `(-inf,1)`, `[1,10k)`, …, `[500k,inf)` |\n", + " | `adjusted_gross_income` (state) | 10 brackets | extends district up to `[1M,inf)` |\n", + " | `eitc_child_count` | 4 discrete | `0`, `1`, `2`, `>2` |\n", + " | any dollar var used with `> 0` | binary | `positive`, `non_positive` |\n", + " | everything else with `==` | raw equality | the value itself |\n", + "\n", + " Targets whose constraints don't map to any declared schema are dropped.\n", + "3. **Closure check.** Resolved targets are grouped into margin blocks and kept only if they form a closed categorical system. Full partitions stay in. Binary subset families stay in only when an authored parent total exists on the exact reduced key so the complement can be derived exactly. Open subset families are dropped rather than emitted as 1-cell margins.\n", + "\n", + "The next cells walk through that exact sequence on a tiny illustrative slice.\n" + ] + }, + { + "cell_type": "markdown", + "id": "a31399a7", + "metadata": {}, + "source": [ + "### 10a. Input: a slice of `filtered_targets`\n", + "\n", + "The converter takes exactly the same DataFrame the GREG and L0 paths take. Below is a tiny illustrative slice including a mix of count-style and non-count targets so the count check is visible. Four person-count targets span age buckets in two districts. Two household-count targets ask for `snap > 0` in each district, but **do not** include household totals; under the corrected IPF contract those targets illustrate the fail-closed behavior and will be reported as non-runnable subset margins. The remaining two rows are non-count targets that the count check will drop.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2df63fc2", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:53:00.134922Z", + "iopub.status.busy": "2026-04-22T15:53:00.134679Z", + "iopub.status.idle": "2026-04-22T15:53:00.146923Z", + "shell.execute_reply": "2026-04-22T15:53:00.146243Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
target_idstratum_idvariablevaluetarget_name
011person_count210.0pc_age_0-4_d601
122person_count205.0pc_age_5-9_d601
233person_count200.0pc_age_0-4_d602
344person_count198.0pc_age_5-9_d602
455household_count80.0hh_snap_pos_d601
566household_count70.0hh_snap_pos_d602
675adjusted_gross_income12000000.0agi_total_d601
786tax_unit_count200.0tax_units_d601
\n", + "
" + ], + "text/plain": [ + " target_id stratum_id variable value target_name\n", + "0 1 1 person_count 210.0 pc_age_0-4_d601\n", + "1 2 2 person_count 205.0 pc_age_5-9_d601\n", + "2 3 3 person_count 200.0 pc_age_0-4_d602\n", + "3 4 4 person_count 198.0 pc_age_5-9_d602\n", + "4 5 5 household_count 80.0 hh_snap_pos_d601\n", + "5 6 6 household_count 70.0 hh_snap_pos_d602\n", + "6 7 5 adjusted_gross_income 12000000.0 agi_total_d601\n", + "7 8 6 tax_unit_count 200.0 tax_units_d601" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Mirror of what benchmark_manifest.filter_targets would produce for a small scope.\n", + "filtered_targets = pd.DataFrame(\n", + " [\n", + " {\n", + " \"target_id\": 1,\n", + " \"stratum_id\": 1,\n", + " \"variable\": \"person_count\",\n", + " \"value\": 210.0,\n", + " \"target_name\": \"pc_age_0-4_d601\",\n", + " },\n", + " {\n", + " \"target_id\": 2,\n", + " \"stratum_id\": 2,\n", + " \"variable\": \"person_count\",\n", + " \"value\": 205.0,\n", + " \"target_name\": \"pc_age_5-9_d601\",\n", + " },\n", + " {\n", + " \"target_id\": 3,\n", + " \"stratum_id\": 3,\n", + " \"variable\": \"person_count\",\n", + " \"value\": 200.0,\n", + " \"target_name\": \"pc_age_0-4_d602\",\n", + " },\n", + " {\n", + " \"target_id\": 4,\n", + " \"stratum_id\": 4,\n", + " \"variable\": \"person_count\",\n", + " \"value\": 198.0,\n", + " \"target_name\": \"pc_age_5-9_d602\",\n", + " },\n", + " {\n", + " \"target_id\": 5,\n", + " \"stratum_id\": 5,\n", + " \"variable\": \"household_count\",\n", + " \"value\": 80.0,\n", + " \"target_name\": \"hh_snap_pos_d601\",\n", + " },\n", + " {\n", + " \"target_id\": 6,\n", + " \"stratum_id\": 6,\n", + " \"variable\": \"household_count\",\n", + " \"value\": 70.0,\n", + " \"target_name\": \"hh_snap_pos_d602\",\n", + " },\n", + " # Non-count rows — dropped by the count check:\n", + " {\n", + " \"target_id\": 7,\n", + " \"stratum_id\": 5,\n", + " \"variable\": \"adjusted_gross_income\",\n", + " \"value\": 1.2e7,\n", + " \"target_name\": \"agi_total_d601\",\n", + " },\n", + " {\n", + " \"target_id\": 8,\n", + " \"stratum_id\": 6,\n", + " \"variable\": \"tax_unit_count\",\n", + " \"value\": 200.0,\n", + " \"target_name\": \"tax_units_d601\",\n", + " },\n", + " ]\n", + ")\n", + "filtered_targets" + ] + }, + { + "cell_type": "markdown", + "id": "d3b17f56", + "metadata": {}, + "source": [ + "Here are the stratum-constraint records for each of the six count-style targets (the remaining two rows have no bearing on the walkthrough because they'll be filtered out before resolution runs)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "79cc71b9", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:53:00.150211Z", + "iopub.status.busy": "2026-04-22T15:53:00.149982Z", + "iopub.status.idle": "2026-04-22T15:53:00.163106Z", + "shell.execute_reply": "2026-04-22T15:53:00.162477Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stratum_idraw_constraints
01congressional_district_geoid==601; age>-1; age<5
12congressional_district_geoid==601; age>4; age<10
23congressional_district_geoid==602; age>-1; age<5
34congressional_district_geoid==602; age>4; age<10
45congressional_district_geoid==601; snap>0
56congressional_district_geoid==602; snap>0
\n", + "
" + ], + "text/plain": [ + " stratum_id raw_constraints\n", + "0 1 congressional_district_geoid==601; age>-1; age<5\n", + "1 2 congressional_district_geoid==601; age>4; age<10\n", + "2 3 congressional_district_geoid==602; age>-1; age<5\n", + "3 4 congressional_district_geoid==602; age>4; age<10\n", + "4 5 congressional_district_geoid==601; snap>0\n", + "5 6 congressional_district_geoid==602; snap>0" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stratum_constraints = {\n", + " 1: [\n", + " {\"variable\": \"congressional_district_geoid\", \"operation\": \"==\", \"value\": \"601\"},\n", + " {\"variable\": \"age\", \"operation\": \">\", \"value\": \"-1\"},\n", + " {\"variable\": \"age\", \"operation\": \"<\", \"value\": \"5\"},\n", + " ],\n", + " 2: [\n", + " {\"variable\": \"congressional_district_geoid\", \"operation\": \"==\", \"value\": \"601\"},\n", + " {\"variable\": \"age\", \"operation\": \">\", \"value\": \"4\"},\n", + " {\"variable\": \"age\", \"operation\": \"<\", \"value\": \"10\"},\n", + " ],\n", + " 3: [\n", + " {\"variable\": \"congressional_district_geoid\", \"operation\": \"==\", \"value\": \"602\"},\n", + " {\"variable\": \"age\", \"operation\": \">\", \"value\": \"-1\"},\n", + " {\"variable\": \"age\", \"operation\": \"<\", \"value\": \"5\"},\n", + " ],\n", + " 4: [\n", + " {\"variable\": \"congressional_district_geoid\", \"operation\": \"==\", \"value\": \"602\"},\n", + " {\"variable\": \"age\", \"operation\": \">\", \"value\": \"4\"},\n", + " {\"variable\": \"age\", \"operation\": \"<\", \"value\": \"10\"},\n", + " ],\n", + " 5: [\n", + " {\"variable\": \"congressional_district_geoid\", \"operation\": \"==\", \"value\": \"601\"},\n", + " {\"variable\": \"snap\", \"operation\": \">\", \"value\": \"0\"},\n", + " ],\n", + " 6: [\n", + " {\"variable\": \"congressional_district_geoid\", \"operation\": \"==\", \"value\": \"602\"},\n", + " {\"variable\": \"snap\", \"operation\": \">\", \"value\": \"0\"},\n", + " ],\n", + "}\n", + "pd.DataFrame(\n", + " [\n", + " {\n", + " \"stratum_id\": sid,\n", + " \"raw_constraints\": \"; \".join(\n", + " f\"{c['variable']}{c['operation']}{c['value']}\" for c in cs\n", + " ),\n", + " }\n", + " for sid, cs in stratum_constraints.items()\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f38a89a6", + "metadata": {}, + "source": [ + "### 10b. The count check and the resolver check\n", + "\n", + "The converter keeps only `person_count` and `household_count` targets, then walks each surviving target, groups its constraints by variable, and matches them against the declared schemas. The result is one `ResolvedTarget` per surviving target, with a `cell` tuple listing the `(derived_column, cell_label)` pairs that identify the cell." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0180aac4", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:53:00.166735Z", + "iopub.status.busy": "2026-04-22T15:53:00.166460Z", + "iopub.status.idle": "2026-04-22T15:53:31.531154Z", + "shell.execute_reply": "2026-04-22T15:53:31.529991Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "count check: kept 6 / 8 targets; dropped 2: ['agi_total_d601', 'tax_units_d601']\n", + "resolver check: kept 6, dropped 0\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
target_namegeoderived_cellscopetarget_value
0pc_age_0-4_d601congressional_district_geoid=601age_bracket=0-4person210.0
1pc_age_5-9_d601congressional_district_geoid=601age_bracket=5-9person205.0
2pc_age_0-4_d602congressional_district_geoid=602age_bracket=0-4person200.0
3pc_age_5-9_d602congressional_district_geoid=602age_bracket=5-9person198.0
4hh_snap_pos_d601congressional_district_geoid=601snap_positive=positivehousehold80.0
5hh_snap_pos_d602congressional_district_geoid=602snap_positive=positivehousehold70.0
\n", + "
" + ], + "text/plain": [ + " target_name geo derived_cell \\\n", + "0 pc_age_0-4_d601 congressional_district_geoid=601 age_bracket=0-4 \n", + "1 pc_age_5-9_d601 congressional_district_geoid=601 age_bracket=5-9 \n", + "2 pc_age_0-4_d602 congressional_district_geoid=602 age_bracket=0-4 \n", + "3 pc_age_5-9_d602 congressional_district_geoid=602 age_bracket=5-9 \n", + "4 hh_snap_pos_d601 congressional_district_geoid=601 snap_positive=positive \n", + "5 hh_snap_pos_d602 congressional_district_geoid=602 snap_positive=positive \n", + "\n", + " scope target_value \n", + "0 person 210.0 \n", + "1 person 205.0 \n", + "2 person 200.0 \n", + "3 person 198.0 \n", + "4 household 80.0 \n", + "5 household 70.0 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import sys\n", + "\n", + "sys.path.insert(0, str(NOTEBOOK_DIR.parent))\n", + "from ipf_conversion import (\n", + " resolve_targets_for_testing,\n", + " assemble_margins_for_testing,\n", + " close_margins_for_testing,\n", + " check_margin_consistency,\n", + " emit_target_rows,\n", + " _SCOPE_BY_VARIABLE,\n", + ")\n", + "\n", + "# 1. Count check: keep only supported count-style target variables.\n", + "supported = set(_SCOPE_BY_VARIABLE.keys())\n", + "count_style = filtered_targets[\n", + " filtered_targets[\"variable\"].isin(supported)\n", + "].reset_index(drop=True)\n", + "dropped_non_count = filtered_targets[~filtered_targets[\"variable\"].isin(supported)]\n", + "print(\n", + " f\"count check: kept {len(count_style)} / {len(filtered_targets)} targets; \"\n", + " f\"dropped {len(dropped_non_count)}: {dropped_non_count['target_name'].tolist()}\"\n", + ")\n", + "\n", + "# 2. Resolver check: drop anything whose stratum constraints don't map to a declared cell.\n", + "resolved, unresolved = resolve_targets_for_testing(count_style, stratum_constraints)\n", + "print(f\"resolver check: kept {len(resolved)}, dropped {len(unresolved)}\")\n", + "\n", + "pd.DataFrame(\n", + " [\n", + " {\n", + " \"target_name\": r.target_name,\n", + " \"geo\": f\"{r.geo.variable}={r.geo.value}\",\n", + " \"derived_cell\": \" & \".join(f\"{c}={lbl}\" for c, lbl in r.cell),\n", + " \"scope\": r.scope,\n", + " \"target_value\": r.target_value,\n", + " }\n", + " for r in resolved\n", + " ]\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "7aba403d", + "metadata": {}, + "source": [ + "Note the two distinct derived columns that got produced:\n", + "\n", + "- `age_bracket` (from the `age > X, age < Y` range constraints) → labels `0-4`, `5-9`.\n", + "- `snap_positive` (from the `snap > 0` single-threshold constraint) → label `positive`.\n", + "\n", + "The congressional-district value passes through unchanged because geography variables\n", + "are already categorical." + ] + }, + { + "cell_type": "markdown", + "id": "28742af9", + "metadata": {}, + "source": [ + "### 10c. Group resolved targets into margin blocks" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e7ea3547", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:53:31.564079Z", + "iopub.status.busy": "2026-04-22T15:53:31.563724Z", + "iopub.status.idle": "2026-04-22T15:53:31.601885Z", + "shell.execute_reply": "2026-04-22T15:53:31.600362Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
margin_idscopecell_varsn_targetsn_geosn_cells
0margin_0000personage_bracket | congressional_district_geoid422
1margin_0001householdcongressional_district_geoid | snap_positive221
\n", + "
" + ], + "text/plain": [ + " margin_id scope cell_vars \\\n", + "0 margin_0000 person age_bracket | congressional_district_geoid \n", + "1 margin_0001 household congressional_district_geoid | snap_positive \n", + "\n", + " n_targets n_geos n_cells \n", + "0 4 2 2 \n", + "1 2 2 1 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "blocks = assemble_margins_for_testing(resolved)\n", + "blocks_view = pd.DataFrame(\n", + " [\n", + " {\n", + " \"margin_id\": b.margin_id,\n", + " \"scope\": b.scope,\n", + " \"cell_vars\": \" | \".join(b.cell_vars),\n", + " \"n_targets\": len(b.targets),\n", + " \"n_geos\": len({t.geo.value for t in b.targets}),\n", + " \"n_cells\": len({t.cell for t in b.targets}),\n", + " }\n", + " for b in blocks\n", + " ]\n", + ")\n", + "blocks_view" + ] + }, + { + "cell_type": "markdown", + "id": "e4d86166", + "metadata": {}, + "source": [ + "### 10d. Close only supported margin systems\n", + "\n", + "The age-bracket block is already a complete 2×2 partition (two age buckets per district), so it is IPF-runnable as-is.\n", + "\n", + "The `snap > 0` block is different: it is only a subset margin and this toy slice does **not** include authored household totals by district. Under the corrected implementation that means the block is not runnable as classical IPF and is dropped rather than emitted as an open 1-cell margin.\n", + "\n", + "The next cell builds a tiny resolved unit table and runs the same closure helper used by the converter tests.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9ba8c9cc", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:53:31.627386Z", + "iopub.status.busy": "2026-04-22T15:53:31.627037Z", + "iopub.status.idle": "2026-04-22T15:53:31.797958Z", + "shell.execute_reply": "2026-04-22T15:53:31.795446Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
margin_idscopestatusn_emitted_cells
0margin_0000personfull_partition4
1margin_0001NaNdropped::unsupported_partial_margin0
\n", + "
" + ], + "text/plain": [ + " margin_id scope status n_emitted_cells\n", + "0 margin_0000 person full_partition 4\n", + "1 margin_0001 NaN dropped::unsupported_partial_margin 0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "demo_unit_data = pd.DataFrame(\n", + " [\n", + " {\"unit_index\": 0, \"household_id\": 0, \"congressional_district_geoid\": 601, \"age_bracket\": \"0-4\", \"snap_positive\": \"positive\"},\n", + " {\"unit_index\": 0, \"household_id\": 0, \"congressional_district_geoid\": 601, \"age_bracket\": \"5-9\", \"snap_positive\": \"positive\"},\n", + " {\"unit_index\": 1, \"household_id\": 1, \"congressional_district_geoid\": 601, \"age_bracket\": \"0-4\", \"snap_positive\": \"non_positive\"},\n", + " {\"unit_index\": 1, \"household_id\": 1, \"congressional_district_geoid\": 601, \"age_bracket\": \"5-9\", \"snap_positive\": \"non_positive\"},\n", + " {\"unit_index\": 2, \"household_id\": 2, \"congressional_district_geoid\": 602, \"age_bracket\": \"0-4\", \"snap_positive\": \"positive\"},\n", + " {\"unit_index\": 2, \"household_id\": 2, \"congressional_district_geoid\": 602, \"age_bracket\": \"5-9\", \"snap_positive\": \"positive\"},\n", + " {\"unit_index\": 3, \"household_id\": 3, \"congressional_district_geoid\": 602, \"age_bracket\": \"0-4\", \"snap_positive\": \"non_positive\"},\n", + " {\"unit_index\": 3, \"household_id\": 3, \"congressional_district_geoid\": 602, \"age_bracket\": \"5-9\", \"snap_positive\": \"non_positive\"},\n", + " ]\n", + ")\n", + "\n", + "closed_blocks, dropped = close_margins_for_testing(resolved, demo_unit_data)\n", + "\n", + "pd.DataFrame(\n", + " [\n", + " {\n", + " \"margin_id\": b.margin_id,\n", + " \"scope\": b.scope,\n", + " \"status\": b.closure_status,\n", + " \"n_emitted_cells\": len(b.cells),\n", + " }\n", + " for b in closed_blocks\n", + " ]\n", + " + [\n", + " {\n", + " \"margin_id\": d[\"margin_id\"],\n", + " \"scope\": None,\n", + " \"status\": f\"dropped::{d['reason']}\",\n", + " \"n_emitted_cells\": 0,\n", + " }\n", + " for d in dropped\n", + " ]\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "0d203ac8", + "metadata": {}, + "source": [ + "### 10e. The emitted `ipf_target_metadata.csv` rows\n", + "\n", + "Only the retained closed blocks are emitted. If a subset family had an authored parent total, any exactly-derived complement cells would appear here alongside the authored cells. In this toy slice the open `snap > 0` household block is dropped, so only the person-scope age block survives to the emitted CSV rows.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4d20c2a3", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:53:31.804688Z", + "iopub.status.busy": "2026-04-22T15:53:31.804344Z", + "iopub.status.idle": "2026-04-22T15:53:31.861703Z", + "shell.execute_reply": "2026-04-22T15:53:31.861165Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
margin_idscopetarget_typevariablescelltarget_valuetarget_namesource_variableis_authoredauthored_target_idsource_target_idsclosure_statusderivation_reason
0margin_0000personcategorical_marginage_bracket|congressional_district_geoidage_bracket=0-4|congressional_district_geoid=601210.0pc_age_0-4_d601person_countTrue11full_partitionNone
1margin_0000personcategorical_marginage_bracket|congressional_district_geoidage_bracket=5-9|congressional_district_geoid=601205.0pc_age_5-9_d601person_countTrue22full_partitionNone
2margin_0000personcategorical_marginage_bracket|congressional_district_geoidage_bracket=0-4|congressional_district_geoid=602200.0pc_age_0-4_d602person_countTrue33full_partitionNone
3margin_0000personcategorical_marginage_bracket|congressional_district_geoidage_bracket=5-9|congressional_district_geoid=602198.0pc_age_5-9_d602person_countTrue44full_partitionNone
\n", + "
" + ], + "text/plain": [ + " margin_id scope target_type \\\n", + "0 margin_0000 person categorical_margin \n", + "1 margin_0000 person categorical_margin \n", + "2 margin_0000 person categorical_margin \n", + "3 margin_0000 person categorical_margin \n", + "\n", + " variables \\\n", + "0 age_bracket|congressional_district_geoid \n", + "1 age_bracket|congressional_district_geoid \n", + "2 age_bracket|congressional_district_geoid \n", + "3 age_bracket|congressional_district_geoid \n", + "\n", + " cell target_value \\\n", + "0 age_bracket=0-4|congressional_district_geoid=601 210.0 \n", + "1 age_bracket=5-9|congressional_district_geoid=601 205.0 \n", + "2 age_bracket=0-4|congressional_district_geoid=602 200.0 \n", + "3 age_bracket=5-9|congressional_district_geoid=602 198.0 \n", + "\n", + " target_name source_variable is_authored authored_target_id \\\n", + "0 pc_age_0-4_d601 person_count True 1 \n", + "1 pc_age_5-9_d601 person_count True 2 \n", + "2 pc_age_0-4_d602 person_count True 3 \n", + "3 pc_age_5-9_d602 person_count True 4 \n", + "\n", + " source_target_ids closure_status derivation_reason \n", + "0 1 full_partition None \n", + "1 2 full_partition None \n", + "2 3 full_partition None \n", + "3 4 full_partition None " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "emitted = emit_target_rows(closed_blocks)\n", + "emitted\n" + ] + }, + { + "cell_type": "markdown", + "id": "4c68c47b", + "metadata": {}, + "source": [ + "### 10f. Coherence check on retained blocks\n", + "\n", + "`surveysd::ipf` still requires retained margins of the same scope and geography to imply compatible totals. The check below now runs on the **closed retained blocks**, not on the pre-validation raw blocks.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e4eaa3de", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:53:31.867738Z", + "iopub.status.busy": "2026-04-22T15:53:31.867216Z", + "iopub.status.idle": "2026-04-22T15:53:31.910411Z", + "shell.execute_reply": "2026-04-22T15:53:31.909443Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 mismatched scope/geography combinations\n" + ] + } + ], + "source": [ + "issues = check_margin_consistency(closed_blocks)\n", + "print(f\"{len(issues)} mismatched scope/geography combinations\")\n", + "for iss in issues:\n", + " totals = \", \".join(f\"{m}={v:,.0f}\" for m, v in iss[\"margin_totals\"].items())\n", + " print(\n", + " f\" scope={iss['scope']} geo={iss['geo_value']} spread={iss['relative_spread']:.2%} {totals}\"\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "id": "f8f9633c", + "metadata": {}, + "source": [ + "For a concrete example of a *real* mismatch, the `age × district` block and the `agi × district × is_filer` block in the full calibration package are both person-scope and share district geographies, but they represent different populations (all persons vs. persons in filer tax units) and therefore imply different totals per district. Under the corrected benchmark scaffold that does **not** trigger sequential external IPF calls anymore. Instead, the export step fails the combined IPF problem and records the mismatch in diagnostics so the run can be narrowed to a coherent closed system.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0af2c9f7", + "metadata": { + "execution": { + "iopub.execute_input": "2026-04-22T15:53:31.928994Z", + "iopub.status.busy": "2026-04-22T15:53:31.928008Z", + "iopub.status.idle": "2026-04-22T15:53:31.964082Z", + "shell.execute_reply": "2026-04-22T15:53:31.962566Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Retained target rows per closed margin:\n", + " margin_0000: 4 rows, per-geo totals: congressional_district_geoid=601->415, congressional_district_geoid=602->398\n" + ] + } + ], + "source": [ + "print(\"Retained target rows per closed margin:\")\n", + "for margin_id, sub in emitted.groupby(\"margin_id\", sort=False):\n", + " totals_by_geo = {}\n", + " for _, r in sub.iterrows():\n", + " geo_parts = [\n", + " a\n", + " for a in r[\"cell\"].split(\"|\")\n", + " if a.startswith(\"congressional_district_geoid=\")\n", + " ]\n", + " geo_part = geo_parts[0] if geo_parts else \"national\"\n", + " totals_by_geo[geo_part] = totals_by_geo.get(geo_part, 0.0) + r[\"target_value\"]\n", + " totals_str = \", \".join(f\"{g}->{v:,.0f}\" for g, v in sorted(totals_by_geo.items()))\n", + " print(f\" {margin_id}: {len(sub)} rows, per-geo totals: {totals_str}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "438b3847", + "metadata": {}, + "source": [ + "### 10g. How the same path runs against the real calibration package\n", + "\n", + "In production, `benchmark_export.build_ipf_inputs` receives the `filtered_targets` slice computed by `benchmark_manifest.filter_targets` from the manifest's `target_filters`. The count check keeps only supported count targets, the resolver check keeps only rows that map cleanly to declared categorical cells, and the closure step keeps only fully closed categorical systems or binary subsets with exact authored parent totals.\n", + "\n", + "The exported diagnostics distinguish between:\n", + "\n", + "- requested targets\n", + "- retained authored IPF targets\n", + "- derived complement rows used only to close a runnable IPF problem\n", + "- requested but non-runnable IPF targets, with reasons such as `missing_parent_total`, `unsupported_partial_margin`, or `non_invariant_household_constraint_variable`\n", + "\n", + "That is the contract the benchmark now uses before calling `surveysd::ipf` once on the retained closed system.\n" + ] + }, + { + "cell_type": "markdown", + "id": "efdf0275", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "Inputs to the IPF runner:\n", + "\n", + "| Artifact | Format | Shape |\n", + "| --- | --- | --- |\n", + "| `unit_metadata.csv` | person-level CSV with category columns + `unit_index`, `household_id`, `base_weight` | `n_persons` rows |\n", + "| `ipf_target_metadata.csv` | CSV with one row per **retained closed margin cell** (`categorical_margin` only) | `n_cells` rows |\n", + "| `initial_weights.npy` | float64 array | `n_units` |\n", + "\n", + "Output of the IPF runner:\n", + "\n", + "| Artifact | Format | Shape |\n", + "| --- | --- | --- |\n", + "| `fitted_weights.csv` | CSV (`unit_index`, `fitted_weight`) | `n_persons` rows, collapsible to `n_units` |\n", + "\n", + "Comparison to the GREG walkthrough:\n", + "\n", + "| Aspect | GREG | IPF |\n", + "| --- | --- | --- |\n", + "| Primary input | sparse `(n_targets, n_units)` matrix | unit-record table with categorical columns |\n", + "| Target vocabulary | arbitrary linear functionals | closed categorical margin totals |\n", + "| Handles dollar-total targets? | yes | no (without an ad-hoc encoding) |\n", + "| Negative weights? | possible with `cal.linear` | no (multiplicative updates) |\n", + "| Collapse step needed? | no, weights already per-unit | yes, per-person weights → per-unit |\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pe3.13 (3.13.0)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/paper-l0/benchmarking/notebooks/toy_data.py b/paper-l0/benchmarking/notebooks/toy_data.py new file mode 100644 index 000000000..80de561bb --- /dev/null +++ b/paper-l0/benchmarking/notebooks/toy_data.py @@ -0,0 +1,114 @@ +"""Synthetic toy household survey used by the GREG and IPF walkthrough notebooks. + +The dataset is ten households distributed across two districts. Each household +has a design weight of 100, a household size, an adult/child breakdown, and a +household income. The targets defined below are internally consistent so any +reasonable calibration run should hit them to within a small tolerance. +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd + + +HOUSEHOLDS = pd.DataFrame( + { + "hh_id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "district": ["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"], + "n_adults": [2, 2, 1, 2, 2, 2, 1, 2, 3, 1], + "n_children": [0, 2, 0, 1, 0, 3, 1, 1, 1, 0], + "income": [ + 30_000, + 60_000, + 25_000, + 45_000, + 70_000, + 40_000, + 20_000, + 80_000, + 90_000, + 15_000, + ], + "design_weight": [100.0] * 10, + } +) +HOUSEHOLDS["hh_size"] = HOUSEHOLDS["n_adults"] + HOUSEHOLDS["n_children"] + + +# Targets, defined as (target_name, target_value, coefficient_column). +# Each coefficient column already exists on the household table. +TARGETS = pd.DataFrame( + [ + ("district_A_households", 520.0, "is_A"), + ("district_A_adults", 940.0, "adults_in_A"), + ("district_A_children", 310.0, "children_in_A"), + ("district_A_income", 23_500_000.0, "income_in_A"), + ("district_B_households", 480.0, "is_B"), + ("district_B_adults", 1_000.0, "adults_in_B"), + ("district_B_children", 600.0, "children_in_B"), + ("district_B_income", 25_000_000.0, "income_in_B"), + ], + columns=["target_name", "value", "coef_col"], +) + + +# IPF-eligible subset: count-style targets only. +IPF_TARGETS = TARGETS[~TARGETS["target_name"].str.endswith("_income")].reset_index( + drop=True +) + + +def household_table_with_coefficients() -> pd.DataFrame: + """Return HOUSEHOLDS augmented with the per-target coefficient columns.""" + hh = HOUSEHOLDS.copy() + hh["is_A"] = (hh["district"] == "A").astype(float) + hh["is_B"] = (hh["district"] == "B").astype(float) + hh["adults_in_A"] = hh["n_adults"] * hh["is_A"] + hh["children_in_A"] = hh["n_children"] * hh["is_A"] + hh["income_in_A"] = hh["income"] * hh["is_A"] + hh["adults_in_B"] = hh["n_adults"] * hh["is_B"] + hh["children_in_B"] = hh["n_children"] * hh["is_B"] + hh["income_in_B"] = hh["income"] * hh["is_B"] + return hh + + +def build_target_matrix(hh: pd.DataFrame, targets: pd.DataFrame) -> np.ndarray: + """Return a (n_targets, n_units) dense matrix of per-unit coefficients.""" + return np.stack([hh[col].to_numpy(dtype=float) for col in targets["coef_col"]]) + + +def baseline_totals(hh: pd.DataFrame, targets: pd.DataFrame) -> pd.DataFrame: + """Realized weighted totals using the design weights.""" + X = build_target_matrix(hh, targets) + w = hh["design_weight"].to_numpy(dtype=float) + realized = X @ w + return pd.DataFrame( + { + "target_name": targets["target_name"].to_numpy(), + "target_value": targets["value"].to_numpy(dtype=float), + "baseline_weighted_total": realized, + } + ) + + +def diagnostic_table( + hh: pd.DataFrame, + targets: pd.DataFrame, + fitted_weights: np.ndarray, +) -> pd.DataFrame: + """Compare fitted weighted totals to target values.""" + X = build_target_matrix(hh, targets) + base_w = hh["design_weight"].to_numpy(dtype=float) + out = pd.DataFrame( + { + "target_name": targets["target_name"].to_numpy(), + "target_value": targets["value"].to_numpy(dtype=float), + "baseline_weighted_total": X @ base_w, + "fitted_weighted_total": X @ fitted_weights, + } + ) + out["abs_rel_error"] = ( + out["fitted_weighted_total"] - out["target_value"] + ).abs() / out["target_value"].abs() + return out diff --git a/paper-l0/benchmarking/requirements-python.txt b/paper-l0/benchmarking/requirements-python.txt new file mode 100644 index 000000000..515179319 --- /dev/null +++ b/paper-l0/benchmarking/requirements-python.txt @@ -0,0 +1,2 @@ +-e ".[dev,l0]" +PyYAML>=6.0 diff --git a/paper-l0/benchmarking/runners/greg_runner.R b/paper-l0/benchmarking/runners/greg_runner.R new file mode 100644 index 000000000..c8234a73c --- /dev/null +++ b/paper-l0/benchmarking/runners/greg_runner.R @@ -0,0 +1,53 @@ +script_arg <- grep("^--file=", commandArgs(trailingOnly = FALSE), value = TRUE) +if (length(script_arg) != 1L) { + stop("Could not determine greg_runner.R path") +} +script_dir <- dirname(normalizePath(sub("^--file=", "", script_arg))) +source(file.path(script_dir, "read_npy.R")) + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 6L) { + stop("Usage: greg_runner.R X.mtx target_metadata.csv initial_weights.npy output_weights.csv maxit epsilon") +} + +matrix_path <- args[[1]] +target_csv <- args[[2]] +weights_npy <- args[[3]] +output_csv <- args[[4]] +maxit <- as.integer(args[[5]]) +epsilon <- as.numeric(args[[6]]) + +library(Matrix) +library(survey) + +X_targets_by_units <- readMM(matrix_path) +target_meta <- read.csv(target_csv, stringsAsFactors = FALSE) +base_weights <- read_npy_vector(weights_npy) +population <- as.numeric(target_meta$value) + +mm <- Matrix::t(X_targets_by_units) +if (nrow(mm) != length(base_weights)) { + stop("Unit count mismatch between matrix and initial weights") +} +if (ncol(mm) != length(population)) { + stop("Target count mismatch between matrix and target metadata") +} + +cal_linear <- get("cal.linear", envir = asNamespace("survey")) +g <- survey:::grake( + mm = mm, + ww = base_weights, + calfun = cal_linear, + bounds = list(lower = -Inf, upper = Inf), + population = population, + epsilon = epsilon, + verbose = FALSE, + maxit = maxit +) + +fitted_weights <- as.numeric(base_weights * g) +write.csv( + data.frame(unit_index = seq_along(fitted_weights) - 1L, fitted_weight = fitted_weights), + output_csv, + row.names = FALSE +) diff --git a/paper-l0/benchmarking/runners/ipf_runner.R b/paper-l0/benchmarking/runners/ipf_runner.R new file mode 100644 index 000000000..e548734c8 --- /dev/null +++ b/paper-l0/benchmarking/runners/ipf_runner.R @@ -0,0 +1,120 @@ +script_arg <- grep("^--file=", commandArgs(trailingOnly = FALSE), value = TRUE) +if (length(script_arg) != 1L) { + stop("Could not determine ipf_runner.R path") +} +script_dir <- dirname(normalizePath(sub("^--file=", "", script_arg))) +source(file.path(script_dir, "read_npy.R")) + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 10L) { + stop("Usage: ipf_runner.R unit_metadata.csv ipf_target_metadata.csv initial_weights.npy output_weights.csv max_iter bound epsP epsH household_id_col weight_col") +} + +unit_csv <- args[[1]] +target_csv <- args[[2]] +weights_npy <- args[[3]] +output_csv <- args[[4]] +max_iter <- as.integer(args[[5]]) +bound <- as.numeric(args[[6]]) +epsP <- as.numeric(args[[7]]) +epsH <- as.numeric(args[[8]]) +household_id_col <- args[[9]] +weight_col <- args[[10]] + +if (!requireNamespace("surveysd", quietly = TRUE)) { + stop("The surveysd package is required for IPF benchmarks") +} +if (!requireNamespace("data.table", quietly = TRUE)) { + stop("The data.table package is required for IPF benchmarks") +} + +build_margin_array <- function(df) { + variables <- strsplit(df$variables[[1]], "\\|")[[1]] + rows <- lapply(seq_len(nrow(df)), function(i) { + cell <- df$cell[[i]] + parts <- strsplit(cell, "\\|")[[1]] + entries <- strsplit(parts, "=") + row <- as.list(setNames(vapply(entries, `[`, "", 2L), vapply(entries, `[`, "", 1L))) + row$Freq <- as.numeric(df$target_value[[i]]) + as.data.frame(row, stringsAsFactors = FALSE) + }) + frame <- do.call(rbind, rows) + stats::xtabs( + stats::as.formula(paste("Freq ~", paste(variables, collapse = " + "))), + data = frame + ) +} + +unit_data <- read.csv(unit_csv, stringsAsFactors = FALSE) +target_meta <- read.csv(target_csv, stringsAsFactors = FALSE) +base_weights <- read_npy_vector(weights_npy) +unit_data <- data.table::as.data.table(unit_data) + +if (!(weight_col %in% names(unit_data))) { + if (!("unit_index" %in% names(unit_data))) { + stop("Unit metadata must include either base weights or a unit_index column") + } + if (max(unit_data$unit_index) >= length(base_weights)) { + stop("unit_index contains values outside the initial weight vector") + } + unit_data[[weight_col]] <- base_weights[unit_data$unit_index + 1L] +} + +if (!("target_type" %in% names(target_meta))) { + stop("ipf target metadata must include a target_type column") +} +unsupported_types <- setdiff(unique(target_meta$target_type), "categorical_margin") +if (length(unsupported_types) > 0L) { + stop(sprintf( + "ipf_runner.R only supports target_type='categorical_margin'; got: %s", + paste(unsupported_types, collapse = ", ") + )) +} + +conP <- list() +conH <- list() +margin_rows_all <- target_meta +for (margin_id in unique(margin_rows_all$margin_id)) { + margin_rows <- margin_rows_all[margin_rows_all$margin_id == margin_id, , drop = FALSE] + margin_array <- build_margin_array(margin_rows) + scope <- unique(margin_rows$scope) + if (length(scope) != 1L) { + stop(sprintf("Margin %s has inconsistent scope values", margin_id)) + } + if (scope[[1]] == "person") { + conP[[length(conP) + 1L]] <- margin_array + } else if (scope[[1]] == "household") { + conH[[length(conH) + 1L]] <- margin_array + } else { + stop(sprintf("Unsupported margin scope: %s", scope[[1]])) + } +} + +ipf_result <- surveysd::ipf( + dat = unit_data, + hid = if (household_id_col %in% names(unit_data)) household_id_col else NULL, + conP = if (length(conP)) conP else NULL, + conH = if (length(conH)) conH else NULL, + epsP = epsP, + epsH = epsH, + verbose = FALSE, + w = weight_col, + bound = bound, + maxIter = max_iter, + meanHH = TRUE, + returnNA = TRUE, + nameCalibWeight = "calibWeight" +) + +if (!("calibWeight" %in% names(ipf_result))) { + stop("surveysd::ipf did not return a calibWeight column") +} + +write.csv( + data.frame( + unit_index = if ("unit_index" %in% names(ipf_result)) ipf_result$unit_index else seq_len(nrow(ipf_result)) - 1L, + fitted_weight = as.numeric(ipf_result$calibWeight) + ), + output_csv, + row.names = FALSE +) diff --git a/paper-l0/benchmarking/runners/read_npy.R b/paper-l0/benchmarking/runners/read_npy.R new file mode 100644 index 000000000..04a4ef797 --- /dev/null +++ b/paper-l0/benchmarking/runners/read_npy.R @@ -0,0 +1,66 @@ +read_npy_vector <- function(path) { + con <- file(path, "rb") + on.exit(close(con), add = TRUE) + + magic <- readBin(con, what = "raw", n = 6, endian = "little") + if (!identical(as.integer(magic), as.integer(charToRaw("\x93NUMPY")))) { + stop("Unsupported .npy file: bad magic header") + } + + major <- readBin(con, what = "integer", n = 1, size = 1, signed = FALSE) + minor <- readBin(con, what = "integer", n = 1, size = 1, signed = FALSE) + + header_len <- if (major == 1L) { + readBin(con, what = "integer", n = 1, size = 2, signed = FALSE, endian = "little") + } else if (major %in% c(2L, 3L)) { + readBin(con, what = "integer", n = 1, size = 4, signed = FALSE, endian = "little") + } else { + stop("Unsupported .npy version") + } + + header_raw <- readBin(con, what = "raw", n = header_len, endian = "little") + header <- rawToChar(header_raw) + + descr_match <- regmatches(header, regexpr("'descr': *'[^']+'", header)) + shape_match <- regmatches(header, regexpr("'shape': *\\([^\\)]*\\)", header)) + fortran_match <- regmatches(header, regexpr("'fortran_order': *(True|False)", header)) + + if (length(descr_match) == 0 || length(shape_match) == 0 || length(fortran_match) == 0) { + stop("Could not parse .npy header") + } + + descr <- sub("^'descr': *'([^']+)'$", "\\1", descr_match) + fortran_order <- sub("^'fortran_order': *(True|False)$", "\\1", fortran_match) + if (fortran_order != "False") { + stop("Only C-order .npy arrays are supported") + } + + shape_text <- sub("^'shape': *\\(([^\\)]*)\\)$", "\\1", shape_match) + shape_parts <- trimws(unlist(strsplit(shape_text, ","))) + shape_parts <- shape_parts[nzchar(shape_parts)] + dims <- as.integer(shape_parts) + + if (length(dims) != 1L || is.na(dims[1])) { + stop("Only 1D .npy vectors are supported") + } + + n <- dims[1] + + if (descr == " 0$ ($\times$ 436 district-level units) & \$ & 9{,}156 \\ +Tax unit count for each of the 21 domains ($\times$ 436 district-level units) & count & 9{,}156 \\ +\midrule +\multicolumn{3}{l}{\textit{Census ACS S2201}} \\ +SNAP household count ($\times$ 436 district-level units) & count & 436 \\ +\midrule +& & \textbf{33{,}572} \\ +\bottomrule +\end{tabular} +} +\caption{District-level calibration targets. The 436 district-level units correspond to the 435 +congressional districts plus the District of Columbia. IRS SOI provides paired dollar and count +targets for each income and deduction domain.} +\label{tab:cd_targets} +\end{table} + +\subsection{State targets (4,080)} + +\begin{table}[H] +\centering +{\tablefont +\begin{tabular}{p{0.55\textwidth}lr} +\toprule +Target domain & Type & Count \\ +\midrule +\multicolumn{3}{l}{\textit{Census ACS S0101}} \\ +Person count by age band (18 bands $\times$ 50 states + DC) & count & 918 \\ +\midrule +\multicolumn{3}{l}{\textit{IRS SOI}} \\ +Person count by AGI bracket (9 bins $\times$ 50 states + DC) & count & 459 \\ +EITC dollars by qualifying children (4 bins $\times$ 50 states + DC) & \$ & 204 \\ +Tax unit count by qualifying children (4 bins $\times$ 50 states + DC) & count & 204 \\ +Aggregate AGI (unconditional, $\times$ 50 states + DC) & \$ & 51 \\ +20 income/deduction dollar totals (domain $> 0$, $\times$ 50 states + DC) & \$ & 1{,}020 \\ +Tax unit count for each of the 21 domains ($\times$ 50 states + DC) & count & 1{,}071 \\ +\midrule +\multicolumn{3}{l}{\textit{USDA FNS SNAP}} \\ +SNAP spending ($\times$ 50 states + DC) & \$ & 51 \\ +SNAP household count ($\times$ 50 states + DC) & count & 51 \\ +\midrule +\multicolumn{3}{l}{\textit{CMS Medicaid}} \\ +Medicaid enrollment ($\times$ 50 states + DC) & count & 51 \\ +\midrule +\multicolumn{3}{l}{\textit{Census STC}} \\ +State income tax collections ($\times$ 50 states + DC) & \$ & 51 \\ +\midrule +& & \textbf{4{,}080} \\ +\bottomrule +\end{tabular} +} +\caption{State-level calibration targets for the 50 states plus the District of Columbia. IRS SOI +variables mirror the district-level structure. USDA provides both SNAP spending and household +counts, and CMS provides Medicaid enrollment.} +\label{tab:state_targets} +\end{table} + +\subsection{National targets (106)} + +\begin{table}[H] +\centering +{\tablefont +\begin{tabular}{p{0.55\textwidth}lr} +\toprule +Target domain & Type & Count \\ +\midrule +\multicolumn{3}{l}{\textit{Demographics (Census ACS + curated)}} \\ +Person count by age band (18 bands) & count & 18 \\ +Person count by SSN card type (4 categories) & count & 4 \\ +Person count: Medicaid enrolled & count & 1 \\ +Person count: ACA PTC recipients & count & 1 \\ +\midrule +\multicolumn{3}{l}{\textit{IRS SOI --- domain-constrained aggregates}} \\ +AGI (unconditional) & \$ & 1 \\ +EITC by qualifying children (0--3+) & \$ & 4 \\ +Tax unit count by qualifying children (0--3+) & count & 4 \\ +21 income/deduction dollar totals (domain $> 0$) & \$ & 21 \\ +Tax unit count for each of the 21 domains & count & 21 \\ +\midrule +\multicolumn{3}{l}{\textit{CBO budget projections}} \\ +SNAP, Social Security, SSI, unemployment comp., income tax & \$ & 5 \\ +\midrule +\multicolumn{3}{l}{\textit{SSA benefit decomposition}} \\ +Retirement, disability, survivors, dependents & \$ & 4 \\ +\midrule +\multicolumn{3}{l}{\textit{JCT tax expenditure estimates}} \\ +SALT ded., charitable ded., mortgage interest ded., medical expense ded., QBI ded., EITC & \$ & 6 \\ +\midrule +\multicolumn{3}{l}{\textit{Healthcare spending (MEPS/NHEA/CMS)}} \\ +Medicaid, health insurance premiums, Medicare Part B, other medical expenses, OTC health & \$ & 5 \\ +\midrule +\multicolumn{3}{l}{\textit{Housing, transfers, and other}} \\ +Rent, real estate taxes, housing subsidy, work/childcare expenses, TANF, alimony (paid + received), child support (paid + received), tip income, net worth & \$ & 10 \\ +\midrule +\multicolumn{3}{l}{\textit{Retirement contributions (IRS SOI)}} \\ +Traditional IRA, Roth IRA contributions & \$ & 2 \\ +\midrule +& & \textbf{106} \\ +\bottomrule +\end{tabular} +} +\caption{National-level calibration targets. CBO, JCT, SSA, CMS, and Census values are curated from +the cited administrative sources and stored in the ETL pipeline. Dollar values are +inflation-adjusted to the calibration year.} +\label{tab:national_targets} +\end{table} + +\section{Algorithm pseudocode} +\label{app:algorithm} + +Algorithm~\ref{alg:l0} presents the complete $L_0$-regularized calibration procedure. + +\begin{algorithm}[ht] +\caption{$L_0$-regularized calibration with Hard Concrete gates} +\label{alg:l0} +\begin{algorithmic}[1] +\Require Calibration matrix $\mathbf{M} \in \R^{m \times n}$, targets $\mathbf{t} \in \R^m$, initial weights $\mathbf{w}_0 \in \R^n$ +\Require Hyperparameters: $\lambda_{L_0}$, $\lambda_{L_2}$, $\beta$, $\gamma$, $\zeta$, learning rate $\eta$, epochs $E$ +\Ensure Calibrated sparse weight vector $\hat{\mathbf{w}} \in \R^n$ + +\State Initialize $\log w_i \gets \log w_{0,i} + \mathcal{N}(0, 0.05^2)$ for all $i$ +\State Initialize $\log \alpha_i \gets \text{logit}(0.999) + \mathcal{N}(0, 0.01^2)$ for all $i$ +\State Initialize Adam optimizer with parameters $\{\log w_i, \log \alpha_i\}$ and learning rate $\eta$ + +\For{epoch $= 1$ to $E$} + \State \textbf{Sample Hard Concrete gates (training):} + \For{$i = 1$ to $n$} + \State $u_i \sim \text{Uniform}(\epsilon, 1-\epsilon)$ + \State $\bar{s}_i \gets \sigma\!\left(\frac{\log u_i - \log(1-u_i) + \log \alpha_i}{\beta}\right)$ + \State $s_i \gets \bar{s}_i(\zeta - \gamma) + \gamma$ + \State $z_i \gets \min(1, \max(0, s_i))$ + \EndFor + \State $w_i^{\text{eff}} \gets \exp(\log w_i) \cdot z_i$ for all $i$ + \State $\hat{t}_j \gets \sum_i M_{ji} \cdot w_i^{\text{eff}}$ for all $j$ + \State $\mathcal{L}_{\text{cal}} \gets \frac{1}{m}\sum_{j=1}^{m}\left(\frac{\hat{t}_j - t_j}{|t_j|}\right)^2$ + \State $\mathcal{L}_{L_0} \gets \sum_{i=1}^{n} \sigma\!\left(\log \alpha_i - \beta \log \frac{-\gamma}{\zeta}\right)$ + \State $\mathcal{L}_{L_2} \gets \sum_{i=1}^{n} w_i^2$ + \State $\mathcal{L} \gets \mathcal{L}_{\text{cal}} + \lambda_{L_0} \mathcal{L}_{L_0} + \lambda_{L_2} \mathcal{L}_{L_2}$ + \State Backpropagate $\nabla_{\log w, \log \alpha} \mathcal{L}$ + \State Adam step +\EndFor + +\State \textbf{Deterministic inference:} +\For{$i = 1$ to $n$} + \State $z_i^{\text{det}} \gets \min\!\left(1, \max\!\left(0,\; \sigma(\log \alpha_i)(\zeta - \gamma) + \gamma\right)\right)$ + \State $\hat{w}_i \gets \exp(\log w_i) \cdot z_i^{\text{det}}$ +\EndFor +\State \Return $\hat{\mathbf{w}}$ +\end{algorithmic} +\end{algorithm} + +\section{Dataset assembly details} +\label{app:assembly} + +After optimization, the fitted weight vector is reshaped back to clone-by-household form and used +to build deployable H5 datasets. The assembly step keeps only active cloned units, reconstructs the +associated person and tax-unit memberships, derives geography from the stored census block +assignment, and recomputes geography-dependent quantities such as Supplemental Poverty Measure +threshold adjustments. + +Two implementation details matter for fidelity. First, geography is derived from the same cloned +block assignments that were used to build the calibration matrix, so the assembled dataset matches +the unit universe that the optimizer saw. Second, take-up draws are regenerated with the same +deterministic seeds used during matrix construction, which keeps take-up-dependent targets +consistent between the calibration package and the published dataset. diff --git a/paper-l0/sections/background.tex b/paper-l0/sections/background.tex new file mode 100644 index 000000000..759da035c --- /dev/null +++ b/paper-l0/sections/background.tex @@ -0,0 +1,114 @@ +\section{Background and related work} +\label{sec:background} + +\subsection{Subnational microsimulation and spatial reweighting} + +Subnational microsimulation usually starts from the same basic problem: a national survey contains +the behavioural and demographic detail needed for policy modelling, but it does not contain enough +observations in every local area to support direct estimation. Spatial microsimulation addresses +that gap either by reweighting existing survey records or by constructing synthetic small-area +populations from aggregate constraints \citep{tanton2014review, odonoghue2014review}. + +This distinction matters for the present paper. Our pipeline remains a reweighting system built on +observed CPS households. We do clone households across sampled geographies, but the core empirical +problem is still calibration of survey-based microdata to administrative targets. That makes +classical calibration methods the closest reference point for the benchmark design. + +The spatial microsimulation literature also provides broader context. \citet{williamson1998}, +\citet{huang2001}, and \citet{harland2012} describe combinatorial and synthetic-population +approaches that search over record assignments area by area. \citet{tanton2011} and +\citet{lovelace2016} discuss reweighting approaches for small-area estimation and policy analysis. +These methods are useful precedents, but they usually target one geographic layer at a time. Our +setting requires one weighted microdata system to reproduce district-level, state-level, and +national targets jointly. + +\subsection{Survey calibration} + +Let $i = 1, \ldots, n$ index sampled units, let $d_i$ denote the initial design weight for unit +$i$, and let $\mathbf{x}_i \in \mathbb{R}^p$ denote the vector of auxiliary variables used for +calibration. Survey calibration seeks adjusted weights $w_i$ such that the weighted sample totals +match known population totals: +\begin{equation} + \sum_{i=1}^{n} w_i \mathbf{x}_i = \mathbf{T}, + \label{eq:calibration_constraint} +\end{equation} +where $\mathbf{T} \in \mathbb{R}^p$ is the vector of published totals +\citep{deville1992, sarndal2007}. In practice, the auxiliary variables may be counts, indicators, +or continuous quantities such as income components. When the target system is internally +consistent, classical calibration methods seek exact reproduction of those totals. + +The subnational problem in this paper extends that setup in two ways. First, the target vector +contains variables from multiple administrative sources and therefore mixes count and dollar +constraints. Second, those constraints are nested by geography: district-level totals should add to +state totals, which should add to national totals after uprating and reconciliation. In the full +production problem, that yields tens of thousands of targets with substantial collinearity across +rows. + +\subsection{Generalized regression} + +Generalized regression (GREG) calibration minimizes a distance from the initial weights while +enforcing the calibration equations \citep{deville1992, sarndal2007}. With the usual chi-squared +distance, the calibrated weight for unit $i$ can be written as +\begin{equation} + w_i = d_i + d_i \mathbf{x}_i^\top + \left(\sum_{j=1}^{n} d_j \mathbf{x}_j \mathbf{x}_j^\top\right)^{-1} + \left(\mathbf{T} - \sum_{j=1}^{n} d_j \mathbf{x}_j\right). + \label{eq:greg} +\end{equation} +GREG is attractive because it can accommodate quantitative targets directly and is well understood +in survey statistics. + +Its weaknesses are also clear in a microsimulation setting. The matrix in +Equation~\ref{eq:greg} becomes harder to invert reliably as the target system grows and the rows +become more collinear. GREG can also assign negative weights. That is acceptable in some +estimation settings, but it is awkward in a microdata file that will later be used for tax-benefit +simulation, where analysts expect households to represent positive population mass. + +\subsection{Iterative proportional fitting and raking} + +Iterative proportional fitting \citep[IPF;][]{deming1940, ireland1968}, often called raking in +survey practice, adjusts weights multiplicatively so that selected margins match published totals. +When the margins are compatible and the algorithm converges, the fitted margins are matched +exactly. IPF preserves non-negativity and avoids matrix inversion, which makes it a natural +baseline for count-style calibration problems. + +IPF is narrower than GREG in scope. Classical IPF is built for categorical or margin-style +constraints, not arbitrary linear dollar targets. It also does not naturally encode a large mixed +system of district, state, and national constraints in one pass. For that reason, the benchmark in +this paper uses IPF only on target subsets that fit its assumptions rather than forcing it onto the +full production matrix. + +\subsection{Adjacent weighting literatures} + +Two adjacent literatures inform the framing even though they are not the main empirical baselines +here. Small area estimation borrows strength across areas to improve local estimates from limited +samples \citep{anderson2013}. Covariate-balancing methods in causal inference choose weights to +balance observed covariates between treated and control units \citep{imai2014}. Both literatures +share the broader idea that weighting can repair deficiencies in the observed sample, but they +optimize for different inferential goals than the hierarchical administrative-target problem studied +here. + +\subsection{\texorpdfstring{$L_0$}{L0} regularization and the Hard Concrete distribution} + +$L_0$ regularization penalizes the number of active parameters in a model. In our setting, the +parameters are household-geography combinations rather than neural-network weights. Direct +optimization of the $L_0$ norm is combinatorial, so we use the Hard Concrete relaxation proposed by +\citet{louizos2018}. Each unit receives a gate $z_i \in [0, 1]$ that is sampled during training and +that collapses toward 0 or 1 after clipping. The expected number of active gates is differentiable, +which makes it possible to optimize calibration error and sparsity jointly. + +That adaptation changes the role of regularization in calibration. Classical calibration methods +return one fitted weight vector for the chosen target system. The Hard Concrete gate makes the +number of retained records a direct object of optimization. This is the mechanism that lets one +pipeline produce both compact national datasets and larger subnational datasets. + +\subsection{National-level predecessor} + +\citet{woodruff2024} developed a national pipeline that imputes tax variables from the IRS Public +Use File onto CPS records and then calibrates the resulting dataset to national administrative +totals. The present paper extends that work in three directions. First, the pipeline now assigns +households to district-level units and states rather than keeping a single national geography. +Second, the optimizer now uses $L_0$ gates instead of dropout-style regularization, which produces +exact sparsity at inference time. Third, the paper evaluates the resulting system as a benchmark +study, with explicit comparisons to classical reference methods and an experimental design that +tracks what happens as the target system approaches production scale. diff --git a/paper-l0/sections/conclusion.tex b/paper-l0/sections/conclusion.tex new file mode 100644 index 000000000..a82e2161a --- /dev/null +++ b/paper-l0/sections/conclusion.tex @@ -0,0 +1,23 @@ +\section{Conclusion} +\label{sec:conclusion} + +This paper studies $L_0$ regularization as a calibration method for subnational microsimulation. In +our setting, the problem is not only to match administrative targets. It is also to do so across +nested geographies while producing a dataset that remains usable in an operational policy model. + +The paper's main empirical design reflects that goal. Rather than presenting $L_0$ only as a new +optimizer, we benchmark it against classical reference methods on shared exported calibration +packages. The tractable benchmark tests whether $L_0$ is competitive when classical methods can run +comfortably. The scaling frontier identifies where those methods begin to fail or fall outside +their natural scope. The production-feasibility case connects the comparison back to the +\texttt{policyengine-us-data} deployment problem. + +That framing is important for the substantive claim. A useful subnational calibration method must +do more than fit a small benchmark. It must support a large hierarchical target system and produce a +manageable microdata file. The $L_0$ gate is valuable here because it makes sparsity a direct +optimization target rather than an afterthought. + +The broader contribution is therefore twofold: a sparse calibration method and an open-source +pipeline that makes the method deployable. Future work can extend the benchmark to finer +geographies, refine the target families, and test alternative sparse optimizers on the same +exported calibration packages. diff --git a/paper-l0/sections/data.tex b/paper-l0/sections/data.tex new file mode 100644 index 000000000..8c5bc6f39 --- /dev/null +++ b/paper-l0/sections/data.tex @@ -0,0 +1,121 @@ +\section{Data} +\label{sec:data} + +The calibration pipeline consumes two classes of inputs: household microdata that define the unit +universe and administrative targets that define the calibration problem. This section describes +both. + +\subsection{Base microdata} + +\subsubsection{Current Population Survey} + +The primary microdata source is the Current Population Survey Annual Social and Economic Supplement +\citep[CPS ASEC;][]{census2024}, a nationally representative household survey conducted by the US +Census Bureau and the Bureau of Labor Statistics. The CPS provides detailed demographic structure, +household relationships, labour income, transfer income, and reported participation in major +programmes such as SNAP, Medicaid, SSI, and TANF. + +The CPS is not a complete tax-benefit dataset by itself. Some tax variables are missing entirely, +others are measured with substantial error, and top incomes are top-coded or otherwise imperfectly +captured relative to tax records \citep{burkhauser2012}. Benefit receipt is also underreported in +many surveys relative to administrative sources \citep{rothbaum2021, meyer2021}. The pipeline +therefore augments the CPS before calibration. + +\subsubsection{IRS Public Use File imputation} + +We impute 72 tax variables from the IRS Public Use File (PUF) onto CPS records following the +national pipeline of \citet{woodruff2024}. The PUF provides detailed tax-return information but +lacks the household structure and demographic coverage needed for microsimulation. Quantile +regression forests \citep{meinshausen2006quantile} bridge that gap by predicting tax variables on +the CPS from the PUF training sample. + +The imputation proceeds sequentially so that later variables condition on earlier imputed values. +This preserves more of the joint tax distribution than an independent-variable approach. The PUF +training sample retains the top 0.5\% of the AGI distribution in full and samples from the +remainder to keep training practical. The resulting enhanced CPS keeps the CPS household structure +while importing tax information that is missing or unreliable in the raw survey. + +\subsubsection{Additional source imputations} + +The pipeline also fills selected gaps from other surveys: + +\begin{itemize} + \item The American Community Survey supplies rent and real-estate tax information. + \item The Survey of Income and Program Participation supplies tip income and several asset + components. + \item The Survey of Consumer Finances supplies net worth and selected debt variables. +\end{itemize} + +These source imputations are part of the base microdata construction rather than the calibration +step. They define the values that cloned households can contribute to later targets. + +\subsection{Calibration targets} + +The target system combines district-level, state-level, and national aggregates from multiple +administrative sources. The pipeline stores these targets in a target database, +\texttt{policy\_data.db}. Table~\ref{tab:target_summary} summarizes the full active target +inventory stored in that database. Individual benchmark runs use subsets of that inventory, so the +exact target count in the experiments is reported by benchmark manifest rather than inferred from +the database-wide total alone. + +\input{tables/target_summary} + +\subsubsection{District-level targets} + +District-level targets cover 435 congressional districts plus the District of Columbia. The main +sources are: + +\begin{itemize} + \item IRS Statistics of Income (SOI) targets for AGI, EITC, income components, deductions, and + tax-unit counts. + \item Census age-distribution targets from ACS tables. + \item SNAP household counts derived for district-level use. +\end{itemize} + +The district layer is numerically dominant because each target family is repeated across hundreds +of geographic units. + +\subsubsection{State targets} + +State targets cover the 50 states plus the District of Columbia. They include: + +\begin{itemize} + \item IRS SOI targets aggregated to the state level. + \item Census age targets. + \item USDA SNAP spending and household counts. + \item CMS Medicaid enrollment totals. + \item Census state income tax collections. +\end{itemize} + +\subsubsection{National targets} + +National targets provide the top layer of the hierarchy. They combine IRS totals with curated +aggregates from agencies such as CBO, JCT, SSA, CMS, and Census. Some domains rely on curated +published totals stored in the ETL pipeline because there is no single machine-readable source that +matches the calibration year exactly. + +\subsection{Hierarchical uprating and reconciliation} +\label{sec:uprating} + +Target sources reference different years and do not always aggregate cleanly across geographic +levels. The pipeline applies two adjustments before calibration. + +The first is an uprating factor that moves a source-year total to the calibration year. Dollar +targets are usually uprated with price or programme-specific growth factors, while count targets +use population or enrolment growth where appropriate. + +The second is a hierarchy inconsistency factor. For source families where district totals do not +sum to the corresponding state totals, the pipeline rescales the district values so that the +district layer matches the uprated state layer: +\begin{equation} + \text{HIF}_{s,v} = \frac{T_{s,v}^{\text{state}}} + {\sum_{d \in s} T_{d,v}^{\text{district}}}, +\end{equation} +and +\begin{equation} + T_{d,v}^{\text{adj}} = + T_{d,v}^{\text{district}} \times \text{HIF}_{s,v} \times \text{UF}_{s,v}. +\end{equation} + +This reconciliation step does not remove all modelling judgment, but it gives the optimizer a +target system that is internally coherent across district, state, and national layers. diff --git a/paper-l0/sections/discussion.tex b/paper-l0/sections/discussion.tex new file mode 100644 index 000000000..6850b4d21 --- /dev/null +++ b/paper-l0/sections/discussion.tex @@ -0,0 +1,88 @@ +\section{Discussion} +\label{sec:discussion} + +\subsection{What the tractable benchmark establishes} + +The tractable benchmark is the cleanest answer to the most obvious objection to the method: does +$L_0$ only look good because it is the only method built for the largest problem? That objection is +fair, which is why Tier 1 matters. If $L_0$ performs competitively with GREG on the mixed-target +tractable benchmark and remains competitive with IPF on the count-style benchmark, then its later +advantages cannot be dismissed as a scale-only story. + +This tier should also clarify what kind of advantage $L_0$ does and does not offer. If GREG +matches the tractable targets with similar accuracy but produces negative weights, that is a +different kind of limitation from simple inaccuracy. If IPF performs well on count margins but does +not extend to mixed dollar-and-count targets, that is a scope limit rather than a poor +implementation. The benchmark design is meant to keep those distinctions visible. + +\subsection{What the scaling frontier establishes} + +The scaling frontier shifts the question from local accuracy to method limits. A useful result in +this section is not just that one method is faster than another. It is where each method stops +being a credible tool for the target system under study. For $L_0$, the main trade-off is usually +between fit, sparsity, and runtime. For GREG, the limits are more likely to come from unstable +linear algebra, memory use, or negative weights. For IPF, the limits are more likely to come from +scope and convergence. + +This frontier is important rhetorically because it distinguishes two claims that are easy to blur. +One claim is that $L_0$ can solve a hard production problem. The stronger claim is that classical +reference methods become difficult to apply before the production target volume is reached. The +scaling ladder is where that stronger claim lives. + +\subsection{What the production-feasibility case establishes} + +The production-feasibility tier connects the statistical benchmark back to deployment. A production +run has to do more than minimize loss on a benchmark subset. It has to produce a microdata file +that can be published, loaded, and simulated. That is where the sparsity mechanism matters most. +Without it, the cloned unit universe remains too large for many downstream uses. + +This is also the tier where the paper can be most concrete about failure modes. A table that says a +method ran out of memory, returned negative weights, failed to converge, or completed in a given +runtime is more informative than a generic statement that the problem is "large." The production +case therefore should remain a feasibility table first and a performance table second. + +\subsection{Trade-offs and limitations} + +The benchmark does not remove all modelling judgment. Several choices remain consequential. + +\subsubsection{Target-set construction} + +The exact target count in a run depends on the benchmark manifest, the active target families, and +the achievable-target mask. Database-wide counts are useful for context, but the paper should base +its empirical claims on the exact exported target set for each run. + +\subsubsection{Clone count and geography scope} + +Clone count affects both feasibility and fidelity. More clones enlarge the feasible support for +district-level targets, but they also enlarge the optimization problem and the output dataset. The +same logic applies to geography scope. Extending the current setup to counties or other local areas +is possible in principle, but it would require new target data and a larger benchmark package. + +\subsubsection{Hyperparameter sensitivity} + +The $L_0$ method is not parameter free. The gate temperature, initialization, and sparsity penalty +all affect the path the optimizer takes. This is why the hyperparameter configuration belongs in an +appendix rather than disappearing into prose. The benchmark should treat those settings as part of +the method specification. + +\subsubsection{Computational cost} + +The pipeline is computationally heavier than a small classical calibration run. Matrix +construction, repeated policy simulation, and gradient-based optimization all cost time. That cost +is acceptable only if the pipeline solves a problem that simpler methods do not solve reliably. The +benchmark is therefore essential to justifying the added infrastructure. + +\subsubsection{Comparability of IPF} + +IPF is the least like-for-like baseline in the paper because its natural target representation is +different. Restricting IPF to count-style margins is the right methodological choice, but it also +means the reader should not interpret every $L_0$ versus IPF result as a full-system comparison. +The paper should say that directly. + +\subsection{Generalizability} + +The specific data engineering in this paper is US-specific, but the benchmark question is broader. +Many microsimulation projects need to reconcile survey microdata with administrative targets across +multiple geographic levels. Where those projects also care about keeping the resulting dataset +small enough to use, a sparse calibration method may be more useful than a classical exact-fit +procedure even if the classical procedure remains attractive on smaller subproblems. diff --git a/paper-l0/sections/introduction.tex b/paper-l0/sections/introduction.tex new file mode 100644 index 000000000..255ca30ab --- /dev/null +++ b/paper-l0/sections/introduction.tex @@ -0,0 +1,52 @@ +\section{Introduction} +\label{sec:introduction} + +Tax-benefit microsimulation models apply policy rules to household microdata to estimate fiscal +and distributional effects. At the national level, that usually means starting from a survey such +as the Current Population Survey (CPS), improving measured incomes and program participation, and +adjusting weights so that the resulting dataset reproduces known aggregates. Subnational work is +harder. Analysts often want estimates for states, congressional districts, or local areas, but the +underlying survey was not designed to support all of those geographies at once. + +That gap matters in practice. Many reforms are proposed, administered, or debated below the +national level. A state tax proposal needs state-specific incomes, filing patterns, and benefit +participation. District-level analysis needs those same outcomes to aggregate correctly within each +district while remaining consistent with state and national totals. A usable subnational dataset +therefore has to satisfy a hierarchical calibration problem rather than a single national one. + +Public survey microdata alone do not solve that problem. The CPS provides demographic structure and +program coverage, but some tax variables are missing or noisy, top incomes are imperfectly +measured, and geographic detail is limited. Administrative targets help, but they arrive from +different agencies, with different concepts, and at different geographic levels. In our setting, +the calibration problem mixes count and dollar targets across district-level units, the 50 states +plus the District of Columbia, and the nation. + +The methodological goal is straightforward to state, even if the implementation is not. We want a +calibration method that can absorb a large hierarchical target system, keep weights usable for +microsimulation, and let us control how large the final dataset becomes. Exact fit is attractive, +but a production dataset also needs to remain computationally practical. A file with millions of +active cloned records is useful for detailed subnational work and still expensive to simulate. A +much smaller file is easier to ship and run, but only if the loss in fidelity is acceptable. + +This paper studies an $L_0$-regularized calibration system built in PolicyEngine's US data +pipeline. The system clones CPS households across sampled geographies, builds a sparse calibration +matrix from tax-benefit simulations, and jointly optimizes positive weights and binary-like gates. +The gates give the optimizer a direct sparsity mechanism. In practical terms, that means the same +framework can target a compact national dataset or a larger subnational dataset without switching +to a different calibration procedure. + +The paper's contribution is empirical as well as methodological. We do not treat generalized +regression (GREG) and iterative proportional fitting (IPF, or raking) as universal substitutes for +the full production problem. Instead, we use them as classical reference methods within scopes that +match their assumptions. The benchmark design uses shared exported calibration packages and asks +three questions. First, on a tractable benchmark, does $L_0$ perform competitively with classical +methods when all methods can plausibly run? Second, how do the methods degrade as the number of +targets increases? Third, which methods remain usable near the full production calibration problem? + +The rest of the paper follows that structure. Section~\ref{sec:background} places the work in the +literature on spatial microsimulation, survey calibration, and sparse optimization. +Section~\ref{sec:data} describes the base microdata and administrative targets. Section +~\ref{sec:methodology} presents the calibration pipeline and the benchmark design. Section +~\ref{sec:results} reports the tractable benchmark, the scaling frontier, and the +production-feasibility case. Section~\ref{sec:discussion} interprets those results and discusses +their limits. diff --git a/paper-l0/sections/methodology.tex b/paper-l0/sections/methodology.tex new file mode 100644 index 000000000..392297f5f --- /dev/null +++ b/paper-l0/sections/methodology.tex @@ -0,0 +1,194 @@ +\section{Methodology} +\label{sec:methodology} + +This section first, it describes the calibration system that produces the weighted +microdata. Second, it defines the benchmark design used to evaluate that system against classical +reference methods. + +\subsection{Pipeline overview} + +The pipeline transforms enhanced CPS microdata into a calibrated subnational dataset in four broad +steps: cloned geography assignment, calibration-matrix construction, $L_0$-regularized +optimization, and dataset assembly. Figure~\ref{fig:pipeline} summarizes that flow. + +\begin{figure}[ht] + \centering + \fbox{\parbox{0.9\textwidth}{\centering\textit{[Pipeline overview diagram to be inserted]}}} + \caption{Pipeline overview. The benchmarked system clones CPS households across sampled + geographies, builds a sparse calibration matrix from tax-benefit simulations, optimizes weights + with $L_0$ gates, and then assembles the retained records into deployable datasets.} + \label{fig:pipeline} +\end{figure} + +The benchmark experiments do not rebuild separate pipelines for each method. Instead, they export a +shared calibration package from the existing pipeline. That package contains the cloned unit +universe, the selected target vector, target metadata, the achievable-target mask, and the initial +weights. $L_0$ and GREG consume that representation directly. IPF uses the same target selection +after conversion to an IPF-compatible margin representation. + +\subsection{Cloned geography assignment} +\label{sec:cloning} + +The base CPS contains roughly 100{,}000 household records. A stratified subsample reduces that to a +smaller working dataset while preserving the upper tail of the AGI distribution. This keeps the +downstream simulations tractable without collapsing the range of household types the optimizer can +use. + +Each retained household is then cloned $N$ times, with $N = 430$ in the current production setup. +Each clone is assigned to a census block, and the block determines the associated district-level +unit, county, tract, and state. The baseline assignment distribution is population weighted. For +households above the 90th percentile of AGI, the pipeline tilts that distribution toward +high-income districts so that the initial clone universe contains more plausible placements for +high-income records: +\begin{equation} + \Pr(B_h = b) = + \frac{P_{\text{pop}}(b) R_h(d(b))} + {\sum_{b'} P_{\text{pop}}(b') R_h(d(b'))}, +\end{equation} +where $P_{\text{pop}}(b)$ is the population-based block probability, $d(b)$ is the district-level +unit containing block $b$, and $R_h(d) = A_d$ for high-AGI households and $R_h(d) = 1$ otherwise. +Here $A_d$ is the district-level AGI target. + +The assignment step also enforces a no-collision rule: the same source household should not appear +twice in the same district-level unit across clones unless repeated resampling fails to find an +alternative. With 430 clones and the current working dataset, the cloned universe contains roughly +5.2 million household-geography candidates. + +\subsection{Calibration-matrix construction} +\label{sec:matrix} + +The calibration matrix $\mathbf{M} \in \mathbb{R}^{m \times n}$ maps $n$ cloned household +candidates to $m$ calibration targets. Entry $M_{ji}$ is the contribution of cloned unit $i$ to +target $j$. For count targets, that is an indicator or count contribution. For dollar targets, it +is the simulated value of the relevant tax or benefit variable. + +The matrix is built from \policyengine{} simulations. Because many target variables depend on +state-specific policy rules, the pipeline evaluates the cloned households under separate state +configurations and reattaches the results according to each clone's assigned geography. For +county-specific rules, such as ACA premium tax credits in the current setup, the pipeline can run +additional county-level loops. This design keeps the geographic information in the calibration +representation rather than in a post-processing step. + +Several targets depend on programme take-up. To make the matrix consistent with the final dataset, +the pipeline re-randomizes take-up at the clone level using deterministic seeds derived from the +source household, the clone index, and the variable name. This lets two clones of the same source +household take up a programme at different rates when they are assigned to different places. + +The matrix is emitted in sparse coordinate form and stored as a compressed sparse row matrix. After +construction, a target-configuration file selects the rows used for a given optimization or +benchmark run. Rows with zero feasible support are marked unachievable and excluded from the loss. + +\subsection{\texorpdfstring{$L_0$}{L0} optimization} +\label{sec:l0} + +Given the sparse matrix $\mathbf{M}$, target vector $\mathbf{t}$, and weight vector +$\mathbf{w} \in \mathbb{R}^n$, the optimizer minimizes +\begin{equation} + \mathcal{L}(\mathbf{w}, \boldsymbol{\alpha}) = + \frac{1}{m} \sum_{j=1}^{m} + \left(\frac{\hat{t}_j - t_j}{|t_j|}\right)^2 + + \lambda_{L_0} \sum_{i=1}^{n} \bar{z}_i + + \lambda_{L_2} \|\mathbf{w}\|_2^2, + \label{eq:loss} +\end{equation} +where +\begin{equation} + \hat{t}_j = \sum_{i=1}^{n} M_{ji} w_i z_i. + \label{eq:estimate} +\end{equation} + +The weights are parameterized in log space so that the fitted weights remain positive. The gate +$z_i$ is a Hard Concrete random variable with expected activation $\bar{z}_i$, following +\citet{louizos2018}. The first term in Equation~\ref{eq:loss} is a relative calibration loss, so a +1\% error on a small target and a 1\% error on a large target receive comparable weight. The +second term is the $L_0$ penalty. It penalizes the expected number of active cloned units and +therefore regulates sparsity directly. The third term is an $L_2$ penalty on the weight vector, +that is, a penalty on the squared Euclidean norm $\|\mathbf{w}\|_2^2$. This term shrinks the +magnitudes of the fitted weights and discourages solutions that satisfy the targets by assigning +very large weights to a small number of retained units. + +An $L_0$ penalty on its own can produce a sparse solution, but sparsity +alone does not guarantee a usable microsimulation dataset. The optimizer could still concentrate too +much population mass on a small subset of cloned households, yielding unstable or implausible +weights even if the support is small. The $L_2$ term addresses a different problem: it discourages +large weight magnitudes conditional on the chosen support. In that sense, the two penalties play +complementary roles. $\lambda_{L_0}$ controls how many units remain active, while $\lambda_{L_2}$ +controls how concentrated the fitted weights can become among those active units. + +The separate tuning parameters are therefore a practical advantage of the objective. Increasing +$\lambda_{L_0}$ pushes the solution toward a smaller retained dataset. Increasing $\lambda_{L_2}$ +pushes the solution toward more moderate fitted weights, even if the support size is unchanged. The +two knobs let the pipeline trade off calibration fit, sparsity, and weight stability more flexibly +than a single regularization term would allow. The full hyperparameter configuration, including the +gate parameters and initialization choices, appears in Appendix~\ref{app:hyperparameters}. +Algorithm~\ref{alg:l0} gives pseudocode for the full optimization loop. + +After optimization, the retained cloned units are assembled into deployable H5 datasets for +\policyengine{}. Those assembly details matter operationally but are not the main object of the +benchmark paper, so Appendix~\ref{app:assembly} summarizes them. + +\subsection{Benchmark design} +\label{sec:benchmark_design} + +The empirical design is organized around one shared benchmark package and three benchmark tiers. +The package is exported from the saved calibration pipeline rather than rebuilt separately for each +method. This keeps the comparison focused on calibration method, not on differences in upstream +data preparation. + +\subsubsection{Shared benchmark package} + +Each benchmark package contains: + +\begin{itemize} + \item the cloned unit universe + \item the selected target vector and target metadata + \item the achievable-target mask + \item the shared initial weights + \item the sparse calibration matrix for scoring fitted weights +\end{itemize} + +GREG uses the exported sparse matrix and target vector directly. IPF requires a conversion step +because it works on unit-record tables and margin constraints rather than on a generic sparse linear +system. The exporter therefore converts selected count-style targets into an IPF-ready +representation while preserving the same cloned unit universe and the same target selection. + +\subsubsection{Experimental tiers} + +Tier 1 is a tractable benchmark. It asks whether $L_0$ performs competitively when the problem is +small enough for classical methods to run routinely. The mixed-target tractable benchmark compares +$L_0$ with GREG. A separate count-style tractable benchmark compares $L_0$ with IPF on a target set +that fits IPF's assumptions. + +Tier 2 is a scaling frontier. Starting from a controlled benchmark package, we increase the number +of targets while holding the basic unit universe fixed. This identifies where each method slows +down, destabilizes, or becomes methodologically inappropriate. + +Tier 3 is a production-feasibility case. It moves as close as possible to the full production +calibration problem, with the current clone count and the current production-style target families. +The goal is to study which methods complete, +which fail, and document why. + +\subsubsection{Method scope} + +The comparison is intentionally asymmetric where the methods are asymmetric. $L_0$ and GREG can +both consume arbitrary linear targets, including mixed count and dollar constraints. IPF is +evaluated only on count-style or indicator-style margins. This is a methodological choice rather +than a concession. It avoids misrepresenting IPF as a general solver for the full sparse matrix +problem. + +\subsection{Evaluation metrics} +\label{sec:metrics} + +Each benchmark run reports: + +\begin{itemize} + \item mean, median, and maximum absolute relative error across achievable targets + \item runtime + \item peak memory where available + \item convergence or failure status + \item any method-specific pathologies, such as negative fitted weights for GREG + \item retained-record counts and effective sample size for $L_0$ outputs +\end{itemize} + +These metrics support the paper's three main empirical questions: comparative performance on +tractable problems, degradation as scale increases, and feasibility near production size. diff --git a/paper-l0/sections/results.tex b/paper-l0/sections/results.tex new file mode 100644 index 000000000..e5709262b --- /dev/null +++ b/paper-l0/sections/results.tex @@ -0,0 +1,90 @@ +\section{Results} +\label{sec:results} + +The results are organized around the three benchmark tiers defined in +Section~\ref{sec:benchmark_design}. The point of that structure is to separate three questions +that the earlier draft treated together: how $L_0$ behaves on a tractable comparison, how the +methods degrade as the target system grows, and which methods remain usable near the production +problem. Numerical entries are still marked as placeholders pending completed benchmark runs. + +\subsection{Tier 1: tractable benchmark comparisons} + +Table~\ref{tab:comparison} reports the tractable benchmark. The mixed-target setting compares $L_0$ +with GREG on the same exported calibration package. The count-style setting compares $L_0$ with IPF +on an IPF-eligible subset derived from that same package. + +\input{tables/comparison} + +The tractable benchmark is the most direct test of whether $L_0$ performs well only because it was +designed for the full production setting. In the mixed-target benchmark, the key comparisons are +accuracy, runtime, and whether GREG produces negative weights. In the count-style benchmark, the +key question is whether $L_0$ remains competitive with IPF even when the target system is limited +to the class of margins that IPF handles naturally. + +\subsection{Tier 2: scaling frontier} + +Table~\ref{tab:scaling_frontier} summarizes the scaling ladder. Each rung uses the same basic +cloned unit universe while increasing the number of benchmark targets. This isolates how method +performance changes with constraint volume rather than with a changing data-preparation pipeline. + +\input{tables/scaling_frontier} + +The scaling frontier is expected to show more than runtime growth. It should also identify where a +method becomes numerically unstable, fails to converge, or stops being interpretable as a credible +comparison. For GREG, the relevant failure modes include memory pressure, unstable linear algebra, +and negative weights. For IPF, the relevant limits include inapplicability outside count-style +margins and non-convergence when the margin system becomes too hard to reconcile. + +\subsection{Tier 3: production feasibility} + +Table~\ref{tab:production_feasibility} reports the production-feasibility case. This table is +designed to be concrete. It should state which method completed, which method failed, how long the +run took, and what the failure looked like when a method did not finish. + +\input{tables/production_feasibility} + +This tier matters because a tractable benchmark alone does not establish practical usefulness. A +method can look reasonable at a few hundred targets and still fail to support a deployable +subnational dataset. The production-feasibility case links the benchmark back to the actual +\texttt{policyengine-us-data} use case. + +\subsection{Supporting diagnostics} + +The tiered benchmark tables provide the main narrative. The following diagnostics support that +narrative by showing how the fitted $L_0$ solution behaves within a completed run. + +\subsubsection{Accuracy by geography} + +Table~\ref{tab:calibration_accuracy} reports absolute relative error by geographic level for the +selected benchmark or production configuration. + +\input{tables/calibration_accuracy} + +The benchmark manifests determine the exact number of targets in each row. Those counts should come +from the exported package rather than from the full target database used by the pipeline, which can +include multiple years or inactive target families that are not part of a given experiment. + +\subsubsection{Sparsity and convergence} + +The sparsity penalty produces exact zeros in the retained weight vector, so dataset size is an +explicit output of the optimization rather than a post-processing step. For the settings reported +here, the main summary objects are the retained-record count, the effective sample size, and the +distribution of fitted weights. + +Figure~\ref{fig:convergence} plots mean absolute relative error over training epochs for the +selected $L_0$ runs. + +\begin{figure}[ht] + \centering + \fbox{\parbox{0.9\textwidth}{\centering\textit{[Convergence curves to be generated from + calibration logs]}}} + \caption{Convergence diagnostics for representative $L_0$ runs. The final figure will report + the calibration loss over epochs for the tractable benchmark and the production-feasibility + run.} + \label{fig:convergence} +\end{figure} + +These diagnostics are secondary to the benchmark tiers, but they are still important. They show +whether a feasible production run is achieving that feasibility by keeping errors low, by keeping +weights stable, and by pruning the cloned universe in a controlled way rather than through ad hoc +thresholding. diff --git a/paper-l0/tables/calibration_accuracy.tex b/paper-l0/tables/calibration_accuracy.tex new file mode 100644 index 000000000..e4d69cde1 --- /dev/null +++ b/paper-l0/tables/calibration_accuracy.tex @@ -0,0 +1,21 @@ +\begin{table}[ht] +\centering +{\tablefont +\begin{tabular}{lrrrr} +\toprule +Geographic level & Targets & Mean ARE (\%) & Median ARE (\%) & Max ARE (\%) \\ +\midrule +National & \tbc & \tbc & \tbc & \tbc \\ +State & \tbc & \tbc & \tbc & \tbc \\ +Congressional district & \tbc & \tbc & \tbc & \tbc \\ +\midrule +\textbf{All achievable} & \tbc & \tbc & \tbc & \tbc \\ +\bottomrule +\end{tabular} +} +\caption{Supporting diagnostic: calibration accuracy by geographic level, measured as absolute +relative error (ARE) across achievable targets for the selected benchmark or production run.} +\label{tab:calibration_accuracy} +\tablenote{Target counts should be read from the exported benchmark package rather than from the +full target database used by the pipeline.} +\end{table} diff --git a/paper-l0/tables/comparison.tex b/paper-l0/tables/comparison.tex new file mode 100644 index 000000000..8161211d0 --- /dev/null +++ b/paper-l0/tables/comparison.tex @@ -0,0 +1,23 @@ +\begin{table}[ht] +\centering +{\tablefont +\resizebox{\textwidth}{!}{% +\begin{tabular}{p{0.27\textwidth}p{0.14\textwidth}rrrr} +\toprule +Setting & Method & Targets & Mean ARE (\%) & Runtime & Notes \\ +\midrule +Mixed-target tractable benchmark & $L_0$ & \tbc & \tbc & \tbc & Retained records: \tbc \\ +Mixed-target tractable benchmark & GREG & \tbc & \tbc & \tbc & Negative weights: \tbc \\ +Count-style tractable benchmark & $L_0$ & \tbc & \tbc & \tbc & Retained records: \tbc \\ +Count-style tractable benchmark & IPF & \tbc & \tbc & \tbc & Converged: \tbc \\ +\bottomrule +\end{tabular} +} +} +\caption{Tier 1 tractable benchmark comparisons. The mixed-target benchmark compares $L_0$ with +GREG on the same exported package. The count-style benchmark compares $L_0$ with IPF on an +IPF-eligible subset.} +\label{tab:comparison} +\tablenote{ARE = absolute relative error across achievable targets. Exact target counts come from +the benchmark manifests, not the full target database used by the pipeline.} +\end{table} diff --git a/paper-l0/tables/hyperparameters.tex b/paper-l0/tables/hyperparameters.tex new file mode 100644 index 000000000..468299b25 --- /dev/null +++ b/paper-l0/tables/hyperparameters.tex @@ -0,0 +1,21 @@ +\begin{table}[ht] +\centering +{\tablefont +\begin{tabular}{lll} +\toprule +Parameter & Value & Role \\ +\midrule +$\beta$ (gate temperature) & 0.35 & Sharpness of 0/1 gate transition \\ +$\gamma$ (left stretch) & $-0.1$ & Enables exact-zero gates after clipping \\ +$\zeta$ (right stretch) & 1.1 & Enables exact-one gates after clipping \\ +Initial keep probability & 0.999 & All records start nearly fully active \\ +$\lambda_{L_2}$ & $10^{-12}$ & Mild weight regularization \\ +Learning rate & 0.15 & Adam optimizer step size \\ +Clone count $N$ & 430 & Geographic replicas per CPS household \\ +\bottomrule +\end{tabular} +} +\caption{Hard Concrete gate and optimization hyperparameters. The stretch parameters $\gamma$ and $\zeta$ follow \citet{louizos2018}; other values are tuned for the calibration setting.} +\label{tab:hyperparameters} +\tablenote{Source: \texttt{unified\_calibration.py} in the \texttt{policyengine-us-data} repository.} +\end{table} diff --git a/paper-l0/tables/presets.tex b/paper-l0/tables/presets.tex new file mode 100644 index 000000000..a574a1b4f --- /dev/null +++ b/paper-l0/tables/presets.tex @@ -0,0 +1,16 @@ +\begin{table}[ht] +\centering +{\tablefont +\begin{tabular}{llrrl} +\toprule +Preset & $\lambda_{L_0}$ & Epochs & Retained records & Use case \\ +\midrule +National & $10^{-4}$ & 4,000 & $\sim$50,000 & Web-based interactive simulation \\ +Local & $10^{-8}$ & 100 & $\sim$3--4 million & Subnational policy analysis \\ +\bottomrule +\end{tabular} +} +\caption{Sparsity presets. The only difference between presets is $\lambda_{L_0}$ and epoch count; all other hyperparameters (Table~\ref{tab:hyperparameters}) are shared.} +\label{tab:presets} +\tablenote{Note: Retained record counts are approximate and vary with the base CPS size and clone count.} +\end{table} diff --git a/paper-l0/tables/production_feasibility.tex b/paper-l0/tables/production_feasibility.tex new file mode 100644 index 000000000..b0598762e --- /dev/null +++ b/paper-l0/tables/production_feasibility.tex @@ -0,0 +1,21 @@ +\begin{table}[ht] +\centering +{\tablefont +\resizebox{\textwidth}{!}{% +\begin{tabular}{p{0.16\textwidth}p{0.17\textwidth}p{0.17\textwidth}rrp{0.19\textwidth}} +\toprule +Method & Target set & Status & Runtime & Peak memory & Notes \\ +\midrule +$L_0$ & Full production mixed targets & \tbc & \tbc & \tbc & Retained records: \tbc \\ +GREG & Full production mixed targets & \tbc & \tbc & \tbc & Negative weights / solver status: \tbc \\ +IPF & Production count-style subset & \tbc & \tbc & \tbc & Iterations or failure mode: \tbc \\ +\bottomrule +\end{tabular} +} +} +\caption{Tier 3 production-feasibility benchmark. This table should report completion and failure +states as directly as possible.} +\label{tab:production_feasibility} +\tablenote{The IPF row is reported on a count-style production subset because IPF is not used as a +general solver for the full mixed-target system.} +\end{table} diff --git a/paper-l0/tables/scaling_frontier.tex b/paper-l0/tables/scaling_frontier.tex new file mode 100644 index 000000000..244659bb3 --- /dev/null +++ b/paper-l0/tables/scaling_frontier.tex @@ -0,0 +1,26 @@ +\begin{table}[ht] +\centering +{\tablefont +\resizebox{\textwidth}{!}{% +\begin{tabular}{rrrrll} +\toprule +Tier & Targets & $L_0$ runtime & GREG runtime & IPF runtime & Main outcome \\ +\midrule +1 & 250 & \tbc & \tbc & \tbc & \tbc \\ +2 & 500 & \tbc & \tbc & \tbc & \tbc \\ +3 & 1{,}000 & \tbc & \tbc & \tbc & \tbc \\ +4 & 2{,}500 & \tbc & \tbc & \tbc & \tbc \\ +5 & 5{,}000 & \tbc & \tbc & \tbc & \tbc \\ +6 & 10{,}000 & \tbc & \tbc & \tbc & \tbc \\ +7 & \tbc[full selected tier] & \tbc & \tbc & \tbc & \tbc \\ +\bottomrule +\end{tabular} +} +} +\caption{Tier 2 scaling frontier. Each row uses the same cloned unit universe and increases the +number of selected targets. Runtime cells should be replaced by failure labels when a method does +not complete.} +\label{tab:scaling_frontier} +\tablenote{The final paper may split this table into separate runtime and failure-mode tables if +the notes column becomes too dense.} +\end{table} diff --git a/paper-l0/tables/target_summary.tex b/paper-l0/tables/target_summary.tex new file mode 100644 index 000000000..0bf9d1407 --- /dev/null +++ b/paper-l0/tables/target_summary.tex @@ -0,0 +1,35 @@ +\begin{table}[ht] +\centering +{\tablefont +\begin{tabular}{p{0.55\textwidth}rr} +\toprule +Source & Targets & Geographies \\ +\midrule +\multicolumn{3}{l}{\textit{District level}} \\ +IRS SOI (AGI, income tax, EITC, 21 income/ded.\ domains) & 25{,}288 & 436 district-level units \\ +Census ACS S0101 (person count by 18 age bands) & 7{,}848 & 436 district-level units \\ +Census ACS S2201 (SNAP household count) & 436 & 436 district-level units \\ +\midrule +\multicolumn{3}{l}{\textit{State level}} \\ +IRS SOI (same domains as district) & 2{,}958 & 50 states + DC \\ +Census ACS S0101 (person count by 18 age bands) & 918 & 50 states + DC \\ +USDA FNS SNAP (spending + household count) & 102 & 50 states + DC \\ +CMS Medicaid (enrollment) & 51 & 50 states + DC \\ +Census STC (state income tax collections) & 51 & 50 states + DC \\ +\midrule +\multicolumn{3}{l}{\textit{National level}} \\ +IRS SOI (domain-constrained aggregates, EITC) & 51 & 1 \\ +Curated totals (CBO, JCT, SSA, CMS, Census) & 37 & 1 \\ +Census ACS S0101 (person count by 18 age bands) & 18 & 1 \\ +\midrule +\textbf{Total} & \textbf{37{,}758} & \\ +\bottomrule +\end{tabular} +} +\caption{Calibration targets by geographic level and source. District-level targets dominate by +count. Full variable listing appears in Appendix~\ref{app:targets}.} +\label{tab:target_summary} +\tablenote{Source: active targets stored in \texttt{policy\_data.db}. Individual benchmark runs use +subsets of this inventory, so experimental target counts should be reported from the benchmark +manifests.} +\end{table} diff --git a/tests/unit/test_benchmark_export.py b/tests/unit/test_benchmark_export.py new file mode 100644 index 000000000..779eeee15 --- /dev/null +++ b/tests/unit/test_benchmark_export.py @@ -0,0 +1,286 @@ +from __future__ import annotations + +import importlib.util +import json +import sys +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest +from scipy.io import mmwrite +from scipy.sparse import csr_matrix + + +REPO_ROOT = Path(__file__).resolve().parents[2] +BENCHMARK_DIR = REPO_ROOT / "paper-l0" / "benchmarking" +BENCHMARK_EXPORT_PATH = BENCHMARK_DIR / "benchmark_export.py" + + +def _load_module(name: str, path: Path): + benchmark_dir_str = str(BENCHMARK_DIR) + if benchmark_dir_str not in sys.path: + sys.path.insert(0, benchmark_dir_str) + spec = importlib.util.spec_from_file_location(name, path) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + sys.modules[name] = module + spec.loader.exec_module(module) + return module + + +def test_export_bundle_writes_ipf_scoring_subset(tmp_path, monkeypatch): + benchmark_export = _load_module( + "benchmark_export_for_tests", BENCHMARK_EXPORT_PATH + ) + benchmark_manifest = _load_module( + "benchmark_manifest_for_tests", BENCHMARK_DIR / "benchmark_manifest.py" + ) + + package = { + "targets_df": pd.DataFrame( + { + "target_id": [1, 2, 3], + "value": [2.0, 3.0, 5.0], + "variable": [ + "household_count", + "household_count", + "household_count", + ], + "geo_level": ["national", "national", "national"], + "geographic_id": ["0", "0", "0"], + } + ), + "target_names": ["requested_a", "requested_b", "requested_c"], + "X_sparse": csr_matrix( + np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=np.float64) + ), + "initial_weights": np.array([1.0, 1.0], dtype=np.float64), + "metadata": {}, + } + + monkeypatch.setattr( + benchmark_export, "load_calibration_package_raw", lambda _path: package + ) + + def _fake_build_ipf_inputs(package, manifest, filtered_targets): + unit_metadata = pd.DataFrame( + { + "unit_index": [0, 1], + "household_id": [0, 1], + "base_weight": [1.0, 1.0], + } + ) + ipf_target_metadata = pd.DataFrame( + { + "margin_id": ["m0", "m1", "m1"], + "scope": ["household", "household", "household"], + "target_type": ["categorical_margin"] * 3, + "variables": ["district", "district|snap", "district|snap"], + "cell": ["district=A", "district=A|snap=yes", "district=A|snap=no"], + "target_value": [2.0, 1.0, 1.0], + "is_authored": [True, True, False], + } + ) + ipf_target_metadata.attrs["retained_authored_target_ids"] = [1, 2] + ipf_target_metadata.attrs["requested_target_count"] = 3 + ipf_target_metadata.attrs["retained_authored_target_count"] = 2 + ipf_target_metadata.attrs["derived_complement_count"] = 1 + ipf_target_metadata.attrs["dropped_targets"] = {"unsupported_partial_margin": 1} + ipf_target_metadata.attrs["dropped_target_details"] = [ + {"reason": "unsupported_partial_margin", "target_ids": [3]} + ] + ipf_target_metadata.attrs["margin_consistency_issues"] = [] + ipf_target_metadata.attrs["derived_complement_rows"] = [ + {"cell": "district=A|snap=no", "target_value": 1.0} + ] + return unit_metadata, ipf_target_metadata + + monkeypatch.setattr(benchmark_export, "build_ipf_inputs", _fake_build_ipf_inputs) + + manifest = benchmark_manifest.BenchmarkManifest( + name="ipf-export-test", + tier="unit", + description="", + package_path="/tmp/fake-package.pkl", + methods=["ipf"], + ) + + output_dir, info = benchmark_export.export_bundle( + manifest=manifest, + output_dir=tmp_path / "bundle", + ) + + scoring_targets = pd.read_csv( + output_dir / "inputs" / "ipf_scoring_target_metadata.csv" + ) + diagnostics = json.loads( + (output_dir / "inputs" / "ipf_conversion_diagnostics.json").read_text() + ) + + assert len(scoring_targets) == 2 + assert scoring_targets["target_id"].tolist() == [1, 2] + assert diagnostics["retained_authored_target_count"] == 2 + assert diagnostics["derived_complement_count"] == 1 + assert info["ipf_retained_authored_target_count"] == 2 + + +def test_export_bundle_requires_external_ipf_scoring_artifacts( + tmp_path, monkeypatch +): + benchmark_export = _load_module( + "benchmark_export_for_external_contract", BENCHMARK_EXPORT_PATH + ) + benchmark_manifest = _load_module( + "benchmark_manifest_for_external_contract", + BENCHMARK_DIR / "benchmark_manifest.py", + ) + + package = { + "targets_df": pd.DataFrame( + { + "target_id": [1], + "value": [2.0], + "variable": ["household_count"], + "geo_level": ["national"], + "geographic_id": ["0"], + } + ), + "target_names": ["requested_a"], + "X_sparse": csr_matrix(np.array([[1.0, 0.0]], dtype=np.float64)), + "initial_weights": np.array([1.0, 1.0], dtype=np.float64), + "metadata": {}, + } + + monkeypatch.setattr( + benchmark_export, "load_calibration_package_raw", lambda _path: package + ) + + unit_csv = tmp_path / "external_unit.csv" + target_csv = tmp_path / "external_targets.csv" + pd.DataFrame( + {"unit_index": [0, 1], "household_id": [0, 1], "base_weight": [1.0, 1.0]} + ).to_csv(unit_csv, index=False) + pd.DataFrame( + { + "margin_id": ["m0"], + "scope": ["household"], + "target_type": ["categorical_margin"], + "variables": ["district"], + "cell": ["district=A"], + "target_value": [2.0], + } + ).to_csv(target_csv, index=False) + + manifest = benchmark_manifest.BenchmarkManifest( + name="ipf-external-contract", + tier="unit", + description="", + package_path="/tmp/fake-package.pkl", + methods=["ipf"], + external_inputs=benchmark_manifest.ExternalInputs( + ipf_unit_metadata_csv=str(unit_csv), + ipf_target_metadata_csv=str(target_csv), + ), + ) + + with pytest.raises(ValueError, match="must provide all of"): + benchmark_export.export_bundle(manifest=manifest, output_dir=tmp_path / "bundle") + + +def test_export_bundle_accepts_fully_specified_external_ipf_inputs( + tmp_path, monkeypatch +): + benchmark_export = _load_module( + "benchmark_export_for_external_copy", BENCHMARK_EXPORT_PATH + ) + benchmark_manifest = _load_module( + "benchmark_manifest_for_external_copy", BENCHMARK_DIR / "benchmark_manifest.py" + ) + + package = { + "targets_df": pd.DataFrame( + { + "target_id": [1, 2], + "value": [2.0, 3.0], + "variable": ["household_count", "household_count"], + "geo_level": ["national", "national"], + "geographic_id": ["0", "0"], + } + ), + "target_names": ["requested_a", "requested_b"], + "X_sparse": csr_matrix(np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float64)), + "initial_weights": np.array([1.0, 1.0], dtype=np.float64), + "metadata": {}, + } + + monkeypatch.setattr( + benchmark_export, "load_calibration_package_raw", lambda _path: package + ) + + unit_csv = tmp_path / "external_unit.csv" + target_csv = tmp_path / "external_targets.csv" + scoring_csv = tmp_path / "external_scoring.csv" + scoring_mtx = tmp_path / "external_scoring.mtx" + diagnostics_json = tmp_path / "external_diag.json" + + pd.DataFrame( + {"unit_index": [0, 1], "household_id": [0, 1], "base_weight": [1.0, 1.0]} + ).to_csv(unit_csv, index=False) + pd.DataFrame( + { + "margin_id": ["m0", "m0"], + "scope": ["household", "household"], + "target_type": ["categorical_margin", "categorical_margin"], + "variables": ["district", "district"], + "cell": ["district=A", "district=B"], + "target_value": [2.0, 3.0], + } + ).to_csv(target_csv, index=False) + pd.DataFrame( + { + "value": [2.0], + "variable": ["household_count"], + "geo_level": ["national"], + "target_name": ["retained_a"], + } + ).to_csv(scoring_csv, index=False) + mmwrite(str(scoring_mtx), csr_matrix(np.array([[1.0, 0.0]], dtype=np.float64))) + diagnostics_json.write_text( + json.dumps( + { + "requested_target_count": 2, + "retained_authored_target_count": 1, + "derived_complement_count": 0, + "dropped_targets": {"missing_parent_total": 1}, + } + ) + ) + + manifest = benchmark_manifest.BenchmarkManifest( + name="ipf-external-complete", + tier="unit", + description="", + package_path="/tmp/fake-package.pkl", + methods=["ipf"], + external_inputs=benchmark_manifest.ExternalInputs( + ipf_unit_metadata_csv=str(unit_csv), + ipf_target_metadata_csv=str(target_csv), + ipf_scoring_target_metadata_csv=str(scoring_csv), + ipf_scoring_matrix_mtx=str(scoring_mtx), + ipf_conversion_diagnostics_json=str(diagnostics_json), + ), + ) + + output_dir, info = benchmark_export.export_bundle( + manifest=manifest, + output_dir=tmp_path / "bundle", + ) + + assert (output_dir / "inputs" / "ipf_target_metadata.csv").exists() + assert (output_dir / "inputs" / "ipf_scoring_target_metadata.csv").exists() + copied_diag = json.loads( + (output_dir / "inputs" / "ipf_conversion_diagnostics.json").read_text() + ) + assert copied_diag["retained_authored_target_count"] == 1 + assert info["ipf_retained_authored_target_count"] == 1 diff --git a/tests/unit/test_benchmarking_runners.py b/tests/unit/test_benchmarking_runners.py new file mode 100644 index 000000000..149e3f9db --- /dev/null +++ b/tests/unit/test_benchmarking_runners.py @@ -0,0 +1,503 @@ +from __future__ import annotations + +import importlib.util +import json +import shutil +import subprocess +import sys +from types import SimpleNamespace +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest +from scipy.io import mmwrite +from scipy.sparse import csr_matrix + + +REPO_ROOT = Path(__file__).resolve().parents[2] +BENCHMARK_DIR = REPO_ROOT / "paper-l0" / "benchmarking" +BENCHMARK_CLI_PATH = BENCHMARK_DIR / "benchmark_cli.py" + + +def _r_package_available(package: str) -> bool: + proc = subprocess.run( + [ + "Rscript", + "-e", + f"quit(status = if (requireNamespace('{package}', quietly = TRUE)) 0 else 1)", + ], + check=False, + capture_output=True, + text=True, + ) + return proc.returncode == 0 + + +def _load_benchmark_cli_module(): + benchmark_dir_str = str(BENCHMARK_DIR) + if benchmark_dir_str not in sys.path: + sys.path.insert(0, benchmark_dir_str) + spec = importlib.util.spec_from_file_location( + "benchmark_cli_for_tests", BENCHMARK_CLI_PATH + ) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +def _write_common_inputs( + run_dir: Path, + matrix, + target_values: list[float], + variables: list[str], + geo_levels: list[str] | None = None, + target_names: list[str] | None = None, + initial_weights: np.ndarray | None = None, + method_options: dict | None = None, +) -> Path: + inputs = run_dir / "inputs" + outputs = run_dir / "outputs" + inputs.mkdir(parents=True, exist_ok=True) + outputs.mkdir(parents=True, exist_ok=True) + + mmwrite(str(inputs / "X_targets_by_units.mtx"), matrix) + if initial_weights is None: + initial_weights = np.ones(matrix.shape[1], dtype=np.float64) + np.save( + inputs / "initial_weights.npy", np.asarray(initial_weights, dtype=np.float64) + ) + + if geo_levels is None: + geo_levels = ["national"] * len(target_values) + if target_names is None: + target_names = [f"target_{idx}" for idx in range(len(target_values))] + + target_metadata = pd.DataFrame( + { + "value": np.asarray(target_values, dtype=np.float64), + "variable": variables, + "geo_level": geo_levels, + "target_name": target_names, + } + ) + target_metadata.to_csv(inputs / "target_metadata.csv", index=False) + + manifest = { + "method_options": method_options or {}, + } + with open(inputs / "benchmark_manifest.json", "w") as f: + json.dump(manifest, f) + + return inputs + + +@pytest.fixture +def benchmark_cli_module(monkeypatch, tmp_path_factory): + cache_root = tmp_path_factory.mktemp("benchmarking-cache") + monkeypatch.setenv("MPLCONFIGDIR", str(cache_root / "mpl")) + monkeypatch.setenv("XDG_CACHE_HOME", str(cache_root / "xdg")) + return _load_benchmark_cli_module() + + +@pytest.fixture(autouse=True) +def _require_rscript(): + if shutil.which("Rscript") is None: + pytest.skip("Rscript is required for benchmarking runner tests") + + +def test_greg_runner_end_to_end_exact_fit(benchmark_cli_module, tmp_path): + if not _r_package_available("survey"): + pytest.skip("R package 'survey' is required for this test") + + run_dir = tmp_path / "greg-run" + matrix = csr_matrix(np.eye(2, dtype=np.float64)) + _write_common_inputs( + run_dir=run_dir, + matrix=matrix, + target_values=[2.0, 3.0], + variables=["household_count", "person_count"], + method_options={"greg": {"maxit": 50, "epsilon": 1e-10}}, + ) + + weights_path, _ = benchmark_cli_module._run_greg(run_dir) + fitted_weights = np.load(weights_path) + + np.testing.assert_allclose( + fitted_weights, np.array([2.0, 3.0]), atol=1e-8, rtol=1e-8 + ) + np.testing.assert_allclose( + matrix.dot(fitted_weights), np.array([2.0, 3.0]), atol=1e-8 + ) + + +def test_ipf_runner_end_to_end_categorical_margin_person_scope( + benchmark_cli_module, tmp_path +): + if not _r_package_available("surveysd"): + pytest.skip("R package 'surveysd' is required for this test") + + run_dir = tmp_path / "ipf-person-margin-run" + # Two household units, each with two person rows (same unit_index). One unit's + # persons are age_bracket="0-4", the other's are "5-9". Person-scope margin + # targets 4 people total in each bucket -> each unit's weight doubles. + matrix = csr_matrix(np.array([[2.0, 0.0], [0.0, 2.0]], dtype=np.float64)) + inputs = _write_common_inputs( + run_dir=run_dir, + matrix=matrix, + target_values=[4.0, 4.0], + variables=["person_count", "person_count"], + method_options={ + "ipf": {"max_iter": 50, "bound": 10.0, "epsP": 1e-9, "epsH": 1e-9} + }, + ) + + unit_metadata = pd.DataFrame( + { + "unit_index": [0, 0, 1, 1], + "household_id": [0, 0, 1, 1], + "age_bracket": ["0-4", "0-4", "5-9", "5-9"], + } + ) + unit_metadata.to_csv(inputs / "unit_metadata.csv", index=False) + + ipf_target_metadata = pd.DataFrame( + { + "scope": ["person", "person"], + "target_type": ["categorical_margin", "categorical_margin"], + "margin_id": ["age_margin", "age_margin"], + "variables": ["age_bracket", "age_bracket"], + "cell": ["age_bracket=0-4", "age_bracket=5-9"], + "target_value": [4.0, 4.0], + } + ) + ipf_target_metadata.to_csv(inputs / "ipf_target_metadata.csv", index=False) + + weights_path, _ = benchmark_cli_module._run_ipf(run_dir) + fitted_weights = np.load(weights_path) + + np.testing.assert_allclose( + fitted_weights, np.array([2.0, 2.0]), atol=1e-8, rtol=1e-8 + ) + np.testing.assert_allclose( + matrix.dot(fitted_weights), np.array([4.0, 4.0]), atol=1e-8 + ) + + +def test_ipf_runner_end_to_end_categorical_margin_household_scope( + benchmark_cli_module, tmp_path +): + if not _r_package_available("surveysd"): + pytest.skip("R package 'surveysd' is required for this test") + + run_dir = tmp_path / "ipf-margin-run" + matrix = csr_matrix(np.array([[1.0, 0.0], [0.0, 1.0]], dtype=np.float64)) + inputs = _write_common_inputs( + run_dir=run_dir, + matrix=matrix, + target_values=[2.0, 2.0], + variables=["household_count", "household_count"], + method_options={ + "ipf": {"max_iter": 50, "bound": 10.0, "epsP": 1e-9, "epsH": 1e-9} + }, + ) + + unit_metadata = pd.DataFrame( + { + "unit_index": [0, 1], + "household_id": [0, 1], + "snap": ["yes", "no"], + } + ) + unit_metadata.to_csv(inputs / "unit_metadata.csv", index=False) + + ipf_target_metadata = pd.DataFrame( + { + "scope": ["household", "household"], + "target_type": ["categorical_margin", "categorical_margin"], + "margin_id": ["snap_margin", "snap_margin"], + "variables": ["snap", "snap"], + "cell": ["snap=yes", "snap=no"], + "target_value": [2.0, 2.0], + } + ) + ipf_target_metadata.to_csv(inputs / "ipf_target_metadata.csv", index=False) + + weights_path, _ = benchmark_cli_module._run_ipf(run_dir) + fitted_weights = np.load(weights_path) + + np.testing.assert_allclose( + fitted_weights, np.array([2.0, 2.0]), atol=1e-8, rtol=1e-8 + ) + np.testing.assert_allclose( + matrix.dot(fitted_weights), np.array([2.0, 2.0]), atol=1e-8 + ) + + +def test_ipf_runner_end_to_end_single_cell_margin_leaves_complement_untouched( + benchmark_cli_module, tmp_path +): + """A 1-cell categorical_margin must rake the authored cell and leave + units outside that cell at their base weights. This is the semantic the + converter now relies on instead of synthesizing a baseline complement. + """ + if not _r_package_available("surveysd"): + pytest.skip("R package 'surveysd' is required for this test") + + run_dir = tmp_path / "ipf-single-cell-run" + # Two households: unit 0 is snap=yes, unit 1 is snap=no. Target: 1 yes-hh + # (authored), no complement constraint. Expect calib[0]=2.0 (rescaled from + # base 4.0 -> 2.0 so 1 weighted hh = 1), calib[1]=4.0 (untouched base). + matrix = csr_matrix(np.array([[1.0, 0.0]], dtype=np.float64)) + inputs = _write_common_inputs( + run_dir=run_dir, + matrix=matrix, + target_values=[2.0], + variables=["household_count"], + initial_weights=np.array([4.0, 4.0], dtype=np.float64), + method_options={ + "ipf": {"max_iter": 100, "bound": 10.0, "epsP": 1e-9, "epsH": 1e-9} + }, + ) + + unit_metadata = pd.DataFrame( + { + "unit_index": [0, 1], + "household_id": [0, 1], + "snap": ["yes", "no"], + "base_weight": [4.0, 4.0], + } + ) + unit_metadata.to_csv(inputs / "unit_metadata.csv", index=False) + + # 1-cell authored margin: only snap=yes has a target, no complement row. + ipf_target_metadata = pd.DataFrame( + { + "scope": ["household"], + "target_type": ["categorical_margin"], + "margin_id": ["snap_yes_only"], + "variables": ["snap"], + "cell": ["snap=yes"], + "target_value": [2.0], + } + ) + ipf_target_metadata.to_csv(inputs / "ipf_target_metadata.csv", index=False) + + weights_path, _ = benchmark_cli_module._run_ipf(run_dir) + fitted_weights = np.load(weights_path) + + # Authored cell (snap=yes, unit 0): rescaled to hit target = 2.0. + np.testing.assert_allclose(fitted_weights[0], 2.0, atol=1e-8, rtol=1e-8) + # Complement unit (snap=no, unit 1): untouched, equals its base weight. + np.testing.assert_allclose(fitted_weights[1], 4.0, atol=1e-8, rtol=1e-8) + + +def test_ipf_runner_rejects_numeric_total_target_type(benchmark_cli_module, tmp_path): + """The runner supports only `categorical_margin`. A CSV with + `target_type='numeric_total'` must fail fast with a clear error so old + external pipelines cannot silently fall back to the (removed) + `benchmark_all` raking path. + """ + if not _r_package_available("surveysd"): + pytest.skip("R package 'surveysd' is required for this test") + + run_dir = tmp_path / "ipf-numeric-rejected" + matrix = csr_matrix(np.array([[1.0, 0.0]], dtype=np.float64)) + inputs = _write_common_inputs( + run_dir=run_dir, + matrix=matrix, + target_values=[1.0], + variables=["person_count"], + method_options={ + "ipf": {"max_iter": 10, "bound": 10.0, "epsP": 1e-4, "epsH": 1e-4} + }, + ) + pd.DataFrame( + { + "unit_index": [0, 1], + "household_id": [0, 1], + "snap": ["yes", "no"], + "base_weight": [1.0, 1.0], + } + ).to_csv(inputs / "unit_metadata.csv", index=False) + pd.DataFrame( + { + "scope": ["person"], + "target_type": ["numeric_total"], + "margin_id": ["_numeric"], + "value_column": ["snap"], + "variables": ["snap"], + "cell": ["snap=yes"], + "target_value": [1.0], + } + ).to_csv(inputs / "ipf_target_metadata.csv", index=False) + + with pytest.raises(RuntimeError, match="IPF runner failed"): + benchmark_cli_module._run_ipf(run_dir) + + +def test_ipf_runner_single_call_multi_margin_exact_fit(benchmark_cli_module, tmp_path): + """Compatible closed margins should run in one `surveysd::ipf` call.""" + if not _r_package_available("surveysd"): + pytest.skip("R package 'surveysd' is required for this test") + + run_dir = tmp_path / "ipf-joint-run" + matrix = csr_matrix( + np.array( + [ + [1.0, 1.0, 0.0, 0.0], # district A count + [0.0, 0.0, 1.0, 1.0], # district B count + [1.0, 0.0, 0.0, 0.0], # district A snap=yes + [0.0, 1.0, 0.0, 0.0], # district A snap=no + [0.0, 0.0, 1.0, 0.0], # district B snap=yes + [0.0, 0.0, 0.0, 1.0], # district B snap=no + ], + dtype=np.float64, + ) + ) + inputs = _write_common_inputs( + run_dir=run_dir, + matrix=matrix, + target_values=[4.0, 4.0, 2.0, 2.0, 2.0, 2.0], + variables=["household_count"] * 6, + initial_weights=np.ones(4, dtype=np.float64), + method_options={ + "ipf": {"max_iter": 200, "bound": 10.0, "epsP": 1e-9, "epsH": 1e-9} + }, + ) + pd.DataFrame( + { + "unit_index": [0, 1, 2, 3], + "household_id": [0, 1, 2, 3], + "district": ["A", "A", "B", "B"], + "snap": ["yes", "no", "yes", "no"], + "base_weight": [1.0, 1.0, 1.0, 1.0], + } + ).to_csv(inputs / "unit_metadata.csv", index=False) + + pd.DataFrame( + { + "scope": ["household"] * 6, + "target_type": ["categorical_margin"] * 6, + "margin_id": [ + "district_margin", + "district_margin", + "district_snap_margin", + "district_snap_margin", + "district_snap_margin", + "district_snap_margin", + ], + "variables": [ + "district", + "district", + "district|snap", + "district|snap", + "district|snap", + "district|snap", + ], + "cell": [ + "district=A", + "district=B", + "district=A|snap=yes", + "district=A|snap=no", + "district=B|snap=yes", + "district=B|snap=no", + ], + "target_value": [4.0, 4.0, 2.0, 2.0, 2.0, 2.0], + } + ).to_csv(inputs / "ipf_target_metadata.csv", index=False) + + weights_path, _ = benchmark_cli_module._run_ipf(run_dir) + fitted_weights = np.load(weights_path) + + np.testing.assert_allclose( + matrix.dot(fitted_weights), + np.array([4.0, 4.0, 2.0, 2.0, 2.0, 2.0]), + atol=1e-6, + ) + assert not (run_dir / "outputs" / "_ipf_blocks").exists() + + +def test_cmd_run_ipf_uses_retained_authored_scoring_subset( + benchmark_cli_module, tmp_path +): + run_dir = tmp_path / "ipf-summary-run" + matrix = csr_matrix(np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=float)) + inputs = _write_common_inputs( + run_dir=run_dir, + matrix=matrix, + target_values=[2.0, 3.0, 5.0], + variables=["household_count", "household_count", "household_count"], + target_names=["requested_a", "requested_b", "requested_c"], + ) + subset_matrix = csr_matrix(np.array([[1.0, 0.0], [0.0, 1.0]], dtype=float)) + mmwrite(str(inputs / "ipf_scoring_X_targets_by_units.mtx"), subset_matrix) + pd.DataFrame( + { + "value": [2.0, 3.0], + "variable": ["household_count", "household_count"], + "geo_level": ["national", "national"], + "target_name": ["retained_a", "retained_b"], + } + ).to_csv(inputs / "ipf_scoring_target_metadata.csv", index=False) + + weights_path = run_dir / "outputs" / "fitted_weights.npy" + np.save(weights_path, np.array([2.0, 3.0], dtype=np.float64)) + + def _fake_run_ipf(_run_dir): + return weights_path, 0.0 + + benchmark_cli_module._run_ipf = _fake_run_ipf + exit_code = benchmark_cli_module.cmd_run( + SimpleNamespace(method="ipf", run_dir=str(run_dir)) + ) + + assert exit_code == 0 + summary = json.loads((run_dir / "outputs" / "ipf_summary.json").read_text()) + assert summary["n_targets"] == 2 + assert summary["scoring_target_set"] == "ipf_retained_authored" + + +def test_cmd_run_l0_can_opt_into_retained_authored_scoring_subset( + benchmark_cli_module, tmp_path +): + run_dir = tmp_path / "l0-summary-run" + matrix = csr_matrix(np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]], dtype=float)) + inputs = _write_common_inputs( + run_dir=run_dir, + matrix=matrix, + target_values=[2.0, 3.0, 5.0], + variables=["household_count", "household_count", "household_count"], + target_names=["requested_a", "requested_b", "requested_c"], + ) + subset_matrix = csr_matrix(np.array([[1.0, 0.0], [0.0, 1.0]], dtype=float)) + mmwrite(str(inputs / "ipf_scoring_X_targets_by_units.mtx"), subset_matrix) + pd.DataFrame( + { + "value": [2.0, 3.0], + "variable": ["household_count", "household_count"], + "geo_level": ["national", "national"], + "target_name": ["retained_a", "retained_b"], + } + ).to_csv(inputs / "ipf_scoring_target_metadata.csv", index=False) + + weights_path = run_dir / "outputs" / "fitted_weights.npy" + np.save(weights_path, np.array([2.0, 3.0], dtype=np.float64)) + + def _fake_run_l0(_run_dir): + return weights_path + + benchmark_cli_module._run_l0 = _fake_run_l0 + exit_code = benchmark_cli_module.cmd_run( + SimpleNamespace( + method="l0", + run_dir=str(run_dir), + score_on="ipf_retained_authored", + ) + ) + + assert exit_code == 0 + summary = json.loads((run_dir / "outputs" / "l0_summary.json").read_text()) + assert summary["n_targets"] == 2 + assert summary["scoring_target_set"] == "ipf_retained_authored" diff --git a/tests/unit/test_ipf_conversion.py b/tests/unit/test_ipf_conversion.py new file mode 100644 index 000000000..72bebd427 --- /dev/null +++ b/tests/unit/test_ipf_conversion.py @@ -0,0 +1,350 @@ +"""Unit tests for paper-l0/benchmarking/ipf_conversion.py. + +These tests stay on the pure-Python side of the converter: target resolution, +closed-margin classification, exact complement derivation from authored parent +totals, and mixed-scope invariance guards. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pandas as pd +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[2] +BENCHMARK_DIR = REPO_ROOT / "paper-l0" / "benchmarking" + + +@pytest.fixture(scope="module") +def ipf_conversion(): + benchmark_dir_str = str(BENCHMARK_DIR) + if benchmark_dir_str not in sys.path: + sys.path.insert(0, benchmark_dir_str) + import ipf_conversion as module # noqa: PLC0415 + + return module + + +def _make_age_partition_targets(): + targets = pd.DataFrame( + [ + { + "target_id": 1, + "stratum_id": 1, + "variable": "person_count", + "value": 210.0, + "target_name": "pc_0_4_d601", + }, + { + "target_id": 2, + "stratum_id": 2, + "variable": "person_count", + "value": 205.0, + "target_name": "pc_5_9_d601", + }, + ] + ) + constraints = { + 1: [ + { + "variable": "congressional_district_geoid", + "operation": "==", + "value": "601", + }, + {"variable": "age", "operation": ">", "value": "-1"}, + {"variable": "age", "operation": "<", "value": "5"}, + ], + 2: [ + { + "variable": "congressional_district_geoid", + "operation": "==", + "value": "601", + }, + {"variable": "age", "operation": ">", "value": "4"}, + {"variable": "age", "operation": "<", "value": "10"}, + ], + } + unit_data = pd.DataFrame( + { + "unit_index": [0, 1, 2, 3], + "household_id": [0, 0, 1, 1], + "congressional_district_geoid": [601, 601, 601, 601], + "age_bracket": ["0-4", "0-4", "5-9", "5-9"], + } + ) + return targets, constraints, unit_data + + +def _make_snap_subset_and_total_targets(): + targets = pd.DataFrame( + [ + { + "target_id": 10, + "stratum_id": 10, + "variable": "household_count", + "value": 10.0, + "target_name": "hh_total_d601", + }, + { + "target_id": 11, + "stratum_id": 11, + "variable": "household_count", + "value": 8.0, + "target_name": "hh_total_d602", + }, + { + "target_id": 12, + "stratum_id": 12, + "variable": "household_count", + "value": 4.0, + "target_name": "hh_snap_pos_d601", + }, + { + "target_id": 13, + "stratum_id": 13, + "variable": "household_count", + "value": 3.0, + "target_name": "hh_snap_pos_d602", + }, + ] + ) + constraints = { + 10: [ + { + "variable": "congressional_district_geoid", + "operation": "==", + "value": "601", + } + ], + 11: [ + { + "variable": "congressional_district_geoid", + "operation": "==", + "value": "602", + } + ], + 12: [ + { + "variable": "congressional_district_geoid", + "operation": "==", + "value": "601", + }, + {"variable": "snap", "operation": ">", "value": "0"}, + ], + 13: [ + { + "variable": "congressional_district_geoid", + "operation": "==", + "value": "602", + }, + {"variable": "snap", "operation": ">", "value": "0"}, + ], + } + unit_data = pd.DataFrame( + { + "unit_index": [0, 1, 2, 3], + "household_id": [0, 1, 2, 3], + "congressional_district_geoid": [601, 601, 602, 602], + "snap_positive": [ + "positive", + "non_positive", + "positive", + "non_positive", + ], + } + ) + return targets, constraints, unit_data + + +def test_close_margins_full_partition_keeps_authored_cells(ipf_conversion): + targets, constraints, unit_data = _make_age_partition_targets() + resolved, unresolved = ipf_conversion.resolve_targets_for_testing( + targets, constraints + ) + assert unresolved == [] + + closed_blocks, dropped = ipf_conversion.close_margins_for_testing( + resolved=resolved, + unit_data=unit_data, + ) + assert dropped == [] + assert len(closed_blocks) == 1 + block = closed_blocks[0] + assert block.closure_status == "full_partition" + assert all(cell.is_authored for cell in block.cells) + + emitted = ipf_conversion.emit_target_rows(closed_blocks) + assert len(emitted) == 2 + assert emitted["is_authored"].tolist() == [True, True] + assert set(emitted["cell"]) == { + "age_bracket=0-4|congressional_district_geoid=601", + "age_bracket=5-9|congressional_district_geoid=601", + } + + +def test_close_margins_binary_subset_derives_complement_from_parent_total( + ipf_conversion, +): + targets, constraints, unit_data = _make_snap_subset_and_total_targets() + resolved, unresolved = ipf_conversion.resolve_targets_for_testing( + targets, constraints + ) + assert unresolved == [] + + closed_blocks, dropped = ipf_conversion.close_margins_for_testing( + resolved=resolved, + unit_data=unit_data, + ) + assert dropped == [] + assert len(closed_blocks) == 2 + assert {block.closure_status for block in closed_blocks} == { + "full_partition", + "binary_subset_with_parent_total", + } + + emitted = ipf_conversion.emit_target_rows(closed_blocks) + subset_rows = emitted.loc[ + emitted["closure_status"] == "binary_subset_with_parent_total" + ].reset_index(drop=True) + derived_rows = subset_rows.loc[~subset_rows["is_authored"]].sort_values( + "cell" + ).reset_index(drop=True) + assert len(derived_rows) == 2 + assert set(derived_rows["cell"]) == { + "congressional_district_geoid=601|snap_positive=non_positive", + "congressional_district_geoid=602|snap_positive=non_positive", + } + assert derived_rows["target_value"].tolist() == [6.0, 5.0] + assert (derived_rows["derivation_reason"] == "authored_parent_total").all() + + +def test_close_margins_missing_parent_total_drops_open_subset(ipf_conversion): + targets, constraints, unit_data = _make_snap_subset_and_total_targets() + open_subset_targets = targets.loc[targets["target_id"].isin([12, 13])].reset_index( + drop=True + ) + open_subset_constraints = {12: constraints[12], 13: constraints[13]} + + resolved, unresolved = ipf_conversion.resolve_targets_for_testing( + open_subset_targets, open_subset_constraints + ) + assert unresolved == [] + + closed_blocks, dropped = ipf_conversion.close_margins_for_testing( + resolved=resolved, + unit_data=unit_data, + ) + assert closed_blocks == [] + assert len(dropped) == 1 + assert dropped[0]["reason"] == "missing_parent_total" + + +def test_close_margins_ambiguous_parent_total_drops_subset(ipf_conversion): + targets, constraints, unit_data = _make_snap_subset_and_total_targets() + ambiguous_targets = pd.concat( + [ + targets.loc[targets["target_id"].isin([10, 11, 12, 13])], + pd.DataFrame( + [ + { + "target_id": 14, + "stratum_id": 14, + "variable": "household_count", + "value": 9.0, + "target_name": "hh_total_d601_duplicate", + } + ] + ), + ], + ignore_index=True, + ) + ambiguous_constraints = dict(constraints) + ambiguous_constraints[14] = [ + { + "variable": "congressional_district_geoid", + "operation": "==", + "value": "601", + } + ] + + resolved, unresolved = ipf_conversion.resolve_targets_for_testing( + ambiguous_targets, ambiguous_constraints + ) + assert unresolved == [] + + closed_blocks, dropped = ipf_conversion.close_margins_for_testing( + resolved=resolved, + unit_data=unit_data, + ) + assert len(closed_blocks) == 1 + assert len(dropped) == 1 + assert dropped[0]["reason"] == "ambiguous_parent_total" + + +def test_margin_consistency_uses_closed_subset_totals(ipf_conversion): + targets, constraints, unit_data = _make_snap_subset_and_total_targets() + resolved, _ = ipf_conversion.resolve_targets_for_testing(targets, constraints) + closed_blocks, dropped = ipf_conversion.close_margins_for_testing( + resolved=resolved, + unit_data=unit_data, + ) + assert dropped == [] + issues = ipf_conversion.check_margin_consistency(closed_blocks) + assert issues == [] + + +def test_household_invariance_guard_drops_person_varying_household_margin( + ipf_conversion, +): + targets = pd.DataFrame( + [ + { + "target_id": 30, + "stratum_id": 30, + "variable": "household_count", + "value": 1.0, + "target_name": "hh_age_u5", + } + ] + ) + constraints = { + 30: [ + {"variable": "age", "operation": ">", "value": "-1"}, + {"variable": "age", "operation": "<", "value": "5"}, + ] + } + resolved, unresolved = ipf_conversion.resolve_targets_for_testing( + targets, constraints + ) + assert unresolved == [] + blocks = ipf_conversion.assemble_margins_for_testing(resolved) + unit_data = pd.DataFrame( + { + "household_id": [0, 0, 1, 1], + "age_bracket": ["0-4", "5-9", "0-4", "5-9"], + } + ) + + valid_blocks, dropped = ipf_conversion._validate_household_margin_invariance( + unit_data=unit_data, + blocks=blocks, + ) + assert valid_blocks == [] + assert len(dropped) == 1 + assert dropped[0]["reason"] == "non_invariant_household_constraint_variable" + assert dropped[0]["columns"] == ["age_bracket"] + + +def test_emit_target_rows_rejects_unclosed_margin_blocks(ipf_conversion): + targets, constraints, _ = _make_age_partition_targets() + resolved, unresolved = ipf_conversion.resolve_targets_for_testing( + targets, constraints + ) + assert unresolved == [] + blocks = ipf_conversion.assemble_margins_for_testing(resolved) + + with pytest.raises(TypeError, match="expects closed margin blocks"): + ipf_conversion.emit_target_rows(blocks)