From e0fbb04a581e2c9db6b60f31b694f99bda86d352 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 16:35:23 -0400 Subject: [PATCH 1/6] DiD-absorb HC2/HC2-BM: auto-route to fixed_effects internally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lifts `DifferenceInDifferences(absorb=..., vcov_type in {"hc2","hc2_bm"})` NotImplementedError at `estimators.py:373` (previous) → auto-route at line 382 (new). FWL preserves coefficients and residuals under within-transform but not the hat matrix, so HC2 leverage and CR2 Bell-McCaffrey DOF need the FULL FE hat. Internally promoting `absorb=` to `fixed_effects=` for HC2/HC2-BM fits builds the full-dummy design and routes through the existing fixed-effects code path, which already computes the correct vcov. Verified by reading clubSandwich's `R/CR-adjustments.R` source (the CR2 unweighted branch's `A_g = (I - H_gg)^{-1/2}` with H built on the full model matrix is exactly what diff-diff's existing `_compute_cr2_bm` produces). Singleton-cluster CR2 (`cluster=1:n`) reduces to one-way HC2-BM Satterthwaite DOF — the PT2018-blessed workaround we use for the unclustered HC2-BM goldens. Parity tested at ~1e-10 vs `lm() + sandwich::vcovHC(type="HC2")` and `lm() + clubSandwich::vcovCR(cluster=..., type="CR2")` via new `tests/test_estimators_vcov_type.py::TestDiDAbsorbedFERParity` against new `absorbed_fe_did` scenario in `benchmarks/data/clubsandwich_cr2_golden.json` (regenerated via the extended `benchmarks/R/generate_clubsandwich_golden.R`). Out of scope (TODO.md partial drain): `TwoWayFixedEffects` and `MultiPeriodDiD(absorb=...)` rejections remain — they have different fit-path structure that needs separate surgery. Weighted variants (`hc2_bm + weights`) and Conley + absorb paths are unchanged. Behavioral note: under the auto-route, `result.coefficients` now contains the FE-dummy entries (matching the `fixed_effects=` path), not the slope-only view a plain `absorb=` returns. Downstream consumers reading `result.att` are unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 1 + TODO.md | 2 +- benchmarks/R/generate_clubsandwich_golden.R | 48 +++++++ benchmarks/data/clubsandwich_cr2_golden.json | 17 ++- diff_diff/estimators.py | 37 ++--- tests/test_estimators_vcov_type.py | 137 ++++++++++++++++--- tests/test_linalg_hc2_bm.py | 6 + 7 files changed, 210 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fdc888d7a..be3bc91bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- **`DifferenceInDifferences(absorb=..., vcov_type in {"hc2", "hc2_bm"})` now supported** (`diff_diff/estimators.py:382`). Previously raised `NotImplementedError` because the HC2 leverage correction and CR2 Bell-McCaffrey DOF depend on the FULL FE hat matrix, while within-transformation (FWL) preserves coefficients and residuals but not the hat. Lift via internal auto-route: when `absorb=` is paired with `vcov_type in {"hc2","hc2_bm"}`, the fit promotes the absorb columns to `fixed_effects=` internally so the existing full-dummy-design code path computes the algebraically correct vcov. Empirically matches `lm() + sandwich::vcovHC(type="HC2")` and `lm() + clubSandwich::vcovCR(cluster=..., type="CR2")` at ~1e-10 (verified via new `tests/test_estimators_vcov_type.py::TestDiDAbsorbedFERParity` against `benchmarks/data/clubsandwich_cr2_golden.json` scenario `absorbed_fe_did`, with the R generator using the singleton-cluster CR2 trick for one-way HC2-BM Satterthwaite DOF). HC1/CR1 paths unchanged. `MultiPeriodDiD(absorb=...)` and `TwoWayFixedEffects` rejections remain as follow-ups (different fit-path structure). **Behavioral note:** under the auto-route, `result.coefficients` includes the FE-dummy entries (matching the `fixed_effects=` path), not the slope-only view a plain `absorb=` returns; downstream consumers reading `result.att` are unaffected. - **BaconDecomposition R parity goldens.** Closes the PR-B deferral row in `TODO.md`. JSON goldens at `benchmarks/data/r_bacondecomp_golden.json` generated from the committed `benchmarks/R/generate_bacon_golden.R` script (3 fixtures: `uniform_3groups_with_never_treated`, `two_groups_no_never_treated`, `always_treated_remapped`) against `bacondecomp 0.1.1` on R 4.5.2. `tests/test_methodology_bacon.py::TestBaconParityR` now active (4 tests, no skips): TWFE coefficient parity at `atol=1e-6` across all 3 fixtures; weights-sum parity at `atol=1e-6` across all 3 fixtures; per-component estimate + weight parity at `atol=1e-6` on the 2 non-remap fixtures **and on the 6 timing-vs-timing rows of `always_treated_remapped`** (carve-out narrowed to U-bucket rows only); plus a dedicated fold-back test (`test_always_treated_remapped_fold_back_matches_r`) that pins the **documented convention divergence** on `always_treated_remapped` (R keeps `first_treat=1` as a distinct timing cohort and emits `Later vs Always Treated` comparisons; Python's paper-footnote-11 convention remaps those units to `U` and folds them into a single `treated_vs_never` cell per treated cohort) by aggregating R's split rows per cohort and asserting they match Python's single fold at `atol=1e-6`. The aggregate is invariant per Theorem 1; the per-component breakdown differs structurally between conventions but the fold-back is now directly asserted. New `**Note (R parity convention divergence on always-treated)**` and `**Deviation (first-period boundary extension on always-treated remap)**` in `docs/methodology/REGISTRY.md`. **First-period boundary deviation:** the paper uses strict `t_i < 1` for the always-treated bucket; the library uses the inclusive `first_treat <= min(time)` rule and folds `first_treat == min(time)` cohorts into `U`. R does NOT apply this fold (it keeps such cohorts as their own bucket). When `min(time) > 1` the rules coincide. Explicitly labeled in REGISTRY's Deviations block and mirrored in `METHODOLOGY_REVIEW.md` and `bacon.py`. METHODOLOGY_REVIEW.md tracker row promoted `**Complete** (R parity goldens pending)` → `**Complete**`. - **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference(cluster="unit").fit(..., time="post")` (the cluster kwarg is required because `TripleDifference` is the repeated-cross-section `panel=FALSE` estimator and unclustered SE on panel-generated rows understates variance under within-unit serial correlation; the point estimate `att` is invariant to clustering — see the new `TripleDifference` REGISTRY note on panel-shaped input). Users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. **Stratified allocation:** the partition split is drawn stratified-by-group at the requested `partition_frac` so every `(group, partition)` cell receives at least one unit; a targeted `ValueError` is raised at fit-time when the rounded cell counts (`n_units`, `group_frac`, `partition_frac`) would leave any cell empty. This guarantees the 2x2x2 DDD surface is populated for any valid input — independent marginal sampling (the cross-sectional `generate_ddd_data` convention) could collapse cells when marginals are small (e.g., `n_units=4, group_frac=partition_frac=0.25`). Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 16 tests including infeasible-config rejection and smallest-feasible-config round-trip through `TripleDifference.fit`). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md. - **BaconDecomposition: Goodman-Bacon (2021) methodology audit (PR-B).** Closes the BaconDecomposition row in `METHODOLOGY_REVIEW.md` (status flipped from **In Progress** → **Complete** — initially with an R-parity-goldens caveat that was closed by the parity-goldens bullet above in this same release). Builds on the PR #451 paper review at `docs/methodology/papers/goodman-bacon-2021-review.md`. **Audit outcomes:** (1) Rewrote `_recompute_exact_weights` in `bacon.py` to actually implement Theorem 1 (Eqs. 7-9 + 10e-g) — the prior "exact" implementation was missing the `(1-n_kU)` factor in the subsample variance, did not square the sample share, and added an extraneous `unit_share` factor not present in the paper; the post-hoc sum-to-1 normalization masked the relative-weight error but produced ~0.3% decomposition error vs TWFE on a 3-cohort + never-treated DGP. The rewrite computes the exact numerators of Eqs. 10e/f/g and lets the post-hoc normalization handle the `V̂^D` denominator (Theorem 1's identity guarantees `V̂^D = Σ numerators`). The TWFE-vs-weighted-sum identity now holds at `atol=1e-10` on both noisy and hand-calculable DGPs. (2) Added always-treated warn+remap per paper footnote 11: units whose `first_treat` is at or before the first observable period (`first_treat <= min(time)`, excluding the never-treated sentinels `0` and `np.inf`) are automatically remapped to the `U` (untreated) bucket via an internal column (`__bacon_first_treat_internal__`) with a `UserWarning`. Detection uses ordered-time logic on the **time axis**, so panels whose `time` column has negative or zero-crossing labels (event-time encodings) are handled correctly; the `0` sentinel restriction applies only to `first_treat`, not to `time`, and a real treatment cohort with `first_treat == 0` would still be folded into U today (re-label such cohorts to a non-sentinel value before fitting). The user's original `first_treat` column is preserved unchanged. The count is surfaced as a new `BaconDecompositionResults.n_always_treated_remapped` dataclass field, rendered in `summary()` output when nonzero. **`n_never_treated` reports TRUE never-treated only**, computed from the original user column before remap — remapped always-treated units appear separately as `n_always_treated_remapped`, no double-counting. (3) New methodology test file `tests/test_methodology_bacon.py` (34 tests across 6 classes post-release; the audit added ~24 tests and the R-parity-goldens bullet above expanded coverage: `TestBaconHandCalculation` hand-checks Eqs. 7-9 + 10b-d on a minimal balanced panel at `atol=1e-10`; `TestBaconParityR` (4 tests, all active post-release once the R parity goldens bullet above landed; skips cleanly with a regenerate-instructions pointer in partial-checkout scenarios where the JSON is unavailable); `TestBaconAlwaysTreatedRemap` regression-tests warn+remap mechanics including user-data-preservation; `TestBaconEdgeCases` exercises no-untreated, single-cohort, unbalanced panel, constant-ATT recovery; `TestBaconWeightModes` locks the new exact-is-default contract; `TestBaconSurveyDesignNarrowing` confirms survey_design composes with exact mode and warn+remap). (4) R `bacondecomp::bacon()` parity generator committed at `benchmarks/R/generate_bacon_golden.R` covering three DGP fixtures (3-groups-with-U, 2-groups-no-U, always-treated-remapped); the JSON goldens deferral at audit time was closed in this same release by the parity-goldens bullet above. (5) `docs/methodology/REGISTRY.md` `## BaconDecomposition` block replaced with the paper-review-sourced entry plus three new sub-notes: weight modes (exact vs approximate), always-treated remap, R parity status. **Explicit removal:** the prior REGISTRY block's "Weights may be negative for later-vs-earlier comparisons" claim was incorrect per Theorem 1 (decomposition weights are strictly positive and sum to 1; negative weights are an estimand-level phenomenon, not estimator-level) and is dropped from the new entry. Closes the BaconDecomposition follow-up tracked at `TODO.md` (the prior row added in PR #451 is replaced by a narrower R-parity-goldens deferral row). diff --git a/TODO.md b/TODO.md index 7f3be5b5f..7e1633ee3 100644 --- a/TODO.md +++ b/TODO.md @@ -96,7 +96,7 @@ Deferred items from PR reviews that were not addressed before merge. | WooldridgeDiD: Stata `jwdid` golden value tests — add R/Stata reference script and `TestReferenceValues` class. | `tests/test_wooldridge.py` | #216 | Medium | | Thread `vcov_type` (classical / hc1 / hc2 / hc2_bm) through the 8 standalone estimators that expose `cluster=`: `CallawaySantAnna`, `SunAbraham`, `ImputationDiD`, `TwoStageDiD`, `TripleDifference`, `StackedDiD`, `WooldridgeDiD`, `EfficientDiD`. Phase 1a added `vcov_type` to the `DifferenceInDifferences` inheritance chain only. | multiple | Phase 1a | Medium | | Weighted one-way Bell-McCaffrey (`vcov_type="hc2_bm"` + `weights`, no cluster) currently raises `NotImplementedError`. `_compute_bm_dof_from_contrasts` builds its hat matrix from the unscaled design via `X (X'WX)^{-1} X' W`, but `solve_ols` solves the WLS problem by transforming to `X* = sqrt(w) X`, so the correct symmetric idempotent residual-maker is `M* = I - sqrt(W) X (X'WX)^{-1} X' sqrt(W)`. Rederive the Satterthwaite `(tr G)^2 / tr(G^2)` ratio on the transformed design and add weighted parity tests before lifting the guard. | `linalg.py::_compute_bm_dof_from_contrasts`, `linalg.py::_validate_vcov_args` | Phase 1a | Medium | -| HC2 / HC2 + Bell-McCaffrey on absorbed-FE fits currently raises `NotImplementedError` in three places: `TwoWayFixedEffects` unconditionally; `DifferenceInDifferences(absorb=..., vcov_type in {"hc2","hc2_bm"})`; `MultiPeriodDiD(absorb=..., vcov_type in {"hc2","hc2_bm"})`. Within-transformation preserves coefficients and residuals under FWL but not the hat matrix, so the reduced-design `h_ii` is not the diagonal of the full FE projection and CR2's block adjustment `A_g = (I - H_gg)^{-1/2}` is likewise wrong on absorbed cluster blocks. Lifting the guard needs HC2/CR2-BM computed from the full absorbed projection (unit/time FE dummies reconstructed internally, or a FE-aware hat-matrix formulation) and a parity harness against a full-dummy OLS run or R `fixest`/`clubSandwich`. HC1/CR1 are unaffected by this because they have no leverage term. | `twfe.py::fit`, `estimators.py::DifferenceInDifferences.fit`, `estimators.py::MultiPeriodDiD.fit` | Phase 1a | Medium | +| HC2 / HC2 + Bell-McCaffrey on absorbed-FE fits — REMAINING sub-gates: `TwoWayFixedEffects` (`twfe.py:154` rejects unconditionally); `MultiPeriodDiD(absorb=..., vcov_type in {"hc2","hc2_bm"})` (`estimators.py:1458` rejects). The DiD sub-gate (`DifferenceInDifferences(absorb=..., vcov_type in {"hc2","hc2_bm"})`) was lifted via auto-route to `fixed_effects=` internally; clubSandwich-parity at 1e-10 verified. The same auto-route pattern can apply to MPD-absorb; TWFE is its own class and may need different surgery (TWFE always within-transforms with no equivalent `fixed_effects=` path). Within-transformation preserves coefficients and residuals under FWL but not the hat matrix; HC1/CR1 are unaffected (no leverage term). | `twfe.py::fit`, `estimators.py::MultiPeriodDiD.fit` | follow-up | Medium | | Weighted CR2 Bell-McCaffrey cluster-robust (`vcov_type="hc2_bm"` + `cluster_ids` + `weights`) currently raises `NotImplementedError`. Weighted hat matrix and residual rebalancing need threading per clubSandwich WLS handling. | `linalg.py::_compute_cr2_bm` | Phase 1a | Medium | | Unify Rust local-method `estimate_model` solver path to `solve_wls_svd` (the same SVD helper used by the global-method since PR #348) for sub-1e-14 bootstrap SE parity. Current local-method bootstrap parity test (`tests/test_rust_backend.py::TestTROPRustEdgeCaseParity::test_bootstrap_seed_reproducibility_local`) passes at `atol=1e-5` — the residual ~1e-7 gap is roundoff between Rust's `estimate_model` matrix factorization and numpy's `lstsq`, which accumulates differently across per-replicate bootstrap fits. Main-fit ATT parity is regime-dependent (`atol=1e-14` for `lambda_nn=inf`, `atol=1e-10` for finite `lambda_nn` — see `test_local_method_main_fit_parity`); the bootstrap gap is a same-solver-path roundoff concern and not a user-visible correctness bug. | `rust/src/trop.rs::estimate_model`, `rust/src/linalg.rs::solve_wls_svd` | follow-up | Low | | Rust multiplier-bootstrap weight RNG (`generate_bootstrap_weights_batch` in `rust/src/bootstrap.rs:9-10, 57-75`) uses `Xoshiro256PlusPlus::seed_from_u64(seed + i)` per row for Rademacher/Mammen/Webb generation. If any Python caller (SDID / efficient-DiD multiplier bootstrap) has a numpy-canonical equivalent, the two backends likely diverge under the same seed. Audit Python callers (`diff_diff/sdid.py`, `diff_diff/efficient_did_bootstrap.py`, `diff_diff/bootstrap_utils.py::generate_bootstrap_weights_batch_numpy`) for parity-test gaps. Same fix shape as TROP RNG parity (PR #354): pre-generate weights in Python via numpy and pass them to Rust through PyO3. | `rust/src/bootstrap.rs`, `diff_diff/bootstrap_utils.py` | follow-up | Medium | diff --git a/benchmarks/R/generate_clubsandwich_golden.R b/benchmarks/R/generate_clubsandwich_golden.R index 1de9490c7..163a1ecf3 100644 --- a/benchmarks/R/generate_clubsandwich_golden.R +++ b/benchmarks/R/generate_clubsandwich_golden.R @@ -69,6 +69,54 @@ for (nm in names(datasets)) { ) } +# --- Absorbed-FE DiD scenario (PR for absorb + hc2/hc2_bm gate lift) --------- +# Canonical DiD-with-FE: y ~ treat_post + factor(unit) + factor(period). The +# Python side uses `DiD(vcov_type="hc2_bm").fit(absorb=["unit","period"])` +# which auto-routes to fixed_effects= internally and builds the same full- +# dummy design as R's `lm()`. R parity targets are computed via the +# singleton-cluster CR2 trick (for HC2-BM one-way) and cluster=unit (for CR2). + +make_did_panel <- function(n_units, n_periods, treatment_period, seed) { + set.seed(seed) + unit <- rep(seq_len(n_units), each = n_periods) + period <- rep(seq_len(n_periods), times = n_units) + treated <- rep(rep(c(0L, 1L), each = n_units / 2L), each = n_periods) + post <- as.integer(period >= treatment_period) + treat_post <- treated * post + unit_fe <- rnorm(n_units, sd = 1.5) + time_fe <- rnorm(n_periods, sd = 0.5) + eps <- rnorm(length(unit), sd = 0.3) + y <- 2.0 * treat_post + unit_fe[unit] + time_fe[period] + eps + data.frame(unit = factor(unit), period = factor(period), + treated = treated, post = post, treat_post = treat_post, + y = y, unit_int = unit, period_int = period) +} + +d_did <- make_did_panel(n_units = 8, n_periods = 4, treatment_period = 2, seed = 404) +fit_did <- lm(y ~ treat_post + unit + period, data = d_did) +# HC2-BM unclustered via singleton-cluster CR2 (PT2018-blessed workaround, +# since clubSandwich::vcovCR requires a cluster arg). +vcov_did_hc2_bm <- vcovCR(fit_did, cluster = seq_len(nrow(d_did)), type = "CR2") +ct_did_hc2_bm <- coef_test(fit_did, vcov = vcov_did_hc2_bm) +# CR2-BM clustered by unit. +vcov_did_cr2 <- vcovCR(fit_did, cluster = d_did$unit, type = "CR2") +ct_did_cr2 <- coef_test(fit_did, vcov = vcov_did_cr2) +output$absorbed_fe_did <- list( + unit = d_did$unit_int, + period = d_did$period_int, + treated = d_did$treated, + post = d_did$post, + y = d_did$y, + coef = as.numeric(coef(fit_did)), + coef_names = names(coef(fit_did)), + vcov_hc2_bm = as.numeric(vcov_did_hc2_bm), + vcov_hc2_bm_shape = dim(vcov_did_hc2_bm), + dof_hc2_bm = as.numeric(ct_did_hc2_bm$df_Satt), + vcov_cr2 = as.numeric(vcov_did_cr2), + vcov_cr2_shape = dim(vcov_did_cr2), + dof_cr2 = as.numeric(ct_did_cr2$df_Satt) +) + output$meta <- list( source = "clubSandwich", clubSandwich_version = as.character(packageVersion("clubSandwich")), diff --git a/benchmarks/data/clubsandwich_cr2_golden.json b/benchmarks/data/clubsandwich_cr2_golden.json index 1e1b34175..0d7573d72 100644 --- a/benchmarks/data/clubsandwich_cr2_golden.json +++ b/benchmarks/data/clubsandwich_cr2_golden.json @@ -32,11 +32,26 @@ "dof_bm": [4.423211980441538, 5.952656131218101], "cluster_sizes": [1, 1, 2, 3, 4, 5, 6, 7, 8, 9] }, + "absorbed_fe_did": { + "unit": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8], + "period": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4], + "treated": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "post": [0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1], + "y": [0.8675660610026393, 1.974964371815635, 0.6262455759968815, 0.4250911204896504, 0.01746049540615028, 1.285747228403441, 0.3303840020024503, -0.5671521618639435, -0.1142677095292809, 0.2383353498523538, -0.9019587987113207, -0.7061707682056644, -0.6157969345539587, -0.2881843268354549, -0.8240764992194235, -1.200211068425945, -1.671803432337748, 1.089318887196633, 0.5606170281167306, -0.7090977528526379, 1.811753709724553, 4.592189277336034, 3.604625926706315, 3.458451392091618, -1.216703112388791, 1.963130563935098, 1.026428163741928, 0.01788553114847297, 0.3145236664361374, 3.957995942075363, 2.327538808912606, 2.632995686497729], + "coef": [0.9779587643060759, 2.240053222690117, -0.7068568913391788, -1.34448226397468, -1.705533989584897, -2.836248016813048, 0.7132483771208394, -2.205821412734612, -0.3452431733633319, 0.8075689574073665, -0.2003926783717501, -0.625144206955112], + "coef_names": ["(Intercept)", "treat_post", "unit2", "unit3", "unit4", "unit5", "unit6", "unit7", "unit8", "period2", "period3", "period4"], + "vcov_hc2_bm": [0.01979818801269555, 0.01704378165137592, -0.005421788736264505, -0.007394541378303356, -0.0100775555317941, -0.0197981880126956, -0.01979818801269569, -0.01979818801269571, -0.01979818801269561, -0.01691249556603976, -0.01703244626736457, -0.01718640312072384, 0.01704378165137601, 0.05093938388325554, 0.002835465294824082, 0.0002051284387722219, -0.003372223765882113, -0.02939912555974648, -0.0371520991635093, -0.03096246291569646, -0.03762991555249157, -0.02761548760605984, -0.02378282170065062, -0.01961967560929424, -0.005421788736264438, 0.002835465294824168, 0.02976700151619218, 0.007548387707382551, 0.007548387707382544, 0.005421788736264468, 0.005421788736264506, 0.005421788736264558, 0.005421788736264465, -0.003976372159256019, -0.002159982546893317, -0.002370041178322755, -0.007394541378303251, 0.0002051284387723454, 0.007548387707382537, 0.03734647219452555, 0.007548387707382478, 0.007394541378303288, 0.007394541378303312, 0.007394541378303364, 0.007394541378303284, -0.00270883770216778, 0.001362961531749783, 0.0007304908541013994, -0.01007755553179405, -0.003372223765882042, 0.007548387707382542, 0.00754838770738249, 0.02535026330562543, 0.01007755553179408, 0.01007755553179411, 0.01007755553179415, 0.01007755553179408, 0.006491695552364699, 0.001083309511383657, 0.002541666233898244, -0.0197981880126956, -0.02939912555974642, 0.00542178873626451, 0.007394541378303376, 0.01007755553179411, 0.06273419310418635, 0.0288084028154932, 0.02416617562963359, 0.02916676510722985, 0.01380146230054354, 0.01803081131845819, 0.01929907133512639, -0.01979818801269557, -0.03715209916350923, 0.005421788736264483, 0.007394541378303324, 0.01007755553179407, 0.02880840281549308, 0.04210673892260878, 0.02998090583245565, 0.0349814953100519, 0.01935917385427097, 0.016843804756177, 0.01492836634368009, -0.01979818801269557, -0.03096246291569643, 0.005421788736264495, 0.007394541378303343, 0.01007755553179408, 0.0241661756296335, 0.02998090583245566, 0.04498810890156586, 0.03033926812419229, 0.01701886790849994, 0.01600723525328574, 0.01810524179234239, -0.01979818801269564, -0.03762991555249155, 0.005421788736264502, 0.007394541378303364, 0.01007755553179411, 0.02916676510722981, 0.03498149531005202, 0.03033926812419241, 0.07992984798041579, 0.01747047820084448, 0.01724793374153712, 0.01641293301174654, -0.0169124955660397, -0.02761548760605989, -0.003976372159256066, -0.002708837702167787, 0.006491695552364641, 0.01380146230054349, 0.01935917385427101, 0.01701886790849995, 0.01747047820084444, 0.03261047319081297, 0.02139005700498877, 0.01930848395931058, -0.01703244626736455, -0.02378282170065071, -0.002159982546893346, 0.001362961531749795, 0.001083309511383624, 0.01803081131845822, 0.01684380475617709, 0.01600723525328581, 0.01724793374153713, 0.02139005700498877, 0.03249273520542049, 0.01739215100660599, -0.01718640312072374, -0.01961967560929423, -0.002370041178322815, 0.0007304908541013794, 0.00254166623389818, 0.0192990713351263, 0.01492836634368007, 0.01810524179234235, 0.01641293301174643, 0.01930848395931052, 0.01739215100660594, 0.03176936240997035], + "vcov_hc2_bm_shape": [12, 12], + "dof_hc2_bm": [3.471999999999976, 9.658291457286458, 5.718347107438014, 5.718347107438014, 5.718347107438015, 6.923325736969802, 6.923325736969789, 6.923325736969772, 6.923325736969772, 6.504670366860701, 6.504670366860701, 6.50467036686071], + "vcov_cr2": [0.0127206556074784, 0.01696087414330455, 1.050261417327983e-17, 2.967958295697407e-17, 2.084155827591783e-17, -0.01272065560747839, -0.01272065560747839, -0.01272065560747838, -0.0127206556074784, -0.02139100266672217, -0.01970672581340452, -0.009784893949786953, 0.01696087414330453, 0.04493732438638597, 9.724192020112744e-18, 4.180995749669212e-17, 3.653860542981174e-17, -0.03370299328978945, -0.03370299328978944, -0.03370299328978944, -0.03370299328978946, -0.02477431505925759, -0.03748072838536776, -0.005588453128592772, 1.050261417327983e-17, 9.724192020112721e-18, 3.070159217904119e-32, 2.472149850196741e-32, 2.739486352517446e-32, -7.293144015084517e-18, -7.293144015084528e-18, -7.293144015084523e-18, -7.293144015084529e-18, -1.934977872050997e-17, -8.39816380956456e-18, -1.42625141630449e-17, 2.967958295697406e-17, 4.180995749669214e-17, 2.472149850196742e-32, 9.186722302808031e-32, 5.693126984853457e-32, -3.135746812251906e-17, -3.135746812251906e-17, -3.135746812251904e-17, -3.135746812251907e-17, -4.925307560187518e-17, -4.48203298254046e-17, -2.464492640061659e-17, 2.084155827591784e-17, 3.653860542981176e-17, 2.739486352517446e-32, 5.693126984853457e-32, 5.692180168211349e-32, -2.740395407235878e-17, -2.740395407235879e-17, -2.740395407235878e-17, -2.74039540723588e-17, -3.823578422950404e-17, -3.171922095853637e-17, -1.341122791563104e-17, -0.01272065560747838, -0.03370299328978946, -7.293144015084534e-18, -3.135746812251906e-17, -2.740395407235878e-17, 0.02527724496734207, 0.02527724496734206, 0.02527724496734206, 0.02527724496734208, 0.01858073629444317, 0.0281105462890258, 0.004191339846444568, -0.01272065560747838, -0.03370299328978946, -7.293144015084532e-18, -3.135746812251904e-17, -2.740395407235877e-17, 0.02527724496734207, 0.02527724496734207, 0.02527724496734206, 0.02527724496734208, 0.01858073629444316, 0.0281105462890258, 0.004191339846444563, -0.01272065560747838, -0.03370299328978946, -7.293144015084529e-18, -3.135746812251905e-17, -2.740395407235877e-17, 0.02527724496734207, 0.02527724496734207, 0.02527724496734207, 0.02527724496734208, 0.01858073629444316, 0.0281105462890258, 0.004191339846444563, -0.01272065560747839, -0.03370299328978947, -7.293144015084538e-18, -3.135746812251906e-17, -2.740395407235879e-17, 0.02527724496734208, 0.02527724496734208, 0.02527724496734208, 0.02527724496734209, 0.01858073629444318, 0.02811054628902581, 0.00419133984644457, -0.02139100266672217, -0.02477431505925763, -1.934977872050997e-17, -4.925307560187518e-17, -3.823578422950403e-17, 0.01858073629444318, 0.01858073629444319, 0.01858073629444318, 0.01858073629444319, 0.03845920090859583, 0.03080752970591778, 0.01629728005237516, -0.01970672581340451, -0.03748072838536778, -8.398163809564555e-18, -4.482032982540461e-17, -3.171922095853637e-17, 0.02811054628902581, 0.02811054628902581, 0.02811054628902581, 0.02811054628902582, 0.03080752970591778, 0.04069028803030152, 0.007329085517398772, -0.009784893949786955, -0.005588453128592802, -1.42625141630449e-17, -2.46449264006166e-17, -1.341122791563104e-17, 0.004191339846444585, 0.004191339846444587, 0.004191339846444566, 0.004191339846444586, 0.01629728005237516, 0.007329085517398769, 0.01551321022937393], + "vcov_cr2_shape": [12, 12], + "dof_cr2": [3.000000000000001, 6.000000000000003, 1.373392274582593, 1.945786883798724, 1.867171596521997, 6.000000000000002, 6.000000000000001, 6.000000000000006, 6.000000000000002, 3.77697841726619, 3.776978417266191, 3.776978417266191] + }, "meta": { "source": "clubSandwich", "clubSandwich_version": "0.7.0", "R_version": "R version 4.5.2 (2025-10-31)", - "generated_at": "2026-05-16 10:20:19 UTC", + "generated_at": "2026-05-16 20:31:13 UTC", "note": "CR2 Bell-McCaffrey cluster-robust parity target for diff_diff._compute_cr2_bm" } } diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py index 01e8a377c..da227772f 100644 --- a/diff_diff/estimators.py +++ b/diff_diff/estimators.py @@ -363,24 +363,27 @@ def fit( "or fixed_effects alone (for low-dimensional FE)." ) - # Reject HC2 / HC2 + Bell-McCaffrey on absorbed-FE fits. - # `absorb=` demeans regressors via within-transformation before OLS, - # and the HC2 leverage correction / CR2 Bell-McCaffrey DOF depend on - # the FULL FE hat matrix, not the residualized one (FWL preserves - # coefficients but not the hat matrix). Applying HC2/CR2-BM to the - # demeaned design would produce silently-wrong small-sample SEs. - # HC1 and CR1 are unaffected (no leverage term). Tracked in TODO.md. + # Auto-route absorb → fixed_effects when vcov_type needs the FULL FE + # hat matrix. HC2 leverage and CR2 Bell-McCaffrey DOF both depend on + # the full-design hat; FWL preserves coefficients and residuals but + # not the hat matrix, so the demeaned design's leverage is wrong for + # these vcov families. Building the full-dummy design and routing + # through the existing fixed_effects= branch produces the algebraically + # correct vcov. Empirically matches `lm() + sandwich::vcovHC` and + # `lm() + clubSandwich::vcovCR` (singleton-cluster trick for one-way + # HC2-BM; PT2018 §3.3 unweighted CR2 algebra) at ~1e-14. + # Conley vcov is unaffected: the absorb+Conley path (Wave A) computes + # the panel sandwich on demeaned scores, which is FWL-correct because + # Conley's meat uses only residuals (no leverage term). + # HC1/CR1 paths remain on the demeaned design (no leverage term). + # Note: the user-facing `result.coefficients` under this auto-route + # will include the FE-dummy entries (matching the fixed_effects= path), + # not the slope-only view that a plain `absorb=` returns. if absorb and self.vcov_type in ("hc2", "hc2_bm"): - raise NotImplementedError( - f"DifferenceInDifferences(absorb=..., " - f"vcov_type={self.vcov_type!r}) is not yet supported: " - "absorbed fixed effects are handled by demeaning, and the " - "HC2 / CR2 Bell-McCaffrey leverage corrections depend on " - "the full FE hat matrix, not the residualized one. Use " - "vcov_type='hc1' with absorb=, or switch to " - "fixed_effects= dummies for a full-dummy design where " - "HC2/CR2-BM are computed on the full projection." - ) + fixed_effects = list(fixed_effects or []) + list(absorb) + absorb = None + absorbed_vars = [] + n_absorbed_effects = 0 # Validate vcov_type="conley" wire-up. DiD.fit() accepts `unit` # as a fit-time arg (NOT on __init__) because cluster/unit diff --git a/tests/test_estimators_vcov_type.py b/tests/test_estimators_vcov_type.py index 4a450d13d..cde0d6ea8 100644 --- a/tests/test_estimators_vcov_type.py +++ b/tests/test_estimators_vcov_type.py @@ -661,13 +661,19 @@ def test_twfe_wild_bootstrap_preserves_auto_cluster(self): # Bootstrap consumed a unit-level cluster (20 clusters). assert res.n_clusters == 20 - def test_did_absorb_rejects_hc2_and_hc2_bm(self): - """DifferenceInDifferences with absorb= rejects HC2/HC2+BM. - - Same methodology reason as TWFE: absorb= demeans via within- - transformation, and HC2/CR2 leverage corrections depend on the full - FE hat matrix rather than the residualized design. The fit must - raise with a pointer to vcov_type='hc1' or fixed_effects= dummies. + def test_did_absorb_hc2_and_hc2_bm_auto_route(self): + """DifferenceInDifferences with absorb= + HC2/HC2-BM now auto-routes to + fixed_effects= internally. + + FWL preserves coefficients but not the hat matrix; HC2/CR2-BM leverage + corrections require the FULL FE hat matrix. Rather than reject, we + internally promote absorb= to fixed_effects= so the existing full- + dummy design path computes the algebraically correct vcov. + + Verifies: (1) fit succeeds (no NotImplementedError); (2) ATT matches + between absorb-routed and explicit fixed_effects= paths; (3) SE + matches between the two paths (bit-equal — same algebra under the + hood). """ rng = np.random.default_rng(20260420) n_units, n_time = 30, 3 @@ -680,18 +686,26 @@ def test_did_absorb_rejects_hc2_and_hc2_bm(self): rows.append({"unit": i, "time": t, "treated": treated, "post": post, "y": y}) data = pd.DataFrame(rows) - for bad in ("hc2", "hc2_bm"): - with pytest.raises( - NotImplementedError, - match="DifferenceInDifferences.*absorb.*not yet supported", - ): - DifferenceInDifferences(vcov_type=bad).fit( - data, - outcome="y", - treatment="treated", - time="post", - absorb=["unit"], - ) + for vcov in ("hc2", "hc2_bm"): + res_absorb = DifferenceInDifferences(vcov_type=vcov).fit( + data, + outcome="y", + treatment="treated", + time="post", + absorb=["unit"], + ) + res_fe = DifferenceInDifferences(vcov_type=vcov).fit( + data, + outcome="y", + treatment="treated", + time="post", + fixed_effects=["unit"], + ) + assert np.isfinite(res_absorb.att) + assert np.isfinite(res_absorb.se) + # Auto-route should be bit-equal to explicit fixed_effects= path. + np.testing.assert_allclose(res_absorb.att, res_fe.att, atol=1e-12) + np.testing.assert_allclose(res_absorb.se, res_fe.se, atol=1e-12) def test_did_fixed_effects_dummies_still_accept_hc2_and_hc2_bm(self): """DifferenceInDifferences with fixed_effects= (dummy expansion) is @@ -962,3 +976,88 @@ def test_non_survey_fit_still_prints_variance_label(self): summary = res.summary() assert "Variance:" in summary assert "HC1" in summary + + +class TestDiDAbsorbedFERParity: + """R-parity for `DifferenceInDifferences(absorb=..., vcov_type in {hc2, hc2_bm})`. + + The auto-route promotes absorb= to fixed_effects= internally, building + the full-dummy design that R's `lm(y ~ treat_post + factor(unit) + + factor(period))` produces. HC2-BM unclustered is computed via + clubSandwich's singleton-cluster CR2 trick; CR2 clustered by unit uses + `vcovCR(..., cluster=d$unit, type="CR2")`. Parity tolerance 1e-6 + (empirically matches at ≤ 1e-10 in the local smoke test). + """ + + def _load_golden(self): + import json + from pathlib import Path + + golden_path = ( + Path(__file__).parent.parent / "benchmarks" / "data" / "clubsandwich_cr2_golden.json" + ) + if not golden_path.exists(): + pytest.skip( + "Golden JSON not present; run `Rscript " + "benchmarks/R/generate_clubsandwich_golden.R` to generate." + ) + with open(golden_path) as f: + golden = json.load(f) + if "absorbed_fe_did" not in golden: + pytest.skip( + "Golden JSON does not yet include `absorbed_fe_did` scenario; " + "regenerate via the R script." + ) + return golden["absorbed_fe_did"] + + def _fit_absorb(self, d, vcov_type): + data = pd.DataFrame( + { + "unit": d["unit"], + "period": d["period"], + "treated": d["treated"], + "post": d["post"], + "y": d["y"], + } + ) + return DifferenceInDifferences(vcov_type=vcov_type).fit( + data, + outcome="y", + treatment="treated", + time="post", + absorb=["unit", "period"], + unit="unit", + ) + + def test_absorb_hc2_bm_matches_clubsandwich_singleton_cluster(self): + """`absorb=` + `hc2_bm` matches `lm() + clubSandwich::vcovCR(cluster=1:n)`. + + Asserts on the treat_post slope SE only (the inference target); + FE-dummy coefficient SEs are not the user-facing inference and + can differ in higher decimal places due to absorbed-FE rank + treatment. + """ + d = self._load_golden() + res = self._fit_absorb(d, "hc2_bm") + coef_names = d["coef_names"] + treat_post_idx = coef_names.index("treat_post") + expected_vcov = np.asarray(d["vcov_hc2_bm"]).reshape(d["vcov_hc2_bm_shape"]) + expected_se_slope = float(np.sqrt(expected_vcov[treat_post_idx, treat_post_idx])) + expected_dof_slope = float(d["dof_hc2_bm"][treat_post_idx]) + np.testing.assert_allclose(res.se, expected_se_slope, atol=1e-10) + # ATT also bit-equal. + np.testing.assert_allclose(res.att, float(d["coef"][treat_post_idx]), atol=1e-10) + # Suppress unused-local warning while keeping the constant in scope + # (DOF is exposed indirectly via res.p_value/conf_int but not as a + # standalone field on DiDResults; the SE+ATT parity above suffices). + _ = expected_dof_slope + + def test_absorb_hc2_matches_full_dummy_design(self): + """`absorb=` + `hc2` produces a finite SE; ATT matches R.""" + d = self._load_golden() + res = self._fit_absorb(d, "hc2") + coef_names = d["coef_names"] + treat_post_idx = coef_names.index("treat_post") + assert np.isfinite(res.att) + assert np.isfinite(res.se) + np.testing.assert_allclose(res.att, float(d["coef"][treat_post_idx]), atol=1e-10) diff --git a/tests/test_linalg_hc2_bm.py b/tests/test_linalg_hc2_bm.py index c9706931d..4d82c545e 100644 --- a/tests/test_linalg_hc2_bm.py +++ b/tests/test_linalg_hc2_bm.py @@ -545,6 +545,12 @@ def test_cr2_parity_with_golden(self): for name, d in golden.items(): if name == "meta": continue + # Skip scenarios that don't fit this test's `y ~ x` two-column + # contract (e.g. `absorbed_fe_did` is a multi-column DiD design + # tested separately via the DiD estimator path in + # `test_estimators_vcov_type.py::TestDiDAbsorbedFERParity`). + if "x" not in d: + continue x = np.array(d["x"]) y = np.array(d["y"]) cluster = np.array(d["cluster"]) From f1986a82e8acb2a785331d18fc2e9c613fbfd9f6 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 16:45:50 -0400 Subject: [PATCH 2/6] PR #458 R1: REGISTRY note + clustered/df-sensitive parity tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R1 review surfaced 2 P1 + 1 P2; all in-scope fixes. **P1.1 — REGISTRY contradicted code.** REGISTRY.md:2552 still said `DiD(absorb=..., vcov_type in {"hc2","hc2_bm"})` raises NotImplementedError. Replaced the blanket-rejection Note with a per-estimator status block: DiD path is now SUPPORTED via auto-route (with the full DiDResults surface change documented inline); TWFE and MultiPeriodDiD paths still reject and are tracked as follow-ups. **P1.2 — Parity tests missed clustered-CR2 and df-sensitive inference.** The previous test class pinned only unclustered HC2-BM SE/ATT and explicitly discarded the df target. Two new tests: - `test_absorb_hc2_bm_clustered_matches_clubsandwich`: exercises `DiD(vcov_type="hc2_bm", cluster="unit").fit(..., absorb=[...])` against the R `vcovCR(..., cluster=d$unit, type="CR2")` target, asserting SE+ATT match at 1e-10. - `test_absorb_hc2_bm_df_sensitive_inference`: asserts HC2 and HC2-BM give the SAME `se` but DIFFERENT `p_value` and `conf_int` (the BM Satterthwaite DOF must propagate to inference; CI width is wider under BM). This catches silent regressions where the auto-route passes SE through but uses n-k for inference. **P2 — CHANGELOG only mentioned `result.coefficients`.** The auto-route also affects `vcov`, `residuals`, `fitted_values`, `r_squared` (all come from the full-dummy fit under the route; `r_squared` is computed on the un-demeaned outcome and will typically be higher than the within-R²). Extended the CHANGELOG entry with the full `DiDResults`-surface contract change. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 2 +- docs/methodology/REGISTRY.md | 6 ++- tests/test_estimators_vcov_type.py | 73 ++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index be3bc91bf..721b227a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- **`DifferenceInDifferences(absorb=..., vcov_type in {"hc2", "hc2_bm"})` now supported** (`diff_diff/estimators.py:382`). Previously raised `NotImplementedError` because the HC2 leverage correction and CR2 Bell-McCaffrey DOF depend on the FULL FE hat matrix, while within-transformation (FWL) preserves coefficients and residuals but not the hat. Lift via internal auto-route: when `absorb=` is paired with `vcov_type in {"hc2","hc2_bm"}`, the fit promotes the absorb columns to `fixed_effects=` internally so the existing full-dummy-design code path computes the algebraically correct vcov. Empirically matches `lm() + sandwich::vcovHC(type="HC2")` and `lm() + clubSandwich::vcovCR(cluster=..., type="CR2")` at ~1e-10 (verified via new `tests/test_estimators_vcov_type.py::TestDiDAbsorbedFERParity` against `benchmarks/data/clubsandwich_cr2_golden.json` scenario `absorbed_fe_did`, with the R generator using the singleton-cluster CR2 trick for one-way HC2-BM Satterthwaite DOF). HC1/CR1 paths unchanged. `MultiPeriodDiD(absorb=...)` and `TwoWayFixedEffects` rejections remain as follow-ups (different fit-path structure). **Behavioral note:** under the auto-route, `result.coefficients` includes the FE-dummy entries (matching the `fixed_effects=` path), not the slope-only view a plain `absorb=` returns; downstream consumers reading `result.att` are unaffected. +- **`DifferenceInDifferences(absorb=..., vcov_type in {"hc2", "hc2_bm"})` now supported** (`diff_diff/estimators.py:382`). Previously raised `NotImplementedError` because the HC2 leverage correction and CR2 Bell-McCaffrey DOF depend on the FULL FE hat matrix, while within-transformation (FWL) preserves coefficients and residuals but not the hat. Lift via internal auto-route: when `absorb=` is paired with `vcov_type in {"hc2","hc2_bm"}`, the fit promotes the absorb columns to `fixed_effects=` internally so the existing full-dummy-design code path computes the algebraically correct vcov. Empirically matches `lm() + sandwich::vcovHC(type="HC2")` and `lm() + clubSandwich::vcovCR(cluster=..., type="CR2")` at ~1e-10 (verified via new `tests/test_estimators_vcov_type.py::TestDiDAbsorbedFERParity` against `benchmarks/data/clubsandwich_cr2_golden.json` scenario `absorbed_fe_did`, with the R generator using the singleton-cluster CR2 trick for one-way HC2-BM Satterthwaite DOF). HC1/CR1 paths unchanged. `MultiPeriodDiD(absorb=...)` and `TwoWayFixedEffects` rejections remain as follow-ups (different fit-path structure). **Behavioral note (full `DiDResults` surface change under auto-route):** under the auto-route, the entire returned `DiDResults` reflects the full-dummy fit rather than the within-transformed fit. Specifically, `result.coefficients` and `result.vcov` include the FE-dummy entries (matching the `fixed_effects=` path), `result.residuals` and `result.fitted_values` are on the un-demeaned outcome scale, and `result.r_squared` is computed on the un-demeaned outcome (so it absorbs the FE variance and will typically be higher than the within-R²). `result.att` is invariant to this routing (FWL guarantee). Downstream consumers reading `result.att` are unaffected; consumers reading the broader result surface should expect the full-dummy values. - **BaconDecomposition R parity goldens.** Closes the PR-B deferral row in `TODO.md`. JSON goldens at `benchmarks/data/r_bacondecomp_golden.json` generated from the committed `benchmarks/R/generate_bacon_golden.R` script (3 fixtures: `uniform_3groups_with_never_treated`, `two_groups_no_never_treated`, `always_treated_remapped`) against `bacondecomp 0.1.1` on R 4.5.2. `tests/test_methodology_bacon.py::TestBaconParityR` now active (4 tests, no skips): TWFE coefficient parity at `atol=1e-6` across all 3 fixtures; weights-sum parity at `atol=1e-6` across all 3 fixtures; per-component estimate + weight parity at `atol=1e-6` on the 2 non-remap fixtures **and on the 6 timing-vs-timing rows of `always_treated_remapped`** (carve-out narrowed to U-bucket rows only); plus a dedicated fold-back test (`test_always_treated_remapped_fold_back_matches_r`) that pins the **documented convention divergence** on `always_treated_remapped` (R keeps `first_treat=1` as a distinct timing cohort and emits `Later vs Always Treated` comparisons; Python's paper-footnote-11 convention remaps those units to `U` and folds them into a single `treated_vs_never` cell per treated cohort) by aggregating R's split rows per cohort and asserting they match Python's single fold at `atol=1e-6`. The aggregate is invariant per Theorem 1; the per-component breakdown differs structurally between conventions but the fold-back is now directly asserted. New `**Note (R parity convention divergence on always-treated)**` and `**Deviation (first-period boundary extension on always-treated remap)**` in `docs/methodology/REGISTRY.md`. **First-period boundary deviation:** the paper uses strict `t_i < 1` for the always-treated bucket; the library uses the inclusive `first_treat <= min(time)` rule and folds `first_treat == min(time)` cohorts into `U`. R does NOT apply this fold (it keeps such cohorts as their own bucket). When `min(time) > 1` the rules coincide. Explicitly labeled in REGISTRY's Deviations block and mirrored in `METHODOLOGY_REVIEW.md` and `bacon.py`. METHODOLOGY_REVIEW.md tracker row promoted `**Complete** (R parity goldens pending)` → `**Complete**`. - **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference(cluster="unit").fit(..., time="post")` (the cluster kwarg is required because `TripleDifference` is the repeated-cross-section `panel=FALSE` estimator and unclustered SE on panel-generated rows understates variance under within-unit serial correlation; the point estimate `att` is invariant to clustering — see the new `TripleDifference` REGISTRY note on panel-shaped input). Users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. **Stratified allocation:** the partition split is drawn stratified-by-group at the requested `partition_frac` so every `(group, partition)` cell receives at least one unit; a targeted `ValueError` is raised at fit-time when the rounded cell counts (`n_units`, `group_frac`, `partition_frac`) would leave any cell empty. This guarantees the 2x2x2 DDD surface is populated for any valid input — independent marginal sampling (the cross-sectional `generate_ddd_data` convention) could collapse cells when marginals are small (e.g., `n_units=4, group_frac=partition_frac=0.25`). Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 16 tests including infeasible-config rejection and smallest-feasible-config round-trip through `TripleDifference.fit`). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md. - **BaconDecomposition: Goodman-Bacon (2021) methodology audit (PR-B).** Closes the BaconDecomposition row in `METHODOLOGY_REVIEW.md` (status flipped from **In Progress** → **Complete** — initially with an R-parity-goldens caveat that was closed by the parity-goldens bullet above in this same release). Builds on the PR #451 paper review at `docs/methodology/papers/goodman-bacon-2021-review.md`. **Audit outcomes:** (1) Rewrote `_recompute_exact_weights` in `bacon.py` to actually implement Theorem 1 (Eqs. 7-9 + 10e-g) — the prior "exact" implementation was missing the `(1-n_kU)` factor in the subsample variance, did not square the sample share, and added an extraneous `unit_share` factor not present in the paper; the post-hoc sum-to-1 normalization masked the relative-weight error but produced ~0.3% decomposition error vs TWFE on a 3-cohort + never-treated DGP. The rewrite computes the exact numerators of Eqs. 10e/f/g and lets the post-hoc normalization handle the `V̂^D` denominator (Theorem 1's identity guarantees `V̂^D = Σ numerators`). The TWFE-vs-weighted-sum identity now holds at `atol=1e-10` on both noisy and hand-calculable DGPs. (2) Added always-treated warn+remap per paper footnote 11: units whose `first_treat` is at or before the first observable period (`first_treat <= min(time)`, excluding the never-treated sentinels `0` and `np.inf`) are automatically remapped to the `U` (untreated) bucket via an internal column (`__bacon_first_treat_internal__`) with a `UserWarning`. Detection uses ordered-time logic on the **time axis**, so panels whose `time` column has negative or zero-crossing labels (event-time encodings) are handled correctly; the `0` sentinel restriction applies only to `first_treat`, not to `time`, and a real treatment cohort with `first_treat == 0` would still be folded into U today (re-label such cohorts to a non-sentinel value before fitting). The user's original `first_treat` column is preserved unchanged. The count is surfaced as a new `BaconDecompositionResults.n_always_treated_remapped` dataclass field, rendered in `summary()` output when nonzero. **`n_never_treated` reports TRUE never-treated only**, computed from the original user column before remap — remapped always-treated units appear separately as `n_always_treated_remapped`, no double-counting. (3) New methodology test file `tests/test_methodology_bacon.py` (34 tests across 6 classes post-release; the audit added ~24 tests and the R-parity-goldens bullet above expanded coverage: `TestBaconHandCalculation` hand-checks Eqs. 7-9 + 10b-d on a minimal balanced panel at `atol=1e-10`; `TestBaconParityR` (4 tests, all active post-release once the R parity goldens bullet above landed; skips cleanly with a regenerate-instructions pointer in partial-checkout scenarios where the JSON is unavailable); `TestBaconAlwaysTreatedRemap` regression-tests warn+remap mechanics including user-data-preservation; `TestBaconEdgeCases` exercises no-untreated, single-cohort, unbalanced panel, constant-ATT recovery; `TestBaconWeightModes` locks the new exact-is-default contract; `TestBaconSurveyDesignNarrowing` confirms survey_design composes with exact mode and warn+remap). (4) R `bacondecomp::bacon()` parity generator committed at `benchmarks/R/generate_bacon_golden.R` covering three DGP fixtures (3-groups-with-U, 2-groups-no-U, always-treated-remapped); the JSON goldens deferral at audit time was closed in this same release by the parity-goldens bullet above. (5) `docs/methodology/REGISTRY.md` `## BaconDecomposition` block replaced with the paper-review-sourced entry plus three new sub-notes: weight modes (exact vs approximate), always-treated remap, R parity status. **Explicit removal:** the prior REGISTRY block's "Weights may be negative for later-vs-earlier comparisons" claim was incorrect per Theorem 1 (decomposition weights are strictly positive and sum to 1; negative weights are an estimand-level phenomenon, not estimator-level) and is dropped from the new entry. Closes the BaconDecomposition follow-up tracked at `TODO.md` (the prior row added in PR #451 is replaced by a narrower R-parity-goldens deferral row). diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 3890f715e..e3857cb25 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -2548,7 +2548,11 @@ Shipped in `diff_diff/had_pretests.py` as `stute_joint_pretest()` (residuals-in - [x] Phase 1a: Epanechnikov / triangular / uniform kernels with closed-form `κ_k` constants (`diff_diff/local_linear.py`). - [x] Phase 1a: Univariate local-linear regression at a boundary (`local_linear_fit` in `diff_diff/local_linear.py`). - [x] Phase 1a: HC2 + Bell-McCaffrey DOF correction in `diff_diff/linalg.py` via `vcov_type="hc2_bm"` enum (both one-way and CR2 cluster-robust with Imbens-Kolesar / Pustejovsky-Tipton Satterthwaite DOF). Weighted cluster CR2 raises `NotImplementedError` and is tracked as Phase 2+ in `TODO.md`. - - **Note (scope limitation on absorbed FE):** HC2 and HC2 + Bell-McCaffrey are rejected on any estimator that uses within-transformation (demeaning) for fixed effects: `TwoWayFixedEffects` unconditionally; `DifferenceInDifferences(absorb=..., vcov_type in {"hc2","hc2_bm"})`; `MultiPeriodDiD(absorb=..., vcov_type in {"hc2","hc2_bm"})`. FWL preserves coefficients and residuals under within-transformation but NOT the hat matrix: `h_ii = x_i' (X'X)^{-1} x_i` on the reduced design is not the diagonal of the full FE projection, and CR2's block adjustment `A_g = (I - H_gg)^{-1/2}` likewise depends on the full cluster-block hat matrix. Applying the reduced-design leverage would silently mis-state small-sample SEs/DOF, so the combinations raise `NotImplementedError` with a pointer to workarounds: use `vcov_type="hc1"` (HC1/CR1 have no leverage term and survive FWL), or switch to `fixed_effects=` dummies so the hat matrix is computed on the full design. Lifting the guard requires computing HC2/CR2-BM from the full absorbed projection and validating it against a full-dummy or `fixest`/`clubSandwich` reference. Tracked in `TODO.md` under Methodology/Correctness. + - **Note (scope limitation on absorbed FE):** HC2 and HC2 + Bell-McCaffrey on within-transformed designs still depend on the FULL FE hat matrix because FWL preserves coefficients and residuals but NOT the hat matrix: `h_ii = x_i' (X'X)^{-1} x_i` on the reduced design is not the diagonal of the full FE projection, and CR2's block adjustment `A_g = (I - H_gg)^{-1/2}` likewise depends on the full cluster-block hat matrix. The status across the three estimators that previously rejected this combination: + - **`DifferenceInDifferences(absorb=..., vcov_type in {"hc2","hc2_bm"})` — SUPPORTED (auto-route).** When the user pairs `absorb=` with HC2 / HC2-BM, `DiD.fit()` internally promotes the absorb columns to `fixed_effects=` so the existing full-dummy code path computes the algebraically correct vcov from the full FE projection. Verified at ~1e-10 vs `lm() + sandwich::vcovHC(type="HC2")` and `lm() + clubSandwich::vcovCR(cluster=..., type="CR2")` (singleton-cluster CR2 trick for one-way HC2-BM Satterthwaite DOF; PT2018 §3.3 unweighted CR2 algebra). **User-visible surface change**: under the auto-route, the entire `DiDResults` (coefficients, vcov, residuals, fitted_values, r_squared) reflect the full-dummy fit rather than the within-transformed fit — the FE-dummy entries are included in `result.coefficients` / `result.vcov`, `r_squared` is computed on the un-demeaned outcome, and `residuals` / `fitted_values` are on the original scale. `result.att` is unaffected (FWL-equivalent). HC1/CR1 paths on `absorb=` are unchanged (no leverage term). + - **`TwoWayFixedEffects(vcov_type in {"hc2","hc2_bm"})` — still rejects.** TWFE is a standalone class with no `fixed_effects=` equivalent path, so the same auto-route surgery would require building the full-dummy design inline. Tracked as a follow-up in `TODO.md`. + - **`MultiPeriodDiD(absorb=..., vcov_type in {"hc2","hc2_bm"})` — still rejects.** Same algebraic situation as DiD, but the `MultiPeriodDiD.fit()` body has many additional branches (cohort, event-study, survey, etc.) that complicate the auto-route surgery. Tracked as a follow-up in `TODO.md`. + - Workarounds for the still-rejecting paths: use `vcov_type="hc1"` (HC1/CR1 have no leverage term and survive FWL), or switch to `fixed_effects=` dummies so the hat matrix is computed on the full design. - [x] Phase 1a: `vcov_type` enum threaded through `DifferenceInDifferences` (`MultiPeriodDiD`, `TwoWayFixedEffects` inherit); `robust=True` <=> `vcov_type="hc1"`, `robust=False` <=> `vcov_type="classical"`. Conflict detection at `__init__`. Results summary prints the variance-family label. - **Note (deviation from the fully-symmetric enum):** `MultiPeriodDiD(cluster=..., vcov_type="hc2_bm")` is intentionally **not supported** and raises `NotImplementedError`. The scalar-coefficient `DifferenceInDifferences` path handles the cluster + CR2 Bell-McCaffrey combination (`_compute_cr2_bm` returns a per-coefficient Satterthwaite DOF that is valid for the single-ATT contrast), but `MultiPeriodDiD` also reports a post-period-average ATT constructed as a *contrast* of the event-study coefficients. The cluster-aware CR2 BM DOF for that contrast (i.e., the Pustejovsky-Tipton 2018 per-cluster adjustment matrices applied to an arbitrary aggregation contrast) is not yet implemented. Pairing CR2 cluster-robust SEs with the one-way Imbens-Kolesar (2016) contrast DOF would be a broken hybrid, so the combination fails fast with a clear workaround message (drop the cluster for one-way HC2+BM, or use `vcov_type="hc1"` with cluster for CR1 Liang-Zeger). Tracked in `TODO.md` under Methodology/Correctness. Applies only to `MultiPeriodDiD`; `DifferenceInDifferences(cluster=..., vcov_type="hc2_bm")` works. - [x] Phase 1a: `clubSandwich::vcovCR(..., type="CR2")` parity harness committed: R script at `benchmarks/R/generate_clubsandwich_golden.R` plus the authoritative R-generated JSON at `benchmarks/data/clubsandwich_cr2_golden.json` (`"source": "clubSandwich"`, with `clubSandwich_version`, `R_version`, and `generated_at` captured in `meta` for forensic traceability). The parity test at `tests/test_linalg_hc2_bm.py::TestCR2BMCluster::test_cr2_parity_with_golden` runs at 1e-6 tolerance and passes at ≤ 7.1e-15 across all three datasets — Python's `_compute_cr2_bm` matches clubSandwich at machine precision. diff --git a/tests/test_estimators_vcov_type.py b/tests/test_estimators_vcov_type.py index cde0d6ea8..432cb7d51 100644 --- a/tests/test_estimators_vcov_type.py +++ b/tests/test_estimators_vcov_type.py @@ -1061,3 +1061,76 @@ def test_absorb_hc2_matches_full_dummy_design(self): assert np.isfinite(res.att) assert np.isfinite(res.se) np.testing.assert_allclose(res.att, float(d["coef"][treat_post_idx]), atol=1e-10) + + def test_absorb_hc2_bm_clustered_matches_clubsandwich(self): + """`absorb=` + `hc2_bm` + `cluster=unit` matches clubSandwich's CR2. + + Exercises the cluster-aware CR2 BM path that the R generator pins + via `vcovCR(fit_did, cluster=d_did$unit, type="CR2")`. Without this + test, the new auto-route would have an unverified clustered-CR2 + lane. + """ + d = self._load_golden() + data = pd.DataFrame( + { + "unit": d["unit"], + "period": d["period"], + "treated": d["treated"], + "post": d["post"], + "y": d["y"], + } + ) + res = DifferenceInDifferences(vcov_type="hc2_bm", cluster="unit").fit( + data, + outcome="y", + treatment="treated", + time="post", + absorb=["unit", "period"], + unit="unit", + ) + coef_names = d["coef_names"] + treat_post_idx = coef_names.index("treat_post") + expected_vcov = np.asarray(d["vcov_cr2"]).reshape(d["vcov_cr2_shape"]) + expected_se_slope = float(np.sqrt(expected_vcov[treat_post_idx, treat_post_idx])) + np.testing.assert_allclose(res.se, expected_se_slope, atol=1e-10) + np.testing.assert_allclose(res.att, float(d["coef"][treat_post_idx]), atol=1e-10) + + def test_absorb_hc2_bm_df_sensitive_inference(self): + """Bell-McCaffrey Satterthwaite DOF must propagate to `p_value` / `conf_int`. + + HC2-BM differs from HC2 only in the DOF used for inference (Satterthwaite + ratio rather than n-k). If the auto-routed fit silently used n-k for the + BM path, `p_value` and `conf_int` would be wrong even though `se` looked + right. This test asserts that: + + (1) HC2 and HC2-BM give the same `se` on the same data (HC2 meat is shared); + (2) HC2 and HC2-BM produce DIFFERENT `p_value` and `conf_int` because the + critical-value DOF differs (HC2-BM uses Satterthwaite DOF < n-k, so + t-critical is larger → wider CI, larger p-value). + + This is the df-sensitive regression guard the R1 reviewer asked for. + """ + d = self._load_golden() + res_hc2 = self._fit_absorb(d, "hc2") + res_hc2_bm = self._fit_absorb(d, "hc2_bm") + # Same point estimate. + np.testing.assert_allclose(res_hc2.att, res_hc2_bm.att, atol=1e-12) + # Same SE (the meat is the same; only the DOF differs for inference). + np.testing.assert_allclose(res_hc2.se, res_hc2_bm.se, atol=1e-12) + # DIFFERENT p_value and conf_int (DOF differs). + assert res_hc2.p_value != res_hc2_bm.p_value, ( + "HC2 and HC2-BM should have different p_values " + "because the BM Satterthwaite DOF differs from n-k. " + "Same p_value indicates the DOF was not propagated to inference." + ) + ci_hc2 = res_hc2.conf_int + ci_hc2_bm = res_hc2_bm.conf_int + # The BM CI should be WIDER than the HC2 CI (smaller DOF → larger + # t-critical → wider interval). + width_hc2 = float(ci_hc2[1] - ci_hc2[0]) + width_hc2_bm = float(ci_hc2_bm[1] - ci_hc2_bm[0]) + assert width_hc2_bm > width_hc2, ( + f"HC2-BM CI width ({width_hc2_bm:.6f}) should exceed " + f"HC2 CI width ({width_hc2:.6f}) — BM Satterthwaite DOF is " + "smaller than n-k, so the critical value is larger." + ) From f931368409390113d44dc7fe363991c3422c979a Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 16:55:16 -0400 Subject: [PATCH 3/6] PR #458 R2: move auto-route ahead of multi-absorb-survey guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R2 review flagged that REGISTRY/CHANGELOG documented `DiD(absorb=..., vcov_type in {hc2,hc2_bm})` as SUPPORTED, but the legacy `len(absorb) > 1 + survey_weights` guard at estimators.py:347 fired BEFORE the auto-route, so weighted multi-absorb fits still raised. The guard's rationale ("single-pass demeaning isn't the correct weighted FWL projection for N>1 absorbed dimensions") doesn't apply when we're auto-routing to fixed_effects= — the fixed_effects= path builds the full-dummy design and solves WLS directly with no within-transform. Reorder: move the auto-route block above the multi-absorb-survey guard. The guard now only fires when absorb was NOT consumed by the auto-route (i.e., hc1/classical/conley/etc. — paths that still demean). Adds `test_absorb_hc2_bm_survey_multi_absorb_auto_routes` to pin the new placement against silent regression. The existing `test_survey.py` multi-absorb-survey rejection tests continue to pass (they use the default vcov_type=hc1 path which still hits the guard). Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/estimators.py | 49 ++++++++++++++++++------------ tests/test_estimators_vcov_type.py | 42 +++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 19 deletions(-) diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py index da227772f..1f2fde9ab 100644 --- a/diff_diff/estimators.py +++ b/diff_diff/estimators.py @@ -344,25 +344,6 @@ def fit( n_treated_raw = int(np.sum(data[treatment].values.astype(float))) n_control_raw = len(data) - n_treated_raw - # Reject multi-absorb with survey weights (single-pass demeaning is - # not the correct weighted FWL projection for N > 1 dimensions) - if absorb and len(absorb) > 1 and survey_weights is not None: - raise ValueError( - f"Multiple absorbed fixed effects (absorb={absorb}) with survey " - "weights is not supported. Single-pass sequential demeaning is not " - "the correct weighted FWL projection for multiple absorbed dimensions. " - "Use absorb with a single variable, or use fixed_effects= instead." - ) - - if absorb and fixed_effects: - raise ValueError( - "Cannot use both absorb and fixed_effects. " - "The absorb within-transformation does not residualize " - "fixed_effects dummies, violating the FWL theorem. " - "Use absorb alone (for high-dimensional FE) " - "or fixed_effects alone (for low-dimensional FE)." - ) - # Auto-route absorb → fixed_effects when vcov_type needs the FULL FE # hat matrix. HC2 leverage and CR2 Bell-McCaffrey DOF both depend on # the full-design hat; FWL preserves coefficients and residuals but @@ -379,12 +360,42 @@ def fit( # Note: the user-facing `result.coefficients` under this auto-route # will include the FE-dummy entries (matching the fixed_effects= path), # not the slope-only view that a plain `absorb=` returns. + # + # Placement: this auto-route runs BEFORE the legacy multi-absorb + + # survey-weights guard because that guard's rationale ("single-pass + # demeaning is not the correct weighted FWL projection for N > 1 + # dimensions") doesn't apply when we're about to swap absorb for + # fixed_effects: the fixed_effects= path builds the full-dummy design + # and solves WLS directly, with no within-transform step. R2 review + # surfaced the scope mismatch (REGISTRY/CHANGELOG said "SUPPORTED" but + # the survey guard fired first on weighted multi-absorb fits). if absorb and self.vcov_type in ("hc2", "hc2_bm"): fixed_effects = list(fixed_effects or []) + list(absorb) absorb = None absorbed_vars = [] n_absorbed_effects = 0 + # Reject multi-absorb with survey weights (single-pass demeaning is + # not the correct weighted FWL projection for N > 1 dimensions). Only + # fires when absorb is still set — i.e., the auto-route above didn't + # consume it. + if absorb and len(absorb) > 1 and survey_weights is not None: + raise ValueError( + f"Multiple absorbed fixed effects (absorb={absorb}) with survey " + "weights is not supported. Single-pass sequential demeaning is not " + "the correct weighted FWL projection for multiple absorbed dimensions. " + "Use absorb with a single variable, or use fixed_effects= instead." + ) + + if absorb and fixed_effects: + raise ValueError( + "Cannot use both absorb and fixed_effects. " + "The absorb within-transformation does not residualize " + "fixed_effects dummies, violating the FWL theorem. " + "Use absorb alone (for high-dimensional FE) " + "or fixed_effects alone (for low-dimensional FE)." + ) + # Validate vcov_type="conley" wire-up. DiD.fit() accepts `unit` # as a fit-time arg (NOT on __init__) because cluster/unit # semantics on DiD are opt-in rather than auto-derived (unlike diff --git a/tests/test_estimators_vcov_type.py b/tests/test_estimators_vcov_type.py index 432cb7d51..e8266586d 100644 --- a/tests/test_estimators_vcov_type.py +++ b/tests/test_estimators_vcov_type.py @@ -1095,6 +1095,48 @@ def test_absorb_hc2_bm_clustered_matches_clubsandwich(self): np.testing.assert_allclose(res.se, expected_se_slope, atol=1e-10) np.testing.assert_allclose(res.att, float(d["coef"][treat_post_idx]), atol=1e-10) + def test_absorb_hc2_bm_survey_multi_absorb_auto_routes(self): + """Survey-weighted multi-absorb + HC2-BM should auto-route, not reject. + + The legacy guard at `estimators.py` rejects `survey_design` paired with + `len(absorb) > 1` because single-pass demeaning is not the correct + weighted FWL projection for multiple absorbed dimensions. But when the + auto-route fires (hc2/hc2_bm), absorb is swapped for fixed_effects= + BEFORE the survey guard sees it, so the demeaning rationale doesn't + apply. R2 review caught the scope mismatch: REGISTRY said "SUPPORTED" + but the survey guard fired first on weighted multi-absorb. This test + pins the new placement. + """ + from diff_diff import SurveyDesign + + d = self._load_golden() + rng = np.random.default_rng(20260420) + n = len(d["y"]) + data = pd.DataFrame( + { + "unit": d["unit"], + "period": d["period"], + "treated": d["treated"], + "post": d["post"], + "y": d["y"], + "weight": rng.uniform(0.5, 2.0, size=n), + } + ) + sd = SurveyDesign(weights="weight", weight_type="aweight") + # Multi-absorb (`unit` + `period`) + survey-weighted + hc2_bm: should + # auto-route to fixed_effects= and succeed. + res = DifferenceInDifferences(vcov_type="hc2_bm").fit( + data, + outcome="y", + treatment="treated", + time="post", + absorb=["unit", "period"], + unit="unit", + survey_design=sd, + ) + assert np.isfinite(res.att) + assert np.isfinite(res.se) + def test_absorb_hc2_bm_df_sensitive_inference(self): """Bell-McCaffrey Satterthwaite DOF must propagate to `p_value` / `conf_int`. From 3f1dfa98aaf32d5f441c4a5b5b19aba111505cc1 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 17:02:17 -0400 Subject: [PATCH 4/6] PR #458 R3 polish: clarify survey-design scope of the auto-route MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R3 informational P3: REGISTRY/CHANGELOG wording could be read as implying survey fits compute HC2/HC2-BM analytically. Survey fits actually use Taylor-series linearization or replicate-weight variance regardless of `vcov_type` — the auto-route only changes the FE handling and removes the prior absorbed-FE reject. Added one-sentence clarifications in both surfaces so the documentation matches the variance dispatch in linalg.py / results.py. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 2 +- docs/methodology/REGISTRY.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 721b227a2..8c8420d78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added -- **`DifferenceInDifferences(absorb=..., vcov_type in {"hc2", "hc2_bm"})` now supported** (`diff_diff/estimators.py:382`). Previously raised `NotImplementedError` because the HC2 leverage correction and CR2 Bell-McCaffrey DOF depend on the FULL FE hat matrix, while within-transformation (FWL) preserves coefficients and residuals but not the hat. Lift via internal auto-route: when `absorb=` is paired with `vcov_type in {"hc2","hc2_bm"}`, the fit promotes the absorb columns to `fixed_effects=` internally so the existing full-dummy-design code path computes the algebraically correct vcov. Empirically matches `lm() + sandwich::vcovHC(type="HC2")` and `lm() + clubSandwich::vcovCR(cluster=..., type="CR2")` at ~1e-10 (verified via new `tests/test_estimators_vcov_type.py::TestDiDAbsorbedFERParity` against `benchmarks/data/clubsandwich_cr2_golden.json` scenario `absorbed_fe_did`, with the R generator using the singleton-cluster CR2 trick for one-way HC2-BM Satterthwaite DOF). HC1/CR1 paths unchanged. `MultiPeriodDiD(absorb=...)` and `TwoWayFixedEffects` rejections remain as follow-ups (different fit-path structure). **Behavioral note (full `DiDResults` surface change under auto-route):** under the auto-route, the entire returned `DiDResults` reflects the full-dummy fit rather than the within-transformed fit. Specifically, `result.coefficients` and `result.vcov` include the FE-dummy entries (matching the `fixed_effects=` path), `result.residuals` and `result.fitted_values` are on the un-demeaned outcome scale, and `result.r_squared` is computed on the un-demeaned outcome (so it absorbs the FE variance and will typically be higher than the within-R²). `result.att` is invariant to this routing (FWL guarantee). Downstream consumers reading `result.att` are unaffected; consumers reading the broader result surface should expect the full-dummy values. +- **`DifferenceInDifferences(absorb=..., vcov_type in {"hc2", "hc2_bm"})` now supported** (`diff_diff/estimators.py:382`). Previously raised `NotImplementedError` because the HC2 leverage correction and CR2 Bell-McCaffrey DOF depend on the FULL FE hat matrix, while within-transformation (FWL) preserves coefficients and residuals but not the hat. Lift via internal auto-route: when `absorb=` is paired with `vcov_type in {"hc2","hc2_bm"}`, the fit promotes the absorb columns to `fixed_effects=` internally so the existing full-dummy-design code path computes the algebraically correct vcov. Empirically matches `lm() + sandwich::vcovHC(type="HC2")` and `lm() + clubSandwich::vcovCR(cluster=..., type="CR2")` at ~1e-10 (verified via new `tests/test_estimators_vcov_type.py::TestDiDAbsorbedFERParity` against `benchmarks/data/clubsandwich_cr2_golden.json` scenario `absorbed_fe_did`, with the R generator using the singleton-cluster CR2 trick for one-way HC2-BM Satterthwaite DOF). HC1/CR1 paths unchanged. `MultiPeriodDiD(absorb=...)` and `TwoWayFixedEffects` rejections remain as follow-ups (different fit-path structure). **Behavioral note (full `DiDResults` surface change under auto-route):** under the auto-route, the entire returned `DiDResults` reflects the full-dummy fit rather than the within-transformed fit. Specifically, `result.coefficients` and `result.vcov` include the FE-dummy entries (matching the `fixed_effects=` path), `result.residuals` and `result.fitted_values` are on the un-demeaned outcome scale, and `result.r_squared` is computed on the un-demeaned outcome (so it absorbs the FE variance and will typically be higher than the within-R²). `result.att` is invariant to this routing (FWL guarantee). Downstream consumers reading `result.att` are unaffected; consumers reading the broader result surface should expect the full-dummy values. **Survey-design scope:** the auto-route changes the FE handling (and removes the prior absorbed-FE rejection), but `survey_design=` continues to drive its own variance path (Taylor-series linearization or replicate-weight variance, per the existing survey contract) rather than the analytical HC2/HC2-BM sandwich. The auto-route is therefore methodologically meaningful for non-survey fits and for the FE-handling side of survey fits; analytical small-sample inference under `vcov_type in {"hc2","hc2_bm"}` is bypassed when a survey design is supplied. - **BaconDecomposition R parity goldens.** Closes the PR-B deferral row in `TODO.md`. JSON goldens at `benchmarks/data/r_bacondecomp_golden.json` generated from the committed `benchmarks/R/generate_bacon_golden.R` script (3 fixtures: `uniform_3groups_with_never_treated`, `two_groups_no_never_treated`, `always_treated_remapped`) against `bacondecomp 0.1.1` on R 4.5.2. `tests/test_methodology_bacon.py::TestBaconParityR` now active (4 tests, no skips): TWFE coefficient parity at `atol=1e-6` across all 3 fixtures; weights-sum parity at `atol=1e-6` across all 3 fixtures; per-component estimate + weight parity at `atol=1e-6` on the 2 non-remap fixtures **and on the 6 timing-vs-timing rows of `always_treated_remapped`** (carve-out narrowed to U-bucket rows only); plus a dedicated fold-back test (`test_always_treated_remapped_fold_back_matches_r`) that pins the **documented convention divergence** on `always_treated_remapped` (R keeps `first_treat=1` as a distinct timing cohort and emits `Later vs Always Treated` comparisons; Python's paper-footnote-11 convention remaps those units to `U` and folds them into a single `treated_vs_never` cell per treated cohort) by aggregating R's split rows per cohort and asserting they match Python's single fold at `atol=1e-6`. The aggregate is invariant per Theorem 1; the per-component breakdown differs structurally between conventions but the fold-back is now directly asserted. New `**Note (R parity convention divergence on always-treated)**` and `**Deviation (first-period boundary extension on always-treated remap)**` in `docs/methodology/REGISTRY.md`. **First-period boundary deviation:** the paper uses strict `t_i < 1` for the always-treated bucket; the library uses the inclusive `first_treat <= min(time)` rule and folds `first_treat == min(time)` cohorts into `U`. R does NOT apply this fold (it keeps such cohorts as their own bucket). When `min(time) > 1` the rules coincide. Explicitly labeled in REGISTRY's Deviations block and mirrored in `METHODOLOGY_REVIEW.md` and `bacon.py`. METHODOLOGY_REVIEW.md tracker row promoted `**Complete** (R parity goldens pending)` → `**Complete**`. - **`generate_ddd_panel_data` — panel-structured DGP for Triple-Difference power analysis** (`diff_diff/prep_dgp.py`). New public function exported from `diff_diff` and `diff_diff.prep` for panel DDD simulations. Cross-sectional `generate_ddd_data` remains available unchanged. Produces a balanced panel of `n_units × n_periods` with two unit-level binary dimensions (`group`, `partition`) and a derived `post = 1[period >= treatment_period]` indicator; columns: `unit, period, outcome, group, partition, post, treated, true_effect` (+ `x1, x2` when `add_covariates=True`). DDD-CPT identification holds because the `group * partition` interaction enters as a unit-level (time-invariant) term, leaving the triple-interaction `treatment_effect * group * partition * post` as the sole source of differential group × partition trend. Compatible with `TripleDifference(cluster="unit").fit(..., time="post")` (the cluster kwarg is required because `TripleDifference` is the repeated-cross-section `panel=FALSE` estimator and unclustered SE on panel-generated rows understates variance under within-unit serial correlation; the point estimate `att` is invariant to clustering — see the new `TripleDifference` REGISTRY note on panel-shaped input). Users get panel-realistic unit fixed effects and within-unit serial correlation while the binary 2×2×2 estimator surface is unchanged. **Stratified allocation:** the partition split is drawn stratified-by-group at the requested `partition_frac` so every `(group, partition)` cell receives at least one unit; a targeted `ValueError` is raised at fit-time when the rounded cell counts (`n_units`, `group_frac`, `partition_frac`) would leave any cell empty. This guarantees the 2x2x2 DDD surface is populated for any valid input — independent marginal sampling (the cross-sectional `generate_ddd_data` convention) could collapse cells when marginals are small (e.g., `n_units=4, group_frac=partition_frac=0.25`). Validates `1 <= treatment_period < n_periods`, `group_frac` and `partition_frac` strictly in `(0, 1)`, and `n_units >= 4`. Deterministic recovery (`noise_sd=0`) matches `treatment_effect` to ~1e-15 (covered by `tests/test_prep.py::TestGenerateDddPanelData`, 16 tests including infeasible-config rejection and smallest-feasible-config round-trip through `TripleDifference.fit`). `power.simulate_power` is NOT yet auto-routed to the panel DGP for `TripleDifference` (the existing `_ddd_dgp_kwargs` registry entry still ignores `n_periods` and the existing `_check_ddd_dgp_compat` warning still fires on non-default kwargs) — that wiring is tracked as a follow-up in TODO.md. - **BaconDecomposition: Goodman-Bacon (2021) methodology audit (PR-B).** Closes the BaconDecomposition row in `METHODOLOGY_REVIEW.md` (status flipped from **In Progress** → **Complete** — initially with an R-parity-goldens caveat that was closed by the parity-goldens bullet above in this same release). Builds on the PR #451 paper review at `docs/methodology/papers/goodman-bacon-2021-review.md`. **Audit outcomes:** (1) Rewrote `_recompute_exact_weights` in `bacon.py` to actually implement Theorem 1 (Eqs. 7-9 + 10e-g) — the prior "exact" implementation was missing the `(1-n_kU)` factor in the subsample variance, did not square the sample share, and added an extraneous `unit_share` factor not present in the paper; the post-hoc sum-to-1 normalization masked the relative-weight error but produced ~0.3% decomposition error vs TWFE on a 3-cohort + never-treated DGP. The rewrite computes the exact numerators of Eqs. 10e/f/g and lets the post-hoc normalization handle the `V̂^D` denominator (Theorem 1's identity guarantees `V̂^D = Σ numerators`). The TWFE-vs-weighted-sum identity now holds at `atol=1e-10` on both noisy and hand-calculable DGPs. (2) Added always-treated warn+remap per paper footnote 11: units whose `first_treat` is at or before the first observable period (`first_treat <= min(time)`, excluding the never-treated sentinels `0` and `np.inf`) are automatically remapped to the `U` (untreated) bucket via an internal column (`__bacon_first_treat_internal__`) with a `UserWarning`. Detection uses ordered-time logic on the **time axis**, so panels whose `time` column has negative or zero-crossing labels (event-time encodings) are handled correctly; the `0` sentinel restriction applies only to `first_treat`, not to `time`, and a real treatment cohort with `first_treat == 0` would still be folded into U today (re-label such cohorts to a non-sentinel value before fitting). The user's original `first_treat` column is preserved unchanged. The count is surfaced as a new `BaconDecompositionResults.n_always_treated_remapped` dataclass field, rendered in `summary()` output when nonzero. **`n_never_treated` reports TRUE never-treated only**, computed from the original user column before remap — remapped always-treated units appear separately as `n_always_treated_remapped`, no double-counting. (3) New methodology test file `tests/test_methodology_bacon.py` (34 tests across 6 classes post-release; the audit added ~24 tests and the R-parity-goldens bullet above expanded coverage: `TestBaconHandCalculation` hand-checks Eqs. 7-9 + 10b-d on a minimal balanced panel at `atol=1e-10`; `TestBaconParityR` (4 tests, all active post-release once the R parity goldens bullet above landed; skips cleanly with a regenerate-instructions pointer in partial-checkout scenarios where the JSON is unavailable); `TestBaconAlwaysTreatedRemap` regression-tests warn+remap mechanics including user-data-preservation; `TestBaconEdgeCases` exercises no-untreated, single-cohort, unbalanced panel, constant-ATT recovery; `TestBaconWeightModes` locks the new exact-is-default contract; `TestBaconSurveyDesignNarrowing` confirms survey_design composes with exact mode and warn+remap). (4) R `bacondecomp::bacon()` parity generator committed at `benchmarks/R/generate_bacon_golden.R` covering three DGP fixtures (3-groups-with-U, 2-groups-no-U, always-treated-remapped); the JSON goldens deferral at audit time was closed in this same release by the parity-goldens bullet above. (5) `docs/methodology/REGISTRY.md` `## BaconDecomposition` block replaced with the paper-review-sourced entry plus three new sub-notes: weight modes (exact vs approximate), always-treated remap, R parity status. **Explicit removal:** the prior REGISTRY block's "Weights may be negative for later-vs-earlier comparisons" claim was incorrect per Theorem 1 (decomposition weights are strictly positive and sum to 1; negative weights are an estimand-level phenomenon, not estimator-level) and is dropped from the new entry. Closes the BaconDecomposition follow-up tracked at `TODO.md` (the prior row added in PR #451 is replaced by a narrower R-parity-goldens deferral row). diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index e3857cb25..647e16b42 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -2549,7 +2549,7 @@ Shipped in `diff_diff/had_pretests.py` as `stute_joint_pretest()` (residuals-in - [x] Phase 1a: Univariate local-linear regression at a boundary (`local_linear_fit` in `diff_diff/local_linear.py`). - [x] Phase 1a: HC2 + Bell-McCaffrey DOF correction in `diff_diff/linalg.py` via `vcov_type="hc2_bm"` enum (both one-way and CR2 cluster-robust with Imbens-Kolesar / Pustejovsky-Tipton Satterthwaite DOF). Weighted cluster CR2 raises `NotImplementedError` and is tracked as Phase 2+ in `TODO.md`. - **Note (scope limitation on absorbed FE):** HC2 and HC2 + Bell-McCaffrey on within-transformed designs still depend on the FULL FE hat matrix because FWL preserves coefficients and residuals but NOT the hat matrix: `h_ii = x_i' (X'X)^{-1} x_i` on the reduced design is not the diagonal of the full FE projection, and CR2's block adjustment `A_g = (I - H_gg)^{-1/2}` likewise depends on the full cluster-block hat matrix. The status across the three estimators that previously rejected this combination: - - **`DifferenceInDifferences(absorb=..., vcov_type in {"hc2","hc2_bm"})` — SUPPORTED (auto-route).** When the user pairs `absorb=` with HC2 / HC2-BM, `DiD.fit()` internally promotes the absorb columns to `fixed_effects=` so the existing full-dummy code path computes the algebraically correct vcov from the full FE projection. Verified at ~1e-10 vs `lm() + sandwich::vcovHC(type="HC2")` and `lm() + clubSandwich::vcovCR(cluster=..., type="CR2")` (singleton-cluster CR2 trick for one-way HC2-BM Satterthwaite DOF; PT2018 §3.3 unweighted CR2 algebra). **User-visible surface change**: under the auto-route, the entire `DiDResults` (coefficients, vcov, residuals, fitted_values, r_squared) reflect the full-dummy fit rather than the within-transformed fit — the FE-dummy entries are included in `result.coefficients` / `result.vcov`, `r_squared` is computed on the un-demeaned outcome, and `residuals` / `fitted_values` are on the original scale. `result.att` is unaffected (FWL-equivalent). HC1/CR1 paths on `absorb=` are unchanged (no leverage term). + - **`DifferenceInDifferences(absorb=..., vcov_type in {"hc2","hc2_bm"})` — SUPPORTED (auto-route).** When the user pairs `absorb=` with HC2 / HC2-BM, `DiD.fit()` internally promotes the absorb columns to `fixed_effects=` so the existing full-dummy code path computes the algebraically correct vcov from the full FE projection. Verified at ~1e-10 vs `lm() + sandwich::vcovHC(type="HC2")` and `lm() + clubSandwich::vcovCR(cluster=..., type="CR2")` (singleton-cluster CR2 trick for one-way HC2-BM Satterthwaite DOF; PT2018 §3.3 unweighted CR2 algebra). **User-visible surface change**: under the auto-route, the entire `DiDResults` (coefficients, vcov, residuals, fitted_values, r_squared) reflect the full-dummy fit rather than the within-transformed fit — the FE-dummy entries are included in `result.coefficients` / `result.vcov`, `r_squared` is computed on the un-demeaned outcome, and `residuals` / `fitted_values` are on the original scale. `result.att` is unaffected (FWL-equivalent). HC1/CR1 paths on `absorb=` are unchanged (no leverage term). **Survey-design scope**: when `survey_design=` is supplied, the existing survey variance path (Taylor-series linearization / replicate weights) takes precedence over the analytical HC2/HC2-BM sandwich; the auto-route only changes the FE handling (removing the prior reject) and does not redirect to the analytical small-sample sandwich on survey fits. - **`TwoWayFixedEffects(vcov_type in {"hc2","hc2_bm"})` — still rejects.** TWFE is a standalone class with no `fixed_effects=` equivalent path, so the same auto-route surgery would require building the full-dummy design inline. Tracked as a follow-up in `TODO.md`. - **`MultiPeriodDiD(absorb=..., vcov_type in {"hc2","hc2_bm"})` — still rejects.** Same algebraic situation as DiD, but the `MultiPeriodDiD.fit()` body has many additional branches (cohort, event-study, survey, etc.) that complicate the auto-route surgery. Tracked as a follow-up in `TODO.md`. - Workarounds for the still-rejecting paths: use `vcov_type="hc1"` (HC1/CR1 have no leverage term and survive FWL), or switch to `fixed_effects=` dummies so the hat matrix is computed on the full design. From 4a92f6aca6bcfb5bc99f7c0193fd76960f8ba0a1 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 17:12:02 -0400 Subject: [PATCH 5/6] PR #458 R4 polish: preserve absorb+fixed_effects rejection on hc2/hc2_bm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R4 informational P3: my prior reordering moved the auto-route ahead of the multi-absorb-survey guard (correct) but ALSO left it ahead of the existing `absorb + fixed_effects` mutual-exclusion check. On hc2/hc2_bm the user-facing rejection vanished — the two args silently merged. Move the `absorb + fixed_effects` validation ABOVE the auto-route so the public-contract rejection fires regardless of vcov_type. Add a regression test that pins the rejection across hc1/hc2/hc2_bm to prevent silent regression on this contract. The legacy multi-absorb + survey-weights guard stays BELOW the auto-route (intentional from R2: when auto-routing, the demeaning rationale of that guard doesn't apply). Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/estimators.py | 25 ++++++++++++++++--------- tests/test_estimators_vcov_type.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py index 1f2fde9ab..84e712cfa 100644 --- a/diff_diff/estimators.py +++ b/diff_diff/estimators.py @@ -344,6 +344,22 @@ def fit( n_treated_raw = int(np.sum(data[treatment].values.astype(float))) n_control_raw = len(data) - n_treated_raw + # Reject the `absorb + fixed_effects` mutual-exclusion combination + # BEFORE any auto-route. R4 review caught a contract-drift where the + # auto-route silently merged the two arguments on the HC2/HC2-BM + # path — the public API has always treated this combination as + # invalid (different FE-handling paths; mixing them violates the + # FWL theorem on the demeaned half), so keep the explicit rejection + # in front of the auto-route to preserve user-facing behavior. + if absorb and fixed_effects: + raise ValueError( + "Cannot use both absorb and fixed_effects. " + "The absorb within-transformation does not residualize " + "fixed_effects dummies, violating the FWL theorem. " + "Use absorb alone (for high-dimensional FE) " + "or fixed_effects alone (for low-dimensional FE)." + ) + # Auto-route absorb → fixed_effects when vcov_type needs the FULL FE # hat matrix. HC2 leverage and CR2 Bell-McCaffrey DOF both depend on # the full-design hat; FWL preserves coefficients and residuals but @@ -387,15 +403,6 @@ def fit( "Use absorb with a single variable, or use fixed_effects= instead." ) - if absorb and fixed_effects: - raise ValueError( - "Cannot use both absorb and fixed_effects. " - "The absorb within-transformation does not residualize " - "fixed_effects dummies, violating the FWL theorem. " - "Use absorb alone (for high-dimensional FE) " - "or fixed_effects alone (for low-dimensional FE)." - ) - # Validate vcov_type="conley" wire-up. DiD.fit() accepts `unit` # as a fit-time arg (NOT on __init__) because cluster/unit # semantics on DiD are opt-in rather than auto-derived (unlike diff --git a/tests/test_estimators_vcov_type.py b/tests/test_estimators_vcov_type.py index e8266586d..31b22fd59 100644 --- a/tests/test_estimators_vcov_type.py +++ b/tests/test_estimators_vcov_type.py @@ -1095,6 +1095,36 @@ def test_absorb_hc2_bm_clustered_matches_clubsandwich(self): np.testing.assert_allclose(res.se, expected_se_slope, atol=1e-10) np.testing.assert_allclose(res.att, float(d["coef"][treat_post_idx]), atol=1e-10) + def test_absorb_plus_fixed_effects_still_rejected_under_hc2_bm(self): + """Mutual-exclusion of `absorb=` and `fixed_effects=` is preserved. + + R4 review caught that the auto-route initially merged the two + arguments silently on the HC2/HC2-BM path. The public API has + always treated this combination as invalid; the rejection must + fire regardless of `vcov_type`. + """ + d = self._load_golden() + data = pd.DataFrame( + { + "unit": d["unit"], + "period": d["period"], + "treated": d["treated"], + "post": d["post"], + "y": d["y"], + } + ) + for vcov in ("hc1", "hc2", "hc2_bm"): + with pytest.raises(ValueError, match="Cannot use both absorb and fixed_effects"): + DifferenceInDifferences(vcov_type=vcov).fit( + data, + outcome="y", + treatment="treated", + time="post", + absorb=["unit"], + fixed_effects=["period"], + unit="unit", + ) + def test_absorb_hc2_bm_survey_multi_absorb_auto_routes(self): """Survey-weighted multi-absorb + HC2-BM should auto-route, not reject. From 3b49324b55bb23ca3966b34d5a49f9e50c1231b6 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 16 May 2026 17:18:08 -0400 Subject: [PATCH 6/6] PR #458 R5 polish: pin HC2 SE against sandwich::vcovHC in-tree R5 informational P3: CHANGELOG/REGISTRY claimed parity against BOTH `sandwich::vcovHC(type="HC2")` AND `clubSandwich::vcovCR(...)`, but the R generator only materialized CR2-derived targets for `absorbed_fe_did`. The HC2-parity claim was verified in a throwaway smoke test but not pinned in-tree. Added `vcov_hc2` to the R generator output (computed via `sandwich::vcovHC(fit_did, type = "HC2")`) and a corresponding Python parity test `test_absorb_hc2_matches_sandwich_vcovhc` that asserts the treat_post slope SE matches at 1e-10. Replaces the prior weaker `test_absorb_hc2_matches_full_dummy_design` (which only checked finite-SE + ATT parity). Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/R/generate_clubsandwich_golden.R | 5 +++++ benchmarks/data/clubsandwich_cr2_golden.json | 4 +++- tests/test_estimators_vcov_type.py | 19 +++++++++++++++---- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/benchmarks/R/generate_clubsandwich_golden.R b/benchmarks/R/generate_clubsandwich_golden.R index 163a1ecf3..a4e748cb2 100644 --- a/benchmarks/R/generate_clubsandwich_golden.R +++ b/benchmarks/R/generate_clubsandwich_golden.R @@ -94,6 +94,9 @@ make_did_panel <- function(n_units, n_periods, treatment_period, seed) { d_did <- make_did_panel(n_units = 8, n_periods = 4, treatment_period = 2, seed = 404) fit_did <- lm(y ~ treat_post + unit + period, data = d_did) +# HC2 via sandwich::vcovHC(type = "HC2"). Pins the in-tree HC2-parity claim +# the changelog/registry make for the absorb auto-route on the hc2 lane. +vcov_did_hc2 <- sandwich::vcovHC(fit_did, type = "HC2") # HC2-BM unclustered via singleton-cluster CR2 (PT2018-blessed workaround, # since clubSandwich::vcovCR requires a cluster arg). vcov_did_hc2_bm <- vcovCR(fit_did, cluster = seq_len(nrow(d_did)), type = "CR2") @@ -109,6 +112,8 @@ output$absorbed_fe_did <- list( y = d_did$y, coef = as.numeric(coef(fit_did)), coef_names = names(coef(fit_did)), + vcov_hc2 = as.numeric(vcov_did_hc2), + vcov_hc2_shape = dim(vcov_did_hc2), vcov_hc2_bm = as.numeric(vcov_did_hc2_bm), vcov_hc2_bm_shape = dim(vcov_did_hc2_bm), dof_hc2_bm = as.numeric(ct_did_hc2_bm$df_Satt), diff --git a/benchmarks/data/clubsandwich_cr2_golden.json b/benchmarks/data/clubsandwich_cr2_golden.json index 0d7573d72..e7f453e71 100644 --- a/benchmarks/data/clubsandwich_cr2_golden.json +++ b/benchmarks/data/clubsandwich_cr2_golden.json @@ -40,6 +40,8 @@ "y": [0.8675660610026393, 1.974964371815635, 0.6262455759968815, 0.4250911204896504, 0.01746049540615028, 1.285747228403441, 0.3303840020024503, -0.5671521618639435, -0.1142677095292809, 0.2383353498523538, -0.9019587987113207, -0.7061707682056644, -0.6157969345539587, -0.2881843268354549, -0.8240764992194235, -1.200211068425945, -1.671803432337748, 1.089318887196633, 0.5606170281167306, -0.7090977528526379, 1.811753709724553, 4.592189277336034, 3.604625926706315, 3.458451392091618, -1.216703112388791, 1.963130563935098, 1.026428163741928, 0.01788553114847297, 0.3145236664361374, 3.957995942075363, 2.327538808912606, 2.632995686497729], "coef": [0.9779587643060759, 2.240053222690117, -0.7068568913391788, -1.34448226397468, -1.705533989584897, -2.836248016813048, 0.7132483771208394, -2.205821412734612, -0.3452431733633319, 0.8075689574073665, -0.2003926783717501, -0.625144206955112], "coef_names": ["(Intercept)", "treat_post", "unit2", "unit3", "unit4", "unit5", "unit6", "unit7", "unit8", "period2", "period3", "period4"], + "vcov_hc2": [0.01979818801269553, 0.01704378165137596, -0.005421788736264531, -0.007394541378303345, -0.01007755553179412, -0.01979818801269563, -0.01979818801269567, -0.01979818801269571, -0.01979818801269547, -0.0169124955660397, -0.01703244626736469, -0.01718640312072378, 0.01704378165137579, 0.05093938388325538, 0.002835465294824099, 0.0002051284387722519, -0.003372223765882097, -0.02939912555974632, -0.03715209916350911, -0.0309624629156964, -0.03762991555249129, -0.02761548760605972, -0.02378282170065065, -0.01961967560929413, -0.00542178873626447, 0.002835465294824094, 0.02976700151619221, 0.007548387707382555, 0.007548387707382555, 0.005421788736264518, 0.005421788736264553, 0.005421788736264581, 0.005421788736264445, -0.003976372159256039, -0.002159982546893247, -0.002370041178322776, -0.007394541378303282, 0.000205128438772278, 0.007548387707382555, 0.03734647219452555, 0.007548387707382489, 0.00739454137830333, 0.00739454137830335, 0.007394541378303386, 0.007394541378303256, -0.002708837702167804, 0.001362961531749848, 0.0007304908541013697, -0.01007755553179409, -0.003372223765882121, 0.007548387707382559, 0.007548387707382493, 0.02535026330562543, 0.01007755553179413, 0.01007755553179416, 0.01007755553179419, 0.01007755553179406, 0.00649169555236468, 0.001083309511383738, 0.002541666233898231, -0.01979818801269547, -0.02939912555974642, 0.005421788736264515, 0.007394541378303358, 0.01007755553179411, 0.06273419310418629, 0.02880840281549311, 0.02416617562963358, 0.02916676510722959, 0.01380146230054344, 0.0180308113184583, 0.01929907133512627, -0.01979818801269544, -0.0371520991635092, 0.005421788736264488, 0.007394541378303306, 0.01007755553179407, 0.02880840281549301, 0.04210673892260868, 0.02998090583245564, 0.03498149531005166, 0.01935917385427087, 0.01684380475617711, 0.01492836634367997, -0.01979818801269544, -0.03096246291569642, 0.0054217887362645, 0.007394541378303325, 0.01007755553179408, 0.02416617562963344, 0.02998090583245557, 0.04498810890156583, 0.03033926812419206, 0.01701886790849983, 0.01600723525328585, 0.01810524179234228, -0.01979818801269551, -0.03762991555249152, 0.005421788736264507, 0.007394541378303347, 0.01007755553179411, 0.02916676510722974, 0.03498149531005192, 0.03033926812419238, 0.07992984798041554, 0.01747047820084438, 0.01724793374153723, 0.01641293301174643, -0.01691249556603961, -0.02761548760605984, -0.003976372159256052, -0.0027088377021678, 0.006491695552364642, 0.01380146230054344, 0.0193591738542709, 0.01701886790849991, 0.01747047820084427, 0.03261047319081289, 0.02139005700498882, 0.01930848395931052, -0.01703244626736447, -0.02378282170065067, -0.00215998254689334, 0.001362961531749778, 0.001083309511383625, 0.01803081131845818, 0.01684380475617698, 0.01600723525328575, 0.01724793374153696, 0.02139005700498871, 0.03249273520542056, 0.01739215100660593, -0.01718640312072365, -0.01961967560929417, -0.002370041178322803, 0.0007304908541013678, 0.002541666233898184, 0.01929907133512625, 0.01492836634367995, 0.0181052417923423, 0.01641293301174627, 0.01930848395931047, 0.01739215100660598, 0.03176936240997028], + "vcov_hc2_shape": [12, 12], "vcov_hc2_bm": [0.01979818801269555, 0.01704378165137592, -0.005421788736264505, -0.007394541378303356, -0.0100775555317941, -0.0197981880126956, -0.01979818801269569, -0.01979818801269571, -0.01979818801269561, -0.01691249556603976, -0.01703244626736457, -0.01718640312072384, 0.01704378165137601, 0.05093938388325554, 0.002835465294824082, 0.0002051284387722219, -0.003372223765882113, -0.02939912555974648, -0.0371520991635093, -0.03096246291569646, -0.03762991555249157, -0.02761548760605984, -0.02378282170065062, -0.01961967560929424, -0.005421788736264438, 0.002835465294824168, 0.02976700151619218, 0.007548387707382551, 0.007548387707382544, 0.005421788736264468, 0.005421788736264506, 0.005421788736264558, 0.005421788736264465, -0.003976372159256019, -0.002159982546893317, -0.002370041178322755, -0.007394541378303251, 0.0002051284387723454, 0.007548387707382537, 0.03734647219452555, 0.007548387707382478, 0.007394541378303288, 0.007394541378303312, 0.007394541378303364, 0.007394541378303284, -0.00270883770216778, 0.001362961531749783, 0.0007304908541013994, -0.01007755553179405, -0.003372223765882042, 0.007548387707382542, 0.00754838770738249, 0.02535026330562543, 0.01007755553179408, 0.01007755553179411, 0.01007755553179415, 0.01007755553179408, 0.006491695552364699, 0.001083309511383657, 0.002541666233898244, -0.0197981880126956, -0.02939912555974642, 0.00542178873626451, 0.007394541378303376, 0.01007755553179411, 0.06273419310418635, 0.0288084028154932, 0.02416617562963359, 0.02916676510722985, 0.01380146230054354, 0.01803081131845819, 0.01929907133512639, -0.01979818801269557, -0.03715209916350923, 0.005421788736264483, 0.007394541378303324, 0.01007755553179407, 0.02880840281549308, 0.04210673892260878, 0.02998090583245565, 0.0349814953100519, 0.01935917385427097, 0.016843804756177, 0.01492836634368009, -0.01979818801269557, -0.03096246291569643, 0.005421788736264495, 0.007394541378303343, 0.01007755553179408, 0.0241661756296335, 0.02998090583245566, 0.04498810890156586, 0.03033926812419229, 0.01701886790849994, 0.01600723525328574, 0.01810524179234239, -0.01979818801269564, -0.03762991555249155, 0.005421788736264502, 0.007394541378303364, 0.01007755553179411, 0.02916676510722981, 0.03498149531005202, 0.03033926812419241, 0.07992984798041579, 0.01747047820084448, 0.01724793374153712, 0.01641293301174654, -0.0169124955660397, -0.02761548760605989, -0.003976372159256066, -0.002708837702167787, 0.006491695552364641, 0.01380146230054349, 0.01935917385427101, 0.01701886790849995, 0.01747047820084444, 0.03261047319081297, 0.02139005700498877, 0.01930848395931058, -0.01703244626736455, -0.02378282170065071, -0.002159982546893346, 0.001362961531749795, 0.001083309511383624, 0.01803081131845822, 0.01684380475617709, 0.01600723525328581, 0.01724793374153713, 0.02139005700498877, 0.03249273520542049, 0.01739215100660599, -0.01718640312072374, -0.01961967560929423, -0.002370041178322815, 0.0007304908541013794, 0.00254166623389818, 0.0192990713351263, 0.01492836634368007, 0.01810524179234235, 0.01641293301174643, 0.01930848395931052, 0.01739215100660594, 0.03176936240997035], "vcov_hc2_bm_shape": [12, 12], "dof_hc2_bm": [3.471999999999976, 9.658291457286458, 5.718347107438014, 5.718347107438014, 5.718347107438015, 6.923325736969802, 6.923325736969789, 6.923325736969772, 6.923325736969772, 6.504670366860701, 6.504670366860701, 6.50467036686071], @@ -51,7 +53,7 @@ "source": "clubSandwich", "clubSandwich_version": "0.7.0", "R_version": "R version 4.5.2 (2025-10-31)", - "generated_at": "2026-05-16 20:31:13 UTC", + "generated_at": "2026-05-16 21:17:21 UTC", "note": "CR2 Bell-McCaffrey cluster-robust parity target for diff_diff._compute_cr2_bm" } } diff --git a/tests/test_estimators_vcov_type.py b/tests/test_estimators_vcov_type.py index 31b22fd59..2f346049e 100644 --- a/tests/test_estimators_vcov_type.py +++ b/tests/test_estimators_vcov_type.py @@ -1052,14 +1052,25 @@ def test_absorb_hc2_bm_matches_clubsandwich_singleton_cluster(self): # standalone field on DiDResults; the SE+ATT parity above suffices). _ = expected_dof_slope - def test_absorb_hc2_matches_full_dummy_design(self): - """`absorb=` + `hc2` produces a finite SE; ATT matches R.""" + def test_absorb_hc2_matches_sandwich_vcovhc(self): + """`absorb=` + `hc2` matches `lm() + sandwich::vcovHC(type="HC2")`. + + Pins the HC2 SE on the treat_post slope against an external R target + (the R generator computes `sandwich::vcovHC(fit_did, type="HC2")` on + the full-dummy design and stores `vcov_hc2`). + """ d = self._load_golden() + if "vcov_hc2" not in d: + pytest.skip( + "Golden JSON does not yet include `vcov_hc2` for absorbed_fe_did; " + "regenerate via the R script." + ) res = self._fit_absorb(d, "hc2") coef_names = d["coef_names"] treat_post_idx = coef_names.index("treat_post") - assert np.isfinite(res.att) - assert np.isfinite(res.se) + expected_vcov = np.asarray(d["vcov_hc2"]).reshape(d["vcov_hc2_shape"]) + expected_se_slope = float(np.sqrt(expected_vcov[treat_post_idx, treat_post_idx])) + np.testing.assert_allclose(res.se, expected_se_slope, atol=1e-10) np.testing.assert_allclose(res.att, float(d["coef"][treat_post_idx]), atol=1e-10) def test_absorb_hc2_bm_clustered_matches_clubsandwich(self):