From a7a1df2ef9332a739048b724f5678206a7124c10 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 16 May 2026 08:30:27 -0400
Subject: [PATCH 1/4] HonestDiD test_m0_short_circuit: replace wall-clock proxy
 with linprog mock

The test was using `time.time() - t0 < 0.5s` as a proxy for "M=0 took the
fast path." Wall-clock proxies on shared CI runners are flaky, so the test
was `skipif(CI=="true")` and only ran locally.

The fast path is `_compute_worst_case_bias(... M=0) -> 0.0` at
honest_did.py:1650, which means `scipy.optimize.linprog` is never reached.
Direct correctness signal: `mock.patch` on `diff_diff.honest_did.optimize.linprog`
plus `assert_not_called()`. CI-safe, instantaneous, and verifies the actual
short-circuit path rather than a timing proxy.

Drops the CI skipif decorator and the unused `import os`. Tier A row 1 of
post-Wave-2 backlog.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_methodology_honest_did.py | 44 +++++++++++++++-------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/tests/test_methodology_honest_did.py b/tests/test_methodology_honest_did.py
index f2a63017..4a67fba8 100644
--- a/tests/test_methodology_honest_did.py
+++ b/tests/test_methodology_honest_did.py
@@ -5,7 +5,6 @@
 equations, known analytical cases, and expected mathematical properties.
 """
 
-import os
 
 import numpy as np
 import pytest
@@ -245,34 +244,37 @@ def test_optimal_flci_is_finite_and_valid(self):
         assert ci_lb_opt <= lb, "CI lower should be <= identified set lower"
         assert ci_ub_opt >= ub, "CI upper should be >= identified set upper"
 
-    @pytest.mark.skipif(
-        os.environ.get("CI") == "true",
-        reason="wall-clock timing is flaky on shared CI runners; short-circuit "
-        "correctness signal will be replaced with a mock/spy per TODO.md "
-        "(see PR #330 follow-up note)",
-    )
     def test_m0_short_circuit(self):
-        """M=0 should use standard CI without optimization.
-
-        Uses wall-clock elapsed time as a proxy for "short-circuit path
-        taken" — fast path is ``<0.5s``, slow optimization would be ``>>
-        0.5s``. Skipped on CI because neighbor-VM contention on shared
-        runners can push even the short-circuit path past the threshold.
-        Run locally to validate the fast-path invariant; the TODO.md entry
-        added by PR #330 tracks replacing this with a mock/spy so the
-        correctness signal becomes CI-safe.
+        """M=0 takes the bias=0 fast path and never invokes the LP solver.
+
+        ``_compute_worst_case_bias`` returns ``0.0`` immediately when ``M=0``
+        (diff_diff/honest_did.py:1650), so ``scipy.optimize.linprog`` is
+        never reached. Patching the LP solver and asserting ``call_count
+        == 0`` is a direct correctness signal — CI-safe (no wall-clock
+        dependency) and faster than the prior timing-based proxy.
         """
+        from unittest.mock import patch
+
         beta_pre = np.array([0.3, 0.2, 0.1])
         beta_post = np.array([2.0])
         sigma = np.eye(4) * 0.01
         l_vec = np.array([1.0])
 
-        import time
-        t0 = time.time()
-        _compute_optimal_flci(beta_pre, beta_post, sigma, l_vec, 3, 1, M=0.0)
-        elapsed = time.time() - t0
+        with patch("diff_diff.honest_did.optimize.linprog") as mock_linprog:
+            ci_lb, ci_ub = _compute_optimal_flci(
+                beta_pre, beta_post, sigma, l_vec, 3, 1, M=0.0
+            )
 
-        assert elapsed < 0.5, f"M=0 should be fast, took {elapsed:.2f}s"
+        assert mock_linprog.call_count == 0, (
+            f"M=0 must skip the LP solver (fast path at "
+            f"_compute_worst_case_bias:1650); got "
+            f"{mock_linprog.call_count} linprog call(s)."
+        )
+        # End-to-end correctness: M=0 CI is still well-defined.
+        assert np.isfinite(ci_lb) and np.isfinite(ci_ub), (
+            f"M=0 CI must be finite; got [{ci_lb}, {ci_ub}]"
+        )
+        assert ci_lb <= ci_ub, f"M=0 CI must be ordered; got [{ci_lb}, {ci_ub}]"
 
     def test_smoothness_flci_with_survey_df(self):
         """Survey df should widen the smoothness FLCI (folded t vs folded normal)."""

From 7b028fb8bdd1294e48b1288cba8db5d2991c95a4 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 16 May 2026 08:37:30 -0400
Subject: [PATCH 2/4] HonestDiD: surface ARP vertex-enumeration pathologies via
 RuntimeWarning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`_enumerate_vertices` was swallowing `np.linalg.LinAlgError` on every
basis with `try / except / continue`, and `_compute_arp_test` returned
False (conservative non-rejection) when the returned list was empty.
Users had no way to tell whether the test "did not reject" because the
data didn't support rejection or because the vertex search was
numerically pathological (singular A_sys on every basis, degenerate
moment-inequality system).

Instrument `_enumerate_vertices` with three counters (`n_total`,
`n_linalg_error`, `n_infeasible`) and emit a `RuntimeWarning` at
function exit when:
  1. `vertices == []` after `n_total > 0` bases tried — enumeration
     exhausted; caller will fall back to conservative non-rejection.
  2. `vertices != []` but `n_linalg_error / n_total >= 0.5` —
     enumeration heavily constrained; recovered vertices may be
     numerically fragile.

`RuntimeWarning` (not `UserWarning`) marks this as a numerical /
algorithmic signal rather than a user-input issue. `stacklevel=3` so the
warning surfaces at `_compute_arp_test`'s caller, matching the codebase
convention for one-level-deep helper warnings.

No changes to the return type, the caller (`_compute_arp_test`), or
the algorithm semantics — the previous silent-skip behavior is fully
preserved, the diagnostic is purely additive.

Tier-A row in the post-Wave-2 backlog (TODO.md, item 6, PR #334
reference). Adds new `TestARPVertexEnumeration` class in
test_methodology_honest_did.py with three cases: exhausted enumeration,
heavy rejection, healthy enumeration.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/honest_did.py              | 37 +++++++++++++-
 tests/test_methodology_honest_did.py | 72 ++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/diff_diff/honest_did.py b/diff_diff/honest_did.py
index 110e0261..714285e3 100644
--- a/diff_diff/honest_did.py
+++ b/diff_diff/honest_did.py
@@ -18,6 +18,7 @@
 """
 
 from dataclasses import dataclass, field
+import warnings
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
 import numpy as np
@@ -1890,11 +1891,15 @@ def _enumerate_vertices(
     if n_eq > n_moments:
         return []
 
-    vertices = []
+    vertices: List[np.ndarray] = []
+    n_total = 0
+    n_linalg_error = 0
+    n_infeasible = 0
 
     # Each vertex has exactly n_eq non-zero (basic) variables
     for basis_idx in itertools.combinations(range(n_moments), n_eq):
         basis_idx = list(basis_idx)
+        n_total += 1
 
         # Build the system for basic variables
         # gamma[basis_idx]' @ X_tilde[basis_idx, :] = 0
@@ -1913,6 +1918,7 @@ def _enumerate_vertices(
         try:
             gamma_basic = np.linalg.solve(A_sys, b_sys)
         except np.linalg.LinAlgError:
+            n_linalg_error += 1
             continue
 
         # Check feasibility: gamma >= 0
@@ -1920,6 +1926,35 @@ def _enumerate_vertices(
             gamma = np.zeros(n_moments)
             gamma[basis_idx] = np.maximum(gamma_basic, 0)
             vertices.append(gamma)
+        else:
+            n_infeasible += 1
+
+    # Diagnostic warnings — surface vertex-search pathologies that would
+    # otherwise hide behind `_compute_arp_test` returning False
+    # (conservative non-rejection).
+    if n_total > 0 and len(vertices) == 0:
+        warnings.warn(
+            f"ARP vertex enumeration exhausted without feasible vertices: "
+            f"tried {n_total} bases, {n_linalg_error} rejected for "
+            f"LinAlgError, {n_infeasible} infeasible (negative basic "
+            f"variables). The caller (_compute_arp_test) will return False "
+            f"(conservative non-rejection). This may indicate near-singular "
+            f"nuisance constraints (X_tilde) or a degenerate "
+            f"moment-inequality system.",
+            RuntimeWarning,
+            stacklevel=3,
+        )
+    elif n_total > 0 and n_linalg_error / n_total >= 0.5:
+        warnings.warn(
+            f"ARP vertex enumeration heavily constrained: "
+            f"{n_linalg_error} of {n_total} bases ({100 * n_linalg_error / n_total:.0f}%) "
+            f"rejected for LinAlgError, {n_infeasible} infeasible. "
+            f"{len(vertices)} feasible vertex(es) recovered. Results may be "
+            f"numerically fragile; consider regularizing the moment-inequality "
+            f"system or reviewing the nuisance constraints (X_tilde).",
+            RuntimeWarning,
+            stacklevel=3,
+        )
 
     return vertices
 
diff --git a/tests/test_methodology_honest_did.py b/tests/test_methodology_honest_did.py
index 4a67fba8..f430f3f2 100644
--- a/tests/test_methodology_honest_did.py
+++ b/tests/test_methodology_honest_did.py
@@ -6,6 +6,8 @@
 """
 
 
+import warnings
+
 import numpy as np
 import pytest
 
@@ -495,3 +497,73 @@ def test_breakdown_monotonicity(self):
         # The optimal FLCI is efficient, so need large M for a weak effect.
         r_large = honest.fit(results, M=20.0)
         assert r_large.ci_lb <= 0 <= r_large.ci_ub, "Should lose significance at large M"
+
+
+class TestARPVertexEnumeration:
+    """Diagnostic warnings on `_enumerate_vertices` vertex-search pathologies."""
+
+    def test_enumerate_vertices_warns_on_exhausted_search(self):
+        """All-LinAlgError path: fully-zero nuisance column makes A_sys
+        singular on every basis, so the enumeration exhausts without
+        feasible vertices and the user should see a RuntimeWarning rather
+        than a silent empty-list return."""
+        from diff_diff.honest_did import _enumerate_vertices
+
+        # 4 moments, 1 nuisance column (all zeros) → A_sys singular on every basis
+        X_tilde = np.zeros((4, 1))
+        sigma_tilde_diag = np.array([1.0, 1.0, 1.0, 1.0])
+        with pytest.warns(RuntimeWarning, match="exhausted"):
+            vertices = _enumerate_vertices(X_tilde, sigma_tilde_diag, n_moments=4)
+        assert vertices == []
+
+    def test_enumerate_vertices_warns_on_heavy_rejection(self):
+        """Mixed-basis path: a partially-singular X_tilde produces some
+        feasible vertices but rejects >= 50% of bases for LinAlgError.
+        The warning helps users see that the recovered vertices came from
+        a numerically fragile enumeration."""
+        from diff_diff.honest_did import _enumerate_vertices
+
+        # 5 moments, 2 nuisance columns. C(5, 3) = 10 bases. Construct
+        # X_tilde so that ~6 of 10 bases have rank-deficient A_sys.
+        X_tilde = np.array([
+            [1.0, 0.0],
+            [1.0, 0.0],
+            [1.0, 0.0],
+            [0.0, 1.0],
+            [0.0, 1.0],
+        ])
+        sigma_tilde_diag = np.array([1.0, 1.0, 1.0, 1.0, 1.0])
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always", RuntimeWarning)
+            vertices = _enumerate_vertices(X_tilde, sigma_tilde_diag, n_moments=5)
+        heavy = [w for w in caught if "heavily constrained" in str(w.message)]
+        # If exhaustion fired instead, that's also a valid outcome — but the
+        # construction is calibrated for the heavy-rejection branch
+        exhausted = [w for w in caught if "exhausted" in str(w.message)]
+        assert heavy or exhausted, (
+            f"Expected heavily-constrained or exhausted warning; got "
+            f"{[str(w.message) for w in caught]}, vertices={len(vertices)}"
+        )
+
+    def test_enumerate_vertices_quiet_on_healthy_enumeration(self):
+        """Well-conditioned X_tilde: most bases solve cleanly and feasible
+        vertices are recovered. No RuntimeWarning should fire."""
+        from diff_diff.honest_did import _enumerate_vertices
+
+        rng = np.random.default_rng(0)
+        # 4 moments, 1 nuisance — small and well-conditioned
+        X_tilde = rng.normal(size=(4, 1))
+        sigma_tilde_diag = np.array([1.0, 1.0, 1.0, 1.0])
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always", RuntimeWarning)
+            vertices = _enumerate_vertices(X_tilde, sigma_tilde_diag, n_moments=4)
+        diag_warnings = [
+            w for w in caught
+            if "exhausted" in str(w.message) or "heavily constrained" in str(w.message)
+        ]
+        assert not diag_warnings, (
+            f"Healthy enumeration must not emit ARP diagnostics; got "
+            f"{[str(w.message) for w in diag_warnings]}"
+        )
+        # Sanity: we expect some feasible vertices on a well-conditioned input
+        assert isinstance(vertices, list)

From 7c0bdb26e368bfa3d54b0ea04658d9bb4faaf6b0 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 16 May 2026 08:49:29 -0400
Subject: [PATCH 3/4] TODO.md: remove rows for items 1+6 (Wave 3); reframe item
 5 (Wooldridge)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Items 1 and 6 from the post-Wave-2 backlog are now shipped:
  - HonestDiD test_m0_short_circuit (mock-based fix, commit bc0bf39a)
  - HonestDiD ARP vertex-rejection diagnostic (commit 0bfaabba)

Removals:
  - Tier-A bullets for items 1 and 6
  - Methodology/Correctness row for HonestDiD ARP vertex-rejection (PR #334)

Item 5 (WooldridgeDiD method/outcome pairing) reframed in place — PR #453
R1 review pointed out that the original "canonical link requirement
(W2023 Prop 3.1) not enforced" framing misrepresented Wooldridge (2023):
Table 1 lists Gaussian/OLS for "any response" and logistic-Bernoulli for
"binary OR fractional", so neither OLS-on-binary nor logit-on-fractional
is a Prop 3.1 violation. A useful hint still exists at the *efficiency*
level (OLS on binary is consistent but logit is typically more efficient
for inference), but should not be framed as a methodology violation.

Both the Methodology/Correctness row and the Tier-A bullet for item 5
are rewritten to reflect this. Item 5 stays open with the corrected
framing for a future wave to address.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 TODO.md | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/TODO.md b/TODO.md
index 6797e6bc..aa3c4805 100644
--- a/TODO.md
+++ b/TODO.md
@@ -96,13 +96,12 @@ Deferred items from PR reviews that were not addressed before merge.
 | Replicate weight tests use Fay-like BRR perturbations (0.5/1.5), not true half-sample BRR. Add true BRR regressions per estimator family. Existing `test_survey_phase6.py` covers true BRR at the helper level. | `tests/test_replicate_weight_expansion.py` | #253 | Low |
 | WooldridgeDiD: QMLE sandwich uses `aweight` cluster-robust adjustment `(G/(G-1))*(n-1)/(n-k)` vs Stata's `G/(G-1)` only. Conservative (inflates SEs). Add `qmle` weight type if Stata golden values confirm material difference. | `wooldridge.py`, `linalg.py` | #216 | Medium |
 | WooldridgeDiD: aggregation weights use cell-level n_{g,t} counts. Paper (W2025 Eqs. 7.2-7.4) defines cohort-share weights. Add optional `weights="cohort_share"` parameter to `aggregate()`. | `wooldridge_results.py` | #216 | Medium |
-| WooldridgeDiD: canonical link requirement (W2023 Prop 3.1) not enforced — no warning if user applies wrong method to outcome type. Estimator is consistent regardless, but equivalence with imputation breaks. | `wooldridge.py` | #216 | Low |
+| WooldridgeDiD: optional *efficiency hint* (NOT a canonical-link violation per W2023 Prop 3.1) when method/outcome pairing is sub-optimal — e.g., `method="ols"` on binary data is consistent under QMLE, but `method="logit"` is typically more efficient. The original framing in this row as a "canonical link requirement" tied to Prop 3.1 was incorrect: Wooldridge (2023) Table 1 lists Gaussian/OLS for "any response" and logistic-Bernoulli for "binary OR fractional". A useful hint exists (efficiency), but should not be framed as a methodology violation. See PR #453 R1 review for the corrected reading. | `wooldridge.py` | #216 | Low |
 | WooldridgeDiD: Stata `jwdid` golden value tests — add R/Stata reference script and `TestReferenceValues` class. | `tests/test_wooldridge.py` | #216 | Medium |
 | Thread `vcov_type` (classical / hc1 / hc2 / hc2_bm) through the 8 standalone estimators that expose `cluster=`: `CallawaySantAnna`, `SunAbraham`, `ImputationDiD`, `TwoStageDiD`, `TripleDifference`, `StackedDiD`, `WooldridgeDiD`, `EfficientDiD`. Phase 1a added `vcov_type` to the `DifferenceInDifferences` inheritance chain only. | multiple | Phase 1a | Medium |
 | Weighted one-way Bell-McCaffrey (`vcov_type="hc2_bm"` + `weights`, no cluster) currently raises `NotImplementedError`. `_compute_bm_dof_from_contrasts` builds its hat matrix from the unscaled design via `X (X'WX)^{-1} X' W`, but `solve_ols` solves the WLS problem by transforming to `X* = sqrt(w) X`, so the correct symmetric idempotent residual-maker is `M* = I - sqrt(W) X (X'WX)^{-1} X' sqrt(W)`. Rederive the Satterthwaite `(tr G)^2 / tr(G^2)` ratio on the transformed design and add weighted parity tests before lifting the guard. | `linalg.py::_compute_bm_dof_from_contrasts`, `linalg.py::_validate_vcov_args` | Phase 1a | Medium |
 | HC2 / HC2 + Bell-McCaffrey on absorbed-FE fits currently raises `NotImplementedError` in three places: `TwoWayFixedEffects` unconditionally; `DifferenceInDifferences(absorb=..., vcov_type in {"hc2","hc2_bm"})`; `MultiPeriodDiD(absorb=..., vcov_type in {"hc2","hc2_bm"})`. Within-transformation preserves coefficients and residuals under FWL but not the hat matrix, so the reduced-design `h_ii` is not the diagonal of the full FE projection and CR2's block adjustment `A_g = (I - H_gg)^{-1/2}` is likewise wrong on absorbed cluster blocks. Lifting the guard needs HC2/CR2-BM computed from the full absorbed projection (unit/time FE dummies reconstructed internally, or a FE-aware hat-matrix formulation) and a parity harness against a full-dummy OLS run or R `fixest`/`clubSandwich`. HC1/CR1 are unaffected by this because they have no leverage term. | `twfe.py::fit`, `estimators.py::DifferenceInDifferences.fit`, `estimators.py::MultiPeriodDiD.fit` | Phase 1a | Medium |
 | Weighted CR2 Bell-McCaffrey cluster-robust (`vcov_type="hc2_bm"` + `cluster_ids` + `weights`) currently raises `NotImplementedError`. Weighted hat matrix and residual rebalancing need threading per clubSandwich WLS handling. | `linalg.py::_compute_cr2_bm` | Phase 1a | Medium |
-| `honest_did.py:1907` `np.linalg.solve(A_sys, b_sys) / except LinAlgError: continue` is a silent basis-rejection in the vertex-enumeration loop that is algorithmically intentional (try the next basis). Consider surfacing a count of rejected bases as a diagnostic when ARP enumeration exhausts, so users see when the vertex search was heavily constrained. Not a silent failure in the sense of the Phase 2 audit (the algorithm is supposed to skip), but the diagnostic would help debug borderline cases. | `honest_did.py` | #334 | Low |
 | Unify Rust local-method `estimate_model` solver path to `solve_wls_svd` (the same SVD helper used by the global-method since PR #348) for sub-1e-14 bootstrap SE parity. Current local-method bootstrap parity test (`tests/test_rust_backend.py::TestTROPRustEdgeCaseParity::test_bootstrap_seed_reproducibility_local`) passes at `atol=1e-5` — the residual ~1e-7 gap is roundoff between Rust's `estimate_model` matrix factorization and numpy's `lstsq`, which accumulates differently across per-replicate bootstrap fits. Main-fit ATT parity is regime-dependent (`atol=1e-14` for `lambda_nn=inf`, `atol=1e-10` for finite `lambda_nn` — see `test_local_method_main_fit_parity`); the bootstrap gap is a same-solver-path roundoff concern and not a user-visible correctness bug. | `rust/src/trop.rs::estimate_model`, `rust/src/linalg.rs::solve_wls_svd` | follow-up | Low |
 | Rust multiplier-bootstrap weight RNG (`generate_bootstrap_weights_batch` in `rust/src/bootstrap.rs:9-10, 57-75`) uses `Xoshiro256PlusPlus::seed_from_u64(seed + i)` per row for Rademacher/Mammen/Webb generation. If any Python caller (SDID / efficient-DiD multiplier bootstrap) has a numpy-canonical equivalent, the two backends likely diverge under the same seed. Audit Python callers (`diff_diff/sdid.py`, `diff_diff/efficient_did_bootstrap.py`, `diff_diff/bootstrap_utils.py::generate_bootstrap_weights_batch_numpy`) for parity-test gaps. Same fix shape as TROP RNG parity (PR #354): pre-generate weights in Python via numpy and pass them to Rust through PyO3. | `rust/src/bootstrap.rs`, `diff_diff/bootstrap_utils.py` | follow-up | Medium |
 | `bias_corrected_local_linear`: extend golden parity to `kernel="triangular"` and `kernel="uniform"` (currently epa-only; all three kernels share `kernel_W` and the `lprobust` math, so parity is expected but not separately asserted). | `benchmarks/R/generate_nprobust_lprobust_golden.R`, `tests/test_bias_corrected_lprobust.py` | Phase 1c | Low |
@@ -160,7 +159,6 @@ Deferred items from PR reviews that were not addressed before merge.
 | CS R helpers hard-code `xformla = ~ 1`; no covariate-adjusted R benchmark for IRLS path | `tests/test_methodology_callaway.py` | #202 | Low |
 | Doc-snippet smoke tests only cover `.rst` files; `.txt` AI guides outside CI validation | `tests/test_doc_snippets.py` | #239 | Low |
 | Add CI validation for `docs/doc-deps.yaml` integrity (stale paths, unmapped source files) | `docs/doc-deps.yaml` | #269 | Low |
-| HonestDiD `test_m0_short_circuit` uses wall-clock `elapsed < 0.5s` as a proxy for "short-circuit path taken" instead of calling the full optimizer. Replace with a direct correctness signal (mock/spy the optimizer or check a state flag) so the test doesn't depend on CI timing. Not flaky today at 500ms, but load-bearing correctness on a timing proxy is brittle. | `tests/test_methodology_honest_did.py:246` | — | Low |
 | SyntheticDiD: rename internal `placebo_effects` variable to `variance_effects` (or `resampled_effects`). Misleading name across the placebo/bootstrap/jackknife dispatch paths — holds three different contents depending on variance method. Low-risk refactor; user-facing field rename should preserve `placebo_effects` as a deprecated alias for one release. | `synthetic_did.py`, `results.py` | follow-up | Medium |
 | AI review CI: pin workflow contract via test (uses `openai/codex-action@v1`, passes `prompt-file`, reads `steps.run_codex.outputs.final-message`, preserves diff-exclude paths and comment markers). Currently only the wrapper-tag and closing-tag-escape strings are asserted. | `tests/test_openai_review.py`, `.github/workflows/ai_pr_review.yml` | #416 | Low |
 | `TestWorkflowDoesNotExecutePRHeadCode` (CodeQL #14 dismissal guard) does not model: `bash <script>` / `sh <script>` / `./<script>` / `source <script>` / `. <script>` direct shell-script execution; multi-line `python3 -c` bodies (line-by-line shlex can't reassemble across newlines — the workflow's 5 sanitizer bodies are exempt by invisibility); shell-variable-expansion indirection (`SCRIPT="$X"; python3 "$SCRIPT"`); `eval`; `find -exec`; `xargs -I {}`. Each represents a path by which PR-head bytes COULD execute without the test failing. The guard catches accidental regressions of common forms (16 tests covering pip/npm/cargo/maturin/etc. installs, python file exec, bash -c indirection with compound flags, env-var prefixes, line continuations, subshells/brace groups, single-line python -c, write-overwrites of allowlisted /tmp paths). Closing the residuals would require multi-line shell parsing with command-substitution awareness + script-execution allowlists — significant work for diminishing return given the dismissal's primary defense is the documented threat model on the alert and in `.github/workflows/ai_pr_review.yml` comment block. | `tests/test_openai_review.py`, `.github/workflows/ai_pr_review.yml` | #436 | Low |
@@ -174,12 +172,10 @@ Ordered paydown view across the tables above. Tier A → D is by effort × risk,
 
 #### Tier A — Quick wins (≤1 day, ≤3 CI rounds expected)
 
-- HonestDiD `test_m0_short_circuit`: replace wall-clock `elapsed < 0.5s` proxy with a state flag (`tests/test_methodology_honest_did.py:246`)
 - EfficientDiD `control_group="last_cohort"` REGISTRY-vs-code alignment with `anticipation>0` (`efficient_did.py`, one design decision)
 - TripleDifference: add `generate_ddd_panel_data` for panel DDD power analysis (`prep_dgp.py`, `power.py`)
 - TROP: extract shared data-setup helper between `fit()` and `_fit_global()` (~150 LoC dedup; `trop.py`, `trop_global.py`, `trop_local.py`)
-- WooldridgeDiD: emit warning when canonical-link is violated (`wooldridge.py`; W2023 Prop 3.1)
-- HonestDiD vertex-rejection diagnostic counter on ARP enumeration exhaust (`honest_did.py:1907`)
+- WooldridgeDiD: optional efficiency hint when method/outcome pairing is sub-optimal (NOT a canonical-link violation per W2023 Prop 3.1 — see Methodology/Correctness row for the corrected framing)
 
 (SyntheticDiD `placebo_effects` → `variance_effects` rename moved to Tier B — the user-facing field rename + one-release deprecation alias is too large for ≤1 day / ≤3 CI rounds.)
 

From 54efaea6d0b2dc95b5bc2ff192fe4684df67fe5b Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 16 May 2026 08:55:51 -0400
Subject: [PATCH 4/4] Address PR #453 R2 review (1 P3, isolate heavy-rejection
 test branch)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

P3 — `test_enumerate_vertices_warns_on_heavy_rejection` previously used
a fixture (X_tilde with 2 unique row types, 5 moments → all 3-row bases
singular) where the assertion could pass via the `exhausted` warning
instead of the intended `heavily constrained` branch. The branch was
effectively untested.

Rewrite the fixture: 5 moments × 1 nuisance column, C(5,2)=10 bases. By
design, 6 bases trip LinAlgError (pairs among the singular-X_tilde
indices) and 4 bases produce feasible vertices (each pairs a positive
X_tilde with the unique negative X_tilde at index 4). 60% rejection rate
hits the heavily-constrained branch specifically, not exhaustion.

Switched to `pytest.warns(RuntimeWarning, match="heavily constrained")`
so the test now fails if the message changes or if the wrong branch
fires. Added a `len(vertices) >= 1` assertion to guard against the
fixture inadvertently producing the exhausted-branch outcome.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_methodology_honest_did.py | 37 +++++++++++-----------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/tests/test_methodology_honest_did.py b/tests/test_methodology_honest_did.py
index f430f3f2..2f378a31 100644
--- a/tests/test_methodology_honest_did.py
+++ b/tests/test_methodology_honest_did.py
@@ -517,32 +517,23 @@ def test_enumerate_vertices_warns_on_exhausted_search(self):
         assert vertices == []
 
     def test_enumerate_vertices_warns_on_heavy_rejection(self):
-        """Mixed-basis path: a partially-singular X_tilde produces some
-        feasible vertices but rejects >= 50% of bases for LinAlgError.
-        The warning helps users see that the recovered vertices came from
-        a numerically fragile enumeration."""
+        """Mixed-basis path: 5 moments, 1 nuisance column. C(5, 2) = 10
+        bases. By design, 6 bases hit LinAlgError (the singular pairs
+        among indices {0,1,2,3} that share aligned nuisance/sigma values)
+        and 4 bases produce feasible vertices (the (i, 4) pairs that pair
+        a positive-X_tilde row with the unique negative-X_tilde row at
+        index 4). 60% rejection rate trips the `heavily constrained`
+        branch specifically, not the exhausted branch."""
         from diff_diff.honest_did import _enumerate_vertices
 
-        # 5 moments, 2 nuisance columns. C(5, 3) = 10 bases. Construct
-        # X_tilde so that ~6 of 10 bases have rank-deficient A_sys.
-        X_tilde = np.array([
-            [1.0, 0.0],
-            [1.0, 0.0],
-            [1.0, 0.0],
-            [0.0, 1.0],
-            [0.0, 1.0],
-        ])
-        sigma_tilde_diag = np.array([1.0, 1.0, 1.0, 1.0, 1.0])
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always", RuntimeWarning)
+        X_tilde = np.array([[1.0], [1.0], [1.0], [2.0], [-1.0]])
+        sigma_tilde_diag = np.array([1.0, 1.0, 1.0, 2.0, 1.0])
+        with pytest.warns(RuntimeWarning, match="heavily constrained"):
             vertices = _enumerate_vertices(X_tilde, sigma_tilde_diag, n_moments=5)
-        heavy = [w for w in caught if "heavily constrained" in str(w.message)]
-        # If exhaustion fired instead, that's also a valid outcome — but the
-        # construction is calibrated for the heavy-rejection branch
-        exhausted = [w for w in caught if "exhausted" in str(w.message)]
-        assert heavy or exhausted, (
-            f"Expected heavily-constrained or exhausted warning; got "
-            f"{[str(w.message) for w in caught]}, vertices={len(vertices)}"
+        assert len(vertices) >= 1, (
+            f"Heavy-rejection construction must still produce some feasible "
+            f"vertices (otherwise the exhausted branch fires); got "
+            f"{len(vertices)} vertices."
         )
 
     def test_enumerate_vertices_quiet_on_healthy_enumeration(self):