diff --git a/changelog.d/quantreg-dead-code-and-vectorize.fixed.md b/changelog.d/quantreg-dead-code-and-vectorize.fixed.md new file mode 100644 index 0000000..cd393dd --- /dev/null +++ b/changelog.d/quantreg-dead-code-and-vectorize.fixed.md @@ -0,0 +1 @@ +Removed dead `random_quantile` RNG code from `QuantReg._fit` (#11) — the RNG was initialised and never used while `q` was hardcoded to 0.5, and the log message "Fitting quantile regression for random quantile" was misleading. Also vectorised the per-row random-quantile path in `QuantRegResults._predict`: the previous implementation allocated an object-dtype DataFrame and wrote numeric predictions into it via `result_df.loc[idx, variable] = ...` inside a Python loop (quadratic in rows x variables, silently demoting numeric output to `object`, contributing to OOM pressure in issue #96). The new implementation uses `np.column_stack` + per-row index to select predictions in one pass, returning a numeric dtype. diff --git a/microimpute/models/quantreg.py b/microimpute/models/quantreg.py index a808303..01da3b4 100644 --- a/microimpute/models/quantreg.py +++ b/microimpute/models/quantreg.py @@ -181,22 +181,35 @@ def _predict( imputed_df[variable] = predictions random_q_imputations[q] = imputed_df - # Create a final dataframe to hold the random quantile imputed values - result_df = pd.DataFrame( - index=random_q_imputations[quantiles[0]].index, - columns=self.imputed_variables, - ) - - # Sample one quantile per row + # Vectorised per-row random quantile selection. The + # previous implementation allocated an object-dtype + # DataFrame via pd.DataFrame(index=..., columns=...) + # and assigned per-row via result_df.loc[idx, var], + # which is O(rows * vars) in Python-level attribute + # lookups and silently demotes numeric predictions to + # object dtype — a major contributor to issue #96 + # (OOM with many variables). rng = np.random.default_rng(self.seed) - for idx in result_df.index: - sampled_q = rng.choice(quantiles) + index = random_q_imputations[quantiles[0]].index + n_rows = len(index) + quantiles_arr = np.asarray(quantiles) + # Sampled quantile index per row. + sampled_idx = rng.integers(0, len(quantiles_arr), size=n_rows) - # For all variables, use the sampled quantile for this row - for variable in self.imputed_variables: - result_df.loc[idx, variable] = random_q_imputations[ - sampled_q - ].loc[idx, variable] + result_df = pd.DataFrame(index=index) + for variable in self.imputed_variables: + # Stack predictions for this variable across all + # quantiles into an (n_rows, n_quantiles) array, + # then select per-row with np.take_along_axis so + # each row uses its own sampled quantile. + stacked = np.column_stack( + [ + np.asarray(random_q_imputations[q][variable].values) + for q in quantiles + ] + ) + row_idx = np.arange(n_rows) + result_df[variable] = stacked[row_idx, sampled_idx] # Add to imputations dictionary using the mean quantile as key imputations[mean_quantile] = result_df @@ -364,11 +377,14 @@ def _fit( self.models[variable][q] = sm.QuantReg(Y, X_with_const).fit(q=q) self.logger.info(f"Model for q={q} fitted successfully") else: - random_generator = np.random.default_rng(self.seed) + # Default to fitting a single median model when no + # quantiles were supplied. The previous implementation + # created ``random_generator = np.random.default_rng(seed)`` + # and logged "random quantile" — but the generator was + # never used and q was hardcoded to 0.5. That was dead + # code and misleading logging. q = 0.5 - self.logger.info( - f"Fitting quantile regression for random quantile {q:.4f}" - ) + self.logger.info(f"Fitting quantile regression for q={q}") for variable in imputed_variables: self.logger.info(f"Imputing variable {variable}") # Handle constant targets diff --git a/tests/test_models/test_quantreg.py b/tests/test_models/test_quantreg.py index 826cc78..35a2468 100644 --- a/tests/test_models/test_quantreg.py +++ b/tests/test_models/test_quantreg.py @@ -322,3 +322,39 @@ def test_quantreg_prediction_quality(diabetes_data: pd.DataFrame) -> None: # Check that predictions have reasonable variance assert np.var(pred_values) > 0, "QuantReg predictions have no variance" + + +def test_quantreg_random_quantile_sample_returns_numeric_dtype() -> None: + """Regression test for #11: the per-row random-quantile path previously + allocated an object-dtype DataFrame and wrote numeric predictions + into it via .loc, silently demoting the result to object. After + vectorisation the output must be a numeric dtype.""" + import numpy as np + import pandas as pd + + rng = np.random.default_rng(0) + n = 100 + data = pd.DataFrame( + { + "x": rng.normal(size=n), + "y": rng.normal(size=n) + rng.normal(size=n), + } + ) + + model = QuantReg() + fitted = model.fit(data, ["x"], ["y"], quantiles=[0.1, 0.5, 0.9]) + + test = pd.DataFrame({"x": rng.normal(size=20)}) + preds = fitted.predict(test, random_quantile_sample=True) + + # With random_quantile_sample=True and no quantiles at predict time, + # the implementation keys the result by the mean of the fitted + # quantiles. + mean_q = np.mean([0.1, 0.5, 0.9]) + assert mean_q in preds + out = preds[mean_q] + assert pd.api.types.is_numeric_dtype(out["y"]), ( + f"Vectorised random-quantile path must return a numeric dtype; " + f"got {out['y'].dtype}" + ) + assert np.all(np.isfinite(out["y"].values))