PolicyEngine · MaxGhenis · Apr 17, 2026 · Apr 17, 2026
diff --git a/changelog.d/imputer-weights.fixed.md b/changelog.d/imputer-weights.fixed.md
@@ -0,0 +1 @@
+Fixed `Imputer.fit(weight_col=...)` silently discarding weights (#4). Previously, weights were used only as bootstrap-resample probabilities over `X_train`, with the resampled data then fed unweighted into the underlying estimator; effective sample size shrank, rare donors were dropped, and variance was inflated relative to the correct weighted estimator. The base `Imputer.fit` now threads `sample_weight` through to each learner's native weighted-fit API: `RandomForestQuantileRegressor.fit(sample_weight=...)`, `sm.WLS` (instead of `sm.OLS`), `LogisticRegression.fit(sample_weight=...)`, `RandomForestClassifier.fit(sample_weight=...)`, and StatMatch's `NND.hotdeck` via `weight.don`. Models that do not support weighted fit (`QuantReg`, `MDN`) now raise `NotImplementedError` rather than silently ignoring weights. NaN weights are now rejected explicitly (previously `(weights <= 0).any()` returned `False` on NaN and let the NaN propagate into `.sample()` probabilities).
diff --git a/microimpute/models/imputer.py b/microimpute/models/imputer.py
@@ -271,9 +271,21 @@ def fit(
             weights = X_train[weight_col]
         elif weight_col is not None and isinstance(weight_col, np.ndarray):
             weights = pd.Series(weight_col, index=X_train.index)
+        elif weight_col is not None and isinstance(weight_col, pd.Series):
+            weights = weight_col.reindex(X_train.index)
 
-        if weights is not None and (weights <= 0).any():
-            raise ValueError("Weights must be positive")
+        if weights is not None:
+            # Check for NaN AND non-positive values together. Previously only
+            # (weights <= 0).any() was checked, which returns False for NaN
+            # weights — those then propagated into .sample() as NaN
+            # probabilities or corrupted sample_weight passed to learners.
+            weights_arr = np.asarray(weights, dtype=float)
+            invalid_mask = np.isnan(weights_arr) | (weights_arr <= 0)
+            if invalid_mask.any():
+                raise ValueError(
+                    "Weights must be positive and finite; found "
+                    f"{int(invalid_mask.sum())} non-positive or NaN weight(s)"
+                )
 
         # Identify target types BEFORE preprocessing
         self.identify_target_types(X_train, imputed_variables, not_numeric_categorical)
@@ -284,21 +296,28 @@ def fit(
             )
         )
 
-        if weights is not None:
-            weights_normalized = weights / weights.sum()
-            X_train = X_train.sample(
-                n=len(X_train),
-                replace=True,
-                weights=weights_normalized,
-                random_state=self.seed,
-            ).reset_index(drop=True)
-
         # Save predictors and imputed variables
         self.predictors = predictors
         self.imputed_variables = imputed_variables
         self.imputed_vars_dummy_info = imputed_vars_dummy_info
         self.original_predictors = original_predictors
 
+        # Pass sample_weight through to the subclass so it can use each
+        # learner's native weighted-fit API (QRF, OLS→WLS, logistic, RFC all
+        # support sample_weight). This replaces the previous bootstrap
+        # resample, which silently discarded weights for the underlying
+        # estimator and inflated variance / shrank effective sample size.
+        sample_weight = None
+        if weights is not None:
+            sample_weight = np.asarray(weights_arr, dtype=float)
+            # Reindex if preprocess_data_types changed the row ordering
+            # (it currently does not, but guard against future drift).
+            if len(sample_weight) != len(X_train):
+                raise RuntimeError(
+                    "Internal error: sample_weight length no longer matches "
+                    "X_train after preprocessing"
+                )
+
         # Defer actual training to subclass with all parameters
         fitted_model = self._fit(
             X_train,
@@ -309,6 +328,7 @@ def fit(
             boolean_targets=self.boolean_targets,
             numeric_targets=self.numeric_targets,
             constant_targets=self.constant_targets,
+            sample_weight=sample_weight,
             **kwargs,
         )
         return fitted_model

diff --git a/microimpute/models/matching.py b/microimpute/models/matching.py
@@ -449,6 +449,7 @@ def _fit(
         numeric_targets: Optional[List[str]] = None,
         constant_targets: Optional[Dict[str, Dict]] = None,
         tune_hyperparameters: bool = False,
+        sample_weight: Optional[np.ndarray] = None,
         **matching_kwargs: Any,
     ) -> MatchingResults:
         """Fit the matching model by storing the donor data and variable names.
@@ -457,6 +458,11 @@ def _fit(
             X_train: DataFrame containing the donor data.
             predictors: List of column names to use as predictors.
             imputed_variables: List of column names to impute.
+            sample_weight: Optional per-row sample weights for the donor
+                dataset. When provided, weights are passed to R StatMatch's
+                ``NND.hotdeck`` via ``weight.don`` so that donor records are
+                matched in proportion to their survey weights rather than
+                uniformly.
             matching_kwargs: Additional keyword arguments for hyperparameter
                 tuning of the matching function.
 
@@ -468,6 +474,13 @@ def _fit(
         """
         try:
             self.donor_data = X_train.copy()
+            if sample_weight is not None:
+                # Attach donor weights to the matching hyperparameters so
+                # they're forwarded into the StatMatch R call (weight.don).
+                matching_kwargs = {
+                    **matching_kwargs,
+                    "donor_sample_weight": np.asarray(sample_weight, dtype=float),
+                }
 
             if tune_hyperparameters:
                 self.logger.info("Tuning hyperparameters for the matching model")

diff --git a/microimpute/models/mdn.py b/microimpute/models/mdn.py
@@ -926,6 +926,7 @@ def _fit(
         numeric_targets: Optional[List[str]] = None,
         constant_targets: Optional[Dict[str, Dict]] = None,
         tune_hyperparameters: bool = False,
+        sample_weight: Optional[np.ndarray] = None,
         **kwargs: Any,
     ) -> Union[MDNResults, Tuple[MDNResults, Dict[str, Any]]]:
         """Fit the MDN model to the training data.
@@ -940,12 +941,22 @@ def _fit(
             numeric_targets: List of numeric target names.
             constant_targets: Dict of constant target info.
             tune_hyperparameters: If True, tune hyperparameters before fitting.
+            sample_weight: Optional per-row sample weights. The underlying
+                pytorch_tabular MDN implementation does not accept sample
+                weights; when provided, the model raises
+                ``NotImplementedError`` so callers do not silently get an
+                unweighted fit.
             **kwargs: Additional parameters.
 
         Returns:
             MDNResults instance with fitted models.
             If tune_hyperparameters=True, returns (MDNResults, best_params).
         """
+        if sample_weight is not None:
+            raise NotImplementedError(
+                "MDN does not yet support sample weights. Use QRF, OLS, or "
+                "Matching for weighted imputation."
+            )
         try:
             best_params = None
 

diff --git a/microimpute/models/ols.py b/microimpute/models/ols.py
@@ -31,6 +31,7 @@ def fit(
         y: pd.Series,
         var_type: str,
         categories: List = None,
+        sample_weight: Optional[np.ndarray] = None,
         **lr_kwargs: Any,
     ) -> None:
         """Fit logistic regression for categorical/boolean target.
@@ -71,7 +72,10 @@ def fit(
         }
 
         self.classifier = LogisticRegression(**classifier_params)
-        self.classifier.fit(X, y_encoded)
+        fit_kwargs = {}
+        if sample_weight is not None:
+            fit_kwargs["sample_weight"] = np.asarray(sample_weight, dtype=float)
+        self.classifier.fit(X, y_encoded, **fit_kwargs)
 
     def predict(
         self,
@@ -137,11 +141,26 @@ def __init__(self, seed: int, logger):
         self.model = None
         self.output_column = None
 
-    def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> None:
-        """Fit OLS model."""
+    def fit(
+        self,
+        X: pd.DataFrame,
+        y: pd.Series,
+        sample_weight: Optional[np.ndarray] = None,
+        **kwargs,
+    ) -> None:
+        """Fit OLS (or WLS when sample_weight is provided).
+
+        When ``sample_weight`` is provided, uses ``statsmodels.api.WLS`` to
+        perform a genuine weighted least-squares fit rather than ignoring
+        the weights.
+        """
         self.output_column = y.name
         X_with_const = sm.add_constant(X)
-        self.model = sm.OLS(y, X_with_const).fit()
+        if sample_weight is not None:
+            weights = np.asarray(sample_weight, dtype=float)
+            self.model = sm.WLS(y, X_with_const, weights=weights).fit()
+        else:
+            self.model = sm.OLS(y, X_with_const).fit()
         self.scale = self.model.scale
 
     def predict(self, X: pd.DataFrame) -> np.ndarray:
@@ -431,6 +450,7 @@ def _fit(
         boolean_targets: Optional[Dict[str, Dict]] = None,
         numeric_targets: Optional[List[str]] = None,
         constant_targets: Optional[Dict[str, Dict]] = None,
+        sample_weight: Optional[np.ndarray] = None,
         **kwargs: Any,
     ) -> OLSResults:
         """Fit the OLS model to the training data.
@@ -439,6 +459,9 @@ def _fit(
             X_train: DataFrame containing the training data.
             predictors: List of column names to use as predictors.
             imputed_variables: List of column names to impute.
+            sample_weight: Optional per-row sample weights, threaded through
+                to ``sm.WLS`` (for numeric targets) or
+                ``LogisticRegression.fit`` (for categorical/boolean).
 
         Returns:
             The fitted model instance.
@@ -476,6 +499,7 @@ def _fit(
                         Y,
                         var_type=categorical_targets[variable]["type"],
                         categories=categorical_targets[variable].get("categories"),
+                        sample_weight=sample_weight,
                         **kwargs,
                     )
                     self.logger.info(
@@ -484,14 +508,22 @@ def _fit(
                 elif variable in (boolean_targets or {}):
                     # Use logistic regression for boolean targets
                     model = _LogisticRegressionModel(seed=self.seed, logger=self.logger)
-                    model.fit(X_train[predictors], Y, var_type="boolean", **kwargs)
+                    model.fit(
+                        X_train[predictors],
+                        Y,
+                        var_type="boolean",
+                        sample_weight=sample_weight,
+                        **kwargs,
+                    )
                     self.logger.info(
                         f"Logistic regression fitted for boolean variable {variable}"
                     )
                 else:
                     # Use OLS for numeric targets
                     model = _OLSModel(seed=self.seed, logger=self.logger)
-                    model.fit(X_train[predictors], Y, **kwargs)
+                    model.fit(
+                        X_train[predictors], Y, sample_weight=sample_weight, **kwargs
+                    )
                     self.logger.info(
                         f"OLS regression fitted for numeric variable {variable}"
                     )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Fixed `Imputer.fit(weight_col=...)` silently discarding weights (#4). Previously, weights were used only as bootstrap-resample probabilities over `X_train`, with the resampled data then fed unweighted into the underlying estimator; effective sample size shrank, rare donors were dropped, and variance was inflated relative to the correct weighted estimator. The base `Imputer.fit` now threads `sample_weight` through to each learner's native weighted-fit API: `RandomForestQuantileRegressor.fit(sample_weight=...)`, `sm.WLS` (instead of `sm.OLS`), `LogisticRegression.fit(sample_weight=...)`, `RandomForestClassifier.fit(sample_weight=...)`, and StatMatch's `NND.hotdeck` via `weight.don`. Models that do not support weighted fit (`QuantReg`, `MDN`) now raise `NotImplementedError` rather than silently ignoring weights. NaN weights are now rejected explicitly (previously `(weights <= 0).any()` returned `False` on NaN and let the NaN propagate into `.sample()` probabilities).