Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/imputer-weights.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed `Imputer.fit(weight_col=...)` silently discarding weights (#4). Previously, weights were used only as bootstrap-resample probabilities over `X_train`, with the resampled data then fed unweighted into the underlying estimator; effective sample size shrank, rare donors were dropped, and variance was inflated relative to the correct weighted estimator. The base `Imputer.fit` now threads `sample_weight` through to each learner's native weighted-fit API: `RandomForestQuantileRegressor.fit(sample_weight=...)`, `sm.WLS` (instead of `sm.OLS`), `LogisticRegression.fit(sample_weight=...)`, `RandomForestClassifier.fit(sample_weight=...)`, and StatMatch's `NND.hotdeck` via `weight.don`. Models that do not support weighted fit (`QuantReg`, `MDN`) now raise `NotImplementedError` rather than silently ignoring weights. NaN weights are now rejected explicitly (previously `(weights <= 0).any()` returned `False` on NaN and let the NaN propagate into `.sample()` probabilities).
42 changes: 31 additions & 11 deletions microimpute/models/imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,21 @@ def fit(
weights = X_train[weight_col]
elif weight_col is not None and isinstance(weight_col, np.ndarray):
weights = pd.Series(weight_col, index=X_train.index)
elif weight_col is not None and isinstance(weight_col, pd.Series):
weights = weight_col.reindex(X_train.index)

if weights is not None and (weights <= 0).any():
raise ValueError("Weights must be positive")
if weights is not None:
# Check for NaN AND non-positive values together. Previously only
# (weights <= 0).any() was checked, which returns False for NaN
# weights — those then propagated into .sample() as NaN
# probabilities or corrupted sample_weight passed to learners.
weights_arr = np.asarray(weights, dtype=float)
invalid_mask = np.isnan(weights_arr) | (weights_arr <= 0)
if invalid_mask.any():
raise ValueError(
"Weights must be positive and finite; found "
f"{int(invalid_mask.sum())} non-positive or NaN weight(s)"
)

# Identify target types BEFORE preprocessing
self.identify_target_types(X_train, imputed_variables, not_numeric_categorical)
Expand All @@ -284,21 +296,28 @@ def fit(
)
)

if weights is not None:
weights_normalized = weights / weights.sum()
X_train = X_train.sample(
n=len(X_train),
replace=True,
weights=weights_normalized,
random_state=self.seed,
).reset_index(drop=True)

# Save predictors and imputed variables
self.predictors = predictors
self.imputed_variables = imputed_variables
self.imputed_vars_dummy_info = imputed_vars_dummy_info
self.original_predictors = original_predictors

# Pass sample_weight through to the subclass so it can use each
# learner's native weighted-fit API (QRF, OLS→WLS, logistic, RFC all
# support sample_weight). This replaces the previous bootstrap
# resample, which silently discarded weights for the underlying
# estimator and inflated variance / shrank effective sample size.
sample_weight = None
if weights is not None:
sample_weight = np.asarray(weights_arr, dtype=float)
# Reindex if preprocess_data_types changed the row ordering
# (it currently does not, but guard against future drift).
if len(sample_weight) != len(X_train):
raise RuntimeError(
"Internal error: sample_weight length no longer matches "
"X_train after preprocessing"
)

# Defer actual training to subclass with all parameters
fitted_model = self._fit(
X_train,
Expand All @@ -309,6 +328,7 @@ def fit(
boolean_targets=self.boolean_targets,
numeric_targets=self.numeric_targets,
constant_targets=self.constant_targets,
sample_weight=sample_weight,
**kwargs,
)
return fitted_model
Expand Down
13 changes: 13 additions & 0 deletions microimpute/models/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ def _fit(
numeric_targets: Optional[List[str]] = None,
constant_targets: Optional[Dict[str, Dict]] = None,
tune_hyperparameters: bool = False,
sample_weight: Optional[np.ndarray] = None,
**matching_kwargs: Any,
) -> MatchingResults:
"""Fit the matching model by storing the donor data and variable names.
Expand All @@ -457,6 +458,11 @@ def _fit(
X_train: DataFrame containing the donor data.
predictors: List of column names to use as predictors.
imputed_variables: List of column names to impute.
sample_weight: Optional per-row sample weights for the donor
dataset. When provided, weights are passed to R StatMatch's
``NND.hotdeck`` via ``weight.don`` so that donor records are
matched in proportion to their survey weights rather than
uniformly.
matching_kwargs: Additional keyword arguments for hyperparameter
tuning of the matching function.

Expand All @@ -468,6 +474,13 @@ def _fit(
"""
try:
self.donor_data = X_train.copy()
if sample_weight is not None:
# Attach donor weights to the matching hyperparameters so
# they're forwarded into the StatMatch R call (weight.don).
matching_kwargs = {
**matching_kwargs,
"donor_sample_weight": np.asarray(sample_weight, dtype=float),
}

if tune_hyperparameters:
self.logger.info("Tuning hyperparameters for the matching model")
Expand Down
11 changes: 11 additions & 0 deletions microimpute/models/mdn.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,7 @@ def _fit(
numeric_targets: Optional[List[str]] = None,
constant_targets: Optional[Dict[str, Dict]] = None,
tune_hyperparameters: bool = False,
sample_weight: Optional[np.ndarray] = None,
**kwargs: Any,
) -> Union[MDNResults, Tuple[MDNResults, Dict[str, Any]]]:
"""Fit the MDN model to the training data.
Expand All @@ -940,12 +941,22 @@ def _fit(
numeric_targets: List of numeric target names.
constant_targets: Dict of constant target info.
tune_hyperparameters: If True, tune hyperparameters before fitting.
sample_weight: Optional per-row sample weights. The underlying
pytorch_tabular MDN implementation does not accept sample
weights; when provided, the model raises
``NotImplementedError`` so callers do not silently get an
unweighted fit.
**kwargs: Additional parameters.

Returns:
MDNResults instance with fitted models.
If tune_hyperparameters=True, returns (MDNResults, best_params).
"""
if sample_weight is not None:
raise NotImplementedError(
"MDN does not yet support sample weights. Use QRF, OLS, or "
"Matching for weighted imputation."
)
try:
best_params = None

Expand Down
44 changes: 38 additions & 6 deletions microimpute/models/ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def fit(
y: pd.Series,
var_type: str,
categories: List = None,
sample_weight: Optional[np.ndarray] = None,
**lr_kwargs: Any,
) -> None:
"""Fit logistic regression for categorical/boolean target.
Expand Down Expand Up @@ -71,7 +72,10 @@ def fit(
}

self.classifier = LogisticRegression(**classifier_params)
self.classifier.fit(X, y_encoded)
fit_kwargs = {}
if sample_weight is not None:
fit_kwargs["sample_weight"] = np.asarray(sample_weight, dtype=float)
self.classifier.fit(X, y_encoded, **fit_kwargs)

def predict(
self,
Expand Down Expand Up @@ -137,11 +141,26 @@ def __init__(self, seed: int, logger):
self.model = None
self.output_column = None

def fit(self, X: pd.DataFrame, y: pd.Series, **kwargs) -> None:
"""Fit OLS model."""
def fit(
self,
X: pd.DataFrame,
y: pd.Series,
sample_weight: Optional[np.ndarray] = None,
**kwargs,
) -> None:
"""Fit OLS (or WLS when sample_weight is provided).

When ``sample_weight`` is provided, uses ``statsmodels.api.WLS`` to
perform a genuine weighted least-squares fit rather than ignoring
the weights.
"""
self.output_column = y.name
X_with_const = sm.add_constant(X)
self.model = sm.OLS(y, X_with_const).fit()
if sample_weight is not None:
weights = np.asarray(sample_weight, dtype=float)
self.model = sm.WLS(y, X_with_const, weights=weights).fit()
else:
self.model = sm.OLS(y, X_with_const).fit()
self.scale = self.model.scale

def predict(self, X: pd.DataFrame) -> np.ndarray:
Expand Down Expand Up @@ -431,6 +450,7 @@ def _fit(
boolean_targets: Optional[Dict[str, Dict]] = None,
numeric_targets: Optional[List[str]] = None,
constant_targets: Optional[Dict[str, Dict]] = None,
sample_weight: Optional[np.ndarray] = None,
**kwargs: Any,
) -> OLSResults:
"""Fit the OLS model to the training data.
Expand All @@ -439,6 +459,9 @@ def _fit(
X_train: DataFrame containing the training data.
predictors: List of column names to use as predictors.
imputed_variables: List of column names to impute.
sample_weight: Optional per-row sample weights, threaded through
to ``sm.WLS`` (for numeric targets) or
``LogisticRegression.fit`` (for categorical/boolean).

Returns:
The fitted model instance.
Expand Down Expand Up @@ -476,6 +499,7 @@ def _fit(
Y,
var_type=categorical_targets[variable]["type"],
categories=categorical_targets[variable].get("categories"),
sample_weight=sample_weight,
**kwargs,
)
self.logger.info(
Expand All @@ -484,14 +508,22 @@ def _fit(
elif variable in (boolean_targets or {}):
# Use logistic regression for boolean targets
model = _LogisticRegressionModel(seed=self.seed, logger=self.logger)
model.fit(X_train[predictors], Y, var_type="boolean", **kwargs)
model.fit(
X_train[predictors],
Y,
var_type="boolean",
sample_weight=sample_weight,
**kwargs,
)
self.logger.info(
f"Logistic regression fitted for boolean variable {variable}"
)
else:
# Use OLS for numeric targets
model = _OLSModel(seed=self.seed, logger=self.logger)
model.fit(X_train[predictors], Y, **kwargs)
model.fit(
X_train[predictors], Y, sample_weight=sample_weight, **kwargs
)
self.logger.info(
f"OLS regression fitted for numeric variable {variable}"
)
Expand Down
Loading
Loading