Skip to content

Commit 4094650

Browse files
0.26.28 stats
1 parent 3532b3c commit 4094650

2 files changed

Lines changed: 60 additions & 14 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "spotpython"
10-
version = "0.26.27"
10+
version = "0.26.28"
1111
authors = [
1212
{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
1313
]

src/spotpython/utils/stats.py

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from statsmodels.formula.api import ols
1010
from statsmodels.stats.outliers_influence import variance_inflation_factor
1111
import statsmodels.api as sm
12+
from sklearn.preprocessing import OneHotEncoder
1213

1314

1415
def cov_to_cor(covariance_matrix) -> np.ndarray:
@@ -722,19 +723,64 @@ def compute_coefficients_table(model, X_encoded, y, vif_table=None) -> pd.DataFr
722723
# 4) Tolerance & 5) VIF
723724
# -------------------------------------------------------------------
724725
if vif_table is None:
725-
results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r})
726+
vif_table = vif(X_encoded)
727+
# results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r})
728+
# Get the VIF for this predictor
729+
vif_row = vif_table.loc[vif_table["feature"] == var, "VIF"]
730+
if len(vif_row) == 0:
731+
var_vif = np.nan
726732
else:
727-
# Get the VIF for this predictor
728-
vif_row = vif_table.loc[vif_table["feature"] == var, "VIF"]
729-
if len(vif_row) == 0:
730-
var_vif = np.nan
731-
else:
732-
var_vif = vif_row.iloc[0]
733-
if var_vif <= 0 or np.isnan(var_vif):
734-
tolerance = np.nan
735-
else:
736-
tolerance = 1.0 / var_vif
737-
# Collect results
738-
results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r, "Tolerance": tolerance, "VIF": var_vif})
733+
var_vif = vif_row.iloc[0]
734+
if var_vif <= 0 or np.isnan(var_vif):
735+
tolerance = np.nan
736+
else:
737+
tolerance = 1.0 / var_vif
738+
# Collect results
739+
results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r, "Tolerance": tolerance, "VIF": var_vif})
739740

740741
return pd.DataFrame(results)
742+
743+
744+
def preprocess_df_for_ols(df, independent_var_columns, target_col) -> tuple:
745+
"""
746+
Preprocesses a df for fiitting an OLS regression model using the specified target column and predictors.
747+
748+
Args:
749+
df (pd.DataFrame): Input DataFrame containing the data.
750+
independent_var_columns (list of str): List of names for predictor columns.
751+
target_col (str): Name of the target/dependent variable column.
752+
753+
Returns:
754+
X_encoded (pd.DataFrame): Encoded predictors with a constant term.
755+
y (pd.Series): Target variable.
756+
757+
"""
758+
# Ensure the target column is numeric and 1D
759+
y = pd.to_numeric(df[target_col], errors="coerce").fillna(0).squeeze()
760+
if y.ndim != 1:
761+
raise ValueError(f"Target column '{target_col}' must be 1-dimensional.")
762+
763+
# Ensure predictors are numeric
764+
X = df[independent_var_columns].apply(pd.to_numeric, errors="coerce")
765+
# Impute missing values
766+
X = X.fillna(X.median())
767+
768+
# Identify categorical columns (replace with your actual categorical list if needed)
769+
categorical_cols = ["type"]
770+
encoder = OneHotEncoder(drop="first", sparse_output=False)
771+
X_categorical_encoded = encoder.fit_transform(df[categorical_cols])
772+
773+
# Convert encoded data into a DataFrame
774+
X_categorical_encoded_df = pd.DataFrame(X_categorical_encoded, columns=encoder.get_feature_names_out(categorical_cols), index=df.index) # Ensure alignment with the original DataFrame
775+
776+
# Combine numeric and categorical (encoded) parts
777+
X_encoded = pd.concat([X, X_categorical_encoded_df], axis=1)
778+
779+
# Add a constant term
780+
X_encoded = sm.add_constant(X_encoded)
781+
782+
# Ensure alignment between X_encoded and y
783+
if X_encoded.shape[0] != y.shape[0]:
784+
raise ValueError(f"Mismatch in rows: predictors (X_encoded) have {X_encoded.shape[0]} rows, " f"but target (y) has {y.shape[0]} rows.")
785+
786+
return X_encoded, y

0 commit comments

Comments
 (0)