|
9 | 9 | from statsmodels.formula.api import ols |
10 | 10 | from statsmodels.stats.outliers_influence import variance_inflation_factor |
11 | 11 | import statsmodels.api as sm |
| 12 | +from sklearn.preprocessing import OneHotEncoder |
12 | 13 |
|
13 | 14 |
|
14 | 15 | def cov_to_cor(covariance_matrix) -> np.ndarray: |
@@ -722,19 +723,64 @@ def compute_coefficients_table(model, X_encoded, y, vif_table=None) -> pd.DataFr |
722 | 723 | # 4) Tolerance & 5) VIF |
723 | 724 | # ------------------------------------------------------------------- |
724 | 725 | if vif_table is None: |
725 | | - results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r}) |
| 726 | + vif_table = vif(X_encoded) |
| 727 | + # results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r}) |
| 728 | + # Get the VIF for this predictor |
| 729 | + vif_row = vif_table.loc[vif_table["feature"] == var, "VIF"] |
| 730 | + if len(vif_row) == 0: |
| 731 | + var_vif = np.nan |
726 | 732 | else: |
727 | | - # Get the VIF for this predictor |
728 | | - vif_row = vif_table.loc[vif_table["feature"] == var, "VIF"] |
729 | | - if len(vif_row) == 0: |
730 | | - var_vif = np.nan |
731 | | - else: |
732 | | - var_vif = vif_row.iloc[0] |
733 | | - if var_vif <= 0 or np.isnan(var_vif): |
734 | | - tolerance = np.nan |
735 | | - else: |
736 | | - tolerance = 1.0 / var_vif |
737 | | - # Collect results |
738 | | - results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r, "Tolerance": tolerance, "VIF": var_vif}) |
| 733 | + var_vif = vif_row.iloc[0] |
| 734 | + if var_vif <= 0 or np.isnan(var_vif): |
| 735 | + tolerance = np.nan |
| 736 | + else: |
| 737 | + tolerance = 1.0 / var_vif |
| 738 | + # Collect results |
| 739 | + results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r, "Tolerance": tolerance, "VIF": var_vif}) |
739 | 740 |
|
740 | 741 | return pd.DataFrame(results) |
| 742 | + |
| 743 | + |
| 744 | +def preprocess_df_for_ols(df, independent_var_columns, target_col) -> tuple: |
| 745 | + """ |
| 746 | + Preprocesses a df for fiitting an OLS regression model using the specified target column and predictors. |
| 747 | +
|
| 748 | + Args: |
| 749 | + df (pd.DataFrame): Input DataFrame containing the data. |
| 750 | + independent_var_columns (list of str): List of names for predictor columns. |
| 751 | + target_col (str): Name of the target/dependent variable column. |
| 752 | +
|
| 753 | + Returns: |
| 754 | + X_encoded (pd.DataFrame): Encoded predictors with a constant term. |
| 755 | + y (pd.Series): Target variable. |
| 756 | +
|
| 757 | + """ |
| 758 | + # Ensure the target column is numeric and 1D |
| 759 | + y = pd.to_numeric(df[target_col], errors="coerce").fillna(0).squeeze() |
| 760 | + if y.ndim != 1: |
| 761 | + raise ValueError(f"Target column '{target_col}' must be 1-dimensional.") |
| 762 | + |
| 763 | + # Ensure predictors are numeric |
| 764 | + X = df[independent_var_columns].apply(pd.to_numeric, errors="coerce") |
| 765 | + # Impute missing values |
| 766 | + X = X.fillna(X.median()) |
| 767 | + |
| 768 | + # Identify categorical columns (replace with your actual categorical list if needed) |
| 769 | + categorical_cols = ["type"] |
| 770 | + encoder = OneHotEncoder(drop="first", sparse_output=False) |
| 771 | + X_categorical_encoded = encoder.fit_transform(df[categorical_cols]) |
| 772 | + |
| 773 | + # Convert encoded data into a DataFrame |
| 774 | + X_categorical_encoded_df = pd.DataFrame(X_categorical_encoded, columns=encoder.get_feature_names_out(categorical_cols), index=df.index) # Ensure alignment with the original DataFrame |
| 775 | + |
| 776 | + # Combine numeric and categorical (encoded) parts |
| 777 | + X_encoded = pd.concat([X, X_categorical_encoded_df], axis=1) |
| 778 | + |
| 779 | + # Add a constant term |
| 780 | + X_encoded = sm.add_constant(X_encoded) |
| 781 | + |
| 782 | + # Ensure alignment between X_encoded and y |
| 783 | + if X_encoded.shape[0] != y.shape[0]: |
| 784 | + raise ValueError(f"Mismatch in rows: predictors (X_encoded) have {X_encoded.shape[0]} rows, " f"but target (y) has {y.shape[0]} rows.") |
| 785 | + |
| 786 | + return X_encoded, y |
0 commit comments