|
| 1 | +from sklearn.compose import ColumnTransformer |
| 2 | +from sklearn.impute import SimpleImputer |
| 3 | +from sklearn.model_selection import train_test_split |
| 4 | +from sklearn.pipeline import Pipeline |
| 5 | +from sklearn.preprocessing import OneHotEncoder, RobustScaler |
| 6 | +import numpy as np |
| 7 | +import pandas as pd |
| 8 | + |
| 9 | + |
| 10 | +def get_num_cols(df: pd.DataFrame) -> list: |
| 11 | + """ |
| 12 | + Identifies numerical columns in a DataFrame. |
| 13 | +
|
| 14 | + This function selects columns with numerical data types (e.g., int, float) |
| 15 | + from the given DataFrame and returns their names as a list. |
| 16 | +
|
| 17 | + Args: |
| 18 | + df (pd.DataFrame): The input DataFrame. |
| 19 | +
|
| 20 | + Returns: |
| 21 | + list: A list of column names corresponding to numerical columns. |
| 22 | +
|
| 23 | + Example: |
| 24 | + >>> import pandas as pd |
| 25 | + >>> import numpy as np |
| 26 | + >>> df = pd.DataFrame({ |
| 27 | + ... "age": [25, 30, np.nan, 35], |
| 28 | + ... "gender": ["M", "F", "M", "F"], |
| 29 | + ... "income": [50000, 60000, 55000, np.nan] |
| 30 | + ... }) |
| 31 | + >>> get_num_cols(df) |
| 32 | + ['age', 'income'] |
| 33 | + """ |
| 34 | + return df.select_dtypes(include=[np.number]).columns.tolist() |
| 35 | + |
| 36 | + |
| 37 | +def get_cat_cols(df: pd.DataFrame) -> list: |
| 38 | + """ |
| 39 | + Identifies categorical columns in a DataFrame. |
| 40 | +
|
| 41 | + This function selects columns with object data types (e.g., strings) |
| 42 | + or columns with all NaN values from the given DataFrame and returns their names as a list. |
| 43 | +
|
| 44 | + Args: |
| 45 | + df (pd.DataFrame): The input DataFrame. |
| 46 | +
|
| 47 | + Returns: |
| 48 | + list: A list of column names corresponding to categorical columns. |
| 49 | +
|
| 50 | + Example: |
| 51 | + >>> import pandas as pd |
| 52 | + >>> import numpy as np |
| 53 | + >>> df = pd.DataFrame({ |
| 54 | + ... "age": [25, 30, np.nan, 35], |
| 55 | + ... "gender": ["M", "F", "M", "F"], |
| 56 | + ... "income": [50000, 60000, 55000, np.nan] |
| 57 | + ... }) |
| 58 | + >>> get_cat_cols(df) |
| 59 | + ['gender'] |
| 60 | + """ |
| 61 | + return df.select_dtypes(include=["object"]).columns.tolist() + [col for col in df.columns if df[col].isna().all()] |
| 62 | + |
| 63 | + |
| 64 | +def generic_preprocess_df( |
| 65 | + df: pd.DataFrame, |
| 66 | + target: str, |
| 67 | + imputer_num=SimpleImputer(strategy="mean"), |
| 68 | + imputer_cat=SimpleImputer(strategy="most_frequent"), |
| 69 | + encoder_cat=OneHotEncoder(categories="auto", drop=None, handle_unknown="ignore", sparse_output=False), |
| 70 | + scaler_num=RobustScaler(), |
| 71 | + test_size=0.2, |
| 72 | + random_state=42, |
| 73 | + shuffle=True, |
| 74 | + n_jobs=None, |
| 75 | +) -> pd.DataFrame: |
| 76 | + """ |
| 77 | + Preprocesses a DataFrame by handling numerical and categorical features, |
| 78 | + splitting the data into training and testing sets, and applying transformations. |
| 79 | +
|
| 80 | + This function performs the following steps: |
| 81 | + - Separates the target column from the features. |
| 82 | + - Identifies numerical and categorical columns. |
| 83 | + - Applies imputers, encoders, and scalers to the respective columns. |
| 84 | + - Splits the data into training and testing sets. |
| 85 | + - Transforms the data using the specified preprocessing pipelines. |
| 86 | +
|
| 87 | + Args: |
| 88 | + df (pd.DataFrame): The input DataFrame to preprocess. |
| 89 | + target (str): The name of the target column to predict. |
| 90 | + imputer_num (SimpleImputer, optional): Imputer for numerical columns. |
| 91 | + Defaults to `SimpleImputer(strategy="mean")`. |
| 92 | + imputer_cat (SimpleImputer, optional): Imputer for categorical columns. |
| 93 | + Defaults to `SimpleImputer(strategy="most_frequent")`. |
| 94 | + encoder_cat (OneHotEncoder, optional): Encoder for categorical columns. |
| 95 | + Defaults to `OneHotEncoder(categories="auto", drop=None, handle_unknown="ignore")`. |
| 96 | + scaler_num (RobustScaler, optional): Scaler for numerical columns. |
| 97 | + Defaults to `RobustScaler()`. |
| 98 | + test_size (float, optional): Proportion of the dataset to include in the test split. |
| 99 | + Defaults to 0.2. |
| 100 | + random_state (int, optional): Random seed for reproducibility. Defaults to 42. |
| 101 | + shuffle (bool, optional): Whether to shuffle the data before splitting. Defaults to True. |
| 102 | + n_jobs (int, optional): Number of jobs to run in parallel for the `ColumnTransformer`. |
| 103 | + Defaults to None (1 job). |
| 104 | +
|
| 105 | + Returns: |
| 106 | + Tuple[np.ndarray, np.ndarray, pd.Series, pd.Series]: |
| 107 | + A tuple containing: |
| 108 | + - X_train (np.ndarray): Transformed training feature set. |
| 109 | + - X_test (np.ndarray): Transformed testing feature set. |
| 110 | + - y_train (pd.Series): Training target values. |
| 111 | + - y_test (pd.Series): Testing target values. |
| 112 | +
|
| 113 | + Raises: |
| 114 | + ValueError: If the target column is not found in the DataFrame. |
| 115 | +
|
| 116 | + Examples: |
| 117 | + >>> from spotpython.utils.preprocess import generic_preprocess_df |
| 118 | + >>> import pandas as pd |
| 119 | + >>> from sklearn.impute import SimpleImputer |
| 120 | + >>> from sklearn.preprocessing import OneHotEncoder, RobustScaler |
| 121 | + >>> df = pd.DataFrame({ |
| 122 | + ... "age": [25, 30, np.nan, 35], |
| 123 | + ... "gender": ["M", "F", "M", "F"], |
| 124 | + ... "income": [50000, 60000, 55000, np.nan], |
| 125 | + ... "target": [1, 0, 1, 0] |
| 126 | + ... }) |
| 127 | + >>> X_train, X_test, y_train, y_test = generic_preprocess_df( |
| 128 | + ... df, |
| 129 | + ... target="target", |
| 130 | + ... imputer_num=SimpleImputer(strategy="mean"), |
| 131 | + ... imputer_cat=SimpleImputer(strategy="most_frequent"), |
| 132 | + ... encoder_cat=OneHotEncoder(), |
| 133 | + ... scaler_num=RobustScaler(), |
| 134 | + ... test_size=0.25, |
| 135 | + ... random_state=42 |
| 136 | + ... ) |
| 137 | + """ |
| 138 | + if df.empty: |
| 139 | + raise ValueError("The input DataFrame is empty.") |
| 140 | + if target not in df.columns: |
| 141 | + raise ValueError(f"Target column '{target}' not found in the DataFrame.") |
| 142 | + X = df.drop(target, axis=1) |
| 143 | + y = df[target] |
| 144 | + num_cols = get_num_cols(X) |
| 145 | + cat_cols = get_cat_cols(X) |
| 146 | + X[cat_cols] = X[cat_cols].astype(str) |
| 147 | + numerical_transformer = Pipeline(steps=[("imputer", imputer_num), ("scaler", scaler_num)]) |
| 148 | + categorical_transformer = Pipeline(steps=[("imputer", imputer_cat), ("encoder", encoder_cat)]) |
| 149 | + preprocessor = ColumnTransformer( |
| 150 | + transformers=[ |
| 151 | + ("numerical", numerical_transformer, num_cols), |
| 152 | + ("categorical", categorical_transformer, cat_cols), |
| 153 | + ], |
| 154 | + remainder="drop", |
| 155 | + sparse_threshold=0, |
| 156 | + n_jobs=n_jobs, |
| 157 | + ) |
| 158 | + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=shuffle) |
| 159 | + X_train = preprocessor.fit_transform(X_train) |
| 160 | + X_test = preprocessor.transform(X_test) |
| 161 | + |
| 162 | + return X_train, X_test, y_train, y_test |
0 commit comments