|
1 | 1 | import numpy as np |
2 | 2 | import matplotlib.pyplot as plt |
3 | 3 | import pandas as pd |
| 4 | +from sklearn.ensemble import GradientBoostingRegressor |
| 5 | +from sklearn.inspection import PartialDependenceDisplay |
| 6 | +from sklearn.model_selection import train_test_split |
4 | 7 |
|
5 | 8 |
|
6 | 9 | def randorient(k, p, xi): |
@@ -138,3 +141,74 @@ def screening(X, fun, xi, p, labels, range=None, print=False) -> pd.DataFrame: |
138 | 141 | plt.gca().tick_params(labelsize=10) |
139 | 142 | plt.grid(True) |
140 | 143 | plt.show() |
| 144 | + |
| 145 | + |
| 146 | +def plot_all_partial_dependence(df, df_target, model="GradientBoostingRegressor", nrows=5, ncols=6, figsize=(20, 15)) -> None: |
| 147 | + """ |
| 148 | + Generates Partial Dependence Plots (PDPs) for every feature in a DataFrame against a target variable, |
| 149 | + arranged in a grid. |
| 150 | +
|
| 151 | + Args: |
| 152 | + df (pd.DataFrame): DataFrame containing the features. |
| 153 | + df_target (pd.Series): Series containing the target variable. |
| 154 | + model (str, optional): Name of the model class to use (e.g., "GradientBoostingRegressor"). |
| 155 | + Defaults to "GradientBoostingRegressor". |
| 156 | + nrows (int, optional): Number of rows in the grid of subplots. Defaults to 5. |
| 157 | + ncols (int, optional): Number of columns in the grid of subplots. Defaults to 6. |
| 158 | + figsize (tuple, optional): Figure size (width, height) in inches. Defaults to (20, 15). |
| 159 | +
|
| 160 | + Returns: |
| 161 | + None |
| 162 | +
|
| 163 | + Examples: |
| 164 | + >>> form spotpython.utils.effects import plot_all_partial_dependence |
| 165 | + >>> from sklearn.datasets import load_boston |
| 166 | + >>> import pandas as pd |
| 167 | + >>> data = load_boston() |
| 168 | + >>> df = pd.DataFrame(data.data, columns=data.feature_names) |
| 169 | + >>> df_target = pd.Series(data.target, name="target") |
| 170 | + >>> plot_all_partial_dependence(df, df_target, model="GradientBoostingRegressor", nrows=5, ncols=6, figsize=(20, 15)) |
| 171 | +
|
| 172 | + """ |
| 173 | + |
| 174 | + # Separate features and target |
| 175 | + X = df |
| 176 | + y = df_target # Target variable is now a Series |
| 177 | + |
| 178 | + # Split data |
| 179 | + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
| 180 | + |
| 181 | + # Instantiate the model |
| 182 | + if model == "GradientBoostingRegressor": |
| 183 | + gb_model = GradientBoostingRegressor(random_state=42) |
| 184 | + elif model == "RandomForestRegressor": |
| 185 | + from sklearn.ensemble import RandomForestRegressor |
| 186 | + |
| 187 | + gb_model = RandomForestRegressor(random_state=42) |
| 188 | + elif model == "DecisionTreeRegressor": |
| 189 | + from sklearn.tree import DecisionTreeRegressor |
| 190 | + |
| 191 | + gb_model = DecisionTreeRegressor(random_state=42) |
| 192 | + else: |
| 193 | + raise ValueError(f"Unsupported model: {model}") |
| 194 | + |
| 195 | + # Train model |
| 196 | + gb_model.fit(X_train, y_train) |
| 197 | + |
| 198 | + # Create subplots |
| 199 | + fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize) |
| 200 | + axes = axes.flatten() # Flatten the 2D array of axes for easy iteration |
| 201 | + |
| 202 | + # Generate PDP for each feature |
| 203 | + features = X.columns |
| 204 | + for i, feature in enumerate(features): |
| 205 | + ax = axes[i] # Select the axis for the current feature |
| 206 | + PartialDependenceDisplay.from_estimator(gb_model, X_train, [feature], ax=ax) |
| 207 | + ax.set_title(feature) # Set the title of the subplot to the feature name |
| 208 | + |
| 209 | + # Remove empty subplots if the number of features is less than nrows * ncols |
| 210 | + for i in range(len(features), nrows * ncols): |
| 211 | + fig.delaxes(axes[i]) |
| 212 | + |
| 213 | + plt.tight_layout() # Adjust subplot parameters for a tight layout |
| 214 | + plt.show() |
0 commit comments