Skip to content

Commit 79ad72c

Browse files
preprocess
1 parent 3b2aacf commit 79ad72c

4 files changed

Lines changed: 481 additions & 1 deletion

File tree

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "spotpython"
10-
version = "0.26.24"
10+
version = "0.26.25"
1111
authors = [
1212
{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
1313
]
@@ -28,6 +28,7 @@ dependencies = [
2828
"captum",
2929
"lightning>=2.0.0rc0",
3030
"graphviz",
31+
"mapie",
3132
"matplotlib",
3233
"mkdocs>=1.6.0",
3334
"mkdocs-material>=9.5.33",

src/spotpython/uc/plot.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import numpy as np
2+
import matplotlib.pyplot as plt
3+
4+
5+
def plot_predictionintervals(
6+
y_train,
7+
y_train_pred,
8+
y_train_pred_low,
9+
y_train_pred_high,
10+
y_test,
11+
y_test_pred,
12+
y_test_pred_low,
13+
y_test_pred_high,
14+
suptitle: str,
15+
) -> None:
16+
"""
17+
Plots prediction intervals for training and testing data.
18+
This function generates four subplots arranged in a 2x2 grid:
19+
1. True vs predicted values with error bars representing prediction intervals.
20+
2. Prediction interval width vs true values.
21+
3. Ordered prediction interval widths for both training and testing data.
22+
4. Histograms of the interval widths for training and testing data.
23+
24+
Args:
25+
y_train (array-like): True values for the training set.
26+
y_train_pred (array-like): Predicted values for the training set.
27+
y_train_pred_low (array-like): Lower bounds of prediction intervals for the training set.
28+
y_train_pred_high (array-like): Upper bounds of prediction intervals for the training set.
29+
y_test (array-like): True values for the testing set.
30+
y_test_pred (array-like): Predicted values for the testing set.
31+
y_test_pred_low (array-like): Lower bounds of prediction intervals for the testing set.
32+
y_test_pred_high (array-like): Upper bounds of prediction intervals for the testing set.
33+
suptitle (str): The title for the entire figure.
34+
35+
Returns:
36+
None: The function displays the plots but does not return any value.
37+
38+
Notes:
39+
- The first subplot compares true and predicted values with error bars for both training
40+
and testing data.
41+
- The second subplot visualizes the width of prediction intervals as a function of true values.
42+
- The third subplot orders the prediction interval widths and displays them for both
43+
training and testing data.
44+
- The fourth subplot shows histograms of the interval widths for training and testing data.
45+
46+
References:
47+
Function adapted from: https://github.com/scikit-learn-contrib/MAPIE/blob/master/notebooks/regression/exoplanets.ipynb
48+
49+
Examples:
50+
>>> import numpy as np
51+
>>> from spotpython.uc.plot import plot_predictionintervals
52+
>>> y_train = np.array([1, 2, 3, 4, 5])
53+
>>> y_train_pred = np.array([1.1, 2.2, 3.3, 4.4, 5.5])
54+
>>> y_train_pred_low = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
55+
>>> y_train_pred_high = np.array([1.2, 2.4, 3.6, 4.8, 6.0])
56+
>>> y_test = np.array([6, 7, 8])
57+
>>> y_test_pred = np.array([6.1, 7.2, 8.3])
58+
>>> y_test_pred_low = np.array([6.0, 7.0, 8.0])
59+
>>> y_test_pred_high = np.array([6.2, 7.4, 8.6])
60+
>>> suptitle = "Prediction Intervals"
61+
>>> plot_predictionintervals(y_train, y_train_pred, y_train_pred_low, y_train_pred_high, y_test, y_test_pred, y_test_pred_low, y_test_pred_high, suptitle)
62+
"""
63+
64+
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))
65+
66+
ax1.errorbar(
67+
x=y_train,
68+
y=y_train_pred,
69+
yerr=(y_train_pred - y_train_pred_low, y_train_pred_high - y_train_pred),
70+
alpha=0.8,
71+
label="train",
72+
fmt=".",
73+
)
74+
ax1.errorbar(
75+
x=y_test,
76+
y=y_test_pred,
77+
yerr=(y_test_pred - y_test_pred_low, y_test_pred_high - y_test_pred),
78+
alpha=0.8,
79+
label="test",
80+
fmt=".",
81+
)
82+
ax1.plot(
83+
[y_train.min(), y_train.max()],
84+
[y_train.min(), y_train.max()],
85+
color="gray",
86+
alpha=0.5,
87+
)
88+
ax1.set_xlabel("True values", fontsize=12)
89+
ax1.set_ylabel("Predicted values", fontsize=12)
90+
ax1.legend()
91+
ax1.set_title("True vs predicted values")
92+
93+
ax2.scatter(x=y_train, y=y_train_pred_high - y_train_pred_low, alpha=0.8, label="train", marker=".")
94+
ax2.scatter(x=y_test, y=y_test_pred_high - y_test_pred_low, alpha=0.8, label="test", marker=".")
95+
ax2.set_xlabel("True values", fontsize=12)
96+
ax2.set_ylabel("Interval width", fontsize=12)
97+
ax2.set_xscale("linear")
98+
ax2.set_ylim([0, np.max(y_test_pred_high - y_test_pred_low) * 1.1])
99+
ax2.legend()
100+
ax2.set_title("Prediction interval width vs true values")
101+
102+
std_all = np.concatenate([y_train_pred_high - y_train_pred_low, y_test_pred_high - y_test_pred_low])
103+
type_all = np.array(["train"] * len(y_train) + ["test"] * len(y_test))
104+
x_all = np.arange(len(std_all))
105+
order_all = np.argsort(std_all)
106+
std_order = std_all[order_all]
107+
type_order = type_all[order_all]
108+
ax3.scatter(
109+
x=x_all[type_order == "train"],
110+
y=std_order[type_order == "train"],
111+
alpha=0.8,
112+
label="train",
113+
marker=".",
114+
)
115+
ax3.scatter(
116+
x=x_all[type_order == "test"],
117+
y=std_order[type_order == "test"],
118+
alpha=0.8,
119+
label="test",
120+
marker=".",
121+
)
122+
ax3.set_xlabel("Order", fontsize=12)
123+
ax3.set_ylabel("Interval width", fontsize=12)
124+
ax3.legend()
125+
ax3.set_title("Ordered prediction interval width")
126+
127+
ax4.hist(y_train_pred_high - y_train_pred_low, alpha=0.5, label="train")
128+
ax4.hist(y_test_pred_high - y_test_pred_low, alpha=0.5, label="test")
129+
ax4.set_xlabel("Interval width", fontsize=12)
130+
ax4.set_ylabel("Frequency", fontsize=12)
131+
ax4.legend()
132+
ax4.set_title("Histogram of interval widths")
133+
134+
plt.suptitle(suptitle, size=20)
135+
plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout to make room for suptitle
136+
plt.show()

src/spotpython/utils/preprocess.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
from sklearn.compose import ColumnTransformer
2+
from sklearn.impute import SimpleImputer
3+
from sklearn.model_selection import train_test_split
4+
from sklearn.pipeline import Pipeline
5+
from sklearn.preprocessing import OneHotEncoder, RobustScaler
6+
import numpy as np
7+
import pandas as pd
8+
9+
10+
def get_num_cols(df: pd.DataFrame) -> list:
11+
"""
12+
Identifies numerical columns in a DataFrame.
13+
14+
This function selects columns with numerical data types (e.g., int, float)
15+
from the given DataFrame and returns their names as a list.
16+
17+
Args:
18+
df (pd.DataFrame): The input DataFrame.
19+
20+
Returns:
21+
list: A list of column names corresponding to numerical columns.
22+
23+
Example:
24+
>>> import pandas as pd
25+
>>> import numpy as np
26+
>>> df = pd.DataFrame({
27+
... "age": [25, 30, np.nan, 35],
28+
... "gender": ["M", "F", "M", "F"],
29+
... "income": [50000, 60000, 55000, np.nan]
30+
... })
31+
>>> get_num_cols(df)
32+
['age', 'income']
33+
"""
34+
return df.select_dtypes(include=[np.number]).columns.tolist()
35+
36+
37+
def get_cat_cols(df: pd.DataFrame) -> list:
38+
"""
39+
Identifies categorical columns in a DataFrame.
40+
41+
This function selects columns with object data types (e.g., strings)
42+
or columns with all NaN values from the given DataFrame and returns their names as a list.
43+
44+
Args:
45+
df (pd.DataFrame): The input DataFrame.
46+
47+
Returns:
48+
list: A list of column names corresponding to categorical columns.
49+
50+
Example:
51+
>>> import pandas as pd
52+
>>> import numpy as np
53+
>>> df = pd.DataFrame({
54+
... "age": [25, 30, np.nan, 35],
55+
... "gender": ["M", "F", "M", "F"],
56+
... "income": [50000, 60000, 55000, np.nan]
57+
... })
58+
>>> get_cat_cols(df)
59+
['gender']
60+
"""
61+
return df.select_dtypes(include=["object"]).columns.tolist() + [col for col in df.columns if df[col].isna().all()]
62+
63+
64+
def generic_preprocess_df(
65+
df: pd.DataFrame,
66+
target: str,
67+
imputer_num=SimpleImputer(strategy="mean"),
68+
imputer_cat=SimpleImputer(strategy="most_frequent"),
69+
encoder_cat=OneHotEncoder(categories="auto", drop=None, handle_unknown="ignore", sparse_output=False),
70+
scaler_num=RobustScaler(),
71+
test_size=0.2,
72+
random_state=42,
73+
shuffle=True,
74+
n_jobs=None,
75+
) -> pd.DataFrame:
76+
"""
77+
Preprocesses a DataFrame by handling numerical and categorical features,
78+
splitting the data into training and testing sets, and applying transformations.
79+
80+
This function performs the following steps:
81+
- Separates the target column from the features.
82+
- Identifies numerical and categorical columns.
83+
- Applies imputers, encoders, and scalers to the respective columns.
84+
- Splits the data into training and testing sets.
85+
- Transforms the data using the specified preprocessing pipelines.
86+
87+
Args:
88+
df (pd.DataFrame): The input DataFrame to preprocess.
89+
target (str): The name of the target column to predict.
90+
imputer_num (SimpleImputer, optional): Imputer for numerical columns.
91+
Defaults to `SimpleImputer(strategy="mean")`.
92+
imputer_cat (SimpleImputer, optional): Imputer for categorical columns.
93+
Defaults to `SimpleImputer(strategy="most_frequent")`.
94+
encoder_cat (OneHotEncoder, optional): Encoder for categorical columns.
95+
Defaults to `OneHotEncoder(categories="auto", drop=None, handle_unknown="ignore")`.
96+
scaler_num (RobustScaler, optional): Scaler for numerical columns.
97+
Defaults to `RobustScaler()`.
98+
test_size (float, optional): Proportion of the dataset to include in the test split.
99+
Defaults to 0.2.
100+
random_state (int, optional): Random seed for reproducibility. Defaults to 42.
101+
shuffle (bool, optional): Whether to shuffle the data before splitting. Defaults to True.
102+
n_jobs (int, optional): Number of jobs to run in parallel for the `ColumnTransformer`.
103+
Defaults to None (1 job).
104+
105+
Returns:
106+
Tuple[np.ndarray, np.ndarray, pd.Series, pd.Series]:
107+
A tuple containing:
108+
- X_train (np.ndarray): Transformed training feature set.
109+
- X_test (np.ndarray): Transformed testing feature set.
110+
- y_train (pd.Series): Training target values.
111+
- y_test (pd.Series): Testing target values.
112+
113+
Raises:
114+
ValueError: If the target column is not found in the DataFrame.
115+
116+
Examples:
117+
>>> from spotpython.utils.preprocess import generic_preprocess_df
118+
>>> import pandas as pd
119+
>>> from sklearn.impute import SimpleImputer
120+
>>> from sklearn.preprocessing import OneHotEncoder, RobustScaler
121+
>>> df = pd.DataFrame({
122+
... "age": [25, 30, np.nan, 35],
123+
... "gender": ["M", "F", "M", "F"],
124+
... "income": [50000, 60000, 55000, np.nan],
125+
... "target": [1, 0, 1, 0]
126+
... })
127+
>>> X_train, X_test, y_train, y_test = generic_preprocess_df(
128+
... df,
129+
... target="target",
130+
... imputer_num=SimpleImputer(strategy="mean"),
131+
... imputer_cat=SimpleImputer(strategy="most_frequent"),
132+
... encoder_cat=OneHotEncoder(),
133+
... scaler_num=RobustScaler(),
134+
... test_size=0.25,
135+
... random_state=42
136+
... )
137+
"""
138+
if df.empty:
139+
raise ValueError("The input DataFrame is empty.")
140+
if target not in df.columns:
141+
raise ValueError(f"Target column '{target}' not found in the DataFrame.")
142+
X = df.drop(target, axis=1)
143+
y = df[target]
144+
num_cols = get_num_cols(X)
145+
cat_cols = get_cat_cols(X)
146+
X[cat_cols] = X[cat_cols].astype(str)
147+
numerical_transformer = Pipeline(steps=[("imputer", imputer_num), ("scaler", scaler_num)])
148+
categorical_transformer = Pipeline(steps=[("imputer", imputer_cat), ("encoder", encoder_cat)])
149+
preprocessor = ColumnTransformer(
150+
transformers=[
151+
("numerical", numerical_transformer, num_cols),
152+
("categorical", categorical_transformer, cat_cols),
153+
],
154+
remainder="drop",
155+
sparse_threshold=0,
156+
n_jobs=n_jobs,
157+
)
158+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=shuffle)
159+
X_train = preprocessor.fit_transform(X_train)
160+
X_test = preprocessor.transform(X_test)
161+
162+
return X_train, X_test, y_train, y_test

0 commit comments

Comments
 (0)