preprocess

bartzbeielstein · bartzbeielstein · commit 79ad72c104d0 · 2025-03-18T00:04:40.000+01:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotpython"
-version = "0.26.24"
+version = "0.26.25"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
@@ -28,6 +28,7 @@ dependencies = [
   "captum",
   "lightning>=2.0.0rc0",
   "graphviz",
+  "mapie",
   "matplotlib",
   "mkdocs>=1.6.0",
   "mkdocs-material>=9.5.33",
diff --git a/src/spotpython/uc/plot.py b/src/spotpython/uc/plot.py
@@ -0,0 +1,136 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def plot_predictionintervals(
+    y_train,
+    y_train_pred,
+    y_train_pred_low,
+    y_train_pred_high,
+    y_test,
+    y_test_pred,
+    y_test_pred_low,
+    y_test_pred_high,
+    suptitle: str,
+) -> None:
+    """
+    Plots prediction intervals for training and testing data.
+    This function generates four subplots arranged in a 2x2 grid:
+    1. True vs predicted values with error bars representing prediction intervals.
+    2. Prediction interval width vs true values.
+    3. Ordered prediction interval widths for both training and testing data.
+    4. Histograms of the interval widths for training and testing data.
+
+    Args:
+        y_train (array-like): True values for the training set.
+        y_train_pred (array-like): Predicted values for the training set.
+        y_train_pred_low (array-like): Lower bounds of prediction intervals for the training set.
+        y_train_pred_high (array-like): Upper bounds of prediction intervals for the training set.
+        y_test (array-like): True values for the testing set.
+        y_test_pred (array-like): Predicted values for the testing set.
+        y_test_pred_low (array-like): Lower bounds of prediction intervals for the testing set.
+        y_test_pred_high (array-like): Upper bounds of prediction intervals for the testing set.
+        suptitle (str): The title for the entire figure.
+
+    Returns:
+        None: The function displays the plots but does not return any value.
+
+    Notes:
+        - The first subplot compares true and predicted values with error bars for both training
+          and testing data.
+        - The second subplot visualizes the width of prediction intervals as a function of true values.
+        - The third subplot orders the prediction interval widths and displays them for both
+          training and testing data.
+        - The fourth subplot shows histograms of the interval widths for training and testing data.
+
+    References:
+        Function adapted from: https://github.com/scikit-learn-contrib/MAPIE/blob/master/notebooks/regression/exoplanets.ipynb
+
+    Examples:
+        >>> import numpy as np
+        >>> from spotpython.uc.plot import plot_predictionintervals
+        >>> y_train = np.array([1, 2, 3, 4, 5])
+        >>> y_train_pred = np.array([1.1, 2.2, 3.3, 4.4, 5.5])
+        >>> y_train_pred_low = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+        >>> y_train_pred_high = np.array([1.2, 2.4, 3.6, 4.8, 6.0])
+        >>> y_test = np.array([6, 7, 8])
+        >>> y_test_pred = np.array([6.1, 7.2, 8.3])
+        >>> y_test_pred_low = np.array([6.0, 7.0, 8.0])
+        >>> y_test_pred_high = np.array([6.2, 7.4, 8.6])
+        >>> suptitle = "Prediction Intervals"
+        >>> plot_predictionintervals(y_train, y_train_pred, y_train_pred_low, y_train_pred_high, y_test, y_test_pred, y_test_pred_low, y_test_pred_high, suptitle)
+    """
+
+    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))
+
+    ax1.errorbar(
+        x=y_train,
+        y=y_train_pred,
+        yerr=(y_train_pred - y_train_pred_low, y_train_pred_high - y_train_pred),
+        alpha=0.8,
+        label="train",
+        fmt=".",
+    )
+    ax1.errorbar(
+        x=y_test,
+        y=y_test_pred,
+        yerr=(y_test_pred - y_test_pred_low, y_test_pred_high - y_test_pred),
+        alpha=0.8,
+        label="test",
+        fmt=".",
+    )
+    ax1.plot(
+        [y_train.min(), y_train.max()],
+        [y_train.min(), y_train.max()],
+        color="gray",
+        alpha=0.5,
+    )
+    ax1.set_xlabel("True values", fontsize=12)
+    ax1.set_ylabel("Predicted values", fontsize=12)
+    ax1.legend()
+    ax1.set_title("True vs predicted values")
+
+    ax2.scatter(x=y_train, y=y_train_pred_high - y_train_pred_low, alpha=0.8, label="train", marker=".")
+    ax2.scatter(x=y_test, y=y_test_pred_high - y_test_pred_low, alpha=0.8, label="test", marker=".")
+    ax2.set_xlabel("True values", fontsize=12)
+    ax2.set_ylabel("Interval width", fontsize=12)
+    ax2.set_xscale("linear")
+    ax2.set_ylim([0, np.max(y_test_pred_high - y_test_pred_low) * 1.1])
+    ax2.legend()
+    ax2.set_title("Prediction interval width vs true values")
+
+    std_all = np.concatenate([y_train_pred_high - y_train_pred_low, y_test_pred_high - y_test_pred_low])
+    type_all = np.array(["train"] * len(y_train) + ["test"] * len(y_test))
+    x_all = np.arange(len(std_all))
+    order_all = np.argsort(std_all)
+    std_order = std_all[order_all]
+    type_order = type_all[order_all]
+    ax3.scatter(
+        x=x_all[type_order == "train"],
+        y=std_order[type_order == "train"],
+        alpha=0.8,
+        label="train",
+        marker=".",
+    )
+    ax3.scatter(
+        x=x_all[type_order == "test"],
+        y=std_order[type_order == "test"],
+        alpha=0.8,
+        label="test",
+        marker=".",
+    )
+    ax3.set_xlabel("Order", fontsize=12)
+    ax3.set_ylabel("Interval width", fontsize=12)
+    ax3.legend()
+    ax3.set_title("Ordered prediction interval width")
+
+    ax4.hist(y_train_pred_high - y_train_pred_low, alpha=0.5, label="train")
+    ax4.hist(y_test_pred_high - y_test_pred_low, alpha=0.5, label="test")
+    ax4.set_xlabel("Interval width", fontsize=12)
+    ax4.set_ylabel("Frequency", fontsize=12)
+    ax4.legend()
+    ax4.set_title("Histogram of interval widths")
+
+    plt.suptitle(suptitle, size=20)
+    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to make room for suptitle
+    plt.show()
diff --git a/src/spotpython/utils/preprocess.py b/src/spotpython/utils/preprocess.py
@@ -0,0 +1,162 @@
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder, RobustScaler
+import numpy as np
+import pandas as pd
+
+
+def get_num_cols(df: pd.DataFrame) -> list:
+    """
+    Identifies numerical columns in a DataFrame.
+
+    This function selects columns with numerical data types (e.g., int, float)
+    from the given DataFrame and returns their names as a list.
+
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+
+    Returns:
+        list: A list of column names corresponding to numerical columns.
+
+    Example:
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> df = pd.DataFrame({
+        ...     "age": [25, 30, np.nan, 35],
+        ...     "gender": ["M", "F", "M", "F"],
+        ...     "income": [50000, 60000, 55000, np.nan]
+        ... })
+        >>> get_num_cols(df)
+        ['age', 'income']
+    """
+    return df.select_dtypes(include=[np.number]).columns.tolist()
+
+
+def get_cat_cols(df: pd.DataFrame) -> list:
+    """
+    Identifies categorical columns in a DataFrame.
+
+    This function selects columns with object data types (e.g., strings)
+    or columns with all NaN values from the given DataFrame and returns their names as a list.
+
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+
+    Returns:
+        list: A list of column names corresponding to categorical columns.
+
+    Example:
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> df = pd.DataFrame({
+        ...     "age": [25, 30, np.nan, 35],
+        ...     "gender": ["M", "F", "M", "F"],
+        ...     "income": [50000, 60000, 55000, np.nan]
+        ... })
+        >>> get_cat_cols(df)
+        ['gender']
+    """
+    return df.select_dtypes(include=["object"]).columns.tolist() + [col for col in df.columns if df[col].isna().all()]
+
+
+def generic_preprocess_df(
+    df: pd.DataFrame,
+    target: str,
+    imputer_num=SimpleImputer(strategy="mean"),
+    imputer_cat=SimpleImputer(strategy="most_frequent"),
+    encoder_cat=OneHotEncoder(categories="auto", drop=None, handle_unknown="ignore", sparse_output=False),
+    scaler_num=RobustScaler(),
+    test_size=0.2,
+    random_state=42,
+    shuffle=True,
+    n_jobs=None,
+) -> pd.DataFrame:
+    """
+    Preprocesses a DataFrame by handling numerical and categorical features,
+    splitting the data into training and testing sets, and applying transformations.
+
+    This function performs the following steps:
+    - Separates the target column from the features.
+    - Identifies numerical and categorical columns.
+    - Applies imputers, encoders, and scalers to the respective columns.
+    - Splits the data into training and testing sets.
+    - Transforms the data using the specified preprocessing pipelines.
+
+    Args:
+        df (pd.DataFrame): The input DataFrame to preprocess.
+        target (str): The name of the target column to predict.
+        imputer_num (SimpleImputer, optional): Imputer for numerical columns.
+            Defaults to `SimpleImputer(strategy="mean")`.
+        imputer_cat (SimpleImputer, optional): Imputer for categorical columns.
+            Defaults to `SimpleImputer(strategy="most_frequent")`.
+        encoder_cat (OneHotEncoder, optional): Encoder for categorical columns.
+            Defaults to `OneHotEncoder(categories="auto", drop=None, handle_unknown="ignore")`.
+        scaler_num (RobustScaler, optional): Scaler for numerical columns.
+            Defaults to `RobustScaler()`.
+        test_size (float, optional): Proportion of the dataset to include in the test split.
+            Defaults to 0.2.
+        random_state (int, optional): Random seed for reproducibility. Defaults to 42.
+        shuffle (bool, optional): Whether to shuffle the data before splitting. Defaults to True.
+        n_jobs (int, optional): Number of jobs to run in parallel for the `ColumnTransformer`.
+            Defaults to None (1 job).
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray, pd.Series, pd.Series]:
+            A tuple containing:
+            - X_train (np.ndarray): Transformed training feature set.
+            - X_test (np.ndarray): Transformed testing feature set.
+            - y_train (pd.Series): Training target values.
+            - y_test (pd.Series): Testing target values.
+
+    Raises:
+        ValueError: If the target column is not found in the DataFrame.
+
+    Examples:
+        >>> from spotpython.utils.preprocess import generic_preprocess_df
+        >>> import pandas as pd
+        >>> from sklearn.impute import SimpleImputer
+        >>> from sklearn.preprocessing import OneHotEncoder, RobustScaler
+        >>> df = pd.DataFrame({
+        ...     "age": [25, 30, np.nan, 35],
+        ...     "gender": ["M", "F", "M", "F"],
+        ...     "income": [50000, 60000, 55000, np.nan],
+        ...     "target": [1, 0, 1, 0]
+        ... })
+        >>> X_train, X_test, y_train, y_test = generic_preprocess_df(
+        ...     df,
+        ...     target="target",
+        ...     imputer_num=SimpleImputer(strategy="mean"),
+        ...     imputer_cat=SimpleImputer(strategy="most_frequent"),
+        ...     encoder_cat=OneHotEncoder(),
+        ...     scaler_num=RobustScaler(),
+        ...     test_size=0.25,
+        ...     random_state=42
+        ... )
+    """
+    if df.empty:
+        raise ValueError("The input DataFrame is empty.")
+    if target not in df.columns:
+        raise ValueError(f"Target column '{target}' not found in the DataFrame.")
+    X = df.drop(target, axis=1)
+    y = df[target]
+    num_cols = get_num_cols(X)
+    cat_cols = get_cat_cols(X)
+    X[cat_cols] = X[cat_cols].astype(str)
+    numerical_transformer = Pipeline(steps=[("imputer", imputer_num), ("scaler", scaler_num)])
+    categorical_transformer = Pipeline(steps=[("imputer", imputer_cat), ("encoder", encoder_cat)])
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ("numerical", numerical_transformer, num_cols),
+            ("categorical", categorical_transformer, cat_cols),
+        ],
+        remainder="drop",
+        sparse_threshold=0,
+        n_jobs=n_jobs,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=shuffle)
+    X_train = preprocessor.fit_transform(X_train)
+    X_test = preprocessor.transform(X_test)
+
+    return X_train, X_test, y_train, y_test
diff --git a/test/test_preprocess.py b/test/test_preprocess.py