0.14.13

bartzbeielstein · bartzbeielstein · commit d56eaca40b17 · 2024-04-20T23:01:13.000+02:00
xai
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotpython"
-version = "0.14.11"
+version = "0.14.13"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotPython/data/california.py b/src/spotPython/data/california.py
@@ -0,0 +1,144 @@
+import torch
+from torch.utils.data import Dataset
+from sklearn.datasets import fetch_california_housing
+
+
+class CaliforniaHousing(Dataset):
+    """
+    A PyTorch Dataset for regression. A toy data set from scikit-learn.
+    Features:
+        * MedInc median income in block group
+        * HouseAge median house age in block group
+        * AveRooms average number of rooms per household
+        * AveBedrms average number of bedrooms per household
+        * Population block group population
+        * AveOccup average number of household members
+        * Latitude block group latitude
+        * Longitude block group longitude
+    The target variable is the median house value for California districts,
+    expressed in hundreds of thousands of Dollars ($100,000).
+    Samples total: 20640, Dimensionality: 8, Features: real, Target: real 0.15 - 5.
+    This dataset was derived from the 1990 U.S. census, using one row per census block group.
+    A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data
+    (a block group typically has a population of 600 to 3,000 people).
+
+    Args:
+        feature_type (torch.dtype): The data type of the features. Defaults to torch.float.
+        target_type (torch.dtype): The data type of the targets. Defaults to torch.long.
+        train (bool): Whether the dataset is for training or not. Defaults to True.
+        n_samples (int): The number of samples of the dataset. Defaults to None, which means the entire dataset is used.
+
+    Attributes:
+        data (Tensor): The data features.
+        targets (Tensor): The data targets.
+
+    Examples:
+        >>> from torch.utils.data import DataLoader
+            from spotPython.data.diabetes import Diabetes
+            import torch
+            dataset = Diabetes(feature_type=torch.float32, target_type=torch.float32)
+            # Set batch size for DataLoader
+            batch_size = 5
+            # Create DataLoader
+            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+            # Iterate over the data in the DataLoader
+            for batch in dataloader:
+                inputs, targets = batch
+                print(f"Batch Size: {inputs.size(0)}")
+                print("---------------")
+                print(f"Inputs: {inputs}")
+                print(f"Targets: {targets}")
+    """
+
+    def __init__(
+        self,
+        feature_type: torch.dtype = torch.float,
+        target_type: torch.dtype = torch.float,
+        train: bool = True,
+        n_samples: int = None,
+    ) -> None:
+        super().__init__()
+        self.feature_type = feature_type
+        self.target_type = target_type
+        self.train = train
+        self.n_samples = n_samples
+        self.data, self.targets = self._load_data()
+
+    def _load_data(self) -> tuple:
+        """Loads the data from scikit-learn and returns the features and targets.
+
+        Returns:
+            tuple: A tuple containing the features and targets.
+
+        Examples:
+            >>> from spotPython.data.diabetes import Diabetes
+                dataset = Diabetes()
+                print(dataset.data.shape)
+                print(dataset.targets.shape)
+                torch.Size([442, 10])
+                torch.Size([442])
+        """
+        feature_df, target_df = fetch_california_housing(return_X_y=True, as_frame=True)
+        if self.n_samples is not None:
+            feature_df = feature_df[: self.n_samples]
+            target_df = target_df[: self.n_samples]
+        # Convert DataFrames to PyTorch tensors
+        feature_tensor = torch.tensor(feature_df.values, dtype=self.feature_type)
+        target_tensor = torch.tensor(target_df.values, dtype=self.target_type)
+
+        return feature_tensor, target_tensor
+
+    def __getitem__(self, idx: int) -> tuple:
+        """
+        Returns the feature and target at the given index.
+
+        Args:
+            idx (int): The index.
+
+        Returns:
+            tuple: A tuple containing the feature and target at the given index.
+
+        Examples:
+            >>> from spotPython.light.csvdataset import CSVDataset
+                dataset = CSVDataset(filename='./data/spotPython/data.csv', target_column='prognosis')
+                print(dataset.data.shape)
+                print(dataset.targets.shape)
+                torch.Size([11, 65])
+                torch.Size([11])
+        """
+        feature = self.data[idx]
+        target = self.targets[idx]
+        return feature, target
+
+    def __len__(self) -> int:
+        """
+        Returns the length of the dataset.
+
+        Returns:
+            int: The length of the dataset.
+
+        Examples:
+            >>> from spotPython.light import CSVDataset
+            >>> dataset = CSVDataset()
+            >>> print(len(dataset))
+            60000
+
+        """
+        return len(self.data)
+
+    def extra_repr(self) -> str:
+        """
+        Returns a string representation of the dataset.
+
+        Returns:
+            str: A string representation of the dataset.
+
+        Examples:
+            >>> from spotPython.light import CSVDataset
+            >>> dataset = CSVDataset()
+            >>> print(dataset)
+            Split: Train
+
+        """
+        split = "Train" if self.train else "Test"
+        return f"Split: {split}"
diff --git a/src/spotPython/light/loadmodel.py b/src/spotPython/light/loadmodel.py
@@ -21,6 +21,7 @@ def load_light_from_checkpoint(config: dict, fun_control: dict, postfix: str = "
             A dictionary containing the function control parameters.
         postfix (str):
             The postfix to append to the configuration ID when generating the checkpoint path.
+            Default is "_TEST". Can be set to "_TRAIN" for training checkpoints.
 
     Returns:
         Any: The loaded model.
diff --git a/src/spotPython/light/trainmodel.py b/src/spotPython/light/trainmodel.py
@@ -4,16 +4,22 @@
 from pytorch_lightning.loggers import TensorBoardLogger
 from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 from spotPython.torch.initialization import kaiming_init, xavier_init
+from lightning.pytorch.callbacks import ModelCheckpoint
 import os
 
 
-def train_model(config: dict, fun_control: dict) -> float:
+def train_model(config: dict, fun_control: dict, timestamp: bool = True) -> float:
     """
     Trains a model using the given configuration and function control parameters.
 
     Args:
-        config (dict): A dictionary containing the configuration parameters for the model.
-        fun_control (dict): A dictionary containing the function control parameters.
+        config (dict):
+            A dictionary containing the configuration parameters for the model.
+        fun_control (dict):
+            A dictionary containing the function control parameters.
+        timestamp (bool):
+            A boolean value indicating whether to include a timestamp in the config id. Default is True.
+            If False, the string "_TRAIN" is appended to the config id.
 
     Returns:
         float: The validation loss of the trained model.
@@ -72,9 +78,15 @@ def train_model(config: dict, fun_control: dict) -> float:
         enable_progress_bar = False
     else:
         enable_progress_bar = fun_control["enable_progress_bar"]
-    # config id is unique. Since the model is not loaded from a checkpoint,
-    # the config id is generated here with a timestamp.
-    config_id = generate_config_id(config, timestamp=True)
+    if timestamp:
+        # config id is unique. Since the model is not loaded from a checkpoint,
+        # the config id is generated here with a timestamp.
+        config_id = generate_config_id(config, timestamp=True)
+    else:
+        # config id is not time-dependent and therefore unique,
+        # so that the model can be loaded from a checkpoint,
+        # the config id is generated here without a timestamp.
+        config_id = generate_config_id(config, timestamp=False) + "_TRAIN"
     model = fun_control["core_model"](**config, _L_in=_L_in, _L_out=_L_out, _torchmetric=_torchmetric)
     initialization = config["initialization"]
     if initialization == "Xavier":
@@ -97,6 +109,16 @@ def train_model(config: dict, fun_control: dict) -> float:
     # print(f"train_model(): Train set size: {len(dm.data_train)}")
     # print(f"train_model(): Batch size: {config['batch_size']}")
 
+    # Callbacks
+    callbacks = [
+        EarlyStopping(monitor="val_loss", patience=config["patience"], mode="min", strict=False, verbose=False)
+    ]
+    if not timestamp:
+        # add ModelCheckpoint only if timestamp is False
+        callbacks.append(
+            ModelCheckpoint(dirpath=os.path.join(fun_control["CHECKPOINT_PATH"], config_id), save_last=True)
+        )  # Save the last checkpoint
+
     # Init trainer
     trainer = L.Trainer(
         # Where to save models
@@ -110,9 +132,7 @@ def train_model(config: dict, fun_control: dict) -> float:
             default_hp_metric=True,
             log_graph=fun_control["log_graph"],
         ),
-        callbacks=[
-            EarlyStopping(monitor="val_loss", patience=config["patience"], mode="min", strict=False, verbose=False)
-        ],
+        callbacks=callbacks,
         enable_progress_bar=enable_progress_bar,
     )
     # Pass the datamodule as arg to trainer.fit to override model hooks :)
diff --git a/src/spotPython/plot/xai.py b/src/spotPython/plot/xai.py
@@ -7,6 +7,13 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import matplotlib.colors as colors
+from spotPython.hyperparameters.values import get_tuned_architecture
+from spotPython.light.trainmodel import train_model
+from spotPython.light.loadmodel import load_light_from_checkpoint
+from spotPython.utils.classes import get_removed_attributes_and_base_net
+import pandas as pd
+from captum.attr import LayerConductance, LayerActivation, LayerIntegratedGradients
+from captum.attr import IntegratedGradients, DeepLift, GradientShap, NoiseTunnel, FeatureAblation
 
 
 def get_activations(net, fun_control, batch_size, device="cpu") -> dict:
@@ -472,3 +479,119 @@ def visualize_gradients(net, fun_control, batch_size, absolute=True, cmap="gray"
         batch_size=batch_size,
     )
     plot_nn_values_scatter(nn_values=grads, nn_values_names="Gradients", absolute=absolute, cmap=cmap, figsize=figsize)
+
+
+def get_attributions(
+    spot_tuner,
+    fun_control,
+    attr_method="IntegratedGradients",
+    baseline=None,
+    abs_attr=True,
+    n_rel=5,
+    feature_names=None,
+):
+    """Get the attributions of a neural network.
+
+    Args:
+        spot_tuner (object):
+            The spot tuner object.
+        fun_control (dict):
+            A dictionary with the function control.
+        attr_method (str, optional):
+            The attribution method. Defaults to "IntegratedGradients".
+        baseline (torch.Tensor, optional):
+            The baseline for the attribution methods. Defaults to None.
+        abs_attr (bool, optional):
+            Whether the method should sort by the absolute attribution values. Defaults to True.
+        n_rel (int, optional):
+            The number of relevant features. Defaults to 5.
+        feature_names (list, optional):
+            The feature names. Defaults to None.
+
+    Returns:
+        pd.DataFrame: A DataFrame with the attributions.
+    """
+    total_attributions = None
+    config = get_tuned_architecture(spot_tuner, fun_control)
+    train_model(config, fun_control, timestamp=False)
+    model_loaded = load_light_from_checkpoint(config, fun_control, postfix="_TRAIN")
+    removed_attributes, model = get_removed_attributes_and_base_net(net=model_loaded)
+    model = model.to("cpu")
+    model.eval()
+    dataset = fun_control["data_set"]
+    n_features = dataset.data.shape[1]
+    if feature_names is None:
+        feature_names = [f"x{i}" for i in range(n_features)]
+    batch_size = config["batch_size"]
+    # train_loader = DataLoader(dataset, batch_size=batch_size)
+    test_loader = DataLoader(dataset, batch_size=batch_size)
+    if attr_method == "IntegratedGradients":
+        attr = IntegratedGradients(model)
+    elif attr_method == "DeepLift":
+        attr = DeepLift(model)
+    elif attr_method == "GradientShap":  # Todo: would need a baseline
+        if baseline is None:
+            raise ValueError("baseline cannot be 'None' for GradientShap")
+        attr = GradientShap(model)
+    elif attr_method == "FeatureAblation":
+        attr = FeatureAblation(model)
+    else:
+        raise ValueError(
+            """
+            Unsupported attribution method.
+            Please choose from 'IntegratedGradients', 'DeepLift', 'GradientShap', or 'FeatureAblation'.
+            """
+        )
+    for inputs, labels in test_loader:
+        attributions = attr.attribute(inputs, return_convergence_delta=False, baselines=baseline)
+        if total_attributions is None:
+            total_attributions = attributions
+        else:
+            if len(attributions) == len(total_attributions):
+                total_attributions += attributions
+
+    # Calculation of average attribution across all batches
+    avg_attributions = total_attributions.mean(dim=0).detach().numpy()
+
+    # Transformation to the absolute attribution values if abs_attr is True
+    # Get indices of the n most important features
+    if abs_attr is True:
+        abs_avg_attributions = abs(avg_attributions)
+        top_n_indices = abs_avg_attributions.argsort()[-n_rel:][::-1]
+    else:
+        top_n_indices = avg_attributions.argsort()[-n_rel:][::-1]
+
+    # Get the importance values for the top n features
+    top_n_importances = avg_attributions[top_n_indices]
+
+    df = pd.DataFrame(
+        {
+            "Feature Index": top_n_indices,
+            "Feature": [feature_names[i] for i in top_n_indices],
+            attr_method + "Attribution": top_n_importances,
+        }
+    )
+    return df
+
+
+def plot_attributions(df, attr_method="IntegratedGradients"):
+    """
+    Plot the attributions of a neural network.
+
+    Args:
+        df (pd.DataFrame):
+            A DataFrame with the attributions.
+        attr_method (str, optional):
+            The attribution method. Defaults to "IntegratedGradients".
+
+    Returns:
+        None
+
+    """
+    sns.set_theme(style="whitegrid")
+    plt.figure(figsize=(10, 6))
+    sns.barplot(x=attr_method + "Attribution", y="Feature", data=df, palette="viridis", hue="Feature")
+    plt.title(f"Top {df.shape[0]} Features by {attr_method} Attribution")
+    plt.xlabel(f"{attr_method} Attribution Value")
+    plt.ylabel("Feature")
+    plt.show()
diff --git a/src/spotPython/spot/spot.py b/src/spotPython/spot/spot.py
@@ -708,7 +708,9 @@ def write_db_dict(self) -> None:
         # Generate a description of the results:
         # if spot_tuner_control['min_y'] exists:
         try:
-            result = f"Results for {ident}: Finally, the best value is {spot_tuner_control['min_y']} at {spot_tuner_control['min_X']}."
+            result = f"""
+                      Results for {ident}: Finally, the best value is {spot_tuner_control['min_y']}
+                      at {spot_tuner_control['min_X']}."""
             #
             db_dict = {
                 "data": {

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotpython"`
`10`		`-version = "0.14.11"`
	`10`	`+version = "0.14.13"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`