0.29.26 20.6.2025 Kriging updated

bartzbeielstein · bartzbeielstein · commit 6ac2a532fc8a · 2025-06-20T21:09:33.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotpython"
-version = "0.29.25"
+version = "0.29.26"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotpython/surrogate/kriging.py b/src/spotpython/surrogate/kriging.py
@@ -194,12 +194,18 @@ def fit(self, X: np.ndarray, y: np.ndarray, bounds: Optional[List[Tuple[float, f
             y (np.ndarray):
                 Target values of shape (n_samples,) or (n_samples, 1).
             bounds (Optional[List[Tuple[float, float]]]):
-                Bounds for each dimension of log(theta). If None, defaults to [(-3, 2)] * n_features.
+                Bounds for each dimension of log(theta). If None, defaults to
+                [(-3, 2)] * n_features for interpolation, or
+                [(-3, 2)] * n_features + [(-6, 0)] for regression/reinterpolation.
 
         Returns:
             Kriging:
                 The fitted Kriging model instance (self).
 
+        Raises:
+            ValueError: If input data has invalid shape or contains invalid values.
+            RuntimeError: If optimization fails or correlation matrix is singular.
+
         Examples:
             >>> import numpy as np
             >>> from spotpython.surrogate.kriging import Kriging
@@ -211,42 +217,141 @@ def fit(self, X: np.ndarray, y: np.ndarray, bounds: Optional[List[Tuple[float, f
             >>> model.fit(X_train, y_train)
             >>> print("Fitted log(theta):", model.logtheta_lambda_)
         """
-        X = np.asarray(X)
-        y = np.asarray(y).flatten()
-        self.X_ = X
-        self.y_ = y
+        # Input validation and preprocessing
+        X = np.asarray(X, dtype=np.float64)
+        y = np.asarray(y, dtype=np.float64).flatten()
+
+        # Validate input shapes
+        if X.ndim != 2:
+            raise ValueError(f"X must be a 2D array, got {X.ndim}D array with shape {X.shape}")
+
+        if y.ndim != 1:
+            raise ValueError(f"y must be a 1D array, got {y.ndim}D array with shape {y.shape}")
+
+        if X.shape[0] != y.shape[0]:
+            raise ValueError(f"Number of samples in X ({X.shape[0]}) must match number of samples in y ({y.shape[0]})")
+
+        # Check for minimum number of samples
+        if X.shape[0] < 2:
+            raise ValueError("At least 2 samples are required for fitting")
+
+        # Check for invalid values
+        if not np.all(np.isfinite(X)):
+            raise ValueError("X contains non-finite values (NaN or inf)")
+
+        if not np.all(np.isfinite(y)):
+            raise ValueError("y contains non-finite values (NaN or inf)")
+
+        # Store training data FIRST before aggregation
+        self.X_ = X.copy()  # Create a copy to avoid external modifications
+        self.y_ = y.copy()
         self.n, self.k = X.shape
-        # Calculate and store min and max of X
+
+        # Calculate and store min and max of X for plotting and validation
         self.min_X = np.min(self.X_, axis=0)
         self.max_X = np.max(self.X_, axis=0)
 
-        _, aggregated_mean_y, _ = aggregate_mean_var(X=self.X_, y=self.y_)
-        self.aggregated_mean_y = np.copy(aggregated_mean_y)
+        # Aggregate data for duplicates (if any) - NOW self.X_ and self.y_ are available
+        try:
+            _, aggregated_mean_y, _ = aggregate_mean_var(X=self.X_, y=self.y_)
+            self.aggregated_mean_y = np.copy(aggregated_mean_y)
+        except Exception as e:
+            raise RuntimeError(f"Failed to aggregate training data: {e}")
+
+        # Check for duplicate rows (which can cause numerical issues)
+        if X.shape[0] > 1:
+            unique_rows = np.unique(X, axis=0)
+            if len(unique_rows) != X.shape[0] and self.method == "interpolation":
+                logger.warning(f"Found {X.shape[0] - len(unique_rows)} duplicate rows in X. " "This may cause numerical issues with interpolation method.")
+
+        # Check for zero variance in any dimension
+        if np.any(self.max_X - self.min_X == 0):
+            zero_var_dims = np.where(self.max_X - self.min_X == 0)[0]
+            logger.warning(f"Zero variance detected in dimensions {zero_var_dims}. " "This may cause numerical issues.")
+
+        # Set optimization bounds
         if bounds is None:
             if self.method == "interpolation":
-                bounds = [(-3.0, 2.0)] * self.k
+                bounds = [(self.min_theta, self.max_theta)] * self.k
             else:
                 # regression and reinterpolation use lambda_ as well
-                bounds = [(-3.0, 2.0)] * self.k + [(-6.0, 0.0)]
+                bounds = [(self.min_theta, self.max_theta)] * self.k + [(np.log10(self.min_Lambda), np.log10(self.max_Lambda))]
+        else:
+            # Validate user-provided bounds
+            expected_length = self.k if self.method == "interpolation" else self.k + 1
+            if len(bounds) != expected_length:
+                raise ValueError(f"bounds must have length {expected_length} for method '{self.method}', " f"got {len(bounds)}")
+
+            # Validate individual bounds
+            for i, (low, high) in enumerate(bounds):
+                if not (isinstance(low, (int, float)) and isinstance(high, (int, float))):
+                    raise ValueError(f"bounds[{i}] must contain numeric values")
+                if low >= high:
+                    raise ValueError(f"bounds[{i}]: lower bound ({low}) must be less than upper bound ({high})")
+
+        # Optimize hyperparameters
+        try:
+            logger.info(f"Starting hyperparameter optimization with bounds: {bounds}")
+            self.logtheta_lambda_, final_likelihood = self.max_likelihood(bounds)
+            logger.info(f"Optimization completed. Final likelihood: {final_likelihood}")
+        except Exception as e:
+            raise RuntimeError(f"Hyperparameter optimization failed: {e}")
 
-        self.logtheta_lambda_, _ = self.max_likelihood(bounds)
+        # Validate optimization results
+        if not np.all(np.isfinite(self.logtheta_lambda_)):
+            raise RuntimeError("Optimization resulted in non-finite hyperparameters")
 
-        # store theta and Lambda in log scale
+        # Extract and store theta and Lambda parameters
         if (self.method == "regression") or (self.method == "reinterpolation"):
-            # case noise is True
             self.theta = self.logtheta_lambda_[:-1]
             self.Lambda = self.logtheta_lambda_[-1]
         else:
             self.theta = self.logtheta_lambda_
             self.Lambda = None
-        # store p for future use
+
+        # Store p for future use (currently fixed at 2)
         self.p = 2
 
-        # Once logtheta_lambda is found, compute the final correlation matrix
-        self.negLnLike, self.Psi_, self.U_ = self.likelihood(self.logtheta_lambda_)
+        # Compute final correlation matrix and validate
+        try:
+            self.negLnLike, self.Psi_, self.U_ = self.likelihood(self.logtheta_lambda_)
+
+            # Check if correlation matrix is well-conditioned
+            if self.U_ is None:
+                raise RuntimeError("Failed to compute Cholesky decomposition of correlation matrix")
+
+            # Check condition number
+            if hasattr(self, "Psi_") and self.Psi_ is not None:
+                try:
+                    cond_num = np.linalg.cond(self.Psi_)
+                    if cond_num > 1e12:
+                        logger.warning(f"Correlation matrix is ill-conditioned (condition number: {cond_num:.2e})")
+                except np.linalg.LinAlgError:
+                    logger.warning("Could not compute condition number of correlation matrix")
+
+        except Exception as e:
+            raise RuntimeError(f"Failed to compute final correlation matrix: {e}")
+
+        # Final validation
+        if not np.isfinite(self.negLnLike):
+            raise RuntimeError("Final likelihood is not finite")
+
+        # Update logging information
+        try:
+            self._update_log()
+        except Exception as e:
+            logger.warning(f"Failed to update log: {e}")
+
+        # Log fitting summary
+        logger.info("Kriging model fitted successfully:")
+        logger.info(f"  - Method: {self.method}")
+        logger.info(f"  - Training samples: {self.n}")
+        logger.info(f"  - Features: {self.k}")
+        logger.info(f"  - Final negative log-likelihood: {self.negLnLike:.6f}")
+        logger.info(f"  - Theta parameters: {self.theta}")
+        if self.Lambda is not None:
+            logger.info(f"  - Lambda parameter: {self.Lambda:.6f}")
 
-        # Update log with the current values
-        self._update_log()
         return self
 
     def predict(self, X: np.ndarray, return_std=False, return_val: str = "y") -> np.ndarray:
diff --git a/src/spotpython/utils/aggregate.py b/src/spotpython/utils/aggregate.py
@@ -3,7 +3,7 @@
 from sklearn.cluster import KMeans
 
 
-def aggregate_mean_var(X, y, sort=False, var_empirical=False) -> (np.ndarray, np.ndarray, np.ndarray):
+def aggregate_mean_var_base(X, y, sort=False, var_empirical=False) -> (np.ndarray, np.ndarray, np.ndarray):
     """
     Aggregate array to mean and variance per group.
     Note: The empirical variance might result in nan values.
@@ -120,6 +120,123 @@ def theoretical_var(group):
     return X_agg, y_mean, y_var
 
 
+def aggregate_mean_var(X: np.ndarray, y: np.ndarray, sort: bool = False, var_empirical: bool = False) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Pure NumPy implementation of aggregate_mean_var for better performance.
+
+    This version avoids pandas overhead and may be faster for large datasets.
+
+    Args:
+        X (np.ndarray): Feature array, shape (n, k).
+        y (np.ndarray): Target values, shape (n,).
+        sort (bool): Whether to sort the results by the group keys. Default is False.
+        var_empirical (bool): Whether to calculate the empirical (sample) variance.
+                             Default is False, which uses theoretical (population) variance.
+
+    Returns:
+        tuple[np.ndarray, np.ndarray, np.ndarray]: A tuple containing:
+            - X_agg: Aggregated unique X values, shape (n_groups, k)
+            - y_mean: Mean of y values per group, shape (n_groups,)
+            - y_var: Variance of y values per group, shape (n_groups,)
+
+    Raises:
+        ValueError: If input arrays have incompatible shapes or dimensions.
+
+    Examples:
+        >>> from spotpython.utils.aggregate import aggregate_mean_var
+            import numpy as np
+            X = np.array([[1, 2], [3, 4], [1, 2]])
+            y = np.array([1, 2, 3])
+            X_agg, y_mean, y_var = aggregate_mean_var(X, y)
+            print(X_agg)
+            [[1. 2.] [3. 4.]]
+            print(y_mean)
+            [2. 2.]
+            print(y_var)
+            [1 0]
+        # Empirical variance might result in nan values, see the example below
+        >>> X_agg, y_mean, y_var = aggregate_mean_var(X, y, var_empirical=True)
+            print(X_agg)
+            print(y_mean)
+            print(y_var)
+            [[1 2]
+            [3 4]]
+            [2. 2.]
+            [ 2. nan]
+        >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1,2]])
+            y = np.array([1, 2, 3, 4, 5])
+            X_agg, y_mean, y_var = aggregate_mean_var(X, y, var_empirical=True)
+            print(X_agg)
+            print(y_mean)
+            print(y_var)
+            [[1 2]
+            [3 4]]
+            [3. 3.]
+            [4. 2.]
+        >>> X_1 = np.ones((2, 3))
+            y_1 = np.sum(X_1, axis=1)
+            y_2 = 2 * y_1
+            X_2 = np.append(X_1, 2 * X_1, axis=0)
+            X = np.append(X_2, X_1, axis=0)
+            y = np.append(y_1, y_2, axis=0)
+            y = np.append(y, y_2, axis=0)
+            print(X)
+            print(y)
+            Z = aggregate_mean_var(X, y, var_empirical=True)
+            print(Z)
+            [[1. 1. 1.]
+            [1. 1. 1.]
+            [2. 2. 2.]
+            [2. 2. 2.]
+            [1. 1. 1.]
+            [1. 1. 1.]]
+            [3. 3. 6. 6. 6. 6.]
+            (array([[1., 1., 1.],
+                [2., 2., 2.]]), array([4.5, 6. ]), array([3., 0.]))
+    """
+    # Input validation
+    X = np.asarray(X)
+    y = np.asarray(y)
+
+    if X.ndim != 2 or y.ndim != 1 or X.shape[0] != y.shape[0]:
+        raise ValueError("Invalid input shapes")
+
+    if X.shape[0] == 0:
+        return np.empty((0, X.shape[1])), np.array([]), np.array([])
+
+    # Use lexsort for stable sorting if requested
+    if sort:
+        sort_idx = np.lexsort([X[:, i] for i in range(X.shape[1] - 1, -1, -1)])
+        X_sorted = X[sort_idx]
+        y_sorted = y[sort_idx]
+    else:
+        X_sorted = X
+        y_sorted = y
+
+    # Find unique rows and group indices
+    _, unique_idx, inverse_idx = np.unique(X_sorted, axis=0, return_index=True, return_inverse=True)
+
+    X_agg = X_sorted[unique_idx]
+
+    # Calculate mean and variance for each group
+    n_groups = len(unique_idx)
+    y_mean = np.zeros(n_groups)
+    y_var = np.zeros(n_groups)
+
+    for i in range(n_groups):
+        group_mask = inverse_idx == i
+        group_y = y_sorted[group_mask]
+
+        y_mean[i] = np.mean(group_y)
+
+        if var_empirical:
+            y_var[i] = np.var(group_y, ddof=1) if len(group_y) > 1 else np.nan
+        else:
+            y_var[i] = np.var(group_y, ddof=0)
+
+    return X_agg, y_mean, y_var
+
+
 def get_ranks(x):
     """
     Returns a numpy array containing ranks of numbers within an input numpy array x.

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotpython"`
`10`		`-version = "0.29.25"`
	`10`	`+version = "0.29.26"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`