0.16.14

bartzbeielstein · bartzbeielstein · commit 8a177fe8d670 · 2024-11-07T17:48:42.000+01:00
updated kriging.py
diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt
@@ -1,3 +1,21 @@
+spotpython-0.16.14:
+
+- spotpython.utils.aggregate.py: aggregate_mean_var() updated:
+    1. Imports: Ensured that necessary imports (`numpy` and `pandas`) are included.
+    2. Input Validation: Added checks to ensure that `X` and `y` are `numpy` arrays, `X` is 2D, `y` is 1D, and they have matching shapes, helping prevent common input errors.
+    3. Use of Pandas Group Functionality: Improved grouping and aggregation by explicitly handling multi-index resulting from `agg`.
+    4. Return Type Consistency: Ensured that the description in the documentation matches the actual operation and expected shapes for return arrays `y_mean` and `y_var`.
+
+- initialize_variables:
+    1. Input Validation: Added checks to ensure correct shapes for `nat_X` and `nat_y`, which prevent unexpected errors during execution.
+    2. Proper Initialization Instead of Assigning in Tuples: Decomposed the tuple assignment directly for clarity.
+
+- set_variable_types:
+    1. Corrected Initialization of `var_type`: If the length of `var_type` is less than `k`, the variable types are now all set to `'num'` consistently, rather than attempting to multiply the list.
+    2. NumPy Vectorization: Moved from `map` and list comprehension to NumPy vectorized operations, which are more efficient and readable when handling large data.
+    3. Simplified Ordered Mask Logic: Used NumPy's `isin` function, which directly checks membership in a list of types relevant for `ordered_mask`.
+
+
 spotpython-0.16.12:
 
 - kriging.py: method extract_from_bounds() updated.
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotpython"
-version = "0.16.13"
+version = "0.16.14"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotpython/build/kriging.py b/src/spotpython/build/kriging.py
@@ -1,7 +1,7 @@
 import copy
 from math import erf
 import matplotlib.pyplot as plt
-from numpy import max, min, var
+from numpy import min, var
 from numpy import sqrt
 from numpy import exp
 from numpy import array
@@ -436,13 +436,11 @@ def optimize_model(self) -> Union[List[float], Tuple[float]]:
     def update_log(self) -> None:
         """
         Update the log with the current values of negLnLike, theta, p, and Lambda.
-
         This method appends the current values of negLnLike, theta, p (if optim_p is True),
         and Lambda (if noise is True)
         to their respective lists in the log dictionary.
         It also updates the log_length attribute with the current length
         of the negLnLike list in the log.
-
         If spot_writer is not None, this method also writes the current values of
         negLnLike, theta, p (if optim_p is True),
         and Lambda (if noise is True) to the spot_writer object.
@@ -505,7 +503,8 @@ def fit(self, nat_X: np.ndarray, nat_y: np.ndarray) -> object:
         Fits the hyperparameters (`theta`, `p`, `Lambda`) of the Kriging model.
         The function computes the following internal values:
         1. `theta`, `p`, and `Lambda` values via optimization of the function `fun_likelihood()`.
-        2. Correlation matrix `Psi` via `rebuildPsi()`.
+        2. Correlation matrix `Psi` via `buildPsi()`.
+        3. U matrix via `buildU()`.
 
         Args:
             self (object): The Kriging object.
@@ -591,25 +590,33 @@ def initialize_variables(self, nat_X: np.ndarray, nat_y: np.ndarray) -> None:
                 S.nat_y: [1 2]
 
         """
+        # Validate input dimensions
+        if nat_X.ndim != 2 or nat_y.ndim != 1:
+            raise ValueError("nat_X must be a 2D array and nat_y must be a 1D array.")
+        if nat_X.shape[0] != nat_y.shape[0]:
+            raise ValueError("The number of samples in nat_X and nat_y must be equal.")
+
+        # Initialize instance variables
         self.nat_X = copy.deepcopy(nat_X)
         self.nat_y = copy.deepcopy(nat_y)
-        self.n = self.nat_X.shape[0]
-        self.k = self.nat_X.shape[1]
+        self.n, self.k = self.nat_X.shape
 
-        self.min_X = min(self.nat_X, axis=0)
-        self.max_X = max(self.nat_X, axis=0)
+        # Calculate and store min and max of X
+        self.min_X = np.min(self.nat_X, axis=0)
+        self.max_X = np.max(self.nat_X, axis=0)
 
-        Z = aggregate_mean_var(X=self.nat_X, y=self.nat_y)
-        # aggregated y values:
-        mu = Z[1]
-        self.aggregated_mean_y = np.copy(mu)
+        # Calculate the aggregated mean of y
+        _, aggregated_mean_y, _ = aggregate_mean_var(X=self.nat_X, y=self.nat_y)
+        self.aggregated_mean_y = np.copy(aggregated_mean_y)
+
+        # Logging the initialized variables
         logger.debug("In initialize_variables(): self.nat_X: %s", self.nat_X)
         logger.debug("In initialize_variables(): self.nat_y: %s", self.nat_y)
         logger.debug("In initialize_variables(): self.aggregated_mean_y: %s", self.aggregated_mean_y)
         logger.debug("In initialize_variables(): self.min_X: %s", self.min_X)
         logger.debug("In initialize_variables(): self.max_X: %s", self.max_X)
-        logger.debug("In initialize_variables(): self.n: %s", self.n)
-        logger.debug("In initialize_variables(): self.k: %s", self.k)
+        logger.debug("In initialize_variables(): self.n: %d", self.n)
+        logger.debug("In initialize_variables(): self.k: %d", self.k)
 
     def set_variable_types(self) -> None:
         """
@@ -645,16 +652,18 @@ def set_variable_types(self) -> None:
         """
         logger.debug("In set_variable_types(): self.k: %s", self.k)
         logger.debug("In set_variable_types(): self.var_type: %s", self.var_type)
-        # assume all variable types are "num" if "num" is
-        # specified once:
+
+        # Ensure var_type has appropriate length by defaulting to 'num'
         if len(self.var_type) < self.k:
-            self.var_type = self.var_type * self.k
+            self.var_type = ['num'] * self.k  # Corrected to fill with 'num' instead of duplicating
             logger.warning("In set_variable_types(): All variable types forced to 'num'.")
             logger.debug("In set_variable_types(): self.var_type: %s", self.var_type)
-        self.num_mask = np.array(list(map(lambda x: x == "num", self.var_type)))
-        self.factor_mask = np.array(list(map(lambda x: x == "factor", self.var_type)))
-        self.int_mask = np.array(list(map(lambda x: x == "int", self.var_type)))
-        self.ordered_mask = np.array(list(map(lambda x: x == "int" or x == "num" or x == "float", self.var_type)))
+        # Create masks for each type using numpy vectorized operations
+        var_type_array = np.array(self.var_type)
+        self.num_mask = (var_type_array == "num")
+        self.factor_mask = (var_type_array == "factor")
+        self.int_mask = (var_type_array == "int")
+        self.ordered_mask = np.isin(var_type_array, ["int", "num", "float"])
         logger.debug("In set_variable_types(): self.num_mask: %s", self.num_mask)
         logger.debug("In set_variable_types(): self.factor_mask: %s", self.factor_mask)
         logger.debug("In set_variable_types(): self.int_mask: %s", self.int_mask)
diff --git a/src/spotpython/utils/aggregate.py b/src/spotpython/utils/aggregate.py
@@ -21,7 +21,8 @@ def aggregate_mean_var(X, y, sort=False) -> (np.ndarray, np.ndarray, np.ndarray)
             aggregated (variance per group) `y` values, shape `(1,)`, if `m` duplicates in `X`.
 
     Examples:
-        >>> X = np.array([[1, 2], [3, 4], [1, 2]])
+        >>> from spotpython.utils.aggregate import aggregate_mean_var
+            X = np.array([[1, 2], [3, 4], [1, 2]])
             y = np.array([1, 2, 3])
             X_agg, y_mean, y_var = aggregate_mean_var(X, y)
             print(X_agg)
@@ -32,23 +33,28 @@ def aggregate_mean_var(X, y, sort=False) -> (np.ndarray, np.ndarray, np.ndarray)
             print(y_var)
             [1. 0.]
     """
-    # Create a DataFrame from X and y
+    if not isinstance(X, np.ndarray) or not isinstance(y, np.ndarray):
+        raise TypeError("X and y must be numpy arrays.")
+
+    if X.ndim != 2 or y.ndim != 1:
+        raise ValueError("X must be a 2D array and y must be a 1D array.")
+
+    if X.shape[0] != y.shape[0]:
+        raise ValueError("The number of rows in X must match the length of y.")
+
+    # Create a DataFrame from X with y as the group target
     df = pd.DataFrame(X)
     df["y"] = y
 
-    # Group by all columns except 'y' and calculate the mean and variance of 'y' for each group
-    grouped = df.groupby(list(df.columns.difference(["y"])), as_index=False, sort=sort)
-    df_mean = grouped.mean()
-    df_var = grouped.var()
+    # Group by all X columns, calculating the mean and variance of y for each group
+    grouped = df.groupby(list(df.columns[:-1]), as_index=False, sort=sort).agg({"y": ["mean", "var"]})
 
-    # Convert the resulting DataFrames to numpy arrays
-    mean_array = df_mean.to_numpy()
-    var_array = df_var.to_numpy()
+    # Extract mean and variance results from the multi-index DataFrame columns
+    y_mean = grouped[("y", "mean")].to_numpy()
+    y_var = grouped[("y", "var")].to_numpy()
 
-    # Split the resulting arrays into separate arrays for X and y
-    X_agg = np.delete(mean_array, -1, 1)
-    y_mean = mean_array[:, -1]
-    y_var = var_array[:, -1]
+    # Extract the unique X values
+    X_agg = grouped.iloc[:, :-2].to_numpy()
 
     return X_agg, y_mean, y_var
 

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotpython"`
`10`		`-version = "0.16.13"`
	`10`	`+version = "0.16.14"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`