Skip to content

Commit 8a177fe

Browse files
0.16.14
updated kriging.py
1 parent 7203dad commit 8a177fe

4 files changed

Lines changed: 68 additions & 35 deletions

File tree

RELEASE_NOTES.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
1+
spotpython-0.16.14:
2+
3+
- spotpython.utils.aggregate.py: aggregate_mean_var() updated:
4+
1. Imports: Ensured that necessary imports (`numpy` and `pandas`) are included.
5+
2. Input Validation: Added checks to ensure that `X` and `y` are `numpy` arrays, `X` is 2D, `y` is 1D, and they have matching shapes, helping prevent common input errors.
6+
3. Use of Pandas Group Functionality: Improved grouping and aggregation by explicitly handling multi-index resulting from `agg`.
7+
4. Return Type Consistency: Ensured that the description in the documentation matches the actual operation and expected shapes for return arrays `y_mean` and `y_var`.
8+
9+
- initialize_variables:
10+
1. Input Validation: Added checks to ensure correct shapes for `nat_X` and `nat_y`, which prevent unexpected errors during execution.
11+
2. Proper Initialization Instead of Assigning in Tuples: Decomposed the tuple assignment directly for clarity.
12+
13+
- set_variable_types:
14+
1. Corrected Initialization of `var_type`: If the length of `var_type` is less than `k`, the variable types are now all set to `'num'` consistently, rather than attempting to multiply the list.
15+
2. NumPy Vectorization: Moved from `map` and list comprehension to NumPy vectorized operations, which are more efficient and readable when handling large data.
16+
3. Simplified Ordered Mask Logic: Used NumPy's `isin` function, which directly checks membership in a list of types relevant for `ordered_mask`.
17+
18+
119
spotpython-0.16.12:
220

321
- kriging.py: method extract_from_bounds() updated.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "spotpython"
10-
version = "0.16.13"
10+
version = "0.16.14"
1111
authors = [
1212
{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
1313
]

src/spotpython/build/kriging.py

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import copy
22
from math import erf
33
import matplotlib.pyplot as plt
4-
from numpy import max, min, var
4+
from numpy import min, var
55
from numpy import sqrt
66
from numpy import exp
77
from numpy import array
@@ -436,13 +436,11 @@ def optimize_model(self) -> Union[List[float], Tuple[float]]:
436436
def update_log(self) -> None:
437437
"""
438438
Update the log with the current values of negLnLike, theta, p, and Lambda.
439-
440439
This method appends the current values of negLnLike, theta, p (if optim_p is True),
441440
and Lambda (if noise is True)
442441
to their respective lists in the log dictionary.
443442
It also updates the log_length attribute with the current length
444443
of the negLnLike list in the log.
445-
446444
If spot_writer is not None, this method also writes the current values of
447445
negLnLike, theta, p (if optim_p is True),
448446
and Lambda (if noise is True) to the spot_writer object.
@@ -505,7 +503,8 @@ def fit(self, nat_X: np.ndarray, nat_y: np.ndarray) -> object:
505503
Fits the hyperparameters (`theta`, `p`, `Lambda`) of the Kriging model.
506504
The function computes the following internal values:
507505
1. `theta`, `p`, and `Lambda` values via optimization of the function `fun_likelihood()`.
508-
2. Correlation matrix `Psi` via `rebuildPsi()`.
506+
2. Correlation matrix `Psi` via `buildPsi()`.
507+
3. U matrix via `buildU()`.
509508
510509
Args:
511510
self (object): The Kriging object.
@@ -591,25 +590,33 @@ def initialize_variables(self, nat_X: np.ndarray, nat_y: np.ndarray) -> None:
591590
S.nat_y: [1 2]
592591
593592
"""
593+
# Validate input dimensions
594+
if nat_X.ndim != 2 or nat_y.ndim != 1:
595+
raise ValueError("nat_X must be a 2D array and nat_y must be a 1D array.")
596+
if nat_X.shape[0] != nat_y.shape[0]:
597+
raise ValueError("The number of samples in nat_X and nat_y must be equal.")
598+
599+
# Initialize instance variables
594600
self.nat_X = copy.deepcopy(nat_X)
595601
self.nat_y = copy.deepcopy(nat_y)
596-
self.n = self.nat_X.shape[0]
597-
self.k = self.nat_X.shape[1]
602+
self.n, self.k = self.nat_X.shape
598603

599-
self.min_X = min(self.nat_X, axis=0)
600-
self.max_X = max(self.nat_X, axis=0)
604+
# Calculate and store min and max of X
605+
self.min_X = np.min(self.nat_X, axis=0)
606+
self.max_X = np.max(self.nat_X, axis=0)
601607

602-
Z = aggregate_mean_var(X=self.nat_X, y=self.nat_y)
603-
# aggregated y values:
604-
mu = Z[1]
605-
self.aggregated_mean_y = np.copy(mu)
608+
# Calculate the aggregated mean of y
609+
_, aggregated_mean_y, _ = aggregate_mean_var(X=self.nat_X, y=self.nat_y)
610+
self.aggregated_mean_y = np.copy(aggregated_mean_y)
611+
612+
# Logging the initialized variables
606613
logger.debug("In initialize_variables(): self.nat_X: %s", self.nat_X)
607614
logger.debug("In initialize_variables(): self.nat_y: %s", self.nat_y)
608615
logger.debug("In initialize_variables(): self.aggregated_mean_y: %s", self.aggregated_mean_y)
609616
logger.debug("In initialize_variables(): self.min_X: %s", self.min_X)
610617
logger.debug("In initialize_variables(): self.max_X: %s", self.max_X)
611-
logger.debug("In initialize_variables(): self.n: %s", self.n)
612-
logger.debug("In initialize_variables(): self.k: %s", self.k)
618+
logger.debug("In initialize_variables(): self.n: %d", self.n)
619+
logger.debug("In initialize_variables(): self.k: %d", self.k)
613620

614621
def set_variable_types(self) -> None:
615622
"""
@@ -645,16 +652,18 @@ def set_variable_types(self) -> None:
645652
"""
646653
logger.debug("In set_variable_types(): self.k: %s", self.k)
647654
logger.debug("In set_variable_types(): self.var_type: %s", self.var_type)
648-
# assume all variable types are "num" if "num" is
649-
# specified once:
655+
656+
# Ensure var_type has appropriate length by defaulting to 'num'
650657
if len(self.var_type) < self.k:
651-
self.var_type = self.var_type * self.k
658+
self.var_type = ['num'] * self.k # Corrected to fill with 'num' instead of duplicating
652659
logger.warning("In set_variable_types(): All variable types forced to 'num'.")
653660
logger.debug("In set_variable_types(): self.var_type: %s", self.var_type)
654-
self.num_mask = np.array(list(map(lambda x: x == "num", self.var_type)))
655-
self.factor_mask = np.array(list(map(lambda x: x == "factor", self.var_type)))
656-
self.int_mask = np.array(list(map(lambda x: x == "int", self.var_type)))
657-
self.ordered_mask = np.array(list(map(lambda x: x == "int" or x == "num" or x == "float", self.var_type)))
661+
# Create masks for each type using numpy vectorized operations
662+
var_type_array = np.array(self.var_type)
663+
self.num_mask = (var_type_array == "num")
664+
self.factor_mask = (var_type_array == "factor")
665+
self.int_mask = (var_type_array == "int")
666+
self.ordered_mask = np.isin(var_type_array, ["int", "num", "float"])
658667
logger.debug("In set_variable_types(): self.num_mask: %s", self.num_mask)
659668
logger.debug("In set_variable_types(): self.factor_mask: %s", self.factor_mask)
660669
logger.debug("In set_variable_types(): self.int_mask: %s", self.int_mask)

src/spotpython/utils/aggregate.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ def aggregate_mean_var(X, y, sort=False) -> (np.ndarray, np.ndarray, np.ndarray)
2121
aggregated (variance per group) `y` values, shape `(1,)`, if `m` duplicates in `X`.
2222
2323
Examples:
24-
>>> X = np.array([[1, 2], [3, 4], [1, 2]])
24+
>>> from spotpython.utils.aggregate import aggregate_mean_var
25+
X = np.array([[1, 2], [3, 4], [1, 2]])
2526
y = np.array([1, 2, 3])
2627
X_agg, y_mean, y_var = aggregate_mean_var(X, y)
2728
print(X_agg)
@@ -32,23 +33,28 @@ def aggregate_mean_var(X, y, sort=False) -> (np.ndarray, np.ndarray, np.ndarray)
3233
print(y_var)
3334
[1. 0.]
3435
"""
35-
# Create a DataFrame from X and y
36+
if not isinstance(X, np.ndarray) or not isinstance(y, np.ndarray):
37+
raise TypeError("X and y must be numpy arrays.")
38+
39+
if X.ndim != 2 or y.ndim != 1:
40+
raise ValueError("X must be a 2D array and y must be a 1D array.")
41+
42+
if X.shape[0] != y.shape[0]:
43+
raise ValueError("The number of rows in X must match the length of y.")
44+
45+
# Create a DataFrame from X with y as the group target
3646
df = pd.DataFrame(X)
3747
df["y"] = y
3848

39-
# Group by all columns except 'y' and calculate the mean and variance of 'y' for each group
40-
grouped = df.groupby(list(df.columns.difference(["y"])), as_index=False, sort=sort)
41-
df_mean = grouped.mean()
42-
df_var = grouped.var()
49+
# Group by all X columns, calculating the mean and variance of y for each group
50+
grouped = df.groupby(list(df.columns[:-1]), as_index=False, sort=sort).agg({"y": ["mean", "var"]})
4351

44-
# Convert the resulting DataFrames to numpy arrays
45-
mean_array = df_mean.to_numpy()
46-
var_array = df_var.to_numpy()
52+
# Extract mean and variance results from the multi-index DataFrame columns
53+
y_mean = grouped[("y", "mean")].to_numpy()
54+
y_var = grouped[("y", "var")].to_numpy()
4755

48-
# Split the resulting arrays into separate arrays for X and y
49-
X_agg = np.delete(mean_array, -1, 1)
50-
y_mean = mean_array[:, -1]
51-
y_var = var_array[:, -1]
56+
# Extract the unique X values
57+
X_agg = grouped.iloc[:, :-2].to_numpy()
5258

5359
return X_agg, y_mean, y_var
5460

0 commit comments

Comments
 (0)