Skip to content

Commit cb44ac4

Browse files
0.26.2 kmeans partitions
1 parent e43807d commit cb44ac4

3 files changed

Lines changed: 429 additions & 100 deletions

File tree

notebooks/00_spotPython_tests.ipynb

Lines changed: 112 additions & 99 deletions
Large diffs are not rendered by default.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "spotpython"
10-
version = "0.26.1"
10+
version = "0.26.2"
1111
authors = [
1212
{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
1313
]

src/spotpython/gp/treegp.py

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
import numpy as np
2+
from sklearn.base import BaseEstimator, RegressorMixin
3+
from sklearn.cluster import KMeans
4+
from sklearn.neighbors import NearestNeighbors
5+
from spotpython.gp.gp_sep import GPsep
6+
import copy
7+
8+
9+
class TreeGP(BaseEstimator, RegressorMixin):
10+
"""A tree-based Gaussian Process model that partitions the input space
11+
and fits local GP models on each partition.
12+
13+
This model divides the input space using clustering, trains a separate
14+
GPsep model on each cluster, and combines their predictions to cover
15+
the whole input space.
16+
17+
Attributes:
18+
n_clusters: Number of clusters to partition the input space.
19+
min_points: Minimum number of points required in each cluster.
20+
weighting: Method for combining predictions ('hard' or 'distance').
21+
distance_power: Power parameter for distance-based weighting.
22+
gp_params: Parameters passed to each local GPsep model.
23+
cluster_model: The clustering algorithm used for partitioning.
24+
local_models: List of GPsep models for each partition.
25+
cluster_centers: Centers of each cluster.
26+
X_bounds: The bounds of the input space from training data.
27+
y_bounds: The bounds of the output space from training data.
28+
"""
29+
30+
def __init__(self, n_clusters=3, min_points=10, weighting="distance", distance_power=2, gp_params=None):
31+
"""Initialize the TreeGP model with partitioning and GP parameters.
32+
33+
Args:
34+
n_clusters: Number of clusters to create. Defaults to 3.
35+
min_points: Minimum points required per cluster. If a cluster has
36+
fewer points, it will be merged with nearest cluster.
37+
Defaults to 10.
38+
weighting: Method for combining predictions ('hard' or 'distance').
39+
Defaults to 'distance'.
40+
distance_power: Power parameter for distance-based weighting.
41+
Higher values make the weighting more local. Defaults to 2.
42+
gp_params: Dictionary of parameters to pass to each GPsep model.
43+
Defaults to None.
44+
"""
45+
self.n_clusters = n_clusters
46+
self.min_points = min_points
47+
self.weighting = weighting
48+
self.distance_power = distance_power
49+
self.gp_params = gp_params or {}
50+
51+
# These will be set during fitting
52+
self.cluster_model = None
53+
self.local_models = []
54+
self.cluster_centers = None
55+
self.X_bounds = None
56+
self.y_bounds = None
57+
self.cluster_indices = None
58+
59+
def get_params(self, deep=True):
60+
"""Get parameters for this estimator.
61+
62+
Args:
63+
deep: If True, will return the parameters for this estimator and
64+
contained subobjects that are estimators. Defaults to True.
65+
66+
Returns:
67+
dict: Parameter names mapped to their values.
68+
"""
69+
params = {
70+
"n_clusters": self.n_clusters,
71+
"min_points": self.min_points,
72+
"weighting": self.weighting,
73+
"distance_power": self.distance_power,
74+
"gp_params": self.gp_params,
75+
}
76+
77+
if deep and self.gp_params:
78+
gp_params_copy = copy.deepcopy(self.gp_params)
79+
params["gp_params"] = gp_params_copy
80+
81+
return params
82+
83+
def set_params(self, **parameters):
84+
"""Set the parameters of this estimator.
85+
86+
Args:
87+
**parameters: Estimator parameters as keyword arguments.
88+
89+
Returns:
90+
self: Estimator instance.
91+
"""
92+
for parameter, value in parameters.items():
93+
if parameter == "gp_params" and isinstance(value, dict):
94+
self.gp_params = copy.deepcopy(value)
95+
else:
96+
setattr(self, parameter, value)
97+
return self
98+
99+
def fit(self, X, y):
100+
"""Fit the TreeGP model by partitioning the input space and
101+
fitting local GP models to each partition.
102+
103+
Args:
104+
X: Training input samples of shape (n_samples, n_features).
105+
y: Target values of shape (n_samples,).
106+
107+
Returns:
108+
self: Returns self.
109+
110+
Raises:
111+
ValueError: If the number of samples is less than n_clusters or min_points.
112+
"""
113+
# Convert pandas objects to numpy arrays if necessary
114+
if hasattr(X, "to_numpy"):
115+
X = X.to_numpy()
116+
if hasattr(y, "to_numpy"):
117+
y = y.to_numpy()
118+
119+
y = y.reshape(-1)
120+
n_samples = X.shape[0]
121+
122+
if n_samples < max(self.n_clusters, self.min_points):
123+
raise ValueError(f"Number of samples ({n_samples}) must be at least " f"max(n_clusters, min_points) = {max(self.n_clusters, self.min_points)}")
124+
125+
# Store data bounds for later normalization
126+
self.X_bounds = (np.min(X, axis=0), np.max(X, axis=0))
127+
self.y_bounds = (np.min(y), np.max(y))
128+
129+
# Partition the input space using KMeans clustering
130+
self.cluster_model = KMeans(n_clusters=self.n_clusters, random_state=42, n_init="auto")
131+
132+
cluster_labels = self.cluster_model.fit_predict(X)
133+
self.cluster_centers = self.cluster_model.cluster_centers_
134+
135+
# Handle clusters with too few points by merging them
136+
valid_clusters = []
137+
clusters_to_merge = []
138+
139+
for i in range(self.n_clusters):
140+
cluster_size = np.sum(cluster_labels == i)
141+
if cluster_size >= self.min_points:
142+
valid_clusters.append(i)
143+
else:
144+
clusters_to_merge.append(i)
145+
146+
# If there are clusters to merge, reassign their points to nearest valid cluster
147+
if clusters_to_merge:
148+
if not valid_clusters:
149+
# If no valid clusters, just use a single GPsep model
150+
print("No valid clusters with enough points. Falling back to single model.")
151+
self.n_clusters = 1
152+
self.cluster_centers = np.mean(X, axis=0, keepdims=True)
153+
cluster_labels = np.zeros(n_samples, dtype=int)
154+
else:
155+
# Reassign points from small clusters to nearest valid cluster
156+
nn = NearestNeighbors(n_neighbors=1).fit(self.cluster_centers[valid_clusters])
157+
158+
for i in clusters_to_merge:
159+
# Find points in this cluster
160+
mask = cluster_labels == i
161+
if np.any(mask):
162+
# Find nearest valid cluster for each point
163+
_, indices = nn.kneighbors(X[mask])
164+
# Reassign to nearest valid cluster
165+
cluster_labels[mask] = [valid_clusters[idx[0]] for idx in indices]
166+
167+
# Save cluster assignments
168+
self.cluster_indices = [np.where(cluster_labels == i)[0] for i in range(self.n_clusters)]
169+
170+
# Fit local GPsep models for each cluster
171+
self.local_models = []
172+
173+
for i in range(self.n_clusters):
174+
idx = self.cluster_indices[i]
175+
if len(idx) > 0: # Make sure we have points in this cluster
176+
# Create and fit a GPsep model for this cluster
177+
gp = GPsep(**self.gp_params)
178+
gp.fit(X[idx], y[idx])
179+
self.local_models.append(gp)
180+
else:
181+
# This shouldn't happen after our merging step, but just in case
182+
self.local_models.append(None)
183+
184+
return self
185+
186+
def predict(self, X, return_std=False, **kwargs):
187+
"""Predict using the TreeGP model.
188+
189+
Args:
190+
X: Query points of shape (n_samples, n_features).
191+
return_std: Whether to return standard deviations. Defaults to False.
192+
**kwargs: Additional keyword arguments passed to the local models' predict method.
193+
194+
Returns:
195+
If return_std is False, returns predicted values.
196+
If return_std is True, returns (predicted_values, standard_deviations).
197+
"""
198+
if hasattr(X, "to_numpy"):
199+
X = X.to_numpy()
200+
201+
n_samples = X.shape[0]
202+
y_pred = np.zeros(n_samples)
203+
204+
if return_std:
205+
y_std = np.zeros(n_samples)
206+
207+
# Get distances to cluster centers for weighting
208+
distances = np.zeros((n_samples, self.n_clusters))
209+
210+
for i in range(self.n_clusters):
211+
# Calculate Euclidean distance to each cluster center
212+
diff = X - self.cluster_centers[i]
213+
distances[:, i] = np.sqrt(np.sum(diff**2, axis=1))
214+
215+
if self.weighting == "hard":
216+
# Hard assignment: use closest cluster's model for prediction
217+
cluster_assignments = np.argmin(distances, axis=1)
218+
219+
for i in range(self.n_clusters):
220+
mask = cluster_assignments == i
221+
if np.any(mask) and self.local_models[i] is not None:
222+
if return_std:
223+
# Use return_full=True to get the full prediction dictionary
224+
preds = self.local_models[i].predict(X[mask], return_full=True)
225+
y_pred[mask] = preds["mean"]
226+
227+
# Extract standard deviations from the covariance matrix
228+
if "Sigma" in preds:
229+
# If full covariance matrix is returned, extract diagonal
230+
if len(preds["Sigma"].shape) == 2:
231+
y_std[mask] = np.sqrt(np.diag(preds["Sigma"]))
232+
else:
233+
# If already diagonal elements, just use directly
234+
y_std[mask] = np.sqrt(preds["Sigma"])
235+
elif "s2" in preds:
236+
# Some implementations return variances as "s2"
237+
y_std[mask] = np.sqrt(preds["s2"])
238+
else:
239+
y_pred[mask] = self.local_models[i].predict(X[mask], return_full=False)
240+
241+
else: # distance weighting
242+
# Compute weights from distances (closer = higher weight)
243+
weights = 1.0 / np.maximum(distances**self.distance_power, 1e-10)
244+
# Normalize weights
245+
weights_sum = np.sum(weights, axis=1, keepdims=True)
246+
weights = weights / np.maximum(weights_sum, 1e-10)
247+
248+
# Get predictions from each model and combine with weights
249+
all_preds = np.zeros((n_samples, self.n_clusters))
250+
all_var = np.zeros((n_samples, self.n_clusters)) # Store variances instead of std devs
251+
252+
for i in range(self.n_clusters):
253+
if self.local_models[i] is not None:
254+
if return_std:
255+
# Use return_full=True to get the full prediction dictionary
256+
preds = self.local_models[i].predict(X, return_full=True)
257+
all_preds[:, i] = preds["mean"]
258+
259+
# Extract variances from the covariance matrix
260+
if "Sigma" in preds:
261+
# If full covariance matrix is returned, extract diagonal
262+
if len(preds["Sigma"].shape) == 2:
263+
all_var[:, i] = np.diag(preds["Sigma"])
264+
else:
265+
# If already diagonal elements, use directly
266+
all_var[:, i] = preds["Sigma"]
267+
elif "s2" in preds:
268+
all_var[:, i] = preds["s2"]
269+
else:
270+
all_preds[:, i] = self.local_models[i].predict(X)
271+
272+
# Combine predictions using weights
273+
y_pred = np.sum(all_preds * weights, axis=1)
274+
275+
if return_std:
276+
# Combine variances (with appropriate weighting)
277+
y_var = np.sum(all_var * (weights**2), axis=1)
278+
y_std = np.sqrt(y_var)
279+
280+
if return_std:
281+
return y_pred, y_std
282+
else:
283+
return y_pred
284+
285+
def score(self, X, y, **kwargs):
286+
"""Calculate the coefficient of determination R^2 of the prediction.
287+
288+
Args:
289+
X: Test input samples.
290+
y: True values for X.
291+
**kwargs: Additional arguments passed to predict().
292+
293+
Returns:
294+
float: R^2 score.
295+
"""
296+
from sklearn.metrics import r2_score
297+
298+
return r2_score(y, self.predict(X, **kwargs))
299+
300+
301+
# Helper function to create and fit a TreeGP model in a single call
302+
def newTreeGP(X, y, n_clusters=3, **kwargs):
303+
"""
304+
Create and fit a TreeGP model in a single function call.
305+
306+
Args:
307+
X: Training input samples.
308+
y: Target values.
309+
n_clusters: Number of clusters to partition the data. Defaults to 3.
310+
**kwargs: Additional parameters passed to TreeGP constructor.
311+
312+
Returns:
313+
TreeGP: The fitted TreeGP model.
314+
"""
315+
model = TreeGP(n_clusters=n_clusters, **kwargs)
316+
return model.fit(X, y)

0 commit comments

Comments
 (0)