0.12.9

bartzbeielstein · bartzbeielstein · commit 43218bfd007b · 2024-03-17T00:28:20.000+01:00
max_surrogate_points
diff --git a/notebooks/00_spotPython_tests.ipynb b/notebooks/00_spotPython_tests.ipynb
@@ -3558,6 +3558,74 @@
       "outputs": [],
       "source": []
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Subset Select"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "from sklearn.cluster import KMeans\n",
+        "\n",
+        "def select_distant_points(X, y, k):\n",
+        "    \"\"\"\n",
+        "    Selects k points that are distant from each other using a clustering approach.\n",
+        "    \n",
+        "    :param X: np.array of shape (n, k), with n points in k-dimensional space.\n",
+        "    :param y: np.array of length n, with values corresponding to each point in X.\n",
+        "    :param k: The number of distant points to select.\n",
+        "    :return: Selected k points from X and their corresponding y values.\n",
+        "    \"\"\"\n",
+        "    # Perform k-means clustering to find k clusters\n",
+        "    kmeans = KMeans(n_clusters=k, random_state=0, n_init=\"auto\").fit(X)\n",
+        "    \n",
+        "    # Find the closest point in X to each cluster center\n",
+        "    selected_points = np.array([X[np.argmin(np.linalg.norm(X - center, axis=1))] for center in kmeans.cluster_centers_])\n",
+        "    \n",
+        "    # Find indices of the selected points in the original X array\n",
+        "    indices = np.array([np.where(np.all(X==point, axis=1))[0][0] for point in selected_points])\n",
+        "    \n",
+        "    # Select the corresponding y values\n",
+        "    selected_y = y[indices]\n",
+        "    \n",
+        "    return selected_points, selected_y\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Selected Points: [[0.77482755 0.11776665]\n",
+            " [0.1600672  0.5466571 ]\n",
+            " [0.87752562 0.66913902]\n",
+            " [0.37216814 0.33013892]\n",
+            " [0.37977024 0.83643457]]\n",
+            "Corresponding y values: [0.79945132 0.63677214 0.17382713 0.97910053 0.26962361]\n"
+          ]
+        }
+      ],
+      "source": [
+        "X = np.random.rand(100, 2)  # Generate some random points\n",
+        "y = np.random.rand(100)     # Random corresponding y values\n",
+        "k = 5\n",
+        "\n",
+        "selected_points, selected_y = select_distant_points(X, y, k)\n",
+        "print(\"Selected Points:\", selected_points)\n",
+        "print(\"Corresponding y values:\", selected_y)"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotPython"
-version = "0.12.8"
+version = "0.12.9"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotPython/spot/spot.py b/src/spotPython/spot/spot.py
@@ -22,7 +22,7 @@
 from numpy import min, max
 from spotPython.utils.init import fun_control_init, optimizer_control_init, surrogate_control_init, design_control_init
 from spotPython.utils.compare import selectNew
-from spotPython.utils.aggregate import aggregate_mean_var
+from spotPython.utils.aggregate import aggregate_mean_var, select_distant_points
 from spotPython.utils.repair import remove_nan
 from spotPython.budget.ocba import get_ocba_X
 import logging
@@ -247,6 +247,7 @@ def __init__(
         self.show_progress = self.fun_control["show_progress"]
         self.infill_criterion = self.fun_control["infill_criterion"]
         self.n_points = self.fun_control["n_points"]
+        self.max_surrogate_points = self.fun_control["max_surrogate_points"]
 
         # if the key "spot_writer" is not in the dictionary fun_control,
         # set self.spot_writer to None else to the value of the key "spot_writer"
@@ -912,8 +913,15 @@ def fit_surrogate(self) -> None:
         logger.debug("In fit_surrogate(): self.y: %s", self.y)
         logger.debug("In fit_surrogate(): self.X.shape: %s", self.X.shape)
         logger.debug("In fit_surrogate(): self.y.shape: %s", self.y.shape)
-        if self.X.shape[0] == self.y.shape[0]:
-            self.surrogate.fit(self.X, self.y)
+        X_points = self.X.shape[0]
+        y_points = self.y.shape[0]
+        if X_points == y_points:
+            if X_points > self.max_surrogate_points:
+                X_S, y_S = select_distant_points(X=self.X, y=self.y, k=self.max_surrogate_points)
+            else:
+                X_S = self.X
+                y_S = self.y
+            self.surrogate.fit(X_S, y_S)
         else:
             logger.warning("X and y have different sizes. Surrogate not fitted.")
         if self.show_models:
diff --git a/src/spotPython/utils/aggregate.py b/src/spotPython/utils/aggregate.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
+from sklearn.cluster import KMeans
 
 
 def aggregate_mean_var(X, y, sort=False) -> (np.ndarray, np.ndarray, np.ndarray):
@@ -72,3 +73,42 @@ def get_ranks(x):
     ranks = np.empty_like(ts)
     ranks[ts] = np.arange(len(x))
     return ranks
+
+
+def select_distant_points(X, y, k):
+    """
+    Selects k points that are distant from each other using a clustering approach.
+
+    Args:
+        X (numpy.ndarray): X array, shape `(n, k)`.
+        y (numpy.ndarray): values, shape `(n,)`.
+        k (int): number of points to select.
+
+    Returns:
+        (numpy.ndarray):
+            selected `X` values, shape `(k, k)`.
+        (numpy.ndarray):
+            selected `y` values, shape `(k,)`.
+
+    Examples:
+        >>> from spotPython.utils.aggregate import select_distant_points
+            X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
+            y = np.array([1, 2, 3, 4, 5])
+            selected_points, selected_y = select_distant_points(X, y, 3)
+            print(selected_points)
+            [[1 2]
+            [7 8]
+            [9 10]]
+            print(selected_y)
+            [1 4 5]
+
+    """
+    # Perform k-means clustering to find k clusters
+    kmeans = KMeans(n_clusters=k, random_state=0, n_init="auto").fit(X)
+    # Find the closest point in X to each cluster center
+    selected_points = np.array([X[np.argmin(np.linalg.norm(X - center, axis=1))] for center in kmeans.cluster_centers_])
+    # Find indices of the selected points in the original X array
+    indices = np.array([np.where(np.all(X == point, axis=1))[0][0] for point in selected_points])
+    # Select the corresponding y values
+    selected_y = y[indices]
+    return selected_points, selected_y
diff --git a/src/spotPython/utils/init.py b/src/spotPython/utils/init.py
@@ -6,6 +6,7 @@
 import datetime
 from dateutil.tz import tzlocal
 from torch.utils.tensorboard import SummaryWriter
+from math import inf
 
 
 def fun_control_init(
@@ -36,6 +37,7 @@ def fun_control_init(
     log_level=50,
     lower=None,
     max_time=1,
+    max_surrogate_points=inf,
     metric_sklearn=None,
     noise=False,
     n_points=1,
@@ -132,6 +134,8 @@ def fun_control_init(
             lower bound
         max_time (int):
             The maximum time in minutes.
+        max_surrogate_points (int):
+            The maximum number of points in the surrogate model. Default is inf.
         metric_sklearn (object):
             The metric object from the scikit-learn library. Default is None.
         noise (bool):
@@ -234,6 +238,7 @@ def fun_control_init(
                 'k_folds': None,
                 'loss_function': None,
                 'lower': None,
+                'max_surrogate_points': 100,
                 'metric_river': None,
                 'metric_sklearn': None,
                 'metric_torch': None,
@@ -333,6 +338,7 @@ def fun_control_init(
         "loss_function": None,
         "lower": lower,
         "max_time": max_time,
+        "max_surrogate_points": max_surrogate_points,
         "metric_river": None,
         "metric_sklearn": metric_sklearn,
         "metric_torch": None,

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotPython"`
`10`		`-version = "0.12.8"`
	`10`	`+version = "0.12.9"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`