0.18.6

bartzbeielstein · bartzbeielstein · commit 1c3b712c49ab · 2024-11-22T14:56:09.000+01:00
diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt
@@ -1,7 +1,10 @@
-spotpython 0.18.5:
+spotpython 0.18.6:
 
 - split.py:
-    computation fixed
+    New function: compute_lengths_from_fractions()
+
+- lightdatamodule.py:
+    train, val, test set computaion updated
 
 spotpython 0.18.4:
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotpython"
-version = "0.18.5"
+version = "0.18.6"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotpython/data/lightdatamodule.py b/src/spotpython/data/lightdatamodule.py
@@ -2,7 +2,7 @@
 import torch
 from torch.utils.data import DataLoader, random_split, TensorDataset
 from typing import Optional
-from spotpython.utils.split import calculate_data_split
+from math import floor
 
 
 class LightDataModule(L.LightningDataModule):
@@ -166,12 +166,19 @@ def setup(self, stage: Optional[str] = None) -> None:
                 Training set size: 3
 
         """
-        full_train_size, val_size, train_size, test_size = calculate_data_split(
-            test_size=self.test_size,
-            full_size=len(self.data_full),
-            verbosity=self.verbosity,
-            stage=stage,
-        )
+        full_size = len(self.data_full)
+        test_size = self.test_size
+
+        # consider the case when test_size is a float
+        if isinstance(self.test_size, float):
+            full_train_size = 1.0 - self.test_size
+            val_size = full_train_size * self.test_size
+            train_size = full_train_size - val_size
+        else:
+            # test_size is an int, training size calculation directly based on it
+            full_train_size = full_size - self.test_size
+            val_size = floor(full_train_size * self.test_size / full_size)
+            train_size = full_size - val_size - test_size
 
         # Assign train/val datasets for use in dataloaders
         if stage == "fit" or stage is None:
@@ -188,7 +195,7 @@ def setup(self, stage: Optional[str] = None) -> None:
             if self.verbosity > 0:
                 print(f"test_size: {test_size} used for test dataset.")
             generator_test = torch.Generator().manual_seed(self.test_seed)
-            self.data_test, _ = random_split(self.data_full, [test_size, full_train_size], generator=generator_test)
+            self.data_test, _, _ = random_split(self.data_full, [test_size, train_size, val_size], generator=generator_test)
             if self.scaler is not None:
                 # Transform the test data
                 self.data_test = self.transform_dataset(self.data_test)
@@ -198,7 +205,7 @@ def setup(self, stage: Optional[str] = None) -> None:
             if self.verbosity > 0:
                 print(f"test_size: {test_size} used for predict dataset.")
             generator_predict = torch.Generator().manual_seed(self.test_seed)
-            self.data_predict, _ = random_split(self.data_full, [test_size, full_train_size], generator=generator_predict)
+            self.data_predict, _, _ = random_split(self.data_full, [test_size, train_size, val_size], generator=generator_predict)
             if self.scaler is not None:
                 # Transform the predict data
                 self.data_predict = self.transform_dataset(self.data_predict)
diff --git a/src/spotpython/utils/split.py b/src/spotpython/utils/split.py
@@ -1,3 +1,68 @@
+import math
+import warnings
+from typing import List
+
+
+def compute_lengths_from_fractions(fractions: List[float], dataset_length: int) -> List[int]:
+    """Compute lengths of dataset splits from given fractions.
+
+    Given a list of fractions that sum up to 1, compute the lengths of each
+    corresponding partition of a dataset with a specified length. Each length is
+    determined as `floor(frac * dataset_length)`. Any remaining items (due to flooring)
+    are distributed among the partitions in a round-robin fashion.
+
+    Args:
+        fractions (List[float]): A list of fractions that should sum to 1.
+        dataset_length (int): The length of the dataset.
+
+    Returns:
+        List[int]: A list of lengths corresponding to each fraction.
+
+    Raises:
+        ValueError: If the fractions do not sum to 1.
+        ValueError: If any fraction is outside the range [0, 1].
+        ValueError: If the sum of computed lengths does not equal the dataset length.
+
+    Examples:
+        >>> from spotpython.utils.split import compute_lengths_from_fractions
+        >>> dataset_length = 5
+        >>> fractions = [0.2, 0.3, 0.5]
+        >>> compute_lengths_from_fractions(fractions, dataset_length)
+        [1, 1, 3]
+
+        In this example, 'dataset_length' is 5 and the 'fractions' specify the
+        desired size distribution. The function calculates partitions of lengths
+        [1, 1, 3] based on the given fractions.
+
+    """
+    if not math.isclose(sum(fractions), 1) or sum(fractions) > 1:
+        raise ValueError("Fractions must sum up to 1.")
+
+    subset_lengths: List[int] = []
+    for i, frac in enumerate(fractions):
+        if frac < 0 or frac > 1:
+            raise ValueError(f"Fraction at index {i} is not between 0 and 1")
+        n_items_in_split = int(math.floor(dataset_length * frac))
+        subset_lengths.append(n_items_in_split)
+
+    remainder = dataset_length - sum(subset_lengths)
+
+    # Add 1 to all the lengths in a round-robin fashion until the remainder is 0
+    for i in range(remainder):
+        idx_to_add_at = i % len(subset_lengths)
+        subset_lengths[idx_to_add_at] += 1
+
+    lengths = subset_lengths
+    for i, length in enumerate(lengths):
+        if length == 0:
+            warnings.warn(f"Length of split at index {i} is 0. " f"This might result in an empty dataset.")
+
+    if sum(lengths) != dataset_length:
+        raise ValueError("Sum of computed lengths does not equal the input dataset length!")
+
+    return lengths
+
+
 def calculate_data_split(test_size, full_size, verbosity=0, stage=None) -> tuple:
     """
     Calculates the split sizes for training, validation, and test datasets.
@@ -52,7 +117,7 @@ def calculate_data_split(test_size, full_size, verbosity=0, stage=None) -> tuple
         val_size = int(full_train_size * test_size / full_size)
         train_size = full_train_size - val_size
         # check if the sizes are correct, i.e., full_size = train_size + val_size + test_size
-        if full_train_size + test_size != full_size:
+        if train_size + val_size + test_size != full_size:
             raise ValueError(f"full_size ({full_size}) != full_train_size ({full_train_size}) + test_size ({test_size})")
 
     if verbosity > 0:

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotpython"`
`10`		`-version = "0.18.5"`
	`10`	`+version = "0.18.6"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`