|
| 1 | +import math |
| 2 | +import warnings |
| 3 | +from typing import List |
| 4 | + |
| 5 | + |
| 6 | +def compute_lengths_from_fractions(fractions: List[float], dataset_length: int) -> List[int]: |
| 7 | + """Compute lengths of dataset splits from given fractions. |
| 8 | +
|
| 9 | + Given a list of fractions that sum up to 1, compute the lengths of each |
| 10 | + corresponding partition of a dataset with a specified length. Each length is |
| 11 | + determined as `floor(frac * dataset_length)`. Any remaining items (due to flooring) |
| 12 | + are distributed among the partitions in a round-robin fashion. |
| 13 | +
|
| 14 | + Args: |
| 15 | + fractions (List[float]): A list of fractions that should sum to 1. |
| 16 | + dataset_length (int): The length of the dataset. |
| 17 | +
|
| 18 | + Returns: |
| 19 | + List[int]: A list of lengths corresponding to each fraction. |
| 20 | +
|
| 21 | + Raises: |
| 22 | + ValueError: If the fractions do not sum to 1. |
| 23 | + ValueError: If any fraction is outside the range [0, 1]. |
| 24 | + ValueError: If the sum of computed lengths does not equal the dataset length. |
| 25 | +
|
| 26 | + Examples: |
| 27 | + >>> from spotpython.utils.split import compute_lengths_from_fractions |
| 28 | + >>> dataset_length = 5 |
| 29 | + >>> fractions = [0.2, 0.3, 0.5] |
| 30 | + >>> compute_lengths_from_fractions(fractions, dataset_length) |
| 31 | + [1, 1, 3] |
| 32 | +
|
| 33 | + In this example, 'dataset_length' is 5 and the 'fractions' specify the |
| 34 | + desired size distribution. The function calculates partitions of lengths |
| 35 | + [1, 1, 3] based on the given fractions. |
| 36 | +
|
| 37 | + """ |
| 38 | + if not math.isclose(sum(fractions), 1) or sum(fractions) > 1: |
| 39 | + raise ValueError("Fractions must sum up to 1.") |
| 40 | + |
| 41 | + subset_lengths: List[int] = [] |
| 42 | + for i, frac in enumerate(fractions): |
| 43 | + if frac < 0 or frac > 1: |
| 44 | + raise ValueError(f"Fraction at index {i} is not between 0 and 1") |
| 45 | + n_items_in_split = int(math.floor(dataset_length * frac)) |
| 46 | + subset_lengths.append(n_items_in_split) |
| 47 | + |
| 48 | + remainder = dataset_length - sum(subset_lengths) |
| 49 | + |
| 50 | + # Add 1 to all the lengths in a round-robin fashion until the remainder is 0 |
| 51 | + for i in range(remainder): |
| 52 | + idx_to_add_at = i % len(subset_lengths) |
| 53 | + subset_lengths[idx_to_add_at] += 1 |
| 54 | + |
| 55 | + lengths = subset_lengths |
| 56 | + for i, length in enumerate(lengths): |
| 57 | + if length == 0: |
| 58 | + warnings.warn(f"Length of split at index {i} is 0. " f"This might result in an empty dataset.") |
| 59 | + |
| 60 | + if sum(lengths) != dataset_length: |
| 61 | + raise ValueError("Sum of computed lengths does not equal the input dataset length!") |
| 62 | + |
| 63 | + return lengths |
| 64 | + |
| 65 | + |
1 | 66 | def calculate_data_split(test_size, full_size, verbosity=0, stage=None) -> tuple: |
2 | 67 | """ |
3 | 68 | Calculates the split sizes for training, validation, and test datasets. |
@@ -52,7 +117,7 @@ def calculate_data_split(test_size, full_size, verbosity=0, stage=None) -> tuple |
52 | 117 | val_size = int(full_train_size * test_size / full_size) |
53 | 118 | train_size = full_train_size - val_size |
54 | 119 | # check if the sizes are correct, i.e., full_size = train_size + val_size + test_size |
55 | | - if full_train_size + test_size != full_size: |
| 120 | + if train_size + val_size + test_size != full_size: |
56 | 121 | raise ValueError(f"full_size ({full_size}) != full_train_size ({full_train_size}) + test_size ({test_size})") |
57 | 122 |
|
58 | 123 | if verbosity > 0: |
|
0 commit comments