0.24.27

bartzbeielstein · bartzbeielstein · commit c3e7104ea7bc · 2025-02-13T11:07:57.000+01:00
check id cols and rows
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotpython"
-version = "0.24.26"
+version = "0.24.27"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotpython/utils/compare.py b/src/spotpython/utils/compare.py
@@ -2,6 +2,7 @@
 from typing import List
 
 import numpy as np
+import pandas as pd
 
 
 def selectNew(A: np.ndarray, X: np.ndarray, tolerance: float = 0) -> Tuple[np.ndarray, np.ndarray]:
@@ -56,3 +57,105 @@ def find_equal_in_lists(a: List[int], b: List[int]) -> List[int]:
     """
     equal = [1 if a[i] == b[i] else 0 for i in range(len(a))]
     return equal
+
+
+def check_identical_columns_and_rows(df, name, remove=False, verbosity=1) -> pd.DataFrame:
+    """
+    Checks for exact identical columns and rows in the DataFrame.
+
+    Note:
+        This is an efficient method for checking exact duplicates in a DataFrame.
+        If checks with tolerance are needed, use `check_identical_columns_and_rows_with_tol()`.
+
+    Args:
+        df (pd.DataFrame): The DataFrame to check.
+        name (str): Name of the DataFrame for reporting.
+        remove (bool): Whether to remove duplicate columns/rows.
+        verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages.
+
+    Returns:
+        pd.DataFrame: The DataFrame with duplicates removed if specified.
+
+    Example:
+        >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [4, 5, 6]})
+        >>> check_identical_columns_and_rows(df, "Example DataFrame", remove=False, verbosity=1)
+        Identical columns in Example DataFrame:
+        ['A', 'B']
+    """
+    # Check for exact identical columns
+    col_mask = df.T.duplicated(keep="first")
+    if col_mask.any() and verbosity > 0:
+        print(f"\nExact identical columns in {name}:")
+        print(list(df.columns[col_mask]))
+
+    if remove:
+        df = df.loc[:, ~col_mask]
+
+    # Check for exact identical rows
+    row_mask = df.duplicated(keep="first")
+    if row_mask.any() and verbosity > 0:
+        print(f"\nExact identical rows in {name}:")
+        print(list(df.index[row_mask]))
+
+    if remove:
+        df = df.loc[~row_mask]
+
+    return df
+
+
+def check_identical_columns_and_rows_with_tol(df, name, tolerance, remove=False, verbosity=1) -> pd.DataFrame:
+    """
+    Checks for identical columns and rows within a given tolerance.
+
+    Args:
+        df (pd.DataFrame): The DataFrame to check.
+        name (str): Name of the DataFrame for reporting.
+        tolerance (float): The tolerance for checking equivalence.
+        remove (bool): Whether to remove duplicates found within the tolerance.
+        verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages.
+
+    Returns:
+        pd.DataFrame: The DataFrame with duplicates removed if specified.
+
+    Example:
+        >>> df = pd.DataFrame({"A": [1, 1, 3], "B": [1, 1.01, 3], "C": [4, 5, 6]})
+        >>> check_identical_columns_and_rows_with_tol(df, "Example DataFrame", tolerance=0.05, remove=False, verbosity=1)
+        Identical columns within tolerance in Example DataFrame:
+        ('A', 'B')
+    """
+
+    # Function to compare rows/columns with tolerance
+    def is_identical_with_tolerance(series1, series2, tol):
+        return np.allclose(series1, series2, atol=tol)
+
+    # Check for identical columns within tolerance
+    identical_columns = []
+    for i in range(len(df.columns)):
+        for j in range(i + 1, len(df.columns)):
+            if is_identical_with_tolerance(df.iloc[:, i], df.iloc[:, j], tolerance):
+                identical_columns.append((df.columns[i], df.columns[j]))
+
+    if identical_columns and verbosity > 0:
+        print(f"\nIdentical columns within tolerance in {name}:")
+        for col_pair in identical_columns:
+            print(col_pair)
+
+    if remove:
+        df = df.drop(columns=[col_pair[1] for col_pair in identical_columns])
+
+    # Check for identical rows within tolerance
+    identical_rows = []
+    for i in range(len(df.index)):
+        for j in range(i + 1, len(df.index)):
+            if is_identical_with_tolerance(df.iloc[i, :], df.iloc[j, :], tolerance):
+                identical_rows.append((df.index[i], df.index[j]))
+
+    if identical_rows and verbosity > 0:
+        print(f"\nIdentical rows within tolerance in {name}:")
+        for row_pair in identical_rows:
+            print(row_pair)
+
+    if remove:
+        df = df.drop(index=[row_pair[1] for row_pair in identical_rows])
+
+    return df
diff --git a/test/test_check_identical_columns_and_rows.py b/test/test_check_identical_columns_and_rows.py
@@ -0,0 +1,98 @@
+import pandas as pd
+import pytest
+from spotpython.utils.compare import check_identical_columns_and_rows, check_identical_columns_and_rows_with_tol
+
+def test_check_exact_identical_columns_and_rows():
+    # Test DataFrames
+    df1 = pd.DataFrame({
+        "A": [1, 2, 3],
+        "B": [1, 2, 3],
+        "C": [4, 5, 6]
+    })
+    
+    df2 = pd.DataFrame({
+        "X": [7, 8, 9],
+        "Y": [10, 11, 12]
+    })
+
+    # Exact duplicates - should identify and remove B
+    result_df = check_identical_columns_and_rows(df1, "Test DataFrame 1", remove=True)
+    assert list(result_df.columns) == ["A", "C"], "Failed to remove duplicate columns accurately"
+
+    # No duplicates - should not remove any columns
+    result_df = check_identical_columns_and_rows(df2, "Test DataFrame 2", remove=True)
+    assert list(result_df.columns) == ["X", "Y"], "Incorrectly removed columns when there were none to remove"
+
+def test_check_identical_columns_and_rows_with_tol():
+    # Test DataFrame
+    df1 = pd.DataFrame({
+        "A": [1.00, 2.01, 3.00],
+        "B": [1.01, 2.00, 3.01],
+        "C": [4.00, 5.00, 6.00]
+    })
+
+    # Within-tolerance duplicates - should identify and remove B
+    result_df = check_identical_columns_and_rows_with_tol(df1, "Test DataFrame 1", tolerance=0.05, remove=True)
+    assert list(result_df.columns) == ["A", "C"], "Failed to remove near-duplicate columns accurately"
+
+    # No near duplicates within a small tolerance
+    result_df = check_identical_columns_and_rows_with_tol(df1, "Test DataFrame 1", tolerance=0.001, remove=True)
+    assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly removed columns when they are not near duplicates"
+
+def test_check_exact_identical_columns_and_rows_remove_true():
+    df1 = pd.DataFrame({
+        "A": [1, 2, 3],
+        "B": [1, 2, 3],
+        "C": [4, 5, 6]
+    })
+    
+    result_df = check_identical_columns_and_rows(df1, "Test DataFrame 1", remove=True)
+    assert list(result_df.columns) == ["A", "C"], "Failed to remove duplicate columns accurately"
+
+def test_check_exact_identical_columns_and_rows_remove_false():
+    df1 = pd.DataFrame({
+        "A": [1, 2, 3],
+        "B": [1, 2, 3],
+        "C": [4, 5, 6]
+    })
+    
+    # Check without removing duplicates
+    result_df = check_identical_columns_and_rows(df1, "Test DataFrame 1", remove=False)
+    assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly identified or removed columns when remove=False"
+
+def test_check_identical_columns_and_rows_with_tol_remove_true():
+    df1 = pd.DataFrame({
+        "A": [1.00, 2.01, 3.00],
+        "B": [1.01, 2.00, 3.01],
+        "C": [4.00, 5.00, 6.00]
+    })
+
+    result_df = check_identical_columns_and_rows_with_tol(df1, "Test DataFrame 1", tolerance=0.05, remove=True)
+    assert list(result_df.columns) == ["A", "C"], "Failed to remove near-duplicate columns accurately with tolerance"
+
+def test_check_identical_columns_and_rows_with_tol_remove_false():
+    df1 = pd.DataFrame({
+        "A": [1.00, 2.01, 3.00],
+        "B": [1.01, 2.00, 3.01],
+        "C": [4.00, 5.00, 6.00]
+    })
+
+    # Check without removing duplicates
+    result_df = check_identical_columns_and_rows_with_tol(df1, "Test DataFrame 1", tolerance=0.05, remove=False)
+    assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly identified or removed columns when remove=False"
+
+def test_with_no_duplicates():
+    df = pd.DataFrame({
+        "X": [1, 2, 3],
+        "Y": [4, 5, 6],
+        "Z": [7, 8, 9]
+    })
+    result_df = check_identical_columns_and_rows(df, "Test DataFrame with No Duplicates", remove=True)
+    assert list(result_df.columns) == ["X", "Y", "Z"], "Incorrectly removed columns in a no-duplicates scenario"
+
+    result_df_with_tol = check_identical_columns_and_rows_with_tol(df, "Test DataFrame with No Duplicates", tolerance=0.1, remove=True)
+    assert list(result_df_with_tol.columns) == ["X", "Y", "Z"], "Incorrectly removed columns in a no-duplicates scenario with tolerance"
+
+
+if __name__ == "__main__":
+    pytest.main()
diff --git a/test/test_save_experiment.py b/test/test_save_experiment.py
@@ -12,6 +12,7 @@ def test_save_experiment(tmp_path, capsys):
     
     # Initialize function control
     fun_control = fun_control_init(
+        save_experiment=True,
         PREFIX=PREFIX,
         lower=np.array([-1, -1]),
         upper=np.array([1, 1])

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotpython"`
`10`		`-version = "0.24.26"`
	`10`	`+version = "0.24.27"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`