0.24.29

bartzbeielstein · bartzbeielstein · commit 07912ee57f32 · 2025-02-13T11:39:37.000+01:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotpython"
-version = "0.24.28"
+version = "0.24.29"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotpython/utils/compare.py b/src/spotpython/utils/compare.py
@@ -2,7 +2,6 @@
 from typing import List
 
 import numpy as np
-import pandas as pd
 
 
 def selectNew(A: np.ndarray, X: np.ndarray, tolerance: float = 0) -> Tuple[np.ndarray, np.ndarray]:
@@ -59,7 +58,7 @@ def find_equal_in_lists(a: List[int], b: List[int]) -> List[int]:
     return equal
 
 
-def check_identical_columns_and_rows(df, remove=False, verbosity=1) -> pd.DataFrame:
+def check_identical_columns_and_rows(df, remove=False, verbosity=1) -> tuple:
     """
     Checks for exact identical columns and rows in the DataFrame.
 
@@ -73,34 +72,50 @@ def check_identical_columns_and_rows(df, remove=False, verbosity=1) -> pd.DataFr
         verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages.
 
     Returns:
-        pd.DataFrame: The DataFrame with duplicates removed if specified.
+        tuple: A tuple containing the DataFrame with duplicates removed if specified,
+               a list of tuples indicating which columns are duplicates,
+               and a list of tuples indicating which rows are duplicates.
 
     Example:
         >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [4, 5, 6]})
-        >>> check_identical_columns_and_rows(df, "Example DataFrame", remove=False, verbosity=1)
-        Identical columns in Example DataFrame:
-        ['A', 'B']
+        >>> check_identical_columns_and_rows(df, remove=False, verbosity=1)
+        Identical columns in DataFrame:
+        [('A', 'B')]
     """
     # Check for exact identical columns
-    col_mask = df.T.duplicated(keep="first")
-    if col_mask.any() and verbosity > 0:
-        print(list(df.columns[col_mask]))
+    identical_columns = []
+    for i in range(len(df.columns)):
+        for j in range(i + 1, len(df.columns)):
+            if df.iloc[:, i].equals(df.iloc[:, j]):
+                identical_columns.append((df.columns[i], df.columns[j]))
 
-    if remove:
-        df = df.loc[:, ~col_mask]
+    if identical_columns and verbosity > 0:
+        print("Identical columns in DataFrame:")
+        for col_pair in identical_columns:
+            print(col_pair)
+
+    if remove and identical_columns:
+        df = df.drop(columns=[col_pair[1] for col_pair in identical_columns])
 
     # Check for exact identical rows
-    row_mask = df.duplicated(keep="first")
-    if row_mask.any() and verbosity > 0:
-        print(list(df.index[row_mask]))
+    identical_rows = []
+    for i in range(len(df.index)):
+        for j in range(i + 1, len(df.index)):
+            if df.iloc[i, :].equals(df.iloc[j, :]):
+                identical_rows.append((df.index[i], df.index[j]))
 
-    if remove:
-        df = df.loc[~row_mask]
+    if identical_rows and verbosity > 0:
+        print("Identical rows in DataFrame:")
+        for row_pair in identical_rows:
+            print(row_pair)
+
+    if remove and identical_rows:
+        df = df.drop(index=[row_pair[1] for row_pair in identical_rows])
 
-    return df
+    return df, identical_columns, identical_rows
 
 
-def check_identical_columns_and_rows_with_tol(df, tolerance, remove=False, verbosity=1) -> pd.DataFrame:
+def check_identical_columns_and_rows_with_tol(df, tolerance, remove=False, verbosity=1) -> tuple:
     """
     Checks for identical columns and rows within a given tolerance.
 
@@ -111,13 +126,15 @@ def check_identical_columns_and_rows_with_tol(df, tolerance, remove=False, verbo
         verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages.
 
     Returns:
-        pd.DataFrame: The DataFrame with duplicates removed if specified.
+        tuple: A tuple containing the DataFrame with duplicates removed if specified,
+               a list of tuples indicating which columns are duplicates within the tolerance,
+               and a list of tuples indicating which rows are duplicates within the tolerance.
 
     Example:
         >>> df = pd.DataFrame({"A": [1, 1, 3], "B": [1, 1.01, 3], "C": [4, 5, 6]})
-        >>> check_identical_columns_and_rows_with_tol(df, "Example DataFrame", tolerance=0.05, remove=False, verbosity=1)
-        Identical columns within tolerance in Example DataFrame:
-        ('A', 'B')
+        >>> check_identical_columns_and_rows_with_tol(df, tolerance=0.05, remove=False, verbosity=1)
+        Identical columns within tolerance in DataFrame:
+        [('A', 'B')]
     """
 
     # Function to compare rows/columns with tolerance
@@ -132,10 +149,11 @@ def is_identical_with_tolerance(series1, series2, tol):
                 identical_columns.append((df.columns[i], df.columns[j]))
 
     if identical_columns and verbosity > 0:
+        print("Identical columns within tolerance in DataFrame:")
         for col_pair in identical_columns:
             print(col_pair)
 
-    if remove:
+    if remove and identical_columns:
         df = df.drop(columns=[col_pair[1] for col_pair in identical_columns])
 
     # Check for identical rows within tolerance
@@ -146,10 +164,11 @@ def is_identical_with_tolerance(series1, series2, tol):
                 identical_rows.append((df.index[i], df.index[j]))
 
     if identical_rows and verbosity > 0:
+        print("Identical rows within tolerance in DataFrame:")
         for row_pair in identical_rows:
             print(row_pair)
 
-    if remove:
+    if remove and identical_rows:
         df = df.drop(index=[row_pair[1] for row_pair in identical_rows])
 
-    return df
+    return df, identical_columns, identical_rows
diff --git a/test/test_check_identical_columns_and_rows.py b/test/test_check_identical_columns_and_rows.py
@@ -2,52 +2,17 @@
 import pytest
 from spotpython.utils.compare import check_identical_columns_and_rows, check_identical_columns_and_rows_with_tol
 
-def test_check_exact_identical_columns_and_rows():
-    # Test DataFrames
-    df1 = pd.DataFrame({
-        "A": [1, 2, 3],
-        "B": [1, 2, 3],
-        "C": [4, 5, 6]
-    })
-    
-    df2 = pd.DataFrame({
-        "X": [7, 8, 9],
-        "Y": [10, 11, 12]
-    })
-
-    # Exact duplicates - should identify and remove B
-    result_df = check_identical_columns_and_rows(df1, remove=True)
-    assert list(result_df.columns) == ["A", "C"], "Failed to remove duplicate columns accurately"
-
-    # No duplicates - should not remove any columns
-    result_df = check_identical_columns_and_rows(df2, remove=True)
-    assert list(result_df.columns) == ["X", "Y"], "Incorrectly removed columns when there were none to remove"
-
-def test_check_identical_columns_and_rows_with_tol():
-    # Test DataFrame
-    df1 = pd.DataFrame({
-        "A": [1.00, 2.01, 3.00],
-        "B": [1.01, 2.00, 3.01],
-        "C": [4.00, 5.00, 6.00]
-    })
-
-    # Within-tolerance duplicates - should identify and remove B
-    result_df = check_identical_columns_and_rows_with_tol(df1, tolerance=0.05, remove=True)
-    assert list(result_df.columns) == ["A", "C"], "Failed to remove near-duplicate columns accurately"
-
-    # No near duplicates within a small tolerance
-    result_df = check_identical_columns_and_rows_with_tol(df1, tolerance=0.001, remove=True)
-    assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly removed columns when they are not near duplicates"
-
 def test_check_exact_identical_columns_and_rows_remove_true():
     df1 = pd.DataFrame({
         "A": [1, 2, 3],
         "B": [1, 2, 3],
         "C": [4, 5, 6]
     })
     
-    result_df = check_identical_columns_and_rows(df1, remove=True)
+    result_df, identical_cols, identical_rows = check_identical_columns_and_rows(df1, remove=True)
     assert list(result_df.columns) == ["A", "C"], "Failed to remove duplicate columns accurately"
+    assert identical_cols == [("A", "B")], "Failed to identify exact duplicate columns"
+    assert identical_rows == [], "Incorrectly identified duplicate rows where none exist"
 
 def test_check_exact_identical_columns_and_rows_remove_false():
     df1 = pd.DataFrame({
@@ -56,9 +21,10 @@ def test_check_exact_identical_columns_and_rows_remove_false():
         "C": [4, 5, 6]
     })
     
-    # Check without removing duplicates
-    result_df = check_identical_columns_and_rows(df1, remove=False)
+    result_df, identical_cols, identical_rows = check_identical_columns_and_rows(df1, remove=False)
     assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly identified or removed columns when remove=False"
+    assert identical_cols == [("A", "B")], "Failed to identify exact duplicate columns"
+    assert identical_rows == [], "Incorrectly found duplicate rows"
 
 def test_check_identical_columns_and_rows_with_tol_remove_true():
     df1 = pd.DataFrame({
@@ -67,8 +33,10 @@ def test_check_identical_columns_and_rows_with_tol_remove_true():
         "C": [4.00, 5.00, 6.00]
     })
 
-    result_df = check_identical_columns_and_rows_with_tol(df1, tolerance=0.05, remove=True)
+    result_df, identical_cols, identical_rows = check_identical_columns_and_rows_with_tol(df1, tolerance=0.05, remove=True)
     assert list(result_df.columns) == ["A", "C"], "Failed to remove near-duplicate columns accurately with tolerance"
+    assert identical_cols == [("A", "B")], "Failed to identify near-duplicate columns within tolerance"
+    assert identical_rows == [], "Incorrectly found duplicate rows"
 
 def test_check_identical_columns_and_rows_with_tol_remove_false():
     df1 = pd.DataFrame({
@@ -77,22 +45,26 @@ def test_check_identical_columns_and_rows_with_tol_remove_false():
         "C": [4.00, 5.00, 6.00]
     })
 
-    # Check without removing duplicates
-    result_df = check_identical_columns_and_rows_with_tol(df1, tolerance=0.05, remove=False)
+    result_df, identical_cols, identical_rows = check_identical_columns_and_rows_with_tol(df1, tolerance=0.05, remove=False)
     assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly identified or removed columns when remove=False"
+    assert identical_cols == [("A", "B")], "Failed to identify near-duplicate columns within tolerance"
+    assert identical_rows == [], "Incorrectly found duplicate rows"
 
 def test_with_no_duplicates():
     df = pd.DataFrame({
         "X": [1, 2, 3],
         "Y": [4, 5, 6],
         "Z": [7, 8, 9]
     })
-    result_df = check_identical_columns_and_rows(df, remove=True)
+    result_df, identical_cols, identical_rows = check_identical_columns_and_rows(df, remove=True)
     assert list(result_df.columns) == ["X", "Y", "Z"], "Incorrectly removed columns in a no-duplicates scenario"
+    assert identical_cols == [], "Incorrectly found duplicate columns where none exist"
+    assert identical_rows == [], "Incorrectly found duplicate rows where none exist"
 
-    result_df_with_tol = check_identical_columns_and_rows_with_tol(df, tolerance=0.1, remove=True)
+    result_df_with_tol, identical_cols_with_tol, identical_rows_with_tol = check_identical_columns_and_rows_with_tol(df, tolerance=0.1, remove=True)
     assert list(result_df_with_tol.columns) == ["X", "Y", "Z"], "Incorrectly removed columns in a no-duplicates scenario with tolerance"
-
+    assert identical_cols_with_tol == [], "Incorrectly found duplicate columns where none exist"
+    assert identical_rows_with_tol == [], "Incorrectly found duplicate rows where none exist"
 
 if __name__ == "__main__":
-    pytest.main()
+    pytest.main()

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotpython"`
`10`		`-version = "0.24.28"`
	`10`	`+version = "0.24.29"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`