Skip to content

Commit c3e7104

Browse files
0.24.27
check id cols and rows
1 parent 6d53ad9 commit c3e7104

4 files changed

Lines changed: 203 additions & 1 deletion

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "spotpython"
10-
version = "0.24.26"
10+
version = "0.24.27"
1111
authors = [
1212
{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
1313
]

src/spotpython/utils/compare.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import List
33

44
import numpy as np
5+
import pandas as pd
56

67

78
def selectNew(A: np.ndarray, X: np.ndarray, tolerance: float = 0) -> Tuple[np.ndarray, np.ndarray]:
@@ -56,3 +57,105 @@ def find_equal_in_lists(a: List[int], b: List[int]) -> List[int]:
5657
"""
5758
equal = [1 if a[i] == b[i] else 0 for i in range(len(a))]
5859
return equal
60+
61+
62+
def check_identical_columns_and_rows(df, name, remove=False, verbosity=1) -> pd.DataFrame:
63+
"""
64+
Checks for exact identical columns and rows in the DataFrame.
65+
66+
Note:
67+
This is an efficient method for checking exact duplicates in a DataFrame.
68+
If checks with tolerance are needed, use `check_identical_columns_and_rows_with_tol()`.
69+
70+
Args:
71+
df (pd.DataFrame): The DataFrame to check.
72+
name (str): Name of the DataFrame for reporting.
73+
remove (bool): Whether to remove duplicate columns/rows.
74+
verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages.
75+
76+
Returns:
77+
pd.DataFrame: The DataFrame with duplicates removed if specified.
78+
79+
Example:
80+
>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [4, 5, 6]})
81+
>>> check_identical_columns_and_rows(df, "Example DataFrame", remove=False, verbosity=1)
82+
Identical columns in Example DataFrame:
83+
['A', 'B']
84+
"""
85+
# Check for exact identical columns
86+
col_mask = df.T.duplicated(keep="first")
87+
if col_mask.any() and verbosity > 0:
88+
print(f"\nExact identical columns in {name}:")
89+
print(list(df.columns[col_mask]))
90+
91+
if remove:
92+
df = df.loc[:, ~col_mask]
93+
94+
# Check for exact identical rows
95+
row_mask = df.duplicated(keep="first")
96+
if row_mask.any() and verbosity > 0:
97+
print(f"\nExact identical rows in {name}:")
98+
print(list(df.index[row_mask]))
99+
100+
if remove:
101+
df = df.loc[~row_mask]
102+
103+
return df
104+
105+
106+
def check_identical_columns_and_rows_with_tol(df, name, tolerance, remove=False, verbosity=1) -> pd.DataFrame:
107+
"""
108+
Checks for identical columns and rows within a given tolerance.
109+
110+
Args:
111+
df (pd.DataFrame): The DataFrame to check.
112+
name (str): Name of the DataFrame for reporting.
113+
tolerance (float): The tolerance for checking equivalence.
114+
remove (bool): Whether to remove duplicates found within the tolerance.
115+
verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages.
116+
117+
Returns:
118+
pd.DataFrame: The DataFrame with duplicates removed if specified.
119+
120+
Example:
121+
>>> df = pd.DataFrame({"A": [1, 1, 3], "B": [1, 1.01, 3], "C": [4, 5, 6]})
122+
>>> check_identical_columns_and_rows_with_tol(df, "Example DataFrame", tolerance=0.05, remove=False, verbosity=1)
123+
Identical columns within tolerance in Example DataFrame:
124+
('A', 'B')
125+
"""
126+
127+
# Function to compare rows/columns with tolerance
128+
def is_identical_with_tolerance(series1, series2, tol):
129+
return np.allclose(series1, series2, atol=tol)
130+
131+
# Check for identical columns within tolerance
132+
identical_columns = []
133+
for i in range(len(df.columns)):
134+
for j in range(i + 1, len(df.columns)):
135+
if is_identical_with_tolerance(df.iloc[:, i], df.iloc[:, j], tolerance):
136+
identical_columns.append((df.columns[i], df.columns[j]))
137+
138+
if identical_columns and verbosity > 0:
139+
print(f"\nIdentical columns within tolerance in {name}:")
140+
for col_pair in identical_columns:
141+
print(col_pair)
142+
143+
if remove:
144+
df = df.drop(columns=[col_pair[1] for col_pair in identical_columns])
145+
146+
# Check for identical rows within tolerance
147+
identical_rows = []
148+
for i in range(len(df.index)):
149+
for j in range(i + 1, len(df.index)):
150+
if is_identical_with_tolerance(df.iloc[i, :], df.iloc[j, :], tolerance):
151+
identical_rows.append((df.index[i], df.index[j]))
152+
153+
if identical_rows and verbosity > 0:
154+
print(f"\nIdentical rows within tolerance in {name}:")
155+
for row_pair in identical_rows:
156+
print(row_pair)
157+
158+
if remove:
159+
df = df.drop(index=[row_pair[1] for row_pair in identical_rows])
160+
161+
return df
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import pandas as pd
2+
import pytest
3+
from spotpython.utils.compare import check_identical_columns_and_rows, check_identical_columns_and_rows_with_tol
4+
5+
def test_check_exact_identical_columns_and_rows():
6+
# Test DataFrames
7+
df1 = pd.DataFrame({
8+
"A": [1, 2, 3],
9+
"B": [1, 2, 3],
10+
"C": [4, 5, 6]
11+
})
12+
13+
df2 = pd.DataFrame({
14+
"X": [7, 8, 9],
15+
"Y": [10, 11, 12]
16+
})
17+
18+
# Exact duplicates - should identify and remove B
19+
result_df = check_identical_columns_and_rows(df1, "Test DataFrame 1", remove=True)
20+
assert list(result_df.columns) == ["A", "C"], "Failed to remove duplicate columns accurately"
21+
22+
# No duplicates - should not remove any columns
23+
result_df = check_identical_columns_and_rows(df2, "Test DataFrame 2", remove=True)
24+
assert list(result_df.columns) == ["X", "Y"], "Incorrectly removed columns when there were none to remove"
25+
26+
def test_check_identical_columns_and_rows_with_tol():
27+
# Test DataFrame
28+
df1 = pd.DataFrame({
29+
"A": [1.00, 2.01, 3.00],
30+
"B": [1.01, 2.00, 3.01],
31+
"C": [4.00, 5.00, 6.00]
32+
})
33+
34+
# Within-tolerance duplicates - should identify and remove B
35+
result_df = check_identical_columns_and_rows_with_tol(df1, "Test DataFrame 1", tolerance=0.05, remove=True)
36+
assert list(result_df.columns) == ["A", "C"], "Failed to remove near-duplicate columns accurately"
37+
38+
# No near duplicates within a small tolerance
39+
result_df = check_identical_columns_and_rows_with_tol(df1, "Test DataFrame 1", tolerance=0.001, remove=True)
40+
assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly removed columns when they are not near duplicates"
41+
42+
def test_check_exact_identical_columns_and_rows_remove_true():
43+
df1 = pd.DataFrame({
44+
"A": [1, 2, 3],
45+
"B": [1, 2, 3],
46+
"C": [4, 5, 6]
47+
})
48+
49+
result_df = check_identical_columns_and_rows(df1, "Test DataFrame 1", remove=True)
50+
assert list(result_df.columns) == ["A", "C"], "Failed to remove duplicate columns accurately"
51+
52+
def test_check_exact_identical_columns_and_rows_remove_false():
53+
df1 = pd.DataFrame({
54+
"A": [1, 2, 3],
55+
"B": [1, 2, 3],
56+
"C": [4, 5, 6]
57+
})
58+
59+
# Check without removing duplicates
60+
result_df = check_identical_columns_and_rows(df1, "Test DataFrame 1", remove=False)
61+
assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly identified or removed columns when remove=False"
62+
63+
def test_check_identical_columns_and_rows_with_tol_remove_true():
64+
df1 = pd.DataFrame({
65+
"A": [1.00, 2.01, 3.00],
66+
"B": [1.01, 2.00, 3.01],
67+
"C": [4.00, 5.00, 6.00]
68+
})
69+
70+
result_df = check_identical_columns_and_rows_with_tol(df1, "Test DataFrame 1", tolerance=0.05, remove=True)
71+
assert list(result_df.columns) == ["A", "C"], "Failed to remove near-duplicate columns accurately with tolerance"
72+
73+
def test_check_identical_columns_and_rows_with_tol_remove_false():
74+
df1 = pd.DataFrame({
75+
"A": [1.00, 2.01, 3.00],
76+
"B": [1.01, 2.00, 3.01],
77+
"C": [4.00, 5.00, 6.00]
78+
})
79+
80+
# Check without removing duplicates
81+
result_df = check_identical_columns_and_rows_with_tol(df1, "Test DataFrame 1", tolerance=0.05, remove=False)
82+
assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly identified or removed columns when remove=False"
83+
84+
def test_with_no_duplicates():
85+
df = pd.DataFrame({
86+
"X": [1, 2, 3],
87+
"Y": [4, 5, 6],
88+
"Z": [7, 8, 9]
89+
})
90+
result_df = check_identical_columns_and_rows(df, "Test DataFrame with No Duplicates", remove=True)
91+
assert list(result_df.columns) == ["X", "Y", "Z"], "Incorrectly removed columns in a no-duplicates scenario"
92+
93+
result_df_with_tol = check_identical_columns_and_rows_with_tol(df, "Test DataFrame with No Duplicates", tolerance=0.1, remove=True)
94+
assert list(result_df_with_tol.columns) == ["X", "Y", "Z"], "Incorrectly removed columns in a no-duplicates scenario with tolerance"
95+
96+
97+
if __name__ == "__main__":
98+
pytest.main()

test/test_save_experiment.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ def test_save_experiment(tmp_path, capsys):
1212

1313
# Initialize function control
1414
fun_control = fun_control_init(
15+
save_experiment=True,
1516
PREFIX=PREFIX,
1617
lower=np.array([-1, -1]),
1718
upper=np.array([1, 1])

0 commit comments

Comments
 (0)