Skip to content

Commit 07912ee

Browse files
0.24.29
1 parent 1f753ee commit 07912ee

3 files changed

Lines changed: 64 additions & 73 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "spotpython"
10-
version = "0.24.28"
10+
version = "0.24.29"
1111
authors = [
1212
{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
1313
]

src/spotpython/utils/compare.py

Lines changed: 44 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
from typing import List
33

44
import numpy as np
5-
import pandas as pd
65

76

87
def selectNew(A: np.ndarray, X: np.ndarray, tolerance: float = 0) -> Tuple[np.ndarray, np.ndarray]:
@@ -59,7 +58,7 @@ def find_equal_in_lists(a: List[int], b: List[int]) -> List[int]:
5958
return equal
6059

6160

62-
def check_identical_columns_and_rows(df, remove=False, verbosity=1) -> pd.DataFrame:
61+
def check_identical_columns_and_rows(df, remove=False, verbosity=1) -> tuple:
6362
"""
6463
Checks for exact identical columns and rows in the DataFrame.
6564
@@ -73,34 +72,50 @@ def check_identical_columns_and_rows(df, remove=False, verbosity=1) -> pd.DataFr
7372
verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages.
7473
7574
Returns:
76-
pd.DataFrame: The DataFrame with duplicates removed if specified.
75+
tuple: A tuple containing the DataFrame with duplicates removed if specified,
76+
a list of tuples indicating which columns are duplicates,
77+
and a list of tuples indicating which rows are duplicates.
7778
7879
Example:
7980
>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [4, 5, 6]})
80-
>>> check_identical_columns_and_rows(df, "Example DataFrame", remove=False, verbosity=1)
81-
Identical columns in Example DataFrame:
82-
['A', 'B']
81+
>>> check_identical_columns_and_rows(df, remove=False, verbosity=1)
82+
Identical columns in DataFrame:
83+
[('A', 'B')]
8384
"""
8485
# Check for exact identical columns
85-
col_mask = df.T.duplicated(keep="first")
86-
if col_mask.any() and verbosity > 0:
87-
print(list(df.columns[col_mask]))
86+
identical_columns = []
87+
for i in range(len(df.columns)):
88+
for j in range(i + 1, len(df.columns)):
89+
if df.iloc[:, i].equals(df.iloc[:, j]):
90+
identical_columns.append((df.columns[i], df.columns[j]))
8891

89-
if remove:
90-
df = df.loc[:, ~col_mask]
92+
if identical_columns and verbosity > 0:
93+
print("Identical columns in DataFrame:")
94+
for col_pair in identical_columns:
95+
print(col_pair)
96+
97+
if remove and identical_columns:
98+
df = df.drop(columns=[col_pair[1] for col_pair in identical_columns])
9199

92100
# Check for exact identical rows
93-
row_mask = df.duplicated(keep="first")
94-
if row_mask.any() and verbosity > 0:
95-
print(list(df.index[row_mask]))
101+
identical_rows = []
102+
for i in range(len(df.index)):
103+
for j in range(i + 1, len(df.index)):
104+
if df.iloc[i, :].equals(df.iloc[j, :]):
105+
identical_rows.append((df.index[i], df.index[j]))
96106

97-
if remove:
98-
df = df.loc[~row_mask]
107+
if identical_rows and verbosity > 0:
108+
print("Identical rows in DataFrame:")
109+
for row_pair in identical_rows:
110+
print(row_pair)
111+
112+
if remove and identical_rows:
113+
df = df.drop(index=[row_pair[1] for row_pair in identical_rows])
99114

100-
return df
115+
return df, identical_columns, identical_rows
101116

102117

103-
def check_identical_columns_and_rows_with_tol(df, tolerance, remove=False, verbosity=1) -> pd.DataFrame:
118+
def check_identical_columns_and_rows_with_tol(df, tolerance, remove=False, verbosity=1) -> tuple:
104119
"""
105120
Checks for identical columns and rows within a given tolerance.
106121
@@ -111,13 +126,15 @@ def check_identical_columns_and_rows_with_tol(df, tolerance, remove=False, verbo
111126
verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages.
112127
113128
Returns:
114-
pd.DataFrame: The DataFrame with duplicates removed if specified.
129+
tuple: A tuple containing the DataFrame with duplicates removed if specified,
130+
a list of tuples indicating which columns are duplicates within the tolerance,
131+
and a list of tuples indicating which rows are duplicates within the tolerance.
115132
116133
Example:
117134
>>> df = pd.DataFrame({"A": [1, 1, 3], "B": [1, 1.01, 3], "C": [4, 5, 6]})
118-
>>> check_identical_columns_and_rows_with_tol(df, "Example DataFrame", tolerance=0.05, remove=False, verbosity=1)
119-
Identical columns within tolerance in Example DataFrame:
120-
('A', 'B')
135+
>>> check_identical_columns_and_rows_with_tol(df, tolerance=0.05, remove=False, verbosity=1)
136+
Identical columns within tolerance in DataFrame:
137+
[('A', 'B')]
121138
"""
122139

123140
# Function to compare rows/columns with tolerance
@@ -132,10 +149,11 @@ def is_identical_with_tolerance(series1, series2, tol):
132149
identical_columns.append((df.columns[i], df.columns[j]))
133150

134151
if identical_columns and verbosity > 0:
152+
print("Identical columns within tolerance in DataFrame:")
135153
for col_pair in identical_columns:
136154
print(col_pair)
137155

138-
if remove:
156+
if remove and identical_columns:
139157
df = df.drop(columns=[col_pair[1] for col_pair in identical_columns])
140158

141159
# Check for identical rows within tolerance
@@ -146,10 +164,11 @@ def is_identical_with_tolerance(series1, series2, tol):
146164
identical_rows.append((df.index[i], df.index[j]))
147165

148166
if identical_rows and verbosity > 0:
167+
print("Identical rows within tolerance in DataFrame:")
149168
for row_pair in identical_rows:
150169
print(row_pair)
151170

152-
if remove:
171+
if remove and identical_rows:
153172
df = df.drop(index=[row_pair[1] for row_pair in identical_rows])
154173

155-
return df
174+
return df, identical_columns, identical_rows

test/test_check_identical_columns_and_rows.py

Lines changed: 19 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -2,52 +2,17 @@
22
import pytest
33
from spotpython.utils.compare import check_identical_columns_and_rows, check_identical_columns_and_rows_with_tol
44

5-
def test_check_exact_identical_columns_and_rows():
6-
# Test DataFrames
7-
df1 = pd.DataFrame({
8-
"A": [1, 2, 3],
9-
"B": [1, 2, 3],
10-
"C": [4, 5, 6]
11-
})
12-
13-
df2 = pd.DataFrame({
14-
"X": [7, 8, 9],
15-
"Y": [10, 11, 12]
16-
})
17-
18-
# Exact duplicates - should identify and remove B
19-
result_df = check_identical_columns_and_rows(df1, remove=True)
20-
assert list(result_df.columns) == ["A", "C"], "Failed to remove duplicate columns accurately"
21-
22-
# No duplicates - should not remove any columns
23-
result_df = check_identical_columns_and_rows(df2, remove=True)
24-
assert list(result_df.columns) == ["X", "Y"], "Incorrectly removed columns when there were none to remove"
25-
26-
def test_check_identical_columns_and_rows_with_tol():
27-
# Test DataFrame
28-
df1 = pd.DataFrame({
29-
"A": [1.00, 2.01, 3.00],
30-
"B": [1.01, 2.00, 3.01],
31-
"C": [4.00, 5.00, 6.00]
32-
})
33-
34-
# Within-tolerance duplicates - should identify and remove B
35-
result_df = check_identical_columns_and_rows_with_tol(df1, tolerance=0.05, remove=True)
36-
assert list(result_df.columns) == ["A", "C"], "Failed to remove near-duplicate columns accurately"
37-
38-
# No near duplicates within a small tolerance
39-
result_df = check_identical_columns_and_rows_with_tol(df1, tolerance=0.001, remove=True)
40-
assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly removed columns when they are not near duplicates"
41-
425
def test_check_exact_identical_columns_and_rows_remove_true():
436
df1 = pd.DataFrame({
447
"A": [1, 2, 3],
458
"B": [1, 2, 3],
469
"C": [4, 5, 6]
4710
})
4811

49-
result_df = check_identical_columns_and_rows(df1, remove=True)
12+
result_df, identical_cols, identical_rows = check_identical_columns_and_rows(df1, remove=True)
5013
assert list(result_df.columns) == ["A", "C"], "Failed to remove duplicate columns accurately"
14+
assert identical_cols == [("A", "B")], "Failed to identify exact duplicate columns"
15+
assert identical_rows == [], "Incorrectly identified duplicate rows where none exist"
5116

5217
def test_check_exact_identical_columns_and_rows_remove_false():
5318
df1 = pd.DataFrame({
@@ -56,9 +21,10 @@ def test_check_exact_identical_columns_and_rows_remove_false():
5621
"C": [4, 5, 6]
5722
})
5823

59-
# Check without removing duplicates
60-
result_df = check_identical_columns_and_rows(df1, remove=False)
24+
result_df, identical_cols, identical_rows = check_identical_columns_and_rows(df1, remove=False)
6125
assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly identified or removed columns when remove=False"
26+
assert identical_cols == [("A", "B")], "Failed to identify exact duplicate columns"
27+
assert identical_rows == [], "Incorrectly found duplicate rows"
6228

6329
def test_check_identical_columns_and_rows_with_tol_remove_true():
6430
df1 = pd.DataFrame({
@@ -67,8 +33,10 @@ def test_check_identical_columns_and_rows_with_tol_remove_true():
6733
"C": [4.00, 5.00, 6.00]
6834
})
6935

70-
result_df = check_identical_columns_and_rows_with_tol(df1, tolerance=0.05, remove=True)
36+
result_df, identical_cols, identical_rows = check_identical_columns_and_rows_with_tol(df1, tolerance=0.05, remove=True)
7137
assert list(result_df.columns) == ["A", "C"], "Failed to remove near-duplicate columns accurately with tolerance"
38+
assert identical_cols == [("A", "B")], "Failed to identify near-duplicate columns within tolerance"
39+
assert identical_rows == [], "Incorrectly found duplicate rows"
7240

7341
def test_check_identical_columns_and_rows_with_tol_remove_false():
7442
df1 = pd.DataFrame({
@@ -77,22 +45,26 @@ def test_check_identical_columns_and_rows_with_tol_remove_false():
7745
"C": [4.00, 5.00, 6.00]
7846
})
7947

80-
# Check without removing duplicates
81-
result_df = check_identical_columns_and_rows_with_tol(df1, tolerance=0.05, remove=False)
48+
result_df, identical_cols, identical_rows = check_identical_columns_and_rows_with_tol(df1, tolerance=0.05, remove=False)
8249
assert list(result_df.columns) == ["A", "B", "C"], "Incorrectly identified or removed columns when remove=False"
50+
assert identical_cols == [("A", "B")], "Failed to identify near-duplicate columns within tolerance"
51+
assert identical_rows == [], "Incorrectly found duplicate rows"
8352

8453
def test_with_no_duplicates():
8554
df = pd.DataFrame({
8655
"X": [1, 2, 3],
8756
"Y": [4, 5, 6],
8857
"Z": [7, 8, 9]
8958
})
90-
result_df = check_identical_columns_and_rows(df, remove=True)
59+
result_df, identical_cols, identical_rows = check_identical_columns_and_rows(df, remove=True)
9160
assert list(result_df.columns) == ["X", "Y", "Z"], "Incorrectly removed columns in a no-duplicates scenario"
61+
assert identical_cols == [], "Incorrectly found duplicate columns where none exist"
62+
assert identical_rows == [], "Incorrectly found duplicate rows where none exist"
9263

93-
result_df_with_tol = check_identical_columns_and_rows_with_tol(df, tolerance=0.1, remove=True)
64+
result_df_with_tol, identical_cols_with_tol, identical_rows_with_tol = check_identical_columns_and_rows_with_tol(df, tolerance=0.1, remove=True)
9465
assert list(result_df_with_tol.columns) == ["X", "Y", "Z"], "Incorrectly removed columns in a no-duplicates scenario with tolerance"
95-
66+
assert identical_cols_with_tol == [], "Incorrectly found duplicate columns where none exist"
67+
assert identical_rows_with_tol == [], "Incorrectly found duplicate rows where none exist"
9668

9769
if __name__ == "__main__":
98-
pytest.main()
70+
pytest.main()

0 commit comments

Comments
 (0)