22from typing import List
33
44import numpy as np
5- import pandas as pd
65
76
87def selectNew (A : np .ndarray , X : np .ndarray , tolerance : float = 0 ) -> Tuple [np .ndarray , np .ndarray ]:
@@ -59,7 +58,7 @@ def find_equal_in_lists(a: List[int], b: List[int]) -> List[int]:
5958 return equal
6059
6160
62- def check_identical_columns_and_rows (df , remove = False , verbosity = 1 ) -> pd . DataFrame :
61+ def check_identical_columns_and_rows (df , remove = False , verbosity = 1 ) -> tuple :
6362 """
6463 Checks for exact identical columns and rows in the DataFrame.
6564
@@ -73,34 +72,50 @@ def check_identical_columns_and_rows(df, remove=False, verbosity=1) -> pd.DataFr
7372 verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages.
7473
7574 Returns:
76- pd.DataFrame: The DataFrame with duplicates removed if specified.
75+ tuple: A tuple containing the DataFrame with duplicates removed if specified,
76+ a list of tuples indicating which columns are duplicates,
77+ and a list of tuples indicating which rows are duplicates.
7778
7879 Example:
7980 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [4, 5, 6]})
80- >>> check_identical_columns_and_rows(df, "Example DataFrame", remove=False, verbosity=1)
81- Identical columns in Example DataFrame:
82- ['A', 'B']
81+ >>> check_identical_columns_and_rows(df, remove=False, verbosity=1)
82+ Identical columns in DataFrame:
83+ [( 'A', 'B') ]
8384 """
8485 # Check for exact identical columns
85- col_mask = df .T .duplicated (keep = "first" )
86- if col_mask .any () and verbosity > 0 :
87- print (list (df .columns [col_mask ]))
86+ identical_columns = []
87+ for i in range (len (df .columns )):
88+ for j in range (i + 1 , len (df .columns )):
89+ if df .iloc [:, i ].equals (df .iloc [:, j ]):
90+ identical_columns .append ((df .columns [i ], df .columns [j ]))
8891
89- if remove :
90- df = df .loc [:, ~ col_mask ]
92+ if identical_columns and verbosity > 0 :
93+ print ("Identical columns in DataFrame:" )
94+ for col_pair in identical_columns :
95+ print (col_pair )
96+
97+ if remove and identical_columns :
98+ df = df .drop (columns = [col_pair [1 ] for col_pair in identical_columns ])
9199
92100 # Check for exact identical rows
93- row_mask = df .duplicated (keep = "first" )
94- if row_mask .any () and verbosity > 0 :
95- print (list (df .index [row_mask ]))
101+ identical_rows = []
102+ for i in range (len (df .index )):
103+ for j in range (i + 1 , len (df .index )):
104+ if df .iloc [i , :].equals (df .iloc [j , :]):
105+ identical_rows .append ((df .index [i ], df .index [j ]))
96106
97- if remove :
98- df = df .loc [~ row_mask ]
107+ if identical_rows and verbosity > 0 :
108+ print ("Identical rows in DataFrame:" )
109+ for row_pair in identical_rows :
110+ print (row_pair )
111+
112+ if remove and identical_rows :
113+ df = df .drop (index = [row_pair [1 ] for row_pair in identical_rows ])
99114
100- return df
115+ return df , identical_columns , identical_rows
101116
102117
103- def check_identical_columns_and_rows_with_tol (df , tolerance , remove = False , verbosity = 1 ) -> pd . DataFrame :
118+ def check_identical_columns_and_rows_with_tol (df , tolerance , remove = False , verbosity = 1 ) -> tuple :
104119 """
105120 Checks for identical columns and rows within a given tolerance.
106121
@@ -111,13 +126,15 @@ def check_identical_columns_and_rows_with_tol(df, tolerance, remove=False, verbo
111126 verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages.
112127
113128 Returns:
114- pd.DataFrame: The DataFrame with duplicates removed if specified.
129+ tuple: A tuple containing the DataFrame with duplicates removed if specified,
130+ a list of tuples indicating which columns are duplicates within the tolerance,
131+ and a list of tuples indicating which rows are duplicates within the tolerance.
115132
116133 Example:
117134 >>> df = pd.DataFrame({"A": [1, 1, 3], "B": [1, 1.01, 3], "C": [4, 5, 6]})
118- >>> check_identical_columns_and_rows_with_tol(df, "Example DataFrame", tolerance=0.05, remove=False, verbosity=1)
119- Identical columns within tolerance in Example DataFrame:
120- ('A', 'B')
135+ >>> check_identical_columns_and_rows_with_tol(df, tolerance=0.05, remove=False, verbosity=1)
136+ Identical columns within tolerance in DataFrame:
137+ [ ('A', 'B')]
121138 """
122139
123140 # Function to compare rows/columns with tolerance
@@ -132,10 +149,11 @@ def is_identical_with_tolerance(series1, series2, tol):
132149 identical_columns .append ((df .columns [i ], df .columns [j ]))
133150
134151 if identical_columns and verbosity > 0 :
152+ print ("Identical columns within tolerance in DataFrame:" )
135153 for col_pair in identical_columns :
136154 print (col_pair )
137155
138- if remove :
156+ if remove and identical_columns :
139157 df = df .drop (columns = [col_pair [1 ] for col_pair in identical_columns ])
140158
141159 # Check for identical rows within tolerance
@@ -146,10 +164,11 @@ def is_identical_with_tolerance(series1, series2, tol):
146164 identical_rows .append ((df .index [i ], df .index [j ]))
147165
148166 if identical_rows and verbosity > 0 :
167+ print ("Identical rows within tolerance in DataFrame:" )
149168 for row_pair in identical_rows :
150169 print (row_pair )
151170
152- if remove :
171+ if remove and identical_rows :
153172 df = df .drop (index = [row_pair [1 ] for row_pair in identical_rows ])
154173
155- return df
174+ return df , identical_columns , identical_rows
0 commit comments