|
2 | 2 | from typing import List |
3 | 3 |
|
4 | 4 | import numpy as np |
| 5 | +import pandas as pd |
5 | 6 |
|
6 | 7 |
|
7 | 8 | def selectNew(A: np.ndarray, X: np.ndarray, tolerance: float = 0) -> Tuple[np.ndarray, np.ndarray]: |
@@ -56,3 +57,105 @@ def find_equal_in_lists(a: List[int], b: List[int]) -> List[int]: |
56 | 57 | """ |
57 | 58 | equal = [1 if a[i] == b[i] else 0 for i in range(len(a))] |
58 | 59 | return equal |
| 60 | + |
| 61 | + |
| 62 | +def check_identical_columns_and_rows(df, name, remove=False, verbosity=1) -> pd.DataFrame: |
| 63 | + """ |
| 64 | + Checks for exact identical columns and rows in the DataFrame. |
| 65 | +
|
| 66 | + Note: |
| 67 | + This is an efficient method for checking exact duplicates in a DataFrame. |
| 68 | + If checks with tolerance are needed, use `check_identical_columns_and_rows_with_tol()`. |
| 69 | +
|
| 70 | + Args: |
| 71 | + df (pd.DataFrame): The DataFrame to check. |
| 72 | + name (str): Name of the DataFrame for reporting. |
| 73 | + remove (bool): Whether to remove duplicate columns/rows. |
| 74 | + verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages. |
| 75 | +
|
| 76 | + Returns: |
| 77 | + pd.DataFrame: The DataFrame with duplicates removed if specified. |
| 78 | +
|
| 79 | + Example: |
| 80 | + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [4, 5, 6]}) |
| 81 | + >>> check_identical_columns_and_rows(df, "Example DataFrame", remove=False, verbosity=1) |
| 82 | + Identical columns in Example DataFrame: |
| 83 | + ['A', 'B'] |
| 84 | + """ |
| 85 | + # Check for exact identical columns |
| 86 | + col_mask = df.T.duplicated(keep="first") |
| 87 | + if col_mask.any() and verbosity > 0: |
| 88 | + print(f"\nExact identical columns in {name}:") |
| 89 | + print(list(df.columns[col_mask])) |
| 90 | + |
| 91 | + if remove: |
| 92 | + df = df.loc[:, ~col_mask] |
| 93 | + |
| 94 | + # Check for exact identical rows |
| 95 | + row_mask = df.duplicated(keep="first") |
| 96 | + if row_mask.any() and verbosity > 0: |
| 97 | + print(f"\nExact identical rows in {name}:") |
| 98 | + print(list(df.index[row_mask])) |
| 99 | + |
| 100 | + if remove: |
| 101 | + df = df.loc[~row_mask] |
| 102 | + |
| 103 | + return df |
| 104 | + |
| 105 | + |
| 106 | +def check_identical_columns_and_rows_with_tol(df, name, tolerance, remove=False, verbosity=1) -> pd.DataFrame: |
| 107 | + """ |
| 108 | + Checks for identical columns and rows within a given tolerance. |
| 109 | +
|
| 110 | + Args: |
| 111 | + df (pd.DataFrame): The DataFrame to check. |
| 112 | + name (str): Name of the DataFrame for reporting. |
| 113 | + tolerance (float): The tolerance for checking equivalence. |
| 114 | + remove (bool): Whether to remove duplicates found within the tolerance. |
| 115 | + verbosity (int): Level of verbosity; 0 for no output, 1 for standard messages. |
| 116 | +
|
| 117 | + Returns: |
| 118 | + pd.DataFrame: The DataFrame with duplicates removed if specified. |
| 119 | +
|
| 120 | + Example: |
| 121 | + >>> df = pd.DataFrame({"A": [1, 1, 3], "B": [1, 1.01, 3], "C": [4, 5, 6]}) |
| 122 | + >>> check_identical_columns_and_rows_with_tol(df, "Example DataFrame", tolerance=0.05, remove=False, verbosity=1) |
| 123 | + Identical columns within tolerance in Example DataFrame: |
| 124 | + ('A', 'B') |
| 125 | + """ |
| 126 | + |
| 127 | + # Function to compare rows/columns with tolerance |
| 128 | + def is_identical_with_tolerance(series1, series2, tol): |
| 129 | + return np.allclose(series1, series2, atol=tol) |
| 130 | + |
| 131 | + # Check for identical columns within tolerance |
| 132 | + identical_columns = [] |
| 133 | + for i in range(len(df.columns)): |
| 134 | + for j in range(i + 1, len(df.columns)): |
| 135 | + if is_identical_with_tolerance(df.iloc[:, i], df.iloc[:, j], tolerance): |
| 136 | + identical_columns.append((df.columns[i], df.columns[j])) |
| 137 | + |
| 138 | + if identical_columns and verbosity > 0: |
| 139 | + print(f"\nIdentical columns within tolerance in {name}:") |
| 140 | + for col_pair in identical_columns: |
| 141 | + print(col_pair) |
| 142 | + |
| 143 | + if remove: |
| 144 | + df = df.drop(columns=[col_pair[1] for col_pair in identical_columns]) |
| 145 | + |
| 146 | + # Check for identical rows within tolerance |
| 147 | + identical_rows = [] |
| 148 | + for i in range(len(df.index)): |
| 149 | + for j in range(i + 1, len(df.index)): |
| 150 | + if is_identical_with_tolerance(df.iloc[i, :], df.iloc[j, :], tolerance): |
| 151 | + identical_rows.append((df.index[i], df.index[j])) |
| 152 | + |
| 153 | + if identical_rows and verbosity > 0: |
| 154 | + print(f"\nIdentical rows within tolerance in {name}:") |
| 155 | + for row_pair in identical_rows: |
| 156 | + print(row_pair) |
| 157 | + |
| 158 | + if remove: |
| 159 | + df = df.drop(index=[row_pair[1] for row_pair in identical_rows]) |
| 160 | + |
| 161 | + return df |
0 commit comments