v0.2.36

bartzbeielstein · bartzbeielstein · commit 0a826f22c2af · 2023-06-17T23:38:00.000+02:00
combine_features move to add_logical_features
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotPython"
-version = "0.2.35"
+version = "0.2.36"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotPython/data/vbdp.py b/src/spotPython/data/vbdp.py
@@ -1,5 +1,4 @@
-import itertools
-import pandas as pd
+# Purpose: Functions for the VBDP project
 
 
 def cluster_features(X):
@@ -63,102 +62,3 @@ def affinity_propagation_features(X):
     print("Estimated number of clusters: %d" % n_clusters_)
     X["cluster"] = af.labels_
     return X
-
-
-def combine_features(X):
-    """Combines all features in a dataframe with each other using bitwise operations
-
-    Args:
-        X (pd.DataFrame): dataframe with features
-    Returns:
-        X (pd.DataFrame): dataframe with new features
-        Examples:
-            >>> df = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]})
-            >>> df
-                a      b      c
-            0  True   True  False
-            1 False   True  False
-            2  True  False   True
-            >>> combine_features(df)
-                a      b      c  a_and_b  a_or_b  a_xor_b  a_and_c  a_or_c  a_xor_c  b_and_c  b_or_c  b_xor_c
-            0  True   True  False     True    True    False    False    True     True    False    True     True
-            1 False   True  False    False    True     True    False   False    False    False   False    False
-            2  True  False   True    False    True     True     True    True    False    False    True     True
-    """
-    new_cols = []
-    # Iterate over all pairs of columns
-    for col1, col2 in itertools.combinations(X.columns, 2):
-        # Create new columns for the bitwise AND, OR and XOR operations
-        and_col = X[[col1, col2]].apply(lambda x: x[col1] & x[col2], axis=1)
-        or_col = X[[col1, col2]].apply(lambda x: x[col1] | x[col2], axis=1)
-        xor_col = X[[col1, col2]].apply(lambda x: x[col1] ^ x[col2], axis=1)
-        new_cols.extend([and_col, or_col, xor_col])
-    # Join all the new columns at once
-    X = pd.concat([X] + new_cols, axis=1)
-    return X
-
-
-def symptom_features(X, y):
-    """Generate new features based on the joint symptoms of a disease
-    Args:
-        X (pd.DataFrame): dataframe with features
-        y (pd.Series): series with target values
-    """
-    # Combine X and y into one dataframe
-    Xy = pd.concat([X, y], axis=1)
-    # Add names to the columns: x1, x2, ..., xn, y
-    Xy.columns = ["x" + str(i) for i in range(1, X.shape[1] + 1)] + ["y"]
-    # full train data with X and y values
-    marginals = Xy.groupby("y").mean()
-    top_2_symptoms = {}
-    bot_2_symptoms = {}
-    # for feature generation
-    combinations = []
-    for i in range(marginals.shape[0]):
-        symptoms = marginals.iloc[i]
-        # for b in True, False:
-        sorted = symptoms.sort_values(ascending=False)
-        top_1 = sorted.keys()[0]
-        top_1_per = sorted.values[0]
-        top_2 = sorted.keys()[1]
-        top_2_per = sorted.values[1]
-
-        bot_1 = sorted.keys()[-1]
-        bot_1_per = sorted.values[-1]
-        bot_2 = sorted.keys()[-2]
-        bot_2_per = sorted.values[-2]
-
-        name = marginals.index[i]
-        dic = {top_1: top_1_per, top_2: top_2_per}
-        dic_bot = {bot_1: bot_1_per, bot_2: bot_2_per}
-        top_2_symptoms[name] = dic
-        bot_2_symptoms[name] = dic_bot
-        combinations.append(((top_1, top_2), (bot_1, bot_2)))
-    Xy_mod = Xy.copy()
-    convert = Xy.drop(columns=["y"]).columns.values
-    for val in convert:
-        Xy_mod[val] = Xy_mod[val].astype("int")
-    for group in combinations:
-        for comb in group:
-            col1, col2 = comb
-            new_columns = pd.DataFrame(
-                {
-                    f"{col1}_and_{col2}": Xy_mod[col1] & Xy_mod[col2],
-                    f"{col1}_or_{col2}": Xy_mod[col1] | Xy_mod[col2],
-                    f"{col1}_xor_{col2}": Xy_mod[col1] ^ Xy_mod[col2],
-                }
-            )
-            Xy_mod = pd.concat([Xy_mod, new_columns], axis=1)
-    # removing duplicate features
-    Xy_mod = Xy_mod.loc[:, ~Xy_mod.columns.duplicated()].copy()
-    print(f"Number of features: {Xy_mod.shape[1]}")
-    print(f"Number of samples: {Xy_mod.shape[0]}")
-    # remove the column y from the Xy_mod data frame
-    X_mod = Xy_mod.drop(columns=["y"])
-    # print the column names
-    print(f"Column names: {Xy_mod.columns.values}")
-    # X_new = add_logical_columns(X_mod, 2)
-    X_new = combine_features(X_mod)
-    print(f"Number of features: {X_new.shape[1]}")
-    print(f"Number of samples: {X_new.shape[0]}")
-    return X_new, top_2_symptoms, bot_2_symptoms
diff --git a/src/spotPython/utils/convert.py b/src/spotPython/utils/convert.py
@@ -71,47 +71,37 @@ def series_to_array(series):
         return series.to_numpy()
 
 
-def add_logical_columns(df, arity):
-    """Adds logical columns to a DataFrame.
+def add_logical_columns(X, arity=2, operations=["and", "or", "xor"]):
+    """Combines all features in a dataframe with each other using bitwise operations
+
     Args:
-        df (pandas.DataFrame): The input DataFrame.
-        arity (int): The arity of the logical columns.
+        X (pd.DataFrame): dataframe with features
+        arity (int): the number of columns to combine at once
+        operations (list of str): the operations to apply. Possible values are 'and', 'or' and 'xor'
     Returns:
-        pandas.DataFrame: The output DataFrame.
-    Example:
-        >>> from spotPython.utils.convert import add_logical_columns
-        >>> import pandas as pd
-        >>> df = pd.DataFrame({'A': [True, False, True], 'B': [False, True, True], 'C': [True, True, False]})
-        >>> result = add_logical_columns(df, 2)
-        >>> print(result)
-            A      B      C  and_A_B  or_A_B  xor_A_B  and_A_C  or_A_C  xor_A_C  and_B_C  or_B_C  xor_B_C
-        0   True  False   True    False    True     True     True    True    False   False    True     True
-        1  False   True   True    False    True     True    False    True     True   False    True     True
-        2   True   True  False     True    True    False    False    True     True    True    True    False
-    """
-    # Create a copy of the input DataFrame to avoid modifying it
-    result = df.copy()
-
-    # Create empty DataFrames for the additional columns
-    and_df = pd.DataFrame(index=df.index)
-    or_df = pd.DataFrame(index=df.index)
-    xor_df = pd.DataFrame(index=df.index)
+        X (pd.DataFrame): dataframe with new features
+    Examples:
+        >>> X = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]})
+        >>> add_logical_columns(X)
+            a      b      c  a_and_b  a_and_c  b_and_c  a_or_b  a_or_c  b_or_c  a_xor_b  a_xor_c  b_xor_c
+        0  True   True  False     True    False    False    True    True    True    False     True     True
+        1 False   True  False    False    False    False    True   False    True     True     True    False
+        2  True  False   True    False     True    False    True    True    True     True    False     True
 
-    # Get all combinations of columns with the specified arity
-    column_combinations = list(combinations(df.columns, arity))
-
-    # Apply the logical_and, logical_or and logical_xor functions to all combinations of columns
-    for cols in column_combinations:
-        col_name = "_".join(cols)
-        and_df[f"and_{col_name}"] = result[cols[0]]
-        or_df[f"or_{col_name}"] = result[cols[0]]
-        xor_df[f"xor_{col_name}"] = result[cols[0]]
-        for col in cols[1:]:
-            and_df[f"and_{col_name}"] &= result[col]
-            or_df[f"or_{col_name}"] |= result[col]
-            xor_df[f"xor_{col_name}"] ^= result[col]
-
-    # Concatenate the input DataFrame with the additional columns
-    result = pd.concat([result, and_df, or_df, xor_df], axis=1)
-
-    return result
+    """
+    new_cols = []
+    # Iterate over all combinations of columns of the given arity
+    for cols in combinations(X.columns, arity):
+        # Create new columns for the specified operations
+        if "and" in operations:
+            and_col = X[list(cols)].apply(lambda x: x.all(), axis=1)
+            new_cols.append(and_col)
+        if "or" in operations:
+            or_col = X[list(cols)].apply(lambda x: x.any(), axis=1)
+            new_cols.append(or_col)
+        if "xor" in operations:
+            xor_col = X[list(cols)].apply(lambda x: x.sum() % 2 == 1, axis=1)
+            new_cols.append(xor_col)
+    # Join all the new columns at once
+    X = pd.concat([X] + new_cols, axis=1)
+    return X

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotPython"`
`10`		`-version = "0.2.35"`
	`10`	`+version = "0.2.36"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`