|
1 | | -import itertools |
2 | | -import pandas as pd |
| 1 | +# Purpose: Functions for the VBDP project |
3 | 2 |
|
4 | 3 |
|
5 | 4 | def cluster_features(X): |
@@ -63,102 +62,3 @@ def affinity_propagation_features(X): |
63 | 62 | print("Estimated number of clusters: %d" % n_clusters_) |
64 | 63 | X["cluster"] = af.labels_ |
65 | 64 | return X |
66 | | - |
67 | | - |
68 | | -def combine_features(X): |
69 | | - """Combines all features in a dataframe with each other using bitwise operations |
70 | | -
|
71 | | - Args: |
72 | | - X (pd.DataFrame): dataframe with features |
73 | | - Returns: |
74 | | - X (pd.DataFrame): dataframe with new features |
75 | | - Examples: |
76 | | - >>> df = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]}) |
77 | | - >>> df |
78 | | - a b c |
79 | | - 0 True True False |
80 | | - 1 False True False |
81 | | - 2 True False True |
82 | | - >>> combine_features(df) |
83 | | - a b c a_and_b a_or_b a_xor_b a_and_c a_or_c a_xor_c b_and_c b_or_c b_xor_c |
84 | | - 0 True True False True True False False True True False True True |
85 | | - 1 False True False False True True False False False False False False |
86 | | - 2 True False True False True True True True False False True True |
87 | | - """ |
88 | | - new_cols = [] |
89 | | - # Iterate over all pairs of columns |
90 | | - for col1, col2 in itertools.combinations(X.columns, 2): |
91 | | - # Create new columns for the bitwise AND, OR and XOR operations |
92 | | - and_col = X[[col1, col2]].apply(lambda x: x[col1] & x[col2], axis=1) |
93 | | - or_col = X[[col1, col2]].apply(lambda x: x[col1] | x[col2], axis=1) |
94 | | - xor_col = X[[col1, col2]].apply(lambda x: x[col1] ^ x[col2], axis=1) |
95 | | - new_cols.extend([and_col, or_col, xor_col]) |
96 | | - # Join all the new columns at once |
97 | | - X = pd.concat([X] + new_cols, axis=1) |
98 | | - return X |
99 | | - |
100 | | - |
101 | | -def symptom_features(X, y): |
102 | | - """Generate new features based on the joint symptoms of a disease |
103 | | - Args: |
104 | | - X (pd.DataFrame): dataframe with features |
105 | | - y (pd.Series): series with target values |
106 | | - """ |
107 | | - # Combine X and y into one dataframe |
108 | | - Xy = pd.concat([X, y], axis=1) |
109 | | - # Add names to the columns: x1, x2, ..., xn, y |
110 | | - Xy.columns = ["x" + str(i) for i in range(1, X.shape[1] + 1)] + ["y"] |
111 | | - # full train data with X and y values |
112 | | - marginals = Xy.groupby("y").mean() |
113 | | - top_2_symptoms = {} |
114 | | - bot_2_symptoms = {} |
115 | | - # for feature generation |
116 | | - combinations = [] |
117 | | - for i in range(marginals.shape[0]): |
118 | | - symptoms = marginals.iloc[i] |
119 | | - # for b in True, False: |
120 | | - sorted = symptoms.sort_values(ascending=False) |
121 | | - top_1 = sorted.keys()[0] |
122 | | - top_1_per = sorted.values[0] |
123 | | - top_2 = sorted.keys()[1] |
124 | | - top_2_per = sorted.values[1] |
125 | | - |
126 | | - bot_1 = sorted.keys()[-1] |
127 | | - bot_1_per = sorted.values[-1] |
128 | | - bot_2 = sorted.keys()[-2] |
129 | | - bot_2_per = sorted.values[-2] |
130 | | - |
131 | | - name = marginals.index[i] |
132 | | - dic = {top_1: top_1_per, top_2: top_2_per} |
133 | | - dic_bot = {bot_1: bot_1_per, bot_2: bot_2_per} |
134 | | - top_2_symptoms[name] = dic |
135 | | - bot_2_symptoms[name] = dic_bot |
136 | | - combinations.append(((top_1, top_2), (bot_1, bot_2))) |
137 | | - Xy_mod = Xy.copy() |
138 | | - convert = Xy.drop(columns=["y"]).columns.values |
139 | | - for val in convert: |
140 | | - Xy_mod[val] = Xy_mod[val].astype("int") |
141 | | - for group in combinations: |
142 | | - for comb in group: |
143 | | - col1, col2 = comb |
144 | | - new_columns = pd.DataFrame( |
145 | | - { |
146 | | - f"{col1}_and_{col2}": Xy_mod[col1] & Xy_mod[col2], |
147 | | - f"{col1}_or_{col2}": Xy_mod[col1] | Xy_mod[col2], |
148 | | - f"{col1}_xor_{col2}": Xy_mod[col1] ^ Xy_mod[col2], |
149 | | - } |
150 | | - ) |
151 | | - Xy_mod = pd.concat([Xy_mod, new_columns], axis=1) |
152 | | - # removing duplicate features |
153 | | - Xy_mod = Xy_mod.loc[:, ~Xy_mod.columns.duplicated()].copy() |
154 | | - print(f"Number of features: {Xy_mod.shape[1]}") |
155 | | - print(f"Number of samples: {Xy_mod.shape[0]}") |
156 | | - # remove the column y from the Xy_mod data frame |
157 | | - X_mod = Xy_mod.drop(columns=["y"]) |
158 | | - # print the column names |
159 | | - print(f"Column names: {Xy_mod.columns.values}") |
160 | | - # X_new = add_logical_columns(X_mod, 2) |
161 | | - X_new = combine_features(X_mod) |
162 | | - print(f"Number of features: {X_new.shape[1]}") |
163 | | - print(f"Number of samples: {X_new.shape[0]}") |
164 | | - return X_new, top_2_symptoms, bot_2_symptoms |
0 commit comments