-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmoving_average_python_polars.py
More file actions
123 lines (93 loc) · 5.87 KB
/
moving_average_python_polars.py
File metadata and controls
123 lines (93 loc) · 5.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import polars as pl
import numpy as np
import time
from concurrent.futures import ProcessPoolExecutor
import multiprocessing as mp
import os
def calculate_features_polars(df):
"""Calculate quantitative features using Polars for better performance"""
# Calculate basic features using Polars expressions (vectorized operations)
result_df = df.select([
# Basic price features
pl.col("Close").alias("feature_0"), # Close price
pl.col("Open").alias("feature_1"), # Open price
pl.col("High").alias("feature_2"), # High price
pl.col("Low").alias("feature_3"), # Low price
pl.col("volume").alias("feature_4"), # Volume
# Return calculations
((pl.col("Close") - pl.col("Open")) / pl.col("Open")).alias("feature_5"), # Return
((pl.col("High") - pl.col("Low")) / pl.col("Open")).alias("feature_6"), # True range
((pl.col("Close") - pl.col("Low")) / (pl.col("High") - pl.col("Low"))).alias("feature_7"), # Stochastic
# Moving averages using rolling windows
pl.col("Close").rolling_mean(window_size=5).alias("feature_8"), # 5-period SMA
pl.col("Close").rolling_mean(window_size=10).alias("feature_11"), # 10-period SMA
pl.col("Close").rolling_mean(window_size=20).alias("feature_13"), # 20-period SMA
# Volatility measures
pl.col("Close").rolling_std(window_size=5).alias("feature_9"), # 5-period volatility
pl.col("Close").rolling_std(window_size=10).alias("feature_12"), # 10-period volatility
pl.col("Close").rolling_std(window_size=20).alias("feature_14"), # 20-period volatility
# Momentum indicators
((pl.col("Close") - pl.col("Close").shift(1)) / pl.col("Close").shift(1)).alias("feature_15"), # 1-period return
((pl.col("Close") - pl.col("Close").shift(3)) / pl.col("Close").shift(3)).alias("feature_16"), # 3-period return
((pl.col("Close") - pl.col("Close").shift(5)) / pl.col("Close").shift(5)).alias("feature_17"), # 5-period return
# Range-based features
((pl.col("High") - pl.col("Low")) / pl.col("Open")).alias("feature_23"), # Daily range
((pl.col("High") - pl.col("Close")) / pl.col("Open")).alias("feature_24"), # Upper shadow
((pl.col("Close") - pl.col("Low")) / pl.col("Open")).alias("feature_25"), # Lower shadow
((pl.col("Close") - pl.col("Open")).abs() / pl.col("Open")).alias("feature_26"), # Body size
# Log returns
(pl.col("Close") / pl.col("Close").shift(1)).log().alias("feature_27"),
# Directional features
(pl.col("Close") > pl.col("Open")).cast(pl.Float64).alias("feature_28"), # Bullish/Bearish
(pl.col("High") > pl.col("High").shift(1)).cast(pl.Float64).alias("feature_29"), # New high
(pl.col("Low") < pl.col("Low").shift(1)).cast(pl.Float64).alias("feature_30"), # New low
# Gap features
((pl.col("Open") - pl.col("Close").shift(1)) / pl.col("Close").shift(1)).alias("feature_31"), # Gap up/down
(pl.col("Open") > pl.col("Close").shift(1)).cast(pl.Float64).alias("feature_32"), # Gap up indicator
(pl.col("Open") < pl.col("Close").shift(1)).cast(pl.Float64).alias("feature_33"), # Gap down indicator
# Price level features
pl.col("Close").log().alias("feature_34"), # Log price
(pl.col("Close") - pl.col("Open")).alias("feature_35"), # Body difference
((pl.col("High") - pl.when(pl.col("Open") > pl.col("Close")).then(pl.col("Open")).otherwise(pl.col("Close"))) / pl.col("Open")).alias("feature_36"), # Upper shadow normalized
((pl.when(pl.col("Open") < pl.col("Close")).then(pl.col("Open")).otherwise(pl.col("Close")) - pl.col("Low")) / pl.col("Open")).alias("feature_37"), # Lower shadow normalized
])
# Convert to numpy for additional complex calculations that Polars doesn't handle well
result_arrays = {col: result_df[col].to_numpy() for col in result_df.columns}
# Create a combined features array
feature_cols = [f"feature_{i}" for i in range(38)]
features = np.column_stack([result_arrays[col] for col in feature_cols if col in result_arrays])
# Pad with zeros for remaining features (38-100)
n_rows, n_existing_features = features.shape
remaining_features = 101 - n_existing_features
padding = np.zeros((n_rows, remaining_features))
features = np.hstack([features, padding])
return features
def calculate_moving_averages_polars(df, periods):
"""Calculate multiple moving averages using Polars"""
results = {}
for period in periods:
ma_col = f"MA_{period}"
# Create a temporary dataframe with the MA column
temp_df = df.select(pl.col("Close").rolling_mean(window_size=period).alias(ma_col))
results[ma_col] = temp_df[ma_col].drop_nulls().to_numpy()
return results
def main():
start_time = time.time()
print("Reading CSV file with Polars...")
df = pl.read_csv("USDJPY2.csv")
print(f"Loaded {len(df)} records.")
print("Calculating 100+ quantitative features using Polars...")
features = calculate_features_polars(df)
print(f"Calculated {features.shape[1]} quantitative features for {features.shape[0]} rows.")
# Calculate moving averages for periods 200-220 using Polars
print("Calculating moving averages for periods 200-220 using Polars...")
ma_periods = list(range(200, 221)) # 200 to 220 inclusive
all_mas = calculate_moving_averages_polars(df, ma_periods)
for period, ma_values in all_mas.items():
print(f"Calculated {len(ma_values)} {period} values.")
end_time = time.time()
duration = (end_time - start_time) * 1000 # Convert to milliseconds
print(f"Total execution time: {duration:.2f} ms")
print(f"Features shape: {features.shape}")
if __name__ == "__main__":
main()