|
| 1 | +import os |
| 2 | +import pandas as pd |
| 3 | +from collections import defaultdict |
| 4 | +import pickle |
| 5 | +import json |
| 6 | + |
| 7 | +# Directory containing the daily CSV files |
| 8 | +data_dir = './aggregates_day/' |
| 9 | + |
| 10 | +# Initialize a dictionary to hold trades data |
| 11 | +trades_data = defaultdict(list) |
| 12 | + |
| 13 | +# List all CSV files in the directory |
| 14 | +files = sorted([f for f in os.listdir(data_dir) if f.endswith('.csv')]) |
| 15 | + |
| 16 | +print("Starting to process files...") |
| 17 | + |
| 18 | +# Process each file (assuming files are named in order) |
| 19 | +for file in files: |
| 20 | + print(f"Processing {file}") |
| 21 | + file_path = os.path.join(data_dir, file) |
| 22 | + df = pd.read_csv(file_path) |
| 23 | + # For each stock, store the date and relevant data |
| 24 | + for _, row in df.iterrows(): |
| 25 | + ticker = row['ticker'] |
| 26 | + date = pd.to_datetime(row['window_start'], unit='ns').date() |
| 27 | + trades = row['transactions'] |
| 28 | + close_price = row['close'] # Ensure 'close' column exists in your CSV |
| 29 | + trades_data[ticker].append({ |
| 30 | + 'date': date, |
| 31 | + 'trades': trades, |
| 32 | + 'close_price': close_price |
| 33 | + }) |
| 34 | + |
| 35 | +print("Finished processing files.") |
| 36 | +print("Building lookup table...") |
| 37 | + |
| 38 | +# Now, build the lookup table with rolling averages and percentage price change |
| 39 | +lookup_table = defaultdict(dict) # Nested dict: ticker -> date -> stats |
| 40 | + |
| 41 | +for ticker, records in trades_data.items(): |
| 42 | + # Convert records to DataFrame |
| 43 | + df_ticker = pd.DataFrame(records) |
| 44 | + # Sort records by date |
| 45 | + df_ticker.sort_values('date', inplace=True) |
| 46 | + df_ticker.set_index('date', inplace=True) |
| 47 | + |
| 48 | + # Calculate the percentage change in close_price |
| 49 | + df_ticker['price_diff'] = df_ticker['close_price'].pct_change() * 100 # Multiply by 100 for percentage |
| 50 | + |
| 51 | + # Shift trades to exclude the current day from rolling calculations |
| 52 | + df_ticker['trades_shifted'] = df_ticker['trades'].shift(1) |
| 53 | + # Calculate rolling average and standard deviation over the previous 5 days |
| 54 | + df_ticker['avg_trades'] = df_ticker['trades_shifted'].rolling(window=5).mean() |
| 55 | + df_ticker['std_trades'] = df_ticker['trades_shifted'].rolling(window=5).std() |
| 56 | + # Store the data in the lookup table |
| 57 | + for date, row in df_ticker.iterrows(): |
| 58 | + # Convert date to string for JSON serialization |
| 59 | + date_str = date.strftime('%Y-%m-%d') |
| 60 | + # Ensure rolling stats are available |
| 61 | + if pd.notnull(row['avg_trades']) and pd.notnull(row['std_trades']): |
| 62 | + lookup_table[ticker][date_str] = { |
| 63 | + 'trades': row['trades'], |
| 64 | + 'close_price': row['close_price'], |
| 65 | + 'price_diff': row['price_diff'], |
| 66 | + 'avg_trades': row['avg_trades'], |
| 67 | + 'std_trades': row['std_trades'] |
| 68 | + } |
| 69 | + else: |
| 70 | + # Store data without rolling stats if not enough data points |
| 71 | + lookup_table[ticker][date_str] = { |
| 72 | + 'trades': row['trades'], |
| 73 | + 'close_price': row['close_price'], |
| 74 | + 'price_diff': row['price_diff'], |
| 75 | + 'avg_trades': None, |
| 76 | + 'std_trades': None |
| 77 | + } |
| 78 | + |
| 79 | +print("Lookup table built successfully.") |
| 80 | + |
| 81 | +# Convert defaultdict to regular dict for JSON serialization |
| 82 | +lookup_table = {k: v for k, v in lookup_table.items()} |
| 83 | + |
| 84 | +# Save the lookup table to a JSON file |
| 85 | +with open('lookup_table.json', 'w') as f: |
| 86 | + json.dump(lookup_table, f, indent=4) |
| 87 | + |
| 88 | +print("Lookup table saved to 'lookup_table.json'.") |
| 89 | + |
| 90 | +# Save the lookup table to a file for later use |
| 91 | +with open('lookup_table.pkl', 'wb') as f: |
| 92 | + pickle.dump(lookup_table, f) |
| 93 | + |
| 94 | +print("Lookup table saved to 'lookup_table.pkl'.") |
0 commit comments