Skip to content

Commit 7368f58

Browse files
committed
fix: Add dataframe.py to git
1 parent d87b076 commit 7368f58

File tree

2 files changed

+86
-1
lines changed

2 files changed

+86
-1
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,5 @@ poetry.lock
2121
.pytest_cache/
2222

2323
# Project-specific
24-
dataframe*
24+
dataframe.pkl
25+
dataframe_test.pkl

src/preprocessing/dataframe.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from typing import List
2+
import pandas as pd
3+
4+
from .snapshot import load_and_process_snapshot
5+
6+
7+
def save_dataframe(df: pd.DataFrame, filepath: str):
8+
"""
9+
Saves the DataFrame to a pickle file.
10+
11+
Parameters:
12+
df (pd.DataFrame): The DataFrame to save.
13+
filepath (str): The path to the pickle file.
14+
"""
15+
df.to_pickle(filepath)
16+
print(f"DataFrame saved to {filepath}")
17+
18+
19+
def load_dataframe(filepath: str) -> pd.DataFrame:
20+
"""
21+
Loads the DataFrame from a pickle file.
22+
23+
Parameters:
24+
filepath (str): The path to the pickle file.
25+
26+
Returns:
27+
pd.DataFrame: The loaded DataFrame.
28+
"""
29+
df = pd.read_pickle(filepath)
30+
print(f"DataFrame loaded from {filepath}")
31+
return df
32+
33+
34+
def extract_timestamp_from_filepath(filepath: str) -> pd.Timestamp:
35+
"""
36+
Extracts a timestamp from the filepath.
37+
38+
Assumes the filepath is of the format: 'data/YYYY-MM-DD/orderbook_HH-MM.json'
39+
40+
Parameters:
41+
filepath (str): The full path to the snapshot file.
42+
43+
Returns:
44+
pd.Timestamp: The extracted timestamp.
45+
"""
46+
import re
47+
from datetime import datetime
48+
49+
# Adjust the regex pattern to match the new filepath format
50+
pattern = r'.*/(\d{4}-\d{2}-\d{2})/orderbook_(\d{2}-\d{2}).json$'
51+
match = re.match(pattern, filepath)
52+
if match:
53+
date_str = match.group(1) # Extracts 'YYYY-MM-DD'
54+
time_str = match.group(2) # Extracts 'HH-MM'
55+
datetime_str = f"{date_str} {time_str}"
56+
timestamp = datetime.strptime(datetime_str, '%Y-%m-%d %H-%M')
57+
return pd.Timestamp(timestamp)
58+
else:
59+
# If no timestamp found, return NaT (Not a Time)
60+
return pd.NaT
61+
62+
63+
def load_snapshots_to_dataframe(filepaths: List[str]) -> pd.DataFrame:
64+
"""
65+
Loads and processes snapshots from a list of filepaths to create a DataFrame.
66+
67+
Parameters:
68+
filepaths (List[str]): List of snapshot filepaths.
69+
70+
Returns:
71+
pd.DataFrame: DataFrame containing computed analysis for each snapshot.
72+
"""
73+
records = []
74+
for filepath in filepaths:
75+
timestamp = extract_timestamp_from_filepath(filepath)
76+
if pd.isnull(timestamp):
77+
continue # Skip files without a valid timestamp
78+
snapshot_stats = load_and_process_snapshot(filepath, timestamp)
79+
records.append(snapshot_stats)
80+
81+
df_stats = pd.DataFrame(records)
82+
df_stats.set_index('timestamp', inplace=True)
83+
df_stats.sort_index(inplace=True)
84+
return df_stats

0 commit comments

Comments
 (0)