Skip to content

Commit 4a05581

Browse files
Add flat files stock trades tutorial (#643)
* Add flat files stock trades tutorial * Fix linting (ignore imports for examples)
1 parent dfec732 commit 4a05581

File tree

7 files changed

+265
-0
lines changed

7 files changed

+265
-0
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# We can use a Python script that aggregates trades by exchange into 30-minute
2+
# chunks, setting the stage for a visual analysis. This approach will highlight
3+
# trade flows, including opening hours and peak activity times, across the
4+
# exchanges. Please see https://polygon.io/blog/insights-from-trade-level-data
5+
#
6+
import pandas as pd # type: ignore
7+
import seaborn as sns # type: ignore
8+
import matplotlib.pyplot as plt # type: ignore
9+
import numpy as np # type: ignore
10+
import pytz # type: ignore
11+
12+
# Replace '2024-04-05.csv' with the path to your actual file
13+
file_path = "2024-04-05.csv"
14+
15+
# Load the CSV file into a pandas DataFrame
16+
df = pd.read_csv(file_path)
17+
18+
# Convert 'participant_timestamp' to datetime (assuming nanoseconds Unix timestamp)
19+
df["participant_timestamp"] = pd.to_datetime(
20+
df["participant_timestamp"], unit="ns", utc=True
21+
)
22+
23+
# Convert to Eastern Time (ET), accounting for both EST and EDT
24+
df["participant_timestamp"] = df["participant_timestamp"].dt.tz_convert(
25+
"America/New_York"
26+
)
27+
28+
# Create a new column for 30-minute time intervals, now in ET
29+
df["time_interval"] = df["participant_timestamp"].dt.floor("30T").dt.time
30+
31+
# Ensure full 24-hour coverage by generating all possible 30-minute intervals
32+
all_intervals = pd.date_range(start="00:00", end="23:59", freq="30T").time
33+
all_exchanges = df["exchange"].unique()
34+
full_index = pd.MultiIndex.from_product(
35+
[all_exchanges, all_intervals], names=["exchange", "time_interval"]
36+
)
37+
38+
# Group by 'exchange' and 'time_interval', count trades, and reset index
39+
grouped = (
40+
df.groupby(["exchange", "time_interval"])
41+
.size()
42+
.reindex(full_index, fill_value=0)
43+
.reset_index(name="trade_count")
44+
)
45+
46+
# Pivot the DataFrame for the heatmap, ensuring all intervals and exchanges are represented
47+
pivot_table = grouped.pivot("exchange", "time_interval", "trade_count").fillna(0)
48+
49+
# Apply a log scale transformation to the trade counts + 1 to handle zero trades correctly
50+
log_scale_data = np.log1p(pivot_table.values)
51+
52+
# Plotting the heatmap using the log scale data
53+
plt.figure(figsize=(20, 10))
54+
sns.heatmap(
55+
log_scale_data,
56+
annot=False,
57+
cmap="Reds",
58+
linewidths=0.5,
59+
cbar=False,
60+
xticklabels=[t.strftime("%H:%M") for t in all_intervals],
61+
yticklabels=pivot_table.index,
62+
)
63+
plt.title("Trade Count Heatmap by Exchange and Time Interval (Log Scale, ET)")
64+
plt.ylabel("Exchange")
65+
plt.xlabel("Time Interval (ET)")
66+
plt.xticks(rotation=45)
67+
plt.tight_layout() # Adjust layout to not cut off labels
68+
plt.show()
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Here's a Python script for analyzing the dataset, that identifies the
2+
# distribution of trades across different exchanges and calculates their
3+
# respective percentages of the total trades. Please see
4+
# https://polygon.io/blog/insights-from-trade-level-data
5+
#
6+
import pandas as pd # type: ignore
7+
8+
# Replace '2024-04-05.csv' with the path to your actual file
9+
file_path = "2024-04-05.csv"
10+
11+
# Load the CSV file into a pandas DataFrame
12+
df = pd.read_csv(file_path)
13+
14+
# Count the number of trades for each exchange
15+
exchange_counts = df["exchange"].value_counts()
16+
17+
# Calculate the total number of trades
18+
total_trades = exchange_counts.sum()
19+
20+
# Print out all exchanges and their percentage of total trades
21+
for exchange, count in exchange_counts.items():
22+
percentage = (count / total_trades) * 100
23+
print(f"Exchange {exchange}: {count} trades, {percentage:.2f}% of total trades")
49.2 KB
Loading
67 KB
Loading
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# Polygon.io Flat Files Stock Trades Analysis Scripts
2+
3+
This repository contains Python scripts for analyzing stock market trading data using Flat Files from Polygon.io. These scripts demonstrate various ways to dissect and visualize trade data for comprehensive market analysis.
4+
5+
Please see the tutorial: [Deep Dive into Trade-Level Data with Flat Files](https://polygon.io/blog/insights-from-trade-level-data)
6+
7+
## Scripts Overview
8+
9+
### **exchange-heatmap.py**
10+
This script aggregates trades by exchange into 30-minute chunks and creates a heatmap visualization. It highlights the flow of trades and peak activity times across different exchanges, providing insights into how different exchanges operate throughout the day.
11+
12+
![Treemap Visualization](./heatmap.png)
13+
14+
### **exchanges-seen.py**
15+
Analyzes the distribution of trades across different exchanges and calculates their respective percentages of total trades. This script helps identify which exchanges handle the most trading volume, offering a perspective on market structure.
16+
17+
```
18+
Exchange 4: 25,570,324 trades, 36.32% of total trades
19+
Exchange 12: 15,147,689 trades, 21.52% of total trades
20+
Exchange 11: 6,877,306 trades, 9.77% of total trades
21+
Exchange 19: 5,098,852 trades, 7.24% of total trades
22+
Exchange 10: 4,006,611 trades, 5.69% of total trades
23+
Exchange 8: 3,686,168 trades, 5.24% of total trades
24+
Exchange 15: 2,446,340 trades, 3.47% of total trades
25+
Exchange 21: 2,173,744 trades, 3.09% of total trades
26+
Exchange 7: 1,509,083 trades, 2.14% of total trades
27+
Exchange 20: 1,296,811 trades, 1.84% of total trades
28+
Exchange 18: 674,553 trades, 0.96% of total trades
29+
Exchange 13: 527,767 trades, 0.75% of total trades
30+
Exchange 2: 417,295 trades, 0.59% of total trades
31+
Exchange 3: 393,919 trades, 0.56% of total trades
32+
Exchange 17: 230,210 trades, 0.33% of total trades
33+
Exchange 1: 183,010 trades, 0.26% of total trades
34+
Exchange 9: 159,020 trades, 0.23% of total trades
35+
Exchange 14: 1,211 trades, 0.00% of total trades
36+
```
37+
38+
### **top-10-tickers.py**
39+
Identifies the top 10 most traded stocks and calculates their respective percentages of the total trades. This script provides a clear view of the market's most active stocks, highlighting where the most trading activity is concentrated.
40+
41+
```
42+
TSLA: 1,549,605 trades, 2.20% of total trades
43+
NVDA: 788,331 trades, 1.12% of total trades
44+
SPY: 669,762 trades, 0.95% of total trades
45+
AMD: 587,140 trades, 0.83% of total trades
46+
MDIA: 561,698 trades, 0.80% of total trades
47+
AAPL: 540,870 trades, 0.77% of total trades
48+
SOXL: 533,511 trades, 0.76% of total trades
49+
QQQ: 508,822 trades, 0.72% of total trades
50+
CADL: 466,604 trades, 0.66% of total trades
51+
AMZN: 465,526 trades, 0.66% of total trades
52+
```
53+
54+
### **trades-histogram.py**
55+
Creates a histogram that aggregates trades into 30-minute intervals throughout the day. This visualization helps understand the distribution of trading volume across different times, including pre-market, regular trading hours, and after-hours.
56+
57+
![Treemap Visualization](./histogram.png)
58+
59+
## Download the Data
60+
61+
First, let's download an actual file and explore the data and see what we can learn. We start by downloading the trades for 2024-04-05 via the [File Browser](https://polygon.io/flat-files/stocks-trades/2024/04). The `us_stocks_sip/trades_v1/2024/04/2024-04-05.csv.gz` file is about 1.35GB and is in a compressed gzip format.
62+
63+
```
64+
gunzip 2024-04-05.csv.gz
65+
```
66+
67+
## Getting Started
68+
69+
To run these scripts, you will need Python 3 and several dependencies installed, including pandas, matplotlib, seaborn, and pytz. Ensure that you have the trading data file available and modify the `file_path` variable in each script to point to your data file location.
70+
71+
```
72+
pip install pandas matplotlib seaborn pytz
73+
```
74+
75+
## Usage
76+
77+
Each script is designed to be run independently:
78+
79+
```bash
80+
python exchange-heatmap.py
81+
python exchanges-seen.py
82+
python top-10-tickers.py
83+
python trades-histogram.py
84+
```
85+
86+
Adjust the script parameters as necessary to fit your specific analysis needs or to accommodate different datasets.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Here's a Python script for analyzing the dataset, that identifies the top 10
2+
# most traded stocks and calculates their respective percentages of the total
3+
# trades. Please see https://polygon.io/blog/insights-from-trade-level-data
4+
#
5+
import pandas as pd # type: ignore
6+
7+
# Replace '2024-04-05.csv' with the path to your actual file
8+
file_path = "2024-04-05.csv"
9+
10+
# Load the CSV file into a pandas DataFrame
11+
df = pd.read_csv(file_path)
12+
13+
# Count the number of trades for each ticker
14+
trade_counts = df["ticker"].value_counts()
15+
16+
# Calculate the total number of trades
17+
total_trades = trade_counts.sum()
18+
19+
# Get the top 10 traded stocks
20+
top_10_traded = trade_counts.head(10)
21+
22+
# Print out the top 10 traded stocks and their percentage of total trades
23+
for ticker, count in top_10_traded.items():
24+
percentage = (count / total_trades) * 100
25+
print(f"{ticker}: {count} trades, {percentage:.2f}% of total trades")
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# To visualize these dynamics, we can use a Python script to create a histogram
2+
# aggregating trades into 30-minute intervals, providing a clear view of when
3+
# trading activity concentrates during the day. This analysis aims to highlight
4+
# the distribution of trading volume across the day, from pre-market to after-
5+
# hours. Please see https://polygon.io/blog/insights-from-trade-level-data
6+
#
7+
import pandas as pd # type: ignore
8+
import matplotlib.pyplot as plt # type: ignore
9+
10+
# Replace '2024-04-05.csv' with the path to your actual file
11+
file_path = "2024-04-05.csv"
12+
13+
# Load the CSV file into a pandas DataFrame
14+
df = pd.read_csv(file_path)
15+
16+
# Convert 'participant_timestamp' to datetime (assuming nanoseconds Unix timestamp)
17+
df["participant_timestamp"] = pd.to_datetime(
18+
df["participant_timestamp"], unit="ns", utc=True
19+
)
20+
21+
# Convert to Eastern Time (ET), accounting for both EST and EDT
22+
df["participant_timestamp"] = df["participant_timestamp"].dt.tz_convert(
23+
"America/New_York"
24+
)
25+
26+
# Create a new column for 30-minute time intervals, now in ET
27+
df["time_interval"] = df["participant_timestamp"].dt.floor("30T")
28+
29+
# Aggregate trades into 30-minute intervals for the entire dataset
30+
trade_counts_per_interval = df.groupby("time_interval").size()
31+
32+
# Prepare the plot
33+
plt.figure(figsize=(15, 7))
34+
35+
# Plotting the histogram/bar chart
36+
bars = plt.bar(
37+
trade_counts_per_interval.index, trade_counts_per_interval.values, width=0.02
38+
)
39+
40+
# Adding trade count annotations on each bar
41+
for bar in bars:
42+
height = bar.get_height()
43+
plt.annotate(
44+
f"{int(height)}",
45+
xy=(bar.get_x() + bar.get_width() / 2, height),
46+
xytext=(0, 3), # 3 points vertical offset
47+
textcoords="offset points",
48+
ha="center",
49+
va="bottom",
50+
)
51+
52+
plt.title("Trade Counts Aggregated by 30-Minute Intervals (ET)")
53+
plt.xlabel("Time Interval (ET)")
54+
plt.ylabel("Number of Trades")
55+
plt.xticks(rotation=45, ha="right")
56+
57+
# Ensure that every 30-minute interval is represented on the x-axis
58+
plt.gca().set_xticklabels(
59+
[t.strftime("%Y-%m-%d %H:%M") for t in trade_counts_per_interval.index], rotation=90
60+
)
61+
62+
plt.tight_layout()
63+
plt.show()

0 commit comments

Comments
 (0)