Add flat files stock trades tutorial (#643)

justinpolygon · web-flow · commit 4a055817bf9a · 2024-04-12T14:01:04.000-07:00
* Add flat files stock trades tutorial

* Fix linting (ignore imports for examples)
diff --git a/examples/tools/flatfiles-stock-trades/exchange-heatmap.py b/examples/tools/flatfiles-stock-trades/exchange-heatmap.py
@@ -0,0 +1,68 @@
+# We can use a Python script that aggregates trades by exchange into 30-minute
+# chunks, setting the stage for a visual analysis. This approach will highlight
+# trade flows, including opening hours and peak activity times, across the
+# exchanges. Please see https://polygon.io/blog/insights-from-trade-level-data
+#
+import pandas as pd  # type: ignore
+import seaborn as sns  # type: ignore
+import matplotlib.pyplot as plt  # type: ignore
+import numpy as np  # type: ignore
+import pytz  # type: ignore
+
+# Replace '2024-04-05.csv' with the path to your actual file
+file_path = "2024-04-05.csv"
+
+# Load the CSV file into a pandas DataFrame
+df = pd.read_csv(file_path)
+
+# Convert 'participant_timestamp' to datetime (assuming nanoseconds Unix timestamp)
+df["participant_timestamp"] = pd.to_datetime(
+    df["participant_timestamp"], unit="ns", utc=True
+)
+
+# Convert to Eastern Time (ET), accounting for both EST and EDT
+df["participant_timestamp"] = df["participant_timestamp"].dt.tz_convert(
+    "America/New_York"
+)
+
+# Create a new column for 30-minute time intervals, now in ET
+df["time_interval"] = df["participant_timestamp"].dt.floor("30T").dt.time
+
+# Ensure full 24-hour coverage by generating all possible 30-minute intervals
+all_intervals = pd.date_range(start="00:00", end="23:59", freq="30T").time
+all_exchanges = df["exchange"].unique()
+full_index = pd.MultiIndex.from_product(
+    [all_exchanges, all_intervals], names=["exchange", "time_interval"]
+)
+
+# Group by 'exchange' and 'time_interval', count trades, and reset index
+grouped = (
+    df.groupby(["exchange", "time_interval"])
+    .size()
+    .reindex(full_index, fill_value=0)
+    .reset_index(name="trade_count")
+)
+
+# Pivot the DataFrame for the heatmap, ensuring all intervals and exchanges are represented
+pivot_table = grouped.pivot("exchange", "time_interval", "trade_count").fillna(0)
+
+# Apply a log scale transformation to the trade counts + 1 to handle zero trades correctly
+log_scale_data = np.log1p(pivot_table.values)
+
+# Plotting the heatmap using the log scale data
+plt.figure(figsize=(20, 10))
+sns.heatmap(
+    log_scale_data,
+    annot=False,
+    cmap="Reds",
+    linewidths=0.5,
+    cbar=False,
+    xticklabels=[t.strftime("%H:%M") for t in all_intervals],
+    yticklabels=pivot_table.index,
+)
+plt.title("Trade Count Heatmap by Exchange and Time Interval (Log Scale, ET)")
+plt.ylabel("Exchange")
+plt.xlabel("Time Interval (ET)")
+plt.xticks(rotation=45)
+plt.tight_layout()  # Adjust layout to not cut off labels
+plt.show()
diff --git a/examples/tools/flatfiles-stock-trades/exchanges-seen.py b/examples/tools/flatfiles-stock-trades/exchanges-seen.py
@@ -0,0 +1,23 @@
+# Here's a Python script for analyzing the dataset, that identifies the
+# distribution of trades across different exchanges and calculates their
+# respective percentages of the total trades. Please see
+# https://polygon.io/blog/insights-from-trade-level-data
+#
+import pandas as pd  # type: ignore
+
+# Replace '2024-04-05.csv' with the path to your actual file
+file_path = "2024-04-05.csv"
+
+# Load the CSV file into a pandas DataFrame
+df = pd.read_csv(file_path)
+
+# Count the number of trades for each exchange
+exchange_counts = df["exchange"].value_counts()
+
+# Calculate the total number of trades
+total_trades = exchange_counts.sum()
+
+# Print out all exchanges and their percentage of total trades
+for exchange, count in exchange_counts.items():
+    percentage = (count / total_trades) * 100
+    print(f"Exchange {exchange}: {count} trades, {percentage:.2f}% of total trades")
diff --git a/examples/tools/flatfiles-stock-trades/heatmap.png b/examples/tools/flatfiles-stock-trades/heatmap.png
diff --git a/examples/tools/flatfiles-stock-trades/histogram.png b/examples/tools/flatfiles-stock-trades/histogram.png
diff --git a/examples/tools/flatfiles-stock-trades/readme.md b/examples/tools/flatfiles-stock-trades/readme.md
@@ -0,0 +1,86 @@
+# Polygon.io Flat Files Stock Trades Analysis Scripts
+
+This repository contains Python scripts for analyzing stock market trading data using Flat Files from Polygon.io. These scripts demonstrate various ways to dissect and visualize trade data for comprehensive market analysis.
+
+Please see the tutorial: [Deep Dive into Trade-Level Data with Flat Files](https://polygon.io/blog/insights-from-trade-level-data)
+
+## Scripts Overview
+
+### **exchange-heatmap.py**
+This script aggregates trades by exchange into 30-minute chunks and creates a heatmap visualization. It highlights the flow of trades and peak activity times across different exchanges, providing insights into how different exchanges operate throughout the day.
+
+![Treemap Visualization](./heatmap.png)
+
+### **exchanges-seen.py**
+Analyzes the distribution of trades across different exchanges and calculates their respective percentages of total trades. This script helps identify which exchanges handle the most trading volume, offering a perspective on market structure.
+
+```
+Exchange 4: 25,570,324 trades, 36.32% of total trades
+Exchange 12: 15,147,689 trades, 21.52% of total trades
+Exchange 11: 6,877,306 trades, 9.77% of total trades
+Exchange 19: 5,098,852 trades, 7.24% of total trades
+Exchange 10: 4,006,611 trades, 5.69% of total trades
+Exchange 8: 3,686,168 trades, 5.24% of total trades
+Exchange 15: 2,446,340 trades, 3.47% of total trades
+Exchange 21: 2,173,744 trades, 3.09% of total trades
+Exchange 7: 1,509,083 trades, 2.14% of total trades
+Exchange 20: 1,296,811 trades, 1.84% of total trades
+Exchange 18: 674,553 trades, 0.96% of total trades
+Exchange 13: 527,767 trades, 0.75% of total trades
+Exchange 2: 417,295 trades, 0.59% of total trades
+Exchange 3: 393,919 trades, 0.56% of total trades
+Exchange 17: 230,210 trades, 0.33% of total trades
+Exchange 1: 183,010 trades, 0.26% of total trades
+Exchange 9: 159,020 trades, 0.23% of total trades
+Exchange 14: 1,211 trades, 0.00% of total trades
+```
+
+### **top-10-tickers.py**
+Identifies the top 10 most traded stocks and calculates their respective percentages of the total trades. This script provides a clear view of the market's most active stocks, highlighting where the most trading activity is concentrated.
+
+```
+TSLA: 1,549,605 trades, 2.20% of total trades
+NVDA: 788,331 trades, 1.12% of total trades
+SPY: 669,762 trades, 0.95% of total trades
+AMD: 587,140 trades, 0.83% of total trades
+MDIA: 561,698 trades, 0.80% of total trades
+AAPL: 540,870 trades, 0.77% of total trades
+SOXL: 533,511 trades, 0.76% of total trades
+QQQ: 508,822 trades, 0.72% of total trades
+CADL: 466,604 trades, 0.66% of total trades
+AMZN: 465,526 trades, 0.66% of total trades
+```
+
+### **trades-histogram.py**
+Creates a histogram that aggregates trades into 30-minute intervals throughout the day. This visualization helps understand the distribution of trading volume across different times, including pre-market, regular trading hours, and after-hours.
+
+![Treemap Visualization](./histogram.png)
+
+## Download the Data
+
+First, let's download an actual file and explore the data and see what we can learn. We start by downloading the trades for 2024-04-05 via the [File Browser](https://polygon.io/flat-files/stocks-trades/2024/04). The `us_stocks_sip/trades_v1/2024/04/2024-04-05.csv.gz` file is about 1.35GB and is in a compressed gzip format.
+
+```
+gunzip 2024-04-05.csv.gz
+```
+
+## Getting Started
+
+To run these scripts, you will need Python 3 and several dependencies installed, including pandas, matplotlib, seaborn, and pytz. Ensure that you have the trading data file available and modify the `file_path` variable in each script to point to your data file location.
+
+```
+pip install pandas matplotlib seaborn pytz
+```
+
+## Usage
+
+Each script is designed to be run independently:
+
+```bash
+python exchange-heatmap.py
+python exchanges-seen.py
+python top-10-tickers.py
+python trades-histogram.py
+```
+
+Adjust the script parameters as necessary to fit your specific analysis needs or to accommodate different datasets.
diff --git a/examples/tools/flatfiles-stock-trades/top-10-tickers.py b/examples/tools/flatfiles-stock-trades/top-10-tickers.py
@@ -0,0 +1,25 @@
+# Here's a Python script for analyzing the dataset, that identifies the top 10
+# most traded stocks and calculates their respective percentages of the total
+# trades. Please see https://polygon.io/blog/insights-from-trade-level-data
+#
+import pandas as pd  # type: ignore
+
+# Replace '2024-04-05.csv' with the path to your actual file
+file_path = "2024-04-05.csv"
+
+# Load the CSV file into a pandas DataFrame
+df = pd.read_csv(file_path)
+
+# Count the number of trades for each ticker
+trade_counts = df["ticker"].value_counts()
+
+# Calculate the total number of trades
+total_trades = trade_counts.sum()
+
+# Get the top 10 traded stocks
+top_10_traded = trade_counts.head(10)
+
+# Print out the top 10 traded stocks and their percentage of total trades
+for ticker, count in top_10_traded.items():
+    percentage = (count / total_trades) * 100
+    print(f"{ticker}: {count} trades, {percentage:.2f}% of total trades")
diff --git a/examples/tools/flatfiles-stock-trades/trades-histogram.py b/examples/tools/flatfiles-stock-trades/trades-histogram.py
@@ -0,0 +1,63 @@
+# To visualize these dynamics, we can use a Python script to create a histogram
+# aggregating trades into 30-minute intervals, providing a clear view of when
+# trading activity concentrates during the day. This analysis aims to highlight
+# the distribution of trading volume across the day, from pre-market to after-
+# hours. Please see https://polygon.io/blog/insights-from-trade-level-data
+#
+import pandas as pd  # type: ignore
+import matplotlib.pyplot as plt  # type: ignore
+
+# Replace '2024-04-05.csv' with the path to your actual file
+file_path = "2024-04-05.csv"
+
+# Load the CSV file into a pandas DataFrame
+df = pd.read_csv(file_path)
+
+# Convert 'participant_timestamp' to datetime (assuming nanoseconds Unix timestamp)
+df["participant_timestamp"] = pd.to_datetime(
+    df["participant_timestamp"], unit="ns", utc=True
+)
+
+# Convert to Eastern Time (ET), accounting for both EST and EDT
+df["participant_timestamp"] = df["participant_timestamp"].dt.tz_convert(
+    "America/New_York"
+)
+
+# Create a new column for 30-minute time intervals, now in ET
+df["time_interval"] = df["participant_timestamp"].dt.floor("30T")
+
+# Aggregate trades into 30-minute intervals for the entire dataset
+trade_counts_per_interval = df.groupby("time_interval").size()
+
+# Prepare the plot
+plt.figure(figsize=(15, 7))
+
+# Plotting the histogram/bar chart
+bars = plt.bar(
+    trade_counts_per_interval.index, trade_counts_per_interval.values, width=0.02
+)
+
+# Adding trade count annotations on each bar
+for bar in bars:
+    height = bar.get_height()
+    plt.annotate(
+        f"{int(height)}",
+        xy=(bar.get_x() + bar.get_width() / 2, height),
+        xytext=(0, 3),  # 3 points vertical offset
+        textcoords="offset points",
+        ha="center",
+        va="bottom",
+    )
+
+plt.title("Trade Counts Aggregated by 30-Minute Intervals (ET)")
+plt.xlabel("Time Interval (ET)")
+plt.ylabel("Number of Trades")
+plt.xticks(rotation=45, ha="right")
+
+# Ensure that every 30-minute interval is represented on the x-axis
+plt.gca().set_xticklabels(
+    [t.strftime("%Y-%m-%d %H:%M") for t in trade_counts_per_interval.index], rotation=90
+)
+
+plt.tight_layout()
+plt.show()