Skip to content

Commit c5b4833

Browse files
authored
Benchmarking MPP reading on YDB Prestable Analytics (#338)
1 parent f90c799 commit c5b4833

File tree

4 files changed

+103
-1
lines changed

4 files changed

+103
-1
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,3 @@ __pycache__
1515
.DS_Store
1616

1717
.ipynb_checkpoints
18-
*.csv
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
MaxTasksPerStage,duration
2+
4,14m 20s
3+
8,7m 8s
4+
16,4m 47s
5+
32,3m 22s
6+
64,3m 9s
7+
128,3m 9s
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Massively-parallel reading from MDB PostgreSQL on YDB Analytics Prestable
6+
"""
7+
8+
import pandas as pd
9+
import matplotlib.pyplot as plt
10+
import numpy as np
11+
import re
12+
from matplotlib.ticker import MultipleLocator
13+
14+
# Read the CSV data
15+
df = pd.read_csv("./ydb_20251006.csv")
16+
print("Raw data:")
17+
print(df)
18+
19+
# Function to convert duration in format "Xm Ys" to seconds
20+
def duration_to_seconds(duration_str):
21+
# Extract minutes and seconds using regex
22+
match = re.match(r'(\d+)m\s+(\d+)s', duration_str)
23+
if match:
24+
minutes, seconds = map(int, match.groups())
25+
return minutes * 60 + seconds
26+
return 0
27+
28+
# Convert duration strings to seconds
29+
df['duration_seconds'] = df['duration'].apply(duration_to_seconds)
30+
print("\nData with duration in seconds:")
31+
print(df)
32+
33+
# Define the data size constant (143.95 GiB)
34+
DATA_SIZE_GIB = 143.95
35+
# Convert GiB to MiB (1 GiB = 1024 MiB)
36+
DATA_SIZE_MIB = DATA_SIZE_GIB * 1024
37+
38+
# Calculate throughput (MiB/s) by dividing data size by duration in seconds
39+
df['throughput_mibs'] = DATA_SIZE_MIB / df['duration_seconds']
40+
print("\nData with calculated throughput:")
41+
print(df)
42+
43+
# Create the plot
44+
plt.figure(figsize=(10, 6))
45+
46+
# Plot the data
47+
plt.plot(df['MaxTasksPerStage'], df['throughput_mibs'], marker='o', linewidth=2, markersize=8)
48+
49+
# Set the title and labels
50+
plt.title('Massively-parallel reading from MDB PostgreSQL on YDB Analytics Prestable', fontsize=14)
51+
plt.xlabel('MaxTasksPerStage', fontsize=12)
52+
plt.ylabel('Throughput (MiB/s)', fontsize=12)
53+
54+
# Set grid
55+
plt.grid(True, linestyle='--', alpha=0.7)
56+
57+
# Customize x-axis to show all task values
58+
plt.xticks(df['MaxTasksPerStage'])
59+
60+
# Add value annotations
61+
for x, y in zip(df['MaxTasksPerStage'], df['throughput_mibs']):
62+
plt.annotate(f'{y:.2f}', (x, y), textcoords="offset points",
63+
xytext=(0, 10), ha='center')
64+
65+
plt.tight_layout()
66+
plt.savefig('ydb_20251006.png')
67+
plt.show()
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
Replicas,Tasks,Time
2+
8,64,27
3+
8,32,30
4+
8,16,38
5+
8,8,66
6+
8,4,130
7+
8,2,242
8+
8,1,517
9+
4,64,41
10+
4,32,43
11+
4,16,50
12+
4,8,77
13+
4,4,129
14+
4,2,249
15+
4,1,480
16+
2,64,63
17+
2,32,57
18+
2,16,62
19+
2,8,79
20+
2,4,136
21+
2,2,256
22+
2,1,515
23+
1,64,102
24+
1,32,92
25+
1,16,101
26+
1,8,107
27+
1,4,134
28+
1,2,242
29+
1,1,463

0 commit comments

Comments
 (0)