Benchmarking MPP reading on YDB Prestable Analytics (#338)

vitalyisaev2 · web-flow · commit c5b4833f87ad · 2025-10-06T12:18:26.000+03:00
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,3 @@ __pycache__
 .DS_Store
 
 .ipynb_checkpoints
-*.csv
diff --git a/scripts/parallel_read/ydb_20251006.csv b/scripts/parallel_read/ydb_20251006.csv
@@ -0,0 +1,7 @@
+MaxTasksPerStage,duration
+4,14m 20s
+8,7m 8s
+16,4m 47s
+32,3m 22s
+64,3m 9s
+128,3m 9s
diff --git a/scripts/parallel_read/ydb_20251006.py b/scripts/parallel_read/ydb_20251006.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Massively-parallel reading from MDB PostgreSQL on YDB Analytics Prestable
+"""
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import re
+from matplotlib.ticker import MultipleLocator
+
+# Read the CSV data
+df = pd.read_csv("./ydb_20251006.csv")
+print("Raw data:")
+print(df)
+
+# Function to convert duration in format "Xm Ys" to seconds
+def duration_to_seconds(duration_str):
+    # Extract minutes and seconds using regex
+    match = re.match(r'(\d+)m\s+(\d+)s', duration_str)
+    if match:
+        minutes, seconds = map(int, match.groups())
+        return minutes * 60 + seconds
+    return 0
+
+# Convert duration strings to seconds
+df['duration_seconds'] = df['duration'].apply(duration_to_seconds)
+print("\nData with duration in seconds:")
+print(df)
+
+# Define the data size constant (143.95 GiB)
+DATA_SIZE_GIB = 143.95
+# Convert GiB to MiB (1 GiB = 1024 MiB)
+DATA_SIZE_MIB = DATA_SIZE_GIB * 1024
+
+# Calculate throughput (MiB/s) by dividing data size by duration in seconds
+df['throughput_mibs'] = DATA_SIZE_MIB / df['duration_seconds']
+print("\nData with calculated throughput:")
+print(df)
+
+# Create the plot
+plt.figure(figsize=(10, 6))
+
+# Plot the data
+plt.plot(df['MaxTasksPerStage'], df['throughput_mibs'], marker='o', linewidth=2, markersize=8)
+
+# Set the title and labels
+plt.title('Massively-parallel reading from MDB PostgreSQL on YDB Analytics Prestable', fontsize=14)
+plt.xlabel('MaxTasksPerStage', fontsize=12)
+plt.ylabel('Throughput (MiB/s)', fontsize=12)
+
+# Set grid
+plt.grid(True, linestyle='--', alpha=0.7)
+
+# Customize x-axis to show all task values
+plt.xticks(df['MaxTasksPerStage'])
+
+# Add value annotations
+for x, y in zip(df['MaxTasksPerStage'], df['throughput_mibs']):
+    plt.annotate(f'{y:.2f}', (x, y), textcoords="offset points", 
+                 xytext=(0, 10), ha='center')
+
+plt.tight_layout()
+plt.savefig('ydb_20251006.png')
+plt.show()
diff --git a/scripts/parallel_read/yq_dev_20250810.csv b/scripts/parallel_read/yq_dev_20250810.csv
@@ -0,0 +1,29 @@
+Replicas,Tasks,Time
+8,64,27
+8,32,30
+8,16,38
+8,8,66
+8,4,130
+8,2,242
+8,1,517
+4,64,41
+4,32,43
+4,16,50
+4,8,77
+4,4,129
+4,2,249
+4,1,480
+2,64,63
+2,32,57
+2,16,62
+2,8,79
+2,4,136
+2,2,256
+2,1,515
+1,64,102
+1,32,92
+1,16,101
+1,8,107
+1,4,134
+1,2,242
+1,1,463

Original file line number	Diff line number	Diff line change
`@@ -15,4 +15,3 @@ __pycache__`
`15`	`15`	`.DS_Store`
`16`	`16`
`17`	`17`	`.ipynb_checkpoints`
`18`		`-*.csv`