|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +""" |
| 5 | +Massively-parallel reading from MDB PostgreSQL on YDB Analytics Prestable |
| 6 | +""" |
| 7 | + |
| 8 | +import pandas as pd |
| 9 | +import matplotlib.pyplot as plt |
| 10 | +import numpy as np |
| 11 | +import re |
| 12 | +from matplotlib.ticker import MultipleLocator |
| 13 | + |
| 14 | +# Read the CSV data |
| 15 | +df = pd.read_csv("./ydb_20251006.csv") |
| 16 | +print("Raw data:") |
| 17 | +print(df) |
| 18 | + |
| 19 | +# Function to convert duration in format "Xm Ys" to seconds |
| 20 | +def duration_to_seconds(duration_str): |
| 21 | + # Extract minutes and seconds using regex |
| 22 | + match = re.match(r'(\d+)m\s+(\d+)s', duration_str) |
| 23 | + if match: |
| 24 | + minutes, seconds = map(int, match.groups()) |
| 25 | + return minutes * 60 + seconds |
| 26 | + return 0 |
| 27 | + |
| 28 | +# Convert duration strings to seconds |
| 29 | +df['duration_seconds'] = df['duration'].apply(duration_to_seconds) |
| 30 | +print("\nData with duration in seconds:") |
| 31 | +print(df) |
| 32 | + |
| 33 | +# Define the data size constant (143.95 GiB) |
| 34 | +DATA_SIZE_GIB = 143.95 |
| 35 | +# Convert GiB to MiB (1 GiB = 1024 MiB) |
| 36 | +DATA_SIZE_MIB = DATA_SIZE_GIB * 1024 |
| 37 | + |
| 38 | +# Calculate throughput (MiB/s) by dividing data size by duration in seconds |
| 39 | +df['throughput_mibs'] = DATA_SIZE_MIB / df['duration_seconds'] |
| 40 | +print("\nData with calculated throughput:") |
| 41 | +print(df) |
| 42 | + |
| 43 | +# Create the plot |
| 44 | +plt.figure(figsize=(10, 6)) |
| 45 | + |
| 46 | +# Plot the data |
| 47 | +plt.plot(df['MaxTasksPerStage'], df['throughput_mibs'], marker='o', linewidth=2, markersize=8) |
| 48 | + |
| 49 | +# Set the title and labels |
| 50 | +plt.title('Massively-parallel reading from MDB PostgreSQL on YDB Analytics Prestable', fontsize=14) |
| 51 | +plt.xlabel('MaxTasksPerStage', fontsize=12) |
| 52 | +plt.ylabel('Throughput (MiB/s)', fontsize=12) |
| 53 | + |
| 54 | +# Set grid |
| 55 | +plt.grid(True, linestyle='--', alpha=0.7) |
| 56 | + |
| 57 | +# Customize x-axis to show all task values |
| 58 | +plt.xticks(df['MaxTasksPerStage']) |
| 59 | + |
| 60 | +# Add value annotations |
| 61 | +for x, y in zip(df['MaxTasksPerStage'], df['throughput_mibs']): |
| 62 | + plt.annotate(f'{y:.2f}', (x, y), textcoords="offset points", |
| 63 | + xytext=(0, 10), ha='center') |
| 64 | + |
| 65 | +plt.tight_layout() |
| 66 | +plt.savefig('ydb_20251006.png') |
| 67 | +plt.show() |
0 commit comments