Skip to content

Commit 0308a83

Browse files
authored
chore: update benchmarking for SSB onboarding (#1001)
* tests: add boilerplate for transfer manager profiling * del files * modularize benchmarking scripts * cleanup files * add initial support for cloud monitoring * add cloud monitoring and rearrange tests * update parameter inputs * lint * add range read to w1r3 * update outputs and range reads * refactor results recording and handle failure messages * remove transfer manager profiling, update logging and outputs * update param object_size and parse range * address comments
1 parent 3071832 commit 0308a83

File tree

4 files changed

+561
-223
lines changed

4 files changed

+561
-223
lines changed

tests/perf/README.md

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,20 @@ $ python3 benchmarking.py --num_samples 10000 --max_size 16384
1818

1919
| Parameter | Description | Possible values | Default |
2020
| --------- | ----------- | --------------- |:-------:|
21-
| --min_size | minimum object size in bytes | any positive integer | `5120` (5 KiB) |
22-
| --max_size | maximum object size in bytes | any positive integer | `2147483648` (2 GiB) |
23-
| --num_samples | number of W1R3 iterations | any positive integer | `1000` |
24-
| --r | bucket region for benchmarks | any GCS region | `US` |
25-
| --p | number of processes (multiprocessing enabled) | any positive integer | 16 (recommend not to exceed 16) |
26-
| --o | file to output results to | any file path | `benchmarking<TIMESTAMP>.csv` |
21+
| --project | GCP project identifier | a project id| * |
22+
| --api | API to use | only JSON is currently supported in python benchmarking | `JSON` |
23+
| --output_type | output results as csv records or cloud monitoring | `csv`, `cloud-monitoring` | `cloud-monitoring` |
24+
| --object_size | object size in bytes; can be a range min..max | string | `1048576` (1 MiB) |
25+
| --range_read_size | size of the range to read in bytes | any positive integer <br> <=0 reads the full object | `0` |
26+
| --minimum_read_offset | minimum offset for the start of the range to be read in bytes | any integer >0 | `0` |
27+
| --maximum_read_offset | maximum offset for the start of the range to be read in bytes | any integer >0 | `0` |
28+
| --samples | number of W1R3 iterations | any positive integer | `8000` |
29+
| --bucket | storage bucket name | a bucket name | `pybench<TIMESTAMP>` |
30+
| --bucket_region | bucket region for benchmarks | any GCS region | `US-WEST1` |
31+
| --workers | number of processes (multiprocessing enabled) | any positive integer | 16 (recommend not to exceed 16) |
32+
| --test_type | test type to run benchmarking | `w1r3`, `range` | `w1r3` |
33+
| --output_file | file to output results to | any file path | `output_bench<TIMESTAMP>.csv` |
34+
| --tmp_dir | temp directory path on file system | any file path | `tm-perf-metrics` |
2735

2836

2937
## Workload definition and CSV headers

tests/perf/_perf_utils.py

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Performance benchmarking helper methods. This is not an officially supported Google product."""
16+
17+
import csv
18+
import logging
19+
import os
20+
import random
21+
import shutil
22+
import time
23+
import uuid
24+
25+
from google.cloud import storage
26+
27+
28+
##### DEFAULTS & CONSTANTS #####
29+
HEADER = [
30+
"Op",
31+
"ObjectSize",
32+
"AppBufferSize",
33+
"LibBufferSize",
34+
"Crc32cEnabled",
35+
"MD5Enabled",
36+
"ApiName",
37+
"ElapsedTimeUs",
38+
"CpuTimeUs",
39+
"Status",
40+
]
41+
CHECKSUM = ["md5", "crc32c", None]
42+
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
43+
DEFAULT_API = "JSON"
44+
DEFAULT_BUCKET_NAME = f"pybench{TIMESTAMP}"
45+
DEFAULT_BUCKET_REGION = "US-WEST1"
46+
DEFAULT_OBJECT_RANGE_SIZE_BYTES = "1048576" # 1 MiB
47+
DEFAULT_NUM_SAMPLES = 8000
48+
DEFAULT_NUM_PROCESSES = 16
49+
DEFAULT_LIB_BUFFER_SIZE = 104857600 # 100MB
50+
DEFAULT_CHUNKSIZE = 104857600 # 100 MB https://github.com/googleapis/python-storage/blob/main/google/cloud/storage/blob.py#L139
51+
NOT_SUPPORTED = -1
52+
DEFAULT_BASE_DIR = "tm-perf-metrics"
53+
DEFAULT_OUTPUT_FILE = f"output_bench{TIMESTAMP}.csv"
54+
DEFAULT_CREATE_SUBDIR_PROBABILITY = 0.1
55+
SSB_SIZE_THRESHOLD_BYTES = 1048576
56+
57+
58+
##### UTILITY METHODS #####
59+
60+
61+
# Returns a boolean value with the provided probability.
62+
def weighted_random_boolean(create_subdir_probability):
63+
return random.uniform(0.0, 1.0) <= create_subdir_probability
64+
65+
66+
# Creates a random file with the given file name, path and size.
67+
def generate_random_file(file_name, file_path, size):
68+
with open(os.path.join(file_path, file_name), "wb") as file_obj:
69+
file_obj.write(os.urandom(size))
70+
71+
72+
# Creates a random directory structure consisting of subdirectories and random files.
73+
# Returns an array of all the generated paths and total size in bytes of all generated files.
74+
def generate_random_directory(
75+
max_objects,
76+
min_file_size,
77+
max_file_size,
78+
base_dir,
79+
create_subdir_probability=DEFAULT_CREATE_SUBDIR_PROBABILITY,
80+
):
81+
directory_info = {
82+
"paths": [],
83+
"total_size_in_bytes": 0,
84+
}
85+
86+
file_path = base_dir
87+
os.makedirs(file_path, exist_ok=True)
88+
for i in range(max_objects):
89+
if weighted_random_boolean(create_subdir_probability):
90+
file_path = f"{file_path}/{uuid.uuid4().hex}"
91+
os.makedirs(file_path, exist_ok=True)
92+
directory_info["paths"].append(file_path)
93+
else:
94+
file_name = uuid.uuid4().hex
95+
rand_size = random.randint(min_file_size, max_file_size)
96+
generate_random_file(file_name, file_path, rand_size)
97+
directory_info["total_size_in_bytes"] += rand_size
98+
directory_info["paths"].append(os.path.join(file_path, file_name))
99+
100+
return directory_info
101+
102+
103+
def results_to_csv(res):
104+
results = []
105+
for metric in HEADER:
106+
results.append(res.get(metric, -1))
107+
return results
108+
109+
110+
def convert_to_csv(filename, results, workers):
111+
with open(filename, "w") as file:
112+
writer = csv.writer(file)
113+
writer.writerow(HEADER)
114+
# Benchmarking main script uses Multiprocessing Pool.map(),
115+
# thus results is structured as List[List[Dict[str, any]]].
116+
for result in results:
117+
for row in result:
118+
writer.writerow(results_to_csv(row))
119+
120+
121+
def convert_to_cloud_monitoring(bucket_name, results, workers):
122+
# Benchmarking main script uses Multiprocessing Pool.map(),
123+
# thus results is structured as List[List[Dict[str, any]]].
124+
for result in results:
125+
for res in result:
126+
range_read_size = res.get("RangeReadSize", 0)
127+
object_size = res.get("ObjectSize")
128+
elapsed_time_us = res.get("ElapsedTimeUs")
129+
status = res.get("Status").pop() # convert ["OK"] --> "OK"
130+
131+
# Handle range reads and calculate throughput using range_read_size.
132+
if range_read_size > 0:
133+
size = range_read_size
134+
else:
135+
size = object_size
136+
137+
# If size is greater than the defined threshold, report in MiB/s, otherwise report in KiB/s.
138+
if size >= SSB_SIZE_THRESHOLD_BYTES:
139+
throughput = (size / 1024 / 1024) / (elapsed_time_us / 1_000_000)
140+
else:
141+
throughput = (size / 1024) / (elapsed_time_us / 1_000_000)
142+
143+
cloud_monitoring_output = (
144+
"throughput{"
145+
+ "library=python-storage,"
146+
+ "api={},".format(res.get("ApiName"))
147+
+ "op={},".format(res.get("Op"))
148+
+ "workers={},".format(workers)
149+
+ "object_size={},".format(object_size)
150+
+ "transfer_offset={},".format(res.get("TransferOffset", 0))
151+
+ "transfer_size={},".format(res.get("TransferSize", object_size))
152+
+ "app_buffer_size={},".format(res.get("AppBufferSize"))
153+
+ "chunksize={},".format(res.get("TransferSize", object_size))
154+
+ "crc32c_enabled={},".format(res.get("Crc32cEnabled"))
155+
+ "md5_enabled={},".format(res.get("MD5Enabled"))
156+
+ "cpu_time_us={},".format(res.get("CpuTimeUs"))
157+
+ "peer='',"
158+
+ f"bucket_name={bucket_name},"
159+
+ "retry_count='',"
160+
+ f"status={status}"
161+
+ "}"
162+
f"{throughput}"
163+
)
164+
165+
print(cloud_monitoring_output)
166+
167+
168+
def cleanup_directory_tree(directory):
169+
"""Clean up directory tree on disk."""
170+
try:
171+
shutil.rmtree(directory)
172+
except Exception as e:
173+
logging.exception(f"Caught an exception while deleting local directory\n {e}")
174+
175+
176+
def cleanup_file(file_path):
177+
"""Clean up local file on disk."""
178+
try:
179+
os.remove(file_path)
180+
except Exception as e:
181+
logging.exception(f"Caught an exception while deleting local file\n {e}")
182+
183+
184+
def get_bucket_instance(bucket_name):
185+
client = storage.Client()
186+
bucket = client.bucket(bucket_name)
187+
if not bucket.exists():
188+
client.create_bucket(bucket)
189+
return bucket
190+
191+
192+
def cleanup_bucket(bucket):
193+
# Delete blobs first as the bucket may contain more than 256 blobs.
194+
try:
195+
blobs = bucket.list_blobs()
196+
for blob in blobs:
197+
blob.delete()
198+
except Exception as e:
199+
logging.exception(f"Caught an exception while deleting blobs\n {e}")
200+
# Delete bucket.
201+
try:
202+
bucket.delete(force=True)
203+
except Exception as e:
204+
logging.exception(f"Caught an exception while deleting bucket\n {e}")
205+
206+
207+
def get_min_max_size(object_size):
208+
# Object size accepts a single value in bytes or a range in bytes min..max
209+
if object_size.find("..") < 0:
210+
min_size = int(object_size)
211+
max_size = int(object_size)
212+
else:
213+
split_sizes = object_size.split("..")
214+
min_size = int(split_sizes[0])
215+
max_size = int(split_sizes[1])
216+
return min_size, max_size

0 commit comments

Comments
 (0)