Skip to content

Commit 87e1257

Browse files
authored
[Monitor][Ingestion] Improve chunking logic (Azure#29584)
When chunking the length of each log entry was not properly measured leading to overestimating the size of an entry in most cases. This change fixes that. Signed-off-by: Paul Van Eck <[email protected]>
1 parent 5a2aaa4 commit 87e1257

File tree

3 files changed

+68
-7
lines changed

3 files changed

+68
-7
lines changed

sdk/monitor/azure-monitor-ingestion/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
### Breaking Changes
88

99
### Bugs Fixed
10+
- Fixed an issue where log entry sizes were miscalculated when chunking. ([#29584](https://github.com/Azure/azure-sdk-for-python/pull/29584))
1011

1112
### Other Changes
1213

sdk/monitor/azure-monitor-ingestion/azure/monitor/ingestion/_helpers.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,21 @@
1818
JSON = MutableMapping[str, Any] # pylint: disable=unsubscriptable-object
1919

2020
MAX_CHUNK_SIZE_BYTES = 1024 * 1024 # 1 MiB
21-
CHAR_SIZE_BYTES = 4
2221
GZIP_MAGIC_NUMBER = b"\x1f\x8b"
2322

2423

25-
def _split_chunks(logs: List[JSON]) -> Generator[List[JSON], None, None]:
24+
def _split_chunks(logs: List[JSON], max_size_bytes: int = MAX_CHUNK_SIZE_BYTES) -> Generator[List[JSON], None, None]:
2625
chunk_size = 0
2726
curr_chunk = []
2827
for log in logs:
29-
# each char is 4 bytes
30-
size = len(json.dumps(log)) * CHAR_SIZE_BYTES
31-
if chunk_size + size <= MAX_CHUNK_SIZE_BYTES:
28+
size = len(json.dumps(log).encode("utf-8"))
29+
if chunk_size + size <= max_size_bytes:
3230
curr_chunk.append(log)
3331
chunk_size += size
3432
else:
35-
_LOGGER.debug('Yielding chunk with size: %d', chunk_size)
36-
yield curr_chunk
33+
if curr_chunk:
34+
_LOGGER.debug('Yielding chunk with size: %d', chunk_size)
35+
yield curr_chunk
3736
curr_chunk = [log]
3837
chunk_size = size
3938
if len(curr_chunk) > 0:
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# -------------------------------------------------------------------------
2+
# Copyright (c) Microsoft Corporation. All rights reserved.
3+
# Licensed under the MIT License. See LICENSE.txt in the project root for
4+
# license information.
5+
# -------------------------------------------------------------------------
6+
import json
7+
import random
8+
import string
9+
import zlib
10+
11+
import pytest
12+
13+
from azure.monitor.ingestion._helpers import (
14+
_create_gzip_requests,
15+
_split_chunks,
16+
MAX_CHUNK_SIZE_BYTES,
17+
GZIP_MAGIC_NUMBER
18+
)
19+
20+
21+
ALPHANUMERIC_CHARACTERS = string.ascii_letters + string.digits
22+
23+
random.seed(42) # For repeatibility
24+
25+
26+
def _get_random_string(length: int):
27+
return ''.join(random.choice(ALPHANUMERIC_CHARACTERS) for _ in range(length))
28+
29+
30+
class TestHelpers:
31+
32+
@pytest.mark.parametrize("content", ["bar", "\uc548\ub155\ud558\uc138\uc694"])
33+
def test_split_chunks(self, content):
34+
obj = {"foo": content}
35+
logs = [obj] * 100
36+
37+
entry_size = len(json.dumps(obj).encode("utf-8"))
38+
39+
chunks = list(_split_chunks(logs, max_size_bytes=entry_size))
40+
assert len(chunks) == 100
41+
42+
chunks = list(_split_chunks(logs, max_size_bytes=entry_size*2))
43+
assert len(chunks) == 50
44+
45+
chunks = list(_split_chunks(logs, max_size_bytes=entry_size*100))
46+
assert len(chunks) == 1
47+
48+
def test_split_chunks_larger_than_max(self):
49+
obj = {"foo": "some-long-string"}
50+
logs = [obj] * 3
51+
# If each entry in the log is greater than the max chunk size, then each entry should be its own chunk.
52+
chunks = list(_split_chunks(logs, max_size_bytes=10))
53+
assert len(chunks) == 3
54+
55+
@pytest.mark.parametrize("num_entries", [100, 10000])
56+
def test_create_gzip_requests(self, num_entries):
57+
logs = [{_get_random_string(20): _get_random_string(500)} for _ in range(num_entries)]
58+
for compressed_bytes, raw_data in _create_gzip_requests(logs):
59+
assert len(compressed_bytes) < MAX_CHUNK_SIZE_BYTES
60+
assert compressed_bytes[:2] == GZIP_MAGIC_NUMBER
61+
assert zlib.decompress(compressed_bytes, 16+zlib.MAX_WBITS) == json.dumps(raw_data).encode("utf-8")

0 commit comments

Comments
 (0)