[Monitor][Ingestion] Improve chunking logic (Azure#29584)

pvaneck · web-flow · commit 87e12574fce3 · 2023-04-07T14:25:39.000-07:00
When chunking the length of each log entry was not properly measured
leading to overestimating the size of an entry in most cases. This
change fixes that.

Signed-off-by: Paul Van Eck &lt;paulvaneck@microsoft.com&gt;
diff --git a/sdk/monitor/azure-monitor-ingestion/CHANGELOG.md b/sdk/monitor/azure-monitor-ingestion/CHANGELOG.md
@@ -7,6 +7,7 @@
 ### Breaking Changes
 
 ### Bugs Fixed
+  - Fixed an issue where log entry sizes were miscalculated when chunking. ([#29584](https://github.com/Azure/azure-sdk-for-python/pull/29584))
 
 ### Other Changes
 
diff --git a/sdk/monitor/azure-monitor-ingestion/azure/monitor/ingestion/_helpers.py b/sdk/monitor/azure-monitor-ingestion/azure/monitor/ingestion/_helpers.py
@@ -18,22 +18,21 @@
 JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
 
 MAX_CHUNK_SIZE_BYTES = 1024 * 1024 # 1 MiB
-CHAR_SIZE_BYTES = 4
 GZIP_MAGIC_NUMBER = b"\x1f\x8b"
 
 
-def _split_chunks(logs: List[JSON]) -> Generator[List[JSON], None, None]:
+def _split_chunks(logs: List[JSON], max_size_bytes: int = MAX_CHUNK_SIZE_BYTES) -> Generator[List[JSON], None, None]:
     chunk_size = 0
     curr_chunk = []
     for log in logs:
-        # each char is 4 bytes
-        size = len(json.dumps(log)) * CHAR_SIZE_BYTES
-        if chunk_size + size <= MAX_CHUNK_SIZE_BYTES:
+        size = len(json.dumps(log).encode("utf-8"))
+        if chunk_size + size <= max_size_bytes:
             curr_chunk.append(log)
             chunk_size += size
         else:
-            _LOGGER.debug('Yielding chunk with size: %d', chunk_size)
-            yield curr_chunk
+            if curr_chunk:
+                _LOGGER.debug('Yielding chunk with size: %d', chunk_size)
+                yield curr_chunk
             curr_chunk = [log]
             chunk_size = size
     if len(curr_chunk) > 0:
diff --git a/sdk/monitor/azure-monitor-ingestion/tests/test_helpers.py b/sdk/monitor/azure-monitor-ingestion/tests/test_helpers.py
@@ -0,0 +1,61 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE.txt in the project root for
+# license information.
+# -------------------------------------------------------------------------
+import json
+import random
+import string
+import zlib
+
+import pytest
+
+from azure.monitor.ingestion._helpers import (
+    _create_gzip_requests,
+    _split_chunks,
+    MAX_CHUNK_SIZE_BYTES,
+    GZIP_MAGIC_NUMBER
+)
+
+
+ALPHANUMERIC_CHARACTERS = string.ascii_letters + string.digits
+
+random.seed(42) # For repeatibility
+
+
+def _get_random_string(length: int):
+    return ''.join(random.choice(ALPHANUMERIC_CHARACTERS) for _ in range(length))
+
+
+class TestHelpers:
+
+    @pytest.mark.parametrize("content", ["bar", "\uc548\ub155\ud558\uc138\uc694"])
+    def test_split_chunks(self, content):
+        obj = {"foo": content}
+        logs = [obj] * 100
+
+        entry_size = len(json.dumps(obj).encode("utf-8"))
+
+        chunks = list(_split_chunks(logs, max_size_bytes=entry_size))
+        assert len(chunks) == 100
+
+        chunks = list(_split_chunks(logs, max_size_bytes=entry_size*2))
+        assert len(chunks) == 50
+
+        chunks = list(_split_chunks(logs, max_size_bytes=entry_size*100))
+        assert len(chunks) == 1
+
+    def test_split_chunks_larger_than_max(self):
+        obj = {"foo": "some-long-string"}
+        logs = [obj] * 3
+        # If each entry in the log is greater than the max chunk size, then each entry should be its own chunk.
+        chunks = list(_split_chunks(logs, max_size_bytes=10))
+        assert len(chunks) == 3
+
+    @pytest.mark.parametrize("num_entries", [100, 10000])
+    def test_create_gzip_requests(self, num_entries):
+        logs = [{_get_random_string(20): _get_random_string(500)} for _ in range(num_entries)]
+        for compressed_bytes, raw_data in _create_gzip_requests(logs):
+            assert len(compressed_bytes) < MAX_CHUNK_SIZE_BYTES
+            assert compressed_bytes[:2] == GZIP_MAGIC_NUMBER
+            assert zlib.decompress(compressed_bytes, 16+zlib.MAX_WBITS) == json.dumps(raw_data).encode("utf-8")