Support batching of logs - by default 10s windows, 100 logs per batch

PetrHeinz · PetrHeinz · commit 7885c49a7370 · 2025-04-07T15:45:01.000+02:00
diff --git a/README.md b/README.md
@@ -56,6 +56,15 @@ gcloud dataflow flex-template run "pubsub-to-betterstack-$(date +%Y%m%d-%H%M%S)"
     --region=$(gcloud config get-value compute/region)
 ```
 
+## Optional parameters
+
+The template supports the following optional parameters:
+
+- `batch_size`: Number of messages to batch before sending to Better Stack. Default: 100
+- `window_size`: Window size in seconds for batching messages. Default: 10
+
+You can include these parameters in your Dataflow job by adding them to the run command, e.g. `gcloud dataflow flex-template run ... --parameters window_size=30`.
+
 ## License
 
 ISC License. See [LICENSE.md](LICENSE.md) for details.
diff --git a/metadata.json b/metadata.json
@@ -20,6 +20,20 @@
       "label": "Better Stack Ingesting Host",
       "helpText": "The ingesting host of your telemetry source in Better Stack",
       "isOptional": false
+    },
+    {
+      "name": "batch_size",
+      "label": "Batch Size",
+      "helpText": "Number of messages to batch before sending to Better Stack",
+      "isOptional": true,
+      "defaultValue": "100"
+    },
+    {
+      "name": "window_size",
+      "label": "Window Size",
+      "helpText": "Window size in seconds for batching messages",
+      "isOptional": true,
+      "defaultValue": "10"
     }
   ]
 }
diff --git a/pipeline.py b/pipeline.py
@@ -3,40 +3,64 @@
 import os
 import apache_beam as beam
 from apache_beam.options.pipeline_options import PipelineOptions
+from apache_beam.transforms.window import FixedWindows
 import requests
-from typing import Dict, Any
+from typing import Dict, Any, List
+from datetime import timedelta
 
 class PubSubToBetterStack(beam.DoFn):
-    def __init__(self, source_token: str, ingesting_host: str):
+    def __init__(self, source_token: str, ingesting_host: str, batch_size: int):
         self.source_token = source_token
         self.ingesting_url = ingesting_host if '://' in ingesting_host else f'https://{ingesting_host}'
+        self.batch_size = batch_size
         self.headers = {
             'Authorization': f'Bearer {source_token}',
             'Content-Type': 'application/json'
         }
+        self.batch = []
 
     def process(self, element: bytes) -> None:
         try:
             # Parse the Pub/Sub data
             data = json.loads(element.decode('utf-8'))
-
+            
             # Rename timestamp key to dt to be understood by Better Stack
             if 'timestamp' in data:
                 data['dt'] = data.pop('timestamp')
+            
+            self.batch.append(data)
+            
+            # If we've reached the batch size, send the batch
+            if len(self.batch) >= self.batch_size:
+                self._send_batch()
+                
+        except Exception as e:
+            # Log the error but don't fail the pipeline
+            print(f"Error processing message: {str(e)}")
+
+    def finish_bundle(self):
+        # Send any remaining messages in the batch
+        if self.batch:
+            self._send_batch()
 
-            # Send to Better Stack
+    def _send_batch(self):
+        try:
+            # Send batch to Better Stack
             response = requests.post(
                 self.ingesting_url,
                 headers=self.headers,
-                json=data
+                json=self.batch
             )
             
             if response.status_code != 202:
                 raise Exception(f"Failed to send to Better Stack: {response.text}")
                 
+            # Clear the batch after successful send
+            self.batch = []
+                
         except Exception as e:
             # Log the error but don't fail the pipeline
-            print(f"Error processing message: {str(e)}")
+            print(f"Error sending batch to Better Stack: {str(e)}")
 
 def run(argv=None):
     parser = argparse.ArgumentParser()
@@ -55,6 +79,18 @@ def run(argv=None):
         required=True,
         help='The ingesting host of your telemetry source in Better Stack'
     )
+    parser.add_argument(
+        '--batch_size',
+        default=100,
+        type=int,
+        help='Number of messages to batch before sending to Better Stack'
+    )
+    parser.add_argument(
+        '--window_size',
+        default=10,
+        type=int,
+        help='Window size in seconds for batching messages'
+    )
     known_args, pipeline_args = parser.parse_known_args(argv)
 
     pipeline_options = PipelineOptions(
@@ -68,13 +104,17 @@ def run(argv=None):
             | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
                 subscription=known_args.input_subscription
             )
+            | 'Window into fixed windows' >> beam.WindowInto(
+                FixedWindows(known_args.window_size)
+            )
             | 'Send to Better Stack' >> beam.ParDo(
                 PubSubToBetterStack(
                     known_args.better_stack_source_token,
-                    known_args.better_stack_ingesting_host
+                    known_args.better_stack_ingesting_host,
+                    known_args.batch_size
                 )
             )
         )
 
 if __name__ == '__main__':
-    run() 
+    run()