http collector: Add Chunking parameters

sebix · sebix · commit d55fb3358560 · 2026-02-14T11:04:54.000+01:00
To handle big files in the queue, file splitting is necessary
chunking was only available for the file and mail url collector, this
adds it to the http collector
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,6 +23,7 @@ Please refer to the [NEWS](NEWS.md) for a list of changes which have an affect o
 
 ### Bots
 #### Collectors
+- `intelmq.bots.collectors.http.collector_http`: Add Chunking parameters to handle big files (PR#2684 by Sebastian Wagner).
 
 #### Parsers
 
diff --git a/docs/user/bots.md b/docs/user/bots.md
@@ -321,6 +321,14 @@ This requires the [python-gnupg](https://pypi.org/project/python-gnupg/) library
 (optional, string) If specified, the string represents path to keyring file. Otherwise the PGP keyring file of the
 current `intelmq` user is used.
 
+**Chunking**
+
+For line-based inputs the bot can split up large reports into smaller chunks. This is particularly important for setups
+that use Redis as a message queue which has a per-message size limitation of 512 MB. To configure chunking,
+set `chunk_size` to a value in bytes. `chunk_replicate_header` determines whether the header line should be repeated for
+each chunk that is passed on to a parser bot. Specifically, to configure a large file input to work around Redis size
+limitation set `chunk_size` to something like 384000000 (~384 MB).
+
 ---
 
 ### Generic URL Stream Fetcher <div id="intelmq.bots.collectors.http.collector_http_stream" />
diff --git a/intelmq/bots/collectors/http/collector_http.py b/intelmq/bots/collectors/http/collector_http.py
@@ -27,11 +27,14 @@
     gpg_keyring: none (defaults to user's GPG keyring) or string (path to keyring file)
 """
 from datetime import datetime, timedelta
+from typing import Optional
+from io import BytesIO
 
 from intelmq.lib.bot import CollectorBot
 from intelmq.lib.mixins import HttpMixin
 from intelmq.lib.utils import unzip
 from intelmq.lib.exceptions import MissingDependencyError
+from intelmq.lib.splitreports import generate_reports
 
 try:
     import gnupg
@@ -64,6 +67,9 @@ class HTTPCollectorBot(CollectorBot, HttpMixin):
     signature_url_formatting: bool = False
     ssl_client_certificate: str = None  # TODO: pathlib.Path
     verify_pgp_signatures: bool = False
+    # splitreports
+    chunk_replicate_header: bool = True
+    chunk_size: Optional[int] = None
 
     def init(self):
         self.use_gpg = self.verify_pgp_signatures
@@ -130,12 +136,15 @@ def process(self):
                                 return_names=True, logger=self.logger)
 
         for file_name, raw_report in raw_reports:
-            report = self.new_report()
-            report.add("raw", raw_report)
-            report.add("feed.url", http_url)
+            template = self.new_report()
+            template.add("raw", raw_report)
+            template.add("feed.url", http_url)
             if file_name:
                 report.add("extra.file_name", file_name)
-            self.send_message(report)
+            for report in generate_reports(template, BytesIO(resp.content),
+                                           self.chunk_size,
+                                           self.chunk_replicate_header):
+                self.send_message(report)
 
     def format_url(self, url: str, formatting) -> str:
         try: