VED-470: Stream content line by line so we can handle arbitrary file sizes.

mfjarvis · mfjarvis · commit b38a2a847daa · 2025-07-18T15:10:34.000+01:00
diff --git a/mesh_processor/poetry.lock b/mesh_processor/poetry.lock
diff --git a/mesh_processor/pyproject.toml b/mesh_processor/pyproject.toml
@@ -12,6 +12,7 @@ boto3 = "~1.38.42"
 mypy-boto3-dynamodb = "^1.38.4"
 moto = {extras = ["s3"], version = "^5.1.8"}
 coverage = "^7.9.1"
+smart-open = {extras = ["s3"], version = "^7.3.0.post1"}
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/mesh_processor/src/converter.py b/mesh_processor/src/converter.py
@@ -1,8 +1,8 @@
 import logging
-
-import boto3
 import os
 
+import boto3
+from smart_open import open
 
 DESTINATION_BUCKET_NAME = os.getenv("DESTINATION_BUCKET_NAME")
 
@@ -31,38 +31,98 @@ def parse_header_value(header_value: str):
     return main_value, parsed_params
 
 
+def read_until_part_start(input_file, boundary):
+    while line := input_file.readline():
+        if line == b"--" + boundary + b"\r\n":
+            return
+    else:
+        raise ValueError(f"Unexpected EOF")
+
+
+def read_headers_bytes(input_file):
+    headers_bytes = b''
+    while line := input_file.readline():
+        if line == b"\r\n":
+            return headers_bytes
+        headers_bytes += line
+    else:
+        raise ValueError("Unexpected EOF")
+
+
+def read_part_headers(input_file):
+    headers_bytes = read_headers_bytes(input_file)
+    headers_str = headers_bytes.decode("utf-8")
+    return parse_headers(headers_str)
+
+
+def stream_part_body(input_file, boundary, output_file):
+    previous_line = None
+    found_part_end = False
+    while not found_part_end:
+        if (line := input_file.readline()) is None:
+            raise ValueError("Unexpected EOF")
+
+        if line == b"--" + boundary + b"\r\n":
+            logger.warning("Found additional part which will not be processed")
+            found_part_end = True
+        if line == b"--" + boundary + b"--\r\n":
+            found_part_end = True
+
+        if previous_line is not None:
+            if found_part_end:
+                # The final \r\n is part of the encapsulation boundary, so should not be included
+                output_file.write(previous_line.rstrip(b'\r\n'))
+            else:
+                output_file.write(previous_line)
+
+        previous_line = line
+
+
+def transfer_multipart_content(bucket_name, file_key, boundary, filename):
+    with open(
+        f"s3://{bucket_name}/{file_key}",
+        "rb",
+        transport_params={"client": s3_client}
+    ) as input_file:
+        read_until_part_start(input_file, boundary)
+
+        headers = read_part_headers(input_file)
+        content_disposition = headers.get("Content-Disposition")
+        if content_disposition:
+            _, content_disposition_params = parse_header_value(content_disposition)
+            filename = content_disposition_params.get("filename") or filename
+
+        with open(
+            f"s3://{DESTINATION_BUCKET_NAME}/{filename}",
+            "wb",
+            transport_params={"client": s3_client}
+        ) as output_file:
+            stream_part_body(input_file, boundary, output_file)
+
+
 def process_record(record):
     bucket_name = record["s3"]["bucket"]["name"]
     file_key = record["s3"]["object"]["key"]
     logger.info(f"Processing {file_key}")
 
-    response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
-    filename = response["Metadata"].get("mex-filename") or file_key
-    # TODO - this will read everything into memory - look at streaming instead
-    content = response["Body"].read().decode("utf-8")
-
+    response = s3_client.head_object(Bucket=bucket_name, Key=file_key)
     content_type = response['ContentType']
     media_type, content_type_params = parse_header_value(content_type)
+    filename = response["Metadata"].get("mex-filename") or file_key
 
-    # Handle multipart content by parsing the filename and content from the first part
+    # Handle multipart content by parsing the filename from headers and streaming the content from the first part
     if media_type.startswith("multipart/"):
         logger.info("Found multipart content")
-        boundary = content_type_params["boundary"]
-        parts = [
-            part.lstrip(f"--{boundary}")
-            for part in content.split(f"\r\n--{boundary}")
-            if part.strip() != "" and part.strip() != "--"
-        ]
-        if len(parts) > 1:
-            logger.warning(f"Got {len(parts)} parts, but will only process the first")
-
-        headers_str, content = parts[0].split("\r\n\r\n", 1)
-        headers = parse_headers(headers_str)
-        content_disposition = headers["Content-Disposition"]
-        _, content_disposition_params = parse_header_value(content_disposition)
-        filename = content_disposition_params.get("filename") or filename
-
-    s3_client.put_object(Bucket=DESTINATION_BUCKET_NAME, Key=filename, Body=content.encode("utf-8"))
+        boundary = content_type_params["boundary"].encode("utf-8")
+        transfer_multipart_content(bucket_name, file_key, boundary, filename)
+    else:
+        s3_client.copy_object(
+            Bucket=DESTINATION_BUCKET_NAME,
+            CopySource={"Bucket": bucket_name, "Key": file_key},
+            Key=filename
+        )
+
+    logger.info(f"Transfer complete for {file_key}")
 
 
 def lambda_handler(event, _):