airbytehq · aldogonzalez8 · Mar 31, 2025 · Mar 27, 2025 · Mar 31, 2025 · Mar 31, 2025
diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml
@@ -1449,6 +1449,15 @@ definitions:
             anyOf:
               - "$ref": "#/definitions/CustomRecordExtractor"
               - "$ref": "#/definitions/DpathExtractor"
+          filename_extractor:
+            description: Defines the name to store the file. Stream name is automatically added to the file path. File unique ID can be used to avoid overwriting files. Random UUID will be used if the extractor is not provided.
+            type: string
+            interpolation_context:
+              - config
+              - record
+            examples:
+              - "{{ record.id }}/{{ record.file_name }}/"
+              - "{{ record.id }}_{{ record.file_name }}/"
       $parameters:
         type: object
         additional_properties: true

diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py
@@ -2003,6 +2003,15 @@ class FileUploader(BaseModel):
         None,
         description="Responsible for fetching the content of the file. If not defined, the assumption is that the whole response body is the file content",
     )
+    filename_extractor: Optional[str] = Field(
+        None,
+        description="Defines the name to store the file. Stream name is automatically added to the file path. File unique ID can be used to avoid overwriting files. Random UUID will be used if the extractor is not provided.",
+        examples=[
+            "{{ record.id }}/{{ record.file_name }}/",
+            "{{ record.id }}_{{ record.file_name }}/",
+        ],
+    )
+    parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
 
 
 class DeclarativeStream(BaseModel):

diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py
@@ -3348,7 +3348,13 @@ def create_file_uploader(
             name=name,
             **kwargs,
         )
-        return FileUploader(requester, download_target_extractor)
+        return FileUploader(
+            requester=requester,
+            download_target_extractor=download_target_extractor,
+            config=config,
+            parameters=model.parameters or {},
+            filename_extractor=model.filename_extractor if model.filename_extractor else None,
+        )
 
     def create_moving_window_call_rate_policy(
         self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any

diff --git a/airbyte_cdk/sources/declarative/retrievers/file_uploader.py b/airbyte_cdk/sources/declarative/retrievers/file_uploader.py
@@ -1,52 +1,74 @@
+#
+# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
+#
+
 import json
 import logging
+import uuid
+from dataclasses import InitVar, dataclass, field
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Mapping, Union, Any
 
+from airbyte_cdk.sources.declarative.interpolation.interpolated_string import (
+    InterpolatedString,
+)
 from airbyte_cdk.models import AirbyteRecordMessageFileReference
 from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor
 from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import (
     SafeResponse,
 )
 from airbyte_cdk.sources.declarative.requesters import Requester
 from airbyte_cdk.sources.declarative.types import Record, StreamSlice
+from airbyte_cdk.sources.types import Config
 from airbyte_cdk.sources.utils.files_directory import get_files_directory
 
 logger = logging.getLogger("airbyte")
 
+
+@dataclass
 class FileUploader:
-    def __init__(
-        self,
-        requester: Requester,
-        download_target_extractor: RecordExtractor,
-        content_extractor: Optional[RecordExtractor] = None,
-    ) -> None:
-        self._requester = requester
-        self._download_target_extractor = download_target_extractor
-        self._content_extractor = content_extractor
+    requester: Requester
+    download_target_extractor: RecordExtractor
+    config: Config
+    parameters: InitVar[Mapping[str, Any]]
+
+    filename_extractor: Optional[Union[InterpolatedString, str]] = None
+    content_extractor: Optional[RecordExtractor] = None
+
+    def __post_init__(self, parameters: Mapping[str, Any]) -> None:
+        if self.filename_extractor:
+            self.filename_extractor = InterpolatedString.create(
+                self.filename_extractor,
+                parameters=parameters,
+            )
 
     def upload(self, record: Record) -> None:
         mocked_response = SafeResponse()
         mocked_response.content = json.dumps(record.data).encode("utf-8")
-        download_target = list(self._download_target_extractor.extract_records(mocked_response))[0]
+        download_target = list(self.download_target_extractor.extract_records(mocked_response))[0]
         if not isinstance(download_target, str):
             raise ValueError(
                 f"download_target is expected to be a str but was {type(download_target)}: {download_target}"
             )
 
-        response = self._requester.send_request(
+        response = self.requester.send_request(
             stream_slice=StreamSlice(
                 partition={}, cursor_slice={}, extra_fields={"download_target": download_target}
             ),
         )
 
-        if self._content_extractor:
+        if self.content_extractor:
             raise NotImplementedError("TODO")
         else:
             files_directory = Path(get_files_directory())
-            # TODO:: we could either interpolate record data if some relative_path is provided or
-            #  use partition_field value in the slice {"partition_field": some_value_id} to create a path
-            file_relative_path = Path(record.stream_name) / record.data["file_name"]
+
+            file_name = (
+                self.filename_extractor.eval(self.config, record=record)
+                if self.filename_extractor
+                else str(uuid.uuid4())
+            )
+            file_name = file_name.lstrip("/")
+            file_relative_path = Path(record.stream_name) / Path(file_name)
 
             full_path = files_directory / file_relative_path
             full_path.parent.mkdir(parents=True, exist_ok=True)
@@ -56,7 +78,7 @@ def upload(self, record: Record) -> None:
             file_size_bytes = full_path.stat().st_size
 
             logger.info("File uploaded successfully")
-            logger.info(f"File url: {str(full_path)} ")
+            logger.info(f"File url: {str(full_path)}")
             logger.info(f"File size: {file_size_bytes / 1024} KB")
             logger.info(f"File relative path: {str(file_relative_path)}")
 

diff --git a/unit_tests/sources/declarative/file/test_file_stream.py b/unit_tests/sources/declarative/file/test_file_stream.py
@@ -1,3 +1,5 @@
+import re
+
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 from unittest import TestCase
@@ -29,9 +31,12 @@ def _source(
     catalog: ConfiguredAirbyteCatalog,
     config: Dict[str, Any],
     state: Optional[List[AirbyteStateMessage]] = None,
+    yaml_file: Optional[str] = None,
 ) -> YamlDeclarativeSource:
+    if not yaml_file:
+        yaml_file = "file_stream_manifest.yaml"
     return YamlDeclarativeSource(
-        path_to_yaml=str(Path(__file__).parent / "file_stream_manifest.yaml"),
+        path_to_yaml=str(Path(__file__).parent / yaml_file),
         catalog=catalog,
         config=config,
         state=state,
@@ -43,11 +48,12 @@ def read(
     catalog: ConfiguredAirbyteCatalog,
     state_builder: Optional[StateBuilder] = None,
     expecting_exception: bool = False,
+    yaml_file: Optional[str] = None,
 ) -> EntrypointOutput:
     config = config_builder.build()
     state = state_builder.build() if state_builder else StateBuilder().build()
     return entrypoint_read(
-        _source(catalog, config, state), config, catalog, state, expecting_exception
+        _source(catalog, config, state, yaml_file), config, catalog, state, expecting_exception
     )
 
 
@@ -96,7 +102,32 @@ def test_get_article_attachments(self) -> None:
         file_reference = output.records[0].record.file_reference
         assert file_reference
         assert file_reference.file_url
+        assert re.match(r"^.*/article_attachments/[0-9a-fA-F-]{36}$", file_reference.file_url)
         assert file_reference.file_relative_path
+        assert re.match(
+            r"^article_attachments/[0-9a-fA-F-]{36}$", file_reference.file_relative_path
+        )
+        assert file_reference.file_size_bytes
+
+    def test_get_article_attachments_with_filename_extractor(self) -> None:
+        output = read(
+            self._config(),
+            CatalogBuilder()
+            .with_stream(ConfiguredAirbyteStreamBuilder().with_name("article_attachments"))
+            .build(),
+            yaml_file="test_file_stream_with_filename_extractor.yaml",
+        )
+
+        assert output.records
+        file_reference = output.records[0].record.file_reference
+        assert file_reference
+        assert file_reference.file_url
+        # todo: once we finally mock the response update to check file name
+        assert not re.match(r"^.*/article_attachments/[0-9a-fA-F-]{36}$", file_reference.file_url)
+        assert file_reference.file_relative_path
+        assert not re.match(
+            r"^article_attachments/[0-9a-fA-F-]{36}$", file_reference.file_relative_path
+        )
         assert file_reference.file_size_bytes
 
     def test_discover_article_attachments(self) -> None: