From a488ab37a41d7a5a981fdc969af92627a43cb0a6 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Wed, 26 Mar 2025 19:35:03 +0200 Subject: [PATCH 01/27] deduplication version 1 --- .../declarative/parsers/custom_exceptions.py | 9 + .../parsers/manifest_deduplicator.py | 321 ++++++++++++++++++ .../parsers/manifest_reference_resolver.py | 10 +- .../test_manifest_reference_resolver.py | 244 +++++++++++++ 4 files changed, 582 insertions(+), 2 deletions(-) create mode 100644 airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py diff --git a/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py b/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py index d6fdee695..a5c192511 100644 --- a/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +++ b/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py @@ -19,3 +19,12 @@ class UndefinedReferenceException(Exception): def __init__(self, path: str, reference: str) -> None: super().__init__(f"Undefined reference {reference} from {path}") + + +class ManifestDeduplicationException(Exception): + """ + Raised when a circular reference is detected in a manifest. + """ + + def __init__(self, exception: str) -> None: + super().__init__(f"Failed to deduplicate manifest: {exception}") diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py new file mode 100644 index 000000000..32a750fa8 --- /dev/null +++ b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py @@ -0,0 +1,321 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import copy +import hashlib +import json +from collections import defaultdict +from typing import Any, DefaultDict, Dict, Hashable, List, Optional, Tuple + +from airbyte_cdk.sources.declarative.parsers.custom_exceptions import ManifestDeduplicationException + +# Type definitions for better readability +ManifestType = Dict[str, Any] +DefinitionsType = Dict[str, Any] +FieldDuplicatesType = DefaultDict[Tuple[str, Any], List[Tuple[List[str], Dict]]] +ComponentDuplicatesType = DefaultDict[str, List[Tuple[List[str], Dict, Dict]]] + +# Configuration constants +N_OCCURANCES = 2 + +DEF_TAG = "definitions" +SHARED_TAG = "shared" + +# SPECIFY COMPONENT TAGS FOR DEDUPLICATION +COMPONENT_TAGS = [ + "authenticator", +] + +# SPECIFY FIELD TAGS FOR DEDUPLICATION +FIELD_TAGS = [ + "url_base", +] + + +def deduplicate_definitions(resolved_manifest: ManifestType) -> ManifestType: + """ + Find commonalities in the input JSON structure and refactor it to avoid redundancy. + + Args: + resolved_manifest: A dictionary representing a JSON structure to be analyzed. + + Returns: + A refactored JSON structure with common properties extracted. + """ + + try: + _manifest = copy.deepcopy(resolved_manifest) + definitions = _manifest.get(DEF_TAG, {}) + field_duplicates, component_duplicates = _collect_all_duplicates(definitions) + _process_duplicates(definitions, field_duplicates, component_duplicates) + return _manifest + except ManifestDeduplicationException: + # we don't want to fix every single error which might occur, + # due to the varaety of possible manifest configurations, + # if any arror occurs, we just return the original manifest. + return resolved_manifest + + +def _process_duplicates( + definitions: DefinitionsType, + field_duplicates: FieldDuplicatesType, + component_duplicates: ComponentDuplicatesType, +) -> None: + """ + Process the duplicates and replace them with references. + + Args: + field_duplicates: Dictionary of duplicate primitive values + component_duplicates: Dictionary of duplicate objects + """ + # process duplicates only if there are any + if len(field_duplicates) > 0 or len(component_duplicates) > 0: + if not SHARED_TAG in definitions: + definitions[SHARED_TAG] = {} + + try: + _process_component_duplicates(definitions, component_duplicates) + _process_field_duplicates(definitions, field_duplicates) + except Exception as e: + raise ManifestDeduplicationException(str(e)) + + +def _is_allowed_component(key: str) -> bool: + """ + Check if the key is an allowed component tag. + + Args: + key: The key to check + + Returns: + True if the key is allowed, False otherwise + """ + return key in COMPONENT_TAGS + + +def _is_allowed_field(key: str) -> bool: + """ + Check if the key is an allowed field tag. + + Args: + key: The key to check + + Returns: + True if the key is allowed, False otherwise + """ + return key in FIELD_TAGS + + +def _collect_all_duplicates( + node: ManifestType, +) -> Tuple[FieldDuplicatesType, ComponentDuplicatesType]: + """ + Traverse the JSON object and collect all potential duplicate values and objects. + + Args: + node: The JSON object to analyze + + Returns: + A tuple of (field_duplicates, component_duplicates) + """ + + field_duplicates: FieldDuplicatesType = defaultdict(list) + component_duplicates: ComponentDuplicatesType = defaultdict(list) + + def collect_duplicates(obj: Dict, path: Optional[List[str]] = None) -> None: + if not isinstance(obj, dict): + return + + path = [] if path is None else path + # Check if the object is empty + for key, value in obj.items(): + current_path = path + [key] + + if isinstance(value, dict): + # First process nested dictionaries + collect_duplicates(value, current_path) + + # Process allowed-only component tags + if _is_allowed_component(key): + obj_hash = _hash_object(value) + if obj_hash: + component_duplicates[obj_hash].append((current_path, obj, value)) + + # handle list[dict] cases + elif isinstance(value, list): + for i, item in enumerate(value): + collect_duplicates(item, current_path + [str(i)]) + + # Process allowed-only field tags + elif _is_allowed_field(key): + hashable_value = _make_hashable(value) + field_duplicates[(key, hashable_value)].append((current_path, obj)) + + try: + collect_duplicates(node) + except Exception as e: + raise ManifestDeduplicationException(str(e)) + + return field_duplicates, component_duplicates + + +def _hash_object(node: Dict) -> Optional[str]: + """ + Create a unique hash for a dictionary object. + + Args: + node: The dictionary to hash + + Returns: + A hash string or None if not hashable + """ + if isinstance(node, Dict): + # Sort keys to ensure consistent hash for same content + return hashlib.md5(json.dumps(node, sort_keys=True).encode()).hexdigest() + + return None + + +def _make_hashable(value: Any) -> Any: + """ + Convert a value to a hashable representation. + + Args: + value: The value to make hashable + + Returns: + A hashable representation of the value + """ + return json.dumps(value) if not isinstance(value, Hashable) else value + + +def _create_reference_key( + definitions: DefinitionsType, key: str, value: Optional[Any] = None +) -> str: + """ + Create a unique reference key and handle collisions. + + Args: + key: The base key to use + definitions: The definitions dictionary with definitions + + Returns: + A unique reference key + """ + + counter = 1 + while key in definitions[SHARED_TAG]: + # If the value is already in shared definitions with this key, no need to rename + if value is not None and _is_same_value(definitions[SHARED_TAG].get(key), value): + return key + key = f"{key}_{counter}" + counter += 1 + return key + + +def _create_ref_object(ref_key: str) -> Dict[str, str]: + """ + Create a reference object using the specified key. + + Args: + ref_key: The reference key to use + + Returns: + A reference object in the proper format + """ + return {"$ref": f"#/{DEF_TAG}/{SHARED_TAG}/{ref_key}"} + + +def _is_same_value(val1: Any, val2: Any) -> bool: + """ + Check if two values are the same by comparing their JSON representation. + + Args: + val1: First value + val2: Second value + + Returns: + True if the values are the same, False otherwise + """ + return json.dumps(val1, sort_keys=True) == json.dumps(val2, sort_keys=True) + + +def _process_component_duplicates( + definitions: ManifestType, + component_duplicates: ComponentDuplicatesType, +) -> None: + """ + Process duplicate objects and replace them with references. + + Args: + definitions: The definitions dictionary to modify + component_duplicates: Dictionary of duplicate objects + """ + for obj_hash, occurrences in component_duplicates.items(): + # Skip non-duplicates + if len(occurrences) < N_OCCURANCES: + continue + + # Take the value from the first occurrence, as they are the same + path, _, value = occurrences[0] + # take the component's name as the last part of it's path + key = path[-1] + # Create a meaningful reference key + ref_key = _create_reference_key(definitions, key) + # Add to definitions + _add_to_shared_definitions(definitions, ref_key, value) + + # Replace all occurrences with references + for path, parent_obj, _ in occurrences: + if path: # Make sure the path is valid + key = path[-1] + parent_obj[key] = _create_ref_object(ref_key) + + +def _add_to_shared_definitions( + definitions: DefinitionsType, + key: str, + value: Any, +) -> DefinitionsType: + """ + Add a value to the shared definitions under the specified key. + + Args: + definitions: The definitions dictionary to modify + key: The key to use + value: The value to add + """ + + if key not in definitions[SHARED_TAG]: + definitions[SHARED_TAG][key] = value + + return definitions + + +def _process_field_duplicates( + definitions: ManifestType, + field_duplicates: FieldDuplicatesType, +) -> None: + """ + Process duplicate primitive values and replace them with references. + + Args: + definitions: The definitions dictionary to modify + field_duplicates: Dictionary of duplicate primitive values + """ + + for (key, value), occurrences in field_duplicates.items(): + # Skip non-duplicates + if len(occurrences) < N_OCCURANCES: + continue + + ref_key = _create_reference_key(definitions, key, value) + # Add to definitions if not already there + _add_to_shared_definitions(definitions, ref_key, value) + + # Replace all occurrences with references + for path, parent_obj in occurrences: + if path: # Make sure the path is valid + key = path[-1] + parent_obj[key] = _create_ref_object(ref_key) diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py index 045ea9a2c..5d68b197b 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py @@ -9,6 +9,7 @@ CircularReferenceException, UndefinedReferenceException, ) +from airbyte_cdk.sources.declarative.parsers.manifest_deduplicator import deduplicate_definitions REF_TAG = "$ref" @@ -102,9 +103,14 @@ class ManifestReferenceResolver: def preprocess_manifest(self, manifest: Mapping[str, Any]) -> Mapping[str, Any]: """ :param manifest: incoming manifest that could have references to previously defined components - :return: """ - return self._evaluate_node(manifest, manifest, set()) # type: ignore[no-any-return] + + preprocessed_manifest = self._evaluate_node(manifest, manifest, set()) + + # we need to reduce commonalities in the manifest after the references have been resolved + reduced_manifest = deduplicate_definitions(preprocessed_manifest) + + return reduced_manifest def _evaluate_node(self, node: Any, manifest: Mapping[str, Any], visited: Set[Any]) -> Any: if isinstance(node, dict): diff --git a/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py b/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py index 1fdbf2d55..29012757b 100644 --- a/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +++ b/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py @@ -162,3 +162,247 @@ def test_circular_reference(): content = {"elem_ref1": "#/elem_ref2", "elem_ref2": "#/elem_ref1"} with pytest.raises(CircularReferenceException): resolver.preprocess_manifest(content) + + +def test_deduplicate_manifest_definitions(): + content = { + "definitions": { + "streams": { + "A": { + "type": "DeclarativeStream", + "name": "A", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "url_base": "https://pokeapi.co/api/v2/", + "authenticator": { + "type": "BasicHttpAuthenticator", + "api_key": '{{ config["api_token"] }}', + }, + "path": "path_to_A", + "http_method": "GET", + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "a": { + "type": "string", + } + }, + "additionalProperties": True, + }, + }, + }, + "A_1": { + "type": "DeclarativeStream", + "name": "A_1", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "url_base": "https://pokeapi.co/api/v2/", + "authenticator": { + "type": "BasicHttpAuthenticator", + "api_key": '{{ config["api_token"] }}', + }, + "path": "path_to_A", + "http_method": "GET", + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "a_1": { + "type": "string", + } + }, + "additionalProperties": True, + }, + }, + }, + "B": { + "type": "DeclarativeStream", + "name": "B", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "url_base": "https://pokeapi.co/api/v2/", + "authenticator": { + "type": "BasicHttpAuthenticator", + "api_key": '{{ config["api_token"] }}', + }, + "path": "path_to_B", + "http_method": "GET", + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "b": { + "type": "string", + } + }, + "additionalProperties": True, + }, + }, + }, + "B_1": { + "type": "DeclarativeStream", + "name": "B_1", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "url_base": "https://pokeapi.co/api/v2/", + "authenticator": { + "type": "BasicHttpAuthenticator", + "api_key": '{{ config["api_token"] }}', + }, + "path": "path_to_B", + "http_method": "GET", + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "b": { + "type": "string", + } + }, + "additionalProperties": True, + }, + }, + }, + } + } + } + expected = { + "definitions": { + "streams": { + "A": { + "type": "DeclarativeStream", + "name": "A", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/url_base"}, + "authenticator": {"$ref": "#/definitions/shared/authenticator"}, + "path": "path_to_A", + "http_method": "GET", + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": {"a": {"type": "string"}}, + "additionalProperties": True, + }, + }, + }, + "A_1": { + "type": "DeclarativeStream", + "name": "A_1", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/url_base"}, + "authenticator": {"$ref": "#/definitions/shared/authenticator"}, + "path": "path_to_A", + "http_method": "GET", + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": {"a_1": {"type": "string"}}, + "additionalProperties": True, + }, + }, + }, + "B": { + "type": "DeclarativeStream", + "name": "B", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/url_base"}, + "authenticator": {"$ref": "#/definitions/shared/authenticator"}, + "path": "path_to_B", + "http_method": "GET", + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": {"b": {"type": "string"}}, + "additionalProperties": True, + }, + }, + }, + "B_1": { + "type": "DeclarativeStream", + "name": "B_1", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/url_base"}, + "authenticator": {"$ref": "#/definitions/shared/authenticator"}, + "path": "path_to_B", + "http_method": "GET", + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": {"b": {"type": "string"}}, + "additionalProperties": True, + }, + }, + }, + }, + "shared": { + "authenticator": { + "type": "BasicHttpAuthenticator", + "api_key": '{{ config["api_token"] }}', + }, + "url_base": "https://pokeapi.co/api/v2/", + }, + } + } + config = resolver.preprocess_manifest(content) + assert config == expected From 7d910ee8b07d8b563617c20fcf7dd95df7f53067 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Wed, 26 Mar 2025 19:35:26 +0200 Subject: [PATCH 02/27] deduplication version 2 --- .../parsers/manifest_deduplicator.py | 284 +++++++----------- .../parsers/manifest_reference_resolver.py | 4 +- 2 files changed, 109 insertions(+), 179 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py index 32a750fa8..347e6a0ee 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py @@ -6,15 +6,14 @@ import hashlib import json from collections import defaultdict -from typing import Any, DefaultDict, Dict, Hashable, List, Optional, Tuple +from typing import Any, DefaultDict, Dict, List, Optional, Tuple from airbyte_cdk.sources.declarative.parsers.custom_exceptions import ManifestDeduplicationException # Type definitions for better readability ManifestType = Dict[str, Any] DefinitionsType = Dict[str, Any] -FieldDuplicatesType = DefaultDict[Tuple[str, Any], List[Tuple[List[str], Dict]]] -ComponentDuplicatesType = DefaultDict[str, List[Tuple[List[str], Dict, Dict]]] +DuplicatesType = DefaultDict[str, List[Tuple[List[str], Dict, Dict]]] # Configuration constants N_OCCURANCES = 2 @@ -22,16 +21,15 @@ DEF_TAG = "definitions" SHARED_TAG = "shared" -# SPECIFY COMPONENT TAGS FOR DEDUPLICATION -COMPONENT_TAGS = [ +# SPECIFY TAGS FOR DEDUPLICATION +TAGS = [ "authenticator", -] - -# SPECIFY FIELD TAGS FOR DEDUPLICATION -FIELD_TAGS = [ "url_base", ] +# the placeholder for collected duplicates +DUPLICATES: DuplicatesType = defaultdict(list, {}) + def deduplicate_definitions(resolved_manifest: ManifestType) -> ManifestType: """ @@ -41,49 +39,72 @@ def deduplicate_definitions(resolved_manifest: ManifestType) -> ManifestType: resolved_manifest: A dictionary representing a JSON structure to be analyzed. Returns: - A refactored JSON structure with common properties extracted. + A refactored JSON structure with common properties extracted to `definitions.shared`, + the duplicated properties replaced with references """ try: _manifest = copy.deepcopy(resolved_manifest) definitions = _manifest.get(DEF_TAG, {}) - field_duplicates, component_duplicates = _collect_all_duplicates(definitions) - _process_duplicates(definitions, field_duplicates, component_duplicates) + + _collect_duplicates(definitions) + _handle_duplicates(definitions) + return _manifest except ManifestDeduplicationException: - # we don't want to fix every single error which might occur, - # due to the varaety of possible manifest configurations, # if any arror occurs, we just return the original manifest. return resolved_manifest -def _process_duplicates( - definitions: DefinitionsType, - field_duplicates: FieldDuplicatesType, - component_duplicates: ComponentDuplicatesType, -) -> None: +def _replace_duplicates_with_refs(definitions: ManifestType) -> None: + """ + Process duplicate objects and replace them with references. + + Args: + definitions: The definitions dictionary to modify + """ + for _, occurrences in DUPLICATES.items(): + # Skip non-duplicates + if len(occurrences) < N_OCCURANCES: + continue + + # Take the value from the first occurrence, as they are the same + path, _, value = occurrences[0] + # take the component's name as the last part of it's path + key = path[-1] + # Create a meaningful reference key + ref_key = _create_reference_key(definitions, key) + # Add to definitions + _add_to_shared_definitions(definitions, ref_key, value) + + # Replace all occurrences with references + for path, parent_obj, _ in occurrences: + if path: # Make sure the path is valid + key = path[-1] + parent_obj[key] = _create_ref_object(ref_key) + + +def _handle_duplicates(definitions: DefinitionsType) -> None: """ - Process the duplicates and replace them with references. + Process the DUPLICATES and replace them with references. Args: - field_duplicates: Dictionary of duplicate primitive values - component_duplicates: Dictionary of duplicate objects + DUPLICATES: Dictionary of duplicate objects """ # process duplicates only if there are any - if len(field_duplicates) > 0 or len(component_duplicates) > 0: + if len(DUPLICATES) > 0: if not SHARED_TAG in definitions: definitions[SHARED_TAG] = {} try: - _process_component_duplicates(definitions, component_duplicates) - _process_field_duplicates(definitions, field_duplicates) + _replace_duplicates_with_refs(definitions) except Exception as e: raise ManifestDeduplicationException(str(e)) -def _is_allowed_component(key: str) -> bool: +def _is_allowed_tag(key: str) -> bool: """ - Check if the key is an allowed component tag. + Check if the key is an allowed tag for deduplication. Args: key: The key to check @@ -91,25 +112,56 @@ def _is_allowed_component(key: str) -> bool: Returns: True if the key is allowed, False otherwise """ - return key in COMPONENT_TAGS + return key in TAGS -def _is_allowed_field(key: str) -> bool: +def _add_duplicate( + current_path: List[str], + obj: Dict, + value: Any, + key: Optional[str] = None, +) -> None: """ - Check if the key is an allowed field tag. + Adds a duplicate record of an observed object by computing a unique hash for the provided value. - Args: - key: The key to check + This function computes a hash for the given value (or a dictionary composed of the key and value if a key is provided) + and appends a tuple containing the current path, the original object, and the value to the global DUPLICATES + dictionary under the corresponding hash. - Returns: - True if the key is allowed, False otherwise + Parameters: + current_path (List[str]): The list of keys or indices representing the current location in the object hierarchy. + obj (Dict): The original dictionary object where the duplicate is observed. + value (Any): The value to be hashed and used for identifying duplicates. + key (Optional[str]): An optional key that, if provided, wraps the value in a dictionary before hashing. + """ + # create hash for the duplicate observed + value_to_hash = value if key is None else {key: value} + obj_hash = _hash_object(value_to_hash) + if obj_hash: + DUPLICATES[obj_hash].append((current_path, obj, value)) + + +def _add_to_shared_definitions( + definitions: DefinitionsType, + key: str, + value: Any, +) -> DefinitionsType: """ - return key in FIELD_TAGS + Add a value to the shared definitions under the specified key. + + Args: + definitions: The definitions dictionary to modify + key: The key to use + value: The value to add + """ + + if key not in definitions[SHARED_TAG]: + definitions[SHARED_TAG][key] = value + + return definitions -def _collect_all_duplicates( - node: ManifestType, -) -> Tuple[FieldDuplicatesType, ComponentDuplicatesType]: +def _collect_duplicates(node: ManifestType, path: Optional[List[str]] = None) -> None: """ Traverse the JSON object and collect all potential duplicate values and objects. @@ -117,48 +169,39 @@ def _collect_all_duplicates( node: The JSON object to analyze Returns: - A tuple of (field_duplicates, component_duplicates) + DUPLICATES: A dictionary of duplicate objects """ - field_duplicates: FieldDuplicatesType = defaultdict(list) - component_duplicates: ComponentDuplicatesType = defaultdict(list) - - def collect_duplicates(obj: Dict, path: Optional[List[str]] = None) -> None: - if not isinstance(obj, dict): + try: + if not isinstance(node, dict): return path = [] if path is None else path + # Check if the object is empty - for key, value in obj.items(): + for key, value in node.items(): current_path = path + [key] if isinstance(value, dict): # First process nested dictionaries - collect_duplicates(value, current_path) - + _collect_duplicates(value, current_path) # Process allowed-only component tags - if _is_allowed_component(key): - obj_hash = _hash_object(value) - if obj_hash: - component_duplicates[obj_hash].append((current_path, obj, value)) + if _is_allowed_tag(key): + _add_duplicate(current_path, node, value) + + # handle primitive types + elif isinstance(value, (str, int, float, bool)): + # Process allowed-only field tags + if _is_allowed_tag(key): + _add_duplicate(current_path, node, value, key) - # handle list[dict] cases + # handle list cases elif isinstance(value, list): for i, item in enumerate(value): - collect_duplicates(item, current_path + [str(i)]) - - # Process allowed-only field tags - elif _is_allowed_field(key): - hashable_value = _make_hashable(value) - field_duplicates[(key, hashable_value)].append((current_path, obj)) - - try: - collect_duplicates(node) + _collect_duplicates(item, current_path + [str(i)]) except Exception as e: raise ManifestDeduplicationException(str(e)) - return field_duplicates, component_duplicates - def _hash_object(node: Dict) -> Optional[str]: """ @@ -173,26 +216,10 @@ def _hash_object(node: Dict) -> Optional[str]: if isinstance(node, Dict): # Sort keys to ensure consistent hash for same content return hashlib.md5(json.dumps(node, sort_keys=True).encode()).hexdigest() - return None -def _make_hashable(value: Any) -> Any: - """ - Convert a value to a hashable representation. - - Args: - value: The value to make hashable - - Returns: - A hashable representation of the value - """ - return json.dumps(value) if not isinstance(value, Hashable) else value - - -def _create_reference_key( - definitions: DefinitionsType, key: str, value: Optional[Any] = None -) -> str: +def _create_reference_key(definitions: DefinitionsType, key: str) -> str: """ Create a unique reference key and handle collisions. @@ -206,9 +233,6 @@ def _create_reference_key( counter = 1 while key in definitions[SHARED_TAG]: - # If the value is already in shared definitions with this key, no need to rename - if value is not None and _is_same_value(definitions[SHARED_TAG].get(key), value): - return key key = f"{key}_{counter}" counter += 1 return key @@ -225,97 +249,3 @@ def _create_ref_object(ref_key: str) -> Dict[str, str]: A reference object in the proper format """ return {"$ref": f"#/{DEF_TAG}/{SHARED_TAG}/{ref_key}"} - - -def _is_same_value(val1: Any, val2: Any) -> bool: - """ - Check if two values are the same by comparing their JSON representation. - - Args: - val1: First value - val2: Second value - - Returns: - True if the values are the same, False otherwise - """ - return json.dumps(val1, sort_keys=True) == json.dumps(val2, sort_keys=True) - - -def _process_component_duplicates( - definitions: ManifestType, - component_duplicates: ComponentDuplicatesType, -) -> None: - """ - Process duplicate objects and replace them with references. - - Args: - definitions: The definitions dictionary to modify - component_duplicates: Dictionary of duplicate objects - """ - for obj_hash, occurrences in component_duplicates.items(): - # Skip non-duplicates - if len(occurrences) < N_OCCURANCES: - continue - - # Take the value from the first occurrence, as they are the same - path, _, value = occurrences[0] - # take the component's name as the last part of it's path - key = path[-1] - # Create a meaningful reference key - ref_key = _create_reference_key(definitions, key) - # Add to definitions - _add_to_shared_definitions(definitions, ref_key, value) - - # Replace all occurrences with references - for path, parent_obj, _ in occurrences: - if path: # Make sure the path is valid - key = path[-1] - parent_obj[key] = _create_ref_object(ref_key) - - -def _add_to_shared_definitions( - definitions: DefinitionsType, - key: str, - value: Any, -) -> DefinitionsType: - """ - Add a value to the shared definitions under the specified key. - - Args: - definitions: The definitions dictionary to modify - key: The key to use - value: The value to add - """ - - if key not in definitions[SHARED_TAG]: - definitions[SHARED_TAG][key] = value - - return definitions - - -def _process_field_duplicates( - definitions: ManifestType, - field_duplicates: FieldDuplicatesType, -) -> None: - """ - Process duplicate primitive values and replace them with references. - - Args: - definitions: The definitions dictionary to modify - field_duplicates: Dictionary of duplicate primitive values - """ - - for (key, value), occurrences in field_duplicates.items(): - # Skip non-duplicates - if len(occurrences) < N_OCCURANCES: - continue - - ref_key = _create_reference_key(definitions, key, value) - # Add to definitions if not already there - _add_to_shared_definitions(definitions, ref_key, value) - - # Replace all occurrences with references - for path, parent_obj in occurrences: - if path: # Make sure the path is valid - key = path[-1] - parent_obj[key] = _create_ref_object(ref_key) diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py index 5d68b197b..987187fb5 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py @@ -108,9 +108,9 @@ def preprocess_manifest(self, manifest: Mapping[str, Any]) -> Mapping[str, Any]: preprocessed_manifest = self._evaluate_node(manifest, manifest, set()) # we need to reduce commonalities in the manifest after the references have been resolved - reduced_manifest = deduplicate_definitions(preprocessed_manifest) + deduplicated_manifest = deduplicate_definitions(preprocessed_manifest) - return reduced_manifest + return deduplicated_manifest def _evaluate_node(self, node: Any, manifest: Mapping[str, Any], visited: Set[Any]) -> Any: if isinstance(node, dict): From 691d16abfc72319b8b1bc57246a20151669865db Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 27 Mar 2025 15:05:43 +0200 Subject: [PATCH 03/27] updated duplicates collection --- .../parsers/manifest_deduplicator.py | 67 +++++++++++-------- .../test_connector_builder_handler.py | 1 + .../declarative/interpolation/test_macros.py | 2 + 3 files changed, 42 insertions(+), 28 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py index 347e6a0ee..e0da3c3a0 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py @@ -13,7 +13,7 @@ # Type definitions for better readability ManifestType = Dict[str, Any] DefinitionsType = Dict[str, Any] -DuplicatesType = DefaultDict[str, List[Tuple[List[str], Dict, Dict]]] +DuplicatesType = DefaultDict[str, List[Tuple[List[str], Dict[str, Any], Dict[str, Any]]]] # Configuration constants N_OCCURANCES = 2 @@ -27,9 +27,6 @@ "url_base", ] -# the placeholder for collected duplicates -DUPLICATES: DuplicatesType = defaultdict(list, {}) - def deduplicate_definitions(resolved_manifest: ManifestType) -> ManifestType: """ @@ -47,8 +44,8 @@ def deduplicate_definitions(resolved_manifest: ManifestType) -> ManifestType: _manifest = copy.deepcopy(resolved_manifest) definitions = _manifest.get(DEF_TAG, {}) - _collect_duplicates(definitions) - _handle_duplicates(definitions) + duplicates = _collect_duplicates(definitions) + _handle_duplicates(definitions, duplicates) return _manifest except ManifestDeduplicationException: @@ -56,14 +53,14 @@ def deduplicate_definitions(resolved_manifest: ManifestType) -> ManifestType: return resolved_manifest -def _replace_duplicates_with_refs(definitions: ManifestType) -> None: +def _replace_duplicates_with_refs(definitions: ManifestType, duplicates: DuplicatesType) -> None: """ Process duplicate objects and replace them with references. Args: definitions: The definitions dictionary to modify """ - for _, occurrences in DUPLICATES.items(): + for _, occurrences in duplicates.items(): # Skip non-duplicates if len(occurrences) < N_OCCURANCES: continue @@ -84,20 +81,20 @@ def _replace_duplicates_with_refs(definitions: ManifestType) -> None: parent_obj[key] = _create_ref_object(ref_key) -def _handle_duplicates(definitions: DefinitionsType) -> None: +def _handle_duplicates(definitions: DefinitionsType, duplicates: DuplicatesType) -> None: """ - Process the DUPLICATES and replace them with references. + Process the duplicates and replace them with references. Args: - DUPLICATES: Dictionary of duplicate objects + duplicates: Dictionary of duplicate objects """ # process duplicates only if there are any - if len(DUPLICATES) > 0: + if len(duplicates) > 0: if not SHARED_TAG in definitions: definitions[SHARED_TAG] = {} try: - _replace_duplicates_with_refs(definitions) + _replace_duplicates_with_refs(definitions, duplicates) except Exception as e: raise ManifestDeduplicationException(str(e)) @@ -116,8 +113,9 @@ def _is_allowed_tag(key: str) -> bool: def _add_duplicate( + duplicates: DuplicatesType, current_path: List[str], - obj: Dict, + obj: Dict[str, Any], value: Any, key: Optional[str] = None, ) -> None: @@ -125,10 +123,11 @@ def _add_duplicate( Adds a duplicate record of an observed object by computing a unique hash for the provided value. This function computes a hash for the given value (or a dictionary composed of the key and value if a key is provided) - and appends a tuple containing the current path, the original object, and the value to the global DUPLICATES + and appends a tuple containing the current path, the original object, and the value to the duplicates dictionary under the corresponding hash. Parameters: + duplicates (DuplicatesType): The dictionary to store duplicate records. current_path (List[str]): The list of keys or indices representing the current location in the object hierarchy. obj (Dict): The original dictionary object where the duplicate is observed. value (Any): The value to be hashed and used for identifying duplicates. @@ -138,7 +137,7 @@ def _add_duplicate( value_to_hash = value if key is None else {key: value} obj_hash = _hash_object(value_to_hash) if obj_hash: - DUPLICATES[obj_hash].append((current_path, obj, value)) + duplicates[obj_hash].append((current_path, obj, value)) def _add_to_shared_definitions( @@ -161,49 +160,61 @@ def _add_to_shared_definitions( return definitions -def _collect_duplicates(node: ManifestType, path: Optional[List[str]] = None) -> None: +def _collect_duplicates(node: ManifestType) -> DuplicatesType: """ Traverse the JSON object and collect all potential duplicate values and objects. Args: - node: The JSON object to analyze + node: The JSON object to analyze. Returns: - DUPLICATES: A dictionary of duplicate objects + duplicates: A dictionary of duplicate objects. """ - try: - if not isinstance(node, dict): + def _collect(obj: Dict[str, Any], path: Optional[List[str]] = None) -> None: + """ + The closure to recursively collect duplicates in the JSON object. + + Args: + obj: The current object being analyzed. + path: The current path in the object hierarchy. + """ + if not isinstance(obj, dict): return path = [] if path is None else path - # Check if the object is empty - for key, value in node.items(): + for key, value in obj.items(): current_path = path + [key] if isinstance(value, dict): # First process nested dictionaries - _collect_duplicates(value, current_path) + _collect(value, current_path) # Process allowed-only component tags if _is_allowed_tag(key): - _add_duplicate(current_path, node, value) + _add_duplicate(duplicates, current_path, obj, value) # handle primitive types elif isinstance(value, (str, int, float, bool)): # Process allowed-only field tags if _is_allowed_tag(key): - _add_duplicate(current_path, node, value, key) + _add_duplicate(duplicates, current_path, obj, value, key) # handle list cases elif isinstance(value, list): for i, item in enumerate(value): - _collect_duplicates(item, current_path + [str(i)]) + _collect(item, current_path + [str(i)]) + + duplicates: DuplicatesType = defaultdict(list, {}) + + try: + _collect(node) + return duplicates except Exception as e: raise ManifestDeduplicationException(str(e)) -def _hash_object(node: Dict) -> Optional[str]: +def _hash_object(node: Dict[str, Any]) -> Optional[str]: """ Create a unique hash for a dictionary object. diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index af5968faa..f703c23e4 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -384,6 +384,7 @@ def test_resolve_manifest(valid_resolve_manifest_config_file): }, "record_selector": {"extractor": {"field_path": ["result"]}}, }, + "shared": {}, }, "streams": [ { diff --git a/unit_tests/sources/declarative/interpolation/test_macros.py b/unit_tests/sources/declarative/interpolation/test_macros.py index 5fbea2601..d7a491557 100644 --- a/unit_tests/sources/declarative/interpolation/test_macros.py +++ b/unit_tests/sources/declarative/interpolation/test_macros.py @@ -5,6 +5,7 @@ import datetime import pytest +from freezegun import freeze_time from airbyte_cdk.sources.declarative.interpolation.macros import macros @@ -29,6 +30,7 @@ def test_macros_export(test_name, fn_name, found_in_macros): assert fn_name not in macros +@freeze_time("2022-01-01") @pytest.mark.parametrize( "input_value, format, input_format, expected_output", [ From 081e7a8bfbb16bb97a9787d81a8943d4006b4a0e Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 31 Mar 2025 17:49:01 +0300 Subject: [PATCH 04/27] deduplicate most frequent tags, use existing refs if definitions.shared. is already provided --- .../declarative_component_schema.yaml | 3 + .../declarative/parsers/custom_exceptions.py | 4 +- .../parsers/manifest_deduplicator.py | 271 +++++-- .../parsers/manifest_reference_resolver.py | 22 +- .../test_connector_builder_handler.py | 1 - .../test_manifest_reference_resolver.py | 663 ++++++++++++++---- 6 files changed, 750 insertions(+), 214 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index b7c0d84a0..999336f64 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1583,6 +1583,7 @@ definitions: title: URL Base description: The base URL (scheme and host, e.g. "https://api.example.com") to match. type: string + sharable: True url_path_pattern: title: URL Path Pattern description: A regular expression pattern to match the URL path. @@ -1841,6 +1842,7 @@ definitions: - "{{ config['base_url'] or 'https://app.posthog.com'}}/api" - "https://connect.squareup.com/v2/quotes/{{ stream_partition['id'] }}/quote_line_groups" - "https://example.com/api/v1/resource/{{ next_page_token['id'] }}" + sharable: True path: title: URL Path description: Path the specific API endpoint that this stream represents. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this. @@ -1872,6 +1874,7 @@ definitions: - "$ref": "#/definitions/SessionTokenAuthenticator" - "$ref": "#/definitions/LegacySessionTokenAuthenticator" - "$ref": "#/definitions/SelectiveAuthenticator" + sharable: True error_handler: title: Error Handler description: Error handler component that defines how to handle errors. diff --git a/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py b/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py index a5c192511..b847ee26c 100644 --- a/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +++ b/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py @@ -26,5 +26,5 @@ class ManifestDeduplicationException(Exception): Raised when a circular reference is detected in a manifest. """ - def __init__(self, exception: str) -> None: - super().__init__(f"Failed to deduplicate manifest: {exception}") + def __init__(self, message: str) -> None: + super().__init__(f"Failed to deduplicate manifest: {message}") diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py index e0da3c3a0..296e12d74 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py @@ -5,30 +5,34 @@ import copy import hashlib import json +import pkgutil from collections import defaultdict from typing import Any, DefaultDict, Dict, List, Optional, Tuple +import yaml + from airbyte_cdk.sources.declarative.parsers.custom_exceptions import ManifestDeduplicationException # Type definitions for better readability ManifestType = Dict[str, Any] DefinitionsType = Dict[str, Any] -DuplicatesType = DefaultDict[str, List[Tuple[List[str], Dict[str, Any], Dict[str, Any]]]] +DuplicateOccurancesType = List[Tuple[List[str], Dict[str, Any], Dict[str, Any]]] +DuplicatesType = DefaultDict[str, DuplicateOccurancesType] # Configuration constants N_OCCURANCES = 2 DEF_TAG = "definitions" +STREAMS_TAG = "streams" SHARED_TAG = "shared" - -# SPECIFY TAGS FOR DEDUPLICATION -TAGS = [ - "authenticator", - "url_base", -] +SHARABLE_TAG = "sharable" +SCHEMA_LOADER_TAG = "schema_loader" +SCHEMAS_TAG = "schemas" +SCHEMA_TAG = "schema" +PROPERTIES_TAG = "properties" -def deduplicate_definitions(resolved_manifest: ManifestType) -> ManifestType: +def deduplicate_minifest(resolved_manifest: ManifestType) -> ManifestType: """ Find commonalities in the input JSON structure and refactor it to avoid redundancy. @@ -42,18 +46,99 @@ def deduplicate_definitions(resolved_manifest: ManifestType) -> ManifestType: try: _manifest = copy.deepcopy(resolved_manifest) - definitions = _manifest.get(DEF_TAG, {}) - duplicates = _collect_duplicates(definitions) - _handle_duplicates(definitions, duplicates) + # prepare the `definitions` tag + _prepare_definitions(_manifest) + # collect duplicates for a given manifest + duplicates = _collect_duplicates(_manifest) + # replace duplicates with references, if any + _handle_duplicates(_manifest, duplicates) + # post processing the manifest + _reference_schemas(_manifest) return _manifest except ManifestDeduplicationException: - # if any arror occurs, we just return the original manifest. + # if any error occurs, we just return the original manifest. return resolved_manifest -def _replace_duplicates_with_refs(definitions: ManifestType, duplicates: DuplicatesType) -> None: +def _get_sharable_tags() -> List[str]: + # we need to recursively find the tags in the component schema that has a `sharable` key inside. + tags = [] + + # we need to find the `sharable` tags in the component schema + def _find_sharable(schema: Dict[str, Any]) -> None: + for root_key, root_value in schema.items(): + properties = root_value.get(PROPERTIES_TAG, {}) + + for inner_key, inner_value in properties.items(): + if SHARABLE_TAG in inner_value.keys(): + tags.append(inner_key) + + _find_sharable(_get_declarative_component_schema().get(DEF_TAG, {})) + + # return unique tags only + return list(set(tags)) + + +def _get_declarative_component_schema() -> Dict[str, Any]: + try: + raw_component_schema = pkgutil.get_data( + "airbyte_cdk", "sources/declarative/declarative_component_schema.yaml" + ) + if raw_component_schema is not None: + declarative_component_schema = yaml.load(raw_component_schema, Loader=yaml.SafeLoader) + return declarative_component_schema # type: ignore + else: + raise RuntimeError( + "Failed to read manifest component json schema required for deduplication" + ) + except FileNotFoundError as e: + raise FileNotFoundError( + f"Failed to read manifest component json schema required for deduplication: {e}" + ) + + +def _prepare_definitions(manifest: ManifestType) -> None: + """ + Clean the definitions in the manifest by removing unnecessary properties. + This function modifies the manifest in place. + Args: + manifest: The manifest to clean + """ + # Check if the definitions tag exists + if not DEF_TAG in manifest: + manifest[DEF_TAG] = {} + + # remove everything from definitions tag except of `shared`, after processing + for key in list(manifest[DEF_TAG].keys()): + if key != SHARED_TAG: + manifest[DEF_TAG].pop(key, None) + + +def _reference_schemas(manifest: ManifestType) -> None: + """ + Process the definitions in the manifest to move streams from definitions to the main stream list. + This function modifies the manifest in place. + + Args: + manifest: The manifest to process + """ + + # create the ref tag to each stream in the manifest + if STREAMS_TAG in manifest: + for stream in manifest[STREAMS_TAG]: + stream_name = stream.get("name") + # reference the stream schema for the stream to where it's storred + if stream_name in manifest[SCHEMAS_TAG].keys(): + stream[SCHEMA_LOADER_TAG][SCHEMA_TAG] = _create_schema_ref(stream_name) + else: + raise ManifestDeduplicationException( + f"Stream {stream_name} not found in `schemas`. Please check the manifest." + ) + + +def _replace_duplicates_with_refs(manifest: ManifestType, duplicates: DuplicatesType) -> None: """ Process duplicate objects and replace them with references. @@ -61,57 +146,42 @@ def _replace_duplicates_with_refs(definitions: ManifestType, duplicates: Duplica definitions: The definitions dictionary to modify """ for _, occurrences in duplicates.items(): - # Skip non-duplicates - if len(occurrences) < N_OCCURANCES: - continue - - # Take the value from the first occurrence, as they are the same - path, _, value = occurrences[0] # take the component's name as the last part of it's path - key = path[-1] - # Create a meaningful reference key - ref_key = _create_reference_key(definitions, key) - # Add to definitions - _add_to_shared_definitions(definitions, ref_key, value) + key, value = _get_key_value_from_occurances(occurrences) + is_shared_def = _is_shared_definition(manifest, key) + + # Add to definitions if not there already + if not is_shared_def: + _add_to_shared_definitions(manifest, key, value) - # Replace all occurrences with references - for path, parent_obj, _ in occurrences: - if path: # Make sure the path is valid - key = path[-1] - parent_obj[key] = _create_ref_object(ref_key) + # Replace occurrences with references + for path, parent_obj, value in occurrences: + if is_shared_def: + if value == _get_shared_definition_value(manifest, key): + parent_obj[key] = _create_shared_definition_ref(key) + else: + parent_obj[key] = _create_shared_definition_ref(key) -def _handle_duplicates(definitions: DefinitionsType, duplicates: DuplicatesType) -> None: +def _handle_duplicates(manifest: DefinitionsType, duplicates: DuplicatesType) -> None: """ Process the duplicates and replace them with references. Args: duplicates: Dictionary of duplicate objects """ - # process duplicates only if there are any + if len(duplicates) > 0: - if not SHARED_TAG in definitions: - definitions[SHARED_TAG] = {} + # Check if the shared tag exists + if not SHARED_TAG in manifest[DEF_TAG]: + manifest[DEF_TAG][SHARED_TAG] = {} try: - _replace_duplicates_with_refs(definitions, duplicates) + _replace_duplicates_with_refs(manifest, duplicates) except Exception as e: raise ManifestDeduplicationException(str(e)) -def _is_allowed_tag(key: str) -> bool: - """ - Check if the key is an allowed tag for deduplication. - - Args: - key: The key to check - - Returns: - True if the key is allowed, False otherwise - """ - return key in TAGS - - def _add_duplicate( duplicates: DuplicatesType, current_path: List[str], @@ -141,7 +211,7 @@ def _add_duplicate( def _add_to_shared_definitions( - definitions: DefinitionsType, + manifest: DefinitionsType, key: str, value: Any, ) -> DefinitionsType: @@ -154,10 +224,10 @@ def _add_to_shared_definitions( value: The value to add """ - if key not in definitions[SHARED_TAG]: - definitions[SHARED_TAG][key] = value + if key not in manifest[DEF_TAG][SHARED_TAG]: + manifest[DEF_TAG][SHARED_TAG][key] = value - return definitions + return manifest def _collect_duplicates(node: ManifestType) -> DuplicatesType: @@ -179,25 +249,30 @@ def _collect(obj: Dict[str, Any], path: Optional[List[str]] = None) -> None: obj: The current object being analyzed. path: The current path in the object hierarchy. """ + if not isinstance(obj, dict): return path = [] if path is None else path # Check if the object is empty for key, value in obj.items(): + # do not collect duplicates from `definitions` tag + if key == DEF_TAG: + continue + current_path = path + [key] if isinstance(value, dict): # First process nested dictionaries _collect(value, current_path) # Process allowed-only component tags - if _is_allowed_tag(key): + if key in sharable_tags: _add_duplicate(duplicates, current_path, obj, value) # handle primitive types elif isinstance(value, (str, int, float, bool)): # Process allowed-only field tags - if _is_allowed_tag(key): + if key in sharable_tags: _add_duplicate(duplicates, current_path, obj, value, key) # handle list cases @@ -205,15 +280,37 @@ def _collect(obj: Dict[str, Any], path: Optional[List[str]] = None) -> None: for i, item in enumerate(value): _collect(item, current_path + [str(i)]) - duplicates: DuplicatesType = defaultdict(list, {}) + # get the tags marked as `sharable` in the component schema + sharable_tags = _get_sharable_tags() + duplicates: DuplicatesType = defaultdict(list, {}) try: - _collect(node) + if sharable_tags: + _collect(node) + # clean non-duplicates and sort based on the count of occurrences + return clean_and_sort_duplicates(duplicates) return duplicates except Exception as e: raise ManifestDeduplicationException(str(e)) +def clean_and_sort_duplicates(duplicates: DuplicatesType) -> DuplicatesType: + """ + Clean non-duplicates and sort the duplicates by their occurrences. + + Args: + duplicates: The duplicates dictionary to sort + Returns: + A sorted duplicates dictionary + """ + # clean non-duplicates + duplicates = defaultdict(list, {k: v for k, v in duplicates.items() if len(v) >= N_OCCURANCES}) + # sort the duplicates by their occurrences + return defaultdict( + list, {k: v for k, v in sorted(duplicates.items(), key=lambda x: len(x[1]), reverse=True)} + ) + + def _hash_object(node: Dict[str, Any]) -> Optional[str]: """ Create a unique hash for a dictionary object. @@ -230,28 +327,57 @@ def _hash_object(node: Dict[str, Any]) -> Optional[str]: return None -def _create_reference_key(definitions: DefinitionsType, key: str) -> str: +def _is_shared_definition(manifest: DefinitionsType, key: str) -> bool: + """ + Check if the key already exists in the shared definitions. + + Args: + key: The key to check + definitions: The definitions dictionary with definitions + + Returns: + True if the key exists in the shared definitions, False otherwise + """ + return key in manifest[DEF_TAG][SHARED_TAG] + + +def _get_shared_definition_value(manifest: DefinitionsType, key: str) -> Any: """ - Create a unique reference key and handle collisions. + Get the value of a shared definition by its key. Args: - key: The base key to use + key: The key to check definitions: The definitions dictionary with definitions + Returns: + The value of the shared definition + """ + if key in manifest[DEF_TAG][SHARED_TAG]: + return manifest[DEF_TAG][SHARED_TAG][key] + else: + raise ManifestDeduplicationException( + f"Key {key} not found in shared definitions. Please check the manifest." + ) + + +def _get_key_value_from_occurances(occurrences: DuplicateOccurancesType) -> Tuple[str, Any]: + """ + Get the key from the occurrences list. + + Args: + occurrences: The occurrences list Returns: - A unique reference key + The key and value from the occurrences """ - counter = 1 - while key in definitions[SHARED_TAG]: - key = f"{key}_{counter}" - counter += 1 - return key + # Take the value from the first occurrence, as they are the same + path, _, value = occurrences[0] + return path[-1], value # Return the component's name as the last part of its path -def _create_ref_object(ref_key: str) -> Dict[str, str]: +def _create_shared_definition_ref(ref_key: str) -> Dict[str, str]: """ - Create a reference object using the specified key. + Create a reference object for the shared definitions using the specified key. Args: ref_key: The reference key to use @@ -260,3 +386,16 @@ def _create_ref_object(ref_key: str) -> Dict[str, str]: A reference object in the proper format """ return {"$ref": f"#/{DEF_TAG}/{SHARED_TAG}/{ref_key}"} + + +def _create_schema_ref(ref_key: str) -> Dict[str, str]: + """ + Create a reference object for stream schema using the specified key. + + Args: + ref_key: The reference key to use + + Returns: + A reference object in the proper format + """ + return {"$ref": f"#/{SCHEMAS_TAG}/{ref_key}"} diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py index 987187fb5..6efb47dc0 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py @@ -3,13 +3,13 @@ # import re -from typing import Any, Mapping, Set, Tuple, Union +from typing import Any, Mapping, Optional, Set, Tuple, Union from airbyte_cdk.sources.declarative.parsers.custom_exceptions import ( CircularReferenceException, UndefinedReferenceException, ) -from airbyte_cdk.sources.declarative.parsers.manifest_deduplicator import deduplicate_definitions +from airbyte_cdk.sources.declarative.parsers.manifest_deduplicator import deduplicate_minifest REF_TAG = "$ref" @@ -100,17 +100,25 @@ class ManifestReferenceResolver: until we find a key with the given path, or until there is nothing to traverse. """ - def preprocess_manifest(self, manifest: Mapping[str, Any]) -> Mapping[str, Any]: + def preprocess_manifest( + self, + manifest: Mapping[str, Any], + reduce_commons: Optional[bool] = False, + ) -> Mapping[str, Any]: """ :param manifest: incoming manifest that could have references to previously defined components + :param reduce_commons: whether to deduplicate the commonalities in the manifest after pre-processing. """ - preprocessed_manifest = self._evaluate_node(manifest, manifest, set()) - # we need to reduce commonalities in the manifest after the references have been resolved - deduplicated_manifest = deduplicate_definitions(preprocessed_manifest) + # we need to reduce commonalities in the manifest after the references have been resolved, + # used mostly for Connector Builder use cases. + if reduce_commons: + deduplicated_manifest = deduplicate_minifest(preprocessed_manifest) + return deduplicated_manifest - return deduplicated_manifest + # return deduplicated_manifest + return preprocessed_manifest # type: ignore def _evaluate_node(self, node: Any, manifest: Mapping[str, Any], visited: Set[Any]) -> Any: if isinstance(node, dict): diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index f703c23e4..af5968faa 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -384,7 +384,6 @@ def test_resolve_manifest(valid_resolve_manifest_config_file): }, "record_selector": {"extractor": {"field_path": ["result"]}}, }, - "shared": {}, }, "streams": [ { diff --git a/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py b/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py index 29012757b..f589b8f19 100644 --- a/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +++ b/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py @@ -16,7 +16,6 @@ resolver = ManifestReferenceResolver() -# @ def test_refer(): content = {"limit": 50, "limit_ref": "#/limit"} config = resolver.preprocess_manifest(content) @@ -152,7 +151,10 @@ def test_list_of_dicts(): def test_multiple_levels_of_indexing(): - content = {"list": [{"A": ["a1", "a2"]}, {"B": ["b1", "b2"]}], "elem_ref": "#/list/1/B/0"} + content = { + "list": [{"A": ["a1", "a2"]}, {"B": ["b1", "b2"]}], + "elem_ref": "#/list/1/B/0", + } config = resolver.preprocess_manifest(content) elem_ref = config["elem_ref"] assert elem_ref == "b1" @@ -164,8 +166,9 @@ def test_circular_reference(): resolver.preprocess_manifest(content) -def test_deduplicate_manifest_definitions(): +def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequent_is_shared(): content = { + "type": "DeclarativeSource", "definitions": { "streams": { "A": { @@ -173,236 +176,620 @@ def test_deduplicate_manifest_definitions(): "name": "A", "retriever": { "type": "SimpleRetriever", - "decoder": {"type": "JsonDecoder"}, "requester": { - "type": "HttpRequester", - "url_base": "https://pokeapi.co/api/v2/", - "authenticator": { - "type": "BasicHttpAuthenticator", - "api_key": '{{ config["api_token"] }}', - }, - "path": "path_to_A", + "$ref": "#/definitions/requester_A", + "path": "A", "http_method": "GET", }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, }, "schema_loader": { "type": "InlineSchemaLoader", - "schema": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "properties": { - "a": { - "type": "string", - } - }, - "additionalProperties": True, - }, + "schema": {"$ref": "#/schemas/A"}, }, }, - "A_1": { + "B": { "type": "DeclarativeStream", - "name": "A_1", + "name": "B", "retriever": { "type": "SimpleRetriever", - "decoder": {"type": "JsonDecoder"}, "requester": { - "type": "HttpRequester", - "url_base": "https://pokeapi.co/api/v2/", - "authenticator": { - "type": "BasicHttpAuthenticator", - "api_key": '{{ config["api_token"] }}', - }, - "path": "path_to_A", + "$ref": "#/definitions/requester_B", + "path": "B", "http_method": "GET", }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, }, "schema_loader": { "type": "InlineSchemaLoader", - "schema": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "properties": { - "a_1": { - "type": "string", - } - }, - "additionalProperties": True, - }, + "schema": {"$ref": "#/schemas/B"}, }, }, - "B": { + "C": { "type": "DeclarativeStream", - "name": "B", + "name": "C", "retriever": { "type": "SimpleRetriever", - "decoder": {"type": "JsonDecoder"}, "requester": { - "type": "HttpRequester", - "url_base": "https://pokeapi.co/api/v2/", - "authenticator": { - "type": "BasicHttpAuthenticator", - "api_key": '{{ config["api_token"] }}', - }, - "path": "path_to_B", + "$ref": "#/definitions/requester_A", + "path": "C", "http_method": "GET", }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, }, "schema_loader": { "type": "InlineSchemaLoader", - "schema": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "properties": { - "b": { - "type": "string", - } - }, - "additionalProperties": True, - }, + "schema": {"$ref": "#/schemas/C"}, }, }, - "B_1": { + "D": { "type": "DeclarativeStream", - "name": "B_1", + "name": "D", "retriever": { "type": "SimpleRetriever", - "decoder": {"type": "JsonDecoder"}, "requester": { - "type": "HttpRequester", - "url_base": "https://pokeapi.co/api/v2/", - "authenticator": { - "type": "BasicHttpAuthenticator", - "api_key": '{{ config["api_token"] }}', - }, - "path": "path_to_B", + "$ref": "#/definitions/requester_B", + "path": "D", "http_method": "GET", }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, }, "schema_loader": { "type": "InlineSchemaLoader", - "schema": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "properties": { - "b": { - "type": "string", - } - }, - "additionalProperties": True, + "schema": {"$ref": "#/schemas/D"}, + }, + }, + "E": { + "type": "DeclarativeStream", + "name": "E", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_B", + "path": "E", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/E"}, }, }, - } - } + }, + # dummy requesters to be resolved and deduplicated + # to the shared `url_base` in the `definitions.shared` section + "requester_A": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + }, + "requester_B": { + "type": "HttpRequester", + "url_base": "https://example.com/v2/", + }, + }, + "streams": [ + {"$ref": "#/definitions/streams/A"}, + {"$ref": "#/definitions/streams/B"}, + {"$ref": "#/definitions/streams/C"}, + {"$ref": "#/definitions/streams/D"}, + {"$ref": "#/definitions/streams/E"}, + ], + "schemas": { + "A": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "B": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "C": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "D": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "E": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + }, } expected = { + "type": "DeclarativeSource", + "definitions": {"shared": {"url_base": "https://example.com/v2/"}}, + "streams": [ + { + "type": "DeclarativeStream", + "name": "A", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + "path": "A", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/A"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "B", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/url_base"}, + "path": "B", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/B"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "C", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + "path": "C", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/C"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "D", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/url_base"}, + "path": "D", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/D"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "E", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/url_base"}, + "path": "E", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/E"}, + }, + }, + ], + "schemas": { + "A": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "B": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "C": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "D": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "E": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + }, + } + config = resolver.preprocess_manifest(content, reduce_commons=True) + assert config == expected + + +def test_deduplicate_manifest_with_shared_definitions_url_base_are_present(): + content = { + "type": "DeclarativeSource", "definitions": { + "shared": {"url_base": "https://example.com/v2/"}, "streams": { "A": { "type": "DeclarativeStream", "name": "A", "retriever": { "type": "SimpleRetriever", - "decoder": {"type": "JsonDecoder"}, "requester": { - "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/url_base"}, - "authenticator": {"$ref": "#/definitions/shared/authenticator"}, - "path": "path_to_A", + "$ref": "#/definitions/requester_A", + "path": "A", "http_method": "GET", }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, }, "schema_loader": { "type": "InlineSchemaLoader", - "schema": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "properties": {"a": {"type": "string"}}, - "additionalProperties": True, - }, + "schema": {"$ref": "#/schemas/A"}, }, }, - "A_1": { + "B": { "type": "DeclarativeStream", - "name": "A_1", + "name": "B", "retriever": { "type": "SimpleRetriever", - "decoder": {"type": "JsonDecoder"}, "requester": { - "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/url_base"}, - "authenticator": {"$ref": "#/definitions/shared/authenticator"}, - "path": "path_to_A", + "$ref": "#/definitions/requester_B", + "path": "B", "http_method": "GET", }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, }, "schema_loader": { "type": "InlineSchemaLoader", - "schema": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "properties": {"a_1": {"type": "string"}}, - "additionalProperties": True, - }, + "schema": {"$ref": "#/schemas/B"}, }, }, - "B": { + "C": { "type": "DeclarativeStream", - "name": "B", + "name": "C", "retriever": { "type": "SimpleRetriever", - "decoder": {"type": "JsonDecoder"}, "requester": { - "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/url_base"}, - "authenticator": {"$ref": "#/definitions/shared/authenticator"}, - "path": "path_to_B", + "$ref": "#/definitions/requester_A", + "path": "C", "http_method": "GET", }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, }, "schema_loader": { "type": "InlineSchemaLoader", - "schema": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "properties": {"b": {"type": "string"}}, - "additionalProperties": True, - }, + "schema": {"$ref": "#/schemas/C"}, }, }, - "B_1": { + "D": { "type": "DeclarativeStream", - "name": "B_1", + "name": "D", "retriever": { "type": "SimpleRetriever", - "decoder": {"type": "JsonDecoder"}, "requester": { - "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/url_base"}, - "authenticator": {"$ref": "#/definitions/shared/authenticator"}, - "path": "path_to_B", + "$ref": "#/definitions/requester_B", + "path": "D", "http_method": "GET", }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, }, "schema_loader": { "type": "InlineSchemaLoader", - "schema": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "properties": {"b": {"type": "string"}}, - "additionalProperties": True, + "schema": {"$ref": "#/schemas/D"}, + }, + }, + "E": { + "type": "DeclarativeStream", + "name": "E", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_B", + "path": "E", + "http_method": "GET", }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/E"}, }, }, }, - "shared": { - "authenticator": { - "type": "BasicHttpAuthenticator", - "api_key": '{{ config["api_token"] }}', + # dummy requesters to be resolved and deduplicated + # to the shared `url_base` in the `definitions.shared` section + "requester_A": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + }, + "requester_B": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/url_base"}, + }, + }, + "streams": [ + {"$ref": "#/definitions/streams/A"}, + {"$ref": "#/definitions/streams/B"}, + {"$ref": "#/definitions/streams/C"}, + {"$ref": "#/definitions/streams/D"}, + {"$ref": "#/definitions/streams/E"}, + ], + "schemas": { + "A": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "B": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "C": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "D": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "E": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + }, + } + expected = { + "type": "DeclarativeSource", + "definitions": {"shared": {"url_base": "https://example.com/v2/"}}, + "streams": [ + { + "type": "DeclarativeStream", + "name": "A", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + "path": "A", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/A"}, }, - "url_base": "https://pokeapi.co/api/v2/", }, - } + { + "type": "DeclarativeStream", + "name": "B", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/url_base"}, + "path": "B", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/B"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "C", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + "path": "C", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/C"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "D", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/url_base"}, + "path": "D", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/D"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "E", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/url_base"}, + "path": "E", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/E"}, + }, + }, + ], + "schemas": { + "A": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "B": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "C": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "D": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "E": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + }, } - config = resolver.preprocess_manifest(content) + config = resolver.preprocess_manifest(content, reduce_commons=True) assert config == expected From 138b607adc3a44289d86e6e5c5b5526340a38e19 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 31 Mar 2025 17:56:32 +0300 Subject: [PATCH 05/27] formatted" --- .../sources/declarative/parsers/manifest_deduplicator.py | 2 +- .../sources/declarative/parsers/manifest_reference_resolver.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py index 296e12d74..a18553625 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py @@ -88,7 +88,7 @@ def _get_declarative_component_schema() -> Dict[str, Any]: ) if raw_component_schema is not None: declarative_component_schema = yaml.load(raw_component_schema, Loader=yaml.SafeLoader) - return declarative_component_schema # type: ignore + return declarative_component_schema # type: ignore else: raise RuntimeError( "Failed to read manifest component json schema required for deduplication" diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py index 6efb47dc0..1e39bcd50 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py @@ -118,7 +118,7 @@ def preprocess_manifest( return deduplicated_manifest # return deduplicated_manifest - return preprocessed_manifest # type: ignore + return preprocessed_manifest # type: ignore def _evaluate_node(self, node: Any, manifest: Mapping[str, Any], visited: Set[Any]) -> Any: if isinstance(node, dict): From f10e60121bb9c73f22a43e93a8742196d546252e Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 31 Mar 2025 18:56:53 +0300 Subject: [PATCH 06/27] updated to account type for the given duplicated key --- .../parsers/manifest_deduplicator.py | 52 +++++++++++-------- .../test_manifest_reference_resolver.py | 20 +++---- 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py index a18553625..14193b869 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py @@ -49,10 +49,8 @@ def deduplicate_minifest(resolved_manifest: ManifestType) -> ManifestType: # prepare the `definitions` tag _prepare_definitions(_manifest) - # collect duplicates for a given manifest - duplicates = _collect_duplicates(_manifest) # replace duplicates with references, if any - _handle_duplicates(_manifest, duplicates) + _handle_duplicates(_manifest, _collect_duplicates(_manifest)) # post processing the manifest _reference_schemas(_manifest) @@ -147,20 +145,20 @@ def _replace_duplicates_with_refs(manifest: ManifestType, duplicates: Duplicates """ for _, occurrences in duplicates.items(): # take the component's name as the last part of it's path - key, value = _get_key_value_from_occurances(occurrences) - is_shared_def = _is_shared_definition(manifest, key) + type_key, key, value = _get_key_value_from_occurances(occurrences) + is_shared_def = _is_shared_definition(manifest, type_key, key) # Add to definitions if not there already if not is_shared_def: - _add_to_shared_definitions(manifest, key, value) + _add_to_shared_definitions(manifest, type_key, key, value) # Replace occurrences with references for path, parent_obj, value in occurrences: if is_shared_def: - if value == _get_shared_definition_value(manifest, key): - parent_obj[key] = _create_shared_definition_ref(key) + if value == _get_shared_definition_value(manifest, type_key, key): + parent_obj[key] = _create_shared_definition_ref(type_key, key) else: - parent_obj[key] = _create_shared_definition_ref(key) + parent_obj[key] = _create_shared_definition_ref(type_key, key) def _handle_duplicates(manifest: DefinitionsType, duplicates: DuplicatesType) -> None: @@ -212,6 +210,7 @@ def _add_duplicate( def _add_to_shared_definitions( manifest: DefinitionsType, + type_key: str, key: str, value: Any, ) -> DefinitionsType: @@ -223,9 +222,11 @@ def _add_to_shared_definitions( key: The key to use value: The value to add """ + if type_key not in manifest[DEF_TAG][SHARED_TAG].keys(): + manifest[DEF_TAG][SHARED_TAG][type_key] = {} - if key not in manifest[DEF_TAG][SHARED_TAG]: - manifest[DEF_TAG][SHARED_TAG][key] = value + if key not in manifest[DEF_TAG][SHARED_TAG][type_key].keys(): + manifest[DEF_TAG][SHARED_TAG][type_key][key] = value return manifest @@ -327,7 +328,7 @@ def _hash_object(node: Dict[str, Any]) -> Optional[str]: return None -def _is_shared_definition(manifest: DefinitionsType, key: str) -> bool: +def _is_shared_definition(manifest: DefinitionsType, type_key: str, key: str) -> bool: """ Check if the key already exists in the shared definitions. @@ -338,10 +339,16 @@ def _is_shared_definition(manifest: DefinitionsType, key: str) -> bool: Returns: True if the key exists in the shared definitions, False otherwise """ - return key in manifest[DEF_TAG][SHARED_TAG] + if type_key in manifest[DEF_TAG][SHARED_TAG].keys(): + # Check if the key exists in the shared definitions + if key in manifest[DEF_TAG][SHARED_TAG][type_key].keys(): + return True -def _get_shared_definition_value(manifest: DefinitionsType, key: str) -> Any: + return False + + +def _get_shared_definition_value(manifest: DefinitionsType, type_key: str, key: str) -> Any: """ Get the value of a shared definition by its key. @@ -351,15 +358,16 @@ def _get_shared_definition_value(manifest: DefinitionsType, key: str) -> Any: Returns: The value of the shared definition """ - if key in manifest[DEF_TAG][SHARED_TAG]: - return manifest[DEF_TAG][SHARED_TAG][key] + if type_key in manifest[DEF_TAG][SHARED_TAG].keys(): + if key in manifest[DEF_TAG][SHARED_TAG][type_key].keys(): + return manifest[DEF_TAG][SHARED_TAG][type_key][key] else: raise ManifestDeduplicationException( f"Key {key} not found in shared definitions. Please check the manifest." ) -def _get_key_value_from_occurances(occurrences: DuplicateOccurancesType) -> Tuple[str, Any]: +def _get_key_value_from_occurances(occurrences: DuplicateOccurancesType) -> Tuple[str, str, Any]: """ Get the key from the occurrences list. @@ -367,15 +375,15 @@ def _get_key_value_from_occurances(occurrences: DuplicateOccurancesType) -> Tupl occurrences: The occurrences list Returns: - The key and value from the occurrences + The key, type and value from the occurrences """ # Take the value from the first occurrence, as they are the same - path, _, value = occurrences[0] - return path[-1], value # Return the component's name as the last part of its path + path, obj, value = occurrences[0] + return obj["type"], path[-1], value # Return the component's name as the last part of its path -def _create_shared_definition_ref(ref_key: str) -> Dict[str, str]: +def _create_shared_definition_ref(type_key: str, key: str) -> Dict[str, str]: """ Create a reference object for the shared definitions using the specified key. @@ -385,7 +393,7 @@ def _create_shared_definition_ref(ref_key: str) -> Dict[str, str]: Returns: A reference object in the proper format """ - return {"$ref": f"#/{DEF_TAG}/{SHARED_TAG}/{ref_key}"} + return {"$ref": f"#/{DEF_TAG}/{SHARED_TAG}/{type_key}/{key}"} def _create_schema_ref(ref_key: str) -> Dict[str, str]: diff --git a/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py b/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py index f589b8f19..fcb933ec8 100644 --- a/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +++ b/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py @@ -330,7 +330,7 @@ def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequ } expected = { "type": "DeclarativeSource", - "definitions": {"shared": {"url_base": "https://example.com/v2/"}}, + "definitions": {"shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}}, "streams": [ { "type": "DeclarativeStream", @@ -361,7 +361,7 @@ def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequ "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/url_base"}, + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, "path": "B", "http_method": "GET", }, @@ -405,7 +405,7 @@ def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequ "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/url_base"}, + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, "path": "D", "http_method": "GET", }, @@ -427,7 +427,7 @@ def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequ "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/url_base"}, + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, "path": "E", "http_method": "GET", }, @@ -484,7 +484,7 @@ def test_deduplicate_manifest_with_shared_definitions_url_base_are_present(): content = { "type": "DeclarativeSource", "definitions": { - "shared": {"url_base": "https://example.com/v2/"}, + "shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}, "streams": { "A": { "type": "DeclarativeStream", @@ -600,7 +600,7 @@ def test_deduplicate_manifest_with_shared_definitions_url_base_are_present(): }, "requester_B": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/url_base"}, + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, }, }, "streams": [ @@ -645,7 +645,7 @@ def test_deduplicate_manifest_with_shared_definitions_url_base_are_present(): } expected = { "type": "DeclarativeSource", - "definitions": {"shared": {"url_base": "https://example.com/v2/"}}, + "definitions": {"shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}}, "streams": [ { "type": "DeclarativeStream", @@ -676,7 +676,7 @@ def test_deduplicate_manifest_with_shared_definitions_url_base_are_present(): "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/url_base"}, + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, "path": "B", "http_method": "GET", }, @@ -720,7 +720,7 @@ def test_deduplicate_manifest_with_shared_definitions_url_base_are_present(): "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/url_base"}, + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, "path": "D", "http_method": "GET", }, @@ -742,7 +742,7 @@ def test_deduplicate_manifest_with_shared_definitions_url_base_are_present(): "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/url_base"}, + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, "path": "E", "http_method": "GET", }, From 66fe38ebdc951eef38e6164777819f2daff2baca Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 31 Mar 2025 19:33:03 +0300 Subject: [PATCH 07/27] add the reduce_commons: true, for Connector Builder case --- .../sources/declarative/manifest_declarative_source.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index ba0ea5eca..6a8c651b9 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -86,7 +86,10 @@ def __init__( # If custom components are needed, locate and/or register them. self.components_module: ModuleType | None = get_registered_components_module(config=config) - resolved_source_config = ManifestReferenceResolver().preprocess_manifest(manifest) + self._reduce_manifest_commons = True if emit_connector_builder_messages else False + resolved_source_config = ManifestReferenceResolver().preprocess_manifest( + manifest, self._reduce_manifest_commons + ) propagated_source_config = ManifestComponentTransformer().propagate_types_and_parameters( "", resolved_source_config, {} ) From 8798042c27f2fba064c276f72c19ae53513e8541 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 31 Mar 2025 19:48:04 +0300 Subject: [PATCH 08/27] enabled the reduce_commons: True for Connector Builder case --- .../declarative/parsers/manifest_deduplicator.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py index 14193b869..576456d74 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py @@ -124,16 +124,17 @@ def _reference_schemas(manifest: ManifestType) -> None: """ # create the ref tag to each stream in the manifest - if STREAMS_TAG in manifest: + if STREAMS_TAG in manifest.keys(): for stream in manifest[STREAMS_TAG]: stream_name = stream.get("name") # reference the stream schema for the stream to where it's storred - if stream_name in manifest[SCHEMAS_TAG].keys(): - stream[SCHEMA_LOADER_TAG][SCHEMA_TAG] = _create_schema_ref(stream_name) - else: - raise ManifestDeduplicationException( - f"Stream {stream_name} not found in `schemas`. Please check the manifest." - ) + if SCHEMAS_TAG in manifest.keys(): + if stream_name in manifest[SCHEMAS_TAG].keys(): + stream[SCHEMA_LOADER_TAG][SCHEMA_TAG] = _create_schema_ref(stream_name) + else: + raise ManifestDeduplicationException( + f"Stream {stream_name} not found in `schemas`. Please check the manifest." + ) def _replace_duplicates_with_refs(manifest: ManifestType, duplicates: DuplicatesType) -> None: From 1d425ee11bdb601a60e1d7121684bfabdac41640 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Tue, 1 Apr 2025 12:49:07 +0300 Subject: [PATCH 09/27] refactorred and cleaned up the code, moved to use the class instead --- .../manifest_declarative_source.py | 54 ++- .../declarative/parsers/custom_exceptions.py | 2 +- .../parsers/manifest_deduplicator.py | 410 ---------------- .../parsers/manifest_normalizer.py | 451 ++++++++++++++++++ .../parsers/manifest_reference_resolver.py | 20 +- .../test_manifest_reference_resolver.py | 24 +- 6 files changed, 506 insertions(+), 455 deletions(-) delete mode 100644 airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py create mode 100644 airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index 6a8c651b9..6249e91d3 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -39,6 +39,9 @@ from airbyte_cdk.sources.declarative.parsers.manifest_component_transformer import ( ManifestComponentTransformer, ) +from airbyte_cdk.sources.declarative.parsers.manifest_normalizer import ( + ManifestNormalizer, +) from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ( ManifestReferenceResolver, ) @@ -57,6 +60,24 @@ from airbyte_cdk.utils.traced_exception import AirbyteTracedException +def _get_declarative_component_schema() -> Dict[str, Any]: + try: + raw_component_schema = pkgutil.get_data( + "airbyte_cdk", "sources/declarative/declarative_component_schema.yaml" + ) + if raw_component_schema is not None: + declarative_component_schema = yaml.load(raw_component_schema, Loader=yaml.SafeLoader) + return declarative_component_schema # type: ignore + else: + raise RuntimeError( + "Failed to read manifest component json schema required for deduplication" + ) + except FileNotFoundError as e: + raise FileNotFoundError( + f"Failed to read manifest component json schema required for deduplication: {e}" + ) + + class ManifestDeclarativeSource(DeclarativeSource): """Declarative source defined by a manifest of low-code components that define source connector behavior""" @@ -78,6 +99,8 @@ def __init__( component_factory: optional factory if ModelToComponentFactory's default behavior needs to be tweaked. """ self.logger = logging.getLogger(f"airbyte.{self.name}") + + self._declarative_component_schema = _get_declarative_component_schema() # For ease of use we don't require the type to be specified at the top level manifest, but it should be included during processing manifest = dict(source_config) if "type" not in manifest: @@ -86,10 +109,15 @@ def __init__( # If custom components are needed, locate and/or register them. self.components_module: ModuleType | None = get_registered_components_module(config=config) - self._reduce_manifest_commons = True if emit_connector_builder_messages else False - resolved_source_config = ManifestReferenceResolver().preprocess_manifest( - manifest, self._reduce_manifest_commons - ) + resolved_source_config = ManifestReferenceResolver().preprocess_manifest(manifest) + + if emit_connector_builder_messages: + # reduce commonalities in the manifest after the references have been resolved, + # used mostly for Connector Builder use cases. + resolved_source_config = ManifestNormalizer( + resolved_source_config, self._declarative_component_schema + ).normalize() + propagated_source_config = ManifestComponentTransformer().propagate_types_and_parameters( "", resolved_source_config, {} ) @@ -269,22 +297,6 @@ def _validate_source(self) -> None: """ Validates the connector manifest against the declarative component schema """ - try: - raw_component_schema = pkgutil.get_data( - "airbyte_cdk", "sources/declarative/declarative_component_schema.yaml" - ) - if raw_component_schema is not None: - declarative_component_schema = yaml.load( - raw_component_schema, Loader=yaml.SafeLoader - ) - else: - raise RuntimeError( - "Failed to read manifest component json schema required for validation" - ) - except FileNotFoundError as e: - raise FileNotFoundError( - f"Failed to read manifest component json schema required for validation: {e}" - ) streams = self._source_config.get("streams") dynamic_streams = self._source_config.get("dynamic_streams") @@ -294,7 +306,7 @@ def _validate_source(self) -> None: ) try: - validate(self._source_config, declarative_component_schema) + validate(self._source_config, self._declarative_component_schema) except ValidationError as e: raise ValidationError( "Validation against json schema defined in declarative_component_schema.yaml schema failed" diff --git a/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py b/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py index b847ee26c..6c5847d3f 100644 --- a/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +++ b/airbyte_cdk/sources/declarative/parsers/custom_exceptions.py @@ -21,7 +21,7 @@ def __init__(self, path: str, reference: str) -> None: super().__init__(f"Undefined reference {reference} from {path}") -class ManifestDeduplicationException(Exception): +class ManifestNormalizationException(Exception): """ Raised when a circular reference is detected in a manifest. """ diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py b/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py deleted file mode 100644 index 576456d74..000000000 --- a/airbyte_cdk/sources/declarative/parsers/manifest_deduplicator.py +++ /dev/null @@ -1,410 +0,0 @@ -# -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. -# - -import copy -import hashlib -import json -import pkgutil -from collections import defaultdict -from typing import Any, DefaultDict, Dict, List, Optional, Tuple - -import yaml - -from airbyte_cdk.sources.declarative.parsers.custom_exceptions import ManifestDeduplicationException - -# Type definitions for better readability -ManifestType = Dict[str, Any] -DefinitionsType = Dict[str, Any] -DuplicateOccurancesType = List[Tuple[List[str], Dict[str, Any], Dict[str, Any]]] -DuplicatesType = DefaultDict[str, DuplicateOccurancesType] - -# Configuration constants -N_OCCURANCES = 2 - -DEF_TAG = "definitions" -STREAMS_TAG = "streams" -SHARED_TAG = "shared" -SHARABLE_TAG = "sharable" -SCHEMA_LOADER_TAG = "schema_loader" -SCHEMAS_TAG = "schemas" -SCHEMA_TAG = "schema" -PROPERTIES_TAG = "properties" - - -def deduplicate_minifest(resolved_manifest: ManifestType) -> ManifestType: - """ - Find commonalities in the input JSON structure and refactor it to avoid redundancy. - - Args: - resolved_manifest: A dictionary representing a JSON structure to be analyzed. - - Returns: - A refactored JSON structure with common properties extracted to `definitions.shared`, - the duplicated properties replaced with references - """ - - try: - _manifest = copy.deepcopy(resolved_manifest) - - # prepare the `definitions` tag - _prepare_definitions(_manifest) - # replace duplicates with references, if any - _handle_duplicates(_manifest, _collect_duplicates(_manifest)) - # post processing the manifest - _reference_schemas(_manifest) - - return _manifest - except ManifestDeduplicationException: - # if any error occurs, we just return the original manifest. - return resolved_manifest - - -def _get_sharable_tags() -> List[str]: - # we need to recursively find the tags in the component schema that has a `sharable` key inside. - tags = [] - - # we need to find the `sharable` tags in the component schema - def _find_sharable(schema: Dict[str, Any]) -> None: - for root_key, root_value in schema.items(): - properties = root_value.get(PROPERTIES_TAG, {}) - - for inner_key, inner_value in properties.items(): - if SHARABLE_TAG in inner_value.keys(): - tags.append(inner_key) - - _find_sharable(_get_declarative_component_schema().get(DEF_TAG, {})) - - # return unique tags only - return list(set(tags)) - - -def _get_declarative_component_schema() -> Dict[str, Any]: - try: - raw_component_schema = pkgutil.get_data( - "airbyte_cdk", "sources/declarative/declarative_component_schema.yaml" - ) - if raw_component_schema is not None: - declarative_component_schema = yaml.load(raw_component_schema, Loader=yaml.SafeLoader) - return declarative_component_schema # type: ignore - else: - raise RuntimeError( - "Failed to read manifest component json schema required for deduplication" - ) - except FileNotFoundError as e: - raise FileNotFoundError( - f"Failed to read manifest component json schema required for deduplication: {e}" - ) - - -def _prepare_definitions(manifest: ManifestType) -> None: - """ - Clean the definitions in the manifest by removing unnecessary properties. - This function modifies the manifest in place. - Args: - manifest: The manifest to clean - """ - # Check if the definitions tag exists - if not DEF_TAG in manifest: - manifest[DEF_TAG] = {} - - # remove everything from definitions tag except of `shared`, after processing - for key in list(manifest[DEF_TAG].keys()): - if key != SHARED_TAG: - manifest[DEF_TAG].pop(key, None) - - -def _reference_schemas(manifest: ManifestType) -> None: - """ - Process the definitions in the manifest to move streams from definitions to the main stream list. - This function modifies the manifest in place. - - Args: - manifest: The manifest to process - """ - - # create the ref tag to each stream in the manifest - if STREAMS_TAG in manifest.keys(): - for stream in manifest[STREAMS_TAG]: - stream_name = stream.get("name") - # reference the stream schema for the stream to where it's storred - if SCHEMAS_TAG in manifest.keys(): - if stream_name in manifest[SCHEMAS_TAG].keys(): - stream[SCHEMA_LOADER_TAG][SCHEMA_TAG] = _create_schema_ref(stream_name) - else: - raise ManifestDeduplicationException( - f"Stream {stream_name} not found in `schemas`. Please check the manifest." - ) - - -def _replace_duplicates_with_refs(manifest: ManifestType, duplicates: DuplicatesType) -> None: - """ - Process duplicate objects and replace them with references. - - Args: - definitions: The definitions dictionary to modify - """ - for _, occurrences in duplicates.items(): - # take the component's name as the last part of it's path - type_key, key, value = _get_key_value_from_occurances(occurrences) - is_shared_def = _is_shared_definition(manifest, type_key, key) - - # Add to definitions if not there already - if not is_shared_def: - _add_to_shared_definitions(manifest, type_key, key, value) - - # Replace occurrences with references - for path, parent_obj, value in occurrences: - if is_shared_def: - if value == _get_shared_definition_value(manifest, type_key, key): - parent_obj[key] = _create_shared_definition_ref(type_key, key) - else: - parent_obj[key] = _create_shared_definition_ref(type_key, key) - - -def _handle_duplicates(manifest: DefinitionsType, duplicates: DuplicatesType) -> None: - """ - Process the duplicates and replace them with references. - - Args: - duplicates: Dictionary of duplicate objects - """ - - if len(duplicates) > 0: - # Check if the shared tag exists - if not SHARED_TAG in manifest[DEF_TAG]: - manifest[DEF_TAG][SHARED_TAG] = {} - - try: - _replace_duplicates_with_refs(manifest, duplicates) - except Exception as e: - raise ManifestDeduplicationException(str(e)) - - -def _add_duplicate( - duplicates: DuplicatesType, - current_path: List[str], - obj: Dict[str, Any], - value: Any, - key: Optional[str] = None, -) -> None: - """ - Adds a duplicate record of an observed object by computing a unique hash for the provided value. - - This function computes a hash for the given value (or a dictionary composed of the key and value if a key is provided) - and appends a tuple containing the current path, the original object, and the value to the duplicates - dictionary under the corresponding hash. - - Parameters: - duplicates (DuplicatesType): The dictionary to store duplicate records. - current_path (List[str]): The list of keys or indices representing the current location in the object hierarchy. - obj (Dict): The original dictionary object where the duplicate is observed. - value (Any): The value to be hashed and used for identifying duplicates. - key (Optional[str]): An optional key that, if provided, wraps the value in a dictionary before hashing. - """ - # create hash for the duplicate observed - value_to_hash = value if key is None else {key: value} - obj_hash = _hash_object(value_to_hash) - if obj_hash: - duplicates[obj_hash].append((current_path, obj, value)) - - -def _add_to_shared_definitions( - manifest: DefinitionsType, - type_key: str, - key: str, - value: Any, -) -> DefinitionsType: - """ - Add a value to the shared definitions under the specified key. - - Args: - definitions: The definitions dictionary to modify - key: The key to use - value: The value to add - """ - if type_key not in manifest[DEF_TAG][SHARED_TAG].keys(): - manifest[DEF_TAG][SHARED_TAG][type_key] = {} - - if key not in manifest[DEF_TAG][SHARED_TAG][type_key].keys(): - manifest[DEF_TAG][SHARED_TAG][type_key][key] = value - - return manifest - - -def _collect_duplicates(node: ManifestType) -> DuplicatesType: - """ - Traverse the JSON object and collect all potential duplicate values and objects. - - Args: - node: The JSON object to analyze. - - Returns: - duplicates: A dictionary of duplicate objects. - """ - - def _collect(obj: Dict[str, Any], path: Optional[List[str]] = None) -> None: - """ - The closure to recursively collect duplicates in the JSON object. - - Args: - obj: The current object being analyzed. - path: The current path in the object hierarchy. - """ - - if not isinstance(obj, dict): - return - - path = [] if path is None else path - # Check if the object is empty - for key, value in obj.items(): - # do not collect duplicates from `definitions` tag - if key == DEF_TAG: - continue - - current_path = path + [key] - - if isinstance(value, dict): - # First process nested dictionaries - _collect(value, current_path) - # Process allowed-only component tags - if key in sharable_tags: - _add_duplicate(duplicates, current_path, obj, value) - - # handle primitive types - elif isinstance(value, (str, int, float, bool)): - # Process allowed-only field tags - if key in sharable_tags: - _add_duplicate(duplicates, current_path, obj, value, key) - - # handle list cases - elif isinstance(value, list): - for i, item in enumerate(value): - _collect(item, current_path + [str(i)]) - - # get the tags marked as `sharable` in the component schema - sharable_tags = _get_sharable_tags() - - duplicates: DuplicatesType = defaultdict(list, {}) - try: - if sharable_tags: - _collect(node) - # clean non-duplicates and sort based on the count of occurrences - return clean_and_sort_duplicates(duplicates) - return duplicates - except Exception as e: - raise ManifestDeduplicationException(str(e)) - - -def clean_and_sort_duplicates(duplicates: DuplicatesType) -> DuplicatesType: - """ - Clean non-duplicates and sort the duplicates by their occurrences. - - Args: - duplicates: The duplicates dictionary to sort - Returns: - A sorted duplicates dictionary - """ - # clean non-duplicates - duplicates = defaultdict(list, {k: v for k, v in duplicates.items() if len(v) >= N_OCCURANCES}) - # sort the duplicates by their occurrences - return defaultdict( - list, {k: v for k, v in sorted(duplicates.items(), key=lambda x: len(x[1]), reverse=True)} - ) - - -def _hash_object(node: Dict[str, Any]) -> Optional[str]: - """ - Create a unique hash for a dictionary object. - - Args: - node: The dictionary to hash - - Returns: - A hash string or None if not hashable - """ - if isinstance(node, Dict): - # Sort keys to ensure consistent hash for same content - return hashlib.md5(json.dumps(node, sort_keys=True).encode()).hexdigest() - return None - - -def _is_shared_definition(manifest: DefinitionsType, type_key: str, key: str) -> bool: - """ - Check if the key already exists in the shared definitions. - - Args: - key: The key to check - definitions: The definitions dictionary with definitions - - Returns: - True if the key exists in the shared definitions, False otherwise - """ - - if type_key in manifest[DEF_TAG][SHARED_TAG].keys(): - # Check if the key exists in the shared definitions - if key in manifest[DEF_TAG][SHARED_TAG][type_key].keys(): - return True - - return False - - -def _get_shared_definition_value(manifest: DefinitionsType, type_key: str, key: str) -> Any: - """ - Get the value of a shared definition by its key. - - Args: - key: The key to check - definitions: The definitions dictionary with definitions - Returns: - The value of the shared definition - """ - if type_key in manifest[DEF_TAG][SHARED_TAG].keys(): - if key in manifest[DEF_TAG][SHARED_TAG][type_key].keys(): - return manifest[DEF_TAG][SHARED_TAG][type_key][key] - else: - raise ManifestDeduplicationException( - f"Key {key} not found in shared definitions. Please check the manifest." - ) - - -def _get_key_value_from_occurances(occurrences: DuplicateOccurancesType) -> Tuple[str, str, Any]: - """ - Get the key from the occurrences list. - - Args: - occurrences: The occurrences list - - Returns: - The key, type and value from the occurrences - """ - - # Take the value from the first occurrence, as they are the same - path, obj, value = occurrences[0] - return obj["type"], path[-1], value # Return the component's name as the last part of its path - - -def _create_shared_definition_ref(type_key: str, key: str) -> Dict[str, str]: - """ - Create a reference object for the shared definitions using the specified key. - - Args: - ref_key: The reference key to use - - Returns: - A reference object in the proper format - """ - return {"$ref": f"#/{DEF_TAG}/{SHARED_TAG}/{type_key}/{key}"} - - -def _create_schema_ref(ref_key: str) -> Dict[str, str]: - """ - Create a reference object for stream schema using the specified key. - - Args: - ref_key: The reference key to use - - Returns: - A reference object in the proper format - """ - return {"$ref": f"#/{SCHEMAS_TAG}/{ref_key}"} diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py new file mode 100644 index 000000000..7010a23eb --- /dev/null +++ b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py @@ -0,0 +1,451 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +import copy +import hashlib +import json +from collections import defaultdict +from itertools import chain +from typing import Any, Callable, DefaultDict, Dict, Iterable, List, Optional, Tuple + +from airbyte_cdk.sources.declarative.parsers.custom_exceptions import ManifestNormalizationException + +# Type definitions for better readability +ManifestType = Dict[str, Any] +DefinitionsType = Dict[str, Any] +DuplicateOccurancesType = List[Tuple[List[str], Dict[str, Any], Dict[str, Any]]] +DuplicatesType = DefaultDict[str, DuplicateOccurancesType] + +# Configuration constants +N_OCCURANCES = 2 +DEF_TAG = "definitions" +STREAMS_TAG = "streams" +SHARED_TAG = "shared" +SHARABLE_TAG = "sharable" +SCHEMA_LOADER_TAG = "schema_loader" +SCHEMAS_TAG = "schemas" +SCHEMA_TAG = "schema" +PROPERTIES_TAG = "properties" + + +def _get_sharable_schema_tags(schema: DefinitionsType) -> List[str]: + """ + Extracts sharable tags from schema definitions. + This function identifies properties within a schema's definitions that are marked as sharable. + It traverses through each definition in the schema, examines its properties, and collects + the keys of properties that contain the SHARABLE_TAG. + + Args: + schema (DefinitionsType): The schema definition dictionary to process + + Returns: + List[str]: A deduplicated list of property keys that are marked as sharable + """ + + # the sharable scope: ['definitions.*'] + schema_definitions = schema.get(DEF_TAG, {}) + + sharable_tags: List[str] = [] + # Extract sharable keys from properties + + extract_sharable_keys: Callable[[Dict[str, Dict[str, Any]]], List[str]] = lambda properties: [ + key for key, value in properties.items() if SHARABLE_TAG in value.keys() + ] + + # Process each root value to get its sharable keys + process_root: Callable[[Dict[str, Any]], List[str]] = lambda root_value: extract_sharable_keys( + root_value.get(PROPERTIES_TAG, {}) + ) + + # Map the process_root function over all schema values and flatten the results + all_sharable_tags = chain.from_iterable(map(process_root, schema_definitions.values())) + + # Add all found sharable tags to the tags list + sharable_tags.extend(all_sharable_tags) + + # return unique tags only + return list(set(sharable_tags)) + + +class ManifestNormalizer: + """ + This class is responsible for normalizing the manifest by appliying processing such as: + - removing duplicated definitions + - replacing them with references. + + To extend the functionality, use the `normilize()` method to include any additional processing steps. + """ + + def __init__( + self, + resolved_manifest: ManifestType, + declarative_schema: DefinitionsType, + ) -> None: + self._resolved_manifest = resolved_manifest + self._declarative_schema = declarative_schema + # get the tags marked as `sharable` in the component schema + self._sharable_tags = _get_sharable_schema_tags(self._declarative_schema) + self._normalized_manifest: ManifestType = copy.deepcopy(self._resolved_manifest) + + def normalize(self) -> ManifestType: + """ + Normalizes the manifest by deduplicating and resolving schema references. + + This method processes the manifest in two steps: + 1. Deduplicates elements within the manifest + 2. Resolves and references schemas + + Returns: + ManifestType: The normalized manifest if processing succeeds, + or the original resolved manifest if normalization fails. + + Raises: + ManifestNormalizationException: Caught internally and handled by returning the original manifest. + """ + try: + # process the manifest + self._deduplicate_minifest() + # post processing the manifest + self._reference_schemas() + + return self._normalized_manifest + except ManifestNormalizationException: + # if any error occurs, we just return the original manifest. + return self._resolved_manifest + + def _get_manifest_streams(self) -> Iterable[Dict[str, Any]]: + """ + Get the streams from the manifest. + + Returns: + A list of streams + """ + if STREAMS_TAG in self._normalized_manifest.keys(): + for stream in self._normalized_manifest[STREAMS_TAG]: + yield stream + + yield from [] + + def _deduplicate_minifest(self) -> None: + """ + Find commonalities in the input JSON structure and refactor it to avoid redundancy. + + Args: + resolved_manifest: A dictionary representing a JSON structure to be analyzed. + + Returns: + A refactored JSON structure with common properties extracted to `definitions.shared`, + the duplicated properties replaced with references + """ + try: + # prepare the `definitions` tag + self._prepare_definitions() + # replace duplicates with references, if any + self._handle_duplicates( + self._collect_duplicates(), + ) + except Exception as e: + raise ManifestNormalizationException(str(e)) + + def _prepare_definitions(self) -> None: + """ + Clean the definitions in the manifest by removing unnecessary properties. + This function modifies the manifest in place. + + Args: + manifest: The manifest to clean + """ + # Check if the definitions tag exists + if not DEF_TAG in self._normalized_manifest: + self._normalized_manifest[DEF_TAG] = {} + + # Check if the shared tag exists + if not SHARED_TAG in self._normalized_manifest[DEF_TAG]: + self._normalized_manifest[DEF_TAG][SHARED_TAG] = {} + + # remove everything from definitions tag except of `shared`, after processing + for key in list(self._normalized_manifest[DEF_TAG].keys()): + if key != SHARED_TAG: + self._normalized_manifest[DEF_TAG].pop(key, None) + + def _reference_schemas(self) -> None: + """ + Process the definitions in the manifest to move streams from definitions to the main stream list. + This function modifies the manifest in place. + + Args: + manifest: The manifest to process + """ + + # reference the stream schema for the stream to where it's stored + if SCHEMAS_TAG in self._normalized_manifest.keys(): + for stream in self._get_manifest_streams(): + stream_name = stream["name"] + + if stream_name not in self._normalized_manifest[SCHEMAS_TAG].keys(): + raise ManifestNormalizationException( + f"Stream {stream_name} not found in `schemas`. Please check the manifest." + ) + + stream[SCHEMA_LOADER_TAG][SCHEMA_TAG] = self._create_schema_ref(stream_name) + + def _replace_duplicates_with_refs(self, duplicates: DuplicatesType) -> None: + """ + Process duplicate objects and replace them with references. + + Args: + definitions: The definitions dictionary to modify + """ + for _, occurrences in duplicates.items(): + type_key, key, value = self._get_occurance_samples(occurrences) + is_shared_def = self._is_shared_definition(type_key, key) + + # Add to definitions if not there already + if not is_shared_def: + self._add_to_shared_definitions(type_key, key, value) + + # Replace occurrences with references + for path, parent_obj, value in occurrences: + if is_shared_def: + if value == self._get_shared_definition_value(type_key, key): + parent_obj[key] = self._create_shared_definition_ref(type_key, key) + else: + parent_obj[key] = self._create_shared_definition_ref(type_key, key) + + def _handle_duplicates(self, duplicates: DuplicatesType) -> None: + """ + Process the duplicates and replace them with references. + + Args: + duplicates: Dictionary of duplicate objects + """ + + if len(duplicates) > 0: + self._replace_duplicates_with_refs(duplicates) + + def _add_duplicate( + self, + duplicates: DuplicatesType, + current_path: List[str], + obj: Dict[str, Any], + value: Any, + key: Optional[str] = None, + ) -> None: + """ + Adds a duplicate record of an observed object by computing a unique hash for the provided value. + + This function computes a hash for the given value (or a dictionary composed of the key and value if a key is provided) + and appends a tuple containing the current path, the original object, and the value to the duplicates + dictionary under the corresponding hash. + + Parameters: + duplicates (DuplicatesType): The dictionary to store duplicate records. + current_path (List[str]): The list of keys or indices representing the current location in the object hierarchy. + obj (Dict): The original dictionary object where the duplicate is observed. + value (Any): The value to be hashed and used for identifying duplicates. + key (Optional[str]): An optional key that, if provided, wraps the value in a dictionary before hashing. + """ + # create hash for each duplicate observed + value_to_hash = value if key is None else {key: value} + duplicates[self._hash_object(value_to_hash)].append((current_path, obj, value)) + + def _add_to_shared_definitions( + self, + type_key: str, + key: str, + value: Any, + ) -> None: + """ + Add a value to the shared definitions under the specified key. + + Args: + definitions: The definitions dictionary to modify + key: The key to use + value: The value to add + """ + if type_key not in self._normalized_manifest[DEF_TAG][SHARED_TAG].keys(): + self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key] = {} + + if key not in self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key].keys(): + self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key][key] = value + + def _collect_duplicates(self) -> DuplicatesType: + """ + Traverse the JSON object and collect all potential duplicate values and objects. + + Args: + node: The JSON object to analyze. + + Returns: + duplicates: A dictionary of duplicate objects. + """ + + def _collect(obj: Dict[str, Any], path: Optional[List[str]] = None) -> None: + """ + The closure to recursively collect duplicates in the JSON object. + + Args: + obj: The current object being analyzed. + path: The current path in the object hierarchy. + """ + + if not isinstance(obj, dict): + return + + path = [] if path is None else path + # Check if the object is empty + for key, value in obj.items(): + # do not collect duplicates from `definitions` tag + if key == DEF_TAG: + continue + + current_path = path + [key] + + if isinstance(value, dict): + # First process nested dictionaries + _collect(value, current_path) + # Process allowed-only component tags + if key in self._sharable_tags: + self._add_duplicate(duplicates, current_path, obj, value) + + # handle primitive types + elif isinstance(value, (str, int, float, bool)): + # Process allowed-only field tags + if key in self._sharable_tags: + self._add_duplicate(duplicates, current_path, obj, value, key) + + # handle list cases + elif isinstance(value, list): + for i, item in enumerate(value): + _collect(item, current_path + [str(i)]) + + duplicates: DuplicatesType = defaultdict(list, {}) + try: + if self._sharable_tags: + _collect(self._normalized_manifest) + # clean non-duplicates and sort based on the count of occurrences + return self._clean_and_sort_duplicates(duplicates) + return duplicates + except Exception as e: + raise ManifestNormalizationException(str(e)) + + def _clean_and_sort_duplicates(self, duplicates: DuplicatesType) -> DuplicatesType: + """ + Clean non-duplicates and sort the duplicates by their occurrences. + + Args: + duplicates: The duplicates dictionary to sort + + Returns: + A sorted duplicates dictionary + """ + + # clean non-duplicates + duplicates = defaultdict( + list, + {k: v for k, v in duplicates.items() if len(v) >= N_OCCURANCES}, + ) + + # sort the duplicates by their occurrences, more frequent ones go first + duplicates = defaultdict( + list, + {k: v for k, v in sorted(duplicates.items(), key=lambda x: len(x[1]), reverse=True)}, + ) + + return duplicates + + def _hash_object(self, obj: Dict[str, Any]) -> str: + """ + Create a unique hash for a dictionary object. + + Args: + node: The dictionary to hash + + Returns: + A hashed string + """ + + # Sort keys to ensure consistent hash for same content + return hashlib.md5(json.dumps(obj, sort_keys=True).encode()).hexdigest() + + def _is_shared_definition(self, type_key: str, key: str) -> bool: + """ + Check if the key already exists in the shared definitions. + + Args: + key: The key to check + definitions: The definitions dictionary with definitions + + Returns: + True if the key exists in the shared definitions, False otherwise + """ + + if type_key in self._normalized_manifest[DEF_TAG][SHARED_TAG].keys(): + # Check if the key exists in the shared definitions + if key in self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key].keys(): + return True + + return False + + def _get_shared_definition_value(self, type_key: str, key: str) -> Any: + """ + Get the value of a shared definition by its key. + + Args: + key: The key to check + definitions: The definitions dictionary with definitions + + Returns: + The value of the shared definition + """ + if type_key in self._normalized_manifest[DEF_TAG][SHARED_TAG].keys(): + if key in self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key].keys(): + return self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key][key] + else: + raise ManifestNormalizationException( + f"Key {key} not found in shared definitions. Please check the manifest." + ) + + def _get_occurance_samples(self, occurrences: DuplicateOccurancesType) -> Tuple[str, str, Any]: + """ + Get the key from the occurrences list. + + Args: + occurrences: The occurrences list + + Returns: + The key, type and value from the occurrences + """ + + # Take the value from the first occurrence, as they are the same + path, obj, value = occurrences[0] + return ( + obj["type"], + path[-1], + value, + ) # Return the component's name as the last part of its path + + def _create_shared_definition_ref(self, type_key: str, key: str) -> Dict[str, str]: + """ + Create a reference object for the shared definitions using the specified key. + + Args: + ref_key: The reference key to use + + Returns: + A reference object in the proper format + """ + return {"$ref": f"#/{DEF_TAG}/{SHARED_TAG}/{type_key}/{key}"} + + def _create_schema_ref(self, ref_key: str) -> Dict[str, str]: + """ + Create a reference object for stream schema using the specified key. + + Args: + ref_key: The reference key to use + + Returns: + A reference object in the proper format + """ + return {"$ref": f"#/{SCHEMAS_TAG}/{ref_key}"} diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py index 1e39bcd50..12e1444ad 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py @@ -3,13 +3,12 @@ # import re -from typing import Any, Mapping, Optional, Set, Tuple, Union +from typing import Any, Dict, Mapping, Set, Tuple, Union from airbyte_cdk.sources.declarative.parsers.custom_exceptions import ( CircularReferenceException, UndefinedReferenceException, ) -from airbyte_cdk.sources.declarative.parsers.manifest_deduplicator import deduplicate_minifest REF_TAG = "$ref" @@ -100,25 +99,12 @@ class ManifestReferenceResolver: until we find a key with the given path, or until there is nothing to traverse. """ - def preprocess_manifest( - self, - manifest: Mapping[str, Any], - reduce_commons: Optional[bool] = False, - ) -> Mapping[str, Any]: + def preprocess_manifest(self, manifest: Mapping[str, Any]) -> Dict[str, Any]: """ :param manifest: incoming manifest that could have references to previously defined components :param reduce_commons: whether to deduplicate the commonalities in the manifest after pre-processing. """ - preprocessed_manifest = self._evaluate_node(manifest, manifest, set()) - - # we need to reduce commonalities in the manifest after the references have been resolved, - # used mostly for Connector Builder use cases. - if reduce_commons: - deduplicated_manifest = deduplicate_minifest(preprocessed_manifest) - return deduplicated_manifest - - # return deduplicated_manifest - return preprocessed_manifest # type: ignore + return self._evaluate_node(manifest, manifest, set()) # type: ignore def _evaluate_node(self, node: Any, manifest: Mapping[str, Any], visited: Set[Any]) -> Any: if isinstance(node, dict): diff --git a/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py b/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py index fcb933ec8..f7a8d145e 100644 --- a/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +++ b/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py @@ -4,10 +4,16 @@ import pytest +from airbyte_cdk.sources.declarative.manifest_declarative_source import ( + _get_declarative_component_schema, +) from airbyte_cdk.sources.declarative.parsers.custom_exceptions import ( CircularReferenceException, UndefinedReferenceException, ) +from airbyte_cdk.sources.declarative.parsers.manifest_normalizer import ( + ManifestNormalizer, +) from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ( ManifestReferenceResolver, _parse_path, @@ -166,7 +172,9 @@ def test_circular_reference(): resolver.preprocess_manifest(content) -def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequent_is_shared(): +def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequent_is_shared() -> ( + None +): content = { "type": "DeclarativeSource", "definitions": { @@ -476,11 +484,13 @@ def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequ }, }, } - config = resolver.preprocess_manifest(content, reduce_commons=True) - assert config == expected + resolved_manifest = resolver.preprocess_manifest(content) + schema = _get_declarative_component_schema() + normalized_manifest = ManifestNormalizer(resolved_manifest, schema).normalize() + assert normalized_manifest == expected -def test_deduplicate_manifest_with_shared_definitions_url_base_are_present(): +def test_deduplicate_manifest_with_shared_definitions_url_base_are_present() -> None: content = { "type": "DeclarativeSource", "definitions": { @@ -791,5 +801,7 @@ def test_deduplicate_manifest_with_shared_definitions_url_base_are_present(): }, }, } - config = resolver.preprocess_manifest(content, reduce_commons=True) - assert config == expected + resolved_manifest = resolver.preprocess_manifest(content) + schema = _get_declarative_component_schema() + normalized_manifest = ManifestNormalizer(resolved_manifest, schema).normalize() + assert normalized_manifest == expected From 06b183a9946cdaac31903ae7e5f4791a4f7d904b Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Tue, 1 Apr 2025 12:52:18 +0300 Subject: [PATCH 10/27] formatted --- .../declarative/parsers/manifest_reference_resolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py index 12e1444ad..2b73e5222 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py @@ -102,9 +102,9 @@ class ManifestReferenceResolver: def preprocess_manifest(self, manifest: Mapping[str, Any]) -> Dict[str, Any]: """ :param manifest: incoming manifest that could have references to previously defined components - :param reduce_commons: whether to deduplicate the commonalities in the manifest after pre-processing. + :return: a new manifest with all references resolved """ - return self._evaluate_node(manifest, manifest, set()) # type: ignore + return self._evaluate_node(manifest, manifest, set()) # type: ignore[no-any-return] def _evaluate_node(self, node: Any, manifest: Mapping[str, Any], visited: Set[Any]) -> Any: if isinstance(node, dict): From 1fa891c600c916e0216392b085ee06e6bcab836d Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Tue, 1 Apr 2025 12:53:08 +0300 Subject: [PATCH 11/27] formatted --- .../sources/declarative/parsers/manifest_reference_resolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py index 2b73e5222..1c5ae0485 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py @@ -102,7 +102,7 @@ class ManifestReferenceResolver: def preprocess_manifest(self, manifest: Mapping[str, Any]) -> Dict[str, Any]: """ :param manifest: incoming manifest that could have references to previously defined components - :return: a new manifest with all references resolved + :return: """ return self._evaluate_node(manifest, manifest, set()) # type: ignore[no-any-return] From 00e31a70d2a84006bc5c18e8fc80f282386518f4 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Tue, 1 Apr 2025 12:57:42 +0300 Subject: [PATCH 12/27] cleaned up --- .../sources/declarative/parsers/manifest_normalizer.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py index 7010a23eb..dc5b8a48d 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py @@ -104,9 +104,7 @@ def normalize(self) -> ManifestType: ManifestNormalizationException: Caught internally and handled by returning the original manifest. """ try: - # process the manifest self._deduplicate_minifest() - # post processing the manifest self._reference_schemas() return self._normalized_manifest @@ -142,9 +140,7 @@ def _deduplicate_minifest(self) -> None: # prepare the `definitions` tag self._prepare_definitions() # replace duplicates with references, if any - self._handle_duplicates( - self._collect_duplicates(), - ) + self._handle_duplicates(self._collect_duplicates()) except Exception as e: raise ManifestNormalizationException(str(e)) From a5aba82612bdd20d12e3067ade67b23ca23aac67 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Tue, 1 Apr 2025 16:31:02 +0300 Subject: [PATCH 13/27] added the dedicated tests --- .../sources/declarative/parsers/conftest.py | 642 ++++++++++++++++++ .../parsers/test_manifest_normalizer.py | 36 + .../test_manifest_reference_resolver.py | 641 ----------------- 3 files changed, 678 insertions(+), 641 deletions(-) create mode 100644 unit_tests/sources/declarative/parsers/conftest.py create mode 100644 unit_tests/sources/declarative/parsers/test_manifest_normalizer.py diff --git a/unit_tests/sources/declarative/parsers/conftest.py b/unit_tests/sources/declarative/parsers/conftest.py new file mode 100644 index 000000000..4b3312b2f --- /dev/null +++ b/unit_tests/sources/declarative/parsers/conftest.py @@ -0,0 +1,642 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Dict + +import pytest + + +@pytest.fixture +def manifest_with_multiple_url_base() -> Dict[str, Any]: + return { + "type": "DeclarativeSource", + "definitions": { + "streams": { + "A": { + "type": "DeclarativeStream", + "name": "A", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_A", + "path": "A", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/A"}, + }, + }, + "B": { + "type": "DeclarativeStream", + "name": "B", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_B", + "path": "B", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/B"}, + }, + }, + "C": { + "type": "DeclarativeStream", + "name": "C", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_A", + "path": "C", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/C"}, + }, + }, + "D": { + "type": "DeclarativeStream", + "name": "D", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_B", + "path": "D", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/D"}, + }, + }, + "E": { + "type": "DeclarativeStream", + "name": "E", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_B", + "path": "E", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/E"}, + }, + }, + }, + # dummy requesters to be resolved and deduplicated + # to the shared `url_base` in the `definitions.shared` section + "requester_A": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + }, + "requester_B": { + "type": "HttpRequester", + "url_base": "https://example.com/v2/", + }, + }, + "streams": [ + {"$ref": "#/definitions/streams/A"}, + {"$ref": "#/definitions/streams/B"}, + {"$ref": "#/definitions/streams/C"}, + {"$ref": "#/definitions/streams/D"}, + {"$ref": "#/definitions/streams/E"}, + ], + "schemas": { + "A": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "B": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "C": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "D": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "E": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + }, + } + + +@pytest.fixture +def expected_manifest_with_multiple_url_base_normalized() -> Dict[str, Any]: + return { + "type": "DeclarativeSource", + "definitions": {"shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}}, + "streams": [ + { + "type": "DeclarativeStream", + "name": "A", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + "path": "A", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/A"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "B", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "path": "B", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/B"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "C", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + "path": "C", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/C"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "D", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "path": "D", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/D"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "E", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "path": "E", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/E"}, + }, + }, + ], + "schemas": { + "A": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "B": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "C": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "D": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "E": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + }, + } + + +@pytest.fixture +def manifest_with_url_base_shared_definition() -> Dict[str, Any]: + return { + "type": "DeclarativeSource", + "definitions": { + "shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}, + "streams": { + "A": { + "type": "DeclarativeStream", + "name": "A", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_A", + "path": "A", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/A"}, + }, + }, + "B": { + "type": "DeclarativeStream", + "name": "B", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_B", + "path": "B", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/B"}, + }, + }, + "C": { + "type": "DeclarativeStream", + "name": "C", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_A", + "path": "C", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/C"}, + }, + }, + "D": { + "type": "DeclarativeStream", + "name": "D", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_B", + "path": "D", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/D"}, + }, + }, + "E": { + "type": "DeclarativeStream", + "name": "E", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "$ref": "#/definitions/requester_B", + "path": "E", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/E"}, + }, + }, + }, + # dummy requesters to be resolved and deduplicated + # to the shared `url_base` in the `definitions.shared` section + "requester_A": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + }, + "requester_B": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + }, + }, + "streams": [ + {"$ref": "#/definitions/streams/A"}, + {"$ref": "#/definitions/streams/B"}, + {"$ref": "#/definitions/streams/C"}, + {"$ref": "#/definitions/streams/D"}, + {"$ref": "#/definitions/streams/E"}, + ], + "schemas": { + "A": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "B": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "C": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "D": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "E": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + }, + } + + +@pytest.fixture +def expected_manifest_with_url_base_shared_definition_normalized() -> Dict[str, Any]: + return { + "type": "DeclarativeSource", + "definitions": {"shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}}, + "streams": [ + { + "type": "DeclarativeStream", + "name": "A", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + "path": "A", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/A"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "B", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "path": "B", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/B"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "C", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://example.com/v1/", + "path": "C", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/C"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "D", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "path": "D", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/D"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "E", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "path": "E", + "http_method": "GET", + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + "decoder": {"type": "JsonDecoder"}, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/E"}, + }, + }, + ], + "schemas": { + "A": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "B": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "C": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "D": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + "E": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": True, + "properties": {}, + }, + }, + } diff --git a/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py b/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py new file mode 100644 index 000000000..20fd4e1d0 --- /dev/null +++ b/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# + + +from airbyte_cdk.sources.declarative.manifest_declarative_source import ( + _get_declarative_component_schema, +) +from airbyte_cdk.sources.declarative.parsers.manifest_normalizer import ( + ManifestNormalizer, +) +from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ( + ManifestReferenceResolver, +) + +resolver = ManifestReferenceResolver() + + +def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequent_is_shared( + manifest_with_multiple_url_base, + expected_manifest_with_multiple_url_base_normalized, +) -> None: + resolved_manifest = resolver.preprocess_manifest(manifest_with_multiple_url_base) + schema = _get_declarative_component_schema() + normalized_manifest = ManifestNormalizer(resolved_manifest, schema).normalize() + assert normalized_manifest == expected_manifest_with_multiple_url_base_normalized + + +def test_deduplicate_manifest_with_shared_definitions_url_base_are_present( + manifest_with_url_base_shared_definition, + expected_manifest_with_url_base_shared_definition_normalized, +) -> None: + resolved_manifest = resolver.preprocess_manifest(manifest_with_url_base_shared_definition) + schema = _get_declarative_component_schema() + normalized_manifest = ManifestNormalizer(resolved_manifest, schema).normalize() + assert normalized_manifest == expected_manifest_with_url_base_shared_definition_normalized diff --git a/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py b/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py index f7a8d145e..29afa0e70 100644 --- a/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +++ b/unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py @@ -4,16 +4,10 @@ import pytest -from airbyte_cdk.sources.declarative.manifest_declarative_source import ( - _get_declarative_component_schema, -) from airbyte_cdk.sources.declarative.parsers.custom_exceptions import ( CircularReferenceException, UndefinedReferenceException, ) -from airbyte_cdk.sources.declarative.parsers.manifest_normalizer import ( - ManifestNormalizer, -) from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ( ManifestReferenceResolver, _parse_path, @@ -170,638 +164,3 @@ def test_circular_reference(): content = {"elem_ref1": "#/elem_ref2", "elem_ref2": "#/elem_ref1"} with pytest.raises(CircularReferenceException): resolver.preprocess_manifest(content) - - -def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequent_is_shared() -> ( - None -): - content = { - "type": "DeclarativeSource", - "definitions": { - "streams": { - "A": { - "type": "DeclarativeStream", - "name": "A", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "$ref": "#/definitions/requester_A", - "path": "A", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/A"}, - }, - }, - "B": { - "type": "DeclarativeStream", - "name": "B", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "$ref": "#/definitions/requester_B", - "path": "B", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/B"}, - }, - }, - "C": { - "type": "DeclarativeStream", - "name": "C", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "$ref": "#/definitions/requester_A", - "path": "C", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/C"}, - }, - }, - "D": { - "type": "DeclarativeStream", - "name": "D", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "$ref": "#/definitions/requester_B", - "path": "D", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/D"}, - }, - }, - "E": { - "type": "DeclarativeStream", - "name": "E", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "$ref": "#/definitions/requester_B", - "path": "E", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/E"}, - }, - }, - }, - # dummy requesters to be resolved and deduplicated - # to the shared `url_base` in the `definitions.shared` section - "requester_A": { - "type": "HttpRequester", - "url_base": "https://example.com/v1/", - }, - "requester_B": { - "type": "HttpRequester", - "url_base": "https://example.com/v2/", - }, - }, - "streams": [ - {"$ref": "#/definitions/streams/A"}, - {"$ref": "#/definitions/streams/B"}, - {"$ref": "#/definitions/streams/C"}, - {"$ref": "#/definitions/streams/D"}, - {"$ref": "#/definitions/streams/E"}, - ], - "schemas": { - "A": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "B": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "C": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "D": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "E": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - }, - } - expected = { - "type": "DeclarativeSource", - "definitions": {"shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}}, - "streams": [ - { - "type": "DeclarativeStream", - "name": "A", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://example.com/v1/", - "path": "A", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/A"}, - }, - }, - { - "type": "DeclarativeStream", - "name": "B", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, - "path": "B", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/B"}, - }, - }, - { - "type": "DeclarativeStream", - "name": "C", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://example.com/v1/", - "path": "C", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/C"}, - }, - }, - { - "type": "DeclarativeStream", - "name": "D", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, - "path": "D", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/D"}, - }, - }, - { - "type": "DeclarativeStream", - "name": "E", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, - "path": "E", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/E"}, - }, - }, - ], - "schemas": { - "A": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "B": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "C": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "D": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "E": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - }, - } - resolved_manifest = resolver.preprocess_manifest(content) - schema = _get_declarative_component_schema() - normalized_manifest = ManifestNormalizer(resolved_manifest, schema).normalize() - assert normalized_manifest == expected - - -def test_deduplicate_manifest_with_shared_definitions_url_base_are_present() -> None: - content = { - "type": "DeclarativeSource", - "definitions": { - "shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}, - "streams": { - "A": { - "type": "DeclarativeStream", - "name": "A", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "$ref": "#/definitions/requester_A", - "path": "A", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/A"}, - }, - }, - "B": { - "type": "DeclarativeStream", - "name": "B", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "$ref": "#/definitions/requester_B", - "path": "B", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/B"}, - }, - }, - "C": { - "type": "DeclarativeStream", - "name": "C", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "$ref": "#/definitions/requester_A", - "path": "C", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/C"}, - }, - }, - "D": { - "type": "DeclarativeStream", - "name": "D", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "$ref": "#/definitions/requester_B", - "path": "D", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/D"}, - }, - }, - "E": { - "type": "DeclarativeStream", - "name": "E", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "$ref": "#/definitions/requester_B", - "path": "E", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/E"}, - }, - }, - }, - # dummy requesters to be resolved and deduplicated - # to the shared `url_base` in the `definitions.shared` section - "requester_A": { - "type": "HttpRequester", - "url_base": "https://example.com/v1/", - }, - "requester_B": { - "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, - }, - }, - "streams": [ - {"$ref": "#/definitions/streams/A"}, - {"$ref": "#/definitions/streams/B"}, - {"$ref": "#/definitions/streams/C"}, - {"$ref": "#/definitions/streams/D"}, - {"$ref": "#/definitions/streams/E"}, - ], - "schemas": { - "A": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "B": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "C": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "D": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "E": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - }, - } - expected = { - "type": "DeclarativeSource", - "definitions": {"shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}}, - "streams": [ - { - "type": "DeclarativeStream", - "name": "A", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://example.com/v1/", - "path": "A", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/A"}, - }, - }, - { - "type": "DeclarativeStream", - "name": "B", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, - "path": "B", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/B"}, - }, - }, - { - "type": "DeclarativeStream", - "name": "C", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": "https://example.com/v1/", - "path": "C", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/C"}, - }, - }, - { - "type": "DeclarativeStream", - "name": "D", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, - "path": "D", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/D"}, - }, - }, - { - "type": "DeclarativeStream", - "name": "E", - "retriever": { - "type": "SimpleRetriever", - "requester": { - "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, - "path": "E", - "http_method": "GET", - }, - "record_selector": { - "type": "RecordSelector", - "extractor": {"type": "DpathExtractor", "field_path": []}, - }, - "decoder": {"type": "JsonDecoder"}, - }, - "schema_loader": { - "type": "InlineSchemaLoader", - "schema": {"$ref": "#/schemas/E"}, - }, - }, - ], - "schemas": { - "A": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "B": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "C": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "D": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - "E": { - "type": "object", - "$schema": "http://json-schema.org/draft-07/schema#", - "additionalProperties": True, - "properties": {}, - }, - }, - } - resolved_manifest = resolver.preprocess_manifest(content) - schema = _get_declarative_component_schema() - normalized_manifest = ManifestNormalizer(resolved_manifest, schema).normalize() - assert normalized_manifest == expected From 9f7d498310a2f5996ab0219f8e44b85f429d0817 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Wed, 2 Apr 2025 17:44:30 +0300 Subject: [PATCH 14/27] formatted --- .../parsers/manifest_normalizer.py | 30 ++++++++----------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py index dc5b8a48d..962120122 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py @@ -74,7 +74,7 @@ class ManifestNormalizer: - removing duplicated definitions - replacing them with references. - To extend the functionality, use the `normilize()` method to include any additional processing steps. + To extend the functionality, use the `normilize()` method to include any additional processing steps. """ def __init__( @@ -117,8 +117,9 @@ def _get_manifest_streams(self) -> Iterable[Dict[str, Any]]: Get the streams from the manifest. Returns: - A list of streams + An Iterable of streams. """ + if STREAMS_TAG in self._normalized_manifest.keys(): for stream in self._normalized_manifest[STREAMS_TAG]: yield stream @@ -128,14 +129,8 @@ def _get_manifest_streams(self) -> Iterable[Dict[str, Any]]: def _deduplicate_minifest(self) -> None: """ Find commonalities in the input JSON structure and refactor it to avoid redundancy. - - Args: - resolved_manifest: A dictionary representing a JSON structure to be analyzed. - - Returns: - A refactored JSON structure with common properties extracted to `definitions.shared`, - the duplicated properties replaced with references """ + try: # prepare the `definitions` tag self._prepare_definitions() @@ -148,10 +143,8 @@ def _prepare_definitions(self) -> None: """ Clean the definitions in the manifest by removing unnecessary properties. This function modifies the manifest in place. - - Args: - manifest: The manifest to clean """ + # Check if the definitions tag exists if not DEF_TAG in self._normalized_manifest: self._normalized_manifest[DEF_TAG] = {} @@ -169,9 +162,6 @@ def _reference_schemas(self) -> None: """ Process the definitions in the manifest to move streams from definitions to the main stream list. This function modifies the manifest in place. - - Args: - manifest: The manifest to process """ # reference the stream schema for the stream to where it's stored @@ -191,8 +181,9 @@ def _replace_duplicates_with_refs(self, duplicates: DuplicatesType) -> None: Process duplicate objects and replace them with references. Args: - definitions: The definitions dictionary to modify + duplicates: The duplicates dictionary collected from the given manifest. """ + for _, occurrences in duplicates.items(): type_key, key, value = self._get_occurance_samples(occurrences) is_shared_def = self._is_shared_definition(type_key, key) @@ -214,7 +205,7 @@ def _handle_duplicates(self, duplicates: DuplicatesType) -> None: Process the duplicates and replace them with references. Args: - duplicates: Dictionary of duplicate objects + duplicates: The duplicates dictionary collected from the given manifest. """ if len(duplicates) > 0: @@ -242,6 +233,7 @@ def _add_duplicate( value (Any): The value to be hashed and used for identifying duplicates. key (Optional[str]): An optional key that, if provided, wraps the value in a dictionary before hashing. """ + # create hash for each duplicate observed value_to_hash = value if key is None else {key: value} duplicates[self._hash_object(value_to_hash)].append((current_path, obj, value)) @@ -334,7 +326,7 @@ def _clean_and_sort_duplicates(self, duplicates: DuplicatesType) -> DuplicatesTy duplicates: The duplicates dictionary to sort Returns: - A sorted duplicates dictionary + A sorted duplicates dictionary. """ # clean non-duplicates @@ -432,6 +424,7 @@ def _create_shared_definition_ref(self, type_key: str, key: str) -> Dict[str, st Returns: A reference object in the proper format """ + return {"$ref": f"#/{DEF_TAG}/{SHARED_TAG}/{type_key}/{key}"} def _create_schema_ref(self, ref_key: str) -> Dict[str, str]: @@ -444,4 +437,5 @@ def _create_schema_ref(self, ref_key: str) -> Dict[str, str]: Returns: A reference object in the proper format """ + return {"$ref": f"#/{SCHEMAS_TAG}/{ref_key}"} From 6ec240ab6c643665f66a94c93b8b8567af35905d Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Tue, 8 Apr 2025 12:50:59 +0300 Subject: [PATCH 15/27] updated normalizer --- .../declarative_component_schema.yaml | 9 ++- .../manifest_declarative_source.py | 56 +++++++++++++------ .../models/declarative_component_schema.py | 6 +- .../parsers/manifest_normalizer.py | 9 ++- .../declarative/interpolation/test_macros.py | 1 - .../parsers/test_manifest_normalizer.py | 16 +++++- 6 files changed, 66 insertions(+), 31 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 95d8ceba6..156b3d3c0 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1604,7 +1604,6 @@ definitions: title: URL Base description: The base URL (scheme and host, e.g. "https://api.example.com") to match. type: string - sharable: True url_path_pattern: title: URL Path Pattern description: A regular expression pattern to match the URL path. @@ -1846,8 +1845,9 @@ definitions: type: string enum: [HttpRequester] url_base: + sharable: true title: API Base URL - description: Base URL of the API source. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this. + description: The Base URL of the API source. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this. type: string interpolation_context: - config @@ -1863,10 +1863,9 @@ definitions: - "{{ config['base_url'] or 'https://app.posthog.com'}}/api" - "https://connect.squareup.com/v2/quotes/{{ stream_partition['id'] }}/quote_line_groups" - "https://example.com/api/v1/resource/{{ next_page_token['id'] }}" - sharable: True path: title: URL Path - description: Path the specific API endpoint that this stream represents. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this. + description: The Path the specific API endpoint that this stream represents. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this. type: string interpolation_context: - config @@ -1882,6 +1881,7 @@ definitions: - "/quotes/{{ stream_partition['id'] }}/quote_line_groups" - "/trades/{{ config['symbol_id'] }}/history" authenticator: + sharable: true title: Authenticator description: Authentication method to use for requests sent to the API. anyOf: @@ -1895,7 +1895,6 @@ definitions: - "$ref": "#/definitions/SessionTokenAuthenticator" - "$ref": "#/definitions/LegacySessionTokenAuthenticator" - "$ref": "#/definitions/SelectiveAuthenticator" - sharable: True error_handler: title: Error Handler description: Error handler component that defines how to handle errors. diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index 578b0f655..9d8f5b536 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -29,7 +29,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( DeclarativeStream as DeclarativeStreamModel, ) -from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + Spec as SpecModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( StateDelegatingStream as StateDelegatingStreamModel, ) @@ -89,7 +91,7 @@ def __init__( debug: bool = False, emit_connector_builder_messages: bool = False, component_factory: Optional[ModelToComponentFactory] = None, - ): + ) -> None: """ Args: config: The provided config dict. @@ -109,18 +111,24 @@ def __init__( # If custom components are needed, locate and/or register them. self.components_module: ModuleType | None = get_registered_components_module(config=config) + # resolve all `$ref` references in the manifest resolved_source_config = ManifestReferenceResolver().preprocess_manifest(manifest) + # resolve all components in the manifest + propagated_source_config = ManifestComponentTransformer().propagate_types_and_parameters( + "", resolved_source_config, {} + ) if emit_connector_builder_messages: - # reduce commonalities in the manifest after the references have been resolved, - # used mostly for Connector Builder use cases. - resolved_source_config = ManifestNormalizer( - resolved_source_config, self._declarative_component_schema + # Connector Builder Ui rendering requires the manifest to be in a specific format. + # 1) references have been resolved + # 2) deprecated fields have been migrated + # 3) the commonly used definitions are extracted to the `definitions.shared.*` + # 4) ! the normalized manifest could be validated after the additional UI post-processing. + propagated_source_config = ManifestNormalizer( + propagated_source_config, + self._declarative_component_schema, ).normalize() - propagated_source_config = ManifestComponentTransformer().propagate_types_and_parameters( - "", resolved_source_config, {} - ) self._source_config = propagated_source_config self._debug = debug self._emit_connector_builder_messages = emit_connector_builder_messages @@ -151,7 +159,9 @@ def message_repository(self) -> MessageRepository: @property def dynamic_streams(self) -> List[Dict[str, Any]]: return self._dynamic_stream_configs( - manifest=self._source_config, config=self._config, with_dynamic_stream_name=True + manifest=self._source_config, + config=self._config, + with_dynamic_stream_name=True, ) @property @@ -174,7 +184,10 @@ def connection_checker(self) -> ConnectionChecker: def streams(self, config: Mapping[str, Any]) -> List[Stream]: self._emit_manifest_debug_message( - extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)} + extra_args={ + "source_name": self.name, + "parsed_config": json.dumps(self._source_config), + } ) stream_configs = self._stream_configs(self._source_config) + self._dynamic_stream_configs( @@ -187,9 +200,11 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: source_streams = [ self._constructor.create_component( - StateDelegatingStreamModel - if stream_config.get("type") == StateDelegatingStreamModel.__name__ - else DeclarativeStreamModel, + ( + StateDelegatingStreamModel + if stream_config.get("type") == StateDelegatingStreamModel.__name__ + else DeclarativeStreamModel + ), stream_config, config, emit_connector_builder_messages=self._emit_connector_builder_messages, @@ -205,7 +220,9 @@ def _initialize_cache_for_parent_streams( ) -> List[Dict[str, Any]]: parent_streams = set() - def update_with_cache_parent_configs(parent_configs: list[dict[str, Any]]) -> None: + def update_with_cache_parent_configs( + parent_configs: list[dict[str, Any]], + ) -> None: for parent_config in parent_configs: parent_streams.add(parent_config["stream"]["name"]) if parent_config["stream"]["type"] == "StateDelegatingStream": @@ -260,7 +277,10 @@ def spec(self, logger: logging.Logger) -> ConnectorSpecification: """ self._configure_logger_level(logger) self._emit_manifest_debug_message( - extra_args={"source_name": self.name, "parsed_config": json.dumps(self._source_config)} + extra_args={ + "source_name": self.name, + "parsed_config": json.dumps(self._source_config), + } ) spec = self._source_config.get("spec") @@ -404,7 +424,9 @@ def _dynamic_stream_configs( # Create a resolver for dynamic components based on type components_resolver = self._constructor.create_component( - COMPONENTS_RESOLVER_TYPE_MAPPING[resolver_type], components_resolver_config, config + COMPONENTS_RESOLVER_TYPE_MAPPING[resolver_type], + components_resolver_config, + config, ) stream_template_config = dynamic_definition["stream_template"] diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index c67cd958b..954c01289 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -49,7 +49,7 @@ class DynamicStreamCheckConfig(BaseModel): ) stream_count: Optional[int] = Field( 0, - description="Numbers of the streams to try reading from when running a check operation.", + description="The number of streams to attempt reading from during a check operation. If `stream_count` exceeds the total number of available streams, the minimum of the two values will be used.", title="Stream Count", ) @@ -2113,7 +2113,7 @@ class HttpRequester(BaseModel): type: Literal["HttpRequester"] url_base: str = Field( ..., - description="Base URL of the API source. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this.", + description="The Base URL of the API source. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this.", examples=[ "https://connect.squareup.com/v2", "{{ config['base_url'] or 'https://app.posthog.com'}}/api", @@ -2124,7 +2124,7 @@ class HttpRequester(BaseModel): ) path: Optional[str] = Field( None, - description="Path the specific API endpoint that this stream represents. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this.", + description="The Path the specific API endpoint that this stream represents. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this.", examples=[ "/products", "/quotes/{{ stream_partition['id'] }}/quote_line_groups", diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py index 962120122..cd26e2a8f 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py @@ -84,9 +84,12 @@ def __init__( ) -> None: self._resolved_manifest = resolved_manifest self._declarative_schema = declarative_schema + self._normalized_manifest: ManifestType = copy.deepcopy(self._resolved_manifest) # get the tags marked as `sharable` in the component schema self._sharable_tags = _get_sharable_schema_tags(self._declarative_schema) - self._normalized_manifest: ManifestType = copy.deepcopy(self._resolved_manifest) + + def to_json_str(self) -> str: + return json.dumps(self._normalized_manifest, indent=2) def normalize(self) -> ManifestType: """ @@ -193,7 +196,7 @@ def _replace_duplicates_with_refs(self, duplicates: DuplicatesType) -> None: self._add_to_shared_definitions(type_key, key, value) # Replace occurrences with references - for path, parent_obj, value in occurrences: + for _, parent_obj, value in occurrences: if is_shared_def: if value == self._get_shared_definition_value(type_key, key): parent_obj[key] = self._create_shared_definition_ref(type_key, key) @@ -235,7 +238,7 @@ def _add_duplicate( """ # create hash for each duplicate observed - value_to_hash = value if key is None else {key: value} + value_to_hash = {key: value} if key is not None else value duplicates[self._hash_object(value_to_hash)].append((current_path, obj, value)) def _add_to_shared_definitions( diff --git a/unit_tests/sources/declarative/interpolation/test_macros.py b/unit_tests/sources/declarative/interpolation/test_macros.py index 526fc2fcc..be055c8a7 100644 --- a/unit_tests/sources/declarative/interpolation/test_macros.py +++ b/unit_tests/sources/declarative/interpolation/test_macros.py @@ -30,7 +30,6 @@ def test_macros_export(test_name, fn_name, found_in_macros): assert fn_name not in macros -@freeze_time("2022-01-01") @pytest.mark.parametrize( "input_value, format, input_format, expected_output", [ diff --git a/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py b/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py index 20fd4e1d0..cb93b20c5 100644 --- a/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py +++ b/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py @@ -20,9 +20,15 @@ def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequ manifest_with_multiple_url_base, expected_manifest_with_multiple_url_base_normalized, ) -> None: - resolved_manifest = resolver.preprocess_manifest(manifest_with_multiple_url_base) + """ + This test is to check that the manifest is normalized when multiple url_base are resolved + and the most frequent one is shared. + """ + schema = _get_declarative_component_schema() + resolved_manifest = resolver.preprocess_manifest(manifest_with_multiple_url_base) normalized_manifest = ManifestNormalizer(resolved_manifest, schema).normalize() + assert normalized_manifest == expected_manifest_with_multiple_url_base_normalized @@ -30,7 +36,13 @@ def test_deduplicate_manifest_with_shared_definitions_url_base_are_present( manifest_with_url_base_shared_definition, expected_manifest_with_url_base_shared_definition_normalized, ) -> None: - resolved_manifest = resolver.preprocess_manifest(manifest_with_url_base_shared_definition) + """ + This test is to check that the manifest is normalized when the `url_base` is shared + between the definitions and the `url_base` is present in the manifest. + """ + schema = _get_declarative_component_schema() + resolved_manifest = resolver.preprocess_manifest(manifest_with_url_base_shared_definition) normalized_manifest = ManifestNormalizer(resolved_manifest, schema).normalize() + assert normalized_manifest == expected_manifest_with_url_base_shared_definition_normalized From 5f5c6b1ba5d7c76930f4b6119f9da3d7a313b5f9 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Tue, 8 Apr 2025 14:12:48 +0300 Subject: [PATCH 16/27] attempt to fix the Connector Builder tests --- .../connector_builder_handler.py | 9 +++-- airbyte_cdk/connector_builder/main.py | 4 +-- .../manifest_declarative_source.py | 8 +++++ .../parsers/manifest_component_transformer.py | 4 +-- .../test_connector_builder_handler.py | 33 ++++++++++++++----- 5 files changed, 43 insertions(+), 15 deletions(-) diff --git a/airbyte_cdk/connector_builder/connector_builder_handler.py b/airbyte_cdk/connector_builder/connector_builder_handler.py index 27929dfa2..fff9b137b 100644 --- a/airbyte_cdk/connector_builder/connector_builder_handler.py +++ b/airbyte_cdk/connector_builder/connector_builder_handler.py @@ -4,7 +4,7 @@ from dataclasses import asdict, dataclass, field -from typing import Any, Dict, List, Mapping +from typing import Any, Dict, List, Mapping, Optional from airbyte_cdk.connector_builder.test_reader import TestReader from airbyte_cdk.models import ( @@ -54,12 +54,17 @@ def get_limits(config: Mapping[str, Any]) -> TestLimits: return TestLimits(max_records, max_pages_per_slice, max_slices, max_streams) -def create_source(config: Mapping[str, Any], limits: TestLimits) -> ManifestDeclarativeSource: +def create_source( + config: Mapping[str, Any], + limits: TestLimits, + post_resolve_manifest: Optional[bool] = False, +) -> ManifestDeclarativeSource: manifest = config["__injected_declarative_manifest"] return ManifestDeclarativeSource( config=config, emit_connector_builder_messages=True, source_config=manifest, + post_resolve_manifest=post_resolve_manifest, component_factory=ModelToComponentFactory( emit_connector_builder_messages=True, limit_pages_fetched_per_slice=limits.max_pages_per_slice, diff --git a/airbyte_cdk/connector_builder/main.py b/airbyte_cdk/connector_builder/main.py index ad2d6650f..392d95e88 100644 --- a/airbyte_cdk/connector_builder/main.py +++ b/airbyte_cdk/connector_builder/main.py @@ -88,10 +88,10 @@ def handle_connector_builder_request( raise ValueError(f"Unrecognized command {command}.") -def handle_request(args: List[str]) -> str: +def handle_request(args: List[str], post_resolve_manifest: Optional[bool] = False) -> str: command, config, catalog, state = get_config_and_catalog_from_args(args) limits = get_limits(config) - source = create_source(config, limits) + source = create_source(config, limits, post_resolve_manifest=post_resolve_manifest) return orjson.dumps( AirbyteMessageSerializer.dump( handle_connector_builder_request(source, command, config, catalog, state, limits) diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index 9d8f5b536..a2e25eb02 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -91,6 +91,7 @@ def __init__( debug: bool = False, emit_connector_builder_messages: bool = False, component_factory: Optional[ModelToComponentFactory] = None, + post_resolve_manifest: Optional[bool] = False, ) -> None: """ Args: @@ -129,6 +130,13 @@ def __init__( self._declarative_component_schema, ).normalize() + # The manifest is now in a format that the Connector Builder UI can use. + # however, the local tests may depend on the completely resolved manifest. + if post_resolve_manifest: + propagated_source_config = ManifestReferenceResolver().preprocess_manifest( + propagated_source_config + ) + self._source_config = propagated_source_config self._debug = debug self._emit_connector_builder_messages = emit_connector_builder_messages diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py b/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py index 6779b54ab..44f414343 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py @@ -4,7 +4,7 @@ import copy import typing -from typing import Any, Mapping, Optional +from typing import Any, Dict, Mapping, Optional PARAMETERS_STR = "$parameters" @@ -95,7 +95,7 @@ def propagate_types_and_parameters( declarative_component: Mapping[str, Any], parent_parameters: Mapping[str, Any], use_parent_parameters: Optional[bool] = None, - ) -> Mapping[str, Any]: + ) -> Dict[str, Any]: """ Recursively transforms the specified declarative component and subcomponents to propagate parameters and insert the default component type if it was not already present. The resulting transformed components are a deep copy of the input diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index 5c537811b..2b57dad88 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -503,7 +503,8 @@ def test_handle_resolve_manifest(valid_resolve_manifest_config_file, dummy_catal str(valid_resolve_manifest_config_file), "--catalog", str(dummy_catalog), - ] + ], + post_resolve_manifest=True, ) assert patched_handle.call_count == 1 @@ -515,7 +516,14 @@ def test_handle_test_read(valid_read_config_file, configured_catalog): return_value=AirbyteMessage(type=MessageType.RECORD), ) as patch: handle_request( - ["read", "--config", str(valid_read_config_file), "--catalog", str(configured_catalog)] + [ + "read", + "--config", + str(valid_read_config_file), + "--catalog", + str(configured_catalog), + ], + post_resolve_manifest=True, ) assert patch.call_count == 1 @@ -867,7 +875,7 @@ def test_handle_429_response(): config = TEST_READ_CONFIG limits = TestLimits() - source = create_source(config, limits) + source = create_source(config, limits, post_resolve_manifest=True) with patch("requests.Session.send", return_value=response) as mock_send: response = handle_connector_builder_request( @@ -919,7 +927,14 @@ def test_missing_config(valid_resolve_manifest_config_file): def test_invalid_config_command(invalid_config_file, dummy_catalog): with pytest.raises(ValueError): handle_request( - ["read", "--config", str(invalid_config_file), "--catalog", str(dummy_catalog)] + [ + "read", + "--config", + str(invalid_config_file), + "--catalog", + str(dummy_catalog), + ], + post_resolve_manifest=True, ) @@ -987,7 +1002,7 @@ def test_create_source(): config = {"__injected_declarative_manifest": MANIFEST} - source = create_source(config, limits) + source = create_source(config, limits, post_resolve_manifest=True) assert isinstance(source, ManifestDeclarativeSource) assert source._constructor._limit_pages_fetched_per_slice == limits.max_pages_per_slice @@ -1081,7 +1096,7 @@ def test_read_source(mock_http_stream): config = {"__injected_declarative_manifest": MANIFEST} - source = create_source(config, limits) + source = create_source(config, limits, post_resolve_manifest=True) output_data = read_stream(source, config, catalog, _A_PER_PARTITION_STATE, limits).record.data slices = output_data["slices"] @@ -1128,7 +1143,7 @@ def test_read_source_single_page_single_slice(mock_http_stream): config = {"__injected_declarative_manifest": MANIFEST} - source = create_source(config, limits) + source = create_source(config, limits, post_resolve_manifest=True) output_data = read_stream(source, config, catalog, _A_PER_PARTITION_STATE, limits).record.data slices = output_data["slices"] @@ -1214,7 +1229,7 @@ def test_handle_read_external_requests(deployment_mode, url_base, expected_error test_manifest["streams"][0]["$parameters"]["url_base"] = url_base config = {"__injected_declarative_manifest": test_manifest} - source = create_source(config, limits) + source = create_source(config, limits, post_resolve_manifest=True) with mock.patch.dict(os.environ, {"DEPLOYMENT_MODE": deployment_mode}, clear=False): output_data = read_stream( @@ -1310,7 +1325,7 @@ def test_handle_read_external_oauth_request(deployment_mode, token_url, expected ) config = {"__injected_declarative_manifest": test_manifest} - source = create_source(config, limits) + source = create_source(config, limits, post_resolve_manifest=True) with mock.patch.dict(os.environ, {"DEPLOYMENT_MODE": deployment_mode}, clear=False): output_data = read_stream( From be3bab10272fa19da88cdb219755dac2b9643a11 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Fri, 11 Apr 2025 17:28:42 +0300 Subject: [PATCH 17/27] revert test --- unit_tests/sources/declarative/interpolation/test_macros.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unit_tests/sources/declarative/interpolation/test_macros.py b/unit_tests/sources/declarative/interpolation/test_macros.py index 4948ba57b..cd42f9d08 100644 --- a/unit_tests/sources/declarative/interpolation/test_macros.py +++ b/unit_tests/sources/declarative/interpolation/test_macros.py @@ -5,7 +5,6 @@ import datetime import pytest -from freezegun import freeze_time from airbyte_cdk.sources.declarative.interpolation.macros import macros From b10d7a191e06b83825755c36893318a69176e439 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Tue, 15 Apr 2025 19:26:22 +0300 Subject: [PATCH 18/27] removed post_resolve_manifest flag --- .../connector_builder_handler.py | 11 ++-------- airbyte_cdk/connector_builder/main.py | 4 ++-- .../manifest_declarative_source.py | 20 +++++++------------ .../test_connector_builder_handler.py | 15 ++++++-------- 4 files changed, 17 insertions(+), 33 deletions(-) diff --git a/airbyte_cdk/connector_builder/connector_builder_handler.py b/airbyte_cdk/connector_builder/connector_builder_handler.py index e3152df80..812f19a22 100644 --- a/airbyte_cdk/connector_builder/connector_builder_handler.py +++ b/airbyte_cdk/connector_builder/connector_builder_handler.py @@ -4,7 +4,7 @@ from dataclasses import asdict, dataclass, field -from typing import Any, ClassVar, Dict, List, Mapping, Optional +from typing import Any, ClassVar, Dict, List, Mapping from airbyte_cdk.connector_builder.test_reader import TestReader from airbyte_cdk.models import ( @@ -55,6 +55,7 @@ def get_limits(config: Mapping[str, Any]) -> TestLimits: max_streams = command_config.get(MAX_STREAMS_KEY) or DEFAULT_MAXIMUM_STREAMS return TestLimits(max_records, max_pages_per_slice, max_slices, max_streams) + def normalize_manifest(config: Mapping[str, Any]) -> bool: """ Check if the manifest should be normalized. @@ -63,13 +64,6 @@ def normalize_manifest(config: Mapping[str, Any]) -> bool: """ return config.get("__requires_normalization", False) -def post_resolve_manifest(config: Mapping[str, Any]) -> bool: - """ - Check if the manifest should be post-resolved. - :param config: The configuration to check - :return: True if the manifest should be post-resolved, False otherwise. - """ - return config.get("__post_resolve_manifest", False) def create_source( config: Mapping[str, Any], @@ -81,7 +75,6 @@ def create_source( emit_connector_builder_messages=True, source_config=manifest, normalize_manifest=normalize_manifest(config), - post_resolve_manifest=post_resolve_manifest(config), component_factory=ModelToComponentFactory( emit_connector_builder_messages=True, limit_pages_fetched_per_slice=limits.max_pages_per_slice, diff --git a/airbyte_cdk/connector_builder/main.py b/airbyte_cdk/connector_builder/main.py index 4a39a907e..80cf4afa9 100644 --- a/airbyte_cdk/connector_builder/main.py +++ b/airbyte_cdk/connector_builder/main.py @@ -88,10 +88,10 @@ def handle_connector_builder_request( raise ValueError(f"Unrecognized command {command}.") -def handle_request(args: List[str], post_resolve_manifest: Optional[bool] = False) -> str: +def handle_request(args: List[str]) -> str: command, config, catalog, state = get_config_and_catalog_from_args(args) limits = get_limits(config) - source = create_source(config, limits, post_resolve_manifest=post_resolve_manifest) + source = create_source(config, limits) return orjson.dumps( AirbyteMessageSerializer.dump( handle_connector_builder_request(source, command, config, catalog, state, limits) diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index a2e25eb02..d6897dec4 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -91,7 +91,7 @@ def __init__( debug: bool = False, emit_connector_builder_messages: bool = False, component_factory: Optional[ModelToComponentFactory] = None, - post_resolve_manifest: Optional[bool] = False, + normalize_manifest: Optional[bool] = False, ) -> None: """ Args: @@ -100,6 +100,8 @@ def __init__( debug: True if debug mode is enabled. emit_connector_builder_messages: True if messages should be emitted to the connector builder. component_factory: optional factory if ModelToComponentFactory's default behavior needs to be tweaked. + normalize_manifest: Optional flag to indicate if the manifest should be normalized. + post_resolve_manifest: Optional flag to indicate if the manifest should be resolved after normalization. """ self.logger = logging.getLogger(f"airbyte.{self.name}") @@ -119,24 +121,16 @@ def __init__( "", resolved_source_config, {} ) - if emit_connector_builder_messages: - # Connector Builder Ui rendering requires the manifest to be in a specific format. + if normalize_manifest: + # Connector Builder UI rendering requires the manifest to be in a specific format. # 1) references have been resolved - # 2) deprecated fields have been migrated - # 3) the commonly used definitions are extracted to the `definitions.shared.*` - # 4) ! the normalized manifest could be validated after the additional UI post-processing. + # 2) the commonly used definitions are extracted to the `definitions.shared.*` + # 3) ! the normalized manifest could be validated only after the additional UI post-processing. propagated_source_config = ManifestNormalizer( propagated_source_config, self._declarative_component_schema, ).normalize() - # The manifest is now in a format that the Connector Builder UI can use. - # however, the local tests may depend on the completely resolved manifest. - if post_resolve_manifest: - propagated_source_config = ManifestReferenceResolver().preprocess_manifest( - propagated_source_config - ) - self._source_config = propagated_source_config self._debug = debug self._emit_connector_builder_messages = emit_connector_builder_messages diff --git a/unit_tests/connector_builder/test_connector_builder_handler.py b/unit_tests/connector_builder/test_connector_builder_handler.py index f095c9448..36680a6bb 100644 --- a/unit_tests/connector_builder/test_connector_builder_handler.py +++ b/unit_tests/connector_builder/test_connector_builder_handler.py @@ -504,7 +504,6 @@ def test_handle_resolve_manifest(valid_resolve_manifest_config_file, dummy_catal "--catalog", str(dummy_catalog), ], - post_resolve_manifest=True, ) assert patched_handle.call_count == 1 @@ -523,7 +522,6 @@ def test_handle_test_read(valid_read_config_file, configured_catalog): "--catalog", str(configured_catalog), ], - post_resolve_manifest=True, ) assert patch.call_count == 1 @@ -875,7 +873,7 @@ def test_handle_429_response(): config = TEST_READ_CONFIG limits = TestLimits() - source = create_source(config, limits, post_resolve_manifest=True) + source = create_source(config, limits) with patch("requests.Session.send", return_value=response) as mock_send: response = handle_connector_builder_request( @@ -934,7 +932,6 @@ def test_invalid_config_command(invalid_config_file, dummy_catalog): "--catalog", str(dummy_catalog), ], - post_resolve_manifest=True, ) @@ -1002,7 +999,7 @@ def test_create_source(): config = {"__injected_declarative_manifest": MANIFEST} - source = create_source(config, limits, post_resolve_manifest=True) + source = create_source(config, limits) assert isinstance(source, ManifestDeclarativeSource) assert source._constructor._limit_pages_fetched_per_slice == limits.max_pages_per_slice @@ -1096,7 +1093,7 @@ def test_read_source(mock_http_stream): config = {"__injected_declarative_manifest": MANIFEST} - source = create_source(config, limits, post_resolve_manifest=True) + source = create_source(config, limits) output_data = read_stream(source, config, catalog, _A_PER_PARTITION_STATE, limits).record.data slices = output_data["slices"] @@ -1143,7 +1140,7 @@ def test_read_source_single_page_single_slice(mock_http_stream): config = {"__injected_declarative_manifest": MANIFEST} - source = create_source(config, limits, post_resolve_manifest=True) + source = create_source(config, limits) output_data = read_stream(source, config, catalog, _A_PER_PARTITION_STATE, limits).record.data slices = output_data["slices"] @@ -1229,7 +1226,7 @@ def test_handle_read_external_requests(deployment_mode, url_base, expected_error test_manifest["streams"][0]["$parameters"]["url_base"] = url_base config = {"__injected_declarative_manifest": test_manifest} - source = create_source(config, limits, post_resolve_manifest=True) + source = create_source(config, limits) with mock.patch.dict(os.environ, {"DEPLOYMENT_MODE": deployment_mode}, clear=False): output_data = read_stream( @@ -1325,7 +1322,7 @@ def test_handle_read_external_oauth_request(deployment_mode, token_url, expected ) config = {"__injected_declarative_manifest": test_manifest} - source = create_source(config, limits, post_resolve_manifest=True) + source = create_source(config, limits) with mock.patch.dict(os.environ, {"DEPLOYMENT_MODE": deployment_mode}, clear=False): output_data = read_stream( From 0587481061bfd43b53ca66a946423998cdc5a4fb Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Tue, 15 Apr 2025 20:07:45 +0300 Subject: [PATCH 19/27] nit --- airbyte_cdk/sources/declarative/manifest_declarative_source.py | 1 - 1 file changed, 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index d6897dec4..c2238d29e 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -101,7 +101,6 @@ def __init__( emit_connector_builder_messages: True if messages should be emitted to the connector builder. component_factory: optional factory if ModelToComponentFactory's default behavior needs to be tweaked. normalize_manifest: Optional flag to indicate if the manifest should be normalized. - post_resolve_manifest: Optional flag to indicate if the manifest should be resolved after normalization. """ self.logger = logging.getLogger(f"airbyte.{self.name}") From d9291671fdd3a822ffe54618c5f286fb9809b7e3 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 17 Apr 2025 15:55:58 +0300 Subject: [PATCH 20/27] add _-should_normalize flag handling --- airbyte_cdk/connector_builder/connector_builder_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/airbyte_cdk/connector_builder/connector_builder_handler.py b/airbyte_cdk/connector_builder/connector_builder_handler.py index 812f19a22..2c60c319a 100644 --- a/airbyte_cdk/connector_builder/connector_builder_handler.py +++ b/airbyte_cdk/connector_builder/connector_builder_handler.py @@ -56,13 +56,13 @@ def get_limits(config: Mapping[str, Any]) -> TestLimits: return TestLimits(max_records, max_pages_per_slice, max_slices, max_streams) -def normalize_manifest(config: Mapping[str, Any]) -> bool: +def should_normalize_manifest(config: Mapping[str, Any]) -> bool: """ Check if the manifest should be normalized. :param config: The configuration to check :return: True if the manifest should be normalized, False otherwise. """ - return config.get("__requires_normalization", False) + return config.get("__should_normalize", False) def create_source( @@ -74,7 +74,7 @@ def create_source( config=config, emit_connector_builder_messages=True, source_config=manifest, - normalize_manifest=normalize_manifest(config), + normalize_manifest=should_normalize_manifest(config), component_factory=ModelToComponentFactory( emit_connector_builder_messages=True, limit_pages_fetched_per_slice=limits.max_pages_per_slice, From 9de27ef8663916a45000d30a97505bbb96c565dd Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 17 Apr 2025 15:57:53 +0300 Subject: [PATCH 21/27] formatted --- airbyte_cdk/connector_builder/connector_builder_handler.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/airbyte_cdk/connector_builder/connector_builder_handler.py b/airbyte_cdk/connector_builder/connector_builder_handler.py index 2c60c319a..b70e2c85f 100644 --- a/airbyte_cdk/connector_builder/connector_builder_handler.py +++ b/airbyte_cdk/connector_builder/connector_builder_handler.py @@ -65,10 +65,7 @@ def should_normalize_manifest(config: Mapping[str, Any]) -> bool: return config.get("__should_normalize", False) -def create_source( - config: Mapping[str, Any], - limits: TestLimits, -) -> ManifestDeclarativeSource: +def create_source(config: Mapping[str, Any], limits: TestLimits) -> ManifestDeclarativeSource: manifest = config["__injected_declarative_manifest"] return ManifestDeclarativeSource( config=config, From c403a0e59156dc30c4c810216c31d9dbe6b5e0b9 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 17 Apr 2025 16:08:38 +0300 Subject: [PATCH 22/27] rename sharable > linkable, shared > linked --- .../declarative_component_schema.yaml | 4 +- .../parsers/manifest_normalizer.py | 121 +++++++++--------- .../sources/declarative/parsers/conftest.py | 28 ++-- .../parsers/test_manifest_normalizer.py | 8 +- 4 files changed, 81 insertions(+), 80 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 5ae3b55b6..438102e61 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1869,7 +1869,7 @@ definitions: type: string enum: [HttpRequester] url_base: - sharable: true + linkable: true title: API Base URL description: The Base URL of the API source. Do not put sensitive information (e.g. API tokens) into this field - Use the Authentication component for this. type: string @@ -1905,7 +1905,7 @@ definitions: - "/quotes/{{ stream_partition['id'] }}/quote_line_groups" - "/trades/{{ config['symbol_id'] }}/history" authenticator: - sharable: true + linkable: true title: Authenticator description: Authentication method to use for requests sent to the API. anyOf: diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py index cd26e2a8f..60029d08c 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py @@ -19,53 +19,54 @@ # Configuration constants N_OCCURANCES = 2 + DEF_TAG = "definitions" -STREAMS_TAG = "streams" -SHARED_TAG = "shared" -SHARABLE_TAG = "sharable" +LINKABLE_TAG = "linkable" +LINKED_TAG = "linked" +PROPERTIES_TAG = "properties" SCHEMA_LOADER_TAG = "schema_loader" -SCHEMAS_TAG = "schemas" SCHEMA_TAG = "schema" -PROPERTIES_TAG = "properties" +SCHEMAS_TAG = "schemas" +STREAMS_TAG = "streams" -def _get_sharable_schema_tags(schema: DefinitionsType) -> List[str]: +def _get_linkable_schema_tags(schema: DefinitionsType) -> List[str]: """ - Extracts sharable tags from schema definitions. - This function identifies properties within a schema's definitions that are marked as sharable. + Extracts linkable tags from schema definitions. + This function identifies properties within a schema's definitions that are marked as linkable. It traverses through each definition in the schema, examines its properties, and collects - the keys of properties that contain the SHARABLE_TAG. + the keys of properties that contain the LINKABLE_TAG. Args: schema (DefinitionsType): The schema definition dictionary to process Returns: - List[str]: A deduplicated list of property keys that are marked as sharable + List[str]: A deduplicated list of property keys that are marked as linkable """ - # the sharable scope: ['definitions.*'] + # the linkable scope: ['definitions.*'] schema_definitions = schema.get(DEF_TAG, {}) - sharable_tags: List[str] = [] - # Extract sharable keys from properties + linkable_tags: List[str] = [] + # Extract linkable keys from properties - extract_sharable_keys: Callable[[Dict[str, Dict[str, Any]]], List[str]] = lambda properties: [ - key for key, value in properties.items() if SHARABLE_TAG in value.keys() + extract_linkable_keys: Callable[[Dict[str, Dict[str, Any]]], List[str]] = lambda properties: [ + key for key, value in properties.items() if LINKABLE_TAG in value.keys() ] - # Process each root value to get its sharable keys - process_root: Callable[[Dict[str, Any]], List[str]] = lambda root_value: extract_sharable_keys( + # Process each root value to get its linkable keys + process_root: Callable[[Dict[str, Any]], List[str]] = lambda root_value: extract_linkable_keys( root_value.get(PROPERTIES_TAG, {}) ) # Map the process_root function over all schema values and flatten the results - all_sharable_tags = chain.from_iterable(map(process_root, schema_definitions.values())) + all_linkable_tags = chain.from_iterable(map(process_root, schema_definitions.values())) - # Add all found sharable tags to the tags list - sharable_tags.extend(all_sharable_tags) + # Add all found linkable tags to the tags list + linkable_tags.extend(all_linkable_tags) # return unique tags only - return list(set(sharable_tags)) + return list(set(linkable_tags)) class ManifestNormalizer: @@ -85,8 +86,8 @@ def __init__( self._resolved_manifest = resolved_manifest self._declarative_schema = declarative_schema self._normalized_manifest: ManifestType = copy.deepcopy(self._resolved_manifest) - # get the tags marked as `sharable` in the component schema - self._sharable_tags = _get_sharable_schema_tags(self._declarative_schema) + # get the tags marked as `linkable` in the component schema + self._linkable_tags = _get_linkable_schema_tags(self._declarative_schema) def to_json_str(self) -> str: return json.dumps(self._normalized_manifest, indent=2) @@ -152,13 +153,13 @@ def _prepare_definitions(self) -> None: if not DEF_TAG in self._normalized_manifest: self._normalized_manifest[DEF_TAG] = {} - # Check if the shared tag exists - if not SHARED_TAG in self._normalized_manifest[DEF_TAG]: - self._normalized_manifest[DEF_TAG][SHARED_TAG] = {} + # Check if the linked tag exists + if not LINKED_TAG in self._normalized_manifest[DEF_TAG]: + self._normalized_manifest[DEF_TAG][LINKED_TAG] = {} - # remove everything from definitions tag except of `shared`, after processing + # remove everything from definitions tag except of `linked`, after processing for key in list(self._normalized_manifest[DEF_TAG].keys()): - if key != SHARED_TAG: + if key != LINKED_TAG: self._normalized_manifest[DEF_TAG].pop(key, None) def _reference_schemas(self) -> None: @@ -189,19 +190,19 @@ def _replace_duplicates_with_refs(self, duplicates: DuplicatesType) -> None: for _, occurrences in duplicates.items(): type_key, key, value = self._get_occurance_samples(occurrences) - is_shared_def = self._is_shared_definition(type_key, key) + is_linked_def = self._is_linked_definition(type_key, key) # Add to definitions if not there already - if not is_shared_def: - self._add_to_shared_definitions(type_key, key, value) + if not is_linked_def: + self._add_to_linked_definitions(type_key, key, value) # Replace occurrences with references for _, parent_obj, value in occurrences: - if is_shared_def: - if value == self._get_shared_definition_value(type_key, key): - parent_obj[key] = self._create_shared_definition_ref(type_key, key) + if is_linked_def: + if value == self._get_linked_definition_value(type_key, key): + parent_obj[key] = self._create_linked_definition_ref(type_key, key) else: - parent_obj[key] = self._create_shared_definition_ref(type_key, key) + parent_obj[key] = self._create_linked_definition_ref(type_key, key) def _handle_duplicates(self, duplicates: DuplicatesType) -> None: """ @@ -241,25 +242,25 @@ def _add_duplicate( value_to_hash = {key: value} if key is not None else value duplicates[self._hash_object(value_to_hash)].append((current_path, obj, value)) - def _add_to_shared_definitions( + def _add_to_linked_definitions( self, type_key: str, key: str, value: Any, ) -> None: """ - Add a value to the shared definitions under the specified key. + Add a value to the linked definitions under the specified key. Args: definitions: The definitions dictionary to modify key: The key to use value: The value to add """ - if type_key not in self._normalized_manifest[DEF_TAG][SHARED_TAG].keys(): - self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key] = {} + if type_key not in self._normalized_manifest[DEF_TAG][LINKED_TAG].keys(): + self._normalized_manifest[DEF_TAG][LINKED_TAG][type_key] = {} - if key not in self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key].keys(): - self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key][key] = value + if key not in self._normalized_manifest[DEF_TAG][LINKED_TAG][type_key].keys(): + self._normalized_manifest[DEF_TAG][LINKED_TAG][type_key][key] = value def _collect_duplicates(self) -> DuplicatesType: """ @@ -297,13 +298,13 @@ def _collect(obj: Dict[str, Any], path: Optional[List[str]] = None) -> None: # First process nested dictionaries _collect(value, current_path) # Process allowed-only component tags - if key in self._sharable_tags: + if key in self._linkable_tags: self._add_duplicate(duplicates, current_path, obj, value) # handle primitive types elif isinstance(value, (str, int, float, bool)): # Process allowed-only field tags - if key in self._sharable_tags: + if key in self._linkable_tags: self._add_duplicate(duplicates, current_path, obj, value, key) # handle list cases @@ -313,7 +314,7 @@ def _collect(obj: Dict[str, Any], path: Optional[List[str]] = None) -> None: duplicates: DuplicatesType = defaultdict(list, {}) try: - if self._sharable_tags: + if self._linkable_tags: _collect(self._normalized_manifest) # clean non-duplicates and sort based on the count of occurrences return self._clean_and_sort_duplicates(duplicates) @@ -360,42 +361,42 @@ def _hash_object(self, obj: Dict[str, Any]) -> str: # Sort keys to ensure consistent hash for same content return hashlib.md5(json.dumps(obj, sort_keys=True).encode()).hexdigest() - def _is_shared_definition(self, type_key: str, key: str) -> bool: + def _is_linked_definition(self, type_key: str, key: str) -> bool: """ - Check if the key already exists in the shared definitions. + Check if the key already exists in the linked definitions. Args: key: The key to check definitions: The definitions dictionary with definitions Returns: - True if the key exists in the shared definitions, False otherwise + True if the key exists in the linked definitions, False otherwise """ - if type_key in self._normalized_manifest[DEF_TAG][SHARED_TAG].keys(): - # Check if the key exists in the shared definitions - if key in self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key].keys(): + if type_key in self._normalized_manifest[DEF_TAG][LINKED_TAG].keys(): + # Check if the key exists in the linked definitions + if key in self._normalized_manifest[DEF_TAG][LINKED_TAG][type_key].keys(): return True return False - def _get_shared_definition_value(self, type_key: str, key: str) -> Any: + def _get_linked_definition_value(self, type_key: str, key: str) -> Any: """ - Get the value of a shared definition by its key. + Get the value of a linked definition by its key. Args: key: The key to check definitions: The definitions dictionary with definitions Returns: - The value of the shared definition + The value of the linked definition """ - if type_key in self._normalized_manifest[DEF_TAG][SHARED_TAG].keys(): - if key in self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key].keys(): - return self._normalized_manifest[DEF_TAG][SHARED_TAG][type_key][key] + if type_key in self._normalized_manifest[DEF_TAG][LINKED_TAG].keys(): + if key in self._normalized_manifest[DEF_TAG][LINKED_TAG][type_key].keys(): + return self._normalized_manifest[DEF_TAG][LINKED_TAG][type_key][key] else: raise ManifestNormalizationException( - f"Key {key} not found in shared definitions. Please check the manifest." + f"Key {key} not found in linked definitions. Please check the manifest." ) def _get_occurance_samples(self, occurrences: DuplicateOccurancesType) -> Tuple[str, str, Any]: @@ -417,9 +418,9 @@ def _get_occurance_samples(self, occurrences: DuplicateOccurancesType) -> Tuple[ value, ) # Return the component's name as the last part of its path - def _create_shared_definition_ref(self, type_key: str, key: str) -> Dict[str, str]: + def _create_linked_definition_ref(self, type_key: str, key: str) -> Dict[str, str]: """ - Create a reference object for the shared definitions using the specified key. + Create a reference object for the linked definitions using the specified key. Args: ref_key: The reference key to use @@ -428,7 +429,7 @@ def _create_shared_definition_ref(self, type_key: str, key: str) -> Dict[str, st A reference object in the proper format """ - return {"$ref": f"#/{DEF_TAG}/{SHARED_TAG}/{type_key}/{key}"} + return {"$ref": f"#/{DEF_TAG}/{LINKED_TAG}/{type_key}/{key}"} def _create_schema_ref(self, ref_key: str) -> Dict[str, str]: """ diff --git a/unit_tests/sources/declarative/parsers/conftest.py b/unit_tests/sources/declarative/parsers/conftest.py index 4b3312b2f..74553165f 100644 --- a/unit_tests/sources/declarative/parsers/conftest.py +++ b/unit_tests/sources/declarative/parsers/conftest.py @@ -120,7 +120,7 @@ def manifest_with_multiple_url_base() -> Dict[str, Any]: }, }, # dummy requesters to be resolved and deduplicated - # to the shared `url_base` in the `definitions.shared` section + # to the linked `url_base` in the `definitions.linked` section "requester_A": { "type": "HttpRequester", "url_base": "https://example.com/v1/", @@ -176,7 +176,7 @@ def manifest_with_multiple_url_base() -> Dict[str, Any]: def expected_manifest_with_multiple_url_base_normalized() -> Dict[str, Any]: return { "type": "DeclarativeSource", - "definitions": {"shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}}, + "definitions": {"linked": {"HttpRequester": {"url_base": "https://example.com/v2/"}}}, "streams": [ { "type": "DeclarativeStream", @@ -207,7 +207,7 @@ def expected_manifest_with_multiple_url_base_normalized() -> Dict[str, Any]: "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "url_base": {"$ref": "#/definitions/linked/HttpRequester/url_base"}, "path": "B", "http_method": "GET", }, @@ -251,7 +251,7 @@ def expected_manifest_with_multiple_url_base_normalized() -> Dict[str, Any]: "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "url_base": {"$ref": "#/definitions/linked/HttpRequester/url_base"}, "path": "D", "http_method": "GET", }, @@ -273,7 +273,7 @@ def expected_manifest_with_multiple_url_base_normalized() -> Dict[str, Any]: "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "url_base": {"$ref": "#/definitions/linked/HttpRequester/url_base"}, "path": "E", "http_method": "GET", }, @@ -325,11 +325,11 @@ def expected_manifest_with_multiple_url_base_normalized() -> Dict[str, Any]: @pytest.fixture -def manifest_with_url_base_shared_definition() -> Dict[str, Any]: +def manifest_with_url_base_linked_definition() -> Dict[str, Any]: return { "type": "DeclarativeSource", "definitions": { - "shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}, + "linked": {"HttpRequester": {"url_base": "https://example.com/v2/"}}, "streams": { "A": { "type": "DeclarativeStream", @@ -438,14 +438,14 @@ def manifest_with_url_base_shared_definition() -> Dict[str, Any]: }, }, # dummy requesters to be resolved and deduplicated - # to the shared `url_base` in the `definitions.shared` section + # to the linked `url_base` in the `definitions.linked` section "requester_A": { "type": "HttpRequester", "url_base": "https://example.com/v1/", }, "requester_B": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "url_base": {"$ref": "#/definitions/linked/HttpRequester/url_base"}, }, }, "streams": [ @@ -491,10 +491,10 @@ def manifest_with_url_base_shared_definition() -> Dict[str, Any]: @pytest.fixture -def expected_manifest_with_url_base_shared_definition_normalized() -> Dict[str, Any]: +def expected_manifest_with_url_base_linked_definition_normalized() -> Dict[str, Any]: return { "type": "DeclarativeSource", - "definitions": {"shared": {"HttpRequester": {"url_base": "https://example.com/v2/"}}}, + "definitions": {"linked": {"HttpRequester": {"url_base": "https://example.com/v2/"}}}, "streams": [ { "type": "DeclarativeStream", @@ -525,7 +525,7 @@ def expected_manifest_with_url_base_shared_definition_normalized() -> Dict[str, "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "url_base": {"$ref": "#/definitions/linked/HttpRequester/url_base"}, "path": "B", "http_method": "GET", }, @@ -569,7 +569,7 @@ def expected_manifest_with_url_base_shared_definition_normalized() -> Dict[str, "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "url_base": {"$ref": "#/definitions/linked/HttpRequester/url_base"}, "path": "D", "http_method": "GET", }, @@ -591,7 +591,7 @@ def expected_manifest_with_url_base_shared_definition_normalized() -> Dict[str, "type": "SimpleRetriever", "requester": { "type": "HttpRequester", - "url_base": {"$ref": "#/definitions/shared/HttpRequester/url_base"}, + "url_base": {"$ref": "#/definitions/linked/HttpRequester/url_base"}, "path": "E", "http_method": "GET", }, diff --git a/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py b/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py index cb93b20c5..9e64c981d 100644 --- a/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py +++ b/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py @@ -33,8 +33,8 @@ def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequ def test_deduplicate_manifest_with_shared_definitions_url_base_are_present( - manifest_with_url_base_shared_definition, - expected_manifest_with_url_base_shared_definition_normalized, + manifest_with_url_base_linked_definition, + expected_manifest_with_url_base_linked_definition_normalized, ) -> None: """ This test is to check that the manifest is normalized when the `url_base` is shared @@ -42,7 +42,7 @@ def test_deduplicate_manifest_with_shared_definitions_url_base_are_present( """ schema = _get_declarative_component_schema() - resolved_manifest = resolver.preprocess_manifest(manifest_with_url_base_shared_definition) + resolved_manifest = resolver.preprocess_manifest(manifest_with_url_base_linked_definition) normalized_manifest = ManifestNormalizer(resolved_manifest, schema).normalize() - assert normalized_manifest == expected_manifest_with_url_base_shared_definition_normalized + assert normalized_manifest == expected_manifest_with_url_base_linked_definition_normalized From 38f7da6287363dfcc2e4329eaae51daa4b0051f8 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Sat, 19 Apr 2025 13:14:28 +0300 Subject: [PATCH 23/27] updated the order of operations; normalization should go after pre-processing > type propagation --- .../manifest_declarative_source.py | 88 ++++++++++++++----- 1 file changed, 66 insertions(+), 22 deletions(-) diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index c2238d29e..b5a79970b 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -103,34 +103,18 @@ def __init__( normalize_manifest: Optional flag to indicate if the manifest should be normalized. """ self.logger = logging.getLogger(f"airbyte.{self.name}") - + self._should_normalize = normalize_manifest self._declarative_component_schema = _get_declarative_component_schema() # For ease of use we don't require the type to be specified at the top level manifest, but it should be included during processing manifest = dict(source_config) - if "type" not in manifest: - manifest["type"] = "DeclarativeSource" - + self._fix_source_type(manifest) # If custom components are needed, locate and/or register them. self.components_module: ModuleType | None = get_registered_components_module(config=config) - # resolve all `$ref` references in the manifest - resolved_source_config = ManifestReferenceResolver().preprocess_manifest(manifest) + self._preprocess_manifest(manifest) # resolve all components in the manifest - propagated_source_config = ManifestComponentTransformer().propagate_types_and_parameters( - "", resolved_source_config, {} - ) - - if normalize_manifest: - # Connector Builder UI rendering requires the manifest to be in a specific format. - # 1) references have been resolved - # 2) the commonly used definitions are extracted to the `definitions.shared.*` - # 3) ! the normalized manifest could be validated only after the additional UI post-processing. - propagated_source_config = ManifestNormalizer( - propagated_source_config, - self._declarative_component_schema, - ).normalize() - - self._source_config = propagated_source_config + self._propagate_types_and_parameters(manifest) + self._source_config = manifest self._debug = debug self._emit_connector_builder_messages = emit_connector_builder_messages self._constructor = ( @@ -145,14 +129,74 @@ def __init__( self._slice_logger: SliceLogger = ( AlwaysLogSliceLogger() if emit_connector_builder_messages else DebugSliceLogger() ) - self._config = config or {} + # validate resolved manifest against the declarative component schema self._validate_source() + # apply additional post-processing to the manifest + self._normalize_manifest() @property def resolved_manifest(self) -> Mapping[str, Any]: + """ + Returns the resolved manifest configuration for the source. + + This property provides access to the internal source configuration as a mapping, + which contains all settings and parameters required to define the source's behavior. + + Returns: + Mapping[str, Any]: The resolved source configuration manifest. + """ return self._source_config + def _preprocess_manifest(self, manifest: Dict[str, Any]) -> None: + """ + Preprocesses the provided manifest dictionary by resolving any manifest references. + + This method modifies the input manifest in place, resolving references using the + ManifestReferenceResolver to ensure all references within the manifest are properly handled. + + Args: + manifest (Dict[str, Any]): The manifest dictionary to preprocess and resolve references in. + + Returns: + None + """ + ManifestReferenceResolver().preprocess_manifest(manifest) + + def _propagate_types_and_parameters(self, manifest: Dict[str, Any]) -> None: + """ + Propagates types and parameters throughout the provided manifest. + + This method utilizes the ManifestComponentTransformer to traverse and update the manifest dictionary, + ensuring that types and parameters are correctly propagated from the root to all nested components. + + Args: + manifest (Dict[str, Any]): The manifest dictionary to update with propagated types and parameters. + + Returns: + None + """ + ManifestComponentTransformer().propagate_types_and_parameters("", manifest, {}) + + def _normalize_manifest(self) -> None: + """ + This method is used to normalize the manifest. It should be called after the manifest has been validated. + + Connector Builder UI rendering requires the manifest to be in a specific format. + - references have been resolved + - the commonly used definitions are extracted to the `definitions.linked.*` + """ + if self._should_normalize: + normalizer = ManifestNormalizer(self._source_config, self._declarative_component_schema) + self._source_config = normalizer.normalize() + + def _fix_source_type(self, manifest: Dict[str, Any]) -> None: + """ + Fix the source type in the manifest. This is necessary because the source type is not always set in the manifest. + """ + if "type" not in manifest: + manifest["type"] = "DeclarativeSource" + @property def message_repository(self) -> MessageRepository: return self._message_repository From 7d71f4be1ced102dd2d4ea6dfb1e33fa8d16cdc8 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Sat, 19 Apr 2025 13:42:04 +0300 Subject: [PATCH 24/27] fixed --- .../manifest_declarative_source.py | 48 ++++++++++--------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index b5a79970b..ef58ab9a3 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -105,16 +105,11 @@ def __init__( self.logger = logging.getLogger(f"airbyte.{self.name}") self._should_normalize = normalize_manifest self._declarative_component_schema = _get_declarative_component_schema() - # For ease of use we don't require the type to be specified at the top level manifest, but it should be included during processing - manifest = dict(source_config) - self._fix_source_type(manifest) # If custom components are needed, locate and/or register them. self.components_module: ModuleType | None = get_registered_components_module(config=config) - # resolve all `$ref` references in the manifest - self._preprocess_manifest(manifest) # resolve all components in the manifest - self._propagate_types_and_parameters(manifest) - self._source_config = manifest + self._source_config = self._preprocess_manifest(dict(source_config)) + self._debug = debug self._emit_connector_builder_messages = emit_connector_builder_messages self._constructor = ( @@ -130,10 +125,12 @@ def __init__( AlwaysLogSliceLogger() if emit_connector_builder_messages else DebugSliceLogger() ) self._config = config or {} + # validate resolved manifest against the declarative component schema self._validate_source() + # apply additional post-processing to the manifest - self._normalize_manifest() + self._postprocess_manifest() @property def resolved_manifest(self) -> Mapping[str, Any]: @@ -148,7 +145,7 @@ def resolved_manifest(self) -> Mapping[str, Any]: """ return self._source_config - def _preprocess_manifest(self, manifest: Dict[str, Any]) -> None: + def _preprocess_manifest(self, manifest: Dict[str, Any]) -> Dict[str, Any]: """ Preprocesses the provided manifest dictionary by resolving any manifest references. @@ -161,22 +158,25 @@ def _preprocess_manifest(self, manifest: Dict[str, Any]) -> None: Returns: None """ - ManifestReferenceResolver().preprocess_manifest(manifest) - - def _propagate_types_and_parameters(self, manifest: Dict[str, Any]) -> None: - """ - Propagates types and parameters throughout the provided manifest. - - This method utilizes the ManifestComponentTransformer to traverse and update the manifest dictionary, - ensuring that types and parameters are correctly propagated from the root to all nested components. + # For ease of use we don't require the type to be specified at the top level manifest, but it should be included during processing + manifest = self._fix_source_type(manifest) + # Resolve references in the manifest + resolved_manifest = ManifestReferenceResolver().preprocess_manifest(manifest) + # Propagate types and parameters throughout the manifest + propagated_manifest = ManifestComponentTransformer().propagate_types_and_parameters( + "", resolved_manifest, {} + ) - Args: - manifest (Dict[str, Any]): The manifest dictionary to update with propagated types and parameters. + return propagated_manifest - Returns: - None + def _postprocess_manifest(self) -> None: + """ + Post-processes the manifest after validation. + This method is responsible for any additional modifications or transformations needed + after the manifest has been validated and before it is used in the source. """ - ManifestComponentTransformer().propagate_types_and_parameters("", manifest, {}) + # apply manifest normalization, if required + self._normalize_manifest() def _normalize_manifest(self) -> None: """ @@ -190,13 +190,15 @@ def _normalize_manifest(self) -> None: normalizer = ManifestNormalizer(self._source_config, self._declarative_component_schema) self._source_config = normalizer.normalize() - def _fix_source_type(self, manifest: Dict[str, Any]) -> None: + def _fix_source_type(self, manifest: Dict[str, Any]) -> Dict[str, Any]: """ Fix the source type in the manifest. This is necessary because the source type is not always set in the manifest. """ if "type" not in manifest: manifest["type"] = "DeclarativeSource" + return manifest + @property def message_repository(self) -> MessageRepository: return self._message_repository From 304235cd94710d5b81ea78d35b658ece48b411c1 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Tue, 22 Apr 2025 01:57:32 +0300 Subject: [PATCH 25/27] add schema extraction + unit test --- .../parsers/manifest_normalizer.py | 45 ++-- .../sources/declarative/parsers/conftest.py | 237 ++++++++++++++++++ .../resources/abnormal_schemas_manifest.yaml | 212 ++++++++++++++++ .../parsers/test_manifest_normalizer.py | 26 +- 4 files changed, 504 insertions(+), 16 deletions(-) create mode 100644 unit_tests/sources/declarative/parsers/resources/abnormal_schemas_manifest.yaml diff --git a/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py index 60029d08c..ad6de6ac1 100644 --- a/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py +++ b/airbyte_cdk/sources/declarative/parsers/manifest_normalizer.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2023 Airbyte, Inc., all rights reserved. +# Copyright (c) 2025 Airbyte, Inc., all rights reserved. # import copy @@ -114,6 +114,7 @@ def normalize(self) -> ManifestType: return self._normalized_manifest except ManifestNormalizationException: # if any error occurs, we just return the original manifest. + # TODO: enable debug logging return self._resolved_manifest def _get_manifest_streams(self) -> Iterable[Dict[str, Any]]: @@ -162,22 +163,41 @@ def _prepare_definitions(self) -> None: if key != LINKED_TAG: self._normalized_manifest[DEF_TAG].pop(key, None) + def _extract_stream_schema(self, stream: Dict[str, Any]) -> None: + """ + Extract the schema from the stream and add it to the `schemas` tag. + """ + + stream_name = stream["name"] + # copy the value of the SCHEMA_TAG to the SCHEMAS_TAG with the stream name as key + schema = stream.get(SCHEMA_LOADER_TAG, {}).get(SCHEMA_TAG) + if not SCHEMAS_TAG in self._normalized_manifest.keys(): + self._normalized_manifest[SCHEMAS_TAG] = {} + # add stream schema to the SCHEMAS_TAG + if not stream_name in self._normalized_manifest[SCHEMAS_TAG].keys(): + # add the schema to the SCHEMAS_TAG with the stream name as key + self._normalized_manifest[SCHEMAS_TAG][stream_name] = schema + def _reference_schemas(self) -> None: """ - Process the definitions in the manifest to move streams from definitions to the main stream list. + Set the schema reference for the given stream in the manifest. This function modifies the manifest in place. """ # reference the stream schema for the stream to where it's stored if SCHEMAS_TAG in self._normalized_manifest.keys(): for stream in self._get_manifest_streams(): - stream_name = stream["name"] - - if stream_name not in self._normalized_manifest[SCHEMAS_TAG].keys(): - raise ManifestNormalizationException( - f"Stream {stream_name} not found in `schemas`. Please check the manifest." - ) + self._extract_stream_schema(stream) + self._set_stream_schema_ref(stream) + def _set_stream_schema_ref(self, stream: Dict[str, Any]) -> None: + """ + Set the schema reference for the given stream in the manifest. + This function modifies the manifest in place. + """ + stream_name = stream["name"] + if SCHEMAS_TAG in self._normalized_manifest.keys(): + if stream_name in self._normalized_manifest[SCHEMAS_TAG]: stream[SCHEMA_LOADER_TAG][SCHEMA_TAG] = self._create_schema_ref(stream_name) def _replace_duplicates_with_refs(self, duplicates: DuplicatesType) -> None: @@ -266,9 +286,6 @@ def _collect_duplicates(self) -> DuplicatesType: """ Traverse the JSON object and collect all potential duplicate values and objects. - Args: - node: The JSON object to analyze. - Returns: duplicates: A dictionary of duplicate objects. """ @@ -431,15 +448,15 @@ def _create_linked_definition_ref(self, type_key: str, key: str) -> Dict[str, st return {"$ref": f"#/{DEF_TAG}/{LINKED_TAG}/{type_key}/{key}"} - def _create_schema_ref(self, ref_key: str) -> Dict[str, str]: + def _create_schema_ref(self, key: str) -> Dict[str, str]: """ Create a reference object for stream schema using the specified key. Args: - ref_key: The reference key to use + key: The reference key to use Returns: A reference object in the proper format """ - return {"$ref": f"#/{SCHEMAS_TAG}/{ref_key}"} + return {"$ref": f"#/{SCHEMAS_TAG}/{key}"} diff --git a/unit_tests/sources/declarative/parsers/conftest.py b/unit_tests/sources/declarative/parsers/conftest.py index 74553165f..3f653ebb1 100644 --- a/unit_tests/sources/declarative/parsers/conftest.py +++ b/unit_tests/sources/declarative/parsers/conftest.py @@ -5,6 +5,7 @@ from typing import Any, Dict import pytest +import yaml @pytest.fixture @@ -640,3 +641,239 @@ def expected_manifest_with_url_base_linked_definition_normalized() -> Dict[str, }, }, } + + +@pytest.fixture +def manifest_with_linked_definitions_url_base_authenticator_abnormal_schemas() -> Dict[str, Any]: + with open( + "unit_tests/sources/declarative/parsers/resources/abnormal_schemas_manifest.yaml", + "r", + ) as file: + return dict(yaml.safe_load(file)) + + +@pytest.fixture +def expected_manifest_with_linked_definitions_url_base_authenticator_normalized() -> Dict[str, Any]: + return { + "version": "6.44.0", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["pokemon"]}, + "definitions": { + "linked": { + "HttpRequester": { + "url_base": "https://pokeapi.co/api/v1/", + "authenticator": { + "type": "ApiKeyAuthenticator", + "api_token": '{{ config["api_key"] }}', + "inject_into": { + "type": "RequestOption", + "field_name": "API_KEY", + "inject_into": "header", + }, + }, + } + } + }, + "streams": [ + { + "type": "DeclarativeStream", + "name": "pokemon", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "path": "pokemon", + "url_base": { + "$ref": "#/definitions/linked/HttpRequester/url_base", + }, + "http_method": "GET", + "authenticator": { + "$ref": "#/definitions/linked/HttpRequester/authenticator", + }, + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/pokemon"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "trainers", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "path": "pokemon", + "url_base": { + "$ref": "#/definitions/linked/HttpRequester/url_base", + }, + "http_method": "GET", + "authenticator": { + "$ref": "#/definitions/linked/HttpRequester/authenticator", + }, + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/trainers"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "items", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "path": "pokemon", + "url_base": "https://pokeapi.co/api/v2/", + "http_method": "GET", + "authenticator": { + "$ref": "#/definitions/linked/HttpRequester/authenticator" + }, + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/items"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "location", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "path": "location", + "url_base": "https://pokeapi.co/api/v2/", + "http_method": "GET", + "authenticator": { + "$ref": "#/definitions/linked/HttpRequester/authenticator" + }, + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/location"}, + }, + }, + { + "type": "DeclarativeStream", + "name": "berries", + "retriever": { + "type": "SimpleRetriever", + "decoder": {"type": "JsonDecoder"}, + "requester": { + "type": "HttpRequester", + "path": "berries", + "url_base": "https://pokeapi.co/api/v2/", + "http_method": "GET", + "authenticator": { + "$ref": "#/definitions/linked/HttpRequester/authenticator" + }, + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": []}, + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": {"$ref": "#/schemas/berries"}, + }, + }, + ], + "spec": { + "type": "Spec", + "connection_specification": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "required": ["api_key"], + "properties": { + "api_key": { + "type": "string", + "order": 0, + "title": "API Key", + "airbyte_secret": True, + } + }, + "additionalProperties": True, + }, + }, + "metadata": { + "assist": {}, + "testedStreams": { + "berries": {"streamHash": None}, + "pokemon": {"streamHash": None}, + "location": {"streamHash": None}, + "trainers": {"streamHash": "ca4ee51a2aaa2a53b9c0b91881a84ad621da575f"}, + "items": {"streamHash": "12e624ecf47c6357c74c27d6a65c72e437b1534a"}, + }, + "autoImportSchema": {"berries": True, "pokemon": True, "location": True}, + }, + "schemas": { + "berries": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "name": {"type": "string"}, + "berry_type": {"type": "integer"}, + }, + "additionalProperties": True, + }, + "pokemon": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "name": {"type": "string"}, + "pokemon_type": {"type": "integer"}, + }, + }, + "location": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "name": {"type": "string"}, + "location_type": {"type": "string"}, + }, + }, + "trainers": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "name": {"type": "string"}, + "pokemon_type": {"type": "integer"}, + }, + }, + "items": { + "type": "object", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "name": {"type": "string"}, + "pokemon_type": {"type": "integer"}, + }, + }, + }, + } diff --git a/unit_tests/sources/declarative/parsers/resources/abnormal_schemas_manifest.yaml b/unit_tests/sources/declarative/parsers/resources/abnormal_schemas_manifest.yaml new file mode 100644 index 000000000..5b972334c --- /dev/null +++ b/unit_tests/sources/declarative/parsers/resources/abnormal_schemas_manifest.yaml @@ -0,0 +1,212 @@ +version: 6.44.0 + +type: DeclarativeSource + +check: + type: CheckStream + stream_names: + - pokemon + +definitions: + linked: + HttpRequester: + url_base: https://pokeapi.co/api/v1/ + +streams: + - type: DeclarativeStream + name: pokemon + retriever: + type: SimpleRetriever + decoder: + type: JsonDecoder + requester: + type: HttpRequester + path: pokemon + url_base: + $ref: "#/definitions/linked/HttpRequester/url_base" + http_method: GET + authenticator: + type: ApiKeyAuthenticator + api_token: "{{ config[\"api_key\"] }}" + inject_into: + type: RequestOption + field_name: API_KEY + inject_into: header + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + $ref: "#/schemas/pokemon" + - type: DeclarativeStream + name: trainers + retriever: + type: SimpleRetriever + decoder: + type: JsonDecoder + requester: + type: HttpRequester + path: pokemon + url_base: + $ref: "#/definitions/linked/HttpRequester/url_base" + http_method: GET + authenticator: + type: ApiKeyAuthenticator + api_token: "{{ config[\"api_key\"] }}" + inject_into: + type: RequestOption + field_name: API_KEY + inject_into: header + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + $ref: "#/schemas/pokemon" + - type: DeclarativeStream + name: items + retriever: + type: SimpleRetriever + decoder: + type: JsonDecoder + requester: + type: HttpRequester + path: pokemon + url_base: https://pokeapi.co/api/v2/ + http_method: GET + authenticator: + type: ApiKeyAuthenticator + api_token: "{{ config[\"api_key\"] }}" + inject_into: + type: RequestOption + field_name: API_KEY + inject_into: header + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + $ref: "#/schemas/pokemon" + - type: DeclarativeStream + name: location + retriever: + type: SimpleRetriever + decoder: + type: JsonDecoder + requester: + type: HttpRequester + path: location + url_base: https://pokeapi.co/api/v2/ + http_method: GET + authenticator: + type: ApiKeyAuthenticator + api_token: "{{ config[\"api_key\"] }}" + inject_into: + type: RequestOption + field_name: API_KEY + inject_into: header + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + $ref: "#/schemas/location" + - type: DeclarativeStream + name: berries + retriever: + type: SimpleRetriever + decoder: + type: JsonDecoder + requester: + type: HttpRequester + path: berries + url_base: https://pokeapi.co/api/v2/ + http_method: GET + authenticator: + type: ApiKeyAuthenticator + api_token: "{{ config[\"api_key\"] }}" + inject_into: + type: RequestOption + field_name: API_KEY + inject_into: header + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + $ref: "#/schemas/berries" + +spec: + type: Spec + connection_specification: + type: object + $schema: http://json-schema.org/draft-07/schema# + required: + - api_key + properties: + api_key: + type: string + order: 0 + title: API Key + airbyte_secret: true + additionalProperties: true + +metadata: + assist: {} + testedStreams: + berries: + streamHash: null + pokemon: + streamHash: null + location: + streamHash: null + trainers: + streamHash: ca4ee51a2aaa2a53b9c0b91881a84ad621da575f + items: + streamHash: 12e624ecf47c6357c74c27d6a65c72e437b1534a + autoImportSchema: + berries: true + pokemon: true + location: true + +schemas: + berries: + type: object + $schema: http://json-schema.org/draft-07/schema# + properties: + name: + type: string + berry_type: + type: integer + additionalProperties: true + pokemon: + type: object + $schema: http://json-schema.org/draft-07/schema# + properties: + name: + type: string + pokemon_type: + type: integer + location: + type: object + $schema: http://json-schema.org/draft-07/schema# + properties: + name: + type: string + location_type: + type: string \ No newline at end of file diff --git a/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py b/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py index 9e64c981d..addb495ce 100644 --- a/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py +++ b/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py @@ -16,7 +16,7 @@ resolver = ManifestReferenceResolver() -def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequent_is_shared( +def test_when_multiple_url_base_are_resolved_and_most_frequent_is_shared( manifest_with_multiple_url_base, expected_manifest_with_multiple_url_base_normalized, ) -> None: @@ -32,7 +32,7 @@ def test_deduplicate_manifest_when_multiple_url_base_are_resolved_and_most_frequ assert normalized_manifest == expected_manifest_with_multiple_url_base_normalized -def test_deduplicate_manifest_with_shared_definitions_url_base_are_present( +def test_with_shared_definitions_url_base_are_present( manifest_with_url_base_linked_definition, expected_manifest_with_url_base_linked_definition_normalized, ) -> None: @@ -46,3 +46,25 @@ def test_deduplicate_manifest_with_shared_definitions_url_base_are_present( normalized_manifest = ManifestNormalizer(resolved_manifest, schema).normalize() assert normalized_manifest == expected_manifest_with_url_base_linked_definition_normalized + + +def test_with_linked_definitions_url_base_authenticator_abnormal_schemas( + manifest_with_linked_definitions_url_base_authenticator_abnormal_schemas, + expected_manifest_with_linked_definitions_url_base_authenticator_normalized, +) -> None: + """ + This test is to check that the manifest is normalized when the `url_base` and the `authenticator` is linked + between the definitions and the `url_base` is present in the manifest. + The `authenticator` is not a normal schema, but a reference to another schema. + """ + + schema = _get_declarative_component_schema() + resolved_manifest = resolver.preprocess_manifest( + manifest_with_linked_definitions_url_base_authenticator_abnormal_schemas + ) + normalized_manifest = ManifestNormalizer(resolved_manifest, schema).normalize() + + assert ( + normalized_manifest + == expected_manifest_with_linked_definitions_url_base_authenticator_normalized + ) From 2c8d1645cf53a45e3f74e9b8f64a94bdb081e05d Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 24 Apr 2025 14:36:48 +0300 Subject: [PATCH 26/27] updated test comments --- .../sources/declarative/parsers/test_manifest_normalizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py b/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py index addb495ce..f25c1bbd6 100644 --- a/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py +++ b/unit_tests/sources/declarative/parsers/test_manifest_normalizer.py @@ -48,7 +48,7 @@ def test_with_shared_definitions_url_base_are_present( assert normalized_manifest == expected_manifest_with_url_base_linked_definition_normalized -def test_with_linked_definitions_url_base_authenticator_abnormal_schemas( +def test_with_linked_definitions_url_base_authenticator_when_multiple_streams_reference_the_same_schema( manifest_with_linked_definitions_url_base_authenticator_abnormal_schemas, expected_manifest_with_linked_definitions_url_base_authenticator_normalized, ) -> None: @@ -56,6 +56,9 @@ def test_with_linked_definitions_url_base_authenticator_abnormal_schemas( This test is to check that the manifest is normalized when the `url_base` and the `authenticator` is linked between the definitions and the `url_base` is present in the manifest. The `authenticator` is not a normal schema, but a reference to another schema. + + The test also verifies the `stream.schema_loader.schema` is properly extracted to + the `schemas.`. """ schema = _get_declarative_component_schema() From 8d7be4e5511de320aab351cefa8bb7d650402a11 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Fri, 25 Apr 2025 15:31:11 +0300 Subject: [PATCH 27/27] updated linked --- .../sources/declarative/declarative_component_schema.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index be75e8836..5376505c8 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1965,6 +1965,7 @@ definitions: - POST authenticator: title: Authenticator + linkable: true description: Authentication method to use for requests sent to the API. anyOf: - "$ref": "#/definitions/NoAuth"