Fix: Resolve issues in Postgres name normalization when names are >63 characters (#359)

aaronsteers · web-flow · commit ddbf6f472e4b · 2024-09-09T20:34:24.000Z
diff --git a/airbyte/_executors/python.py b/airbyte/_executors/python.py
@@ -10,7 +10,7 @@
 from typing import TYPE_CHECKING, Literal
 
 from overrides import overrides
-from rich import print
+from rich import print  # noqa: A004  # Allow shadowing the built-in
 
 from airbyte import exceptions as exc
 from airbyte._executors.base import Executor
diff --git a/airbyte/_executors/util.py b/airbyte/_executors/util.py
@@ -8,7 +8,7 @@
 import requests
 import yaml
 from requests import HTTPError
-from rich import print
+from rich import print  # noqa: A004  # Allow shadowing the built-in
 
 from airbyte import exceptions as exc
 from airbyte._executors.declarative import DeclarativeExecutor
diff --git a/airbyte/_processors/sql/postgres.py b/airbyte/_processors/sql/postgres.py
@@ -3,8 +3,11 @@
 
 from __future__ import annotations
 
+import functools
+
 from overrides import overrides
 
+from airbyte._util.name_normalizers import LowerCaseNormalizer
 from airbyte._writers.jsonl import JsonlWriter
 from airbyte.secrets.base import SecretString
 from airbyte.shared.sql_processor import SqlConfig, SqlProcessorBase
@@ -35,6 +38,24 @@ def get_database_name(self) -> str:
         return self.database
 
 
+class PostgresNormalizer(LowerCaseNormalizer):
+    """A name normalizer for Postgres.
+
+    Postgres has specific field name length limits:
+    - Tables names are limited to 63 characters.
+    - Column names are limited to 63 characters.
+
+    The postgres normalizer inherits from the default LowerCaseNormalizer class, and
+    additionally truncates column and table names to 63 characters.
+    """
+
+    @staticmethod
+    @functools.cache
+    def normalize(name: str) -> str:
+        """Normalize the name, truncating to 63 characters."""
+        return LowerCaseNormalizer.normalize(name)[:63]
+
+
 class PostgresSqlProcessor(SqlProcessorBase):
     """A Postgres implementation of the cache.
 
@@ -49,3 +70,6 @@ class PostgresSqlProcessor(SqlProcessorBase):
     supports_merge_insert = False
     file_writer_class = JsonlWriter
     sql_config: PostgresConfig
+
+    normalizer = PostgresNormalizer
+    """A Postgres-specific name normalizer for table and column name normalization."""
diff --git a/airbyte/_util/temp_files.py b/airbyte/_util/temp_files.py
@@ -23,7 +23,7 @@ def as_temp_files(files_contents: list[dict | str]) -> Generator[list[str], Any,
     try:
         for content in files_contents:
             use_json = isinstance(content, dict)
-            temp_file = tempfile.NamedTemporaryFile(
+            temp_file = tempfile.NamedTemporaryFile(  # noqa: SIM115  # Avoiding context manager
                 mode="w+t",
                 delete=False,
                 encoding="utf-8",
diff --git a/airbyte/_writers/jsonl.py b/airbyte/_writers/jsonl.py
@@ -35,7 +35,7 @@ def _open_new_file(
         """Open a new file for writing."""
         return cast(
             IO[str],
-            gzip.open(
+            gzip.open(  # noqa: SIM115  # Avoiding context manager
                 file_path,
                 mode="wt",
                 encoding="utf-8",
diff --git a/airbyte/cloud/workspaces.py b/airbyte/cloud/workspaces.py
@@ -215,10 +215,10 @@ def _deploy_connection(
         source_id: str
         if isinstance(source, Source):
             selected_streams = selected_streams or source.get_selected_streams()
-            if source._deployed_source_id:  # noqa: SLF001
-                source_id = source._deployed_source_id  # noqa: SLF001
-            else:
-                source_id = self._deploy_source(source)
+            source_id = (
+                source._deployed_source_id  # noqa: SLF001  # Access to non-public API
+                or self._deploy_source(source)
+            )
         else:
             source_id = source
             if not selected_streams:
diff --git a/airbyte/secrets/config.py b/airbyte/secrets/config.py
@@ -77,6 +77,6 @@ def disable_secret_source(source: SecretManager | SecretSourceEnum) -> None:
         return
 
     # Else, remove by name
-    for s in _SECRETS_SOURCES:
+    for s in list(_SECRETS_SOURCES).copy():
         if s.name == str(source):
             _SECRETS_SOURCES.remove(s)
diff --git a/airbyte/shared/sql_processor.py b/airbyte/shared/sql_processor.py
@@ -497,8 +497,20 @@ def _get_temp_table_name(
         batch_id: str | None = None,  # ULID of the batch
     ) -> str:
         """Return a new (unique) temporary table name."""
-        batch_id = batch_id or str(ulid.ULID())
-        return self.normalizer.normalize(f"{stream_name}_{batch_id}")
+        if not batch_id:
+            batch_id = str(ulid.ULID())
+
+        # Use the first 6 and last 3 characters of the ULID. This gives great uniqueness while
+        # limiting the table name suffix to 10 characters, including the underscore.
+        suffix = (
+            f"{batch_id[:6]}{batch_id[-3:]}"
+            if len(batch_id) > 9  # noqa: PLR2004  # Allow magic int value
+            else batch_id
+        )
+
+        # Note: The normalizer may truncate the table name if the database has a name length limit.
+        # For instance, the Postgres normalizer will enforce a 63-character limit on table names.
+        return self.normalizer.normalize(f"{stream_name}_{suffix}")
 
     def _fully_qualified(
         self,
diff --git a/airbyte/sources/base.py b/airbyte/sources/base.py
@@ -9,7 +9,7 @@
 from typing import TYPE_CHECKING, Any, Literal
 
 import yaml
-from rich import print
+from rich import print  # noqa: A004  # Allow shadowing the built-in
 from rich.syntax import Syntax
 
 from airbyte_protocol.models import (
@@ -405,9 +405,25 @@ def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]:
 
         return found[0].json_schema
 
-    def get_records(self, stream: str) -> LazyDataset:
+    def get_records(
+        self,
+        stream: str,
+        *,
+        normalize_field_names: bool = False,
+        prune_undeclared_fields: bool = True,
+    ) -> LazyDataset:
         """Read a stream from the connector.
 
+        Args:
+            stream: The name of the stream to read.
+            normalize_field_names: When `True`, field names will be normalized to lower case, with
+                special characters removed. This matches the behavior of PyAirbyte caches and most
+                Airbyte destinations.
+            prune_undeclared_fields: When `True`, undeclared fields will be pruned from the records,
+                which generally matches the behavior of PyAirbyte caches and most Airbyte
+                destinations, specifically when you expect the catalog may be stale. You can disable
+                this to keep all fields in the records.
+
         This involves the following steps:
         * Call discover to get the catalog
         * Generate a configured catalog that syncs the given stream in full_refresh mode
@@ -445,8 +461,8 @@ def _with_logging(records: Iterable[dict[str, Any]]) -> Iterator[dict[str, Any]]
 
         stream_record_handler = StreamRecordHandler(
             json_schema=self.get_stream_json_schema(stream),
-            prune_extra_fields=True,
-            normalize_keys=False,
+            prune_extra_fields=prune_undeclared_fields,
+            normalize_keys=normalize_field_names,
         )
 
         # This method is non-blocking, so we use "PLAIN" to avoid a live progress display
diff --git a/airbyte/types.py b/airbyte/types.py
@@ -1,3 +1,4 @@
+# noqa: A005  # Allow shadowing the built-in 'types' module
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 
 """Type conversion methods for SQL Caches."""
@@ -7,7 +8,7 @@
 from typing import cast
 
 import sqlalchemy
-from rich import print
+from rich import print  # noqa: A004  # Allow shadowing the built-in
 
 
 # Compare to documentation here: https://docs.airbyte.com/understanding-airbyte/supported-data-types
diff --git a/airbyte/validate.py b/airbyte/validate.py
@@ -15,7 +15,7 @@
 from pathlib import Path
 
 import yaml
-from rich import print
+from rich import print  # noqa: A004  # Allow shadowing the built-in
 
 import airbyte as ab
 from airbyte import exceptions as exc
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,7 +59,7 @@ pdoc = "^14.3.0"
 pytest = "^8.2.0"
 pytest-docker = "^3.1.1"
 pytest-mypy = "^0.10.3"
-ruff = "0.4.1"
+ruff = "^0.6.4"
 types-jsonschema = "^4.20.0.0"
 types-requests = "2.31.0.4"
 freezegun = "^1.4.0"
@@ -188,7 +188,7 @@ ignore = [
     "S",       # flake8-bandit (noisy, security related)
     "SIM910",  # Allow "None" as second argument to Dict.get(). "Explicit is better than implicit."
     "TD002",   # Require author for TODOs
-    "TRIO",    # flake8-trio (opinionated, noisy)
+    "ASYNC1",  # flake8-trio (opinionated, noisy)
     "INP001",  # Dir 'examples' is part of an implicit namespace package. Add an __init__.py.
 
     # TODO: Consider re-enabling these before release:
diff --git a/tests/integration_tests/test_source_test_fixture.py b/tests/integration_tests/test_source_test_fixture.py
diff --git a/tests/unit_tests/test_text_normalization.py b/tests/unit_tests/test_text_normalization.py