-
Notifications
You must be signed in to change notification settings - Fork 71
feat(connectors): add universal source and destination using PyAirbyte #969
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
c9285f8
dbe9884
561093b
9e19587
4e326cd
bc64448
ab6ed52
74dbb59
f405021
a3ebe6e
ef338c4
60c502b
271673c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| # Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
| """CLI modules for PyAirbyte.""" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| FROM docker.io/airbyte/python-connector-base:4.0.2@sha256:9fdb1888c4264cf6fee473649ecb593f56f58e5d0096a87ee0b231777e2e3e73 | ||
|
|
||
| WORKDIR /airbyte/integration_code | ||
|
|
||
| COPY pyproject.toml uv.lock ./ | ||
| COPY airbyte ./airbyte | ||
|
|
||
| RUN pip install --no-cache-dir . | ||
|
|
||
| ENV AIRBYTE_ENTRYPOINT="destination-pyairbyte" | ||
| ENTRYPOINT ["destination-pyairbyte"] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| # Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
| """Universal connectors using PyAirbyte as backends.""" | ||
|
|
||
| from airbyte.cli.universal_connector.destination import DestinationPyAirbyteUniversal | ||
| from airbyte.cli.universal_connector.source import SourcePyAirbyteUniversal | ||
|
|
||
|
|
||
| __all__ = [ | ||
| "DestinationPyAirbyteUniversal", | ||
| "SourcePyAirbyteUniversal", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,320 @@ | ||
| # Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
| """Universal destination implementation using PyAirbyte caches.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import datetime | ||
| import json | ||
| import logging | ||
| import uuid | ||
| from collections import defaultdict | ||
| from typing import TYPE_CHECKING, Any | ||
|
|
||
| from sqlalchemy import text | ||
|
|
||
| from airbyte_cdk.destinations import Destination | ||
| from airbyte_cdk.models import ( | ||
| AirbyteConnectionStatus, | ||
| AirbyteMessage, | ||
| ConfiguredAirbyteCatalog, | ||
| ConnectorSpecification, | ||
| DestinationSyncMode, | ||
| Status, | ||
| Type, | ||
| ) | ||
|
|
||
| from airbyte.caches.bigquery import BigQueryCache | ||
| from airbyte.caches.duckdb import DuckDBCache | ||
| from airbyte.caches.motherduck import MotherDuckCache | ||
| from airbyte.caches.postgres import PostgresCache | ||
| from airbyte.caches.snowflake import SnowflakeCache | ||
| from airbyte.secrets.base import SecretString | ||
|
|
||
|
|
||
| if TYPE_CHECKING: | ||
| from collections.abc import Iterable, Mapping | ||
|
|
||
| from sqlalchemy.engine import Engine | ||
|
|
||
| from airbyte.caches.base import CacheBase | ||
|
|
||
|
|
||
| logger = logging.getLogger("airbyte") | ||
|
|
||
|
|
||
| DESTINATION_TYPE_DUCKDB = "duckdb" | ||
| DESTINATION_TYPE_POSTGRES = "postgres" | ||
| DESTINATION_TYPE_SNOWFLAKE = "snowflake" | ||
| DESTINATION_TYPE_BIGQUERY = "bigquery" | ||
| DESTINATION_TYPE_MOTHERDUCK = "motherduck" | ||
|
|
||
|
|
||
| class DestinationPyAirbyteUniversal(Destination): | ||
| """Universal destination that writes to any PyAirbyte-supported cache backend.""" | ||
|
|
||
| def spec(self, logger: logging.Logger) -> ConnectorSpecification: # noqa: ARG002 | ||
| """Return the connector specification.""" | ||
| return ConnectorSpecification( | ||
| documentationUrl="https://docs.airbyte.com/integrations/destinations/pyairbyte-universal", | ||
| connectionSpecification={ | ||
| "$schema": "http://json-schema.org/draft-07/schema#", | ||
| "title": "PyAirbyte Universal Destination Spec", | ||
| "type": "object", | ||
| "required": ["destination_type"], | ||
| "properties": { | ||
| "destination_type": { | ||
| "type": "string", | ||
| "title": "Destination Type", | ||
| "description": "The type of destination to write to.", | ||
| "enum": ["duckdb", "postgres", "snowflake", "bigquery", "motherduck"], | ||
| }, | ||
| "duckdb": { | ||
| "type": "object", | ||
| "title": "DuckDB Configuration", | ||
| "properties": { | ||
| "db_path": {"type": "string", "default": "/local/pyairbyte.duckdb"}, | ||
| "schema_name": {"type": "string", "default": "main"}, | ||
| }, | ||
| }, | ||
| "postgres": { | ||
| "type": "object", | ||
| "title": "PostgreSQL Configuration", | ||
| "properties": { | ||
| "host": {"type": "string", "default": "localhost"}, | ||
| "port": {"type": "integer", "default": 5432}, | ||
| "username": {"type": "string"}, | ||
| "password": {"type": "string", "airbyte_secret": True}, | ||
| "database": {"type": "string"}, | ||
| "schema_name": {"type": "string", "default": "public"}, | ||
| }, | ||
| }, | ||
| "snowflake": { | ||
| "type": "object", | ||
| "title": "Snowflake Configuration", | ||
| "properties": { | ||
| "account": {"type": "string"}, | ||
| "username": {"type": "string"}, | ||
| "password": {"type": "string", "airbyte_secret": True}, | ||
| "warehouse": {"type": "string"}, | ||
| "database": {"type": "string"}, | ||
| "schema_name": {"type": "string", "default": "PUBLIC"}, | ||
| "role": {"type": "string"}, | ||
| }, | ||
| }, | ||
| "bigquery": { | ||
| "type": "object", | ||
| "title": "BigQuery Configuration", | ||
| "properties": { | ||
| "project_name": {"type": "string"}, | ||
| "dataset_name": {"type": "string"}, | ||
| "credentials_path": {"type": "string"}, | ||
| }, | ||
| }, | ||
| "motherduck": { | ||
| "type": "object", | ||
| "title": "MotherDuck Configuration", | ||
| "properties": { | ||
| "database": {"type": "string", "default": "my_db"}, | ||
| "schema_name": {"type": "string", "default": "main"}, | ||
| "api_key": {"type": "string", "airbyte_secret": True}, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| ) | ||
|
|
||
| def _get_cache(self, config: Mapping[str, Any]) -> CacheBase: | ||
| """Create and return the appropriate cache based on configuration.""" | ||
| destination_type = config.get("destination_type") | ||
|
|
||
| if destination_type == DESTINATION_TYPE_DUCKDB: | ||
| duckdb_config = config.get("duckdb", {}) | ||
| return DuckDBCache( | ||
| db_path=duckdb_config.get("db_path", "/local/pyairbyte.duckdb"), | ||
| schema_name=duckdb_config.get("schema_name", "main"), | ||
| ) | ||
|
|
||
| if destination_type == DESTINATION_TYPE_POSTGRES: | ||
| pg_config = config.get("postgres", {}) | ||
| return PostgresCache( | ||
| host=pg_config.get("host", "localhost"), | ||
| port=pg_config.get("port", 5432), | ||
| username=pg_config.get("username", "postgres"), | ||
| password=SecretString(pg_config.get("password", "")), | ||
| database=pg_config.get("database", "postgres"), | ||
| schema_name=pg_config.get("schema_name", "public"), | ||
| ) | ||
|
|
||
| if destination_type == DESTINATION_TYPE_SNOWFLAKE: | ||
| sf_config = config.get("snowflake", {}) | ||
| password = sf_config.get("password") | ||
| return SnowflakeCache( | ||
| account=sf_config.get("account", ""), | ||
| username=sf_config.get("username", ""), | ||
| password=SecretString(password) if password else None, | ||
| warehouse=sf_config.get("warehouse", ""), | ||
| database=sf_config.get("database", ""), | ||
| schema_name=sf_config.get("schema_name", "PUBLIC"), | ||
| role=sf_config.get("role", ""), | ||
| ) | ||
|
|
||
| if destination_type == DESTINATION_TYPE_BIGQUERY: | ||
| bq_config = config.get("bigquery", {}) | ||
| return BigQueryCache( | ||
| project_name=bq_config.get("project_name", ""), | ||
| dataset_name=bq_config.get("dataset_name", ""), | ||
| credentials_path=bq_config.get("credentials_path"), | ||
| ) | ||
|
|
||
| if destination_type == DESTINATION_TYPE_MOTHERDUCK: | ||
| md_config = config.get("motherduck", {}) | ||
| return MotherDuckCache( | ||
| database=md_config.get("database", "my_db"), | ||
| schema_name=md_config.get("schema_name", "main"), | ||
| api_key=SecretString(md_config.get("api_key", "")), | ||
| ) | ||
|
|
||
| raise ValueError(f"Unsupported destination type: {destination_type}") | ||
|
|
||
| def check( | ||
| self, | ||
| logger: logging.Logger, # noqa: ARG002 | ||
| config: Mapping[str, Any], | ||
| ) -> AirbyteConnectionStatus: | ||
| """Test the connection to the destination.""" | ||
| try: | ||
| cache = self._get_cache(config) | ||
| engine = cache.get_sql_engine() | ||
| with engine.connect() as conn: | ||
| conn.execute(text("SELECT 1")) | ||
| return AirbyteConnectionStatus(status=Status.SUCCEEDED) | ||
| except Exception as e: | ||
| return AirbyteConnectionStatus( | ||
| status=Status.FAILED, message=f"Connection failed: {e!r}" | ||
| ) | ||
|
|
||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def write( | ||
| self, | ||
| config: Mapping[str, Any], | ||
| configured_catalog: ConfiguredAirbyteCatalog, | ||
| input_messages: Iterable[AirbyteMessage], | ||
| ) -> Iterable[AirbyteMessage]: | ||
| """Write data to the destination using PyAirbyte cache. | ||
|
|
||
| This method processes messages in a streaming fashion, buffering records | ||
| and flushing on state messages to ensure fault tolerance. | ||
| """ | ||
| cache = self._get_cache(config) | ||
| streams = {s.stream.name for s in configured_catalog.streams} | ||
| schema_name = cache.schema_name | ||
|
|
||
| logger.info(f"Starting write to PyAirbyte Universal with {len(streams)} streams") | ||
|
|
||
| # Get SQL engine and ensure schema exists | ||
| engine = cache.get_sql_engine() | ||
| with engine.connect() as conn: | ||
| conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")) | ||
| conn.commit() # pyrefly: ignore[missing-attribute] | ||
|
|
||
| # Create tables for each stream | ||
| for configured_stream in configured_catalog.streams: | ||
| name = configured_stream.stream.name | ||
| table_name = f"_airbyte_raw_{name}" | ||
|
|
||
| with engine.connect() as conn: | ||
| if configured_stream.destination_sync_mode == DestinationSyncMode.overwrite: | ||
| logger.info(f"Dropping table for overwrite: {table_name}") | ||
| conn.execute(text(f"DROP TABLE IF EXISTS {schema_name}.{table_name}")) | ||
| conn.commit() # pyrefly: ignore[missing-attribute] | ||
|
Comment on lines
+241
to
+249
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Unquoted SQL identifiers cause syntax errors for stream names with special characters Stream names and schema names are interpolated directly into raw SQL strings without any identifier quoting. When a stream name contains characters that are not valid in unquoted SQL identifiers (e.g., dashes, dots, spaces, or SQL keywords), the generated SQL will fail with a syntax error or, worse, enable SQL injection. Root Cause and ImpactThe
For example, a stream named Similarly, a stream name containing Notably, the companion smoke test source defines scenarios like Impact: The destination will crash with SQL errors for any stream whose name contains characters not valid in unquoted SQL identifiers. This includes common stream naming patterns like Prompt for agentsWas this helpful? React with 👍 or 👎 to provide feedback. |
||
|
|
||
| # Create the raw table if needed | ||
| create_sql = f""" | ||
| CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} ( | ||
| _airbyte_ab_id VARCHAR(36) PRIMARY KEY, | ||
| _airbyte_emitted_at TIMESTAMP, | ||
| _airbyte_data JSON | ||
| ) | ||
| """ | ||
| conn.execute(text(create_sql)) | ||
| conn.commit() # pyrefly: ignore[missing-attribute] | ||
aaronsteers marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # Buffer for records | ||
| buffer: dict[str, dict[str, list[Any]]] = defaultdict(lambda: defaultdict(list)) | ||
|
|
||
| for message in input_messages: | ||
| if message.type == Type.STATE: | ||
| # Flush the buffer before yielding state | ||
| for stream_name in list(buffer.keys()): | ||
| self._flush_buffer( | ||
| engine=engine, | ||
| buffer=buffer, | ||
| schema_name=schema_name, | ||
| stream_name=stream_name, | ||
| ) | ||
| buffer = defaultdict(lambda: defaultdict(list)) | ||
| yield message | ||
|
|
||
| elif message.type == Type.RECORD: | ||
| record = message.record | ||
| if record is None: | ||
| continue | ||
| stream_name = record.stream | ||
| if stream_name not in streams: | ||
| logger.debug(f"Stream {stream_name} not in configured streams, skipping") | ||
| continue | ||
aaronsteers marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # Add to buffer | ||
| buffer[stream_name]["_airbyte_ab_id"].append(str(uuid.uuid4())) | ||
| buffer[stream_name]["_airbyte_emitted_at"].append( | ||
| datetime.datetime.now(datetime.timezone.utc).isoformat() | ||
| ) | ||
coderabbitai[bot] marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| buffer[stream_name]["_airbyte_data"].append(json.dumps(record.data)) | ||
|
|
||
| else: | ||
| logger.debug(f"Message type {message.type} not handled, skipping") | ||
|
|
||
| # Flush any remaining records | ||
| for stream_name in list(buffer.keys()): | ||
| self._flush_buffer( | ||
| engine=engine, | ||
| buffer=buffer, | ||
| schema_name=schema_name, | ||
| stream_name=stream_name, | ||
| ) | ||
|
|
||
| # Close the cache | ||
| cache.close() | ||
coderabbitai[bot] marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| def _flush_buffer( | ||
| self, | ||
| *, | ||
| engine: Engine, | ||
| buffer: dict[str, dict[str, list[Any]]], | ||
| schema_name: str, | ||
| stream_name: str, | ||
| ) -> None: | ||
| """Flush buffered records to the database.""" | ||
| if not buffer[stream_name]["_airbyte_ab_id"]: | ||
| return | ||
|
|
||
| table_name = f"_airbyte_raw_{stream_name}" | ||
| entries = buffer[stream_name] | ||
|
|
||
| logger.info(f"Flushing {len(entries['_airbyte_ab_id'])} records to {table_name}") | ||
|
|
||
| with engine.connect() as conn: | ||
| for i in range(len(entries["_airbyte_ab_id"])): | ||
| insert_sql = text(f""" | ||
| INSERT INTO {schema_name}.{table_name} | ||
| (_airbyte_ab_id, _airbyte_emitted_at, _airbyte_data) | ||
| VALUES (:ab_id, :emitted_at, :data) | ||
| """) | ||
| conn.execute( | ||
| insert_sql, | ||
| { | ||
| "ab_id": entries["_airbyte_ab_id"][i], | ||
| "emitted_at": entries["_airbyte_emitted_at"][i], | ||
| "data": entries["_airbyte_data"][i], | ||
| }, | ||
| ) | ||
| conn.commit() # pyrefly: ignore[missing-attribute] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| # Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
| """Entry point for the PyAirbyte Universal destination.""" | ||
|
|
||
| import sys | ||
|
|
||
| from airbyte.cli.universal_connector import DestinationPyAirbyteUniversal | ||
|
|
||
|
|
||
| def run() -> None: | ||
| """Run the destination.""" | ||
| DestinationPyAirbyteUniversal().run(sys.argv[1:]) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| run() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| # Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
| """Entry point for the PyAirbyte Universal source.""" | ||
|
|
||
| import sys | ||
|
|
||
| from airbyte_cdk.entrypoint import launch | ||
|
|
||
| from airbyte.cli.universal_connector import SourcePyAirbyteUniversal | ||
|
|
||
|
|
||
| def run() -> None: | ||
| """Run the source.""" | ||
| launch(SourcePyAirbyteUniversal(), sys.argv[1:]) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| run() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: airbytehq/PyAirbyte
Length of output: 1430
🏁 Script executed:
Repository: airbytehq/PyAirbyte
Length of output: 7486
🏁 Script executed:
Repository: airbytehq/PyAirbyte
Length of output: 90
🏁 Script executed:
Repository: airbytehq/PyAirbyte
Length of output: 45
🏁 Script executed:
Repository: airbytehq/PyAirbyte
Length of output: 45
🏁 Script executed:
Repository: airbytehq/PyAirbyte
Length of output: 3232
🏁 Script executed:
Repository: airbytehq/PyAirbyte
Length of output: 101
🏁 Script executed:
Repository: airbytehq/PyAirbyte
Length of output: 7591
🏁 Script executed:
Repository: airbytehq/PyAirbyte
Length of output: 45
🏁 Script executed:
Repository: airbytehq/PyAirbyte
Length of output: 2399
Add early validation to reject empty strings for required Snowflake fields, wdyt?
The constructor accepts all these parameters, but passing empty strings for required fields like
account,username,warehouse,database, androlewill slip through validation and only fail at runtime during connection. Consider adding validation in SnowflakeConfig to catch these misconfigurations early with clear error messages.🤖 Prompt for AI Agents