Skip to content

Commit 919a812

Browse files
authored
Feat(CLI): Add support for docker images and local python dev executables in poetry (#415)
1 parent 53246a3 commit 919a812

File tree

2 files changed

+136
-83
lines changed

2 files changed

+136
-83
lines changed

airbyte/cli.py

Lines changed: 90 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
pyab --help
1515
```
1616
17-
You can also use the fast and powerful `uv` tool to run the CLI without pre-installing:
17+
You can also use `pipx` or the fast and powerful `uv` tool to run the PyAirbyte CLI
18+
without pre-installing:
1819
1920
```
2021
# Install `uv` if you haven't already:
@@ -23,6 +24,43 @@
2324
# Run the PyAirbyte CLI using `uvx`:
2425
uvx --from=airbyte pyab --help
2526
```
27+
28+
Example `benchmark` Usage:
29+
30+
```
31+
# PyAirbyte System Benchmark (no-op):
32+
pyab benchmark --num-records=2.4e6
33+
34+
# Source Benchmark:
35+
pyab benchmark --source=source-hardcoded-records --config='{count: 400000}'
36+
pyab benchmark --source=source-hardcoded-records --config='{count: 400000}' --streams='*'
37+
pyab benchmark --source=source-hardcoded-records --config='{count: 4000}' --streams=dummy_fields
38+
39+
# Source Benchmark from Docker Image:
40+
pyab benchmark --source=airbyte/source-hardcoded-records:latest --config='{count: 400_000}'
41+
pyab benchmark --source=airbyte/source-hardcoded-records:dev --config='{count: 400_000}'
42+
43+
# Destination Benchmark:
44+
pyab benchmark --destination=destination-dev-null --config=/path/to/config.json
45+
46+
# Benchmark a Local Python Source (source-s3):
47+
pyab benchmark --source=$(poetry run which source-s3) --config=./secrets/config.json
48+
# Equivalent to:
49+
LOCAL_EXECUTABLE=$(poetry run which source-s3)
50+
CONFIG_PATH=$(realpath ./secrets/config.json)
51+
pyab benchmark --source=$LOCAL_EXECUTABLE --config=$CONFIG_PATH
52+
```
53+
54+
Example Usage with `uv`:
55+
56+
57+
58+
Example `validate` Usage:
59+
60+
```
61+
pyab validate --connector=source-hardcoded-records
62+
pyab validate --connector=source-hardcoded-records --config='{count: 400_000}'
63+
```
2664
"""
2765

2866
from __future__ import annotations
@@ -118,6 +156,22 @@ def _inject_secrets(config_dict: dict[str, Any]) -> None:
118156
return config_dict
119157

120158

159+
def _is_docker_image(image: str | None) -> bool:
160+
"""Check if the source or destination is a docker image."""
161+
return image is not None and ":" in image
162+
163+
164+
def _is_executable_path(connector_str: str) -> bool:
165+
return connector_str.startswith(".") or "/" in connector_str
166+
167+
168+
def _get_connector_name(connector: str) -> str:
169+
if _is_docker_image(connector):
170+
return connector.split(":")[0].split("/")[-1]
171+
172+
return connector
173+
174+
121175
def _resolve_source_job(
122176
*,
123177
source: str | None = None,
@@ -127,16 +181,29 @@ def _resolve_source_job(
127181
"""Resolve the source job into a configured Source object.
128182
129183
Args:
130-
source: The source name, with an optional version declaration.
184+
source: The source name or source reference.
131185
If a path is provided, the source will be loaded from the local path.
132-
If the string `'.'` is provided, the source will be loaded from the current
133-
working directory.
186+
If the source contains a colon (':'), it will be interpreted as a docker image and tag.
134187
config: The path to a configuration file for the named source or destination.
135188
streams: A comma-separated list of stream names to select for reading. If set to "*",
136189
all streams will be selected. If not provided, all streams will be selected.
137190
"""
191+
config_dict = _resolve_config(config) if config else None
192+
streams_list: str | list[str] = streams or "*"
193+
if isinstance(streams, str) and streams != "*":
194+
streams_list = [stream.strip() for stream in streams.split(",")]
195+
138196
source_obj: Source
139-
if source and (source.startswith(".") or "/" in source):
197+
if source and _is_docker_image(source):
198+
source_obj = get_source(
199+
name=_get_connector_name(source),
200+
docker_image=source,
201+
config=config_dict,
202+
streams=streams_list,
203+
)
204+
return source_obj
205+
206+
if source and _is_executable_path(source):
140207
# Treat the source as a path.
141208
source_executable = Path(source)
142209
if not source_executable.exists():
@@ -149,26 +216,22 @@ def _resolve_source_job(
149216
source_obj = get_source(
150217
name=source_executable.stem,
151218
local_executable=source_executable,
219+
config=config_dict,
220+
streams=streams_list,
152221
)
153222
return source_obj
154-
if not config:
155-
raise PyAirbyteInputError(
156-
message="No configuration found.",
157-
)
223+
158224
if not source or not source.startswith("source-"):
159225
raise PyAirbyteInputError(
160-
message="Expected a source name or path to executable.",
226+
message="Expected a source name, docker image, or path to executable.",
161227
input_value=source,
162228
)
163229

164230
source_name: str = source
165-
streams_list: str | list[str] = streams or "*"
166-
if isinstance(streams, str) and streams != "*":
167-
streams_list = [stream.strip() for stream in streams.split(",")]
168231

169232
return get_source(
170233
name=source_name,
171-
config=_resolve_config(config) if config else {},
234+
config=config_dict,
172235
streams=streams_list,
173236
)
174237

@@ -181,10 +244,10 @@ def _resolve_destination_job(
181244
"""Resolve the destination job into a configured Destination object.
182245
183246
Args:
184-
destination: The destination name, with an optional version declaration.
185-
If a path is provided, the destination will be loaded from the local path.
186-
If the string `'.'` is provided, the destination will be loaded from the current
187-
working directory.
247+
destination: The destination name or source reference.
248+
If a path is provided, the source will be loaded from the local path.
249+
If the destination contains a colon (':'), it will be interpreted as a docker image
250+
and tag.
188251
config: The path to a configuration file for the named source or destination.
189252
"""
190253
if not config:
@@ -236,62 +299,27 @@ def _resolve_destination_job(
236299
required=False,
237300
help=CONFIG_HELP,
238301
)
239-
@click.option(
240-
"--install",
241-
is_flag=True,
242-
default=False,
243-
help=(
244-
"Whether to install the connector if it is not available locally. "
245-
"Defaults to False, meaning the connector is expected to be already be installed."
246-
),
247-
)
248302
def validate(
249303
connector: str | None = None,
250304
config: str | None = None,
251-
*,
252-
install: bool = False,
253305
) -> None:
254306
"""Validate the connector."""
255-
local_executable: Path | None = None
256307
if not connector:
257308
raise PyAirbyteInputError(
258309
message="No connector provided.",
259310
)
260-
if connector.startswith(".") or "/" in connector:
261-
# Treat the connector as a path.
262-
local_executable = Path(connector)
263-
if not local_executable.exists():
264-
raise PyAirbyteInputError(
265-
message="Connector executable not found.",
266-
context={
267-
"connector": connector,
268-
},
269-
)
270-
connector_name = local_executable.stem
271-
else:
272-
connector_name = connector
273-
274-
if not connector_name.startswith("source-") and not connector_name.startswith("destination-"):
275-
raise PyAirbyteInputError(
276-
message=(
277-
"Expected a connector name or path to executable. "
278-
"Connector names are expected to begin with 'source-' or 'destination-'."
279-
),
280-
input_value=connector,
281-
)
282311

283312
connector_obj: Source | Destination
284-
if connector_name.startswith("source-"):
285-
connector_obj = get_source(
286-
name=connector_name,
287-
local_executable=local_executable,
288-
install_if_missing=install,
313+
if "source-" in connector:
314+
connector_obj = _resolve_source_job(
315+
source=connector,
316+
config=None,
317+
streams=None,
289318
)
290319
else: # destination
291-
connector_obj = get_destination(
292-
name=connector_name,
293-
local_executable=local_executable,
294-
install_if_missing=install,
320+
connector_obj = _resolve_destination_job(
321+
destination=connector,
322+
config=None,
295323
)
296324

297325
print("Getting `spec` output from connector...")
@@ -310,7 +338,7 @@ def validate(
310338
type=str,
311339
help=(
312340
"The source name, with an optional version declaration. "
313-
"If a path is provided, it will be interpreted as a path to the local executable. "
341+
"If the name contains a colon (':'), it will be interpreted as a docker image and tag. "
314342
),
315343
)
316344
@click.option(

examples/run_perf_test_reads.py

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -23,22 +23,25 @@
2323
2424
```
2525
# Run with 5_000 records
26-
poetry run python ./examples/run_perf_test_reads.py -n=1e3
26+
poetry run python ./examples/run_perf_test_reads.py -n=5e3
2727
# Run with 500_000 records
28-
poetry run python ./examples/run_perf_test_reads.py -n=1e5
28+
poetry run python ./examples/run_perf_test_reads.py -n=5e5
29+
30+
# Load 1 million records to Snowflake cache
31+
poetry run python ./examples/run_perf_test_reads.py -n=1e6 --cache=snowflake
2932
30-
# Load 5_000 records to Snowflake
31-
poetry run python ./examples/run_perf_test_reads.py -n=1e3 --cache=snowflake
33+
# Load 1 million records to Snowflake destination
34+
poetry run python ./examples/run_perf_test_reads.py -n=1e6 --destination=snowflake
3235
3336
# Load 5_000 records to BigQuery
34-
poetry run python ./examples/run_perf_test_reads.py -n=1e3 --cache=bigquery
37+
poetry run python ./examples/run_perf_test_reads.py -n=5e3 --cache=bigquery
3538
```
3639
3740
You can also use this script to test destination load performance:
3841
3942
```bash
4043
# Load 5_000 records to BigQuery
41-
poetry run python ./examples/run_perf_test_reads.py -n=1e3 --destination=e2e
44+
poetry run python ./examples/run_perf_test_reads.py -n=5e3 --destination=e2e
4245
```
4346
4447
Testing raw PyAirbyte throughput with and without caching:
@@ -74,6 +77,7 @@
7477
from airbyte.secrets.google_gsm import GoogleGSMSecretManager
7578
from airbyte.sources import get_benchmark_source
7679
from typing_extensions import Literal
80+
from ulid import ULID
7781

7882
if TYPE_CHECKING:
7983
from airbyte.sources.base import Source
@@ -82,6 +86,12 @@
8286
AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing"
8387

8488

89+
def _random_suffix() -> str:
90+
"""Generate a random suffix for use in test environments, using ULIDs."""
91+
ulid = str(ULID())
92+
return ulid[:6] + ulid[-3:]
93+
94+
8595
def get_gsm_secret_json(secret_name: str) -> dict:
8696
secret_mgr = GoogleGSMSecretManager(
8797
project=AIRBYTE_INTERNAL_GCP_PROJECT,
@@ -95,25 +105,26 @@ def get_gsm_secret_json(secret_name: str) -> dict:
95105

96106

97107
def get_cache(
98-
cache_type: Literal["duckdb", "snowflake", "bigquery", False],
108+
cache_type: Literal["duckdb", "snowflake", "bigquery", "disabled", False],
99109
) -> CacheBase | Literal[False]:
100-
if cache_type is False:
110+
if cache_type is False or cache_type == "disabled":
101111
return False
102112

103113
if cache_type == "duckdb":
104114
return ab.new_local_cache()
105115

106116
if cache_type == "snowflake":
107-
secret_config = get_gsm_secret_json(
117+
snowflake_config = get_gsm_secret_json(
108118
secret_name="AIRBYTE_LIB_SNOWFLAKE_CREDS",
109119
)
110120
return SnowflakeCache(
111-
account=secret_config["account"],
112-
username=secret_config["username"],
113-
password=secret_config["password"],
114-
database=secret_config["database"],
115-
warehouse=secret_config["warehouse"],
116-
role=secret_config["role"],
121+
account=snowflake_config["account"],
122+
username=snowflake_config["username"],
123+
password=snowflake_config["password"],
124+
database=snowflake_config["database"],
125+
warehouse=snowflake_config["warehouse"],
126+
role=snowflake_config["role"],
127+
schema_name=f"INTEGTEST_{_random_suffix()}",
117128
)
118129

119130
if cache_type == "bigquery":
@@ -171,12 +182,26 @@ def get_destination(destination_type: str) -> ab.Destination:
171182
if destination_type in ["e2e", "noop"]:
172183
return get_noop_destination()
173184

185+
if destination_type.removeprefix("destination-") == "snowflake":
186+
snowflake_config = get_gsm_secret_json(
187+
secret_name="AIRBYTE_LIB_SNOWFLAKE_CREDS",
188+
)
189+
snowflake_config["host"] = (
190+
f"{snowflake_config['account']}.snowflakecomputing.com"
191+
)
192+
snowflake_config["schema"] = f"INTEGTEST_{_random_suffix()}"
193+
return ab.get_destination(
194+
"destination-snowflake",
195+
config=snowflake_config,
196+
docker_image=True,
197+
)
198+
174199
raise ValueError(f"Unknown destination type: {destination_type}") # noqa: TRY003
175200

176201

177202
def main(
178203
n: int | str = "5e5",
179-
cache_type: Literal["duckdb", "bigquery", "snowflake", False] = "duckdb",
204+
cache_type: Literal["duckdb", "bigquery", "snowflake", "disabled"] = "disabled",
180205
source_alias: str = "e2e",
181206
destination_type: str | None = None,
182207
) -> None:
@@ -222,8 +247,8 @@ def main(
222247
"--cache",
223248
type=str,
224249
help="The cache type to use.",
225-
choices=["duckdb", "snowflake", "bigquery"],
226-
default="duckdb",
250+
choices=["duckdb", "snowflake", "bigquery", "disabled"],
251+
default="disabled",
227252
)
228253
parser.add_argument(
229254
"--no-cache",
@@ -244,20 +269,20 @@ def main(
244269
"hardcoded",
245270
"faker",
246271
],
247-
default="hardcoded",
272+
default="benchmark",
248273
)
249274
parser.add_argument(
250275
"--destination",
251276
type=str,
252277
help=("The destination to use (optional)."),
253-
choices=["e2e"],
278+
choices=["e2e", "noop", "snowflake"],
254279
default=None,
255280
)
256281
args = parser.parse_args()
257282

258283
main(
259284
n=args.n,
260-
cache_type=args.cache if not args.no_cache else False,
285+
cache_type=args.cache if not args.no_cache else "disabled",
261286
source_alias=args.source,
262287
destination_type=args.destination,
263288
)

0 commit comments

Comments
 (0)