Skip to content

Commit 2f27c87

Browse files
authored
Feat: Easy persistent cache with new ab.get_colab_cache helper function (#361)
1 parent c5fea25 commit 2f27c87

File tree

4 files changed

+119
-3
lines changed

4 files changed

+119
-3
lines changed

airbyte/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@
126126
from airbyte import (
127127
caches,
128128
cloud,
129+
constants,
129130
datasets,
130131
destinations,
131132
documents,
@@ -139,7 +140,7 @@
139140
)
140141
from airbyte.caches.bigquery import BigQueryCache
141142
from airbyte.caches.duckdb import DuckDBCache
142-
from airbyte.caches.util import get_default_cache, new_local_cache
143+
from airbyte.caches.util import get_colab_cache, get_default_cache, new_local_cache
143144
from airbyte.datasets import CachedDataset
144145
from airbyte.destinations.base import Destination
145146
from airbyte.destinations.util import get_destination
@@ -154,8 +155,9 @@
154155

155156
__all__ = [
156157
# Modules
157-
"cloud",
158158
"caches",
159+
"cloud",
160+
"constants",
159161
"datasets",
160162
"destinations",
161163
"documents",
@@ -169,6 +171,7 @@
169171
"sources",
170172
# Factories
171173
"get_available_connectors",
174+
"get_colab_cache",
172175
"get_default_cache",
173176
"get_destination",
174177
"get_secret",

airbyte/caches/base.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from airbyte_protocol.models import ConfiguredAirbyteCatalog
1515

16+
from airbyte import constants
1617
from airbyte._writers.base import AirbyteWriterInterface
1718
from airbyte.caches._catalog_backend import CatalogBackendBase, SqlCatalogBackend
1819
from airbyte.caches._state_backend import SqlStateBackend
@@ -50,7 +51,7 @@ class CacheBase(SqlConfig, AirbyteWriterInterface):
5051
to the SQL backend specified in the `SqlConfig` class.
5152
"""
5253

53-
cache_dir: Path = Field(default=Path(".cache"))
54+
cache_dir: Path = Field(default=Path(constants.DEFAULT_CACHE_ROOT))
5455
"""The directory to store the cache in."""
5556

5657
cleanup: bool = TEMP_FILE_CLEANUP

airbyte/caches/util.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,18 @@
1111
from airbyte.caches.duckdb import DuckDBCache
1212

1313

14+
# Google drive constants:
15+
16+
_MY_DRIVE = "MyDrive"
17+
"""The default name of the user's personal Google Drive."""
18+
19+
_GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = "/content/drive"
20+
"""The recommended path to mount Google Drive to."""
21+
22+
23+
# Utility functions:
24+
25+
1426
def get_default_cache() -> DuckDBCache:
1527
"""Get a local cache for storing data, using the default database path.
1628
@@ -63,3 +75,88 @@ def new_local_cache(
6375
cache_dir=cache_dir,
6476
cleanup=cleanup,
6577
)
78+
79+
80+
def get_colab_cache(
81+
cache_name: str = "default_cache",
82+
sub_dir: str = "Airbyte/cache",
83+
schema_name: str = "main",
84+
table_prefix: str | None = "",
85+
drive_name: str = _MY_DRIVE,
86+
mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH,
87+
) -> DuckDBCache:
88+
"""Get a local cache for storing data, using the default database path.
89+
90+
Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple
91+
Colab sessions.
92+
93+
Please note that Google Colab may prompt you to authenticate with your Google account to access
94+
your Google Drive. When prompted, click the link and follow the instructions.
95+
96+
Colab will require access to read and write files in your Google Drive, so please be sure to
97+
grant the necessary permissions when prompted.
98+
99+
All arguments are optional and have default values that are suitable for most use cases.
100+
101+
Args:
102+
cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you
103+
want to use a different database for different projects.
104+
sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this
105+
if you want to store the cache in a different subdirectory than the default.
106+
schema_name: The name of the schema to write to. Defaults to "main". Override this if you
107+
want to write to a different schema.
108+
table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this
109+
if you want to use a different prefix for all tables.
110+
drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you
111+
want to store data in a shared drive instead of your personal drive.
112+
mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this
113+
if you want to mount Google Drive to a different path (not recommended).
114+
115+
## Usage Examples
116+
117+
The default `get_colab_cache` arguments are suitable for most use cases:
118+
119+
```python
120+
from airbyte.caches.colab import get_colab_cache
121+
122+
colab_cache = get_colab_cache()
123+
```
124+
125+
Or you can call `get_colab_cache` with custom arguments:
126+
127+
```python
128+
custom_cache = get_colab_cache(
129+
cache_name="my_custom_cache",
130+
sub_dir="Airbyte/custom_cache",
131+
drive_name="My Company Drive",
132+
)
133+
```
134+
"""
135+
try:
136+
from google.colab import drive # noqa: PLC0415 # type: ignore[reportMissingImports]
137+
except ImportError:
138+
drive = None
139+
msg = (
140+
"The `google.colab` interface is only available in Google Colab. "
141+
"Please run this code in a Google Colab notebook."
142+
)
143+
raise ImportError(msg) from None
144+
145+
drive.mount(mount_path)
146+
drive_root = (
147+
Path(mount_path) / drive_name
148+
if drive_name == _MY_DRIVE
149+
else Path(mount_path) / "Shareddrives" / drive_name
150+
)
151+
152+
cache_dir = drive_root / sub_dir
153+
cache_dir.mkdir(parents=True, exist_ok=True)
154+
db_file_path = cache_dir / f"{cache_name}.duckdb"
155+
156+
print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.")
157+
return DuckDBCache(
158+
db_path=db_file_path,
159+
cache_dir=cache_dir,
160+
schema_name=schema_name,
161+
table_prefix=table_prefix,
162+
)

airbyte/constants.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from __future__ import annotations
55

66
import os
7+
from pathlib import Path
78

89

910
DEBUG_MODE = False # Set to True to enable additional debug logging.
@@ -41,6 +42,20 @@
4142
Specific caches may override this value with a different schema name.
4243
"""
4344

45+
DEFAULT_CACHE_ROOT: Path = (
46+
Path() / ".cache"
47+
if "AIRBYTE_CACHE_ROOT" not in os.environ
48+
else Path(os.environ["AIRBYTE_CACHE_ROOT"])
49+
)
50+
"""Default cache root is `.cache` in the current working directory.
51+
52+
The default location can be overridden by setting the `AIRBYTE_CACHE_ROOT` environment variable.
53+
54+
Overriding this can be useful if you always want to store cache files in a specific location.
55+
For example, in ephemeral environments like Google Colab, you might want to store cache files in
56+
your mounted Google Drive by setting this to a path like `/content/drive/MyDrive/Airbyte/cache`.
57+
"""
58+
4459
DEFAULT_ARROW_MAX_CHUNK_SIZE = 100_000
4560
"""The default number of records to include in each batch of an Arrow dataset."""
4661

0 commit comments

Comments
 (0)