|
11 | 11 | from airbyte.caches.duckdb import DuckDBCache |
12 | 12 |
|
13 | 13 |
|
| 14 | +# Google drive constants: |
| 15 | + |
| 16 | +_MY_DRIVE = "MyDrive" |
| 17 | +"""The default name of the user's personal Google Drive.""" |
| 18 | + |
| 19 | +_GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = "/content/drive" |
| 20 | +"""The recommended path to mount Google Drive to.""" |
| 21 | + |
| 22 | + |
| 23 | +# Utility functions: |
| 24 | + |
| 25 | + |
14 | 26 | def get_default_cache() -> DuckDBCache: |
15 | 27 | """Get a local cache for storing data, using the default database path. |
16 | 28 |
|
@@ -63,3 +75,88 @@ def new_local_cache( |
63 | 75 | cache_dir=cache_dir, |
64 | 76 | cleanup=cleanup, |
65 | 77 | ) |
| 78 | + |
| 79 | + |
| 80 | +def get_colab_cache( |
| 81 | + cache_name: str = "default_cache", |
| 82 | + sub_dir: str = "Airbyte/cache", |
| 83 | + schema_name: str = "main", |
| 84 | + table_prefix: str | None = "", |
| 85 | + drive_name: str = _MY_DRIVE, |
| 86 | + mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH, |
| 87 | +) -> DuckDBCache: |
| 88 | + """Get a local cache for storing data, using the default database path. |
| 89 | +
|
| 90 | + Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple |
| 91 | + Colab sessions. |
| 92 | +
|
| 93 | + Please note that Google Colab may prompt you to authenticate with your Google account to access |
| 94 | + your Google Drive. When prompted, click the link and follow the instructions. |
| 95 | +
|
| 96 | + Colab will require access to read and write files in your Google Drive, so please be sure to |
| 97 | + grant the necessary permissions when prompted. |
| 98 | +
|
| 99 | + All arguments are optional and have default values that are suitable for most use cases. |
| 100 | +
|
| 101 | + Args: |
| 102 | + cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you |
| 103 | + want to use a different database for different projects. |
| 104 | + sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this |
| 105 | + if you want to store the cache in a different subdirectory than the default. |
| 106 | + schema_name: The name of the schema to write to. Defaults to "main". Override this if you |
| 107 | + want to write to a different schema. |
| 108 | + table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this |
| 109 | + if you want to use a different prefix for all tables. |
| 110 | + drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you |
| 111 | + want to store data in a shared drive instead of your personal drive. |
| 112 | + mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this |
| 113 | + if you want to mount Google Drive to a different path (not recommended). |
| 114 | +
|
| 115 | + ## Usage Examples |
| 116 | +
|
| 117 | + The default `get_colab_cache` arguments are suitable for most use cases: |
| 118 | +
|
| 119 | + ```python |
| 120 | + from airbyte.caches.colab import get_colab_cache |
| 121 | +
|
| 122 | + colab_cache = get_colab_cache() |
| 123 | + ``` |
| 124 | +
|
| 125 | + Or you can call `get_colab_cache` with custom arguments: |
| 126 | +
|
| 127 | + ```python |
| 128 | + custom_cache = get_colab_cache( |
| 129 | + cache_name="my_custom_cache", |
| 130 | + sub_dir="Airbyte/custom_cache", |
| 131 | + drive_name="My Company Drive", |
| 132 | + ) |
| 133 | + ``` |
| 134 | + """ |
| 135 | + try: |
| 136 | + from google.colab import drive # noqa: PLC0415 # type: ignore[reportMissingImports] |
| 137 | + except ImportError: |
| 138 | + drive = None |
| 139 | + msg = ( |
| 140 | + "The `google.colab` interface is only available in Google Colab. " |
| 141 | + "Please run this code in a Google Colab notebook." |
| 142 | + ) |
| 143 | + raise ImportError(msg) from None |
| 144 | + |
| 145 | + drive.mount(mount_path) |
| 146 | + drive_root = ( |
| 147 | + Path(mount_path) / drive_name |
| 148 | + if drive_name == _MY_DRIVE |
| 149 | + else Path(mount_path) / "Shareddrives" / drive_name |
| 150 | + ) |
| 151 | + |
| 152 | + cache_dir = drive_root / sub_dir |
| 153 | + cache_dir.mkdir(parents=True, exist_ok=True) |
| 154 | + db_file_path = cache_dir / f"{cache_name}.duckdb" |
| 155 | + |
| 156 | + print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.") |
| 157 | + return DuckDBCache( |
| 158 | + db_path=db_file_path, |
| 159 | + cache_dir=cache_dir, |
| 160 | + schema_name=schema_name, |
| 161 | + table_prefix=table_prefix, |
| 162 | + ) |
0 commit comments