Skip to content

Commit fcb1d30

Browse files
authored
feat/add support for databricks personal access token (#304)
* add token field to databricks access configs * add integration test for pat support * add github ci secrets mapping
1 parent 7d8fa92 commit fcb1d30

File tree

12 files changed

+127
-28
lines changed

12 files changed

+127
-28
lines changed

.github/workflows/e2e.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ jobs:
109109
KAFKA_API_KEY: ${{ secrets.KAFKA_API_KEY }}
110110
KAFKA_SECRET: ${{ secrets.KAFKA_SECRET }}
111111
KAFKA_BOOTSTRAP_SERVER: ${{ secrets.KAFKA_BOOTSTRAP_SERVER }}
112+
DATABRICKS_PAT: ${{ secrets.DATABRICKS_PAT }}
112113
run : |
113114
source .venv/bin/activate
114115
make install-test

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.3.11-dev0
2+
3+
### Enhancements
4+
5+
* **Support Databricks personal access token**
6+
17
## 0.3.10
28

39
### Enhancements

test/integration/connectors/databricks_tests/test_volumes_native.py renamed to test/integration/connectors/databricks/test_volumes_native.py

Lines changed: 75 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import json
22
import os
3-
import tempfile
43
import uuid
54
from contextlib import contextmanager
65
from dataclasses import dataclass
76
from pathlib import Path
7+
from unittest import mock
88

99
import pytest
1010
from databricks.sdk import WorkspaceClient
@@ -31,11 +31,15 @@
3131

3232

3333
@dataclass
34-
class EnvData:
34+
class BaseEnvData:
3535
host: str
36+
catalog: str
37+
38+
39+
@dataclass
40+
class BasicAuthEnvData(BaseEnvData):
3641
client_id: str
3742
client_secret: str
38-
catalog: str
3943

4044
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
4145
return DatabricksNativeVolumesConnectionConfig(
@@ -47,32 +51,52 @@ def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
4751
)
4852

4953

50-
def get_env_data() -> EnvData:
51-
return EnvData(
54+
@dataclass
55+
class PATEnvData(BaseEnvData):
56+
token: str
57+
58+
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
59+
return DatabricksNativeVolumesConnectionConfig(
60+
host=self.host,
61+
access_config=DatabricksNativeVolumesAccessConfig(
62+
token=self.token,
63+
),
64+
)
65+
66+
67+
def get_basic_auth_env_data() -> BasicAuthEnvData:
68+
return BasicAuthEnvData(
5269
host=os.environ["DATABRICKS_HOST"],
5370
client_id=os.environ["DATABRICKS_CLIENT_ID"],
5471
client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
5572
catalog=os.environ["DATABRICKS_CATALOG"],
5673
)
5774

5875

76+
def get_pat_env_data() -> PATEnvData:
77+
return PATEnvData(
78+
host=os.environ["DATABRICKS_HOST"],
79+
catalog=os.environ["DATABRICKS_CATALOG"],
80+
token=os.environ["DATABRICKS_PAT"],
81+
)
82+
83+
5984
@pytest.mark.asyncio
6085
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
6186
@requires_env(
6287
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
6388
)
64-
async def test_volumes_native_source():
65-
env_data = get_env_data()
66-
indexer_config = DatabricksNativeVolumesIndexerConfig(
67-
recursive=True,
68-
volume="test-platform",
69-
volume_path="databricks-volumes-test-input",
70-
catalog=env_data.catalog,
71-
)
72-
connection_config = env_data.get_connection_config()
73-
with tempfile.TemporaryDirectory() as tempdir:
74-
tempdir_path = Path(tempdir)
75-
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tempdir_path)
89+
async def test_volumes_native_source(tmp_path: Path):
90+
env_data = get_basic_auth_env_data()
91+
with mock.patch.dict(os.environ, clear=True):
92+
indexer_config = DatabricksNativeVolumesIndexerConfig(
93+
recursive=True,
94+
volume="test-platform",
95+
volume_path="databricks-volumes-test-input",
96+
catalog=env_data.catalog,
97+
)
98+
connection_config = env_data.get_connection_config()
99+
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
76100
indexer = DatabricksNativeVolumesIndexer(
77101
connection_config=connection_config, index_config=indexer_config
78102
)
@@ -89,12 +113,44 @@ async def test_volumes_native_source():
89113
)
90114

91115

116+
@pytest.mark.asyncio
117+
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG)
118+
@requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
119+
async def test_volumes_native_source_pat(tmp_path: Path):
120+
env_data = get_pat_env_data()
121+
with mock.patch.dict(os.environ, clear=True):
122+
indexer_config = DatabricksNativeVolumesIndexerConfig(
123+
recursive=True,
124+
volume="test-platform",
125+
volume_path="databricks-volumes-test-input",
126+
catalog=env_data.catalog,
127+
)
128+
connection_config = env_data.get_connection_config()
129+
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
130+
indexer = DatabricksNativeVolumesIndexer(
131+
connection_config=connection_config, index_config=indexer_config
132+
)
133+
downloader = DatabricksNativeVolumesDownloader(
134+
connection_config=connection_config, download_config=download_config
135+
)
136+
await source_connector_validation(
137+
indexer=indexer,
138+
downloader=downloader,
139+
configs=SourceValidationConfigs(
140+
test_id="databricks_volumes_native_pat",
141+
expected_num_files=1,
142+
),
143+
)
144+
145+
92146
def _get_volume_path(catalog: str, volume: str, volume_path: str):
93147
return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
94148

95149

96150
@contextmanager
97-
def databricks_destination_context(env_data: EnvData, volume: str, volume_path) -> WorkspaceClient:
151+
def databricks_destination_context(
152+
env_data: BasicAuthEnvData, volume: str, volume_path
153+
) -> WorkspaceClient:
98154
client = WorkspaceClient(
99155
host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
100156
)
@@ -137,7 +193,7 @@ def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_p
137193
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
138194
)
139195
async def test_volumes_native_destination(upload_file: Path):
140-
env_data = get_env_data()
196+
env_data = get_basic_auth_env_data()
141197
volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
142198
file_data = FileData(
143199
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"directory_structure": [
3+
"fake-memo.pdf"
4+
]
5+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"identifier": "9a6eb650-98d6-5465-8f1d-aa7118eee87e",
3+
"connector_type": "databricks_volumes",
4+
"source_identifiers": {
5+
"filename": "fake-memo.pdf",
6+
"fullpath": "/Volumes/utic-dev-tech-fixtures/default/test-platform/databricks-volumes-test-input/fake-memo.pdf",
7+
"rel_path": "fake-memo.pdf"
8+
},
9+
"metadata": {
10+
"url": "/Volumes/utic-dev-tech-fixtures/default/test-platform/databricks-volumes-test-input/fake-memo.pdf",
11+
"version": null,
12+
"record_locator": null,
13+
"date_created": null,
14+
"date_modified": "1729186569000",
15+
"date_processed": null,
16+
"permissions_data": null,
17+
"filesize_bytes": null
18+
},
19+
"additional_metadata": {
20+
"catalog": "utic-dev-tech-fixtures",
21+
"path": "/Volumes/utic-dev-tech-fixtures/default/test-platform/databricks-volumes-test-input/fake-memo.pdf"
22+
},
23+
"reprocess": false,
24+
"local_download_path": "/private/var/folders/n8/rps3wl195pj4p_0vyxqj5jrw0000gn/T/pytest-of-romanisecke/pytest-9/test_volumes_native_source_pat0/fake-memo.pdf",
25+
"display_name": null
26+
}

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.3.10" # pragma: no cover
1+
__version__ = "0.3.11-dev0" # pragma: no cover

unstructured_ingest/v2/processes/connectors/databricks/volumes.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
)
1515
from unstructured_ingest.utils.dep_check import requires_dependencies
1616
from unstructured_ingest.v2.interfaces import (
17+
AccessConfig,
1718
ConnectionConfig,
1819
Downloader,
1920
DownloaderConfig,
@@ -52,6 +53,10 @@ def path(self) -> str:
5253
return path
5354

5455

56+
class DatabricksVolumesAccessConfig(AccessConfig):
57+
token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
58+
59+
5560
class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
5661
host: Optional[str] = Field(
5762
default=None,

unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33

44
from pydantic import Field, Secret
55

6-
from unstructured_ingest.v2.interfaces import AccessConfig
76
from unstructured_ingest.v2.processes.connector_registry import (
87
DestinationRegistryEntry,
98
SourceRegistryEntry,
109
)
1110
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11+
DatabricksVolumesAccessConfig,
1212
DatabricksVolumesConnectionConfig,
1313
DatabricksVolumesDownloader,
1414
DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@
2121
CONNECTOR_TYPE = "databricks_volumes_aws"
2222

2323

24-
class DatabricksAWSVolumesAccessConfig(AccessConfig):
24+
class DatabricksAWSVolumesAccessConfig(DatabricksVolumesAccessConfig):
2525
account_id: Optional[str] = Field(
2626
default=None,
2727
description="The Databricks account ID for the Databricks " "accounts endpoint",

unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33

44
from pydantic import Field, Secret
55

6-
from unstructured_ingest.v2.interfaces import AccessConfig
76
from unstructured_ingest.v2.processes.connector_registry import (
87
DestinationRegistryEntry,
98
SourceRegistryEntry,
109
)
1110
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11+
DatabricksVolumesAccessConfig,
1212
DatabricksVolumesConnectionConfig,
1313
DatabricksVolumesDownloader,
1414
DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@
2121
CONNECTOR_TYPE = "databricks_volumes_azure"
2222

2323

24-
class DatabricksAzureVolumesAccessConfig(AccessConfig):
24+
class DatabricksAzureVolumesAccessConfig(DatabricksVolumesAccessConfig):
2525
account_id: Optional[str] = Field(
2626
default=None,
2727
description="The Databricks account ID for the Databricks " "accounts endpoint.",

0 commit comments

Comments
 (0)