Skip to content

Commit d58e37f

Browse files
[DS-285]: Add id_name field to Couchbase connection_config (#252)
* Add id_name field to couchbase downloader config * Couchbase downloader config id_name field; changelog and version update * Include id_name in couchbase src e2e test * Fix id_name for e2e test * Fix spellcheck * Use id instead of cbmid as id_name for couchbase src test * parse string * Rename id_name to collection_id * Fix formatting * Update description of Couchbase collection_id field
1 parent b1f0974 commit d58e37f

File tree

6 files changed

+22
-7
lines changed

6 files changed

+22
-7
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.3.4-dev1
2+
3+
### Features
4+
5+
* **Add `collection_id` field to Couchbase `downloader_config`**
6+
17
## 0.3.3
28

39
### Enhancements

test_e2e/env_setup/couchbase/common/constants.env

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ CB_PASSWORD=password
44
CB_BUCKET=unstructured
55
CB_SCOPE=_default
66
CB_COLLECTION=_default
7-
CB_INDEX_NAME=unstructured_test_search
7+
CB_INDEX_NAME=unstructured_test_search
8+
CB_COLLECTION_ID=id

test_e2e/env_setup/couchbase/source_connector/ingest_source_setup_cluster.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class ClusterConfig:
1717
bucket_name: str
1818
scope_name: str
1919
collection_name: str
20+
collection_id: str
2021

2122

2223
def get_client(cluster_config: ClusterConfig) -> Cluster:
@@ -33,6 +34,7 @@ def setup_cluster(cluster_config: ClusterConfig, source_file: str):
3334
bucket = cluster.bucket(cluster_config.bucket_name)
3435
scope = bucket.scope(cluster_config.scope_name)
3536
collection = scope.collection(cluster_config.collection_name)
37+
collection_id = cluster_config.collection_id
3638

3739
cluster.query(
3840
f"Create primary index on "
@@ -45,8 +47,7 @@ def setup_cluster(cluster_config: ClusterConfig, source_file: str):
4547
for line in file:
4648
try:
4749
doc = json.loads(line)
48-
# Assuming 'cbmid' is the document ID. Adjust as necessary.
49-
doc_id = doc.get("cbmid", uuid.uuid4())
50+
doc_id = str(doc.get(collection_id, uuid.uuid4()))
5051
if doc_id:
5152
collection.upsert(doc_id, doc)
5253
else:
@@ -67,6 +68,7 @@ def setup_cluster(cluster_config: ClusterConfig, source_file: str):
6768
parser.add_argument("--bucket_name", required=True, help="Couchbase bucket name")
6869
parser.add_argument("--scope_name", required=True, help="Couchbase scope name")
6970
parser.add_argument("--collection_name", required=True, help="Couchbase collection name")
71+
parser.add_argument("--collection_id", required=True, help="Couchbase collection id key")
7072
parser.add_argument("--source_file", required=True, help="Source file to ingest")
7173

7274
args = parser.parse_args()
@@ -78,6 +80,7 @@ def setup_cluster(cluster_config: ClusterConfig, source_file: str):
7880
bucket_name=args.bucket_name,
7981
scope_name=args.scope_name,
8082
collection_name=args.collection_name,
83+
collection_id=args.collection_id,
8184
)
8285

8386
setup_cluster(config, args.source_file)

test_e2e/src/couchbase.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ source "$SCRIPT_DIR"/cleanup.sh
1919
source "$SCRIPT_DIR"/env_setup/couchbase/common/constants.env
2020

2121
# Check if all necessary environment variables are set
22-
if [ -z "$CB_USERNAME" ] || [ -z "$CB_PASSWORD" ] || [ -z "$CB_CONN_STR" ] || [ -z "$CB_BUCKET" ] || [ -z "$CB_SCOPE" ] || [ -z "$CB_COLLECTION" ]; then
23-
echo "Error: One or more environment variables are not set. Please set CB_CONN_STR, CB_USERNAME, CB_PASSWORD, CB_BUCKET, CB_SCOPE, and CB_COLLECTION."
22+
if [ -z "$CB_USERNAME" ] || [ -z "$CB_PASSWORD" ] || [ -z "$CB_CONN_STR" ] || [ -z "$CB_BUCKET" ] || [ -z "$CB_SCOPE" ] || [ -z "$CB_COLLECTION" ] || [ -z "$CB_COLLECTION_ID" ]; then
23+
echo "Error: One or more environment variables are not set. Please set CB_CONN_STR, CB_USERNAME, CB_PASSWORD, CB_BUCKET, CB_SCOPE, CB_COLLECTION and CB_COLLECTION_ID."
2424
exit 1
2525
fi
2626

@@ -51,6 +51,7 @@ python "$SCRIPT_DIR"/env_setup/couchbase/source_connector/ingest_source_setup_cl
5151
--bucket_name "$CB_BUCKET" \
5252
--scope_name "$CB_SCOPE" \
5353
--collection_name "$CB_COLLECTION" \
54+
--collection_id "$CB_COLLECTION_ID" \
5455
--source_file "$SCRIPT_DIR"/env_setup/couchbase/source_connector/airline_sample.jsonlines
5556
wait
5657

@@ -69,6 +70,7 @@ PYTHONPATH=. ./unstructured_ingest/main.py \
6970
--password "$CB_PASSWORD" \
7071
--scope "$CB_SCOPE" \
7172
--collection "$CB_COLLECTION" \
73+
--collection-id "$CB_COLLECTION_ID" \
7274
--work-dir "$WORK_DIR" \
7375
--preserve-downloads \
7476
--reprocess \

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.3.3" # pragma: no cover
1+
__version__ = "0.3.4-dev1" # pragma: no cover

unstructured_ingest/v2/processes/connectors/couchbase.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,9 @@ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
219219

220220

221221
class CouchbaseDownloaderConfig(DownloaderConfig):
222+
collection_id: str = Field(
223+
default="id", description="The unique key of the id field in the collection"
224+
)
222225
fields: list[str] = field(default_factory=list)
223226

224227

@@ -250,7 +253,7 @@ def map_cb_results(self, cb_results: dict) -> str:
250253
def generate_download_response(
251254
self, result: dict, bucket: str, file_data: FileData
252255
) -> DownloadResponse:
253-
record_id = result["id"]
256+
record_id = result[self.download_config.collection_id]
254257
filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
255258
filename = f"{filename_id}.txt"
256259
download_path = self.download_dir / Path(filename)

0 commit comments

Comments
 (0)