Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions dsnotebooks/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def set_kg_key(cls, v):
return v or input("Knowledge graph key: ")


class CopyCollDocumentNotebookSettings(ProjectNotebookSettings):
index_key: str = ""
document_name: str = ""


class CollOptionalNotebookSettings(NotebookSettings):
proj_key: Optional[str] = None
index_key: Optional[str] = None
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,10 @@
" \"Filename\": row[\"_source\"][\"file-info\"][\"filename\"],\n",
" \"Title\": metadata.get(\"title\", \"\"),\n",
" \"Authors\": \", \".join(\n",
" [author[\"name\"] for author in metadata.get(\"authors\")]) if metadata.get(\"authors\") is not None else \"\"\n",
" ,\n",
" [author[\"name\"] for author in metadata.get(\"authors\")]\n",
" )\n",
" if metadata.get(\"authors\") is not None\n",
" else \"\",\n",
" }\n",
" )\n",
"\n",
Expand Down
1 change: 1 addition & 0 deletions examples/data_query_chemistry/chemistry.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@
],
"source": [
"from datetime import datetime\n",
"\n",
"# Visualize summary table\n",
"results = [\n",
" {\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@
"\n",
" with open(json_file) as f:\n",
" doc_jsondata = json.loads(f.read())\n",
" visualize_document_tables(doc_jsondata)\n"
" visualize_document_tables(doc_jsondata)"
]
}
],
Expand Down
16 changes: 16 additions & 0 deletions examples/document_copy_to_collection/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copy documents from one collection to another

This example shows how an index item can be uploaded to another collection

:point_right: Run the [copy_document_to_new_collection.ipynb](./copy_document_to_new_collection.ipynb)
example.

Do note that the example focuses on the case of a "deepsearch-db" project data index, other data index types are not supported.

## Access required

The content of this notebook requires access to Deep Search capabilities which are not
available on the public access system.

[Contact us](https://ds4sd.github.io) if you are interested in exploring
these Deep Search capabilities.
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Copy documents from one collection to another\n",
"\n",
"This example explores how index item can be copy to a new/other collection.\n",
"\n",
"\n",
"### Access required\n",
"\n",
"The content of this notebook requires access to Deep Search capabilities which are not\n",
"available on the public access system.\n",
"\n",
"[Contact us](https://ds4sd.github.io) if you are interested in exploring\n",
"these Deep Search capabilities."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set notebooks parameters"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from dsnotebooks.settings import CopyCollDocumentNotebookSettings\n",
"\n",
"# notebook settings auto-loaded from .env / env vars\n",
"notebook_settings = CopyCollDocumentNotebookSettings()\n",
"\n",
"PROFILE_NAME = notebook_settings.profile # profile to use\n",
"PROJ_KEY = notebook_settings.proj_key # project to use\n",
"INDEX_KEY = notebook_settings.index_key # index to use\n",
"NEW_INDEX_NAME = notebook_settings.new_idx_name # new index to use\n",
"DOC_NAME = notebook_settings.document_name # document to copy\n",
"CLEANUP = notebook_settings.cleanup # whether to clean up\n",
"\n",
"WAIT_S = 3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Connect to Deep Search"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from typing import Optional\n",
"\n",
"import deepsearch as ds\n",
"from deepsearch.cps.client.components.data_indices import (\n",
" DataIndexItemUrls,\n",
" ElasticProjectDataCollectionSource,\n",
")\n",
"\n",
"api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### List collections in project"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"index_name: 208afeb_3595e30-ocr, index_key: a5bc1ca67923f9934a51f86315a1aa5e7e7af5a2\n",
"index_name: 208afeb_3595e30-raw, index_key: db5d279cf71f3e61b0bdfd5c85adafe592cdb7f9\n",
"index_name: 244fa8d_53dcc8f, index_key: d5768ecd3c9caf4e39291ba12fa0a943c0a2146c\n",
"index_name: 38c7afb_eaeb165-ocr, index_key: d0c1553e82a9eb6f22acc47d255187328ab9034b\n",
"index_name: 38c7afb_eaeb165-raw, index_key: 9713088d370c4fde8376e2d570a92a48b1433b60\n",
"index_name: 40a7236_317d2b4-ocr, index_key: 69431c668920c6e16a921ef16a5adefc0da5108f\n",
"index_name: 6dcfb01_5ab483a, index_key: 30e01985ff09b0f4f9c9ffe0511a0cf694cdefac\n",
"index_name: 9a765ea_e2fc9b3, index_key: b5adb77a5471f75fa7f6b2de33a6b411069e61b9\n",
"index_name: 9a765ea_e2fc9b3-ocr, index_key: 9391aa17f5b7ba5609e23767b24460a45e9aba71\n",
"index_name: 9a765ea_e2fc9b3-raw, index_key: 1ff725e720bf2b6ebe4ff45bccddfd60616998db\n",
"index_name: 9b8ea5e_8006e4a-ocr, index_key: 499f25c5e736e3bd7ae481f464205fbc4d57d950\n",
"index_name: Annual Reports, index_key: d82c8a03126801837a459e8e26601cb5edfe001d\n",
"index_name: Annual Reports2, index_key: 9c1b4ee0bab81102b701b4292fb8e0e5a05acf06\n",
"index_name: automation_sandbox, index_key: 273194cd4e2f2cf9a0ab1ee053cb68b754f5a347\n",
"index_name: codelab, index_key: ae7c27bbe442d480e15e2109a62c5d93799d3cd8\n",
"index_name: current_events_books, index_key: d7ebcf78c4bfda543908a690837996c7e943ac20\n",
"index_name: e85c89e_18dbb53-ocr, index_key: 1d8865b68d3dfadd4e4a612fc59c854135da6fba\n",
"index_name: ef6a02a_f5d0895-ocr, index_key: 039a896ca877e0a10c2e6456b654c30c86041350\n",
"index_name: f40ab11_6a16159-ocr, index_key: c6ae597dd58a7d9e3f27c8f9ad893d66a23e8b78\n",
"index_name: FSAE, index_key: b7e37b4201796671aff2129e347f5d4ebf62ed67\n",
"index_name: invoice, index_key: 8a05889e40f28ddd189dd7bcbae8507104dd9db5\n",
"index_name: PMC-valve-replacement, index_key: ead52d7a34c7c0835dacb0fc1256f9f79ca92329\n",
"index_name: pva-test-20240711, index_key: 776fcc3068f557175b01f6db98272c240b6c5750\n",
"index_name: red-books, index_key: f3b4fb43a95255556cd8514ca9381ff5407c73b7\n",
"index_name: redhat-latest-20240604, index_key: 2fa37b4750e014b99e8467b41c8820fd92c4f4f0\n",
"index_name: test, index_key: 7d87a006e4692a2d522b757c5c0ba00453646f09\n"
]
}
],
"source": [
"collections = api.data_indices.list(proj_key=PROJ_KEY)\n",
"\n",
"for dataindex in collections:\n",
" print(f\"index_name: {dataindex.name}, index_key: {dataindex.source.index_key}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get document from collection"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"doc_id: 6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea\n"
]
}
],
"source": [
"indices = api.data_indices.list(proj_key=PROJ_KEY)\n",
"\n",
"dataindex = next((x for x in indices if x.source.index_key == INDEX_KEY), None)\n",
"\n",
"search_query = dataindex.list_items(api, DOC_NAME)\n",
"\n",
"doc_id: Optional[str] = None\n",
"for item in search_query:\n",
" doc_id = item[\"id\"]\n",
"\n",
"if not doc_id:\n",
" print(\"No document found\")\n",
"else:\n",
" print(f\"doc_id: {doc_id}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Target collection"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Option 1: Create New Collection\n",
"- Fill new_collection_name"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"new_data_index = api.data_indices.create(\n",
" proj_key=PROJ_KEY, name=NEW_INDEX_NAME, type=\"deepsearch-doc\"\n",
")\n",
"\n",
"new_index_key = new_data_index.source.index_key\n",
"print(new_index_key)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Option 2: Fill index key (collection already created) from previous list of collections"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# new_index_key = \"\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get document urls"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"item: DataIndexItemUrls = dataindex.get_item_urls(api, doc_id)\n",
"\n",
"pdf_url = item.pdf_url\n",
"print(pdf_url)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Upload document to target collection"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from deepsearch.cps.data_indices import utils as data_indices_utils\n",
"\n",
"coords = ElasticProjectDataCollectionSource(proj_key=PROJ_KEY, index_key=new_index_key)\n",
"\n",
"data_indices_utils.upload_files(api=api, coords=coords, url=pdf_url)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cleanup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if CLEANUP:\n",
" api.data_indices.delete(coords)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading