diff --git a/dsnotebooks/settings.py b/dsnotebooks/settings.py index 8fa7cdc..3f3e437 100644 --- a/dsnotebooks/settings.py +++ b/dsnotebooks/settings.py @@ -32,6 +32,11 @@ def set_kg_key(cls, v): return v or input("Knowledge graph key: ") +class CopyCollDocumentNotebookSettings(ProjectNotebookSettings): + index_key: str = "" + document_name: str = "" + + class CollOptionalNotebookSettings(NotebookSettings): proj_key: Optional[str] = None index_key: Optional[str] = None diff --git a/examples/bring_your_own_pdf/upload_explore_download_pdfs.ipynb b/examples/bring_your_own_pdf/upload_explore_download_pdfs.ipynb index 4e7772f..5e496be 100644 --- a/examples/bring_your_own_pdf/upload_explore_download_pdfs.ipynb +++ b/examples/bring_your_own_pdf/upload_explore_download_pdfs.ipynb @@ -381,8 +381,10 @@ " \"Filename\": row[\"_source\"][\"file-info\"][\"filename\"],\n", " \"Title\": metadata.get(\"title\", \"\"),\n", " \"Authors\": \", \".join(\n", - " [author[\"name\"] for author in metadata.get(\"authors\")]) if metadata.get(\"authors\") is not None else \"\"\n", - " ,\n", + " [author[\"name\"] for author in metadata.get(\"authors\")]\n", + " )\n", + " if metadata.get(\"authors\") is not None\n", + " else \"\",\n", " }\n", " )\n", "\n", diff --git a/examples/data_query_chemistry/chemistry.ipynb b/examples/data_query_chemistry/chemistry.ipynb index b2cac90..a855956 100644 --- a/examples/data_query_chemistry/chemistry.ipynb +++ b/examples/data_query_chemistry/chemistry.ipynb @@ -340,6 +340,7 @@ ], "source": [ "from datetime import datetime\n", + "\n", "# Visualize summary table\n", "results = [\n", " {\n", diff --git a/examples/document_conversion_extract_tables/extract_tables.ipynb b/examples/document_conversion_extract_tables/extract_tables.ipynb index 7452dda..9985ebf 100644 --- a/examples/document_conversion_extract_tables/extract_tables.ipynb +++ b/examples/document_conversion_extract_tables/extract_tables.ipynb @@ -224,7 +224,7 @@ "\n", " with open(json_file) as f:\n", " doc_jsondata = json.loads(f.read())\n", - " visualize_document_tables(doc_jsondata)\n" + " visualize_document_tables(doc_jsondata)" ] } ], diff --git a/examples/document_copy_to_collection/README.md b/examples/document_copy_to_collection/README.md new file mode 100644 index 0000000..450db0c --- /dev/null +++ b/examples/document_copy_to_collection/README.md @@ -0,0 +1,16 @@ +# Copy documents from one collection to another + +This example shows how an index item can be uploaded to another collection + +:point_right: Run the [copy_document_to_new_collection.ipynb](./copy_document_to_new_collection.ipynb) + example. + +Do note that the example focuses on the case of a "deepsearch-db" project data index, other data index types are not supported. + +## Access required + +The content of this notebook requires access to Deep Search capabilities which are not +available on the public access system. + +[Contact us](https://ds4sd.github.io) if you are interested in exploring +these Deep Search capabilities. diff --git a/examples/document_copy_to_collection/copy_document_to_new_collection.ipynb b/examples/document_copy_to_collection/copy_document_to_new_collection.ipynb new file mode 100644 index 0000000..0b0b632 --- /dev/null +++ b/examples/document_copy_to_collection/copy_document_to_new_collection.ipynb @@ -0,0 +1,285 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Copy documents from one collection to another\n", + "\n", + "This example explores how index item can be copy to a new/other collection.\n", + "\n", + "\n", + "### Access required\n", + "\n", + "The content of this notebook requires access to Deep Search capabilities which are not\n", + "available on the public access system.\n", + "\n", + "[Contact us](https://ds4sd.github.io) if you are interested in exploring\n", + "these Deep Search capabilities." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set notebooks parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from dsnotebooks.settings import CopyCollDocumentNotebookSettings\n", + "\n", + "# notebook settings auto-loaded from .env / env vars\n", + "notebook_settings = CopyCollDocumentNotebookSettings()\n", + "\n", + "PROFILE_NAME = notebook_settings.profile # profile to use\n", + "PROJ_KEY = notebook_settings.proj_key # project to use\n", + "INDEX_KEY = notebook_settings.index_key # index to use\n", + "NEW_INDEX_NAME = notebook_settings.new_idx_name # new index to use\n", + "DOC_NAME = notebook_settings.document_name # document to copy\n", + "CLEANUP = notebook_settings.cleanup # whether to clean up\n", + "\n", + "WAIT_S = 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Connect to Deep Search" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Optional\n", + "\n", + "import deepsearch as ds\n", + "from deepsearch.cps.client.components.data_indices import (\n", + " DataIndexItemUrls,\n", + " ElasticProjectDataCollectionSource,\n", + ")\n", + "\n", + "api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List collections in project" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "index_name: 208afeb_3595e30-ocr, index_key: a5bc1ca67923f9934a51f86315a1aa5e7e7af5a2\n", + "index_name: 208afeb_3595e30-raw, index_key: db5d279cf71f3e61b0bdfd5c85adafe592cdb7f9\n", + "index_name: 244fa8d_53dcc8f, index_key: d5768ecd3c9caf4e39291ba12fa0a943c0a2146c\n", + "index_name: 38c7afb_eaeb165-ocr, index_key: d0c1553e82a9eb6f22acc47d255187328ab9034b\n", + "index_name: 38c7afb_eaeb165-raw, index_key: 9713088d370c4fde8376e2d570a92a48b1433b60\n", + "index_name: 40a7236_317d2b4-ocr, index_key: 69431c668920c6e16a921ef16a5adefc0da5108f\n", + "index_name: 6dcfb01_5ab483a, index_key: 30e01985ff09b0f4f9c9ffe0511a0cf694cdefac\n", + "index_name: 9a765ea_e2fc9b3, index_key: b5adb77a5471f75fa7f6b2de33a6b411069e61b9\n", + "index_name: 9a765ea_e2fc9b3-ocr, index_key: 9391aa17f5b7ba5609e23767b24460a45e9aba71\n", + "index_name: 9a765ea_e2fc9b3-raw, index_key: 1ff725e720bf2b6ebe4ff45bccddfd60616998db\n", + "index_name: 9b8ea5e_8006e4a-ocr, index_key: 499f25c5e736e3bd7ae481f464205fbc4d57d950\n", + "index_name: Annual Reports, index_key: d82c8a03126801837a459e8e26601cb5edfe001d\n", + "index_name: Annual Reports2, index_key: 9c1b4ee0bab81102b701b4292fb8e0e5a05acf06\n", + "index_name: automation_sandbox, index_key: 273194cd4e2f2cf9a0ab1ee053cb68b754f5a347\n", + "index_name: codelab, index_key: ae7c27bbe442d480e15e2109a62c5d93799d3cd8\n", + "index_name: current_events_books, index_key: d7ebcf78c4bfda543908a690837996c7e943ac20\n", + "index_name: e85c89e_18dbb53-ocr, index_key: 1d8865b68d3dfadd4e4a612fc59c854135da6fba\n", + "index_name: ef6a02a_f5d0895-ocr, index_key: 039a896ca877e0a10c2e6456b654c30c86041350\n", + "index_name: f40ab11_6a16159-ocr, index_key: c6ae597dd58a7d9e3f27c8f9ad893d66a23e8b78\n", + "index_name: FSAE, index_key: b7e37b4201796671aff2129e347f5d4ebf62ed67\n", + "index_name: invoice, index_key: 8a05889e40f28ddd189dd7bcbae8507104dd9db5\n", + "index_name: PMC-valve-replacement, index_key: ead52d7a34c7c0835dacb0fc1256f9f79ca92329\n", + "index_name: pva-test-20240711, index_key: 776fcc3068f557175b01f6db98272c240b6c5750\n", + "index_name: red-books, index_key: f3b4fb43a95255556cd8514ca9381ff5407c73b7\n", + "index_name: redhat-latest-20240604, index_key: 2fa37b4750e014b99e8467b41c8820fd92c4f4f0\n", + "index_name: test, index_key: 7d87a006e4692a2d522b757c5c0ba00453646f09\n" + ] + } + ], + "source": [ + "collections = api.data_indices.list(proj_key=PROJ_KEY)\n", + "\n", + "for dataindex in collections:\n", + " print(f\"index_name: {dataindex.name}, index_key: {dataindex.source.index_key}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get document from collection" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "doc_id: 6627d1b67955c51ff1aa8858de671bb5a62ad70c77e62e0ac57c153d0078b7ea\n" + ] + } + ], + "source": [ + "indices = api.data_indices.list(proj_key=PROJ_KEY)\n", + "\n", + "dataindex = next((x for x in indices if x.source.index_key == INDEX_KEY), None)\n", + "\n", + "search_query = dataindex.list_items(api, DOC_NAME)\n", + "\n", + "doc_id: Optional[str] = None\n", + "for item in search_query:\n", + " doc_id = item[\"id\"]\n", + "\n", + "if not doc_id:\n", + " print(\"No document found\")\n", + "else:\n", + " print(f\"doc_id: {doc_id}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Target collection" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 1: Create New Collection\n", + "- Fill new_collection_name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_data_index = api.data_indices.create(\n", + " proj_key=PROJ_KEY, name=NEW_INDEX_NAME, type=\"deepsearch-doc\"\n", + ")\n", + "\n", + "new_index_key = new_data_index.source.index_key\n", + "print(new_index_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 2: Fill index key (collection already created) from previous list of collections" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# new_index_key = \"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get document urls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "item: DataIndexItemUrls = dataindex.get_item_urls(api, doc_id)\n", + "\n", + "pdf_url = item.pdf_url\n", + "print(pdf_url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upload document to target collection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deepsearch.cps.data_indices import utils as data_indices_utils\n", + "\n", + "coords = ElasticProjectDataCollectionSource(proj_key=PROJ_KEY, index_key=new_index_key)\n", + "\n", + "data_indices_utils.upload_files(api=api, coords=coords, url=pdf_url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if CLEANUP:\n", + " api.data_indices.delete(coords)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/integration_argilla/argilla_upload.ipynb b/examples/integration_argilla/argilla_upload.ipynb index c2c4b89..8cd63ed 100644 --- a/examples/integration_argilla/argilla_upload.ipynb +++ b/examples/integration_argilla/argilla_upload.ipynb @@ -113,6 +113,7 @@ "source": [ "from typing import Optional\n", "\n", + "\n", "class DocTextSegment(BaseModel):\n", " page: int # page number\n", " idx: int # index of text segment in the document\n", @@ -174,7 +175,7 @@ "converted_docs = {}\n", "# group output files and visualize the output\n", "for output_file in Path(output_dir).rglob(\"*.json\"):\n", - " with open(output_file, 'r') as file:\n", + " with open(output_file, \"r\") as file:\n", " doc_jsondata = json.loads(file.read())\n", " converted_docs[f\"{output_file}//{output_file.name}\"] = doc_jsondata\n", "\n", @@ -255,11 +256,18 @@ "for segment in text_segments:\n", " records_text_classificaiton.append(\n", " rg.Record(\n", - " fields={\"text\":segment.text},\n", + " fields={\"text\": segment.text},\n", " vectors={},\n", " suggestions=segment.text_classification,\n", " metadata=segment.dict(\n", - " exclude={\"text\", \"text_classification\", \"token_classification\", \"idx\", \"title\", \"name\"}\n", + " exclude={\n", + " \"text\",\n", + " \"text_classification\",\n", + " \"token_classification\",\n", + " \"idx\",\n", + " \"title\",\n", + " \"name\",\n", + " }\n", " ),\n", " )\n", " )" @@ -285,10 +293,12 @@ " rg.TermsMetadataProperty(name=\"page\"),\n", " ],\n", " vectors=[\n", - " rg.VectorField(name='mini-lm-sentence-transformers', dimensions=384),\n", + " rg.VectorField(name=\"mini-lm-sentence-transformers\", dimensions=384),\n", " ],\n", ")\n", - "dataset = rg.Dataset(name=f\"{ARGILLA_DATASET}-text\", workspace=\"argilla\", settings=settings)\n", + "dataset = rg.Dataset(\n", + " name=f\"{ARGILLA_DATASET}-text\", workspace=\"argilla\", settings=settings\n", + ")\n", "dataset.create()\n", "dataset.records.log(records_text_classificaiton)" ] @@ -311,7 +321,14 @@ " vectors={},\n", " suggestions=segment.token_classification,\n", " metadata=segment.dict(\n", - " exclude={\"text\", \"text_classification\", \"token_classification\", \"idx\", \"title\", \"name\"}\n", + " exclude={\n", + " \"text\",\n", + " \"text_classification\",\n", + " \"token_classification\",\n", + " \"idx\",\n", + " \"title\",\n", + " \"name\",\n", + " }\n", " ),\n", " )\n", " )" @@ -338,10 +355,12 @@ " rg.TermsMetadataProperty(name=\"page\"),\n", " ],\n", " vectors=[\n", - " rg.VectorField(name='mini-lm-sentence-transformers', dimensions=384),\n", + " rg.VectorField(name=\"mini-lm-sentence-transformers\", dimensions=384),\n", " ],\n", ")\n", - "dataset = rg.Dataset(name=f\"{ARGILLA_DATASET}-token\", workspace=\"argilla\", settings=settings)\n", + "dataset = rg.Dataset(\n", + " name=f\"{ARGILLA_DATASET}-token\", workspace=\"argilla\", settings=settings\n", + ")\n", "dataset.create()\n", "dataset.records.log(records_token_classificaiton)" ] diff --git a/examples/kg_download_quick_start/kg_download_quick_start.ipynb b/examples/kg_download_quick_start/kg_download_quick_start.ipynb index a864c4c..d4d84fc 100755 --- a/examples/kg_download_quick_start/kg_download_quick_start.ipynb +++ b/examples/kg_download_quick_start/kg_download_quick_start.ipynb @@ -384,9 +384,10 @@ "source": [ "# Find the relevant edges\n", "edges = jsonl2df(os.path.join(BASE_DIR, \"unzipped_data\", \"_edges.jsonl\"))\n", - "if len(edges)>0:\n", + "if len(edges) > 0:\n", " edges = edges[\n", - " (edges.source_collection == \"material\") & (edges.target_collection == \"property\")\n", + " (edges.source_collection == \"material\")\n", + " & (edges.target_collection == \"property\")\n", " ]\n", " edges = [edges[\"source_hash\"].to_list(), edges[\"target_hash\"].to_list()]\n", "\n", @@ -518,7 +519,9 @@ " perovskite_edges = (\n", " hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index[0, :] == m_idx\n", " )\n", - " prop_idx = hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index[1, perovskite_edges]\n", + " prop_idx = hetero_kg[\"material\", \"mat2prop\", \"property\"].edge_index[\n", + " 1, perovskite_edges\n", + " ]\n", " properties = [hetero_kg[\"property\"][\"_name\"][idx] for idx in prop_idx.tolist()]\n", "\n", " # Check if the desired properties are linked\n",