From 0835ff0a5013d199b6a67665e7dac2cbc331c836 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 2 Apr 2024 07:20:12 +0200 Subject: [PATCH 1/3] added the nlp-for-metadata Signed-off-by: Peter Staar --- .../data_query_quick_start/quick_start.ipynb | 10 +- examples/nlp_for_metadata/README.md | 5 + .../nlp_for_metadata/nlp_for_metadata.ipynb | 2995 +++++++++++++++++ poetry.lock | 41 +- pyproject.toml | 2 +- 5 files changed, 3025 insertions(+), 28 deletions(-) create mode 100644 examples/nlp_for_metadata/README.md create mode 100644 examples/nlp_for_metadata/nlp_for_metadata.ipynb diff --git a/examples/data_query_quick_start/quick_start.ipynb b/examples/data_query_quick_start/quick_start.ipynb index 6f3e334..bfecac8 100644 --- a/examples/data_query_quick_start/quick_start.ipynb +++ b/examples/data_query_quick_start/quick_start.ipynb @@ -132,9 +132,7 @@ "cell_type": "code", "execution_count": 5, "id": "f915761b", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -370,9 +368,7 @@ "cell_type": "code", "execution_count": 6, "id": "81df4c38", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -786,7 +782,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/examples/nlp_for_metadata/README.md b/examples/nlp_for_metadata/README.md new file mode 100644 index 0000000..1504c62 --- /dev/null +++ b/examples/nlp_for_metadata/README.md @@ -0,0 +1,5 @@ +# NLP on Documents - Extracting meta data + +:point_right: Run the [nlp_for_metadata.ipynb](./nlp_for_metadata.ipynb) +notebook to extract metadata from (scientific) reports + diff --git a/examples/nlp_for_metadata/nlp_for_metadata.ipynb b/examples/nlp_for_metadata/nlp_for_metadata.ipynb new file mode 100644 index 0000000..69f039d --- /dev/null +++ b/examples/nlp_for_metadata/nlp_for_metadata.ipynb @@ -0,0 +1,2995 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "37d96e78", + "metadata": {}, + "source": [ + "# Document MetaData Extraction" + ] + }, + { + "cell_type": "markdown", + "id": "4edb626f", + "metadata": {}, + "source": [ + "## Getting started\n" + ] + }, + { + "cell_type": "markdown", + "id": "a8f9c441", + "metadata": {}, + "source": [ + "### Set notebook parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b01a4fd1", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Project key: 1234567890abcdefghijklmnopqrstvwyz123456\n" + ] + } + ], + "source": [ + "from dsnotebooks.settings import ProjectNotebookSettings\n", + "\n", + "# notebook settings auto-loaded from .env / env vars\n", + "notebook_settings = ProjectNotebookSettings()\n", + "\n", + "PROFILE_NAME = notebook_settings.profile # the profile to use\n", + "PROJ_KEY = notebook_settings.proj_key # the project to use\n", + "\n", + "# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456" + ] + }, + { + "cell_type": "markdown", + "id": "239dc0f1", + "metadata": {}, + "source": [ + "### Import example dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "502cdef8", + "metadata": { + "ExecuteTime": { + "end_time": "2022-08-02T12:14:25.377422Z", + "start_time": "2022-08-02T12:14:25.152485Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " -> already downloaded part-of-speech\n", + " -> already downloaded reference\n", + " -> already downloaded material\n", + " -> already downloaded language\n", + " -> already downloaded name\n", + " -> already downloaded semantic\n", + " -> already downloaded geoloc\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "\n", + "import textwrap\n", + "\n", + "import pandas as pd\n", + "\n", + "import deepsearch as ds\n", + "\n", + "from pathlib import Path\n", + "from zipfile import ZipFile\n", + "\n", + "from deepsearch.documents.core.export import export_to_markdown\n", + "from IPython.display import display, Markdown, HTML, display_html\n", + "\n", + "from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models\n", + "\n", + "from deepsearch_glm.nlp_utils import (\n", + " extract_references_from_doc,\n", + " init_nlp_model,\n", + " list_nlp_model_configs,\n", + ")\n", + "\n", + "from tabulate import tabulate\n", + "\n", + "models = load_pretrained_nlp_models(verbose=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e6e4dcda", + "metadata": {}, + "source": [ + "### Connect to Deep Search" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f44fbf08", + "metadata": {}, + "outputs": [], + "source": [ + "api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)" + ] + }, + { + "cell_type": "markdown", + "id": "6f1200c5-1138-4491-bc33-3b2d5aabe949", + "metadata": {}, + "source": [ + "## Convert Document" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ec83eb0b", + "metadata": { + "ExecuteTime": { + "end_time": "2022-08-02T12:14:49.216045Z", + "start_time": "2022-08-02T12:14:25.380757Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:00<00:00, 78.98it/s]\u001b[38;2;15;98;254m \u001b[0m\n", + "Submitting input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:03<00:00, 3.27s/it]\u001b[38;2;15;98;254m \u001b[0m\n", + "Converting input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:27<00:00, 27.58s/it]\u001b[38;2;15;98;254m \u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'Total documents': 1, 'Successfully converted documents': 1}\n" + ] + } + ], + "source": [ + "output_dir = Path(\"./converted_docs\")\n", + "\n", + "fname = \"2206.00785.pdf\"\n", + "\n", + "documents = ds.convert_documents(\n", + " api=api,\n", + " proj_key=PROJ_KEY,\n", + " source_path=f\"../../data/samples/{fname}\",\n", + " progress_bar=True\n", + ") \n", + "documents.download_all(result_dir=output_dir)\n", + "info = documents.generate_report(result_dir=output_dir)\n", + "print(info) " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "382c4869-cca9-43fc-8052-c0ab7e9c175d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "writing converted_docs/2206.00785.json\n", + "writing converted_docs/2206.00785.md\n" + ] + } + ], + "source": [ + "# Iterare output files and visualize the output\n", + "for output_file in output_dir.rglob(\"json*.zip\"):\n", + " with ZipFile(output_file) as archive:\n", + " all_files = archive.namelist()\n", + " for name in all_files:\n", + " if not name.endswith(\".json\"):\n", + " continue\n", + " \n", + " #basename = name.rstrip('.json')\n", + " doc_json = json.loads(archive.read(name))\n", + " \n", + " ofile = output_dir / name\n", + " print(f\"writing {ofile}\")\n", + " with ofile.open(\"w\") as fw:\n", + " fw.write(json.dumps(doc_json, indent=2))\n", + " \n", + " doc_md = export_to_markdown(doc_json)\n", + "\n", + " ofile = output_dir / name.replace(\".json\", \".md\")\n", + " print(f\"writing {ofile}\")\n", + " with ofile.open(\"w\") as fw:\n", + " fw.write(doc_md)\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "b19f7678-b650-484b-a994-150d0c4ec3a2", + "metadata": {}, + "outputs": [], + "source": [ + "# display last document\n", + "# display(Markdown(doc_md))" + ] + }, + { + "cell_type": "markdown", + "id": "6784c8a9-4b96-4385-a04e-40ddbf6c613f", + "metadata": {}, + "source": [ + "## Extract references from converted Document" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "710cc200-e2ba-46f3-9ca0-efd2baab7ee1", + "metadata": {}, + "outputs": [], + "source": [ + "def resolve(path, doc):\n", + "\n", + " if len(path)>1 and path[0]==\"#\":\n", + " return resolve(path[1:], doc)\n", + " \n", + " if len(path)==1 and isinstance(doc, dict):\n", + " return doc[path[0]]\n", + "\n", + " elif len(path)==1 and isinstance(doc, list):\n", + " ind = int(path[0])\n", + " return doc[ind]\n", + " \n", + " elif len(path)>1 and isinstance(doc, dict):\n", + " return resolve(path[1:], doc[path[0]])\n", + "\n", + " elif len(path)>1 and isinstance(doc, list):\n", + " ind = int(path[0])\n", + " return resolve(path[1:], doc[ind])\n", + "\n", + " else:\n", + " return None\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ed3612b4-bbd2-42d0-ba2d-f8f994565380", + "metadata": {}, + "outputs": [], + "source": [ + "ifile = \"converted_docs/2206.00785.json\"\n", + "\n", + "with open(ifile) as fr:\n", + " doc = json.load(fr)\n", + "\n", + "model = init_nlp_model(\"language;reference;metadata\")\n", + "res = model.apply_on_doc(doc)\n", + "\n", + "props = pd.DataFrame(res[\"properties\"][\"data\"], columns=res[\"properties\"][\"headers\"])\n", + "insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6d98745c-e0f3-41d2-8261-b7953d835dec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " subtype subj_path name\n", + "0 abstract # Abstract-Document understanding is a key busin...\n", + "1 title #/texts/1 Delivering Document Conversion as a Cloud Serv...\n", + "2 author #/texts/1 Christoph Auer\n", + "3 author #/texts/1 Research Ruschlikon\n", + "4 author #/texts/2 Research Ruschlikon\n", + "5 author #/texts/3 Michele Dolfi\n", + "6 author #/texts/3 Research Ruschlikon\n", + "7 author #/texts/4 J Staar\n", + "8 author #/texts/4 Research Ruschlikon\n" + ] + } + ], + "source": [ + "#print(res[\"instances\"][\"headers\"])\n", + "\n", + "doc_insts = insts[insts[\"subj_name\"]==\"DOCUMENT\"][[\"subtype\", \"subj_path\", \"name\"]]\n", + "print(doc_insts)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "01771757-70c3-44cb-824c-1fd9b716a99f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1] C. Gopal, C. L. Marshall, D. Vesset, N. Ward-Dutton, J. Hamel, R.\n", + "Jyoti, P. Rutten, C. W. Olofson, J. Rydning, S. Rau, and J. Duke, 'IDC\n", + "FutureScape: Worldwide future of intelligence 2022 predictions,'\n", + "International Data Group, Inc., Needham, MA, Research Report\n", + "US47913321, Oct. 2021. [Online]. Available:\n", + "https://www.idc.com/getdoc.jsp?containerId=US47913321\n", + "\n", + "entities:\n", + " subtype name\n", + "214 reference-number 1\n", + "215 authors C. Gopal, C. L. Marshall, D. Vesset, N. Ward-D...\n", + "216 person-name C Gopal\n", + "217 person-name C L Marshall\n", + "218 person-name D Vesset\n", + "219 person-name N Ward\n", + "220 person-name J Hamel\n", + "221 person-name R Jyoti\n", + "222 person-name P Rutten\n", + "223 person-name C W Olofson\n", + "224 person-name J Rydning\n", + "225 person-name S Rau\n", + "226 person-name J Duke\n", + "227 title 'IDC FutureScape: Worldwide future of intellig...\n", + "228 journal International Data Group, Inc., Needham, MA, R...\n", + "229 person-name International Data Group\n", + "230 person-name Research Report\n", + "231 volume 47913321\n", + "232 date Oct. 2021\n", + "233 note Online]. Available:\n", + "234 url https://www.idc.com/getdoc.jsp?containerId=US4... \n", + "\n", + "\n", + "[2] D. Vile, 'The road to becoming a data driven business,' Freeform\n", + "Dynamics Ltd., New Milton, United Kingdom, Research Report US47913321,\n", + "Nov. 2020. [Online]. Available: https://www.freeformdynamics.com/wp-\n", + "content/uploads/2020/11/ 2020-The road to becoming a data driven\n", + "business.pdf\n", + "\n", + "entities:\n", + " subtype name\n", + "235 reference-number 2\n", + "236 authors D. Vile\n", + "237 person-name D Vile\n", + "238 title 'The road to becoming a data driven business,'\n", + "239 journal Freeform Dynamics Ltd., New Milton, United Kin...\n", + "240 person-name Freeform Dynamics Ltd\n", + "241 country United Kingdom\n", + "242 person-name United Kingdom\n", + "243 person-name Research Report\n", + "244 volume 47913321\n", + "245 date Nov. 2020\n", + "246 note Online]. Available:\n", + "247 url https://www.freeformdynamics.com/wp-content/up...\n", + "248 title data driven business.pdf \n", + "\n", + "\n", + "[3] M. Aslett and N. Patience, 'Data platforms market map 2021,' S&P\n", + "Global Market Intelligence, Tech. Rep., Sep. 2021.\n", + "\n", + "entities:\n", + " subtype name\n", + "249 reference-number 3\n", + "250 authors M. Aslett and N. Patience\n", + "251 person-name M Aslett\n", + "252 person-name N Patience\n", + "253 title 'Data platforms market map 2021,'\n", + "254 journal S&P Global Market Intelligence, Tech. Rep., Sep\n", + "255 person-name Global Market Intelligence\n", + "256 abbreviation-name Tech Rep\n", + "257 date 2021 \n", + "\n", + "\n", + "[4] G. Aggarwal. (2021, Jan.) How the pandemic has accelerated cloud\n", + "adoption. Forbes. Jersey City, NJ. [Online]. Available:\n", + "https://www.forbes.com/sites/forbestechcouncil/2021/01/15/ how-the-\n", + "pandemic-has-accelerated-cloud-adoption\n", + "\n", + "entities:\n", + " subtype name\n", + "258 reference-number 4\n", + "259 authors G. Aggarwal\n", + "260 person-name G Aggarwal\n", + "261 date 2021, Jan\n", + "262 title How the pandemic has accelerated cloud adoptio...\n", + "263 person-name Jersey City\n", + "264 note Available:\n", + "265 url https://www.forbes.com/sites/forbestechcouncil... \n", + "\n", + "\n", + "[5] 'Enterprise survey series: DevOps and the cloud,' Evans Data\n", + "Corporation, Santa Cruz, CA, Research Report, Aug. 2021. [Online].\n", + "Available: https://evansdata.com/reports/viewRelease.php?reportID=45\n", + "\n", + "entities:\n", + " subtype name\n", + "266 reference-number 5\n", + "267 title 'Enterprise survey series: DevOps and the cloud,'\n", + "268 journal Evans Data Corporation, Santa Cruz, CA, Resear...\n", + "269 person-name Evans Data Corporation\n", + "270 person-name Santa Cruz\n", + "271 person-name Research Report\n", + "272 date 2021\n", + "273 note Online]. Available:\n", + "274 url https://evansdata.com/reports/viewRelease.php?... \n", + "\n", + "\n", + "[6] J. Arundel and J. Domingus, Cloud Native DevOps with Kubernetes:\n", + "Building, Deploying, and Scaling Modern Applications in the Cloud.\n", + "Sebastopol, CA: O'Reilly Media, Apr. 2019.\n", + "\n", + "entities:\n", + " subtype name\n", + "275 reference-number 6\n", + "276 authors J. Arundel and J. Domingus\n", + "277 person-name J Arundel\n", + "278 person-name J Domingus\n", + "279 title Cloud Native DevOps with Kubernetes: Building,...\n", + "280 person-name Cloud Native DevOps\n", + "281 journal Reilly Media, Apr\n", + "282 date 2019 \n", + "\n", + "\n" + ] + } + ], + "source": [ + "\n", + "\n", + "refs = props[(props[\"label\"]==\"reference\") & (props[\"confidence\"]>0.8)]\n", + "\n", + "cnt = 0\n", + "for i,ref in refs.iterrows():\n", + " #print(ref)\n", + "\n", + " item = resolve(ref[\"subj_path\"].split(\"/\"), res)\n", + " print(\"\\n\".join(textwrap.wrap(item[\"text\"], 70)))\n", + "\n", + " ents = insts[insts[\"subj_hash\"]==item[\"subj_hash\"]][[\"subtype\", \"name\"]]\n", + " print(\"\\nentities:\\n\", ents, \"\\n\\n\")\n", + "\n", + " \n", + " cnt+=1\n", + " if cnt>5:\n", + " break\n" + ] + }, + { + "cell_type": "markdown", + "id": "cfeca54d-bbc1-4022-851d-0b29027de761", + "metadata": {}, + "source": [ + "## Extract Matedata from ingested documents" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8bb459a8-7b26-4dc3-98da-b1b4a1b59fcc", + "metadata": {}, + "outputs": [], + "source": [ + "# Import standard dependenices\n", + "from copy import deepcopy\n", + "import pandas as pd\n", + "from numerize.numerize import numerize\n", + "from tqdm.notebook import tqdm\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "# IPython utilities\n", + "from IPython.display import display, HTML\n", + "\n", + "# Import the deepsearch-toolkit\n", + "import deepsearch as ds\n", + "from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource\n", + "from deepsearch.cps.queries import DataQuery" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "d064166b-7578-437c-b3a6-b16eb3d95c1f", + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch list of all data collections\n", + "collections = api.elastic.list()\n", + "collections.sort(key=lambda c: c.name.lower())" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "db9da464-23db-4562-a1ce-259a717f404a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameTypeNum entriesDateCoords
0AAAIDocument16.02K2023-08-29default/aaai
1ACL AnthologyDocument55.28K2023-08-22default/acl
2Annual ReportsDocument107.38K2024-01-12default/annual-report
3arXiv abstractsDocument2.37M2023-12-07default/arxiv-abstract
4arXiv category taxonomyRecord1552023-12-05default/arxiv-category
5arXiv full documentsDocument2.29M2023-10-29default/arxiv
6BioRxivDocument357.76K2023-11-09default/biorxiv
7BrendaRecord7.12K2023-01-03default/brenda
8ChEMBLRecord2.11M2023-01-03default/chembl
9ChemRxivDocument8.82K2023-11-23default/chemrxiv
10ClinicalTrialsDocument426.42K2023-01-03default/clinical-trials
11CODRecord503.78K2023-07-24default/cod
12Cord19Document655.45K2022-11-17default/cord19
13CrossrefDocument131.86M2023-02-22default/crossref
14Crossref journal listReference100.52K2022-02-22default/crossref-journal
15D&B HooversRecord10K2021-04-16default/swot-report
16DeepSearch materialsRecord360.54K2023-01-03default/ds4sd-material
17DOABDocument8.8K2023-12-04default/doab
18DrugBankRecord4.44K2022-11-03default/drugbank
19engrXivDocument1.84K2023-01-03default/engrxiv
20ESG ReportsDocument17.36K2024-01-08default/esg-report
21FDA Adverse Event Reporting System (FAERS)Document435.62K2023-01-03default/faers
22GenBankRecord260.36M2023-01-24default/genbank
23HBCP Open Access CorpusDocument902023-01-03default/hbcp
24IBM RedbooksDocument2.75K2023-06-08default/ibm-redbooks
25IEEEDocument61.95K2024-01-16default/ieee
26International Patent Classification (IPC)Reference78.52K2022-02-22default/wipo-ipc
27IPCCDocument8192023-06-14default/ipcc
28Legal Entity IdentifierRecord2.1M2023-08-16default/lei
29Material ComponentsDocument16.32K2023-01-30default/experiment
30MedRxivDocument69.18K2023-11-02default/medrxiv
31NeurIPSDocument16.9K2023-09-24default/neurips
32NewsDocument9.82M2023-09-10default/news
33NMRShiftRecord44.33K2023-01-03default/nmrshift
34OpenCVFDocument26.94K2023-10-04default/opencvf
35OpenStaxDocument762024-02-01default/openstax
36OpenStreetMapGeneric296.31M2023-03-12default/osm
37PatCIDRecord13.03M2023-09-15default/patcid
38Patent SMILESDocument2.84M2023-10-11default/patent-smiles
39Patents from CNIPRDocument22022-12-19default/patent-cnipr
40Patents from EPODocument7.09M2023-07-06default/patent-epo
41Patents from JPODocument2.54M2024-01-08default/patent-jpo
42Patents from KIPODocument1.8M2022-12-19default/patent-kipo
43Patents from USPTODocument16.16M2024-02-09default/patent-uspto
44Patents from USPTO (TEST)Document6.81K2024-03-13default/patent-uspto-test
45PLOSDocument340.28K2024-01-10default/plos
46PubChemRecord118.24M2023-07-06default/pubchem
47PubMed CentralDocument5.08M2023-03-01default/pubmed
48PubMed Central (PDF)Document27.66K2024-01-22default/pmc-pdf
49Red HatDocument7.17K2024-01-23default/redhat
50RxNormRecord374.18K2023-01-03default/rxnorm
51SEC Edgar CIK LookupReference786K2022-02-22default/sec-cik
52SEC Edgar filingsDocument56.38K2021-07-06default/sec-filing
53Semantic Scholar Academic GraphDocument216.85M2024-03-11default/semantic-scholar
54SMILES from USPTORecord116.48M2022-12-25default/patent-uspto-smiles
55SMILES from USPTO (fingerprints)Record85.81M2023-02-23default/patent-uspto-smiles-fp
56UMLSRecord2.69M2023-01-03default/umls
57UniProtRecord567.48K2023-01-03default/uniprot
58USPTO patents for NERDocument2.64K2023-03-20default/uspto-for-ner
59WikipediaDocument6.45M2024-02-26default/wikipedia
\n", + "
" + ], + "text/plain": [ + " Name Type Num entries \\\n", + "0 AAAI Document 16.02K \n", + "1 ACL Anthology Document 55.28K \n", + "2 Annual Reports Document 107.38K \n", + "3 arXiv abstracts Document 2.37M \n", + "4 arXiv category taxonomy Record 155 \n", + "5 arXiv full documents Document 2.29M \n", + "6 BioRxiv Document 357.76K \n", + "7 Brenda Record 7.12K \n", + "8 ChEMBL Record 2.11M \n", + "9 ChemRxiv Document 8.82K \n", + "10 ClinicalTrials Document 426.42K \n", + "11 COD Record 503.78K \n", + "12 Cord19 Document 655.45K \n", + "13 Crossref Document 131.86M \n", + "14 Crossref journal list Reference 100.52K \n", + "15 D&B Hoovers Record 10K \n", + "16 DeepSearch materials Record 360.54K \n", + "17 DOAB Document 8.8K \n", + "18 DrugBank Record 4.44K \n", + "19 engrXiv Document 1.84K \n", + "20 ESG Reports Document 17.36K \n", + "21 FDA Adverse Event Reporting System (FAERS) Document 435.62K \n", + "22 GenBank Record 260.36M \n", + "23 HBCP Open Access Corpus Document 90 \n", + "24 IBM Redbooks Document 2.75K \n", + "25 IEEE Document 61.95K \n", + "26 International Patent Classification (IPC) Reference 78.52K \n", + "27 IPCC Document 819 \n", + "28 Legal Entity Identifier Record 2.1M \n", + "29 Material Components Document 16.32K \n", + "30 MedRxiv Document 69.18K \n", + "31 NeurIPS Document 16.9K \n", + "32 News Document 9.82M \n", + "33 NMRShift Record 44.33K \n", + "34 OpenCVF Document 26.94K \n", + "35 OpenStax Document 76 \n", + "36 OpenStreetMap Generic 296.31M \n", + "37 PatCID Record 13.03M \n", + "38 Patent SMILES Document 2.84M \n", + "39 Patents from CNIPR Document 2 \n", + "40 Patents from EPO Document 7.09M \n", + "41 Patents from JPO Document 2.54M \n", + "42 Patents from KIPO Document 1.8M \n", + "43 Patents from USPTO Document 16.16M \n", + "44 Patents from USPTO (TEST) Document 6.81K \n", + "45 PLOS Document 340.28K \n", + "46 PubChem Record 118.24M \n", + "47 PubMed Central Document 5.08M \n", + "48 PubMed Central (PDF) Document 27.66K \n", + "49 Red Hat Document 7.17K \n", + "50 RxNorm Record 374.18K \n", + "51 SEC Edgar CIK Lookup Reference 786K \n", + "52 SEC Edgar filings Document 56.38K \n", + "53 Semantic Scholar Academic Graph Document 216.85M \n", + "54 SMILES from USPTO Record 116.48M \n", + "55 SMILES from USPTO (fingerprints) Record 85.81M \n", + "56 UMLS Record 2.69M \n", + "57 UniProt Record 567.48K \n", + "58 USPTO patents for NER Document 2.64K \n", + "59 Wikipedia Document 6.45M \n", + "\n", + " Date Coords \n", + "0 2023-08-29 default/aaai \n", + "1 2023-08-22 default/acl \n", + "2 2024-01-12 default/annual-report \n", + "3 2023-12-07 default/arxiv-abstract \n", + "4 2023-12-05 default/arxiv-category \n", + "5 2023-10-29 default/arxiv \n", + "6 2023-11-09 default/biorxiv \n", + "7 2023-01-03 default/brenda \n", + "8 2023-01-03 default/chembl \n", + "9 2023-11-23 default/chemrxiv \n", + "10 2023-01-03 default/clinical-trials \n", + "11 2023-07-24 default/cod \n", + "12 2022-11-17 default/cord19 \n", + "13 2023-02-22 default/crossref \n", + "14 2022-02-22 default/crossref-journal \n", + "15 2021-04-16 default/swot-report \n", + "16 2023-01-03 default/ds4sd-material \n", + "17 2023-12-04 default/doab \n", + "18 2022-11-03 default/drugbank \n", + "19 2023-01-03 default/engrxiv \n", + "20 2024-01-08 default/esg-report \n", + "21 2023-01-03 default/faers \n", + "22 2023-01-24 default/genbank \n", + "23 2023-01-03 default/hbcp \n", + "24 2023-06-08 default/ibm-redbooks \n", + "25 2024-01-16 default/ieee \n", + "26 2022-02-22 default/wipo-ipc \n", + "27 2023-06-14 default/ipcc \n", + "28 2023-08-16 default/lei \n", + "29 2023-01-30 default/experiment \n", + "30 2023-11-02 default/medrxiv \n", + "31 2023-09-24 default/neurips \n", + "32 2023-09-10 default/news \n", + "33 2023-01-03 default/nmrshift \n", + "34 2023-10-04 default/opencvf \n", + "35 2024-02-01 default/openstax \n", + "36 2023-03-12 default/osm \n", + "37 2023-09-15 default/patcid \n", + "38 2023-10-11 default/patent-smiles \n", + "39 2022-12-19 default/patent-cnipr \n", + "40 2023-07-06 default/patent-epo \n", + "41 2024-01-08 default/patent-jpo \n", + "42 2022-12-19 default/patent-kipo \n", + "43 2024-02-09 default/patent-uspto \n", + "44 2024-03-13 default/patent-uspto-test \n", + "45 2024-01-10 default/plos \n", + "46 2023-07-06 default/pubchem \n", + "47 2023-03-01 default/pubmed \n", + "48 2024-01-22 default/pmc-pdf \n", + "49 2024-01-23 default/redhat \n", + "50 2023-01-03 default/rxnorm \n", + "51 2022-02-22 default/sec-cik \n", + "52 2021-07-06 default/sec-filing \n", + "53 2024-03-11 default/semantic-scholar \n", + "54 2022-12-25 default/patent-uspto-smiles \n", + "55 2023-02-23 default/patent-uspto-smiles-fp \n", + "56 2023-01-03 default/umls \n", + "57 2023-01-03 default/uniprot \n", + "58 2023-03-20 default/uspto-for-ner \n", + "59 2024-02-26 default/wikipedia " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Visualize summary table\n", + "results = [\n", + " {\n", + " \"Name\": c.name,\n", + " \"Type\": c.metadata.type,\n", + " \"Num entries\": numerize(c.documents),\n", + " \"Date\": c.metadata.created.strftime(\"%Y-%m-%d\"),\n", + " \"Coords\": f\"{c.source.elastic_id}/{c.source.index_key}\",\n", + " }\n", + " for c in collections\n", + "]\n", + "display(pd.DataFrame(results))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "357340cc-97e3-44bc-aa28-41a1be1e9a20", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cb1357fa9d50499e929a520811253c24", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/60 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namematches
0arXiv full documents165
1Semantic Scholar Academic Graph40
2OpenCVF31
3arXiv abstracts24
4ACL Anthology16
\n", + "" + ], + "text/plain": [ + " name matches\n", + "0 arXiv full documents 165\n", + "1 Semantic Scholar Academic Graph 40\n", + "2 OpenCVF 31\n", + "3 arXiv abstracts 24\n", + "4 ACL Anthology 16" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Input query\n", + "search_query = \"main-text.text:(\\\"DocLayNet\\\" OR \\\"PubLayNet\\\")\"\n", + "\n", + "# Iterate through the data collections\n", + "results = []\n", + "for c in (pbar := tqdm(collections)):\n", + " pbar.set_description(f\"Querying {c.name}\")\n", + "\n", + " # Search only on document collections\n", + " if c.metadata.type != \"Document\":\n", + " continue\n", + "\n", + " # Execute the query\n", + " query = DataQuery(search_query, source=[], limit=0, coordinates=c.source)\n", + " query_results = api.queries.run(query)\n", + " results.append({\n", + " \"name\": c.name,\n", + " \"matches\": query_results.outputs[\"data_count\"]\n", + " })\n", + "\n", + "# Sort and display results\n", + "results.sort(reverse=True, key=lambda r: r[\"matches\"])\n", + "display(pd.DataFrame(results[0:5]))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "90f84882-1c85-4b0a-b0eb-ea5bf0b41e32", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "21d681aa843745bca7374749f46e23e7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/33 [00:00=3.8" files = [ - {file = "cibuildwheel-2.16.5-py3-none-any.whl", hash = "sha256:e9a0b743a57cf2a2861e1e580765fd70237689aeda6c79db4870ee688ee24d1f"}, - {file = "cibuildwheel-2.16.5.tar.gz", hash = "sha256:9fe763405afac4aef33eb8641891dda83312848ec18cd44e30daac34dfa9336d"}, + {file = "cibuildwheel-2.17.0-py3-none-any.whl", hash = "sha256:62ddd06179269b9da111bf9e97aca8ecb7b9642e1151a0bac702dd46429b52bf"}, + {file = "cibuildwheel-2.17.0.tar.gz", hash = "sha256:889510a7d974da855a8b793f8dbe718ce18189a42c2560741868e68900e02be2"}, ] [package.dependencies] @@ -985,27 +985,27 @@ files = [ [[package]] name = "deepsearch-glm" -version = "0.12.2" +version = "0.17.2" description = "Graph Language Models" optional = false -python-versions = ">=3.8,<4.0" +python-versions = "<4.0,>=3.8" files = [ - {file = "deepsearch_glm-0.12.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:c815a9c6c6179047fde28f979783f80b62b817d2bdf7735d6e75554f518415af"}, - {file = "deepsearch_glm-0.12.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:f0f26a8be5795c750c246ed5fa9381ee0b2deeac3c0d4e37a37f5b76430cb567"}, - {file = "deepsearch_glm-0.12.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f39a5f2032f00a0474a38d67c3c462a2fd9af6ebb2dce00489d83c51ef740aea"}, - {file = "deepsearch_glm-0.12.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:0695723b85fb81f65e9193549940e72788eb6a9369fa3a472a82c84ed55ca1cb"}, - {file = "deepsearch_glm-0.12.2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:6c5c1dffea9c2118dad31762fa2c437d7f90954b9c0b744f5754dbf819be5462"}, - {file = "deepsearch_glm-0.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae926bdbec45e8ed9585a0cb9e3e124796522a18707310bee6aea44084616f12"}, - {file = "deepsearch_glm-0.12.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:4477879a8bd52f04dacfcf41a603a44d6839df46e2bf1f73abf75657e282129d"}, - {file = "deepsearch_glm-0.12.2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:df3277a64208ba4c93b126f5cd4dc29ac09e81ae485364b63a1f49625b596e32"}, - {file = "deepsearch_glm-0.12.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:609908f148bc3a7f975acf83469c54f9dd807821535517b9ddf5a46c27a35641"}, - {file = "deepsearch_glm-0.12.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:f518e5a11773fd9a8cf8c299237663bfd7533fe2d52bc6bc729862415955ab99"}, - {file = "deepsearch_glm-0.12.2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:bf36f7c40d880bdfc4b5548c32ef4bc244f1d0a858992b0dc1c56c4962e8381f"}, - {file = "deepsearch_glm-0.12.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79e0ff50ee432e7f0a3ab8a95ba68f58b8867b5361c8489f9465da3f4f73b7c0"}, + {file = "deepsearch_glm-0.17.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6234fb2fa6755ff1bb7000d21e4574eea68a29557d8f16ba179f5f5713766d9b"}, + {file = "deepsearch_glm-0.17.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:af97145ebb2f21b074ef6385c45d60a2d2553b68254c30aa66b7ddd9206b7f7b"}, + {file = "deepsearch_glm-0.17.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fe2669ac7f8567383e0818fe9b3b73979978fb5e65f36db3b7626bf3af6206d"}, + {file = "deepsearch_glm-0.17.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:95cbe8e8264c675a128f520f33afa3fd34295c64b00d282c015fe13c7cc2bf3b"}, + {file = "deepsearch_glm-0.17.2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:1b9203be399dd4f026769998cca25a5691ff79791ead2dfa05385af8467f4bd8"}, + {file = "deepsearch_glm-0.17.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f512fede5fd062ff005f51073ef5660e2e963e0013251176b69bd7ab9e45faa"}, + {file = "deepsearch_glm-0.17.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:be15a98b0cbf36e5141e5dd8e22ba29b0e0d92604fc58e53e8fa6c837b29a40f"}, + {file = "deepsearch_glm-0.17.2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:e3ddcaf73dd5578786db3333c238790a45d0fa0af4b1df9a41a4b9dd234c2401"}, + {file = "deepsearch_glm-0.17.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20e7b819d58df057bd1826fea8cd3e6d0ed4cac3fe819795ee5205360fa77fee"}, + {file = "deepsearch_glm-0.17.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:0f8405524b000669b82098b1989e8c4ef4da0f93407477c1d807533d9f427867"}, + {file = "deepsearch_glm-0.17.2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:da459f79913b0f967f5802766b2b964bc997d7f5259663901d84c2780961dfa8"}, + {file = "deepsearch_glm-0.17.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50757b64a607104882a683b2a570f96f450faaf0f1047125df043d01406f2f16"}, ] [package.dependencies] -cibuildwheel = ">=2.16.5,<3.0.0" +cibuildwheel = ">=2.17.0,<3.0.0" deepsearch-toolkit = ">=0.31.0" matplotlib = ">=3.7.1,<4.0.0" networkx = ">=3.1,<4.0" @@ -3492,6 +3492,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -5179,4 +5180,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">= 3.8, <3.11" -content-hash = "af9c6fc06f4f0ef24bfb94fbae7f8d740af54d14c1edca64ffef6f615712a3f2" +content-hash = "928e878a81836c1528c27876675534238f9bc5c350965694061acb31ef559c3b" diff --git a/pyproject.toml b/pyproject.toml index 59cac7c..0b53572 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ python-dotenv = "^1.0.0" nbclient = "^0.9.0" pandas = "^1.5.1" argilla = "^1.24.0" -deepsearch-glm = ">=0.12.2" +deepsearch-glm = "v0.17.2" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^22.1.0"} From 14d6d456ceb2d9ecb74e4f7d17a264fddb4ff75e Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Wed, 8 May 2024 08:55:07 +0200 Subject: [PATCH 2/3] updated the metadata notebook for private document collections Signed-off-by: Peter Staar --- .../nlp_for_metadata/nlp_for_metadata.ipynb | 5553 ++++++++++------- poetry.lock | 28 +- pyproject.toml | 2 +- 3 files changed, 3443 insertions(+), 2140 deletions(-) diff --git a/examples/nlp_for_metadata/nlp_for_metadata.ipynb b/examples/nlp_for_metadata/nlp_for_metadata.ipynb index 69f039d..52e9903 100644 --- a/examples/nlp_for_metadata/nlp_for_metadata.ipynb +++ b/examples/nlp_for_metadata/nlp_for_metadata.ipynb @@ -79,6 +79,7 @@ " -> already downloaded language\n", " -> already downloaded name\n", " -> already downloaded semantic\n", + " -> already downloaded metadata\n", " -> already downloaded geoloc\n" ] } @@ -109,7 +110,7 @@ "\n", "from tabulate import tabulate\n", "\n", - "models = load_pretrained_nlp_models(verbose=True)" + "models = load_pretrained_nlp_models(force=False, verbose=True)" ] }, { @@ -122,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "f44fbf08", "metadata": {}, "outputs": [], @@ -153,9 +154,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processing input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:00<00:00, 78.98it/s]\u001b[38;2;15;98;254m \u001b[0m\n", - "Submitting input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:03<00:00, 3.27s/it]\u001b[38;2;15;98;254m \u001b[0m\n", - "Converting input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:27<00:00, 27.58s/it]\u001b[38;2;15;98;254m \u001b[0m\n" + "Processing input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:00<00:00, 124.39it/s]\u001b[38;2;15;98;254m \u001b[0m\n", + "Submitting input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:06<00:00, 6.66s/it]\u001b[38;2;15;98;254m \u001b[0m\n", + "Converting input: : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:26<00:00, 26.56s/it]\u001b[38;2;15;98;254m \u001b[0m\n" ] }, { @@ -226,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 6, "id": "b19f7678-b650-484b-a994-150d0c4ec3a2", "metadata": {}, "outputs": [], @@ -245,7 +246,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "710cc200-e2ba-46f3-9ca0-efd2baab7ee1", "metadata": {}, "outputs": [], @@ -276,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, "id": "ed3612b4-bbd2-42d0-ba2d-f8f994565380", "metadata": {}, "outputs": [], @@ -295,7 +296,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "6d98745c-e0f3-41d2-8261-b7953d835dec", "metadata": {}, "outputs": [ @@ -303,21 +304,78 @@ "name": "stdout", "output_type": "stream", "text": [ - " subtype subj_path name\n", - "0 abstract # Abstract-Document understanding is a key busin...\n", - "1 title #/texts/1 Delivering Document Conversion as a Cloud Serv...\n", - "2 author #/texts/1 Christoph Auer\n", - "3 author #/texts/1 Research Ruschlikon\n", - "4 author #/texts/2 Research Ruschlikon\n", - "5 author #/texts/3 Michele Dolfi\n", - "6 author #/texts/3 Research Ruschlikon\n", - "7 author #/texts/4 J Staar\n", - "8 author #/texts/4 Research Ruschlikon\n" + "TITLE\n", + "Delivering Document Conversion as a Cloud Service with High Throughput and Responsiveness\n", + "ABSTRACT\n", + "['Abstract-Document understanding is a key business process in the data-driven economy since documents are central to knowledge discovery and business insights. Converting documents into a machine-processable format is a particular challenge here due to their huge variability in formats and complex structure. Accordingly, many algorithms and machine-learning methods emerged to solve particular tasks such as Optical Character Recognition (OCR), layout analysis, table-structure recovery, figure understanding, etc. We observe the adoption of such methods in document understanding solutions offered by all major cloud providers. Yet, publications outlining how such services are designed and optimized to scale in the cloud are scarce. In this paper, we focus on the case of document conversion to illustrate the particular challenges of scaling a complex data processing pipeline with a strong reliance on machine-learning methods on cloud infrastructure. Our key objective is to achieve high scalability and responsiveness for different workload profiles in a well-defined resource budget. We outline the requirements, design, and implementation choices of our document conversion service and reflect on the challenges we faced. Evidence for the scaling behavior and resource efficiency is provided for two alternative workload distribution strategies and deployment configurations. Our best-performing method achieves sustained throughput of over one million PDF pages per hour on 3072 CPU cores across 192 nodes.', 'Index Terms-cloud applications, document understanding, distributed computing, artificial intelligence']\n", + " type subj_hash subj_name subj_path label \\\n", + "5 semantic 8967552455475999131 TEXT #/texts/0 header \n", + "7 semantic 384749972256050104 TEXT #/texts/1 meta-data \n", + "9 semantic 15891517341344374830 TEXT #/texts/2 meta-data \n", + "11 semantic 10276496618786154295 TEXT #/texts/3 meta-data \n", + "13 semantic 5624406992563222356 TEXT #/texts/4 meta-data \n", + "15 semantic 15035726207261556942 TEXT #/texts/5 text \n", + "17 semantic 4662798960261328447 TEXT #/texts/6 text \n", + "19 semantic 15072469540570473164 TEXT #/texts/7 header \n", + "21 semantic 8600142426167835349 TEXT #/texts/8 text \n", + "23 semantic 3072624984713661043 TEXT #/texts/9 text \n", + "25 semantic 14339411138813898476 TEXT #/texts/10 text \n", + "27 semantic 17407436599861342415 TEXT #/texts/11 meta-data \n", + "29 semantic 4004878754391976765 TEXT #/texts/12 text \n", + "31 semantic 15578236054977031520 TEXT #/texts/13 text \n", + "33 semantic 1317828445053500670 TEXT #/texts/14 text \n", + "35 semantic 3501395332085509922 TEXT #/texts/15 text \n", + "37 semantic 14716706603701707953 TEXT #/texts/16 text \n", + "39 semantic 2277014394919988861 TEXT #/texts/17 text \n", + "41 semantic 18364912209191405749 TEXT #/texts/18 header \n", + "43 semantic 487083125877341825 TEXT #/texts/19 text \n", + "\n", + " confidence \n", + "5 0.73 \n", + "7 1.00 \n", + "9 0.99 \n", + "11 0.99 \n", + "13 0.99 \n", + "15 0.99 \n", + "17 0.93 \n", + "19 1.00 \n", + "21 0.99 \n", + "23 0.99 \n", + "25 1.00 \n", + "27 0.99 \n", + "29 1.00 \n", + "31 1.00 \n", + "33 0.97 \n", + "35 0.95 \n", + "37 1.00 \n", + "39 0.95 \n", + "41 0.97 \n", + "43 1.00 \n", + " type subj_hash subj_name subj_path label confidence\n", + "1 metadata 8967552455475999131 DOCUMENT #/texts/0 title 1.0\n", + "2 metadata 15035726207261556942 DOCUMENT #/texts/5 abstract 1.0\n", + "3 metadata 4662798960261328447 DOCUMENT #/texts/6 abstract 1.0\n", + " subtype subj_path name\n", + "0 author #/texts/1 Christoph Auer\n", + "1 author #/texts/3 Michele Dolfi\n", + "2 author #/texts/4 J Staar\n" ] } ], "source": [ - "#print(res[\"instances\"][\"headers\"])\n", + "if \"title\" in res[\"description\"]:\n", + " print(\"TITLE\")\n", + " print(res[\"description\"][\"title\"])\n", + "\n", + "if \"abstract\" in res[\"description\"]:\n", + " print(\"ABSTRACT\")\n", + " print(res[\"description\"][\"abstract\"])\n", + "\n", + "doc_props = props[props[\"type\"]==\"semantic\"]\n", + "print(doc_props[0:20])\n", + "\n", + "doc_props = props[props[\"type\"]==\"metadata\"]\n", + "print(doc_props)\n", "\n", "doc_insts = insts[insts[\"subj_name\"]==\"DOCUMENT\"][[\"subtype\", \"subj_path\", \"name\"]]\n", "print(doc_insts)" @@ -325,171 +383,23 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "01771757-70c3-44cb-824c-1fd9b716a99f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1] C. Gopal, C. L. Marshall, D. Vesset, N. Ward-Dutton, J. Hamel, R.\n", - "Jyoti, P. Rutten, C. W. Olofson, J. Rydning, S. Rau, and J. Duke, 'IDC\n", - "FutureScape: Worldwide future of intelligence 2022 predictions,'\n", - "International Data Group, Inc., Needham, MA, Research Report\n", - "US47913321, Oct. 2021. [Online]. Available:\n", - "https://www.idc.com/getdoc.jsp?containerId=US47913321\n", - "\n", - "entities:\n", - " subtype name\n", - "214 reference-number 1\n", - "215 authors C. Gopal, C. L. Marshall, D. Vesset, N. Ward-D...\n", - "216 person-name C Gopal\n", - "217 person-name C L Marshall\n", - "218 person-name D Vesset\n", - "219 person-name N Ward\n", - "220 person-name J Hamel\n", - "221 person-name R Jyoti\n", - "222 person-name P Rutten\n", - "223 person-name C W Olofson\n", - "224 person-name J Rydning\n", - "225 person-name S Rau\n", - "226 person-name J Duke\n", - "227 title 'IDC FutureScape: Worldwide future of intellig...\n", - "228 journal International Data Group, Inc., Needham, MA, R...\n", - "229 person-name International Data Group\n", - "230 person-name Research Report\n", - "231 volume 47913321\n", - "232 date Oct. 2021\n", - "233 note Online]. Available:\n", - "234 url https://www.idc.com/getdoc.jsp?containerId=US4... \n", - "\n", - "\n", - "[2] D. Vile, 'The road to becoming a data driven business,' Freeform\n", - "Dynamics Ltd., New Milton, United Kingdom, Research Report US47913321,\n", - "Nov. 2020. [Online]. Available: https://www.freeformdynamics.com/wp-\n", - "content/uploads/2020/11/ 2020-The road to becoming a data driven\n", - "business.pdf\n", - "\n", - "entities:\n", - " subtype name\n", - "235 reference-number 2\n", - "236 authors D. Vile\n", - "237 person-name D Vile\n", - "238 title 'The road to becoming a data driven business,'\n", - "239 journal Freeform Dynamics Ltd., New Milton, United Kin...\n", - "240 person-name Freeform Dynamics Ltd\n", - "241 country United Kingdom\n", - "242 person-name United Kingdom\n", - "243 person-name Research Report\n", - "244 volume 47913321\n", - "245 date Nov. 2020\n", - "246 note Online]. Available:\n", - "247 url https://www.freeformdynamics.com/wp-content/up...\n", - "248 title data driven business.pdf \n", - "\n", - "\n", - "[3] M. Aslett and N. Patience, 'Data platforms market map 2021,' S&P\n", - "Global Market Intelligence, Tech. Rep., Sep. 2021.\n", - "\n", - "entities:\n", - " subtype name\n", - "249 reference-number 3\n", - "250 authors M. Aslett and N. Patience\n", - "251 person-name M Aslett\n", - "252 person-name N Patience\n", - "253 title 'Data platforms market map 2021,'\n", - "254 journal S&P Global Market Intelligence, Tech. Rep., Sep\n", - "255 person-name Global Market Intelligence\n", - "256 abbreviation-name Tech Rep\n", - "257 date 2021 \n", - "\n", - "\n", - "[4] G. Aggarwal. (2021, Jan.) How the pandemic has accelerated cloud\n", - "adoption. Forbes. Jersey City, NJ. [Online]. Available:\n", - "https://www.forbes.com/sites/forbestechcouncil/2021/01/15/ how-the-\n", - "pandemic-has-accelerated-cloud-adoption\n", - "\n", - "entities:\n", - " subtype name\n", - "258 reference-number 4\n", - "259 authors G. Aggarwal\n", - "260 person-name G Aggarwal\n", - "261 date 2021, Jan\n", - "262 title How the pandemic has accelerated cloud adoptio...\n", - "263 person-name Jersey City\n", - "264 note Available:\n", - "265 url https://www.forbes.com/sites/forbestechcouncil... \n", - "\n", - "\n", - "[5] 'Enterprise survey series: DevOps and the cloud,' Evans Data\n", - "Corporation, Santa Cruz, CA, Research Report, Aug. 2021. [Online].\n", - "Available: https://evansdata.com/reports/viewRelease.php?reportID=45\n", - "\n", - "entities:\n", - " subtype name\n", - "266 reference-number 5\n", - "267 title 'Enterprise survey series: DevOps and the cloud,'\n", - "268 journal Evans Data Corporation, Santa Cruz, CA, Resear...\n", - "269 person-name Evans Data Corporation\n", - "270 person-name Santa Cruz\n", - "271 person-name Research Report\n", - "272 date 2021\n", - "273 note Online]. Available:\n", - "274 url https://evansdata.com/reports/viewRelease.php?... \n", - "\n", - "\n", - "[6] J. Arundel and J. Domingus, Cloud Native DevOps with Kubernetes:\n", - "Building, Deploying, and Scaling Modern Applications in the Cloud.\n", - "Sebastopol, CA: O'Reilly Media, Apr. 2019.\n", - "\n", - "entities:\n", - " subtype name\n", - "275 reference-number 6\n", - "276 authors J. Arundel and J. Domingus\n", - "277 person-name J Arundel\n", - "278 person-name J Domingus\n", - "279 title Cloud Native DevOps with Kubernetes: Building,...\n", - "280 person-name Cloud Native DevOps\n", - "281 journal Reilly Media, Apr\n", - "282 date 2019 \n", - "\n", - "\n" - ] - } - ], - "source": [ - "\n", - "\n", - "refs = props[(props[\"label\"]==\"reference\") & (props[\"confidence\"]>0.8)]\n", - "\n", - "cnt = 0\n", - "for i,ref in refs.iterrows():\n", - " #print(ref)\n", - "\n", - " item = resolve(ref[\"subj_path\"].split(\"/\"), res)\n", - " print(\"\\n\".join(textwrap.wrap(item[\"text\"], 70)))\n", - "\n", - " ents = insts[insts[\"subj_hash\"]==item[\"subj_hash\"]][[\"subtype\", \"name\"]]\n", - " print(\"\\nentities:\\n\", ents, \"\\n\\n\")\n", - "\n", - " \n", - " cnt+=1\n", - " if cnt>5:\n", - " break\n" - ] + "outputs": [], + "source": [] }, { "cell_type": "markdown", "id": "cfeca54d-bbc1-4022-851d-0b29027de761", "metadata": {}, "source": [ - "## Extract Matedata from ingested documents" + "## Extract MetaData from public documents" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "id": "8bb459a8-7b26-4dc3-98da-b1b4a1b59fcc", "metadata": {}, "outputs": [], @@ -513,7 +423,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 5, "id": "d064166b-7578-437c-b3a6-b16eb3d95c1f", "metadata": {}, "outputs": [], @@ -525,7 +435,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 7, "id": "db9da464-23db-4562-a1ce-259a717f404a", "metadata": {}, "outputs": [ @@ -579,7 +489,7 @@ " Annual Reports\n", " Document\n", " 107.38K\n", - " 2024-01-12\n", + " 2024-04-15\n", " default/annual-report\n", " \n", " \n", @@ -599,412 +509,12 @@ " default/arxiv-category\n", " \n", " \n", - " 5\n", - " arXiv full documents\n", - " Document\n", - " 2.29M\n", - " 2023-10-29\n", - " default/arxiv\n", - " \n", - " \n", - " 6\n", - " BioRxiv\n", - " Document\n", - " 357.76K\n", - " 2023-11-09\n", - " default/biorxiv\n", - " \n", - " \n", - " 7\n", - " Brenda\n", - " Record\n", - " 7.12K\n", - " 2023-01-03\n", - " default/brenda\n", - " \n", - " \n", - " 8\n", - " ChEMBL\n", - " Record\n", - " 2.11M\n", - " 2023-01-03\n", - " default/chembl\n", - " \n", - " \n", - " 9\n", - " ChemRxiv\n", - " Document\n", - " 8.82K\n", - " 2023-11-23\n", - " default/chemrxiv\n", - " \n", - " \n", - " 10\n", - " ClinicalTrials\n", - " Document\n", - " 426.42K\n", - " 2023-01-03\n", - " default/clinical-trials\n", - " \n", - " \n", - " 11\n", - " COD\n", - " Record\n", - " 503.78K\n", - " 2023-07-24\n", - " default/cod\n", - " \n", - " \n", - " 12\n", - " Cord19\n", - " Document\n", - " 655.45K\n", - " 2022-11-17\n", - " default/cord19\n", - " \n", - " \n", - " 13\n", - " Crossref\n", - " Document\n", - " 131.86M\n", - " 2023-02-22\n", - " default/crossref\n", - " \n", - " \n", - " 14\n", - " Crossref journal list\n", - " Reference\n", - " 100.52K\n", - " 2022-02-22\n", - " default/crossref-journal\n", - " \n", - " \n", - " 15\n", - " D&B Hoovers\n", - " Record\n", - " 10K\n", - " 2021-04-16\n", - " default/swot-report\n", - " \n", - " \n", - " 16\n", - " DeepSearch materials\n", - " Record\n", - " 360.54K\n", - " 2023-01-03\n", - " default/ds4sd-material\n", - " \n", - " \n", - " 17\n", - " DOAB\n", - " Document\n", - " 8.8K\n", - " 2023-12-04\n", - " default/doab\n", - " \n", - " \n", - " 18\n", - " DrugBank\n", - " Record\n", - " 4.44K\n", - " 2022-11-03\n", - " default/drugbank\n", - " \n", - " \n", - " 19\n", - " engrXiv\n", - " Document\n", - " 1.84K\n", - " 2023-01-03\n", - " default/engrxiv\n", - " \n", - " \n", - " 20\n", - " ESG Reports\n", - " Document\n", - " 17.36K\n", - " 2024-01-08\n", - " default/esg-report\n", - " \n", - " \n", - " 21\n", - " FDA Adverse Event Reporting System (FAERS)\n", - " Document\n", - " 435.62K\n", - " 2023-01-03\n", - " default/faers\n", - " \n", - " \n", - " 22\n", - " GenBank\n", - " Record\n", - " 260.36M\n", - " 2023-01-24\n", - " default/genbank\n", - " \n", - " \n", - " 23\n", - " HBCP Open Access Corpus\n", - " Document\n", - " 90\n", - " 2023-01-03\n", - " default/hbcp\n", - " \n", - " \n", - " 24\n", - " IBM Redbooks\n", - " Document\n", - " 2.75K\n", - " 2023-06-08\n", - " default/ibm-redbooks\n", - " \n", - " \n", - " 25\n", - " IEEE\n", - " Document\n", - " 61.95K\n", - " 2024-01-16\n", - " default/ieee\n", - " \n", - " \n", - " 26\n", - " International Patent Classification (IPC)\n", - " Reference\n", - " 78.52K\n", - " 2022-02-22\n", - " default/wipo-ipc\n", - " \n", - " \n", - " 27\n", - " IPCC\n", - " Document\n", - " 819\n", - " 2023-06-14\n", - " default/ipcc\n", - " \n", - " \n", - " 28\n", - " Legal Entity Identifier\n", - " Record\n", - " 2.1M\n", - " 2023-08-16\n", - " default/lei\n", - " \n", - " \n", - " 29\n", - " Material Components\n", - " Document\n", - " 16.32K\n", - " 2023-01-30\n", - " default/experiment\n", - " \n", - " \n", - " 30\n", - " MedRxiv\n", - " Document\n", - " 69.18K\n", - " 2023-11-02\n", - " default/medrxiv\n", - " \n", - " \n", - " 31\n", - " NeurIPS\n", - " Document\n", - " 16.9K\n", - " 2023-09-24\n", - " default/neurips\n", - " \n", - " \n", - " 32\n", - " News\n", - " Document\n", - " 9.82M\n", - " 2023-09-10\n", - " default/news\n", - " \n", - " \n", - " 33\n", - " NMRShift\n", - " Record\n", - " 44.33K\n", - " 2023-01-03\n", - " default/nmrshift\n", - " \n", - " \n", - " 34\n", - " OpenCVF\n", - " Document\n", - " 26.94K\n", - " 2023-10-04\n", - " default/opencvf\n", - " \n", - " \n", - " 35\n", - " OpenStax\n", - " Document\n", - " 76\n", - " 2024-02-01\n", - " default/openstax\n", - " \n", - " \n", - " 36\n", - " OpenStreetMap\n", - " Generic\n", - " 296.31M\n", - " 2023-03-12\n", - " default/osm\n", - " \n", - " \n", - " 37\n", - " PatCID\n", - " Record\n", - " 13.03M\n", - " 2023-09-15\n", - " default/patcid\n", - " \n", - " \n", - " 38\n", - " Patent SMILES\n", - " Document\n", - " 2.84M\n", - " 2023-10-11\n", - " default/patent-smiles\n", - " \n", - " \n", - " 39\n", - " Patents from CNIPR\n", - " Document\n", - " 2\n", - " 2022-12-19\n", - " default/patent-cnipr\n", - " \n", - " \n", - " 40\n", - " Patents from EPO\n", - " Document\n", - " 7.09M\n", - " 2023-07-06\n", - " default/patent-epo\n", - " \n", - " \n", - " 41\n", - " Patents from JPO\n", - " Document\n", - " 2.54M\n", - " 2024-01-08\n", - " default/patent-jpo\n", - " \n", - " \n", - " 42\n", - " Patents from KIPO\n", - " Document\n", - " 1.8M\n", - " 2022-12-19\n", - " default/patent-kipo\n", - " \n", - " \n", - " 43\n", - " Patents from USPTO\n", - " Document\n", - " 16.16M\n", - " 2024-02-09\n", - " default/patent-uspto\n", - " \n", - " \n", - " 44\n", - " Patents from USPTO (TEST)\n", - " Document\n", - " 6.81K\n", - " 2024-03-13\n", - " default/patent-uspto-test\n", - " \n", - " \n", - " 45\n", - " PLOS\n", - " Document\n", - " 340.28K\n", - " 2024-01-10\n", - " default/plos\n", - " \n", - " \n", - " 46\n", - " PubChem\n", - " Record\n", - " 118.24M\n", - " 2023-07-06\n", - " default/pubchem\n", - " \n", - " \n", - " 47\n", - " PubMed Central\n", - " Document\n", - " 5.08M\n", - " 2023-03-01\n", - " default/pubmed\n", - " \n", - " \n", - " 48\n", - " PubMed Central (PDF)\n", - " Document\n", - " 27.66K\n", - " 2024-01-22\n", - " default/pmc-pdf\n", - " \n", - " \n", - " 49\n", - " Red Hat\n", - " Document\n", - " 7.17K\n", - " 2024-01-23\n", - " default/redhat\n", - " \n", - " \n", - " 50\n", - " RxNorm\n", - " Record\n", - " 374.18K\n", - " 2023-01-03\n", - " default/rxnorm\n", - " \n", - " \n", - " 51\n", - " SEC Edgar CIK Lookup\n", - " Reference\n", - " 786K\n", - " 2022-02-22\n", - " default/sec-cik\n", - " \n", - " \n", - " 52\n", - " SEC Edgar filings\n", - " Document\n", - " 56.38K\n", - " 2021-07-06\n", - " default/sec-filing\n", - " \n", - " \n", - " 53\n", - " Semantic Scholar Academic Graph\n", - " Document\n", - " 216.85M\n", - " 2024-03-11\n", - " default/semantic-scholar\n", - " \n", - " \n", - " 54\n", - " SMILES from USPTO\n", - " Record\n", - " 116.48M\n", - " 2022-12-25\n", - " default/patent-uspto-smiles\n", - " \n", - " \n", - " 55\n", - " SMILES from USPTO (fingerprints)\n", - " Record\n", - " 85.81M\n", - " 2023-02-23\n", - " default/patent-uspto-smiles-fp\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", " 56\n", @@ -1032,6 +542,14 @@ " \n", " \n", " 59\n", + " VHDL articles\n", + " Document\n", + " 215\n", + " 2024-04-23\n", + " default/vhdl\n", + " \n", + " \n", + " 60\n", " Wikipedia\n", " Document\n", " 6.45M\n", @@ -1040,132 +558,37 @@ " \n", " \n", "\n", + "

61 rows × 5 columns

\n", "" ], "text/plain": [ - " Name Type Num entries \\\n", - "0 AAAI Document 16.02K \n", - "1 ACL Anthology Document 55.28K \n", - "2 Annual Reports Document 107.38K \n", - "3 arXiv abstracts Document 2.37M \n", - "4 arXiv category taxonomy Record 155 \n", - "5 arXiv full documents Document 2.29M \n", - "6 BioRxiv Document 357.76K \n", - "7 Brenda Record 7.12K \n", - "8 ChEMBL Record 2.11M \n", - "9 ChemRxiv Document 8.82K \n", - "10 ClinicalTrials Document 426.42K \n", - "11 COD Record 503.78K \n", - "12 Cord19 Document 655.45K \n", - "13 Crossref Document 131.86M \n", - "14 Crossref journal list Reference 100.52K \n", - "15 D&B Hoovers Record 10K \n", - "16 DeepSearch materials Record 360.54K \n", - "17 DOAB Document 8.8K \n", - "18 DrugBank Record 4.44K \n", - "19 engrXiv Document 1.84K \n", - "20 ESG Reports Document 17.36K \n", - "21 FDA Adverse Event Reporting System (FAERS) Document 435.62K \n", - "22 GenBank Record 260.36M \n", - "23 HBCP Open Access Corpus Document 90 \n", - "24 IBM Redbooks Document 2.75K \n", - "25 IEEE Document 61.95K \n", - "26 International Patent Classification (IPC) Reference 78.52K \n", - "27 IPCC Document 819 \n", - "28 Legal Entity Identifier Record 2.1M \n", - "29 Material Components Document 16.32K \n", - "30 MedRxiv Document 69.18K \n", - "31 NeurIPS Document 16.9K \n", - "32 News Document 9.82M \n", - "33 NMRShift Record 44.33K \n", - "34 OpenCVF Document 26.94K \n", - "35 OpenStax Document 76 \n", - "36 OpenStreetMap Generic 296.31M \n", - "37 PatCID Record 13.03M \n", - "38 Patent SMILES Document 2.84M \n", - "39 Patents from CNIPR Document 2 \n", - "40 Patents from EPO Document 7.09M \n", - "41 Patents from JPO Document 2.54M \n", - "42 Patents from KIPO Document 1.8M \n", - "43 Patents from USPTO Document 16.16M \n", - "44 Patents from USPTO (TEST) Document 6.81K \n", - "45 PLOS Document 340.28K \n", - "46 PubChem Record 118.24M \n", - "47 PubMed Central Document 5.08M \n", - "48 PubMed Central (PDF) Document 27.66K \n", - "49 Red Hat Document 7.17K \n", - "50 RxNorm Record 374.18K \n", - "51 SEC Edgar CIK Lookup Reference 786K \n", - "52 SEC Edgar filings Document 56.38K \n", - "53 Semantic Scholar Academic Graph Document 216.85M \n", - "54 SMILES from USPTO Record 116.48M \n", - "55 SMILES from USPTO (fingerprints) Record 85.81M \n", - "56 UMLS Record 2.69M \n", - "57 UniProt Record 567.48K \n", - "58 USPTO patents for NER Document 2.64K \n", - "59 Wikipedia Document 6.45M \n", + " Name Type Num entries Date \\\n", + "0 AAAI Document 16.02K 2023-08-29 \n", + "1 ACL Anthology Document 55.28K 2023-08-22 \n", + "2 Annual Reports Document 107.38K 2024-04-15 \n", + "3 arXiv abstracts Document 2.37M 2023-12-07 \n", + "4 arXiv category taxonomy Record 155 2023-12-05 \n", + ".. ... ... ... ... \n", + "56 UMLS Record 2.69M 2023-01-03 \n", + "57 UniProt Record 567.48K 2023-01-03 \n", + "58 USPTO patents for NER Document 2.64K 2023-03-20 \n", + "59 VHDL articles Document 215 2024-04-23 \n", + "60 Wikipedia Document 6.45M 2024-02-26 \n", + "\n", + " Coords \n", + "0 default/aaai \n", + "1 default/acl \n", + "2 default/annual-report \n", + "3 default/arxiv-abstract \n", + "4 default/arxiv-category \n", + ".. ... \n", + "56 default/umls \n", + "57 default/uniprot \n", + "58 default/uspto-for-ner \n", + "59 default/vhdl \n", + "60 default/wikipedia \n", "\n", - " Date Coords \n", - "0 2023-08-29 default/aaai \n", - "1 2023-08-22 default/acl \n", - "2 2024-01-12 default/annual-report \n", - "3 2023-12-07 default/arxiv-abstract \n", - "4 2023-12-05 default/arxiv-category \n", - "5 2023-10-29 default/arxiv \n", - "6 2023-11-09 default/biorxiv \n", - "7 2023-01-03 default/brenda \n", - "8 2023-01-03 default/chembl \n", - "9 2023-11-23 default/chemrxiv \n", - "10 2023-01-03 default/clinical-trials \n", - "11 2023-07-24 default/cod \n", - "12 2022-11-17 default/cord19 \n", - "13 2023-02-22 default/crossref \n", - "14 2022-02-22 default/crossref-journal \n", - "15 2021-04-16 default/swot-report \n", - "16 2023-01-03 default/ds4sd-material \n", - "17 2023-12-04 default/doab \n", - "18 2022-11-03 default/drugbank \n", - "19 2023-01-03 default/engrxiv \n", - "20 2024-01-08 default/esg-report \n", - "21 2023-01-03 default/faers \n", - "22 2023-01-24 default/genbank \n", - "23 2023-01-03 default/hbcp \n", - "24 2023-06-08 default/ibm-redbooks \n", - "25 2024-01-16 default/ieee \n", - "26 2022-02-22 default/wipo-ipc \n", - "27 2023-06-14 default/ipcc \n", - "28 2023-08-16 default/lei \n", - "29 2023-01-30 default/experiment \n", - "30 2023-11-02 default/medrxiv \n", - "31 2023-09-24 default/neurips \n", - "32 2023-09-10 default/news \n", - "33 2023-01-03 default/nmrshift \n", - "34 2023-10-04 default/opencvf \n", - "35 2024-02-01 default/openstax \n", - "36 2023-03-12 default/osm \n", - "37 2023-09-15 default/patcid \n", - "38 2023-10-11 default/patent-smiles \n", - "39 2022-12-19 default/patent-cnipr \n", - "40 2023-07-06 default/patent-epo \n", - "41 2024-01-08 default/patent-jpo \n", - "42 2022-12-19 default/patent-kipo \n", - "43 2024-02-09 default/patent-uspto \n", - "44 2024-03-13 default/patent-uspto-test \n", - "45 2024-01-10 default/plos \n", - "46 2023-07-06 default/pubchem \n", - "47 2023-03-01 default/pubmed \n", - "48 2024-01-22 default/pmc-pdf \n", - "49 2024-01-23 default/redhat \n", - "50 2023-01-03 default/rxnorm \n", - "51 2022-02-22 default/sec-cik \n", - "52 2021-07-06 default/sec-filing \n", - "53 2024-03-11 default/semantic-scholar \n", - "54 2022-12-25 default/patent-uspto-smiles \n", - "55 2023-02-23 default/patent-uspto-smiles-fp \n", - "56 2023-01-03 default/umls \n", - "57 2023-01-03 default/uniprot \n", - "58 2023-03-20 default/uspto-for-ner \n", - "59 2024-02-26 default/wikipedia " + "[61 rows x 5 columns]" ] }, "metadata": {}, @@ -1189,19 +612,19 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 8, "id": "357340cc-97e3-44bc-aa28-41a1be1e9a20", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "cb1357fa9d50499e929a520811253c24", + "model_id": "731a7106c87f46fb97ed8d94c8ce883b", "version_major": 2, "version_minor": 0 }, "text/plain": [ - " 0%| | 0/60 [00:00 collection: \", index)\n", + " \n", + " return None\n", + "\n", + "def search_documents(api, proj_key, coll_name, query, max_docs=100, page_size=1):\n", + "\n", + " index = get_indices_in_project(api, coll_name=coll_name,\n", + " proj_key=proj_key)\n", + "\n", + " if index==None:\n", + " return\n", + "\n", + " try:\n", + " data_query = DataQuery(query, coordinates=index.source, limit=page_size) # The size of each request page)\n", + " cursor = api.queries.run_paginated_query(data_query)\n", + "\n", + " # [Optional] Compute the number of total results matched. This can be used to monitor the pagination progress.\n", + " count_query = deepcopy(data_query)\n", + " count_query.paginated_task.parameters[\"limit\"] = 0\n", + " count_results = api.queries.run(count_query)\n", + " expected_total = count_results.outputs[\"data_count\"]\n", + " expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n", + "\n", + " print(\"#-documents: \", expected_total)\n", + "\n", + " cur_docs = 0\n", + " for result_page in tqdm(cursor):\n", + "\n", + " if cur_docs>max_docs:\n", + " break\n", + "\n", + " for row in result_page.outputs[\"data_outputs\"]:\n", + "\n", + " #print(cur_docs, max_docs)\n", + " if cur_docs>max_docs:\n", + " break\n", + "\n", + "\n", + " yield row[\"_source\"]\n", + " cur_docs += 1\n", + "\n", + " except Exception as e:\n", + " print(\" => \", e)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2e2d12e8-00ba-49d3-bc8e-19a257eb85df", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#-documents: 9\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1it [00:01, 1.12s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "document-hash: 0f43aba61158df5f5a00d91434bee8dd47e9dad2a6252ab7607408e2e6057b7d\n", + "title: \n", + " Source area and tectonic provenance of Paleocene-Eocene red bed\n", + "clastics from the Kurdistan area NE Iraq: Bulk-rock geochemistry\n", + "constraints \n", + "\n", + "authors: [\n", + " {\n", + " \"name\": \"Brian G Jones\"\n", + " },\n", + " {\n", + " \"name\": \"Muatasam Mahmood Hassan\"\n", + " },\n", + " {\n", + " \"name\": \"Solomon Buckman\"\n", + " },\n", + " {\n", + " \"name\": \"Ali Ismail Al Jubory\"\n", + " },\n", + " {\n", + " \"name\": \"Sabah Ahmed Ismail\"\n", + " }\n", + "]\n", + "affiliations: [\n", + " {\n", + " \"name\": \"School of Earth and Environmental Sciences\"\n", + " },\n", + " {\n", + " \"name\": \"University of Wollongong\"\n", + " },\n", + " {\n", + " \"name\": \"School of Earth Science\"\n", + " },\n", + " {\n", + " \"name\": \"School of Earth Science\"\n", + " },\n", + " {\n", + " \"name\": \"University of Kirkuk\"\n", + " }\n", + "]\n", + "abstract: \n", + "\n", + "abstract \n", + "\n", + "Paleocene-Eocene Red Beds exist along a narrow belt in the NW-SE\n", + "oriented imbricate zone in northeastern Iraq and are composed of\n", + "clastic rocks including conglomerate, sandstone and mudstone. \n", + "\n", + "Trace elements show that the lower part of the Red Beds (unit one) was\n", + "derived mainly from mafic and ultramafic rocks. A decrease in mafic\n", + "and ultramafic components in the upper part of the Red Beds is\n", + "accompanied by an increase in felsic components indicating the\n", + "exposure of both felsic and intermediate igneous bodies in the source\n", + "areas. \n", + "\n", + "Trace elements normalized to upper continental crust confirmed the\n", + "mafic and ultramafic source for the lower part of the Red Beds. Unit\n", + "two and the overlying unit four reflect a style showing felsic and\n", + "mafic trends with transition elements being depleted in these parts.\n", + "The intervening unit three shows various patterns partly similar to\n", + "units one and two depending on clast abundance. \n", + "\n", + "The concentrations of rare earth elements in the mudstone reaches up\n", + "to 60% of the main chemical elements, therefore it is useful to\n", + "concentrate on this facies for geochemical studies. \n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2it [00:01, 1.18it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "document-hash: 44cd3953cb824628f2d7fe8976afc9beb2ed07c26ae83f0c79ca357af85af9d4\n", + "title: \n", + " Facies analysis and diagenetic features of the Aptian Dariyan\n", + "Formation in Zagros Fold-Thrust Belt, SW Iran \n", + "\n", + "authors: [\n", + " {\n", + " \"name\": \"Arash Shaabanpour Haghighi\"\n", + " },\n", + " {\n", + " \"name\": \"Mohammad Sahraeyan\"\n", + " }\n", + "]\n", + "affiliations: []\n", + "abstract: \n", + "\n", + "abstract \n", + "\n", + "The Aptian Dariyan Formation (upper part of the Khami Group), is one\n", + "of the important reservoir rocks in the Zagros Fold-Thrust Belt. The\n", + "Zagros Fold-Thrust Belt is located on the boundary between the Arabian\n", + "and Eurasian lithospheric plates and formed from collision between\n", + "Eurasia and advancing Arabia during the Cenozoic. In these studied\n", + "area, the Dariyan Formation with a thickness of 136 meters (Fahliyan\n", + "section) and 100 meters (Kuh-e-Rahmat section), consists of carbonate\n", + "rocks. Based on the facies analysis and sedimentological data, 16\n", + "microfacies were identified. The microfacies are attributed to five\n", + "facies belts: tidal flat (lime mudstone, dolomitic mudstone and\n", + "stromatolitic boundstone), lagoon (bioclastic packstone, orbitolinids\n", + "bioclastic packstone and orbitolinids peloidal packstone), shoal\n", + "(orbitolinids grainstone and peloidal grainstone), restricted\n", + "(peloidal packstone, rudist floatstone/rudstone and orbitolinid\n", + "wackestone), and open marine (orbitolinid floatstone, dasycladacean\n", + "algae floatstone, bioclast pelagic foraminiferal wackestone/packstone,\n", + "pelagic foraminiferal mudstone/wackestone, and calcispere\n", + "packstone/wackestone). The depositional model relates to the carbonate\n", + "ramp. The allochems of the Dariyan Formation are dominated by\n", + "foraminifera, bioclasts and green algae. Peloids, and intraclasts are\n", + "less abundant in this formation. Due to the great diversity and\n", + "abundance of the foraminifera, this carbonate ramp is referred to as a\n", + "''foraminifera-dominated carbonate ramp system''. This carbonate\n", + "system reflects a local regression in the Fahliyan section which can\n", + "be related to the vertical movement of the Kazeroon Fault. The\n", + "carbonates of Dariyan Formation have been affected by a variety of\n", + "diagenetic processes such as compaction, dissolution, cementation,\n", + "neomorphism, and dolomitization. \n", + "\n", + "Ó 2014 Elsevier Ltd. All rights reserved. \n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "3it [00:02, 1.26it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "document-hash: 45319f285bb4544209fb74269a72a17c3a3525246945441aec927928a105bf04\n", + "title: \n", + " Integrated provenance analysis of Zakeen (Devonian) and Faraghan\n", + "(early Permian) sandstones in the Zagros belt, SW Iran \n", + "\n", + "authors: [\n", + " {\n", + " \"name\": \"S Mohammad Zamanzadeh\"\n", + " },\n", + " {\n", + " \"name\": \"Yousef Zoleikhaei\"\n", + " },\n", + " {\n", + " \"name\": \"Abdolhossein Amini\"\n", + " }\n", + "]\n", + "affiliations: [\n", + " {\n", + " \"name\": \"College of Science\"\n", + " },\n", + " {\n", + " \"name\": \"University of Tehran\"\n", + " },\n", + " {\n", + " \"name\": \"Faculty of Geography\"\n", + " },\n", + " {\n", + " \"name\": \"University of Tehran\"\n", + " }\n", + "]\n", + "abstract: \n", + "\n", + "abstract \n", + "\n", + "Successions of a controversial period of time in the Zagros and\n", + "Arabian Plate stratigraphic column, including Zakeen (Devonian) and\n", + "Faraghan (early Permian) formations are investigated for their\n", + "provenance characteristics. Nearly similar depositional environments\n", + "of the formations, regardless of 70-80 My hiatus between them, is the\n", + "main motivation for this study. Evidence from various methods are put\n", + "together to reconstruct a comprehensive image of their provenance.\n", + "Results from petrographic and detrital mode analysis indicate a\n", + "continental block provenance for of the sandstones of both formations.\n", + "In addition, evidence of recycling is evident from some rock fragments\n", + "in the conglomeratic facies. Heavy mineral diversities are limited to\n", + "the ultra-stable species which represent consistent morphological\n", + "characteristics in both formations. However, the values of rutile:\n", + "zircon index (RZi) showed intermittent changes from low RZi to high\n", + "RZi intervals in both formations. Detrital zircon age data in previous\n", + "studies represented the same source for these two formations, which\n", + "also remained unchanged from Neo-Proterozoic to late Paleozoic\n", + "successions. Zircon grains' morphology, however, showed remarkable\n", + "difference between the Zakeen and Faraghan formations on the one hand\n", + "and successions deposited in the basin prior to the tectonic movements\n", + "of mid-Paleozoic time on the other. Outcomes of this study show that,\n", + "although each single technique may shed light on a particular aspect\n", + "of the greater provenance problem, by integration of all the data,\n", + "important evidence of recycled nature of these successions could be\n", + "confirmed. Changes in the thickness of the Paleozoic units, the nature\n", + "of their stratal surfaces, along with the information from magmatic\n", + "events in the area provide a tectonostratigraphic framework for\n", + "northern margin of Gondwana in which the recycled nature of these\n", + "successions is justifiable. The recycled nature of the studied\n", + "formations on the one hand, and their identical provenance on the\n", + "other, raise a challenge for the timing proposed for two tectonic\n", + "activities of middle Paleozoic and mid-Carboniferous. \n", + "\n", + "Ó 2014 Elsevier Ltd. All rights reserved. \n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "4it [00:03, 1.42it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "document-hash: 71b9d4a7505055da7d78886e41abc80602eecdaed0863a0f51add493f38968ba\n", + "title: \n", + " Multi-phase inversion tectonics related to the Hendijan e Nowrooz e\n", + "Khafji Fault activity, Zagros Mountains, SW Iran \n", + "\n", + "authors: [\n", + " {\n", + " \"name\": \"Sadjad Kazem Shiroodi\"\n", + " },\n", + " {\n", + " \"name\": \"Mohammad Ghafoori\"\n", + " },\n", + " {\n", + " \"name\": \"Ali Faghih\"\n", + " },\n", + " {\n", + " \"name\": \"Mostafa Ghanadian\"\n", + " },\n", + " {\n", + " \"name\": \"Gholamreza Lashkaripour\"\n", + " },\n", + " {\n", + " \"name\": \"Naser Hafezi Moghadas\"\n", + " }\n", + "]\n", + "affiliations: [\n", + " {\n", + " \"name\": \"Department of Geology\"\n", + " },\n", + " {\n", + " \"name\": \"Faculty of Sciences\"\n", + " },\n", + " {\n", + " \"name\": \"Ferdowsi University of Mashhad\"\n", + " }\n", + "]\n", + "abstract: \n", + "\n", + "abstract \n", + "\n", + "Distinctive characteristics of inverted structures make them important\n", + "criteria for the identification of certain structural styles of folded\n", + "belts. The interpretation of 3D seismic reflection and well data sheds\n", + "new light on the structural evolution and age of inverted structures\n", + "associated to the Hendijan$_{e}$Nowrooz $_{e}$Khafji Fault within the\n", + "Persian Gulf Basin and northeastern margin of Afro-Arabian plate.\n", + "Analysis of thickness variations of growth strata using $_{'}$T-Z\n", + "plot$_{'}$ (thickness versus throw plot) method revealed the\n", + "kinematics of the fault. Obtained results show that the fault has\n", + "experienced a multi-phase evolutionary history over six different\n", + "extension and compression deformation events (i.e. positive and\n", + "negative inversion) between 252.2 and 11.62 Ma. This cyclic activity\n", + "of the growth fault was resulted from alteration of sedimentary\n", + "processes during continuous fault slip. The structural development of\n", + "the study area both during positive and negative inversion geometry\n", + "styles was ultimately controlled by the relative motion between the\n", + "Afro-Arabian and Central-Iranian plates. \n", + "\n", + "© 2015 Elsevier Ltd. All rights reserved. \n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "5it [00:03, 1.57it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "document-hash: 7594495bb7872d4aa3bfa7bacbc4f598fa8c84fddc6c553effaf4f1b101935c0\n", + "title: \n", + " Lithofacies, architectural elements and tectonic provenance of the\n", + "siliciclastic rocks of the Lower Permian Dorud Formation in the Alborz\n", + "Mountain Range, Northern Iran \n", + "\n", + "authors: [\n", + " {\n", + " \"name\": \"Mojtaba Javidan\"\n", + " },\n", + " {\n", + " \"name\": \"Hosseinali Mokhtarpour\"\n", + " },\n", + " {\n", + " \"name\": \"Mohammad Sahraeyan\"\n", + " },\n", + " {\n", + " \"name\": \"Hojatollah Kheyrandish\"\n", + " }\n", + "]\n", + "affiliations: [\n", + " {\n", + " \"name\": \"Department of Geology\"\n", + " },\n", + " {\n", + " \"name\": \"College of Basic Sciences\"\n", + " },\n", + " {\n", + " \"name\": \"Department of Geology\"\n", + " },\n", + " {\n", + " \"name\": \"Department of Geology\"\n", + " }\n", + "]\n", + "abstract: \n", + "\n", + "abstract \n", + "\n", + "The siliciclastic deposits of the Lower Permian Dorud Formation widely\n", + "crop out in the eastern part of the Alborz Mountain Range (northern\n", + "Iran). In order to interpret the sedimentary environments and tectonic\n", + "provenance of these deposits, two sections in the Kiyasar and\n", + "Talmadareh with 112 and 122 m thickness, respectively; have been\n", + "studied. The analysis of lithofacies and architectural elements, leads\n", + "to recognition of seven lithofacies (Gmm, Sr, Sl, Sh, Sp, Fl, and Fm),\n", + "and four architectural elements (FF, LA, CH, and CR). Based on these\n", + "results, the sedimentary environment of these deposits has been\n", + "identified as a sandy meandering river. The petrographical analysis\n", + "indicates that these sediments were deposited under humid weather in\n", + "the craton interior and recycled orogeny tectonic provenance. \n", + "\n", + "Ó 2015 Elsevier Ltd. All rights reserved. \n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "6it [00:04, 1.64it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "document-hash: 7c5d4947280cec27fbb01892eea145933df0813be615ccd5fb5bb5503254d0f1\n", + "title: \n", + " Stratigraphy, mineralogy and depositional environment of the evaporite\n", + "unit in the As ¸ kale (Erzurum) sub-basin, Eastern Anatolia (Turkey) \n", + "\n", + "authors: [\n", + " {\n", + " \"name\": \"Emel Abdio\"\n", + " },\n", + " {\n", + " \"name\": \"Mehmet Arslan\"\n", + " },\n", + " {\n", + " \"name\": \"Cahit Helvac\"\n", + " }\n", + "]\n", + "affiliations: []\n", + "abstract: \n", + "\n", + "abstract \n", + "\n", + "The study area is situated in the As¸ kale sub-basin where the Early-\n", + "Middle Miocene aged As¸ kale Formation was deposited in a shallow\n", + "marine to lagoonal environment, and consists of interstratifications\n", + "of clastic sediments, carbonates and evaporites. The successions of\n", + "the As¸ kale Formation can be divided into four main members\n", + "interfingering with one another both vertically and laterally, and\n", + "composed of the sandstone-mudstone-limestone member, the evaporite\n", + "member, the gravelstone-sandstone-mudstone intercalations and the\n", + "limestone member. The evaporite unit comprises of secondary gypsum\n", + "lithofacies formed by hydration of precursor anhydrite, anhydrite,\n", + "gypsum-bearing limestone and claystone in the form of wedges and\n", + "lenses. Massive, nodular, nodular-banded, laminated and laminated-\n", + "banded gpysum lithofacieses in addition to chicken-wire and rare\n", + "entrolithic structures were described, indicating a sabhka or a\n", + "shallow water depositional environment. Alabastrine and porphyblastic\n", + "textures of gypsum were identified within the all lithofacieses with\n", + "abundant amount of anhydrite relics. Additionally, saponite and\n", + "illite/smectite, calcite and dolomite, celestite, epsomite were also\n", + "observed. Successions of the As¸ kale Formation were deposited in\n", + "stable subtropical climatic conditions within rapidly subsiding sub-\n", + "basin resulted in conversion of sub-basin to shallow platform and even\n", + "in lagoon environment. \n", + "\n", + "© 2015 Elsevier Ltd. All rights reserved. \n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "6it [00:04, 1.32it/s]\n" + ] + } + ], + "source": [ + "api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)\n", + "\n", + "model = init_nlp_model(\"language;reference;metadata\")\n", + "\n", + "proj_key = \"c4ae6545156c5f99770fdfd161102a01567d8ecd\"\n", + "#coll_name = \"GeoArabia\"\n", + "#coll_name = \"BasinResearch1\"\n", + "coll_name = \"African_ES\"\n", + "\n", + "query = \"*\"\n", + "\n", + "for doc in search_documents(api, proj_key, coll_name, query, max_docs=5, page_size=1):\n", + " \n", + " print(\"document-hash: \", doc[\"file-info\"][\"document-hash\"])\n", + " \n", + " res = model.apply_on_doc(doc)\n", + " #print(res[\"description\"].keys())\n", + " \n", + " if \"title\" in res[\"description\"]:\n", + " text = res[\"description\"][\"title\"]\n", + " text = \"\\n\".join(textwrap.wrap(text, width=70))\n", + "\n", + " print(\"title: \\n\", text, \"\\n\")\n", + "\n", + " if \"authors\" in res[\"description\"]:\n", + " print(\"authors: \", json.dumps(res[\"description\"][\"authors\"], indent=2))\n", + "\n", + " if \"affiliations\" in res[\"description\"]:\n", + " print(\"affiliations: \", json.dumps(res[\"description\"][\"affiliations\"], indent=2))\n", + " \n", + " if \"abstract\" in res[\"description\"]:\n", + "\n", + " print(\"abstract: \\n\")\n", + " for _ in res[\"description\"][\"abstract\"]:\n", + " text = \"\\n\".join(textwrap.wrap(_, width=70))\n", + " print(text, \"\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7207ed71-a8e3-4e4f-88c1-5fbbb4f82ac1", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, - "id": "4cf42286-b0a1-438a-8c7f-d852e61c260f", + "id": "9b8d13fa-d23d-46c6-9b05-a07b27e4d6c7", "metadata": {}, "outputs": [], "source": [] diff --git a/poetry.lock b/poetry.lock index 6b99c52..1a6e33e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -985,23 +985,23 @@ files = [ [[package]] name = "deepsearch-glm" -version = "0.17.2" +version = "0.18.4" description = "Graph Language Models" optional = false python-versions = "<4.0,>=3.8" files = [ - {file = "deepsearch_glm-0.17.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6234fb2fa6755ff1bb7000d21e4574eea68a29557d8f16ba179f5f5713766d9b"}, - {file = "deepsearch_glm-0.17.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:af97145ebb2f21b074ef6385c45d60a2d2553b68254c30aa66b7ddd9206b7f7b"}, - {file = "deepsearch_glm-0.17.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fe2669ac7f8567383e0818fe9b3b73979978fb5e65f36db3b7626bf3af6206d"}, - {file = "deepsearch_glm-0.17.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:95cbe8e8264c675a128f520f33afa3fd34295c64b00d282c015fe13c7cc2bf3b"}, - {file = "deepsearch_glm-0.17.2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:1b9203be399dd4f026769998cca25a5691ff79791ead2dfa05385af8467f4bd8"}, - {file = "deepsearch_glm-0.17.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f512fede5fd062ff005f51073ef5660e2e963e0013251176b69bd7ab9e45faa"}, - {file = "deepsearch_glm-0.17.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:be15a98b0cbf36e5141e5dd8e22ba29b0e0d92604fc58e53e8fa6c837b29a40f"}, - {file = "deepsearch_glm-0.17.2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:e3ddcaf73dd5578786db3333c238790a45d0fa0af4b1df9a41a4b9dd234c2401"}, - {file = "deepsearch_glm-0.17.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20e7b819d58df057bd1826fea8cd3e6d0ed4cac3fe819795ee5205360fa77fee"}, - {file = "deepsearch_glm-0.17.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:0f8405524b000669b82098b1989e8c4ef4da0f93407477c1d807533d9f427867"}, - {file = "deepsearch_glm-0.17.2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:da459f79913b0f967f5802766b2b964bc997d7f5259663901d84c2780961dfa8"}, - {file = "deepsearch_glm-0.17.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50757b64a607104882a683b2a570f96f450faaf0f1047125df043d01406f2f16"}, + {file = "deepsearch_glm-0.18.4-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ad88c5bf3c203174ef81e0699405aec0f5386130cbc6a975b165f81887bc1a52"}, + {file = "deepsearch_glm-0.18.4-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:21d51a0671f0713d23be57030287a0f907f4a5f0627a45ea07e2caf54129a71a"}, + {file = "deepsearch_glm-0.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fc853941ea751a15f65e83f9bee9f988d0ecac4b28fac067b2aab49e15edb74"}, + {file = "deepsearch_glm-0.18.4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:cab5e577cf724343f2a5987ff4488c69e86a2dbca8cb0359c9243a07c6cd7d69"}, + {file = "deepsearch_glm-0.18.4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:dda02391306d657a884b12f21cc3d1228663f940ec6001c833893dd2844bcc25"}, + {file = "deepsearch_glm-0.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9dccd4286a93ee1a216acba27e1fc76f5d14e280d968998cfeae11a00ad1b6cb"}, + {file = "deepsearch_glm-0.18.4-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:cf38368bc72eab673459ea0fc96c02b1f3ae120df2d9443e1a63e010764ac1e9"}, + {file = "deepsearch_glm-0.18.4-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:d3fd83ea3b2bce11bac1d710f12547728f4dd48bfaa8bd472366ef144469d52c"}, + {file = "deepsearch_glm-0.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fb4bfd43ac3b996cdd151c35e94fa399953ee3952d7e86390a825880ece95f3"}, + {file = "deepsearch_glm-0.18.4-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:57cb67e435cacb6c4a6b6a9109d943267c493ebbba252a88ca40909976f60225"}, + {file = "deepsearch_glm-0.18.4-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:edc399939b6464f96600d2f23796ae2641d668fb794b77199e87abdef77f8853"}, + {file = "deepsearch_glm-0.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00ad8d932e7f0d1be4fd99fc0d4c8d50cb1ff10764f146b6ecb310a1379123d4"}, ] [package.dependencies] @@ -5279,4 +5279,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">= 3.8, <3.11" -content-hash = "928e878a81836c1528c27876675534238f9bc5c350965694061acb31ef559c3b" +content-hash = "73f1b0b84cdeb292efcf668a0a3deeb2ac3f76ebcc3ced6791f681792f17d3b8" diff --git a/pyproject.toml b/pyproject.toml index 0b53572..77fc873 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ python-dotenv = "^1.0.0" nbclient = "^0.9.0" pandas = "^1.5.1" argilla = "^1.24.0" -deepsearch-glm = "v0.17.2" +deepsearch-glm = "v0.18.4" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^22.1.0"} From d76a64fae21543a6c678ecf0e60a732d0da52c03 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Wed, 8 May 2024 09:16:31 +0200 Subject: [PATCH 3/3] reverted other notebook to original Signed-off-by: Peter Staar --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 948a82b..022eed8 100644 --- a/README.md +++ b/README.md @@ -41,8 +41,9 @@ is based on [Profiles][profiles]. Unless otherwise configured, the profile used | | Name | Description | | -- | ----------------- | ----------- | | 1. | [NLP on documents](examples/nlp_on_documents/nlp_on_documents.ipynb) | A few quick examples on how to apply NLP models on documents (eg extracting key-terms) | -| 2. | [Reference Parsing](examples/nlp_for_references/nlp_for_references.ipynb) | Examples on how to parse references from Documents | -| 3. | [Material Extraction](examples/nlp_for_materials/nlp_for_materials.ipynb) | Examples on how to extract materials from Documents | +| 2. | [MetaData Extraction](examples/nlp_for_metadata/nlp_for_metadata.ipynb) | Examples on how to detect the metadata of a Document | +| 3. | [Reference Parsing](examples/nlp_for_references/nlp_for_references.ipynb) | Examples on how to parse references from Documents | +| 4. | [Material Extraction](examples/nlp_for_materials/nlp_for_materials.ipynb) | Examples on how to extract materials from Documents | ### Data queries