From 0835ff0a5013d199b6a67665e7dac2cbc331c836 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 2 Apr 2024 07:20:12 +0200
Subject: [PATCH 1/3] added the nlp-for-metadata

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 .../data_query_quick_start/quick_start.ipynb  |   10 +-
 examples/nlp_for_metadata/README.md           |    5 +
 .../nlp_for_metadata/nlp_for_metadata.ipynb   | 2995 +++++++++++++++++
 poetry.lock                                   |   41 +-
 pyproject.toml                                |    2 +-
 5 files changed, 3025 insertions(+), 28 deletions(-)
 create mode 100644 examples/nlp_for_metadata/README.md
 create mode 100644 examples/nlp_for_metadata/nlp_for_metadata.ipynb

diff --git a/examples/data_query_quick_start/quick_start.ipynb b/examples/data_query_quick_start/quick_start.ipynb
index 6f3e334..bfecac8 100644
--- a/examples/data_query_quick_start/quick_start.ipynb
+++ b/examples/data_query_quick_start/quick_start.ipynb
@@ -132,9 +132,7 @@
    "cell_type": "code",
    "execution_count": 5,
    "id": "f915761b",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -370,9 +368,7 @@
    "cell_type": "code",
    "execution_count": 6,
    "id": "81df4c38",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -786,7 +782,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,
diff --git a/examples/nlp_for_metadata/README.md b/examples/nlp_for_metadata/README.md
new file mode 100644
index 0000000..1504c62
--- /dev/null
+++ b/examples/nlp_for_metadata/README.md
@@ -0,0 +1,5 @@
+# NLP on Documents - Extracting meta data
+
+:point_right: Run the [nlp_for_metadata.ipynb](./nlp_for_metadata.ipynb)
+notebook to extract metadata from (scientific) reports
+
diff --git a/examples/nlp_for_metadata/nlp_for_metadata.ipynb b/examples/nlp_for_metadata/nlp_for_metadata.ipynb
new file mode 100644
index 0000000..69f039d
--- /dev/null
+++ b/examples/nlp_for_metadata/nlp_for_metadata.ipynb
@@ -0,0 +1,2995 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "37d96e78",
+   "metadata": {},
+   "source": [
+    "# Document MetaData Extraction"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4edb626f",
+   "metadata": {},
+   "source": [
+    "## Getting started\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a8f9c441",
+   "metadata": {},
+   "source": [
+    "### Set notebook parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b01a4fd1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Project key:  1234567890abcdefghijklmnopqrstvwyz123456\n"
+     ]
+    }
+   ],
+   "source": [
+    "from dsnotebooks.settings import ProjectNotebookSettings\n",
+    "\n",
+    "# notebook settings auto-loaded from .env / env vars\n",
+    "notebook_settings = ProjectNotebookSettings()\n",
+    "\n",
+    "PROFILE_NAME = notebook_settings.profile  # the profile to use\n",
+    "PROJ_KEY = notebook_settings.proj_key     # the project to use\n",
+    "\n",
+    "# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "239dc0f1",
+   "metadata": {},
+   "source": [
+    "### Import example dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "502cdef8",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-08-02T12:14:25.377422Z",
+     "start_time": "2022-08-02T12:14:25.152485Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " -> already downloaded part-of-speech\n",
+      " -> already downloaded reference\n",
+      " -> already downloaded material\n",
+      " -> already downloaded language\n",
+      " -> already downloaded name\n",
+      " -> already downloaded semantic\n",
+      " -> already downloaded geoloc\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "\n",
+    "import textwrap\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "import deepsearch as ds\n",
+    "\n",
+    "from pathlib import Path\n",
+    "from zipfile import ZipFile\n",
+    "\n",
+    "from deepsearch.documents.core.export import export_to_markdown\n",
+    "from IPython.display import display, Markdown, HTML, display_html\n",
+    "\n",
+    "from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models\n",
+    "\n",
+    "from deepsearch_glm.nlp_utils import (\n",
+    "    extract_references_from_doc,\n",
+    "    init_nlp_model,\n",
+    "    list_nlp_model_configs,\n",
+    ")\n",
+    "\n",
+    "from tabulate import tabulate\n",
+    "\n",
+    "models = load_pretrained_nlp_models(verbose=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6e4dcda",
+   "metadata": {},
+   "source": [
+    "### Connect to Deep Search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f44fbf08",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6f1200c5-1138-4491-bc33-3b2d5aabe949",
+   "metadata": {},
+   "source": [
+    "## Convert Document"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ec83eb0b",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-08-02T12:14:49.216045Z",
+     "start_time": "2022-08-02T12:14:25.380757Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing input:     : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:00<00:00, 78.98it/s]\u001b[38;2;15;98;254m                                                                                                                                                              \u001b[0m\n",
+      "Submitting input:     : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:03<00:00,  3.27s/it]\u001b[38;2;15;98;254m                                                                                                                                                              \u001b[0m\n",
+      "Converting input:     : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:27<00:00, 27.58s/it]\u001b[38;2;15;98;254m                                                                                                                                                              \u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'Total documents': 1, 'Successfully converted documents': 1}\n"
+     ]
+    }
+   ],
+   "source": [
+    "output_dir = Path(\"./converted_docs\")\n",
+    "\n",
+    "fname = \"2206.00785.pdf\"\n",
+    "\n",
+    "documents = ds.convert_documents(\n",
+    "    api=api,\n",
+    "    proj_key=PROJ_KEY,\n",
+    "    source_path=f\"../../data/samples/{fname}\",\n",
+    "    progress_bar=True\n",
+    ")           \n",
+    "documents.download_all(result_dir=output_dir)\n",
+    "info = documents.generate_report(result_dir=output_dir)\n",
+    "print(info) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "382c4869-cca9-43fc-8052-c0ab7e9c175d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "writing converted_docs/2206.00785.json\n",
+      "writing converted_docs/2206.00785.md\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Iterare output files and visualize the output\n",
+    "for output_file in output_dir.rglob(\"json*.zip\"):\n",
+    "    with ZipFile(output_file) as archive:\n",
+    "        all_files = archive.namelist()\n",
+    "        for name in all_files:\n",
+    "            if not name.endswith(\".json\"):\n",
+    "                continue\n",
+    "            \n",
+    "            #basename = name.rstrip('.json')\n",
+    "            doc_json = json.loads(archive.read(name))\n",
+    "            \n",
+    "            ofile = output_dir / name\n",
+    "            print(f\"writing {ofile}\")\n",
+    "            with ofile.open(\"w\") as fw:\n",
+    "                fw.write(json.dumps(doc_json, indent=2))\n",
+    "                \n",
+    "            doc_md = export_to_markdown(doc_json)\n",
+    "\n",
+    "            ofile = output_dir / name.replace(\".json\", \".md\")\n",
+    "            print(f\"writing {ofile}\")\n",
+    "            with ofile.open(\"w\") as fw:\n",
+    "                fw.write(doc_md)\n",
+    "\n",
+    "            "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "b19f7678-b650-484b-a994-150d0c4ec3a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# display last document\n",
+    "# display(Markdown(doc_md))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6784c8a9-4b96-4385-a04e-40ddbf6c613f",
+   "metadata": {},
+   "source": [
+    "## Extract references from converted Document"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "710cc200-e2ba-46f3-9ca0-efd2baab7ee1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def resolve(path, doc):\n",
+    "\n",
+    "    if len(path)>1 and path[0]==\"#\":\n",
+    "        return resolve(path[1:], doc)\n",
+    "        \n",
+    "    if len(path)==1 and isinstance(doc, dict):\n",
+    "        return doc[path[0]]\n",
+    "\n",
+    "    elif len(path)==1 and isinstance(doc, list):\n",
+    "        ind = int(path[0])\n",
+    "        return doc[ind]\n",
+    "    \n",
+    "    elif len(path)>1 and isinstance(doc, dict):\n",
+    "        return resolve(path[1:], doc[path[0]])\n",
+    "\n",
+    "    elif len(path)>1 and isinstance(doc, list):\n",
+    "        ind = int(path[0])\n",
+    "        return resolve(path[1:], doc[ind])\n",
+    "\n",
+    "    else:\n",
+    "        return None\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ed3612b4-bbd2-42d0-ba2d-f8f994565380",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ifile = \"converted_docs/2206.00785.json\"\n",
+    "\n",
+    "with open(ifile) as fr:\n",
+    "    doc = json.load(fr)\n",
+    "\n",
+    "model = init_nlp_model(\"language;reference;metadata\")\n",
+    "res = model.apply_on_doc(doc)\n",
+    "\n",
+    "props = pd.DataFrame(res[\"properties\"][\"data\"], columns=res[\"properties\"][\"headers\"])\n",
+    "insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "6d98745c-e0f3-41d2-8261-b7953d835dec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Document understanding is a key busin...\n",
+      "1     title  #/texts/1  Delivering Document Conversion as a Cloud Serv...\n",
+      "2    author  #/texts/1                                     Christoph Auer\n",
+      "3    author  #/texts/1                                Research Ruschlikon\n",
+      "4    author  #/texts/2                                Research Ruschlikon\n",
+      "5    author  #/texts/3                                      Michele Dolfi\n",
+      "6    author  #/texts/3                                Research Ruschlikon\n",
+      "7    author  #/texts/4                                            J Staar\n",
+      "8    author  #/texts/4                                Research Ruschlikon\n"
+     ]
+    }
+   ],
+   "source": [
+    "#print(res[\"instances\"][\"headers\"])\n",
+    "\n",
+    "doc_insts = insts[insts[\"subj_name\"]==\"DOCUMENT\"][[\"subtype\", \"subj_path\", \"name\"]]\n",
+    "print(doc_insts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "01771757-70c3-44cb-824c-1fd9b716a99f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1] C. Gopal, C. L. Marshall, D. Vesset, N. Ward-Dutton, J. Hamel, R.\n",
+      "Jyoti, P. Rutten, C. W. Olofson, J. Rydning, S. Rau, and J. Duke, 'IDC\n",
+      "FutureScape: Worldwide future of intelligence 2022 predictions,'\n",
+      "International Data Group, Inc., Needham, MA, Research Report\n",
+      "US47913321, Oct. 2021. [Online]. Available:\n",
+      "https://www.idc.com/getdoc.jsp?containerId=US47913321\n",
+      "\n",
+      "entities:\n",
+      "               subtype                                               name\n",
+      "214  reference-number                                                  1\n",
+      "215           authors  C. Gopal, C. L. Marshall, D. Vesset, N. Ward-D...\n",
+      "216       person-name                                            C Gopal\n",
+      "217       person-name                                       C L Marshall\n",
+      "218       person-name                                           D Vesset\n",
+      "219       person-name                                             N Ward\n",
+      "220       person-name                                            J Hamel\n",
+      "221       person-name                                            R Jyoti\n",
+      "222       person-name                                           P Rutten\n",
+      "223       person-name                                        C W Olofson\n",
+      "224       person-name                                          J Rydning\n",
+      "225       person-name                                              S Rau\n",
+      "226       person-name                                             J Duke\n",
+      "227             title  'IDC FutureScape: Worldwide future of intellig...\n",
+      "228           journal  International Data Group, Inc., Needham, MA, R...\n",
+      "229       person-name                           International Data Group\n",
+      "230       person-name                                    Research Report\n",
+      "231            volume                                           47913321\n",
+      "232              date                                          Oct. 2021\n",
+      "233              note                                Online]. Available:\n",
+      "234               url  https://www.idc.com/getdoc.jsp?containerId=US4... \n",
+      "\n",
+      "\n",
+      "[2] D. Vile, 'The road to becoming a data driven business,' Freeform\n",
+      "Dynamics Ltd., New Milton, United Kingdom, Research Report US47913321,\n",
+      "Nov. 2020. [Online]. Available: https://www.freeformdynamics.com/wp-\n",
+      "content/uploads/2020/11/ 2020-The road to becoming a data driven\n",
+      "business.pdf\n",
+      "\n",
+      "entities:\n",
+      "               subtype                                               name\n",
+      "235  reference-number                                                  2\n",
+      "236           authors                                            D. Vile\n",
+      "237       person-name                                             D Vile\n",
+      "238             title     'The road to becoming a data driven business,'\n",
+      "239           journal  Freeform Dynamics Ltd., New Milton, United Kin...\n",
+      "240       person-name                              Freeform Dynamics Ltd\n",
+      "241           country                                     United Kingdom\n",
+      "242       person-name                                     United Kingdom\n",
+      "243       person-name                                    Research Report\n",
+      "244            volume                                           47913321\n",
+      "245              date                                          Nov. 2020\n",
+      "246              note                                Online]. Available:\n",
+      "247               url  https://www.freeformdynamics.com/wp-content/up...\n",
+      "248             title                           data driven business.pdf \n",
+      "\n",
+      "\n",
+      "[3] M. Aslett and N. Patience, 'Data platforms market map 2021,' S&P\n",
+      "Global Market Intelligence, Tech. Rep., Sep. 2021.\n",
+      "\n",
+      "entities:\n",
+      "                subtype                                             name\n",
+      "249   reference-number                                                3\n",
+      "250            authors                        M. Aslett and N. Patience\n",
+      "251        person-name                                         M Aslett\n",
+      "252        person-name                                       N Patience\n",
+      "253              title                'Data platforms market map 2021,'\n",
+      "254            journal  S&P Global Market Intelligence, Tech. Rep., Sep\n",
+      "255        person-name                       Global Market Intelligence\n",
+      "256  abbreviation-name                                         Tech Rep\n",
+      "257               date                                             2021 \n",
+      "\n",
+      "\n",
+      "[4] G. Aggarwal. (2021, Jan.) How the pandemic has accelerated cloud\n",
+      "adoption. Forbes. Jersey City, NJ. [Online]. Available:\n",
+      "https://www.forbes.com/sites/forbestechcouncil/2021/01/15/ how-the-\n",
+      "pandemic-has-accelerated-cloud-adoption\n",
+      "\n",
+      "entities:\n",
+      "               subtype                                               name\n",
+      "258  reference-number                                                  4\n",
+      "259           authors                                        G. Aggarwal\n",
+      "260       person-name                                         G Aggarwal\n",
+      "261              date                                          2021, Jan\n",
+      "262             title  How the pandemic has accelerated cloud adoptio...\n",
+      "263       person-name                                        Jersey City\n",
+      "264              note                                         Available:\n",
+      "265               url  https://www.forbes.com/sites/forbestechcouncil... \n",
+      "\n",
+      "\n",
+      "[5] 'Enterprise survey series: DevOps and the cloud,' Evans Data\n",
+      "Corporation, Santa Cruz, CA, Research Report, Aug. 2021. [Online].\n",
+      "Available: https://evansdata.com/reports/viewRelease.php?reportID=45\n",
+      "\n",
+      "entities:\n",
+      "               subtype                                               name\n",
+      "266  reference-number                                                  5\n",
+      "267             title  'Enterprise survey series: DevOps and the cloud,'\n",
+      "268           journal  Evans Data Corporation, Santa Cruz, CA, Resear...\n",
+      "269       person-name                             Evans Data Corporation\n",
+      "270       person-name                                         Santa Cruz\n",
+      "271       person-name                                    Research Report\n",
+      "272              date                                               2021\n",
+      "273              note                                Online]. Available:\n",
+      "274               url  https://evansdata.com/reports/viewRelease.php?... \n",
+      "\n",
+      "\n",
+      "[6] J. Arundel and J. Domingus, Cloud Native DevOps with Kubernetes:\n",
+      "Building, Deploying, and Scaling Modern Applications in the Cloud.\n",
+      "Sebastopol, CA: O'Reilly Media, Apr. 2019.\n",
+      "\n",
+      "entities:\n",
+      "               subtype                                               name\n",
+      "275  reference-number                                                  6\n",
+      "276           authors                         J. Arundel and J. Domingus\n",
+      "277       person-name                                          J Arundel\n",
+      "278       person-name                                         J Domingus\n",
+      "279             title  Cloud Native DevOps with Kubernetes: Building,...\n",
+      "280       person-name                                Cloud Native DevOps\n",
+      "281           journal                                  Reilly Media, Apr\n",
+      "282              date                                               2019 \n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "refs = props[(props[\"label\"]==\"reference\") & (props[\"confidence\"]>0.8)]\n",
+    "\n",
+    "cnt = 0\n",
+    "for i,ref in refs.iterrows():\n",
+    "    #print(ref)\n",
+    "\n",
+    "    item = resolve(ref[\"subj_path\"].split(\"/\"), res)\n",
+    "    print(\"\\n\".join(textwrap.wrap(item[\"text\"], 70)))\n",
+    "\n",
+    "    ents = insts[insts[\"subj_hash\"]==item[\"subj_hash\"]][[\"subtype\", \"name\"]]\n",
+    "    print(\"\\nentities:\\n\", ents, \"\\n\\n\")\n",
+    "\n",
+    "    \n",
+    "    cnt+=1\n",
+    "    if cnt>5:\n",
+    "        break\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cfeca54d-bbc1-4022-851d-0b29027de761",
+   "metadata": {},
+   "source": [
+    "## Extract Matedata from ingested documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "8bb459a8-7b26-4dc3-98da-b1b4a1b59fcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import standard dependenices\n",
+    "from copy import deepcopy\n",
+    "import pandas as pd\n",
+    "from numerize.numerize import numerize\n",
+    "from tqdm.notebook import tqdm\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "\n",
+    "# IPython utilities\n",
+    "from IPython.display import display, HTML\n",
+    "\n",
+    "# Import the deepsearch-toolkit\n",
+    "import deepsearch as ds\n",
+    "from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource\n",
+    "from deepsearch.cps.queries import DataQuery"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "d064166b-7578-437c-b3a6-b16eb3d95c1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Fetch list of all data collections\n",
+    "collections = api.elastic.list()\n",
+    "collections.sort(key=lambda c: c.name.lower())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "db9da464-23db-4562-a1ce-259a717f404a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Type</th>\n",
+       "      <th>Num entries</th>\n",
+       "      <th>Date</th>\n",
+       "      <th>Coords</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>AAAI</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>16.02K</td>\n",
+       "      <td>2023-08-29</td>\n",
+       "      <td>default/aaai</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ACL Anthology</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>55.28K</td>\n",
+       "      <td>2023-08-22</td>\n",
+       "      <td>default/acl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Annual Reports</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>107.38K</td>\n",
+       "      <td>2024-01-12</td>\n",
+       "      <td>default/annual-report</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>arXiv abstracts</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>2.37M</td>\n",
+       "      <td>2023-12-07</td>\n",
+       "      <td>default/arxiv-abstract</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>arXiv category taxonomy</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>155</td>\n",
+       "      <td>2023-12-05</td>\n",
+       "      <td>default/arxiv-category</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>arXiv full documents</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>2.29M</td>\n",
+       "      <td>2023-10-29</td>\n",
+       "      <td>default/arxiv</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>BioRxiv</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>357.76K</td>\n",
+       "      <td>2023-11-09</td>\n",
+       "      <td>default/biorxiv</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Brenda</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>7.12K</td>\n",
+       "      <td>2023-01-03</td>\n",
+       "      <td>default/brenda</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>ChEMBL</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>2.11M</td>\n",
+       "      <td>2023-01-03</td>\n",
+       "      <td>default/chembl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>ChemRxiv</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>8.82K</td>\n",
+       "      <td>2023-11-23</td>\n",
+       "      <td>default/chemrxiv</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>ClinicalTrials</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>426.42K</td>\n",
+       "      <td>2023-01-03</td>\n",
+       "      <td>default/clinical-trials</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>COD</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>503.78K</td>\n",
+       "      <td>2023-07-24</td>\n",
+       "      <td>default/cod</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Cord19</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>655.45K</td>\n",
+       "      <td>2022-11-17</td>\n",
+       "      <td>default/cord19</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Crossref</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>131.86M</td>\n",
+       "      <td>2023-02-22</td>\n",
+       "      <td>default/crossref</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Crossref journal list</td>\n",
+       "      <td>Reference</td>\n",
+       "      <td>100.52K</td>\n",
+       "      <td>2022-02-22</td>\n",
+       "      <td>default/crossref-journal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>D&amp;B Hoovers</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>10K</td>\n",
+       "      <td>2021-04-16</td>\n",
+       "      <td>default/swot-report</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>DeepSearch materials</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>360.54K</td>\n",
+       "      <td>2023-01-03</td>\n",
+       "      <td>default/ds4sd-material</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>DOAB</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>8.8K</td>\n",
+       "      <td>2023-12-04</td>\n",
+       "      <td>default/doab</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>DrugBank</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>4.44K</td>\n",
+       "      <td>2022-11-03</td>\n",
+       "      <td>default/drugbank</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>engrXiv</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>1.84K</td>\n",
+       "      <td>2023-01-03</td>\n",
+       "      <td>default/engrxiv</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>ESG Reports</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>17.36K</td>\n",
+       "      <td>2024-01-08</td>\n",
+       "      <td>default/esg-report</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>FDA Adverse Event Reporting System (FAERS)</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>435.62K</td>\n",
+       "      <td>2023-01-03</td>\n",
+       "      <td>default/faers</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>GenBank</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>260.36M</td>\n",
+       "      <td>2023-01-24</td>\n",
+       "      <td>default/genbank</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>HBCP Open Access Corpus</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>90</td>\n",
+       "      <td>2023-01-03</td>\n",
+       "      <td>default/hbcp</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>IBM Redbooks</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>2.75K</td>\n",
+       "      <td>2023-06-08</td>\n",
+       "      <td>default/ibm-redbooks</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>IEEE</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>61.95K</td>\n",
+       "      <td>2024-01-16</td>\n",
+       "      <td>default/ieee</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>International Patent Classification (IPC)</td>\n",
+       "      <td>Reference</td>\n",
+       "      <td>78.52K</td>\n",
+       "      <td>2022-02-22</td>\n",
+       "      <td>default/wipo-ipc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>IPCC</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>819</td>\n",
+       "      <td>2023-06-14</td>\n",
+       "      <td>default/ipcc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>Legal Entity Identifier</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>2.1M</td>\n",
+       "      <td>2023-08-16</td>\n",
+       "      <td>default/lei</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>Material Components</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>16.32K</td>\n",
+       "      <td>2023-01-30</td>\n",
+       "      <td>default/experiment</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>MedRxiv</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>69.18K</td>\n",
+       "      <td>2023-11-02</td>\n",
+       "      <td>default/medrxiv</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>NeurIPS</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>16.9K</td>\n",
+       "      <td>2023-09-24</td>\n",
+       "      <td>default/neurips</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>News</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>9.82M</td>\n",
+       "      <td>2023-09-10</td>\n",
+       "      <td>default/news</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>NMRShift</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>44.33K</td>\n",
+       "      <td>2023-01-03</td>\n",
+       "      <td>default/nmrshift</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>OpenCVF</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>26.94K</td>\n",
+       "      <td>2023-10-04</td>\n",
+       "      <td>default/opencvf</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>OpenStax</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>76</td>\n",
+       "      <td>2024-02-01</td>\n",
+       "      <td>default/openstax</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>OpenStreetMap</td>\n",
+       "      <td>Generic</td>\n",
+       "      <td>296.31M</td>\n",
+       "      <td>2023-03-12</td>\n",
+       "      <td>default/osm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>PatCID</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>13.03M</td>\n",
+       "      <td>2023-09-15</td>\n",
+       "      <td>default/patcid</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>Patent SMILES</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>2.84M</td>\n",
+       "      <td>2023-10-11</td>\n",
+       "      <td>default/patent-smiles</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>Patents from CNIPR</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2022-12-19</td>\n",
+       "      <td>default/patent-cnipr</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>Patents from EPO</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>7.09M</td>\n",
+       "      <td>2023-07-06</td>\n",
+       "      <td>default/patent-epo</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>41</th>\n",
+       "      <td>Patents from JPO</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>2.54M</td>\n",
+       "      <td>2024-01-08</td>\n",
+       "      <td>default/patent-jpo</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>42</th>\n",
+       "      <td>Patents from KIPO</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>1.8M</td>\n",
+       "      <td>2022-12-19</td>\n",
+       "      <td>default/patent-kipo</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43</th>\n",
+       "      <td>Patents from USPTO</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>16.16M</td>\n",
+       "      <td>2024-02-09</td>\n",
+       "      <td>default/patent-uspto</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44</th>\n",
+       "      <td>Patents from USPTO (TEST)</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>6.81K</td>\n",
+       "      <td>2024-03-13</td>\n",
+       "      <td>default/patent-uspto-test</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45</th>\n",
+       "      <td>PLOS</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>340.28K</td>\n",
+       "      <td>2024-01-10</td>\n",
+       "      <td>default/plos</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>46</th>\n",
+       "      <td>PubChem</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>118.24M</td>\n",
+       "      <td>2023-07-06</td>\n",
+       "      <td>default/pubchem</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>47</th>\n",
+       "      <td>PubMed Central</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>5.08M</td>\n",
+       "      <td>2023-03-01</td>\n",
+       "      <td>default/pubmed</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>48</th>\n",
+       "      <td>PubMed Central (PDF)</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>27.66K</td>\n",
+       "      <td>2024-01-22</td>\n",
+       "      <td>default/pmc-pdf</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>49</th>\n",
+       "      <td>Red Hat</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>7.17K</td>\n",
+       "      <td>2024-01-23</td>\n",
+       "      <td>default/redhat</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50</th>\n",
+       "      <td>RxNorm</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>374.18K</td>\n",
+       "      <td>2023-01-03</td>\n",
+       "      <td>default/rxnorm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>51</th>\n",
+       "      <td>SEC Edgar CIK Lookup</td>\n",
+       "      <td>Reference</td>\n",
+       "      <td>786K</td>\n",
+       "      <td>2022-02-22</td>\n",
+       "      <td>default/sec-cik</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>52</th>\n",
+       "      <td>SEC Edgar filings</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>56.38K</td>\n",
+       "      <td>2021-07-06</td>\n",
+       "      <td>default/sec-filing</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>53</th>\n",
+       "      <td>Semantic Scholar Academic Graph</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>216.85M</td>\n",
+       "      <td>2024-03-11</td>\n",
+       "      <td>default/semantic-scholar</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54</th>\n",
+       "      <td>SMILES from USPTO</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>116.48M</td>\n",
+       "      <td>2022-12-25</td>\n",
+       "      <td>default/patent-uspto-smiles</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>55</th>\n",
+       "      <td>SMILES from USPTO (fingerprints)</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>85.81M</td>\n",
+       "      <td>2023-02-23</td>\n",
+       "      <td>default/patent-uspto-smiles-fp</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>56</th>\n",
+       "      <td>UMLS</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>2.69M</td>\n",
+       "      <td>2023-01-03</td>\n",
+       "      <td>default/umls</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>57</th>\n",
+       "      <td>UniProt</td>\n",
+       "      <td>Record</td>\n",
+       "      <td>567.48K</td>\n",
+       "      <td>2023-01-03</td>\n",
+       "      <td>default/uniprot</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>58</th>\n",
+       "      <td>USPTO patents for NER</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>2.64K</td>\n",
+       "      <td>2023-03-20</td>\n",
+       "      <td>default/uspto-for-ner</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>59</th>\n",
+       "      <td>Wikipedia</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>6.45M</td>\n",
+       "      <td>2024-02-26</td>\n",
+       "      <td>default/wikipedia</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                          Name       Type Num entries  \\\n",
+       "0                                         AAAI   Document      16.02K   \n",
+       "1                                ACL Anthology   Document      55.28K   \n",
+       "2                               Annual Reports   Document     107.38K   \n",
+       "3                              arXiv abstracts   Document       2.37M   \n",
+       "4                      arXiv category taxonomy     Record         155   \n",
+       "5                         arXiv full documents   Document       2.29M   \n",
+       "6                                      BioRxiv   Document     357.76K   \n",
+       "7                                       Brenda     Record       7.12K   \n",
+       "8                                       ChEMBL     Record       2.11M   \n",
+       "9                                     ChemRxiv   Document       8.82K   \n",
+       "10                              ClinicalTrials   Document     426.42K   \n",
+       "11                                         COD     Record     503.78K   \n",
+       "12                                      Cord19   Document     655.45K   \n",
+       "13                                    Crossref   Document     131.86M   \n",
+       "14                       Crossref journal list  Reference     100.52K   \n",
+       "15                                 D&B Hoovers     Record         10K   \n",
+       "16                        DeepSearch materials     Record     360.54K   \n",
+       "17                                        DOAB   Document        8.8K   \n",
+       "18                                    DrugBank     Record       4.44K   \n",
+       "19                                     engrXiv   Document       1.84K   \n",
+       "20                                 ESG Reports   Document      17.36K   \n",
+       "21  FDA Adverse Event Reporting System (FAERS)   Document     435.62K   \n",
+       "22                                     GenBank     Record     260.36M   \n",
+       "23                     HBCP Open Access Corpus   Document          90   \n",
+       "24                                IBM Redbooks   Document       2.75K   \n",
+       "25                                        IEEE   Document      61.95K   \n",
+       "26   International Patent Classification (IPC)  Reference      78.52K   \n",
+       "27                                        IPCC   Document         819   \n",
+       "28                     Legal Entity Identifier     Record        2.1M   \n",
+       "29                         Material Components   Document      16.32K   \n",
+       "30                                     MedRxiv   Document      69.18K   \n",
+       "31                                     NeurIPS   Document       16.9K   \n",
+       "32                                        News   Document       9.82M   \n",
+       "33                                    NMRShift     Record      44.33K   \n",
+       "34                                     OpenCVF   Document      26.94K   \n",
+       "35                                    OpenStax   Document          76   \n",
+       "36                               OpenStreetMap    Generic     296.31M   \n",
+       "37                                      PatCID     Record      13.03M   \n",
+       "38                               Patent SMILES   Document       2.84M   \n",
+       "39                          Patents from CNIPR   Document           2   \n",
+       "40                            Patents from EPO   Document       7.09M   \n",
+       "41                            Patents from JPO   Document       2.54M   \n",
+       "42                           Patents from KIPO   Document        1.8M   \n",
+       "43                          Patents from USPTO   Document      16.16M   \n",
+       "44                   Patents from USPTO (TEST)   Document       6.81K   \n",
+       "45                                        PLOS   Document     340.28K   \n",
+       "46                                     PubChem     Record     118.24M   \n",
+       "47                              PubMed Central   Document       5.08M   \n",
+       "48                        PubMed Central (PDF)   Document      27.66K   \n",
+       "49                                     Red Hat   Document       7.17K   \n",
+       "50                                      RxNorm     Record     374.18K   \n",
+       "51                        SEC Edgar CIK Lookup  Reference        786K   \n",
+       "52                           SEC Edgar filings   Document      56.38K   \n",
+       "53             Semantic Scholar Academic Graph   Document     216.85M   \n",
+       "54                           SMILES from USPTO     Record     116.48M   \n",
+       "55            SMILES from USPTO (fingerprints)     Record      85.81M   \n",
+       "56                                        UMLS     Record       2.69M   \n",
+       "57                                     UniProt     Record     567.48K   \n",
+       "58                       USPTO patents for NER   Document       2.64K   \n",
+       "59                                   Wikipedia   Document       6.45M   \n",
+       "\n",
+       "          Date                          Coords  \n",
+       "0   2023-08-29                    default/aaai  \n",
+       "1   2023-08-22                     default/acl  \n",
+       "2   2024-01-12           default/annual-report  \n",
+       "3   2023-12-07          default/arxiv-abstract  \n",
+       "4   2023-12-05          default/arxiv-category  \n",
+       "5   2023-10-29                   default/arxiv  \n",
+       "6   2023-11-09                 default/biorxiv  \n",
+       "7   2023-01-03                  default/brenda  \n",
+       "8   2023-01-03                  default/chembl  \n",
+       "9   2023-11-23                default/chemrxiv  \n",
+       "10  2023-01-03         default/clinical-trials  \n",
+       "11  2023-07-24                     default/cod  \n",
+       "12  2022-11-17                  default/cord19  \n",
+       "13  2023-02-22                default/crossref  \n",
+       "14  2022-02-22        default/crossref-journal  \n",
+       "15  2021-04-16             default/swot-report  \n",
+       "16  2023-01-03          default/ds4sd-material  \n",
+       "17  2023-12-04                    default/doab  \n",
+       "18  2022-11-03                default/drugbank  \n",
+       "19  2023-01-03                 default/engrxiv  \n",
+       "20  2024-01-08              default/esg-report  \n",
+       "21  2023-01-03                   default/faers  \n",
+       "22  2023-01-24                 default/genbank  \n",
+       "23  2023-01-03                    default/hbcp  \n",
+       "24  2023-06-08            default/ibm-redbooks  \n",
+       "25  2024-01-16                    default/ieee  \n",
+       "26  2022-02-22                default/wipo-ipc  \n",
+       "27  2023-06-14                    default/ipcc  \n",
+       "28  2023-08-16                     default/lei  \n",
+       "29  2023-01-30              default/experiment  \n",
+       "30  2023-11-02                 default/medrxiv  \n",
+       "31  2023-09-24                 default/neurips  \n",
+       "32  2023-09-10                    default/news  \n",
+       "33  2023-01-03                default/nmrshift  \n",
+       "34  2023-10-04                 default/opencvf  \n",
+       "35  2024-02-01                default/openstax  \n",
+       "36  2023-03-12                     default/osm  \n",
+       "37  2023-09-15                  default/patcid  \n",
+       "38  2023-10-11           default/patent-smiles  \n",
+       "39  2022-12-19            default/patent-cnipr  \n",
+       "40  2023-07-06              default/patent-epo  \n",
+       "41  2024-01-08              default/patent-jpo  \n",
+       "42  2022-12-19             default/patent-kipo  \n",
+       "43  2024-02-09            default/patent-uspto  \n",
+       "44  2024-03-13       default/patent-uspto-test  \n",
+       "45  2024-01-10                    default/plos  \n",
+       "46  2023-07-06                 default/pubchem  \n",
+       "47  2023-03-01                  default/pubmed  \n",
+       "48  2024-01-22                 default/pmc-pdf  \n",
+       "49  2024-01-23                  default/redhat  \n",
+       "50  2023-01-03                  default/rxnorm  \n",
+       "51  2022-02-22                 default/sec-cik  \n",
+       "52  2021-07-06              default/sec-filing  \n",
+       "53  2024-03-11        default/semantic-scholar  \n",
+       "54  2022-12-25     default/patent-uspto-smiles  \n",
+       "55  2023-02-23  default/patent-uspto-smiles-fp  \n",
+       "56  2023-01-03                    default/umls  \n",
+       "57  2023-01-03                 default/uniprot  \n",
+       "58  2023-03-20           default/uspto-for-ner  \n",
+       "59  2024-02-26               default/wikipedia  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Visualize summary table\n",
+    "results = [\n",
+    "    {\n",
+    "        \"Name\": c.name,\n",
+    "        \"Type\": c.metadata.type,\n",
+    "        \"Num entries\": numerize(c.documents),\n",
+    "        \"Date\": c.metadata.created.strftime(\"%Y-%m-%d\"),\n",
+    "        \"Coords\": f\"{c.source.elastic_id}/{c.source.index_key}\",\n",
+    "    }\n",
+    "    for c in collections\n",
+    "]\n",
+    "display(pd.DataFrame(results))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "357340cc-97e3-44bc-aa28-41a1be1e9a20",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cb1357fa9d50499e929a520811253c24",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/60 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>matches</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>arXiv full documents</td>\n",
+       "      <td>165</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Semantic Scholar Academic Graph</td>\n",
+       "      <td>40</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>OpenCVF</td>\n",
+       "      <td>31</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>arXiv abstracts</td>\n",
+       "      <td>24</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ACL Anthology</td>\n",
+       "      <td>16</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              name  matches\n",
+       "0             arXiv full documents      165\n",
+       "1  Semantic Scholar Academic Graph       40\n",
+       "2                          OpenCVF       31\n",
+       "3                  arXiv abstracts       24\n",
+       "4                    ACL Anthology       16"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Input query\n",
+    "search_query = \"main-text.text:(\\\"DocLayNet\\\" OR \\\"PubLayNet\\\")\"\n",
+    "\n",
+    "# Iterate through the data collections\n",
+    "results = []\n",
+    "for c in (pbar := tqdm(collections)):\n",
+    "    pbar.set_description(f\"Querying {c.name}\")\n",
+    "\n",
+    "    # Search only on document collections\n",
+    "    if c.metadata.type != \"Document\":\n",
+    "        continue\n",
+    "\n",
+    "    # Execute the query\n",
+    "    query = DataQuery(search_query, source=[], limit=0, coordinates=c.source)\n",
+    "    query_results = api.queries.run(query)\n",
+    "    results.append({\n",
+    "        \"name\": c.name,\n",
+    "        \"matches\": query_results.outputs[\"data_count\"]\n",
+    "    })\n",
+    "\n",
+    "# Sort and display results\n",
+    "results.sort(reverse=True, key=lambda r: r[\"matches\"])\n",
+    "display(pd.DataFrame(results[0:5]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "90f84882-1c85-4b0a-b0eb-ea5bf0b41e32",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "21d681aa843745bca7374749f46e23e7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/33 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2007.12238.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  MiniConf is a framework for hosting virtual ac...\n",
+      "1     title  #/texts/2            MiniConf-A Virtual Conference Framework\n",
+      "2    author  #/texts/2                                   Alexander M Rush\n",
+      "3    author  #/texts/2                                   Hendrik Strobelt\n",
+      "2111.06016.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Analyzing the layout of a document to identify...\n",
+      "1     title  #/texts/2  SYNTHETIC DOCUMENT GENERATOR FOR ANNOTATION-FR...\n",
+      "2    author  #/texts/2                                       Natraj Raman\n",
+      "3    author  #/texts/2                                       Sameena Shah\n",
+      "4    author  #/texts/2                                     Manuela Veloso\n",
+      "2105.14931.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. We present d ocument d omain r andom...\n",
+      "1     title  #/texts/2  Document Domain Randomization for Deep Learnin...\n",
+      "2    author  #/texts/2                                          Meng Ling\n",
+      "3    author  #/texts/2                                          Jian Chen\n",
+      "4    author  #/texts/2                                   Michael Sedlmair\n",
+      "5    author  #/texts/2                                   Robert S Laramee\n",
+      "6    author  #/texts/2                                            Jian Wu\n",
+      "7    author  #/texts/2                                              C Lee\n",
+      "8    author  #/texts/9                            Old Dominion University\n",
+      "2102.02971.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-The triple-based knowledge in large-s...\n",
+      "1     title  #/texts/2  Metaknowledge Extraction Based on Multi-Modal ...\n",
+      "2    author  #/texts/2                                         Shukan Liu\n",
+      "3    author  #/texts/2                                          Ruilin Xu\n",
+      "4    author  #/texts/2                                        Boying Geng\n",
+      "5    author  #/texts/2                                           Qiao Sun\n",
+      "6    author  #/texts/2                                            Li Duan\n",
+      "7    author  #/texts/2                                         Yiming Liu\n",
+      "2003.13197.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Decomposing images of document pages into high...\n",
+      "1      title  #/texts/2  Cross-Domain Document Object Detection: Benchm...\n",
+      "2     author  #/texts/2                                             Kai Li\n",
+      "3     author  #/texts/2                                    Chris Tensmeyer\n",
+      "4     author  #/texts/2                                       Handong Zhao\n",
+      "5     author  #/texts/3                                Nikolaos Barmpalios\n",
+      "6     author  #/texts/3                                     Vlad I Morariu\n",
+      "7     author  #/texts/3                                   Varun Manjunatha\n",
+      "8     author  #/texts/3                                           Tong Sun\n",
+      "9     author  #/texts/3                                             Yun Fu\n",
+      "10    author  #/texts/4                            Northeastern University\n",
+      "11    author  #/texts/4                                     Adobe Research\n",
+      "12    author  #/texts/4                               Adobe Document Cloud\n",
+      "2111.08609.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Document AI, or Document Intelligence, is a re...\n",
+      "1     title  #/texts/3          Lei Cui, Yiheng Xu, Tengchao Lv, Furu Wei\n",
+      "2    author  #/texts/3                            Microsoft Research Asia\n",
+      "2209.00852.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Layout generation is a novel task in computer ...\n",
+      "1     title  #/texts/2  Geometry Aligned Variational Transformer for I...\n",
+      "2    author  #/texts/2                                        Yunning Cao\n",
+      "3    author  #/texts/2                                              Ye Ma\n",
+      "4    author  #/texts/2                                           Min Zhou\n",
+      "5    author  #/texts/2                                       Chuanbin Liu\n",
+      "6    author  #/texts/2                                        Hongtao Xie\n",
+      "7    author  #/texts/2                                        Tiezheng Ge\n",
+      "8    author  #/texts/2                                       Yuning Jiang\n",
+      "2203.09056.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  We introduce a new table detection and structu...\n",
+      "1     title  #/texts/2  Robust Table Detection and Structure Recogniti...\n",
+      "2    author  #/texts/2                                        Chixiang Ma\n",
+      "3    author  #/texts/2                                        Weihong Lin\n",
+      "4    author  #/texts/2                                            Lei Sun\n",
+      "5    author  #/texts/2                                         Qiang Huob\n",
+      "6    author  #/texts/3                            Microsoft Research Asia\n",
+      "2203.09638.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. We formulate the task of detecting l...\n",
+      "1     title  #/texts/2  Unified Line and Paragraph Detection by Graph ...\n",
+      "2    author  #/texts/2                                         Shuang Liu\n",
+      "3    author  #/texts/2                                       Renshen Wang\n",
+      "4    author  #/texts/2                                    Michalis Raptis\n",
+      "5    author  #/texts/2                                     Yasuhisa Fujii\n",
+      "6    author  #/texts/3                                          San Diego\n",
+      "7    author  #/texts/5                                    Google Research\n",
+      "2305.05836.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Accurate Named Entity Recognition (NER) is cru...\n",
+      "1     title  #/texts/2  Extracting Complex Named Entities in Legal Doc...\n",
+      "2    author  #/texts/3                                    Abhinav Agrawal\n",
+      "2305.02567.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Automatic layout generation that can synthesiz...\n",
+      "1     title  #/texts/2  LayoutDM: Transformer-based Diffusion Model fo...\n",
+      "2    author  #/texts/2                                         Shang Chai\n",
+      "2205.12840.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. We present a novel method, DistillAd...\n",
+      "1     title  #/texts/1  DistillAdapt: Source-Free Active Visual Domain...\n",
+      "2    author  #/texts/1                                Divya Kothandaraman\n",
+      "3    author  #/texts/1                                      Sumit Shekhar\n",
+      "4    author  #/texts/1                                 Abhilasha Sancheti\n",
+      "5    author  #/texts/1                                       Manoj Ghuhan\n",
+      "6    author  #/texts/1                                      Tripti Shukla\n",
+      "7    author  #/texts/1                                     Dinesh Manocha\n",
+      "8    author  #/texts/2                                     Adobe Research\n",
+      "2105.06400.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Information Extraction (IE) from the...\n",
+      "1     title  #/texts/2  TabLeX: A Benchmark Dataset for Structure and ...\n",
+      "2    author  #/texts/2                                        Harsh Desai\n",
+      "3    author  #/texts/2                                       Pratik Kayal\n",
+      "4    author  #/texts/2                                       Mayank Singh\n",
+      "2103.15348.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Abstract. Recent advances in document image an...\n",
+      "1      title  #/texts/2  LayoutParser : A Unified Toolkit for Deep Lear...\n",
+      "2     author  #/texts/2                                       Zejiang Shen\n",
+      "3     author  #/texts/2                                      Ruochen Zhang\n",
+      "4     author  #/texts/2                                       Melissa Dell\n",
+      "5     author  #/texts/2                       Benjamin Charles Germain Lee\n",
+      "6     author  #/texts/3                                      Jacob Carlson\n",
+      "7     author  #/texts/3                                         Weining Li\n",
+      "8     author  #/texts/4                                    Allen Institute\n",
+      "9     author  #/texts/5                                   Brown University\n",
+      "10    author  #/texts/6                                 Harvard University\n",
+      "2110.09915.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Previous works on key information extraction f...\n",
+      "1     title  #/texts/2  Entity Relation Extraction as Dependency Parsi...\n",
+      "2    author  #/texts/2                                          Yue Zhang\n",
+      "3    author  #/texts/2                                           Bo Zhang\n",
+      "4    author  #/texts/2                                           Rui Wang\n",
+      "5    author  #/texts/2                                         Junjie Cao\n",
+      "6    author  #/texts/2                                            Chen Li\n",
+      "7    author  #/texts/2                                           Zuyi Bao\n",
+      "2112.12353.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract: The volume of academic literature, s...\n",
+      "1     title  #/texts/1  LAME: Layout-Aware Metadata Extraction Approac...\n",
+      "2    author  #/texts/1                                        South Korea\n",
+      "3    author  #/texts/2                                        South Korea\n",
+      "4    author  #/texts/4                                        South Korea\n",
+      "2201.09745.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Since a vast number of tables can be easily co...\n",
+      "1     title  #/texts/2  Table Pre-training: A Survey on Model Architec...\n",
+      "2    author  #/texts/2                                         Haoyu Dong\n",
+      "3    author  #/texts/3                                 Microsoft Research\n",
+      "4    author  #/texts/4                                 Microsoft Research\n",
+      "5    author  #/texts/5                                             Ao Liu\n",
+      "6    author  #/texts/6                         Shi Han Microsoft Research\n",
+      "7    author  #/texts/7                   Dongmei Zhang Microsoft Research\n",
+      "2205.00347.pdf\n",
+      "Empty DataFrame\n",
+      "Columns: [subtype, subj_path, name]\n",
+      "Index: []\n",
+      "1911.10683.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Important information that relates to...\n",
+      "1     title  #/texts/2  Image-based table recognition: data, model, an...\n",
+      "2    author  #/texts/2                                           Xu Zhong\n",
+      "3    author  #/texts/2                                 Research Australia\n",
+      "4    author  #/texts/2                                          City Road\n",
+      "5    author  #/texts/3                               Elaheh ShafieiBavani\n",
+      "6    author  #/texts/3                                 Research Australia\n",
+      "7    author  #/texts/3                                          City Road\n",
+      "8    author  #/texts/4                                 Research Australia\n",
+      "9    author  #/texts/4                                          City Road\n",
+      "2305.06553.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. In this paper, we introduce WeLayout...\n",
+      "1     title  #/texts/2  WeLayout: WeChat Layout Analysis System for th...\n",
+      "2    author  #/texts/2                                    Mingliang Zhang\n",
+      "3    author  #/texts/2                                           Zhen Cao\n",
+      "4    author  #/texts/2                                         Juntao Liu\n",
+      "5    author  #/texts/2                                        Liqiang Niu\n",
+      "6    author  #/texts/2                                       Fandong Meng\n",
+      "7    author  #/texts/2                                           Jie Zhou\n",
+      "8    author  #/texts/3                                        Tencent Inc\n",
+      "2209.04460.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Scientific articles published prior ...\n",
+      "1     title  #/texts/2  Figure and Figure Caption Extraction for Mixed...\n",
+      "2    author  #/texts/2                                         J P Naiman\n",
+      "3    author  #/texts/2                                 Peter K G Williams\n",
+      "4    author  #/texts/4                                     Alyssa Goodman\n",
+      "2207.06695.pdf\n",
+      "     subtype   subj_path                                               name\n",
+      "0   abstract           #  This paper presents DavarOCR, an open-source t...\n",
+      "1      title   #/texts/2  DavarOCR: A Toolbox for OCR and Multi-Modal Do...\n",
+      "2     author   #/texts/2   Liang Qiao Hikvision Research Institute Hangzhou\n",
+      "3     author   #/texts/3    Hui Jiang Hikvision Research Institute Hangzhou\n",
+      "4     author   #/texts/4    Ying Chen Hikvision Research Institute Hangzhou\n",
+      "5     author   #/texts/6   Pengfei Li Hikvision Research Institute Hangzhou\n",
+      "6     author   #/texts/7  Zaisheng Li Hikvision Research Institute Hangzhou\n",
+      "7     author   #/texts/8   Baorui Zou Hikvision Research Institute Hangzhou\n",
+      "8     author   #/texts/9   Dashan Guo Hikvision Research Institute Hangzhou\n",
+      "9     author  #/texts/10    Yingda Xu Hikvision Research Institute Hangzhou\n",
+      "10    author  #/texts/11     Yunlu Xu Hikvision Research Institute Hangzhou\n",
+      "11    author  #/texts/12                                     Zhanzhan Cheng\n",
+      "12    author  #/texts/12              Hikvision Research Institute Hangzhou\n",
+      "13    author  #/texts/13       Yi Niu Hikvision Research Institute Hangzhou\n",
+      "2204.12974.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Existing image captioning systems are dedicate...\n",
+      "1     title  #/texts/2  CapOnImage: Context-driven Dense-Captioning on...\n",
+      "2    author  #/texts/2                                           Yiqi Gao\n",
+      "3    author  #/texts/3                                        Xinglin Hou\n",
+      "4    author  #/texts/4                                     Yuanmeng Zhang\n",
+      "5    author  #/texts/5                                        Tiezheng Ge\n",
+      "6    author  #/texts/6                                       Yuning Jiang\n",
+      "7    author  #/texts/7                                          Peng Wang\n",
+      "2305.10825.pdf\n",
+      "   subtype   subj_path                                          name\n",
+      "0   author   #/texts/3  Haoxing Chen Nanjing University Tiansuan Lab\n",
+      "1    title   #/texts/3                                             B\n",
+      "2   author   #/texts/4                                  Zhangxuan Gu\n",
+      "3   author   #/texts/6                                    Xing Zheng\n",
+      "4   author   #/texts/8                                 Changhua Meng\n",
+      "5   author  #/texts/10                                     Zhuoer Xu\n",
+      "6   author  #/texts/12                                       Jun Lan\n",
+      "7   author  #/texts/14                                     Yaohui Li\n",
+      "8   author  #/texts/15                                Research Group\n",
+      "9   author  #/texts/15                    Nanjing University Nanjing\n",
+      "10  author  #/texts/16                                    Huijia Zhu\n",
+      "11  author  #/texts/18                                 Weiqiang Wang\n",
+      "2302.11583.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Scientific articles published prior to the 'ag...\n",
+      "1     title  #/texts/2  The Digitization of Historical Astrophysical L...\n",
+      "2    author  #/texts/2                                      Jill P Naiman\n",
+      "3    author  #/texts/2                                 Peter K G Williams\n",
+      "4    author  #/texts/2                                     Alyssa Goodman\n",
+      "2203.04814.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Abstract. In this work, we propose Text-Degrad...\n",
+      "1      title  #/texts/1  Text-DIAE: Degradation Invariant Autoencoders ...\n",
+      "2     author  #/texts/1                               Mohamed Ali Souibgui\n",
+      "3     author  #/texts/1                                      Sanket Biswas\n",
+      "4     author  #/texts/1                                       Andres Mafla\n",
+      "5     author  #/texts/2                                   Ali Furkan Biten\n",
+      "6     author  #/texts/2                                        Alicia Forn\n",
+      "7     author  #/texts/2                                  Yousri Kessentini\n",
+      "8     author  #/texts/2                                         Josep Llad\n",
+      "9     author  #/texts/2                                        Lluis Gomez\n",
+      "10    author  #/texts/2                               Dimosthenis Karatzas\n",
+      "11    author  #/texts/3                             Computer Vision Center\n",
+      "12    author  #/texts/3                                    Universitat Aut\n",
+      "2110.02069.pdf\n",
+      "  subtype  subj_path                                               name\n",
+      "0   title  #/texts/2  OPAD: An Optimized Policy-based Active Learnin...\n",
+      "1  author  #/texts/2                                      Sumit Shekhar\n",
+      "2  author  #/texts/2                               Adobe Research India\n",
+      "3  author  #/texts/4                               Adobe Research India\n",
+      "4  author  #/texts/5                                       Ishan Jindal\n",
+      "5  author  #/texts/5                                      Roorkee India\n",
+      "6  author  #/texts/6                                        Avneet Jain\n",
+      "7  author  #/texts/6                                      Roorkee India\n",
+      "2106.07359.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Extracting metadata from scientific p...\n",
+      "1     title  #/texts/2  MexPub: Deep Transfer Learning for Metadata Ex...\n",
+      "2    author  #/texts/2  Zeyd Boukhers Nada Beili Timo Hartmann Prantik...\n",
+      "3    author  #/texts/3                                     Landau Koblenz\n",
+      "2308.14397.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-In this research paper, we introduce ...\n",
+      "1     title  #/texts/2  Ensemble of Anchor-Free Models for Robust Bang...\n",
+      "2    author  #/texts/2                                Mong Sain Chak Dept\n",
+      "3    author  #/texts/3                                   Asib Rahman Dept\n",
+      "2301.06629.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Designing visually appealing layouts for multi...\n",
+      "1     title  #/texts/2  Diverse Multimedia Layout Generation with Mult...\n",
+      "2    author  #/texts/2                                     David D Nguyen\n",
+      "3    author  #/texts/2                                   Sydney Australia\n",
+      "4    author  #/texts/3                                        Surya Nepal\n",
+      "5    author  #/texts/4                                    Salil S Kanhere\n",
+      "6    author  #/texts/4                                   Sydney Australia\n",
+      "2205.13724.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  We propose V-Doc, a question-answering tool us...\n",
+      "1     title  #/texts/1    V-Doc : Visual questions answers with Documents\n",
+      "2    author  #/texts/1                                         Yihao Ding\n",
+      "3    author  #/texts/1                                          Zhe Huang\n",
+      "4    author  #/texts/1                                        Runlin Wang\n",
+      "5    author  #/texts/1                                         Hang Zhang\n",
+      "6    author  #/texts/1                                        Xianru Chen\n",
+      "7    author  #/texts/1                                         Yuzhong Ma\n",
+      "8    author  #/texts/1                                      Hyunsuk Chung\n",
+      "9    author  #/texts/1                                   Soyeon Caren Han\n",
+      "2210.05391.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  A large amount of document data exists in unst...\n",
+      "1     title  #/texts/3  Chenxia Li, Ruoyu Guo, Jun Zhou, Mengtao An, Y...\n",
+      "2    author  #/texts/3                                          Baidu Inc\n",
+      "2306.02815.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  The extraction of text in high quality is esse...\n",
+      "1     title  #/texts/2  Transformer-Based UNet with Multi-Headed Cross...\n",
+      "2211.04934.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Business documents come in a variety of struct...\n",
+      "1     title  #/texts/2  DoSA : A System to Accelerate Annotations on B...\n",
+      "2    author  #/texts/2                                   Neelesh K Shukla\n",
+      "3    author  #/texts/2                                          Amit Vaid\n",
+      "4    author  #/texts/3                 State Street Corporation Bengaluru\n",
+      "2303.08137.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Controllable layout generation aims at synthes...\n",
+      "1     title  #/texts/2  LayoutDM: Discrete Diffusion Model for Control...\n",
+      "2    author  #/texts/2                                     Kotaro Kikuchi\n",
+      "3    author  #/texts/2                                         Mayu Otani\n",
+      "4    author  #/texts/3                                  Waseda University\n",
+      "2203.06947.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Recently, various multimodal networks for Visu...\n",
+      "1     title  #/texts/1  XYLayoutLM: Towards Layout-Aware Multimodal Ne...\n",
+      "2    author  #/texts/1                                       Zhangxuan Gu\n",
+      "3    author  #/texts/1                                      Changhua Meng\n",
+      "4    author  #/texts/1                                            Ke Wang\n",
+      "5    author  #/texts/1                                            Jun Lan\n",
+      "6    author  #/texts/1                                      Weiqiang Wang\n",
+      "7    author  #/texts/1                                            Ming Gu\n",
+      "8    author  #/texts/1                                       Liqing Zhang\n",
+      "9    author  #/texts/1                      Shanghai Jiao Tong University\n",
+      "2106.00676.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Classifying the core textual components of a s...\n",
+      "1     title  #/texts/2  Incorporating Visual Layout Structures for Sci...\n",
+      "2    author  #/texts/2                                       Zejiang Shen\n",
+      "3    author  #/texts/2                                            Kyle Lo\n",
+      "4    author  #/texts/2                                       Lucy Lu Wang\n",
+      "5    author  #/texts/2                                       Bailey Kuehl\n",
+      "6    author  #/texts/2                                      Daniel S Weld\n",
+      "7    author  #/texts/2                                        Doug Downey\n",
+      "8    author  #/texts/3                                    Allen Institute\n",
+      "2305.00795.pdf\n",
+      "   subtype  subj_path                                               name\n",
+      "0    title  #/texts/2  SelfDocSeg: A Self-Supervised vision-based App...\n",
+      "1   author  #/texts/2                                     Subhajit Maity\n",
+      "2   author  #/texts/2                                      Sanket Biswas\n",
+      "3   author  #/texts/3                                   Siladittya Manna\n",
+      "4   author  #/texts/3                                      Ayan Banerjee\n",
+      "5   author  #/texts/4                                         Josep Llad\n",
+      "6   author  #/texts/4                                Saumik Bhattacharya\n",
+      "7   author  #/texts/6                          Technology Innovation Hub\n",
+      "8   author  #/texts/7                             Computer Vision Center\n",
+      "9   author  #/texts/7                        Computer Science Department\n",
+      "10  author  #/texts/8                                    Universitat Aut\n",
+      "2202.01414.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Digitization of newspapers is of interest for ...\n",
+      "1     title  #/texts/2  DocBed: A Multi-Stage OCR Solution for Documen...\n",
+      "2    author  #/texts/2                                        Wenzhen Zhu\n",
+      "3    author  #/texts/2                                    Negin Sokhandan\n",
+      "4    author  #/texts/2                                         Guang Yang\n",
+      "5    author  #/texts/2                                     Sujitha Martin\n",
+      "6    author  #/texts/2                            Suchitra Sathyanarayana\n",
+      "2101.09465.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Web search is an essential way for human to ob...\n",
+      "1     title  #/texts/2  WebSRC: A Dataset for Web-Based Structural Rea...\n",
+      "2    author  #/texts/2                                            Lu Chen\n",
+      "3    author  #/texts/2                                        Xingyu Chen\n",
+      "4    author  #/texts/2                                         Zihan Zhao\n",
+      "5    author  #/texts/2                            Danyang Zhang Jiabao Ji\n",
+      "6    author  #/texts/2                                             Ao Luo\n",
+      "7    author  #/texts/2                                       Yuxuan Xiong\n",
+      "8    author  #/texts/2                                             Kai Yu\n",
+      "9    author  #/texts/3                Shanghai Jiao Tong University Cross\n",
+      "2009.14457.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  In this paper, we propose a multi-task learnin...\n",
+      "1     title  #/texts/2  Towards a Multi-modal, Multi-task Learning bas...\n",
+      "2    author  #/texts/2                                 Subhojeet Pramanik\n",
+      "3    author  #/texts/2                                  Shashank Mujumdar\n",
+      "4    author  #/texts/2                                         Hima Patel\n",
+      "2206.13155.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Multi-modal document pre-trained mode...\n",
+      "1     title  #/texts/2  Bi-VLDoc: Bidirectional Vision-Language Modeli...\n",
+      "2    author  #/texts/2                                         Chuwei Luo\n",
+      "3    author  #/texts/2                                        Guozhi Tang\n",
+      "4    author  #/texts/2                                           Qi Zheng\n",
+      "5    author  #/texts/2                                           Cong Yao\n",
+      "6    author  #/texts/2                                        Lianwen Jin\n",
+      "7    author  #/texts/2                                       Chenliang Li\n",
+      "8    author  #/texts/2                                           Yang Xue\n",
+      "9    author  #/texts/2                                             Luo Si\n",
+      "2305.14962.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Transforming documents into machine-...\n",
+      "1     title  #/texts/2  ICDAR 2023 Competition on Robust Layout Segmen...\n",
+      "2    author  #/texts/2                                     Christoph Auer\n",
+      "3    author  #/texts/2                                       Ahmed Nassar\n",
+      "4    author  #/texts/2                                       Maksym Lysak\n",
+      "5    author  #/texts/2                                      Michele Dolfi\n",
+      "6    author  #/texts/3                                Nikolaos Livathinos\n",
+      "7    author  #/texts/3                                        Peter Staar\n",
+      "2201.09407.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  The document layout analysis (DLA) aims to dec...\n",
+      "1     title  #/texts/2  CROSS-DOMAIN DOCUMENT LAYOUT ANALYSIS VIA UNSU...\n",
+      "2    author  #/texts/2                                        Xingjiao Wu\n",
+      "3    author  #/texts/2                                         Luwei Xiao\n",
+      "4    author  #/texts/2                                      Xiangcheng Du\n",
+      "5    author  #/texts/2                                      Yingbin Zheng\n",
+      "6    author  #/texts/2                                             Xin Li\n",
+      "7    author  #/texts/2                                        Tianlong Ma\n",
+      "8    author  #/texts/3                       East China Normal University\n",
+      "2106.13802.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Document image classification remains a popula...\n",
+      "1     title  #/texts/4  American Family Insurance, Machine Learning Re...\n",
+      "2101.10281.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Adobe's Portable Document Format (PDF) is a po...\n",
+      "1     title  #/texts/2   PAWLS : PDF Annotation With Labels and Structure\n",
+      "2    author  #/texts/2                                       Mark Neumann\n",
+      "3    author  #/texts/3                                       Zejiang Shen\n",
+      "4    author  #/texts/4                                     Sam Skjonsberg\n",
+      "2006.01038.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Document layout analysis usually relies on com...\n",
+      "1      title  #/texts/1              arXiv:2006.01038v1 [cs.CL] 1 Jun 2020\n",
+      "2     author  #/texts/1                                         Minghao Li\n",
+      "3     author  #/texts/1                                          Yiheng Xu\n",
+      "4     author  #/texts/1                                            Lei Cui\n",
+      "5     author  #/texts/1                                      Shaohan Huang\n",
+      "6     author  #/texts/1                                           Furu Wei\n",
+      "7     author  #/texts/1                                         Zhoujun Li\n",
+      "8     author  #/texts/1                                          Ming Zhou\n",
+      "9     author  #/texts/2                                 Beihang University\n",
+      "10    author  #/texts/4                            Microsoft Research Asia\n",
+      "2303.05049.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Layout generation aims to synthesize realistic...\n",
+      "1     title  #/texts/2  Unifying Lay out Generation with a Decoupled D...\n",
+      "2    author  #/texts/2                                     Zhizheng Zhang\n",
+      "3    author  #/texts/2                                       Xiaoyi Zhang\n",
+      "4    author  #/texts/2                                        Wenxuan Xie\n",
+      "5    author  #/texts/2                                        Yuwang Wang\n",
+      "6    author  #/texts/2                                             Yan Lu\n",
+      "7    author  #/texts/3                                Jiaotong University\n",
+      "8    author  #/texts/3                            Microsoft Research Asia\n",
+      "9    author  #/texts/3                                Tsinghua University\n",
+      "2105.06220.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Document layout analysis is crucial ...\n",
+      "1     title  #/texts/2  VSR: A Unified Framework for Document Layout A...\n",
+      "2    author  #/texts/2                                         Peng Zhang\n",
+      "3    author  #/texts/2                                         Liang Qiao\n",
+      "4    author  #/texts/2                                     Zhanzhan Cheng\n",
+      "5    author  #/texts/2                                             Yi Niu\n",
+      "6    author  #/texts/2                                             Fei Wu\n",
+      "7    author  #/texts/3                       Hikvision Research Institute\n",
+      "8    author  #/texts/6                                Zhejiang University\n",
+      "2106.11797.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Automatically recognizing the layout of handwr...\n",
+      "1     title  #/texts/2  Evaluation of a Region Proposal Architecture f...\n",
+      "2    author  #/texts/2                                       Lorenzo Quir\n",
+      "3    author  #/texts/2                                      Enrique Vidal\n",
+      "4    author  #/texts/3                                    Research Center\n",
+      "5    author  #/texts/3                                  Universitat Polit\n",
+      "2304.11810.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Document layout analysis has a wide range of r...\n",
+      "1     title  #/texts/2  PARAGRAPH2GRAPH: A GNN-BASED FRAMEWORK FOR LAY...\n",
+      "2    author  #/texts/2                                            Shu Wei\n",
+      "3    author  #/texts/3                                 Datagrand Tech Inc\n",
+      "4    author  #/texts/4                          Nuo Xu Datagrand Tech Inc\n",
+      "5    author  #/texts/5                                         Deng Huang\n",
+      "6    author  #/texts/6                                 Datagrand Tech Inc\n",
+      "7    author  #/texts/7                       Xiang Gao Datagrand Tech Inc\n",
+      "2203.16850.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  This paper addresses the problem of document i...\n",
+      "1     title  #/texts/1  Revisiting Document Image Dewarping by Grid Re...\n",
+      "2    author  #/texts/1                                     Xiangwei Jiang\n",
+      "3    author  #/texts/1                                        Rujiao Long\n",
+      "4    author  #/texts/1                                            Nan Xue\n",
+      "5    author  #/texts/1                                         Zhibo Yang\n",
+      "6    author  #/texts/1                                           Cong Yao\n",
+      "7    author  #/texts/1                                   Wuhan University\n",
+      "8    author  #/texts/1                                   Wuhan University\n",
+      "2303.03755.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Generating visual layouts is an essential ingr...\n",
+      "1     title  #/texts/2  DLT: Conditioned layout generation with Joint ...\n",
+      "2    author  #/texts/2                                          Elad Levi\n",
+      "3    author  #/texts/2                                          Eli Brosh\n",
+      "4    author  #/texts/2                                  Mykola Mykhailych\n",
+      "5    author  #/texts/2                                     Meir Perez Wix\n",
+      "2110.08164.pdf\n",
+      "   subtype   subj_path                                          name\n",
+      "0   author   #/texts/3                                  Penghai Zhao\n",
+      "1    title   #/texts/3                                    A PREPRINT\n",
+      "2   author   #/texts/4                              Ethnic Languages\n",
+      "3   author   #/texts/4  Education Northwest Minzu University Lanzhou\n",
+      "4   author   #/texts/5                                   Zhengqi Cai\n",
+      "5   author   #/texts/7                                   Weilan Wang\n",
+      "6   author   #/texts/8                              Ethnic Languages\n",
+      "7   author   #/texts/8  Education Northwest Minzu University Lanzhou\n",
+      "8   author   #/texts/9                                  Guowei Zhang\n",
+      "9   author  #/texts/10                              Ethnic Languages\n",
+      "10  author  #/texts/10  Education Northwest Minzu University Lanzhou\n",
+      "11  author  #/texts/11                                       Yuqi Lu\n",
+      "12  author  #/texts/12                              Ethnic Languages\n",
+      "13  author  #/texts/12  Education Northwest Minzu University Lanzhou\n",
+      "2209.06584.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Active consumption of digital documents has yi...\n",
+      "1     title  #/texts/2  One-Shot Doc Snippet Detection: Powering Searc...\n",
+      "2    author  #/texts/2                                       Abhinav Java\n",
+      "3    author  #/texts/2                                     Milan Aggarwal\n",
+      "4    author  #/texts/2                                     Surgan Jandial\n",
+      "5    author  #/texts/2                                     Mausoom Sarkar\n",
+      "6    author  #/texts/2                               Balaji Krishnamurthy\n",
+      "7    author  #/texts/3                         Data Science Research Labs\n",
+      "2003.07560.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Tabular data is a crucial form of inf...\n",
+      "1    author  #/texts/3                                           Yiren Li\n",
+      "2     title  #/texts/3                                                  ‖\n",
+      "3    author  #/texts/3                                        Zheng Huang\n",
+      "4    author  #/texts/3                                         Junchi Yan\n",
+      "5    author  #/texts/3                                            Yi Zhou\n",
+      "6    author  #/texts/3                                             Fan Ye\n",
+      "7    author  #/texts/3                                        Xianhui Liu\n",
+      "8    author  #/texts/6                      Shanghai Jiao Tong University\n",
+      "9    author  #/texts/7                      Shanghai Jiao Tong University\n",
+      "2203.01017.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #                            a. Picture of a table: \n",
+      "1     title  #/texts/2  TableFormer: Table Structure Understanding wit...\n",
+      "2    author  #/texts/2                                       Ahmed Nassar\n",
+      "3    author  #/texts/2                                Nikolaos Livathinos\n",
+      "4    author  #/texts/2                                       Maksym Lysak\n",
+      "5    author  #/texts/2                                        Peter Staar\n",
+      "2208.08037.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  To satisfy various user needs, different subta...\n",
+      "1      title  #/texts/2  UniLayout: Taming Unified Sequence-to-Sequence...\n",
+      "2     author  #/texts/2                                      Zhaoyun Jiang\n",
+      "3     author  #/texts/2                                         Huayu Deng\n",
+      "4     author  #/texts/2                                        Zhongkai Wu\n",
+      "5     author  #/texts/2                                          Jiaqi Guo\n",
+      "6     author  #/texts/2                                        Shizhao Sun\n",
+      "7     author  #/texts/2                                     Vuksan Mijovic\n",
+      "8     author  #/texts/2                                       Zijiang Yang\n",
+      "9     author  #/texts/2                                      Dongmei Zhang\n",
+      "10    author  #/texts/3                                Jiaotong University\n",
+      "11    author  #/texts/3                       Shanghai Jiaotong University\n",
+      "12    author  #/texts/4                                 Beihang University\n",
+      "13    author  #/texts/4                            Microsoft Research Asia\n",
+      "2207.12955.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Most existing scene text detectors f...\n",
+      "1     title  #/texts/2  Contextual Text Block Detection towards Scene ...\n",
+      "2    author  #/texts/2                                         Chuhui Xue\n",
+      "3    author  #/texts/2                                      Jiaxing Huang\n",
+      "4    author  #/texts/2                                         Shijian Lu\n",
+      "5    author  #/texts/2                                       Changhu Wang\n",
+      "6    author  #/texts/2                                           Song Bai\n",
+      "7    author  #/texts/3                   Nanyang Technological University\n",
+      "8    author  #/texts/5                                          Dance Inc\n",
+      "2305.02769.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Table detection is the task of class...\n",
+      "1     title  #/texts/2  Towards End-to-End Semi-Supervised Table Detec...\n",
+      "2    author  #/texts/2                                    Tahira Shehzadi\n",
+      "3    author  #/texts/2                               Khurram Azeem Hashmi\n",
+      "4    author  #/texts/2                                    Didier Stricker\n",
+      "5    author  #/texts/2                                     Marcus Liwicki\n",
+      "6    author  #/texts/2                              Muhammad Zeshan Afzal\n",
+      "2303.05325.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Abstract. While strides have been made in deep...\n",
+      "1      title  #/texts/2  BaDLAD: A Large Multi-Domain Bengali Document ...\n",
+      "2     author  #/texts/2                              Istiak Hossain Shihab\n",
+      "3     author  #/texts/2                                      Rakibul Hasan\n",
+      "4     author  #/texts/2                               Mahfuzur Rahman Emon\n",
+      "5     author  #/texts/3                               Syed Mobassir Hossen\n",
+      "6     author  #/texts/3                                  Nazmuddoha Ansary\n",
+      "7     author  #/texts/3                                      Intesur Ahmed\n",
+      "8     author  #/texts/3                                        Fazle Rabbi\n",
+      "9     author  #/texts/4                              Shahriar Elahi Dhruvo\n",
+      "10    author  #/texts/4                                   Akib Hasan Pavel\n",
+      "11    author  #/texts/5                                Marsia Haque Meghla\n",
+      "12    author  #/texts/5                                     Rezwanul Haque\n",
+      "13    author  #/texts/5                            Sayma Sultana Chowdhury\n",
+      "14    author  #/texts/6                                      Tahsin Reasat\n",
+      "15    author  #/texts/6                               Ahmed Imtiaz Humayun\n",
+      "16    author  #/texts/6                                       Asif Sushmit\n",
+      "17    author  #/texts/8                              Vanderbilt University\n",
+      "2011.13534.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Documents are a core part of many businesses i...\n",
+      "1     title  #/texts/2  A Survey of Deep Learning Approaches for OCR a...\n",
+      "2    author  #/texts/2                            Nishant Subramani Scale\n",
+      "3    author  #/texts/3                             Alexandre Matton Scale\n",
+      "4    author  #/texts/6                                         Adrian Lam\n",
+      "2308.14978.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Document pre-trained models and grid-based mod...\n",
+      "1     title  #/texts/2  Vision Grid Transformer for Document Layout An...\n",
+      "2    author  #/texts/2                                           Cheng Da\n",
+      "3    author  #/texts/2                                         Chuwei Luo\n",
+      "4    author  #/texts/2                                           Qi Zheng\n",
+      "5    author  #/texts/2                                           Cong Yao\n",
+      "2207.11871.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Document Visual Question Answering (VQA) aims ...\n",
+      "1     title  #/texts/2  Towards Complex Document Understanding By Disc...\n",
+      "2    author  #/texts/2                                        Fengbin Zhu\n",
+      "3    author  #/texts/2                                       Wenqiang Lei\n",
+      "4    author  #/texts/2                                          Fuli Feng\n",
+      "5    author  #/texts/2                                          Chao Wang\n",
+      "6    author  #/texts/2                                      Haozhou Zhang\n",
+      "7    author  #/texts/4                                    Estates Pte Ltd\n",
+      "8    author  #/texts/5                                 Sichuan University\n",
+      "2308.11788.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. We present an extensible method for ...\n",
+      "1     title  #/texts/2  An extensible point-based method for data char...\n",
+      "2    author  #/texts/2                                        Carlos Soto\n",
+      "3    author  #/texts/2                                        Shinjae Yoo\n",
+      "4    author  #/texts/3                     Brookhaven National Laboratory\n",
+      "1912.13318.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Pre-training techniques have been verified suc...\n",
+      "1     title  #/texts/2  LayoutLM: Pre-training of Text and Layout for ...\n",
+      "2    author  #/texts/2                                          Yiheng Xu\n",
+      "3    author  #/texts/4                                      Shaohan Huang\n",
+      "4    author  #/texts/4                            Microsoft Research Asia\n",
+      "5    author  #/texts/5                                           Furu Wei\n",
+      "6    author  #/texts/5                            Microsoft Research Asia\n",
+      "7    author  #/texts/6                                          Ming Zhou\n",
+      "8    author  #/texts/6                            Microsoft Research Asia\n",
+      "2208.11203.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Tables are widely used in several typ...\n",
+      "1     title  #/texts/2  Graph Neural Networks and Representation Embed...\n",
+      "2    author  #/texts/2                                     Andrea Gemelli\n",
+      "3    author  #/texts/3                                    Emanuele Vivoli\n",
+      "2305.02549.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  The recent advent of self-supervised pretraini...\n",
+      "1      title  #/texts/2  FormNetV2: Multimodal Graph Contrastive Learni...\n",
+      "2     author  #/texts/2                                          Hao Zhang\n",
+      "3     author  #/texts/2                                      Timothy Dozat\n",
+      "4     author  #/texts/2                                      Vincent Perot\n",
+      "5     author  #/texts/2                                         Guolong Su\n",
+      "6     author  #/texts/2                                        Xiang Zhang\n",
+      "7     author  #/texts/2                                   Nikolai Glushnev\n",
+      "8     author  #/texts/2                                       Renshen Wang\n",
+      "9     author  #/texts/2                                     Joshua Ainslie\n",
+      "10    author  #/texts/2                                     Shangbang Long\n",
+      "11    author  #/texts/2                                         Siyang Qin\n",
+      "12    author  #/texts/2                                     Yasuhisa Fujii\n",
+      "13    author  #/texts/2                                            Nan Hua\n",
+      "14    author  #/texts/2                                       Google Cloud\n",
+      "15    author  #/texts/2                                    Google Research\n",
+      "16    author  #/texts/2                                       Google Cloud\n",
+      "2104.02416.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Generative models able to synthesize layouts o...\n",
+      "1     title  #/texts/2  Variational Transformer Networks for Layout Ge...\n",
+      "2    author  #/texts/2                                Diego Martin Arroyo\n",
+      "3    author  #/texts/4                                      Janis Postels\n",
+      "2206.01062.pdf\n",
+      "  subtype  subj_path                                               name\n",
+      "0   title  #/texts/2  DocLayNet: A Large Human-Annotated Dataset for...\n",
+      "1  author  #/texts/2                                   Birgit Pfitzmann\n",
+      "2  author  #/texts/2                               Research Rueschlikon\n",
+      "3  author  #/texts/3                                     Christoph Auer\n",
+      "4  author  #/texts/4                               Research Rueschlikon\n",
+      "5  author  #/texts/5                               Research Rueschlikon\n",
+      "6  author  #/texts/6                                        Peter Staar\n",
+      "7  author  #/texts/6                               Research Rueschlikon\n",
+      "2212.09621.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Unsupervised pre-training on millions of digit...\n",
+      "1      title  #/texts/2  WUKONG-READER: Multi-modal Pre-training for Fi...\n",
+      "2     author  #/texts/2                                          Haoli Bai\n",
+      "3     author  #/texts/2                                       Zhiguang Liu\n",
+      "4     author  #/texts/2                                       Xiaojun Meng\n",
+      "5     author  #/texts/2                                          Wentao Li\n",
+      "6     author  #/texts/2                                         Shuang Liu\n",
+      "7     author  #/texts/2                                           Nian Xie\n",
+      "8     author  #/texts/2                                       Rongfu Zheng\n",
+      "9     author  #/texts/2                                      Liangwei Wang\n",
+      "10    author  #/texts/2                                             Lu Hou\n",
+      "11    author  #/texts/2                                      Jiansheng Wei\n",
+      "12    author  #/texts/2                                          Xin Jiang\n",
+      "13    author  #/texts/2                                Qun Liu Huawei Noah\n",
+      "14    author  #/texts/2                                            Ark Lab\n",
+      "2309.09506.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Graphic layout generation, a growing research ...\n",
+      "1     title  #/texts/2  LAYOUTNUWA: REVEALING THE HIDDEN LAYOUT EXPERT...\n",
+      "2    author  #/texts/2                                       Zecheng Tang\n",
+      "3    author  #/texts/2                                         Chenfei Wu\n",
+      "4    author  #/texts/2                                          Juntao Li\n",
+      "5    author  #/texts/2                                           Nan Duan\n",
+      "6    author  #/texts/2                                 Soochow University\n",
+      "7    author  #/texts/2                            Microsoft Research Asia\n",
+      "2305.03393.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Extracting tables from documents is ...\n",
+      "1     title  #/texts/2  Optimized Table Tokenization for Table Structu...\n",
+      "2    author  #/texts/2                                       Maksym Lysak\n",
+      "3    author  #/texts/2                                       Ahmed Nassar\n",
+      "4    author  #/texts/3                                Nikolaos Livathinos\n",
+      "5    author  #/texts/3                                     Christoph Auer\n",
+      "6    author  #/texts/4                                        Peter Staar\n",
+      "2206.00785.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Document understanding is a key busin...\n",
+      "1     title  #/texts/2  Delivering Document Conversion as a Cloud Serv...\n",
+      "2    author  #/texts/2                                     Christoph Auer\n",
+      "3    author  #/texts/2                                Research Ruschlikon\n",
+      "4    author  #/texts/3                                Research Ruschlikon\n",
+      "5    author  #/texts/4                                      Michele Dolfi\n",
+      "6    author  #/texts/4                                Research Ruschlikon\n",
+      "7    author  #/texts/5                                            J Staar\n",
+      "8    author  #/texts/5                                Research Ruschlikon\n",
+      "2108.01249.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  unified manner, 2) how to formulate the learni...\n",
+      "1     title  #/texts/2                          Kota Yamaguchi CyberAgent\n",
+      "2106.15117.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  We present a novel data generation tool for do...\n",
+      "1     title  #/texts/2  SDL: NEW DATA GENERATION TOOLS FOR FULL-LEVEL ...\n",
+      "2    author  #/texts/2                                  Nguyen Truong Son\n",
+      "2303.13839.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  The problem of document structure reconstructi...\n",
+      "1     title  #/texts/2  HRDoc: Dataset and Baseline Method toward Hier...\n",
+      "2    author  #/texts/2                                         Jiefeng Ma\n",
+      "3    author  #/texts/2                                             Jun Du\n",
+      "4    author  #/texts/2                                         Pengfei Hu\n",
+      "5    author  #/texts/2                                     Zhenrong Zhang\n",
+      "6    author  #/texts/2                                      Jianshu Zhang\n",
+      "7    author  #/texts/2                                         Huihui Zhu\n",
+      "8    author  #/texts/2                                           Cong Liu\n",
+      "2004.08686.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Deep learning-based approaches for automatic d...\n",
+      "1     title  #/texts/2  A Large Dataset of Historical Japanese Documen...\n",
+      "2    author  #/texts/2            Zejiang Shen Kaixuan Zhang Melissa Dell\n",
+      "3    author  #/texts/3                                 Harvard University\n",
+      "2305.10448.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  This paper presents GenDoc, a general sequence...\n",
+      "1     title  #/texts/2  Sequence-to-Sequence Pre-training with Unified...\n",
+      "2    author  #/texts/2                                        Shuwei Feng\n",
+      "3    author  #/texts/2                                      Tianyang Zhan\n",
+      "4    author  #/texts/2          Zhanming Jie Trung Quoc Luong Xiaoran Jin\n",
+      "2108.09433.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Precise boundary annotations of imag...\n",
+      "1     title  #/texts/5                                                  )\n",
+      "2203.15143.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Scene text detection and document layout analy...\n",
+      "1     title  #/texts/2  Towards End-to-End Unified Scene Text Detectio...\n",
+      "2    author  #/texts/2                                     Shangbang Long\n",
+      "3    author  #/texts/2                                         Siyang Qin\n",
+      "4    author  #/texts/2                                   Dmitry Panteleev\n",
+      "5    author  #/texts/2                                Alessandro Bissacco\n",
+      "6    author  #/texts/2                                     Yasuhisa Fujii\n",
+      "7    author  #/texts/2                    Michalis Raptis Google Research\n",
+      "2212.09877.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Graphic layout designs play an essential role ...\n",
+      "1      title  #/texts/2  LayoutDETR: Detection Transformer Is a Good Mu...\n",
+      "2     author  #/texts/2                                            Ning Yu\n",
+      "3     author  #/texts/2                                        Zeyuan Chen\n",
+      "4     author  #/texts/2                                           Rui Meng\n",
+      "5     author  #/texts/2                                            Gang Wu\n",
+      "6     author  #/texts/3                                         Paul Josel\n",
+      "7     author  #/texts/3                                Juan Carlos Niebles\n",
+      "8     author  #/texts/3                                      Caiming Xiong\n",
+      "9     author  #/texts/3                                             Ran Xu\n",
+      "10    author  #/texts/4                                Salesforce Research\n",
+      "2303.10787.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. We develop a diffusion-based approac...\n",
+      "1     title  #/texts/2         Diffusion-based Document Layout Generation\n",
+      "2    author  #/texts/2                                          Yijuan Lu\n",
+      "3    author  #/texts/2                                       John Corring\n",
+      "4    author  #/texts/2                                    Dinei Florencio\n",
+      "5    author  #/texts/2                                          Cha Zhang\n",
+      "6    author  #/texts/3                                  Purdue University\n",
+      "7    author  #/texts/3                                     West Lafayette\n",
+      "8    author  #/texts/4                                    Microsoft Cloud\n",
+      "2201.11438.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Understanding documents with rich la...\n",
+      "1     title  #/texts/2  DocSegTr: An Instance-Level End-to-End Documen...\n",
+      "2    author  #/texts/2                                      Sanket Biswas\n",
+      "3    author  #/texts/2                                      Ayan Banerjee\n",
+      "4    author  #/texts/2                                         Josep Llad\n",
+      "5    author  #/texts/3                             Computer Vision Center\n",
+      "6    author  #/texts/3                        Computer Science Department\n",
+      "7    author  #/texts/4                                    Universitat Aut\n",
+      "2211.08863.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Infographics are often an integral component o...\n",
+      "1     title  #/texts/2  ChartParser: Automatic Chart Parsing for Print...\n",
+      "2    author  #/texts/3                                     Anukriti Kumar\n",
+      "3    author  #/texts/3                                        Tanuja Ganu\n",
+      "4    author  #/texts/3                                        Saikat Guha\n",
+      "5    author  #/texts/4                                 Microsoft Research\n",
+      "2111.05736.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Nowadays, metadata information is often given ...\n",
+      "1     title  #/texts/2  Multimodal Approach for Metadata Extraction fr...\n",
+      "2    author  #/texts/2                               Azeddine Bouabdallah\n",
+      "3    author  #/texts/3                                      Jorge Gavilan\n",
+      "4    author  #/texts/4                                     Jennifer Gerbl\n",
+      "5    author  #/texts/5                            Prayuth Patumcharoenpol\n",
+      "2206.10253.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Printed documents continue to be a challenge f...\n",
+      "1    author  #/texts/2  Anukriti Kumar Tanuja Ganu Saikat Guha Microso...\n",
+      "2     title  #/texts/2   Document Navigability: A Need for Print-Impaired\n",
+      "2306.01058.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Recent work has shown that infusing layout fea...\n",
+      "1     title  #/texts/2  Are Layout-Infused Language Models Robust to L...\n",
+      "2    author  #/texts/2                                     Catherine Chen\n",
+      "3    author  #/texts/2                                       Zejiang Shen\n",
+      "4    author  #/texts/2                                          Dan Klein\n",
+      "5    author  #/texts/3                                  Gabriel Stanovsky\n",
+      "6    author  #/texts/3                                        Doug Downey\n",
+      "7    author  #/texts/3                                            Kyle Lo\n",
+      "8    author  #/texts/4                                    Allen Institute\n",
+      "9    author  #/texts/4                            Northwestern University\n",
+      "2112.05112.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Creating visual layouts is an important step i...\n",
+      "1     title  #/texts/2  BLT: Bidirectional Layout Transformer for Cont...\n",
+      "2    author  #/texts/2                                         Xiang Kong\n",
+      "3    author  #/texts/2                                           Lu Jiang\n",
+      "4    author  #/texts/2                                       Huiwen Chang\n",
+      "5    author  #/texts/2                                          Han Zhang\n",
+      "6    author  #/texts/2                                           Yuan Hao\n",
+      "7    author  #/texts/2                                       Haifeng Gong\n",
+      "8    author  #/texts/2                                         Irfan Essa\n",
+      "2303.00289.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  In this paper, we present StrucTexTv2, an effe...\n",
+      "1      title  #/texts/2  STRUCTEXTV2: MASKED VISUAL-TEXTUAL PREDIC-TION...\n",
+      "2     author  #/texts/2                                         Yuechen Yu\n",
+      "3     author  #/texts/2                                           Yulin Li\n",
+      "4     author  #/texts/2                                    Chengquan Zhang\n",
+      "5     author  #/texts/2                                    Xiaoqiang Zhang\n",
+      "6     author  #/texts/2                                       Zengyuan Guo\n",
+      "7     author  #/texts/3                                        Xiameng Qin\n",
+      "8     author  #/texts/3                                            Kun Yao\n",
+      "9     author  #/texts/3                                          Junyu Han\n",
+      "10    author  #/texts/3                                         Errui Ding\n",
+      "11    author  #/texts/3                                      Jingdong Wang\n",
+      "12    author  #/texts/4                                          Baidu Inc\n",
+      "2106.03331.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  We propose SelfDoc, a task-agnostic pre-traini...\n",
+      "1      title  #/texts/2  SelfDoc: Self-Supervised Document Representati...\n",
+      "2     author  #/texts/2                                         Peizhao Li\n",
+      "3     author  #/texts/2                                        Jiuxiang Gu\n",
+      "4     author  #/texts/2                                     Vlad I Morariu\n",
+      "5     author  #/texts/2                                       Handong Zhao\n",
+      "6     author  #/texts/2                                         Rajiv Jain\n",
+      "7     author  #/texts/2                                   Varun Manjunatha\n",
+      "8     author  #/texts/2                                         Hongfu Liu\n",
+      "9     author  #/texts/2                                Brandeis University\n",
+      "10    author  #/texts/2                                     Adobe Research\n",
+      "2210.17246.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract Scientific documents contain tables t...\n",
+      "1     title  #/texts/2  Tables to LaTeX: Structure and Content Extract...\n",
+      "2    author  #/texts/2                                       Pratik Kayal\n",
+      "3    author  #/texts/2                                        Harsh Desai\n",
+      "4    author  #/texts/2                                       Mayank Singh\n",
+      "2211.15504.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  This paper presents an application of the Layo...\n",
+      "1     title  #/texts/2           Semantic Table Detection with LayoutLMv3\n",
+      "2    author  #/texts/2                                       Ivan Silajev\n",
+      "3    author  #/texts/3                                       Niels Victor\n",
+      "2304.01577.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Compared to general document analysis tasks, f...\n",
+      "1     title  #/texts/2  Form-NLU: Dataset for the Form Language Unders...\n",
+      "2    author  #/texts/7                          Hyunsuk Chung FortifyEdge\n",
+      "2107.02638.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Despite significant progress on curr...\n",
+      "1     title  #/texts/2  DocSynth: A Layout Guided Approach for Control...\n",
+      "2    author  #/texts/2                                      Sanket Biswas\n",
+      "3    author  #/texts/2                                           Pau Riba\n",
+      "4    author  #/texts/4                             Computer Vision Center\n",
+      "5    author  #/texts/4                        Computer Science Department\n",
+      "6    author  #/texts/5                                    Universitat Aut\n",
+      "2307.16369.pdf\n",
+      "    subtype subj_path                                               name\n",
+      "0  abstract         #  Document understanding and information extract...\n",
+      "2204.08387.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Self-supervised pre-training techniques have a...\n",
+      "1     title  #/texts/2  LayoutLMv3: Pre-training for Document AI with ...\n",
+      "2    author  #/texts/2                                        Yupan Huang\n",
+      "3    author  #/texts/2                                            Lei Cui\n",
+      "4    author  #/texts/2                                          Yutong Lu\n",
+      "5    author  #/texts/2                                           Furu Wei\n",
+      "6    author  #/texts/3                                            Sun Yat\n",
+      "7    author  #/texts/4                                 Microsoft Research\n",
+      "2206.11229.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Information extraction from semi-str...\n",
+      "1     title  #/texts/3  Maty' aˇs Skalick' y, ˇ Stˇ ep' an ˇ Simsa, Mi...\n",
+      "2305.04609.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Instance-level segmentation of docum...\n",
+      "1     title  #/texts/2  SwinDocSegmenter: An End-to-End Unified Domain...\n",
+      "2    author  #/texts/2                                      Ayan Banerjee\n",
+      "3    author  #/texts/2                                      Sanket Biswas\n",
+      "4    author  #/texts/3                                         Josep Llad\n",
+      "5    author  #/texts/4                             Computer Vision Center\n",
+      "6    author  #/texts/4        Computer Science Department Universitat Aut\n",
+      "2008.02569.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. We introduce a new dataset for graph...\n",
+      "1     title  #/texts/2  IIIT-AR-13K: A New Dataset for Graphical Objec...\n",
+      "2    author  #/texts/2                                        Peter Lipps\n",
+      "3    author  #/texts/6                             Open Text Software Gmb\n",
+      "2203.02378.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Image Transformer has recently achieved signif...\n",
+      "1     title  #/texts/2  DIT: SELF-SUPERVISED PRE-TRAINING FOR DOCUMENT...\n",
+      "2    author  #/texts/2                                         Junlong Li\n",
+      "3    author  #/texts/2                                          Yiheng Xu\n",
+      "4    author  #/texts/2                                            Lei Cui\n",
+      "5    author  #/texts/2                                          Cha Zhang\n",
+      "6    author  #/texts/2                                           Furu Wei\n",
+      "7    author  #/texts/3                      Shanghai Jiao Tong University\n",
+      "8    author  #/texts/4                                 Microsoft Research\n",
+      "9    author  #/texts/5                                    Microsoft Azure\n",
+      "2203.13530.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Document intelligence as a relatively...\n",
+      "1     title  #/texts/2  Multimodal Pre-training Based on Graph Attenti...\n",
+      "2    author  #/texts/2                                     Zhenrong Zhang\n",
+      "3    author  #/texts/2                                         Jiefeng Ma\n",
+      "4    author  #/texts/2                                             Jun Du\n",
+      "5    author  #/texts/2                                       Licheng Wang\n",
+      "6    author  #/texts/2                                      Jianshu Zhang\n",
+      "2302.08575.pdf\n",
+      "Empty DataFrame\n",
+      "Columns: [subtype, subj_path, name]\n",
+      "Index: []\n",
+      "2205.02411.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Relational understanding is critical for a num...\n",
+      "1     title  #/texts/2  Relational Representation Learning in Visually...\n",
+      "2    author  #/texts/2                                             Xin Li\n",
+      "3    author  #/texts/2                                          Yunfei Wu\n",
+      "4    author  #/texts/3                                          Yan Zheng\n",
+      "5    author  #/texts/3                                          Yiqing Hu\n",
+      "6    author  #/texts/3         Haoyu Cao Deqiang Jiang Yinsong Liu Bo Ren\n",
+      "7    author  #/texts/4                                      Tencent YouTu\n",
+      "2305.08719.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Document layout analysis is a crucial prerequi...\n",
+      "1      title  #/texts/2  M $^{6}$Doc: A Large-Scale Multi-Format, Multi...\n",
+      "2     author  #/texts/2                                        Hiuyi Cheng\n",
+      "3     author  #/texts/2                                      Peirong Zhang\n",
+      "4     author  #/texts/2                                          Sihang Wu\n",
+      "5     author  #/texts/2                                       Jiaxin Zhang\n",
+      "6     author  #/texts/3                                         Qiyuan Zhu\n",
+      "7     author  #/texts/3                                        Zecheng Xie\n",
+      "8     author  #/texts/3                                            Jing Li\n",
+      "9     author  #/texts/3                                           Kai Ding\n",
+      "10    author  #/texts/3                                        Lianwen Jin\n",
+      "11    author  #/texts/5             Huawei Cloud Computing Technologies Co\n",
+      "12    author  #/texts/6                                 Sig Information Co\n",
+      "2305.04833.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Table Detection has become a fundamental task ...\n",
+      "1     title  #/texts/1  Revisiting Table Detection Datasets for Visual...\n",
+      "2    author  #/texts/1                                           Bin Xiao\n",
+      "3    author  #/texts/1                                       Murat Simsek\n",
+      "4    author  #/texts/1                                     Burak Kantarci\n",
+      "5    author  #/texts/3                                         Lytica Inc\n",
+      "6    author  #/texts/3                                       Legget Drive\n",
+      "2108.00871.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  It is common in graphic design humans visually...\n",
+      "1     title  #/texts/2  Constrained Graphic Layout Generation via Late...\n",
+      "2    author  #/texts/2                                     Kotaro Kikuchi\n",
+      "3    author  #/texts/3                         Waseda University Shinjuku\n",
+      "4    author  #/texts/4                                         Mayu Otani\n",
+      "5    author  #/texts/5                                      Agent Shibuya\n",
+      "6    author  #/texts/6        Edgar Simo-Serra Waseda University Shinjuku\n",
+      "2306.08937.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Visually-Rich Document Entity Retrieval (VDER)...\n",
+      "1     title  #/texts/2  Document Entity Retrieval with Massive and Noi...\n",
+      "2    author  #/texts/2                                           Lijun Yu\n",
+      "3    author  #/texts/2                                           Jin Miao\n",
+      "4    author  #/texts/2                                         Xiaoyu Sun\n",
+      "5    author  #/texts/2                                         Jiayi Chen\n",
+      "6    author  #/texts/2                              Alexander G Hauptmann\n",
+      "7    author  #/texts/2                                         Hanjun Dai\n",
+      "8    author  #/texts/2                                            Wei Wei\n",
+      "2103.05908.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  We combine deep learning and Conditional Proba...\n",
+      "1     title  #/texts/2  DeepCPCFG: Deep Learning and Context Free Gram...\n",
+      "2    author  #/texts/2                                      Freddy C Chua\n",
+      "3    author  #/texts/2                                  High Street Suite\n",
+      "4    author  #/texts/2                                          Palo Alto\n",
+      "5    author  #/texts/3                                      Nigel P Duffy\n",
+      "6    author  #/texts/4                                  High Street Suite\n",
+      "7    author  #/texts/4                                          Palo Alto\n",
+      "2109.01078.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Transformer-based pre-training techniques of t...\n",
+      "1     title  #/texts/2  Skim-Attention: Learning to Focus via Document...\n",
+      "2    author  #/texts/2                                       Laura Nguyen\n",
+      "3    author  #/texts/2                                     Thomas Scialom\n",
+      "4    author  #/texts/2                                     Jacopo Staiano\n",
+      "5    author  #/texts/2                                Benjamin Piwowarski\n",
+      "6    author  #/texts/3                                 Sorbonne Universit\n",
+      "2108.09436.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Handwritten documents are often char...\n",
+      "1     title  #/texts/2  Palmira: A Deep Deformable Network for Instanc...\n",
+      "2    author  #/texts/2                                       Sowmya Aitha\n",
+      "3    author  #/texts/3                                   Abhishek Trivedi\n",
+      "4    author  #/texts/4                         Ravi Kiran Sarvadevabhatla\n",
+      "2308.01971.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. We introduce a novel bottom-up appro...\n",
+      "1     title  #/texts/2  SpaDen : Sparse and Dense Keypoint Estimation ...\n",
+      "2    author  #/texts/2                                       Saleem Ahmed\n",
+      "3    author  #/texts/2                                         Pengyu Yan\n",
+      "4    author  #/texts/3                                     David Doermann\n",
+      "5    author  #/texts/3                                 Srirangaraj Setlur\n",
+      "6    author  #/texts/4                                   Venu Govindaraju\n",
+      "2203.08504.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  This paper presents a systematic literature re...\n",
+      "1     title  #/texts/2     A Survey of Historical Document Image Datasets\n",
+      "2    author  #/texts/2                             Konstantina Nikolaidou\n",
+      "3    author  #/texts/2                                      Hamam Mokayed\n",
+      "4    author  #/texts/2                                     Marcus Liwicki\n",
+      "5    author  #/texts/3                             Machine Learning Group\n",
+      "2304.14953.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. In recent years, the field of docume...\n",
+      "1     title  #/texts/2  CCpdf: Building a High Quality Corpus for Visu...\n",
+      "2    author  #/texts/2                                    Karol Kaczmarek\n",
+      "3    author  #/texts/4                         Adam Mickiewicz University\n",
+      "2307.12571.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Document dewarping from a distorted camera-cap...\n",
+      "1     title  #/texts/2  MataDoc: Margin and Text Aware Document Dewarp...\n",
+      "2    author  #/texts/2                                          Beiya Dai\n",
+      "3    author  #/texts/2                                          Qunyi Xie\n",
+      "4    author  #/texts/2                                           Yulin Li\n",
+      "5    author  #/texts/2                                        Xiameng Qin\n",
+      "6    author  #/texts/2                                    Chengquan Zhang\n",
+      "7    author  #/texts/2                                            Kun Yao\n",
+      "8    author  #/texts/2                                          Junyu Han\n",
+      "9    author  #/texts/2                                          Baidu Inc\n",
+      "2204.10939.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Document intelligence automates the extraction...\n",
+      "1      title  #/texts/2  Unified Pretraining Framework for Document Und...\n",
+      "2     author  #/texts/2                                        Jiuxiang Gu\n",
+      "3     author  #/texts/2                                     Vlad I Morariu\n",
+      "4     author  #/texts/2                                       Handong Zhao\n",
+      "5     author  #/texts/2                                Nikolaos Barmpalios\n",
+      "6     author  #/texts/2                                         Rajiv Jain\n",
+      "7     author  #/texts/2                                        Ani Nenkova\n",
+      "8     author  #/texts/2                                           Tong Sun\n",
+      "9     author  #/texts/3                                     Adobe Research\n",
+      "10    author  #/texts/3                               Adobe Document Cloud\n",
+      "2308.10511.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Computer Science and Engineering Shahjalal Uni...\n",
+      "1    author  #/texts/3                                   Computer Science\n",
+      "2     title  #/texts/3                                     Shrestha Datta\n",
+      "3    author  #/texts/4                                      Raisa Fairooz\n",
+      "2205.08094.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  We present MATrIX-a Modality-Aware Transformer...\n",
+      "1     title  #/texts/2  MATrIX-Modality-Aware Transformer for Informat...\n",
+      "2    author  #/texts/2                                     Thomas Delteil\n",
+      "3    author  #/texts/4                                     Edouard Belval\n",
+      "4    author  #/texts/6                                           Lei Chen\n",
+      "2010.01762.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  In layout object detection problems, the groun...\n",
+      "1     title  #/texts/2  OLALA : Object-Level Active Learning based Lay...\n",
+      "2    author  #/texts/2                                       Zejiang Shen\n",
+      "3    author  #/texts/2                                          Jian Zhao\n",
+      "4    author  #/texts/2                                       Melissa Dell\n",
+      "5    author  #/texts/2                                        Yaoliang Yu\n",
+      "6    author  #/texts/2                                         Weining Li\n",
+      "7    author  #/texts/3                                 Harvard University\n",
+      "2106.14616.pdf\n",
+      "  subtype  subj_path                                               name\n",
+      "0   title  #/texts/2  ICDAR 2021 Competition on Scientific Literatur...\n",
+      "1  author  #/texts/2                                        Peter Zhong\n",
+      "2  author  #/texts/2                                    Douglas Burdick\n",
+      "3  author  #/texts/7                                   Research Almaden\n",
+      "2308.01979.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. We present a comprehensive study of ...\n",
+      "1     title  #/texts/3              Saleem Ahmed() [0000-0001-8648-9625],\n",
+      "2    author  #/texts/3                                      Bhavin Jawade\n",
+      "3    author  #/texts/3                                     Shubham Pandey\n",
+      "4    author  #/texts/3                                 Srirangaraj Setlur\n",
+      "5    author  #/texts/3                                   Venu Govindaraju\n",
+      "2308.15517.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Document AI aims to automatically analyze docu...\n",
+      "1     title  #/texts/1  Document AI: A Comparative Study of Transforme...\n",
+      "2    author  #/texts/2                                      Shaomu Tan Uv\n",
+      "2012.14740.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Pre-training of text and layout has proved eff...\n",
+      "1      title  #/texts/2  LAYOUTLMV2: MULTI-MODAL PRE-TRAINING FOR VISUA...\n",
+      "2     author  #/texts/2                                            Yang Xu\n",
+      "3     author  #/texts/2                                          Yiheng Xu\n",
+      "4     author  #/texts/2                                            Lei Cui\n",
+      "5     author  #/texts/2                                           Furu Wei\n",
+      "6     author  #/texts/2                                        Guoxin Wang\n",
+      "7     author  #/texts/2                                          Yijuan Lu\n",
+      "8     author  #/texts/3                                    Dinei Florencio\n",
+      "9     author  #/texts/3                                          Cha Zhang\n",
+      "10    author  #/texts/3                                       Wanxiang Che\n",
+      "11    author  #/texts/3                                          Min Zhang\n",
+      "12    author  #/texts/3                                        Lidong Zhou\n",
+      "13    author  #/texts/5                            Microsoft Research Asia\n",
+      "14    author  #/texts/6                                    Microsoft Cloud\n",
+      "15    author  #/texts/7                                 Soochow University\n",
+      "2304.12506.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Online learning and academic conferen...\n",
+      "1     title  #/texts/2  DualSlide: Global-to-Local Sketching Interface...\n",
+      "2    author  #/texts/2                                         Xusheng Du\n",
+      "3    author  #/texts/2                                         Haoran Xie\n",
+      "2008.10831.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Localizing page elements/objects such...\n",
+      "1     title  #/texts/2  CDeC-Net: Composite Deformable Cascade Network...\n",
+      "2    author  #/texts/2                                     Madhav Agarwal\n",
+      "3    author  #/texts/4                                        C V Jawahar\n",
+      "2301.11529.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Layout design is an important task in various ...\n",
+      "1     title  #/texts/2  PLay: Parametrically Conditioned Layout Genera...\n",
+      "2    author  #/texts/2                                      Forrest Huang\n",
+      "3    author  #/texts/2                                            Gang Li\n",
+      "4    author  #/texts/2                                            Yang Li\n",
+      "2308.13769.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Document digitization is vital for pr...\n",
+      "1     title  #/texts/2   Bengali Document Layout Analysis with Detectron2\n",
+      "2    author  #/texts/2                                        Md Ataullha\n",
+      "3    author  #/texts/2                                Mahedi Hassan Rabby\n",
+      "4    author  #/texts/2                                   Mushfiqur Rahman\n",
+      "5    author  #/texts/2                                Tahsina Bintay Azam\n",
+      "2112.12703.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Archivists, textual scholars, and hi...\n",
+      "1     title  #/texts/2  Digital Editions as Distant Supervision for La...\n",
+      "2    author  #/texts/2                                Alejandro H Toselli\n",
+      "3    author  #/texts/2                                              Si Wu\n",
+      "4    author  #/texts/2                                      David A Smith\n",
+      "5    author  #/texts/4                            Northeastern University\n",
+      "2305.08455.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  We call on the Document AI (DocAI) community t...\n",
+      "1      title  #/texts/2  Document Understanding Dataset and Evaluation ...\n",
+      "2     author  #/texts/2                                Jordy Van Landeghem\n",
+      "3     author  #/texts/2                                   Dawid Jurkiewicz\n",
+      "4     author  #/texts/2                                   Bertrand Ackaert\n",
+      "5     author  #/texts/2                                     Ernest Valveny\n",
+      "6     author  #/texts/2                                   Matthew Blaschko\n",
+      "7     author  #/texts/2                                         Sien Moens\n",
+      "8     author  #/texts/2                             Computer Vision Center\n",
+      "9     author  #/texts/2                                    Universitat Aut\n",
+      "10    author  #/texts/2                            Jagiellonian University\n",
+      "11    author  #/texts/2                         Adam Mickiewicz University\n",
+      "2212.12975.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  It is difficult to design a visually appealing...\n",
+      "1     title  #/texts/2  Interactive Layout Drawing Interface with Shad...\n",
+      "2    author  #/texts/2                                         Haoran Xie\n",
+      "2306.05749.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Recently, there has been a growing interest in...\n",
+      "1     title  #/texts/2  DocAligner: Annotating Real-world Photographic...\n",
+      "2    author  #/texts/2                                       Jiaxin Zhang\n",
+      "3    author  #/texts/2                                      Bangdong Chen\n",
+      "4    author  #/texts/2                                        Hiuyi Cheng\n",
+      "5    author  #/texts/2                                        Fengjun Guo\n",
+      "6    author  #/texts/2                                           Kai Ding\n",
+      "7    author  #/texts/2                                        Lianwen Jin\n",
+      "8    author  #/texts/3                                 Sig Information Co\n",
+      "2303.11589.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Creating graphic layouts is a fundamental step...\n",
+      "1     title  #/texts/2  LayoutDiffusion: Improving Graphic Layout Gene...\n",
+      "2    author  #/texts/2                                        Junyi Zhang\n",
+      "3    author  #/texts/2                                          Jiaqi Guo\n",
+      "4    author  #/texts/2                                        Shizhao Sun\n",
+      "5    author  #/texts/2                                      Dongmei Zhang\n",
+      "6    author  #/texts/2                      Shanghai Jiao Tong University\n",
+      "7    author  #/texts/2                            Microsoft Research Asia\n",
+      "2012.08191.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-We present docExtractor, a generic ap...\n",
+      "1     title  #/texts/2  docExtractor: An off-the-shelf historical docu...\n",
+      "2    author  #/texts/2                                        Tom Monnier\n",
+      "3    author  #/texts/2                                      Mathieu Aubry\n",
+      "4    author  #/texts/3                                Univ Gustave Eiffel\n",
+      "2106.11539.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  We present DocFormer-a multi-modal transformer...\n",
+      "1     title  #/texts/2  DocFormer: End-to-End Transformer for Document...\n",
+      "2    author  #/texts/2                                  Srikar Appalaraju\n",
+      "3    author  #/texts/4                                      Bhavan Jasani\n",
+      "4    author  #/texts/8                                         R Manmatha\n",
+      "2308.12896.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  This paper highlights the need to bring docume...\n",
+      "1     title  #/texts/2  Beyond Document Page Classification: Design, D...\n",
+      "2    author  #/texts/2                                Jordy Van Landeghem\n",
+      "3    author  #/texts/2                                      Sanket Biswas\n",
+      "4    author  #/texts/2                                   Matthew Blaschko\n",
+      "5    author  #/texts/3                             Computer Vision Center\n",
+      "6    author  #/texts/3                                    Universitat Aut\n",
+      "2301.10140.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  The volume of scientific output is creating an...\n",
+      "1      title  #/texts/2            The Semantic Scholar Open Data Platform\n",
+      "2     author  #/texts/2                                 Chloe Anastasiades\n",
+      "3     author  #/texts/2                                     Russell Authur\n",
+      "4     author  #/texts/2                                         Iz Beltagy\n",
+      "5     author  #/texts/2                                     Jonathan Bragg\n",
+      "6     author  #/texts/2                              Alexandra Buraczynski\n",
+      "7     author  #/texts/2                                     Isabel Cachola\n",
+      "8     author  #/texts/2                                      Stefan Candra\n",
+      "9     author  #/texts/2                             Yoganand Chandrasekhar\n",
+      "10    author  #/texts/2                                        Arman Cohan\n",
+      "11    author  #/texts/2                                        Doug Downey\n",
+      "12    author  #/texts/2                                          Rob Evans\n",
+      "13    author  #/texts/2                                     Sergey Feldman\n",
+      "14    author  #/texts/2                                      Joseph Gorney\n",
+      "15    author  #/texts/2                                       David Graham\n",
+      "16    author  #/texts/2                                        Fangzhou Hu\n",
+      "17    author  #/texts/2                                         Regan Huff\n",
+      "18    author  #/texts/2                                        Daniel King\n",
+      "19    author  #/texts/2                                       Bailey Kuehl\n",
+      "20    author  #/texts/2                                     Michael Langan\n",
+      "21    author  #/texts/2                                         Daniel Lin\n",
+      "22    author  #/texts/2                                         Haokun Liu\n",
+      "23    author  #/texts/2                                            Kyle Lo\n",
+      "24    author  #/texts/2                                   Kelsey MacMillan\n",
+      "25    author  #/texts/2                                       Tyler Murray\n",
+      "26    author  #/texts/2                                       Chris Newell\n",
+      "27    author  #/texts/2                                          Smita Rao\n",
+      "28    author  #/texts/2                                    Shaurya Rohatgi\n",
+      "29    author  #/texts/2                                         Paul Sayre\n",
+      "30    author  #/texts/2                                       Zejiang Shen\n",
+      "31    author  #/texts/2                                    Amanpreet Singh\n",
+      "32    author  #/texts/2                                      Luca Soldaini\n",
+      "33    author  #/texts/2                           Shivashankar Subramanian\n",
+      "34    author  #/texts/2                                       Amber Tanaka\n",
+      "35    author  #/texts/2                                        Alex D Wade\n",
+      "36    author  #/texts/2                                       Linda Wagner\n",
+      "37    author  #/texts/2                                       Lucy Lu Wang\n",
+      "38    author  #/texts/2                                        Caroline Wu\n",
+      "39    author  #/texts/2                                    Jiangjiang Yang\n",
+      "40    author  #/texts/2                                    Angele Zamarron\n",
+      "41    author  #/texts/2                               Madeleine Van Zuylen\n",
+      "42    author  #/texts/2                                      Daniel S Weld\n",
+      "2212.02896.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Table of contents (ToC) extraction ai...\n",
+      "1     title  #/texts/2  Multimodal Tree Decoder for Table of Contents ...\n",
+      "2    author  #/texts/2                                         Pengfei Hu\n",
+      "3    author  #/texts/2                                     Zhenrong Zhang\n",
+      "4    author  #/texts/2                                      Jianshu Zhang\n",
+      "5    author  #/texts/2                                             Jun Du\n",
+      "6    author  #/texts/2                                          Jiajia Wu\n",
+      "7    author  #/texts/4                                          P R China\n",
+      "2302.05658.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. This paper introduces the DocILE ben...\n",
+      "1     title  #/texts/2  DocILE Benchmark for Document Information Loca...\n",
+      "2    author  #/texts/2                                         Yash Patel\n",
+      "3    author  #/texts/3                                        Ahmed Hamdi\n",
+      "4    author  #/texts/4                                   Mickael Coustaty\n",
+      "5    author  #/texts/4                               Dimosthenis Karatzas\n",
+      "6    author  #/texts/7                         Czech Technical University\n",
+      "7    author  #/texts/8                             Computer Vision Center\n",
+      "8    author  #/texts/8                                    Universitat Aut\n",
+      "2304.13240.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Accurately extracting structured dat...\n",
+      "1     title  #/texts/2  Structure Diagram Recognition in Financial Ann...\n",
+      "2    author  #/texts/2                                       Meixuan Qiao\n",
+      "3    author  #/texts/2                                           Jun Wang\n",
+      "4    author  #/texts/2                                        Junfu Xiang\n",
+      "5    author  #/texts/2                                           Qiyu Hou\n",
+      "6    author  #/texts/2                                         Ruixuan Li\n",
+      "7    author  #/texts/5                                         Wudao Tech\n",
+      "2102.09395.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  The number of published PDF documents in both ...\n",
+      "1      title  #/texts/2  Robust PDF Document Conversion Using Recurrent...\n",
+      "2     author  #/texts/2                                Nikolaos Livathinos\n",
+      "3     author  #/texts/2                                     Cesar Berrospi\n",
+      "4     author  #/texts/2                                       Maksym Lysak\n",
+      "5     author  #/texts/2                                 Viktor Kuropiatnyk\n",
+      "6     author  #/texts/2                                       Ahmed Nassar\n",
+      "7     author  #/texts/2                                      Michele Dolfi\n",
+      "8     author  #/texts/2                                     Christoph Auer\n",
+      "9     author  #/texts/2                                      Kasper Dinkla\n",
+      "10    author  #/texts/2                                        Peter Staar\n",
+      "2104.12756.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Infographics are documents designed to effecti...\n",
+      "1     title  #/texts/2                                     InfographicVQA\n",
+      "2    author  #/texts/2                                      Minesh Mathew\n",
+      "3    author  #/texts/2                                        Viraj Bagal\n",
+      "4    author  #/texts/2                               Dimosthenis Karatzas\n",
+      "5    author  #/texts/2                                     Ernest Valveny\n",
+      "6    author  #/texts/2                                          V Jawahar\n",
+      "7    author  #/texts/2                                    Universitat Aut\n",
+      "2212.02623.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  We propose Universal Document Processing (UDOP...\n",
+      "1      title  #/texts/2  Unifying Vision, Text, and Layout for Universa...\n",
+      "2     author  #/texts/2                                        Zineng Tang\n",
+      "3     author  #/texts/2                                          Ziyi Yang\n",
+      "4     author  #/texts/2                                        Guoxin Wang\n",
+      "5     author  #/texts/2                                         Yuwei Fang\n",
+      "6     author  #/texts/2                                           Yang Liu\n",
+      "7     author  #/texts/3                                      Chenguang Zhu\n",
+      "8     author  #/texts/3                                       Michael Zeng\n",
+      "9     author  #/texts/3                                          Cha Zhang\n",
+      "10    author  #/texts/3                                       Mohit Bansal\n",
+      "11    author  #/texts/4                                        Chapel Hill\n",
+      "12    author  #/texts/5        Microsoft Azure Cognitive Services Research\n",
+      "2111.13809.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Document layout analysis (DLA) plays an import...\n",
+      "1      title  #/texts/2  DOCUMENT LAYOUT ANALYSIS WITH AESTHETIC-GUIDED...\n",
+      "2     author  #/texts/2                                        Tianlong Ma\n",
+      "3     author  #/texts/2                                        Xingjiao Wu\n",
+      "4     author  #/texts/2                                             Xin Li\n",
+      "5     author  #/texts/2                                      Xiangcheng Du\n",
+      "6     author  #/texts/2                                          Zhao Zhou\n",
+      "7     author  #/texts/2                                          Liang Xue\n",
+      "8     author  #/texts/2                                          Cheng Jin\n",
+      "9     author  #/texts/3                                   Fudan University\n",
+      "10    author  #/texts/3                       East China Normal University\n",
+      "11    author  #/texts/4                                 Zhongshan Hospital\n",
+      "12    author  #/texts/4                                   Fudan University\n",
+      "2309.09742.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. The reliability of supervised machin...\n",
+      "1     title  #/texts/3  David Tschirschwitz r 0000 ' 0001 ' 5344 ' 417...\n",
+      "2    author  #/texts/3                                     Christian Benz\n",
+      "3    author  #/texts/4                                      Morris Florek\n",
+      "4    author  #/texts/5                                   Henrik Norderhus\n",
+      "5    author  #/texts/6                                        Benno Stein\n",
+      "6    author  #/texts/7                                   Volker Rodehorst\n",
+      "2208.10970.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Recognizing the layout of unstructured digital...\n",
+      "1     title  #/texts/2  Doc-GCN: Heterogeneous Graph Convolutional Net...\n",
+      "2    author  #/texts/2                                          Siwen Luo\n",
+      "3    author  #/texts/2                                         Yihao Ding\n",
+      "4    author  #/texts/2                                          Siqu Long\n",
+      "5    author  #/texts/2                                   Soyeon Caren Han\n",
+      "6    author  #/texts/2                                        Josiah Poon\n",
+      "2302.01451.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Relevant information in documents is often sum...\n",
+      "1     title  #/texts/2  CTE: A Dataset for Contextualized Table Extrac...\n",
+      "2    author  #/texts/2                                     Andrea Gemelli\n",
+      "3    author  #/texts/2                                    Emanuele Vivoli\n",
+      "4    author  #/texts/2                                     Simone Marinai\n",
+      "2301.10781.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  The lack of generalizability-in which a model ...\n",
+      "1     title  #/texts/2  Generalizability in Document Layout Analysis f...\n",
+      "2    author  #/texts/2                                        Jill Naiman\n",
+      "2102.05533.pdf\n",
+      "Empty DataFrame\n",
+      "Columns: [subtype, subj_path, name]\n",
+      "Index: []\n",
+      "2108.11591.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Reading order detection is the cornerstone to ...\n",
+      "1     title  #/texts/2  LayoutReader: Pre-training of Text and Layout ...\n",
+      "2    author  #/texts/2                                        Zilong Wang\n",
+      "3    author  #/texts/2                                          Yiheng Xu\n",
+      "4    author  #/texts/2                                            Lei Cui\n",
+      "5    author  #/texts/2                                       Jingbo Shang\n",
+      "6    author  #/texts/2                                           Furu Wei\n",
+      "7    author  #/texts/3                                          San Diego\n",
+      "8    author  #/texts/4                            Microsoft Research Asia\n",
+      "2104.08836.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Multimodal pre-training with text, layout, and...\n",
+      "1      title  #/texts/2  LayoutXLM: Multimodal Pre-training for Multili...\n",
+      "2     author  #/texts/2                                          Yiheng Xu\n",
+      "3     author  #/texts/2                                            Lei Cui\n",
+      "4     author  #/texts/2                                        Guoxin Wang\n",
+      "5     author  #/texts/2                                          Yijuan Lu\n",
+      "6     author  #/texts/2                                    Dinei Florencio\n",
+      "7     author  #/texts/2                                          Cha Zhang\n",
+      "8     author  #/texts/2                                           Furu Wei\n",
+      "9     author  #/texts/3                            Microsoft Research Asia\n",
+      "10    author  #/texts/4                                    Microsoft Azure\n",
+      "2305.15393.pdf\n",
+      "     subtype  subj_path                                               name\n",
+      "0   abstract          #  Attaining a high degree of user controllabilit...\n",
+      "1      title  #/texts/2  LayoutGPT: Compositional Visual Planning and G...\n",
+      "2     author  #/texts/2                                         Weixi Feng\n",
+      "3     author  #/texts/2                                        Wanrong Zhu\n",
+      "4     author  #/texts/2                                      Varun Jampani\n",
+      "5     author  #/texts/2                                        Arjun Akula\n",
+      "6     author  #/texts/2                                        Sugato Basu\n",
+      "7     author  #/texts/2                                      Xin Eric Wang\n",
+      "8     author  #/texts/2                                  William Yang Wang\n",
+      "9     author  #/texts/2                                      Santa Barbara\n",
+      "10    author  #/texts/2                                         Santa Cruz\n",
+      "2202.12985.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Pretraining has proven successful in...\n",
+      "1     title  #/texts/2  OCR-IDL: OCR Annotations for Industry Document...\n",
+      "2    author  #/texts/2                                   Ali Furkan Biten\n",
+      "3    author  #/texts/2                                        Lluis Gomez\n",
+      "4    author  #/texts/2                                     Ernest Valveny\n",
+      "5    author  #/texts/2                               Dimosthenis Karatzas\n",
+      "6    author  #/texts/3                             Computer Vision Center\n",
+      "2303.14884.pdf\n",
+      "  subtype  subj_path                                               name\n",
+      "0   title  #/texts/1  A large-scale dataset for end-to-end table rec...\n",
+      "1  author  #/texts/1                                           Fan Yang\n",
+      "2  author  #/texts/1                                             Lei Hu\n",
+      "3  author  #/texts/1                                          Xinwu Liu\n",
+      "4  author  #/texts/1                                   Shuangping Huang\n",
+      "5  author  #/texts/1                                        Zhenghui Gu\n",
+      "6  author  #/texts/3                                  Times Electric Co\n",
+      "7  author  #/texts/7                                   Shuangping Huang\n",
+      "2101.06573.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Current Artificial Intelligence (AI) methods, ...\n",
+      "1     title  #/texts/2           Understanding in Artificial Intelligence\n",
+      "2    author  #/texts/2                                   Stefan Maetschke\n",
+      "3    author  #/texts/2                              David Martinez Iraola\n",
+      "4    author  #/texts/2                               Elaheh ShafieiBavani\n",
+      "5    author  #/texts/2                                        Peter Zhong\n",
+      "6    author  #/texts/2                                            Ying Xu\n",
+      "7    author  #/texts/3                                 Research Australia\n",
+      "2304.06447.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Document-based Visual Question Answe...\n",
+      "1     title  #/texts/2  PDF-VQA: A New Dataset for Real-World VQA on P...\n",
+      "2    author  #/texts/2                                         Yihao Ding\n",
+      "3    author  #/texts/2                                          Siwen Luo\n",
+      "4    author  #/texts/2                                      Hyunsuk Chung\n",
+      "5    author  #/texts/2                                   Soyeon Caren Han\n",
+      "2101.12741.pdf\n",
+      "Empty DataFrame\n",
+      "Columns: [subtype, subj_path, name]\n",
+      "Index: []\n",
+      "2202.08125.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Background. In recent years, libraries and arc...\n",
+      "1     title  #/texts/2  Processing the structure of documents: Logical...\n",
+      "2    author  #/texts/2                                    Nicolas Gutehrl\n",
+      "3    author  #/texts/3                     Recherches Interdisciplinaires\n",
+      "4    author  #/texts/4                             Iana Atanassova Centre\n",
+      "5    author  #/texts/4                     Recherches Interdisciplinaires\n",
+      "6    author  #/texts/4                             Institut Universitaire\n",
+      "2308.02051.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Document layout analysis (DLA) is th...\n",
+      "1     title  #/texts/2   A Graphical Approach to Document Layout Analysis\n",
+      "2    author  #/texts/2                                         Jilin Wang\n",
+      "3    author  #/texts/2                                   Michael Krumdick\n",
+      "4    author  #/texts/2                                        Baojia Tong\n",
+      "5    author  #/texts/2                                       Hamima Halim\n",
+      "6    author  #/texts/3                                        Vadym Barda\n",
+      "7    author  #/texts/3                                  Delphine Vendryes\n",
+      "8    author  #/texts/4                                Kensho Technologies\n",
+      "9    author  #/texts/8                                        Los Angeles\n",
+      "2006.14615.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. We address the problem of layout gen...\n",
+      "1     title  #/texts/2  Layout Generation and Completion with Self-att...\n",
+      "2    author  #/texts/2                                        Kamal Gupta\n",
+      "3    author  #/texts/2                                 Alessandro Achille\n",
+      "4    author  #/texts/2                                        Larry Davis\n",
+      "5    author  #/texts/2                                Abhinav Shrivastava\n",
+      "6    author  #/texts/3                                       College Park\n",
+      "2012.06547.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  We present a deep neural network to predict st...\n",
+      "1     title  #/texts/2  LayoutGMN: Neural Graph Matching for Structura...\n",
+      "2    author  #/texts/2                                  Akshay Gadi Patil\n",
+      "3    author  #/texts/2                                           Manyi Li\n",
+      "4    author  #/texts/2                                     Matthew Fisher\n",
+      "5    author  #/texts/2                                      Manolis Savva\n",
+      "6    author  #/texts/2                                          Hao Zhang\n",
+      "7    author  #/texts/2                            Simon Fraser University\n",
+      "8    author  #/texts/2                                     Adobe Research\n",
+      "1908.07836.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract-Recognizing the layout of unstructure...\n",
+      "1     title  #/texts/2  PubLayNet: largest dataset ever for document l...\n",
+      "2    author  #/texts/2                                           Xu Zhong\n",
+      "3    author  #/texts/2                                 Research Australia\n",
+      "4    author  #/texts/2                                          City Road\n",
+      "5    author  #/texts/3                                       Jianbin Tang\n",
+      "6    author  #/texts/3                                 Research Australia\n",
+      "7    author  #/texts/3                                          City Road\n",
+      "8    author  #/texts/4                                 Research Australia\n",
+      "9    author  #/texts/4                                          City Road\n",
+      "2108.13297.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Documents often contain complex phys...\n",
+      "1     title  #/texts/2  VTLayout: Fusion of Visual and Text Features f...\n",
+      "2    author  #/texts/2                                         Shoubin Li\n",
+      "3    author  #/texts/2                                           Xuyan Ma\n",
+      "4    author  #/texts/2                                       Shuaiqun Pan\n",
+      "5    author  #/texts/2                                             Jun Hu\n",
+      "6    author  #/texts/2                                            Lin Shi\n",
+      "7    author  #/texts/2                                          Qing Wang\n",
+      "2305.02577.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Abstract. Text reading order is a crucial aspe...\n",
+      "1     title  #/texts/2  Text Reading Order in Uncontrolled Conditions ...\n",
+      "2    author  #/texts/2                                       Renshen Wang\n",
+      "3    author  #/texts/2                                     Yasuhisa Fujii\n",
+      "4    author  #/texts/2                                Alessandro Bissacco\n",
+      "5    author  #/texts/3                                    Google Research\n",
+      "2102.08445.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Additional Key Words and Phrases: Table extrac...\n",
+      "1     title  #/texts/2  TableLab: An Interactive Table Extraction Syst...\n",
+      "2201.01654.pdf\n",
+      "    subtype  subj_path                                               name\n",
+      "0  abstract          #  Tables have been an ever-existing structure to...\n",
+      "1     title  #/texts/2  TableParser: Automatic Table Parsing with Weak...\n",
+      "2    author  #/texts/2                                       Susie Xi Rao\n",
+      "3    author  #/texts/2                                    Johannes Rausch\n",
+      "4    author  #/texts/2                                        Peter Egger\n",
+      "5    author  #/texts/2                                           Ce Zhang\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_collection = ElasticDataCollectionSource(elastic_id=\"default\", index_key=\"arxiv\")\n",
+    "page_size = 5\n",
+    "\n",
+    "# Prepare the data query\n",
+    "query = DataQuery(\n",
+    "    search_query, # The search query to be executed\n",
+    "    #source=[\"description.title\", \"description.authors\", \"identifiers\"], # Which fields of documents we want to fetch\n",
+    "    limit=page_size, # The size of each request page\n",
+    "    coordinates=data_collection # The data collection to be queries\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# [Optional] Compute the number of total results matched. This can be used to monitor the pagination progress.\n",
+    "count_query = deepcopy(query)\n",
+    "count_query.paginated_task.parameters[\"limit\"] = 0\n",
+    "count_results = api.queries.run(count_query)\n",
+    "expected_total = count_results.outputs[\"data_count\"]\n",
+    "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n",
+    "\n",
+    "\n",
+    "# Iterate through all results by fetching `page_size` results at the same time\n",
+    "all_results = []\n",
+    "cursor = api.queries.run_paginated_query(query)\n",
+    "for result_page in tqdm(cursor, total=expected_pages):\n",
+    "    # Iterate through the results of a single page, and add to the total list\n",
+    "    for row in result_page.outputs[\"data_outputs\"]:\n",
+    "        doc = row[\"_source\"]\n",
+    "        print(doc[\"file-info\"][\"filename\"])\n",
+    "\n",
+    "        res = model.apply_on_doc(doc)\n",
+    "\n",
+    "        props = pd.DataFrame(res[\"properties\"][\"data\"], columns=res[\"properties\"][\"headers\"])\n",
+    "        insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])\n",
+    "\n",
+    "        doc_insts = insts[insts[\"subj_name\"]==\"DOCUMENT\"][[\"subtype\", \"subj_path\", \"name\"]]\n",
+    "        print(doc_insts)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cf42286-b0a1-438a-8c7f-d852e61c260f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "23663f76e1e243f0a6319b8ef58f504b6b45c83666dfefd3138ba8cf69ab01fa"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/poetry.lock b/poetry.lock
index 668aa53..5154b15 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -711,13 +711,13 @@ files = [
 
 [[package]]
 name = "cibuildwheel"
-version = "2.16.5"
+version = "2.17.0"
 description = "Build Python wheels on CI with minimal configuration."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "cibuildwheel-2.16.5-py3-none-any.whl", hash = "sha256:e9a0b743a57cf2a2861e1e580765fd70237689aeda6c79db4870ee688ee24d1f"},
-    {file = "cibuildwheel-2.16.5.tar.gz", hash = "sha256:9fe763405afac4aef33eb8641891dda83312848ec18cd44e30daac34dfa9336d"},
+    {file = "cibuildwheel-2.17.0-py3-none-any.whl", hash = "sha256:62ddd06179269b9da111bf9e97aca8ecb7b9642e1151a0bac702dd46429b52bf"},
+    {file = "cibuildwheel-2.17.0.tar.gz", hash = "sha256:889510a7d974da855a8b793f8dbe718ce18189a42c2560741868e68900e02be2"},
 ]
 
 [package.dependencies]
@@ -985,27 +985,27 @@ files = [
 
 [[package]]
 name = "deepsearch-glm"
-version = "0.12.2"
+version = "0.17.2"
 description = "Graph Language Models"
 optional = false
-python-versions = ">=3.8,<4.0"
+python-versions = "<4.0,>=3.8"
 files = [
-    {file = "deepsearch_glm-0.12.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:c815a9c6c6179047fde28f979783f80b62b817d2bdf7735d6e75554f518415af"},
-    {file = "deepsearch_glm-0.12.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:f0f26a8be5795c750c246ed5fa9381ee0b2deeac3c0d4e37a37f5b76430cb567"},
-    {file = "deepsearch_glm-0.12.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f39a5f2032f00a0474a38d67c3c462a2fd9af6ebb2dce00489d83c51ef740aea"},
-    {file = "deepsearch_glm-0.12.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:0695723b85fb81f65e9193549940e72788eb6a9369fa3a472a82c84ed55ca1cb"},
-    {file = "deepsearch_glm-0.12.2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:6c5c1dffea9c2118dad31762fa2c437d7f90954b9c0b744f5754dbf819be5462"},
-    {file = "deepsearch_glm-0.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae926bdbec45e8ed9585a0cb9e3e124796522a18707310bee6aea44084616f12"},
-    {file = "deepsearch_glm-0.12.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:4477879a8bd52f04dacfcf41a603a44d6839df46e2bf1f73abf75657e282129d"},
-    {file = "deepsearch_glm-0.12.2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:df3277a64208ba4c93b126f5cd4dc29ac09e81ae485364b63a1f49625b596e32"},
-    {file = "deepsearch_glm-0.12.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:609908f148bc3a7f975acf83469c54f9dd807821535517b9ddf5a46c27a35641"},
-    {file = "deepsearch_glm-0.12.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:f518e5a11773fd9a8cf8c299237663bfd7533fe2d52bc6bc729862415955ab99"},
-    {file = "deepsearch_glm-0.12.2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:bf36f7c40d880bdfc4b5548c32ef4bc244f1d0a858992b0dc1c56c4962e8381f"},
-    {file = "deepsearch_glm-0.12.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:79e0ff50ee432e7f0a3ab8a95ba68f58b8867b5361c8489f9465da3f4f73b7c0"},
+    {file = "deepsearch_glm-0.17.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6234fb2fa6755ff1bb7000d21e4574eea68a29557d8f16ba179f5f5713766d9b"},
+    {file = "deepsearch_glm-0.17.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:af97145ebb2f21b074ef6385c45d60a2d2553b68254c30aa66b7ddd9206b7f7b"},
+    {file = "deepsearch_glm-0.17.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fe2669ac7f8567383e0818fe9b3b73979978fb5e65f36db3b7626bf3af6206d"},
+    {file = "deepsearch_glm-0.17.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:95cbe8e8264c675a128f520f33afa3fd34295c64b00d282c015fe13c7cc2bf3b"},
+    {file = "deepsearch_glm-0.17.2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:1b9203be399dd4f026769998cca25a5691ff79791ead2dfa05385af8467f4bd8"},
+    {file = "deepsearch_glm-0.17.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f512fede5fd062ff005f51073ef5660e2e963e0013251176b69bd7ab9e45faa"},
+    {file = "deepsearch_glm-0.17.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:be15a98b0cbf36e5141e5dd8e22ba29b0e0d92604fc58e53e8fa6c837b29a40f"},
+    {file = "deepsearch_glm-0.17.2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:e3ddcaf73dd5578786db3333c238790a45d0fa0af4b1df9a41a4b9dd234c2401"},
+    {file = "deepsearch_glm-0.17.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20e7b819d58df057bd1826fea8cd3e6d0ed4cac3fe819795ee5205360fa77fee"},
+    {file = "deepsearch_glm-0.17.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:0f8405524b000669b82098b1989e8c4ef4da0f93407477c1d807533d9f427867"},
+    {file = "deepsearch_glm-0.17.2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:da459f79913b0f967f5802766b2b964bc997d7f5259663901d84c2780961dfa8"},
+    {file = "deepsearch_glm-0.17.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50757b64a607104882a683b2a570f96f450faaf0f1047125df043d01406f2f16"},
 ]
 
 [package.dependencies]
-cibuildwheel = ">=2.16.5,<3.0.0"
+cibuildwheel = ">=2.17.0,<3.0.0"
 deepsearch-toolkit = ">=0.31.0"
 matplotlib = ">=3.7.1,<4.0.0"
 networkx = ">=3.1,<4.0"
@@ -3492,6 +3492,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -5179,4 +5180,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">= 3.8, <3.11"
-content-hash = "af9c6fc06f4f0ef24bfb94fbae7f8d740af54d14c1edca64ffef6f615712a3f2"
+content-hash = "928e878a81836c1528c27876675534238f9bc5c350965694061acb31ef559c3b"
diff --git a/pyproject.toml b/pyproject.toml
index 59cac7c..0b53572 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ python-dotenv = "^1.0.0"
 nbclient = "^0.9.0"
 pandas = "^1.5.1"
 argilla = "^1.24.0"
-deepsearch-glm = ">=0.12.2"
+deepsearch-glm = "v0.17.2"
 
 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^22.1.0"}

From 14d6d456ceb2d9ecb74e4f7d17a264fddb4ff75e Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Wed, 8 May 2024 08:55:07 +0200
Subject: [PATCH 2/3] updated the metadata notebook for private document
 collections

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 .../nlp_for_metadata/nlp_for_metadata.ipynb   | 5553 ++++++++++-------
 poetry.lock                                   |   28 +-
 pyproject.toml                                |    2 +-
 3 files changed, 3443 insertions(+), 2140 deletions(-)

diff --git a/examples/nlp_for_metadata/nlp_for_metadata.ipynb b/examples/nlp_for_metadata/nlp_for_metadata.ipynb
index 69f039d..52e9903 100644
--- a/examples/nlp_for_metadata/nlp_for_metadata.ipynb
+++ b/examples/nlp_for_metadata/nlp_for_metadata.ipynb
@@ -79,6 +79,7 @@
       " -> already downloaded language\n",
       " -> already downloaded name\n",
       " -> already downloaded semantic\n",
+      " -> already downloaded metadata\n",
       " -> already downloaded geoloc\n"
      ]
     }
@@ -109,7 +110,7 @@
     "\n",
     "from tabulate import tabulate\n",
     "\n",
-    "models = load_pretrained_nlp_models(verbose=True)"
+    "models = load_pretrained_nlp_models(force=False, verbose=True)"
    ]
   },
   {
@@ -122,7 +123,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "f44fbf08",
    "metadata": {},
    "outputs": [],
@@ -153,9 +154,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Processing input:     : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:00<00:00, 78.98it/s]\u001b[38;2;15;98;254m                                                                                                                                                              \u001b[0m\n",
-      "Submitting input:     : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:03<00:00,  3.27s/it]\u001b[38;2;15;98;254m                                                                                                                                                              \u001b[0m\n",
-      "Converting input:     : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:27<00:00, 27.58s/it]\u001b[38;2;15;98;254m                                                                                                                                                              \u001b[0m\n"
+      "Processing input:     : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:00<00:00, 124.39it/s]\u001b[38;2;15;98;254m                                                                                                                                                         \u001b[0m\n",
+      "Submitting input:     : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:06<00:00,  6.66s/it]\u001b[38;2;15;98;254m                                                                                                                                                          \u001b[0m\n",
+      "Converting input:     : 100%|\u001b[38;2;15;98;254m██████████████████████████████\u001b[0m| 1/1 [00:26<00:00, 26.56s/it]\u001b[38;2;15;98;254m                                                                                                                                                          \u001b[0m\n"
      ]
     },
     {
@@ -226,7 +227,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 6,
    "id": "b19f7678-b650-484b-a994-150d0c4ec3a2",
    "metadata": {},
    "outputs": [],
@@ -245,7 +246,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "710cc200-e2ba-46f3-9ca0-efd2baab7ee1",
    "metadata": {},
    "outputs": [],
@@ -276,7 +277,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 13,
    "id": "ed3612b4-bbd2-42d0-ba2d-f8f994565380",
    "metadata": {},
    "outputs": [],
@@ -295,7 +296,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "id": "6d98745c-e0f3-41d2-8261-b7953d835dec",
    "metadata": {},
    "outputs": [
@@ -303,21 +304,78 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Document understanding is a key busin...\n",
-      "1     title  #/texts/1  Delivering Document Conversion as a Cloud Serv...\n",
-      "2    author  #/texts/1                                     Christoph Auer\n",
-      "3    author  #/texts/1                                Research Ruschlikon\n",
-      "4    author  #/texts/2                                Research Ruschlikon\n",
-      "5    author  #/texts/3                                      Michele Dolfi\n",
-      "6    author  #/texts/3                                Research Ruschlikon\n",
-      "7    author  #/texts/4                                            J Staar\n",
-      "8    author  #/texts/4                                Research Ruschlikon\n"
+      "TITLE\n",
+      "Delivering Document Conversion as a Cloud Service with High Throughput and Responsiveness\n",
+      "ABSTRACT\n",
+      "['Abstract-Document understanding is a key business process in the data-driven economy since documents are central to knowledge discovery and business insights. Converting documents into a machine-processable format is a particular challenge here due to their huge variability in formats and complex structure. Accordingly, many algorithms and machine-learning methods emerged to solve particular tasks such as Optical Character Recognition (OCR), layout analysis, table-structure recovery, figure understanding, etc. We observe the adoption of such methods in document understanding solutions offered by all major cloud providers. Yet, publications outlining how such services are designed and optimized to scale in the cloud are scarce. In this paper, we focus on the case of document conversion to illustrate the particular challenges of scaling a complex data processing pipeline with a strong reliance on machine-learning methods on cloud infrastructure. Our key objective is to achieve high scalability and responsiveness for different workload profiles in a well-defined resource budget. We outline the requirements, design, and implementation choices of our document conversion service and reflect on the challenges we faced. Evidence for the scaling behavior and resource efficiency is provided for two alternative workload distribution strategies and deployment configurations. Our best-performing method achieves sustained throughput of over one million PDF pages per hour on 3072 CPU cores across 192 nodes.', 'Index Terms-cloud applications, document understanding, distributed computing, artificial intelligence']\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "5   semantic   8967552455475999131      TEXT   #/texts/0     header   \n",
+      "7   semantic    384749972256050104      TEXT   #/texts/1  meta-data   \n",
+      "9   semantic  15891517341344374830      TEXT   #/texts/2  meta-data   \n",
+      "11  semantic  10276496618786154295      TEXT   #/texts/3  meta-data   \n",
+      "13  semantic   5624406992563222356      TEXT   #/texts/4  meta-data   \n",
+      "15  semantic  15035726207261556942      TEXT   #/texts/5       text   \n",
+      "17  semantic   4662798960261328447      TEXT   #/texts/6       text   \n",
+      "19  semantic  15072469540570473164      TEXT   #/texts/7     header   \n",
+      "21  semantic   8600142426167835349      TEXT   #/texts/8       text   \n",
+      "23  semantic   3072624984713661043      TEXT   #/texts/9       text   \n",
+      "25  semantic  14339411138813898476      TEXT  #/texts/10       text   \n",
+      "27  semantic  17407436599861342415      TEXT  #/texts/11  meta-data   \n",
+      "29  semantic   4004878754391976765      TEXT  #/texts/12       text   \n",
+      "31  semantic  15578236054977031520      TEXT  #/texts/13       text   \n",
+      "33  semantic   1317828445053500670      TEXT  #/texts/14       text   \n",
+      "35  semantic   3501395332085509922      TEXT  #/texts/15       text   \n",
+      "37  semantic  14716706603701707953      TEXT  #/texts/16       text   \n",
+      "39  semantic   2277014394919988861      TEXT  #/texts/17       text   \n",
+      "41  semantic  18364912209191405749      TEXT  #/texts/18     header   \n",
+      "43  semantic    487083125877341825      TEXT  #/texts/19       text   \n",
+      "\n",
+      "    confidence  \n",
+      "5         0.73  \n",
+      "7         1.00  \n",
+      "9         0.99  \n",
+      "11        0.99  \n",
+      "13        0.99  \n",
+      "15        0.99  \n",
+      "17        0.93  \n",
+      "19        1.00  \n",
+      "21        0.99  \n",
+      "23        0.99  \n",
+      "25        1.00  \n",
+      "27        0.99  \n",
+      "29        1.00  \n",
+      "31        1.00  \n",
+      "33        0.97  \n",
+      "35        0.95  \n",
+      "37        1.00  \n",
+      "39        0.95  \n",
+      "41        0.97  \n",
+      "43        1.00  \n",
+      "       type             subj_hash subj_name  subj_path     label  confidence\n",
+      "1  metadata   8967552455475999131  DOCUMENT  #/texts/0     title         1.0\n",
+      "2  metadata  15035726207261556942  DOCUMENT  #/texts/5  abstract         1.0\n",
+      "3  metadata   4662798960261328447  DOCUMENT  #/texts/6  abstract         1.0\n",
+      "  subtype  subj_path            name\n",
+      "0  author  #/texts/1  Christoph Auer\n",
+      "1  author  #/texts/3   Michele Dolfi\n",
+      "2  author  #/texts/4         J Staar\n"
      ]
     }
    ],
    "source": [
-    "#print(res[\"instances\"][\"headers\"])\n",
+    "if \"title\" in res[\"description\"]:\n",
+    "    print(\"TITLE\")\n",
+    "    print(res[\"description\"][\"title\"])\n",
+    "\n",
+    "if \"abstract\" in res[\"description\"]:\n",
+    "    print(\"ABSTRACT\")\n",
+    "    print(res[\"description\"][\"abstract\"])\n",
+    "\n",
+    "doc_props = props[props[\"type\"]==\"semantic\"]\n",
+    "print(doc_props[0:20])\n",
+    "\n",
+    "doc_props = props[props[\"type\"]==\"metadata\"]\n",
+    "print(doc_props)\n",
     "\n",
     "doc_insts = insts[insts[\"subj_name\"]==\"DOCUMENT\"][[\"subtype\", \"subj_path\", \"name\"]]\n",
     "print(doc_insts)"
@@ -325,171 +383,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "01771757-70c3-44cb-824c-1fd9b716a99f",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[1] C. Gopal, C. L. Marshall, D. Vesset, N. Ward-Dutton, J. Hamel, R.\n",
-      "Jyoti, P. Rutten, C. W. Olofson, J. Rydning, S. Rau, and J. Duke, 'IDC\n",
-      "FutureScape: Worldwide future of intelligence 2022 predictions,'\n",
-      "International Data Group, Inc., Needham, MA, Research Report\n",
-      "US47913321, Oct. 2021. [Online]. Available:\n",
-      "https://www.idc.com/getdoc.jsp?containerId=US47913321\n",
-      "\n",
-      "entities:\n",
-      "               subtype                                               name\n",
-      "214  reference-number                                                  1\n",
-      "215           authors  C. Gopal, C. L. Marshall, D. Vesset, N. Ward-D...\n",
-      "216       person-name                                            C Gopal\n",
-      "217       person-name                                       C L Marshall\n",
-      "218       person-name                                           D Vesset\n",
-      "219       person-name                                             N Ward\n",
-      "220       person-name                                            J Hamel\n",
-      "221       person-name                                            R Jyoti\n",
-      "222       person-name                                           P Rutten\n",
-      "223       person-name                                        C W Olofson\n",
-      "224       person-name                                          J Rydning\n",
-      "225       person-name                                              S Rau\n",
-      "226       person-name                                             J Duke\n",
-      "227             title  'IDC FutureScape: Worldwide future of intellig...\n",
-      "228           journal  International Data Group, Inc., Needham, MA, R...\n",
-      "229       person-name                           International Data Group\n",
-      "230       person-name                                    Research Report\n",
-      "231            volume                                           47913321\n",
-      "232              date                                          Oct. 2021\n",
-      "233              note                                Online]. Available:\n",
-      "234               url  https://www.idc.com/getdoc.jsp?containerId=US4... \n",
-      "\n",
-      "\n",
-      "[2] D. Vile, 'The road to becoming a data driven business,' Freeform\n",
-      "Dynamics Ltd., New Milton, United Kingdom, Research Report US47913321,\n",
-      "Nov. 2020. [Online]. Available: https://www.freeformdynamics.com/wp-\n",
-      "content/uploads/2020/11/ 2020-The road to becoming a data driven\n",
-      "business.pdf\n",
-      "\n",
-      "entities:\n",
-      "               subtype                                               name\n",
-      "235  reference-number                                                  2\n",
-      "236           authors                                            D. Vile\n",
-      "237       person-name                                             D Vile\n",
-      "238             title     'The road to becoming a data driven business,'\n",
-      "239           journal  Freeform Dynamics Ltd., New Milton, United Kin...\n",
-      "240       person-name                              Freeform Dynamics Ltd\n",
-      "241           country                                     United Kingdom\n",
-      "242       person-name                                     United Kingdom\n",
-      "243       person-name                                    Research Report\n",
-      "244            volume                                           47913321\n",
-      "245              date                                          Nov. 2020\n",
-      "246              note                                Online]. Available:\n",
-      "247               url  https://www.freeformdynamics.com/wp-content/up...\n",
-      "248             title                           data driven business.pdf \n",
-      "\n",
-      "\n",
-      "[3] M. Aslett and N. Patience, 'Data platforms market map 2021,' S&P\n",
-      "Global Market Intelligence, Tech. Rep., Sep. 2021.\n",
-      "\n",
-      "entities:\n",
-      "                subtype                                             name\n",
-      "249   reference-number                                                3\n",
-      "250            authors                        M. Aslett and N. Patience\n",
-      "251        person-name                                         M Aslett\n",
-      "252        person-name                                       N Patience\n",
-      "253              title                'Data platforms market map 2021,'\n",
-      "254            journal  S&P Global Market Intelligence, Tech. Rep., Sep\n",
-      "255        person-name                       Global Market Intelligence\n",
-      "256  abbreviation-name                                         Tech Rep\n",
-      "257               date                                             2021 \n",
-      "\n",
-      "\n",
-      "[4] G. Aggarwal. (2021, Jan.) How the pandemic has accelerated cloud\n",
-      "adoption. Forbes. Jersey City, NJ. [Online]. Available:\n",
-      "https://www.forbes.com/sites/forbestechcouncil/2021/01/15/ how-the-\n",
-      "pandemic-has-accelerated-cloud-adoption\n",
-      "\n",
-      "entities:\n",
-      "               subtype                                               name\n",
-      "258  reference-number                                                  4\n",
-      "259           authors                                        G. Aggarwal\n",
-      "260       person-name                                         G Aggarwal\n",
-      "261              date                                          2021, Jan\n",
-      "262             title  How the pandemic has accelerated cloud adoptio...\n",
-      "263       person-name                                        Jersey City\n",
-      "264              note                                         Available:\n",
-      "265               url  https://www.forbes.com/sites/forbestechcouncil... \n",
-      "\n",
-      "\n",
-      "[5] 'Enterprise survey series: DevOps and the cloud,' Evans Data\n",
-      "Corporation, Santa Cruz, CA, Research Report, Aug. 2021. [Online].\n",
-      "Available: https://evansdata.com/reports/viewRelease.php?reportID=45\n",
-      "\n",
-      "entities:\n",
-      "               subtype                                               name\n",
-      "266  reference-number                                                  5\n",
-      "267             title  'Enterprise survey series: DevOps and the cloud,'\n",
-      "268           journal  Evans Data Corporation, Santa Cruz, CA, Resear...\n",
-      "269       person-name                             Evans Data Corporation\n",
-      "270       person-name                                         Santa Cruz\n",
-      "271       person-name                                    Research Report\n",
-      "272              date                                               2021\n",
-      "273              note                                Online]. Available:\n",
-      "274               url  https://evansdata.com/reports/viewRelease.php?... \n",
-      "\n",
-      "\n",
-      "[6] J. Arundel and J. Domingus, Cloud Native DevOps with Kubernetes:\n",
-      "Building, Deploying, and Scaling Modern Applications in the Cloud.\n",
-      "Sebastopol, CA: O'Reilly Media, Apr. 2019.\n",
-      "\n",
-      "entities:\n",
-      "               subtype                                               name\n",
-      "275  reference-number                                                  6\n",
-      "276           authors                         J. Arundel and J. Domingus\n",
-      "277       person-name                                          J Arundel\n",
-      "278       person-name                                         J Domingus\n",
-      "279             title  Cloud Native DevOps with Kubernetes: Building,...\n",
-      "280       person-name                                Cloud Native DevOps\n",
-      "281           journal                                  Reilly Media, Apr\n",
-      "282              date                                               2019 \n",
-      "\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "\n",
-    "refs = props[(props[\"label\"]==\"reference\") & (props[\"confidence\"]>0.8)]\n",
-    "\n",
-    "cnt = 0\n",
-    "for i,ref in refs.iterrows():\n",
-    "    #print(ref)\n",
-    "\n",
-    "    item = resolve(ref[\"subj_path\"].split(\"/\"), res)\n",
-    "    print(\"\\n\".join(textwrap.wrap(item[\"text\"], 70)))\n",
-    "\n",
-    "    ents = insts[insts[\"subj_hash\"]==item[\"subj_hash\"]][[\"subtype\", \"name\"]]\n",
-    "    print(\"\\nentities:\\n\", ents, \"\\n\\n\")\n",
-    "\n",
-    "    \n",
-    "    cnt+=1\n",
-    "    if cnt>5:\n",
-    "        break\n"
-   ]
+   "outputs": [],
+   "source": []
   },
   {
    "cell_type": "markdown",
    "id": "cfeca54d-bbc1-4022-851d-0b29027de761",
    "metadata": {},
    "source": [
-    "## Extract Matedata from ingested documents"
+    "## Extract MetaData from public documents"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 4,
    "id": "8bb459a8-7b26-4dc3-98da-b1b4a1b59fcc",
    "metadata": {},
    "outputs": [],
@@ -513,7 +423,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 5,
    "id": "d064166b-7578-437c-b3a6-b16eb3d95c1f",
    "metadata": {},
    "outputs": [],
@@ -525,7 +435,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 7,
    "id": "db9da464-23db-4562-a1ce-259a717f404a",
    "metadata": {},
    "outputs": [
@@ -579,7 +489,7 @@
        "      <td>Annual Reports</td>\n",
        "      <td>Document</td>\n",
        "      <td>107.38K</td>\n",
-       "      <td>2024-01-12</td>\n",
+       "      <td>2024-04-15</td>\n",
        "      <td>default/annual-report</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -599,412 +509,12 @@
        "      <td>default/arxiv-category</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>arXiv full documents</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>2.29M</td>\n",
-       "      <td>2023-10-29</td>\n",
-       "      <td>default/arxiv</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>BioRxiv</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>357.76K</td>\n",
-       "      <td>2023-11-09</td>\n",
-       "      <td>default/biorxiv</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>Brenda</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>7.12K</td>\n",
-       "      <td>2023-01-03</td>\n",
-       "      <td>default/brenda</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8</th>\n",
-       "      <td>ChEMBL</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>2.11M</td>\n",
-       "      <td>2023-01-03</td>\n",
-       "      <td>default/chembl</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>ChemRxiv</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>8.82K</td>\n",
-       "      <td>2023-11-23</td>\n",
-       "      <td>default/chemrxiv</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>ClinicalTrials</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>426.42K</td>\n",
-       "      <td>2023-01-03</td>\n",
-       "      <td>default/clinical-trials</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>COD</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>503.78K</td>\n",
-       "      <td>2023-07-24</td>\n",
-       "      <td>default/cod</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>Cord19</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>655.45K</td>\n",
-       "      <td>2022-11-17</td>\n",
-       "      <td>default/cord19</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>Crossref</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>131.86M</td>\n",
-       "      <td>2023-02-22</td>\n",
-       "      <td>default/crossref</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>Crossref journal list</td>\n",
-       "      <td>Reference</td>\n",
-       "      <td>100.52K</td>\n",
-       "      <td>2022-02-22</td>\n",
-       "      <td>default/crossref-journal</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>D&amp;B Hoovers</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>10K</td>\n",
-       "      <td>2021-04-16</td>\n",
-       "      <td>default/swot-report</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>DeepSearch materials</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>360.54K</td>\n",
-       "      <td>2023-01-03</td>\n",
-       "      <td>default/ds4sd-material</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>DOAB</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>8.8K</td>\n",
-       "      <td>2023-12-04</td>\n",
-       "      <td>default/doab</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>DrugBank</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>4.44K</td>\n",
-       "      <td>2022-11-03</td>\n",
-       "      <td>default/drugbank</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>engrXiv</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>1.84K</td>\n",
-       "      <td>2023-01-03</td>\n",
-       "      <td>default/engrxiv</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20</th>\n",
-       "      <td>ESG Reports</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>17.36K</td>\n",
-       "      <td>2024-01-08</td>\n",
-       "      <td>default/esg-report</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>FDA Adverse Event Reporting System (FAERS)</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>435.62K</td>\n",
-       "      <td>2023-01-03</td>\n",
-       "      <td>default/faers</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>22</th>\n",
-       "      <td>GenBank</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>260.36M</td>\n",
-       "      <td>2023-01-24</td>\n",
-       "      <td>default/genbank</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23</th>\n",
-       "      <td>HBCP Open Access Corpus</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>90</td>\n",
-       "      <td>2023-01-03</td>\n",
-       "      <td>default/hbcp</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>24</th>\n",
-       "      <td>IBM Redbooks</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>2.75K</td>\n",
-       "      <td>2023-06-08</td>\n",
-       "      <td>default/ibm-redbooks</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>25</th>\n",
-       "      <td>IEEE</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>61.95K</td>\n",
-       "      <td>2024-01-16</td>\n",
-       "      <td>default/ieee</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>26</th>\n",
-       "      <td>International Patent Classification (IPC)</td>\n",
-       "      <td>Reference</td>\n",
-       "      <td>78.52K</td>\n",
-       "      <td>2022-02-22</td>\n",
-       "      <td>default/wipo-ipc</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>27</th>\n",
-       "      <td>IPCC</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>819</td>\n",
-       "      <td>2023-06-14</td>\n",
-       "      <td>default/ipcc</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>28</th>\n",
-       "      <td>Legal Entity Identifier</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>2.1M</td>\n",
-       "      <td>2023-08-16</td>\n",
-       "      <td>default/lei</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>29</th>\n",
-       "      <td>Material Components</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>16.32K</td>\n",
-       "      <td>2023-01-30</td>\n",
-       "      <td>default/experiment</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>30</th>\n",
-       "      <td>MedRxiv</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>69.18K</td>\n",
-       "      <td>2023-11-02</td>\n",
-       "      <td>default/medrxiv</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>NeurIPS</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>16.9K</td>\n",
-       "      <td>2023-09-24</td>\n",
-       "      <td>default/neurips</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>32</th>\n",
-       "      <td>News</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>9.82M</td>\n",
-       "      <td>2023-09-10</td>\n",
-       "      <td>default/news</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>33</th>\n",
-       "      <td>NMRShift</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>44.33K</td>\n",
-       "      <td>2023-01-03</td>\n",
-       "      <td>default/nmrshift</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>34</th>\n",
-       "      <td>OpenCVF</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>26.94K</td>\n",
-       "      <td>2023-10-04</td>\n",
-       "      <td>default/opencvf</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35</th>\n",
-       "      <td>OpenStax</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>76</td>\n",
-       "      <td>2024-02-01</td>\n",
-       "      <td>default/openstax</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>36</th>\n",
-       "      <td>OpenStreetMap</td>\n",
-       "      <td>Generic</td>\n",
-       "      <td>296.31M</td>\n",
-       "      <td>2023-03-12</td>\n",
-       "      <td>default/osm</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37</th>\n",
-       "      <td>PatCID</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>13.03M</td>\n",
-       "      <td>2023-09-15</td>\n",
-       "      <td>default/patcid</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>38</th>\n",
-       "      <td>Patent SMILES</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>2.84M</td>\n",
-       "      <td>2023-10-11</td>\n",
-       "      <td>default/patent-smiles</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>39</th>\n",
-       "      <td>Patents from CNIPR</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>2</td>\n",
-       "      <td>2022-12-19</td>\n",
-       "      <td>default/patent-cnipr</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>40</th>\n",
-       "      <td>Patents from EPO</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>7.09M</td>\n",
-       "      <td>2023-07-06</td>\n",
-       "      <td>default/patent-epo</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>41</th>\n",
-       "      <td>Patents from JPO</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>2.54M</td>\n",
-       "      <td>2024-01-08</td>\n",
-       "      <td>default/patent-jpo</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>42</th>\n",
-       "      <td>Patents from KIPO</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>1.8M</td>\n",
-       "      <td>2022-12-19</td>\n",
-       "      <td>default/patent-kipo</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>43</th>\n",
-       "      <td>Patents from USPTO</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>16.16M</td>\n",
-       "      <td>2024-02-09</td>\n",
-       "      <td>default/patent-uspto</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>44</th>\n",
-       "      <td>Patents from USPTO (TEST)</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>6.81K</td>\n",
-       "      <td>2024-03-13</td>\n",
-       "      <td>default/patent-uspto-test</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>45</th>\n",
-       "      <td>PLOS</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>340.28K</td>\n",
-       "      <td>2024-01-10</td>\n",
-       "      <td>default/plos</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>46</th>\n",
-       "      <td>PubChem</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>118.24M</td>\n",
-       "      <td>2023-07-06</td>\n",
-       "      <td>default/pubchem</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>47</th>\n",
-       "      <td>PubMed Central</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>5.08M</td>\n",
-       "      <td>2023-03-01</td>\n",
-       "      <td>default/pubmed</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>48</th>\n",
-       "      <td>PubMed Central (PDF)</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>27.66K</td>\n",
-       "      <td>2024-01-22</td>\n",
-       "      <td>default/pmc-pdf</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>49</th>\n",
-       "      <td>Red Hat</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>7.17K</td>\n",
-       "      <td>2024-01-23</td>\n",
-       "      <td>default/redhat</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>50</th>\n",
-       "      <td>RxNorm</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>374.18K</td>\n",
-       "      <td>2023-01-03</td>\n",
-       "      <td>default/rxnorm</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>51</th>\n",
-       "      <td>SEC Edgar CIK Lookup</td>\n",
-       "      <td>Reference</td>\n",
-       "      <td>786K</td>\n",
-       "      <td>2022-02-22</td>\n",
-       "      <td>default/sec-cik</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>52</th>\n",
-       "      <td>SEC Edgar filings</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>56.38K</td>\n",
-       "      <td>2021-07-06</td>\n",
-       "      <td>default/sec-filing</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>53</th>\n",
-       "      <td>Semantic Scholar Academic Graph</td>\n",
-       "      <td>Document</td>\n",
-       "      <td>216.85M</td>\n",
-       "      <td>2024-03-11</td>\n",
-       "      <td>default/semantic-scholar</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>54</th>\n",
-       "      <td>SMILES from USPTO</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>116.48M</td>\n",
-       "      <td>2022-12-25</td>\n",
-       "      <td>default/patent-uspto-smiles</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>55</th>\n",
-       "      <td>SMILES from USPTO (fingerprints)</td>\n",
-       "      <td>Record</td>\n",
-       "      <td>85.81M</td>\n",
-       "      <td>2023-02-23</td>\n",
-       "      <td>default/patent-uspto-smiles-fp</td>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>56</th>\n",
@@ -1032,6 +542,14 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>59</th>\n",
+       "      <td>VHDL articles</td>\n",
+       "      <td>Document</td>\n",
+       "      <td>215</td>\n",
+       "      <td>2024-04-23</td>\n",
+       "      <td>default/vhdl</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>60</th>\n",
        "      <td>Wikipedia</td>\n",
        "      <td>Document</td>\n",
        "      <td>6.45M</td>\n",
@@ -1040,132 +558,37 @@
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>61 rows × 5 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                          Name       Type Num entries  \\\n",
-       "0                                         AAAI   Document      16.02K   \n",
-       "1                                ACL Anthology   Document      55.28K   \n",
-       "2                               Annual Reports   Document     107.38K   \n",
-       "3                              arXiv abstracts   Document       2.37M   \n",
-       "4                      arXiv category taxonomy     Record         155   \n",
-       "5                         arXiv full documents   Document       2.29M   \n",
-       "6                                      BioRxiv   Document     357.76K   \n",
-       "7                                       Brenda     Record       7.12K   \n",
-       "8                                       ChEMBL     Record       2.11M   \n",
-       "9                                     ChemRxiv   Document       8.82K   \n",
-       "10                              ClinicalTrials   Document     426.42K   \n",
-       "11                                         COD     Record     503.78K   \n",
-       "12                                      Cord19   Document     655.45K   \n",
-       "13                                    Crossref   Document     131.86M   \n",
-       "14                       Crossref journal list  Reference     100.52K   \n",
-       "15                                 D&B Hoovers     Record         10K   \n",
-       "16                        DeepSearch materials     Record     360.54K   \n",
-       "17                                        DOAB   Document        8.8K   \n",
-       "18                                    DrugBank     Record       4.44K   \n",
-       "19                                     engrXiv   Document       1.84K   \n",
-       "20                                 ESG Reports   Document      17.36K   \n",
-       "21  FDA Adverse Event Reporting System (FAERS)   Document     435.62K   \n",
-       "22                                     GenBank     Record     260.36M   \n",
-       "23                     HBCP Open Access Corpus   Document          90   \n",
-       "24                                IBM Redbooks   Document       2.75K   \n",
-       "25                                        IEEE   Document      61.95K   \n",
-       "26   International Patent Classification (IPC)  Reference      78.52K   \n",
-       "27                                        IPCC   Document         819   \n",
-       "28                     Legal Entity Identifier     Record        2.1M   \n",
-       "29                         Material Components   Document      16.32K   \n",
-       "30                                     MedRxiv   Document      69.18K   \n",
-       "31                                     NeurIPS   Document       16.9K   \n",
-       "32                                        News   Document       9.82M   \n",
-       "33                                    NMRShift     Record      44.33K   \n",
-       "34                                     OpenCVF   Document      26.94K   \n",
-       "35                                    OpenStax   Document          76   \n",
-       "36                               OpenStreetMap    Generic     296.31M   \n",
-       "37                                      PatCID     Record      13.03M   \n",
-       "38                               Patent SMILES   Document       2.84M   \n",
-       "39                          Patents from CNIPR   Document           2   \n",
-       "40                            Patents from EPO   Document       7.09M   \n",
-       "41                            Patents from JPO   Document       2.54M   \n",
-       "42                           Patents from KIPO   Document        1.8M   \n",
-       "43                          Patents from USPTO   Document      16.16M   \n",
-       "44                   Patents from USPTO (TEST)   Document       6.81K   \n",
-       "45                                        PLOS   Document     340.28K   \n",
-       "46                                     PubChem     Record     118.24M   \n",
-       "47                              PubMed Central   Document       5.08M   \n",
-       "48                        PubMed Central (PDF)   Document      27.66K   \n",
-       "49                                     Red Hat   Document       7.17K   \n",
-       "50                                      RxNorm     Record     374.18K   \n",
-       "51                        SEC Edgar CIK Lookup  Reference        786K   \n",
-       "52                           SEC Edgar filings   Document      56.38K   \n",
-       "53             Semantic Scholar Academic Graph   Document     216.85M   \n",
-       "54                           SMILES from USPTO     Record     116.48M   \n",
-       "55            SMILES from USPTO (fingerprints)     Record      85.81M   \n",
-       "56                                        UMLS     Record       2.69M   \n",
-       "57                                     UniProt     Record     567.48K   \n",
-       "58                       USPTO patents for NER   Document       2.64K   \n",
-       "59                                   Wikipedia   Document       6.45M   \n",
+       "                       Name      Type Num entries        Date  \\\n",
+       "0                      AAAI  Document      16.02K  2023-08-29   \n",
+       "1             ACL Anthology  Document      55.28K  2023-08-22   \n",
+       "2            Annual Reports  Document     107.38K  2024-04-15   \n",
+       "3           arXiv abstracts  Document       2.37M  2023-12-07   \n",
+       "4   arXiv category taxonomy    Record         155  2023-12-05   \n",
+       "..                      ...       ...         ...         ...   \n",
+       "56                     UMLS    Record       2.69M  2023-01-03   \n",
+       "57                  UniProt    Record     567.48K  2023-01-03   \n",
+       "58    USPTO patents for NER  Document       2.64K  2023-03-20   \n",
+       "59            VHDL articles  Document         215  2024-04-23   \n",
+       "60                Wikipedia  Document       6.45M  2024-02-26   \n",
+       "\n",
+       "                    Coords  \n",
+       "0             default/aaai  \n",
+       "1              default/acl  \n",
+       "2    default/annual-report  \n",
+       "3   default/arxiv-abstract  \n",
+       "4   default/arxiv-category  \n",
+       "..                     ...  \n",
+       "56            default/umls  \n",
+       "57         default/uniprot  \n",
+       "58   default/uspto-for-ner  \n",
+       "59            default/vhdl  \n",
+       "60       default/wikipedia  \n",
        "\n",
-       "          Date                          Coords  \n",
-       "0   2023-08-29                    default/aaai  \n",
-       "1   2023-08-22                     default/acl  \n",
-       "2   2024-01-12           default/annual-report  \n",
-       "3   2023-12-07          default/arxiv-abstract  \n",
-       "4   2023-12-05          default/arxiv-category  \n",
-       "5   2023-10-29                   default/arxiv  \n",
-       "6   2023-11-09                 default/biorxiv  \n",
-       "7   2023-01-03                  default/brenda  \n",
-       "8   2023-01-03                  default/chembl  \n",
-       "9   2023-11-23                default/chemrxiv  \n",
-       "10  2023-01-03         default/clinical-trials  \n",
-       "11  2023-07-24                     default/cod  \n",
-       "12  2022-11-17                  default/cord19  \n",
-       "13  2023-02-22                default/crossref  \n",
-       "14  2022-02-22        default/crossref-journal  \n",
-       "15  2021-04-16             default/swot-report  \n",
-       "16  2023-01-03          default/ds4sd-material  \n",
-       "17  2023-12-04                    default/doab  \n",
-       "18  2022-11-03                default/drugbank  \n",
-       "19  2023-01-03                 default/engrxiv  \n",
-       "20  2024-01-08              default/esg-report  \n",
-       "21  2023-01-03                   default/faers  \n",
-       "22  2023-01-24                 default/genbank  \n",
-       "23  2023-01-03                    default/hbcp  \n",
-       "24  2023-06-08            default/ibm-redbooks  \n",
-       "25  2024-01-16                    default/ieee  \n",
-       "26  2022-02-22                default/wipo-ipc  \n",
-       "27  2023-06-14                    default/ipcc  \n",
-       "28  2023-08-16                     default/lei  \n",
-       "29  2023-01-30              default/experiment  \n",
-       "30  2023-11-02                 default/medrxiv  \n",
-       "31  2023-09-24                 default/neurips  \n",
-       "32  2023-09-10                    default/news  \n",
-       "33  2023-01-03                default/nmrshift  \n",
-       "34  2023-10-04                 default/opencvf  \n",
-       "35  2024-02-01                default/openstax  \n",
-       "36  2023-03-12                     default/osm  \n",
-       "37  2023-09-15                  default/patcid  \n",
-       "38  2023-10-11           default/patent-smiles  \n",
-       "39  2022-12-19            default/patent-cnipr  \n",
-       "40  2023-07-06              default/patent-epo  \n",
-       "41  2024-01-08              default/patent-jpo  \n",
-       "42  2022-12-19             default/patent-kipo  \n",
-       "43  2024-02-09            default/patent-uspto  \n",
-       "44  2024-03-13       default/patent-uspto-test  \n",
-       "45  2024-01-10                    default/plos  \n",
-       "46  2023-07-06                 default/pubchem  \n",
-       "47  2023-03-01                  default/pubmed  \n",
-       "48  2024-01-22                 default/pmc-pdf  \n",
-       "49  2024-01-23                  default/redhat  \n",
-       "50  2023-01-03                  default/rxnorm  \n",
-       "51  2022-02-22                 default/sec-cik  \n",
-       "52  2021-07-06              default/sec-filing  \n",
-       "53  2024-03-11        default/semantic-scholar  \n",
-       "54  2022-12-25     default/patent-uspto-smiles  \n",
-       "55  2023-02-23  default/patent-uspto-smiles-fp  \n",
-       "56  2023-01-03                    default/umls  \n",
-       "57  2023-01-03                 default/uniprot  \n",
-       "58  2023-03-20           default/uspto-for-ner  \n",
-       "59  2024-02-26               default/wikipedia  "
+       "[61 rows x 5 columns]"
       ]
      },
      "metadata": {},
@@ -1189,19 +612,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 8,
    "id": "357340cc-97e3-44bc-aa28-41a1be1e9a20",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "cb1357fa9d50499e929a520811253c24",
+       "model_id": "731a7106c87f46fb97ed8d94c8ce883b",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "  0%|          | 0/60 [00:00<?, ?it/s]"
+       "  0%|          | 0/61 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
@@ -1276,7 +699,7 @@
     }
    ],
    "source": [
-    "# Input query\n",
+    "# Input query: search for papers which mention `DocLayNet` or `PubLayNet` in the main-text\n",
     "search_query = \"main-text.text:(\\\"DocLayNet\\\" OR \\\"PubLayNet\\\")\"\n",
     "\n",
     "# Iterate through the data collections\n",
@@ -1303,14 +726,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 11,
    "id": "90f84882-1c85-4b0a-b0eb-ea5bf0b41e32",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "21d681aa843745bca7374749f46e23e7",
+       "model_id": "e61ea71924bd4746961398551b4955bd",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1326,1581 +749,2869 @@
      "output_type": "stream",
      "text": [
       "2007.12238.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  MiniConf is a framework for hosting virtual ac...\n",
-      "1     title  #/texts/2            MiniConf-A Virtual Conference Framework\n",
-      "2    author  #/texts/2                                   Alexander M Rush\n",
-      "3    author  #/texts/2                                   Hendrik Strobelt\n",
+      "title:  MiniConf-A Virtual Conference Framework\n",
+      "abstract:  Abstract MiniConf is a framework for hosting virtual academic conferences motivated by the sudden inability for these events to be hosted globally. The framework is designed to be global and asynchronous, interactive, and to promote browsing and discovery. We developed the system to be sustainable and maintainable, in particular ensuring that it is open-source, easy to setup, and scalable on minimal hardware. In this technical report, we discuss design decisions, provide technical detail, and show examples of a case study deployment. Keywords Conference Management-Academic Communication-Software Development $^{1}$CS+Cornell Tech, Cornell University, New York NY, USA $^{2}$MIT-IBM Watson AI Lab, IBM Research, Cambridge MA, USA Correspondence : info@mini-conf.org\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   2265028778467379955  DOCUMENT          #         en        1.00\n",
+      "1   metadata   7284302905140581098  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   4436866271436177692  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata   3600736756989613306  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   metadata   4437602680284425626  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "5   metadata   4167859403464433494  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "6   metadata  16515177709963531263  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "7   metadata  12555384452004545401  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "8   metadata   6728476374858842747  DOCUMENT  #/texts/9   abstract        1.00\n",
+      "9   language   7357703022170425089      TEXT  #/texts/0         en        0.52\n",
+      "10  semantic   7357703022170425089      TEXT  #/texts/0  reference        0.97\n",
+      "11  language   7284302905140581098      TEXT  #/texts/1         en        0.67\n",
       "2111.06016.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Analyzing the layout of a document to identify...\n",
-      "1     title  #/texts/2  SYNTHETIC DOCUMENT GENERATOR FOR ANNOTATION-FR...\n",
-      "2    author  #/texts/2                                       Natraj Raman\n",
-      "3    author  #/texts/2                                       Sameena Shah\n",
-      "4    author  #/texts/2                                     Manuela Veloso\n",
+      "title:  SYNTHETIC DOCUMENT GENERATOR FOR ANNOTATION-FREE LAYOUT RECOGNITION\n",
+      "abstract:  ABSTRACT Analyzing the layout of a document to identify headers, sections, tables, figures etc. is critical to understanding its content. Deep learning based approaches for detecting the layout structure of document images have been promising. However, these methods require a large number of annotated examples during training, which are both expensive and time consuming to obtain. We describe here a synthetic document generator that automatically produces realistic documents with labels for spatial positions, extents and categories of the layout elements. The proposed generative process treats every physical component of a document as a random variable and models their intrinsic dependencies using a Bayesian Network graph. Our hierarchical formulation using stochastic templates allow parameter sharing between documents for retaining broad themes and yet the distributional characteristics produces visually unique samples, thereby capturing complex and diverse layouts. We empirically illustrate that a deep layout detection model trained purely on the synthetic documents can match the performance of a model that uses real documents. K eywords Synthetic Image Generation · Bayesian Network · Layout Analysis\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  15863822848670978642  DOCUMENT          #         en        1.00\n",
+      "1   metadata    849366496391973042  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   6562169947754304198  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata  13702001854642761223  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   metadata  12685056807055253008  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "5   language  10149061364101562486      TEXT  #/texts/0         en        0.50\n",
+      "6   semantic  10149061364101562486      TEXT  #/texts/0  reference        0.91\n",
+      "7   language    849366496391973042      TEXT  #/texts/1         en        0.35\n",
+      "8   semantic    849366496391973042      TEXT  #/texts/1     header        0.97\n",
+      "9   language   9139753227411297604      TEXT  #/texts/2         en        0.21\n",
+      "10  semantic   9139753227411297604      TEXT  #/texts/2  meta-data        0.99\n",
+      "11  language   6562169947754304198      TEXT  #/texts/3         en        0.64\n",
       "2105.14931.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. We present d ocument d omain r andom...\n",
-      "1     title  #/texts/2  Document Domain Randomization for Deep Learnin...\n",
-      "2    author  #/texts/2                                          Meng Ling\n",
-      "3    author  #/texts/2                                          Jian Chen\n",
-      "4    author  #/texts/2                                   Michael Sedlmair\n",
-      "5    author  #/texts/2                                   Robert S Laramee\n",
-      "6    author  #/texts/2                                            Jian Wu\n",
-      "7    author  #/texts/2                                              C Lee\n",
-      "8    author  #/texts/9                            Old Dominion University\n",
+      "title:  Document Domain Randomization for Deep Learning Document Layout Extraction\n",
+      "abstract:  Abstract. We present d ocument d omain r andomization (DDR), the first successful transfer of CNNs trained only on graphically rendered pseudo-paper pages to real-world document segmentation. DDR renders pseudo-document pages by modeling randomized textual and non-textual contents of interest, with userdefined layout and font styles to support joint learning of fine-grained classes. We demonstrate competitive results using our DDR approach to extract nine document classes from the benchmark CS-150 and papers published in two domains, namely annual meetings of Association for Computational Linguistics (ACL) and IEEE Visualization (VIS). We compare DDR to conditions of style mismatch, fewer or more noisy samples that are more easily obtained in the real world. We show that high-fidelity semantic information is not necessary to label semantic classes but style mismatch between train and test can lower model accuracy. Using smaller training samples had a slightly detrimental effect. Finally, network models still achieved high test accuracy when correct labels are diluted towards confusing labels; this behavior hold across several classes. Keywords: Document domain randomization · Document layout · Deep neural network · behavior analysis · evaluation.\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language   7327671452024217603  DOCUMENT           #         en   \n",
+      "1   metadata  14069245462061859532  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata  12775108533738256421  DOCUMENT  #/texts/12   abstract   \n",
+      "3   metadata  16989119287644322975  DOCUMENT  #/texts/13   abstract   \n",
+      "4   language   1657309244825806266      TEXT   #/texts/0         en   \n",
+      "5   semantic   1657309244825806266      TEXT   #/texts/0  reference   \n",
+      "6   language  14069245462061859532      TEXT   #/texts/1         en   \n",
+      "7   semantic  14069245462061859532      TEXT   #/texts/1     header   \n",
+      "8   language   8299103160765612598      TEXT   #/texts/2         en   \n",
+      "9   semantic   8299103160765612598      TEXT   #/texts/2  meta-data   \n",
+      "10  language   3210927963314597787      TEXT   #/texts/3         en   \n",
+      "11  semantic   3210927963314597787      TEXT   #/texts/3  meta-data   \n",
+      "\n",
+      "    confidence  \n",
+      "0         0.96  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         0.64  \n",
+      "5         0.66  \n",
+      "6         0.51  \n",
+      "7         0.88  \n",
+      "8         0.24  \n",
+      "9         0.99  \n",
+      "10        0.44  \n",
+      "11        0.72  \n",
       "2102.02971.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-The triple-based knowledge in large-s...\n",
-      "1     title  #/texts/2  Metaknowledge Extraction Based on Multi-Modal ...\n",
-      "2    author  #/texts/2                                         Shukan Liu\n",
-      "3    author  #/texts/2                                          Ruilin Xu\n",
-      "4    author  #/texts/2                                        Boying Geng\n",
-      "5    author  #/texts/2                                           Qiao Sun\n",
-      "6    author  #/texts/2                                            Li Duan\n",
-      "7    author  #/texts/2                                         Yiming Liu\n",
+      "title:  Metaknowledge Extraction Based on Multi-Modal Documents\n",
+      "abstract:  Abstract-The triple-based knowledge in large-scale knowledge bases is most likely lacking in structural logic and problematic of conducting knowledge hierarchy. In this paper, we introduce the concept of metaknowledge to knowledge engineering research for the purpose of structural knowledge construction. Therefore, the Metaknowledge Extraction Framework and Document Structure Tree model are presented to extract and organize metaknowledge elements (titles, authors, abstracts, sections, paragraphs, etc.), so that it is feasible to extract the structural knowledge from multi-modal documents. Experiment results have proved the effectiveness of metaknowledge elements extraction by our framework. Meanwhile, detailed examples are given to demonstrate what exactly metaknowledge is and how to generate it. At the end of this paper, we propose and analyze the task flow of metaknowledge applications and the associations between knowledge and metaknowledge. Index Terms-Metaknowledge, Multi-Modal, Document Layout Analysis, Knowledge Graph.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   9495012996734215687  DOCUMENT          #         en        0.97\n",
+      "1   metadata  10742098332968591246  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   2370874436100491633  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata  11616931947318553305  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   language  12404813074137060112      TEXT  #/texts/0         en        0.49\n",
+      "5   semantic  12404813074137060112      TEXT  #/texts/0       text        0.90\n",
+      "6   language  10742098332968591246      TEXT  #/texts/1         en        0.62\n",
+      "7   semantic  10742098332968591246      TEXT  #/texts/1     header        0.55\n",
+      "8   language   4948844491635507699      TEXT  #/texts/2         en        0.56\n",
+      "9   semantic   4948844491635507699      TEXT  #/texts/2  meta-data        0.88\n",
+      "10  language   2370874436100491633      TEXT  #/texts/3         en        0.89\n",
+      "11  semantic   2370874436100491633      TEXT  #/texts/3       text        0.99\n",
       "2003.13197.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Decomposing images of document pages into high...\n",
-      "1      title  #/texts/2  Cross-Domain Document Object Detection: Benchm...\n",
-      "2     author  #/texts/2                                             Kai Li\n",
-      "3     author  #/texts/2                                    Chris Tensmeyer\n",
-      "4     author  #/texts/2                                       Handong Zhao\n",
-      "5     author  #/texts/3                                Nikolaos Barmpalios\n",
-      "6     author  #/texts/3                                     Vlad I Morariu\n",
-      "7     author  #/texts/3                                   Varun Manjunatha\n",
-      "8     author  #/texts/3                                           Tong Sun\n",
-      "9     author  #/texts/3                                             Yun Fu\n",
-      "10    author  #/texts/4                            Northeastern University\n",
-      "11    author  #/texts/4                                     Adobe Research\n",
-      "12    author  #/texts/4                               Adobe Document Cloud\n",
+      "title:  Cross-Domain Document Object Detection: Benchmark Suite and Method\n",
+      "abstract:  Abstract Decomposing images of document pages into high-level semantic regions (e.g., figures, tables, paragraphs), document object detection (DOD) is fundamental for downstream tasks like intelligent document editing and understanding. DOD remains a challenging problem as document objects vary significantly in layout, size, aspect ratio, texture, etc. An additional challenge arises in practice because large labeled training datasets are only available for domains that differ from the target domain. We investigate cross-domain DOD, where the goal is to learn a detector for the target domain using labeled data from the source domain and only unlabeled data from the target domain. Documents from the two domains may vary significantly in layout, language, and genre. We establish a benchmark suite consisting of different types of PDF document datasets that can be utilized for cross-domain DOD model training and evaluation. For each dataset, we provide the page images, bounding box annotations, PDF files, and the rendering layers extracted from the PDF files. Moreover, we propose a novel cross-domain DOD model which builds upon the standard detection model and addresses domain shifts by incorporating three novel alignment modules: Feature Pyramid Alignment (FPA) module, Region Alignment (RA) module and Rendering Layer alignment (RLA) module. Extensive experiments on the benchmark suite substantiate the efficacy of the three proposed modules and the proposed method significantly outperforms the baseline methods. The project page is at https://github.com/kailigo/cddod.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   2954750465553139966  DOCUMENT          #         en        1.00\n",
+      "1   metadata   9382351260204097292  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  12550748484969917940  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata  16113821817977818841  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language   9754266243859282074      TEXT  #/texts/0         en        0.23\n",
+      "5   semantic   9754266243859282074      TEXT  #/texts/0  reference        0.78\n",
+      "6   language   9382351260204097292      TEXT  #/texts/1         en        0.51\n",
+      "7   semantic   9382351260204097292      TEXT  #/texts/1     header        0.70\n",
+      "8   language    202484138750054472      TEXT  #/texts/2         en        0.25\n",
+      "9   semantic    202484138750054472      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  14449744740596420817      TEXT  #/texts/3         en        0.20\n",
+      "11  semantic  14449744740596420817      TEXT  #/texts/3  meta-data        1.00\n",
       "2111.08609.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Document AI, or Document Intelligence, is a re...\n",
-      "1     title  #/texts/3          Lei Cui, Yiheng Xu, Tengchao Lv, Furu Wei\n",
-      "2    author  #/texts/3                            Microsoft Research Asia\n",
+      "title:  Lei Cui, Yiheng Xu, Tengchao Lv, Furu Wei\n",
+      "abstract:  ABSTRACT Document AI, or Document Intelligence, is a relatively new research topic that refers to the techniques for automatically reading, understanding, and analyzing business documents. It is an important research direction for natural language processing and computer vision. In recent years, the popularity of deep learning technology has greatly advanced the development of Document AI, such as document layout analysis, visual information extraction, document visual question answering, document image classification, etc. This paper briefly reviews some of the representative models, tasks, and benchmark datasets. Furthermore, we also introduce early-stage heuristic rule-based document analysis, statistical machine learning algorithms, and deep learning approaches especially pre-training methods. Finally, we look into future directions for Document AI research. 1 DOCUMENT AI Document AI, or Document Intelligence, is a booming research topic with increased industrial demand in recent years. It mainly refers to the process of automated understanding, classifying and extracting information with rich typesetting formats from webpages, digital-born documents or scanned documents through AI technology. Due to the diversity of layouts and formats, low-quality scanned document images, and the complexity of the template structure, Document AI is a very challenging task and has attracted widespread attention in related research areas. With the acceleration of digitization, the structured analysis and content extraction of documents, images and others has become a key part of the success of digital transformation. Meanwhile automatic, accurate, and rapid information processing is crucial to improving productivity. Taking business documents as an example, they not only contain the processing details and knowledge accumulation of a company's internal and external affairs, but also a large number of industry-related entities and digital information. Manually extracting information is time-consuming and labor-intensive with low accuracy and low reusability. Document AI deeply combines artificial intelligence and human intelligence, and has different types of applications in multiple industries such as finance, healthcare, insurance, energy and logistics. For instance, in the finance field, it can conduct financial report analysis and intelligent decision analysis, and provide scientific and systematic data support for the formulation of corporate strategies and investment decisions. In healthcare, it can improve the digitization of medical cases and enhance diagnosis accuracy. By analyzing the correlation between medical literature and cases, people can locate potential treatment options. In the accounting field, it can achieve automatic information extraction of invoices and purchase orders, automatically analyze a large number of unstructured documents, and support different downstream business scenarios, saving a lot of manual processing time.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  18024887739747733016  DOCUMENT          #         en        1.00\n",
+      "1   metadata   2823524375188962888  DOCUMENT  #/texts/2      title        1.00\n",
+      "2   metadata  17737597528573843477  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   4278000990015673025  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   metadata  14014369534978377579  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "5   metadata   7722544801970925360  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "6   language    326120714262372444      TEXT  #/texts/0         en        0.40\n",
+      "7   semantic    326120714262372444      TEXT  #/texts/0  reference        0.51\n",
+      "8   language   7667149918016322326      TEXT  #/texts/1         en        0.44\n",
+      "9   semantic   7667149918016322326      TEXT  #/texts/1     header        0.72\n",
+      "10  language   2823524375188962888      TEXT  #/texts/2         en        0.71\n",
+      "11  semantic   2823524375188962888      TEXT  #/texts/2  reference        0.90\n",
       "2209.00852.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Layout generation is a novel task in computer ...\n",
-      "1     title  #/texts/2  Geometry Aligned Variational Transformer for I...\n",
-      "2    author  #/texts/2                                        Yunning Cao\n",
-      "3    author  #/texts/2                                              Ye Ma\n",
-      "4    author  #/texts/2                                           Min Zhou\n",
-      "5    author  #/texts/2                                       Chuanbin Liu\n",
-      "6    author  #/texts/2                                        Hongtao Xie\n",
-      "7    author  #/texts/2                                        Tiezheng Ge\n",
-      "8    author  #/texts/2                                       Yuning Jiang\n",
+      "title:  Geometry Aligned Variational Transformer for Image-conditioned Layout Generation\n",
+      "abstract:  ABSTRACT Layout generation is a novel task in computer vision, which combines the challenges in both object localization and aesthetic appraisal, widely used in advertisements, posters, and slides design. An accurate and pleasant layout should consider both the intradomain relationship within layout elements and the inter-domain relationship between layout elements and the image. However, most previous methods simply focus on image-content-agnostic layout generation, without leveraging the complex visual information from the image. To this end, we explore a novel paradigm entitled image-conditioned layout generation, which aims to add text overlays to an image in a semantically coherent manner. Specifically, we propose an Image-Conditioned Variational Transformer (ICVT) that autoregressively generates various layouts in an image. First, self-attention mechanism is adopted to model the contextual relationship within layout elements, while cross-attention mechanism is used to fuse the visual information of conditional images. Subsequently, we take them as building blocks of conditional variational autoencoder (CVAE), which demonstrates appealing diversity. Second, in order to alleviate the gap between layout elements domain and visual domain, we design a Geometry Alignment module, in which the geometric information of the image is aligned with the layout representation. In addition, we construct a large-scale advertisement poster layout designing dataset with delicate layout and saliency map annotations. Experimental results show that our model can adaptively generate layouts in the non-intrusive area of the image, resulting in a harmonious layout design. KEYWORDS image-conditioned layout generation, conditional variational autoencoder, Transformer, cross attention\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   1043133886708999723  DOCUMENT          #         en        0.99\n",
+      "1   metadata  16310176366628644108  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   7923403198268402625  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata   6638340325318454439  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   metadata   7923294752006355278  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "5   metadata   4688329289695183139  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "6   language   8377923071847443401      TEXT  #/texts/0         en        0.56\n",
+      "7   semantic   8377923071847443401      TEXT  #/texts/0  reference        0.95\n",
+      "8   language  16310176366628644108      TEXT  #/texts/1         en        0.74\n",
+      "9   semantic  16310176366628644108      TEXT  #/texts/1     header        0.86\n",
+      "10  language  12909032936984420733      TEXT  #/texts/2         en        0.35\n",
+      "11  semantic  12909032936984420733      TEXT  #/texts/2  meta-data        0.99\n",
       "2203.09056.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  We introduce a new table detection and structu...\n",
-      "1     title  #/texts/2  Robust Table Detection and Structure Recogniti...\n",
-      "2    author  #/texts/2                                        Chixiang Ma\n",
-      "3    author  #/texts/2                                        Weihong Lin\n",
-      "4    author  #/texts/2                                            Lei Sun\n",
-      "5    author  #/texts/2                                         Qiang Huob\n",
-      "6    author  #/texts/3                            Microsoft Research Asia\n",
+      "title:  Robust Table Detection and Structure Recognition from Heterogeneous Document Images\n",
+      "abstract:  Abstract We introduce a new table detection and structure recognition approach named RobusTabNet to detect the boundaries of tables and reconstruct the cellular structure of the table from heterogeneous document images. For table detection, we propose to use CornerNet as a new region proposal network to generate higher quality table proposals for Faster R-CNN, which has significantly improved the localization accuracy of Faster R-CNN for table detection. Consequently, our table detection approach achieves state-of-the-art performance on three public table detection benchmarks, namely cTDaR TrackA, PubLayNet and IIIT-AR-13K, by only using a lightweight ResNet-18 backbone network. Furthermore, we propose a new split-and-merge based table structure recognition approach, in which a novel spatial CNN based separation line prediction module is proposed to split each detected table into a grid of cells, and a Grid CNN based cell merging module is applied to recover the spanning cells. As the spatial CNN module can e ectively propagate contextual information across the whole table image, our table structure recognizer can robustly recognize tables with large blank spaces and geometrically distorted (even curved) tables. Thanks to these two techniques, our table structure recognition approach achieves state-of-the-art performance on three public benchmarks, including SciTSR, PubTabNet and cTDaR TrackB. Moreover, we have further demonstrated the advantages of our approach in recognizing tables with complex structures, large blank spaces, empty or spanning cells as well as geometrically distorted or even curved tables on a more challenging in-house dataset. Keywords: Table detection, Table structure recognition, Corner detection, Spatial CNN, Grid CNN, Split-and-merge\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language    889660866869193011  DOCUMENT          #         en        0.99\n",
+      "1   metadata   4207068466148295970  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  15138944633239382092  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata   5174676295578023103  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   metadata   5104988531686271285  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "5   language  12654551183495324521      TEXT  #/texts/0         en        0.42\n",
+      "6   semantic  12654551183495324521      TEXT  #/texts/0       text        0.89\n",
+      "7   language   4207068466148295970      TEXT  #/texts/1         en        0.67\n",
+      "8   semantic   4207068466148295970      TEXT  #/texts/1     header        0.66\n",
+      "9   language   8283687367011430155      TEXT  #/texts/2         it        0.31\n",
+      "10  semantic   8283687367011430155      TEXT  #/texts/2  meta-data        0.99\n",
+      "11  language   2638908518268155278      TEXT  #/texts/3         en        0.64\n",
       "2203.09638.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. We formulate the task of detecting l...\n",
-      "1     title  #/texts/2  Unified Line and Paragraph Detection by Graph ...\n",
-      "2    author  #/texts/2                                         Shuang Liu\n",
-      "3    author  #/texts/2                                       Renshen Wang\n",
-      "4    author  #/texts/2                                    Michalis Raptis\n",
-      "5    author  #/texts/2                                     Yasuhisa Fujii\n",
-      "6    author  #/texts/3                                          San Diego\n",
-      "7    author  #/texts/5                                    Google Research\n",
+      "title:  Unified Line and Paragraph Detection by Graph Convolutional Networks ⋆\n",
+      "abstract:  Abstract. We formulate the task of detecting lines and paragraphs in a document into a unified two-level clustering problem. Given a set of text detection boxes that roughly correspond to words, a text line is a cluster of boxes and a paragraph is a cluster of lines. These clusters form a two-level tree that represents a major part of the layout of a document. We use a graph convolutional network to predict the relations between text detection boxes and then build both levels of clusters from these predictions. Experimentally, we demonstrate that the unified approach can be highly efficient while still achieving state-of-the-art quality for detecting paragraphs in public benchmarks and real-world images. Keywords: Text detection, document layout, graph convolutional network.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   2831576719369875742  DOCUMENT          #         en        1.00\n",
+      "1   metadata  10579568759084702273  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   1968444455801408936  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  13330257735040533249  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language   9315017878854505401      TEXT  #/texts/0         en        0.33\n",
+      "5   semantic   9315017878854505401      TEXT  #/texts/0       text        0.89\n",
+      "6   language  10579568759084702273      TEXT  #/texts/1         en        0.90\n",
+      "7   semantic  10579568759084702273      TEXT  #/texts/1     header        0.63\n",
+      "8   language    618605864483312106      TEXT  #/texts/2         en        0.24\n",
+      "9   semantic    618605864483312106      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language  18349261345975562107      TEXT  #/texts/3         en        0.75\n",
+      "11  semantic  18349261345975562107      TEXT  #/texts/3  meta-data        0.97\n",
       "2305.05836.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Accurate Named Entity Recognition (NER) is cru...\n",
-      "1     title  #/texts/2  Extracting Complex Named Entities in Legal Doc...\n",
-      "2    author  #/texts/3                                    Abhinav Agrawal\n",
+      "title:  Extracting Complex Named Entities in Legal Documents via Weakly Supervised Object Detection\n",
+      "abstract:  ABSTRACT Accurate Named Entity Recognition (NER) is crucial for various information retrieval tasks in industry. However, despite significant progress in traditional NER methods, the extraction of Complex Named Entities remains a relatively unexplored area. In this paper, we propose a novel system that combines object detection for Document Layout Analysis (DLA) with weakly supervised learning to address the challenge of extracting discontinuous complex named entities in legal documents. Notably, to the best of our knowledge, this is the first work to apply weak supervision to DLA. Our experimental results show that the model trained solely on pseudo labels outperforms the supervised baseline when gold-standard data is limited, highlighting the effectiveness of our proposed approach in reducing the dependency on annotated data. CCS CONCEPTS · Applied computing → Law; · Information systems → Document structure; · Computing methodologies → Information extraction. KEYWORDS complex named entity recognition, weakly supervised object detection, document understanding, law, information extraction ACM Reference Format: Hsiu-Wei Yang and Abhinav Agrawal. 2023. Extracting Complex Named Entities in Legal Documents via Weakly Supervised Object Detection. In Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR '23), July 23-27, 2023, Taipei, Taiwan. ACM, New York, NY, USA, 5 pages. https://doi.org/10.1145/3539618. 3591852\n",
+      "        type             subj_hash subj_name   subj_path     label  confidence\n",
+      "0   language  11794573301687210414  DOCUMENT           #        en        1.00\n",
+      "1   metadata  17466521243985813516  DOCUMENT   #/texts/1     title        1.00\n",
+      "2   metadata   6886915531134010569  DOCUMENT   #/texts/4  abstract        1.00\n",
+      "3   metadata  11297975182741860390  DOCUMENT   #/texts/5  abstract        1.00\n",
+      "4   metadata  10877363875417754656  DOCUMENT   #/texts/6  abstract        1.00\n",
+      "5   metadata  13673276854222443119  DOCUMENT   #/texts/7  abstract        1.00\n",
+      "6   metadata   6886659627785775936  DOCUMENT   #/texts/8  abstract        1.00\n",
+      "7   metadata   6196452448967347559  DOCUMENT   #/texts/9  abstract        1.00\n",
+      "8   metadata  18033064798229328065  DOCUMENT  #/texts/10  abstract        1.00\n",
+      "9   metadata   8842802019638608357  DOCUMENT  #/texts/11  abstract        1.00\n",
+      "10  language  11740347048559880097      TEXT   #/texts/0        en        0.62\n",
+      "11  semantic  11740347048559880097      TEXT   #/texts/0      text        0.83\n",
       "2305.02567.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Automatic layout generation that can synthesiz...\n",
-      "1     title  #/texts/2  LayoutDM: Transformer-based Diffusion Model fo...\n",
-      "2    author  #/texts/2                                         Shang Chai\n",
+      "title:  LayoutDM: Transformer-based Diffusion Model for Layout Generation\n",
+      "abstract:  Abstract Automatic layout generation that can synthesize highquality layouts is an important tool for graphic design in many applications. Though existing methods based on generative models such as Generative Adversarial Networks (GANs) and Variational Auto-Encoders (VAEs) have progressed, they still leave much room for improving the quality and diversity of the results. Inspired by the recent success of diffusion models in generating high-quality images, this paper explores their potential for conditional layout generation and proposes Transformer-based Layout Diffusion Model (LayoutDM) by instantiating the conditional denoising diffusion probabilistic model (DDPM) with a purely transformer-based architecture. Instead of using convolutional neural networks, a transformer-based conditional Layout Denoiser is proposed to learn the reverse diffusion process to generate samples from noised layout data. Benefitting from both transformer and DDPM, our LayoutDM is of desired properties such as high-quality generation, strong sample diversity, faithful distribution coverage, and stationary training in comparison to GANs and VAEs. Quantitative and qualitative experimental results show that our method outperforms state-of-the-art generative models in terms of quality and diversity.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  10181470485652924664  DOCUMENT          #         en        0.99\n",
+      "1   metadata   9434206477714495822  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   3785748435451180652  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata   3680119538929946904  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language   9912909017476921615      TEXT  #/texts/0         en        0.54\n",
+      "5   semantic   9912909017476921615      TEXT  #/texts/0  reference        0.66\n",
+      "6   language   9434206477714495822      TEXT  #/texts/1         en        0.61\n",
+      "7   semantic   9434206477714495822      TEXT  #/texts/1     header        0.86\n",
+      "8   language  10066165997310264035      TEXT  #/texts/2         en        0.77\n",
+      "9   semantic  10066165997310264035      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language  11841021682582949258      TEXT  #/texts/3         zh        0.43\n",
+      "11  semantic  11841021682582949258      TEXT  #/texts/3  meta-data        0.99\n",
       "2205.12840.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. We present a novel method, DistillAd...\n",
-      "1     title  #/texts/1  DistillAdapt: Source-Free Active Visual Domain...\n",
-      "2    author  #/texts/1                                Divya Kothandaraman\n",
-      "3    author  #/texts/1                                      Sumit Shekhar\n",
-      "4    author  #/texts/1                                 Abhilasha Sancheti\n",
-      "5    author  #/texts/1                                       Manoj Ghuhan\n",
-      "6    author  #/texts/1                                      Tripti Shukla\n",
-      "7    author  #/texts/1                                     Dinesh Manocha\n",
-      "8    author  #/texts/2                                     Adobe Research\n",
+      "title:  DistillAdapt: Source-Free Active Visual Domain Adaptation\n",
+      "abstract:  Abstract. We present a novel method, DistillAdapt, for the challenging problem of Source-free Active Domain Adaptation (SF-ADA). The problem requires adapting a pre-trained 'source' domain network to a 'target' domain, within a provided budget for acquiring labels in the 'target' domain, while assuming that the source data is not available for adaptation, due to privacy concerns or otherwise. DistillAdapt is one of the first approaches for SF-ADA, and holistically addresses the challenges of SF-ADA via a novel Guided Attention Transfer Network (GATN) and an active learning function, H$_{AL}$. The GATN enables selective distillation of features from the pre-trained network to the target network using a small subset of annotated target samples mined by H$_{AL}$. H$_{AL}$ acquires samples at batch-level and balances transfer-ability from the pre-trained network and uncertainty of the target network. DistillAdapt is task-agnostic, and can be applied across visual tasks such as classification, segmentation and detection. Moreover, DistillAdapt can handle shifts in output label space. We conduct experiments and extensive ablation studies across 3 visual tasks, viz. digits classification (MNIST, SVHN), synthetic (GTA5) to real (CityScapes) image segmentation, and document layout detection (PubLayNet to DSSE). We show that our source-free approach, DistillAdapt, results in an improvement of 0. 5%-31. 3% (across datasets and tasks) over prior adaptation methods that assume access to large amounts of annotated source data for adaptation.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  15881966247098943967  DOCUMENT          #         en        1.00\n",
+      "1   metadata  14946664510960877857  DOCUMENT  #/texts/0      title        1.00\n",
+      "2   metadata   1143708917984711296  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   language  14946664510960877857      TEXT  #/texts/0         en        0.32\n",
+      "4   semantic  14946664510960877857      TEXT  #/texts/0     header        0.76\n",
+      "5   language   4787167108441673167      TEXT  #/texts/1         en        0.49\n",
+      "6   semantic   4787167108441673167      TEXT  #/texts/1  meta-data        0.96\n",
+      "7   language   5395310715898161319      TEXT  #/texts/2         en        0.74\n",
+      "8   semantic   5395310715898161319      TEXT  #/texts/2  meta-data        0.90\n",
+      "9   language   1143708917984711296      TEXT  #/texts/3         en        0.85\n",
+      "10  semantic   1143708917984711296      TEXT  #/texts/3       text        0.98\n",
+      "11  language   8083078873212012775      TEXT  #/texts/4         en        0.12\n",
       "2105.06400.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Information Extraction (IE) from the...\n",
-      "1     title  #/texts/2  TabLeX: A Benchmark Dataset for Structure and ...\n",
-      "2    author  #/texts/2                                        Harsh Desai\n",
-      "3    author  #/texts/2                                       Pratik Kayal\n",
-      "4    author  #/texts/2                                       Mayank Singh\n",
+      "title:  TabLeX: A Benchmark Dataset for Structure and Content Information Extraction from Scientific Tables\n",
+      "abstract:  Abstract. Information Extraction (IE) from the tables present in scientific articles is challenging due to complicated tabular representations and complex embedded text. This paper presents$_{TabLeX}$, a large-scale benchmark dataset comprising table images generated from scientific articles. TabLeX consists of two subsets, one for table structure extraction and the other for table content extraction. Each table image is accompanied by its corresponding L A T E X source code. To facilitate the development of robust table IE tools, TabLeX contains images in different aspect ratios and in a variety of fonts. Our analysis sheds light on the shortcomings of current state-of-the-art table extraction models and shows that they fail on even simple table images. Towards the end, we experiment with a transformer-based existing baseline to report performance scores. In contrast to the static benchmarks, we plan to augment this dataset with more complex and diverse tables at regular intervals. Keywords: Information Extraction · L A T E X · Scientific Articles.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  13137329850442248461  DOCUMENT          #         en        0.99\n",
+      "1   metadata  11472458349144611068  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   4588761031433719882  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   8958995113943448416  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  12599017931824777467      TEXT  #/texts/0         en        0.63\n",
+      "5   semantic  12599017931824777467      TEXT  #/texts/0       text        0.83\n",
+      "6   language  11472458349144611068      TEXT  #/texts/1         en        0.59\n",
+      "7   semantic  11472458349144611068      TEXT  #/texts/1     header        0.86\n",
+      "8   language  11111553338144474380      TEXT  #/texts/2         en        0.42\n",
+      "9   semantic  11111553338144474380      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   1415660526087454217      TEXT  #/texts/3         en        0.56\n",
+      "11  semantic   1415660526087454217      TEXT  #/texts/3  meta-data        0.95\n",
       "2103.15348.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Abstract. Recent advances in document image an...\n",
-      "1      title  #/texts/2  LayoutParser : A Unified Toolkit for Deep Lear...\n",
-      "2     author  #/texts/2                                       Zejiang Shen\n",
-      "3     author  #/texts/2                                      Ruochen Zhang\n",
-      "4     author  #/texts/2                                       Melissa Dell\n",
-      "5     author  #/texts/2                       Benjamin Charles Germain Lee\n",
-      "6     author  #/texts/3                                      Jacob Carlson\n",
-      "7     author  #/texts/3                                         Weining Li\n",
-      "8     author  #/texts/4                                    Allen Institute\n",
-      "9     author  #/texts/5                                   Brown University\n",
-      "10    author  #/texts/6                                 Harvard University\n",
+      "title:  LayoutParser : A Unified Toolkit for Deep Learning Based Document Image Analysis\n",
+      "abstract:  Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of important innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applications. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout detection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digitization pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io. Keywords: Document Image Analysis · Deep Learning · Layout Analysis · Character Recognition · Open Source library · Toolkit.\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language  12548874647383832801  DOCUMENT           #         en   \n",
+      "1   metadata   2802584575013519106  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata   2179018036072283553  DOCUMENT  #/texts/11   abstract   \n",
+      "3   metadata   8986516417526936006  DOCUMENT  #/texts/12   abstract   \n",
+      "4   language   1737321237888584465      TEXT   #/texts/0         en   \n",
+      "5   semantic   1737321237888584465      TEXT   #/texts/0       text   \n",
+      "6   language   2802584575013519106      TEXT   #/texts/1         en   \n",
+      "7   semantic   2802584575013519106      TEXT   #/texts/1     header   \n",
+      "8   language   7564785066304785170      TEXT   #/texts/2         en   \n",
+      "9   semantic   7564785066304785170      TEXT   #/texts/2  meta-data   \n",
+      "10  language   9158019064682925600      TEXT   #/texts/3         en   \n",
+      "11  semantic   9158019064682925600      TEXT   #/texts/3  meta-data   \n",
+      "\n",
+      "    confidence  \n",
+      "0         1.00  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         0.34  \n",
+      "5         0.89  \n",
+      "6         0.52  \n",
+      "7         0.81  \n",
+      "8         0.24  \n",
+      "9         1.00  \n",
+      "10        0.52  \n",
+      "11        0.99  \n",
       "2110.09915.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Previous works on key information extraction f...\n",
-      "1     title  #/texts/2  Entity Relation Extraction as Dependency Parsi...\n",
-      "2    author  #/texts/2                                          Yue Zhang\n",
-      "3    author  #/texts/2                                           Bo Zhang\n",
-      "4    author  #/texts/2                                           Rui Wang\n",
-      "5    author  #/texts/2                                         Junjie Cao\n",
-      "6    author  #/texts/2                                            Chen Li\n",
-      "7    author  #/texts/2                                           Zuyi Bao\n",
+      "title:  Entity Relation Extraction as Dependency Parsing in Visually Rich Documents\n",
+      "abstract:  Abstract Previous works on key information extraction from visually rich documents (VRDs) mainly focus on labeling the text within each bounding box (i.e., semantic entity), while the relations in-between are largely unexplored. In this paper, we adapt the popular dependency parsing model, the biaffine parser, to this entity relation extraction task. Being different from the original dependency parsing model which recognizes dependency relations between words, we identify relations between groups of words with layout information instead. We have compared different representations of the semantic entity, different VRD encoders, and different relation decoders. For the model training, we explore multi-task learning to combine entity labeling and relation extraction tasks; and for the evaluation, we conduct experiments on different datasets with filtering and augmentation. The results demonstrate that our proposed model achieves 65.96% F1 score on the FUNSD dataset. As for the realworld application, our model has been applied to the in-house customs data, achieving reliable performance in the production setting.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   7256593462777767795  DOCUMENT          #         en        0.99\n",
+      "1   metadata  11039456751490139420  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   3437509844966890620  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata   3351640345825674097  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language   4359367855478211315      TEXT  #/texts/0         en        0.36\n",
+      "5   semantic   4359367855478211315      TEXT  #/texts/0       text        0.70\n",
+      "6   language  11039456751490139420      TEXT  #/texts/1         en        0.85\n",
+      "7   semantic  11039456751490139420      TEXT  #/texts/1     header        0.86\n",
+      "8   language   2331012439618467779      TEXT  #/texts/2         en        0.17\n",
+      "9   semantic   2331012439618467779      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   7382564455872733900      TEXT  #/texts/3         en        0.65\n",
+      "11  semantic   7382564455872733900      TEXT  #/texts/3  meta-data        1.00\n",
       "2112.12353.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract: The volume of academic literature, s...\n",
-      "1     title  #/texts/1  LAME: Layout-Aware Metadata Extraction Approac...\n",
-      "2    author  #/texts/1                                        South Korea\n",
-      "3    author  #/texts/2                                        South Korea\n",
-      "4    author  #/texts/4                                        South Korea\n",
+      "title:  LAME: Layout-Aware Metadata Extraction Approach for Research Articles JONGYUN CHOI$^{1}$, HYESOO KONG$^{2}$, HWAMOOK YOON$^{2}$, HEUNG-SEON OH$^{3}$, and YUCHUL JUNG$^{1*}$\n",
+      "abstract:  Abstract: The volume of academic literature, such as academic conference papers and journals, has increased rapidly worldwide, and research on metadata extraction is ongoing. However, high-performing metadata extraction is still challenging due to diverse layout formats according to journal publishers. To accommodate the diversity of the layouts of academic journals, we propose a novel LAyout-aware Metadata Extraction (LAME) framework equipped with the three characteristics (e.g., design of an automatic layout analysis, construction of a large meta-data training set, and construction of Layout-MetaBERT). We designed an automatic layout analysis using PDFMiner. Based on the layout analysis, a large volume of metadata-separated training data, including the title, abstract, author name, author affiliated organization, and keywords, were automatically extracted. Moreover, we constructed Layout-MetaBERT to extract the metadata from academic journals with varying layout formats. The experimental results with Layout-MetaBERT exhibited robust performance (Macro-F1, 93.27%) in metadata extraction for unseen journals with different layout formats. Keywords: Automatic layout analysis, Layout-MetaBERT, Metadata extraction, Research article\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   6012573637307908533  DOCUMENT          #         en        1.00\n",
+      "1   metadata    464228663354183117  DOCUMENT  #/texts/0      title        1.00\n",
+      "2   metadata  15858266536491636911  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   9152689721526878606  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language    464228663354183117      TEXT  #/texts/0         en        0.39\n",
+      "5   semantic    464228663354183117      TEXT  #/texts/0  meta-data        0.65\n",
+      "6   language  14433534267320267039      TEXT  #/texts/1         en        0.74\n",
+      "7   semantic  14433534267320267039      TEXT  #/texts/1  meta-data        0.99\n",
+      "8   language   4203210264018792995      TEXT  #/texts/2         en        0.64\n",
+      "9   semantic   4203210264018792995      TEXT  #/texts/2  meta-data        0.97\n",
+      "10  language  15854392013436684226      TEXT  #/texts/3         en        0.79\n",
+      "11  semantic  15854392013436684226      TEXT  #/texts/3  meta-data        0.94\n",
       "2201.09745.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Since a vast number of tables can be easily co...\n",
-      "1     title  #/texts/2  Table Pre-training: A Survey on Model Architec...\n",
-      "2    author  #/texts/2                                         Haoyu Dong\n",
-      "3    author  #/texts/3                                 Microsoft Research\n",
-      "4    author  #/texts/4                                 Microsoft Research\n",
-      "5    author  #/texts/5                                             Ao Liu\n",
-      "6    author  #/texts/6                         Shi Han Microsoft Research\n",
-      "7    author  #/texts/7                   Dongmei Zhang Microsoft Research\n",
+      "title:  Table Pre-training: A Survey on Model Architectures, Pre-training Objectives, and Downstream Tasks\n",
+      "abstract:  ABSTRACT Since a vast number of tables can be easily collected from web pages, spreadsheets, PDFs, and various other document types, a flurry of table pre-training frameworks have been proposed following the success of text and images, and they have achieved new state-of-thearts on various tasks such as table question answering, table type recognition, column relation classification, table search, formula prediction, etc. To fully use the supervision signals in unlabeled tables, a variety of pre-training objectives have been designed and evaluated, for example, denoising cell values, predicting numerical relationships, and implicitly executing SQLs. And to best leverage the characteristics of (semi-)structured tables, various tabular language models, particularly with specially-designed attention mechanisms, have been explored. Since tables usually appear and interact with free-form text, table pre-training usually takes the form of table-text joint pre-training, which attracts significant research interests from multiple domains. This survey aims to provide a comprehensive review of different model designs, pre-training objectives, and downstream tasks for table pre-training, and we further share our thoughts and vision on existing challenges and future opportunities. CCS CONCEPTS · Information systems → Information retrieval. KEYWORDS Semi-structured table; Representation learning; pre-training ACM Reference Format: Haoyu Dong, Zhoujun Cheng, Xinyi He, Mengyu Zhou, Anda Zhou, Fan Zhou, Ao Liu, Shi Han, and Dongmei Zhang. 2022. Table Pre-training: A Survey on Model Architectures, Pre-training Objectives, and Downstream Tasks. In. ACM, New York, NY, USA, 14 pages.\n",
+      "        type             subj_hash subj_name   subj_path     label  confidence\n",
+      "0   language  14097143220837798334  DOCUMENT           #        en        1.00\n",
+      "1   metadata   1418740895044524822  DOCUMENT   #/texts/1     title        1.00\n",
+      "2   metadata  11144305653262557381  DOCUMENT   #/texts/8  abstract        1.00\n",
+      "3   metadata   8767759704357325899  DOCUMENT   #/texts/9  abstract        1.00\n",
+      "4   metadata   5892269567500855356  DOCUMENT  #/texts/10  abstract        1.00\n",
+      "5   metadata   1349658109544879525  DOCUMENT  #/texts/11  abstract        1.00\n",
+      "6   metadata  11144279514693801820  DOCUMENT  #/texts/12  abstract        1.00\n",
+      "7   metadata   1144729129087448562  DOCUMENT  #/texts/13  abstract        1.00\n",
+      "8   metadata    394174642402172125  DOCUMENT  #/texts/14  abstract        1.00\n",
+      "9   metadata  17058277238831647723  DOCUMENT  #/texts/15  abstract        1.00\n",
+      "10  language   6717765264948266926      TEXT   #/texts/0        en        0.27\n",
+      "11  semantic   6717765264948266926      TEXT   #/texts/0      text        0.98\n",
       "2205.00347.pdf\n",
-      "Empty DataFrame\n",
-      "Columns: [subtype, subj_path, name]\n",
-      "Index: []\n",
+      "title:  LayoutBERT: Masked Language Layout Model for Object Insertion\n",
+      "abstract:  ['Image compositing is one of the most fundamental steps in creative workflows. It involves taking objects/parts of several images to create a new image, called a composite. Currently, this process is done manually by creating accurate masks of objects to be inserted and carefully blending them with the target scene or images, usually with the help of tools such as Photoshop or GIMP. While there have been several works on automatic selection of objects for creating masks, the problem of object placement within an image with the correct position, scale, and harmony remains a difficult problem with limited exploration. Automatic object insertion in images or designs is a difficult problem as it requires understanding of the scene geometry and the color harmony between objects. We propose LayoutBERT for the object insertion task. It uses a novel self-supervised masked language model objective and bidirectional multi-head self-attention. It outperforms previous layout-based likelihood models and shows favorable properties in terms of model capacity. We demonstrate the effectiveness of our approach for object insertion in the image compositing setting and other settings like documents and design templates. We further demonstrate the usefulness of the learned representations for layout-based retrieval tasks. We provide both qualitative and quantitative evaluations on datasets from diverse domains like COCO, PublayNet, and two new datasets which we call Image Layouts and Template Layouts. Image Layouts which consists of 5.8 million images with layout annotations is the largest image layout dataset to our knowledge. We also share ablation study results on the effect of dataset size, model size and class sample size for this task.']\n",
+      "        type             subj_hash subj_name  subj_path   label  confidence\n",
+      "0   language  16542934806182997069  DOCUMENT          #      en        0.99\n",
+      "1   language  12311640805300642491      TEXT  #/texts/0      en        0.87\n",
+      "2   semantic  12311640805300642491      TEXT  #/texts/0    text        1.00\n",
+      "3   language  16817937696810446594      TEXT  #/texts/1      en        0.91\n",
+      "4   semantic  16817937696810446594      TEXT  #/texts/1    text        1.00\n",
+      "5   language   4922519091516066157      TEXT  #/texts/2      en        0.16\n",
+      "6   semantic   4922519091516066157      TEXT  #/texts/2  header        0.94\n",
+      "7   language  11724433687639709379      TEXT  #/texts/3      en        0.90\n",
+      "8   semantic  11724433687639709379      TEXT  #/texts/3    text        1.00\n",
+      "9   language  15566448323773711965      TEXT  #/texts/4      en        0.57\n",
+      "10  semantic  15566448323773711965      TEXT  #/texts/4  header        1.00\n",
+      "11  language  10768863974302381558      TEXT  #/texts/5      en        0.77\n",
       "1911.10683.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Important information that relates to...\n",
-      "1     title  #/texts/2  Image-based table recognition: data, model, an...\n",
-      "2    author  #/texts/2                                           Xu Zhong\n",
-      "3    author  #/texts/2                                 Research Australia\n",
-      "4    author  #/texts/2                                          City Road\n",
-      "5    author  #/texts/3                               Elaheh ShafieiBavani\n",
-      "6    author  #/texts/3                                 Research Australia\n",
-      "7    author  #/texts/3                                          City Road\n",
-      "8    author  #/texts/4                                 Research Australia\n",
-      "9    author  #/texts/4                                          City Road\n",
+      "title:  Image-based table recognition: data, model, and evaluation\n",
+      "abstract:  Abstract-Important information that relates to a specific topic in a document is often organized in tabular format to assist readers with information retrieval and comparison, which may be difficult to provide in natural language. However, tabular data in unstructured digital documents, e.g. Portable Document Format (PDF) and images, are difficult to parse into structured machine-readable format, due to complexity and diversity in their structure and style. To facilitate image-based table recognition with deep learning, we develop and release the largest publicly available table recognition dataset PubTabNet $^{1}$, containing 568k table images with corresponding structured HTML representation. PubTabNet is automatically generated by matching the XML and PDF representations of the scientific articles in PubMed Central TM Open Access Subset (PMCOA). We also propose a novel attention-based encoder-dual-decoder (EDD) architecture that converts images of tables into HTML code. The model has a structure decoder which reconstructs the table structure and helps the cell decoder to recognize cell content. In addition, we propose a new Tree-Edit-Distance-based Similarity (TEDS) metric for table recognition, which more appropriately captures multi-hop cell misalignment and OCR errors than the pre-established metric. The experiments demonstrate that the EDD model can accurately recognize complex tables solely relying on the image representation, outperforming the state-of-the-art by 9.7% absolute TEDS score.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   4080901632344816715  DOCUMENT          #         en        1.00\n",
+      "1   metadata  10644900836417579648  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  14242290163871738915  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   language   3067541903536160771      TEXT  #/texts/0         en        0.41\n",
+      "4   semantic   3067541903536160771      TEXT  #/texts/0  reference        0.78\n",
+      "5   language  10644900836417579648      TEXT  #/texts/1         en        0.79\n",
+      "6   semantic  10644900836417579648      TEXT  #/texts/1  reference        0.62\n",
+      "7   language  15474928253492353043      TEXT  #/texts/2         en        0.58\n",
+      "8   semantic  15474928253492353043      TEXT  #/texts/2  meta-data        1.00\n",
+      "9   language   9635025618824704809      TEXT  #/texts/3         en        0.46\n",
+      "10  semantic   9635025618824704809      TEXT  #/texts/3  meta-data        1.00\n",
+      "11  language  17849197603200806688      TEXT  #/texts/4         en        0.49\n",
       "2305.06553.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. In this paper, we introduce WeLayout...\n",
-      "1     title  #/texts/2  WeLayout: WeChat Layout Analysis System for th...\n",
-      "2    author  #/texts/2                                    Mingliang Zhang\n",
-      "3    author  #/texts/2                                           Zhen Cao\n",
-      "4    author  #/texts/2                                         Juntao Liu\n",
-      "5    author  #/texts/2                                        Liqiang Niu\n",
-      "6    author  #/texts/2                                       Fandong Meng\n",
-      "7    author  #/texts/2                                           Jie Zhou\n",
-      "8    author  #/texts/3                                        Tencent Inc\n",
+      "title:  WeLayout: WeChat Layout Analysis System for the ICDAR 2023 Competition on Robust Layout Segmentation in Corporate Documents\n",
+      "abstract:  Abstract. In this paper, we introduce WeLayout, a novel system for segmenting the layout of corporate documents, which stands for We Chat Layout Analysis System. Our approach utilizes a sophisticated ensemble of DINO and YOLO models, specifically developed for the ICDAR 2023 Competition on Robust Layout Segmentation. Our method significantly surpasses the baseline, securing a top position 1 on the leaderboard with a mAP of 70.0. To achieve this performance, we concentrated on enhancing various aspects of the task, such as dataset augmentation, model architecture, bounding box refinement, and model ensemble techniques. Additionally, we trained the data separately for each document category to ensure a higher mean submission score. We also developed an algorithm for cell matching to further improve our performance. To identify the optimal weights and IoU thresholds for our model ensemble, we employed a Bayesian optimization algorithm called the Tree-Structured Parzen Estimator. Our approach effectively demonstrates the benefits of combining query-based and anchor-free models for achieving robust layout segmentation in corporate documents.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  14956064378562553280  DOCUMENT          #         en        0.98\n",
+      "1   metadata  14484922385533454757  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   1897918619985061747  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   language  15158612243538917595      TEXT  #/texts/0         en        0.68\n",
+      "4   semantic  15158612243538917595      TEXT  #/texts/0  reference        0.66\n",
+      "5   language  14484922385533454757      TEXT  #/texts/1         en        0.68\n",
+      "6   semantic  14484922385533454757      TEXT  #/texts/1     header        0.59\n",
+      "7   language  13617141398818891398      TEXT  #/texts/2         en        0.23\n",
+      "8   semantic  13617141398818891398      TEXT  #/texts/2  meta-data        0.99\n",
+      "9   language  17025208375146063947      TEXT  #/texts/3         en        0.57\n",
+      "10  semantic  17025208375146063947      TEXT  #/texts/3  meta-data        0.89\n",
+      "11  language   7171695325311829417      TEXT  #/texts/4         en        0.26\n",
       "2209.04460.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Scientific articles published prior ...\n",
-      "1     title  #/texts/2  Figure and Figure Caption Extraction for Mixed...\n",
-      "2    author  #/texts/2                                         J P Naiman\n",
-      "3    author  #/texts/2                                 Peter K G Williams\n",
-      "4    author  #/texts/4                                     Alyssa Goodman\n",
+      "title:  Figure and Figure Caption Extraction for Mixed Raster and Vector PDFs: Digitization of Astronomical Literature with OCR Features\n",
+      "abstract:  Abstract. Scientific articles published prior to the 'age of digitization' in the late 1990s contain figures which are 'trapped' within their scanned pages. While progress to extract figures and their captions has been made, there is currently no robust method for this process. We present a YOLO-based method for use on scanned pages, post-Optical Character Recognition (OCR), which uses both grayscale and OCR-features. When applied to the astrophysics literature holdings of the Astrophysics Data System (ADS), we find F1 scores of 90.9% (92.2%) for figures (figure captions) with the intersection-over-union (IOU) cut-off of 0.9 which is a significant improvement over other state-of-the-art methods. Keywords: scholarly document processing · document layout analysis · astronomy.\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language  15034260135785917006  DOCUMENT           #         en   \n",
+      "1   metadata   3433926981744971318  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata   9462471546895446962  DOCUMENT   #/texts/9   abstract   \n",
+      "3   metadata  10821452764085231010  DOCUMENT  #/texts/10   abstract   \n",
+      "4   language  10280969898410321041      TEXT   #/texts/0         en   \n",
+      "5   semantic  10280969898410321041      TEXT   #/texts/0  reference   \n",
+      "6   language   3433926981744971318      TEXT   #/texts/1         en   \n",
+      "7   semantic   3433926981744971318      TEXT   #/texts/1     header   \n",
+      "8   language   2453516942315491092      TEXT   #/texts/2         en   \n",
+      "9   semantic   2453516942315491092      TEXT   #/texts/2  meta-data   \n",
+      "10  language   4178901503750185596      TEXT   #/texts/3         en   \n",
+      "11  semantic   4178901503750185596      TEXT   #/texts/3  meta-data   \n",
+      "\n",
+      "    confidence  \n",
+      "0         1.00  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         0.68  \n",
+      "5         0.82  \n",
+      "6         0.82  \n",
+      "7         0.54  \n",
+      "8         0.24  \n",
+      "9         0.93  \n",
+      "10        0.20  \n",
+      "11        1.00  \n",
       "2207.06695.pdf\n",
-      "     subtype   subj_path                                               name\n",
-      "0   abstract           #  This paper presents DavarOCR, an open-source t...\n",
-      "1      title   #/texts/2  DavarOCR: A Toolbox for OCR and Multi-Modal Do...\n",
-      "2     author   #/texts/2   Liang Qiao Hikvision Research Institute Hangzhou\n",
-      "3     author   #/texts/3    Hui Jiang Hikvision Research Institute Hangzhou\n",
-      "4     author   #/texts/4    Ying Chen Hikvision Research Institute Hangzhou\n",
-      "5     author   #/texts/6   Pengfei Li Hikvision Research Institute Hangzhou\n",
-      "6     author   #/texts/7  Zaisheng Li Hikvision Research Institute Hangzhou\n",
-      "7     author   #/texts/8   Baorui Zou Hikvision Research Institute Hangzhou\n",
-      "8     author   #/texts/9   Dashan Guo Hikvision Research Institute Hangzhou\n",
-      "9     author  #/texts/10    Yingda Xu Hikvision Research Institute Hangzhou\n",
-      "10    author  #/texts/11     Yunlu Xu Hikvision Research Institute Hangzhou\n",
-      "11    author  #/texts/12                                     Zhanzhan Cheng\n",
-      "12    author  #/texts/12              Hikvision Research Institute Hangzhou\n",
-      "13    author  #/texts/13       Yi Niu Hikvision Research Institute Hangzhou\n",
+      "title:  DavarOCR: A Toolbox for OCR and Multi-Modal Document Understanding\n",
+      "abstract:  ABSTRACT This paper presents DavarOCR, an open-source toolbox for OCR and document understanding tasks. DavarOCR currently implements 19 advanced algorithms, covering 9 different task forms. DavarOCR provides detailed usage instructions and the trained models for each algorithm. Compared with the previous opensource OCR toolbox, DavarOCR has relatively more complete support for the sub-tasks of the cutting-edge technology of document understanding. In order to promote the development and application of OCR technology in academia and industry, we pay more attention to the use of modules that different sub-domains of technology can share. DavarOCR is publicly released at https: //github.com/hikopensource/Davar-Lab-OCR. CCS CONCEPTS · Computing methodologies → Computer vision problems. KEYWORDS Open-source, OCR, Document Understanding ACM Reference Format: Liang Qiao, Hui Jiang, Ying Chen, Can Li, Pengfei Li, Zaisheng Li, Baorui Zou, Dashan Guo, Yingda Xu, Yunlu Xu, Zhanzhan Cheng $^{∗}$, and Yi Niu. 2022. DavarOCR: A Toolbox for OCR and Multi-Modal Document Understanding. In Proceedings of the 30th ACM International Conference on Multimedia (MM '22), October 10-14, 2022, Lisboa, Portugal. ACM, New York, NY, USA, 4 pages. https://doi.org/10.1145/3503161.3548547 ACM ISBN 978-1-4503-9203-7/22/10...$15.00\n",
+      "        type             subj_hash subj_name   subj_path     label  confidence\n",
+      "0   language  10578670118810806432  DOCUMENT           #        en        1.00\n",
+      "1   metadata  14078910730148120710  DOCUMENT   #/texts/1     title        1.00\n",
+      "2   metadata   9804854515977808943  DOCUMENT  #/texts/14  abstract        1.00\n",
+      "3   metadata  12499147061936475091  DOCUMENT  #/texts/15  abstract        1.00\n",
+      "4   metadata  13885398359714424134  DOCUMENT  #/texts/16  abstract        1.00\n",
+      "5   metadata   7460045827015547000  DOCUMENT  #/texts/17  abstract        1.00\n",
+      "6   metadata   9804754606106263206  DOCUMENT  #/texts/18  abstract        1.00\n",
+      "7   metadata  13984942241077387243  DOCUMENT  #/texts/19  abstract        1.00\n",
+      "8   metadata   1459573036359563303  DOCUMENT  #/texts/20  abstract        1.00\n",
+      "9   metadata  10320331568267550096  DOCUMENT  #/texts/21  abstract        1.00\n",
+      "10  metadata   9842410885214878497  DOCUMENT  #/texts/22  abstract        1.00\n",
+      "11  language  17753305227894791928      TEXT   #/texts/0        en        0.55\n",
       "2204.12974.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Existing image captioning systems are dedicate...\n",
-      "1     title  #/texts/2  CapOnImage: Context-driven Dense-Captioning on...\n",
-      "2    author  #/texts/2                                           Yiqi Gao\n",
-      "3    author  #/texts/3                                        Xinglin Hou\n",
-      "4    author  #/texts/4                                     Yuanmeng Zhang\n",
-      "5    author  #/texts/5                                        Tiezheng Ge\n",
-      "6    author  #/texts/6                                       Yuning Jiang\n",
-      "7    author  #/texts/7                                          Peng Wang\n",
+      "title:  CapOnImage: Context-driven Dense-Captioning on Image\n",
+      "abstract:  ABSTRACT Existing image captioning systems are dedicated to generating narrative captions for images, which are spatially detached from the image in presentation. However, texts can also be used as decorations on the image to highlight the key points and increase the attractiveness of images. In this work, we introduce a new task called captioning on image (CapOnImage), which aims to generate dense captions at different locations of the image based on contextual information. To fully exploit the surrounding visual context to generate the most suitable caption for each location, we propose a multi-modal pre-training model with multi-level pre-training tasks that progressively learn the correspondence between texts and image locations from easy to difficult. Since the model may generate redundant captions for nearby locations, we further enhance the location embedding with neighbor locations as context. For this new task, we also introduce a large-scale benchmark called CapOn-Image2M, which contains 2.1 million product images, each with an average of 4.8 spatially localized captions. Compared with other image captioning model variants, our model achieves the best results in both captioning accuracy and diversity aspects. We will make code and datasets public to facilitate future research.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  14357711412048741219  DOCUMENT          #         en        0.99\n",
+      "1   metadata  13423398989423713140  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata    222532995322659675  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "3   metadata   6373866757109498540  DOCUMENT  #/texts/9   abstract        1.00\n",
+      "4   language   3594251336999786412      TEXT  #/texts/0         en        0.44\n",
+      "5   semantic   3594251336999786412      TEXT  #/texts/0       text        0.69\n",
+      "6   language  13423398989423713140      TEXT  #/texts/1         en        0.48\n",
+      "7   semantic  13423398989423713140      TEXT  #/texts/1     header        0.75\n",
+      "8   language   6919689940402428871      TEXT  #/texts/2         en        0.40\n",
+      "9   semantic   6919689940402428871      TEXT  #/texts/2  meta-data        0.98\n",
+      "10  language   7331669175832383025      TEXT  #/texts/3         nl        0.24\n",
+      "11  semantic   7331669175832383025      TEXT  #/texts/3  meta-data        0.99\n",
       "2305.10825.pdf\n",
-      "   subtype   subj_path                                          name\n",
-      "0   author   #/texts/3  Haoxing Chen Nanjing University Tiansuan Lab\n",
-      "1    title   #/texts/3                                             B\n",
-      "2   author   #/texts/4                                  Zhangxuan Gu\n",
-      "3   author   #/texts/6                                    Xing Zheng\n",
-      "4   author   #/texts/8                                 Changhua Meng\n",
-      "5   author  #/texts/10                                     Zhuoer Xu\n",
-      "6   author  #/texts/12                                       Jun Lan\n",
-      "7   author  #/texts/14                                     Yaohui Li\n",
-      "8   author  #/texts/15                                Research Group\n",
-      "9   author  #/texts/15                    Nanjing University Nanjing\n",
-      "10  author  #/texts/16                                    Huijia Zhu\n",
-      "11  author  #/texts/18                                 Weiqiang Wang\n",
+      "title:  B\n",
+      "abstract:  ['Diffusion model based language-guided image editing has achieved great success recently. However, existing state-of-the-art diffusion models struggle with rendering correct text and text style during generation. To tackle this problem, we propose a universal self-supervised text editing diffusion model (DiffUTE), which aims to replace or modify words in the source image with another one while maintaining its realistic appearance. Specifically, we build our model on a diffusion model and carefully modify the network structure to enable the model for drawing multilingual characters with the help of glyph and position information. Moreover, we design a self-supervised learning framework to leverage large amounts of web data to improve the representation ability of the model. Experimental results show that our method achieves an impressive performance and enables controllable editing on in-the-wild images with high fidelity. Our code will be avaliable in \\\\url{https://github.com/chenhaoxing/DiffUTE}.']\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   6514666095637483066  DOCUMENT          #         en        0.99\n",
+      "1   metadata  12998565238298329342  DOCUMENT  #/texts/2      title        1.00\n",
+      "2   language   8636953316538300383      TEXT  #/texts/0         en        0.71\n",
+      "3   semantic   8636953316538300383      TEXT  #/texts/0  reference        0.66\n",
+      "4   language  13544356125384020866      TEXT  #/texts/1         en        0.35\n",
+      "5   semantic  13544356125384020866      TEXT  #/texts/1     header        0.92\n",
+      "6   language  12998565238298329342      TEXT  #/texts/2         de        1.00\n",
+      "7   semantic  12998565238298329342      TEXT  #/texts/2     header        1.00\n",
+      "8   language   3876166548510224187      TEXT  #/texts/3         en        0.56\n",
+      "9   semantic   3876166548510224187      TEXT  #/texts/3  meta-data        0.99\n",
+      "10  language   2124254991731670016      TEXT  #/texts/4         en        0.59\n",
+      "11  semantic   2124254991731670016      TEXT  #/texts/4  meta-data        0.90\n",
       "2302.11583.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Scientific articles published prior to the 'ag...\n",
-      "1     title  #/texts/2  The Digitization of Historical Astrophysical L...\n",
-      "2    author  #/texts/2                                      Jill P Naiman\n",
-      "3    author  #/texts/2                                 Peter K G Williams\n",
-      "4    author  #/texts/2                                     Alyssa Goodman\n",
+      "title:  The Digitization of Historical Astrophysical Literature with Highly-Localized Figures and Figure Captions\n",
+      "abstract:  Abstract Scientific articles published prior to the 'age of digitization' in the late 1990s contain figures which are 'trapped' within their scanned pages. While progress to extract figures and their captions has been made, there is currently no robust method for this process. We present a YOLO-based method for use on scanned pages, after they have been processed with Optical Character Recognition (OCR), which uses both grayscale and OCR-features. We focus our efforts on translating the intersection-overunion (IOU) metric from the field of object detection to document layout analysis and quantify 'high localization' levels as an IOU of 0.9. When applied to the astrophysics literature holdings of the NASA Astrophysics Data System (ADS), we find F1 scores of 90.9% (92.2%) for figures (figure captions) with the IOU cut-off of 0.9 which is a significant improvement over other state-of-the-art methods. Keywords: scholarly document processing, document layout analysis, astronomy.\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language  16161331326838422230  DOCUMENT           #         en   \n",
+      "1   metadata    380514435218640980  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata  13231021132440135750  DOCUMENT   #/texts/9   abstract   \n",
+      "3   metadata   2570868380428784219  DOCUMENT  #/texts/10   abstract   \n",
+      "4   metadata   8293005220653308550  DOCUMENT  #/texts/11   abstract   \n",
+      "5   language  11001573406615304101      TEXT   #/texts/0         en   \n",
+      "6   semantic  11001573406615304101      TEXT   #/texts/0       text   \n",
+      "7   language    380514435218640980      TEXT   #/texts/1         en   \n",
+      "8   semantic    380514435218640980      TEXT   #/texts/1     header   \n",
+      "9   language   2659903811506978490      TEXT   #/texts/2         en   \n",
+      "10  semantic   2659903811506978490      TEXT   #/texts/2  meta-data   \n",
+      "11  language  14358587209864171010      TEXT   #/texts/3         en   \n",
+      "\n",
+      "    confidence  \n",
+      "0         1.00  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         1.00  \n",
+      "5         0.33  \n",
+      "6         0.96  \n",
+      "7         0.84  \n",
+      "8         0.66  \n",
+      "9         0.47  \n",
+      "10        0.98  \n",
+      "11        0.68  \n",
       "2203.04814.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Abstract. In this work, we propose Text-Degrad...\n",
-      "1      title  #/texts/1  Text-DIAE: Degradation Invariant Autoencoders ...\n",
-      "2     author  #/texts/1                               Mohamed Ali Souibgui\n",
-      "3     author  #/texts/1                                      Sanket Biswas\n",
-      "4     author  #/texts/1                                       Andres Mafla\n",
-      "5     author  #/texts/2                                   Ali Furkan Biten\n",
-      "6     author  #/texts/2                                        Alicia Forn\n",
-      "7     author  #/texts/2                                  Yousri Kessentini\n",
-      "8     author  #/texts/2                                         Josep Llad\n",
-      "9     author  #/texts/2                                        Lluis Gomez\n",
-      "10    author  #/texts/2                               Dimosthenis Karatzas\n",
-      "11    author  #/texts/3                             Computer Vision Center\n",
-      "12    author  #/texts/3                                    Universitat Aut\n",
+      "title:  Text-DIAE: Degradation Invariant Autoencoders for Text Recognition and Document Enhancement\n",
+      "abstract:  Abstract. In this work, we propose Text-Degradation Invariant Auto Encoder (Text-DIAE) aimed to solve two tasks, text recognition (handwritten or scene-text) and document image enhancement. We define three pretext tasks as learning objectives to be optimized during pretraining without the usage of labelled data. Each of the pre-text objectives is specifically tailored for the final downstream tasks. We conduct several ablation experiments that show the importance of each degradation for a specific domain. Exhaustive experimentation shows that our method does not have limitations of previous state-of-the-art based on contrastive losses while at the same time requiring essentially fewer data samples to converge. Finally, we demonstrate that our method surpasses the state-of-the-art significantly in existing supervised and selfsupervised settings in handwritten and scene text recognition and document image enhancement. Our code and trained models will be made publicly available at http://Upon_Acceptance. Keywords: Self-Supervised Learning, Handwritten Text Recognition, Scene-Text Recognition, Document Image Enhancement.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  14746165682703908176  DOCUMENT          #         en        0.99\n",
+      "1   metadata   1239120697244612174  DOCUMENT  #/texts/0      title        1.00\n",
+      "2   metadata  12077914615450139164  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   4888985162782367430  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language   1239120697244612174      TEXT  #/texts/0         en        0.36\n",
+      "5   semantic   1239120697244612174      TEXT  #/texts/0     header        0.91\n",
+      "6   language   7357589549043404236      TEXT  #/texts/1         es        0.28\n",
+      "7   semantic   7357589549043404236      TEXT  #/texts/1  meta-data        0.99\n",
+      "8   language   3561303740234063321      TEXT  #/texts/2         es        0.22\n",
+      "9   semantic   3561303740234063321      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  13046124633699204353      TEXT  #/texts/3         en        0.28\n",
+      "11  semantic  13046124633699204353      TEXT  #/texts/3  meta-data        0.95\n",
       "2110.02069.pdf\n",
-      "  subtype  subj_path                                               name\n",
-      "0   title  #/texts/2  OPAD: An Optimized Policy-based Active Learnin...\n",
-      "1  author  #/texts/2                                      Sumit Shekhar\n",
-      "2  author  #/texts/2                               Adobe Research India\n",
-      "3  author  #/texts/4                               Adobe Research India\n",
-      "4  author  #/texts/5                                       Ishan Jindal\n",
-      "5  author  #/texts/5                                      Roorkee India\n",
-      "6  author  #/texts/6                                        Avneet Jain\n",
-      "7  author  #/texts/6                                      Roorkee India\n",
+      "title:  OPAD: An Optimized Policy-based Active Learning Framework for Document Content Analysis\n",
+      "abstract:  ABSTRACT\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  16825692768564201021  DOCUMENT          #         en        0.94\n",
+      "1   metadata   4532215096900581724  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  16063359772842428263  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   language  13727066778969946399      TEXT  #/texts/0         en        0.43\n",
+      "4   semantic  13727066778969946399      TEXT  #/texts/0       text        0.69\n",
+      "5   language   4532215096900581724      TEXT  #/texts/1         en        0.57\n",
+      "6   semantic   4532215096900581724      TEXT  #/texts/1     header        0.84\n",
+      "7   language  14339003219097135650      TEXT  #/texts/2         en        0.41\n",
+      "8   semantic  14339003219097135650      TEXT  #/texts/2  meta-data        0.98\n",
+      "9   language   5884331416575885239      TEXT  #/texts/3         en        0.66\n",
+      "10  semantic   5884331416575885239      TEXT  #/texts/3  meta-data        1.00\n",
+      "11  language  14652148498818357478      TEXT  #/texts/4         en        0.16\n",
       "2106.07359.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Extracting metadata from scientific p...\n",
-      "1     title  #/texts/2  MexPub: Deep Transfer Learning for Metadata Ex...\n",
-      "2    author  #/texts/2  Zeyd Boukhers Nada Beili Timo Hartmann Prantik...\n",
-      "3    author  #/texts/3                                     Landau Koblenz\n",
+      "title:  MexPub: Deep Transfer Learning for Metadata Extraction from German Publications\n",
+      "abstract:  Abstract-Extracting metadata from scientific papers can be considered as a solved problem in NLP due to the high accuracy of state-of-the-art methods. However, this does not apply to German scientific publications, which have a variety of styles and layouts. In contrast to most of the English scientific publications that follow standard and simple layouts, the order, content, position and size of metadata in German publications vary greatly among publications. This variety makes traditional NLP methods fail to accurately extract metadata from these publications. In this paper, we present a method that extracts metadata from PDF documents with different layouts and styles by viewing the document as an image. We used Mask R-CNN that is trained on COCO dataset and finetuned with PubLayNet dataset that consists of 200K PDF snapshots with five basic classes (e.g. text, figure, etc). We refine-tuned the model on our proposed synthetic dataset consisting of 30K article snapshots to extract nine patterns (i.e. author, title, etc). Our synthetic dataset is generated using contents in both languages German and English and a finite set of challenging templates obtained from German publications. Our method achieved an average accuracy of around 90% which validates its capability to accurately extract metadata from a variety of PDF documents with challenging templates. Index Terms-author name disambiguation, entity linkage, bibliographic data, neural networks, classification\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  17314565271857111086  DOCUMENT          #         en        1.00\n",
+      "1   metadata  16695866139959375602  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   3770385164590787878  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  11106268346932843180  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   4288696785669740737      TEXT  #/texts/0         en        0.38\n",
+      "5   semantic   4288696785669740737      TEXT  #/texts/0  reference        0.67\n",
+      "6   language  16695866139959375602      TEXT  #/texts/1         en        0.80\n",
+      "7   semantic  16695866139959375602      TEXT  #/texts/1     header        0.52\n",
+      "8   language   2881893159174956500      TEXT  #/texts/2         en        0.40\n",
+      "9   semantic   2881893159174956500      TEXT  #/texts/2  meta-data        0.98\n",
+      "10  language   8155322520631945055      TEXT  #/texts/3         en        0.49\n",
+      "11  semantic   8155322520631945055      TEXT  #/texts/3  meta-data        0.89\n",
       "2308.14397.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-In this research paper, we introduce ...\n",
-      "1     title  #/texts/2  Ensemble of Anchor-Free Models for Robust Bang...\n",
-      "2    author  #/texts/2                                Mong Sain Chak Dept\n",
-      "3    author  #/texts/3                                   Asib Rahman Dept\n",
+      "title:  Ensemble of Anchor-Free Models for Robust Bangla Document Layout Segmentation\n",
+      "abstract:  Abstract-In this research paper, we introduce a novel approach designed for the purpose of segmenting the layout of Bangla documents. Our methodology involves the utilization of a sophisticated ensemble of YOLOv8 models, which were trained for the DL Sprint 2.0-BUET CSE Fest 2023 Competition focused on Bangla document layout segmentation. Our primary emphasis lies in enhancing various aspects of the task, including techniques such as image augmentation, model architecture, and the incorporation of model ensembles. We deliberately reduce the quality of a subset of document images to enhance the resilience of model training, thereby resulting in an improvement in our cross-validation score. By employing Bayesian optimization, we determine the optimal confidence and Intersection over Union (IoU) thresholds for our model ensemble. Through our approach, we successfully demonstrate the effectiveness of anchor-free models in achieving robust layout segmentation in Bangla documents.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   4186796933444337650  DOCUMENT          #         en        1.00\n",
+      "1   metadata  15963578782981549677  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   7869003996707617300  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   language   1586940549653234579      TEXT  #/texts/0         en        0.27\n",
+      "4   semantic   1586940549653234579      TEXT  #/texts/0  reference        0.86\n",
+      "5   language  15963578782981549677      TEXT  #/texts/1         en        0.51\n",
+      "6   semantic  15963578782981549677      TEXT  #/texts/1     header        0.93\n",
+      "7   language  16607079952244636778      TEXT  #/texts/2         en        0.70\n",
+      "8   semantic  16607079952244636778      TEXT  #/texts/2  meta-data        1.00\n",
+      "9   language   3645630920813888449      TEXT  #/texts/3         en        0.72\n",
+      "10  semantic   3645630920813888449      TEXT  #/texts/3  meta-data        1.00\n",
+      "11  language   7869003996707617300      TEXT  #/texts/4         en        0.90\n",
       "2301.06629.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Designing visually appealing layouts for multi...\n",
-      "1     title  #/texts/2  Diverse Multimedia Layout Generation with Mult...\n",
-      "2    author  #/texts/2                                     David D Nguyen\n",
-      "3    author  #/texts/2                                   Sydney Australia\n",
-      "4    author  #/texts/3                                        Surya Nepal\n",
-      "5    author  #/texts/4                                    Salil S Kanhere\n",
-      "6    author  #/texts/4                                   Sydney Australia\n",
+      "title:  Diverse Multimedia Layout Generation with Multi Choice Learning\n",
+      "abstract:  ABSTRACT Designing visually appealing layouts for multimedia documents containing text, graphs and images requires a form of creative intelligence. Modelling the generation of layouts has recently gained attention due to its importance in aesthetics and communication style. In contrast to standard prediction tasks, there are a range of acceptable layouts which depend on user preferences. For example, a poster designer may prefer logos on the top-left while another prefers logos on the bottom-right. Both are correct choices yet existing machine learning models treat layouts as a single choice prediction problem. In such situations, these models would simply average over all possible choices given the same input forming a degenerate sample. In the above example, this would form an unacceptable layout with a logo in the centre. In this paper, we present an auto-regressive neural network architecture, called LayoutMCL, that uses multi-choice prediction and winner-takes-all loss to effectively stabilise layout generation. LayoutMCL avoids the averaging problem by using multiple predictors to learn a range of possible options for each layout object. This enables LayoutMCL to generate multiple and diverse layouts from a single input which is in contrast with existing approaches © 2021 Association for Computing Machinery. https://doi.org/10.1145/3474085.3475525 which yield similar layouts with minor variations. Through quantitative benchmarks on real data (magazine, document and mobile app layouts), we demonstrate that LayoutMCL reduces Fréchet Inception Distance (FID) by 83-98% and generates significantly more diversity in comparison to existing approaches. CCS CONCEPTS · Computing methodologies → Neural networks; Mixture modeling; · Applied computing → Multi / mixed media creation. KEYWORDS multimedia applications, neural networks, generative models, creative intelligence, layouts, multi-choice learning, mixture models ACM Reference Format: David D. Nguyen, Surya Nepal, and Salil S. Kanhere. 2021. Diverse Multimedia Layout Generation with Multi Choice Learning. In Proceedings of the 29th ACM International Conference on Multimedia (MM '21), October 20-24, 2021, Virtual Event, China. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/3474085.3475525\n",
+      "        type             subj_hash subj_name   subj_path     label  confidence\n",
+      "0   language  13531002653667033373  DOCUMENT           #        en        0.99\n",
+      "1   metadata   2484448217154416415  DOCUMENT   #/texts/1     title        1.00\n",
+      "2   metadata   6494237988358185759  DOCUMENT   #/texts/5  abstract        1.00\n",
+      "3   metadata   8217380235620815139  DOCUMENT   #/texts/6  abstract        1.00\n",
+      "4   metadata   5607695864583266366  DOCUMENT   #/texts/7  abstract        1.00\n",
+      "5   metadata  17573229298095673916  DOCUMENT   #/texts/8  abstract        1.00\n",
+      "6   metadata   3607335504213949949  DOCUMENT   #/texts/9  abstract        1.00\n",
+      "7   metadata   3232040026906145710  DOCUMENT  #/texts/10  abstract        1.00\n",
+      "8   metadata  11729361395974924344  DOCUMENT  #/texts/11  abstract        1.00\n",
+      "9   metadata  10132473083317424389  DOCUMENT  #/texts/12  abstract        1.00\n",
+      "10  metadata   6494669723362407576  DOCUMENT  #/texts/13  abstract        1.00\n",
+      "11  metadata   9741072581454175100  DOCUMENT  #/texts/14  abstract        1.00\n",
       "2205.13724.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  We propose V-Doc, a question-answering tool us...\n",
-      "1     title  #/texts/1    V-Doc : Visual questions answers with Documents\n",
-      "2    author  #/texts/1                                         Yihao Ding\n",
-      "3    author  #/texts/1                                          Zhe Huang\n",
-      "4    author  #/texts/1                                        Runlin Wang\n",
-      "5    author  #/texts/1                                         Hang Zhang\n",
-      "6    author  #/texts/1                                        Xianru Chen\n",
-      "7    author  #/texts/1                                         Yuzhong Ma\n",
-      "8    author  #/texts/1                                      Hyunsuk Chung\n",
-      "9    author  #/texts/1                                   Soyeon Caren Han\n",
+      "title:  V-Doc : Visual questions answers with Documents\n",
+      "abstract:  Abstract We propose V-Doc, a question-answering tool using document images and PDF, mainly for researchers and general non-deep learning experts looking to generate, process, and understand the document visual question answering tasks. The V-Doc supports generating and using both extractive and abstractive question-answer pairs using documents images. The extractive QA selects a subset of tokens or phrases from the document contents to predict the answers, while the abstractive QA recognises the language in the content and generates the answer based on the trained model. Both aspects are crucial to understanding the documents, especially in an image format. We include a detailed scenario of question generation for the abstractive QA task. V-Doc supports a wide range of datasets and models, and is highly extensible through a declarative, framework-agnostic platform. 1\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   9416824584998558997  DOCUMENT          #         en        1.00\n",
+      "1   metadata   4138600041242086037  DOCUMENT  #/texts/0      title        1.00\n",
+      "2   metadata  13003046497338185974  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata   1338064323689945546  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   language   4138600041242086037      TEXT  #/texts/0         en        0.71\n",
+      "5   semantic   4138600041242086037      TEXT  #/texts/0       text        0.79\n",
+      "6   language  12653076769892651762      TEXT  #/texts/1         en        0.28\n",
+      "7   semantic  12653076769892651762      TEXT  #/texts/1  meta-data        1.00\n",
+      "8   language  14589044734228421915      TEXT  #/texts/2         en        0.58\n",
+      "9   semantic  14589044734228421915      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language  13003046497338185974      TEXT  #/texts/3         en        0.32\n",
+      "11  semantic  13003046497338185974      TEXT  #/texts/3     header        0.93\n",
       "2210.05391.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  A large amount of document data exists in unst...\n",
-      "1     title  #/texts/3  Chenxia Li, Ruoyu Guo, Jun Zhou, Mengtao An, Y...\n",
-      "2    author  #/texts/3                                          Baidu Inc\n",
+      "title:  Chenxia Li, Ruoyu Guo, Jun Zhou, Mengtao An, Yuning Du, Lingfeng Zhu, Yi Liu, Xiaoguang Hu, Dianhai Yu\n",
+      "abstract:  Abstract A large amount of document data exists in unstructured form such as raw images without any text information. Designing a practical document image analysis system is a meaningful but challenging task. In previous work, we proposed an intelligent document analysis system PP-Structure. In order to further upgrade the function and performance of PP-Structure, we propose PP-StructureV2 in this work, which contains two subsystems: Layout Information Extraction and Key Information Extraction. Firstly, we integrate Image Direction Correction module and Layout Restoration module to enhance the functionality of the system. Secondly, 8 practical strategies are utilized in PP-StructureV2 for better performance. For Layout Analysis model, we introduce ultra lightweight detector PP-PicoDet and knowledge distillation algorithm FGD for model lightweighting, which increased the inference speed by 11 times with comparable mAP. For Table Recognition model, we utilize PP-LCNet, CSP-PAN and SLAHead to optimize the backbone module, feature fusion module and decoding module, respectively, which improved the table structure accuracy by 6% with comparable inference speed. For Key Information Extraction model, we introduce VI-LayoutXLM which is a visual-feature independent LayoutXLM architecture, TB-YX sorting algorithm and U-DML knowledge distillation algorithm, which brought 2.8% and 9.1% improvement respectively on the Hmean of Semantic Entity Recognition and Relation Extraction tasks. All the above mentioned models and code are open-sourced in the GitHub repository PaddleOCR $^{1}$.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  10369181880053115369  DOCUMENT          #         en        1.00\n",
+      "1   metadata  12408328708422733217  DOCUMENT  #/texts/2      title        1.00\n",
+      "2   metadata   6111303043411909073  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   9351761072530796738  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   8875645199832918788      TEXT  #/texts/0         en        0.48\n",
+      "5   semantic   8875645199832918788      TEXT  #/texts/0  reference        0.81\n",
+      "6   language   5481805245648485323      TEXT  #/texts/1         en        0.31\n",
+      "7   semantic   5481805245648485323      TEXT  #/texts/1     header        0.61\n",
+      "8   language  12408328708422733217      TEXT  #/texts/2         en        0.67\n",
+      "9   semantic  12408328708422733217      TEXT  #/texts/2  reference        0.69\n",
+      "10  language   1681836491796280974      TEXT  #/texts/3         en        0.78\n",
+      "11  semantic   1681836491796280974      TEXT  #/texts/3  meta-data        0.99\n",
       "2306.02815.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  The extraction of text in high quality is esse...\n",
-      "1     title  #/texts/2  Transformer-Based UNet with Multi-Headed Cross...\n",
+      "title:  Transformer-Based UNet with Multi-Headed Cross-Attention Skip Connections to Eliminate Artifacts in Scanned Documents\n",
+      "abstract:  Abstract The extraction of text in high quality is essential for textbased document analysis tasks like Document Classification or Named Entity Recognition. Unfortunately, this is not always ensured, as poor scan quality and the resulting artifacts lead to errors in the Optical Character Recognition (OCR) process. Current approaches using Convolutional Neural Networks show promising results for background removal tasks but fail correcting artifacts like pixelation or compression errors. For general images, Transformer backbones are getting integrated more frequently in wellknown neural network structures for denoising tasks. In this work, a modified UNet structure using a Swin Transformer backbone is presented to remove typical artifacts in scanned documents. Multi-headed cross-attention skip connections are used to more selectively learn features in respective levels of abstraction. The performance of this approach is examined regarding compression errors, pixelation and random noise. An improvement in text extraction quality with a reduced error rate of up to 53.9% on the synthetic data is archived. The pretrained base-model can be easily adopted to new artifacts. The cross-attention skip connections allow to integrate textual information extracted from the encoder or in form of commands to more selectively control the models outcome. The latter is shown by means of an example application.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  14009397682830537636  DOCUMENT          #         en        1.00\n",
+      "1   metadata   1856000564203785991  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   3087064519284633291  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata   3447213099923161850  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language  14896720172907815422      TEXT  #/texts/0         en        0.32\n",
+      "5   semantic  14896720172907815422      TEXT  #/texts/0  reference        0.95\n",
+      "6   language   1856000564203785991      TEXT  #/texts/1         en        0.83\n",
+      "7   semantic   1856000564203785991      TEXT  #/texts/1     header        0.82\n",
+      "8   language   8235066263506277257      TEXT  #/texts/2         en        0.39\n",
+      "9   semantic   8235066263506277257      TEXT  #/texts/2  meta-data        0.97\n",
+      "10  language   4742752365505202138      TEXT  #/texts/3         en        0.44\n",
+      "11  semantic   4742752365505202138      TEXT  #/texts/3  meta-data        0.95\n",
       "2211.04934.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Business documents come in a variety of struct...\n",
-      "1     title  #/texts/2  DoSA : A System to Accelerate Annotations on B...\n",
-      "2    author  #/texts/2                                   Neelesh K Shukla\n",
-      "3    author  #/texts/2                                          Amit Vaid\n",
-      "4    author  #/texts/3                 State Street Corporation Bengaluru\n",
+      "title:  DoSA : A System to Accelerate Annotations on Business Documents with Human-in-the-Loop\n",
+      "abstract:  Abstract Business documents come in a variety of structures, formats and information needs which makes information extraction a challenging task. Due to these variations, having a document generic model which can work well across all types of documents and for all the use cases seems far-fetched. For documentspecific models, we would need customized document-specific labels. We introduce DoSA (Do cument S pecific A utomated Annotations), which helps annotators in generating initial annotations automatically using our novel bootstrap approach by leveraging document generic datasets and models. These initial annotations can further be reviewed by a human for correctness. An initial document-specific model can be trained and its inference can be used as feedback for generating more automated annotations. These automated annotations can be reviewed by human-in-the-loop for the correctness and a new improved model can be trained using the current model as pretrained model before going for the next iteration. In this paper, our scope is limited to Form like documents due to limited availability of generic annotated datasets, but this idea can be extended to a variety of other documents as more datasets are built. An open-source ready-to-use implementation is made available on GitHub. 1\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   7687356650631646950  DOCUMENT          #         en        1.00\n",
+      "1   metadata     28198017736497714  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  11981348536210967890  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  10011946881393212937  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   9783302183418439591      TEXT  #/texts/0         en        0.48\n",
+      "5   semantic   9783302183418439591      TEXT  #/texts/0  reference        0.51\n",
+      "6   language     28198017736497714      TEXT  #/texts/1         en        0.69\n",
+      "7   semantic     28198017736497714      TEXT  #/texts/1     header        0.56\n",
+      "8   language  11165351352510034148      TEXT  #/texts/2         en        0.65\n",
+      "9   semantic  11165351352510034148      TEXT  #/texts/2  meta-data        0.93\n",
+      "10  language   4442204473943079428      TEXT  #/texts/3         en        0.53\n",
+      "11  semantic   4442204473943079428      TEXT  #/texts/3  meta-data        0.68\n",
       "2303.08137.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Controllable layout generation aims at synthes...\n",
-      "1     title  #/texts/2  LayoutDM: Discrete Diffusion Model for Control...\n",
-      "2    author  #/texts/2                                     Kotaro Kikuchi\n",
-      "3    author  #/texts/2                                         Mayu Otani\n",
-      "4    author  #/texts/3                                  Waseda University\n",
+      "title:  LayoutDM: Discrete Diffusion Model for Controllable Layout Generation\n",
+      "abstract:  Abstract Controllable layout generation aims at synthesizing plausible arrangement of element bounding boxes with optional constraints, such as type or position of a specific element. In this work, we try to solve a broad range of layout generation tasks in a single model that is based on discrete state-space diffusion models. Our model, named LayoutDM, naturally handles the structured layout data in the discrete representation and learns to progressively infer a noiseless layout from the initial input, where we model the layout corruption process by modality-wise discrete diffusion. For conditional generation, we propose to inject layout constraints in the form of masking or logit adjustment during inference. We show in the experiments that our LayoutDM successfully generates high-quality layouts and outperforms both task-specific and task-agnostic baselines on several layout tasks. 1\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   5274670548434171366  DOCUMENT          #         en        0.98\n",
+      "1   metadata   7423050627833765715  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   2463036394959683986  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  12872335772674645887  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language   7128723719619351585      TEXT  #/texts/0         en        0.26\n",
+      "5   semantic   7128723719619351585      TEXT  #/texts/0       text        0.89\n",
+      "6   language   7423050627833765715      TEXT  #/texts/1         en        0.56\n",
+      "7   semantic   7423050627833765715      TEXT  #/texts/1     header        0.85\n",
+      "8   language   7379740030068026814      TEXT  #/texts/2         en        0.29\n",
+      "9   semantic   7379740030068026814      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language   4654927673697807378      TEXT  #/texts/3         en        0.55\n",
+      "11  semantic   4654927673697807378      TEXT  #/texts/3  meta-data        0.98\n",
       "2203.06947.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Recently, various multimodal networks for Visu...\n",
-      "1     title  #/texts/1  XYLayoutLM: Towards Layout-Aware Multimodal Ne...\n",
-      "2    author  #/texts/1                                       Zhangxuan Gu\n",
-      "3    author  #/texts/1                                      Changhua Meng\n",
-      "4    author  #/texts/1                                            Ke Wang\n",
-      "5    author  #/texts/1                                            Jun Lan\n",
-      "6    author  #/texts/1                                      Weiqiang Wang\n",
-      "7    author  #/texts/1                                            Ming Gu\n",
-      "8    author  #/texts/1                                       Liqing Zhang\n",
-      "9    author  #/texts/1                      Shanghai Jiao Tong University\n",
+      "title:  XYLayoutLM: Towards Layout-Aware Multimodal Networks For Visually-Rich Document Understanding\n",
+      "abstract:  Abstract Recently, various multimodal networks for Visually-Rich Document Understanding(VRDU) have been proposed, showing the promotion of transformers by integrating visual and layout information with the text embeddings. However, most existing approaches utilize the position embeddings to incorporate the sequence information, neglecting the noisy improper reading order obtained by OCR tools. In this paper, we propose a robust layout-aware multimodal network named XYLayoutLM to capture and leverage rich layout information from proper reading orders produced by our Augmented XY Cut. Moreover, a Dilated Conditional Position Encoding module is proposed to deal with the input sequence of variable lengths, and it additionally extracts local layout information from both textual and visual modalities while generating position embeddings. Experiment results show that our XYLayoutLM achieves competitive results on document understanding tasks.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  13644024371973421720  DOCUMENT          #         en        1.00\n",
+      "1   metadata   6080215795318562627  DOCUMENT  #/texts/0      title        1.00\n",
+      "2   metadata   8620641516224866452  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata  16810561801514819585  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language   6080215795318562627      TEXT  #/texts/0         en        0.49\n",
+      "5   semantic   6080215795318562627      TEXT  #/texts/0     header        0.68\n",
+      "6   language   8343217181628732407      TEXT  #/texts/1         en        0.50\n",
+      "7   semantic   8343217181628732407      TEXT  #/texts/1  meta-data        0.98\n",
+      "8   language   4372809223159679403      TEXT  #/texts/2         en        0.42\n",
+      "9   semantic   4372809223159679403      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   9574469604736173385      TEXT  #/texts/3         en        0.14\n",
+      "11  semantic   9574469604736173385      TEXT  #/texts/3  meta-data        0.83\n",
       "2106.00676.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Classifying the core textual components of a s...\n",
-      "1     title  #/texts/2  Incorporating Visual Layout Structures for Sci...\n",
-      "2    author  #/texts/2                                       Zejiang Shen\n",
-      "3    author  #/texts/2                                            Kyle Lo\n",
-      "4    author  #/texts/2                                       Lucy Lu Wang\n",
-      "5    author  #/texts/2                                       Bailey Kuehl\n",
-      "6    author  #/texts/2                                      Daniel S Weld\n",
-      "7    author  #/texts/2                                        Doug Downey\n",
-      "8    author  #/texts/3                                    Allen Institute\n",
+      "title:  Incorporating Visual Layout Structures for Scientific Text Classification\n",
+      "abstract:  Abstract Classifying the core textual components of a scientific paper-title, author, body text, etc.is a critical first step in automated scientific document understanding. Previous work has shown how using elementary layout information, i.e., each token's 2D position on the page, leads to more accurate classification. We introduce new methods for incorporating VIsual LAyout (VILA) structures, e.g., the grouping of page texts into text lines or text blocks, into language models to further improve performance. We show that the I-VILA approach, which simply adds special tokens denoting the boundaries of layout structures into model inputs, can lead to 1.9% Macro F1 improvements for token classification. Moreover, we design a hierarchical model, H-VILA, that encodes the text based on layout structures and record an up-to 47% inference time reduction with less than 1.5% Macro F1 loss for the text classification models. Experiments are conducted on a newly curated evaluation suite, S2-VLUE, with a novel metric measuring classification uniformity within visual groups and a new dataset of gold annotations covering papers from 19 scientific disciplines. Pre-trained weights, benchmark datasets, and source code will be available at https://github.com/allenai/VILA.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   8906765692817121331  DOCUMENT          #         en        1.00\n",
+      "1   metadata   6464735184806538810  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  17766533753665278732  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   5357052966551336360  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   8324885902404712044      TEXT  #/texts/0         en        0.41\n",
+      "5   semantic   8324885902404712044      TEXT  #/texts/0  reference        0.66\n",
+      "6   language   6464735184806538810      TEXT  #/texts/1         en        0.49\n",
+      "7   semantic   6464735184806538810      TEXT  #/texts/1     header        0.90\n",
+      "8   language   9600035908089748960      TEXT  #/texts/2         en        0.49\n",
+      "9   semantic   9600035908089748960      TEXT  #/texts/2  meta-data        0.94\n",
+      "10  language  12045148224929084550      TEXT  #/texts/3         en        0.60\n",
+      "11  semantic  12045148224929084550      TEXT  #/texts/3  meta-data        0.99\n",
       "2305.00795.pdf\n",
-      "   subtype  subj_path                                               name\n",
-      "0    title  #/texts/2  SelfDocSeg: A Self-Supervised vision-based App...\n",
-      "1   author  #/texts/2                                     Subhajit Maity\n",
-      "2   author  #/texts/2                                      Sanket Biswas\n",
-      "3   author  #/texts/3                                   Siladittya Manna\n",
-      "4   author  #/texts/3                                      Ayan Banerjee\n",
-      "5   author  #/texts/4                                         Josep Llad\n",
-      "6   author  #/texts/4                                Saumik Bhattacharya\n",
-      "7   author  #/texts/6                          Technology Innovation Hub\n",
-      "8   author  #/texts/7                             Computer Vision Center\n",
-      "9   author  #/texts/7                        Computer Science Department\n",
-      "10  author  #/texts/8                                    Universitat Aut\n",
+      "title:  SelfDocSeg: A Self-Supervised vision-based Approach towards Document Segmentation\n",
+      "abstract:  ['Document layout analysis is a known problem to the documents research community and has been vastly explored yielding a multitude of solutions ranging from text mining, and recognition to graph-based representation, visual feature extraction, etc. However, most of the existing works have ignored the crucial fact regarding the scarcity of labeled data. With growing internet connectivity to personal life, an enormous amount of documents had been available in the public domain and thus making data annotation a tedious task. We address this challenge using self-supervision and unlike, the few existing self-supervised document segmentation approaches which use text mining and textual labels, we use a complete vision-based approach in pre-training without any ground-truth label or its derivative. Instead, we generate pseudo-layouts from the document images to pre-train an image encoder to learn the document object representation and localization in a self-supervised framework before fine-tuning it with an object detection model. We show that our pipeline sets a new benchmark in this context and performs at par with the existing methods and the supervised counterparts, if not outperforms. The code is made publicly available at: https://github.com/MaitySubhajit/SelfDocSeg']\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  13678012789189708476  DOCUMENT          #         en        0.99\n",
+      "1   metadata  12907604576135823634  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   language   7874936498254575304      TEXT  #/texts/0         en        0.72\n",
+      "3   semantic   7874936498254575304      TEXT  #/texts/0  reference        0.66\n",
+      "4   language  12907604576135823634      TEXT  #/texts/1         en        0.89\n",
+      "5   semantic  12907604576135823634      TEXT  #/texts/1     header        0.84\n",
+      "6   language   5455095828293575748      TEXT  #/texts/2         en        0.53\n",
+      "7   semantic   5455095828293575748      TEXT  #/texts/2  meta-data        0.98\n",
+      "8   language   8702834490821239030      TEXT  #/texts/3         en        0.16\n",
+      "9   semantic   8702834490821239030      TEXT  #/texts/3  meta-data        1.00\n",
+      "10  language   2784524357674996804      TEXT  #/texts/4         en        0.22\n",
+      "11  semantic   2784524357674996804      TEXT  #/texts/4  meta-data        0.99\n",
       "2202.01414.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Digitization of newspapers is of interest for ...\n",
-      "1     title  #/texts/2  DocBed: A Multi-Stage OCR Solution for Documen...\n",
-      "2    author  #/texts/2                                        Wenzhen Zhu\n",
-      "3    author  #/texts/2                                    Negin Sokhandan\n",
-      "4    author  #/texts/2                                         Guang Yang\n",
-      "5    author  #/texts/2                                     Sujitha Martin\n",
-      "6    author  #/texts/2                            Suchitra Sathyanarayana\n",
+      "title:  DocBed: A Multi-Stage OCR Solution for Documents with Complex Layouts\n",
+      "abstract:  Abstract Digitization of newspapers is of interest for many reasons including preservation of history, accessibility and search ability, etc. While digitization of documents such as scientific articles and magazines is prevalent in literature, one of the main challenges for digitization of newspaper lies in its complex layout (e.g. articles spanning multiple columns, text interrupted by images) analysis, which is necessary to preserve human read-order. This work provides a major breakthrough in the digitization of newspapers on three fronts: first, releasing a dataset of 3000 fully-annotated, real-world newspaper images from 21 different U.S. states representing an extensive variety of complex layouts for document layout analysis; second, proposing layout segmentation as a precursor to existing optical character recognition (OCR) engines, where multiple state-of-the-art image segmentation models and several post-processing methods are explored for document layout segmentation; third, providing a thorough and structured evaluation protocol for isolated layout segmentation and endto-end OCR.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   1254548184009159520  DOCUMENT          #         en        1.00\n",
+      "1   metadata   8978338783002004372  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  12292879104215867518  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   1120897889773198344  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  14113261502938670554      TEXT  #/texts/0         en        0.41\n",
+      "5   semantic  14113261502938670554      TEXT  #/texts/0       text        0.90\n",
+      "6   language   8978338783002004372      TEXT  #/texts/1         en        0.78\n",
+      "7   semantic   8978338783002004372      TEXT  #/texts/1     header        0.79\n",
+      "8   language  10271475077874645195      TEXT  #/texts/2         en        0.50\n",
+      "9   semantic  10271475077874645195      TEXT  #/texts/2  meta-data        0.95\n",
+      "10  language   7339311064102247317      TEXT  #/texts/3         en        0.52\n",
+      "11  semantic   7339311064102247317      TEXT  #/texts/3  meta-data        0.92\n",
       "2101.09465.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Web search is an essential way for human to ob...\n",
-      "1     title  #/texts/2  WebSRC: A Dataset for Web-Based Structural Rea...\n",
-      "2    author  #/texts/2                                            Lu Chen\n",
-      "3    author  #/texts/2                                        Xingyu Chen\n",
-      "4    author  #/texts/2                                         Zihan Zhao\n",
-      "5    author  #/texts/2                            Danyang Zhang Jiabao Ji\n",
-      "6    author  #/texts/2                                             Ao Luo\n",
-      "7    author  #/texts/2                                       Yuxuan Xiong\n",
-      "8    author  #/texts/2                                             Kai Yu\n",
-      "9    author  #/texts/3                Shanghai Jiao Tong University Cross\n",
+      "title:  WebSRC: A Dataset for Web-Based Structural Reading Comprehension\n",
+      "abstract:  Abstract Web search is an essential way for human to obtain information, but it's still a great challenge for machines to understand the contents of web pages. In this paper, we introduce the task of web-based structural reading comprehension. Given a web page and a question about it, the task is to find an answer from the web page. This task requires a system not only to understand the semantics of texts but also the structure of the web page. Moreover, we proposed WebSRC, a novel Web-based S tructural R eading C omprehension dataset. WebSRC consists of 0.44M question-answer pairs, which are collected from 6.5K web pages with corresponding HTML source code, screenshots, and metadata. Each question in WebSRC requires a certain structural understanding of a web page to answer, and the answer is either a text span on the web page or yes/no. We evaluate various strong baselines on our dataset to show the difficulty of our task. We also investigate the usefulness of structural information and visual features. Our dataset and task are publicly available at https://speechlab-sjtu. github.io/WebSRC/.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  11501099291707251985  DOCUMENT          #         en        1.00\n",
+      "1   metadata   2324440648085679014  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   7666771146189822451  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata   9356709581928391389  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language  11875601693668658903      TEXT  #/texts/0         en        0.25\n",
+      "5   semantic  11875601693668658903      TEXT  #/texts/0       text        0.98\n",
+      "6   language   2324440648085679014      TEXT  #/texts/1         en        0.53\n",
+      "7   semantic   2324440648085679014      TEXT  #/texts/1     header        0.77\n",
+      "8   language  15969453667391098982      TEXT  #/texts/2         en        0.50\n",
+      "9   semantic  15969453667391098982      TEXT  #/texts/2  meta-data        0.80\n",
+      "10  language  14477852969093793837      TEXT  #/texts/3         en        0.70\n",
+      "11  semantic  14477852969093793837      TEXT  #/texts/3  meta-data        0.88\n",
       "2009.14457.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  In this paper, we propose a multi-task learnin...\n",
-      "1     title  #/texts/2  Towards a Multi-modal, Multi-task Learning bas...\n",
-      "2    author  #/texts/2                                 Subhojeet Pramanik\n",
-      "3    author  #/texts/2                                  Shashank Mujumdar\n",
-      "4    author  #/texts/2                                         Hima Patel\n",
+      "title:  Towards a Multi-modal, Multi-task Learning based Pre-training Framework for Document Representation Learning\n",
+      "abstract:  Abstract In this paper, we propose a multi-task learning-based framework that utilizes a combination of self-supervised and supervised pre-training tasks to learn a generic document representation. We design the network architecture and the pretraining tasks to incorporate the multi-modal document information across text, layout, and image dimensions and allow the network to work with multi-page documents. We showcase the applicability of our pre-training framework on a variety of different real-world document tasks such as document classification, document information extraction, and document retrieval. We conduct exhaustive experiments to compare performance against different ablations of our framework and state-of-the-art baselines. We discuss the current limitations and next steps for our work and make the code available to promote future research in this direction.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  15695631869970857141  DOCUMENT          #         en        1.00\n",
+      "1   metadata   9405462448796430699  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   1309713858244130926  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  15701970662693362424  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  16793184873001030572      TEXT  #/texts/0         en        0.29\n",
+      "5   semantic  16793184873001030572      TEXT  #/texts/0  reference        0.97\n",
+      "6   language   9405462448796430699      TEXT  #/texts/1         en        0.87\n",
+      "7   semantic   9405462448796430699      TEXT  #/texts/1     header        0.89\n",
+      "8   language  13576085455555947094      TEXT  #/texts/2         en        0.50\n",
+      "9   semantic  13576085455555947094      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  14968621602377449348      TEXT  #/texts/3         en        0.27\n",
+      "11  semantic  14968621602377449348      TEXT  #/texts/3  meta-data        1.00\n",
       "2206.13155.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Multi-modal document pre-trained mode...\n",
-      "1     title  #/texts/2  Bi-VLDoc: Bidirectional Vision-Language Modeli...\n",
-      "2    author  #/texts/2                                         Chuwei Luo\n",
-      "3    author  #/texts/2                                        Guozhi Tang\n",
-      "4    author  #/texts/2                                           Qi Zheng\n",
-      "5    author  #/texts/2                                           Cong Yao\n",
-      "6    author  #/texts/2                                        Lianwen Jin\n",
-      "7    author  #/texts/2                                       Chenliang Li\n",
-      "8    author  #/texts/2                                           Yang Xue\n",
-      "9    author  #/texts/2                                             Luo Si\n",
+      "title:  Bi-VLDoc: Bidirectional Vision-Language Modeling for Visually-Rich Document Understanding\n",
+      "abstract:  Abstract-Multi-modal document pre-trained models have proven to be very effective in a variety of visually-rich document understanding (VrDU) tasks. Though existing document pretrained models have achieved excellent performance on standard benchmarks for VrDU, the way they model and exploit the interactions between vision and language on documents has hindered them from better generalization ability and higher accuracy. In this work, we investigate the problem of vision-language joint representation learning for VrDU mainly from the perspective of supervisory signals. Specifically, a pre-training paradigm called Bi-VLDoc is proposed, in which a bidirectional visionlanguage supervision strategy and a vision-language hybridattention mechanism are devised to fully explore and utilize the interactions between these two modalities, to learn stronger cross-modal document representations with richer semantics. Benefiting from the learned informative cross-modal document representations, Bi-VLDoc significantly advances the state-of-theart performance on three widely-used document understanding benchmarks, including Form Understanding (from 85.14% to 93.44%), Receipt Information Extraction (from 96.01% to 97.84%), and Document Classification (from 96.08% to 97.12%). On Document Visual QA, Bi-VLDoc achieves the state-of-the-art performance compared to previous single model methods. Index Terms-Visually-rich Document Understanding, Document Pre-trained Models, Cross-modal Document Representations.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  11596065795325662056  DOCUMENT          #         en        0.99\n",
+      "1   metadata  12526893973429135138  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  15409526057872574964  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata  17336839330116895928  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language  16857606035249792126      TEXT  #/texts/0         en        0.40\n",
+      "5   semantic  16857606035249792126      TEXT  #/texts/0  reference        0.95\n",
+      "6   language  12526893973429135138      TEXT  #/texts/1         en        0.52\n",
+      "7   semantic  12526893973429135138      TEXT  #/texts/1     header        0.67\n",
+      "8   language  16728676851862945303      TEXT  #/texts/2         en        0.26\n",
+      "9   semantic  16728676851862945303      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   2650850847520354234      TEXT  #/texts/3         en        0.54\n",
+      "11  semantic   2650850847520354234      TEXT  #/texts/3  meta-data        0.98\n",
       "2305.14962.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Transforming documents into machine-...\n",
-      "1     title  #/texts/2  ICDAR 2023 Competition on Robust Layout Segmen...\n",
-      "2    author  #/texts/2                                     Christoph Auer\n",
-      "3    author  #/texts/2                                       Ahmed Nassar\n",
-      "4    author  #/texts/2                                       Maksym Lysak\n",
-      "5    author  #/texts/2                                      Michele Dolfi\n",
-      "6    author  #/texts/3                                Nikolaos Livathinos\n",
-      "7    author  #/texts/3                                        Peter Staar\n",
+      "title:  ICDAR 2023 Competition on Robust Layout Segmentation in Corporate Documents\n",
+      "abstract:  Abstract. Transforming documents into machine-processable representations is a challenging task due to their complex structures and variability in formats. Recovering the layout structure and content from PDF files or scanned material has remained a key problem for decades. IC-DAR has a long tradition in hosting competitions to benchmark the state-of-the-art and encourage the development of novel solutions to document layout understanding. In this report, we present the results of our ICDAR 2023 Competition on Robust Layout Segmentation in Corporate Documents, which posed the challenge to accurately segment the page layout in a broad range of document styles and domains, including corporate reports, technical literature and patents. To raise the bar over previous competitions, we engineered a hard competition dataset and proposed the recent DocLayNet dataset for training. We recorded 45 team registrations and received official submissions from 21 teams. In the presented solutions, we recognize interesting combinations of recent computer vision models, data augmentation strategies and ensemble methods to achieve remarkable accuracy in the task we posed. A clear trend towards adoption of vision-transformer based methods is evident. The results demonstrate substantial progress towards achieving robust and highly generalizing methods for document layout understanding. Keywords: Document Layout Analysis · Machine Learning · Computer Vision · Object Detection · ICDAR Competition\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  16179804872697398409  DOCUMENT          #         en        0.99\n",
+      "1   metadata   2549771196302855966  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   3783859718727443955  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  14449809084629610892  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language   7764449527618403650      TEXT  #/texts/0         en        0.66\n",
+      "5   semantic   7764449527618403650      TEXT  #/texts/0  reference        0.66\n",
+      "6   language   2549771196302855966      TEXT  #/texts/1         en        0.67\n",
+      "7   semantic   2549771196302855966      TEXT  #/texts/1     header        0.51\n",
+      "8   language  17714906341415131143      TEXT  #/texts/2         en        0.21\n",
+      "9   semantic  17714906341415131143      TEXT  #/texts/2  meta-data        0.98\n",
+      "10  language   7644758239577654701      TEXT  #/texts/3         en        0.44\n",
+      "11  semantic   7644758239577654701      TEXT  #/texts/3  meta-data        0.99\n",
       "2201.09407.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  The document layout analysis (DLA) aims to dec...\n",
-      "1     title  #/texts/2  CROSS-DOMAIN DOCUMENT LAYOUT ANALYSIS VIA UNSU...\n",
-      "2    author  #/texts/2                                        Xingjiao Wu\n",
-      "3    author  #/texts/2                                         Luwei Xiao\n",
-      "4    author  #/texts/2                                      Xiangcheng Du\n",
-      "5    author  #/texts/2                                      Yingbin Zheng\n",
-      "6    author  #/texts/2                                             Xin Li\n",
-      "7    author  #/texts/2                                        Tianlong Ma\n",
-      "8    author  #/texts/3                       East China Normal University\n",
+      "title:  CROSS-DOMAIN DOCUMENT LAYOUT ANALYSIS VIA UNSUPERVISED DOCUMENT STYLE GUIDE\n",
+      "abstract:  ABSTRACT The document layout analysis (DLA) aims to decompose document images into high-level semantic areas (i.e., figures, tables, texts, and background). Creating a DLA framework with strong generalization capabilities is a challenge due to document objects are diversity in layout, size, aspect ratio, texture, etc. Many researchers devoted this challenge by synthesizing data to build large training sets. However, the synthetic training data has different styles and erratic quality. Besides, there is a large gap between the source data and the target data. In this paper, we propose an unsupervised cross-domain DLA framework based on document style guidance. We integrated the document quality assessment and the document cross-domain analysis into a unified framework. Our framework is composed of three components, Document Layout Generator (GLD), Document Elements Decorator(GED), and Document Style Discriminator(DSD). The GLD is used to document layout generates, the GED is used to document layout elements fill, and the DSD is used to document quality assessment and cross-domain guidance. First, we apply GLD to predict the positions of the generated document. Then, we design a novel algorithm based on aesthetic guidance to fill the document positions. Finally, we use contrastive learning to evaluate the quality assessment of the document. Besides, we design a new strategy to change the document quality assessment component into a document cross-domain style guide component. Our framework is an unsupervised document layout analysis framework. We have proved through numerous experiments that our proposed method has achieved remarkable performance. Index Terms-Semantic Segmentation, Docuemnt Layout Analysis, Deep Learning\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  13068987446936029579  DOCUMENT          #         en        0.98\n",
+      "1   metadata  10264502586869452829  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   9148518800528223737  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   8168549048811471983  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   metadata  14148857376607305658  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "5   language  16812019770818315157      TEXT  #/texts/0         en        0.13\n",
+      "6   semantic  16812019770818315157      TEXT  #/texts/0       text        0.88\n",
+      "7   language  10264502586869452829      TEXT  #/texts/1         en        0.36\n",
+      "8   semantic  10264502586869452829      TEXT  #/texts/1     header        0.97\n",
+      "9   language   8747017711417260965      TEXT  #/texts/2         en        0.20\n",
+      "10  semantic   8747017711417260965      TEXT  #/texts/2  meta-data        1.00\n",
+      "11  language  11489218395994698375      TEXT  #/texts/3         en        0.71\n",
       "2106.13802.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Document image classification remains a popula...\n",
-      "1     title  #/texts/4  American Family Insurance, Machine Learning Re...\n",
+      "title:  American Family Insurance, Machine Learning Research Group\n",
+      "abstract:  Abstract Document image classification remains a popular research area because it can be commercialized in many enterprise applications across different industries. Recent advancements in large pre-trained computer vision and language models and graph neural networks has lent document image classification many tools. However using large pre-trained models usually requires substantial computing resources which could defeat the costsaving advantages of automatic document image classification. In the paper we propose an efficient document image classification framework that uses graph convolution neural networks and incorporates textual, visual and layout information of the document. Empirical results on both publicly available and real-world data show that our methods achieve near SOTA performance yet require much less computing resources and time for model training and inference. This results in solutions than offer better cost advantages, especially in scalable deployment for enterprise applications.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  16898192775396121132  DOCUMENT          #         en        1.00\n",
+      "1   metadata  18047577149978264333  DOCUMENT  #/texts/3      title        1.00\n",
+      "2   metadata   1342019013639942885  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   5364596895525468982  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   2030268891848045074      TEXT  #/texts/0         en        0.36\n",
+      "5   semantic   2030268891848045074      TEXT  #/texts/0  reference        0.95\n",
+      "6   language   7730456910860267844      TEXT  #/texts/1         en        0.56\n",
+      "7   semantic   7730456910860267844      TEXT  #/texts/1     header        0.81\n",
+      "8   language   3576015150755710974      TEXT  #/texts/2         en        0.67\n",
+      "9   semantic   3576015150755710974      TEXT  #/texts/2  reference        0.72\n",
+      "10  language  18047577149978264333      TEXT  #/texts/3         en        0.82\n",
+      "11  semantic  18047577149978264333      TEXT  #/texts/3  reference        0.48\n",
       "2101.10281.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Adobe's Portable Document Format (PDF) is a po...\n",
-      "1     title  #/texts/2   PAWLS : PDF Annotation With Labels and Structure\n",
-      "2    author  #/texts/2                                       Mark Neumann\n",
-      "3    author  #/texts/3                                       Zejiang Shen\n",
-      "4    author  #/texts/4                                     Sam Skjonsberg\n",
+      "title:  PAWLS : PDF Annotation With Labels and Structure\n",
+      "abstract:  Abstract Adobe's Portable Document Format (PDF) is a popular way of distributing view-only documents with a rich visual markup. This presents a challenge to NLP practitioners who wish to use the information contained within PDF documents for training models or data analysis, because annotating these documents is difficult. In this paper, we present PDF Annotation with Labels and Structure (PAWLS), a new annotation tool designed specifically for the PDF document format. PAWLS is particularly suited for mixed-mode annotation and scenarios in which annotators require extended context to annotate accurately. PAWLS supports span-based textual annotation, N-ary relations and freeform, non-textual bounding boxes, all of which can be exported in convenient formats for training multi-modal machine learning models. A read-only PAWLS server is available at https://pawls. apps.allenai.org/ 1 and the source code is available at https://github. com/allenai/pawls.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  14568006567697038323  DOCUMENT          #         en        1.00\n",
+      "1   metadata   4039357198175436344  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  14712112053459110108  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  16316434293491587175  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language    560399852479874905      TEXT  #/texts/0         en        0.44\n",
+      "5   semantic    560399852479874905      TEXT  #/texts/0       text        0.98\n",
+      "6   language   4039357198175436344      TEXT  #/texts/1         en        0.37\n",
+      "7   semantic   4039357198175436344      TEXT  #/texts/1     header        0.97\n",
+      "8   language   5624206597017791865      TEXT  #/texts/2         en        0.31\n",
+      "9   semantic   5624206597017791865      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  15153810220268008204      TEXT  #/texts/3         en        0.58\n",
+      "11  semantic  15153810220268008204      TEXT  #/texts/3  meta-data        0.99\n",
       "2006.01038.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Document layout analysis usually relies on com...\n",
-      "1      title  #/texts/1              arXiv:2006.01038v1 [cs.CL] 1 Jun 2020\n",
-      "2     author  #/texts/1                                         Minghao Li\n",
-      "3     author  #/texts/1                                          Yiheng Xu\n",
-      "4     author  #/texts/1                                            Lei Cui\n",
-      "5     author  #/texts/1                                      Shaohan Huang\n",
-      "6     author  #/texts/1                                           Furu Wei\n",
-      "7     author  #/texts/1                                         Zhoujun Li\n",
-      "8     author  #/texts/1                                          Ming Zhou\n",
-      "9     author  #/texts/2                                 Beihang University\n",
-      "10    author  #/texts/4                            Microsoft Research Asia\n",
+      "title:  arXiv:2006.01038v1 [cs.CL] 1 Jun 2020\n",
+      "abstract:  Abstract Document layout analysis usually relies on computer vision models to understand documents while ignoring textual information that is vital to capture. Meanwhile, high quality labeled datasets with both visual and textual information are still insufficient. In this paper, we present DocBank, a benchmark dataset with fine-grained token-level annotations for document layout analysis. DocBank is constructed using a simple yet effective way with weak supervision from the L A T E X documents available on the arXiv.com. With DocBank, models from different modalities can be compared fairly and multi-modal approaches will be further investigated and boost the performance of document layout analysis. We build several strong baselines and manually split train/dev/test sets for evaluation. Experiment results show that models trained on DocBank accurately recognize the layout information for a variety of documents. The DocBank dataset will be publicly available at https://github.com/doc-analysis/DocBank.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   2465515363197577403  DOCUMENT          #         en        0.98\n",
+      "1   metadata   9465834195761580083  DOCUMENT  #/texts/0      title        1.00\n",
+      "2   metadata   6884525327918319354  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata  12088364625193045527  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language   9465834195761580083      TEXT  #/texts/0         en        0.29\n",
+      "5   semantic   9465834195761580083      TEXT  #/texts/0  reference        0.97\n",
+      "6   language   6395967760918650982      TEXT  #/texts/1         en        0.31\n",
+      "7   semantic   6395967760918650982      TEXT  #/texts/1  meta-data        0.90\n",
+      "8   language  14211229016076119859      TEXT  #/texts/2         en        0.38\n",
+      "9   semantic  14211229016076119859      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language   1286919575614411010      TEXT  #/texts/3         en        0.73\n",
+      "11  semantic   1286919575614411010      TEXT  #/texts/3  meta-data        1.00\n",
       "2303.05049.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Layout generation aims to synthesize realistic...\n",
-      "1     title  #/texts/2  Unifying Lay out Generation with a Decoupled D...\n",
-      "2    author  #/texts/2                                     Zhizheng Zhang\n",
-      "3    author  #/texts/2                                       Xiaoyi Zhang\n",
-      "4    author  #/texts/2                                        Wenxuan Xie\n",
-      "5    author  #/texts/2                                        Yuwang Wang\n",
-      "6    author  #/texts/2                                             Yan Lu\n",
-      "7    author  #/texts/3                                Jiaotong University\n",
-      "8    author  #/texts/3                            Microsoft Research Asia\n",
-      "9    author  #/texts/3                                Tsinghua University\n",
+      "title:  Unifying Lay out Generation with a Decoupled Diffusion Model\n",
+      "abstract:  Abstract Layout generation aims to synthesize realistic graphic scenes consisting of elements with different attributes including category, size, position, and between-element relation. It is a crucial task for r educing the burden on heavyduty graphic design works for formatted scenes, e.g., publications, documents, and user interfaces (UIs). Diverse application scenarios impose a big c hallenge in unifying various layout generation subtasks, including conditional and unconditional generation. In this paper, we propose a Layout Diffusion Generative Model (LDGM) to achieve such unification with a single decoupled diffusion model. LDGM views a layout of arbitrary missing or coarse element attributes as an intermediate dif fusion status from a completed layout. Since different attrib utes have their individual semantics and characteristics, we propose to decouple the diffusion processes for them to improve the diversity of training samples and learn the rever se process jointly to exploit global-scope contexts for facilitating generation. As a result, our LDGM can generate layouts either from scratch or conditional on arbitrary available attributes. Extensive qualitative and quantitative e xperiments demonstrate our proposed LDGM outperforms existing layout generation models in both functionality and performance.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  11469147180815824661  DOCUMENT          #         en        0.99\n",
+      "1   metadata   7345684647555911193  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  14633266648332875894  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  12461356387847223939  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  13734010725237739732      TEXT  #/texts/0         en        0.24\n",
+      "5   semantic  13734010725237739732      TEXT  #/texts/0       text        0.89\n",
+      "6   language   7345684647555911193      TEXT  #/texts/1         en        0.67\n",
+      "7   semantic   7345684647555911193      TEXT  #/texts/1     header        0.83\n",
+      "8   language   4192388650002618545      TEXT  #/texts/2         en        0.48\n",
+      "9   semantic   4192388650002618545      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language  14722593531256542654      TEXT  #/texts/3         en        0.39\n",
+      "11  semantic  14722593531256542654      TEXT  #/texts/3  meta-data        0.99\n",
       "2105.06220.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Document layout analysis is crucial ...\n",
-      "1     title  #/texts/2  VSR: A Unified Framework for Document Layout A...\n",
-      "2    author  #/texts/2                                         Peng Zhang\n",
-      "3    author  #/texts/2                                         Liang Qiao\n",
-      "4    author  #/texts/2                                     Zhanzhan Cheng\n",
-      "5    author  #/texts/2                                             Yi Niu\n",
-      "6    author  #/texts/2                                             Fei Wu\n",
-      "7    author  #/texts/3                       Hikvision Research Institute\n",
-      "8    author  #/texts/6                                Zhejiang University\n",
+      "title:  VSR: A Unified Framework for Document Layout Analysis combining Vision, Semantics and Relations\n",
+      "abstract:  Abstract. Document layout analysis is crucial for understanding document structures. On this task, vision and semantics of documents, and relations between layout components contribute to the understanding process. Though many works have been proposed to exploit the above information, they show unsatisfactory results. NLP-based methods model layout analysis as a sequence labeling task and show insufficient capabilities in layout modeling. CV-based methods model layout analysis as a detection or segmentation task, but bear limitations of inefficient modality fusion and lack of relation modeling between layout components. To address the above limitations, we propose a unified framework VSR for document layout analysis, combining vision, semantics and relations. VSR supports both NLP-based and CV-based methods. Specifically, we first introduce vision through document image and semantics through text embedding maps. Then, modality-specific visual and semantic features are extracted using a two-stream network, which are adaptively fused to make full use of complementary information. Finally, given component candidates, a relation module based on graph neural network is incorported to model relations between components and output final results. On three popular benchmarks, VSR outperforms previous models by large margins. Code will be released soon. Keywords: Vision · Semantics · Relations · Document layout analysis.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  11491769011364032964  DOCUMENT          #         en        0.95\n",
+      "1   metadata   1183044382250210687  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  17385183871704569381  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata  17289783423793469794  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language   8063734862441376055      TEXT  #/texts/0         en        0.55\n",
+      "5   semantic   8063734862441376055      TEXT  #/texts/0  reference        0.66\n",
+      "6   language   1183044382250210687      TEXT  #/texts/1         en        0.66\n",
+      "7   semantic   1183044382250210687      TEXT  #/texts/1     header        0.80\n",
+      "8   language  10571720774666920474      TEXT  #/texts/2         en        0.31\n",
+      "9   semantic  10571720774666920474      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   7775955189029927588      TEXT  #/texts/3         en        0.56\n",
+      "11  semantic   7775955189029927588      TEXT  #/texts/3  meta-data        1.00\n",
       "2106.11797.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Automatically recognizing the layout of handwr...\n",
-      "1     title  #/texts/2  Evaluation of a Region Proposal Architecture f...\n",
-      "2    author  #/texts/2                                       Lorenzo Quir\n",
-      "3    author  #/texts/2                                      Enrique Vidal\n",
-      "4    author  #/texts/3                                    Research Center\n",
-      "5    author  #/texts/3                                  Universitat Polit\n",
+      "title:  Evaluation of a Region Proposal Architecture for Multi-task Document Layout Analysis\n",
+      "abstract:  Abstract Automatically recognizing the layout of handwritten documents is an important step towards useful extraction of information from those documents. The most common application is to feed downstream applications such as automatic text recognition and keyword spotting; however, the recognition of the layout also helps to establish relationships between elements in the document which allows to enrich the information that can be extracted. Most of the modern document layout analysis systems are designed to address only one part of the document layout problem, namely: baseline detection or region segmentation. In contrast, we evaluate the effectiveness of the Mask-RCNN architecture to address the problem of baseline detection and region segmentation in an integrated manner. We present experimental results on two handwritten text datasets and one handwritten music dataset. The analyzed architecture yields promising results, outperforming state-of-theart techniques in all three datasets. Keywordsdocument layout analysis, region proposal network, baseline detection, region segmentation.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   3713940944030150691  DOCUMENT          #         en        0.98\n",
+      "1   metadata  16570263275213596473  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  16485599656984523560  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata   9852063695928942848  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   metadata  17889680592944608058  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "5   language  11004961425232215947      TEXT  #/texts/0         en        0.23\n",
+      "6   semantic  11004961425232215947      TEXT  #/texts/0  reference        0.95\n",
+      "7   language  16570263275213596473      TEXT  #/texts/1         en        0.61\n",
+      "8   semantic  16570263275213596473      TEXT  #/texts/1     header        0.93\n",
+      "9   language  17652260495556794451      TEXT  #/texts/2         en        0.35\n",
+      "10  semantic  17652260495556794451      TEXT  #/texts/2  meta-data        0.99\n",
+      "11  language   1051774319299939846      TEXT  #/texts/3         es        0.35\n",
       "2304.11810.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Document layout analysis has a wide range of r...\n",
-      "1     title  #/texts/2  PARAGRAPH2GRAPH: A GNN-BASED FRAMEWORK FOR LAY...\n",
-      "2    author  #/texts/2                                            Shu Wei\n",
-      "3    author  #/texts/3                                 Datagrand Tech Inc\n",
-      "4    author  #/texts/4                          Nuo Xu Datagrand Tech Inc\n",
-      "5    author  #/texts/5                                         Deng Huang\n",
-      "6    author  #/texts/6                                 Datagrand Tech Inc\n",
-      "7    author  #/texts/7                       Xiang Gao Datagrand Tech Inc\n",
+      "title:  PARAGRAPH2GRAPH: A GNN-BASED FRAMEWORK FOR LAYOUT PARAGRAPH ANALYSIS\n",
+      "abstract:  ABSTRACT Document layout analysis has a wide range of requirements across various domains, languages, and business scenarios. However, most current state-of-the-art algorithms are language-dependent, with architectures that rely on transformer encoders or language-specific text encoders, such as BERT, for feature extraction. These approaches are limited in their ability to handle very long documents due to input sequence length constraints and are closely tied to language-specific tokenizers. Additionally, training a cross-language text encoder can be challenging due to the lack of labeled multilingual document datasets that consider privacy. Furthermore, some layout tasks require a clean separation between different layout components without overlap, which can be difficult for image segmentationbased algorithms to achieve. In this paper, we present Paragraph2Graph, a language-independent graph neural network (GNN)-based model that achieves competitive results on common document layout datasets while being adaptable to business scenarios with strict separation. With only 19.95 million parameters, our model is suitable for industrial applications, particularly in multi-language scenarios. We are releasing all of our code and pretrained models at this repo. K eywords GNN · Language-independent · Document Layout · Layout Paragraph · Generalization\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language   3100341686165584999  DOCUMENT           #         en   \n",
+      "1   metadata  15403989920679912416  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata    227750139215915282  DOCUMENT   #/texts/8   abstract   \n",
+      "3   metadata  18323991934847197201  DOCUMENT   #/texts/9   abstract   \n",
+      "4   metadata    882870464380862174  DOCUMENT  #/texts/10   abstract   \n",
+      "5   language  14975926239625135018      TEXT   #/texts/0         en   \n",
+      "6   semantic  14975926239625135018      TEXT   #/texts/0       text   \n",
+      "7   language  15403989920679912416      TEXT   #/texts/1         en   \n",
+      "8   semantic  15403989920679912416      TEXT   #/texts/1     header   \n",
+      "9   language  11996006598348841766      TEXT   #/texts/2         en   \n",
+      "10  semantic  11996006598348841766      TEXT   #/texts/2  meta-data   \n",
+      "11  language  16098072656615255554      TEXT   #/texts/3         en   \n",
+      "\n",
+      "    confidence  \n",
+      "0         0.99  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         1.00  \n",
+      "5         0.28  \n",
+      "6         0.69  \n",
+      "7         0.32  \n",
+      "8         0.96  \n",
+      "9         0.70  \n",
+      "10        0.77  \n",
+      "11        0.20  \n",
       "2203.16850.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  This paper addresses the problem of document i...\n",
-      "1     title  #/texts/1  Revisiting Document Image Dewarping by Grid Re...\n",
-      "2    author  #/texts/1                                     Xiangwei Jiang\n",
-      "3    author  #/texts/1                                        Rujiao Long\n",
-      "4    author  #/texts/1                                            Nan Xue\n",
-      "5    author  #/texts/1                                         Zhibo Yang\n",
-      "6    author  #/texts/1                                           Cong Yao\n",
-      "7    author  #/texts/1                                   Wuhan University\n",
-      "8    author  #/texts/1                                   Wuhan University\n",
+      "title:  Revisiting Document Image Dewarping by Grid Regularization\n",
+      "abstract:  Abstract This paper addresses the problem of document image dewarping, which aims at eliminating the geometric distortion in document images for document digitization. Instead of designing a better neural network to approximate the optical flow fields between the inputs and outputs, we pursue the best readability by taking the text lines and the document boundaries into account from a constrained optimization perspective. Specifically, our proposed method first learns the boundary points and the pixels in the text lines and then follows the most simple observation that the boundaries and text lines in both horizontal and vertical directions should be kept after dewarping to introduce a novel grid regularization scheme. To obtain the final forward mapping for dewarping, we solve an optimization problem with our proposed grid regularization. The experiments comprehensively demonstrate that our proposed approach outperforms the prior arts by large margins in terms of readability (with the metrics of Character Errors Rate and the Edit Distance) while maintaining the best image quality on the publiclyavailable DocUNet benchmark.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language    978603162285402020  DOCUMENT          #         en        0.99\n",
+      "1   metadata  12429620403640438226  DOCUMENT  #/texts/0      title        1.00\n",
+      "2   metadata   4879793979395431883  DOCUMENT  #/texts/2   abstract        1.00\n",
+      "3   metadata    175822038141138086  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "4   language  12429620403640438226      TEXT  #/texts/0         en        0.75\n",
+      "5   semantic  12429620403640438226      TEXT  #/texts/0     header        0.88\n",
+      "6   language  15235721918946238063      TEXT  #/texts/1         en        0.55\n",
+      "7   semantic  15235721918946238063      TEXT  #/texts/1  meta-data        0.99\n",
+      "8   language   4879793979395431883      TEXT  #/texts/2         en        0.32\n",
+      "9   semantic   4879793979395431883      TEXT  #/texts/2     header        0.93\n",
+      "10  language    175822038141138086      TEXT  #/texts/3         en        0.91\n",
+      "11  semantic    175822038141138086      TEXT  #/texts/3       text        0.95\n",
       "2303.03755.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Generating visual layouts is an essential ingr...\n",
-      "1     title  #/texts/2  DLT: Conditioned layout generation with Joint ...\n",
-      "2    author  #/texts/2                                          Elad Levi\n",
-      "3    author  #/texts/2                                          Eli Brosh\n",
-      "4    author  #/texts/2                                  Mykola Mykhailych\n",
-      "5    author  #/texts/2                                     Meir Perez Wix\n",
+      "title:  DLT: Conditioned layout generation with Joint Discrete-Continuous Diffusion Layout Transformer\n",
+      "abstract:  Abstract Generating visual layouts is an essential ingredient of graphic design. The ability to condition layout generation on a partial subset of component attributes is critical to real-world applications that involve user interaction. Recently, diffusion models have demonstrated high-quality generative performances in various domains. However, it is unclear how to apply diffusion models to the natural representation of layouts which consists of a mix of discrete (class) and continuous (location, size) attributes. To address the conditioning layout generation problem, we introduce DLT, a joint discrete-continuous diffusion model. DLT is a transformer-based model which has a flexible conditioning mechanism that allows for conditioning on any given subset of all the layout component classes, locations, and sizes. Our method outperforms state-of-the-art generative models on various layout generation datasets with respect to different metrics and conditioning settings. Additionally, we validate the effectiveness of our proposed conditioning mechanism and the joint continuous-diffusion process. This joint process can be incorporated into a wide range of mixed discrete-continuous generative tasks.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  11659927563608853753  DOCUMENT          #         en        0.98\n",
+      "1   metadata    669239715039352587  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  14231489606976721709  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata    537144397678098445  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   language   1378232785427175996      TEXT  #/texts/0         en        0.28\n",
+      "5   semantic   1378232785427175996      TEXT  #/texts/0       text        0.89\n",
+      "6   language    669239715039352587      TEXT  #/texts/1         en        0.72\n",
+      "7   semantic    669239715039352587      TEXT  #/texts/1     header        0.84\n",
+      "8   language  10345949929986003176      TEXT  #/texts/2         en        0.44\n",
+      "9   semantic  10345949929986003176      TEXT  #/texts/2  meta-data        0.97\n",
+      "10  language  14231489606976721709      TEXT  #/texts/3         en        0.32\n",
+      "11  semantic  14231489606976721709      TEXT  #/texts/3     header        0.93\n",
       "2110.08164.pdf\n",
-      "   subtype   subj_path                                          name\n",
-      "0   author   #/texts/3                                  Penghai Zhao\n",
-      "1    title   #/texts/3                                    A PREPRINT\n",
-      "2   author   #/texts/4                              Ethnic Languages\n",
-      "3   author   #/texts/4  Education Northwest Minzu University Lanzhou\n",
-      "4   author   #/texts/5                                   Zhengqi Cai\n",
-      "5   author   #/texts/7                                   Weilan Wang\n",
-      "6   author   #/texts/8                              Ethnic Languages\n",
-      "7   author   #/texts/8  Education Northwest Minzu University Lanzhou\n",
-      "8   author   #/texts/9                                  Guowei Zhang\n",
-      "9   author  #/texts/10                              Ethnic Languages\n",
-      "10  author  #/texts/10  Education Northwest Minzu University Lanzhou\n",
-      "11  author  #/texts/11                                       Yuqi Lu\n",
-      "12  author  #/texts/12                              Ethnic Languages\n",
-      "13  author  #/texts/12  Education Northwest Minzu University Lanzhou\n",
+      "title:  A PREPRINT\n",
+      "abstract:  ['Accurate layout analysis without subsequent text-line segmentation remains an ongoing challenge, especially when facing the Kangyur, a kind of historical Tibetan document featuring considerable touching components and mottled background. Aiming at identifying different regions in document images, layout analysis is indispensable for subsequent procedures such as character recognition. However, there was only a little research being carried out to perform line-level layout analysis which failed to deal with the Kangyur. To obtain the optimal results, a fine-grained sub-line level layout analysis approach is presented. Firstly, we introduced an accelerated method to build the dataset which is dynamic and reliable. Secondly, enhancement had been made to the SOLOv2 according to the characteristics of the Kangyur. Then, we fed the enhanced SOLOv2 with the prepared annotation file during the training phase. Once the network is trained, instances of the text line, sentence, and titles can be segmented and identified during the inference stage. The experimental results show that the proposed method delivers a decent 72.7% average precision on our dataset. In general, this preliminary research provides insights into the fine-grained sub-line level layout analysis and testifies the SOLOv2-based approaches. We also believe that the proposed methods can be adopted on other language documents with various layouts.']\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  17785348499810975599  DOCUMENT          #         en        1.00\n",
+      "1   metadata    460898704592932350  DOCUMENT  #/texts/2      title        1.00\n",
+      "2   language  10001330042032416266      TEXT  #/texts/0         en        0.42\n",
+      "3   semantic  10001330042032416266      TEXT  #/texts/0  reference        0.91\n",
+      "4   language  10494471504194919372      TEXT  #/texts/1         ja        0.34\n",
+      "5   semantic  10494471504194919372      TEXT  #/texts/1     header        0.84\n",
+      "6   language    460898704592932350      TEXT  #/texts/2         en        0.13\n",
+      "7   semantic    460898704592932350      TEXT  #/texts/2  reference        0.91\n",
+      "8   language  16346695938433635803      TEXT  #/texts/3         id        0.46\n",
+      "9   semantic  16346695938433635803      TEXT  #/texts/3  meta-data        1.00\n",
+      "10  language   6531628207994599756      TEXT  #/texts/4         en        0.77\n",
+      "11  semantic   6531628207994599756      TEXT  #/texts/4  meta-data        0.92\n",
       "2209.06584.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Active consumption of digital documents has yi...\n",
-      "1     title  #/texts/2  One-Shot Doc Snippet Detection: Powering Searc...\n",
-      "2    author  #/texts/2                                       Abhinav Java\n",
-      "3    author  #/texts/2                                     Milan Aggarwal\n",
-      "4    author  #/texts/2                                     Surgan Jandial\n",
-      "5    author  #/texts/2                                     Mausoom Sarkar\n",
-      "6    author  #/texts/2                               Balaji Krishnamurthy\n",
-      "7    author  #/texts/3                         Data Science Research Labs\n",
+      "title:  One-Shot Doc Snippet Detection: Powering Search in Document Beyond Text\n",
+      "abstract:  Abstract Active consumption of digital documents has yielded scope for research in various applications, including search. Traditionally, searching within a document has been cast as a text matching problem ignoring the rich layout and visual cues commonly present in structured documents, forms, etc. To that end, we ask a mostly unexplored question: 'Can we search for other similar snippets present in a target document page given a single query instance of a document snippet?'. We propose MONOMER to solve this as a one-shot snippet detection task. MONOMER fuses context from visual, textual, and spatial modalities of snippets and documents to find query snippet in target documents. We conduct extensive ablations and experiments showing MONOMER outperforms several baselines from one-shot object detection (BHRL), template matching, and document understanding (LayoutLMv3). Due to the scarcity of relevant data for the task at hand, we train MONOMER on programmatically generated data having many visually similar query snippets and target document pairs from two datasets-Flamingo Forms and PubLayNet. We also do a human study to validate the generated data.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  11192906298435654347  DOCUMENT          #         en        0.99\n",
+      "1   metadata  18233337473379706613  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  11756603228534403438  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   8440703618720666577  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  10436243055566423792      TEXT  #/texts/0         en        0.44\n",
+      "5   semantic  10436243055566423792      TEXT  #/texts/0  reference        0.95\n",
+      "6   language  18233337473379706613      TEXT  #/texts/1         en        0.50\n",
+      "7   semantic  18233337473379706613      TEXT  #/texts/1     header        0.77\n",
+      "8   language  18446113455465465647      TEXT  #/texts/2         en        0.65\n",
+      "9   semantic  18446113455465465647      TEXT  #/texts/2  meta-data        0.88\n",
+      "10  language  14998310295267340112      TEXT  #/texts/3         en        0.51\n",
+      "11  semantic  14998310295267340112      TEXT  #/texts/3  meta-data        0.69\n",
       "2003.07560.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Tabular data is a crucial form of inf...\n",
-      "1    author  #/texts/3                                           Yiren Li\n",
-      "2     title  #/texts/3                                                  ‖\n",
-      "3    author  #/texts/3                                        Zheng Huang\n",
-      "4    author  #/texts/3                                         Junchi Yan\n",
-      "5    author  #/texts/3                                            Yi Zhou\n",
-      "6    author  #/texts/3                                             Fan Ye\n",
-      "7    author  #/texts/3                                        Xianhui Liu\n",
-      "8    author  #/texts/6                      Shanghai Jiao Tong University\n",
-      "9    author  #/texts/7                      Shanghai Jiao Tong University\n",
+      "title:  ‖\n",
+      "abstract:  Abstract-Tabular data is a crucial form of information expression, which can organize data in a standard structure for easy information retrieval and comparison. However, in financial industry and many other fields tables are often disclosed in unstructured digital files, e.g. Portable Document Format (PDF) and images, which are difficult to be extracted directly. In this paper, to facilitate deep learning based table extraction from unstructured digital files, we publish a standard Chinese dataset named FinTab, which contains more than 1,600 financial tables of diverse kinds and their corresponding structure representation in JSON. In addition, we propose a novel graph-based convolutional neural network model named GFTE as a baseline for future comparison. GFTE integrates image feature, position feature and textual feature together for precise edge prediction and reaches overall good results $^{1}$.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  17365030269464102181  DOCUMENT          #         en        0.99\n",
+      "1   metadata   2005836100606620539  DOCUMENT  #/texts/2      title        1.00\n",
+      "2   metadata   1624559958454587795  DOCUMENT  #/texts/9   abstract        1.00\n",
+      "3   language  12864712276516214286      TEXT  #/texts/0         en        0.30\n",
+      "4   semantic  12864712276516214286      TEXT  #/texts/0  reference        0.78\n",
+      "5   language  14832218684155059358      TEXT  #/texts/1         en        0.88\n",
+      "6   semantic  14832218684155059358      TEXT  #/texts/1       text        0.47\n",
+      "7   language   2005836100606620539      TEXT  #/texts/2         vi        0.52\n",
+      "8   semantic   2005836100606620539      TEXT  #/texts/2       text        1.00\n",
+      "9   language  11681392060471928309      TEXT  #/texts/3         en        0.34\n",
+      "10  semantic  11681392060471928309      TEXT  #/texts/3  meta-data        1.00\n",
+      "11  language  14497389118296410556      TEXT  #/texts/4         en        0.78\n",
       "2203.01017.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #                            a. Picture of a table: \n",
-      "1     title  #/texts/2  TableFormer: Table Structure Understanding wit...\n",
-      "2    author  #/texts/2                                       Ahmed Nassar\n",
-      "3    author  #/texts/2                                Nikolaos Livathinos\n",
-      "4    author  #/texts/2                                       Maksym Lysak\n",
-      "5    author  #/texts/2                                        Peter Staar\n",
+      "title:  TableFormer: Table Structure Understanding with Transformers.\n",
+      "abstract:  Abstract a. Picture of a table:\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   1450589212631209931  DOCUMENT          #         en        0.99\n",
+      "1   metadata  16933509326206698184  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  16623877941696432046  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata   7722591067936378833  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language  12349202100122108811      TEXT  #/texts/0         en        0.38\n",
+      "5   semantic  12349202100122108811      TEXT  #/texts/0       text        0.89\n",
+      "6   language  16933509326206698184      TEXT  #/texts/1         en        0.76\n",
+      "7   semantic  16933509326206698184      TEXT  #/texts/1  reference        0.87\n",
+      "8   language   3262318516024354863      TEXT  #/texts/2         en        0.41\n",
+      "9   semantic   3262318516024354863      TEXT  #/texts/2  meta-data        0.92\n",
+      "10  language   8760086191836158497      TEXT  #/texts/3         de        0.11\n",
+      "11  semantic   8760086191836158497      TEXT  #/texts/3       text        0.60\n",
       "2208.08037.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  To satisfy various user needs, different subta...\n",
-      "1      title  #/texts/2  UniLayout: Taming Unified Sequence-to-Sequence...\n",
-      "2     author  #/texts/2                                      Zhaoyun Jiang\n",
-      "3     author  #/texts/2                                         Huayu Deng\n",
-      "4     author  #/texts/2                                        Zhongkai Wu\n",
-      "5     author  #/texts/2                                          Jiaqi Guo\n",
-      "6     author  #/texts/2                                        Shizhao Sun\n",
-      "7     author  #/texts/2                                     Vuksan Mijovic\n",
-      "8     author  #/texts/2                                       Zijiang Yang\n",
-      "9     author  #/texts/2                                      Dongmei Zhang\n",
-      "10    author  #/texts/3                                Jiaotong University\n",
-      "11    author  #/texts/3                       Shanghai Jiaotong University\n",
-      "12    author  #/texts/4                                 Beihang University\n",
-      "13    author  #/texts/4                            Microsoft Research Asia\n",
+      "title:  UniLayout: Taming Unified Sequence-to-Sequence Transformers for Graphic Layout Generation\n",
+      "abstract:  Abstract To satisfy various user needs, different subtasks of graphic layout generation have been explored intensively in recent years. Existing studies usually propose taskspecific methods with diverse input-output formats, dedicated model architectures, and different learning methods. However, those specialized approaches make the adaption to unseen subtasks difficult, hinder the knowledge sharing between different subtasks, and are contrary to the trend of devising general-purpose models. In this work, we propose UniLayout, which handles different subtasks for graphic layout generation in a unified manner. First, we uniformly represent diverse inputs and outputs of subtasks as the sequences of tokens. Then, based on the unified sequence format, we naturally leverage an identical encoder-decoder architecture with Transformers for different subtasks. Moreover, based on the above two kinds of unification, we further develop a single model that supports all subtasks concurrently. Experiments on two public datasets demonstrate that while simple, UniLayout significantly outperforms the previous task-specific methods.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   8182807638037382561  DOCUMENT          #         en        1.00\n",
+      "1   metadata  16989829520099866629  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   8153202450442895503  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  15212054895134725433  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   3988459388858141865      TEXT  #/texts/0         en        0.44\n",
+      "5   semantic   3988459388858141865      TEXT  #/texts/0  reference        0.86\n",
+      "6   language  16989829520099866629      TEXT  #/texts/1         en        0.63\n",
+      "7   semantic  16989829520099866629      TEXT  #/texts/1     header        0.81\n",
+      "8   language   1229170802030660337      TEXT  #/texts/2         en        0.28\n",
+      "9   semantic   1229170802030660337      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   9540420743311235070      TEXT  #/texts/3         en        0.60\n",
+      "11  semantic   9540420743311235070      TEXT  #/texts/3  meta-data        0.91\n",
       "2207.12955.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Most existing scene text detectors f...\n",
-      "1     title  #/texts/2  Contextual Text Block Detection towards Scene ...\n",
-      "2    author  #/texts/2                                         Chuhui Xue\n",
-      "3    author  #/texts/2                                      Jiaxing Huang\n",
-      "4    author  #/texts/2                                         Shijian Lu\n",
-      "5    author  #/texts/2                                       Changhu Wang\n",
-      "6    author  #/texts/2                                           Song Bai\n",
-      "7    author  #/texts/3                   Nanyang Technological University\n",
-      "8    author  #/texts/5                                          Dance Inc\n",
+      "title:  Contextual Text Block Detection towards Scene Text Understanding\n",
+      "abstract:  Abstract. Most existing scene text detectors focus on detecting characters or words that only capture partial text messages due to missing contextual information. For a better understanding of text in scenes, it is more desired to detect contextual text blocks (CTBs) which consist of one or multiple integral text units (e.g., characters, words, or phrases) in natural reading order and transmit certain complete text messages. This paper presents contextual text detection, a new setup that detects CTBs for better understanding of texts in scenes. We formulate the new setup by a dual detection task which first detects integral text units and then groups them into a CTB. To this end, we design a novel scene text clustering technique that treats integral text units as tokens and groups them (belonging to the same CTB) into an ordered token sequence. In addition, we create two datasets SCUT-CTW-Context and ReCTS-Context to facilitate future research, where each CTB is well annotated by an ordered sequence of integral text units. Further, we introduce three metrics that measure contextual text detection in local accuracy, continuity, and global accuracy. Extensive experiments show that our method accurately detects CTBs which effectively facilitates downstream tasks such as text classification and translation. The project is available at https://sg-vilab.github.io/publication/xue2022contextual/. Keywords: Scene Text Detection\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  15828830270858536421  DOCUMENT          #         en        0.99\n",
+      "1   metadata   9070712247046693948  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  14369142600913373072  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata   9054809053710816760  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language   3136319967504727252      TEXT  #/texts/0         en        0.54\n",
+      "5   semantic   3136319967504727252      TEXT  #/texts/0  reference        0.95\n",
+      "6   language   9070712247046693948      TEXT  #/texts/1         en        0.63\n",
+      "7   semantic   9070712247046693948      TEXT  #/texts/1     header        0.75\n",
+      "8   language  13251585860532160945      TEXT  #/texts/2         en        0.36\n",
+      "9   semantic  13251585860532160945      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language  10777370744359634488      TEXT  #/texts/3         en        0.60\n",
+      "11  semantic  10777370744359634488      TEXT  #/texts/3  meta-data        0.88\n",
       "2305.02769.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Table detection is the task of class...\n",
-      "1     title  #/texts/2  Towards End-to-End Semi-Supervised Table Detec...\n",
-      "2    author  #/texts/2                                    Tahira Shehzadi\n",
-      "3    author  #/texts/2                               Khurram Azeem Hashmi\n",
-      "4    author  #/texts/2                                    Didier Stricker\n",
-      "5    author  #/texts/2                                     Marcus Liwicki\n",
-      "6    author  #/texts/2                              Muhammad Zeshan Afzal\n",
+      "title:  Towards End-to-End Semi-Supervised Table Detection with Deformable Transformer\n",
+      "abstract:  Abstract. Table detection is the task of classifying and localizing table objects within document images. With the recent development in deep learning methods, we observe remarkable success in table detection. However, a significant amount of labeled data is required to train these models effectively. Many semi-supervised approaches are introduced to mitigate the need for a substantial amount of label data. These approaches use CNN-based detectors that rely on anchor proposals and post-processing stages such as NMS. To tackle these limitations, this paper presents a novel end-to-end semi-supervised table detection method that employs the deformable transformer for detecting table objects. We evaluate our semi-supervised method on PubLayNet, DocBank, ICADR-19 and TableBank datasets, and it achieves superior performance compared to previous methods. It outperforms the fully supervised method (Deformable transformer) by +3.4 points on 10% labels of TableBank-both dataset and the previous CNN-based semi-supervised approach (Soft Teacher) by +1.8 points on 10% labels of PubLayNet dataset. We hope this work opens new possibilities towards semi-supervised and unsupervised table detection methods. Keywords: Semi-Supervised Learning · Deformable Transformer · Table Analysis · Table Detection.\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language   2339371309351147950  DOCUMENT           #         en   \n",
+      "1   metadata   2441605672578681191  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata  10072912660304435314  DOCUMENT  #/texts/10   abstract   \n",
+      "3   metadata  11963684253056995713  DOCUMENT  #/texts/11   abstract   \n",
+      "4   language   5123431033643498349      TEXT   #/texts/0         en   \n",
+      "5   semantic   5123431033643498349      TEXT   #/texts/0  reference   \n",
+      "6   language   2441605672578681191      TEXT   #/texts/1         en   \n",
+      "7   semantic   2441605672578681191      TEXT   #/texts/1     header   \n",
+      "8   language  16577848168059002784      TEXT   #/texts/2         en   \n",
+      "9   semantic  16577848168059002784      TEXT   #/texts/2  meta-data   \n",
+      "10  language  12065750236253281860      TEXT   #/texts/3         en   \n",
+      "11  semantic  12065750236253281860      TEXT   #/texts/3  meta-data   \n",
+      "\n",
+      "    confidence  \n",
+      "0         1.00  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         0.62  \n",
+      "5         0.66  \n",
+      "6         0.76  \n",
+      "7         0.65  \n",
+      "8         0.40  \n",
+      "9         0.94  \n",
+      "10        0.20  \n",
+      "11        1.00  \n",
       "2303.05325.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Abstract. While strides have been made in deep...\n",
-      "1      title  #/texts/2  BaDLAD: A Large Multi-Domain Bengali Document ...\n",
-      "2     author  #/texts/2                              Istiak Hossain Shihab\n",
-      "3     author  #/texts/2                                      Rakibul Hasan\n",
-      "4     author  #/texts/2                               Mahfuzur Rahman Emon\n",
-      "5     author  #/texts/3                               Syed Mobassir Hossen\n",
-      "6     author  #/texts/3                                  Nazmuddoha Ansary\n",
-      "7     author  #/texts/3                                      Intesur Ahmed\n",
-      "8     author  #/texts/3                                        Fazle Rabbi\n",
-      "9     author  #/texts/4                              Shahriar Elahi Dhruvo\n",
-      "10    author  #/texts/4                                   Akib Hasan Pavel\n",
-      "11    author  #/texts/5                                Marsia Haque Meghla\n",
-      "12    author  #/texts/5                                     Rezwanul Haque\n",
-      "13    author  #/texts/5                            Sayma Sultana Chowdhury\n",
-      "14    author  #/texts/6                                      Tahsin Reasat\n",
-      "15    author  #/texts/6                               Ahmed Imtiaz Humayun\n",
-      "16    author  #/texts/6                                       Asif Sushmit\n",
-      "17    author  #/texts/8                              Vanderbilt University\n",
+      "title:  BaDLAD: A Large Multi-Domain Bengali Document Layout Analysis Dataset\n",
+      "abstract:  Abstract. While strides have been made in deep learning based Bengali Optical Character Recognition (OCR) in the past decade, absence of large Document Layout Analysis (DLA) datasets has hindered the application of OCR in document transcription, e.g., transcribing historical documents and newspapers. Moreover, rule-based DLA systems that are currently being employed in practice are not robust to domain variations and out-of-distribution layouts. To this end, we present the first multidomain large B eng a li D ocument L ayout A nalysis D ataset: BaDLAD. This dataset contains 33, 695 human annotated document samples from six domains-i) books and magazines ii) public domain govt. documents iii) liberation war documents iv) new newspapers v) historical newspapers and vi) property deeds; with 710 K polygon annotations for four unit types: text-box, paragraph, image, and table. Through preliminary experiments benchmarking the performance of existing state-of-the-art deep learning architectures for English DLA, we demonstrate the efficacy of our dataset in training deep learning based Bengali document digitization models. Keywords: Handwritten Document Images · Layout Analysis (Physical and Logical) · Mobile/Camera-Based · Other Domains · Typeset Document Images\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language   8865613824093398131  DOCUMENT           #         en   \n",
+      "1   metadata  10095296672555375400  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata  16411799519606679859  DOCUMENT   #/texts/9   abstract   \n",
+      "3   metadata  10800146418327418135  DOCUMENT  #/texts/10   abstract   \n",
+      "4   language  12112214876466467582      TEXT   #/texts/0         en   \n",
+      "5   semantic  12112214876466467582      TEXT   #/texts/0  reference   \n",
+      "6   language  10095296672555375400      TEXT   #/texts/1         en   \n",
+      "7   semantic  10095296672555375400      TEXT   #/texts/1     header   \n",
+      "8   language  17408826844111608210      TEXT   #/texts/2         en   \n",
+      "9   semantic  17408826844111608210      TEXT   #/texts/2  meta-data   \n",
+      "10  language   3359000651769550631      TEXT   #/texts/3         en   \n",
+      "11  semantic   3359000651769550631      TEXT   #/texts/3  meta-data   \n",
+      "\n",
+      "    confidence  \n",
+      "0         0.99  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         0.60  \n",
+      "5         0.66  \n",
+      "6         0.37  \n",
+      "7         0.94  \n",
+      "8         0.19  \n",
+      "9         0.99  \n",
+      "10        0.27  \n",
+      "11        0.98  \n",
       "2011.13534.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Documents are a core part of many businesses i...\n",
-      "1     title  #/texts/2  A Survey of Deep Learning Approaches for OCR a...\n",
-      "2    author  #/texts/2                            Nishant Subramani Scale\n",
-      "3    author  #/texts/3                             Alexandre Matton Scale\n",
-      "4    author  #/texts/6                                         Adrian Lam\n",
+      "title:  A Survey of Deep Learning Approaches for OCR and Document Understanding\n",
+      "abstract:  Abstract Documents are a core part of many businesses in many fields such as law, finance, and technology among others. Automatic understanding of documents such as invoices, contracts, and resumes is lucrative, opening up many new avenues of business. The fields of natural language processing and computer vision have seen tremendous progress through the development of deep learning such that these methods have started to become infused in contemporary document understanding systems. In this survey paper, we review different techniques for document understanding for documents written in English and consolidate methodologies present in literature to act as a jumping-off point for researchers exploring this area.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   3322178716407991214  DOCUMENT          #         en        1.00\n",
+      "1   metadata   3082305603685610193  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   6720404006088754024  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "3   metadata   2132452585218772003  DOCUMENT  #/texts/9   abstract        1.00\n",
+      "4   language   5865682553062852405      TEXT  #/texts/0         en        0.39\n",
+      "5   semantic   5865682553062852405      TEXT  #/texts/0  reference        0.95\n",
+      "6   language   3082305603685610193      TEXT  #/texts/1         en        0.72\n",
+      "7   semantic   3082305603685610193      TEXT  #/texts/1     header        0.88\n",
+      "8   language   5246139715348675309      TEXT  #/texts/2         en        0.45\n",
+      "9   semantic   5246139715348675309      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language  13584582387870540825      TEXT  #/texts/3         en        0.31\n",
+      "11  semantic  13584582387870540825      TEXT  #/texts/3  meta-data        0.95\n",
       "2308.14978.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Document pre-trained models and grid-based mod...\n",
-      "1     title  #/texts/2  Vision Grid Transformer for Document Layout An...\n",
-      "2    author  #/texts/2                                           Cheng Da\n",
-      "3    author  #/texts/2                                         Chuwei Luo\n",
-      "4    author  #/texts/2                                           Qi Zheng\n",
-      "5    author  #/texts/2                                           Cong Yao\n",
+      "title:  Vision Grid Transformer for Document Layout Analysis\n",
+      "abstract:  Abstract Document pre-trained models and grid-based models have proven to be very effective on various tasks in Document AI. However, for the document layout analysis (DLA) task, existing document pre-trained models, even those pretrained in a multi-modal fashion, usually rely on either textual features or visual features. Grid-based models for DLA are multi-modality but largely neglect the effect of pre-training. To fully leverage multi-modal information and exploit pre-training techniques to learn better representation for DLA, in this paper, we present VGT, a twostream Vision Grid Transformer, in which Grid Transformer (GiT) is proposed and pre-trained for 2D token-level and segment-level semantic understanding. Furthermore, a new dataset named D $^{4}$LA, which is so far the most diverse and detailed manually-annotated benchmark for document layout analysis, is curated and released. Experiment results have illustrated that the proposed VGT model achieves new state-of-the-art results on DLA tasks, e.g. PubLayNet (95. 7% → 96. 2%), DocBank (79. 6% → 84. 1%), and D $^{4}$LA (67. 7% → 68. 8%). The code and models as well as the D $^{4}$LA dataset will be made publicly available $^{1}$.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  16592063621056527461  DOCUMENT          #         en        1.00\n",
+      "1   metadata   4640372460409057753  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  17100450427175069178  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata  17593631429546340130  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language   9727257513907321483      TEXT  #/texts/0         en        0.41\n",
+      "5   semantic   9727257513907321483      TEXT  #/texts/0  reference        0.86\n",
+      "6   language   4640372460409057753      TEXT  #/texts/1         en        0.53\n",
+      "7   semantic   4640372460409057753      TEXT  #/texts/1     header        0.87\n",
+      "8   language  17143522046456530342      TEXT  #/texts/2         en        0.74\n",
+      "9   semantic  17143522046456530342      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  13989699358317696966      TEXT  #/texts/3         en        0.11\n",
+      "11  semantic  13989699358317696966      TEXT  #/texts/3  meta-data        0.50\n",
       "2207.11871.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Document Visual Question Answering (VQA) aims ...\n",
-      "1     title  #/texts/2  Towards Complex Document Understanding By Disc...\n",
-      "2    author  #/texts/2                                        Fengbin Zhu\n",
-      "3    author  #/texts/2                                       Wenqiang Lei\n",
-      "4    author  #/texts/2                                          Fuli Feng\n",
-      "5    author  #/texts/2                                          Chao Wang\n",
-      "6    author  #/texts/2                                      Haozhou Zhang\n",
-      "7    author  #/texts/4                                    Estates Pte Ltd\n",
-      "8    author  #/texts/5                                 Sichuan University\n",
+      "title:  Towards Complex Document Understanding By Discrete Reasoning\n",
+      "abstract:  ABSTRACT Document Visual Question Answering (VQA) aims to answer questions over visually-rich documents. In this work, we introduce a new Document VQA dataset, named TAT-DQA, which consists of 3,067 document pages comprising semi-structured table(s) and unstructured text as well as 16,558 question-answer pairs. The documents are sampled from financial reports and contain lots of numbers, which means discrete reasoning capability is demanded to answer the questions. Based on TAT-DQA, we further develop a novel model named MHST that takes into account the information in multi-modalities to intelligently address different types of questions with corresponding strategies, i.e., extraction or reasoning. The experiments show that MHST model significantly outperforms the baseline methods, demonstrating its effectiveness. However, the performance still lags far behind that of expert humans. We expect that our TAT-DQA dataset would facilitate the research on understanding of visually-rich documents, especially for scenarios that require discrete reasoning. Also, we hope the proposed model would inspire researchers to design more advanced Document VQA models in future. CCS CONCEPTS · Computing methodologies → Natural language processing; · Information systems → Question answering. KEYWORDS Question Answering, Visually-rich Document Understanding, Discrete Reasoning ACM Reference Format: Fengbin Zhu 1, $^{2}$, Wenqiang Lei3 $^{∗}$, Fuli Feng $^{4}$, Chao Wang$^{2}$, Haozhou Zhang$^{3}$, Tat-Seng Chua$^{1}$. 2022. Towards Complex Document Understanding By Discrete Reasoning. In Proceedings of the 30th ACM International Conference on Multimedia (MM '22), October 10-14, 2022, Lisboa, Portugal. ACM, New York, NY, USA, 10 pages. https://doi.org/10.1145/3503161.3548422 $^{∗}$Corresponding author. ACM ISBN 978-1-4503-9203-7/22/10...$15.00 Question : What was the total cost in Wireless including spectrum license fee in 2019? Derivation : 1,320 + 1,731 = 3,051 Scale : Millions Answer : 3,051,000,000 Figure 1: An example of TAT-DQA dataset. Given a question and a visually-rich document that contains both tabular and textual data, the machine is expected to derive the answer.\n",
+      "        type             subj_hash subj_name   subj_path     label  confidence\n",
+      "0   language   5727170198881628579  DOCUMENT           #        en         1.0\n",
+      "1   metadata   8475542795139585408  DOCUMENT   #/texts/1     title         1.0\n",
+      "2   metadata  11279303476920735275  DOCUMENT   #/texts/8  abstract         1.0\n",
+      "3   metadata  16821095994808609750  DOCUMENT   #/texts/9  abstract         1.0\n",
+      "4   metadata   4923876684724984128  DOCUMENT  #/texts/10  abstract         1.0\n",
+      "5   metadata   1220977146596289779  DOCUMENT  #/texts/11  abstract         1.0\n",
+      "6   metadata   2661334982718065771  DOCUMENT  #/texts/12  abstract         1.0\n",
+      "7   metadata  11280315739843364000  DOCUMENT  #/texts/13  abstract         1.0\n",
+      "8   metadata   3816002088806916798  DOCUMENT  #/texts/14  abstract         1.0\n",
+      "9   metadata   8132312948183007779  DOCUMENT  #/texts/15  abstract         1.0\n",
+      "10  metadata   9622509244035404651  DOCUMENT  #/texts/16  abstract         1.0\n",
+      "11  metadata   6521211824532126123  DOCUMENT  #/texts/17  abstract         1.0\n",
       "2308.11788.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. We present an extensible method for ...\n",
-      "1     title  #/texts/2  An extensible point-based method for data char...\n",
-      "2    author  #/texts/2                                        Carlos Soto\n",
-      "3    author  #/texts/2                                        Shinjae Yoo\n",
-      "4    author  #/texts/3                     Brookhaven National Laboratory\n",
+      "title:  An extensible point-based method for data chart value detection\n",
+      "abstract:  Abstract. We present an extensible method for identifying semantic points to reverse engineer (i.e. extract the values of) data charts, particularly those in scientific articles. Our method uses a point proposal network (akin to region proposal networks for object detection) to directly predict the position of points of interest in a chart, and it is readily extensible to multiple chart types and chart elements. We focus on complex bar charts in the scientific literature, on which our model is able to detect salient points with an accuracy of 0.8705 F1 (@1.5-cell max deviation); it achieves 0.9810 F1 on synthetically-generated charts similar to those used in prior works. We also explore training exclusively on synthetic data with novel augmentations, reaching surprisingly competent performance in this way (0.6621 F1) on real charts with widely varying appearance, and we further demonstrate our unchanged method applied directly to synthetic pie charts (0.8343 F1). Datasets, trained models, and evaluation code are available at https://github.com/BNLNLP/PPN_model. Keywords: document analysis, chart extraction, value detection\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  15730228459936273311  DOCUMENT          #         en        1.00\n",
+      "1   metadata   4784013526395200985  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   4060338734311582032  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata  16636000959355062064  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language   5550430566041753073      TEXT  #/texts/0         en        0.43\n",
+      "5   semantic   5550430566041753073      TEXT  #/texts/0  reference        0.86\n",
+      "6   language   4784013526395200985      TEXT  #/texts/1         en        0.70\n",
+      "7   semantic   4784013526395200985      TEXT  #/texts/1     header        0.74\n",
+      "8   language    911163436536595014      TEXT  #/texts/2         en        0.70\n",
+      "9   semantic    911163436536595014      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   9723793807474243122      TEXT  #/texts/3         en        0.18\n",
+      "11  semantic   9723793807474243122      TEXT  #/texts/3  meta-data        0.98\n",
       "1912.13318.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Pre-training techniques have been verified suc...\n",
-      "1     title  #/texts/2  LayoutLM: Pre-training of Text and Layout for ...\n",
-      "2    author  #/texts/2                                          Yiheng Xu\n",
-      "3    author  #/texts/4                                      Shaohan Huang\n",
-      "4    author  #/texts/4                            Microsoft Research Asia\n",
-      "5    author  #/texts/5                                           Furu Wei\n",
-      "6    author  #/texts/5                            Microsoft Research Asia\n",
-      "7    author  #/texts/6                                          Ming Zhou\n",
-      "8    author  #/texts/6                            Microsoft Research Asia\n",
+      "title:  LayoutLM: Pre-training of Text and Layout for Document Image Understanding\n",
+      "abstract:  ABSTRACT Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the widespread use of pre-training models for NLP applications, they almost exclusively focus on text-level manipulation, while neglecting layout and style information that is vital for document image understanding. In this paper, we propose the LayoutLM to jointly model interactions between text and layout information across scanned document images, which is beneficial for a great number of real-world document image understanding tasks such as information extraction from scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into LayoutLM. To the best of our knowledge, this is the first time that text and layout are jointly learned in a single framework for documentlevel pre-training. It achieves new state-of-the-art results in several downstream tasks, including form understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image classification (from 93.07 to 94.42). The code and pre-trained LayoutLM models are publicly available at https://aka.ms/layoutlm. CCS CONCEPTS · Informationsystems → Businessintelligence; · Computing methodologies → Informationextraction; Transferlearning; · Applied computing → Document analysis. KEYWORDS LayoutLM; pre-trained models; document image understanding ACM Reference Format: Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. 2020. LayoutLM: Pre-training of Text and Layout for Document Image Understanding. In Proceedings of the 26th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '20), August 23-27, 2020, Virtual Event, CA, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/ 10.1145/3394486.3403172 $^{∗}$Equal contributions during internship at Microsoft Research Asia. ACM ISBN 978-1-4503-7998-4/20/08...$15.00\n",
+      "        type             subj_hash subj_name   subj_path     label  confidence\n",
+      "0   language   7610895368039716585  DOCUMENT           #        en         1.0\n",
+      "1   metadata  17654429621247340035  DOCUMENT   #/texts/1     title         1.0\n",
+      "2   metadata   7726871037747607472  DOCUMENT   #/texts/7  abstract         1.0\n",
+      "3   metadata  10978540521033865848  DOCUMENT   #/texts/8  abstract         1.0\n",
+      "4   metadata   5934707724399873881  DOCUMENT   #/texts/9  abstract         1.0\n",
+      "5   metadata   2378094128453498823  DOCUMENT  #/texts/10  abstract         1.0\n",
+      "6   metadata   7726968154032694329  DOCUMENT  #/texts/11  abstract         1.0\n",
+      "7   metadata   9608013805800255202  DOCUMENT  #/texts/12  abstract         1.0\n",
+      "8   metadata    561746052722555832  DOCUMENT  #/texts/13  abstract         1.0\n",
+      "9   metadata   4105288624001774438  DOCUMENT  #/texts/14  abstract         1.0\n",
+      "10  metadata   1007175453030828339  DOCUMENT  #/texts/15  abstract         1.0\n",
+      "11  metadata   6999006171723600633  DOCUMENT  #/texts/16  abstract         1.0\n",
       "2208.11203.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Tables are widely used in several typ...\n",
-      "1     title  #/texts/2  Graph Neural Networks and Representation Embed...\n",
-      "2    author  #/texts/2                                     Andrea Gemelli\n",
-      "3    author  #/texts/3                                    Emanuele Vivoli\n",
+      "title:  Graph Neural Networks and Representation Embedding for Table Extraction in PDF Documents\n",
+      "abstract:  Abstract-Tables are widely used in several types of documents since they can bring important information in a structured way. In scientific papers, tables can sum up novel discoveries and summarize experimental results, making the research comparable and easily understandable by scholars. Several methods perform table analysis working on document images, losing useful information during the conversion from the PDF files since OCR tools can be prone to recognition errors, in particular for text inside tables. The main contribution of this work is to tackle the problem of table extraction, exploiting Graph Neural Networks. Node features are enriched with suitably designed representation embeddings. These representations help to better distinguish not only tables from the other parts of the paper, but also table cells from table headers. We experimentally evaluated the proposed approach on a new dataset obtained by merging the information provided in the PubLayNet and PubTables-1M datasets.\n",
+      "        type            subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  6493239332379965773  DOCUMENT          #         en        0.99\n",
+      "1   metadata  2882203704718404424  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  3310489291270722526  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   language  8641064745316605644      TEXT  #/texts/0         en        0.57\n",
+      "4   semantic  8641064745316605644      TEXT  #/texts/0  reference        0.86\n",
+      "5   language  2882203704718404424      TEXT  #/texts/1         en        0.75\n",
+      "6   semantic  2882203704718404424      TEXT  #/texts/1     header        0.87\n",
+      "7   language   776884144777806374      TEXT  #/texts/2         en        0.45\n",
+      "8   semantic   776884144777806374      TEXT  #/texts/2  meta-data        1.00\n",
+      "9   language   520288979263302990      TEXT  #/texts/3         en        0.32\n",
+      "10  semantic   520288979263302990      TEXT  #/texts/3  meta-data        0.99\n",
+      "11  language  3310489291270722526      TEXT  #/texts/4         en        0.92\n",
       "2305.02549.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  The recent advent of self-supervised pretraini...\n",
-      "1      title  #/texts/2  FormNetV2: Multimodal Graph Contrastive Learni...\n",
-      "2     author  #/texts/2                                          Hao Zhang\n",
-      "3     author  #/texts/2                                      Timothy Dozat\n",
-      "4     author  #/texts/2                                      Vincent Perot\n",
-      "5     author  #/texts/2                                         Guolong Su\n",
-      "6     author  #/texts/2                                        Xiang Zhang\n",
-      "7     author  #/texts/2                                   Nikolai Glushnev\n",
-      "8     author  #/texts/2                                       Renshen Wang\n",
-      "9     author  #/texts/2                                     Joshua Ainslie\n",
-      "10    author  #/texts/2                                     Shangbang Long\n",
-      "11    author  #/texts/2                                         Siyang Qin\n",
-      "12    author  #/texts/2                                     Yasuhisa Fujii\n",
-      "13    author  #/texts/2                                            Nan Hua\n",
-      "14    author  #/texts/2                                       Google Cloud\n",
-      "15    author  #/texts/2                                    Google Research\n",
-      "16    author  #/texts/2                                       Google Cloud\n",
+      "title:  FormNetV2: Multimodal Graph Contrastive Learning for Form Document Information Extraction\n",
+      "abstract:  Abstract The recent advent of self-supervised pretraining techniques has led to a surge in the use of multimodal learning in form document understanding. However, existing approaches that extend the mask language modeling to other modalities require careful multitask tuning, complex reconstruction target designs, or additional pre-training data. In Form-NetV2, we introduce a centralized multimodal graph contrastive learning strategy to unify self-supervised pre-training for all modalities in one loss. The graph contrastive objective maximizes the agreement of multimodal representations, providing a natural interplay for all modalities without special customization. In addition, we extract image features within the bounding box that joins a pair of tokens connected by a graph edge, capturing more targeted visual cues without loading a sophisticated and separately pre-trained image embedder. FormNetV2 establishes new state-of-theart performance on FUNSD, CORD, SROIE and Payment benchmarks with a more compact model size.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   6435374954406078525  DOCUMENT          #         en        0.99\n",
+      "1   metadata   5772810757726285982  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   1818879276920072344  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata  16352104498896647424  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   language   8132802399624216254      TEXT  #/texts/0         en        0.31\n",
+      "5   semantic   8132802399624216254      TEXT  #/texts/0  reference        0.66\n",
+      "6   language   5772810757726285982      TEXT  #/texts/1         en        0.62\n",
+      "7   semantic   5772810757726285982      TEXT  #/texts/1     header        0.68\n",
+      "8   language   2314371849884569015      TEXT  #/texts/2         en        0.26\n",
+      "9   semantic   2314371849884569015      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   1818879276920072344      TEXT  #/texts/3         en        0.32\n",
+      "11  semantic   1818879276920072344      TEXT  #/texts/3     header        0.93\n",
       "2104.02416.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Generative models able to synthesize layouts o...\n",
-      "1     title  #/texts/2  Variational Transformer Networks for Layout Ge...\n",
-      "2    author  #/texts/2                                Diego Martin Arroyo\n",
-      "3    author  #/texts/4                                      Janis Postels\n",
+      "title:  Variational Transformer Networks for Layout Generation\n",
+      "abstract:  Abstract Generative models able to synthesize layouts of different kinds (e.g. documents, user interfaces or furniture arrangements) are a useful tool to aid design processes and as a first step in the generation of synthetic data, among other tasks. We exploit the properties of self-attention layers to capture high level relationships between elements in a layout, and use these as the building blocks of the well-known Variational Autoencoder (VAE) formulation. Our proposed Variational Transformer Network (VTN) is capable of learning margins, alignments and other global design rules without explicit supervision. Layouts sampled from our model have a high degree of resemblance to the training data, while demonstrating appealing diversity. In an extensive evaluation on publicly available benchmarks for different layout types VTNs achieve state-of-the-art diversity and perceptual quality. Additionally, we show the capabilities of this method as part of a document layout detection pipeline.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   4026452083066056880  DOCUMENT          #         en        1.00\n",
+      "1   metadata   8211240174221100152  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  18116077653495279146  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  12556626668984946601  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language   6185815998567946240      TEXT  #/texts/0         en        0.55\n",
+      "5   semantic   6185815998567946240      TEXT  #/texts/0       text        0.69\n",
+      "6   language   8211240174221100152      TEXT  #/texts/1         en        0.60\n",
+      "7   semantic   8211240174221100152      TEXT  #/texts/1     header        0.95\n",
+      "8   language   7735061465405094816      TEXT  #/texts/2         en        0.29\n",
+      "9   semantic   7735061465405094816      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language   9235665801254691886      TEXT  #/texts/3         fr        0.11\n",
+      "11  semantic   9235665801254691886      TEXT  #/texts/3  meta-data        0.99\n",
       "2206.01062.pdf\n",
-      "  subtype  subj_path                                               name\n",
-      "0   title  #/texts/2  DocLayNet: A Large Human-Annotated Dataset for...\n",
-      "1  author  #/texts/2                                   Birgit Pfitzmann\n",
-      "2  author  #/texts/2                               Research Rueschlikon\n",
-      "3  author  #/texts/3                                     Christoph Auer\n",
-      "4  author  #/texts/4                               Research Rueschlikon\n",
-      "5  author  #/texts/5                               Research Rueschlikon\n",
-      "6  author  #/texts/6                                        Peter Staar\n",
-      "7  author  #/texts/6                               Research Rueschlikon\n",
+      "title:  DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis\n",
+      "abstract:  ABSTRACT\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  10617426972143493900  DOCUMENT          #         en        1.00\n",
+      "1   metadata   6692027461717503948  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  12275254655398075866  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   language   9792106968482800026      TEXT  #/texts/0         en        0.45\n",
+      "4   semantic   9792106968482800026      TEXT  #/texts/0  reference        0.95\n",
+      "5   language   6692027461717503948      TEXT  #/texts/1         en        0.52\n",
+      "6   semantic   6692027461717503948      TEXT  #/texts/1     header        0.90\n",
+      "7   language  16365913279299628941      TEXT  #/texts/2         de        0.28\n",
+      "8   semantic  16365913279299628941      TEXT  #/texts/2  meta-data        0.93\n",
+      "9   language   1613519574664415958      TEXT  #/texts/3         de        0.56\n",
+      "10  semantic   1613519574664415958      TEXT  #/texts/3  meta-data        0.97\n",
+      "11  language  17669899987897193800      TEXT  #/texts/4         en        0.37\n",
       "2212.09621.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Unsupervised pre-training on millions of digit...\n",
-      "1      title  #/texts/2  WUKONG-READER: Multi-modal Pre-training for Fi...\n",
-      "2     author  #/texts/2                                          Haoli Bai\n",
-      "3     author  #/texts/2                                       Zhiguang Liu\n",
-      "4     author  #/texts/2                                       Xiaojun Meng\n",
-      "5     author  #/texts/2                                          Wentao Li\n",
-      "6     author  #/texts/2                                         Shuang Liu\n",
-      "7     author  #/texts/2                                           Nian Xie\n",
-      "8     author  #/texts/2                                       Rongfu Zheng\n",
-      "9     author  #/texts/2                                      Liangwei Wang\n",
-      "10    author  #/texts/2                                             Lu Hou\n",
-      "11    author  #/texts/2                                      Jiansheng Wei\n",
-      "12    author  #/texts/2                                          Xin Jiang\n",
-      "13    author  #/texts/2                                Qun Liu Huawei Noah\n",
-      "14    author  #/texts/2                                            Ark Lab\n",
+      "title:  WUKONG-READER: Multi-modal Pre-training for Fine-grained Visual Document Understanding\n",
+      "abstract:  Abstract Unsupervised pre-training on millions of digital-born or scanned documents has shown promising advances in visual document understanding (VDU). While various visionlanguage pre-training objectives are studied in existing solutions, the document textline, as an intrinsic granularity in VDU, has seldom been explored so far. A document textline usually contains words that are spatially and semantically correlated, which can be easily obtained from OCR engines. In this paper, we propose WUKONG-READER, trained with new pre-training objectives to leverage the structural knowledge nested in document textlines. We introduce textline-region contrastive learning to achieve fine-grained alignment between the visual regions and texts of document textlines. Furthermore, masked region modeling and textline-grid matching are also designed to enhance the visual and layout representations of textlines. Experiments show that our WUKONG-READER has superior performance on various VDU tasks such as information extraction. The fine-grained alignment over textlines also empowers WUKONG-READER with promising localization ability.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language    622493043892949961  DOCUMENT          #         en        0.99\n",
+      "1   metadata   7134492898741291181  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  10292335078860401705  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata     30206745668943456  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   language  14162084248726860593      TEXT  #/texts/0         en        0.49\n",
+      "5   semantic  14162084248726860593      TEXT  #/texts/0       text        0.56\n",
+      "6   language   7134492898741291181      TEXT  #/texts/1         en        0.68\n",
+      "7   semantic   7134492898741291181      TEXT  #/texts/1     header        0.92\n",
+      "8   language   9121673767507891652      TEXT  #/texts/2         en        0.30\n",
+      "9   semantic   9121673767507891652      TEXT  #/texts/2  meta-data        0.98\n",
+      "10  language  10292335078860401705      TEXT  #/texts/3         en        0.32\n",
+      "11  semantic  10292335078860401705      TEXT  #/texts/3     header        0.93\n",
       "2309.09506.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Graphic layout generation, a growing research ...\n",
-      "1     title  #/texts/2  LAYOUTNUWA: REVEALING THE HIDDEN LAYOUT EXPERT...\n",
-      "2    author  #/texts/2                                       Zecheng Tang\n",
-      "3    author  #/texts/2                                         Chenfei Wu\n",
-      "4    author  #/texts/2                                          Juntao Li\n",
-      "5    author  #/texts/2                                           Nan Duan\n",
-      "6    author  #/texts/2                                 Soochow University\n",
-      "7    author  #/texts/2                            Microsoft Research Asia\n",
+      "title:  LAYOUTNUWA: REVEALING THE HIDDEN LAYOUT EXPERTISE OF LARGE LANGUAGE MODELS\n",
+      "abstract:  ABSTRACT Graphic layout generation, a growing research field, plays a significant role in user engagement and information perception. Existing methods primarily treat layout generation as a numerical optimization task, focusing on quantitative aspects while overlooking the semantic information of layout, such as the relationship between each layout element. In this paper, we propose LayoutNUWA, the first model that treats layout generation as a code generation task to enhance semantic information and harnesses the hidden layout expertise of large language models (LLMs). More concretely, we develop a Code Instruct Tuning (CIT) approach comprising three interconnected modules: 1) the Code Initialization (CI) module quantifies the numerical conditions and initializes them as HTML code with strategically placed masks; 2) the Code Completion (CC) module employs the formatting knowledge of LLMs to fill in the masked portions within the HTML code; 3) the Code Rendering (CR) module transforms the completed code into the final layout output, ensuring a highly interpretable and transparent layout generation procedure that directly maps code to a visualized layout. We attain significant state-of-the-art performance (even over 50% improvements) on multiple datasets, showcasing the strong capabilities of LayoutNUWA. Our code is available at https://github.com/ProjectNUWA/LayoutNUWA.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   1102428998215578833  DOCUMENT          #         en        1.00\n",
+      "1   metadata   2952583967065414989  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   1214126413230308130  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata  17592137747910693327  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   language   7724916533818732031      TEXT  #/texts/0         en        0.35\n",
+      "5   semantic   7724916533818732031      TEXT  #/texts/0  reference        0.95\n",
+      "6   language   2952583967065414989      TEXT  #/texts/1         en        0.44\n",
+      "7   semantic   2952583967065414989      TEXT  #/texts/1     header        0.86\n",
+      "8   language   4814086422136468295      TEXT  #/texts/2         en        0.21\n",
+      "9   semantic   4814086422136468295      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   1214126413230308130      TEXT  #/texts/3         en        0.64\n",
+      "11  semantic   1214126413230308130      TEXT  #/texts/3     header        1.00\n",
       "2305.03393.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Extracting tables from documents is ...\n",
-      "1     title  #/texts/2  Optimized Table Tokenization for Table Structu...\n",
-      "2    author  #/texts/2                                       Maksym Lysak\n",
-      "3    author  #/texts/2                                       Ahmed Nassar\n",
-      "4    author  #/texts/3                                Nikolaos Livathinos\n",
-      "5    author  #/texts/3                                     Christoph Auer\n",
-      "6    author  #/texts/4                                        Peter Staar\n",
+      "title:  Optimized Table Tokenization for Table Structure Recognition\n",
+      "abstract:  Abstract. Extracting tables from documents is a crucial task in any document conversion pipeline. Recently, transformer-based models have demonstrated that table-structure can be recognized with impressive accuracy using Image-to-Markup-Sequence (Im2Seq) approaches. Taking only the image of a table, such models predict a sequence of tokens (e.g. in HTML, LaTeX) which represent the structure of the table. Since the token representation of the table structure has a significant impact on the accuracy and run-time performance of any Im2Seq model, we investigate in this paper how table-structure representation can be optimised. We propose a new, optimised table-structure language (OTSL) with a minimized vocabulary and specific rules. The benefits of OTSL are that it reduces the number of tokens to 5 (HTML needs 28+) and shortens the sequence length to half of HTML on average. Consequently, model accuracy improves significantly, inference time is halved compared to HTML-based models, and the predicted table structures are always syntactically correct. This in turn eliminates most post-processing needs. Popular table structure data-sets will be published in OTSL format to the community. Keywords: Table Structure Recognition · Data Representation · Transformers · Optimization.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   5374198401597601690  DOCUMENT          #         en        1.00\n",
+      "1   metadata   5888487835083259627  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   1264022019336188589  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata   5349896953270355381  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language  15896613119077796986      TEXT  #/texts/0         en        0.62\n",
+      "5   semantic  15896613119077796986      TEXT  #/texts/0  reference        0.66\n",
+      "6   language   5888487835083259627      TEXT  #/texts/1         en        0.74\n",
+      "7   semantic   5888487835083259627      TEXT  #/texts/1       text        0.58\n",
+      "8   language  13588719527350604669      TEXT  #/texts/2         en        0.31\n",
+      "9   semantic  13588719527350604669      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language     51599421743934370      TEXT  #/texts/3         en        0.24\n",
+      "11  semantic     51599421743934370      TEXT  #/texts/3  meta-data        1.00\n",
       "2206.00785.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Document understanding is a key busin...\n",
-      "1     title  #/texts/2  Delivering Document Conversion as a Cloud Serv...\n",
-      "2    author  #/texts/2                                     Christoph Auer\n",
-      "3    author  #/texts/2                                Research Ruschlikon\n",
-      "4    author  #/texts/3                                Research Ruschlikon\n",
-      "5    author  #/texts/4                                      Michele Dolfi\n",
-      "6    author  #/texts/4                                Research Ruschlikon\n",
-      "7    author  #/texts/5                                            J Staar\n",
-      "8    author  #/texts/5                                Research Ruschlikon\n",
+      "title:  Delivering Document Conversion as a Cloud Service with High Throughput and Responsiveness\n",
+      "abstract:  Abstract-Document understanding is a key business process in the data-driven economy since documents are central to knowledge discovery and business insights. Converting documents into a machine-processable format is a particular challenge here due to their huge variability in formats and complex structure. Accordingly, many algorithms and machine-learning methods emerged to solve particular tasks such as Optical Character Recognition (OCR), layout analysis, table-structure recovery, figure understanding, etc. We observe the adoption of such methods in document understanding solutions offered by all major cloud providers. Yet, publications outlining how such services are designed and optimized to scale in the cloud are scarce. In this paper, we focus on the case of document conversion to illustrate the particular challenges of scaling a complex data processing pipeline with a strong reliance on machine-learning methods on cloud infrastructure. Our key objective is to achieve high scalability and responsiveness for different workload profiles in a well-defined resource budget. We outline the requirements, design, and implementation choices of our document conversion service and reflect on the challenges we faced. Evidence for the scaling behavior and resource efficiency is provided for two alternative workload distribution strategies and deployment configurations. Our best-performing method achieves sustained throughput of over one million PDF pages per hour on 3072 CPU cores across 192 nodes. Index Terms-cloud applications, document understanding, distributed computing, artificial intelligence\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  13152862009447693765  DOCUMENT          #         en        1.00\n",
+      "1   metadata   8967552455475999131  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  15035726207261556942  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   4662798960261328447  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  17724327985223046044      TEXT  #/texts/0         en        0.37\n",
+      "5   semantic  17724327985223046044      TEXT  #/texts/0  reference        0.86\n",
+      "6   language   8967552455475999131      TEXT  #/texts/1         en        0.91\n",
+      "7   semantic   8967552455475999131      TEXT  #/texts/1     header        0.73\n",
+      "8   language  10556124696351850413      TEXT  #/texts/2         en        0.36\n",
+      "9   semantic  10556124696351850413      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  18140503323381183580      TEXT  #/texts/3         en        0.31\n",
+      "11  semantic  18140503323381183580      TEXT  #/texts/3  meta-data        0.99\n",
       "2108.01249.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  unified manner, 2) how to formulate the learni...\n",
-      "1     title  #/texts/2                          Kota Yamaguchi CyberAgent\n",
+      "title:  Kota Yamaguchi CyberAgent\n",
+      "abstract:  Abstract unified manner, 2) how to formulate the learning problem, and 3) how to evaluate the quality of documents. Vector graphic documents present visual elements in a resolution free, compact format and are often seen in creative applications. In this work, we attempt to learn a generative model of vector graphic documents. We define vector graphic documents by a multi-modal set of attributes associated to a canvas and a sequence of visual elements such as shapes, images, or texts, and train variational autoencoders to learn the representation of the documents. We collect a new dataset of design templates from an online service that features complete document structure including occluded elements. In experiments, we show that our model, named CanvasVAE, constitutes a strong baseline for generative modeling of vector graphic documents.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  11544672061719147914  DOCUMENT          #         en        0.99\n",
+      "1   metadata   1865621952677141032  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   9499241547212596575  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata  17505636629490201581  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   metadata   5018213406211262976  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "5   language  14555891895603680624      TEXT  #/texts/0         en        0.64\n",
+      "6   semantic  14555891895603680624      TEXT  #/texts/0     header        0.67\n",
+      "7   language   1865621952677141032      TEXT  #/texts/1         fr        0.35\n",
+      "8   semantic   1865621952677141032      TEXT  #/texts/1  reference        0.58\n",
+      "9   language   1076108524886774303      TEXT  #/texts/2         en        0.11\n",
+      "10  semantic   1076108524886774303      TEXT  #/texts/2  meta-data        0.95\n",
+      "11  language   9499241547212596575      TEXT  #/texts/3         en        0.32\n",
       "2106.15117.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  We present a novel data generation tool for do...\n",
-      "1     title  #/texts/2  SDL: NEW DATA GENERATION TOOLS FOR FULL-LEVEL ...\n",
-      "2    author  #/texts/2                                  Nguyen Truong Son\n",
+      "title:  SDL: NEW DATA GENERATION TOOLS FOR FULL-LEVEL ANNOTATED DOCUMENT LAYOUT\n",
+      "abstract:  ABSTRACT We present a novel data generation tool for document processing. The tool focuses on providing maximal level of visual information in a normal type document, ranging from character position to paragraph-level position. It also enables working with a large dataset on low-resource languages as well as providing a mean of processing thorough full-level information of documented text. The data generation tools come with a dataset of 320000 Vietnamese synthetic document images and an instruction to generate a dataset of similar size on other languages. The repository can be found at: https://github.com/tson1997/SDL-Document-Image-Generation\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  12852606380513963941  DOCUMENT          #         en        1.00\n",
+      "1   metadata  16022826563161339384  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   3623797934402135081  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  11818793719691083935  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  13076215469834646960      TEXT  #/texts/0         en        0.31\n",
+      "5   semantic  13076215469834646960      TEXT  #/texts/0  reference        0.95\n",
+      "6   language  16022826563161339384      TEXT  #/texts/1         en        0.56\n",
+      "7   semantic  16022826563161339384      TEXT  #/texts/1     header        0.94\n",
+      "8   language   8551711514533328919      TEXT  #/texts/2         en        0.27\n",
+      "9   semantic   8551711514533328919      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  16028837443172790694      TEXT  #/texts/3         en        0.64\n",
+      "11  semantic  16028837443172790694      TEXT  #/texts/3  meta-data        0.97\n",
       "2303.13839.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  The problem of document structure reconstructi...\n",
-      "1     title  #/texts/2  HRDoc: Dataset and Baseline Method toward Hier...\n",
-      "2    author  #/texts/2                                         Jiefeng Ma\n",
-      "3    author  #/texts/2                                             Jun Du\n",
-      "4    author  #/texts/2                                         Pengfei Hu\n",
-      "5    author  #/texts/2                                     Zhenrong Zhang\n",
-      "6    author  #/texts/2                                      Jianshu Zhang\n",
-      "7    author  #/texts/2                                         Huihui Zhu\n",
-      "8    author  #/texts/2                                           Cong Liu\n",
+      "title:  HRDoc: Dataset and Baseline Method toward Hierarchical Reconstruction of Document Structures\n",
+      "abstract:  Abstract The problem of document structure reconstruction refers to converting digital or scanned documents into corresponding semantic structures. Most existing works mainly focus on splitting the boundary of each element in a single document page, neglecting the reconstruction of semantic structure in multi-page documents. This paper introduces hierarchical reconstruction of document structures as a novel task suitable for NLP and CV fields. To better evaluate the system performance on the new task, we built a large-scale dataset named HRDoc, which consists of 2,500 multi-page documents with nearly 2 million semantic units. Every document in HRDoc has line-level annotations including categories and relations obtained from rule-based extractors and human annotators. Moreover, we proposed an encoder-decoder-based hierarchical document structure parsing system (DSPS) to tackle this problem. By adopting a multi-modal bidirectional encoder and a structure-aware GRU decoder with soft-mask operation, the DSPS model surpass the baseline method by a large margin. All scripts and datasets will be made publicly available at https://github.com/jfma-USTC/HRDoc.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  17037468589192894036  DOCUMENT          #         en        0.98\n",
+      "1   metadata   1104942195736734785  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  10718753664355590567  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   7728974788577724069  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  14970080583824688175      TEXT  #/texts/0         en        0.19\n",
+      "5   semantic  14970080583824688175      TEXT  #/texts/0       text        0.99\n",
+      "6   language   1104942195736734785      TEXT  #/texts/1         en        0.57\n",
+      "7   semantic   1104942195736734785      TEXT  #/texts/1     header        0.81\n",
+      "8   language   4710658602481414826      TEXT  #/texts/2         en        0.33\n",
+      "9   semantic   4710658602481414826      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   6306836046963688445      TEXT  #/texts/3         en        0.71\n",
+      "11  semantic   6306836046963688445      TEXT  #/texts/3  meta-data        0.93\n",
       "2004.08686.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Deep learning-based approaches for automatic d...\n",
-      "1     title  #/texts/2  A Large Dataset of Historical Japanese Documen...\n",
-      "2    author  #/texts/2            Zejiang Shen Kaixuan Zhang Melissa Dell\n",
-      "3    author  #/texts/3                                 Harvard University\n",
+      "title:  A Large Dataset of Historical Japanese Documents with Complex Layouts\n",
+      "abstract:  Abstract Deep learning-based approaches for automatic document layout analysis and content extraction have the potential to unlock rich information trapped in historical documents on a large scale. One major hurdle is the lack of large datasets for training robust models. In particular, little training data exist for Asian languages. To this end, we present HJDataset, a Large Dataset of H istorical J apanese Documents with Complex Layouts. It contains over 250,000 layout element annotations of seven types. In addition to bounding boxes and masks of the content regions, it also includes the hierarchical structures and reading orders for layout elements. The dataset is constructed using a combination of human and machine efforts. A semi-rule based method is developed to extract the layout elements, and the results are checked by human inspectors. The resulting large-scale dataset is used to provide baseline performance analyses for text region detection using state-of-the-art deep learning models. And we demonstrate the usefulness of the dataset on real-world document digitization tasks. The dataset is available at https://dell-research-harvard. github.io/HJDataset/.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  13930001176231161878  DOCUMENT          #         en        1.00\n",
+      "1   metadata   7793786044407703107  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   8981345721148683510  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata    849097712286451589  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   metadata   6803870968619498453  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "5   language  13458486629839261648      TEXT  #/texts/0         en        0.39\n",
+      "6   semantic  13458486629839261648      TEXT  #/texts/0  reference        0.91\n",
+      "7   language   7793786044407703107      TEXT  #/texts/1         en        0.84\n",
+      "8   semantic   7793786044407703107      TEXT  #/texts/1     header        0.84\n",
+      "9   language  16464535545893033410      TEXT  #/texts/2         ca        0.24\n",
+      "10  semantic  16464535545893033410      TEXT  #/texts/2  meta-data        1.00\n",
+      "11  language  12942928698614636971      TEXT  #/texts/3         en        0.44\n",
       "2305.10448.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  This paper presents GenDoc, a general sequence...\n",
-      "1     title  #/texts/2  Sequence-to-Sequence Pre-training with Unified...\n",
-      "2    author  #/texts/2                                        Shuwei Feng\n",
-      "3    author  #/texts/2                                      Tianyang Zhan\n",
-      "4    author  #/texts/2          Zhanming Jie Trung Quoc Luong Xiaoran Jin\n",
+      "title:  Sequence-to-Sequence Pre-training with Unified Modality Masking for Visual Document Understanding\n",
+      "abstract:  Abstract This paper presents GenDoc, a general sequence-to-sequence document understanding model pre-trained with unified masking across three modalities: text, image, and layout. The proposed model utilizes an encoderdecoder architecture, which allows for increased adaptability to a wide range of downstream tasks with diverse output formats, in contrast to the encoder-only models commonly employed in document understanding. In addition to the traditional text infilling task used in previous encoder-decoder models, our pre-training extends to include tasks of masked image token prediction and masked layout prediction. We also design modalityspecific instruction and adopt both disentangled attention and the mixture-of-modalityexperts strategy to effectively capture the information leveraged by each modality. Evaluation of the proposed model through extensive experiments on several downstream tasks in document understanding demonstrates its ability to achieve superior or competitive performance compared to state-of-the-art approaches. Our analysis further suggests that GenDoc is more robust than the encoder-only models in scenarios where the OCR quality is imperfect.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   6647708032236096662  DOCUMENT          #         en        1.00\n",
+      "1   metadata   7053189837528699617  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  17614263210074065046  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  14376828063070828111  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   1081297291117528818      TEXT  #/texts/0         en        0.68\n",
+      "5   semantic   1081297291117528818      TEXT  #/texts/0       text        0.83\n",
+      "6   language   7053189837528699617      TEXT  #/texts/1         en        0.79\n",
+      "7   semantic   7053189837528699617      TEXT  #/texts/1     header        0.91\n",
+      "8   language  15653723734524768695      TEXT  #/texts/2         en        0.42\n",
+      "9   semantic  15653723734524768695      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   3724175546549476443      TEXT  #/texts/3         de        0.47\n",
+      "11  semantic   3724175546549476443      TEXT  #/texts/3  meta-data        1.00\n",
       "2108.09433.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Precise boundary annotations of imag...\n",
-      "1     title  #/texts/5                                                  )\n",
+      "title:  )\n",
+      "abstract:  Abstract. Precise boundary annotations of image regions can be crucial for downstream applications which rely on region-class semantics. Some document collections contain densely laid out, highly irregular and overlapping multi-class region instances with large range in aspect ratio. Fully automatic boundary estimation approaches tend to be data intensive, cannot handle variable-sized images and produce sub-optimal results for aforementioned images. To address these issues, we propose BoundaryNet, a novel resizing-free approach for high-precision semi-automatic layout annotation. The variable-sized user selected region of interest is first processed by an attention-guided skip network. The network optimization is guided via Fast Marching distance maps to obtain a good quality initial boundary estimate and an associated feature representation. These outputs are processed by a Residual Graph Convolution Network optimized using Hausdorff loss to obtain the final region boundary. Results on a challenging image manuscript dataset demonstrate that BoundaryNet outperforms strong baselines and produces high-quality semantic region boundaries. Qualitatively, our approach generalizes across multiple document image datasets containing different script systems and layouts, all without additional fine-tuning. We integrate BoundaryNet into a document annotation system and show that it provides high annotation throughput compared to manual and fully automatic alternatives. Keywords: document layout analysis · interactive · deep learning\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   7286648885983317681  DOCUMENT          #         en        0.97\n",
+      "1   metadata   6891813752407437231  DOCUMENT  #/texts/4      title        1.00\n",
+      "2   metadata   1285909825609735560  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   1594751657300697312  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  17989209837640758744      TEXT  #/texts/0         en        0.45\n",
+      "5   semantic  17989209837640758744      TEXT  #/texts/0  reference        0.86\n",
+      "6   language  18061687588185802103      TEXT  #/texts/1         en        0.60\n",
+      "7   semantic  18061687588185802103      TEXT  #/texts/1     header        0.97\n",
+      "8   language  16389516399927816228      TEXT  #/texts/2         en        0.53\n",
+      "9   semantic  16389516399927816228      TEXT  #/texts/2  reference        0.98\n",
+      "10  language   6891813752407437230      TEXT  #/texts/3         en        0.99\n",
+      "11  semantic   6891813752407437230      TEXT  #/texts/3  reference        1.00\n",
       "2203.15143.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Scene text detection and document layout analy...\n",
-      "1     title  #/texts/2  Towards End-to-End Unified Scene Text Detectio...\n",
-      "2    author  #/texts/2                                     Shangbang Long\n",
-      "3    author  #/texts/2                                         Siyang Qin\n",
-      "4    author  #/texts/2                                   Dmitry Panteleev\n",
-      "5    author  #/texts/2                                Alessandro Bissacco\n",
-      "6    author  #/texts/2                                     Yasuhisa Fujii\n",
-      "7    author  #/texts/2                    Michalis Raptis Google Research\n",
+      "title:  Towards End-to-End Unified Scene Text Detection and Layout Analysis\n",
+      "abstract:  Abstract Scene text detection and document layout analysis have long been treated as two separate tasks in different image domains. In this paper, we bring them together and introduce the task of unified scene text detection and layout analysis. The first hierarchical scene text dataset is introduced to enable this novel research task. We also propose a novel method that is able to simultaneously detect scene text and form text clusters in a unified way. Comprehensive experiments show that our unified model achieves better performance than multiple well-designed baseline methods. Additionally, this model achieves stateof-the-art results on multiple scene text detection datasets without the need of complex post-processing. Dataset and code: https://github.com/google-research- datasets/hiertext.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  16210526061993160862  DOCUMENT          #         en        1.00\n",
+      "1   metadata  12075476996982694064  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  14820214027740521356  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata   5609833535106633893  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   metadata  18311416795872171667  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "5   language   2157359505453830444      TEXT  #/texts/0         en        0.30\n",
+      "6   semantic   2157359505453830444      TEXT  #/texts/0       text        0.89\n",
+      "7   language  12075476996982694064      TEXT  #/texts/1         en        0.72\n",
+      "8   semantic  12075476996982694064      TEXT  #/texts/1     header        0.92\n",
+      "9   language   6712749907627824000      TEXT  #/texts/2         en        0.29\n",
+      "10  semantic   6712749907627824000      TEXT  #/texts/2  meta-data        0.99\n",
+      "11  language   6560726583765693241      TEXT  #/texts/3         en        0.09\n",
       "2212.09877.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Graphic layout designs play an essential role ...\n",
-      "1      title  #/texts/2  LayoutDETR: Detection Transformer Is a Good Mu...\n",
-      "2     author  #/texts/2                                            Ning Yu\n",
-      "3     author  #/texts/2                                        Zeyuan Chen\n",
-      "4     author  #/texts/2                                           Rui Meng\n",
-      "5     author  #/texts/2                                            Gang Wu\n",
-      "6     author  #/texts/3                                         Paul Josel\n",
-      "7     author  #/texts/3                                Juan Carlos Niebles\n",
-      "8     author  #/texts/3                                      Caiming Xiong\n",
-      "9     author  #/texts/3                                             Ran Xu\n",
-      "10    author  #/texts/4                                Salesforce Research\n",
+      "title:  LayoutDETR: Detection Transformer Is a Good Multimodal Layout Designer\n",
+      "abstract:  Abstract Graphic layout designs play an essential role in visual communication. Yet handcrafting layout designs are skilldemanding, time-consuming, and non-scalable to batch production. Although generative models emerge to make design automation no longer utopian, it remains non-trivial to customize designs that comply with designers' multimodal desires, i.e., constrained by background images and driven by foreground contents. In this study, we propose LayoutDETR that inherits the high quality and realism from generative modeling, in the meanwhile reformulating content-aware requirements as a detection problem: we learn to detect in a background image the reasonable locations, scales, and spatial relations for multimodal elements in a layout. Experiments validate that our solution yields new state-of-the-art performance for layout generation on public benchmarks and on our newly-curated ads banner dataset. For practical usage, we build our solution into a graphical system that facilitates user studies. We demonstrate that our designs attract more subjective preference than baselines by significant margins. Our code, models, dataset, graphical system, and demos are available at GitHub.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  10084390588055568504  DOCUMENT          #         en        1.00\n",
+      "1   metadata  11994464921092174700  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   7850596257863206924  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  17827466626641903476  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  10295852222025669617      TEXT  #/texts/0         en        0.53\n",
+      "5   semantic  10295852222025669617      TEXT  #/texts/0  reference        0.89\n",
+      "6   language  11994464921092174700      TEXT  #/texts/1         en        0.54\n",
+      "7   semantic  11994464921092174700      TEXT  #/texts/1     header        0.80\n",
+      "8   language  16518229938452596987      TEXT  #/texts/2         en        0.51\n",
+      "9   semantic  16518229938452596987      TEXT  #/texts/2  meta-data        0.97\n",
+      "10  language   8884254896872574636      TEXT  #/texts/3         en        0.21\n",
+      "11  semantic   8884254896872574636      TEXT  #/texts/3  meta-data        1.00\n",
       "2303.10787.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. We develop a diffusion-based approac...\n",
-      "1     title  #/texts/2         Diffusion-based Document Layout Generation\n",
-      "2    author  #/texts/2                                          Yijuan Lu\n",
-      "3    author  #/texts/2                                       John Corring\n",
-      "4    author  #/texts/2                                    Dinei Florencio\n",
-      "5    author  #/texts/2                                          Cha Zhang\n",
-      "6    author  #/texts/3                                  Purdue University\n",
-      "7    author  #/texts/3                                     West Lafayette\n",
-      "8    author  #/texts/4                                    Microsoft Cloud\n",
+      "title:  Diffusion-based Document Layout Generation\n",
+      "abstract:  Abstract. We develop a diffusion-based approach for various document layout sequence generation. Layout sequences specify the contents of a document design in an explicit format. Our novel diffusion-based approach works in the sequence domain rather than the image domain in order to permit more complex and realistic layouts. We also introduce a new metric, Document Earth Mover's Distance (Doc-EMD). By considering similarity between heterogeneous categories document designs, we handle the shortcomings of prior document metrics that only evaluate the same category of layouts. Our empirical analysis shows that our diffusion-based approach is comparable to or outperforming other previous methods for layout generation across various document datasets. Moreover, our metric is capable of differentiating documents better than previous metrics for specific cases. Keywords: Structured document generation · Document layout · Diffusion methods · Generative models.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   9758273175405515723  DOCUMENT          #         en        0.98\n",
+      "1   metadata  16052403849674172506  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   2448372797569194825  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   9845471528146357593  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language   7017913058660137428      TEXT  #/texts/0         en        0.24\n",
+      "5   semantic   7017913058660137428      TEXT  #/texts/0       text        0.89\n",
+      "6   language  16052403849674172506      TEXT  #/texts/1         en        0.37\n",
+      "7   semantic  16052403849674172506      TEXT  #/texts/1     header        0.96\n",
+      "8   language   6476430098925505246      TEXT  #/texts/2         en        0.49\n",
+      "9   semantic   6476430098925505246      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language  12218180187596295929      TEXT  #/texts/3         en        0.56\n",
+      "11  semantic  12218180187596295929      TEXT  #/texts/3  meta-data        0.97\n",
       "2201.11438.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Understanding documents with rich la...\n",
-      "1     title  #/texts/2  DocSegTr: An Instance-Level End-to-End Documen...\n",
-      "2    author  #/texts/2                                      Sanket Biswas\n",
-      "3    author  #/texts/2                                      Ayan Banerjee\n",
-      "4    author  #/texts/2                                         Josep Llad\n",
-      "5    author  #/texts/3                             Computer Vision Center\n",
-      "6    author  #/texts/3                        Computer Science Department\n",
-      "7    author  #/texts/4                                    Universitat Aut\n",
+      "title:  DocSegTr: An Instance-Level End-to-End Document Image Segmentation Transformer\n",
+      "abstract:  Abstract. Understanding documents with rich layouts is an essential step towards information extraction. Business intelligence processes often require the extraction of useful semantic content from documents at a large scale for subsequent decision-making tasks. In this context, instance-level segmentation of different document objects(title, sections, figures, tables and so on) has emerged as an interesting problem for the document layout analysis community. To advance the research in this direction, we present a transformer-based model for end-to-end segmentation of complex layouts in document images. To our knowledge, this is the first work on transformer-based document segmentation. Extensive experimentation on the PubLayNet dataset shows that our model achieved comparable or better segmentation performance than the existing state-of-the-art approaches. We hope our simple and flexible framework could serve as a promising baseline for instance-level recognition tasks in document images. Keywords: Document Layout Analysis · Instance-Level Segmentation · Transformers · Information extraction\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language    208221823398909622  DOCUMENT          #         en        0.97\n",
+      "1   metadata  14220025198384659792  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata    820982994215739284  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata  15767495618934522204  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language  11806661338889905799      TEXT  #/texts/0         en        0.17\n",
+      "5   semantic  11806661338889905799      TEXT  #/texts/0       text        0.88\n",
+      "6   language  14220025198384659792      TEXT  #/texts/1         en        0.51\n",
+      "7   semantic  14220025198384659792      TEXT  #/texts/1     header        0.89\n",
+      "8   language   8018994052576249341      TEXT  #/texts/2         en        0.36\n",
+      "9   semantic   8018994052576249341      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language   3272490091820520194      TEXT  #/texts/3         en        0.68\n",
+      "11  semantic   3272490091820520194      TEXT  #/texts/3  meta-data        0.99\n",
       "2211.08863.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Infographics are often an integral component o...\n",
-      "1     title  #/texts/2  ChartParser: Automatic Chart Parsing for Print...\n",
-      "2    author  #/texts/3                                     Anukriti Kumar\n",
-      "3    author  #/texts/3                                        Tanuja Ganu\n",
-      "4    author  #/texts/3                                        Saikat Guha\n",
-      "5    author  #/texts/4                                 Microsoft Research\n",
+      "title:  ChartParser: Automatic Chart Parsing for Print-Impaired\n",
+      "abstract:  Abstract Infographics are often an integral component of scientific documents for reporting qualitative or quantitative findings as they make it much simpler to comprehend the underlying complex information. However, their interpretation continues to be a challenge for the blind, low-vision, and other print-impaired (BLV) individuals. In this paper, we propose ChartParser, a fully automated pipeline that leverages deep learning, OCR, and image processing techniques to extract all figures from a research paper, classify them into various chart categories (bar chart, line chart, etc.) and obtain relevant information from them, specifically bar charts (including horizontal, vertical, stacked horizontal and stacked vertical charts) which already have several exciting challenges. Finally, we present the retrieved content in a tabular format that is screen-reader friendly and accessible to the BLV users. We present a thorough evaluation of our approach by applying our pipeline to sample real-world annotated bar charts from research papers.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   9575898009567312608  DOCUMENT          #         en        1.00\n",
+      "1   metadata  13385598999438028549  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   2004079478680832734  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   7547314663753897690  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  15463886055130069472      TEXT  #/texts/0         en        0.37\n",
+      "5   semantic  15463886055130069472      TEXT  #/texts/0  reference        0.91\n",
+      "6   language  13385598999438028549      TEXT  #/texts/1         en        0.58\n",
+      "7   semantic  13385598999438028549      TEXT  #/texts/1  reference        0.51\n",
+      "8   language  17622143137064393226      TEXT  #/texts/2         en        0.20\n",
+      "9   semantic  17622143137064393226      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   1447479644763299756      TEXT  #/texts/3         en        0.47\n",
+      "11  semantic   1447479644763299756      TEXT  #/texts/3  meta-data        0.58\n",
       "2111.05736.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Nowadays, metadata information is often given ...\n",
-      "1     title  #/texts/2  Multimodal Approach for Metadata Extraction fr...\n",
-      "2    author  #/texts/2                               Azeddine Bouabdallah\n",
-      "3    author  #/texts/3                                      Jorge Gavilan\n",
-      "4    author  #/texts/4                                     Jennifer Gerbl\n",
-      "5    author  #/texts/5                            Prayuth Patumcharoenpol\n",
+      "title:  Multimodal Approach for Metadata Extraction from German Scientific Publications\n",
+      "abstract:  Abstract Nowadays, metadata information is often given by the authors themselves upon submission. However, a significant part of already existing research papers have missing or incomplete metadata information. German scientific papers come in a large variety of layouts which makes the extraction of metadata a non-trivial task that requires a precise way to classify the metadata extracted from the documents. In this paper, we propose a multimodal deep learning approach for metadata extraction from scientific papers in the German language. We consider multiple types of input data by combining natural language processing and image vision processing. This model aims to increase the overall accuracy of metadata extraction compared to other state-of-the-art approaches. It enables the utilization of both spatial and contextual features in order to achieve a more reliable extraction. Our model for this approach was trained on a dataset consisting of around 8800 documents and is able to obtain an overall F1-score of 0.923. CCS Concepts: · Computing methodologies → Information extraction; Computer vision representations; Natural language processing; Supervised learning by classification; · Applied computing → Document metadata. Keywords: natural language processing, computer vision, metadata extraction, deep learning, biLSTM, classification, multimodality\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   6214813008397517794  DOCUMENT          #         en        1.00\n",
+      "1   metadata   9336427579926614669  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   8542053257928421913  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  10087057927913462578  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   metadata  13952774756382584407  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "5   metadata   3716015961100721764  DOCUMENT  #/texts/9   abstract        1.00\n",
+      "6   language   9086007843929612395      TEXT  #/texts/0         en        0.54\n",
+      "7   semantic   9086007843929612395      TEXT  #/texts/0  reference        0.51\n",
+      "8   language   9336427579926614669      TEXT  #/texts/1         en        0.81\n",
+      "9   semantic   9336427579926614669      TEXT  #/texts/1     header        0.52\n",
+      "10  language   6158805096068987949      TEXT  #/texts/2         fr        0.66\n",
+      "11  semantic   6158805096068987949      TEXT  #/texts/2  meta-data        1.00\n",
       "2206.10253.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Printed documents continue to be a challenge f...\n",
-      "1    author  #/texts/2  Anukriti Kumar Tanuja Ganu Saikat Guha Microso...\n",
-      "2     title  #/texts/2   Document Navigability: A Need for Print-Impaired\n",
+      "title:  Document Navigability: A Need for Print-Impaired\n",
+      "abstract:  Abstract Printed documents continue to be a challenge for blind, low-vision, and other print-disabled (BLV) individuals. In this paper, we focus on the specific problem of (in-)accessibility of internal references to citations, footnotes, figures, tables and equations. While sighted users can flip to the referenced content and flip back in seconds, linear audio narration that BLV individuals rely on makes following these references extremely hard. We propose a vision based technique to locate the referenced content and extract metadata needed to (in subsequent work) inline a content summary into the audio narration. We apply our technique to citations in scientific documents and find it works well both on born-digital as well as scanned documents.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   8367168009336343541  DOCUMENT          #         en        1.00\n",
+      "1   metadata  10890231688151233893  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  16071323955613293918  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata  16575161736546407260  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language  12424868080524350218      TEXT  #/texts/0         en        0.41\n",
+      "5   semantic  12424868080524350218      TEXT  #/texts/0  reference        0.95\n",
+      "6   language  10890231688151233893      TEXT  #/texts/1         en        0.53\n",
+      "7   semantic  10890231688151233893      TEXT  #/texts/1  reference        0.49\n",
+      "8   language  15018036349796390670      TEXT  #/texts/2         en        0.55\n",
+      "9   semantic  15018036349796390670      TEXT  #/texts/2  meta-data        0.98\n",
+      "10  language   2095024526711489732      TEXT  #/texts/3         sr        0.07\n",
+      "11  semantic   2095024526711489732      TEXT  #/texts/3  meta-data        1.00\n",
       "2306.01058.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Recent work has shown that infusing layout fea...\n",
-      "1     title  #/texts/2  Are Layout-Infused Language Models Robust to L...\n",
-      "2    author  #/texts/2                                     Catherine Chen\n",
-      "3    author  #/texts/2                                       Zejiang Shen\n",
-      "4    author  #/texts/2                                          Dan Klein\n",
-      "5    author  #/texts/3                                  Gabriel Stanovsky\n",
-      "6    author  #/texts/3                                        Doug Downey\n",
-      "7    author  #/texts/3                                            Kyle Lo\n",
-      "8    author  #/texts/4                                    Allen Institute\n",
-      "9    author  #/texts/4                            Northwestern University\n",
+      "title:  Are Layout-Infused Language Models Robust to Layout Distribution Shifts? A Case Study with Scientific Documents\n",
+      "abstract:  Abstract Recent work has shown that infusing layout features into language models (LMs) improves processing of visually-rich documents such as scientific papers. Layout-infused LMs are often evaluated on documents with familiar layout features (e.g., papers from the same publisher), but in practice models encounter documents with unfamiliar distributions of layout features, such as new combinations of text sizes and styles, or new spatial configurations of textual elements. In this work, we test whether layoutinfused LMs are robust to layout distribution shifts. As a case study, we use the task of scientific document structure recovery, segmenting a scientific paper into its structural categories (e.g., TITLE, CAPTION, REFERENCE). To emulate distribution shifts that occur in practice, we re-partition the GROTOAP2 dataset. We find that under layout distribution shifts model performance degrades by up to 20 F1. Simple training strategies, such as increasing training diversity, can reduce this degradation by over 35% relative F1; however, models fail to reach in-distribution performance in any tested out-of-distribution conditions. This work highlights the need to consider layout distribution shifts during model evaluation, and presents a methodology for conducting such evaluations. 1\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  12891525512286936503  DOCUMENT          #         en        1.00\n",
+      "1   metadata  17168748629924448588  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  11408116613210739775  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata    720193478378176240  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   4461508289038516592      TEXT  #/texts/0         en        0.37\n",
+      "5   semantic   4461508289038516592      TEXT  #/texts/0  reference        0.66\n",
+      "6   language  17168748629924448588      TEXT  #/texts/1         en        0.56\n",
+      "7   semantic  17168748629924448588      TEXT  #/texts/1     header        0.94\n",
+      "8   language  14393954923057164949      TEXT  #/texts/2         en        0.43\n",
+      "9   semantic  14393954923057164949      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  13150667996460248282      TEXT  #/texts/3         en        0.56\n",
+      "11  semantic  13150667996460248282      TEXT  #/texts/3  meta-data        0.98\n",
       "2112.05112.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Creating visual layouts is an important step i...\n",
-      "1     title  #/texts/2  BLT: Bidirectional Layout Transformer for Cont...\n",
-      "2    author  #/texts/2                                         Xiang Kong\n",
-      "3    author  #/texts/2                                           Lu Jiang\n",
-      "4    author  #/texts/2                                       Huiwen Chang\n",
-      "5    author  #/texts/2                                          Han Zhang\n",
-      "6    author  #/texts/2                                           Yuan Hao\n",
-      "7    author  #/texts/2                                       Haifeng Gong\n",
-      "8    author  #/texts/2                                         Irfan Essa\n",
+      "title:  BLT: Bidirectional Layout Transformer for Controllable Layout Generation\n",
+      "abstract:  Abstract Creating visual layouts is an important step in graphic design. Automatic generation of such layouts is important as we seek scale-able and diverse visual designs. Prior works on automatic layout generation focus on unconditional generation, in which the models generate layouts while neglecting user needs for specific problems. To advance conditional layout generation, we introduce BLT, a bidirectional layout transformer. BLT differs from autoregressive decoding as it first generates a draft layout that satisfies the user inputs and then refines the layout iteratively. We verify the proposed model on multiple benchmarks with various fidelity metrics. Our results demonstrate two key advances to the state-of-the-art layout transformer models. First, our model empowers layout transformers to fulfill controllable layout generation. Second, our model slashes the linear inference time in autoregressive decoding into a constant complexity, thereby achieving 4 x-10 x speedups in generating a layout at inference time.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   3487215798834057801  DOCUMENT          #         en        1.00\n",
+      "1   metadata   9988876189333205714  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  14707720847299285769  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata  10029200523167255709  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language  11091681416119290226      TEXT  #/texts/0         en        0.43\n",
+      "5   semantic  11091681416119290226      TEXT  #/texts/0  reference        0.89\n",
+      "6   language   9988876189333205714      TEXT  #/texts/1         en        0.72\n",
+      "7   semantic   9988876189333205714      TEXT  #/texts/1     header        0.90\n",
+      "8   language  10607460801012111610      TEXT  #/texts/2         en        0.21\n",
+      "9   semantic  10607460801012111610      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  14765350902884342761      TEXT  #/texts/3         ru        0.12\n",
+      "11  semantic  14765350902884342761      TEXT  #/texts/3  meta-data        1.00\n",
       "2303.00289.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  In this paper, we present StrucTexTv2, an effe...\n",
-      "1      title  #/texts/2  STRUCTEXTV2: MASKED VISUAL-TEXTUAL PREDIC-TION...\n",
-      "2     author  #/texts/2                                         Yuechen Yu\n",
-      "3     author  #/texts/2                                           Yulin Li\n",
-      "4     author  #/texts/2                                    Chengquan Zhang\n",
-      "5     author  #/texts/2                                    Xiaoqiang Zhang\n",
-      "6     author  #/texts/2                                       Zengyuan Guo\n",
-      "7     author  #/texts/3                                        Xiameng Qin\n",
-      "8     author  #/texts/3                                            Kun Yao\n",
-      "9     author  #/texts/3                                          Junyu Han\n",
-      "10    author  #/texts/3                                         Errui Ding\n",
-      "11    author  #/texts/3                                      Jingdong Wang\n",
-      "12    author  #/texts/4                                          Baidu Inc\n",
+      "title:  STRUCTEXTV2: MASKED VISUAL-TEXTUAL PREDIC-TION FOR DOCUMENT IMAGE PRE-TRAINING\n",
+      "abstract:  ABSTRACT In this paper, we present StrucTexTv2, an effective document image pre-training framework, by performing masked visual-textual prediction. It consists of two self-supervised pre-training tasks: masked image modeling and masked language modeling, based on text region-level image masking. The proposed method randomly masks some image regions according to the bounding box coordinates of text words. The objectives of our pre-training tasks are reconstructing the pixels of masked image regions and the corresponding masked tokens simultaneously. Hence the pre-trained encoder can capture more textual semantics in comparison to the masked image modeling that usually predicts the masked image patches. Compared to the masked multi-modal modeling methods for document image understanding that rely on both the image and text modalities, StrucTexTv2 models image-only input and potentially deals with more application scenarios free from OCR pre-processing. Extensive experiments on mainstream benchmarks of document image understanding demonstrate the effectiveness of StrucTexTv2. It achieves competitive or even new state-of-the-art performance in various downstream tasks such as image classification, layout analysis, table structure recognition, document OCR, and information extraction under the end-to-end scenario.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   5501277416369373170  DOCUMENT          #         en        1.00\n",
+      "1   metadata   2625417652974853981  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  14017668860031900670  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   2394508152047310417  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  17276933865135358730      TEXT  #/texts/0         en        0.17\n",
+      "5   semantic  17276933865135358730      TEXT  #/texts/0       text        0.89\n",
+      "6   language   2625417652974853981      TEXT  #/texts/1         en        0.32\n",
+      "7   semantic   2625417652974853981      TEXT  #/texts/1     header        0.91\n",
+      "8   language  16763168585780875790      TEXT  #/texts/2         en        0.30\n",
+      "9   semantic  16763168585780875790      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  10481712215930735018      TEXT  #/texts/3         en        0.69\n",
+      "11  semantic  10481712215930735018      TEXT  #/texts/3  meta-data        0.91\n",
       "2106.03331.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  We propose SelfDoc, a task-agnostic pre-traini...\n",
-      "1      title  #/texts/2  SelfDoc: Self-Supervised Document Representati...\n",
-      "2     author  #/texts/2                                         Peizhao Li\n",
-      "3     author  #/texts/2                                        Jiuxiang Gu\n",
-      "4     author  #/texts/2                                     Vlad I Morariu\n",
-      "5     author  #/texts/2                                       Handong Zhao\n",
-      "6     author  #/texts/2                                         Rajiv Jain\n",
-      "7     author  #/texts/2                                   Varun Manjunatha\n",
-      "8     author  #/texts/2                                         Hongfu Liu\n",
-      "9     author  #/texts/2                                Brandeis University\n",
-      "10    author  #/texts/2                                     Adobe Research\n",
+      "title:  SelfDoc: Self-Supervised Document Representation Learning\n",
+      "abstract:  Abstract We propose SelfDoc, a task-agnostic pre-training framework for document image understanding. Because documents are multimodal and are intended for sequential reading, our framework exploits the positional, textual, and visual information of every semantically meaningful component in a document, and it models the contextualization between each block of content. Unlike existing document pre-training models, our model is coarse-grained instead of treating individual words as input, therefore avoiding an overly fine-grained with excessive contextualization. Beyond that, we introduce cross-modal learning in the model pre-training phase to fully leverage multimodal information from unlabeled documents. For downstream usage, we propose a novel modality-adaptive attention mechanism for multimodal feature fusion by adaptively emphasizing language and vision signals. Our framework benefits from self-supervised pre-training on documents without requiring annotations by a feature masking training strategy. It achieves superior performance on multiple downstream tasks with significantly fewer document images used in the pre-training stage compared to previous works.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   5907688169914557708  DOCUMENT          #         en        1.00\n",
+      "1   metadata  15934776123702835385  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  11722497451823681661  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  10620501522660942751  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  17609727343114496377      TEXT  #/texts/0         en        0.40\n",
+      "5   semantic  17609727343114496377      TEXT  #/texts/0  reference        0.95\n",
+      "6   language  15934776123702835385      TEXT  #/texts/1         en        0.80\n",
+      "7   semantic  15934776123702835385      TEXT  #/texts/1     header        0.86\n",
+      "8   language  16172421238534189085      TEXT  #/texts/2         en        0.30\n",
+      "9   semantic  16172421238534189085      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   6934219798032987944      TEXT  #/texts/3         en        0.13\n",
+      "11  semantic   6934219798032987944      TEXT  #/texts/3  meta-data        1.00\n",
       "2210.17246.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract Scientific documents contain tables t...\n",
-      "1     title  #/texts/2  Tables to LaTeX: Structure and Content Extract...\n",
-      "2    author  #/texts/2                                       Pratik Kayal\n",
-      "3    author  #/texts/2                                        Harsh Desai\n",
-      "4    author  #/texts/2                                       Mayank Singh\n",
+      "title:  Tables to LaTeX: Structure and Content Extraction from Scientific Tables\n",
+      "abstract:  Abstract Scientific documents contain tables that list important information in a concise fashion. Structure and content extraction from tables embedded within PDF research documents is a very challenging task due to the existence of visual features like spanning cells and content features like mathematical symbols and equations. Most existing table structure identification methods tend to ignore these academic writing features. In this paper, we adapt the transformer-based language modeling paradigm for scientific table structure and content extraction. Specifically, the proposed model converts a tabular image to its corresponding L A T E X source code. Overall, we outperform the current state-of-the-art baselines and achieve an exact match accuracy of 70.35% and 49.69% on table structure and content extraction, respectively. Further analysis demonstrates that the proposed models efficiently identify the number of rows and columns, the alphanumeric characters, the L A T E X tokens, and symbols. Keywords Scientific documents · Transformer · L A T E X · Tabular information · Information Extraction\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   6910941384560644633  DOCUMENT          #         en        0.99\n",
+      "1   metadata  16977338439878864218  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   6955543981316047167  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata   7723503127079648564  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   language   4431987692703606554      TEXT  #/texts/0         en        0.34\n",
+      "5   semantic   4431987692703606554      TEXT  #/texts/0       text        0.69\n",
+      "6   language  16977338439878864218      TEXT  #/texts/1         en        0.73\n",
+      "7   semantic  16977338439878864218      TEXT  #/texts/1     header        0.89\n",
+      "8   language   3857753522884437254      TEXT  #/texts/2         en        0.35\n",
+      "9   semantic   3857753522884437254      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   6955543981316047167      TEXT  #/texts/3         en        0.85\n",
+      "11  semantic   6955543981316047167      TEXT  #/texts/3       text        0.99\n",
       "2211.15504.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  This paper presents an application of the Layo...\n",
-      "1     title  #/texts/2           Semantic Table Detection with LayoutLMv3\n",
-      "2    author  #/texts/2                                       Ivan Silajev\n",
-      "3    author  #/texts/3                                       Niels Victor\n",
+      "title:  Semantic Table Detection with LayoutLMv3\n",
+      "abstract:  Abstract This paper presents an application of the LayoutLMv3 model for semantic table detection on financial documents from the IIIT-AR-13K dataset. The motivation behind this paper's experiment was that LayoutLMv3's official paper had no results for table detection using semantic information. We concluded that our approach did not improve the model's table detection capabilities, for which we can give several possible reasons. Either the model's weights were unsuitable for our purpose, or we needed to invest more time in optimising the model's hyperparameters. It is also possible that semantic information does not improve a model's table detection accuracy.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   3220804023042761428  DOCUMENT          #         en        0.97\n",
+      "1   metadata   8520412851891641044  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  13057559607760445383  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  15600106556577833633  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  14852017143538016115      TEXT  #/texts/0         en        0.44\n",
+      "5   semantic  14852017143538016115      TEXT  #/texts/0  reference        0.91\n",
+      "6   language   8520412851891641044      TEXT  #/texts/1         en        0.78\n",
+      "7   semantic   8520412851891641044      TEXT  #/texts/1       text        0.77\n",
+      "8   language    997834807154655310      TEXT  #/texts/2         fi        0.26\n",
+      "9   semantic    997834807154655310      TEXT  #/texts/2  meta-data        0.98\n",
+      "10  language   2747418767922277996      TEXT  #/texts/3         fr        0.25\n",
+      "11  semantic   2747418767922277996      TEXT  #/texts/3  meta-data        0.98\n",
       "2304.01577.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Compared to general document analysis tasks, f...\n",
-      "1     title  #/texts/2  Form-NLU: Dataset for the Form Language Unders...\n",
-      "2    author  #/texts/7                          Hyunsuk Chung FortifyEdge\n",
+      "title:  Form-NLU: Dataset for the Form Language Understanding\n",
+      "abstract:  ABSTRACT Compared to general document analysis tasks, form document structure understanding and retrieval are challenging. Form documents are typically made by two types of authors; A form designer, who develops the form structure and keys, and a form user, who fills out form values based on the provided keys. Hence, the form values may not be aligned with the form designer's intention (structure and keys) if a form user gets confused. In this paper, we introduce Form-NLU, the first novel dataset for form structure understanding and its key and value information extraction, interpreting the form designer's intent and the alignment of user-written value on it. It consists of 857 form images, 6k form keys and values, and 4k table keys and values. Our dataset also includes three form types: digital, printed, and handwritten, which cover diverse form appearances and layouts. We propose a robust positional and logical relationbased form key-value information extraction framework. Using this dataset, Form-NLU, we first examine strong object detection models for the form layout understanding, then evaluate the key information extraction task on the dataset, providing fine-grained results for different types of forms and keys. Furthermore, we examine it with the off-the-shelf pdf layout extraction tool and prove its feasibility in real-world cases. CCS CONCEPTS · Information systems → Information retrieval. KEYWORDS Datasets, Form understanding, Natural language understanding ACM Reference Format: Yihao Ding, Siqu Long, Jiabin Huang, Kaixuan Ren, Xingxiang Luo, Hyunsuk Chung, and Soyeon Caren Han. 2023. Form-NLU: Dataset for the Form Language Understanding. In Proceedings of The 46th International ACM ACM ISBN 978-1-4503-XXXX-X/18/06...$15.00 Soyeon Caren Han The University of Sydney Sydney, NSW, Australia SIGIR Conference on Research and Development in Information Retrieval (SIGIR '23). ACM, New York, NY, USA, 10 pages. https://doi.org/XXX\n",
+      "        type             subj_hash subj_name   subj_path     label  confidence\n",
+      "0   language   7248722372843658536  DOCUMENT           #        en         1.0\n",
+      "1   metadata  11505020220111782961  DOCUMENT   #/texts/1     title         1.0\n",
+      "2   metadata   2637372515777053633  DOCUMENT   #/texts/8  abstract         1.0\n",
+      "3   metadata  11448376779154117332  DOCUMENT   #/texts/9  abstract         1.0\n",
+      "4   metadata   2106780096098535656  DOCUMENT  #/texts/10  abstract         1.0\n",
+      "5   metadata  15919134484492398433  DOCUMENT  #/texts/11  abstract         1.0\n",
+      "6   metadata   2638737412827573576  DOCUMENT  #/texts/12  abstract         1.0\n",
+      "7   metadata   7791884911297709895  DOCUMENT  #/texts/13  abstract         1.0\n",
+      "8   metadata  13884229375021504457  DOCUMENT  #/texts/14  abstract         1.0\n",
+      "9   metadata   4071022534915414281  DOCUMENT  #/texts/15  abstract         1.0\n",
+      "10  metadata  15904124921512099718  DOCUMENT  #/texts/16  abstract         1.0\n",
+      "11  metadata   5875658572963855080  DOCUMENT  #/texts/17  abstract         1.0\n",
       "2107.02638.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Despite significant progress on curr...\n",
-      "1     title  #/texts/2  DocSynth: A Layout Guided Approach for Control...\n",
-      "2    author  #/texts/2                                      Sanket Biswas\n",
-      "3    author  #/texts/2                                           Pau Riba\n",
-      "4    author  #/texts/4                             Computer Vision Center\n",
-      "5    author  #/texts/4                        Computer Science Department\n",
-      "6    author  #/texts/5                                    Universitat Aut\n",
+      "title:  DocSynth: A Layout Guided Approach for Controllable Document Image Synthesis\n",
+      "abstract:  Abstract. Despite significant progress on current state-of-the-art image generation models, synthesis of document images containing multiple and complex object layouts is a challenging task. This paper presents a novel approach, called DocSynth, to automatically synthesize document images based on a given layout. In this work, given a spatial layout (bounding boxes with object categories) as a reference by the user, our proposed DocSynth model learns to generate a set of realistic document images consistent with the defined layout. Also, this framework has been adapted to this work as a superior baseline model for creating synthetic document image datasets for augmenting real data during training for document layout analysis tasks. Different sets of learning objectives have been also used to improve the model performance. Quantitatively, we also compare the generated results of our model with real data using standard evaluation metrics. The results highlight that our model can successfully generate realistic and diverse document images with multiple objects. We also present a comprehensive qualitative analysis summary of the different scopes of synthetic image generation tasks. Lastly, to our knowledge this is the first work of its kind. Keywords: Document Synthesis · Generative Adversarial Networks · Layout Generation.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   9684124591752018285  DOCUMENT          #         en        0.99\n",
+      "1   metadata  16022043740431148641  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  13234250752867375615  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata  11540617821912263900  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language   1514611656878267735      TEXT  #/texts/0         en        0.39\n",
+      "5   semantic   1514611656878267735      TEXT  #/texts/0  reference        0.95\n",
+      "6   language  16022043740431148641      TEXT  #/texts/1         en        0.83\n",
+      "7   semantic  16022043740431148641      TEXT  #/texts/1     header        0.87\n",
+      "8   language   6400617401789590231      TEXT  #/texts/2         en        0.32\n",
+      "9   semantic   6400617401789590231      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language  15927436114134815536      TEXT  #/texts/3         en        0.25\n",
+      "11  semantic  15927436114134815536      TEXT  #/texts/3  meta-data        0.48\n",
       "2307.16369.pdf\n",
-      "    subtype subj_path                                               name\n",
-      "0  abstract         #  Document understanding and information extract...\n",
+      "title:  Workshop on Document Intelligence Understanding\n",
+      "abstract:  ABSTRACT Document understanding and information extraction include different tasks to understand a document and extract valuable information automatically. Recently, there has been a rising demand for developing document understanding among different domains, including business, law, and medicine, to boost the efficiency of work that is associated with a large number of documents. This workshop aims to bring together researchers and industry developers in the field of document intelligence and understanding diverse document types to boost automatic document processing and understanding techniques. We also release a data challenge on the recently introduced document-level VQA dataset, PDFVQA$^{1}$. The PDFVQA challenge examines the model's structural and contextual understandings on the natural full document level of multiple consecutive document pages by including questions with a sequence of answers extracted from multi-pages of the full document. This task helps to boost the document understanding step from the single-page level to the full document level understanding. KEYWORDS Document Understanding, Information Extraction, Layout Analyzing, Visual Question Answering ACM Reference Format: Workshop on Document Intelligence Understanding https://doc-iu.github. io/. In Proceedings of Make sure to enter the correct conference title from your rights confirmation emai (Conference acronym 'XX). ACM, New York, NY, USA, 4 pages. https://doi.org/XXXXXXX.XXXXXXX\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   6209530062408105643  DOCUMENT          #         en        0.99\n",
+      "1   metadata   8284828600829237345  DOCUMENT  #/texts/2   abstract        1.00\n",
+      "2   metadata   1732992016307038206  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata   2717719593872793157  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   metadata   8284713813064902382  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "5   metadata  11386063114585799801  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "6   metadata  15467961449563024489  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "7   metadata   5588889559182754493  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "8   language   8405150555766025679      TEXT  #/texts/0         en        0.30\n",
+      "9   semantic   8405150555766025679      TEXT  #/texts/0  reference        0.67\n",
+      "10  language  11183264259715455003      TEXT  #/texts/1         en        0.51\n",
+      "11  semantic  11183264259715455003      TEXT  #/texts/1  reference        0.84\n",
       "2204.08387.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Self-supervised pre-training techniques have a...\n",
-      "1     title  #/texts/2  LayoutLMv3: Pre-training for Document AI with ...\n",
-      "2    author  #/texts/2                                        Yupan Huang\n",
-      "3    author  #/texts/2                                            Lei Cui\n",
-      "4    author  #/texts/2                                          Yutong Lu\n",
-      "5    author  #/texts/2                                           Furu Wei\n",
-      "6    author  #/texts/3                                            Sun Yat\n",
-      "7    author  #/texts/4                                 Microsoft Research\n",
+      "title:  LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking\n",
+      "abstract:  Abstract Self-supervised pre-training techniques have achieved remarkable progress in Document AI. Most multimodal pretrained models use a masked language modeling objective to learn bidirectional representations on the text modality, but they differ in pre-training objectives for the image modality. This discrepancy adds difficulty to multimodal representation learning. In this paper, we propose LayoutLMv3 to pre-train multimodal Transformers for Document AI with unified text and image masking. Additionally, LayoutLMv3 is pre-trained with a word-patch alignment objective to learn cross-modal alignment by predicting whether the corresponding image patch of a text word is masked. The simple unified architecture and training objectives make LayoutLMv3 a general-purpose pre-trained model for both text-centric and image-centric Document AI tasks. Experimental results show that LayoutLMv3 achieves state-of-the-art performance not only in text-centric tasks, including form understanding, receipt understanding, and document visual question answering, but also in imagecentric tasks such as document image classification and document layout analysis. The code and models are publicly available at https://aka.ms/layoutlmv3.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   5613557634612087368  DOCUMENT          #         en        1.00\n",
+      "1   metadata  10055515728685339539  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  11995162111768406472  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  11875175660407515804  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language   1853412802335269668      TEXT  #/texts/0         en        0.37\n",
+      "5   semantic   1853412802335269668      TEXT  #/texts/0       text        0.96\n",
+      "6   language  10055515728685339539      TEXT  #/texts/1         en        0.86\n",
+      "7   semantic  10055515728685339539      TEXT  #/texts/1     header        0.79\n",
+      "8   language   4700528860860957044      TEXT  #/texts/2         en        0.29\n",
+      "9   semantic   4700528860860957044      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  15999884248560655279      TEXT  #/texts/3         en        0.39\n",
+      "11  semantic  15999884248560655279      TEXT  #/texts/3  meta-data        0.97\n",
       "2206.11229.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Information extraction from semi-str...\n",
-      "1     title  #/texts/3  Maty' aˇs Skalick' y, ˇ Stˇ ep' an ˇ Simsa, Mi...\n",
+      "title:  Maty' aˇs Skalick' y, ˇ Stˇ ep' an ˇ Simsa, Michal Uˇriˇ c' aˇr, and Milan ˇ Sulc\n",
+      "abstract:  Abstract. Information extraction from semi-structured documents is crucial for frictionless business-to-business (B2B) communication. While machine learning problems related to Document Information Extraction (IE) have been studied for decades, many common problem definitions and benchmarks do not reflect domain-specific aspects and practical needs for automating B2B document communication. We review the landscape of Document IE problems, datasets and benchmarks. We highlight the practical aspects missing in the common definitions and define the Key Information Localization and Extraction (KILE) and Line Item Recognition (LIR) problems. There is a lack of relevant datasets and benchmarks for Document IE on semi-structured business documents as their content is typically legally protected or sensitive. We discuss potential sources of available documents including synthetic data. Keywords: Document Understanding · Survey · Benchmarks · Datasets\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   8967976425495197562  DOCUMENT          #         en        0.96\n",
+      "1   metadata   4509603375389757621  DOCUMENT  #/texts/2      title        1.00\n",
+      "2   metadata   3735705173397610820  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  12757473539277897783  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   4282766078654614157      TEXT  #/texts/0         en        0.23\n",
+      "5   semantic   4282766078654614157      TEXT  #/texts/0  reference        0.67\n",
+      "6   language    777241034591429017      TEXT  #/texts/1         en        0.56\n",
+      "7   semantic    777241034591429017      TEXT  #/texts/1     header        0.50\n",
+      "8   language   4509603375389757621      TEXT  #/texts/2         en        0.15\n",
+      "9   semantic   4509603375389757621      TEXT  #/texts/2  reference        0.73\n",
+      "10  language   7449385233145878399      TEXT  #/texts/3         es        0.39\n",
+      "11  semantic   7449385233145878399      TEXT  #/texts/3  meta-data        0.99\n",
       "2305.04609.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Instance-level segmentation of docum...\n",
-      "1     title  #/texts/2  SwinDocSegmenter: An End-to-End Unified Domain...\n",
-      "2    author  #/texts/2                                      Ayan Banerjee\n",
-      "3    author  #/texts/2                                      Sanket Biswas\n",
-      "4    author  #/texts/3                                         Josep Llad\n",
-      "5    author  #/texts/4                             Computer Vision Center\n",
-      "6    author  #/texts/4        Computer Science Department Universitat Aut\n",
+      "title:  SwinDocSegmenter: An End-to-End Unified Domain Adaptive Transformer for Document Instance Segmentation\n",
+      "abstract:  Abstract. Instance-level segmentation of documents consists in assigning a class-aware and instance-aware label to each pixel of the image. It is a key step in document parsing for their understanding. In this paper, we present a unified transformer encoder-decoder architecture for en-toend instance segmentation of complex layouts in document images. The method adapts a contrastive training with a mixed query selection for anchor initialization in the decoder. Later on, it performs a dot product between the obtained query embeddings and the pixel embedding map (coming from the encoder) for semantic reasoning. Extensive experimentation on competitive benchmarks like PubLayNet, PRIMA, Historical Japanese (HJ), and TableBank demonstrate that our model with SwinL backbone achieves better segmentation performance than the existing state-of-the-art approaches with the average precision of 93.72, 54.39, 84.65 and 98.04 respectively under one billion parameters. The code is made publicly available at: github.com/ayanban011/SwinDocSegmenter Keywords: Document Layout Analysis · Instance-Level Segmentation · Swin Transformer · Contrastive Learning.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   1693492343147334485  DOCUMENT          #         en        0.99\n",
+      "1   metadata   6160941610122377495  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   6639197326931776497  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  13483731373294669948  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  15950250016772815960      TEXT  #/texts/0         en        0.58\n",
+      "5   semantic  15950250016772815960      TEXT  #/texts/0  reference        0.66\n",
+      "6   language   6160941610122377495      TEXT  #/texts/1         en        0.60\n",
+      "7   semantic   6160941610122377495      TEXT  #/texts/1     header        0.72\n",
+      "8   language   7027868792384148107      TEXT  #/texts/2         en        0.35\n",
+      "9   semantic   7027868792384148107      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language  12268483507213020384      TEXT  #/texts/3         en        0.19\n",
+      "11  semantic  12268483507213020384      TEXT  #/texts/3  meta-data        0.54\n",
       "2008.02569.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. We introduce a new dataset for graph...\n",
-      "1     title  #/texts/2  IIIT-AR-13K: A New Dataset for Graphical Objec...\n",
-      "2    author  #/texts/2                                        Peter Lipps\n",
-      "3    author  #/texts/6                             Open Text Software Gmb\n",
+      "title:  IIIT-AR-13K: A New Dataset for Graphical Object Detection in Documents\n",
+      "abstract:  Abstract. We introduce a new dataset for graphical object detection in business documents, more specifically annual reports. This dataset, iiit-$_{ar}$-13$_{k}$, is created by manually annotating the bounding boxes of graphical or page objects in publicly available annual reports. This dataset contains a total of 13$_{k}$ annotated page images with objects in five different popular categories-table, figure, natural image, logo, and signature. It is the largest manually annotated dataset for graphical object detection. Annual reports created in multiple languages for several years from various companies bring high diversity into this dataset. We benchmark $_{iiit-ar}$-13$_{k}$ dataset with two state of the art graphical object detection techniques using $_{f}$aster r-cnn [20] and $_{m}$ask r-cnn [11] and establish high baselines for further research. Our dataset is highly effective as training data for developing practical solutions for graphical object detection in both business documents and technical articles. By training with $_{iiit-ar}$-13$_{k}$, we demonstrate the feasibility of a single solution that can report superior performance compared to the equivalent ones trained with a much larger amount of data, for table detection. We hope that our dataset helps in advancing the research for detecting various types of graphical objects in business documents $^{1}$. Keywords: graphical object detection · annual reports · business documents · $_{f}$aster r-cnn · $_{m}$ask $_{r-cnn}$.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   7134740071165908334  DOCUMENT          #         en        0.95\n",
+      "1   metadata  13822516791368960623  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  13448690121196485778  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "3   metadata   6100566510081120729  DOCUMENT  #/texts/9   abstract        1.00\n",
+      "4   language  11381059642053378033      TEXT  #/texts/0         en        0.29\n",
+      "5   semantic  11381059642053378033      TEXT  #/texts/0  reference        0.99\n",
+      "6   language  13822516791368960623      TEXT  #/texts/1         en        0.51\n",
+      "7   semantic  13822516791368960623      TEXT  #/texts/1     header        0.77\n",
+      "8   language  13657362457735794412      TEXT  #/texts/2         en        0.43\n",
+      "9   semantic  13657362457735794412      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  12503024925615714599      TEXT  #/texts/3         en        0.57\n",
+      "11  semantic  12503024925615714599      TEXT  #/texts/3  meta-data        0.96\n",
       "2203.02378.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Image Transformer has recently achieved signif...\n",
-      "1     title  #/texts/2  DIT: SELF-SUPERVISED PRE-TRAINING FOR DOCUMENT...\n",
-      "2    author  #/texts/2                                         Junlong Li\n",
-      "3    author  #/texts/2                                          Yiheng Xu\n",
-      "4    author  #/texts/2                                            Lei Cui\n",
-      "5    author  #/texts/2                                          Cha Zhang\n",
-      "6    author  #/texts/2                                           Furu Wei\n",
-      "7    author  #/texts/3                      Shanghai Jiao Tong University\n",
-      "8    author  #/texts/4                                 Microsoft Research\n",
-      "9    author  #/texts/5                                    Microsoft Azure\n",
+      "title:  DIT: SELF-SUPERVISED PRE-TRAINING FOR DOCUMENT IMAGE TRANSFORMER\n",
+      "abstract:  ABSTRACT Image Transformer has recently achieved significant progress for natural image understanding, either using supervised (ViT, DeiT, etc.) or self-supervised (BEiT, MAE, etc.) pre-training techniques. In this paper, we propose DiT, a selfsupervised pre-trained D ocument I mage T ransformer model using large-scale unlabeled text images for Document AI tasks, which is essential since no supervised counterparts ever exist due to the lack of human labeled document images. We leverage DiT as the backbone network in a variety of vision-based Document AI tasks, including document image classification, document layout analysis, table detection as well as text detection for OCR. Experiment results have illustrated that the self-supervised pre-trained DiT model achieves new state-of-the-art results on these downstream tasks, e.g. document image classification (91.11 → 92.69), document layout analysis (91.0 → 94.9), table detection (94.23 → 96.55) and text detection for OCR (93.07 → 94.29). The code and pre-trained models are publicly available at https://aka.ms/msdit.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  16626975744466566992  DOCUMENT          #         en        0.99\n",
+      "1   metadata  11648296819527699848  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   5421231543418135299  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "3   metadata   3697732579581766426  DOCUMENT  #/texts/9   abstract        1.00\n",
+      "4   language  13200780365894350615      TEXT  #/texts/0         en        0.54\n",
+      "5   semantic  13200780365894350615      TEXT  #/texts/0       text        0.69\n",
+      "6   language  11648296819527699848      TEXT  #/texts/1         en        0.26\n",
+      "7   semantic  11648296819527699848      TEXT  #/texts/1     header        0.86\n",
+      "8   language  10095158915153554708      TEXT  #/texts/2         en        0.29\n",
+      "9   semantic  10095158915153554708      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  12542425350952447845      TEXT  #/texts/3         en        0.41\n",
+      "11  semantic  12542425350952447845      TEXT  #/texts/3  meta-data        1.00\n",
       "2203.13530.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Document intelligence as a relatively...\n",
-      "1     title  #/texts/2  Multimodal Pre-training Based on Graph Attenti...\n",
-      "2    author  #/texts/2                                     Zhenrong Zhang\n",
-      "3    author  #/texts/2                                         Jiefeng Ma\n",
-      "4    author  #/texts/2                                             Jun Du\n",
-      "5    author  #/texts/2                                       Licheng Wang\n",
-      "6    author  #/texts/2                                      Jianshu Zhang\n",
+      "title:  Multimodal Pre-training Based on Graph Attention Network for Document Understanding\n",
+      "abstract:  Abstract-Document intelligence as a relatively new research topic supports many business applications. Its main task is to automatically read, understand, and analyze documents. However, due to the diversity of formats (invoices, reports, forms, etc.) and layouts in documents, it is difficult to make machines understand documents. In this paper, we present the GraphDoc, a multimodal graph attention-based model for various document understanding tasks. GraphDoc is pre-trained in a multimodal framework by utilizing text, layout, and image information simultaneously. In a document, a text block relies heavily on its surrounding contexts, so we inject the graph structure into the attention mechanism to form a graph attention layer so that each input node can only attend to its neighborhoods. The input nodes of each graph attention layer are composed of textual, visual, and positional features from semantically meaningful regions in a document image. We do the multimodal feature fusion of each node by the gate fusion layer. The contextualization between each node is modeled by the graph attention layer. GraphDoc learns a generic representation from only 320k unlabeled documents via the Masked Sentence Modeling task. Extensive experimental results on the publicly available datasets show that GraphDoc achieves state-of-the-art performance, which demonstrates the effectiveness of our proposed method. Index Terms-Document understanding, Pre-training, Multimodal, Graph attention layer.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  11814168582946535945  DOCUMENT          #         en        0.98\n",
+      "1   metadata  18239112013650998523  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   4944397510388150405  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata   1396660194561995515  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   language   1417991811488983492      TEXT  #/texts/0         en        0.32\n",
+      "5   semantic   1417991811488983492      TEXT  #/texts/0       text        0.89\n",
+      "6   language  18239112013650998523      TEXT  #/texts/1         en        0.78\n",
+      "7   semantic  18239112013650998523      TEXT  #/texts/1     header        0.85\n",
+      "8   language   9021015283574159893      TEXT  #/texts/2         en        0.67\n",
+      "9   semantic   9021015283574159893      TEXT  #/texts/2  meta-data        0.97\n",
+      "10  language   4944397510388150405      TEXT  #/texts/3         en        0.91\n",
+      "11  semantic   4944397510388150405      TEXT  #/texts/3       text        1.00\n",
       "2302.08575.pdf\n",
-      "Empty DataFrame\n",
-      "Columns: [subtype, subj_path, name]\n",
-      "Index: []\n",
+      "title:  Foundation Models for Natural Language Processing -- Pre-trained Language Models Integrating Media\n",
+      "abstract:  ['This open access book provides a comprehensive overview of the state of the art in research and applications of Foundation Models and is intended for readers familiar with basic Natural Language Processing (NLP) concepts. Over the recent years, a revolutionary new paradigm has been developed for training models for NLP. These models are first pre-trained on large collections of text documents to acquire general syntactic knowledge and semantic information. Then, they are fine-tuned for specific tasks, which they can often solve with superhuman accuracy. When the models are large enough, they can be instructed by prompts to solve new tasks without any fine-tuning. Moreover, they can be applied to a wide range of different media and problem domains, ranging from image and video processing to robot control learning. Because they provide a blueprint for solving many tasks in artificial intelligence, they have been called Foundation Models. After a brief introduction to basic NLP models the main pre-trained language models BERT, GPT and sequence-to-sequence transformer are described, as well as the concepts of self-attention and context-sensitive embedding. Then, different approaches to improving these models are discussed, such as expanding the pre-training criteria, increasing the length of input texts, or including extra knowledge. An overview of the best-performing models for about twenty application areas is then presented, e.g., question answering, translation, story generation, dialog systems, generating images from text, etc. For each application area, the strengths and weaknesses of current models are discussed, and an outlook on further developments is given. In addition, links are provided to freely available program code. A concluding chapter summarizes the economic opportunities, mitigation of risks, and potential developments of AI.']\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   8421627772241711435  DOCUMENT          #         en        0.97\n",
+      "1   language  15144751674124179519      TEXT  #/texts/0         en        0.35\n",
+      "2   semantic  15144751674124179519      TEXT  #/texts/0       text        0.99\n",
+      "3   language  13543005973877344845      TEXT  #/texts/1         de        0.37\n",
+      "4   semantic  13543005973877344845      TEXT  #/texts/1  meta-data        0.99\n",
+      "5   language  13566690406347038172      TEXT  #/texts/2         en        0.59\n",
+      "6   semantic  13566690406347038172      TEXT  #/texts/2     header        0.47\n",
+      "7   language   8314110134380907026      TEXT  #/texts/3         en        0.73\n",
+      "8   semantic   8314110134380907026      TEXT  #/texts/3     header        0.81\n",
+      "9   language   4744613495211267368      TEXT  #/texts/4         en        0.99\n",
+      "10  semantic   4744613495211267368      TEXT  #/texts/4       text        0.91\n",
+      "11  language   2692523668970354209      TEXT  #/texts/5         en        0.92\n",
       "2205.02411.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Relational understanding is critical for a num...\n",
-      "1     title  #/texts/2  Relational Representation Learning in Visually...\n",
-      "2    author  #/texts/2                                             Xin Li\n",
-      "3    author  #/texts/2                                          Yunfei Wu\n",
-      "4    author  #/texts/3                                          Yan Zheng\n",
-      "5    author  #/texts/3                                          Yiqing Hu\n",
-      "6    author  #/texts/3         Haoyu Cao Deqiang Jiang Yinsong Liu Bo Ren\n",
-      "7    author  #/texts/4                                      Tencent YouTu\n",
+      "title:  Relational Representation Learning in Visually-Rich Documents\n",
+      "abstract:  Abstract Relational understanding is critical for a number of visually-rich documents (VRDs) understanding tasks. Through multi-modal pre-training, recent studies provide comprehensive contextual representations and exploit them as prior knowledge for downstream tasks. In spite of their impressive results, we observe that the widespread relational hints (e.g., relation of key/value fields on receipts) built upon contextual knowledge are not excavated yet. To mitigate this gap, we propose DocReL, a Doc ument Re lational Representation L earning framework. The major challenge of DocReL roots in the variety of relations. From the simplest pairwise relation to the complex global structure, it is infeasible to conduct supervised training due to the definition of relation varies and even conflicts in different tasks. To deal with the unpredictable definition of relations, we propose a novel contrastive learning task named Relational Consistency Modeling (RCM), which harnesses the fact that existing relations should be consistent in differently augmented positive views. RCM provides relational representations which are more compatible to the urgent need of downstream tasks, even without any knowledge about the exact definition of relation. DocReL achieves better performance on a wide variety of VRD relational understanding tasks, including table structure recognition, key information extraction and reading order detection.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  14685796338317814159  DOCUMENT          #         en        1.00\n",
+      "1   metadata  14560507360067576989  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   9619244236012610321  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata    402873043014886069  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language   4466021334064041913      TEXT  #/texts/0         en        0.68\n",
+      "5   semantic   4466021334064041913      TEXT  #/texts/0       text        0.83\n",
+      "6   language  14560507360067576989      TEXT  #/texts/1         en        0.72\n",
+      "7   semantic  14560507360067576989      TEXT  #/texts/1     header        0.73\n",
+      "8   language  10588574520574666354      TEXT  #/texts/2         eo        0.42\n",
+      "9   semantic  10588574520574666354      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   4943200990914226626      TEXT  #/texts/3         en        0.47\n",
+      "11  semantic   4943200990914226626      TEXT  #/texts/3  meta-data        1.00\n",
       "2305.08719.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Document layout analysis is a crucial prerequi...\n",
-      "1      title  #/texts/2  M $^{6}$Doc: A Large-Scale Multi-Format, Multi...\n",
-      "2     author  #/texts/2                                        Hiuyi Cheng\n",
-      "3     author  #/texts/2                                      Peirong Zhang\n",
-      "4     author  #/texts/2                                          Sihang Wu\n",
-      "5     author  #/texts/2                                       Jiaxin Zhang\n",
-      "6     author  #/texts/3                                         Qiyuan Zhu\n",
-      "7     author  #/texts/3                                        Zecheng Xie\n",
-      "8     author  #/texts/3                                            Jing Li\n",
-      "9     author  #/texts/3                                           Kai Ding\n",
-      "10    author  #/texts/3                                        Lianwen Jin\n",
-      "11    author  #/texts/5             Huawei Cloud Computing Technologies Co\n",
-      "12    author  #/texts/6                                 Sig Information Co\n",
+      "title:  M $^{6}$Doc: A Large-Scale Multi-Format, Multi-Type, Multi-Layout, Multi-Language, Multi-Annotation Category Dataset for Modern Document Layout Analysis\n",
+      "abstract:  Abstract Document layout analysis is a crucial prerequisite for document under standing, including document retrieval and conversion. Most public datasets currently contain only PDF documents and lack realistic documents. Models trained on these datasets may not generalize well to real-world scenarios. Therefore, this paper introduces a large and diverse document layout analysis dataset called M $^{6}$Doc. The M 6 designation represents six properties: (1) Multi-Format (including scanned, photographed, and PDF documents); (2) Multi-Type (such as scientific articles, textbooks, books, test papers, magazines, newspapers, and notes); (3) Multi-Layout (rectangular, Manhattan, non-Manhattan, and multi-column Manhattan); (4) Multi-Language (Chinese and English); (5) Multi-Annotation Category (74 types of annotation labels with 237,116 annotation instances in 9,080 manually annotated pages); and (6) Modern documents. Additionally, we propose a transformer-based document layout analysis method called TransDLANet, which leverages an adaptive element matching mechanism that enables query embedding to better match ground truth to improve recall, and constructs a segmentation branch for more precise document image instance segmentation. We conduct a comprehensive evaluation of M $^{6}$Doc with various layout analysis methods and demonstrate its ef fectiveness. TransDLANet achieves stateof-the-art performance on M $^{6}$Doc with 64.5% mAP. The M $^{6}$Doc dataset will be available at https://github. com/HCIILAB/ M6Doc.\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language   1876173860373953341  DOCUMENT           #         en   \n",
+      "1   metadata  13256736339476695081  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata   8152406454759716696  DOCUMENT   #/texts/8   abstract   \n",
+      "3   metadata   9004593750770180351  DOCUMENT   #/texts/9   abstract   \n",
+      "4   metadata   1940889033656737863  DOCUMENT  #/texts/10   abstract   \n",
+      "5   language  10861081459604961628      TEXT   #/texts/0         en   \n",
+      "6   semantic  10861081459604961628      TEXT   #/texts/0  reference   \n",
+      "7   language  13256736339476695081      TEXT   #/texts/1         en   \n",
+      "8   semantic  13256736339476695081      TEXT   #/texts/1     header   \n",
+      "9   language   1638358623752885653      TEXT   #/texts/2         en   \n",
+      "10  semantic   1638358623752885653      TEXT   #/texts/2  meta-data   \n",
+      "11  language  11362042950132522047      TEXT   #/texts/3         en   \n",
+      "\n",
+      "    confidence  \n",
+      "0         0.98  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         1.00  \n",
+      "5         0.58  \n",
+      "6         0.66  \n",
+      "7         0.52  \n",
+      "8         0.67  \n",
+      "9         0.23  \n",
+      "10        0.95  \n",
+      "11        0.40  \n",
       "2305.04833.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Table Detection has become a fundamental task ...\n",
-      "1     title  #/texts/1  Revisiting Table Detection Datasets for Visual...\n",
-      "2    author  #/texts/1                                           Bin Xiao\n",
-      "3    author  #/texts/1                                       Murat Simsek\n",
-      "4    author  #/texts/1                                     Burak Kantarci\n",
-      "5    author  #/texts/3                                         Lytica Inc\n",
-      "6    author  #/texts/3                                       Legget Drive\n",
+      "title:  Revisiting Table Detection Datasets for Visually Rich Documents\n",
+      "abstract:  Abstract Table Detection has become a fundamental task for visually rich document understanding with the surging number of electronic documents. There have been some open datasets widely used in many studies. However, popular available datasets have some inherent limitations, including the noisy and inconsistent samples, and the limit number of training samples, and the limit number of data-sources. These limitations make these datasets unreliable to evaluate the model performance and cannot reflect the actual capacity of models. Therefore, in this paper, we revisit some open datasets with high quality of annotations, identify and clean the noise, and align the annotation definitions of these datasets to merge a larger dataset, termed with Open-Tables. Moreover, to enrich the data sources, we propose a new dataset, termed with ICT-TD, using the PDF files of Information and communication technologies (ICT) commodities which is a di erent domain containing unique samples that hardly appear in open datasets. To ensure the label quality of the dataset, we annotated the dataset manually following the guidance of a domain expert. The proposed dataset has a larger intra-variance and smaller inter-variance, making it more challenging and can be a sample of actual cases in the business context. We built strong baselines using various state-of-the-art object detection models and also built the baselines in the cross-domain setting. Our experimental results show that the domain di erence among existing open datasets are small, even they have di erent data-sources. Our proposed Open-tables and ICT-TD are more suitable for the cross domain setting, and can provide more reliable evaluation for model because of their high quality and consistent annotations. We conduct experiments to discuss the side e ects of noise in the open source datasets. Our experimental results show that in the cross-domain setting, benchmark models trained with cleaned Open-Tables dataset can achieve 0.6%-2.6% higher weighted average F1 than the corresponding ones trained with the noisy version of Open-Tables, demonstrating the reliability of the proposed datasets. The datasets are public available at http://ieee-dataport. org/documents/table-detection-dataset-visually-rich-documents Keywords: Object Detection, Table Detection Dataset, ICT Supply Chain, Table Detection 1. Introduction Tables or tabular data have been widely used in electronic documents to summarize critical information so that the information can be presented e ciently to human readers. However, electronic documents, such as Portable Document Format (PDF) files, cannot provide enough meta-data to describe the location and the structure of these tables, meaning that these tables are unstructured and cannot be quickly processed and interpreted automatically. With the surging amount of electronic documents, Table Detection (TD) becomes a fundamental task for downstream document understanding tasks, such as Key Information Extraction and Table Structure Recognition [1]. With the development of deep learning, transforming electronic documents into visually rich document images and formulating the problem as an object detection problem became the dominant solutions. There have been some public datasets for the TD problem, such as ICDAR2013 [2], ICDAR2017 [3], ICDAR2019 [4] and TableBank [5]. Some of these datasets are manually labeled, which means that the annotations are more reliable and consistent, but the number of training sample in these datasets are usually limited. Besides, the annotation definitions across these datasets are often di erent, which means we cannot simply merge these datasets together to form larger datasets. In contrast, datasets such as TableBank [5] and PubLayNet [6] are annotated by parsing meta-data of electronic documents, making these annotations are noisy and inconsistent, even though these datasets are much larger. Figure 1 shows two samples from the TableBank test set. One typical issue of these meta-data generated datasets is that the bounding box can be larger than an ideal bounding box, as shown in Figure 1 (a), which can make the evaluation unreliable when the Intersection over Union (IoU) threshold is high. Another issue is that some tables are missed or the bounding box is not large enough to cover the whole table, as shown in Figure 1 (b). The quality of a table detection set is critical for the TD problem because a successful TD application should avoid losing information presented in the tables. And the issues of noisy labels in the test set can influence the model evaluation, especially for widely used evaluation metrics threshold by IoU scores. It is worth mentioning that even though manually annotated datasets have higher quality of annotations, there are still many noisy samples in both their training and testing sets. Therefore, in this study, we revisit several well-annotated datasets, including ICDAR2013, ICDAR2017, ICDAR2019, Marmot and TNCR, align the labeling definition of these datasets, clean the noisy samples and merge them together to form a larger dataset, termed with Open-Tables. The new Open-Tables dataset can minimize the side e ects of noisy samples to the model evaluation and provide more reliable results. We include more details regarding Open-Tables dataset in section 3.1.\n",
+      "        type             subj_hash subj_name   subj_path     label  confidence\n",
+      "0   language   7304776487939613810  DOCUMENT           #        en        1.00\n",
+      "1   metadata   8558559027627596631  DOCUMENT   #/texts/0     title        1.00\n",
+      "2   metadata   3716338067735375485  DOCUMENT   #/texts/4  abstract        1.00\n",
+      "3   metadata   8636239885669143883  DOCUMENT   #/texts/5  abstract        1.00\n",
+      "4   metadata  16225242767116965410  DOCUMENT   #/texts/6  abstract        1.00\n",
+      "5   metadata   4785204321927401322  DOCUMENT   #/texts/7  abstract        1.00\n",
+      "6   metadata   1493289078835811758  DOCUMENT   #/texts/8  abstract        1.00\n",
+      "7   metadata  11541983487153591499  DOCUMENT   #/texts/9  abstract        1.00\n",
+      "8   metadata  10011220560812929426  DOCUMENT  #/texts/10  abstract        1.00\n",
+      "9   language   8558559027627596631      TEXT   #/texts/0        en        0.60\n",
+      "10  semantic   8558559027627596631      TEXT   #/texts/0      text        0.48\n",
+      "11  language  13696316175992901111      TEXT   #/texts/1        en        0.18\n",
       "2108.00871.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  It is common in graphic design humans visually...\n",
-      "1     title  #/texts/2  Constrained Graphic Layout Generation via Late...\n",
-      "2    author  #/texts/2                                     Kotaro Kikuchi\n",
-      "3    author  #/texts/3                         Waseda University Shinjuku\n",
-      "4    author  #/texts/4                                         Mayu Otani\n",
-      "5    author  #/texts/5                                      Agent Shibuya\n",
-      "6    author  #/texts/6        Edgar Simo-Serra Waseda University Shinjuku\n",
+      "title:  Constrained Graphic Layout Generation via Latent Optimization\n",
+      "abstract:  ABSTRACT It is common in graphic design humans visually arrange various elements according to their design intent and semantics. For example, a title text almost always appears on top of other elements in a document. In this work, we generate graphic layouts that can flexibly incorporate such design semantics, either specified implicitly or explicitly by a user. We optimize using the latent space of an off-the-shelf layout generation model, allowing our approach to be complementary to and used with existing layout generation models. Our approach builds on a generative layout model based on a Transformer architecture, and formulates the layout generation as a constrained optimization problem where design constraints are used for element alignment, overlap avoidance, or any other user-specified relationship. We show in the experiments that our approach is capable of generating realistic layouts in both constrained and unconstrained generation tasks with a single model. The code is available at https://github.com/ktrk115/const_layout. CCS CONCEPTS · Human-centered computing → Interaction design process and methods; · Applied computing → Computer-aided design. KEYWORDS layout generation, generative adversarial network, constrained optimization, latent space exploration ACM Reference Format: Kotaro Kikuchi, Edgar Simo-Serra, Mayu Otani, and Kota Yamaguchi. 2021. Constrained Graphic Layout Generation via Latent Optimization. In Proceedings of the 29th ACM International Conference on Multimedia (MM '21), October 20-24, 2021, Virtual Event, China. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/3474085.3475497\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language  12199915035636768252  DOCUMENT           #         en   \n",
+      "1   metadata   8384071914771172394  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata   2109654227642717686  DOCUMENT   #/texts/8   abstract   \n",
+      "3   metadata   3211312228955143655  DOCUMENT   #/texts/9   abstract   \n",
+      "4   metadata   6555000622341922001  DOCUMENT  #/texts/10   abstract   \n",
+      "5   metadata  16983468812637625951  DOCUMENT  #/texts/11   abstract   \n",
+      "6   metadata   2109469912817786993  DOCUMENT  #/texts/12   abstract   \n",
+      "7   metadata   3683188399811590448  DOCUMENT  #/texts/13   abstract   \n",
+      "8   metadata   9328733848143751678  DOCUMENT  #/texts/14   abstract   \n",
+      "9   metadata  13989158213906669862  DOCUMENT  #/texts/15   abstract   \n",
+      "10  language   1239214986904154463      TEXT   #/texts/0         en   \n",
+      "11  semantic   1239214986904154463      TEXT   #/texts/0  reference   \n",
+      "\n",
+      "    confidence  \n",
+      "0         0.98  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         1.00  \n",
+      "5         1.00  \n",
+      "6         1.00  \n",
+      "7         1.00  \n",
+      "8         1.00  \n",
+      "9         1.00  \n",
+      "10        0.41  \n",
+      "11        0.86  \n",
       "2306.08937.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Visually-Rich Document Entity Retrieval (VDER)...\n",
-      "1     title  #/texts/2  Document Entity Retrieval with Massive and Noi...\n",
-      "2    author  #/texts/2                                           Lijun Yu\n",
-      "3    author  #/texts/2                                           Jin Miao\n",
-      "4    author  #/texts/2                                         Xiaoyu Sun\n",
-      "5    author  #/texts/2                                         Jiayi Chen\n",
-      "6    author  #/texts/2                              Alexander G Hauptmann\n",
-      "7    author  #/texts/2                                         Hanjun Dai\n",
-      "8    author  #/texts/2                                            Wei Wei\n",
+      "title:  Document Entity Retrieval with Massive and Noisy Pre-training\n",
+      "abstract:  Abstract Visually-Rich Document Entity Retrieval (VDER) is a type of machine learning task that aims at recovering text spans in the documents for each of the entities in question. VDER has gained significant attention in recent years thanks to its broad applications in enterprise AI. Unfortunately, as document images often contain personally identifiable information (PII), publicly available data have been scarce, not only because of privacy constraints but also the costs of acquiring annotations. To make things worse, each dataset would often define its own sets of entities, and the non-overlapping entity spaces between datasets make it difficult to transfer knowledge between documents. In this paper, we propose a method to collect massive-scale, noisy, and weakly labeled data from the web to benefit the training of VDER models. Such a method will generate a huge amount of document image data to compensate for the lack of training data in many VDER settings. Moreover, the collected dataset named DocuNet would not need to be dependent on specific document types or entity sets, making it universally applicable to all VDER tasks. Empowered by DocuNet, we present a lightweight multimodal architecture named UniFormer, which can learn a unified representation from text, layout, and image crops without needing extra visual pretraining. We experiment with our methods on popular VDER models in various settings and show the improvements when this massive dataset is incorporated with UniFormer on both classic entity retrieval and few-shot learning settings.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  15142591603334469240  DOCUMENT          #         en        0.98\n",
+      "1   metadata  10066368879651874202  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  15287023683645999756  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata   9371627777753320426  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   metadata  15691518741437309238  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "5   language  13913708657465956901      TEXT  #/texts/0         en        0.20\n",
+      "6   semantic  13913708657465956901      TEXT  #/texts/0  reference        0.66\n",
+      "7   language  10066368879651874202      TEXT  #/texts/1         en        0.75\n",
+      "8   semantic  10066368879651874202      TEXT  #/texts/1     header        0.98\n",
+      "9   language   8221334556122986640      TEXT  #/texts/2         en        0.32\n",
+      "10  semantic   8221334556122986640      TEXT  #/texts/2  meta-data        0.99\n",
+      "11  language  15287023683645999756      TEXT  #/texts/3         en        0.32\n",
       "2103.05908.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  We combine deep learning and Conditional Proba...\n",
-      "1     title  #/texts/2  DeepCPCFG: Deep Learning and Context Free Gram...\n",
-      "2    author  #/texts/2                                      Freddy C Chua\n",
-      "3    author  #/texts/2                                  High Street Suite\n",
-      "4    author  #/texts/2                                          Palo Alto\n",
-      "5    author  #/texts/3                                      Nigel P Duffy\n",
-      "6    author  #/texts/4                                  High Street Suite\n",
-      "7    author  #/texts/4                                          Palo Alto\n",
+      "title:  DeepCPCFG: Deep Learning and Context Free Grammars for End-to-End Information Extraction\n",
+      "abstract:  Abstract We combine deep learning and Conditional Probabilistic Context Free Grammars (CPCFG) to create an end-to-end system for extracting structured information from complex documents. For each class of documents, we create a CPCFG that describes the structure of the information to be extracted. Conditional probabilities are modeled by deep neural networks. We use this grammar to parse 2-D documents to directly produce structured records containing the extracted information. This system is trained end-to-end with (Document, Record) pairs. We apply this approach to extract information from scanned invoices achieving state-of-the-art results.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   5782007314389811631  DOCUMENT          #         en        1.00\n",
+      "1   metadata  13526205582851115312  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  17155324988894173481  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   3790138250182757054  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  15035037928597917938      TEXT  #/texts/0         en        0.32\n",
+      "5   semantic  15035037928597917938      TEXT  #/texts/0       text        0.99\n",
+      "6   language  13526205582851115312      TEXT  #/texts/1         en        0.59\n",
+      "7   semantic  13526205582851115312      TEXT  #/texts/1     header        0.86\n",
+      "8   language  13070915798935976283      TEXT  #/texts/2         en        0.37\n",
+      "9   semantic  13070915798935976283      TEXT  #/texts/2  meta-data        0.98\n",
+      "10  language  11881341791620129678      TEXT  #/texts/3         en        0.38\n",
+      "11  semantic  11881341791620129678      TEXT  #/texts/3  meta-data        1.00\n",
       "2109.01078.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Transformer-based pre-training techniques of t...\n",
-      "1     title  #/texts/2  Skim-Attention: Learning to Focus via Document...\n",
-      "2    author  #/texts/2                                       Laura Nguyen\n",
-      "3    author  #/texts/2                                     Thomas Scialom\n",
-      "4    author  #/texts/2                                     Jacopo Staiano\n",
-      "5    author  #/texts/2                                Benjamin Piwowarski\n",
-      "6    author  #/texts/3                                 Sorbonne Universit\n",
+      "title:  Skim-Attention: Learning to Focus via Document Layout\n",
+      "abstract:  Abstract Transformer-based pre-training techniques of text and layout have proven effective in a number of document understanding tasks. Despite this success, multimodal pre-training models suffer from very high computational and memory costs. Motivated by human reading strategies, this paper presents Skim-Attention, a new attention mechanism that takes advantage of the structure of the document and its layout. Skim-Attention only attends to the 2dimensional position of the words in a document. Our experiments show that Skim-Attention obtains a lower perplexity than prior works, while being more computationally efficient. Skim-Attention can be further combined with long-range Transformers to efficiently process long documents. We also show how Skim-Attention can be used off-the-shelf as a mask for any Pre-trained Language Model, allowing to improve their performance while restricting attention. Finally, we show the emergence of a document structure representation in Skim-Attention.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   9883551471125205979  DOCUMENT          #         en        0.99\n",
+      "1   metadata  10129463260375798652  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   5534460045214421186  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   3636955013674087094  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language   4755179979580467845      TEXT  #/texts/0         en        0.48\n",
+      "5   semantic   4755179979580467845      TEXT  #/texts/0  reference        0.66\n",
+      "6   language  10129463260375798652      TEXT  #/texts/1         en        0.71\n",
+      "7   semantic  10129463260375798652      TEXT  #/texts/1     header        0.74\n",
+      "8   language    572329452412399190      TEXT  #/texts/2         en        0.28\n",
+      "9   semantic    572329452412399190      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   4645504454607610911      TEXT  #/texts/3         fr        0.41\n",
+      "11  semantic   4645504454607610911      TEXT  #/texts/3  meta-data        0.98\n",
       "2108.09436.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Handwritten documents are often char...\n",
-      "1     title  #/texts/2  Palmira: A Deep Deformable Network for Instanc...\n",
-      "2    author  #/texts/2                                       Sowmya Aitha\n",
-      "3    author  #/texts/3                                   Abhishek Trivedi\n",
-      "4    author  #/texts/4                         Ravi Kiran Sarvadevabhatla\n",
+      "title:  Palmira: A Deep Deformable Network for Instance Segmentation of Dense and Uneven Layouts in Handwritten Manuscripts\n",
+      "abstract:  Abstract. Handwritten documents are often characterized by dense and uneven layout. Despite advances, standard deep network based approaches for semantic layout segmentation are not robust to complex deformations seen across semantic regions. This phenomenon is especially pronounced for the low-resource Indic palm-leaf manuscript domain. To address the issue, we first introduce Indiscapes2, a new large-scale diverse dataset of Indic manuscripts with semantic layout annotations. Indiscapes2 contains documents from four different historical collections and is 150% larger than its predecessor, Indiscapes. We also propose a novel deep network Palmira for robust, deformation-aware instance segmentation of regions in handwritten manuscripts. We also report Hausdorff distance and its variants as a boundary-aware performance measure. Our experiments demonstrate that$_{Palmira}$ provides robust layouts, outperforms strong baseline approaches and ablative variants. We also include qualitative results on Arabic, South-East Asian and Hebrew historical manuscripts to showcase the generalization capability of $_{Palmira}$. Keywords: instance segmentation · deformable convolutional network · historical document analysis · document image segmentation · dataset\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   9626559153965122002  DOCUMENT          #         en        0.99\n",
+      "1   metadata   6342588646724752712  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   9076307976746177469  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata  18076662708448135472  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language   1894792420845750180      TEXT  #/texts/0         en        0.47\n",
+      "5   semantic   1894792420845750180      TEXT  #/texts/0  reference        0.86\n",
+      "6   language   6342588646724752712      TEXT  #/texts/1         en        0.60\n",
+      "7   semantic   6342588646724752712      TEXT  #/texts/1     header        0.73\n",
+      "8   language   2763018529891900900      TEXT  #/texts/2         en        0.41\n",
+      "9   semantic   2763018529891900900      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  18273145714788509308      TEXT  #/texts/3         en        0.45\n",
+      "11  semantic  18273145714788509308      TEXT  #/texts/3  meta-data        1.00\n",
       "2308.01971.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. We introduce a novel bottom-up appro...\n",
-      "1     title  #/texts/2  SpaDen : Sparse and Dense Keypoint Estimation ...\n",
-      "2    author  #/texts/2                                       Saleem Ahmed\n",
-      "3    author  #/texts/2                                         Pengyu Yan\n",
-      "4    author  #/texts/3                                     David Doermann\n",
-      "5    author  #/texts/3                                 Srirangaraj Setlur\n",
-      "6    author  #/texts/4                                   Venu Govindaraju\n",
+      "title:  SpaDen : Sparse and Dense Keypoint Estimation for Real-World Chart Understanding\n",
+      "abstract:  Abstract. We introduce a novel bottom-up approach for the extraction of chart data. Our model utilizes images of charts as inputs and learns to detect keypoints (KP), which are used to reconstruct the components within the plot area. Our novelty lies in detecting a fusion of continuous and discrete KP as predicted heatmaps. A combination of sparse and dense per-pixel objectives coupled with a uni-modal self-attentionbased feature-fusion layer is applied to learn KP embeddings. Further leveraging deep metric learning for unsupervised clustering, allows us to segment the chart plot area into various objects. By further matching the chart components to the legend, we are able to obtain the data series names. A post-processing threshold is applied to the KP embeddings to refine the object reconstructions and improve accuracy. Our extensive experiments include an evaluation of different modules for KP estimation and the combination of deep layer aggregation and corner pooling approaches. The results of our experiments provide extensive evaluation for the task of real-world chart data extraction. $^{1}$. Keywords: Charts and Document Understanding and Reasoning\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  13367340984838580806  DOCUMENT          #         en        1.00\n",
+      "1   metadata   1903187599367179918  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  12538701134398444025  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "3   metadata  10579837932229061513  DOCUMENT  #/texts/9   abstract        1.00\n",
+      "4   language    341948888729253223      TEXT  #/texts/0         en        0.36\n",
+      "5   semantic    341948888729253223      TEXT  #/texts/0  reference        0.86\n",
+      "6   language   1903187599367179918      TEXT  #/texts/1         en        0.45\n",
+      "7   semantic   1903187599367179918      TEXT  #/texts/1     header        0.87\n",
+      "8   language  17616124111452301542      TEXT  #/texts/2         en        0.16\n",
+      "9   semantic  17616124111452301542      TEXT  #/texts/2  meta-data        0.89\n",
+      "10  language  17428461424590529908      TEXT  #/texts/3         en        0.22\n",
+      "11  semantic  17428461424590529908      TEXT  #/texts/3  meta-data        1.00\n",
       "2203.08504.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  This paper presents a systematic literature re...\n",
-      "1     title  #/texts/2     A Survey of Historical Document Image Datasets\n",
-      "2    author  #/texts/2                             Konstantina Nikolaidou\n",
-      "3    author  #/texts/2                                      Hamam Mokayed\n",
-      "4    author  #/texts/2                                     Marcus Liwicki\n",
-      "5    author  #/texts/3                             Machine Learning Group\n",
+      "title:  A Survey of Historical Document Image Datasets\n",
+      "abstract:  Abstract This paper presents a systematic literature review of image datasets for document image analysis, focusing on historical documents, such as handwritten manuscripts and early prints. Finding appropriate datasets for historical document analysis is a crucial prerequisite to facilitate research using different machine learning algorithms. However, because of the very large variety of the actual data (e.g., scripts, tasks, dates, support systems, and amount of deterioration), the different formats for data and label representation, and the different evaluation processes and benchmarks, finding appropriate datasets is a difficult task. This work fills this gap, presenting a meta-study on existing datasets. After a systematic selection process (according to PRISMA guidelines), we select 56 studies that are chosen based on different factors, such as the year of publication, number of methods implemented in the article, reliability of the chosen algorithms, dataset size, and journal outlet. We summarize each study by assigning it to one of three pre-defined tasks: document classification, layout structure, or semantic analysis. We present the statistics, document type, language, tasks, input visual aspects, and ground truth information for every dataset. In addition, we provide the benchmark tasks and results from these papers or recent competitions. We further discuss gaps and challenges in this domain. We advocate for providing conversion tools to common formats (e.g., COCO format for computer vision tasks) and always providing a set of evaluation metrics, instead of just one, to make results comparable across studies. Keywords: Historical Documents, Image Datasets, Document Image Analysis, Machine Learning\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language   4007586703670525870  DOCUMENT           #         en   \n",
+      "1   metadata   7274577276839794671  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata  16472628404673665640  DOCUMENT  #/texts/10   abstract   \n",
+      "3   metadata  17605835865519781365  DOCUMENT  #/texts/11   abstract   \n",
+      "4   metadata  15932023376733108084  DOCUMENT  #/texts/12   abstract   \n",
+      "5   language   9758151282582610634      TEXT   #/texts/0         en   \n",
+      "6   semantic   9758151282582610634      TEXT   #/texts/0       text   \n",
+      "7   language   7274577276839794671      TEXT   #/texts/1         en   \n",
+      "8   semantic   7274577276839794671      TEXT   #/texts/1     header   \n",
+      "9   language   5009320327279888416      TEXT   #/texts/2         en   \n",
+      "10  semantic   5009320327279888416      TEXT   #/texts/2  meta-data   \n",
+      "11  language  14203392011151395431      TEXT   #/texts/3         en   \n",
+      "\n",
+      "    confidence  \n",
+      "0         0.99  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         1.00  \n",
+      "5         0.33  \n",
+      "6         0.89  \n",
+      "7         0.58  \n",
+      "8         0.93  \n",
+      "9         0.32  \n",
+      "10        0.99  \n",
+      "11        0.47  \n",
       "2304.14953.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. In recent years, the field of docume...\n",
-      "1     title  #/texts/2  CCpdf: Building a High Quality Corpus for Visu...\n",
-      "2    author  #/texts/2                                    Karol Kaczmarek\n",
-      "3    author  #/texts/4                         Adam Mickiewicz University\n",
+      "title:  CCpdf: Building a High Quality Corpus for Visually Rich Documents from Web Crawl Data\n",
+      "abstract:  Abstract. In recent years, the field of document understanding has progressed a lot. A significant part of this progress has been possible thanks to the use of language models pretrained on large amounts of documents. However, pretraining corpora used in the domain of document understanding are single domain, monolingual, or nonpublic. Our goal in this paper is to propose an efficient pipeline for creating a big-scale, diverse, multilingual corpus of PDF files from all over the Internet using Common Crawl, as PDF files are the most canonical types of documents as considered in document understanding. We analyzed extensively all of the steps of the pipeline and proposed a solution which is a trade-off between data quality and processing time. We also share a CCpdf corpus in a form or an index of PDF files along with a script for downloading them, which produces a collection useful for language model pretraining. The dataset and tools published with this paper offer researchers the opportunity to develop even better multilingual language models. Keywords: Natural Language Processing, language models, dataset construction, document understanding.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  10771338546968252461  DOCUMENT          #         en        0.98\n",
+      "1   metadata   3724028993523053831  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   1609918121696388518  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   8094359134579896324  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  10595408906284717089      TEXT  #/texts/0         en        0.32\n",
+      "5   semantic  10595408906284717089      TEXT  #/texts/0  reference        0.66\n",
+      "6   language   3724028993523053831      TEXT  #/texts/1         en        0.76\n",
+      "7   semantic   3724028993523053831      TEXT  #/texts/1     header        0.57\n",
+      "8   language   8104816477335951616      TEXT  #/texts/2         pl        0.49\n",
+      "9   semantic   8104816477335951616      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language   8883813848999863888      TEXT  #/texts/3         en        0.49\n",
+      "11  semantic   8883813848999863888      TEXT  #/texts/3  meta-data        0.81\n",
       "2307.12571.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Document dewarping from a distorted camera-cap...\n",
-      "1     title  #/texts/2  MataDoc: Margin and Text Aware Document Dewarp...\n",
-      "2    author  #/texts/2                                          Beiya Dai\n",
-      "3    author  #/texts/2                                          Qunyi Xie\n",
-      "4    author  #/texts/2                                           Yulin Li\n",
-      "5    author  #/texts/2                                        Xiameng Qin\n",
-      "6    author  #/texts/2                                    Chengquan Zhang\n",
-      "7    author  #/texts/2                                            Kun Yao\n",
-      "8    author  #/texts/2                                          Junyu Han\n",
-      "9    author  #/texts/2                                          Baidu Inc\n",
+      "title:  MataDoc: Margin and Text Aware Document Dewarping for Arbitrary Boundary\n",
+      "abstract:  Abstract Document dewarping from a distorted camera-captured image is of great value for OCR and document understanding. The document boundary plays an important role which is more evident than the inner region in document dewarping. Current learning-based methods mainly focus on complete boundary cases, leading to poor document correction performance of documents with incomplete boundaries. In contrast to these methods, this paper proposes MataDoc, the first method focusing on arbitrary boundary document dewarping with margin and text aware regularizations. Specifically, we design the margin regularization by explicitly considering background consistency to enhance boundary perception. Moreover, we introduce word position consistency to keep text lines straight in rectified document images. To produce a comprehensive evaluation of MataDoc, we propose a novel benchmark ArbDoc, mainly consisting of document images with arbitrary boundaries in four typical scenarios. Extensive experiments confirm the superiority of MataDoc with consideration for the incomplete boundary on ArbDoc and also demonstrate the effectiveness of the proposed method on DocUNet, DIR300, and Warp-Doc datasets.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language    742822728802086802  DOCUMENT          #         en        0.99\n",
+      "1   metadata  14673947180037347255  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   6340112537169098581  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata   9458629370508501719  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language   6284796105560134854      TEXT  #/texts/0         en        0.46\n",
+      "5   semantic   6284796105560134854      TEXT  #/texts/0  reference        0.95\n",
+      "6   language  14673947180037347255      TEXT  #/texts/1         en        0.60\n",
+      "7   semantic  14673947180037347255      TEXT  #/texts/1     header        0.81\n",
+      "8   language   4014942427634426251      TEXT  #/texts/2         en        0.42\n",
+      "9   semantic   4014942427634426251      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   2614023099649271751      TEXT  #/texts/3         en        0.44\n",
+      "11  semantic   2614023099649271751      TEXT  #/texts/3  meta-data        1.00\n",
       "2204.10939.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Document intelligence automates the extraction...\n",
-      "1      title  #/texts/2  Unified Pretraining Framework for Document Und...\n",
-      "2     author  #/texts/2                                        Jiuxiang Gu\n",
-      "3     author  #/texts/2                                     Vlad I Morariu\n",
-      "4     author  #/texts/2                                       Handong Zhao\n",
-      "5     author  #/texts/2                                Nikolaos Barmpalios\n",
-      "6     author  #/texts/2                                         Rajiv Jain\n",
-      "7     author  #/texts/2                                        Ani Nenkova\n",
-      "8     author  #/texts/2                                           Tong Sun\n",
-      "9     author  #/texts/3                                     Adobe Research\n",
-      "10    author  #/texts/3                               Adobe Document Cloud\n",
+      "title:  Unified Pretraining Framework for Document Understanding\n",
+      "abstract:  Abstract Document intelligence automates the extraction of information from documents and supports many business applications. Recent self-supervised learning methods on large-scale unlabeled document datasets have opened up promising directions towards reducing annotation efforts by training models with self-supervised objectives. However, most of the existing document pretraining methods are still language-dominated. We present UDoc, a new unified pretraining framework for document understanding. UDoc is designed to support most document understanding tasks, extending the Transformer to take multimodal embeddings as input. Each input element is composed of words and visual features from a semantic region of the input document image. An important feature of UDoc is that it learns a generic representation by making use of three self-supervised losses, encouraging the representation to model sentences, learn similarities, and align modalities. Extensive empirical analysis demonstrates that the pretraining procedure learns better joint representations and leads to improvements in downstream tasks.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  14399465253152604874  DOCUMENT          #         en        1.00\n",
+      "1   metadata   4747122830571822511  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   9967082317227599151  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   3123119904347243024  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   2608806305983470192      TEXT  #/texts/0         en        0.35\n",
+      "5   semantic   2608806305983470192      TEXT  #/texts/0       text        0.96\n",
+      "6   language   4747122830571822511      TEXT  #/texts/1         en        0.68\n",
+      "7   semantic   4747122830571822511      TEXT  #/texts/1     header        0.86\n",
+      "8   language   4979022925906638436      TEXT  #/texts/2         en        0.19\n",
+      "9   semantic   4979022925906638436      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   6628659997364410072      TEXT  #/texts/3         en        0.32\n",
+      "11  semantic   6628659997364410072      TEXT  #/texts/3  meta-data        1.00\n",
       "2308.10511.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Computer Science and Engineering Shahjalal Uni...\n",
-      "1    author  #/texts/3                                   Computer Science\n",
-      "2     title  #/texts/3                                     Shrestha Datta\n",
-      "3    author  #/texts/4                                      Raisa Fairooz\n",
+      "title:  Shrestha Datta\n",
+      "abstract:  Computer Science and Engineering Shahjalal University of Science and Technology Sylhet, Bangladesh raisafairoozshafa@gmail.com Tariful Islam Fahim Computer Science and Engineering Shahjalal University of Science and Technology Sylhet, Bangladesh tarifulislamfahim12@gmail.com $^{Abstract}$-Understanding digital documents is like solving a puzzle, especially historical ones. Document Layout Analysis (DLA) helps with this puzzle by dividing documents into sections like paragraphs, images, and tables. This is crucial for machines to read and understand these documents. In the DL Sprint 2.0 competition, we worked on understanding Bangla documents. We used a dataset called BaDLAD with lots of examples. We trained a special model called Mask R-CNN to help with this understanding. We made this model better by step-by-step hyperparameter tuning, and we achieved a good dice score of 0.889. However, not everything went perfectly. We tried using a model trained for English documents, but it didn't fit well with Bangla. This showed us that each language has its own challenges. Our solution for the DL Sprint 2.0 is publicly available at https://www.kaggle.com/competitions/dlsprint2/discussion/432201 along with notebooks, weights, and inference notebook. $^{Index Terms}$-Instant Segmentation, Mask-RCNN, DLA\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language    260975602765610135  DOCUMENT          #         en        0.97\n",
+      "1   metadata  17590598390767860458  DOCUMENT  #/texts/2      title        1.00\n",
+      "2   metadata   9809662086485383601  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  14265448797484868543  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   metadata  16405199266218820589  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "5   metadata  18136302189533099052  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "6   language  11994155766495304257      TEXT  #/texts/0         en        0.34\n",
+      "7   semantic  11994155766495304257      TEXT  #/texts/0  reference        0.86\n",
+      "8   language   1717063881973427376      TEXT  #/texts/1         en        0.52\n",
+      "9   semantic   1717063881973427376      TEXT  #/texts/1     header        0.95\n",
+      "10  language  17590598390767860458      TEXT  #/texts/2         en        0.72\n",
+      "11  semantic  17590598390767860458      TEXT  #/texts/2  reference        0.78\n",
       "2205.08094.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  We present MATrIX-a Modality-Aware Transformer...\n",
-      "1     title  #/texts/2  MATrIX-Modality-Aware Transformer for Informat...\n",
-      "2    author  #/texts/2                                     Thomas Delteil\n",
-      "3    author  #/texts/4                                     Edouard Belval\n",
-      "4    author  #/texts/6                                           Lei Chen\n",
+      "title:  MATrIX-Modality-Aware Transformer for Information eXtraction\n",
+      "abstract:  Abstract We present MATrIX-a Modality-Aware Transformer for Information eXtraction in the Visual Document Understanding (VDU) domain. VDU covers information extraction from visually rich documents such as forms, invoices, receipts, tables, graphs, presentations, or advertisements. In these, text semantics and visual information supplement each other to provide a global understanding of the document. MATrIX is pre-trained in an unsupervised way with specifically designed tasks that require the use of multimodal information (spatial, visual, or textual). We consider the spatial and text modalities all at once in a single token set. To make the attention more flexible, we use a learned modality-aware relative bias in the attention mechanism to modulate the attention between the tokens of different modalities. We evaluate MATrIX on 3 different datasets each with strong baselines.\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language  18149103541339865680  DOCUMENT           #         en   \n",
+      "1   metadata  12704912113251550677  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata  11932713160379830690  DOCUMENT  #/texts/10   abstract   \n",
+      "3   metadata   2977944849444182431  DOCUMENT  #/texts/11   abstract   \n",
+      "4   language  15328238962955998856      TEXT   #/texts/0         en   \n",
+      "5   semantic  15328238962955998856      TEXT   #/texts/0  reference   \n",
+      "6   language  12704912113251550677      TEXT   #/texts/1         en   \n",
+      "7   semantic  12704912113251550677      TEXT   #/texts/1     header   \n",
+      "8   language   3511161955042061787      TEXT   #/texts/2         de   \n",
+      "9   semantic   3511161955042061787      TEXT   #/texts/2  meta-data   \n",
+      "10  language    351589254607964310      TEXT   #/texts/3         de   \n",
+      "11  semantic    351589254607964310      TEXT   #/texts/3  meta-data   \n",
+      "\n",
+      "    confidence  \n",
+      "0         0.98  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         0.60  \n",
+      "5         0.66  \n",
+      "6         0.45  \n",
+      "7         0.83  \n",
+      "8         0.50  \n",
+      "9         0.99  \n",
+      "10        0.56  \n",
+      "11        0.93  \n",
       "2010.01762.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  In layout object detection problems, the groun...\n",
-      "1     title  #/texts/2  OLALA : Object-Level Active Learning based Lay...\n",
-      "2    author  #/texts/2                                       Zejiang Shen\n",
-      "3    author  #/texts/2                                          Jian Zhao\n",
-      "4    author  #/texts/2                                       Melissa Dell\n",
-      "5    author  #/texts/2                                        Yaoliang Yu\n",
-      "6    author  #/texts/2                                         Weining Li\n",
-      "7    author  #/texts/3                                 Harvard University\n",
+      "title:  OLALA : Object-Level Active Learning based Layout Annotation\n",
+      "abstract:  Abstract In layout object detection problems, the ground-truth datasets are constructed by annotating object instances individually. Yet active learning for object detection is typically conducted at the image level, not at the object level. Because objects appear with different frequencies across images, image-level active learning may be subject to over-exposure to common objects. This reduces the efficiency of human labeling. This work introduces an Object-Level Active Learning Layout Annotation framework, OLALA, which includes an object scoring method and a prediction correction algorithm. The object scoring method estimates the object prediction informativeness considering both the object category and the location. It selects only the most ambiguous object prediction regions within an image for annotators to label, optimizing the use of the annotation budget. For the unselected model predictions, we propose a correction algorithm to rectify two types of potential errors with minor supervision from ground-truths. The human annotated and model predicted objects are then merged as new image annotations for training the object detection models. In simulated labeling experiments, we show that OLALA helps to create the dataset more efficiently and report strong accuracy improvements of the trained models compared to image-level active learning baselines. The code is available at https://github.com/ lolipopshock/Detectron2 AL.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  10724987884045795509  DOCUMENT          #         en        1.00\n",
+      "1   metadata   2022203075097451439  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata    621634545155468654  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   6076357810235503844  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  18121462109731020905      TEXT  #/texts/0         en        0.43\n",
+      "5   semantic  18121462109731020905      TEXT  #/texts/0  reference        0.94\n",
+      "6   language   2022203075097451439      TEXT  #/texts/1         en        0.51\n",
+      "7   semantic   2022203075097451439      TEXT  #/texts/1     header        0.93\n",
+      "8   language  10515800068930002976      TEXT  #/texts/2         en        0.27\n",
+      "9   semantic  10515800068930002976      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  16070955043266230473      TEXT  #/texts/3         en        0.65\n",
+      "11  semantic  16070955043266230473      TEXT  #/texts/3  meta-data        0.99\n",
       "2106.14616.pdf\n",
-      "  subtype  subj_path                                               name\n",
-      "0   title  #/texts/2  ICDAR 2021 Competition on Scientific Literatur...\n",
-      "1  author  #/texts/2                                        Peter Zhong\n",
-      "2  author  #/texts/2                                    Douglas Burdick\n",
-      "3  author  #/texts/7                                   Research Almaden\n",
+      "title:  ICDAR 2021 Competition on Scientific Literature Parsing\n",
+      "abstract:  ['contain important information related to cutting-edge innovations in diverse domains. Advances in natural language processing have been driving the fast development in automated information extraction from scientific literature. However, scientific literature is often available in unstructured PDF format. While PDF is great for preserving basic visual elements, such as characters, lines, shapes, etc., on a canvas for presentation to humans, automatic processing of the PDF format by machines presents many challenges. With over 2.5 trillion PDF documents in existence, these issues are prevalent in many other important application domains as well. Our ICDAR 2021 Scientific Literature Parsing Competition (ICDAR2021-SLP) aims to drive the advances specifically in document understanding. ICDAR2021-SLP leverages the PubLayNet and PubTabNet datasets, which provide hundreds of thousands of training and evaluation examples. In Task A, Document Layout Recognition, submissions with the highest performance combine object detection and specialised solutions for the different categories. In Task B, Table Recognition, top submissions rely on methods to identify table components and post-processing methods to generate the table structure and content. Results from both tasks show an impressive performance and opens the possibility for high performance practical applications.']\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  17927538766348339794  DOCUMENT          #         en        1.00\n",
+      "1   metadata     90058776505785949  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   language  13303669633711505885      TEXT  #/texts/0         en        0.51\n",
+      "3   semantic  13303669633711505885      TEXT  #/texts/0  reference        0.67\n",
+      "4   language     90058776505785949      TEXT  #/texts/1         en        0.72\n",
+      "5   semantic     90058776505785949      TEXT  #/texts/1     header        0.69\n",
+      "6   language  13081534621825993538      TEXT  #/texts/2         en        0.42\n",
+      "7   semantic  13081534621825993538      TEXT  #/texts/2  meta-data        0.98\n",
+      "8   language   1534861440285625209      TEXT  #/texts/3         en        0.20\n",
+      "9   semantic   1534861440285625209      TEXT  #/texts/3  meta-data        1.00\n",
+      "10  language  17359734174191143350      TEXT  #/texts/4         en        0.87\n",
+      "11  semantic  17359734174191143350      TEXT  #/texts/4  meta-data        1.00\n",
       "2308.01979.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. We present a comprehensive study of ...\n",
-      "1     title  #/texts/3              Saleem Ahmed() [0000-0001-8648-9625],\n",
-      "2    author  #/texts/3                                      Bhavin Jawade\n",
-      "3    author  #/texts/3                                     Shubham Pandey\n",
-      "4    author  #/texts/3                                 Srirangaraj Setlur\n",
-      "5    author  #/texts/3                                   Venu Govindaraju\n",
+      "title:  Saleem Ahmed() [0000-0001-8648-9625],\n",
+      "abstract:  Abstract. We present a comprehensive study of chart visual questionanswering(QA) task, to address the challenges faced in comprehending and extracting data from chart visualizations within documents. Despite efforts to tackle this problem using synthetic charts, solutions are limited by the shortage of annotated real-world data. To fill this gap, we introduce a benchmark and dataset for chart visual QA on real-world charts, offering a systematic analysis of the task and a novel taxonomy for template-based chart question creation. Our contribution includes the introduction of a new answer type, 'list', with both ranked and unranked variations. Our study is conducted on a real-world chart dataset from scientific literature, showcasing higher visual complexity compared to other works. Our focus is on template-based QA and how it can serve as a standard for evaluating the first-order logic capabilities of models. The results of our experiments, conducted on a real-world out-of-distribution dataset, provide a robust evaluation of large-scale pre-trained models and advance the field of chart visual QA and formal logic verification for neural networks in general. Our code and dataset is publicly available $^{1}$. Keywords: Charts and Document Understanding and Reasoning\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   3808109563705575166  DOCUMENT          #         en        0.99\n",
+      "1   metadata   4019322271052231808  DOCUMENT  #/texts/2      title        1.00\n",
+      "2   metadata  11773917095604246068  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  16220849979362681651  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  17985728316496339607      TEXT  #/texts/0         en        0.29\n",
+      "5   semantic  17985728316496339607      TEXT  #/texts/0  reference        0.86\n",
+      "6   language   9923502157441382971      TEXT  #/texts/1         en        0.79\n",
+      "7   semantic   9923502157441382971      TEXT  #/texts/1     header        0.70\n",
+      "8   language   4019322271052231808      TEXT  #/texts/2         en        0.26\n",
+      "9   semantic   4019322271052231808      TEXT  #/texts/2  reference        0.64\n",
+      "10  language   5576611028079447637      TEXT  #/texts/3         en        0.46\n",
+      "11  semantic   5576611028079447637      TEXT  #/texts/3  meta-data        0.93\n",
       "2308.15517.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Document AI aims to automatically analyze docu...\n",
-      "1     title  #/texts/1  Document AI: A Comparative Study of Transforme...\n",
-      "2    author  #/texts/2                                      Shaomu Tan Uv\n",
+      "title:  Document AI: A Comparative Study of Transformer-Based, Graph-Based Models, and Convolutional Neural Networks For Document Layout Analysis\n",
+      "abstract:  ABSTRACT Document AI aims to automatically analyze documents by leveraging natural language processing and computer vision techniques. One of the major tasks of Document AI is document layout analysis, which structures document pages by interpreting the content and spatial relationships of layout, image, and text. This task can be image-centric, wherein the aim is to identify and label various regions such as authors and paragraphs, or text-centric, where the focus is on classifying individual words in a document. Although there are increasingly sophisticated methods for improving layout analysis, doubts remain about the extent to which their findings can be generalized to a broader context. Specifically, prior work developed systems based on very different architectures, such as transformer-based, graph-based, and CNNs. However, no work has mentioned the effectiveness of these models in a comparative analysis. Moreover, while language-independent Document AI models capable of knowledge transfer have been developed, it remains to be investigated to what degree they can effectively transfer knowledge. In this study, we aim to fill these gaps by conducting a comparative evaluation of state-of-the-art models in document layout analysis and investigating the potential of cross-lingual layout analysis by utilizing machine translation techniques. KEYWORDS Document AI, Document Layout Analysis, Vision and Language, Multilingual Document Understanding, Machine Translation\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  13637588366794903612  DOCUMENT          #         en        0.99\n",
+      "1   metadata  18194790379355975476  DOCUMENT  #/texts/0      title        1.00\n",
+      "2   metadata   6741446958383146662  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata  15052902537839731467  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   metadata   6741617251885352993  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "5   metadata  12872026505660432034  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "6   language  18194790379355975476      TEXT  #/texts/0         en        0.60\n",
+      "7   semantic  18194790379355975476      TEXT  #/texts/0     header        0.70\n",
+      "8   language  11017632467139747605      TEXT  #/texts/1         en        0.29\n",
+      "9   semantic  11017632467139747605      TEXT  #/texts/1  meta-data        1.00\n",
+      "10  language   1435601097181785708      TEXT  #/texts/2         nl        0.10\n",
+      "11  semantic   1435601097181785708      TEXT  #/texts/2  meta-data        1.00\n",
       "2012.14740.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Pre-training of text and layout has proved eff...\n",
-      "1      title  #/texts/2  LAYOUTLMV2: MULTI-MODAL PRE-TRAINING FOR VISUA...\n",
-      "2     author  #/texts/2                                            Yang Xu\n",
-      "3     author  #/texts/2                                          Yiheng Xu\n",
-      "4     author  #/texts/2                                            Lei Cui\n",
-      "5     author  #/texts/2                                           Furu Wei\n",
-      "6     author  #/texts/2                                        Guoxin Wang\n",
-      "7     author  #/texts/2                                          Yijuan Lu\n",
-      "8     author  #/texts/3                                    Dinei Florencio\n",
-      "9     author  #/texts/3                                          Cha Zhang\n",
-      "10    author  #/texts/3                                       Wanxiang Che\n",
-      "11    author  #/texts/3                                          Min Zhang\n",
-      "12    author  #/texts/3                                        Lidong Zhou\n",
-      "13    author  #/texts/5                            Microsoft Research Asia\n",
-      "14    author  #/texts/6                                    Microsoft Cloud\n",
-      "15    author  #/texts/7                                 Soochow University\n",
+      "title:  LAYOUTLMV2: MULTI-MODAL PRE-TRAINING FOR VISUALLY-RICH DOCUMENT UNDERSTANDING\n",
+      "abstract:  ABSTRACT Pre-training of text and layout has proved effective in a variety of visuallyrich document understanding tasks due to its effective model architecture and the advantage of large-scale unlabeled scanned/digital-born documents. In this paper, we present LayoutLMv2 by pre-training text, layout and image in a multi-modal framework, where new model architectures and pre-training tasks are leveraged. Specifically, LayoutLMv2 not only uses the existing masked visual-language modeling task but also the new text-image alignment and textimage matching tasks in the pre-training stage, where cross-modality interaction is better learned. Meanwhile, it also integrates a spatial-aware selfattention mechanism into the Transformer architecture, so that the model can fully understand the relative positional relationship among different text blocks. Experiment results show that LayoutLMv2 outperforms strong baselines and achieves new state-of-the-art results on a wide variety of downstream visuallyrich document understanding tasks, including FUNSD (0.7895 → 0.8420), CORD (0.9493 → 0.9601), SROIE (0.9524 → 0.9781), Kleister-NDA (0.834 → 0.852), RVL-CDIP (0.9443 → 0.9564), and DocVQA (0.7295 → 0.8672).\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language  14425372393029709547  DOCUMENT           #         en   \n",
+      "1   metadata  11474741870108700040  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata   4857877928660915249  DOCUMENT  #/texts/12   abstract   \n",
+      "3   metadata  11909429981990900791  DOCUMENT  #/texts/13   abstract   \n",
+      "4   language   9433559876163775546      TEXT   #/texts/0         en   \n",
+      "5   semantic   9433559876163775546      TEXT   #/texts/0  reference   \n",
+      "6   language  11474741870108700040      TEXT   #/texts/1         en   \n",
+      "7   semantic  11474741870108700040      TEXT   #/texts/1     header   \n",
+      "8   language  17478075665133754721      TEXT   #/texts/2         en   \n",
+      "9   semantic  17478075665133754721      TEXT   #/texts/2  meta-data   \n",
+      "10  language  11558241638393646319      TEXT   #/texts/3         en   \n",
+      "11  semantic  11558241638393646319      TEXT   #/texts/3  meta-data   \n",
+      "\n",
+      "    confidence  \n",
+      "0         0.98  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         0.37  \n",
+      "5         0.94  \n",
+      "6         0.37  \n",
+      "7         0.85  \n",
+      "8         0.40  \n",
+      "9         1.00  \n",
+      "10        0.13  \n",
+      "11        1.00  \n",
       "2304.12506.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Online learning and academic conferen...\n",
-      "1     title  #/texts/2  DualSlide: Global-to-Local Sketching Interface...\n",
-      "2    author  #/texts/2                                         Xusheng Du\n",
-      "3    author  #/texts/2                                         Haoran Xie\n",
+      "title:  DualSlide: Global-to-Local Sketching Interface for Slide Content and Layout Design\n",
+      "abstract:  Abstract-Online learning and academic conferences have become pervasive and essential for education and professional development, especially since the onset of pandemics. Academic presentations usually require well-designed slides that are easily understood. Sketches that visually represent design intentions and are readily accessible to the average users. To assist non-expert users in creating visually appealing academic slides, we propose DualSlide, a global and local two-stage sketching interface system that provides image retrieval and user guidance. At the global stage, DualSlide provides a heat map canvas to display the distribution of all slide layouts in a dataset, allowing users to explore the reference slides efficiently. At the local stage of the system, detailed references and guidance for designing slide content, such as diagrams and fonts, can be provided. We further propose a sketch-matching algorithm to compare the user's input sketch and similar diagrams. All user guidance can be adapted in real-time editing, and users can design slides with a high degree of freedom. We conducted a user study to verify the effectiveness and usability of the proposed DualSlide system confirming that DualSlide provides high retrieval accuracy and satisfactory design results with a good user experience. Index Terms-two-stage design, sketching interface, slides, layout design\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   5705758540542284734  DOCUMENT          #         en        1.00\n",
+      "1   metadata   5519579206395144526  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  16967503306961863663  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata   6035262646238865268  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language   1595675669711621451      TEXT  #/texts/0         en        0.45\n",
+      "5   semantic   1595675669711621451      TEXT  #/texts/0       text        0.96\n",
+      "6   language   5519579206395144526      TEXT  #/texts/1         en        0.63\n",
+      "7   semantic   5519579206395144526      TEXT  #/texts/1     header        0.92\n",
+      "8   language   7737451635683353484      TEXT  #/texts/2         en        0.45\n",
+      "9   semantic   7737451635683353484      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   4795755608797352799      TEXT  #/texts/3         en        0.20\n",
+      "11  semantic   4795755608797352799      TEXT  #/texts/3  meta-data        0.94\n",
       "2008.10831.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Localizing page elements/objects such...\n",
-      "1     title  #/texts/2  CDeC-Net: Composite Deformable Cascade Network...\n",
-      "2    author  #/texts/2                                     Madhav Agarwal\n",
-      "3    author  #/texts/4                                        C V Jawahar\n",
+      "title:  CDeC-Net: Composite Deformable Cascade Network for Table Detection in Document Images\n",
+      "abstract:  Abstract-Localizing page elements/objects such as tables, figures, equations, etc. is the primary step in extracting information from document images. We propose a novel endto-end trainable deep network, (CDeC-Net) for detecting tables present in the documents. The proposed network consists of a multistage extension of Mask R-CNN with a dual backbone having deformable convolution for detecting tables varying in scale with high detection accuracy at higher IoU threshold. We empirically evaluate CDeC-Net on all the publicly available benchmark datasets-ICDAR-2013, ICDAR-2017, ICDAR-2019, UNLV, Marmot, PubLayNet, and TableBank-with extensive experiments. Our solution has three important properties: (i) a single trained model CDeC-Net ‡ performs well across all the popular benchmark datasets; (ii) we report excellent performances across multiple, including higher, thresholds of IoU; (iii) by following the same protocol of the recent papers for each of the benchmarks, we consistently demonstrate the superior quantitative performance. Our code and models will be publicly released for enabling the reproducibility of the results. Keywords-Page object, table detection, Cascade Mask R-CNN, deformable convolution, single model.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   7500632956678550284  DOCUMENT          #         en        1.00\n",
+      "1   metadata   3886870210686438049  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   7844835822484761539  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  16959079097357691233  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   metadata  10826267580776457888  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "5   language   8245647462462407059      TEXT  #/texts/0         en        0.42\n",
+      "6   semantic   8245647462462407059      TEXT  #/texts/0  reference        0.99\n",
+      "7   language   3886870210686438049      TEXT  #/texts/1         en        0.45\n",
+      "8   semantic   3886870210686438049      TEXT  #/texts/1     header        0.72\n",
+      "9   language  15487035232066340389      TEXT  #/texts/2         en        0.56\n",
+      "10  semantic  15487035232066340389      TEXT  #/texts/2  meta-data        0.92\n",
+      "11  language  17396645985426887891      TEXT  #/texts/3         en        0.29\n",
       "2301.11529.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Layout design is an important task in various ...\n",
-      "1     title  #/texts/2  PLay: Parametrically Conditioned Layout Genera...\n",
-      "2    author  #/texts/2                                      Forrest Huang\n",
-      "3    author  #/texts/2                                            Gang Li\n",
-      "4    author  #/texts/2                                            Yang Li\n",
+      "title:  PLay: Parametrically Conditioned Layout Generation using Latent Diffusion\n",
+      "abstract:  Abstract Layout design is an important task in various design fields, including user interfaces, document, and graphic design. As this task requires tedious manual effort by designers, prior works have attempted to automate this process using generative models, but commonly fell short of providing intuitive user controls and achieving design objectives. In this paper, we build a conditional latent diffusion model, PLay, that generates parametrically conditioned layouts in vector graphic space from user-specified guidelines, which are commonly used by designers for representing their design intents in current practices. Our method outperforms prior works across three datasets on metrics including FID and FD-VG, and in user test. Moreover, it brings a novel and interactive experience to professional layout design processes.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   2466707221960475363  DOCUMENT          #         en        1.00\n",
+      "1   metadata   2108956444558052725  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   6816454215830934296  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata  14315677914332171523  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   language   7964423042393186425      TEXT  #/texts/0         en        0.35\n",
+      "5   semantic   7964423042393186425      TEXT  #/texts/0       text        0.97\n",
+      "6   language   2108956444558052725      TEXT  #/texts/1         en        0.32\n",
+      "7   semantic   2108956444558052725      TEXT  #/texts/1     header        0.76\n",
+      "8   language   1180747206188371815      TEXT  #/texts/2         en        0.37\n",
+      "9   semantic   1180747206188371815      TEXT  #/texts/2  meta-data        0.98\n",
+      "10  language   6816454215830934296      TEXT  #/texts/3         en        0.32\n",
+      "11  semantic   6816454215830934296      TEXT  #/texts/3     header        0.93\n",
       "2308.13769.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Document digitization is vital for pr...\n",
-      "1     title  #/texts/2   Bengali Document Layout Analysis with Detectron2\n",
-      "2    author  #/texts/2                                        Md Ataullha\n",
-      "3    author  #/texts/2                                Mahedi Hassan Rabby\n",
-      "4    author  #/texts/2                                   Mushfiqur Rahman\n",
-      "5    author  #/texts/2                                Tahsina Bintay Azam\n",
+      "title:  Bengali Document Layout Analysis with Detectron2\n",
+      "abstract:  Abstract-Document digitization is vital for preserving historical records, efficient document management, and advancing OCR (Optical Character Recognition) research. Document Layout Analysis (DLA) involves segmenting documents into meaningful units like text boxes, paragraphs, images, and tables. Challenges arise when dealing with diverse layouts, historical documents, and unique scripts like Bengali, hindered by the lack of comprehensive Bengali DLA datasets. We improved the accuracy of the DLA model for Bengali documents by utilizing advanced Mask R-CNN models available in the Detectron2 library. Our evaluation involved three variants: Mask R-CNN R-50, R-101, and X-101, both with and without pretrained weights from PubLayNet, on the BaDLAD dataset, which contains human-annotated Bengali documents in four categories: text boxes, paragraphs, images, and tables. Results show the effectiveness of these models in accurately segmenting Bengali documents. We discuss speed-accuracy tradeoffs and underscore the significance of pretrained weights. Our findings expand the applicability of Mask R-CNN in document layout analysis, efficient document management, and OCR research while suggesting future avenues for fine-tuning and data augmentation. Index Terms-Mask R-CNN, Instance Segmentation, Transfer Learning\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  17548983273677985646  DOCUMENT          #         en        1.00\n",
+      "1   metadata    559063730110845019  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  11324618554899354160  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata  10644090233689937859  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   metadata  11368196014538905625  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "5   metadata   4911548022119256481  DOCUMENT  #/texts/9   abstract        1.00\n",
+      "6   language   3042703310976267912      TEXT  #/texts/0         en        0.31\n",
+      "7   semantic   3042703310976267912      TEXT  #/texts/0  reference        0.86\n",
+      "8   language    559063730110845019      TEXT  #/texts/1         en        0.48\n",
+      "9   semantic    559063730110845019      TEXT  #/texts/1     header        0.82\n",
+      "10  language  14447214234305273736      TEXT  #/texts/2         en        0.23\n",
+      "11  semantic  14447214234305273736      TEXT  #/texts/2  meta-data        0.99\n",
       "2112.12703.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Archivists, textual scholars, and hi...\n",
-      "1     title  #/texts/2  Digital Editions as Distant Supervision for La...\n",
-      "2    author  #/texts/2                                Alejandro H Toselli\n",
-      "3    author  #/texts/2                                              Si Wu\n",
-      "4    author  #/texts/2                                      David A Smith\n",
-      "5    author  #/texts/4                            Northeastern University\n",
+      "title:  Digital Editions as Distant Supervision for Layout Analysis of Printed Books ⋆\n",
+      "abstract:  Abstract. Archivists, textual scholars, and historians often produce digital editions of historical documents. Using markup schemes such as those of the Text Encoding Initiative and EpiDoc, these digital editions often record documents' semantic regions (such as notes and figures) and physical features (such as page and line breaks) as well as transcribing their textual content. We describe methods for exploiting this semantic markup as distant supervision for training and evaluating layout analysis models. In experiments with several model architectures on the half-million pages of the Deutsches Textarchiv (DTA), we find a high correlation of these region-level evaluation methods with pixel-level and word-level metrics. We discuss the possibilities for improving accuracy with self-training and the ability of models trained on the DTA to generalize to other historical printed books. Keywords: Layout analysis · Distant supervision · Evaluation.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  15802633760579864106  DOCUMENT          #         en        0.99\n",
+      "1   metadata   6470581748295618029  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  13796643975352810616  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   3726832405081047624  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  11473093061238217778      TEXT  #/texts/0         en        0.59\n",
+      "5   semantic  11473093061238217778      TEXT  #/texts/0  reference        0.89\n",
+      "6   language   6470581748295618029      TEXT  #/texts/1         en        0.91\n",
+      "7   semantic   6470581748295618029      TEXT  #/texts/1     header        0.88\n",
+      "8   language  16387431572513897996      TEXT  #/texts/2         en        0.31\n",
+      "9   semantic  16387431572513897996      TEXT  #/texts/2  meta-data        0.98\n",
+      "10  language   7570883125013524308      TEXT  #/texts/3         en        0.75\n",
+      "11  semantic   7570883125013524308      TEXT  #/texts/3  meta-data        0.96\n",
       "2305.08455.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  We call on the Document AI (DocAI) community t...\n",
-      "1      title  #/texts/2  Document Understanding Dataset and Evaluation ...\n",
-      "2     author  #/texts/2                                Jordy Van Landeghem\n",
-      "3     author  #/texts/2                                   Dawid Jurkiewicz\n",
-      "4     author  #/texts/2                                   Bertrand Ackaert\n",
-      "5     author  #/texts/2                                     Ernest Valveny\n",
-      "6     author  #/texts/2                                   Matthew Blaschko\n",
-      "7     author  #/texts/2                                         Sien Moens\n",
-      "8     author  #/texts/2                             Computer Vision Center\n",
-      "9     author  #/texts/2                                    Universitat Aut\n",
-      "10    author  #/texts/2                            Jagiellonian University\n",
-      "11    author  #/texts/2                         Adam Mickiewicz University\n",
+      "title:  Document Understanding Dataset and Evaluation (DUDE)\n",
+      "abstract:  Abstract We call on the Document AI (DocAI) community to reevaluate current methodologies and embrace the challenge of creating more practically-oriented benchmarks. Document Understanding Dataset and Evaluation (DUDE) seeks to remediate the halted research progress in understanding visually-rich documents (VRDs). We present a new dataset with novelties related to types of questions, answers, and document layouts based on multi-industry, multi-domain, and multi-page VRDs of various origins, and dates. Moreover, we are pushing the boundaries of current methods by creating multi-task and multi-domain evaluation setups that more accurately simulate real-world situations where powerful generalization and adaptation under low-resource settings are desired. DUDE aims to set a new standard as a more practical, long-standing benchmark for the community, and we hope that it will lead to future extensions and contributions that address real-world challenges. Finally, our work illustrates the importance of finding more efficient ways to model language, images, and layout in DocAI.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language    101647772389983397  DOCUMENT          #         en        0.99\n",
+      "1   metadata  10331356580876402369  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  14331103659799847498  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata   4393248028108810465  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   language   7099592825139238992      TEXT  #/texts/0         en        0.61\n",
+      "5   semantic   7099592825139238992      TEXT  #/texts/0  reference        0.66\n",
+      "6   language  10331356580876402369      TEXT  #/texts/1         en        0.48\n",
+      "7   semantic  10331356580876402369      TEXT  #/texts/1     header        0.88\n",
+      "8   language   9476829149127177963      TEXT  #/texts/2         pl        0.18\n",
+      "9   semantic   9476829149127177963      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language  14331103659799847498      TEXT  #/texts/3         en        0.32\n",
+      "11  semantic  14331103659799847498      TEXT  #/texts/3     header        0.93\n",
       "2212.12975.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  It is difficult to design a visually appealing...\n",
-      "1     title  #/texts/2  Interactive Layout Drawing Interface with Shad...\n",
-      "2    author  #/texts/2                                         Haoran Xie\n",
+      "title:  Interactive Layout Drawing Interface with Shadow Guidance\n",
+      "abstract:  ABSTRACT It is difficult to design a visually appealing layout for common users, which takes time even for professional designers. In this paper, we present an interactive layout design system with shadow guidance and layout retrieval to help users obtain satisfactory design results. This study focuses in particular on the design of academic presentation slides. The user may refer to the shadow guidance as a heat map, which is the layout distribution of our gathered data set, using the suggested shadow guidance. The suggested system is datadriven, allowing users to analyze the design data naturally. The layout may then be edited by the user to finalize the layout design. We validated the suggested interface in our user study by comparing it with common design interfaces. The findings show that the suggested interface may achieve high retrieval accuracy while simultaneously offering a pleasant user experience. Keywords: Educational video, slide-based video, user interface, layout design\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  17236209161277235289  DOCUMENT          #         en        1.00\n",
+      "1   metadata   9715825528859916993  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   2496493818778864004  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata   8708013986464472762  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   metadata  16728472205694020197  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "5   language   7593916797503527452      TEXT  #/texts/0         en        0.67\n",
+      "6   semantic   7593916797503527452      TEXT  #/texts/0       text        0.57\n",
+      "7   language   9715825528859916993      TEXT  #/texts/1         en        0.64\n",
+      "8   semantic   9715825528859916993      TEXT  #/texts/1     header        0.87\n",
+      "9   language   3488045698984828010      TEXT  #/texts/2         en        0.84\n",
+      "10  semantic   3488045698984828010      TEXT  #/texts/2  meta-data        1.00\n",
+      "11  language   8233996923867261655      TEXT  #/texts/3         en        0.65\n",
       "2306.05749.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Recently, there has been a growing interest in...\n",
-      "1     title  #/texts/2  DocAligner: Annotating Real-world Photographic...\n",
-      "2    author  #/texts/2                                       Jiaxin Zhang\n",
-      "3    author  #/texts/2                                      Bangdong Chen\n",
-      "4    author  #/texts/2                                        Hiuyi Cheng\n",
-      "5    author  #/texts/2                                        Fengjun Guo\n",
-      "6    author  #/texts/2                                           Kai Ding\n",
-      "7    author  #/texts/2                                        Lianwen Jin\n",
-      "8    author  #/texts/3                                 Sig Information Co\n",
+      "title:  DocAligner: Annotating Real-world Photographic Document Images by Simply Taking Pictures\n",
+      "abstract:  Abstract Recently, there has been a growing interest in research concerning document image analysis and recognition in photographic scenarios. However, the lack of labeled datasets for this emerging challenge poses a significant obstacle, as manual annotation can be time-consuming and impractical. To tackle this issue, we present DocAligner, a novel method that streamlines the manual annotation process to a simple step of taking pictures. DocAligner achieves this by establishing dense correspondence between photographic document images and their clean counterparts. It enables the automatic transfer of existing annotations in clean document images to photographic ones and helps to automatically acquire labels that are unavailable through manual labeling. Considering the distinctive characteristics of document images, DocAligner incorporates several innovative features. First, we propose a non-rigid pre-alignment technique based on the document's edges, which effectively eliminates interference caused by significant global shifts and repetitive patterns present in document images. Second, to handle large shifts and ensure high accuracy, we introduce a hierarchical aligning approach that combines global and local correlation layers. Furthermore, considering the importance of fine-grained elements in document images, we present a details recurrent refinement module to enhance the output in a high-resolution space. To train DocAligner, we construct a synthetic dataset and introduce a self-supervised learning approach to enhance its robustness for real-world data. Through extensive experiments, we demonstrate the effectiveness of DocAligner and the acquired dataset. Datasets and codes will be publicly available.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  16350862388529045349  DOCUMENT          #         en        1.00\n",
+      "1   metadata   1693234739285510286  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   8284927886365265466  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  12966088567558872775  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   4212173990135466214      TEXT  #/texts/0         en        0.32\n",
+      "5   semantic   4212173990135466214      TEXT  #/texts/0  reference        0.95\n",
+      "6   language   1693234739285510286      TEXT  #/texts/1         en        0.70\n",
+      "7   semantic   1693234739285510286      TEXT  #/texts/1     header        0.46\n",
+      "8   language   4770053369267445470      TEXT  #/texts/2         en        0.29\n",
+      "9   semantic   4770053369267445470      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   1088813220980863929      TEXT  #/texts/3         en        0.48\n",
+      "11  semantic   1088813220980863929      TEXT  #/texts/3  meta-data        0.99\n",
       "2303.11589.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Creating graphic layouts is a fundamental step...\n",
-      "1     title  #/texts/2  LayoutDiffusion: Improving Graphic Layout Gene...\n",
-      "2    author  #/texts/2                                        Junyi Zhang\n",
-      "3    author  #/texts/2                                          Jiaqi Guo\n",
-      "4    author  #/texts/2                                        Shizhao Sun\n",
-      "5    author  #/texts/2                                      Dongmei Zhang\n",
-      "6    author  #/texts/2                      Shanghai Jiao Tong University\n",
-      "7    author  #/texts/2                            Microsoft Research Asia\n",
+      "title:  LayoutDiffusion: Improving Graphic Layout Generation by Discrete Diffusion Probabilistic Models\n",
+      "abstract:  Abstract Creating graphic layouts is a fundamental step in graphic designs. In this work, we present a novel generative model named LayoutDiffusion for automatic layout generation. As layout is typically represented as a sequence of discrete tokens, LayoutDiffusion models layout generation as a discrete denoising diffusion process. It learns to reverse a mild forward process, in which layouts become increasingly chaotic with the growth of forward steps and layouts in the neighboring steps do not differ too much. Designing such a mild forward process is however very challenging as layout has both categorical attributes and ordinal attributes. To tackle the challenge, we summarize three critical factors for achieving a mild forward process for the layout, i.e., legality, coordinate proximity and type disruption. Based on the factors, we propose a block-wise transition matrix coupled with a piece-wise linear noise schedule. Experiments on RICO and PubLayNet datasets show that Layout-Diffusion outperforms state-of-the-art approaches significantly. Moreover, it enables two conditional layout generation tasks in a plug-and-play manner without re-training and achieves better performance than existing methods.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   9641950465761553286  DOCUMENT          #         en        0.99\n",
+      "1   metadata   3540765956142751839  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  15449882581325865178  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  11577166392640436701  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language   6096154047150880050      TEXT  #/texts/0         en        0.30\n",
+      "5   semantic   6096154047150880050      TEXT  #/texts/0       text        0.89\n",
+      "6   language   3540765956142751839      TEXT  #/texts/1         en        0.53\n",
+      "7   semantic   3540765956142751839      TEXT  #/texts/1     header        0.83\n",
+      "8   language  16684261228906829665      TEXT  #/texts/2         en        0.49\n",
+      "9   semantic  16684261228906829665      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   7858426494994361717      TEXT  #/texts/3         ro        0.11\n",
+      "11  semantic   7858426494994361717      TEXT  #/texts/3  meta-data        0.98\n",
       "2012.08191.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-We present docExtractor, a generic ap...\n",
-      "1     title  #/texts/2  docExtractor: An off-the-shelf historical docu...\n",
-      "2    author  #/texts/2                                        Tom Monnier\n",
-      "3    author  #/texts/2                                      Mathieu Aubry\n",
-      "4    author  #/texts/3                                Univ Gustave Eiffel\n",
+      "title:  docExtractor: An off-the-shelf historical document element extraction\n",
+      "abstract:  Abstract-We present docExtractor, a generic approach for extracting visual elements such as text lines or illustrations from historical documents without requiring any real data annotation. We demonstrate it provides high-quality performances as an offthe-shelf system across a wide variety of datasets and leads to results on par with state-of-the-art when fine-tuned. We argue that the performance obtained without fine-tuning on a specific dataset is critical for applications, in particular in digital humanities, and that the line-level page segmentation we address is the most relevant for a general purpose element extraction engine. We rely on a fast generator of rich synthetic documents and design a fully convolutional network, which we show to generalize better than a detection-based approach. Furthermore, we introduce a new public dataset dubbed IlluHisDoc dedicated to the fine evaluation of illustration segmentation in historical documents. Index Terms-deep learning, document layout analysis, historical document, page segmentation, text line detection, synthetic data\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  12746519129587114900  DOCUMENT          #         en        0.99\n",
+      "1   metadata   6487608977047476565  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   5025046227996738281  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata    336290989734687421  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language  16017635334618188321      TEXT  #/texts/0         en        0.50\n",
+      "5   semantic  16017635334618188321      TEXT  #/texts/0  reference        0.99\n",
+      "6   language   6487608977047476565      TEXT  #/texts/1         en        0.74\n",
+      "7   semantic   6487608977047476565      TEXT  #/texts/1       text        0.49\n",
+      "8   language   5569259395429831475      TEXT  #/texts/2         en        0.58\n",
+      "9   semantic   5569259395429831475      TEXT  #/texts/2  meta-data        0.98\n",
+      "10  language   1878354837183421307      TEXT  #/texts/3         fr        0.23\n",
+      "11  semantic   1878354837183421307      TEXT  #/texts/3  meta-data        0.95\n",
       "2106.11539.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  We present DocFormer-a multi-modal transformer...\n",
-      "1     title  #/texts/2  DocFormer: End-to-End Transformer for Document...\n",
-      "2    author  #/texts/2                                  Srikar Appalaraju\n",
-      "3    author  #/texts/4                                      Bhavan Jasani\n",
-      "4    author  #/texts/8                                         R Manmatha\n",
+      "title:  DocFormer: End-to-End Transformer for Document Understanding\n",
+      "abstract:  Abstract We present DocFormer-a multi-modal transformer based architecture for the task of Visual Document Understanding (VDU). VDU is a challenging problem which aims to understand documents in their varied formats (forms, receipts etc.) and layouts. In addition, DocFormer is pre-trained in an unsupervised fashion using carefully designed tasks which encourage multi-modal interaction. DocFormer uses text, vision and spatial features and combines them using a novel multi-modal self-attention layer. DocFormer also shares learned spatial embeddings across modalities which makes it easy for the model to correlate text to visual tokens and vice versa. DocFormer is evaluated on 4 different datasets each with strong baselines. DocFormer achieves state-of-the-art results on all of them, sometimes beating models 4x its size (in no. of parameters).\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language   9235921951217316380  DOCUMENT           #         en   \n",
+      "1   metadata   9729555286773671600  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata  15482861471896913137  DOCUMENT  #/texts/10   abstract   \n",
+      "3   metadata   2622808252535089031  DOCUMENT  #/texts/11   abstract   \n",
+      "4   language   5106629619373635080      TEXT   #/texts/0         en   \n",
+      "5   semantic   5106629619373635080      TEXT   #/texts/0  reference   \n",
+      "6   language   9729555286773671600      TEXT   #/texts/1         en   \n",
+      "7   semantic   9729555286773671600      TEXT   #/texts/1     header   \n",
+      "8   language   5467895438401112969      TEXT   #/texts/2         en   \n",
+      "9   semantic   5467895438401112969      TEXT   #/texts/2  meta-data   \n",
+      "10  language   9961559843998000328      TEXT   #/texts/3         en   \n",
+      "11  semantic   9961559843998000328      TEXT   #/texts/3  meta-data   \n",
+      "\n",
+      "    confidence  \n",
+      "0         0.99  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         0.33  \n",
+      "5         0.95  \n",
+      "6         0.68  \n",
+      "7         0.57  \n",
+      "8         0.62  \n",
+      "9         0.97  \n",
+      "10        0.27  \n",
+      "11        1.00  \n",
       "2308.12896.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  This paper highlights the need to bring docume...\n",
-      "1     title  #/texts/2  Beyond Document Page Classification: Design, D...\n",
-      "2    author  #/texts/2                                Jordy Van Landeghem\n",
-      "3    author  #/texts/2                                      Sanket Biswas\n",
-      "4    author  #/texts/2                                   Matthew Blaschko\n",
-      "5    author  #/texts/3                             Computer Vision Center\n",
-      "6    author  #/texts/3                                    Universitat Aut\n",
+      "title:  Beyond Document Page Classification: Design, Datasets, and Challenges\n",
+      "abstract:  Abstract This paper highlights the need to bring document classification benchmarking closer to real-world applications, both in the nature of data tested (X : multi-channel, multipaged, multi-industry; Y : class distributions and label set variety) and in classification tasks considered (f : multipage document, page stream, and document bundle classification,...). We identify the lack of public multi-page document classification datasets, formalize different classification tasks arising in application scenarios, and motivate the value of targeting efficient multi-page document representations. An experimental study on proposed multi-page document classification datasets demonstrates that current benchmarks have become irrelevant and need to be updated to evaluate complete documents, as they naturally occur in practice. This reality check also calls for more mature evaluation methodologies, covering calibration evaluation, inference complexity (time-memory), and a range of realistic distribution shifts (e.g., born-digital vs. scanning noise, shifting page order). Our study ends on a hopeful note by recommending concrete avenues for future improvements.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   4395650766487548677  DOCUMENT          #         en        0.99\n",
+      "1   metadata  14640984598143256974  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  12465491340188692402  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  10875145229315404845  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  15855267480109489419      TEXT  #/texts/0         en        0.35\n",
+      "5   semantic  15855267480109489419      TEXT  #/texts/0  reference        0.86\n",
+      "6   language  14640984598143256974      TEXT  #/texts/1         en        0.77\n",
+      "7   semantic  14640984598143256974      TEXT  #/texts/1  reference        0.56\n",
+      "8   language   6853249753038199094      TEXT  #/texts/2         en        0.28\n",
+      "9   semantic   6853249753038199094      TEXT  #/texts/2  meta-data        0.97\n",
+      "10  language   6836684947553125863      TEXT  #/texts/3         ca        0.14\n",
+      "11  semantic   6836684947553125863      TEXT  #/texts/3  meta-data        0.95\n",
       "2301.10140.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  The volume of scientific output is creating an...\n",
-      "1      title  #/texts/2            The Semantic Scholar Open Data Platform\n",
-      "2     author  #/texts/2                                 Chloe Anastasiades\n",
-      "3     author  #/texts/2                                     Russell Authur\n",
-      "4     author  #/texts/2                                         Iz Beltagy\n",
-      "5     author  #/texts/2                                     Jonathan Bragg\n",
-      "6     author  #/texts/2                              Alexandra Buraczynski\n",
-      "7     author  #/texts/2                                     Isabel Cachola\n",
-      "8     author  #/texts/2                                      Stefan Candra\n",
-      "9     author  #/texts/2                             Yoganand Chandrasekhar\n",
-      "10    author  #/texts/2                                        Arman Cohan\n",
-      "11    author  #/texts/2                                        Doug Downey\n",
-      "12    author  #/texts/2                                          Rob Evans\n",
-      "13    author  #/texts/2                                     Sergey Feldman\n",
-      "14    author  #/texts/2                                      Joseph Gorney\n",
-      "15    author  #/texts/2                                       David Graham\n",
-      "16    author  #/texts/2                                        Fangzhou Hu\n",
-      "17    author  #/texts/2                                         Regan Huff\n",
-      "18    author  #/texts/2                                        Daniel King\n",
-      "19    author  #/texts/2                                       Bailey Kuehl\n",
-      "20    author  #/texts/2                                     Michael Langan\n",
-      "21    author  #/texts/2                                         Daniel Lin\n",
-      "22    author  #/texts/2                                         Haokun Liu\n",
-      "23    author  #/texts/2                                            Kyle Lo\n",
-      "24    author  #/texts/2                                   Kelsey MacMillan\n",
-      "25    author  #/texts/2                                       Tyler Murray\n",
-      "26    author  #/texts/2                                       Chris Newell\n",
-      "27    author  #/texts/2                                          Smita Rao\n",
-      "28    author  #/texts/2                                    Shaurya Rohatgi\n",
-      "29    author  #/texts/2                                         Paul Sayre\n",
-      "30    author  #/texts/2                                       Zejiang Shen\n",
-      "31    author  #/texts/2                                    Amanpreet Singh\n",
-      "32    author  #/texts/2                                      Luca Soldaini\n",
-      "33    author  #/texts/2                           Shivashankar Subramanian\n",
-      "34    author  #/texts/2                                       Amber Tanaka\n",
-      "35    author  #/texts/2                                        Alex D Wade\n",
-      "36    author  #/texts/2                                       Linda Wagner\n",
-      "37    author  #/texts/2                                       Lucy Lu Wang\n",
-      "38    author  #/texts/2                                        Caroline Wu\n",
-      "39    author  #/texts/2                                    Jiangjiang Yang\n",
-      "40    author  #/texts/2                                    Angele Zamarron\n",
-      "41    author  #/texts/2                               Madeleine Van Zuylen\n",
-      "42    author  #/texts/2                                      Daniel S Weld\n",
+      "title:  The Semantic Scholar Open Data Platform\n",
+      "abstract:  Abstract The volume of scientific output is creating an urgent need for automated tools to help scientists keep up with developments in their field. Semantic Scholar (S2) is an open data platform and website aimed at accelerating science by helping scholars discover and understand scientific literature. We combine public and proprietary data sources using state-of-theart techniques for scholarly PDF content extraction and automatic knowledge graph construction to build the Semantic Scholar Academic Graph, the largest open scientific literature graph to-date, with 200M+ papers, 80M+ authors, 550M+ paper-authorship edges, and 2.4B+ citation edges. The graph includes advanced semantic features such as structurally parsed text, natural language summaries, and vector embeddings. In this paper, we describe the components of the S2 data processing pipeline and the associated APIs offered by the platform. We will update this living document to reflect changes as we add new data offerings and improve existing services.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  17344644664439194234  DOCUMENT          #         en        1.00\n",
+      "1   metadata   4896217369367338170  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  13887372868498991883  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   4492224273364446239  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  10473876935274060974      TEXT  #/texts/0         en        0.31\n",
+      "5   semantic  10473876935274060974      TEXT  #/texts/0       text        0.95\n",
+      "6   language   4896217369367338170      TEXT  #/texts/1         en        0.61\n",
+      "7   semantic   4896217369367338170      TEXT  #/texts/1     header        0.90\n",
+      "8   language  11464820173648324489      TEXT  #/texts/2         en        0.47\n",
+      "9   semantic  11464820173648324489      TEXT  #/texts/2  meta-data        0.85\n",
+      "10  language  14822167066882793830      TEXT  #/texts/3         en        0.69\n",
+      "11  semantic  14822167066882793830      TEXT  #/texts/3  meta-data        0.69\n",
       "2212.02896.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Table of contents (ToC) extraction ai...\n",
-      "1     title  #/texts/2  Multimodal Tree Decoder for Table of Contents ...\n",
-      "2    author  #/texts/2                                         Pengfei Hu\n",
-      "3    author  #/texts/2                                     Zhenrong Zhang\n",
-      "4    author  #/texts/2                                      Jianshu Zhang\n",
-      "5    author  #/texts/2                                             Jun Du\n",
-      "6    author  #/texts/2                                          Jiajia Wu\n",
-      "7    author  #/texts/4                                          P R China\n",
+      "title:  Multimodal Tree Decoder for Table of Contents Extraction in Document Images\n",
+      "abstract:  Abstract-Table of contents (ToC) extraction aims to extract headings of different levels in documents to better understand the outline of the contents, which can be widely used for document understanding and information retrieval. Existing works often use hand-crafted features and predefined rule-based functions to detect headings and resolve the hierarchical relationship between headings. Both the benchmark and research based on deep learning are still limited. Accordingly, in this paper, we first introduce a standard dataset, HierDoc, including image samples from 650 documents of scientific papers with their content labels. Then we propose a novel end-to-end model by using the multimodal tree decoder (MTD) for ToC as a benchmark for HierDoc. The MTD model is mainly composed of three parts, namely encoder, classifier, and decoder. The encoder fuses the multimodality features of vision, text, and layout information for each entity of the document. Then the classifier recognizes and selects the heading entities. Next, to parse the hierarchical relationship between the heading entities, a tree-structured decoder is designed. To evaluate the performance, both the metric of tree-edit-distance similarity (TEDS) and F1-Measure are adopted. Finally, our MTD approach achieves an average TEDS of 87.2% and an average F1-Measure of 88.1% on the test set of HierDoc. The code and dataset will be released at: https://github.com/Pengfei-Hu/MTD.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   4520737201345173879  DOCUMENT          #         en        0.97\n",
+      "1   metadata   2294829083606373586  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   5961811913560082829  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   language  18342393307934275562      TEXT  #/texts/0         en        0.49\n",
+      "4   semantic  18342393307934275562      TEXT  #/texts/0  reference        0.89\n",
+      "5   language   2294829083606373586      TEXT  #/texts/1         en        0.70\n",
+      "6   semantic   2294829083606373586      TEXT  #/texts/1     header        0.49\n",
+      "7   language   6688015588212255585      TEXT  #/texts/2         en        0.36\n",
+      "8   semantic   6688015588212255585      TEXT  #/texts/2  meta-data        0.99\n",
+      "9   language   8322937833169004557      TEXT  #/texts/3         en        0.64\n",
+      "10  semantic   8322937833169004557      TEXT  #/texts/3  meta-data        0.95\n",
+      "11  language  16705250730596284103      TEXT  #/texts/4         en        0.80\n",
       "2302.05658.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. This paper introduces the DocILE ben...\n",
-      "1     title  #/texts/2  DocILE Benchmark for Document Information Loca...\n",
-      "2    author  #/texts/2                                         Yash Patel\n",
-      "3    author  #/texts/3                                        Ahmed Hamdi\n",
-      "4    author  #/texts/4                                   Mickael Coustaty\n",
-      "5    author  #/texts/4                               Dimosthenis Karatzas\n",
-      "6    author  #/texts/7                         Czech Technical University\n",
-      "7    author  #/texts/8                             Computer Vision Center\n",
-      "8    author  #/texts/8                                    Universitat Aut\n",
+      "title:  DocILE Benchmark for Document Information Localization and Extraction\n",
+      "abstract:  Abstract. This paper introduces the DocILE benchmark with the largest dataset of business documents for the tasks of Key Information Localization and Extraction and Line Item Recognition. It contains 6. 7k annotated business documents, 100k synthetically generated documents, and nearly 1M unlabeled documents for unsupervised pre-training. The dataset has been built with knowledge of domain-and task-specific aspects, resulting in the following key features: (i) annotations in 55 classes, which surpasses the granularity of previously published key information extraction datasets by a large margin; (ii) Line Item Recognition represents a highly practical information extraction task, where key information has to be assigned to items in a table; (iii) documents come from numerous layouts and the test set includes zero-and few-shot cases as well as layouts commonly seen in the training set. The benchmark comes with several baselines, including RoBERTa, LayoutLMv3 and DETRbased Table Transformer; applied to both tasks of the DocILE benchmark, with results shared in this paper, offering a quick starting point for future work. The dataset, baselines and supplementary material are available at https://github.com/rossumai/docile. Keywords: Document AI · Information Extraction · Line Item Recognition · Business Documents · Intelligent Document Processing\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language   7009734128893118738  DOCUMENT           #         en   \n",
+      "1   metadata  13493428666814362393  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata   1681750115897753256  DOCUMENT   #/texts/9   abstract   \n",
+      "3   metadata   5646429187594938390  DOCUMENT  #/texts/10   abstract   \n",
+      "4   language   5503286060854998078      TEXT   #/texts/0         en   \n",
+      "5   semantic   5503286060854998078      TEXT   #/texts/0       text   \n",
+      "6   language  13493428666814362393      TEXT   #/texts/1         en   \n",
+      "7   semantic  13493428666814362393      TEXT   #/texts/1     header   \n",
+      "8   language    172976763706734726      TEXT   #/texts/2         en   \n",
+      "9   semantic    172976763706734726      TEXT   #/texts/2  meta-data   \n",
+      "10  language  12947117675211018377      TEXT   #/texts/3         en   \n",
+      "11  semantic  12947117675211018377      TEXT   #/texts/3  meta-data   \n",
+      "\n",
+      "    confidence  \n",
+      "0         0.94  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         0.58  \n",
+      "5         0.83  \n",
+      "6         0.57  \n",
+      "7         0.89  \n",
+      "8         0.15  \n",
+      "9         0.92  \n",
+      "10        0.23  \n",
+      "11        0.73  \n",
       "2304.13240.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Accurately extracting structured dat...\n",
-      "1     title  #/texts/2  Structure Diagram Recognition in Financial Ann...\n",
-      "2    author  #/texts/2                                       Meixuan Qiao\n",
-      "3    author  #/texts/2                                           Jun Wang\n",
-      "4    author  #/texts/2                                        Junfu Xiang\n",
-      "5    author  #/texts/2                                           Qiyu Hou\n",
-      "6    author  #/texts/2                                         Ruixuan Li\n",
-      "7    author  #/texts/5                                         Wudao Tech\n",
+      "title:  Structure Diagram Recognition in Financial Announcements\n",
+      "abstract:  Abstract. Accurately extracting structured data from structure diagrams in financial announcements is of great practical importance for building financial knowledge graphs and further improving the efficiency of various financial applications. First, we proposed a new method for recognizing structure diagrams in financial announcements, which can better detect and extract different types of connecting lines, including straight lines, curves, and polylines of different orientations and angles. Second, we developed a semi-automated, two-stage method to efficiently generate the industry's first benchmark of structure diagrams from Chinese financial announcements, where a large number of diagrams were synthesized and annotated using an automated tool to train a preliminary recognition model with fairly good performance, and then a highquality benchmark can be obtained by automatically annotating the realworld structure diagrams using the preliminary model and then making few manual corrections. Finally, we experimentally verified the significant performance advantage of our structure diagram recognition method over previous methods. Keywords: Structure Diagram Recognition · Document AI · Financial Announcements\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  15950461597083458226  DOCUMENT          #         en        1.00\n",
+      "1   metadata    403958489487654933  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  14621555780754220160  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata  14253956465128531592  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language  10864193995400050123      TEXT  #/texts/0         en        0.53\n",
+      "5   semantic  10864193995400050123      TEXT  #/texts/0  reference        0.66\n",
+      "6   language    403958489487654933      TEXT  #/texts/1         en        0.69\n",
+      "7   semantic    403958489487654933      TEXT  #/texts/1     header        0.85\n",
+      "8   language   3492372939238010805      TEXT  #/texts/2         en        0.17\n",
+      "9   semantic   3492372939238010805      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language  16586331086933136302      TEXT  #/texts/3         en        0.81\n",
+      "11  semantic  16586331086933136302      TEXT  #/texts/3  meta-data        0.99\n",
       "2102.09395.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  The number of published PDF documents in both ...\n",
-      "1      title  #/texts/2  Robust PDF Document Conversion Using Recurrent...\n",
-      "2     author  #/texts/2                                Nikolaos Livathinos\n",
-      "3     author  #/texts/2                                     Cesar Berrospi\n",
-      "4     author  #/texts/2                                       Maksym Lysak\n",
-      "5     author  #/texts/2                                 Viktor Kuropiatnyk\n",
-      "6     author  #/texts/2                                       Ahmed Nassar\n",
-      "7     author  #/texts/2                                      Michele Dolfi\n",
-      "8     author  #/texts/2                                     Christoph Auer\n",
-      "9     author  #/texts/2                                      Kasper Dinkla\n",
-      "10    author  #/texts/2                                        Peter Staar\n",
+      "title:  Robust PDF Document Conversion Using Recurrent Neural Networks\n",
+      "abstract:  Abstract The number of published PDF documents in both the academic and commercial world has increased exponentially in recent decades. There is a growing need to make their rich content discoverable to information retrieval tools. Achieving high-quality semantic searches demands that a document's structural components such as title, section headers, paragraphs, (nested) lists, tables and figures (including their captions) are properly identified. Unfortunately, the PDF format is known to not conserve such structural information because it simply represents a document as a stream of low-level printing commands, in which one or more characters are placed in a bounding box with a particular styling. In this paper, we present a novel approach to document structure recovery in PDF using recurrent neural networks to process the low-level PDF data representation directly, instead of relying on a visual re-interpretation of the rendered PDF page, as has been proposed in previous literature. We demonstrate how a sequence of PDF printing commands can be used as input into a neural network and how the network can learn to classify each printing command according to its structural function in the page. This approach has three advantages: First, it can distinguish among more fine-grained labels (typically 10-20 labels as opposed to 1-5 with visual methods), which results in a more accurate and detailed document structure resolution. Second, it can take into account the text flow across pages more naturally compared to visual methods because it can concatenate the printing commands of sequential pages. Last, our proposed method needs less memory and it is computationally less expensive than visual methods. This allows us to deploy such models in production environments at a much lower cost. Through extensive architectural search in combination with advanced feature engineering, we were able to implement a model that yields a weighted average F$_{1}$ score of 97% across 17 distinct structural labels. The best model we achieved is currently served in production environments on our Corpus Conversion Service (CCS), which was presented at KDD18. This model enhances the capabilities of CCS significantly, as it eliminates the need for human annotated label ground-truth for every unseen document layout. This proved particularly useful when applied to a huge corpus of PDF articles related to COVID-19.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  10561561617789153455  DOCUMENT          #         en        1.00\n",
+      "1   metadata   7135050673999108316  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  13302319754357243881  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   7392025859030204500  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language  11051377048051759528      TEXT  #/texts/0         en        0.44\n",
+      "5   semantic  11051377048051759528      TEXT  #/texts/0       text        0.97\n",
+      "6   language   7135050673999108316      TEXT  #/texts/1         en        0.37\n",
+      "7   semantic   7135050673999108316      TEXT  #/texts/1     header        0.94\n",
+      "8   language  15708761523827467165      TEXT  #/texts/2         en        0.28\n",
+      "9   semantic  15708761523827467165      TEXT  #/texts/2  meta-data        0.66\n",
+      "10  language  10144578681078525297      TEXT  #/texts/3         en        0.52\n",
+      "11  semantic  10144578681078525297      TEXT  #/texts/3  meta-data        1.00\n",
       "2104.12756.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Infographics are documents designed to effecti...\n",
-      "1     title  #/texts/2                                     InfographicVQA\n",
-      "2    author  #/texts/2                                      Minesh Mathew\n",
-      "3    author  #/texts/2                                        Viraj Bagal\n",
-      "4    author  #/texts/2                               Dimosthenis Karatzas\n",
-      "5    author  #/texts/2                                     Ernest Valveny\n",
-      "6    author  #/texts/2                                          V Jawahar\n",
-      "7    author  #/texts/2                                    Universitat Aut\n",
+      "title:  InfographicVQA\n",
+      "abstract:  Abstract Infographics are documents designed to effectively communicate information using a combination of textual, graphical and visual elements. In this work, we explore the automatic understanding of infographic images by using Visual Question Answering technique. To this end, we present InfographicVQA, a new dataset that comprises a diverse collection of infographics along with natural language questions and answers annotations. The collected questions require methods to jointly reason over the document layout, textual content, graphical elements, and data visualizations. We curate the dataset with emphasis on questions that require elementary reasoning and basic arithmetic skills. Finally, we evaluate two strong baselines based on state of the art multi-modal VQA models, and establish baseline performance for the new task. The dataset, code and leaderboard will be made available at docvqa.org\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   7013058411310364900  DOCUMENT          #         en        1.00\n",
+      "1   metadata  16914697807876545820  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   1451867982113835867  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata   5889522086521579528  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language   3037774237887954095      TEXT  #/texts/0         en        0.52\n",
+      "5   semantic   3037774237887954095      TEXT  #/texts/0       text        0.69\n",
+      "6   language  16914697807876545820      TEXT  #/texts/1         en        0.44\n",
+      "7   semantic  16914697807876545820      TEXT  #/texts/1     header        0.66\n",
+      "8   language  10601015517049298818      TEXT  #/texts/2         en        0.24\n",
+      "9   semantic  10601015517049298818      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language   9677905795005395855      TEXT  #/texts/3         en        0.19\n",
+      "11  semantic   9677905795005395855      TEXT  #/texts/3  meta-data        0.93\n",
       "2212.02623.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  We propose Universal Document Processing (UDOP...\n",
-      "1      title  #/texts/2  Unifying Vision, Text, and Layout for Universa...\n",
-      "2     author  #/texts/2                                        Zineng Tang\n",
-      "3     author  #/texts/2                                          Ziyi Yang\n",
-      "4     author  #/texts/2                                        Guoxin Wang\n",
-      "5     author  #/texts/2                                         Yuwei Fang\n",
-      "6     author  #/texts/2                                           Yang Liu\n",
-      "7     author  #/texts/3                                      Chenguang Zhu\n",
-      "8     author  #/texts/3                                       Michael Zeng\n",
-      "9     author  #/texts/3                                          Cha Zhang\n",
-      "10    author  #/texts/3                                       Mohit Bansal\n",
-      "11    author  #/texts/4                                        Chapel Hill\n",
-      "12    author  #/texts/5        Microsoft Azure Cognitive Services Research\n",
+      "title:  Unifying Vision, Text, and Layout for Universal Document Processing\n",
+      "abstract:  Abstract We propose Universal Document Processing (UDOP), a foundation Document AI model which unifies text, image, and layout modalities together with varied task formats, including document understanding and generation. UDOP leverages the spatial correlation between textual content and document image to model image, text, and layout modalities with one uniform representation. With a novel Vision-Text-Layout Transformer, UDOP unifies pretraining and multi-domain downstream tasks into a prompt-based sequence generation scheme. UDOP is pretrained on both large-scale unlabeled document corpora using innovative self-supervised objectives and diverse labeled data. UDOP also learns to generate document images from text and layout modalities via masked image reconstruction. To the best of our knowledge, this is the first time in the field of document AI that one model simultaneously achieves highquality neural document editing and content customization. Our method sets the state-of-the-art on 9 Document AI tasks, e.g., document understanding and QA, across diverse data domains like finance reports, academic papers, and websites. UDOP ranks first on the leaderboard of the Document Understanding Benchmark (DUE). 1\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  15492257190033828791  DOCUMENT          #         en        1.00\n",
+      "1   metadata  17694378913530663894  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   5130702399638276543  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata   1750046879787500780  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language   6822356808946763591      TEXT  #/texts/0         en        0.55\n",
+      "5   semantic   6822356808946763591      TEXT  #/texts/0  reference        0.89\n",
+      "6   language  17694378913530663894      TEXT  #/texts/1         en        0.61\n",
+      "7   semantic  17694378913530663894      TEXT  #/texts/1  reference        0.77\n",
+      "8   language   7225642215376316190      TEXT  #/texts/2         en        0.46\n",
+      "9   semantic   7225642215376316190      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   3758912565052507091      TEXT  #/texts/3         en        0.35\n",
+      "11  semantic   3758912565052507091      TEXT  #/texts/3  meta-data        0.96\n",
       "2111.13809.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Document layout analysis (DLA) plays an import...\n",
-      "1      title  #/texts/2  DOCUMENT LAYOUT ANALYSIS WITH AESTHETIC-GUIDED...\n",
-      "2     author  #/texts/2                                        Tianlong Ma\n",
-      "3     author  #/texts/2                                        Xingjiao Wu\n",
-      "4     author  #/texts/2                                             Xin Li\n",
-      "5     author  #/texts/2                                      Xiangcheng Du\n",
-      "6     author  #/texts/2                                          Zhao Zhou\n",
-      "7     author  #/texts/2                                          Liang Xue\n",
-      "8     author  #/texts/2                                          Cheng Jin\n",
-      "9     author  #/texts/3                                   Fudan University\n",
-      "10    author  #/texts/3                       East China Normal University\n",
-      "11    author  #/texts/4                                 Zhongshan Hospital\n",
-      "12    author  #/texts/4                                   Fudan University\n",
+      "title:  DOCUMENT LAYOUT ANALYSIS WITH AESTHETIC-GUIDED IMAGE AUGMENTATION\n",
+      "abstract:  ABSTRACT Document layout analysis (DLA) plays an important role in information extraction and document understanding. At present, document layout analysis has reached a milestone achievement, however, document layout analysis of non-Manhattan is still a challenge. In this paper, we propose an image layer modeling method to tackle this challenge. To measure the proposed image layer modeling method, we propose a manually-labeled non-Manhattan layout fine-grained segmentation dataset named FPD. As far as we know, FPD is the first manually-labeled non-Manhattan layout fine-grained segmentation dataset. To effectively extract fine-grained features of documents, we propose an edge embedding network named L-E $^{3}$Net. Experimental results prove that our proposed image layer modeling method can better deal with the fine-grained segmented document of the non-Manhattan layout. Index Terms docuemnt layout analysis, data augmentation, deep learning, non-Manhattan layout\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   7685565125117959574  DOCUMENT          #         en        1.00\n",
+      "1   metadata  17663094003299423450  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  11832164313368010743  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  14954915565413020412  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   metadata   1448756563310952220  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "5   language   7446772614954564842      TEXT  #/texts/0         en        0.39\n",
+      "6   semantic   7446772614954564842      TEXT  #/texts/0  reference        0.91\n",
+      "7   language  17663094003299423450      TEXT  #/texts/1         en        0.42\n",
+      "8   semantic  17663094003299423450      TEXT  #/texts/1     header        0.95\n",
+      "9   language   2411124507804902582      TEXT  #/texts/2         en        0.27\n",
+      "10  semantic   2411124507804902582      TEXT  #/texts/2  meta-data        1.00\n",
+      "11  language  13847099027065667571      TEXT  #/texts/3         en        0.68\n",
       "2309.09742.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. The reliability of supervised machin...\n",
-      "1     title  #/texts/3  David Tschirschwitz r 0000 ' 0001 ' 5344 ' 417...\n",
-      "2    author  #/texts/3                                     Christian Benz\n",
-      "3    author  #/texts/4                                      Morris Florek\n",
-      "4    author  #/texts/5                                   Henrik Norderhus\n",
-      "5    author  #/texts/6                                        Benno Stein\n",
-      "6    author  #/texts/7                                   Volker Rodehorst\n",
+      "title:  David Tschirschwitz r 0000 ' 0001 ' 5344 ' 4172 $^{s}$,\n",
+      "abstract:  Abstract. The reliability of supervised machine learning systems depends on the accuracy and availability of ground truth labels. However, the process of human annotation, being prone to error, introduces the potential for noisy labels, which can impede the practicality of these systems. While training with noisy labels is a significant consideration, the reliability of test data is also crucial to ascertain the dependability of the results. A common approach to addressing this issue is repeated labeling, where multiple annotators label the same example, and their labels are combined to provide a better estimate of the true label. In this paper, we propose a novel localization algorithm that adapts wellestablished ground truth estimation methods for object detection and instance segmentation tasks. The key innovation of our method lies in its ability to transform combined localization and classification tasks into classification-only problems, thus enabling the application of techniques such as Expectation-Maximization (EM) or Majority Voting (MJV). Although our main focus is the aggregation of unique ground truth for test data, our algorithm also shows superior performance during training on the TexBiG dataset, surpassing both noisy label training and label aggregation using Weighted Boxes Fusion (WBF). Our experiments indicate that the benefits of repeated labels emerge under specific dataset and annotation configurations. The key factors appear to be (1) dataset complexity, the (2) annotator consistency, and (3) the given annotation budget constraints. Keywords: Object Detection · Instance Segmentation · Robust Learning.\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language    992249970351541192  DOCUMENT           #         en   \n",
+      "1   metadata   9276283848181217609  DOCUMENT   #/texts/2      title   \n",
+      "2   metadata  16381755428115428238  DOCUMENT   #/texts/9   abstract   \n",
+      "3   metadata   1019027222051697096  DOCUMENT  #/texts/10   abstract   \n",
+      "4   language  16338564269424971201      TEXT   #/texts/0         en   \n",
+      "5   semantic  16338564269424971201      TEXT   #/texts/0  reference   \n",
+      "6   language  10996720205158029901      TEXT   #/texts/1         en   \n",
+      "7   semantic  10996720205158029901      TEXT   #/texts/1     header   \n",
+      "8   language   9276283848181217609      TEXT   #/texts/2         de   \n",
+      "9   semantic   9276283848181217609      TEXT   #/texts/2  reference   \n",
+      "10  language   3259182652263725180      TEXT   #/texts/3         de   \n",
+      "11  semantic   3259182652263725180      TEXT   #/texts/3  meta-data   \n",
+      "\n",
+      "    confidence  \n",
+      "0         0.96  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         0.30  \n",
+      "5         0.95  \n",
+      "6         0.74  \n",
+      "7         0.96  \n",
+      "8         0.34  \n",
+      "9         0.58  \n",
+      "10        0.15  \n",
+      "11        0.46  \n",
       "2208.10970.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Recognizing the layout of unstructured digital...\n",
-      "1     title  #/texts/2  Doc-GCN: Heterogeneous Graph Convolutional Net...\n",
-      "2    author  #/texts/2                                          Siwen Luo\n",
-      "3    author  #/texts/2                                         Yihao Ding\n",
-      "4    author  #/texts/2                                          Siqu Long\n",
-      "5    author  #/texts/2                                   Soyeon Caren Han\n",
-      "6    author  #/texts/2                                        Josiah Poon\n",
+      "title:  Doc-GCN: Heterogeneous Graph Convolutional Networks for Document Layout Analysis\n",
+      "abstract:  Abstract Recognizing the layout of unstructured digital documents is crucial when parsing the documents into the structured, machine-readable format for downstream applications. Recent studies in Document Layout Analysis usually rely on computer vision models to understand documents while ignoring other information, such as context information or relation of document components, which are vital to capture. Our Doc-GCN presents an effective way to harmonize and integrate heterogeneous aspects for Document Layout Analysis. We first construct graphs to explicitly describe four main aspects, including syntactic, semantic, density, and appearance/visual information. Then, we apply graph convolutional networks for representing each aspect of information and use pooling to integrate them. Finally, we aggregate each aspect and feed them into 2-layer MLPs for document layout component classification. Our Doc-GCN achieves new state-of-the-art results in three widely used DLA datasets.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   2720927874437785770  DOCUMENT          #         en        1.00\n",
+      "1   metadata  17870169857378124289  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   9077524686609812103  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata  14890249552540682664  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language   8059319291858021328      TEXT  #/texts/0         en        0.51\n",
+      "5   semantic   8059319291858021328      TEXT  #/texts/0  reference        0.86\n",
+      "6   language  17870169857378124289      TEXT  #/texts/1         en        0.57\n",
+      "7   semantic  17870169857378124289      TEXT  #/texts/1     header        0.83\n",
+      "8   language   6152090793369172651      TEXT  #/texts/2         en        0.34\n",
+      "9   semantic   6152090793369172651      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  17053512559835824917      TEXT  #/texts/3         en        0.65\n",
+      "11  semantic  17053512559835824917      TEXT  #/texts/3  meta-data        0.82\n",
       "2302.01451.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Relevant information in documents is often sum...\n",
-      "1     title  #/texts/2  CTE: A Dataset for Contextualized Table Extrac...\n",
-      "2    author  #/texts/2                                     Andrea Gemelli\n",
-      "3    author  #/texts/2                                    Emanuele Vivoli\n",
-      "4    author  #/texts/2                                     Simone Marinai\n",
+      "title:  CTE: A Dataset for Contextualized Table Extraction\n",
+      "abstract:  Abstract Relevant information in documents is often summarized in tables, helping the reader to identify useful facts. Most benchmark datasets support either document layout analysis or table understanding, but lack in providing data to apply both tasks in a unified way. We define the task of Contextualized Table Extraction (CTE), which aims to extract and define the structure of tables considering the textual context of the document. The dataset comprises 75k fully annotated pages of scientific papers, including more than 35k tables. Data are gathered from PubMed Central, merging the information provided by annotations in the PubTables-1M and PubLayNet datasets. The dataset can support CTE and adds new classes to the original ones. The generated annotations can be used to develop end-to-end pipelines for various tasks, including document layout analysis, table detection, structure recognition, and functional analysis. We formally define CTE and evaluation metrics, showing which subtasks can be tackled, describing advantages, limitations, and future works of this collection of data. Annotations and code will be accessible at https://github.com/AILab-UniFI/cte-dataset. Keywords Dataset, Table Extraction, Scientific Paper Analysis, Document Layout Analysis, Benchmark\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  10999014387843235765  DOCUMENT          #         en        0.98\n",
+      "1   metadata   6774018724173639210  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   2501798605750979630  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata  12534062044414101616  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   metadata   2501901927569946792  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "5   metadata  13016416439003278616  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "6   language   2625281664540391426      TEXT  #/texts/0         en        0.31\n",
+      "7   semantic   2625281664540391426      TEXT  #/texts/0       text        0.99\n",
+      "8   language   6774018724173639210      TEXT  #/texts/1         en        0.75\n",
+      "9   semantic   6774018724173639210      TEXT  #/texts/1       text        0.57\n",
+      "10  language  14763756518753826937      TEXT  #/texts/2         it        0.26\n",
+      "11  semantic  14763756518753826937      TEXT  #/texts/2  meta-data        1.00\n",
       "2301.10781.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  The lack of generalizability-in which a model ...\n",
-      "1     title  #/texts/2  Generalizability in Document Layout Analysis f...\n",
-      "2    author  #/texts/2                                        Jill Naiman\n",
+      "title:  Generalizability in Document Layout Analysis for Scientific Article Figure & Caption Extraction\n",
+      "abstract:  Abstract The lack of generalizability-in which a model trained on one dataset cannot provide accurate results for a different dataset-is a known problem in the field of document layout analysis. Thus, when a model is used to locate important page objects in scientific literature such as figures, tables, captions, and math formulas, the model often cannot be applied successfully to new domains. While several solutions have been proposed, including newer and updated deep learning models, larger handannotated datasets, and the generation of large synthetic datasets, so far there is no 'magic bullet' for translating a model trained on a particular domain or historical time period to a new field. Here we present our ongoing work in translating our document layout analysis model from the historical astrophysical literature to the larger corpus of scientific documents within the HathiTrust U.S. Federal Documents collection. We use this example as an avenue to highlight some of the problems with generalizability in the document layout analysis community and discuss several challenges and possible solutions to address these issues. All code for this work is available on The Reading Time Machine GitHub repository, https://github.com/ReadingTimeMachine/htrc short conf. Keywords: scholarly document processing, document layout analysis, astronomy.\n",
+      "        type             subj_hash subj_name   subj_path      label  \\\n",
+      "0   language  15211011720834601151  DOCUMENT           #         en   \n",
+      "1   metadata   5738259683770902497  DOCUMENT   #/texts/1      title   \n",
+      "2   metadata   7687256829341479909  DOCUMENT   #/texts/8   abstract   \n",
+      "3   metadata  18316644352025785821  DOCUMENT   #/texts/9   abstract   \n",
+      "4   metadata   3325627418640212133  DOCUMENT  #/texts/10   abstract   \n",
+      "5   language   7989988473973578225      TEXT   #/texts/0         en   \n",
+      "6   semantic   7989988473973578225      TEXT   #/texts/0       text   \n",
+      "7   language   5738259683770902497      TEXT   #/texts/1         en   \n",
+      "8   semantic   5738259683770902497      TEXT   #/texts/1     header   \n",
+      "9   language  16699890999419839987      TEXT   #/texts/2         en   \n",
+      "10  semantic  16699890999419839987      TEXT   #/texts/2  meta-data   \n",
+      "11  language  13542574747711984673      TEXT   #/texts/3         en   \n",
+      "\n",
+      "    confidence  \n",
+      "0         1.00  \n",
+      "1         1.00  \n",
+      "2         1.00  \n",
+      "3         1.00  \n",
+      "4         1.00  \n",
+      "5         0.44  \n",
+      "6         0.95  \n",
+      "7         0.60  \n",
+      "8         0.82  \n",
+      "9         0.52  \n",
+      "10        0.99  \n",
+      "11        0.68  \n",
       "2102.05533.pdf\n",
-      "Empty DataFrame\n",
-      "Columns: [subtype, subj_path, name]\n",
-      "Index: []\n",
+      "title:  A view of computational models for image segmentation\n",
+      "abstract:  ['Image segmentation is a central topic in image processing and computer vision and a key issue in many applications, e.g., in medical imaging, microscopy, document analysis and remote sensing. According to the human perception, image segmentation is the process of dividing an image into non-overlapping regions. These regions, which may correspond, e.g., to different objects, are fundamental for the correct interpretation and classification of the scene represented by the image. The division into regions is not unique, but it depends on the application, i.e., it must be driven by the final goal of the segmentation and hence by the most significant features with respect to that goal. Thus, image segmentation can be regarded as a strongly ill-posed problem. A classical approach to deal with ill posedness consists in incorporating in the model a-priori information about the solution, e.g., in the form of penalty terms. In this work we provide a brief overview of basic computational models for image segmentation, focusing on edge-based and region-based variational models, as well as on statistical and machine-learning approaches. We also sketch numerical methods that are applied in computing solutions to these models. In our opinion, our view can help the readers identify suitable classes of methods for solving their specific problems.']\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   8334122060317671079  DOCUMENT          #         en        0.92\n",
+      "1   language  10459924432452545932      TEXT  #/texts/0         en        0.82\n",
+      "2   semantic  10459924432452545932      TEXT  #/texts/0     header        0.62\n",
+      "3   language  13729361378862501881      TEXT  #/texts/1         en        0.84\n",
+      "4   semantic  13729361378862501881      TEXT  #/texts/1     header        0.93\n",
+      "5   language   1216235182387265925      TEXT  #/texts/2         en        0.19\n",
+      "6   semantic   1216235182387265925      TEXT  #/texts/2  meta-data        0.84\n",
+      "7   language   8013272106503804603      TEXT  #/texts/3         en        0.76\n",
+      "8   semantic   8013272106503804603      TEXT  #/texts/3  meta-data        0.67\n",
+      "9   language   3453923813147997148      TEXT  #/texts/4         en        0.35\n",
+      "10  semantic   3453923813147997148      TEXT  #/texts/4       text        0.99\n",
+      "11  language  11771141895094630316      TEXT  #/texts/5         en        0.58\n",
       "2108.11591.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Reading order detection is the cornerstone to ...\n",
-      "1     title  #/texts/2  LayoutReader: Pre-training of Text and Layout ...\n",
-      "2    author  #/texts/2                                        Zilong Wang\n",
-      "3    author  #/texts/2                                          Yiheng Xu\n",
-      "4    author  #/texts/2                                            Lei Cui\n",
-      "5    author  #/texts/2                                       Jingbo Shang\n",
-      "6    author  #/texts/2                                           Furu Wei\n",
-      "7    author  #/texts/3                                          San Diego\n",
-      "8    author  #/texts/4                            Microsoft Research Asia\n",
+      "title:  LayoutReader: Pre-training of Text and Layout for Reading Order Detection\n",
+      "abstract:  Abstract Reading order detection is the cornerstone to understanding visually-rich documents (e.g., receipts and forms). Unfortunately, no existing work took advantage of advanced deep learning models because it is too laborious to annotate a large enough dataset. We observe that the reading order of WORD documents is embedded in their XML metadata; meanwhile, it is easy to convert WORD documents to PDFs or images. Therefore, in an automated manner, we construct ReadingBank, a benchmark dataset that contains reading order, text, and layout information for 500,000 document images covering a wide spectrum of document types. This first-ever large-scale dataset unleashes the power of deep neural networks for reading order detection. Specifically, our proposed LayoutReader captures the text and layout information for reading order prediction using the seq2seq model. It performs almost perfectly in reading order detection and significantly improves both open-source and commercial OCR engines in ordering text lines in their results in our experiments. We will release the dataset and model at https:// aka.ms/layoutreader.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   9757730494358412914  DOCUMENT          #         en        0.99\n",
+      "1   metadata  13558474190134142955  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  13643495312176067581  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata   5705766784987579239  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language   4713674989249568416      TEXT  #/texts/0         en        0.44\n",
+      "5   semantic   4713674989249568416      TEXT  #/texts/0       text        0.61\n",
+      "6   language  13558474190134142955      TEXT  #/texts/1         en        0.76\n",
+      "7   semantic  13558474190134142955      TEXT  #/texts/1     header        0.90\n",
+      "8   language  17626226113250559526      TEXT  #/texts/2         en        0.27\n",
+      "9   semantic  17626226113250559526      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  13633992836670036397      TEXT  #/texts/3         en        0.61\n",
+      "11  semantic  13633992836670036397      TEXT  #/texts/3  meta-data        0.99\n",
       "2104.08836.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Multimodal pre-training with text, layout, and...\n",
-      "1      title  #/texts/2  LayoutXLM: Multimodal Pre-training for Multili...\n",
-      "2     author  #/texts/2                                          Yiheng Xu\n",
-      "3     author  #/texts/2                                            Lei Cui\n",
-      "4     author  #/texts/2                                        Guoxin Wang\n",
-      "5     author  #/texts/2                                          Yijuan Lu\n",
-      "6     author  #/texts/2                                    Dinei Florencio\n",
-      "7     author  #/texts/2                                          Cha Zhang\n",
-      "8     author  #/texts/2                                           Furu Wei\n",
-      "9     author  #/texts/3                            Microsoft Research Asia\n",
-      "10    author  #/texts/4                                    Microsoft Azure\n",
+      "title:  LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding ∗\n",
+      "abstract:  Abstract Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually-rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. In this paper, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually-rich document understanding. To accurately evaluate LayoutXLM, we also introduce a multilingual form understanding benchmark dataset named XFUN, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese), and key-value pairs are manually labeled for each language. Experiment results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUN dataset. The pre-trained LayoutXLM model and the XFUN dataset will be publicly available at https: //aka.ms/layoutxlm.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   4883914572144313992  DOCUMENT          #         en        1.00\n",
+      "1   metadata  14541526077917844680  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   2439660219367115416  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "3   metadata   9547735864067414344  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "4   language   1421188079197686270      TEXT  #/texts/0         en        0.49\n",
+      "5   semantic   1421188079197686270      TEXT  #/texts/0       text        0.96\n",
+      "6   language  14541526077917844680      TEXT  #/texts/1         en        0.63\n",
+      "7   semantic  14541526077917844680      TEXT  #/texts/1     header        0.69\n",
+      "8   language   7935954898418490613      TEXT  #/texts/2         en        0.27\n",
+      "9   semantic   7935954898418490613      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  10743865134721424209      TEXT  #/texts/3         en        0.27\n",
+      "11  semantic  10743865134721424209      TEXT  #/texts/3  meta-data        1.00\n",
       "2305.15393.pdf\n",
-      "     subtype  subj_path                                               name\n",
-      "0   abstract          #  Attaining a high degree of user controllabilit...\n",
-      "1      title  #/texts/2  LayoutGPT: Compositional Visual Planning and G...\n",
-      "2     author  #/texts/2                                         Weixi Feng\n",
-      "3     author  #/texts/2                                        Wanrong Zhu\n",
-      "4     author  #/texts/2                                      Varun Jampani\n",
-      "5     author  #/texts/2                                        Arjun Akula\n",
-      "6     author  #/texts/2                                        Sugato Basu\n",
-      "7     author  #/texts/2                                      Xin Eric Wang\n",
-      "8     author  #/texts/2                                  William Yang Wang\n",
-      "9     author  #/texts/2                                      Santa Barbara\n",
-      "10    author  #/texts/2                                         Santa Cruz\n",
+      "title:  LayoutGPT: Compositional Visual Planning and Generation with Large Language Models\n",
+      "abstract:  Abstract Attaining a high degree of user controllability in visual generation often requires intricate, fine-grained inputs like layouts. However, such inputs impose a substantial burden on users when compared to simple text inputs. To address the issue, we study how Large Language Models (LLMs) can serve as visual planners by generating layouts from text conditions, and thus collaborate with visual generative models. We propose LayoutGPT, a method to compose in-context visual demonstrations in style sheet language to enhance the visual planning skills of LLMs. LayoutGPT can generate plausible layouts in multiple domains, ranging from 2D images to 3D indoor scenes. LayoutGPT also shows superior performance in converting challenging language concepts like numerical and spatial relations to layout arrangements for faithful text-to-image generation. When combined with a downstream image generation model, LayoutGPT outperforms text-to-image models/systems by 20-40% and achieves comparable performance as human users in designing visual layouts for numerical and spatial correctness. Lastly, Layout-GPT achieves comparable performance to supervised methods in 3D indoor scene synthesis, demonstrating its effectiveness and potential in multiple visual domains. 1 Introduction Can Large Language Models (LLMs) comprehend visual concepts and generate plausible arrangments in visual spaces? Recently, LLMs have shown significant advancement in various reasoning skills [50, 49] that remain challenging to existing visual generative models. For instance, text-to-image generation (T2I) models suffer from generating objects with specified counts, positions, and attributes [10]. 3D scene synthesis models face challenges in preserving furniture within pre-defined room sizes [30]. Addressing these issues necessitates the development of compositional skills that effectively arrange components in a coherent manner, accurately reflecting object specifications and interactions.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   6911408252689886783  DOCUMENT          #         en        1.00\n",
+      "1   metadata  16832561012421574481  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   3435677435934837381  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata    459968366665532336  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   metadata  17723871280452781935  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "5   metadata  12745803599622234410  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "6   language  10264649267537693906      TEXT  #/texts/0         en        0.64\n",
+      "7   semantic  10264649267537693906      TEXT  #/texts/0  reference        0.66\n",
+      "8   language  16832561012421574481      TEXT  #/texts/1         en        0.62\n",
+      "9   semantic  16832561012421574481      TEXT  #/texts/1     header        0.86\n",
+      "10  language   5934779742741270059      TEXT  #/texts/2         en        0.53\n",
+      "11  semantic   5934779742741270059      TEXT  #/texts/2  meta-data        0.99\n",
       "2202.12985.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Pretraining has proven successful in...\n",
-      "1     title  #/texts/2  OCR-IDL: OCR Annotations for Industry Document...\n",
-      "2    author  #/texts/2                                   Ali Furkan Biten\n",
-      "3    author  #/texts/2                                        Lluis Gomez\n",
-      "4    author  #/texts/2                                     Ernest Valveny\n",
-      "5    author  #/texts/2                               Dimosthenis Karatzas\n",
-      "6    author  #/texts/3                             Computer Vision Center\n",
+      "title:  OCR-IDL: OCR Annotations for Industry Document Library Dataset\n",
+      "abstract:  Abstract. Pretraining has proven successful in Document Intelligence tasks where deluge of documents are used to pretrain the models only later to be finetuned on downstream tasks. One of the problems of the pretraining approaches is the inconsistent usage of pretraining data with different OCR engines leading to incomparable results between models. In other words, it is not obvious whether the performance gain is coming from diverse usage of amount of data and distinct OCR engines or from the proposed models. To remedy the problem, we make public the OCR annotations for IDL documents using commercial OCR engine given their superior performance over open source OCR models. The contributed dataset (OCR-IDL) has an estimated monetary value over 20K US $. It is our hope that OCR-IDL can be a starting point for future works on Document Intelligence. All of our data and its collection process with the annotations can be found in https://github.com/furkanbiten/idl_data.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  10847684039601849357  DOCUMENT          #         en        0.99\n",
+      "1   metadata   7551407551839940680  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   6471885383062386798  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   language   7082209914402450907      TEXT  #/texts/0         en        0.38\n",
+      "4   semantic   7082209914402450907      TEXT  #/texts/0       text        0.90\n",
+      "5   language   7551407551839940680      TEXT  #/texts/1         en        0.34\n",
+      "6   semantic   7551407551839940680      TEXT  #/texts/1     header        0.88\n",
+      "7   language   5513468041440136666      TEXT  #/texts/2         es        0.15\n",
+      "8   semantic   5513468041440136666      TEXT  #/texts/2  meta-data        1.00\n",
+      "9   language  11627341239655523964      TEXT  #/texts/3         en        0.70\n",
+      "10  semantic  11627341239655523964      TEXT  #/texts/3  meta-data        0.98\n",
+      "11  language  11261052408275609312      TEXT  #/texts/4         uk        0.17\n",
       "2303.14884.pdf\n",
-      "  subtype  subj_path                                               name\n",
-      "0   title  #/texts/1  A large-scale dataset for end-to-end table rec...\n",
-      "1  author  #/texts/1                                           Fan Yang\n",
-      "2  author  #/texts/1                                             Lei Hu\n",
-      "3  author  #/texts/1                                          Xinwu Liu\n",
-      "4  author  #/texts/1                                   Shuangping Huang\n",
-      "5  author  #/texts/1                                        Zhenghui Gu\n",
-      "6  author  #/texts/3                                  Times Electric Co\n",
-      "7  author  #/texts/7                                   Shuangping Huang\n",
+      "title:  A large-scale dataset for end-to-end table recognition in the wild\n",
+      "abstract:  ['Table recognition (TR) is one of the research hotspots in pattern recognition, which aims to extract information from tables in an image. Common table recognition tasks include table detection (TD), table structure recognition (TSR) and table content recognition (TCR). TD is to locate tables in the image, TCR recognizes text content, and TSR recognizes spatial ogical structure. Currently, the end-to-end TR in real scenarios, accomplishing the three sub-tasks simultaneously, is yet an unexplored research area. One major factor that inhibits researchers is the lack of a benchmark dataset. To this end, we propose a new large-scale dataset named Table Recognition Set (TabRecSet) with diverse table forms sourcing from multiple scenarios in the wild, providing complete annotation dedicated to end-to-end TR research. It is the largest and first bi-lingual dataset for end-to-end TR, with 38.1K tables in which 20.4K are in English\\\\, and 17.7K are in Chinese. The samples have diverse forms, such as the border-complete and -incomplete table, regular and irregular table (rotated, distorted, etc.). The scenarios are multiple in the wild, varying from scanned to camera-taken images, documents to Excel tables, educational test papers to financial invoices. The annotations are complete, consisting of the table body spatial annotation, cell spatial logical annotation and text content for TD, TSR and TCR, respectively. The spatial annotation utilizes the polygon instead of the bounding box or quadrilateral adopted by most datasets. The polygon spatial annotation is more suitable for irregular tables that are common in wild scenarios. Additionally, we propose a visualized and interactive annotation tool named TableMe to improve the efficiency and quality of table annotation.']\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   8114940727583702668  DOCUMENT          #         en        1.00\n",
+      "1   metadata  11716041957983712877  DOCUMENT  #/texts/0      title        1.00\n",
+      "2   language  11716041957983712877      TEXT  #/texts/0         en        0.78\n",
+      "3   semantic  11716041957983712877      TEXT  #/texts/0       text        0.68\n",
+      "4   language   6084627799462099121      TEXT  #/texts/1         en        0.44\n",
+      "5   semantic   6084627799462099121      TEXT  #/texts/1  meta-data        0.99\n",
+      "6   language  17393083399206941044      TEXT  #/texts/2         en        0.77\n",
+      "7   semantic  17393083399206941044      TEXT  #/texts/2  meta-data        0.98\n",
+      "8   language   4093452139209713676      TEXT  #/texts/3         en        0.41\n",
+      "9   semantic   4093452139209713676      TEXT  #/texts/3  meta-data        0.93\n",
+      "10  language   7996752084309247071      TEXT  #/texts/4         en        0.26\n",
+      "11  semantic   7996752084309247071      TEXT  #/texts/4  meta-data        1.00\n",
       "2101.06573.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Current Artificial Intelligence (AI) methods, ...\n",
-      "1     title  #/texts/2           Understanding in Artificial Intelligence\n",
-      "2    author  #/texts/2                                   Stefan Maetschke\n",
-      "3    author  #/texts/2                              David Martinez Iraola\n",
-      "4    author  #/texts/2                               Elaheh ShafieiBavani\n",
-      "5    author  #/texts/2                                        Peter Zhong\n",
-      "6    author  #/texts/2                                            Ying Xu\n",
-      "7    author  #/texts/3                                 Research Australia\n",
+      "title:  Understanding in Artificial Intelligence\n",
+      "abstract:  Abstract Current Artificial Intelligence (AI) methods, most based on deep learning, have facilitated progress in several fields, including computer vision and natural language understanding. The progress of these AI methods is measured using benchmarks designed to solve challenging tasks, such as visual question answering. A question remains of how much understanding is leveraged by these methods and how appropriate are the current benchmarks to measure understanding capabilities. To answer these questions, we have analysed existing benchmarks and their understanding capabilities, defined by a set of understanding capabilities, and current research streams. We show how progress has been made in benchmark development to measure understanding capabilities of AI methods and we review as well how current methods develop understanding capabilities. Keywords: Artificial intelligence, Deep-learning, neuro-symbolic, reasoning, understanding, computer vision, natural language processing\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language    679268346913823170  DOCUMENT          #         en        1.00\n",
+      "1   metadata  16418080794343938376  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  11352816888786938849  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata  15222897743183409902  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   metadata  15402767724637827274  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "5   language  16408750086547568363      TEXT  #/texts/0         en        0.32\n",
+      "6   semantic  16408750086547568363      TEXT  #/texts/0       text        0.95\n",
+      "7   language  16418080794343938376      TEXT  #/texts/1         en        0.75\n",
+      "8   semantic  16418080794343938376      TEXT  #/texts/1     header        0.56\n",
+      "9   language  13644035075325621441      TEXT  #/texts/2         en        0.23\n",
+      "10  semantic  13644035075325621441      TEXT  #/texts/2  meta-data        0.99\n",
+      "11  language    611290635858811423      TEXT  #/texts/3         en        0.58\n",
       "2304.06447.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Document-based Visual Question Answe...\n",
-      "1     title  #/texts/2  PDF-VQA: A New Dataset for Real-World VQA on P...\n",
-      "2    author  #/texts/2                                         Yihao Ding\n",
-      "3    author  #/texts/2                                          Siwen Luo\n",
-      "4    author  #/texts/2                                      Hyunsuk Chung\n",
-      "5    author  #/texts/2                                   Soyeon Caren Han\n",
+      "title:  PDF-VQA: A New Dataset for Real-World VQA on PDF Documents\n",
+      "abstract:  Abstract. Document-based Visual Question Answering examines the document understanding of document images in conditions of natural language questions. We proposed a new document-based VQA dataset, PDF-VQA, to comprehensively examine the document understanding from various aspects, including document element recognition, document layout structural understanding as well as contextual understanding and key information extraction. Our PDF-VQA dataset extends the current scale of document understanding that limits on the single document page to the new scale that asks questions over the full document of multiple pages. We also propose a new graph-based VQA model that explicitly integrates the spatial and hierarchically structural relationships between different document elements to boost the document structural understanding. The performances are compared with several baselines over different question types and tasks $^{4}$. Keywords: Document Understanding · Document Information Extraction · Visual Question Answering\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  17739815372653637004  DOCUMENT          #         en        1.00\n",
+      "1   metadata  11359654040391113622  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  16398652216189860809  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata  11512328541013711730  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language  10248123059142269704      TEXT  #/texts/0         en        0.34\n",
+      "5   semantic  10248123059142269704      TEXT  #/texts/0  reference        0.95\n",
+      "6   language  11359654040391113622      TEXT  #/texts/1         en        0.15\n",
+      "7   semantic  11359654040391113622      TEXT  #/texts/1     header        0.99\n",
+      "8   language   9242125684836779999      TEXT  #/texts/2         en        0.40\n",
+      "9   semantic   9242125684836779999      TEXT  #/texts/2  meta-data        0.99\n",
+      "10  language   4101268396283963173      TEXT  #/texts/3         en        0.74\n",
+      "11  semantic   4101268396283963173      TEXT  #/texts/3  meta-data        0.36\n",
       "2101.12741.pdf\n",
-      "Empty DataFrame\n",
-      "Columns: [subtype, subj_path, name]\n",
-      "Index: []\n",
+      "title:  Post-OCR Paragraph Recognition by Graph Convolutional Networks\n",
+      "abstract:  ['We propose a new approach for paragraph recognition in document images by spatial graph convolutional networks (GCN) applied on OCR text boxes. Two steps, namely line splitting and line clustering, are performed to extract paragraphs from the lines in OCR results. Each step uses a beta-skeleton graph constructed from bounding boxes, where the graph edges provide efficient support for graph convolution operations. With only pure layout input features, the GCN model size is 3~4 orders of magnitude smaller compared to R-CNN based models, while achieving comparable or better accuracies on PubLayNet and other datasets. Furthermore, the GCN models show good generalization from synthetic training data to real-world images, and good adaptivity for variable document styles.']\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  10969933011407272340  DOCUMENT          #         en        1.00\n",
+      "1   language  11717495397265653643      TEXT  #/texts/0         en        0.48\n",
+      "2   semantic  11717495397265653643      TEXT  #/texts/0       text        0.90\n",
+      "3   language   1895072058650522271      TEXT  #/texts/1         en        0.94\n",
+      "4   semantic   1895072058650522271      TEXT  #/texts/1       text        0.99\n",
+      "5   language   6548777523668082231      TEXT  #/texts/2         en        0.80\n",
+      "6   semantic   6548777523668082231      TEXT  #/texts/2     header        0.84\n",
+      "7   language  13639445319858036725      TEXT  #/texts/3         en        0.61\n",
+      "8   semantic  13639445319858036725      TEXT  #/texts/3  meta-data        0.88\n",
+      "9   language  10013140086229577108      TEXT  #/texts/4         en        0.88\n",
+      "10  semantic  10013140086229577108      TEXT  #/texts/4       text        0.99\n",
+      "11  language   2420190195673594550      TEXT  #/texts/5         en        0.80\n",
       "2202.08125.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Background. In recent years, libraries and arc...\n",
-      "1     title  #/texts/2  Processing the structure of documents: Logical...\n",
-      "2    author  #/texts/2                                    Nicolas Gutehrl\n",
-      "3    author  #/texts/3                     Recherches Interdisciplinaires\n",
-      "4    author  #/texts/4                             Iana Atanassova Centre\n",
-      "5    author  #/texts/4                     Recherches Interdisciplinaires\n",
-      "6    author  #/texts/4                             Institut Universitaire\n",
+      "title:  Processing the structure of documents: Logical Layout Analysis of historical newspapers in French\n",
+      "abstract:  Abstract Background. In recent years, libraries and archives led important digitisation campaigns that opened the access to vast collections of historical documents. While such documents are often available as XML ALTO documents, they lack information about their logical structure. In this paper, we address the problem of Logical Layout Analysis applied to historical documents in French. We propose a rule-based method, that we evaluate and compare with two Machine-Learning models, namely RIPPER and Gradient Boosting. Our data set contains French newspapers, periodicals and magazines, published in the first half of the twentieth century in the Franche-Comté Region. Results. Our rule-based system outperforms the two other models in nearly all evaluations. It has especially better Recall results, indicating that our system covers more types of every logical label than the other two models. When comparing RIPPER with Gradient Boosting, we can observe that Gradient Boosting has better Precision scores but RIPPER has better Recall scores. Conclusions. The evaluation shows that our system outperforms the two Machine Learning models, and provides significantly higher Recall. It also confirms that our system can be used to produce annotated data sets that are large enough to envisage Machine Learning or Deep Learning approaches for the task of Logical Layout Analysis. Combining rules and Machine Learning models into hybrid systems could potentially provide even better performances. Furthermore, as the layout in historical documents evolves rapidly, one possible solution to overcome this problem would be to apply Rule Learning algorithms to bootstrap rule sets adapted to different publication periods.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  10694267316762447349  DOCUMENT          #         en        0.98\n",
+      "1   metadata  17453111564062200966  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  10719077243545767902  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata  11121295389236019161  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   metadata  10458261237088064204  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "5   metadata  15014609299195796349  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "6   language   3140571036692500309      TEXT  #/texts/0         en        0.45\n",
+      "7   semantic   3140571036692500309      TEXT  #/texts/0       text        0.99\n",
+      "8   language  17453111564062200966      TEXT  #/texts/1         en        0.87\n",
+      "9   semantic  17453111564062200966      TEXT  #/texts/1       text        0.55\n",
+      "10  language  10183605818128324151      TEXT  #/texts/2         de        0.53\n",
+      "11  semantic  10183605818128324151      TEXT  #/texts/2  meta-data        0.98\n",
       "2308.02051.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Document layout analysis (DLA) is th...\n",
-      "1     title  #/texts/2   A Graphical Approach to Document Layout Analysis\n",
-      "2    author  #/texts/2                                         Jilin Wang\n",
-      "3    author  #/texts/2                                   Michael Krumdick\n",
-      "4    author  #/texts/2                                        Baojia Tong\n",
-      "5    author  #/texts/2                                       Hamima Halim\n",
-      "6    author  #/texts/3                                        Vadym Barda\n",
-      "7    author  #/texts/3                                  Delphine Vendryes\n",
-      "8    author  #/texts/4                                Kensho Technologies\n",
-      "9    author  #/texts/8                                        Los Angeles\n",
+      "title:  A Graphical Approach to Document Layout Analysis\n",
+      "abstract:  Abstract. Document layout analysis (DLA) is the task of detecting the distinct, semantic content within a document and correctly classifying these items into an appropriate category (e.g., text, title, figure). DLA pipelines enable users to convert documents into structured machinereadable formats that can then be used for many useful downstream tasks. Most existing state-of-the-art (SOTA) DLA models represent documents as images, discarding the rich metadata available in electronically generated PDFs. Directly leveraging this metadata, we represent each PDF page as a structured graph and frame the DLA problem as a graph segmentation and classification problem. We introduce the Graph-based Layout Analysis Model (GLAM), a lightweight graph neural network competitive with SOTA models on two challenging DLA datasets-while being an order of magnitude smaller than existing models. In particular, the 4-million parameter GLAM model outperforms the leading 140M+ parameter computer vision-based model on 5 of the 11 classes on the DocLayNet dataset. A simple ensemble of these two models achieves a new state-of-the-art on DocLayNet, increasing mAP from 76.8 to 80.8. Overall, GLAM is over 5 times more efficient than SOTA models, making GLAM a favorable engineering choice for DLA tasks.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   4516578811656693411  DOCUMENT          #         en        1.00\n",
+      "1   metadata   1547390354026499610  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   7215812434060372258  DOCUMENT  #/texts/9   abstract        1.00\n",
+      "3   language  11271486628029795725      TEXT  #/texts/0         en        0.45\n",
+      "4   semantic  11271486628029795725      TEXT  #/texts/0  reference        0.54\n",
+      "5   language   1547390354026499610      TEXT  #/texts/1         en        0.56\n",
+      "6   semantic   1547390354026499610      TEXT  #/texts/1     header        0.96\n",
+      "7   language   6590265173522894960      TEXT  #/texts/2         en        0.24\n",
+      "8   semantic   6590265173522894960      TEXT  #/texts/2  meta-data        1.00\n",
+      "9   language  11511962287349343203      TEXT  #/texts/3         en        0.31\n",
+      "10  semantic  11511962287349343203      TEXT  #/texts/3  meta-data        1.00\n",
+      "11  language    331836956372778542      TEXT  #/texts/4         en        0.23\n",
       "2006.14615.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. We address the problem of layout gen...\n",
-      "1     title  #/texts/2  Layout Generation and Completion with Self-att...\n",
-      "2    author  #/texts/2                                        Kamal Gupta\n",
-      "3    author  #/texts/2                                 Alessandro Achille\n",
-      "4    author  #/texts/2                                        Larry Davis\n",
-      "5    author  #/texts/2                                Abhinav Shrivastava\n",
-      "6    author  #/texts/3                                       College Park\n",
+      "title:  Layout Generation and Completion with Self-attention\n",
+      "abstract:  Abstract. We address the problem of layout generation for diverse domains such as images, documents, and mobile applications. A layout is a set of graphical elements, belonging to one or more categories, placed together in a meaningful way. Generating a new layout or extending an existing layout requires understanding the relationships between these graphical elements. To do this, we propose a novel framework, Layout-Transformer, that leverages a self-attention based approach to learn contextual relationships between layout elements and generate layouts in a given domain. The proposed model improves upon the state-of-the-art approaches in layout generation in four ways. First, our model can generate a new layout either from an empty set or add more elements to a partial layout starting from an initial set of elements. Second, as the approach is attention-based, we can visualize which previous elements the model is attending to predict the next element, thereby providing an interpretable sequence of layout elements. Third, our model can easily scale to support both a large number of element categories and a large number of elements per layout. Finally, the model also produces an embedding for various element categories, which can be used to explore the relationships between the categories. We demonstrate with experiments that our model can produce meaningful layouts in diverse settings such as object bounding boxes in scenes (COCO bounding boxes), documents (PubLayNet), and mobile applications (RICO dataset). Keywords: Generative modeling, Self-attention, Layout generation\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  16078767997603633708  DOCUMENT          #         en        0.99\n",
+      "1   metadata   3448975359140491406  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  15248218062120468612  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "3   metadata    942661215784899310  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "4   language   9584954673896534035      TEXT  #/texts/0         en        0.34\n",
+      "5   semantic   9584954673896534035      TEXT  #/texts/0  reference        1.00\n",
+      "6   language   3448975359140491406      TEXT  #/texts/1         en        0.88\n",
+      "7   semantic   3448975359140491406      TEXT  #/texts/1     header        0.86\n",
+      "8   language   5240041772907971174      TEXT  #/texts/2         en        0.36\n",
+      "9   semantic   5240041772907971174      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   8314509802714261312      TEXT  #/texts/3         en        0.63\n",
+      "11  semantic   8314509802714261312      TEXT  #/texts/3  meta-data        1.00\n",
       "2012.06547.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  We present a deep neural network to predict st...\n",
-      "1     title  #/texts/2  LayoutGMN: Neural Graph Matching for Structura...\n",
-      "2    author  #/texts/2                                  Akshay Gadi Patil\n",
-      "3    author  #/texts/2                                           Manyi Li\n",
-      "4    author  #/texts/2                                     Matthew Fisher\n",
-      "5    author  #/texts/2                                      Manolis Savva\n",
-      "6    author  #/texts/2                                          Hao Zhang\n",
-      "7    author  #/texts/2                            Simon Fraser University\n",
-      "8    author  #/texts/2                                     Adobe Research\n",
+      "title:  LayoutGMN: Neural Graph Matching for Structural Layout Similarity\n",
+      "abstract:  Abstract We present a deep neural network to predict structural similarity between 2D layouts by leveraging Graph Matching Networks (GMN). Our network, coined LayoutGMN, learns the layout metric via neural graph matching, using an attention-based GMN designed under a triplet network setting. To train our network, we utilize weak labels obtained by pixel-wise Intersection-over-Union (IoUs) to define the triplet loss. Importantly, LayoutGMN is built with a structural bias which can effectively compensate for the lack of structure awareness in IoUs. We demonstrate this on two prominent forms of layouts, viz., floorplans and UI designs, via retrieval experiments on large-scale datasets. In particular, retrieval results by our network better match human judgement of structural layout similarity compared to both IoUs and other baselines including a state-of-theart method based on graph neural networks and image convolution. In addition, LayoutGMN is the first deep model to offer both metric learning of structural layout similarity and structural matching between layout elements.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  11932898538783523936  DOCUMENT          #         en        0.99\n",
+      "1   metadata  12407577734524725751  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  15238942460965794110  DOCUMENT  #/texts/3   abstract        1.00\n",
+      "3   metadata  18057253837422795236  DOCUMENT  #/texts/4   abstract        1.00\n",
+      "4   language   5463348197003769018      TEXT  #/texts/0         en        0.39\n",
+      "5   semantic   5463348197003769018      TEXT  #/texts/0  reference        0.99\n",
+      "6   language  12407577734524725751      TEXT  #/texts/1         en        0.56\n",
+      "7   semantic  12407577734524725751      TEXT  #/texts/1     header        0.87\n",
+      "8   language  17514608994654715057      TEXT  #/texts/2         en        0.45\n",
+      "9   semantic  17514608994654715057      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  15238942460965794110      TEXT  #/texts/3         en        0.32\n",
+      "11  semantic  15238942460965794110      TEXT  #/texts/3     header        0.93\n",
       "1908.07836.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract-Recognizing the layout of unstructure...\n",
-      "1     title  #/texts/2  PubLayNet: largest dataset ever for document l...\n",
-      "2    author  #/texts/2                                           Xu Zhong\n",
-      "3    author  #/texts/2                                 Research Australia\n",
-      "4    author  #/texts/2                                          City Road\n",
-      "5    author  #/texts/3                                       Jianbin Tang\n",
-      "6    author  #/texts/3                                 Research Australia\n",
-      "7    author  #/texts/3                                          City Road\n",
-      "8    author  #/texts/4                                 Research Australia\n",
-      "9    author  #/texts/4                                          City Road\n",
+      "title:  PubLayNet: largest dataset ever for document layout analysis\n",
+      "abstract:  Abstract-Recognizing the layout of unstructured digital documents is an important step when parsing the documents into structured machine-readable format for downstream applications. Deep neural networks that are developed for computer vision have been proven to be an effective method to analyze layout of document images. However, document layout datasets that are currently publicly available are several magnitudes smaller than established computing vision datasets. Models have to be trained by transfer learning from a base model that is pre-trained on a traditional computer vision dataset. In this paper, we develop the PubLayNet dataset for document layout analysis by automatically matching the XML representations and the content of over 1 million PDF articles that are publicly available on PubMed Central$^{™}$. The size of the dataset is comparable to established computer vision datasets, containing over 360 thousand document images, where typical document layout elements are annotated. The experiments demonstrate that deep neural networks trained on PubLayNet accurately recognize the layout of scientific articles. The pre-trained models are also a more effective base mode for transfer learning on a different document domain. We release the dataset (https://github.com/ibm-aur-nlp/PubLayNet) to support development and evaluation of more advanced models for document layout analysis. Index Terms-automatic annotation, document layout, deep learning, transfer learning\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language    444664934372890831  DOCUMENT          #         en        0.99\n",
+      "1   metadata  11925642137111790531  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   8260381015807345190  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   4125432983079110592  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  13116259115036862914      TEXT  #/texts/0         en        0.46\n",
+      "5   semantic  13116259115036862914      TEXT  #/texts/0  reference        0.93\n",
+      "6   language  11925642137111790531      TEXT  #/texts/1         en        0.58\n",
+      "7   semantic  11925642137111790531      TEXT  #/texts/1     header        0.72\n",
+      "8   language  13473377044079016890      TEXT  #/texts/2         en        0.58\n",
+      "9   semantic  13473377044079016890      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language  13933898933532581931      TEXT  #/texts/3         en        0.47\n",
+      "11  semantic  13933898933532581931      TEXT  #/texts/3  meta-data        0.99\n",
       "2108.13297.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Documents often contain complex phys...\n",
-      "1     title  #/texts/2  VTLayout: Fusion of Visual and Text Features f...\n",
-      "2    author  #/texts/2                                         Shoubin Li\n",
-      "3    author  #/texts/2                                           Xuyan Ma\n",
-      "4    author  #/texts/2                                       Shuaiqun Pan\n",
-      "5    author  #/texts/2                                             Jun Hu\n",
-      "6    author  #/texts/2                                            Lin Shi\n",
-      "7    author  #/texts/2                                          Qing Wang\n",
+      "title:  VTLayout: Fusion of Visual and Text Features for Document Layout Analysis\n",
+      "abstract:  Abstract. Documents often contain complex physical structures, which make the Document Layout Analysis (DLA) task challenging. As a preprocessing step for content extraction, DLA has the potential to capture rich information in historical or scientific documents on a large scale. Although many deep-learning-based methods from computer vision have already achieved excellent performance in detecting Figure from documents, they are still unsatisfactory in recognizing the List, Table, Text and Title category blocks in DLA. This paper proposes a VTLayout model fusing the documents' deep visual, shallow visual, and text features to localize and identify different category blocks. The model mainly includes two stages, and the three feature extractors are built in the second stage. In the first stage, the Cascade Mask R-CNN model is applied directly to localize all category blocks of the documents. In the second stage, the deep visual, shallow visual, and text features are extracted for fusion to identify the category blocks of documents. As a result, we strengthen the classification power of different category blocks based on the existing localization technique. The experimental results show that the identification capability of the VTLayout is superior to the most advanced method of DLA based on the PubLayNet dataset, and the F1 score is as high as 0.9599. Keywords: Document Layout Analysis · Fusion of Visual and Text · VTLayout · PubLayNet\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  15662473300130382433  DOCUMENT          #         en        1.00\n",
+      "1   metadata   6721946344459692653  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  10147356995285361999  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata  15912456067780105091  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language   7673201184713737662      TEXT  #/texts/0         en        0.40\n",
+      "5   semantic   7673201184713737662      TEXT  #/texts/0       text        0.61\n",
+      "6   language   6721946344459692653      TEXT  #/texts/1         en        0.58\n",
+      "7   semantic   6721946344459692653      TEXT  #/texts/1     header        0.89\n",
+      "8   language  18275165193923108925      TEXT  #/texts/2         en        0.40\n",
+      "9   semantic  18275165193923108925      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language   3124243469042107727      TEXT  #/texts/3         en        0.84\n",
+      "11  semantic   3124243469042107727      TEXT  #/texts/3  meta-data        0.96\n",
       "2305.02577.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Abstract. Text reading order is a crucial aspe...\n",
-      "1     title  #/texts/2  Text Reading Order in Uncontrolled Conditions ...\n",
-      "2    author  #/texts/2                                       Renshen Wang\n",
-      "3    author  #/texts/2                                     Yasuhisa Fujii\n",
-      "4    author  #/texts/2                                Alessandro Bissacco\n",
-      "5    author  #/texts/3                                    Google Research\n",
+      "title:  Text Reading Order in Uncontrolled Conditions by Sparse Graph Segmentation\n",
+      "abstract:  Abstract. Text reading order is a crucial aspect in the output of an OCR engine, with a large impact on downstream tasks. Its difficulty lies in the large variation of domain specific layout structures, and is further exacerbated by real-world image degradations such as perspective distortions. We propose a lightweight, scalable and generalizable approach to identify text reading order with a multi-modal, multi-task graph convolutional network (GCN) running on a sparse layout based graph. Predictions from the model provide hints of bidimensional relations among text lines and layout region structures, upon which a post-processing cluster-and-sort algorithm generates an ordered sequence of all the text lines. The model is language-agnostic and runs effectively across multilanguage datasets that contain various types of images taken in uncontrolled conditions, and it is small enough to be deployed on virtually any platform including mobile devices. Keywords: Multi-modality, bidimensional ordering relations, graph convolutional networks.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language   1972699006411133721  DOCUMENT          #         en        1.00\n",
+      "1   metadata   7732980062417155571  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata    571625923297984197  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   4908119920997196749  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   language  16757843187132766078      TEXT  #/texts/0         en        0.58\n",
+      "5   semantic  16757843187132766078      TEXT  #/texts/0  reference        0.66\n",
+      "6   language   7732980062417155571      TEXT  #/texts/1         en        0.87\n",
+      "7   semantic   7732980062417155571      TEXT  #/texts/1     header        0.90\n",
+      "8   language   3313787235241970061      TEXT  #/texts/2         en        0.43\n",
+      "9   semantic   3313787235241970061      TEXT  #/texts/2  meta-data        0.83\n",
+      "10  language  16094051868833861638      TEXT  #/texts/3         en        0.29\n",
+      "11  semantic  16094051868833861638      TEXT  #/texts/3  meta-data        0.99\n",
       "2102.08445.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Additional Key Words and Phrases: Table extrac...\n",
-      "1     title  #/texts/2  TableLab: An Interactive Table Extraction Syst...\n",
+      "title:  TableLab: An Interactive Table Extraction System with Adaptive Deep Learning\n",
+      "abstract:  Additional Key Words and Phrases: Table extraction, neural networks, Label correction ACM Reference Format: Nancy Xin Ru Wang, Douglas Burdick, and Yunyao Li. 2021. TableLab: An Interactive Table Extraction System with Adaptive Deep Learning. In 26th International Conference on Intelligent User Interfaces (IUI '21 Companion), April 14-17, 2021, College Station, TX, USA. ACM, New York, NY, USA, 5 pages. https://doi.org/10.1145/3397482.3450718\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  17938030701221560116  DOCUMENT          #         en        1.00\n",
+      "1   metadata  13161890226093865876  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata  13357791164955825560  DOCUMENT  #/texts/5   abstract        1.00\n",
+      "3   metadata   8235759605829279332  DOCUMENT  #/texts/6   abstract        1.00\n",
+      "4   metadata    562980825111325059  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "5   language   5128146051365197271      TEXT  #/texts/0         en        0.58\n",
+      "6   semantic   5128146051365197271      TEXT  #/texts/0       text        0.98\n",
+      "7   language  13161890226093865876      TEXT  #/texts/1         en        0.64\n",
+      "8   semantic  13161890226093865876      TEXT  #/texts/1  reference        0.55\n",
+      "9   language   8981668870332301618      TEXT  #/texts/2         en        0.50\n",
+      "10  semantic   8981668870332301618      TEXT  #/texts/2  meta-data        1.00\n",
+      "11  language   1596575228035912164      TEXT  #/texts/3         en        0.90\n",
       "2201.01654.pdf\n",
-      "    subtype  subj_path                                               name\n",
-      "0  abstract          #  Tables have been an ever-existing structure to...\n",
-      "1     title  #/texts/2  TableParser: Automatic Table Parsing with Weak...\n",
-      "2    author  #/texts/2                                       Susie Xi Rao\n",
-      "3    author  #/texts/2                                    Johannes Rausch\n",
-      "4    author  #/texts/2                                        Peter Egger\n",
-      "5    author  #/texts/2                                           Ce Zhang\n"
+      "title:  TableParser: Automatic Table Parsing with Weak Supervision from Spreadsheets\n",
+      "abstract:  Abstract Tables have been an ever-existing structure to store data. There exist now different approaches to store tabular data physically. PDFs, images, spreadsheets, and CSVs are leading examples. Being able to parse table structures and extract content bounded by these structures is of high importance in many applications. In this paper, we devise TableParser, a system capable of parsing tables in both native PDFs and scanned images with high precision. We have conducted extensive experiments to show the efficacy of domain adaptation in developing such a tool. Moreover, we create TableAnnotator and ExcelAnnotator, which constitute a spreadsheet-based weak supervision mechanism and a pipeline to enable table parsing. We share these resources with the research community to facilitate further research in this interesting direction.\n",
+      "        type             subj_hash subj_name  subj_path      label  confidence\n",
+      "0   language  13912777304373020997  DOCUMENT          #         en        0.99\n",
+      "1   metadata    901169889309037110  DOCUMENT  #/texts/1      title        1.00\n",
+      "2   metadata   7917292835367858114  DOCUMENT  #/texts/7   abstract        1.00\n",
+      "3   metadata   6679159056897124658  DOCUMENT  #/texts/8   abstract        1.00\n",
+      "4   language   6386611247138565482      TEXT  #/texts/0         en        0.18\n",
+      "5   semantic   6386611247138565482      TEXT  #/texts/0       text        0.88\n",
+      "6   language    901169889309037110      TEXT  #/texts/1         en        0.83\n",
+      "7   semantic    901169889309037110      TEXT  #/texts/1     header        0.39\n",
+      "8   language  14716675017513697862      TEXT  #/texts/2         en        0.24\n",
+      "9   semantic  14716675017513697862      TEXT  #/texts/2  meta-data        1.00\n",
+      "10  language    351740986866721621      TEXT  #/texts/3         en        0.70\n",
+      "11  semantic    351740986866721621      TEXT  #/texts/3  meta-data        0.99\n"
      ]
     }
    ],
@@ -2924,6 +3635,7 @@
     "expected_total = count_results.outputs[\"data_count\"]\n",
     "expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n",
     "\n",
+    "model = init_nlp_model(\"language;reference;metadata\")\n",
     "\n",
     "# Iterate through all results by fetching `page_size` results at the same time\n",
     "all_results = []\n",
@@ -2936,18 +3648,609 @@
     "\n",
     "        res = model.apply_on_doc(doc)\n",
     "\n",
+    "        if \"title\" in res[\"description\"]:\n",
+    "            print(\"title: \", res[\"description\"][\"title\"])\n",
+    "\n",
+    "        if \"abstract\" in res[\"description\"]:\n",
+    "            print(\"abstract: \", res[\"description\"][\"abstract\"])\n",
+    "\n",
     "        props = pd.DataFrame(res[\"properties\"][\"data\"], columns=res[\"properties\"][\"headers\"])\n",
-    "        insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])\n",
+    "        #print(props[0:12])\n",
     "\n",
-    "        doc_insts = insts[insts[\"subj_name\"]==\"DOCUMENT\"][[\"subtype\", \"subj_path\", \"name\"]]\n",
-    "        print(doc_insts)\n",
+    "        #insts = pd.DataFrame(res[\"instances\"][\"data\"], columns=res[\"instances\"][\"headers\"])\n",
+    "\n",
+    "        #doc_insts = insts[insts[\"subj_name\"]==\"DOCUMENT\"][[\"subtype\", \"subj_path\", \"name\"]]\n",
+    "        #print(doc_insts)\n",
     "\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5f49fced-9e94-40c2-82b6-e0f66331784e",
+   "metadata": {},
+   "source": [
+    "## Extract MetaData from private documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8da5b230-6b56-4d7d-aa40-262549963217",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import argparse\n",
+    "\n",
+    "# Import standard dependenices\n",
+    "from copy import deepcopy\n",
+    "import pandas as pd\n",
+    "from numerize.numerize import numerize\n",
+    "from tqdm import tqdm\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Import the deepsearch-toolkit\n",
+    "import deepsearch as ds\n",
+    "from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource\n",
+    "from deepsearch.cps.queries import DataQuery"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "86670afe-5e93-48eb-90ab-895ea5796903",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_indices_in_project(api, proj_key, coll_name):\n",
+    "\n",
+    "    data_indices = api.data_indices.list(proj_key=proj_key)\n",
+    "\n",
+    "    for index in data_indices:\n",
+    "        if coll_name==index.name:\n",
+    "            return index\n",
+    "\n",
+    "    print(\"Could not find collection in project. Please select one of the following collections\")\n",
+    "    for index in data_indices:\n",
+    "        print(\" -> collection: \", index)\n",
+    "    \n",
+    "    return None\n",
+    "\n",
+    "def search_documents(api, proj_key, coll_name, query, max_docs=100, page_size=1):\n",
+    "\n",
+    "    index = get_indices_in_project(api, coll_name=coll_name,\n",
+    "                                   proj_key=proj_key)\n",
+    "\n",
+    "    if index==None:\n",
+    "        return\n",
+    "\n",
+    "    try:\n",
+    "        data_query = DataQuery(query, coordinates=index.source, limit=page_size) # The size of each request page)\n",
+    "        cursor = api.queries.run_paginated_query(data_query)\n",
+    "\n",
+    "        # [Optional] Compute the number of total results matched. This can be used to monitor the pagination progress.\n",
+    "        count_query = deepcopy(data_query)\n",
+    "        count_query.paginated_task.parameters[\"limit\"] = 0\n",
+    "        count_results = api.queries.run(count_query)\n",
+    "        expected_total = count_results.outputs[\"data_count\"]\n",
+    "        expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula\n",
+    "\n",
+    "        print(\"#-documents: \", expected_total)\n",
+    "\n",
+    "        cur_docs = 0\n",
+    "        for result_page in tqdm(cursor):\n",
+    "\n",
+    "            if cur_docs>max_docs:\n",
+    "                break\n",
+    "\n",
+    "            for row in result_page.outputs[\"data_outputs\"]:\n",
+    "\n",
+    "                #print(cur_docs, max_docs)\n",
+    "                if cur_docs>max_docs:\n",
+    "                    break\n",
+    "\n",
+    "\n",
+    "                yield row[\"_source\"]\n",
+    "                cur_docs += 1\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        print(\" => \", e)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "2e2d12e8-00ba-49d3-bc8e-19a257eb85df",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "#-documents:  9\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "1it [00:01,  1.12s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "document-hash:  0f43aba61158df5f5a00d91434bee8dd47e9dad2a6252ab7607408e2e6057b7d\n",
+      "title: \n",
+      " Source area and tectonic provenance of Paleocene-Eocene red bed\n",
+      "clastics from the Kurdistan area NE Iraq: Bulk-rock geochemistry\n",
+      "constraints \n",
+      "\n",
+      "authors:  [\n",
+      "  {\n",
+      "    \"name\": \"Brian G Jones\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Muatasam Mahmood Hassan\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Solomon Buckman\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Ali Ismail Al Jubory\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Sabah Ahmed Ismail\"\n",
+      "  }\n",
+      "]\n",
+      "affiliations:  [\n",
+      "  {\n",
+      "    \"name\": \"School of Earth and Environmental Sciences\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"University of Wollongong\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"School of Earth Science\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"School of Earth Science\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"University of Kirkuk\"\n",
+      "  }\n",
+      "]\n",
+      "abstract: \n",
+      "\n",
+      "abstract \n",
+      "\n",
+      "Paleocene-Eocene Red Beds exist along a narrow belt in the NW-SE\n",
+      "oriented imbricate zone in northeastern Iraq and are composed of\n",
+      "clastic rocks including conglomerate, sandstone and mudstone. \n",
+      "\n",
+      "Trace elements show that the lower part of the Red Beds (unit one) was\n",
+      "derived mainly from mafic and ultramafic rocks. A decrease in mafic\n",
+      "and ultramafic components in the upper part of the Red Beds is\n",
+      "accompanied by an increase in felsic components indicating the\n",
+      "exposure of both felsic and intermediate igneous bodies in the source\n",
+      "areas. \n",
+      "\n",
+      "Trace elements normalized to upper continental crust confirmed the\n",
+      "mafic and ultramafic source for the lower part of the Red Beds. Unit\n",
+      "two and the overlying unit four reflect a style showing felsic and\n",
+      "mafic trends with transition elements being depleted in these parts.\n",
+      "The intervening unit three shows various patterns partly similar to\n",
+      "units one and two depending on clast abundance. \n",
+      "\n",
+      "The concentrations of rare earth elements in the mudstone reaches up\n",
+      "to 60% of the main chemical elements, therefore it is useful to\n",
+      "concentrate on this facies for geochemical studies. \n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2it [00:01,  1.18it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "document-hash:  44cd3953cb824628f2d7fe8976afc9beb2ed07c26ae83f0c79ca357af85af9d4\n",
+      "title: \n",
+      " Facies analysis and diagenetic features of the Aptian Dariyan\n",
+      "Formation in Zagros Fold-Thrust Belt, SW Iran \n",
+      "\n",
+      "authors:  [\n",
+      "  {\n",
+      "    \"name\": \"Arash Shaabanpour Haghighi\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Mohammad Sahraeyan\"\n",
+      "  }\n",
+      "]\n",
+      "affiliations:  []\n",
+      "abstract: \n",
+      "\n",
+      "abstract \n",
+      "\n",
+      "The Aptian Dariyan Formation (upper part of the Khami Group), is one\n",
+      "of the important reservoir rocks in the Zagros Fold-Thrust Belt. The\n",
+      "Zagros Fold-Thrust Belt is located on the boundary between the Arabian\n",
+      "and Eurasian lithospheric plates and formed from collision between\n",
+      "Eurasia and advancing Arabia during the Cenozoic. In these studied\n",
+      "area, the Dariyan Formation with a thickness of 136 meters (Fahliyan\n",
+      "section) and 100 meters (Kuh-e-Rahmat section), consists of carbonate\n",
+      "rocks. Based on the facies analysis and sedimentological data, 16\n",
+      "microfacies were identified. The microfacies are attributed to five\n",
+      "facies belts: tidal flat (lime mudstone, dolomitic mudstone and\n",
+      "stromatolitic boundstone), lagoon (bioclastic packstone, orbitolinids\n",
+      "bioclastic packstone and orbitolinids peloidal packstone), shoal\n",
+      "(orbitolinids grainstone and peloidal grainstone), restricted\n",
+      "(peloidal packstone, rudist floatstone/rudstone and orbitolinid\n",
+      "wackestone), and open marine (orbitolinid floatstone, dasycladacean\n",
+      "algae floatstone, bioclast pelagic foraminiferal wackestone/packstone,\n",
+      "pelagic foraminiferal mudstone/wackestone, and calcispere\n",
+      "packstone/wackestone). The depositional model relates to the carbonate\n",
+      "ramp. The allochems of the Dariyan Formation are dominated by\n",
+      "foraminifera, bioclasts and green algae. Peloids, and intraclasts are\n",
+      "less abundant in this formation. Due to the great diversity and\n",
+      "abundance of the foraminifera, this carbonate ramp is referred to as a\n",
+      "''foraminifera-dominated carbonate ramp system''. This carbonate\n",
+      "system reflects a local regression in the Fahliyan section which can\n",
+      "be related to the vertical movement of the Kazeroon Fault. The\n",
+      "carbonates of Dariyan Formation have been affected by a variety of\n",
+      "diagenetic processes such as compaction, dissolution, cementation,\n",
+      "neomorphism, and dolomitization. \n",
+      "\n",
+      "Ó 2014 Elsevier Ltd. All rights reserved. \n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "3it [00:02,  1.26it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "document-hash:  45319f285bb4544209fb74269a72a17c3a3525246945441aec927928a105bf04\n",
+      "title: \n",
+      " Integrated provenance analysis of Zakeen (Devonian) and Faraghan\n",
+      "(early Permian) sandstones in the Zagros belt, SW Iran \n",
+      "\n",
+      "authors:  [\n",
+      "  {\n",
+      "    \"name\": \"S Mohammad Zamanzadeh\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Yousef Zoleikhaei\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Abdolhossein Amini\"\n",
+      "  }\n",
+      "]\n",
+      "affiliations:  [\n",
+      "  {\n",
+      "    \"name\": \"College of Science\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"University of Tehran\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Faculty of Geography\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"University of Tehran\"\n",
+      "  }\n",
+      "]\n",
+      "abstract: \n",
+      "\n",
+      "abstract \n",
+      "\n",
+      "Successions of a controversial period of time in the Zagros and\n",
+      "Arabian Plate stratigraphic column, including Zakeen (Devonian) and\n",
+      "Faraghan (early Permian) formations are investigated for their\n",
+      "provenance characteristics. Nearly similar depositional environments\n",
+      "of the formations, regardless of 70-80 My hiatus between them, is the\n",
+      "main motivation for this study. Evidence from various methods are put\n",
+      "together to reconstruct a comprehensive image of their provenance.\n",
+      "Results from petrographic and detrital mode analysis indicate a\n",
+      "continental block provenance for of the sandstones of both formations.\n",
+      "In addition, evidence of recycling is evident from some rock fragments\n",
+      "in the conglomeratic facies. Heavy mineral diversities are limited to\n",
+      "the ultra-stable species which represent consistent morphological\n",
+      "characteristics in both formations. However, the values of rutile:\n",
+      "zircon index (RZi) showed intermittent changes from low RZi to high\n",
+      "RZi intervals in both formations. Detrital zircon age data in previous\n",
+      "studies represented the same source for these two formations, which\n",
+      "also remained unchanged from Neo-Proterozoic to late Paleozoic\n",
+      "successions. Zircon grains' morphology, however, showed remarkable\n",
+      "difference between the Zakeen and Faraghan formations on the one hand\n",
+      "and successions deposited in the basin prior to the tectonic movements\n",
+      "of mid-Paleozoic time on the other. Outcomes of this study show that,\n",
+      "although each single technique may shed light on a particular aspect\n",
+      "of the greater provenance problem, by integration of all the data,\n",
+      "important evidence of recycled nature of these successions could be\n",
+      "confirmed. Changes in the thickness of the Paleozoic units, the nature\n",
+      "of their stratal surfaces, along with the information from magmatic\n",
+      "events in the area provide a tectonostratigraphic framework for\n",
+      "northern margin of Gondwana in which the recycled nature of these\n",
+      "successions is justifiable. The recycled nature of the studied\n",
+      "formations on the one hand, and their identical provenance on the\n",
+      "other, raise a challenge for the timing proposed for two tectonic\n",
+      "activities of middle Paleozoic and mid-Carboniferous. \n",
+      "\n",
+      "Ó 2014 Elsevier Ltd. All rights reserved. \n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "4it [00:03,  1.42it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "document-hash:  71b9d4a7505055da7d78886e41abc80602eecdaed0863a0f51add493f38968ba\n",
+      "title: \n",
+      " Multi-phase inversion tectonics related to the Hendijan e Nowrooz e\n",
+      "Khafji Fault activity, Zagros Mountains, SW Iran \n",
+      "\n",
+      "authors:  [\n",
+      "  {\n",
+      "    \"name\": \"Sadjad Kazem Shiroodi\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Mohammad Ghafoori\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Ali Faghih\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Mostafa Ghanadian\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Gholamreza Lashkaripour\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Naser Hafezi Moghadas\"\n",
+      "  }\n",
+      "]\n",
+      "affiliations:  [\n",
+      "  {\n",
+      "    \"name\": \"Department of Geology\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Faculty of Sciences\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Ferdowsi University of Mashhad\"\n",
+      "  }\n",
+      "]\n",
+      "abstract: \n",
+      "\n",
+      "abstract \n",
+      "\n",
+      "Distinctive characteristics of inverted structures make them important\n",
+      "criteria for the identification of certain structural styles of folded\n",
+      "belts. The interpretation of 3D seismic reflection and well data sheds\n",
+      "new light on the structural evolution and age of inverted structures\n",
+      "associated to the Hendijan$_{e}$Nowrooz $_{e}$Khafji Fault within the\n",
+      "Persian Gulf Basin and northeastern margin of Afro-Arabian plate.\n",
+      "Analysis of thickness variations of growth strata using $_{'}$T-Z\n",
+      "plot$_{'}$ (thickness versus throw plot) method revealed the\n",
+      "kinematics of the fault. Obtained results show that the fault has\n",
+      "experienced a multi-phase evolutionary history over six different\n",
+      "extension and compression deformation events (i.e. positive and\n",
+      "negative inversion) between 252.2 and 11.62 Ma. This cyclic activity\n",
+      "of the growth fault was resulted from alteration of sedimentary\n",
+      "processes during continuous fault slip. The structural development of\n",
+      "the study area both during positive and negative inversion geometry\n",
+      "styles was ultimately controlled by the relative motion between the\n",
+      "Afro-Arabian and Central-Iranian plates. \n",
+      "\n",
+      "© 2015 Elsevier Ltd. All rights reserved. \n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "5it [00:03,  1.57it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "document-hash:  7594495bb7872d4aa3bfa7bacbc4f598fa8c84fddc6c553effaf4f1b101935c0\n",
+      "title: \n",
+      " Lithofacies, architectural elements and tectonic provenance of the\n",
+      "siliciclastic rocks of the Lower Permian Dorud Formation in the Alborz\n",
+      "Mountain Range, Northern Iran \n",
+      "\n",
+      "authors:  [\n",
+      "  {\n",
+      "    \"name\": \"Mojtaba Javidan\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Hosseinali Mokhtarpour\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Mohammad Sahraeyan\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Hojatollah Kheyrandish\"\n",
+      "  }\n",
+      "]\n",
+      "affiliations:  [\n",
+      "  {\n",
+      "    \"name\": \"Department of Geology\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"College of Basic Sciences\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Department of Geology\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Department of Geology\"\n",
+      "  }\n",
+      "]\n",
+      "abstract: \n",
+      "\n",
+      "abstract \n",
+      "\n",
+      "The siliciclastic deposits of the Lower Permian Dorud Formation widely\n",
+      "crop out in the eastern part of the Alborz Mountain Range (northern\n",
+      "Iran). In order to interpret the sedimentary environments and tectonic\n",
+      "provenance of these deposits, two sections in the Kiyasar and\n",
+      "Talmadareh with 112 and 122 m thickness, respectively; have been\n",
+      "studied. The analysis of lithofacies and architectural elements, leads\n",
+      "to recognition of seven lithofacies (Gmm, Sr, Sl, Sh, Sp, Fl, and Fm),\n",
+      "and four architectural elements (FF, LA, CH, and CR). Based on these\n",
+      "results, the sedimentary environment of these deposits has been\n",
+      "identified as a sandy meandering river. The petrographical analysis\n",
+      "indicates that these sediments were deposited under humid weather in\n",
+      "the craton interior and recycled orogeny tectonic provenance. \n",
+      "\n",
+      "Ó 2015 Elsevier Ltd. All rights reserved. \n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "6it [00:04,  1.64it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "document-hash:  7c5d4947280cec27fbb01892eea145933df0813be615ccd5fb5bb5503254d0f1\n",
+      "title: \n",
+      " Stratigraphy, mineralogy and depositional environment of the evaporite\n",
+      "unit in the As ¸ kale (Erzurum) sub-basin, Eastern Anatolia (Turkey) \n",
+      "\n",
+      "authors:  [\n",
+      "  {\n",
+      "    \"name\": \"Emel Abdio\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Mehmet Arslan\"\n",
+      "  },\n",
+      "  {\n",
+      "    \"name\": \"Cahit Helvac\"\n",
+      "  }\n",
+      "]\n",
+      "affiliations:  []\n",
+      "abstract: \n",
+      "\n",
+      "abstract \n",
+      "\n",
+      "The study area is situated in the As¸ kale sub-basin where the Early-\n",
+      "Middle Miocene aged As¸ kale Formation was deposited in a shallow\n",
+      "marine to lagoonal environment, and consists of interstratifications\n",
+      "of clastic sediments, carbonates and evaporites. The successions of\n",
+      "the As¸ kale Formation can be divided into four main members\n",
+      "interfingering with one another both vertically and laterally, and\n",
+      "composed of the sandstone-mudstone-limestone member, the evaporite\n",
+      "member, the gravelstone-sandstone-mudstone intercalations and the\n",
+      "limestone member. The evaporite unit comprises of secondary gypsum\n",
+      "lithofacies formed by hydration of precursor anhydrite, anhydrite,\n",
+      "gypsum-bearing limestone and claystone in the form of wedges and\n",
+      "lenses. Massive, nodular, nodular-banded, laminated and laminated-\n",
+      "banded gpysum lithofacieses in addition to chicken-wire and rare\n",
+      "entrolithic structures were described, indicating a sabhka or a\n",
+      "shallow water depositional environment. Alabastrine and porphyblastic\n",
+      "textures of gypsum were identified within the all lithofacieses with\n",
+      "abundant amount of anhydrite relics. Additionally, saponite and\n",
+      "illite/smectite, calcite and dolomite, celestite, epsomite were also\n",
+      "observed. Successions of the As¸ kale Formation were deposited in\n",
+      "stable subtropical climatic conditions within rapidly subsiding sub-\n",
+      "basin resulted in conversion of sub-basin to shallow platform and even\n",
+      "in lagoon environment. \n",
+      "\n",
+      "© 2015 Elsevier Ltd. All rights reserved. \n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "6it [00:04,  1.32it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)\n",
+    "\n",
+    "model = init_nlp_model(\"language;reference;metadata\")\n",
+    "\n",
+    "proj_key = \"c4ae6545156c5f99770fdfd161102a01567d8ecd\"\n",
+    "#coll_name = \"GeoArabia\"\n",
+    "#coll_name = \"BasinResearch1\"\n",
+    "coll_name = \"African_ES\"\n",
+    "\n",
+    "query = \"*\"\n",
+    "\n",
+    "for doc in search_documents(api, proj_key, coll_name, query, max_docs=5, page_size=1):\n",
+    "    \n",
+    "    print(\"document-hash: \", doc[\"file-info\"][\"document-hash\"])\n",
+    "    \n",
+    "    res = model.apply_on_doc(doc)\n",
+    "    #print(res[\"description\"].keys())\n",
+    "  \n",
+    "    if \"title\" in res[\"description\"]:\n",
+    "        text = res[\"description\"][\"title\"]\n",
+    "        text = \"\\n\".join(textwrap.wrap(text, width=70))\n",
+    "\n",
+    "        print(\"title: \\n\", text, \"\\n\")\n",
+    "\n",
+    "    if \"authors\" in res[\"description\"]:\n",
+    "        print(\"authors: \", json.dumps(res[\"description\"][\"authors\"], indent=2))\n",
+    "\n",
+    "    if \"affiliations\" in res[\"description\"]:\n",
+    "        print(\"affiliations: \", json.dumps(res[\"description\"][\"affiliations\"], indent=2))\n",
+    "    \n",
+    "    if \"abstract\" in res[\"description\"]:\n",
+    "\n",
+    "        print(\"abstract: \\n\")\n",
+    "        for _ in res[\"description\"][\"abstract\"]:\n",
+    "            text = \"\\n\".join(textwrap.wrap(_, width=70))\n",
+    "            print(text, \"\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7207ed71-a8e3-4e4f-88c1-5fbbb4f82ac1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4cf42286-b0a1-438a-8c7f-d852e61c260f",
+   "id": "9b8d13fa-d23d-46c6-9b05-a07b27e4d6c7",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/poetry.lock b/poetry.lock
index 6b99c52..1a6e33e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -985,23 +985,23 @@ files = [
 
 [[package]]
 name = "deepsearch-glm"
-version = "0.17.2"
+version = "0.18.4"
 description = "Graph Language Models"
 optional = false
 python-versions = "<4.0,>=3.8"
 files = [
-    {file = "deepsearch_glm-0.17.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6234fb2fa6755ff1bb7000d21e4574eea68a29557d8f16ba179f5f5713766d9b"},
-    {file = "deepsearch_glm-0.17.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:af97145ebb2f21b074ef6385c45d60a2d2553b68254c30aa66b7ddd9206b7f7b"},
-    {file = "deepsearch_glm-0.17.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fe2669ac7f8567383e0818fe9b3b73979978fb5e65f36db3b7626bf3af6206d"},
-    {file = "deepsearch_glm-0.17.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:95cbe8e8264c675a128f520f33afa3fd34295c64b00d282c015fe13c7cc2bf3b"},
-    {file = "deepsearch_glm-0.17.2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:1b9203be399dd4f026769998cca25a5691ff79791ead2dfa05385af8467f4bd8"},
-    {file = "deepsearch_glm-0.17.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f512fede5fd062ff005f51073ef5660e2e963e0013251176b69bd7ab9e45faa"},
-    {file = "deepsearch_glm-0.17.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:be15a98b0cbf36e5141e5dd8e22ba29b0e0d92604fc58e53e8fa6c837b29a40f"},
-    {file = "deepsearch_glm-0.17.2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:e3ddcaf73dd5578786db3333c238790a45d0fa0af4b1df9a41a4b9dd234c2401"},
-    {file = "deepsearch_glm-0.17.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20e7b819d58df057bd1826fea8cd3e6d0ed4cac3fe819795ee5205360fa77fee"},
-    {file = "deepsearch_glm-0.17.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:0f8405524b000669b82098b1989e8c4ef4da0f93407477c1d807533d9f427867"},
-    {file = "deepsearch_glm-0.17.2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:da459f79913b0f967f5802766b2b964bc997d7f5259663901d84c2780961dfa8"},
-    {file = "deepsearch_glm-0.17.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50757b64a607104882a683b2a570f96f450faaf0f1047125df043d01406f2f16"},
+    {file = "deepsearch_glm-0.18.4-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ad88c5bf3c203174ef81e0699405aec0f5386130cbc6a975b165f81887bc1a52"},
+    {file = "deepsearch_glm-0.18.4-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:21d51a0671f0713d23be57030287a0f907f4a5f0627a45ea07e2caf54129a71a"},
+    {file = "deepsearch_glm-0.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fc853941ea751a15f65e83f9bee9f988d0ecac4b28fac067b2aab49e15edb74"},
+    {file = "deepsearch_glm-0.18.4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:cab5e577cf724343f2a5987ff4488c69e86a2dbca8cb0359c9243a07c6cd7d69"},
+    {file = "deepsearch_glm-0.18.4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:dda02391306d657a884b12f21cc3d1228663f940ec6001c833893dd2844bcc25"},
+    {file = "deepsearch_glm-0.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9dccd4286a93ee1a216acba27e1fc76f5d14e280d968998cfeae11a00ad1b6cb"},
+    {file = "deepsearch_glm-0.18.4-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:cf38368bc72eab673459ea0fc96c02b1f3ae120df2d9443e1a63e010764ac1e9"},
+    {file = "deepsearch_glm-0.18.4-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:d3fd83ea3b2bce11bac1d710f12547728f4dd48bfaa8bd472366ef144469d52c"},
+    {file = "deepsearch_glm-0.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fb4bfd43ac3b996cdd151c35e94fa399953ee3952d7e86390a825880ece95f3"},
+    {file = "deepsearch_glm-0.18.4-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:57cb67e435cacb6c4a6b6a9109d943267c493ebbba252a88ca40909976f60225"},
+    {file = "deepsearch_glm-0.18.4-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:edc399939b6464f96600d2f23796ae2641d668fb794b77199e87abdef77f8853"},
+    {file = "deepsearch_glm-0.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00ad8d932e7f0d1be4fd99fc0d4c8d50cb1ff10764f146b6ecb310a1379123d4"},
 ]
 
 [package.dependencies]
@@ -5279,4 +5279,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = ">= 3.8, <3.11"
-content-hash = "928e878a81836c1528c27876675534238f9bc5c350965694061acb31ef559c3b"
+content-hash = "73f1b0b84cdeb292efcf668a0a3deeb2ac3f76ebcc3ced6791f681792f17d3b8"
diff --git a/pyproject.toml b/pyproject.toml
index 0b53572..77fc873 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ python-dotenv = "^1.0.0"
 nbclient = "^0.9.0"
 pandas = "^1.5.1"
 argilla = "^1.24.0"
-deepsearch-glm = "v0.17.2"
+deepsearch-glm = "v0.18.4"
 
 [tool.poetry.group.dev.dependencies]
 black = {extras = ["jupyter"], version = "^22.1.0"}

From d76a64fae21543a6c678ecf0e60a732d0da52c03 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Wed, 8 May 2024 09:16:31 +0200
Subject: [PATCH 3/3] reverted other notebook to original

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 948a82b..022eed8 100644
--- a/README.md
+++ b/README.md
@@ -41,8 +41,9 @@ is based on [Profiles][profiles]. Unless otherwise configured, the profile used
 |    | Name              | Description |
 | -- | ----------------- | ----------- |
 | 1. | [NLP on documents](examples/nlp_on_documents/nlp_on_documents.ipynb) | A few quick examples on how to apply NLP models on documents (eg extracting key-terms) |
-| 2. | [Reference Parsing](examples/nlp_for_references/nlp_for_references.ipynb) | Examples on how to parse references from Documents |
-| 3. | [Material Extraction](examples/nlp_for_materials/nlp_for_materials.ipynb) | Examples on how to extract materials from Documents |
+| 2. | [MetaData Extraction](examples/nlp_for_metadata/nlp_for_metadata.ipynb) | Examples on how to detect the metadata of a Document |
+| 3. | [Reference Parsing](examples/nlp_for_references/nlp_for_references.ipynb) | Examples on how to parse references from Documents |
+| 4. | [Material Extraction](examples/nlp_for_materials/nlp_for_materials.ipynb) | Examples on how to extract materials from Documents |
 
 ### Data queries