Skip to content

Commit 1601bad

Browse files
pdf parsing using azure ai document intelligence. fixed 2 code blocks. 1-increased elastic client timeout. 2-fixed issue in combine pararaph and table text
1 parent 92ecb60 commit 1601bad

File tree

1 file changed

+10
-4
lines changed

1 file changed

+10
-4
lines changed

supporting-blog-content/pdf-azure-ai-document-intelligence/pdf-azure-ai-document-intelligence.ipynb

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -265,10 +265,16 @@
265265
" structured_data = []\n",
266266
"\n",
267267
" # Combine paragraph and table content\n",
268-
" for p_number, contents in {**paragraph_content, **table_content}.items():\n",
268+
" for p_number in set(paragraph_content.keys()).union(table_content.keys()):\n",
269269
" concatenated_text = \"\"\n",
270-
" for content in contents:\n",
271-
" concatenated_text += content[\"content_text\"] + \"\\n\"\n",
270+
"\n",
271+
" if p_number in paragraph_content:\n",
272+
" for content in paragraph_content[p_number]:\n",
273+
" concatenated_text += content[\"content_text\"] + \"\\n\"\n",
274+
"\n",
275+
" if p_number in table_content:\n",
276+
" for content in table_content[p_number]:\n",
277+
" concatenated_text += content[\"content_text\"] + \"\\n\"\n",
272278
"\n",
273279
" page_content_concatenated[p_number] = concatenated_text.strip()\n",
274280
"\n",
@@ -360,7 +366,7 @@
360366
"ES_URL = os.getenv(\"ES_URL\")\n",
361367
"ES_API_KEY = os.getenv(\"ES_API_KEY\")\n",
362368
"\n",
363-
"es = Elasticsearch(hosts=ES_URL, api_key=ES_API_KEY, request_timeout=60)"
369+
"es = Elasticsearch(hosts=ES_URL, api_key=ES_API_KEY, request_timeout=300)"
364370
]
365371
},
366372
{

0 commit comments

Comments
 (0)