|
431 | 431 | "print(f\"Fetched and exported {doc_num} documents.\")" |
432 | 432 | ] |
433 | 433 | }, |
434 | | - { |
435 | | - "cell_type": "markdown", |
436 | | - "metadata": {}, |
437 | | - "source": [ |
438 | | - "### Using the backend converter (optional)\n", |
439 | | - "\n", |
440 | | - "- The custom backend converters `PubMedDocumentBackend` and `PatentUsptoDocumentBackend` aim at handling the parsing of PMC articles and USPTO patents, respectively.\n", |
441 | | - "- As any other backends, you can leverage the function `is_valid()` to check if the input document is supported by the this backend.\n", |
442 | | - "- Note that some XML sections in the original USPTO zip file may not represent patents, like sequence listings, and therefore they will show as invalid by the backend." |
443 | | - ] |
444 | | - }, |
445 | | - { |
446 | | - "cell_type": "code", |
447 | | - "execution_count": 11, |
448 | | - "metadata": {}, |
449 | | - "outputs": [ |
450 | | - { |
451 | | - "name": "stdout", |
452 | | - "output_type": "stream", |
453 | | - "text": [ |
454 | | - "Document nihpp-2024.12.26.630351v1.nxml is a valid PMC article? True\n", |
455 | | - "Document ipg241217-1.xml is a valid patent? True\n" |
456 | | - ] |
457 | | - }, |
458 | | - { |
459 | | - "data": { |
460 | | - "application/vnd.jupyter.widget-view+json": { |
461 | | - "model_id": "316241ca89a843bda3170f2a5c76c639", |
462 | | - "version_major": 2, |
463 | | - "version_minor": 0 |
464 | | - }, |
465 | | - "text/plain": [ |
466 | | - " 0%| | 0/4014 [00:00<?, ?it/s]" |
467 | | - ] |
468 | | - }, |
469 | | - "metadata": {}, |
470 | | - "output_type": "display_data" |
471 | | - }, |
472 | | - { |
473 | | - "name": "stdout", |
474 | | - "output_type": "stream", |
475 | | - "text": [ |
476 | | - "Found 3928 patents out of 4014 XML files.\n" |
477 | | - ] |
478 | | - } |
479 | | - ], |
480 | | - "source": [ |
481 | | - "from tqdm.notebook import tqdm\n", |
482 | | - "\n", |
483 | | - "from docling.backend.xml.jats_backend import JatsDocumentBackend\n", |
484 | | - "from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend\n", |
485 | | - "from docling.datamodel.base_models import InputFormat\n", |
486 | | - "from docling.datamodel.document import InputDocument\n", |
487 | | - "\n", |
488 | | - "# check PMC\n", |
489 | | - "in_doc = InputDocument(\n", |
490 | | - " path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\",\n", |
491 | | - " format=InputFormat.XML_JATS,\n", |
492 | | - " backend=JatsDocumentBackend,\n", |
493 | | - ")\n", |
494 | | - "backend = JatsDocumentBackend(\n", |
495 | | - " in_doc=in_doc, path_or_stream=TEMP_DIR / \"nihpp-2024.12.26.630351v1.nxml\"\n", |
496 | | - ")\n", |
497 | | - "print(f\"Document {in_doc.file.name} is a valid PMC article? {backend.is_valid()}\")\n", |
498 | | - "\n", |
499 | | - "# check USPTO\n", |
500 | | - "in_doc = InputDocument(\n", |
501 | | - " path_or_stream=TEMP_DIR / \"ipg241217-1.xml\",\n", |
502 | | - " format=InputFormat.XML_USPTO,\n", |
503 | | - " backend=PatentUsptoDocumentBackend,\n", |
504 | | - ")\n", |
505 | | - "backend = PatentUsptoDocumentBackend(\n", |
506 | | - " in_doc=in_doc, path_or_stream=TEMP_DIR / \"ipg241217-1.xml\"\n", |
507 | | - ")\n", |
508 | | - "print(f\"Document {in_doc.file.name} is a valid patent? {backend.is_valid()}\")\n", |
509 | | - "\n", |
510 | | - "patent_valid = 0\n", |
511 | | - "pbar = tqdm(TEMP_DIR.glob(\"*.xml\"), total=doc_num)\n", |
512 | | - "for in_path in pbar:\n", |
513 | | - " in_doc = InputDocument(\n", |
514 | | - " path_or_stream=in_path,\n", |
515 | | - " format=InputFormat.XML_USPTO,\n", |
516 | | - " backend=PatentUsptoDocumentBackend,\n", |
517 | | - " )\n", |
518 | | - " backend = PatentUsptoDocumentBackend(in_doc=in_doc, path_or_stream=in_path)\n", |
519 | | - " patent_valid += int(backend.is_valid())\n", |
520 | | - "\n", |
521 | | - "print(f\"Found {patent_valid} patents out of {doc_num} XML files.\")" |
522 | | - ] |
523 | | - }, |
524 | | - { |
525 | | - "cell_type": "markdown", |
526 | | - "metadata": {}, |
527 | | - "source": [ |
528 | | - "Calling the function `convert()` will convert the input document into a `DoclingDocument`" |
529 | | - ] |
530 | | - }, |
531 | | - { |
532 | | - "cell_type": "code", |
533 | | - "execution_count": 12, |
534 | | - "metadata": {}, |
535 | | - "outputs": [ |
536 | | - { |
537 | | - "name": "stdout", |
538 | | - "output_type": "stream", |
539 | | - "text": [ |
540 | | - "Patent \"Semiconductor package\" has 19 claims\n" |
541 | | - ] |
542 | | - } |
543 | | - ], |
544 | | - "source": [ |
545 | | - "doc = backend.convert()\n", |
546 | | - "\n", |
547 | | - "claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n", |
548 | | - "print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')" |
549 | | - ] |
550 | | - }, |
551 | | - { |
552 | | - "cell_type": "markdown", |
553 | | - "metadata": {}, |
554 | | - "source": [ |
555 | | - "✏️ **Tip**: in general, there is no need to use the backend converters to parse USPTO or JATS (PubMed) XML files. The generic `DocumentConverter` object tries to guess the input document format and applies the corresponding backend parser. The conversion shown in [Simple Conversion](#simple-conversion) is the recommended usage for the supported XML files." |
556 | | - ] |
557 | | - }, |
558 | 434 | { |
559 | 435 | "cell_type": "markdown", |
560 | 436 | "metadata": {}, |
|
923 | 799 | "name": "python", |
924 | 800 | "nbconvert_exporter": "python", |
925 | 801 | "pygments_lexer": "ipython3", |
926 | | - "version": "3.12.8" |
| 802 | + "version": "3.12.10" |
927 | 803 | } |
928 | 804 | }, |
929 | 805 | "nbformat": 4, |
|
0 commit comments