Skip to content

Commit eab5b4a

Browse files
[FEATURE] Migrate kitodo#1712 to main branch (kitodo#1717)
Co-authored-by: Markus Weigelt <Markus.Weigelt@slub-dresden.de>
1 parent 98820f1 commit eab5b4a

File tree

7 files changed

+315
-6
lines changed

7 files changed

+315
-6
lines changed

Classes/Common/FullTextReader.php

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class FullTextReader
3333

3434
/**
3535
* Constructor
36-
*
36+
*
3737
* @param array $formats
3838
*/
3939
public function __construct(array $formats)
@@ -44,7 +44,7 @@ public function __construct(array $formats)
4444

4545
/**
4646
* This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an
47-
* XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have
47+
* XML full text representation. For IIIF manifests, ALTO documents have
4848
* to be given in the Canvas' / Manifest's "seeAlso" property.
4949
*
5050
* @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property
@@ -83,7 +83,7 @@ public function getFromXml(string $id, array $fileLocations, $physicalStructureN
8383
if (!empty($fileContent) && !empty($this->formats[$textFormat])) {
8484
$textMiniOcr = '';
8585
if (!empty($this->formats[$textFormat]['class'])) {
86-
$textMiniOcr = $this->getRawTextFromClass($fileContent, $textFormat);
86+
$textMiniOcr = $this->getRawTextFromClass($id, $fileContent, $textFormat);
8787
}
8888
$fullText = $textMiniOcr;
8989
} else {
@@ -98,12 +98,14 @@ public function getFromXml(string $id, array $fileLocations, $physicalStructureN
9898
*
9999
* @access private
100100
*
101+
* @param string $id The "@ID" attribute of the physical structure node (METS) or the "@id" property
102+
* of the Manifest / Range (IIIF)
101103
* @param string $fileContent The content of the XML file
102104
* @param string $textFormat
103105
*
104106
* @return string
105107
*/
106-
private function getRawTextFromClass(string $fileContent, string $textFormat): string
108+
private function getRawTextFromClass(string $id, string $fileContent, string $textFormat): string
107109
{
108110
$textMiniOcr = '';
109111
$class = $this->formats[$textFormat]['class'];
@@ -113,6 +115,7 @@ private function getRawTextFromClass(string $fileContent, string $textFormat): s
113115
if ($obj instanceof FulltextInterface) {
114116
// Load XML from file.
115117
$ocrTextXml = Helper::getXmlFileAsString($fileContent);
118+
$obj->setPageId($id);
116119
$textMiniOcr = $obj->getTextAsMiniOcr($ocrTextXml);
117120
} else {
118121
$this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"');

Classes/Common/FulltextInterface.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,16 @@
2424
*/
2525
interface FulltextInterface
2626
{
27+
28+
/**
29+
* Set the page identifier.
30+
*
31+
* @access public
32+
*
33+
* @param string $pageId The page identifier of mets:div in the physical struct map of the METS.
34+
*/
35+
public function setPageId(string $pageId): void;
36+
2737
/**
2838
* This extracts raw fulltext data from XML
2939
*

Classes/Format/Alto.php

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212

1313
namespace Kitodo\Dlf\Format;
1414

15+
use Kitodo\Dlf\Common\FulltextInterface;
16+
1517
/**
1618
* Fulltext ALTO format class for the 'dlf' extension
1719
*
@@ -22,7 +24,7 @@
2224
*
2325
* @access public
2426
*/
25-
class Alto implements \Kitodo\Dlf\Common\FulltextInterface
27+
class Alto implements FulltextInterface
2628
{
2729
/**
2830
* This extracts the fulltext data from ALTO XML
@@ -159,4 +161,9 @@ private function registerAltoNamespace(\SimpleXMLElement &$xml)
159161
$xml->registerXPathNamespace('alto', 'http://www.loc.gov/standards/alto/ns-v4#');
160162
}
161163
}
164+
165+
public function setPageId(string $pageId): void
166+
{
167+
// Nothing to do here.
168+
}
162169
}

Classes/Format/Tei.php

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
<?php
2+
3+
/**
4+
* (c) Kitodo. Key to digital objects e.V. <contact@kitodo.org>
5+
*
6+
* This file is part of the Kitodo and TYPO3 projects.
7+
*
8+
* @license GNU General Public License version 3 or later.
9+
* For the full copyright and license information, please read the
10+
* LICENSE.txt file that was distributed with this source code.
11+
*/
12+
13+
namespace Kitodo\Dlf\Format;
14+
15+
use Kitodo\Dlf\Common\FulltextInterface;
16+
use Psr\Log\LoggerAwareInterface;
17+
use Psr\Log\LoggerAwareTrait;
18+
use SimpleXMLElement;
19+
20+
/**
21+
* Fulltext ALTO format class for the 'dlf' extension
22+
*
23+
* ** This currently supports ALTO 2.x / 3.x / 4.x **
24+
*
25+
* @package TYPO3
26+
* @subpackage dlf
27+
*
28+
* @access public
29+
*/
30+
class Tei implements FulltextInterface, LoggerAwareInterface
31+
{
32+
use LoggerAwareTrait;
33+
34+
private string $pageId;
35+
36+
public function setPageId(string $pageId): void
37+
{
38+
$this->pageId = $pageId;
39+
}
40+
41+
/**
42+
* This extracts the fulltext data from TEI XML
43+
*
44+
* @access public
45+
*
46+
* @param \SimpleXMLElement $xml The XML to extract the raw text from
47+
*
48+
* @return string The raw unformatted fulltext
49+
*/
50+
public function getRawText(\SimpleXMLElement $xml): string
51+
{
52+
if (empty($this->pageId)) {
53+
$this->logger->warning('Text could not be retrieved from TEI because the page ID is empty.');
54+
return '';
55+
}
56+
57+
// register ALTO namespace depending on document
58+
$this->registerTeiNamespace($xml);
59+
60+
// Get all (presumed) words of the text.
61+
$contentXml = $xml->xpath('./TEI:text')[0]->asXML();
62+
63+
// Remove tags but keep their content
64+
$contentXml = preg_replace('/<\/?(?:body|front|div|head|titlePage)[^>]*>/u', '', $contentXml);
65+
66+
// Replace linebreaks
67+
$contentXml = preg_replace('/<lb(?:\s[^>]*)?\/>/u', '', $contentXml);
68+
$contentXml = preg_replace('/\s+/', ' ', $contentXml);
69+
70+
// Extract content between each <pb /> and the next <pb /> or end of string
71+
$pattern = '/<pb[^>]*facs="([^"]+)"[^>]*\/>([\s\S]*?)(?=<pb[^>]*\/>|$)/u';
72+
$facs = [];
73+
74+
// Use preg_match_all to get all matches at once
75+
if (preg_match_all($pattern, $contentXml, $matches, PREG_SET_ORDER)) {
76+
foreach ($matches as $match) {
77+
$facsMatch = trim($match[1]);
78+
$facsId = str_starts_with($facsMatch, "#") ? substr($facsMatch, 1) : $facsMatch;
79+
$facs[$facsId] = trim(strip_tags($match[2])); // Everything until next <pb /> or end of string
80+
}
81+
}
82+
83+
if (!array_key_exists($this->pageId, $facs)) {
84+
$this->logger->debug('The page break attribute "facs" with the page identifier postfix "' . $this->pageId . '" could not be found in the TEI document');
85+
return '';
86+
}
87+
88+
return $facs[$this->pageId];
89+
}
90+
91+
/**
92+
* This extracts the fulltext data from TEI XML and returns it in MiniOCR format
93+
*
94+
* @access public
95+
*
96+
* @param \SimpleXMLElement $xml The XML to extract the raw text from
97+
*
98+
* @return string The unformatted fulltext in MiniOCR format
99+
*/
100+
public function getTextAsMiniOcr(\SimpleXMLElement $xml): string
101+
{
102+
$rawText = $this->getRawText($xml);
103+
104+
if (empty($rawText)) {
105+
return '';
106+
}
107+
108+
$miniOcr = new SimpleXMLElement("<ocr></ocr>");
109+
$miniOcr->addChild('b', $rawText);
110+
$miniOcrXml = $miniOcr->asXml();
111+
if (\is_string($miniOcrXml)) {
112+
return $miniOcrXml;
113+
}
114+
return '';
115+
}
116+
117+
/**
118+
* This registers the necessary TEI namespace for the current TEI-XML
119+
*
120+
* @access private
121+
*
122+
* @param \SimpleXMLElement &$xml: The XML to register the namespace for
123+
*/
124+
private function registerTeiNamespace(\SimpleXMLElement $xml)
125+
{
126+
$namespace = $xml->getDocNamespaces();
127+
128+
if (in_array('http://www.tei-c.org/ns/1.0', $namespace, true)) {
129+
$xml->registerXPathNamespace('TEI', 'http://www.tei-c.org/ns/1.0');
130+
}
131+
}
132+
}

Documentation/User/Index.rst

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ User Manual
1616
:local:
1717
:depth: 2
1818

19-
2019
.. _indexing_documents:
2120

2221
Indexing Documents
@@ -545,3 +544,76 @@ With the command `kitodo:optimize` it is possible to hard commit documents to an
545544
Show each processed documents uid and location with timestamp and
546545
amount of processed/all documents.
547546
:Example:
547+
548+
549+
.. _indexing_fulltexts:
550+
551+
Indexing full texts
552+
==================
553+
554+
Full texts must be provided in the ``FULLTEXT`` file group within the METS. Kitodo.Presentation supports the ALTO and TEI format for indexing full texts.
555+
556+
**ALTO**
557+
558+
Each ALTO file contains the full text of a single page of the document.
559+
560+
.. code-block:: xml
561+
<mets:fileGrp USE="FULLTEXT">
562+
<mets:file ID="..." MIMETYPE="text/xml">
563+
<mets:FLocat LOCTYPE="URL" xlink:href="https://www.example.com/example-alto-page-1.xml"/>
564+
</mets:file>
565+
<mets:file ID="..." MIMETYPE="text/xml">
566+
<mets:FLocat LOCTYPE="URL" xlink:href="https://www.example.com/example-alto-page-2.xml"/>
567+
</mets:file>
568+
...
569+
</mets:fileGrp>
570+
571+
**TEI**
572+
573+
TEI contains all full texts of the entire document.
574+
575+
.. code-block:: xml
576+
<mets:fileGrp USE="FULLTEXT">
577+
<mets:file ID="..." MIMETYPE="application/tei+xml">
578+
<mets:FLocat LOCTYPE="URL" xlink:href="https://www.example.com/example-tei.xml"/>
579+
</mets:file>
580+
</mets:fileGrp>
581+
582+
.. note::
583+
584+
The identifier of the ``facsimile`` tag (and thus the ``pb`` tag (page break) references) in the TEI must match the ``ID`` attribute of the ``mets:div`` with type ``page`` in the physical structMap of the METS. Otherwise, the pages cannot be mapped and will not be indexed.
585+
586+
587+
For indexing full texts, the formats need to be defined in the Data Formats or in the table ``tx_dlf_formats`` with following settings.
588+
589+
.. t3-field-list-table::
590+
:header-rows: 1
591+
592+
- :Type:
593+
Format Name (e.g. in METS)
594+
:Root:
595+
Root Element
596+
:Namespace:
597+
Namespace URI
598+
:Class:
599+
Class Name
600+
601+
- :Type:
602+
ALTO
603+
:Root:
604+
alto
605+
:Namespace:
606+
http://www.loc.gov/standards/alto/ns-v2#
607+
:Class:
608+
``Kitodo\Dlf\Format\Alto``
609+
610+
- :Type:
611+
TEI
612+
:Root:
613+
TEI
614+
:Namespace:
615+
http://www.tei-c.org/ns/1.0
616+
:Class:
617+
``Kitodo\Dlf\Format\Tei``
618+
619+
After configuration, all full texts will be indexed when executing the commands of :ref:`indexing_documents`.

Tests/Fixtures/Format/tei.xml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<TEI xmlns="http://www.tei-c.org/ns/1.0">
3+
<teiHeader>
4+
</teiHeader>
5+
<facsimile>
6+
<graphic mimeType="image/jpeg" url="https://www.example.com/00000001.tif.original.jpg" id="f0001"/>
7+
<graphic mimeType="image/jpeg" url="https://www.example.com/00000002.tif.original.jpg" id="f0002"/>
8+
</facsimile>
9+
<text>
10+
<front>
11+
<titlePage id="uuid-82add175-7012-4a6d-bc13-a1a666acb769">
12+
<pb facs="#f0001" n=" - " corresp="https://www.example.com/0001"/>
13+
<p>
14+
<lb/>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.
15+
16+
</p>
17+
</titlePage>
18+
</front>
19+
<body>
20+
<div id="uuid-cf72f6ba-61a0-41b3-ba9b-a6331b7a504b" n="1" rend="Content">
21+
<pb facs="#f0002" n=" - " corresp="https://www.example.com/0002"/>
22+
</div>
23+
<div id="uuid-45e92103-ecd2-46ab-aabd-ddc589a548d2" n="1" rend="Aenean commodo ligula eget dolor">
24+
<head>
25+
<lb/>
26+
Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim.
27+
</head>
28+
</div>
29+
</body>
30+
<back/>
31+
</text>
32+
</TEI>

0 commit comments

Comments
 (0)