|
| 1 | +<?php |
| 2 | + |
| 3 | +/** |
| 4 | + * (c) Kitodo. Key to digital objects e.V. <contact@kitodo.org> |
| 5 | + * |
| 6 | + * This file is part of the Kitodo and TYPO3 projects. |
| 7 | + * |
| 8 | + * @license GNU General Public License version 3 or later. |
| 9 | + * For the full copyright and license information, please read the |
| 10 | + * LICENSE.txt file that was distributed with this source code. |
| 11 | + */ |
| 12 | + |
| 13 | +namespace Kitodo\Dlf\Format; |
| 14 | + |
| 15 | +use Kitodo\Dlf\Common\FulltextInterface; |
| 16 | +use Psr\Log\LoggerAwareInterface; |
| 17 | +use Psr\Log\LoggerAwareTrait; |
| 18 | +use SimpleXMLElement; |
| 19 | + |
| 20 | +/** |
| 21 | + * Fulltext ALTO format class for the 'dlf' extension |
| 22 | + * |
| 23 | + * ** This currently supports ALTO 2.x / 3.x / 4.x ** |
| 24 | + * |
| 25 | + * @package TYPO3 |
| 26 | + * @subpackage dlf |
| 27 | + * |
| 28 | + * @access public |
| 29 | + */ |
| 30 | +class Tei implements FulltextInterface, LoggerAwareInterface |
| 31 | +{ |
| 32 | + use LoggerAwareTrait; |
| 33 | + |
| 34 | + private string $pageId; |
| 35 | + |
| 36 | + public function setPageId(string $pageId): void |
| 37 | + { |
| 38 | + $this->pageId = $pageId; |
| 39 | + } |
| 40 | + |
| 41 | + /** |
| 42 | + * This extracts the fulltext data from TEI XML |
| 43 | + * |
| 44 | + * @access public |
| 45 | + * |
| 46 | + * @param \SimpleXMLElement $xml The XML to extract the raw text from |
| 47 | + * |
| 48 | + * @return string The raw unformatted fulltext |
| 49 | + */ |
| 50 | + public function getRawText(\SimpleXMLElement $xml): string |
| 51 | + { |
| 52 | + if (empty($this->pageId)) { |
| 53 | + $this->logger->warning('Text could not be retrieved from TEI because the page ID is empty.'); |
| 54 | + return ''; |
| 55 | + } |
| 56 | + |
| 57 | + // register ALTO namespace depending on document |
| 58 | + $this->registerTeiNamespace($xml); |
| 59 | + |
| 60 | + // Get all (presumed) words of the text. |
| 61 | + $contentXml = $xml->xpath('./TEI:text')[0]->asXML(); |
| 62 | + |
| 63 | + // Remove tags but keep their content |
| 64 | + $contentXml = preg_replace('/<\/?(?:body|front|div|head|titlePage)[^>]*>/u', '', $contentXml); |
| 65 | + |
| 66 | + // Replace linebreaks |
| 67 | + $contentXml = preg_replace('/<lb(?:\s[^>]*)?\/>/u', '', $contentXml); |
| 68 | + $contentXml = preg_replace('/\s+/', ' ', $contentXml); |
| 69 | + |
| 70 | + // Extract content between each <pb /> and the next <pb /> or end of string |
| 71 | + $pattern = '/<pb[^>]*facs="([^"]+)"[^>]*\/>([\s\S]*?)(?=<pb[^>]*\/>|$)/u'; |
| 72 | + $facs = []; |
| 73 | + |
| 74 | + // Use preg_match_all to get all matches at once |
| 75 | + if (preg_match_all($pattern, $contentXml, $matches, PREG_SET_ORDER)) { |
| 76 | + foreach ($matches as $match) { |
| 77 | + $facsMatch = trim($match[1]); |
| 78 | + $facsId = str_starts_with($facsMatch, "#") ? substr($facsMatch, 1) : $facsMatch; |
| 79 | + $facs[$facsId] = trim(strip_tags($match[2])); // Everything until next <pb /> or end of string |
| 80 | + } |
| 81 | + } |
| 82 | + |
| 83 | + if (!array_key_exists($this->pageId, $facs)) { |
| 84 | + $this->logger->debug('The page break attribute "facs" with the page identifier postfix "' . $this->pageId . '" could not be found in the TEI document'); |
| 85 | + return ''; |
| 86 | + } |
| 87 | + |
| 88 | + return $facs[$this->pageId]; |
| 89 | + } |
| 90 | + |
| 91 | + /** |
| 92 | + * This extracts the fulltext data from TEI XML and returns it in MiniOCR format |
| 93 | + * |
| 94 | + * @access public |
| 95 | + * |
| 96 | + * @param \SimpleXMLElement $xml The XML to extract the raw text from |
| 97 | + * |
| 98 | + * @return string The unformatted fulltext in MiniOCR format |
| 99 | + */ |
| 100 | + public function getTextAsMiniOcr(\SimpleXMLElement $xml): string |
| 101 | + { |
| 102 | + $rawText = $this->getRawText($xml); |
| 103 | + |
| 104 | + if (empty($rawText)) { |
| 105 | + return ''; |
| 106 | + } |
| 107 | + |
| 108 | + $miniOcr = new SimpleXMLElement("<ocr></ocr>"); |
| 109 | + $miniOcr->addChild('b', $rawText); |
| 110 | + $miniOcrXml = $miniOcr->asXml(); |
| 111 | + if (\is_string($miniOcrXml)) { |
| 112 | + return $miniOcrXml; |
| 113 | + } |
| 114 | + return ''; |
| 115 | + } |
| 116 | + |
| 117 | + /** |
| 118 | + * This registers the necessary TEI namespace for the current TEI-XML |
| 119 | + * |
| 120 | + * @access private |
| 121 | + * |
| 122 | + * @param \SimpleXMLElement &$xml: The XML to register the namespace for |
| 123 | + */ |
| 124 | + private function registerTeiNamespace(\SimpleXMLElement $xml) |
| 125 | + { |
| 126 | + $namespace = $xml->getDocNamespaces(); |
| 127 | + |
| 128 | + if (in_array('http://www.tei-c.org/ns/1.0', $namespace, true)) { |
| 129 | + $xml->registerXPathNamespace('TEI', 'http://www.tei-c.org/ns/1.0'); |
| 130 | + } |
| 131 | + } |
| 132 | +} |
0 commit comments