diff --git a/examples/discovery-search-app/package.json b/examples/discovery-search-app/package.json index 283978d2a..0e67e055f 100644 --- a/examples/discovery-search-app/package.json +++ b/examples/discovery-search-app/package.json @@ -28,6 +28,7 @@ "carbon-components": "^10.6.0", "carbon-components-react": "^7.7.0", "classnames": "^2.2.6", + "core-js": "^2.6.12", "cors": "^2.8.5", "dotenv": "^8.1.0", "express": "^4.17.1", diff --git a/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx b/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx index b06e33612..342cce130 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx +++ b/packages/discovery-react-components/src/components/DocumentPreview/DocumentPreview.tsx @@ -8,6 +8,7 @@ import SimpleDocument from './components/SimpleDocument/SimpleDocument'; import withErrorBoundary, { WithErrorBoundaryProps } from 'utils/hoc/withErrorBoundary'; import { defaultMessages, Messages } from './messages'; import HtmlView from './components/HtmlView/HtmlView'; +import PdfViewerWithHighlight from './components/PdfViewerHighlight/PdfViewerWithHighlight'; import { isCsvFile, isJsonFile } from './utils/documentData'; const { ZOOM_IN, ZOOM_OUT } = PreviewToolbar; @@ -154,6 +155,7 @@ function PreviewDocument({ const ErrorBoundDocumentPreview: any = withErrorBoundary(DocumentPreview); ErrorBoundDocumentPreview.PreviewToolbar = PreviewToolbar; ErrorBoundDocumentPreview.PreviewDocument = PreviewDocument; +ErrorBoundDocumentPreview.PdfViewerWithHighlight = PdfViewerWithHighlight; export default ErrorBoundDocumentPreview; export { ErrorBoundDocumentPreview as DocumentPreview }; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponent-ja.pdf.ts b/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponent-ja.pdf.ts new file mode 100644 index 000000000..9d62e2f10 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponent-ja.pdf.ts @@ -0,0 +1,2 @@ +export const document = + ''; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponents-ja_document.json b/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponents-ja_document.json new file mode 100644 index 000000000..4789d596d --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/__fixtures__/DiscoComponents-ja_document.json @@ -0,0 +1,55 @@ +{ + "document_id": "feab8705259090b89fbcbb15942cb10d", + "result_metadata": { + "collection_id": "b6cdf1cd-902c-8ea3-0000-017d32224d8f" + }, + "enriched_text": [ + { + "entities": [ + { + "model_name": "natural_language_understanding", + "mentions": [ + { + "confidence": 0.9950965, + "location": { + "end": 2, + "begin": 0 + }, + "text": "最初" + } + ], + "text": "最初", + "type": "Ordinal" + } + ] + } + ], + "metadata": { + "parent_document_id": "feab8705259090b89fbcbb15942cb10d", + "customer_id": "IBMid-270001M55T" + }, + "extracted_metadata": { + "sha1": "4FF2B41ED7A77975ABB21D9E4025DF31335E6451", + "numPages": "1", + "filename": "DiscoComponents-ja-updated.pdf", + "file_type": "pdf", + "text_mappings": "{\"text_mappings\":[{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,87.82411193847656,400.4930725097656,194.260009765625]},\"field\":{\"name\":\"title\",\"index\":0,\"span\":[0,20]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,411.83612060546875,262.9510192871094,425.62003993988037]},\"field\":{\"name\":\"subtitle\",\"index\":0,\"span\":[0,19]}},{\"page\":{\"page_number\":1,\"bbox\":[268.46466064453125,416.1183776855469,325.5726318359375,425.375319480896]},\"field\":{\"name\":\"subtitle\",\"index\":1,\"span\":[0,3]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,644.3582763671875,313.07745361328125,653.6152181625366]},\"field\":{\"name\":\"subtitle\",\"index\":2,\"span\":[0,15]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,456.12786865234375,95.6172866821289,463.06002855300903]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[0,4]}},{\"page\":{\"page_number\":1,\"bbox\":[100.0745620727539,452.9471435546875,257.0570983886719,463.06002855300903]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[4,27]}},{\"page\":{\"page_number\":1,\"bbox\":[261.5120849609375,452.9471435546875,408.1592712402344,463.0600233078003]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[27,49]}},{\"page\":{\"page_number\":1,\"bbox\":[412.5315856933594,456.12786865234375,464.3571472167969,463.06002855300903]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[49,54]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,452.9471435546875,534.0211791992188,596.2600049972534]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[54,234]}},{\"page\":{\"page_number\":1,\"bbox\":[54.519996643066406,679.4979858398438,535.1033325195312,723.2200269699097]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[234,353]}}],\"pages\":[{\"page_number\":0,\"height\":842.0,\"width\":595.0,\"origin\":\"TopLeft\"}]}", + "title": "Discovery Component README Japanese", + "publicationdate": "2021-11-18" + }, + "subtitle": ["Discovery Component", "の使用", "サンプルアプリケーションの実行"], + "html": "Discovery Component README Japanese

Discovery Components

Discovery Component

の使用

最初に

IBM Watson Discovery の

Improve and Customize

ページで

Document retrieval プロジェクトをカスタマイズする必要があります。たとえばファセットや検索 バーや検索結果を設定できます。その後 Discovery component を使ったアプリケ ーションを作成します。アプリケーションは指定したプロジェクトの設定をロードしま す。 必要なソフトウェア: git, nvm, yarn または npm

サンプルアプリケーションの実行

• サンプルアプリケーションはこのライブラリーが提供するコアコンポーネントのカタログです。実際のデ ータを使ってコンポーネントがどのように動くかを簡単に見ることができます。コードを変更して、カスタ マイズする方法を確認することもできます。

", + "text": [ + "最初に IBM Watson Discovery の Improve and Customize ページで Document retrieval プロジェクトをカスタマイズする必要があります。たとえばファセットや検索 バーや検索結果を設定できます。その後 Discovery component を使ったアプリケ ーションを作成します。アプリケーションは指定したプロジェクトの設定をロードしま す。 必要なソフトウェア: git, nvm, yarn または npm • サンプルアプリケーションはこのライブラリーが提供するコアコンポーネントのカタログです。実際のデ ータを使ってコンポーネントがどのように動くかを簡単に見ることができます。コードを変更して、カスタ マイズする方法を確認することもできます。" + ], + "title": "Discovery Components", + "document_passages": [ + { + "passage_text": "Discovery Components", + "start_offset": 0, + "end_offset": 20, + "field": "title" + } + ], + "table_results_references": [] +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx new file mode 100644 index 000000000..326b33afa --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerHighlight.tsx @@ -0,0 +1,131 @@ +import React, { FC, useMemo, useEffect } from 'react'; +import cx from 'classnames'; +import { settings } from 'carbon-components'; +import { QueryResult } from 'ibm-watson/discovery/v2'; +import { ProcessedDoc } from 'utils/document'; +import { TextMappings } from '../../types'; +import { PdfDisplayProps } from '../PdfViewer/types'; +import { PdfRenderedText } from '../PdfViewer/PdfViewerTextLayer'; +import { ExtractedDocumentInfo } from './utils/common/documentUtils'; +import { Highlighter } from './utils/Highlighter'; +import { HighlightProps } from './types'; + +type Props = PdfDisplayProps & + HighlightProps & { + /** + * Class name to style highlight layer + */ + className?: string; + + /** + * Parsed document information + */ + parsedDocument: ExtractedDocumentInfo | null; + + /** + * PDF text content information in a page from parsed PDF + */ + pdfRenderedText: PdfRenderedText | null; + }; + +/** + * Text highlight layer for PdfViewer + */ +const PdfViewerHighlight: FC = ({ + className, + highlightClassName, + document, + parsedDocument, + page, + highlights, + pdfRenderedText, + scale, + _useHtmlBbox = true, + _usePdfTextItem = true +}) => { + const highlighter = useHighlighter({ + document, + textMappings: parsedDocument?.textMappings, + processedDoc: _useHtmlBbox ? parsedDocument?.processedDoc : undefined, + pdfRenderedText: (_usePdfTextItem && pdfRenderedText) || undefined, + pageNum: page + }); + + const { textDivs } = pdfRenderedText || {}; + useEffect(() => { + if (highlighter) { + highlighter.setTextContentDivs(textDivs); + } + }, [highlighter, textDivs]); + + const highlightBoxes = useMemo(() => { + return highlights.map(highlight => { + return highlighter?.getHighlight(highlight); + }); + }, [highlighter, highlights]); + + return ( +
+ {highlightBoxes.map((hl, hlIndex) => { + return ( + + {hl?.boxes.map((item, index) => { + const padding = 0; + const [left, top, right, bottom] = item.bbox; + return ( +
+ ); + })} + + ); + })} +
+ ); +}; + +const useHighlighter = ({ + document, + textMappings, + processedDoc, + pdfRenderedText, + pageNum +}: { + document: QueryResult; + textMappings?: TextMappings; + processedDoc?: ProcessedDoc; + pdfRenderedText?: PdfRenderedText; + pageNum: number; +}) => { + return useMemo(() => { + if (textMappings) { + return new Highlighter({ + document, + textMappings, + pageNum, + htmlBboxInfo: processedDoc && { + bboxes: processedDoc.bboxes, + styles: processedDoc.styles + }, + pdfTextContentInfo: + pdfRenderedText?.textContent && pdfRenderedText?.viewport ? pdfRenderedText : undefined + }); + } + return null; + }, [document, pageNum, pdfRenderedText, processedDoc, textMappings]); +}; + +export default PdfViewerHighlight; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.scss b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.scss new file mode 100644 index 000000000..5703cca25 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.scss @@ -0,0 +1,28 @@ +// Carbon highlight color for white theme +// https://www.carbondesignsystem.com/guidelines/color/usage/ +$highlight: #d0e2ff; + +.withTextSelection { + display: flex; + + .rightPane { + flex: 1 1 30%; + height: 100vh; + overflow-y: scroll; + + p { + margin-bottom: 0.5rem; + } + } + .text { + overflow-wrap: break-word; + white-space: pre-wrap; + font-size: 10pt; + font-family: 'Courier New', Courier, monospace; + } + + .highlight { + opacity: 0.3; + background: darken($highlight, 30%); + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.tsx new file mode 100644 index 000000000..fe2dde415 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.stories.tsx @@ -0,0 +1,208 @@ +import React, { useCallback, useMemo, useRef, useState } from 'react'; +import { storiesOf } from '@storybook/react'; +import { withKnobs, radios, number } from '@storybook/addon-knobs'; +import { action } from '@storybook/addon-actions'; +import PdfViewerWithHighlight from './PdfViewerWithHighlight'; +import { flatten } from 'lodash'; +import { DocumentFieldHighlight } from './types'; +import './PdfViewerWithHighlight.stories.scss'; + +import { document as doc } from 'components/DocumentPreview/__fixtures__/Art Effects.pdf'; +import document from 'components/DocumentPreview/__fixtures__/Art Effects Koya Creative Base TSA 2008.pdf.json'; + +import { document as docJa } from 'components/DocumentPreview/__fixtures__/DiscoComponent-ja.pdf'; +import documentJa from 'components/DocumentPreview/__fixtures__/DiscoComponents-ja_document.json'; + +import PDFJS from 'pdfjs-dist'; +import { getDocFieldValue } from './utils/common/documentUtils'; +(PDFJS as any).cMapUrl = './node_modules/pdfjs-dist/cmaps/'; +(PDFJS as any).cMapPacked = true; + +const pageKnob = { + label: 'Page', + options: { + range: true, + min: 1, + max: 8, + step: 1 + }, + defaultValue: 1 +}; + +const zoomKnob = { + label: 'Zoom', + options: { + 'Zoom out (50%)': '0.5', + 'Default (100%)': '1', + 'Zoom in (150%)': '1.5' + }, + defaultValue: '1' +}; + +const EMPTY: never[] = []; + +const WithTextSelection: typeof PdfViewerWithHighlight = props => { + const [selectedField, setSelectedField] = useState('text|||0'); + const { document } = props; + + const handleOnChangeField = useCallback((e: React.ChangeEvent) => { + setSelectedField(e.target.value); + }, []); + const [selectedFieldName, selectedFieldIndex] = useMemo(() => { + const [n, i] = selectedField?.split('|||') || []; + return [n, Number(i)]; + }, [selectedField]); + const fieldOptions = useMemo(() => { + const fields = Object.keys(document).filter(field => { + return !field.match(/^(document_id|extracted_|enriched_)/) && document[field]?.length > 0; + }); + + return flatten( + fields.map(field => { + const documentFields = Array.isArray(document[field]) ? document[field] : [document[field]]; + return documentFields + .map((content: any, index: number) => { + if (typeof content === 'string') { + return { + value: `${field}|||${index}`, + label: `${field}[${index}]` + }; + } + return null; + }) + .filter((x: any) => !!x); + }) + ); + }, [document]); + + // text selection & highlights + const [highlights, setHighlights] = useState([]); + + const fieldTextNodeRef = useRef(null); + const getFieldTextSelection = () => { + const selection = window.getSelection(); + if (!fieldTextNodeRef.current) { + return null; + } + if (!selection || selection.rangeCount < 1 || selection.isCollapsed) { + return null; + } + + const { anchorNode, focusNode, anchorOffset, focusOffset } = selection; + const anchorParentNode = anchorNode?.parentNode as HTMLElement; + const focusParentNode = focusNode?.parentNode as HTMLElement; + if ( + anchorParentNode !== fieldTextNodeRef.current || + focusParentNode !== fieldTextNodeRef.current + ) { + return null; + } + + const text = selection.toString(); + return { text, begin: anchorOffset, end: focusOffset }; + }; + const handleOnMouseUp = (_: MouseEvent) => { + const textSelection = getFieldTextSelection(); + if (!textSelection) { + return; + } + + const { begin, end } = textSelection; + const fieldText = getDocFieldValue(document, selectedFieldName, selectedFieldIndex); + + const highlight: DocumentFieldHighlight = { + field: selectedFieldName, + fieldIndex: selectedFieldIndex, + location: { begin: Math.min(begin, end), end: Math.max(begin, end) }, + text: fieldText?.substring(begin, end) + } as DocumentFieldHighlight; + setHighlights([highlight]); + }; + + return ( +
+ +
+
+ +
+

+ {/* eslint-disable-next-line jsx-a11y/no-onchange*/} + +

+
Select text to highlight
+ {/* eslint-disable-next-line jsx-a11y/no-noninteractive-element-interactions */} +

+ {selectedField && + getDocFieldValue(document, selectedFieldName, selectedFieldIndex)! + .replace(/ /g, '\u00a0') // NBSP + .replace(/\n/g, '\\n')} +

+
+
+ ); +}; + +storiesOf('DocumentPreview/components/PdfViewerWithHighlight', module) + .addDecorator(withKnobs) + .add('default', () => { + const page = number(pageKnob.label, pageKnob.defaultValue, pageKnob.options); + const zoom = radios(zoomKnob.label, zoomKnob.options, zoomKnob.defaultValue); + const scale = parseFloat(zoom); + const setLoadingAction = action('setLoading'); + + return ( + + ); + }) + .add('with text selection', () => { + const page = number(pageKnob.label, pageKnob.defaultValue, pageKnob.options); + const zoom = radios(zoomKnob.label, zoomKnob.options, zoomKnob.defaultValue); + const scale = parseFloat(zoom); + const setLoadingAction = action('setLoading'); + + return ( + + ); + }) + .add('with PDF in Japanese', () => { + const page = number(pageKnob.label, pageKnob.defaultValue, pageKnob.options); + const zoom = radios(zoomKnob.label, zoomKnob.options, zoomKnob.defaultValue); + const scale = parseFloat(zoom); + const setLoadingAction = action('setLoading'); + + return ( + + ); + }); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx new file mode 100644 index 000000000..e706cb321 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/PdfViewerWithHighlight.tsx @@ -0,0 +1,47 @@ +import React, { FC, useState, useCallback } from 'react'; +import useAsyncFunctionCall from 'utils/useAsyncFunctionCall'; +import PdfViewer, { PdfViewerProps } from '../PdfViewer/PdfViewer'; +import { PdfRenderedText } from '../PdfViewer/PdfViewerTextLayer'; +import PdfViewerHighlight from './PdfViewerHighlight'; +import { extractDocumentInfo } from './utils/common/documentUtils'; +import { HighlightProps } from './types'; + +type Props = PdfViewerProps & HighlightProps; + +/** + * PDF viewer component with text highlighting capability + */ +const PdfViewerWithHighlight: FC = ({ + highlightClassName, + document, + highlights, + _useHtmlBbox, + _usePdfTextItem, + ...rest +}) => { + const { page, scale } = rest; + const [renderedText, setRenderedText] = useState(null); + + const documentInfo = useAsyncFunctionCall( + useCallback(async () => await extractDocumentInfo(document), [document]) + ); + + const highlightReady = !!documentInfo && !!renderedText; + return ( + + + + ); +}; + +export default PdfViewerWithHighlight; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/README.md b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/README.md new file mode 100644 index 000000000..daec4a7a5 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/README.md @@ -0,0 +1,45 @@ +## How highlighting works + +### TextLayout + +`TextLayout` shows that what text is placed where in a page. `TextLayout` has multiple `TextLayoutCells`. Each cell shows a particular text is rendered in a particular boundary box. + +So, `metadata.text_mappings` is a kind of `TextLayout` because it bounds text to boundary boxes. `bbox`es stored in `html` field can also be a `TextLayout`. Text objects in a PDF page (`TextContentItem`s in `pdfjs-dist` npm package) can also be a `TextLayout`. + +Depending on its source, each type of text layout has each granularity, i.e. text length and the size of boundary box in a `TextLayoutCell` are different. For example, a cell from `text_mappings` typically has longer text (sometimes it's a paragraph) and large boundary box. A cell from PDF text content item has shorter text (say it's word or short phrase) and small boundary box. + +For highlighting, smaller boundary boxes produces more accurate highlight boundary box. + +### Find smaller text layout cell using `TextBoxMappings` + +So, we build mappings from larger cells to smaller cells. More detail, mapping from a span on a text in a large cell to a span on a text in a smaller cell. + +To find highlight boundary box, we typically starts with cells from `text_mapping` because we can find a cell and a span on it from a span on a field. Then, use the mappings to find smaller cells, which are typically from PDF text content items. + +However, calculation of the mapping is not straightforward. A smaller cell can be overlapped with two or more larger cells. The order of smaller cells may not be the same as the text in a larger cells. They make hard to find a smaller cell from a span on a text in a larger cell. `getTextBoxMappings` and it helpers `TextNormalizer`, `TextProvider`, `CellProvider` are used to calculate a good mapping even with the situation. + +#### How to build mappings + +`CellProvider` denotes fine-grained text layout. It provides small text layout cells with the text. `MappingTargetBoxProvider` wraps `CellProvider` mainly for normalizing text. Normalization is important because the text in original PDF can be refined in field text. For example, two consecutive spaces are normalized to one, and quotation marks can be normalized. + +`TextProvider` provides text from course-grained text layout cells. User can consume spans on the text (i.e. mark the text span used) and the class manages text which is yet to be consumed. The class can find `match` to a given text in the remaining text and returns score of match. `MappingSourceTextProvider` wraps `TextProvider` for text normalization. + +With these classes, `getTextBoxMappings` builds mappings as follow: + +1. Load text from `CellProvider`. It may span on multiple text layout cells +2. Find match in `TextProvider`, and then consume the matched text +3. For each text layout cells in the matched text, + 1. associate the text layout cell and a span on the matched text + 2. mark the text layout cell consumed + +### Text layout cell to boundary box + +Now, we have small cells for highlighting. + +Even with a small cell, text to highlight may be a span on a cell text. In the case, we have to calculate boundary box for the span. By default, cells approximate the boundary box by assigning width evenly to every characters in the cell text. + +Some `TextLayoutCalls` has capability of calculating boundary box for a sub-span of its text. For example, cells for PDF text items `PdfTextContentTextLayoutCell` can calculate boundary boxes for given text spans. It internally uses DOM and DOM's `getBoundingClientRect` to get the result. + +### Highlighter + +`Highlighter` manages available information about a document and a page, and calculate boundary boxes for given spans on fields. diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts new file mode 100644 index 000000000..a6936c80b --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/types.ts @@ -0,0 +1,69 @@ +import { Bbox as DocPreviewBbox, TextSpan as DocPreviewTextSpan } from '../../types'; +import { Location } from 'utils/document/processDoc'; +import { QueryResult } from 'ibm-watson/discovery/v2'; + +// (re-)export useful types +export type Bbox = DocPreviewBbox; +export type TextSpan = DocPreviewTextSpan; + +/** + * A document. Same to QueryResult, but this more focuses on document fields + */ +export type DocumentFields = { [fieldName: string]: string[] | undefined }; + +/** + * Highlight on a document field + */ +export type DocumentFieldHighlight = { + field: string; + fieldIndex: number; + location: Location; + className?: string; +}; + +/** + * Highlight shape on a page, which consists of boundary boxes + */ +export interface HighlightShape { + boxes: HighlightShapeBox[]; + className?: string; +} + +/** + * Boundary box for a highlight + */ +export interface HighlightShapeBox { + bbox: Bbox; + dir?: string; // e.g. ltr, rtl. ltr by default + isStart?: boolean; + isEnd?: boolean; +} + +export interface HighlightProps { + /** + * Class name to style each highlight + */ + highlightClassName?: string; + + /** + * Document data returned by query + */ + document: QueryResult; + + /** + * Highlight spans on fields in document + */ + highlights: DocumentFieldHighlight[]; + + /** + * Consider bboxes in HTML field to highlight. + * True by default. This is for testing purpose. + */ + _useHtmlBbox?: boolean; + + /** + * Flag to whether to use PDF text items for finding bbox for highlighting. + * True by default. This is for testing and debugging purpose. + */ + _usePdfTextItem?: boolean; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts new file mode 100644 index 000000000..0dc14f75b --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/Highlighter.ts @@ -0,0 +1,188 @@ +import { TextMappings } from 'components/DocumentPreview/types'; +import flatMap from 'lodash/flatMap'; +import { PDFPageViewport, TextContent } from 'pdfjs-dist'; +import { nonEmpty } from 'utils/nonEmpty'; +import { + DocumentFields, + DocumentFieldHighlight, + HighlightShape, + HighlightShapeBox +} from '../types'; +import { spanOffset, START } from '../../../utils/textSpan'; +import { getTextBoxMappings } from './textBoxMapping'; +import { TextBoxMapping, TextBoxMappingResult } from './textBoxMapping/types'; +import { HtmlBboxTextLayout, PdfTextContentTextLayout, TextMappingsTextLayout } from './textLayout'; +import { HtmlBboxInfo, TextLayout, TextLayoutCell } from './textLayout/types'; + +const debugOut = require('debug')?.('pdf:Highlighter'); +function debug(...args: any) { + debugOut?.apply(null, args); +} + +/** + * Highlighter - calculate highlight bbox from spans on text fields + */ +export class Highlighter { + readonly pageNum: number; + private readonly textMappingsLayout: TextMappingsTextLayout; + private pdfTextContentLayout: PdfTextContentTextLayout | null = null; + private textToHtmlBboxMappings: TextBoxMapping | null = null; + private textToPdfTextItemMappings: TextBoxMapping | null = null; + + constructor({ + document, + textMappings, + pageNum, + htmlBboxInfo, + pdfTextContentInfo + }: { + document: DocumentFields; + textMappings: TextMappings; + pageNum: number; + htmlBboxInfo?: HtmlBboxInfo; + pdfTextContentInfo?: { + textContent: TextContent; + viewport: PDFPageViewport; + spans?: HTMLElement[]; + }; + }) { + this.pageNum = pageNum; + this.textMappingsLayout = new TextMappingsTextLayout({ document, textMappings }, pageNum); + if (htmlBboxInfo) { + this.setHtmlBboxInfo(htmlBboxInfo); + } + if (pdfTextContentInfo) { + this.setTextContentItems( + pdfTextContentInfo.textContent, + pdfTextContentInfo.viewport, + pdfTextContentInfo.spans, + htmlBboxInfo + ); + } + } + + /** + * Update highlight with bboxes in HTML field in document + * @param htmlBoxInfo processed document info including bboxes + */ + setHtmlBboxInfo(htmlBoxInfo: HtmlBboxInfo) { + const htmlLayout = new HtmlBboxTextLayout(htmlBoxInfo, this.pageNum); + this.textToHtmlBboxMappings = getTextBoxMappings(this.textMappingsLayout, htmlLayout); + } + + /** + * Update highlighter with PDF text content + * @param textContent PDF text content of the current page + * @param viewport viewport of the currently rendered PDF page + * @param textContentDivs HTML elements where text content items are rendered + * @param htmlBoxInfo processed document info including bboxes + */ + setTextContentItems( + textContent: TextContent, + viewport: PDFPageViewport, + textContentDivs?: HTMLElement[], + htmlBoxInfo?: HtmlBboxInfo + ) { + this.pdfTextContentLayout = new PdfTextContentTextLayout( + { textContent, viewport }, + this.pageNum, + htmlBoxInfo + ); + this.textToPdfTextItemMappings = getTextBoxMappings( + this.textMappingsLayout, + this.pdfTextContentLayout + ); + this.setTextContentDivs(textContentDivs); + } + + /** + * Update text content HTML elements + * @param textContentDivs HTML elements where text content items are rendered + */ + setTextContentDivs(textContentDivs?: HTMLElement[]) { + this.pdfTextContentLayout?.setDivs(textContentDivs); + } + + /** + * Get highlight shape from a span on a field + * @param highlight a span on a document field to highlight + * @returns highlight shape + */ + getHighlight( + highlight: T + ): HighlightShape & Omit { + debug('getHighlight: %o', highlight); + const { field, fieldIndex, location, className, ...rest } = highlight; + const items = this.getHighlightTextMappingResult({ field, fieldIndex, location }); + debug('getHighlight - items: %o', items); + + const boxShapes: HighlightShapeBox[] = items + .map((item, index) => { + const { cell: baseCell, span: baseSpan } = item.cell?.getNormalized() || {}; + if (baseCell) { + let bbox = baseCell.bbox; + if (baseSpan) { + bbox = + baseCell.getBboxForTextSpan(baseSpan) || + baseCell.getBboxForTextSpan(baseSpan, { useRatio: true }) || + baseCell.bbox; + } + debug('getHighlight - cell(%i): %o', item.cell); + debug(' box: %o', bbox); + return { + bbox, + isStart: index === 0, + isEnd: index === items.length - 1 + }; + } + debug('getHighlight - cell(%i) is not mapped. source span: %o', item.sourceSpan); + return null; + }) + .filter(nonEmpty); + return { + boxes: boxShapes, + className, + ...rest + }; + } + + /** + * Get text layout cells from a span on a field + * @param highlight a span on a document field to highlight + * @returns TextLayoutCells representing the given highlight + */ + private getHighlightTextMappingResult(highlight: DocumentFieldHighlight): TextBoxMappingResult { + let items = this.textMappingsLayout.getHighlight(highlight); + + const doMapping = ( + items: TextBoxMappingResult, + textBoxMapping: TextBoxMapping, + parent: TextLayout + ) => + flatMap(items, item => { + if (item.cell) { + const { cell: baseCell } = item.cell.getNormalized(); + if (baseCell.parent === parent) { + const newItems = textBoxMapping.apply(item.cell); + return newItems.map(({ cell, sourceSpan }) => { + return { + cell, + sourceSpan: spanOffset(sourceSpan, item.sourceSpan[START]) + }; + }); + } + return item; + } + return []; + }); + + const { textToPdfTextItemMappings, textToHtmlBboxMappings } = this; + if (textToPdfTextItemMappings) { + items = doMapping(items, textToPdfTextItemMappings, this.textMappingsLayout); + } + if (textToHtmlBboxMappings) { + items = doMapping(items, textToHtmlBboxMappings, this.textMappingsLayout); + } + return items; + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts new file mode 100644 index 000000000..3fe8387b8 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/TextNormalizer.ts @@ -0,0 +1,324 @@ +import { TextSpan } from '../../types'; +import { END, spanLen, START } from '../../../../utils/textSpan'; + +type SpanMapping = { rawSpan: TextSpan; normalizedSpan: TextSpan }; + +type CharNormalizer = { + /** + * Get normalized character of the original string + */ + normal: (original: string) => string; + /** + * String representation regex that matches to characters to be normalized + */ + regexString: string; +}; + +const SPACES: CharNormalizer = { + normal: (_: string) => ' ', + regexString: '\\s+' +}; + +const DOUBLE_QUOTE: CharNormalizer = { + normal: (_: string) => '"', + regexString: `[${[ + '«', // U+00AB + '»', // U+00BB + '“', // U+201C + '”', // U+201D + '„', // U+201E + '‟', // U+201F + '❝', // U+275D + '❞', // U+275E + '⹂', // U+2E42 + '〝', // U+301D + '〞', // U+301E + '〟', // U+301F + '"' // U+FF02 + ].join('')}]` +}; + +const QUOTE: CharNormalizer = { + normal: (_: string) => "'", + regexString: `[${[ + '‹', // U+2039 + '›', // U+203A + '’', // U+2019 + '❮', // U+276E + '❯', // U+276F + '‘', // U+2018 + '‚', // U+201A + '‛', // U+201B + '❛', // U+275B + '❜', // U+275C + '❟' // U+275F + ].join('')}]` +}; + +// handle a character that is encoded as a surrogate pair +// in Javascript string (i.e. UTF-16), whose length is 2 +// as a single character +const SURROGATE_PAIR: CharNormalizer = { + normal: (_: string) => '_', + regexString: '[\uD800-\uDBFF][\uDC00-\uDFFF]' +}; + +// remove "Combining Diacritical Marks" from the string +// NOTE: we may have to do this after conversion again +// str.normalize("NFD").replace(/[\u0300-\u036f]/g, "") +const DIACRITICAL_MARK: CharNormalizer = { + normal: (_: string) => '', + regexString: '[\u0300-\u036f]' +}; +const DIACRITICAL_MARK_REGEX = new RegExp(DIACRITICAL_MARK.regexString, 'g'); + +function normalizeDiacriticalMarks(text: string, keepLength = false): string { + const r = text + .normalize('NFD') + .replace(DIACRITICAL_MARK_REGEX, DIACRITICAL_MARK.normal) + .normalize('NFC'); + if (keepLength && r.length !== text.length) { + // + // String.normalize may change length of a string. `keepLength` flag keeps string + // length after conversion by padding or truncating a string. + // + return r.substring(0, text.length).padEnd(text.length, ' '); + } + return r; +} + +const NORMALIZATIONS = [SPACES, DOUBLE_QUOTE, QUOTE, SURROGATE_PAIR, DIACRITICAL_MARK].map(n => ({ + ...n, + regex: new RegExp(n.regexString, 'g') +})); + +// regex to match all the chars to normalize. +// the regex is: /(\s+)|(["""])|(['''])|([\u8D..FF])|([\u03..6f])/g +const NORMALIZATIONS_REGEX = new RegExp( + NORMALIZATIONS.map(n => `(${n.regexString})`).join('|'), + 'g' +); + +/** + * Normalize the following in text: + * - two or more consecutive spaces to a single space + * - variants of single quote to `'` + * - variants of double quote to `"` + * - surrogate pairs to a single character `_` + * - remove diacritical marks (accent) from characters + * + * This is used for preprocessing to compare texts to ignore minor + * text differences. + * + * @param text text to normalize + * @returns normalized text @see TextNormalizer + */ +function normalizeText(text: string): string { + const r = NORMALIZATIONS.reduce((text, n) => { + return text.replace(n.regex, m => n.normal(m)); + }, text); + return normalizeDiacriticalMarks(r); +} + +/** + * Text normalizer with mapping between spans on original and normalized text + * + * Normalize the following in a text: + * - two or more consecutive spaces to a single space + * - variants of single quote to `'` + * - variants of double quote to `"` + * - surrogate pairs to a single character `_` + * - remove diacritical marks (accent) from characters + */ +export class TextNormalizer { + readonly rawText: string; + readonly normalizedText: string; + private readonly normalizationMappings: SpanMapping[]; + + constructor(rawText: string) { + this.rawText = rawText; + + let normalizedText = ''; + const addNormalizedText = (text: string) => { + normalizedText += normalizeDiacriticalMarks(text, true); + }; + + const normalizationMappings: SpanMapping[] = []; + const re = NORMALIZATIONS_REGEX; + let cur = 0; + let match = re.exec(this.rawText); + while (match != null) { + const originalChar = match[0]; + let normalizedChar = match[0]; + for (let i = 0; i < match.length - 1; i += 1) { + if (match[i + 1] != null) { + normalizedChar = NORMALIZATIONS[i].normal(match[0]); + break; + } + } + const needNormalize = originalChar !== normalizedChar; + + if (match.index > cur) { + const newText = this.rawText.substring(cur, match.index); + if (needNormalize) { + const rawSpan: TextSpan = [cur, match.index]; + const normalizedSpan: TextSpan = [ + normalizedText.length, + normalizedText.length + newText.length + ]; + normalizationMappings.push({ rawSpan, normalizedSpan }); + addNormalizedText(newText); + cur += newText.length; + } + } + + if (needNormalize) { + const newText = normalizedChar; + const rawSpan: TextSpan = [match.index, match.index + match[0].length]; + const normalizedSpan: TextSpan = [ + normalizedText.length, + normalizedText.length + newText.length + ]; + normalizationMappings.push({ rawSpan, normalizedSpan }); + addNormalizedText(newText); + cur = re.lastIndex; + } + match = re.exec(this.rawText); + } + + if (cur < this.rawText.length) { + const newText = this.rawText.substring(cur); + const rawSpan: TextSpan = [cur, cur + newText.length]; + const normalizedSpan: TextSpan = [ + normalizedText.length, + normalizedText.length + newText.length + ]; + normalizationMappings.push({ rawSpan, normalizedSpan }); + addNormalizedText(newText); + } + + this.normalizedText = normalizedText; + this.normalizationMappings = optimizeSpanMappings(normalizationMappings); + } + + /** + * Convert a span on original text to a span on normalized text + * @param rawSpan span on original text + * @returns span on normalized text + */ + toNormalized(rawSpan: TextSpan): TextSpan { + const [rawBegin, rawEnd] = rawSpan; + + const normalizedIndex = (raw: number) => { + if (raw < 0) { + return raw; + } + const beginIndex = this.normalizationMappings.findIndex(({ rawSpan }) => raw < rawSpan[END]); + if (beginIndex >= 0) { + const { rawSpan, normalizedSpan } = this.normalizationMappings[beginIndex]; + return mapCharIndexOnSpans(raw, { from: rawSpan, to: normalizedSpan }); + } + const last = this.normalizationMappings[this.normalizationMappings.length - 1]; + return raw - last.rawSpan[END] + last.normalizedSpan[END]; + }; + return [normalizedIndex(rawBegin), normalizedIndex(rawEnd)]; + } + + /** + * Convert a span on normalized text to a span on normalized text + * @param normalizedSpan span on normalized text + * @returns span on original text + */ + toRaw(normalizedSpan: TextSpan): TextSpan { + const [normalizedBegin, normalizedEnd] = normalizedSpan; + + const rawIndex = (normalized: number) => { + if (normalized < 0) { + return normalized; + } + const beginIndex = this.normalizationMappings.findIndex( + ({ normalizedSpan }) => normalized < normalizedSpan[END] + ); + if (beginIndex >= 0) { + const { rawSpan, normalizedSpan } = this.normalizationMappings[beginIndex]; + return mapCharIndexOnSpans(normalized, { from: normalizedSpan, to: rawSpan }); + } + const last = this.normalizationMappings[this.normalizationMappings.length - 1]; + return normalized - last.normalizedSpan[END] + last.rawSpan[END]; + }; + return [rawIndex(normalizedBegin), rawIndex(normalizedEnd)]; + } + + /** + * Normalize a text. @see TextNormalizer for the details of the normalization + * @param text text to be normalized + * @returns normalized text + */ + normalize(text: string): string { + return normalizeText(text); + } + + /** + * Check whether a given text is blank or not + * @param text text to be tested + * @returns `true` when the text only contains spaces + */ + isBlank(text: string): boolean { + return text.length === 0 || text.trim().length === 0 || !!text.match(/^\s*$/); + } +} + +/** + * Map charIndex on a 'from' span to index on 'to' span + * @param charIndex char index to map + * @param mapping {from: Span, to: Span} spans + * @returns + */ +function mapCharIndexOnSpans( + charIndex: number, + { from: fromSpan, to: toSpan }: { from: TextSpan; to: TextSpan } +): number { + if (spanLen(fromSpan) === spanLen(toSpan)) { + return toSpan[START] + (charIndex - fromSpan[START]); + } + return ( + toSpan[START] + + Math.round((charIndex - fromSpan[START]) * (spanLen(toSpan) / spanLen(fromSpan))) + ); +} + +/** + * Optimize the mappings between spans on original text and spans on normalized text + * by merging consecutive identical mappings + * + * Example: given mapping: + * (original: [0,10] -> normalized: [0,10]) + * (original: [10,20] -> normalized: [10,20]) + * (original: [20,25] -> normalized: [20,21]) + * The mapping above is optimized to: + * (original: [0,20] -> normalized: [0,20]) + * (original: [20,25] -> normalized: [20,21]) + */ +function optimizeSpanMappings(mappings: SpanMapping[]): SpanMapping[] { + const sameLength = (mapping: SpanMapping) => + spanLen(mapping.normalizedSpan) === spanLen(mapping.rawSpan); + const isShifted = (a: SpanMapping, b: SpanMapping) => + b.normalizedSpan[START] - a.normalizedSpan[START] === b.rawSpan[START] - a.rawSpan[START]; + + return mappings.reduce((acc, mapping) => { + const lastMapping = acc.length > 0 ? acc[acc.length - 1] : null; + if ( + sameLength(mapping) && + lastMapping && + sameLength(lastMapping) && + isShifted(lastMapping, mapping) + ) { + // merge mappings + lastMapping.normalizedSpan[END] = mapping.normalizedSpan[END]; + lastMapping.rawSpan[END] = mapping.rawSpan[END]; + return acc; + } + acc.push(mapping); + return acc; + }, [] as SpanMapping[]); +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/TextNormalizer.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/TextNormalizer.test.ts new file mode 100644 index 000000000..029b1d7c4 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/TextNormalizer.test.ts @@ -0,0 +1,142 @@ +import { TextSpan } from '../../../types'; +import { TextNormalizer } from '../TextNormalizer'; + +describe('TextNormalizer', () => { + it('should do nothing with text that does not have any chars to normalize', () => { + const fieldText = 'This is a sample text content.'; + const expectedNormalizedText = fieldText; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.rawText).toEqual(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + expect(matcher.normalizationMappings).toHaveLength(1); + + let spans: TextSpan[] = [ + [0, 10], // start from beginning + [3, 10], // start from one before space + [4, 10], // start from space + [5, 10], // start from one character after space + [10, 20], // end at one char before space + [10, 21], // end at space + [10, 22], // end at one char after space, + [10, fieldText.length] + ]; + for (const span of spans) { + expect(matcher.toNormalized(span)).toEqual(span); + expect(matcher.toRaw(span)).toEqual(span); + } + }); + + it('should normalize text with one long blank', () => { + const fieldText = 'This is a sample text content.'; + const expectedNormalizedText = 'This is a sample text content.'; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.rawText).toEqual(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + expect(matcher.normalizationMappings).toHaveLength(3); + + // test begin + expect(matcher.toNormalized([0, 10])).toEqual([0, 7]); + expect(matcher.toNormalized([3, 10])).toEqual([3, 7]); // one before blank + expect(matcher.toNormalized([4, 10])).toEqual([4, 7]); + expect(matcher.toNormalized([5, 10])).toEqual([4, 7]); + expect(matcher.toNormalized([6, 10])).toEqual([5, 7]); + expect(matcher.toNormalized([7, 10])).toEqual([5, 7]); + expect(matcher.toNormalized([8, 10])).toEqual([5, 7]); + expect(matcher.toNormalized([9, 10])).toEqual([6, 7]); // one after blank + // test end + expect(matcher.toNormalized([0, 3])).toEqual([0, 3]); // one before blank + expect(matcher.toNormalized([0, 4])).toEqual([0, 4]); + expect(matcher.toNormalized([0, 5])).toEqual([0, 4]); + expect(matcher.toNormalized([0, 6])).toEqual([0, 5]); + expect(matcher.toNormalized([0, 7])).toEqual([0, 5]); + expect(matcher.toNormalized([0, 8])).toEqual([0, 5]); + expect(matcher.toNormalized([0, 9])).toEqual([0, 6]); // one after blank + // last + expect(matcher.toNormalized([20, fieldText.length])).toEqual([ + 17, + expectedNormalizedText.length + ]); + + // test begin + expect(matcher.toRaw([0, 7])).toEqual([0, 10]); + expect(matcher.toRaw([3, 7])).toEqual([3, 10]); // one before blank + expect(matcher.toRaw([4, 7])).toEqual([4, 10]); + expect(matcher.toRaw([5, 7])).toEqual([8, 10]); // one after blank + // test end + expect(matcher.toRaw([0, 3])).toEqual([0, 3]); // one before blank + expect(matcher.toRaw([0, 4])).toEqual([0, 4]); + expect(matcher.toRaw([0, 5])).toEqual([0, 8]); // one after blank + expect(matcher.toRaw([0, 6])).toEqual([0, 9]); // two after blank + // last + expect(matcher.toRaw([17, expectedNormalizedText.length])).toEqual([20, fieldText.length]); + }); + + it('should normalize text with multiple long blanks', () => { + const fieldText = 'This is a sample text content. '; + const expectedNormalizedText = 'This is a sample text content. '; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + expect(matcher.toNormalized([9, 29] /* s a sample te */)).toEqual([6, 19]); + expect(matcher.toRaw([10, 16] /* sample */)).toEqual([17, 23]); + }); + + it('should normalize quotes', () => { + const fieldText = 'This is “double-quoted”. This is ‘single-quoted’.'; + const expectedNormalizedText = 'This is "double-quoted". This is \'single-quoted\'.'; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + expect(matcher.toNormalized([9, 29])).toEqual([9, 29]); + expect(matcher.toRaw([10, 16])).toEqual([10, 16]); + for (let i = 0; i < fieldText.length; i += 1) { + expect(matcher.toNormalized([0, i + 1])).toEqual([0, i + 1]); + expect(matcher.toNormalized([i, fieldText.length])).toEqual([i, fieldText.length]); + expect(matcher.toRaw([0, i + 1])).toEqual([0, i + 1]); + expect(matcher.toRaw([i, fieldText.length])).toEqual([i, fieldText.length]); + } + }); + + it('should normalize surrogate pairs', () => { + const fieldText = 'This is emoji 😁.'; + const expectedNormalizedText = 'This is emoji _.'; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + expect(matcher.toNormalized([14, 16])).toEqual([14, 15]); + expect(matcher.toRaw([14, 15])).toEqual([14, 16]); + }); + + it('should normalize diacritical marks', () => { + const fieldText = 'àáâãäåçèéêëìíîïñòóôõöùúûüýÿæœ'; + const expectedNormalizedText = 'aaaaaaceeeeiiiinooooouuuuyyæœ'; + + const matcher = new TextNormalizer(fieldText); + expect(matcher.normalizedText).toEqual(expectedNormalizedText); + + const fieldText2 = fieldText.normalize('NFD'); // à: U+00E0 -> U+0061 U+0300 + expect(fieldText2.length).toBe(fieldText.length * 2 - 2 /* æœ are not changed */); + const matcher2 = new TextNormalizer(fieldText2); + expect(matcher2.normalizedText).toEqual(expectedNormalizedText); + }); + + describe('range conversion', () => { + it('should return mapped indices for negative indices and greater indices than text length', () => { + const matcher = new TextNormalizer('1234567890'); + expect(matcher.toNormalized([-10, 20])).toEqual([-10, 20]); + expect(matcher.toNormalized([20, 30])).toEqual([20, 30]); + expect(matcher.toRaw([-10, 20])).toEqual([-10, 20]); + expect(matcher.toRaw([20, 30])).toEqual([20, 30]); + }); + + it('should return mapped indices for negative indices and greater indices than normalized text length', () => { + const matcher = new TextNormalizer(' '); + expect(matcher.toNormalized([-10, 20])).toEqual([-10, 11]); + expect(matcher.toNormalized([20, 30])).toEqual([11, 21]); + expect(matcher.toRaw([-10, 20])).toEqual([-10, 29]); + expect(matcher.toRaw([20, 30])).toEqual([29, 39]); + }); + }); +}); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts new file mode 100644 index 000000000..153a2e1b3 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/bboxUtils.test.ts @@ -0,0 +1,24 @@ +import { bboxGetSpanByRatio, isNextToEachOther } from '../bboxUtils'; + +describe('bboxGetSpanByRatio', () => { + it('should return proper bbox for spans on text', () => { + // text: '0123456789' -> highlight: '0123456789' + expect(bboxGetSpanByRatio([0, 0, 10, 2], 10, [0, 10])).toEqual([0, 0, 10, 2]); + // text: '0123456789' -> highlight: '23' + expect(bboxGetSpanByRatio([0, 0, 10, 2], 10, [2, 4])).toEqual([2, 0, 4, 2]); + // text: '012345' -> highlight: '23' + expect(bboxGetSpanByRatio([0, 0, 10, 2], 5, [2, 4])).toEqual([4, 0, 8, 2]); + }); +}); + +describe('isSideBySideOnLine', () => { + it('should return true for side-by-side boxes', () => { + expect(isNextToEachOther([0, 0, 5, 2], [5, 0, 10, 2])).toBeTruthy(); + }); + it('should return false when boxes are not vertically aligned', () => { + expect(isNextToEachOther([0, 0, 5, 2], [5, 1, 10, 3])).toBeFalsy(); + }); + it('should return false when two boxes are apart from each other', () => { + expect(isNextToEachOther([0, 0, 5, 2], [7, 0, 10, 2])).toBeFalsy(); + }); +}); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/findLargestIndex.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/findLargestIndex.test.ts new file mode 100644 index 000000000..3c4abf352 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/__tests__/findLargestIndex.test.ts @@ -0,0 +1,46 @@ +import { findLargestIndex } from '../findLargestIndex'; + +describe('findLargestIndex', () => { + it('should find correct index', () => { + expect(findLargestIndex(0, 100, index => (index <= 49 ? index : null))).toEqual({ + index: 49, + value: 49 + }); + expect(findLargestIndex(0, 100, index => (index <= 50 ? index : null))).toEqual({ + index: 50, + value: 50 + }); + expect(findLargestIndex(0, 100, index => (index <= 51 ? index : null))).toEqual({ + index: 51, + value: 51 + }); + }); + + it('should find correct index at the edge of the range', () => { + expect(findLargestIndex(0, 100, index => (index === 0 ? index : null))).toEqual({ + index: 0, + value: 0 + }); + expect(findLargestIndex(0, 100, index => (index <= 150 ? index : null))).toEqual({ + index: 99, + value: 99 + }); + }); + + it('should find correct index in a range of 1 width', () => { + expect(findLargestIndex(0, 1, _ => true)).toEqual({ + index: 0, + value: true + }); + }); + + it('should return null for empty ranges', () => { + expect(findLargestIndex(0, 0, _ => true)).toBeNull(); + }); + + it('should return null when no match in the range', () => { + expect(findLargestIndex(0, 100, _ => null)).toBeNull(); + expect(findLargestIndex(0, 100, index => (index <= -50 ? index : null))).toBeNull(); + expect(findLargestIndex(0, 1, _ => null)).toBeNull(); + }); +}); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts new file mode 100644 index 000000000..cf37e0f34 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/bboxUtils.ts @@ -0,0 +1,61 @@ +import { Bbox, TextSpan } from '../../types'; +import { bboxesIntersect } from '../../../../utils/box'; +import { spanIntersection, spanLen } from '../../../../utils/textSpan'; + +/** + * Get bbox for a text span assuming each character takes horizontal spaces evenly + * @param bbox bbox occupied with a text + * @param origLength length of the text + * @returns bbox for the text + */ +export function bboxGetSpanByRatio(bbox: Bbox, origLength: number, span: TextSpan): Bbox { + const theSpan = spanIntersection([0, origLength], span); + if (origLength === 0 || spanLen(theSpan) <= 0) { + return [bbox[0], bbox[1], bbox[0], bbox[3]] as Bbox; + } + + const [spanStart, spanEnd] = span; + const [left, top, right, bottom] = bbox; + const width = right - left; + const resultLeft = left + (width / origLength) * spanStart; + const resultRight = left + (width / origLength) * spanEnd; + + return [resultLeft, top, resultRight, bottom]; +} + +/** + * Check whether the two bboxes are next to each other in a row. + * This is used to get a text of a line from a list of small text cells. + */ +export function isNextToEachOther(boxA: Bbox, boxB: Bbox): boolean { + // + // The ratio of height used to check whether two bboxes are on the same line or not. + // With the value 0.8, when more than 80% of range of height of each bbox overlaps + // one of another, they are considered on the same line. + // + const OVERLAP_RATIO = 0.8; + + if (bboxesIntersect(boxA, boxB)) { + return false; + } + + const [leftA, topA, rightA, bottomA] = boxA; + const [leftB, topB, rightB, bottomB] = boxB; + const heightA = bottomA - topA; + const heightB = bottomB - topB; + + // compare height ratio + if (!(heightA * OVERLAP_RATIO < heightB || heightB * OVERLAP_RATIO < heightA)) { + return false; + } + + const avgHeight = (heightA + heightB) / 2; + const overlapHeight = Math.max(0, Math.min(bottomA, bottomB) - Math.max(topA, topB)); + if (overlapHeight < avgHeight * OVERLAP_RATIO) { + return false; + } + + // see if boxes can be neighborhoods + const verticalGap = Math.max(0, leftB - rightA, leftA - rightB); + return verticalGap < avgHeight; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/documentUtils.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/documentUtils.ts new file mode 100644 index 000000000..576fb6c49 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/documentUtils.ts @@ -0,0 +1,64 @@ +import { TextMappings } from 'components/DocumentPreview/types'; +import { getTextMappings } from 'components/DocumentPreview/utils/documentData'; +import { QueryResult } from 'ibm-watson/discovery/v2'; +import { processDoc, ProcessedDoc } from 'utils/document'; +import { Location } from 'utils/document/processDoc'; +import { DocumentFields, TextSpan } from '../../types'; + +/** + * Get value of the specified field from a search result document + * + * @param document search result document + * @param field field name + * @param index field index. 0 by default + * @param span (optional) span on the field value to return. Returns entire the field value by default + * @returns text + */ +export function getDocFieldValue( + document: DocumentFields, + field: string, + index?: number, + span?: Location | TextSpan +): string | undefined { + let fieldText: string | undefined; + + const documentFieldArray = document[field]; + if (!Array.isArray(documentFieldArray) && !index) { + fieldText = documentFieldArray; + } else { + fieldText = documentFieldArray?.[index ?? 0]; + } + if (!fieldText || !span) { + return fieldText; + } + + if (Array.isArray(span)) { + return fieldText.substring(span[0], span[1]); + } else { + return fieldText.substring(span.begin, span.end); + } +} + +export type ExtractedDocumentInfo = { + processedDoc: ProcessedDoc; + textMappings?: TextMappings; +}; + +/** + * Extract bboxes and text_mappings from a search result document + */ +export async function extractDocumentInfo(document: QueryResult): Promise { + const docHtml = document.html; + const textMappings = getTextMappings(document) ?? undefined; + + const processedDoc = await processDoc( + { ...document, docHtml }, + { sections: true, bbox: true, bboxInnerText: true } + ); + + if (!processedDoc.bboxes) { + throw Error('Unexpected result from processDoc'); + } + + return { processedDoc, textMappings }; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/findLargestIndex.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/findLargestIndex.ts new file mode 100644 index 000000000..34a3f1a03 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/common/findLargestIndex.ts @@ -0,0 +1,34 @@ +/** + * Find the largest index that satisfies the matchFn and the value of matchFn then + * @param begin begin index of the range. inclusive + * @param end end index of the rage. exclusive + * @param matchFn + */ +export function findLargestIndex( + begin: number, + end: number, + matchFn: (index: number) => V | null, + splitMid?: boolean +): { index: number; value: V } | null { + if (end - begin < 1) return null; + + const midIndex = splitMid ? begin + Math.floor((end - begin) / 2) : end - 1; + const value = matchFn(midIndex); + if (!(value == null)) { + if (end - (midIndex + 1) > 0) { + const r = findLargestIndex(midIndex + 1, end, matchFn, true); + if (r) return r; + else return { index: midIndex, value }; + } else { + return { index: midIndex, value }; + } + } else { + if (midIndex - begin > 0) { + const r = findLargestIndex(begin, midIndex, matchFn, true); + if (r) return r; + else return null; + } else { + return null; + } + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts new file mode 100644 index 000000000..dd8e2f932 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/CellProvider.ts @@ -0,0 +1,113 @@ +import { isNextToEachOther } from '../common/bboxUtils'; +import { TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; + +export class CellProvider { + private readonly skippedCells: TextLayoutCellBase[] = []; + private cells: readonly TextLayoutCellBase[]; + private cursor: number = 0; + + constructor(cells: TextLayoutCellBase[]) { + this.cells = Object.freeze([...cells]); + } + + hasNext(): boolean { + while (this.cursor < this.cells.length) { + const cell = this.cells[this.cursor]; + if (cell.text.trim().length !== 0) { + break; + } + this.skip(); + } + return this.cursor < this.cells.length; + } + + /** + * get cells on a line + */ + private getNextCells(): TextLayoutCellBase[] { + const { + cells: lastCells, + cursor: lastCursor, + result: lastResult + } = this.getNextCellsCache || {}; + + if (lastResult && lastCells === this.cells && lastCursor === this.cursor) { + return lastResult; + } + + const result: TextLayoutCellBase[] = []; + let lastCell: TextLayoutCell | null = null; + for (let i = this.cursor; i < this.cells.length; i += 1) { + const currentBox = this.cells[i]; + // maybe we need to break this loop by big box change + const { cell: baseCurrentCell } = currentBox.getNormalized(); + if (lastCell && !isNextToEachOther(lastCell.bbox, baseCurrentCell.bbox)) { + break; + } + result.push(currentBox); + lastCell = baseCurrentCell; + } + + this.getNextCellsCache = { + cells: this.cells, + cursor: this.cursor, + result + }; + return result; + } + + private getNextCellsCache: { + cells: readonly TextLayoutCellBase[]; + cursor: number; + result: TextLayoutCellBase[]; + } | null = null; + + /** + * get text from cells on a line + */ + getNextText(): { texts: string[]; nextCellIndex: number } { + const nextCells = this.getNextCells(); + const texts = nextCells.map(cell => cell.text); + return { texts, nextCellIndex: this.cursor }; + } + + /** + * consume (mark as used) first n chars from the cursor + * @return text layout cells on the consumed text + */ + consume(length: number): TextLayoutCellBase[] { + const result: TextLayoutCellBase[] = []; + + let lengthToConsume = length; + while (lengthToConsume > 0) { + const current = this.cells[this.cursor]; + const bboxTextLength = current.text.length; + + if (lengthToConsume < bboxTextLength) { + // in this case, split bbox and consume matched part + // add prefix to the result + const consumed = current.getPartial([0, lengthToConsume]); + result.push(consumed); + + const remaining = current.getPartial([lengthToConsume, bboxTextLength]); + const newCells = [...this.cells]; + newCells[this.cursor] = remaining; + this.cells = Object.freeze(newCells); + break; + } + + result.push(current); + lengthToConsume -= bboxTextLength; + this.cursor += 1; + } + return result; + } + + /** + * skip the current cell + */ + skip() { + this.skippedCells.push(this.cells[this.cursor]); + this.cursor += 1; + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts new file mode 100644 index 000000000..337fe72d9 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingSourceTextProvider.ts @@ -0,0 +1,74 @@ +import minBy from 'lodash/minBy'; +import { spanGetText, spanLen, START } from '../../../../utils/textSpan'; +import { TextSpan } from '../../types'; +import { TextNormalizer } from '../common/TextNormalizer'; +import { TextLayoutCell } from '../textLayout/types'; +import { TextProvider } from './TextProvider'; + +const debugOut = require('debug')?.('pdf:mapping:MappingSourceTextProvider'); +function debug(...args: any) { + debugOut?.apply(null, args); +} + +/** + * TextProvider with normalization + * @see TextProvider + */ +export class MappingSourceTextProvider { + private readonly cell: TextLayoutCell; + private readonly normalizer: TextNormalizer; + private readonly provider: TextProvider; + + constructor(cell: TextLayoutCell) { + this.cell = cell; + this.normalizer = new TextNormalizer(cell.text); + this.provider = new TextProvider(this.normalizer.normalizedText); + } + + /** + * Find the best span where the give text matches to the rest of the text + */ + getMatch(text: string) { + const normalizedText = this.normalizer.normalize(text); + debug('getMatch "%s", normalized "%s"', text, normalizedText); + const normalizedMatches = this.provider.getMatches(normalizedText); + debug('normalized matches: %o', normalizedMatches); + + // find best + const normalizedResult = minBy(normalizedMatches, m => m.minHistoryDistance); + if (!normalizedResult) { + debug('getMatch result: null'); + return null; + } + + const rawMatchedSpan = this.normalizer.toRaw(normalizedResult.span); + const rawSkipTextSpan = this.normalizer.toRaw([ + normalizedResult.span[START] - normalizedResult.skipText.length, + normalizedResult.span[START] + ]); + const r = { + span: rawMatchedSpan, + skipText: spanGetText(this.cell.text, rawSkipTextSpan), + score: spanLen(rawMatchedSpan) - normalizedResult.minHistoryDistance, + approxLenAfterEnd: normalizedResult.textAfterEnd.length + }; + debug('getMatch result: %o', r); + return r; + } + + /** + * Mark the given `span` as used + */ + consume(span: TextSpan) { + const normalizedSpan = this.normalizer.toNormalized(span); + this.provider.consume(normalizedSpan); + debug('text span consumed %o', span); + } + + /** + * Check whether a given text is blank or not + */ + isBlank(text: string): boolean { + return this.normalizer.isBlank(text); + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts new file mode 100644 index 000000000..ff6984f6e --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/MappingTargetCellProvider.ts @@ -0,0 +1,74 @@ +import { END } from '../../../../utils/textSpan'; +import { TextLayoutCellBase } from '../textLayout/types'; +import { TextNormalizer } from '../common/TextNormalizer'; +import { CellProvider } from './CellProvider'; + +/** + * Cell provider with normalization + * @see CellProvider + */ +export class MappingTargetBoxProvider { + private readonly cellProvider: CellProvider; + private current: { + nextCellIndex: number; + normalizer: TextNormalizer; + leadingSpaces: number; + } | null = null; + + constructor(cells: TextLayoutCellBase[]) { + this.cellProvider = new CellProvider(cells); + } + + /** + * check whether this provider has another item to visit or not + */ + hasNext(): boolean { + while (this.cellProvider.hasNext()) { + const { texts, nextCellIndex } = this.cellProvider.getNextText(); + const text = texts.join(''); + const leadingSpaces = text.match(/^\s*/)?.[0].length ?? 0; + const trimmedText = text.substring(leadingSpaces); + if (trimmedText.length > 0) { + const normalizer = new TextNormalizer(trimmedText); + this.current = { + nextCellIndex, + normalizer, + leadingSpaces + }; + return true; + } + this.cellProvider.skip(); // skip blank only + } + this.current = null; + return false; + } + + /** + * get the next value + */ + getNextInfo(): { text: string; index: number } { + return { + text: this.current!.normalizer.normalizedText, + index: this.current!.nextCellIndex + }; + } + + /** + * consume (mark as used) first n chars from the cursor + * @return text layout cells on the consumed text + */ + consume(length: number): TextLayoutCellBase[] { + const rawSpan = this.current!.normalizer.toRaw([0, length]); + const rawLength = this.current!.leadingSpaces + rawSpan[END]; + this.current = null; + return this.cellProvider.consume(rawLength); + } + + /** + * mark the current cell skipped (when no match found in source) + */ + skip() { + this.current = null; + this.cellProvider.skip(); + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts new file mode 100644 index 000000000..00aed9e40 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextBoxMapping.ts @@ -0,0 +1,122 @@ +import { Dictionary } from 'lodash'; +import groupBy from 'lodash/groupBy'; +import { TextSpan } from '../../types'; +import { + spanCompare, + spanFromSubSpan, + spanGetSubSpan, + spanIntersection, + spanIntersects +} from '../../../../utils/textSpan'; +import { TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; +import { TextNormalizer } from '../common/TextNormalizer'; +import { TextBoxMapping, TextBoxMappingEntry, TextBoxMappingResult } from './types'; + +const debugOut = require('debug')?.('pdf:mapping:TextBoxMapping'); +function debug(...args: any) { + debugOut?.apply(null, args); +} + +/** + * Text box mapping. Mapping between cells (i.e. text box) in a TextLayout + * to ones in another TextLayout. + */ +class TextBoxMappingImpl implements TextBoxMapping { + private readonly mappingEntryMap: Dictionary; + + constructor(mappingEntries: TextBoxMappingEntry[]) { + this.mappingEntryMap = groupBy(mappingEntries, m => m.text.cell.id); + + // sort by span offset + Object.values(this.mappingEntryMap).forEach(value => { + value.sort((a, b) => spanCompare(a.text.span, b.text.span)); + }); + debug('TextBoxMapping created'); + debug(this); + } + + /** + * get text mapping entries for a given span `spanInSourceCell` on a given `sourceCell` + */ + private getEntries( + sourceCell: TextLayoutCell, + spanOnSourceCell: TextSpan + ): TextBoxMappingEntry[] { + return (this.mappingEntryMap[sourceCell.id] || []).filter(m => + spanIntersects(m.text.span, spanOnSourceCell) + ); + } + + /** + * @inheritdoc + */ + apply(source: TextLayoutCellBase, aSpan?: TextSpan): TextBoxMappingResult { + const span: TextSpan = aSpan || [0, source.text.length]; + + const { cell: sourceCell, span: sourceSpan } = source.getNormalized(); + const spanInSourceCell = sourceSpan ? spanFromSubSpan(sourceSpan, span) : span; + + debug('applying TextBoxMapping'); + debug(source, span); + const entries = this.getEntries(sourceCell, spanInSourceCell); + const result = entries.map(m => { + if (!m.box) { + return { cell: null, sourceSpan: m.text.span }; + } else { + let boxSpan; + if (equalsSpanText(m.text.cell, m.text.span, source, spanInSourceCell)) { + boxSpan = spanGetSubSpan(m.text.span, spanInSourceCell); + } else { + const n1 = new TextNormalizer(m.text.cell.text); + const normalizedBoxSpan = spanGetSubSpan( + n1.toNormalized(m.text.span), + n1.toNormalized(spanInSourceCell) + ); + const n2 = new TextNormalizer(m.box.cell.text); + boxSpan = n2.toRaw(normalizedBoxSpan); + } + + return { + cell: m.box.cell.getPartial(boxSpan), + sourceSpan: spanIntersection(m.text.span, spanInSourceCell) + }; + } + }); + debug('applying TextBoxMapping - result'); + debug(result); + return result; + } +} + +/** + * Builder for the TextMapping + */ +export class TextBoxMappingBuilder { + mappingEntries: TextBoxMappingEntry[] = []; + + /** + * add new mapping data + */ + addMapping(text: TextBoxMappingEntry['text'], box: TextBoxMappingEntry['box']) { + this.mappingEntries.push({ text, box }); + debug('>> added a new mapping entry (%o) => (cell: %o)', text, text, box?.cell); + } + + toTextBoxMapping() { + return new TextBoxMappingImpl(this.mappingEntries); + } +} + +/** + * Check if text on spans on cells are the same or not + */ +function equalsSpanText( + textCell: TextLayoutCellBase, + textSpan: TextSpan, + sourceCell: TextLayoutCellBase, + sourceSpan: TextSpan +) { + const left = textCell.text.substring(...textSpan); + const right = sourceCell.text.substring(...sourceSpan); + return left === right; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts new file mode 100644 index 000000000..700be90ce --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/TextProvider.ts @@ -0,0 +1,118 @@ +import { TextSpan } from '../../types'; +import { + END, + START, + spanIntersects, + spanIncludesIndex, + spanGetText, + spanIntersection +} from '../../../../utils/textSpan'; +import { findLargestIndex } from '../common/findLargestIndex'; + +const MAX_HISTORY = 3; + +export type TextMatch = { + /** + * matched text span + */ + span: TextSpan; + + /** + * text before the matched text. i.e. text that will be skipped by using this match + */ + skipText: string; + + /** + * distance from the nearest cursors + */ + minHistoryDistance: number; + + /** + * text after the matched text + */ + textAfterEnd: string; +}; + +/** + * Manage text in a source (larger) cell. + * - Find text (in a target cell) from the _unused_ text + * - Once a span is mapped to a target (smaller) cell, mark the the correspondent span _used_ + */ +export class TextProvider { + private readonly fieldText: string; + private remainingSpans: TextSpan[]; + private history: number[] = [0]; // Keep MAX_HISTORY last recently consumed + + constructor(fieldText: string) { + this.fieldText = fieldText; + this.remainingSpans = [[0, fieldText.length]]; + } + + /** + * Get how the given `text` matches to the currently available text + */ + getMatches(text: string, minLength = 1, maxLength = text.length): TextMatch[] { + const match = findLargestIndex(minLength, maxLength + 1, index => { + const lengthToMatch = index; + const textToMatch = text.substring(0, lengthToMatch); + + const result: TextMatch[] = []; + for (const aSpan of this.remainingSpans) { + const [spanBegin, spanEnd] = aSpan; + const spanText = this.fieldText.slice(spanBegin, spanEnd); + + const foundIndex = spanText.indexOf(textToMatch); + if (foundIndex >= 0) { + const foundSpanBegin = spanBegin + foundIndex; + const foundSpanEnd = foundSpanBegin + textToMatch.length; + const historyDistances = this.history.map(i => { + const v = foundSpanBegin - i; + return v >= 0 ? v : Number.MAX_SAFE_INTEGER; + }); + result.push({ + span: [foundSpanBegin, foundSpanEnd], + skipText: spanText.substring(0, foundIndex), + minHistoryDistance: Math.min(...historyDistances, this.fieldText.length), + textAfterEnd: this.remainingSpans + .map(span => { + const validSpan = spanIntersection([foundSpanEnd, this.fieldText.length], span); + return spanGetText(this.fieldText, validSpan); + }) + .join('') + }); + } + } + return result.length > 0 ? result : null; + }); + + return match ? match.value : []; + } + + /** + * Mark the `span` as used + */ + consume(span: TextSpan) { + const remaining: TextSpan[] = []; + this.remainingSpans.forEach(remainingSpan => { + if (spanIntersects(span, remainingSpan)) { + if (remainingSpan[START] < span[START]) { + remaining.push([remainingSpan[START], span[START]]); + } + if (span[END] < remainingSpan[END]) { + remaining.push([span[END], remainingSpan[END]]); + } + } else { + remaining.push(remainingSpan); + } + }); + this.remainingSpans = remaining; + + // update history + const validSpans = [span[END], ...this.history].filter(index => { + if (spanIncludesIndex(span, index)) return false; + if (!this.remainingSpans.some(s => spanIncludesIndex(s, index))) return false; + return true; + }); + this.history = validSpans.slice(0, MAX_HISTORY); + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/__tests__/TextProvider.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/__tests__/TextProvider.test.ts new file mode 100644 index 000000000..928b4cabd --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/__tests__/TextProvider.test.ts @@ -0,0 +1,56 @@ +import { TextProvider } from '../TextProvider'; + +describe('TextProvider', () => { + it('should find correct span for a text', () => { + const fieldText = 'This is a sample sample text content.'; + const provider = new TextProvider(fieldText); + + const r = provider.getMatches('sample')[0]; + expect(r?.skipText).toBe('This is a '); + expect(r?.span).toEqual([10, 16]); + expect(r?.minHistoryDistance).toBe(10); + expect(r?.textAfterEnd).toBe(' sample text content.'); + }); + + it('should find correct spans for a text after consuming a span', () => { + const fieldText = 'This is a sample sample text content.'; + const matcher = new TextProvider(fieldText); + + // match and consumer a word + let match = matcher.getMatches('sample'); + let r = match[0]; + matcher.consume(r?.span); + + // find span in former of remaining spans + match = matcher.getMatches(' is'); + expect(match).toHaveLength(1); + r = match[0]; + expect(r?.skipText).toBe('This'); + expect(r?.span).toEqual([4, 7]); + expect(r?.minHistoryDistance).toBe(4); + expect(r?.textAfterEnd).toBe(' a sample text content.'); + + // find span in latter of remaining spans + match = matcher.getMatches('sample'); + expect(match).toHaveLength(1); + r = match[0]; + expect(r?.skipText).toBe(' '); + expect(r?.span).toEqual([17, 23]); + expect(r?.minHistoryDistance).toBe(1); + expect(r?.textAfterEnd).toBe(' text content.'); + + // find spans in both of remaining spans + match = matcher.getMatches('s'); + expect(match).toHaveLength(2); + r = match[0]; + expect(r?.skipText).toBe('Thi'); + expect(r?.span).toEqual([3, 4]); + expect(r?.minHistoryDistance).toBe(3); + expect(r?.textAfterEnd).toBe(' is a sample text content.'); + r = match[1]; + expect(r?.skipText).toBe(' '); + expect(r?.span).toEqual([17, 18]); + expect(r?.minHistoryDistance).toBe(1); + expect(r?.textAfterEnd).toBe('ample text content.'); + }); +}); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts new file mode 100644 index 000000000..44f0f0b1d --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/getTextBoxMapping.ts @@ -0,0 +1,205 @@ +import minBy from 'lodash/minBy'; +import { nonEmpty } from 'utils/nonEmpty'; +import { TextSpan } from '../../types'; +import { bboxesIntersect } from '../../../../utils/box'; +import { spanLen, spanMerge } from '../../../../utils/textSpan'; +import { TextLayout, TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; +import { MappingSourceTextProvider } from './MappingSourceTextProvider'; +import { MappingTargetBoxProvider } from './MappingTargetCellProvider'; +import { TextBoxMappingBuilder } from './TextBoxMapping'; +import { TextBoxMapping } from './types'; + +const debugOut = require('debug')?.('pdf:mapping:getTextBoxMapping'); +function debug(...args: any) { + debugOut?.apply(null, args); +} + +/** + * Calculate text box mapping from `source` text layout to `target` text layout + * @param source text layout with larger cells + * @param target text layout with smaller cells + * @returns a text box mapping instance + */ +export function getTextBoxMappings< + SourceCell extends TextLayoutCell, + TargetCell extends TextLayoutCell +>(sourceLayout: TextLayout, targetLayout: TextLayout): TextBoxMapping { + debug('getTextBoxMapping: enter'); + + const target = new Target(targetLayout); + const source = new Source(sourceLayout, targetLayout); + const builder = new TextBoxMappingBuilder(); + + target.processText((targetCellId, targetText, markTargetAsMapped) => { + const matchInSource = source.findMatch(targetCellId, targetText); + if (matchInSource) { + const mappedTargetCells = markTargetAsMapped(matchInSource.matchLength); + + let mappedSourceFullSpan: TextSpan = [0, 0]; + mappedTargetCells.forEach(targetCell => { + const mappedSourceSpan = matchInSource.markSourceAsMapped(targetCell.text); + if (mappedSourceSpan) { + builder.addMapping( + { cell: matchInSource.cell, span: mappedSourceSpan }, + { cell: targetCell } + ); + mappedSourceFullSpan = spanMerge(mappedSourceFullSpan, mappedSourceSpan); + } + }); + if (spanLen(mappedSourceFullSpan) > 0) { + matchInSource.markSourceMappedBySpan(mappedSourceFullSpan); + } + } + }); + + return builder.toTextBoxMapping(); +} + +/** + * Utility class for manipulating target text layout in getTextBoxMapping + */ +class Target { + targetProvider: MappingTargetBoxProvider; + + constructor(targetLayout: TextLayout) { + this.targetProvider = new MappingTargetBoxProvider(targetLayout.cells); + } + + /** + * Try to map text fragments (`cellId` and `text` passed to `textMapper`) + * in target using a given `textMapper` + */ + processText( + textMapper: ( + cellId: number, + text: string, + markTargetMapped: (length: number) => TextLayoutCellBase[] + ) => void + ) { + while (this.targetProvider.hasNext()) { + const { index: cellId, text: nextText } = this.targetProvider.getNextInfo(); + debug('> find match at index %d, text: %s', cellId, nextText); + + let isMapped = false; + const markAsMapped = (matchedLength: number) => { + if (matchedLength > 0) { + isMapped = true; + const matchedTargetCells = this.targetProvider.consume(matchedLength); + debug('> raw target cells for matched length: %d', matchedLength); + debug(matchedTargetCells); + + return matchedTargetCells.map(cell => cell.trim()).filter(cell => cell.text.length > 0); + } + return []; + }; + + textMapper(cellId, nextText, markAsMapped); + if (!isMapped) { + this.targetProvider.skip(); + } + } + } +} + +/** + * Utility class for manipulating source text layout and its source text in getTextBoxMapping + */ +class Source { + sourceProviders: MappingSourceTextProvider[]; + targetIndexToSources: { + cell: SourceCell; + provider: MappingSourceTextProvider; + }[][]; + + constructor(sourceLayout: TextLayout, targetLayout: TextLayout) { + this.sourceProviders = sourceLayout.cells.map(cell => new MappingSourceTextProvider(cell)); + this.targetIndexToSources = targetLayout.cells.map(targetCell => { + const cells = sourceLayout.cells + .map((sourceCell, index) => { + if (!bboxesIntersect(sourceCell.bbox, targetCell.bbox)) { + return null; + } + return { cell: sourceCell, provider: this.sourceProviders[index] }; + }) + .filter(nonEmpty); + + if (cells.some(({ cell }) => cell.isInHtmlBbox)) { + return cells.filter(({ cell }) => cell.isInHtmlBbox); + } + return cells; + }); + } + + /** + * Find the best (i.e. longest length `text`) match in source which intersects + * with the target cell of given `targetCellId` + * @param targetCellId + * @param text + * @return matched source information and functions to mark the matched span as mapped + */ + findMatch(targetCellId: TargetCell['id'], text: string) { + const candidateSources = this.targetIndexToSources[targetCellId]; + const bestMatch = Source.findBestMatch(candidateSources, text); + debug('> source cell(s) matched: %o', bestMatch); + + if (!bestMatch?.match || spanLen(bestMatch.match.span) === 0) { + return null; + } + + const matchedCell = bestMatch.cell; + const matchedSourceSpan = bestMatch.match.span; + const matchedSourceProvider = bestMatch.provider; + + return { + cell: matchedCell, + matchLength: spanLen(matchedSourceSpan), + markSourceAsMapped: (text: string) => { + const mappedSource = matchedSourceProvider.getMatch(text); + debug('>> target cell %o to source %o', text, mappedSource); + return mappedSource?.span; + }, + markSourceMappedBySpan: (span: TextSpan) => { + if (spanLen(span) > 0) { + matchedSourceProvider.consume(span); + } + } + }; + } + + /** + * Find the best source (larger text layout cell) where text `textToMatch` is in + * @param sources source (larger) text layout cells overlapping the current target cell + * @param textToMatch text form target cell(s) + * @returns the best source where the `textToMatch` is matched and the text location in the source + */ + private static findBestMatch( + sources: { + cell: TextLayoutCell; + provider: MappingSourceTextProvider; + }[], + textToMatch: string + ) { + // find matches + const matches = sources.map(source => { + const match = source.provider.getMatch(textToMatch); + return { ...source, match }; + }); + + // calc cost for each match + let skipTextLen = 0; + const matchesWithCost = matches.map(aMatch => { + const { match: providerMatch } = aMatch; + const cost = !providerMatch + ? Number.MAX_SAFE_INTEGER + : skipTextLen + providerMatch.skipText.length - spanLen(providerMatch.span); + + skipTextLen += providerMatch?.approxLenAfterEnd ?? 0; + + return { ...aMatch, cost }; + }); + + // find best match + const bestMatch = minBy(matchesWithCost, match => match.cost); + return bestMatch; + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/index.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/index.ts new file mode 100644 index 000000000..8e16507ac --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/index.ts @@ -0,0 +1 @@ +export { getTextBoxMappings } from './getTextBoxMapping'; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/types.ts new file mode 100644 index 000000000..2c7e0d666 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textBoxMapping/types.ts @@ -0,0 +1,28 @@ +import { TextSpan } from '../../types'; +import { TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; + +export type TextBoxMappingResult = { + cell: TextLayoutCellBase | null; + sourceSpan: TextSpan; +}[]; + +/** + * Interface for text box mapping + */ +export interface TextBoxMapping { + /** + * Get spans on target (smaller) cells for a given span on a source (larger) cell + * @param source source text layout cell + * @param span span on the source cell + */ + apply(source: TextLayoutCellBase, span?: TextSpan): TextBoxMappingResult; +} + +/** + * Interface for text box mapping entries. + * Internal. Used only in text box mapping implementation + */ +export interface TextBoxMappingEntry { + text: { cell: TextLayoutCell; span: TextSpan }; + box: { cell: TextLayoutCellBase } | null; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts new file mode 100644 index 000000000..8732f4571 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/BaseTextLayout.ts @@ -0,0 +1,125 @@ +import { spanGetText, spanIntersection, spanOffset, START } from '../../../../utils/textSpan'; +import { Bbox, TextSpan } from '../../types'; +import { bboxGetSpanByRatio } from '../common/bboxUtils'; +import { TextLayout, TextLayoutCell, TextLayoutCellBase } from './types'; + +/** + * Base implementation of text layout cell + */ +export class BaseTextLayoutCell> + implements TextLayoutCell +{ + readonly parent: Layout; + readonly id: number; + readonly pageNum: number; + readonly bbox: Bbox; + readonly text: string; + + constructor({ + parent, + id, + pageNum, + bbox, + text + }: { + parent: Layout; + id: number; + pageNum: number; + bbox: Bbox; + text: string; + }) { + this.parent = parent; + this.id = id; + this.pageNum = pageNum; + this.bbox = bbox; + this.text = text; + } + + /** + * @inheritdoc + */ + getPartial(span: TextSpan): TextLayoutCellBase { + return new PartialTextLayoutCell(this, span); + } + + /** + * @inheritdoc + */ + getNormalized(): { cell: TextLayoutCell; span?: TextSpan } { + return { cell: this }; + } + + /** + * @inheritdoc + */ + getBboxForTextSpan(span: TextSpan, options: { useRatio?: boolean }): Bbox | null { + if (options?.useRatio) { + return bboxGetSpanByRatio(this.bbox, this.text.length, span); + } + return null; + } + + /** + * @inheritdoc + */ + trim(): TextLayoutCellBase { + return trimCell(this); + } +} + +/** + * Text span on a base text layout cell + */ +export class PartialTextLayoutCell implements TextLayoutCellBase { + readonly base: TextLayoutCell; + readonly span: TextSpan; + + constructor(base: TextLayoutCell, span: TextSpan) { + this.base = base; + this.span = spanIntersection([0, base.text.length], span); + } + + /* @inheritdoc */ + get text() { + return spanGetText(this.base.text, this.span); + } + + /** + * @inheritdoc + */ + getPartial(span: TextSpan): TextLayoutCellBase { + const newSpan = spanIntersection(this.span, spanOffset(span, this.span[START])); + return new PartialTextLayoutCell(this.base, newSpan); + } + + /** + * @inheritdoc + */ + getNormalized() { + return { cell: this.base, span: this.span }; + } + + /** + * @inheritdoc + */ + trim(): TextLayoutCellBase { + return trimCell(this); + } +} + +/** + * Get a text layout cell that represents a trimmed text of a given `cell` + * @returns a new cell for the trimmed text. Zero-length cell when the text of the given `cell` is blank + */ +function trimCell(cell: TextLayoutCellBase) { + const text = cell.text; + const nLeadingSpaces = text.match(/^\s*/)![0].length; + const nTrailingSpaces = text.match(/\s*$/)![0].length; + if (nLeadingSpaces === 0 && nTrailingSpaces === 0) { + return cell; + } + if (text.length > nLeadingSpaces + nTrailingSpaces) { + return cell.getPartial([nLeadingSpaces, text.length - nTrailingSpaces]); + } + return cell.getPartial([0, 0]); // return zero-length cell +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts new file mode 100644 index 000000000..db60ce738 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/HtmlBboxTextLayout.ts @@ -0,0 +1,71 @@ +import { decodeHTML } from 'entities'; +import { ProcessedBbox } from 'utils/document'; +import { Bbox, TextSpan } from '../../types'; +import { BaseTextLayoutCell } from './BaseTextLayout'; +import { HtmlBboxInfo, TextLayout } from './types'; + +/** + * Text layout based on bboxes in HTML field + */ +export class HtmlBboxTextLayout implements TextLayout { + private readonly bboxInfo: HtmlBboxInfo; + readonly cells: HtmlBboxTextLayoutCell[]; + + constructor(bboxInfo: HtmlBboxInfo, pageNum: number) { + this.bboxInfo = bboxInfo; + this.cells = + bboxInfo.bboxes + ?.filter(bbox => bbox.page === pageNum) + .map((bbox, index) => { + return new HtmlBboxTextLayoutCell(this, index, bbox); + }) ?? []; + } + + /** + * @inheritdoc + */ + cellAt(id: number) { + return this.cells[id]; + } + + /** + * Install style to DOM if not yet. The style will be used to calculate bbox in `getBboxForTextSpan` + */ + installStyle() { + if (this.bboxInfo.styles) { + // TODO: implement this + } + } +} + +/** + * Text layout cell based on bboxes in HTML field + */ +class HtmlBboxTextLayoutCell extends BaseTextLayoutCell { + private readonly processedBbox: ProcessedBbox; + + constructor(parent: HtmlBboxTextLayout, index: number, processedBbox: ProcessedBbox) { + const id = index; + const pageNum = processedBbox.page; + const bbox: Bbox = [ + processedBbox.left, + processedBbox.top, + processedBbox.right, + processedBbox.bottom + ]; + const text = decodeHTML(processedBbox.innerTextSource ?? ''); + super({ parent, id, pageNum, bbox, text }); + + this.processedBbox = processedBbox; // keep this for later improvement + } + + /** + * @inheritdoc + */ + getBboxForTextSpan(span: TextSpan, options: { useRatio?: boolean }): Bbox | null { + if (this.processedBbox != null) { + // TODO: implement this. calculate bbox for text span using text on browser + } + return super.getBboxForTextSpan(span, options); + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts new file mode 100644 index 000000000..a57dd011c --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/PdfTextContentTextLayout.ts @@ -0,0 +1,118 @@ +import { bboxesIntersect } from 'components/DocumentPreview/utils/box'; +import { PDFPageViewport, PDFPageViewportOptions, TextContentItem } from 'pdfjs-dist'; +import { Bbox, TextSpan } from '../../types'; +import { BaseTextLayoutCell } from './BaseTextLayout'; +import { getAdjustedCellByOffsetByDom } from './dom'; +import { HtmlBboxInfo, PdfTextContentInfo, TextLayout } from './types'; + +/** + * Text layout based on PDF text objects + */ +export class PdfTextContentTextLayout implements TextLayout { + private readonly textContentInfo: PdfTextContentInfo; + readonly cells: PdfTextContentTextLayoutCell[]; + private divs: HTMLElement[] | undefined; + + constructor(textContentInfo: PdfTextContentInfo, pageNum: number, htmlBboxInfo?: HtmlBboxInfo) { + this.textContentInfo = textContentInfo; + + const textContentItems = textContentInfo.textContent.items; + + this.cells = textContentItems.map((item, index) => { + const cellBbox = getBbox(item, this.viewport); + let isInHtmlBbox = false; + if (htmlBboxInfo?.bboxes?.length) { + isInHtmlBbox = htmlBboxInfo.bboxes.some(bbox => { + return bboxesIntersect(cellBbox, [bbox.left, bbox.top, bbox.right, bbox.bottom]); + }); + } + return new PdfTextContentTextLayoutCell(this, index, item, pageNum, cellBbox, isInHtmlBbox); + }); + } + + /** + * get viewport of the current page + */ + get viewport(): PDFPageViewport { + return this.textContentInfo.viewport; + } + + /** + * @inheritdoc + */ + cellAt(id: number) { + return this.cells[id]; + } + + /** + * set PDF text content item divs + */ + setDivs(divs: HTMLElement[] | undefined) { + this.divs = divs; + } + + /** + * get HTML element for a given cell id + */ + divAt(id: number): HTMLElement | undefined { + return this.divs?.[id]; + } +} + +/** + * Text layout cell based on PDF text objects + */ +class PdfTextContentTextLayoutCell extends BaseTextLayoutCell { + /** + * @inheritdoc + */ + readonly isInHtmlBbox?: boolean; + + constructor( + parent: PdfTextContentTextLayout, + index: number, + textItem: TextContentItem, + pageNum: number, + bbox: Bbox, + isInHtmlBbox: boolean + ) { + const id = index; + const text = textItem.str; + super({ parent, id, pageNum, bbox, text }); + this.isInHtmlBbox = isInHtmlBbox; + } + + /** + * @inheritdoc + */ + getBboxForTextSpan(span: TextSpan, options: { useRatio?: boolean }): Bbox | null { + const spanElement = this.parent.divAt(this.id); + if (spanElement && spanElement.parentNode) { + const scale = this.parent.viewport.scale; + const bbox = getAdjustedCellByOffsetByDom(this, span, spanElement, scale); + if (bbox) { + return bbox; + } + } + return super.getBboxForTextSpan(span, options); + } +} + +/** + * Get bbox from a PDF text content item + */ +function getBbox(textItem: TextContentItem, viewport: PDFPageViewport): Bbox { + const { transform } = textItem; + + const patchedViewport = viewport as PDFPageViewportOptions & PDFPageViewport; + const defaultSideways = patchedViewport.rotation % 180 !== 0; + + const [fontHeightPx, , offsetX, offsetY, x, y] = transform; + const [xMin, yMin, , yMax] = patchedViewport.viewBox; + const top = defaultSideways ? x + offsetX + yMin : yMax - (y + offsetY); + const left = defaultSideways ? y - xMin : x - xMin; + const bottom = top + fontHeightPx; + const adjustHeight = fontHeightPx * 0.2; + + return [left, top + adjustHeight, left + textItem.width, bottom + adjustHeight]; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/TextMappingsTextLayout.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/TextMappingsTextLayout.ts new file mode 100644 index 000000000..e3486f453 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/TextMappingsTextLayout.ts @@ -0,0 +1,81 @@ +import { Cell, CellField } from '../../../../types'; +import { + spanGetSubSpan, + spanContains, + spanIntersection, + spanIntersects +} from '../../../../utils/textSpan'; +import { DocumentFields, DocumentFieldHighlight, TextSpan } from '../../types'; +import { getDocFieldValue } from '../common/documentUtils'; +import { TextBoxMappingResult } from '../textBoxMapping/types'; +import { BaseTextLayoutCell } from './BaseTextLayout'; +import { TextLayout, TextMappingInfo } from './types'; + +/** + * Text layout based on text mappings + */ +export class TextMappingsTextLayout implements TextLayout { + readonly cells: TextMappingsTextLayoutCell[]; + + constructor(textMappingInfo: TextMappingInfo, pageNum: number) { + const { textMappings, document } = textMappingInfo; + + this.cells = textMappings.text_mappings + .filter(cell => cell.page.page_number === pageNum) + .map((cell, index) => { + return new TextMappingsTextLayoutCell(this, index, document, cell); + }); + } + + /** + * @inheritdoc + */ + cellAt(id: number) { + return this.cells[id]; + } + + /** + * Get highlighted text layout cells from a span on a field in a search result document + * @param highlight field and span to highlight + * @returns a text cell based on + */ + getHighlight(highlight: DocumentFieldHighlight): TextBoxMappingResult { + const highlightSpan: TextSpan = [highlight.location.begin, highlight.location.end]; + const highlightCells = this.cells + .filter(cell => { + const { cellField } = cell; + return ( + cellField.name === highlight.field && + cellField.index === highlight.fieldIndex && + spanIntersects(cellField.span, highlightSpan) + ); + }) + .map(cell => { + const { cellField } = cell; + const currentSpan = spanIntersection(cellField.span, highlightSpan); + if (spanContains(highlightSpan, cellField.span)) { + return { cell, sourceSpan: currentSpan }; + } + const subSpan = spanGetSubSpan(cellField.span, currentSpan); + return { cell: cell.getPartial(subSpan), sourceSpan: currentSpan }; + }); + return highlightCells; + } +} + +/** + * Text layout cell based on a text mapping cell + */ +class TextMappingsTextLayoutCell extends BaseTextLayoutCell { + readonly cellField: CellField; + + constructor(parent: TextMappingsTextLayout, index: number, document: DocumentFields, cell: Cell) { + const id = index; + const pageNum = cell.page.page_number; + const bbox = cell.page.bbox; + const text = + getDocFieldValue(document, cell.field.name, cell.field.index, cell.field.span) ?? ''; + super({ parent, id, pageNum, bbox, text }); + this.cellField = cell.field; + } +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts new file mode 100644 index 000000000..73edcfe48 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/dom.ts @@ -0,0 +1,63 @@ +import { forEachRectInRange, getTextNodeAndOffset } from 'utils/document/documentUtils'; +import { Bbox, TextSpan } from '../../types'; +import { END, START } from '../../../../utils/textSpan'; +import { TextLayoutCell } from './types'; + +const debugOut = require('debug')?.('pdf:textLayout:dom'); +function debug(...args: any) { + debugOut?.apply(null, args); +} + +/** + * Get a bbox for a span on a text layout cell using DOM element rendered on browser + * @param cell text layout cell + * @param textSpan span on the text layout cell + * @param spanElement an DOM element where the text layout cell is rendered + * @param scale the current scale factor + * @returns bbox for the span on the cell + */ +export function getAdjustedCellByOffsetByDom( + cell: TextLayoutCell, + textSpan: TextSpan, + spanElement: HTMLElement, + scale: number +): Bbox | null { + if (!(spanElement.firstChild instanceof Text) || !(spanElement.lastChild instanceof Text)) { + debug('unexpected. span dont have text node'); + return null; + } + + const beginOffset = textSpan[START]; + const endOffset = Math.min(cell.text.length, textSpan[END]); + + try { + const { textNode: beginTextNode, textOffset: beginTextOffset } = + beginOffset > 0 + ? getTextNodeAndOffset(spanElement, beginOffset) + : { textNode: spanElement.firstChild, textOffset: 0 }; + const { textNode: endTextNode, textOffset: endTextOffset } = + endOffset > 0 + ? getTextNodeAndOffset(spanElement, endOffset) + : { textNode: spanElement.lastChild, textOffset: spanElement.lastChild.length }; + + debug('finding text node for: ', cell.text); + debug(' textContent: ', beginTextNode.textContent); + debug(' beginOffset: ', beginTextOffset); + debug(' textContent: ', endTextNode.textContent); + debug(' endOffset: ', endTextOffset); + + // create highlight rect(s) inside of a field + let [left, top, right, bottom] = cell.bbox; + + const parentRect = spanElement.parentElement?.getBoundingClientRect(); + forEachRectInRange(beginTextNode, beginTextOffset, endTextNode, endTextOffset, rect => { + left = (rect.left - parentRect!.left) / scale; + right = left + rect.width / scale; + }); + + return [left, top, right, bottom]; + } catch (e) { + debug('Caught exception on calculating bbox from DOM: ', e); + } + return null; +} diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/index.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/index.ts new file mode 100644 index 000000000..cfef239cf --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/index.ts @@ -0,0 +1,3 @@ +export { HtmlBboxTextLayout } from './HtmlBboxTextLayout'; +export { PdfTextContentTextLayout } from './PdfTextContentTextLayout'; +export { TextMappingsTextLayout } from './TextMappingsTextLayout'; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts new file mode 100644 index 000000000..3dc3e9776 --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/components/PdfViewerHighlight/utils/textLayout/types.ts @@ -0,0 +1,97 @@ +import { TextMappings } from 'components/DocumentPreview/types'; +import { PDFPageViewport, TextContent } from 'pdfjs-dist'; +import { ProcessedDoc } from 'utils/document'; +import { Bbox, DocumentFields, TextSpan } from '../../types'; + +/** + * Text layout information + */ +export interface TextLayout { + /** + * cells, paris of bbox and text, of this text layout + */ + readonly cells: CellType[]; + + /** + * get cell by ID + */ + cellAt(id: CellType['id']): CellType; +} + +/** + * Text layout cell. A text and its bbox. + */ +export interface TextLayoutCell extends TextLayoutCellBase { + readonly parent: TextLayout; + + /** + * ID to identify this cell in + */ + readonly id: IDType; + + /** + * text of this cell + */ + readonly text: string; + + readonly pageNum: number; + readonly bbox: Bbox; + + /** + * get bbox for the given text span. + * @returns null when it's not available + */ + getBboxForTextSpan(span: TextSpan, options?: { useRatio?: boolean }): Bbox | null; + + /** + * a special property for PDF text content item cell. True when this cell overlaps HTML cell + */ + readonly isInHtmlBbox?: boolean; +} + +/** + * Generic text layout cell. Bbox may not be directly available. + * Mainly for sub-string of a text layout cell. + */ +export interface TextLayoutCellBase { + /** + * text of this cell + */ + readonly text: string; + + /** + * get sub-span of this text layout + */ + getPartial(span: TextSpan): TextLayoutCellBase; + + /** + * get normalized form, the base text layout cell and a span on it + */ + getNormalized(): { cell: TextLayoutCell; span?: TextSpan }; + + /** + * get cell for the trimmed text + */ + trim(): TextLayoutCellBase; +} + +/** + * Information to create HtmlBboxTextLayout + */ +export type HtmlBboxInfo = Pick; + +/** + * Information to create PdfTextContentTextLayout + */ +export type PdfTextContentInfo = { + textContent: TextContent; + viewport: PDFPageViewport; +}; + +/** + * Information to create TextMappingsTextLayout + */ +export type TextMappingInfo = { + document: DocumentFields; + textMappings: TextMappings; +}; diff --git a/packages/discovery-react-components/src/components/DocumentPreview/types.ts b/packages/discovery-react-components/src/components/DocumentPreview/types.ts index e72598074..686203d42 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/types.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/types.ts @@ -6,6 +6,9 @@ export interface TextMappings { // [ left, top, right, bottom ] export type Bbox = [number, number, number, number]; +// [ start (inclusive), end (exclusive) ] +export type TextSpan = [number, number]; + export type Origin = 'TopLeft' | 'BottomLeft'; export interface Page { @@ -32,8 +35,7 @@ export interface CellPage { export interface CellField { name: string; index: number; - // [ START, END ] - span: [number, number]; + span: TextSpan; } export interface StyledCell extends CellPage { diff --git a/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/box.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/box.test.ts index 4c3bfeb78..e76b5f828 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/box.test.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/box.test.ts @@ -1,4 +1,4 @@ -import { findMatchingBbox } from '../box'; +import { findMatchingBbox, bboxesIntersect } from '../box'; import { CellPage } from '../../types'; const originalDocBbox = [ { @@ -329,4 +329,21 @@ describe('box', () => { ]; expect(findMatchingBbox(originalDocBbox[1] as CellPage, processedDocBbox)).toEqual(result); }); + + describe('bboxesIntersect', () => { + it('should return true when boxes intersect', () => { + expect(bboxesIntersect([10, 10, 20, 20], [15, 15, 25, 25])).toBeTruthy(); + }); + + it("should return false when boxes don't intersect", () => { + expect(bboxesIntersect([10, 10, 20, 20], [15, 25, 25, 35])).toBeFalsy(); + }); + + it('should return false when one box is on another', () => { + expect(bboxesIntersect([10, 10, 20, 20], [20, 10, 30, 20])).toBeFalsy(); + expect(bboxesIntersect([10, 10, 20, 20], [0, 10, 10, 20])).toBeFalsy(); + expect(bboxesIntersect([10, 10, 20, 20], [10, 20, 20, 30])).toBeFalsy(); + expect(bboxesIntersect([10, 10, 20, 20], [10, 0, 20, 10])).toBeFalsy(); + }); + }); }); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/textSpan.test.ts b/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/textSpan.test.ts new file mode 100644 index 000000000..a1f7d18ad --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/utils/__tests__/textSpan.test.ts @@ -0,0 +1,135 @@ +import { + spanCompare, + spanContains, + spanFromSubSpan, + spanGetSubSpan, + spanGetText, + spanIncludesIndex, + spanIntersection, + spanIntersects, + spanLen +} from '../textSpan'; +import { TextSpan } from '../../types'; + +describe('spanGetText', () => { + it('should return valid span text', () => { + expect(spanGetText('0123456789', [3, 5])).toBe('34'); + expect(spanGetText('0123456789', [0, 10])).toBe('0123456789'); + }); + it('should return null for null text', () => { + expect(spanGetText(null, [3, 5])).toBe(null); + }); + it('should return empty text for empty or negative span', () => { + expect(spanGetText('0123456789', [0, 0])).toBe(''); + expect(spanGetText('0123456789', [5, 3])).toBe(''); + }); + it('should return text span for negative or large indices', () => { + expect(spanGetText('0123456789', [-10, 20])).toBe('0123456789'); + }); +}); + +describe('spanLen', () => { + it('should return span length', () => { + expect(spanLen([5, 10])).toBe(5); + expect(spanLen([10, 10])).toBe(0); + }); + it('should return zero for negative spans', () => { + expect(spanLen([10, 5])).toBe(0); + }); +}); + +describe('spanIntersects', () => { + it('should properly distinguish span intersection', () => { + expect(spanIntersects([10, 19], [20, 30])).toBeFalsy(); + expect(spanIntersects([10, 20], [20, 30])).toBeFalsy(); + expect(spanIntersects([10, 21], [20, 30])).toBeTruthy(); + expect(spanIntersects([29, 40], [20, 30])).toBeTruthy(); + expect(spanIntersects([30, 41], [20, 30])).toBeFalsy(); + expect(spanIntersects([31, 40], [20, 30])).toBeFalsy(); + + expect(spanIntersects([25, 26], [20, 30])).toBeTruthy(); + + expect(spanIntersects([20, 30], [10, 19])).toBeFalsy(); + expect(spanIntersects([20, 30], [10, 20])).toBeFalsy(); + expect(spanIntersects([20, 30], [10, 21])).toBeTruthy(); + expect(spanIntersects([20, 30], [29, 40])).toBeTruthy(); + expect(spanIntersects([20, 30], [30, 41])).toBeFalsy(); + expect(spanIntersects([20, 30], [31, 40])).toBeFalsy(); + }); +}); + +describe('spanIncludesIndex', () => { + it('should return true for indices inside a span', () => { + expect(spanIncludesIndex([10, 20], 10)).toBeTruthy(); + expect(spanIncludesIndex([10, 20], 15)).toBeTruthy(); + expect(spanIncludesIndex([10, 20], 19)).toBeTruthy(); + }); + it('should return false for indices outside a span', () => { + expect(spanIncludesIndex([10, 20], 9)).toBeFalsy(); + expect(spanIncludesIndex([10, 20], 20)).toBeFalsy(); + expect(spanIncludesIndex([10, 20], 21)).toBeFalsy(); + }); +}); + +describe('spanContains', () => { + it('should return true when a span contains other span', () => { + expect(spanContains([10, 20], [15, 18])).toBeTruthy(); + expect(spanContains([10, 20], [10, 18])).toBeTruthy(); + expect(spanContains([10, 20], [15, 20])).toBeTruthy(); + }); + it("should return true when a span doesn't contain other span", () => { + expect(spanContains([10, 20], [9, 10])).toBeFalsy(); + expect(spanContains([10, 20], [9, 18])).toBeFalsy(); + expect(spanContains([10, 20], [15, 21])).toBeFalsy(); + expect(spanContains([10, 20], [21, 30])).toBeFalsy(); + }); +}); + +describe('spanIntersection', () => { + it('should return span intersection', () => { + expect(spanIntersection([10, 20], [15, 18])).toEqual([15, 18]); + expect(spanIntersection([10, 20], [10, 18])).toEqual([10, 18]); + expect(spanIntersection([10, 20], [15, 25])).toEqual([15, 20]); + }); + it('should return a span when the span is contained in another span', () => { + const a = [10, 20] as TextSpan; + expect(spanIntersection(a, [0, 30])).toBe(a); + expect(spanIntersection(a, [10, 21])).toBe(a); + expect(spanIntersection([0, 30], a)).toBe(a); + expect(spanIntersection([10, 20], a)).toBe(a); + }); +}); + +describe('spanFromSubSpan', () => { + it('should return a span that represents a sub-span (span in span) in a base span', () => { + expect(spanFromSubSpan([10, 20], [0, 5])).toEqual([10, 15]); + expect(spanFromSubSpan([10, 20], [5, 10])).toEqual([15, 20]); + expect(spanFromSubSpan([10, 20], [5, 20])).toEqual([15, 20]); + }); +}); + +describe('spanGetSubSpan', () => { + it('should return a span on a base span', () => { + expect(spanGetSubSpan([10, 20], [10, 15])).toEqual([0, 5]); + expect(spanGetSubSpan([10, 20], [15, 20])).toEqual([5, 10]); + }); + it('should return an empty span when given spans has no intersection', () => { + expect(spanLen(spanGetSubSpan([10, 20], [0, 5]))).toBe(0); + expect(spanLen(spanGetSubSpan([10, 20], [20, 25]))).toBe(0); + }); +}); + +describe('spanCompare', () => { + it('should return zero for same spans', () => { + expect(spanCompare([0, 0], [0, 0])).toBe(0); + expect(spanCompare([10, 20], [10, 20])).toBe(0); + }); + it('should return negative for spans before another', () => { + expect(spanCompare([10, 20], [11, 20]) < 0).toBeTruthy(); + expect(spanCompare([10, 20], [10, 21]) < 0).toBeTruthy(); + }); + it('should return positive for spans after another', () => { + expect(spanCompare([10, 20], [9, 20]) > 0).toBeTruthy(); + expect(spanCompare([10, 20], [10, 19]) > 0).toBeTruthy(); + }); +}); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts b/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts index 99d01bdf7..33005c151 100644 --- a/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts +++ b/packages/discovery-react-components/src/components/DocumentPreview/utils/box.ts @@ -7,10 +7,16 @@ import { ProcessedBbox } from '../../../utils/document/processDoc'; * @param boxB second bbox * @returns bool */ -function intersects(boxA: number[], boxB: number[]): boolean { +export function bboxesIntersect(boxA: number[], boxB: number[]): boolean { const [leftA, topA, rightA, bottomA, pageA] = boxA; const [leftB, topB, rightB, bottomB, pageB] = boxB; - return !(leftB > rightA || rightB < leftA || topB > bottomA || bottomB < topA || pageA !== pageB); + return !( + leftB >= rightA || + rightB <= leftA || + topB >= bottomA || + bottomB <= topA || + pageA !== pageB + ); } /** @@ -22,7 +28,7 @@ export const findMatchingBbox = (docBox: CellPage, htmlBox: ProcessedBbox[]) => return htmlBox.filter(pBbox => { const { left, top, right, bottom, page } = pBbox; const [left2, top2, right2, bottom2] = docBox.bbox; - return intersects( + return bboxesIntersect( [left2, top2, right2, bottom2, docBox.page_number], [left, top, right, bottom, page] ); diff --git a/packages/discovery-react-components/src/components/DocumentPreview/utils/textSpan.ts b/packages/discovery-react-components/src/components/DocumentPreview/utils/textSpan.ts new file mode 100644 index 000000000..2b0f543bf --- /dev/null +++ b/packages/discovery-react-components/src/components/DocumentPreview/utils/textSpan.ts @@ -0,0 +1,109 @@ +import { TextSpan } from '../types'; + +export const START = 0; +export const END = 1; + +/** + * Check whether two spans has intersection or not + * TextSpan version of spansIntersect in utils/document/documentUtil.ts + */ +export function spanIntersects([beginA, endA]: TextSpan, [beginB, endB]: TextSpan): boolean { + // TODO: integrate with spansIntersect in documentUtils.ts + // currently, the function returns true to spansIntersect([1,2], [0,1]) + // which is expected to be false here. And fixing it results in test error + // We need further investigate if we can fix the spansIntersect. + return beginA < endB && endA > beginB; +} + +/** + * Get text for a given span + */ +export function spanGetText( + text: T, + span: TextSpan +): string | T { + if (!text) return text; + if (spanLen(span) === 0) return ''; + return text.substring(span[START], span[END]); +} + +/** + * Get span length + */ +export function spanLen(span: TextSpan): number { + return Math.max(0, span[END] - span[START]); +} + +/** + * Check whether a span includes an given character index or not + */ +export function spanIncludesIndex([begin, end]: TextSpan, index: number): boolean { + return begin <= index && index < end; +} + +/** + * Check whether a span contains another span + * (i.e. for all index in `other` span, the index is in `span` span) + */ +export function spanContains(span: TextSpan, other: TextSpan): boolean { + return span[START] <= other[START] && other[END] <= span[END]; +} + +/** + * Get the largest span that is contained by both of given spans + * @returns intersection of two spans when the two spans intersects. Zero-length span otherwise. + */ +export function spanIntersection(a: TextSpan, b: TextSpan): TextSpan { + if (spanContains(a, b)) return b; + if (spanContains(b, a)) return a; + const start = Math.max(a[START], b[START]); + const end = Math.min(a[END], b[END]); + return [start, start <= end ? end : start]; +} + +/** + * Get the smallest span that contains both of given spans + */ +export function spanMerge(a: TextSpan, b: TextSpan): TextSpan { + if (spanContains(a, b) || spanLen(b) === 0) return a; + if (spanContains(b, a) || spanLen(a) === 0) return b; + const start = Math.min(a[START], b[START]); + const end = Math.max(a[END], b[END]); + return [start, start <= end ? end : start]; +} + +/** + * Offset spans by given offset + */ +export function spanOffset([start, end]: TextSpan, offset: number): TextSpan { + return [start + offset, end + offset]; +} + +/** + * Get a span from a `subSpan` on a given `base` span + * + * For example, `spanFromSubSpan([10, 20], [1, 2]) // [11, 12]` + */ +export function spanFromSubSpan(base: TextSpan, subSpan: TextSpan): TextSpan { + return spanIntersection(base, spanOffset(subSpan, base[START])); +} + +/** + * Get a span within a given `base` span for a `span` + * + * For example, `spanGetSubSpan([10, 20], [11, 12]) // [1, 2]` + */ +export function spanGetSubSpan(base: TextSpan, span: TextSpan): TextSpan { + return spanOffset(spanIntersection(base, span), -base[START]); +} + +/** + * Compare method for spans + * + * @param spanA a span to compare + * @param spanB another span to compare + * @returns a positive number when spanA is after spanB, a negative number when spanA is before spanB, zero when spanA equals to spanB + */ +export function spanCompare([startA, endA]: TextSpan, [startB, endB]: TextSpan): number { + return startA === startB ? endA - endB : startA - startB; +} diff --git a/packages/discovery-react-components/src/utils/document/__tests__/processDoc.spec.tsx b/packages/discovery-react-components/src/utils/document/__tests__/processDoc.spec.tsx index ad894bfb8..15c424b87 100644 --- a/packages/discovery-react-components/src/utils/document/__tests__/processDoc.spec.tsx +++ b/packages/discovery-react-components/src/utils/document/__tests__/processDoc.spec.tsx @@ -219,3 +219,32 @@ describe('processDoc', () => { expect(doc.tables![2].bboxes[0]).toEqual(bboxData); }); }); + +describe('processDoc', () => { + let doc: ProcessedDoc; + + beforeAll(async () => { + // parse doc for use in tests + doc = await processDoc(contractData.results[0], { bbox: true, bboxInnerText: true }); + }); + + it('successfully picks up bboxes', () => { + expect(doc.bboxes).toHaveLength(1584); + }); + + it('successfully picks up bbox text source', () => { + expect(doc.bboxes).toHaveLength(1584); + + // + // On 22 December 2008 ART EFFECTS LIMITED and Customer entered into an Information Technology Procurement Framework Agreement ("the + // + expect(doc.bboxes[0].innerTextSource).toEqual( + 'On 22 December 2008 ART EFFECTS LIMITED and Customer entered into an Information Technology Procurement Framework Agreement ("the ' + ); + expect(doc.bboxes[0].innerTextLocation).toEqual({ begin: 2530, end: 2660 }); + + // <Enter Amendment Text> + expect(doc.bboxes[1490].innerTextSource).toEqual('<Enter Amendment Text> '); + expect(doc.bboxes[1490].innerTextLocation).toEqual({ begin: 442990, end: 443016 }); + }); +}); diff --git a/packages/discovery-react-components/src/utils/document/documentUtils.ts b/packages/discovery-react-components/src/utils/document/documentUtils.ts index d82fa9b35..ebadd1cd0 100644 --- a/packages/discovery-react-components/src/utils/document/documentUtils.ts +++ b/packages/discovery-react-components/src/utils/document/documentUtils.ts @@ -144,11 +144,6 @@ export function createFieldRects({ endTextNode, endOffset }: CreateFieldRectsProps): void { - // create a Range for each field - const range = document.createRange(); - range.setStart(beginTextNode, Math.min(beginOffset, beginTextNode.length)); - range.setEnd(endTextNode, Math.min(endOffset, endTextNode.length)); - // create a field container const fieldNode = document.createElement('div'); fieldNode.className = 'field'; @@ -158,21 +153,45 @@ export function createFieldRects({ fragment.appendChild(fieldNode); // create highlight rect(s) inside of a field - Array.prototype.forEach.call(uniqRects(range.getClientRects() as DOMRectList), rect => { + forEachRectInRange(beginTextNode, beginOffset, endTextNode, endOffset, rect => { const div = document.createElement('div'); div.className = 'field--rect'; div.setAttribute('data-testid', 'field-rect'); div.setAttribute( 'style', `top: ${rect.top - parentRect.top}px; - left: ${rect.left - parentRect.left}px; - width: ${rect.width}px; - height: ${rect.height}px;` + left: ${rect.left - parentRect.left}px; + width: ${rect.width}px; + height: ${rect.height}px;` ); fieldNode.appendChild(div); }); } +/** + * Iterate over all the DOMRects for a range + * @param beginTextNode + * @param beginOffset + * @param endTextNode + * @param endOffset + * @param callback a callback invoked with each DOMRect in a range + */ +export function forEachRectInRange( + beginTextNode: Text, + beginOffset: number, + endTextNode: Text, + endOffset: number, + callback: (rect: DOMRect) => any +) { + // create a Range + const range = document.createRange(); + range.setStart(beginTextNode, Math.min(beginOffset, beginTextNode.length)); + range.setEnd(endTextNode, Math.min(endOffset, endTextNode.length)); + + // visit rects in the range + Array.prototype.forEach.call(uniqRects(range.getClientRects() as DOMRectList), callback); +} + // Some browsers (Chrome, Safari) return duplicate rects export function uniqRects(rects: DOMRectList): Partial { return uniqWith( diff --git a/packages/discovery-react-components/src/utils/document/processDoc.ts b/packages/discovery-react-components/src/utils/document/processDoc.ts index a39b42879..aca49147d 100644 --- a/packages/discovery-react-components/src/utils/document/processDoc.ts +++ b/packages/discovery-react-components/src/utils/document/processDoc.ts @@ -28,6 +28,7 @@ interface Options { sections?: boolean; tables?: boolean; bbox?: boolean; + bboxInnerText?: boolean; itemMap?: boolean; } @@ -66,6 +67,8 @@ export interface ProcessedBbox { page: number; className: string; location: Location; + innerTextSource?: string; + innerTextLocation?: Location; } export interface Table { @@ -130,7 +133,7 @@ export async function processDoc( const parser = new SaxParser(); // setup initial parsing handling - setupDocParser(parser, doc); + setupDocParser(parser, doc, options); const htmlContent = Array.isArray(html) ? html[0] : html; @@ -145,7 +148,7 @@ export async function processDoc( return doc; } -function setupDocParser(parser: SaxParser, doc: ProcessedDoc): void { +function setupDocParser(parser: SaxParser, doc: ProcessedDoc, options: Options): void { parser.pushState({ onopentag: (_: Parser, tagName: string): void => { /* eslint-disable-next-line default-case */ @@ -155,7 +158,7 @@ function setupDocParser(parser: SaxParser, doc: ProcessedDoc): void { break; } case 'body': { - setupBodyParser(parser, doc); + setupBodyParser(parser, doc, options); break; } } @@ -189,11 +192,11 @@ function setupStyleParser(parser: SaxParser, doc: ProcessedDoc): void { }); } -function setupBodyParser(parser: SaxParser, doc: ProcessedDoc): void { +function setupBodyParser(parser: SaxParser, doc: ProcessedDoc, options: Options): void { parser.pushState({ onopentag: (p: Parser, tagName: string, attributes: Attributes): void => { if (SECTION_NAMES.includes(tagName)) { - setupSectionParser(parser, doc, tagName, attributes, p.startIndex, p); + setupSectionParser(parser, doc, tagName, attributes, p.startIndex, p, options); } } }); @@ -205,7 +208,8 @@ function setupSectionParser( sectionTagName: string, sectionTagAttrs: Attributes, sectionStartIndex: number, - sectionParser: Parser + sectionParser: Parser, + options: Options ): void { let lastClassName = ''; let currentTable: Table | null = null; @@ -283,6 +287,13 @@ function setupSectionParser( if (doc.bboxes) { doc.bboxes.push(currentBbox); } + if (options.bboxInnerText) { + currentBbox.innerTextSource = ''; + currentBbox.innerTextLocation = { + begin: p.endIndex != null ? p.endIndex + 1 : -1, + end: -1 + }; + } if (currentTable && doc.tables) { currentTable.bboxes.push(currentBbox); } @@ -309,6 +320,10 @@ function setupSectionParser( ); } + if (currentBbox && options.bboxInnerText) { + currentBbox.innerTextSource += text; + } + sectionHtml.push(text); }, @@ -335,6 +350,15 @@ function setupSectionParser( if (doc.bboxes && tagName === BBOX_TAG && currentBbox) { currentBbox.location.end = getChildEndFromCloseTag(p); + + if (options.bboxInnerText && currentBbox.innerTextLocation) { + currentBbox.innerTextLocation.end = getChildEndFromCloseTag(p); + if (currentBbox.innerTextLocation.end < 0 && currentBbox.innerTextSource != null) { + currentBbox.innerTextLocation.begin = + currentBbox.innerTextLocation.end - currentBbox.innerTextSource.length; + } + } + currentBbox = null; } diff --git a/packages/discovery-react-components/src/utils/nonEmpty.ts b/packages/discovery-react-components/src/utils/nonEmpty.ts new file mode 100644 index 000000000..be6fb569f --- /dev/null +++ b/packages/discovery-react-components/src/utils/nonEmpty.ts @@ -0,0 +1,9 @@ +/** + * A filter to drop any non-null values from a list. + * Use with `Array.filter` method to get a list of non-null type. + * + * `const list: number[] = [1, null, 2].filter(nonEmpty); // [1,2]` + */ +export function nonEmpty(value: T | null | undefined): value is T { + return value !== null && value !== undefined; +} diff --git a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss index dce987167..28a1d3544 100644 --- a/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss +++ b/packages/discovery-styles/scss/components/document-preview/_document-preview-pdf-viewer.scss @@ -1,4 +1,6 @@ @import './pdfjs_web_mixins'; +@import '../../vars'; +@import './mixins'; .#{$prefix}--document-preview-pdf-viewer { position: relative; @@ -12,3 +14,16 @@ .#{$prefix}--document-preview-pdf-viewer--text { transform-origin: left top 0px; } + +.#{$prefix}--document-preview-pdf-viewer-highlight { + position: absolute; + transform-origin: left top 0px; + top: 0; + left: 0; +} + +.#{$prefix}--document-preview-pdf-viewer-highlight--item { + position: absolute; + opacity: 0.3; + background: darken($highlight, 30%); +} diff --git a/yarn.lock b/yarn.lock index b047d1520..cf9abdb5f 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2267,7 +2267,7 @@ __metadata: languageName: node linkType: hard -"@ibm-watson/discovery-react-components@^1.5.0-beta.3, @ibm-watson/discovery-react-components@workspace:packages/discovery-react-components": +"@ibm-watson/discovery-react-components@^1.5.0-beta.4, @ibm-watson/discovery-react-components@workspace:packages/discovery-react-components": version: 0.0.0-use.local resolution: "@ibm-watson/discovery-react-components@workspace:packages/discovery-react-components" dependencies: @@ -2305,7 +2305,7 @@ __metadata: languageName: unknown linkType: soft -"@ibm-watson/discovery-styles@^1.5.0-beta.2, @ibm-watson/discovery-styles@workspace:packages/discovery-styles": +"@ibm-watson/discovery-styles@^1.5.0-beta.4, @ibm-watson/discovery-styles@workspace:packages/discovery-styles": version: 0.0.0-use.local resolution: "@ibm-watson/discovery-styles@workspace:packages/discovery-styles" dependencies: @@ -9102,7 +9102,7 @@ __metadata: languageName: node linkType: hard -"core-js@npm:^2.4.0": +"core-js@npm:^2.4.0, core-js@npm:^2.6.12": version: 2.6.12 resolution: "core-js@npm:2.6.12" checksum: 44fa9934a85f8c78d61e0c8b7b22436330471ffe59ec5076fe7f324d6e8cf7f824b14b1c81ca73608b13bdb0fef035bd820989bf059767ad6fa13123bb8bd016 @@ -10260,12 +10260,13 @@ __metadata: resolution: "discovery-search-app@workspace:examples/discovery-search-app" dependencies: "@carbon/icons": ^10.5.0 - "@ibm-watson/discovery-react-components": ^1.5.0-beta.3 - "@ibm-watson/discovery-styles": ^1.5.0-beta.2 + "@ibm-watson/discovery-react-components": ^1.5.0-beta.4 + "@ibm-watson/discovery-styles": ^1.5.0-beta.4 body-parser: ^1.19.0 carbon-components: ^10.6.0 carbon-components-react: ^7.7.0 classnames: ^2.2.6 + core-js: ^2.6.12 cors: ^2.8.5 cross-env: ^7.0.3 dotenv: ^8.1.0