-
Notifications
You must be signed in to change notification settings - Fork 33
feat: add PDF viewer with highlighting #238
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 50 commits
5e1902c
824c2d7
c5e09cb
9f0dcd5
cc06446
de5731f
f64fa07
85e3538
3c08df9
c722221
69ff042
c0771fc
66c7c56
47855f4
66b3e43
e43318a
1d46e11
9b33674
73c2ff8
050db2d
f62c419
3e06fda
b1defdd
7a6ef58
3d02caf
3b810af
488b160
dd9d5a0
5811f4f
17feb71
158b8a2
0705d2e
9e9cad5
6ba0703
052336a
e081c92
68d895d
8c82fae
2c28bd9
032842c
58ec1c5
0ccaf6c
3eedcf8
c13942b
a9dd38e
f6fbcd2
dc09472
0967b05
bb448d5
d651ea8
fbac5ee
e1fe864
0911538
457f2ec
89e4f75
21799dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,55 @@ | ||
| { | ||
| "document_id": "feab8705259090b89fbcbb15942cb10d", | ||
| "result_metadata": { | ||
| "collection_id": "b6cdf1cd-902c-8ea3-0000-017d32224d8f" | ||
| }, | ||
| "enriched_text": [ | ||
| { | ||
| "entities": [ | ||
| { | ||
| "model_name": "natural_language_understanding", | ||
| "mentions": [ | ||
| { | ||
| "confidence": 0.9950965, | ||
| "location": { | ||
| "end": 2, | ||
| "begin": 0 | ||
| }, | ||
| "text": "最初" | ||
| } | ||
| ], | ||
| "text": "最初", | ||
| "type": "Ordinal" | ||
| } | ||
| ] | ||
| } | ||
| ], | ||
| "metadata": { | ||
| "parent_document_id": "feab8705259090b89fbcbb15942cb10d", | ||
| "customer_id": "IBMid-270001M55T" | ||
| }, | ||
| "extracted_metadata": { | ||
| "sha1": "4FF2B41ED7A77975ABB21D9E4025DF31335E6451", | ||
| "numPages": "1", | ||
| "filename": "DiscoComponents-ja-updated.pdf", | ||
| "file_type": "pdf", | ||
| "text_mappings": "{\"text_mappings\":[{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,87.82411193847656,400.4930725097656,194.260009765625]},\"field\":{\"name\":\"title\",\"index\":0,\"span\":[0,20]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,411.83612060546875,262.9510192871094,425.62003993988037]},\"field\":{\"name\":\"subtitle\",\"index\":0,\"span\":[0,19]}},{\"page\":{\"page_number\":1,\"bbox\":[268.46466064453125,416.1183776855469,325.5726318359375,425.375319480896]},\"field\":{\"name\":\"subtitle\",\"index\":1,\"span\":[0,3]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,644.3582763671875,313.07745361328125,653.6152181625366]},\"field\":{\"name\":\"subtitle\",\"index\":2,\"span\":[0,15]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,456.12786865234375,95.6172866821289,463.06002855300903]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[0,4]}},{\"page\":{\"page_number\":1,\"bbox\":[100.0745620727539,452.9471435546875,257.0570983886719,463.06002855300903]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[4,27]}},{\"page\":{\"page_number\":1,\"bbox\":[261.5120849609375,452.9471435546875,408.1592712402344,463.0600233078003]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[27,49]}},{\"page\":{\"page_number\":1,\"bbox\":[412.5315856933594,456.12786865234375,464.3571472167969,463.06002855300903]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[49,54]}},{\"page\":{\"page_number\":1,\"bbox\":[54.51987838745117,452.9471435546875,534.0211791992188,596.2600049972534]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[54,234]}},{\"page\":{\"page_number\":1,\"bbox\":[54.519996643066406,679.4979858398438,535.1033325195312,723.2200269699097]},\"field\":{\"name\":\"text\",\"index\":0,\"span\":[234,353]}}],\"pages\":[{\"page_number\":0,\"height\":842.0,\"width\":595.0,\"origin\":\"TopLeft\"}]}", | ||
| "title": "Discovery Component README Japanese", | ||
| "publicationdate": "2021-11-18" | ||
| }, | ||
| "subtitle": ["Discovery Component", "の使用", "サンプルアプリケーションの実行"], | ||
| "html": "<html><head><meta charset=\"UTF-8\"/><meta name=\"publicationdate\" content=\"2021-11-18\"/><meta name=\"numPages\" content=\"1\"/><title>Discovery Component README Japanese</title><style>.css_1902558513 { font: bold 18.96pt '/MS-PGothic-Bold'; } .css_904416330 { font: 11.04pt '/SymbolMT'; } .css_1548729052 { font: bold 18.96pt '/Tahoma-Bold-Bold'; } .css_2121319508 { font: bold 54.96pt '/Tahoma-Bold-Bold'; } .css_1950597664 { font: 13.92pt '/Tahoma'; } .css_1579914921 { font: 13.92pt '/MS-PGothic'; }</style></head><body><section id=\"1\" data-level=\"1\"><p text-alignment=\"left\"><span class=\"title css_2121319508\"><bbox page=\"1\" x=\"54.51987838745117\" y=\"87.82411193847656\" height=\"106.43589782714844\" width=\"345.97319412231445\">Discovery Components</bbox></span></p><p text-alignment=\"left\"><span class=\"subtitle css_1548729052\"><bbox page=\"1\" x=\"54.51987838745117\" y=\"411.83612060546875\" height=\"13.783919334411621\" width=\"208.4311408996582\">Discovery Component</bbox></span></p><p text-alignment=\"left\"><span class=\"subtitle css_1902558513\"><bbox page=\"1\" x=\"268.46466064453125\" y=\"416.1183776855469\" height=\"9.256941795349121\" width=\"57.10797119140625\">の使用</bbox></span></p><p text-alignment=\"left\"><span class=\"text css_1579914921\"><bbox page=\"1\" x=\"54.51987838745117\" y=\"456.12786865234375\" height=\"6.932159900665283\" width=\"41.097408294677734\">最初に</bbox></span></p><p text-alignment=\"left\"><span class=\"text css_1950597664\"><bbox page=\"1\" x=\"100.0745620727539\" y=\"452.9471435546875\" height=\"10.112884998321533\" width=\"156.98253631591797\">IBM Watson Discovery の</bbox></span></p><p text-alignment=\"left\"><span class=\"text css_1950597664\"><bbox page=\"1\" x=\"261.5120849609375\" y=\"452.9471435546875\" height=\"10.112879753112793\" width=\"146.64718627929688\">Improve and Customize</bbox></span></p><p text-alignment=\"left\"><span class=\"text css_1579914921\"><bbox page=\"1\" x=\"412.5315856933594\" y=\"456.12786865234375\" height=\"6.932159900665283\" width=\"51.8255615234375\">ページで</bbox></span></p><p text-alignment=\"left\"><span class=\"text css_1950597664\"><bbox page=\"1\" x=\"54.51987838745117\" y=\"452.9471435546875\" height=\"143.31286144256592\" width=\"479.5013008117676\">Document retrieval プロジェクトをカスタマイズする必要があります。たとえばファセットや検索 バーや検索結果を設定できます。その後 Discovery component を使ったアプリケ ーションを作成します。アプリケーションは指定したプロジェクトの設定をロードしま す。 必要なソフトウェア: git, nvm, yarn または npm</bbox></span></p><p text-alignment=\"left\"><span class=\"subtitle css_1902558513\"><bbox page=\"1\" x=\"54.51987838745117\" y=\"644.3582763671875\" height=\"9.256941795349121\" width=\"258.5575752258301\">サンプルアプリケーションの実行</bbox></span></p><p text-alignment=\"left\"><span class=\"text css_904416330\"><bbox page=\"1\" x=\"54.519996643066406\" y=\"679.4979858398438\" height=\"43.72204113006592\" width=\"480.58333587646484\">• サンプルアプリケーションはこのライブラリーが提供するコアコンポーネントのカタログです。実際のデ ータを使ってコンポーネントがどのように動くかを簡単に見ることができます。コードを変更して、カスタ マイズする方法を確認することもできます。</bbox></span></p></section></body></html>", | ||
| "text": [ | ||
| "最初に IBM Watson Discovery の Improve and Customize ページで Document retrieval プロジェクトをカスタマイズする必要があります。たとえばファセットや検索 バーや検索結果を設定できます。その後 Discovery component を使ったアプリケ ーションを作成します。アプリケーションは指定したプロジェクトの設定をロードしま す。 必要なソフトウェア: git, nvm, yarn または npm • サンプルアプリケーションはこのライブラリーが提供するコアコンポーネントのカタログです。実際のデ ータを使ってコンポーネントがどのように動くかを簡単に見ることができます。コードを変更して、カスタ マイズする方法を確認することもできます。" | ||
| ], | ||
| "title": "Discovery Components", | ||
| "document_passages": [ | ||
| { | ||
| "passage_text": "Discovery Components", | ||
| "start_offset": 0, | ||
| "end_offset": 20, | ||
| "field": "title" | ||
| } | ||
| ], | ||
| "table_results_references": [] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,157 @@ | ||
| import React, { FC, useMemo, useEffect } from 'react'; | ||
| import cx from 'classnames'; | ||
| import { settings } from 'carbon-components'; | ||
| import { QueryResult } from 'ibm-watson/discovery/v2'; | ||
| import { ProcessedDoc } from 'utils/document'; | ||
| import { TextMappings } from '../../types'; | ||
| import { PdfDisplayProps } from '../PdfViewer/types'; | ||
| import { PdfRenderedText } from '../PdfViewer/PdfViewerTextLayer'; | ||
| import { DocumentFieldHighlight } from './types'; | ||
| import { ExtractedDocumentInfo } from './utils/common/documentUtils'; | ||
| import { Highlighter } from './utils/Highlighter'; | ||
|
|
||
| type Props = PdfDisplayProps & { | ||
| /** | ||
| * Class name to style highlight layer | ||
| */ | ||
| className?: string; | ||
|
|
||
| /** | ||
| * Class name to style each highlight | ||
| */ | ||
| highlightClassName?: string; | ||
|
|
||
| /** | ||
| * Document data returned by query | ||
| */ | ||
| document: QueryResult; | ||
|
|
||
| /** | ||
| * Parsed document information | ||
| */ | ||
| parsedDocument: ExtractedDocumentInfo | null; | ||
|
|
||
| /** | ||
| * Highlight spans on fields in document | ||
| */ | ||
| highlights: DocumentFieldHighlight[]; | ||
|
|
||
| /** | ||
| * PDF text content information in a page from parsed PDF | ||
| */ | ||
| pdfRenderedText: PdfRenderedText | null; | ||
|
|
||
| /** | ||
| * Flag to whether or not to use bbox information from html field in the document. | ||
| * True by default. This is for testing and debugging purpose. | ||
| */ | ||
| useHtmlBbox?: boolean; | ||
|
|
||
| /** | ||
| * Flag to whether to use PDF text items for finding bbox for highlighting. | ||
| * True by default. This is for testing and debugging purpose. | ||
| */ | ||
| usePdfTextItem?: boolean; | ||
| }; | ||
|
|
||
| /** | ||
| * Text highlight layer for PdfViewer | ||
| */ | ||
| const PdfViewerHighlight: FC<Props> = ({ | ||
jhpedemonte marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| className, | ||
| highlightClassName, | ||
| document, | ||
| parsedDocument, | ||
| page, | ||
| highlights, | ||
| pdfRenderedText, | ||
| scale, | ||
| useHtmlBbox = true, | ||
| usePdfTextItem = true | ||
| }) => { | ||
| const highlighter = useHighlighter({ | ||
| document, | ||
| textMappings: parsedDocument?.textMappings, | ||
| processedDoc: useHtmlBbox ? parsedDocument?.processedDoc : undefined, | ||
| pdfRenderedText: (usePdfTextItem && pdfRenderedText) || undefined, | ||
| pageNum: page | ||
| }); | ||
|
|
||
| const { textDivs } = pdfRenderedText || {}; | ||
| useEffect(() => { | ||
| if (highlighter) { | ||
| highlighter.setTextContentDivs(textDivs); | ||
| } | ||
| }, [highlighter, textDivs]); | ||
|
|
||
| const highlightBoxes = useMemo(() => { | ||
| return highlights.map(highlight => { | ||
| return highlighter?.getHighlight(highlight); | ||
| }); | ||
| }, [highlighter, highlights]); | ||
|
|
||
| return ( | ||
| <div className={cx(`${settings.prefix}--document-preview-pdf-viewer-highlight`, className)}> | ||
| {highlightBoxes.map((hl, hlIndex) => { | ||
| return ( | ||
| <React.Fragment key={`k-${hlIndex}`}> | ||
| {hl?.boxes.map((item, index) => { | ||
| const padding = 0; | ||
| const [left, top, right, bottom] = item.bbox; | ||
| return ( | ||
| <div | ||
| key={`${left}${top}${right}${bottom}_${index}`} | ||
| className={cx( | ||
| `${settings.prefix}--document-preview-pdf-viewer-highlight--item`, | ||
| highlightClassName, | ||
| hl.className | ||
| )} | ||
| style={{ | ||
| left: `${(left - padding) * scale}px`, | ||
| top: `${(top - padding) * scale}px`, | ||
| width: `${(right - left + padding) * scale}px`, | ||
| height: `${(bottom - top + padding) * scale}px` | ||
| }} | ||
| data-testid="highlight" | ||
| /> | ||
| ); | ||
| })} | ||
| </React.Fragment> | ||
| ); | ||
| })} | ||
| </div> | ||
| ); | ||
| }; | ||
|
|
||
| const useHighlighter = ({ | ||
| document, | ||
| textMappings, | ||
| processedDoc, | ||
| pdfRenderedText, | ||
| pageNum | ||
| }: { | ||
| document: QueryResult; | ||
| textMappings?: TextMappings; | ||
| processedDoc?: ProcessedDoc; | ||
| pdfRenderedText?: PdfRenderedText; | ||
| pageNum: number; | ||
| }) => { | ||
| return useMemo(() => { | ||
| if (textMappings) { | ||
| return new Highlighter({ | ||
| document, | ||
| textMappings, | ||
| pageNum, | ||
| htmlBboxInfo: processedDoc && { | ||
| bboxes: processedDoc.bboxes, | ||
| styles: processedDoc.styles | ||
| }, | ||
| pdfTextContentInfo: | ||
| pdfRenderedText?.textContent && pdfRenderedText?.viewport ? pdfRenderedText : undefined | ||
| }); | ||
| } | ||
| return null; | ||
| }, [document, pageNum, pdfRenderedText, processedDoc, textMappings]); | ||
| }; | ||
|
|
||
| export default PdfViewerHighlight; | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| .withTextSelection { | ||
| display: flex; | ||
| height: 800px; | ||
|
||
|
|
||
| .rightPane { | ||
| flex: 1 1 auto; | ||
| width: 20%; | ||
| overflow-y: scroll; | ||
|
|
||
| p { | ||
| margin-bottom: 0.5rem; | ||
| } | ||
| } | ||
| .text { | ||
| overflow-wrap: break-word; | ||
| white-space: pre-wrap; | ||
| font-size: 10pt; | ||
| font-family: 'Courier New', Courier, monospace; | ||
| } | ||
|
|
||
| .highlight { | ||
| opacity: 0.4; | ||
| background: rgba(255, 64, 128, 1); | ||
|
||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that we should better denote that these properties aren't meant to be used normally by end users. Best I can think is to prepend an underscore to the name: