Skip to content

Commit 26360c3

Browse files
committed
Add text extractor for an external service
1 parent f6317dd commit 26360c3

File tree

4 files changed

+103
-0
lines changed

4 files changed

+103
-0
lines changed

web/app.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ import { PDFPrintServiceFactory } from "web-print_service";
8989
import { PDFRenderingQueue } from "./pdf_rendering_queue.js";
9090
import { PDFScriptingManager } from "./pdf_scripting_manager.js";
9191
import { PDFSidebar } from "web-pdf_sidebar";
92+
import { PdfTextExtractor } from "./pdf_text_extractor.js";
9293
import { PDFThumbnailViewer } from "web-pdf_thumbnail_viewer";
9394
import { PDFViewer } from "./pdf_viewer.js";
9495
import { Preferences } from "web-preferences";
@@ -129,6 +130,8 @@ const PDFViewerApplication = {
129130
pdfDocumentProperties: null,
130131
/** @type {PDFLinkService} */
131132
pdfLinkService: null,
133+
/** @type {PdfTextExtractor|null} */
134+
pdfTextExtractor: null,
132135
/** @type {PDFHistory} */
133136
pdfHistory: null,
134137
/** @type {PDFSidebar} */
@@ -262,6 +265,8 @@ const PDFViewerApplication = {
262265
}
263266
await this._initializeViewerComponents();
264267

268+
this.pdfTextExtractor = new PdfTextExtractor(this.externalServices);
269+
265270
// Bind the various event handlers *after* the viewer has been
266271
// initialized, to prevent errors if an event arrives too soon.
267272
this.bindEvents();
@@ -1144,6 +1149,7 @@ const PDFViewerApplication = {
11441149
this.pdfViewer.setDocument(null);
11451150
this.pdfLinkService.setDocument(null);
11461151
this.pdfDocumentProperties?.setDocument(null);
1152+
this.pdfTextExtractor?.setDocument(null);
11471153
}
11481154
this.pdfLinkService.externalLinkEnabled = true;
11491155
this.store = null;
@@ -1450,6 +1456,7 @@ const PDFViewerApplication = {
14501456

14511457
const pdfViewer = this.pdfViewer;
14521458
pdfViewer.setDocument(pdfDocument);
1459+
this.pdfTextExtractor.setViewer(pdfViewer);
14531460
const { firstPagePromise, onePageRendered, pagesPromise } = pdfViewer;
14541461

14551462
this.pdfThumbnailViewer?.setDocument(pdfDocument);

web/external_services.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ class BaseExternalServices {
3333

3434
reportTelemetry(data) {}
3535

36+
reportText(data) {}
37+
3638
/**
3739
* @returns {Promise<IL10n>}
3840
*/

web/firefoxcom.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,10 @@ class ExternalServices extends BaseExternalServices {
645645
FirefoxCom.request("reportTelemetry", data);
646646
}
647647

648+
reportText(data) {
649+
FirefoxCom.request("reportText", data);
650+
}
651+
648652
updateEditorStates(data) {
649653
FirefoxCom.request("updateEditorStates", data);
650654
}

web/pdf_text_extractor.js

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/* Copyright 2024 Mozilla Foundation
2+
*
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
16+
/**
17+
* This class manages the interaction of extracting the text content of the page
18+
* and passing it back to the external service.
19+
*/
20+
class PdfTextExtractor {
21+
/** @type {PDFViewer} */
22+
#pdfViewer;
23+
24+
#externalServices;
25+
26+
/**
27+
* @type {?Promise<string>}
28+
*/
29+
#textPromise;
30+
31+
#pendingRequests = new Set();
32+
33+
constructor(externalServices) {
34+
this.#externalServices = externalServices;
35+
36+
window.addEventListener("requestTextContent", ({ detail }) => {
37+
this.extractTextContent(detail.requestId);
38+
});
39+
}
40+
41+
/**
42+
* The PDF viewer is required to get the page text.
43+
*
44+
* @param {PDFViewer | null}
45+
*/
46+
setViewer(pdfViewer) {
47+
this.#pdfViewer = pdfViewer;
48+
if (this.#pdfViewer && this.#pendingRequests.size) {
49+
// Handle any pending requests that came in while things were loading.
50+
for (const pendingRequest of this.#pendingRequests) {
51+
this.extractTextContent(pendingRequest);
52+
}
53+
this.#pendingRequests = new Set();
54+
}
55+
}
56+
57+
/**
58+
* Builds up all of the text from a PDF.
59+
*
60+
* @param {number} requestId
61+
*/
62+
async extractTextContent(requestId) {
63+
if (!this.#pdfViewer) {
64+
this.#pendingRequests.add(requestId);
65+
return;
66+
}
67+
68+
if (!this.#textPromise) {
69+
const textPromise = this.#pdfViewer.getAllText();
70+
this.#textPromise = textPromise;
71+
72+
// After the text resolves, cache the text for a little bit in case
73+
// multiple consumers call it.
74+
textPromise.then(() => {
75+
setTimeout(() => {
76+
if (this.#textPromise === textPromise) {
77+
this.#textPromise = null;
78+
}
79+
}, 5000);
80+
});
81+
}
82+
83+
this.#externalServices.reportText({
84+
text: await this.#textPromise,
85+
requestId,
86+
});
87+
}
88+
}
89+
90+
export { PdfTextExtractor };

0 commit comments

Comments
 (0)