1
1
import { inputData , opt } from '../containers/app.js' ;
2
2
import {
3
- layoutDataTables , layoutRegions , ocrAll , pageMetricsArr ,
3
+ layoutDataTables , layoutRegions , ocrAll , pageMetricsAll ,
4
4
} from '../containers/dataContainer.js' ;
5
5
import { ImageCache } from '../containers/imageContainer.js' ;
6
6
import { reorderOcrPage } from '../modifyOCR.js' ;
7
7
import { saveAs } from '../utils/miscUtils.js' ;
8
- import { writePdf } from './writePdf.js' ;
8
+ import { writePdf } from './pdf/ writePdf.js' ;
9
9
import { writeHocr } from './writeHocr.js' ;
10
10
import { writeText } from './writeText.js' ;
11
11
import { writeHtml } from './writeHtml.js' ;
@@ -45,8 +45,8 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
45
45
const dimsLimit = { width : - 1 , height : - 1 } ;
46
46
if ( opt . standardizePageSize ) {
47
47
for ( let i = minPage ; i <= maxPage ; i ++ ) {
48
- dimsLimit . height = Math . max ( dimsLimit . height , pageMetricsArr [ i ] . dims . height ) ;
49
- dimsLimit . width = Math . max ( dimsLimit . width , pageMetricsArr [ i ] . dims . width ) ;
48
+ dimsLimit . height = Math . max ( dimsLimit . height , pageMetricsAll [ i ] . dims . height ) ;
49
+ dimsLimit . width = Math . max ( dimsLimit . width , pageMetricsAll [ i ] . dims . width ) ;
50
50
}
51
51
}
52
52
@@ -58,10 +58,30 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
58
58
59
59
const rotateText = ! rotateBackground ;
60
60
61
+ const includeImages = false ;
62
+ /** @type {ImageWrapper[] } */
63
+ let images = [ ] ;
64
+ if ( includeImages ) {
65
+ images = await Promise . all ( ImageCache . nativeSrc ) ;
66
+ }
67
+
61
68
// Page sizes should not be standardized at this step, as the overlayText/overlayTextImage functions will perform this,
62
69
// and assume that the overlay PDF is the same size as the input images.
63
- const pdfStr = await writePdf ( ocrDownload , minPage , maxPage , opt . displayMode , rotateText , rotateBackground ,
64
- { width : - 1 , height : - 1 } , opt . confThreshHigh , opt . confThreshMed , opt . overlayOpacity / 100 ) ;
70
+ const pdfStr = await writePdf ( {
71
+ ocrArr : ocrDownload ,
72
+ pageMetricsArr : pageMetricsAll ,
73
+ minpage : minPage ,
74
+ maxpage : maxPage ,
75
+ textMode : opt . displayMode ,
76
+ rotateText,
77
+ rotateBackground,
78
+ dimsLimit : { width : - 1 , height : - 1 } ,
79
+ confThreshHigh : opt . confThreshHigh ,
80
+ confThreshMed : opt . confThreshMed ,
81
+ proofOpacity : opt . overlayOpacity / 100 ,
82
+ images,
83
+ includeImages,
84
+ } ) ;
65
85
66
86
const enc = new TextEncoder ( ) ;
67
87
const pdfEnc = enc . encode ( pdfStr ) ;
@@ -121,7 +141,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
121
141
122
142
await w . convertImageStart ( { humanReadable : opt . humanReadablePDF } ) ;
123
143
for ( let i = minPage ; i < maxPage + 1 ; i ++ ) {
124
- /** @type {import('../containers/imageContainer.js'). ImageWrapper } */
144
+ /** @type {ImageWrapper } */
125
145
let image ;
126
146
if ( binary ) {
127
147
image = await ImageCache . getBinary ( i , props ) ;
@@ -134,7 +154,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
134
154
// Angle the PDF viewer is instructed to rotated the image by.
135
155
// This method is currently only used when rotation is needed but the user's (unrotated) source images are being used.
136
156
// If the images are being rendered, then rotation is expected to be applied within the rendering process.
137
- const angleImagePdf = rotateBackground && ! renderImage ? ( pageMetricsArr [ i ] . angle || 0 ) * - 1 : 0 ;
157
+ const angleImagePdf = rotateBackground && ! renderImage ? ( pageMetricsAll [ i ] . angle || 0 ) * - 1 : 0 ;
138
158
139
159
await w . convertImageAddPage ( {
140
160
image : image . src , i, pagewidth : dimsLimit . width , pageheight : dimsLimit . height , angle : angleImagePdf ,
@@ -157,8 +177,19 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
157
177
158
178
w . freeDocument ( pdfOverlay ) ;
159
179
} else {
160
- const pdfStr = await writePdf ( ocrDownload , minPage , maxPage , opt . displayMode , false , true , dimsLimit , opt . confThreshHigh , opt . confThreshMed ,
161
- opt . overlayOpacity / 100 ) ;
180
+ const pdfStr = await writePdf ( {
181
+ ocrArr : ocrDownload ,
182
+ pageMetricsArr : pageMetricsAll ,
183
+ minpage : minPage ,
184
+ maxpage : maxPage ,
185
+ textMode : opt . displayMode ,
186
+ rotateText : false ,
187
+ rotateBackground : true ,
188
+ dimsLimit,
189
+ confThreshHigh : opt . confThreshHigh ,
190
+ confThreshMed : opt . confThreshMed ,
191
+ proofOpacity : opt . overlayOpacity / 100 ,
192
+ } ) ;
162
193
163
194
// The PDF is still run through muPDF, even thought in eBook mode no background layer is added.
164
195
// This is because muPDF cleans up the PDF we made in the previous step, including:
@@ -186,7 +217,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
186
217
w . freeDocument ( pdf ) ;
187
218
}
188
219
} else if ( format === 'hocr' ) {
189
- content = writeHocr ( ocrDownload , minPage , maxPage ) ;
220
+ content = writeHocr ( { ocrData : ocrDownload , minValue : minPage , maxValue : maxPage } ) ;
190
221
} else if ( format === 'html' ) {
191
222
const images = /** @type {Array<ImageWrapper> } */ ( [ ] ) ;
192
223
if ( opt . includeImages ) {
@@ -218,18 +249,29 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
218
249
ocrPages : ocrDownload , images, minpage : minPage , maxpage : maxPage , reflowText : opt . reflow , removeMargins : opt . removeMargins ,
219
250
} ) ;
220
251
} else if ( format === 'txt' ) {
221
- content = writeText ( ocrDownload , minPage , maxPage , opt . reflow , false ) ;
252
+ content = writeText ( {
253
+ ocrCurrent : ocrDownload ,
254
+ minpage : minPage ,
255
+ maxpage : maxPage ,
256
+ reflowText : opt . reflow ,
257
+ docxMode : false ,
258
+ } ) ;
222
259
// Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.
223
260
// @ts -ignore
224
261
} else if ( typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'docx' ) {
225
262
// Less common export formats are loaded dynamically to reduce initial load time.
226
263
const writeDocx = ( await import ( './writeDocx.js' ) ) . writeDocx ;
227
- content = await writeDocx ( ocrDownload , minPage , maxPage ) ;
264
+ content = await writeDocx ( { hocrCurrent : ocrDownload , minpage : minPage , maxpage : maxPage } ) ;
228
265
// @ts -ignore
229
266
} else if ( typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'xlsx' ) {
230
267
// Less common export formats are loaded dynamically to reduce initial load time.
231
268
const writeXlsx = ( await import ( './writeTabular.js' ) ) . writeXlsx ;
232
- content = await writeXlsx ( ocrDownload , layoutDataTables . pages , minPage , maxPage ) ;
269
+ content = await writeXlsx ( {
270
+ ocrPageArr : ocrDownload ,
271
+ layoutPageArr : layoutDataTables . pages ,
272
+ minpage : minPage ,
273
+ maxpage : maxPage ,
274
+ } ) ;
233
275
} else if ( format === 'scribe' ) {
234
276
const data = {
235
277
ocr : removeCircularRefsOcr ( ocrDownload ) ,
0 commit comments