@@ -45,7 +45,7 @@ def extract(pdfs, opts)
4545
4646 # Does a PDF have any text embedded?
4747 def contains_text? ( pdf )
48- fonts = `pdffonts #{ pdf } 2>&1`
48+ fonts = `pdffonts #{ ESCAPE [ pdf ] } 2>&1`
4949 !fonts . match ( NO_TEXT_DETECTED )
5050 end
5151
@@ -59,19 +59,22 @@ def extract_from_pdf(pdf, pages)
5959 def extract_from_ocr ( pdf , pages )
6060 tempdir = Dir . mktmpdir
6161 base_path = File . join ( @output , @pdf_name )
62+ escaped_pdf = ESCAPE [ pdf ]
6263 if pages
6364 pages . each do |page |
6465 tiff = "#{ tempdir } /#{ @pdf_name } _#{ page } .tif"
66+ escaped_tiff = ESCAPE [ tiff ]
6567 file = "#{ base_path } _#{ page } "
66- run "MAGICK_TMPDIR=#{ tempdir } OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{ MEMORY_ARGS } #{ OCR_FLAGS } #{ pdf } [#{ page - 1 } ] #{ tiff } 2>&1"
67- run "tesseract #{ tiff } #{ file } -l eng 2>&1"
68+ run "MAGICK_TMPDIR=#{ tempdir } OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{ MEMORY_ARGS } #{ OCR_FLAGS } #{ escaped_pdf } [#{ page - 1 } ] #{ escaped_tiff } 2>&1"
69+ run "tesseract #{ escaped_tiff } #{ ESCAPE [ file ] } -l eng 2>&1"
6870 clean_text ( file + '.txt' ) if @clean_ocr
6971 FileUtils . remove_entry_secure tiff
7072 end
7173 else
7274 tiff = "#{ tempdir } /#{ @pdf_name } .tif"
73- run "MAGICK_TMPDIR=#{ tempdir } OMP_NUM_THREADS=2 gm convert -despeckle #{ MEMORY_ARGS } #{ OCR_FLAGS } #{ pdf } #{ tiff } 2>&1"
74- run "tesseract #{ tiff } #{ base_path } -l eng 2>&1"
75+ escaped_tiff = ESCAPE [ tiff ]
76+ run "MAGICK_TMPDIR=#{ tempdir } OMP_NUM_THREADS=2 gm convert -despeckle #{ MEMORY_ARGS } #{ OCR_FLAGS } #{ escaped_pdf } #{ escaped_tiff } 2>&1"
77+ run "tesseract #{ escaped_tiff } #{ base_path } -l eng 2>&1"
7578 clean_text ( base_path + '.txt' ) if @clean_ocr
7679 end
7780 ensure
@@ -100,14 +103,14 @@ def run(command)
100103 # Extract the full contents of a pdf as a single file, directly.
101104 def extract_full ( pdf )
102105 text_path = File . join ( @output , "#{ @pdf_name } .txt" )
103- run "pdftotext -enc UTF-8 #{ pdf } #{ text_path } 2>&1"
106+ run "pdftotext -enc UTF-8 #{ ESCAPE [ pdf ] } #{ ESCAPE [ text_path ] } 2>&1"
104107 end
105108
106109 # Extract the contents of a single page of text, directly, adding it to
107110 # the `@pages_to_ocr` list if the text length is inadequate.
108111 def extract_page ( pdf , page )
109112 text_path = File . join ( @output , "#{ @pdf_name } _#{ page } .txt" )
110- run "pdftotext -enc UTF-8 -f #{ page } -l #{ page } #{ pdf } #{ text_path } 2>&1"
113+ run "pdftotext -enc UTF-8 -f #{ page } -l #{ page } #{ ESCAPE [ pdf ] } #{ ESCAPE [ text_path ] } 2>&1"
111114 unless @forbid_ocr
112115 @pages_to_ocr . push ( page ) if File . read ( text_path ) . length < MIN_TEXT_PER_PAGE
113116 end
0 commit comments