Skip to content

Commit f3e9fa1

Browse files
committed
despeckle before OCR.
1 parent 513ad5c commit f3e9fa1

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

lib/docsplit/text_extractor.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,14 @@ def extract_from_ocr(pdf, pages)
6363
pages.each do |page|
6464
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
6565
file = "#{base_path}_#{page}"
66-
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
66+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
6767
run "tesseract #{tiff} #{file} -l eng 2>&1"
6868
clean_text(file + '.txt') if @clean_ocr
6969
FileUtils.remove_entry_secure tiff
7070
end
7171
else
7272
tiff = "#{tempdir}/#{@pdf_name}.tif"
73-
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
73+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
7474
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
7575
clean_text(base_path + '.txt') if @clean_ocr
7676
end

0 commit comments

Comments
 (0)