default tesseract to english

jashkenas · jashkenas · commit 2c6cd0023f83 · 2011-04-26T16:48:15.000-04:00
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -64,7 +64,7 @@ def extract_from_ocr(pdf, pages)
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
-          run "tesseract #{tiff} #{file} 2>&1"
+          run "tesseract #{tiff} #{file} -l eng 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end