Skip to content

Commit 2d03ce9

Browse files
committed
Merge pull request #12 from vrybas/11_escape_incoming_file_names
11 escape incoming file names
2 parents a106a51 + a65fc0a commit 2d03ce9

12 files changed

+50
-13
lines changed

lib/docsplit.rb

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ module Docsplit
1919

2020
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
2121

22+
ESCAPE = lambda {|x| Shellwords.shellescape(x) }
23+
2224
# Check for all dependencies, and warn of their absence.
2325
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
2426
DEPENDENCIES.each_key do |dep|
@@ -62,11 +64,13 @@ def self.extract_pdf(docs, opts={})
6264
[docs].flatten.each do |doc|
6365
ext = File.extname(doc)
6466
basename = File.basename(doc, ext)
67+
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
68+
6569
if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
66-
`gm convert "#{doc}" "#{out}/#{basename}.pdf"`
70+
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
6771
else
6872
options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
69-
run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
73+
run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
7074
end
7175
end
7276
end
@@ -113,6 +117,7 @@ def self.normalize_value(value)
113117

114118
require 'tmpdir'
115119
require 'fileutils'
120+
require 'shellwords'
116121
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
117122
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
118123
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"

lib/docsplit/image_extractor.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def convert(pdf, size, format, previous=nil)
3232
basename = File.basename(pdf, File.extname(pdf))
3333
directory = directory_for(size)
3434
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
35+
escaped_pdf = ESCAPE[pdf]
3536
FileUtils.mkdir_p(directory) unless File.exists?(directory)
3637
common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
3738
if previous
@@ -40,8 +41,8 @@ def convert(pdf, size, format, previous=nil)
4041
raise ExtractionFailed, result if $? != 0
4142
else
4243
page_list(pages).each do |page|
43-
out_file = File.join(directory, "#{basename}_#{page}.#{format}")
44-
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
44+
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
45+
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
4546
result = `#{cmd}`.chomp
4647
raise ExtractionFailed, result if $? != 0
4748
end

lib/docsplit/info_extractor.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class InfoExtractor
1818
# Pull out a single datum from a pdf.
1919
def extract(key, pdfs, opts)
2020
pdf = [pdfs].flatten.first
21-
cmd = "pdfinfo #{pdf} 2>&1"
21+
cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
2222
result = `#{cmd}`.chomp
2323
raise ExtractionFailed, result if $? != 0
2424
match = result.match(MATCHERS[key])

lib/docsplit/page_extractor.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def extract(pdfs, opts)
1111
pdf_name = File.basename(pdf, File.extname(pdf))
1212
page_path = File.join(@output, "#{pdf_name}_%d.pdf")
1313
FileUtils.mkdir_p @output unless File.exists?(@output)
14-
cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
14+
cmd = "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
1515
result = `#{cmd}`.chomp
1616
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
1717
raise ExtractionFailed, result if $? != 0

lib/docsplit/text_extractor.rb

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def extract(pdfs, opts)
4545

4646
# Does a PDF have any text embedded?
4747
def contains_text?(pdf)
48-
fonts = `pdffonts #{pdf} 2>&1`
48+
fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
4949
!fonts.match(NO_TEXT_DETECTED)
5050
end
5151

@@ -59,19 +59,22 @@ def extract_from_pdf(pdf, pages)
5959
def extract_from_ocr(pdf, pages)
6060
tempdir = Dir.mktmpdir
6161
base_path = File.join(@output, @pdf_name)
62+
escaped_pdf = ESCAPE[pdf]
6263
if pages
6364
pages.each do |page|
6465
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
66+
escaped_tiff = ESCAPE[tiff]
6567
file = "#{base_path}_#{page}"
66-
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
67-
run "tesseract #{tiff} #{file} -l eng 2>&1"
68+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
69+
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l eng 2>&1"
6870
clean_text(file + '.txt') if @clean_ocr
6971
FileUtils.remove_entry_secure tiff
7072
end
7173
else
7274
tiff = "#{tempdir}/#{@pdf_name}.tif"
73-
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
74-
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
75+
escaped_tiff = ESCAPE[tiff]
76+
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
77+
run "tesseract #{escaped_tiff} #{base_path} -l eng 2>&1"
7578
clean_text(base_path + '.txt') if @clean_ocr
7679
end
7780
ensure
@@ -100,14 +103,14 @@ def run(command)
100103
# Extract the full contents of a pdf as a single file, directly.
101104
def extract_full(pdf)
102105
text_path = File.join(@output, "#{@pdf_name}.txt")
103-
run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
106+
run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
104107
end
105108

106109
# Extract the contents of a single page of text, directly, adding it to
107110
# the `@pages_to_ocr` list if the text length is inadequate.
108111
def extract_page(pdf, page)
109112
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
110-
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
113+
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
111114
unless @forbid_ocr
112115
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
113116
end
Binary file not shown.
Binary file not shown.

test/unit/test_convert_to_pdf.rb

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ def test_rtf_conversion
1212
assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/obama_hopes.pdf"]
1313
end
1414

15+
def test_png_conversion
16+
Docsplit.extract_pdf('test/fixtures/image.png', :output => OUTPUT)
17+
assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/image.pdf"]
18+
end
1519
def test_png_conversion
1620
Docsplit.extract_pdf('test/fixtures/image.png', :output => OUTPUT)
1721
assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/image.pdf"]
@@ -23,4 +27,9 @@ def test_conversion_then_page_extraction
2327
assert Dir["#{OUTPUT}/*.pdf"].length == 8
2428
end
2529

30+
def test_name_escaping_while_converting
31+
Docsplit.extract_pdf('test/fixtures/PDF file with spaces \'single\' and "double quotes".doc', :output => OUTPUT)
32+
assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/PDF file with spaces 'single' and \"double quotes\".pdf"]
33+
end
34+
2635
end

test/unit/test_extract_images.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,9 @@ def test_repeated_extraction_in_the_same_directory
4141
assert Dir["#{OUTPUT}/*"] == ['test/output/obama_arts_1.gif', 'test/output/obama_arts_2.gif']
4242
end
4343

44+
def test_name_escaping_while_extracting_images
45+
Docsplit.extract_images('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :format => :gif, :size => "250x", :output => OUTPUT)
46+
assert Dir["#{OUTPUT}/*"] == ['test/output/PDF file with spaces \'single\' and "double quotes"_1.gif', 'test/output/PDF file with spaces \'single\' and "double quotes"_2.gif']
47+
end
48+
4449
end

test/unit/test_extract_info.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,8 @@ def test_password_protected
3232
end
3333
end
3434

35+
def test_name_escaping_while_extracting_info
36+
assert 2 == Docsplit.extract_length('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf')
37+
end
38+
3539
end

0 commit comments

Comments
 (0)