diff --git a/lib/jars/tabula-1.0.0-SNAPSHOT-jar-with-dependencies.jar b/lib/jars/tabula-1.0.0-SNAPSHOT-jar-with-dependencies.jar index 8eabe30b..c5e67fbe 100644 Binary files a/lib/jars/tabula-1.0.0-SNAPSHOT-jar-with-dependencies.jar and b/lib/jars/tabula-1.0.0-SNAPSHOT-jar-with-dependencies.jar differ diff --git a/lib/tabula_job_executor/jobs/detect_tables.rb b/lib/tabula_job_executor/jobs/detect_tables.rb index 48ff3938..ff0fc06f 100644 --- a/lib/tabula_job_executor/jobs/detect_tables.rb +++ b/lib/tabula_job_executor/jobs/detect_tables.rb @@ -5,10 +5,8 @@ class DetectTablesJob < Tabula::Background::Job include Observable def perform - filepath = options[:filepath] + filepath = options[:filepath] output_dir = options[:output_dir] - - page_areas_by_page = [] begin @@ -39,6 +37,6 @@ def perform end at(100, 100, "complete") - return nil + return nil end -end +end \ No newline at end of file diff --git a/lib/tabula_job_executor/jobs/string_search.rb b/lib/tabula_job_executor/jobs/string_search.rb new file mode 100644 index 00000000..163581fa --- /dev/null +++ b/lib/tabula_job_executor/jobs/string_search.rb @@ -0,0 +1,35 @@ +require 'java' + +class StringSearchJob + + def performString(output_dir, boundariesArray) + page_areas_by_page = [] + begin + extractor = Tabula::Extraction::ObjectExtractor.new(File.join(output_dir, 'document.pdf'), :all) + page_count = extractor.page_count + rda = Java::TechnologyTabulaDetectors::StringSearch.new + extractor.extract.each do |page| + areas = rda.detect(page, boundariesArray) + page_areas_by_page << areas.map { |rect| + [ rect.getLeft, + rect.getTop, + rect.getWidth, + rect.getHeight ] + } + end + + rescue Java::JavaLang::Exception => e + warn("String bounds detect failed. You may need to select tables manually.") + end + + File.open(output_dir + "/string.json", 'w') do |f| + f.puts page_areas_by_page.to_json + end + + File.open(output_dir + "/string_list.json", 'a') do |f| + f.puts boundariesArray[0] + "," + boundariesArray[1] + "," + boundariesArray[2] + "," + boundariesArray[3] + "\n" + end + + return page_areas_by_page + end +end \ No newline at end of file diff --git a/webapp/index.html b/webapp/index.html index ca67c155..204f69bb 100644 --- a/webapp/index.html +++ b/webapp/index.html @@ -104,8 +104,45 @@ Copy to Clipboard + +
+ + + + + + + + +
+ + +
+ String Search + + + + + +
+