diff --git a/boilerpipe-ruby.gemspec b/boilerpipe-ruby.gemspec index 54be038..a492123 100644 --- a/boilerpipe-ruby.gemspec +++ b/boilerpipe-ruby.gemspec @@ -23,5 +23,6 @@ Gem::Specification.new do |spec| spec.add_development_dependency 'rake', '>= 12.3.3' spec.add_development_dependency 'rickshaw', '~> 0.5.0' spec.add_development_dependency 'rspec', '~> 3.10' + spec.add_development_dependency 'simplecov', '~> 0.18.5' spec.add_runtime_dependency 'nokogiri', '~> 1.10' end diff --git a/lib/boilerpipe.rb b/lib/boilerpipe.rb index 6a7cf39..8bce16c 100644 --- a/lib/boilerpipe.rb +++ b/lib/boilerpipe.rb @@ -50,7 +50,6 @@ require 'boilerpipe/sax/tag_actions/body' require 'boilerpipe/sax/tag_actions/inline_whitespace' require 'boilerpipe/sax/tag_actions/inline_no_whitespace' -require 'boilerpipe/sax/tag_actions/block_level' require 'boilerpipe/sax/tag_actions/font' require 'boilerpipe/sax/tag_actions/inline_tag_label' require 'boilerpipe/sax/tag_actions/block_tag_label' diff --git a/lib/boilerpipe/document/text_block.rb b/lib/boilerpipe/document/text_block.rb index 3f0bc50..4c9e9e9 100644 --- a/lib/boilerpipe/document/text_block.rb +++ b/lib/boilerpipe/document/text_block.rb @@ -5,7 +5,7 @@ class TextBlock attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text, :num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density, - :link_density, :labels, :tag_level, :num_full_text_words + :link_density, :labels, :tag_level attr_accessor :content @@ -16,7 +16,6 @@ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_w @num_words_in_anchor_text = num_words_in_anchor_text @num_words_in_wrapped_lines = num_words_in_wrapped_lines @num_wrapped_lines = num_wrapped_lines - @num_full_text_words = 0 @offset_blocks_start = offset_blocks @offset_blocks_end = offset_blocks @content = false @@ -26,7 +25,7 @@ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_w end def self.empty_start - new('', 0, 0, 0, 0, -1) + new('', 0, 0, 0, 1, -1) end def set_tag_level(level) @@ -67,29 +66,23 @@ def merge_next(other) @num_wrapped_lines += other.num_wrapped_lines @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max - init_densities @content |= other.is_content? - - @num_full_text_words += other.num_full_text_words - - if other.labels - if @labels.nil? - @labels = other.labels.clone - else - @labels.merge(other.labels.clone) - end - end - + @labels.merge(other.labels.clone) @tag_level = [@tag_level, other.tag_level].min + + init_densities end def to_s - # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText(); - labels = 'null' - if !@labels.empty? - labels = "[#{@labels.to_a.join(',')}]" + "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels_to_s}\n#{text}" + end + + def labels_to_s + if @labels.empty? + 'null' + else + "[#{@labels.to_a.join(',')}]" end - "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}" end def clone @@ -101,7 +94,6 @@ def clone def init_densities if @num_words_in_wrapped_lines == 0 @num_words_in_wrapped_lines = @num_words - @num_wrapped_lines = 1 end @text_density = @num_words_in_wrapped_lines / @num_wrapped_lines.to_f @link_density = @num_words == 0 ? 0.0 : @num_words_in_anchor_text / @num_words.to_f diff --git a/lib/boilerpipe/filters/block_proximity_fusion.rb b/lib/boilerpipe/filters/block_proximity_fusion.rb index 8ec40f0..cf38cf1 100644 --- a/lib/boilerpipe/filters/block_proximity_fusion.rb +++ b/lib/boilerpipe/filters/block_proximity_fusion.rb @@ -18,12 +18,7 @@ def process(doc) text_blocks = doc.text_blocks return false if text_blocks.size < 2 - prev_block = if @content_only - text_blocks.find { |tb| tb.is_content? } - else - text_blocks.first - end - + prev_block = text_blocks.first return false if prev_block.nil? offset = text_blocks.index(prev_block) + 1 @@ -38,19 +33,20 @@ def process(doc) end diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1 - if diff_blocks <= @max_blocks_distance - ok = true - ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only - ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only - - if ok - prev_block.merge_next(tb) - blocks_to_remove << tb - else - prev_block = tb - end + next if diff_blocks > @max_blocks_distance + + ok = true + ok = false if @content_only && prev_block.is_not_content? + ok = false if ok && @same_tag_level_only && prev_block.tag_level != tb.tag_level + + if ok + prev_block.merge_next(tb) + blocks_to_remove << tb + else + prev_block = tb end end + doc.replace_text_blocks!(text_blocks - blocks_to_remove) doc end diff --git a/lib/boilerpipe/sax/html_content_handler.rb b/lib/boilerpipe/sax/html_content_handler.rb index 612bcd8..bd9b34f 100644 --- a/lib/boilerpipe/sax/html_content_handler.rb +++ b/lib/boilerpipe/sax/html_content_handler.rb @@ -7,10 +7,9 @@ class HTMLContentHandler < Nokogiri::XML::SAX::Document ANCHOR_TEXT_END = ">\ue00a$" def initialize - @label_stacks = [] + @label_stacks = [[]] @tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions @tag_level = 0 - @sb_last_was_whitespace = false @text_buffer = '' @token_buffer = '' @offset_blocks = 0 @@ -28,10 +27,11 @@ def initialize end def start_element(name, attrs = []) - @label_stacks << nil + @label_stacks << [] tag = name.upcase.intern tag_action = @tag_actions[tag] + org = @tag_level if tag_action @tag_level += 1 if tag_action.changes_tag_level? @flush = tag_action.start(self, name, attrs) | @flush @@ -39,6 +39,7 @@ def start_element(name, attrs = []) @tag_level += 1 @flush = true end + puts "before: #{org}, after: #{@tag_level}" @last_event = :START_TAG @last_start_tag = tag @@ -61,18 +62,20 @@ def characters(text) # add a single space if the block was only whitespace if text.empty? append_space - @last_event = :WHITESPACE return end # set block levels - @block_tag_level = @tag_level if @block_tag_level == -1 + if @block_tag_level == -1 + puts "-1 setting block level tag_level: #{@tag_level}" + @block_tag_level = @tag_level + end + puts "block_tag_level: #{@block_tag_level}" append_space if started_with_whitespace append_text(text) append_space if ended_with_whitespace - @last_event = :CHARACTERS end def end_element(name) @@ -112,11 +115,10 @@ def flush_block when 0 return when 1 - clear_buffers if @sb_last_was_whitespace + clear_buffers if @last_event == :WHITESPACE return end - num_tokens = 0 num_words = 0 num_words_current_line = 0 num_words_in_wrapped_lines = 0 @@ -132,7 +134,6 @@ def flush_block elsif ANCHOR_TEXT_END == token @in_anchor_text = false elsif is_word?(token) - num_tokens += 1 num_words += 1 num_words_current_line += 1 num_linked_words += 1 if @in_anchor_text @@ -144,12 +145,10 @@ def flush_block current_line_length = token_length num_words_current_line = 1 end - else - num_tokens += 1 end end - return if num_tokens == 0 + return if tokens.empty? num_words_in_wrapped_lines = 0 if num_wrapped_lines == 0 @@ -163,11 +162,13 @@ def flush_block num_words, num_linked_words, num_words_in_wrapped_lines, - num_wrapped_lines, @offset_blocks) + num_wrapped_lines, + @offset_blocks) @offset_blocks += 1 clear_buffers text_block.set_tag_level(@block_tag_level) + classify_text_block_with_labels(text_block) add_text_block(text_block) @block_tag_level = -1 end @@ -192,16 +193,6 @@ def is_word?(word) word =~ VALID_WORD_CHARACTER end - # public void flushBlock() { - # int numWords = 0; - # int numLinkedWords = 0; - # int numWrappedLines = 0; - # int currentLineLength = -1; // don't count the first space - # final int maxLineLength = 80; - # int numTokens = 0; - # int numWordsCurrentLine = 0; - # } - def increase_in_ignorable_element! @in_ignorable_element += 1 end @@ -227,29 +218,30 @@ def in_anchor_tag? @in_anchor_tag > 0 end - def add_text_block(text_block) - @label_stacks.each do |stack| - next unless stack - - stack.each do |label_action| - text_block.add_label(label_action.labels) if label_action - end + def classify_text_block_with_labels(text_block) + @label_stacks + .flatten + .filter{|stack| stack} + .each do |label_action| + text_block.add_label(label_action.labels) end + end + + def add_text_block(text_block) @text_blocks << text_block end # append space if last character wasn't already one def append_space - return if @sb_last_was_whitespace - - @sb_last_was_whitespace = true + return if @last_event == :WHITESPACE + @last_event = :WHITESPACE @text_buffer << ' ' @token_buffer << ' ' end def append_text(text) - @sb_last_was_whitespace = false + @last_event = :CHARACTERS @text_buffer << text @token_buffer << text end @@ -260,11 +252,6 @@ def append_token(token) def add_label_action(label_action) label_stack = @label_stacks.last - if label_stack.nil? - label_stack = [] - @label_stacks.pop - @label_stacks << label_stack - end label_stack << label_action end diff --git a/lib/boilerpipe/sax/tag_actions/anchor_text.rb b/lib/boilerpipe/sax/tag_actions/anchor_text.rb index a9b6dd7..3097db0 100644 --- a/lib/boilerpipe/sax/tag_actions/anchor_text.rb +++ b/lib/boilerpipe/sax/tag_actions/anchor_text.rb @@ -29,13 +29,11 @@ def changes_tag_level? def append_anchor_text_start(handler) handler.append_space handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_START) - handler.append_token(' ') end def append_anchor_text_end(handler) handler.append_space handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_END) - handler.append_token(' ') end def nested_achor_tag_error_recovering(handler, name) diff --git a/lib/boilerpipe/sax/tag_actions/block_level.rb b/lib/boilerpipe/sax/tag_actions/block_level.rb deleted file mode 100644 index 321f7ba..0000000 --- a/lib/boilerpipe/sax/tag_actions/block_level.rb +++ /dev/null @@ -1,17 +0,0 @@ -module Boilerpipe::SAX::TagActions - # Explicitly marks this tag a simple "block-level" element, - # which always generates whitespace - class BlockLevel - def start(handler, name, attrs) - true - end - - def end_tag(handler, name) - true - end - - def changes_tag_level? - true - end - end -end diff --git a/spec/document/text_block_spec.rb b/spec/document/text_block_spec.rb index bbc3d34..fb93dc6 100644 --- a/spec/document/text_block_spec.rb +++ b/spec/document/text_block_spec.rb @@ -50,6 +50,101 @@ module Boilerpipe subject.merge_next(another_block) expect(subject.text).to eq "hello\ngood-bye" end + + it 'num words gets combined' do + another_block = Document::TextBlock.new('good-bye', 1) + subject.merge_next(another_block) + expect(subject.num_words).to eq 1 + end + + it 'num_words_in_anchor_text gets combined' do + another_block = Document::TextBlock.new('good-bye', 1, 1) + subject.merge_next(another_block) + expect(subject.num_words_in_anchor_text).to eq 1 + end + + it 'num_words_in_wrapped_lines gets combined' do + another_block = Document::TextBlock.new('good-bye', 1, 1, 1) + subject.merge_next(another_block) + expect(subject.num_words_in_wrapped_lines).to eq 1 + end + + it 'num_wrapped_lines gets combined' do + # one by default + another_block = Document::TextBlock.new('good-bye', 1, 1, 1) + subject.merge_next(another_block) + expect(subject.num_wrapped_lines).to eq 2 + end + + it 'offset_block_start uses the earlier start' do + block = Document::TextBlock.new('one', 1, 1, 1, 1, 5) + another_block = Document::TextBlock.new('two', 1, 1, 1, 1, 3) + block.merge_next(another_block) + expect(block.offset_blocks_start).to eq 3 + end + + it 'offset_block_end uses the later end' do + block = Document::TextBlock.new('one', 1, 1, 1, 1, 5) + another_block = Document::TextBlock.new('two', 1, 1, 1, 1, 3) + block.merge_next(another_block) + expect(block.offset_blocks_end).to eq 5 + end + + it 'recomputes densities' do + block = Document::TextBlock.new('one', 10, 5, 10, 2, 5) + another_block = Document::TextBlock.new('two', 10, 5, 10, 3, 3) + + block.merge_next(another_block) + + expect(block.text_density).to eq 4.0 + expect(block.link_density).to eq 0.5 + end + + it 'resets wrapped lines' do + block = Document::TextBlock.new('one', 10) + another_block = Document::TextBlock.new('two', 10) + + block.merge_next(another_block) + + expect(block.num_words_in_wrapped_lines).to eq 20 + expect(block.num_wrapped_lines).to eq 2 + end + + it 'if one is content the merged block is content' do + block = Document::TextBlock.new('one') + block.content = false + + another_block = Document::TextBlock.new('two') + another_block.content = true + + block.merge_next(another_block) + + expect(block.content).to eq true + end + + it 'merges labels' do + block = Document::TextBlock.new('one') + block.add_label('boom') + + another_block = Document::TextBlock.new('two') + another_block.add_label('pow') + + block.merge_next(another_block) + + expect(block.labels).to eq Set.new(['boom', 'pow']) + end + + it 'sets the tag level to the minimum of the two blocks' do + block = Document::TextBlock.new('one') + block.set_tag_level(2) + + another_block = Document::TextBlock.new('two') + another_block.set_tag_level(1) + + block.merge_next(another_block) + + expect(block.tag_level).to eq 1 + end end describe '#add_label' do diff --git a/spec/filters/block_proximity_fusion_spec.rb b/spec/filters/block_proximity_fusion_spec.rb index e09788e..d1cd849 100644 --- a/spec/filters/block_proximity_fusion_spec.rb +++ b/spec/filters/block_proximity_fusion_spec.rb @@ -2,10 +2,10 @@ module Boilerpipe::Filters describe BlockProximityFusion do - let(:text_block1) { Boilerpipe::Document::TextBlock.new('one', 0, 0, 0, 0, 0) } - let(:text_block2) { Boilerpipe::Document::TextBlock.new('two', 0, 0, 0, 0, 1) } - let(:text_block3) { Boilerpipe::Document::TextBlock.new('three', 0, 0, 0, 0, 2) } - let(:text_block4) { Boilerpipe::Document::TextBlock.new('four', 0, 0, 0, 0, 3) } + let(:text_block1) { Boilerpipe::Document::TextBlock.new('one', 0, 0, 0, 1, 0) } + let(:text_block2) { Boilerpipe::Document::TextBlock.new('two', 0, 0, 0, 1, 1) } + let(:text_block3) { Boilerpipe::Document::TextBlock.new('three', 0, 0, 0, 1, 2) } + let(:text_block4) { Boilerpipe::Document::TextBlock.new('four', 0, 0, 0, 1, 3) } let(:text_blocks) { [text_block1, text_block2, text_block3, text_block4] } let!(:doc) { Boilerpipe::Document::TextDocument.new('', text_blocks) } @@ -18,6 +18,7 @@ module Boilerpipe::Filters describe '#process' do context 'where blocks exceed distance' do + # only_content: true, same_tag_level: false it 'doesnt change blocks' do expect(doc.text_blocks.size).to eq 4 filter = BlockProximityFusion.new(1, true, false) @@ -27,10 +28,15 @@ module Boilerpipe::Filters end context 'where blocks do not exceed distance' do + # only_content: false, same_tag_level: false it 'Fuses adjacent blocks' do + puts doc.text_blocks.map(&:text).inspect + puts doc.debug_s expect(doc.text_blocks.last.text.size).to eq 4 filter = BlockProximityFusion.new(1, false, false) filter.process(doc) + puts doc.text_blocks.map(&:text).inspect + puts doc.debug_s expect(doc.text_blocks.last.text).to eq "three\nfour" end diff --git a/spec/filters/simple_block_fusion_processor_spec.rb b/spec/filters/simple_block_fusion_processor_spec.rb index 3dea759..61da407 100644 --- a/spec/filters/simple_block_fusion_processor_spec.rb +++ b/spec/filters/simple_block_fusion_processor_spec.rb @@ -6,10 +6,10 @@ module Boilerpipe::Filters let!(:doc) { Boilerpipe::Document::TextDocument.new('', text_blocks) } context 'where blocks have same text density' do - let(:text_block1) { Boilerpipe::Document::TextBlock.new('one', 0, 0, 0, 0, 0) } - let(:text_block2) { Boilerpipe::Document::TextBlock.new('two', 0, 0, 0, 0, 1) } - let(:text_block3) { Boilerpipe::Document::TextBlock.new('three', 0, 0, 0, 0, 2) } - let(:text_block4) { Boilerpipe::Document::TextBlock.new('four', 0, 0, 0, 0, 3) } + let(:text_block1) { Boilerpipe::Document::TextBlock.new('one', 0, 0, 0, 1, 0) } + let(:text_block2) { Boilerpipe::Document::TextBlock.new('two', 0, 0, 0, 1, 1) } + let(:text_block3) { Boilerpipe::Document::TextBlock.new('three', 0, 0, 0, 1, 2) } + let(:text_block4) { Boilerpipe::Document::TextBlock.new('four', 0, 0, 0, 1, 3) } it 'the blocks are merged' do expect(doc.text_blocks.size).to eq 4 SimpleBlockFusionProcessor.process(doc) diff --git a/spec/sax/handler_spec.rb b/spec/sax/handler_spec.rb index 398a851..f4baf0a 100644 --- a/spec/sax/handler_spec.rb +++ b/spec/sax/handler_spec.rb @@ -26,6 +26,14 @@ module Boilerpipe::SAX end describe '#flush_block' do + it 'resets flush' + it 'sets title with last text from TITLE tag' + it 'clears out text_buffer' + it 'clears out token_buffer' + it 'determins line and word counts' + it 'creates text block' + it 'classifies text block with labels' + it 'adds text block to document' end describe '#text_document' do @@ -68,14 +76,14 @@ module Boilerpipe::SAX end describe '#add_label_action' do - context 'with a nil as the last element in the label stacks' do + context 'with an array as the last element in the label stacks' do before { subject.start_element('boom') } - it 'removes that nil' do - expect(subject.label_stacks.first).to eq nil + it 'adds the label' do + expect(subject.label_stacks.last).to eq [] subject.add_label_action(:boom) - expect(subject.label_stacks.first).to eq [:boom] - expect(subject.label_stacks.size).to eq 1 + expect(subject.label_stacks.last).to eq [:boom] + expect(subject.label_stacks.size).to eq 2 end end end diff --git a/spec/sax/tag_actions/anchor_text_spec.rb b/spec/sax/tag_actions/anchor_text_spec.rb index c77f5fc..22447f8 100644 --- a/spec/sax/tag_actions/anchor_text_spec.rb +++ b/spec/sax/tag_actions/anchor_text_spec.rb @@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions it 'adds anchor text start' do handler = Boilerpipe::SAX::HTMLContentHandler.new - expect { subject.start(handler, nil, nil) }.to change { handler.token_buffer_size }.from(0).to(5) + expect { subject.start(handler, nil, nil) }.to change { handler.token_buffer_size }.from(0).to(4) end it 'returns false' do @@ -24,15 +24,16 @@ module Boilerpipe::SAX::TagActions subject.start(handler, nil, nil) expect(handler.in_anchor_tag).to eq(1) end + it 'doesnt append end anchor text' do handler = Boilerpipe::SAX::HTMLContentHandler.new expect { subject.start(handler, nil, nil) }.to change { handler.in_anchor_tag }.from(0).to(1) # puts handler.token_buffer - expect(handler.token_buffer_size).to eq(5) + expect(handler.token_buffer_size).to eq(4) subject.start(handler, nil, nil) # puts handler.token_buffer - expect(handler.token_buffer_size).to eq(5) + expect(handler.token_buffer_size).to eq(4) end end end @@ -47,7 +48,8 @@ module Boilerpipe::SAX::TagActions it 'adds end anchor text' do handler = Boilerpipe::SAX::HTMLContentHandler.new handler.in_anchor_tag = 1 - expect { subject.end_tag(handler, nil) }.to change { handler.token_buffer_size }.from(0).to(5) + expect { subject.end_tag(handler, nil) }.to change { handler.token_buffer_size }.from(0).to(4) + puts Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_END.size end context 'if in nested anchor tag' do diff --git a/spec/sax/tag_actions/block_level_spec.rb b/spec/sax/tag_actions/block_level_spec.rb deleted file mode 100644 index 5e8471d..0000000 --- a/spec/sax/tag_actions/block_level_spec.rb +++ /dev/null @@ -1,21 +0,0 @@ -require 'spec_helper' - -module Boilerpipe::SAX::TagActions - describe BlockLevel do - describe '#start' do - it 'returns true' do - expect(subject.start(nil, nil, nil)).to be true - end - end - describe '#end_tag' do - it 'returns true' do - expect(subject.end_tag(nil, nil)).to be true - end - end - describe '#changes_tag_level?' do - it 'returns true' do - expect(subject.changes_tag_level?).to be true - end - end - end -end diff --git a/spec/sax/tag_actions/block_tag_label_spec.rb b/spec/sax/tag_actions/block_tag_label_spec.rb index f30ec0d..b24e73d 100644 --- a/spec/sax/tag_actions/block_tag_label_spec.rb +++ b/spec/sax/tag_actions/block_tag_label_spec.rb @@ -8,16 +8,19 @@ module Boilerpipe::SAX::TagActions describe '.new' do it 'takes a label action' end + describe '#start' do it 'returns true' do expect(subject.start(handler, nil, nil)).to be true end end + describe '#end_tag' do it 'returns true' do expect(subject.end_tag(handler, nil)).to be true end end + describe '#changes_tag_level?' do it 'returns true' do expect(subject.changes_tag_level?).to be true diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 15499af..0b42506 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,2 +1,4 @@ +require 'simplecov' +SimpleCov.start $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) require 'boilerpipe'