diff --git a/lib/boilerpipe/filters/block_proximity_fusion.rb b/lib/boilerpipe/filters/block_proximity_fusion.rb index 8ec40f0..ca7100e 100644 --- a/lib/boilerpipe/filters/block_proximity_fusion.rb +++ b/lib/boilerpipe/filters/block_proximity_fusion.rb @@ -32,25 +32,25 @@ def process(doc) blocks_to_remove = [] blocks.each do |tb| - if tb.is_not_content? +# if tb.is_not_content? +# prev_block = tb +# next +# end +# + block_distance = tb.offset_blocks_start - prev_block.offset_blocks_end - 1 + + ok = block_distance <= @max_blocks_distance + ok = false if ok && @content_only && (prev_block.is_not_content? || tb.is_not_content?) + ok = false if ok && @same_tag_level_only && prev_block.tag_level != tb.tag_level + + if ok + prev_block.merge_next(tb) + blocks_to_remove << tb + else prev_block = tb - next - end - - diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1 - if diff_blocks <= @max_blocks_distance - ok = true - ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only - ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only - - if ok - prev_block.merge_next(tb) - blocks_to_remove << tb - else - prev_block = tb - end end end + doc.replace_text_blocks!(text_blocks - blocks_to_remove) doc end diff --git a/lib/boilerpipe/sax/tag_action_map.rb b/lib/boilerpipe/sax/tag_action_map.rb index f643166..30f8ceb 100644 --- a/lib/boilerpipe/sax/tag_action_map.rb +++ b/lib/boilerpipe/sax/tag_action_map.rb @@ -3,13 +3,13 @@ class TagActionMap def self.tag_actions labels = ::Boilerpipe::Labels { - STYLE: TagActions::IgnorableElement.new, - SCRIPT: TagActions::IgnorableElement.new, - OPTION: TagActions::IgnorableElement.new, - OBJECT: TagActions::IgnorableElement.new, - EMBED: TagActions::IgnorableElement.new, APPLET: TagActions::IgnorableElement.new, + EMBED: TagActions::IgnorableElement.new, LINK: TagActions::IgnorableElement.new, + OPTION: TagActions::IgnorableElement.new, + OBJECT: TagActions::IgnorableElement.new, + SCRIPT: TagActions::IgnorableElement.new, + STYLE: TagActions::IgnorableElement.new, A: TagActions::AnchorText.new, BODY: TagActions::Body.new,