From a3dc2f4e7f131a61d5d5c67ec7a263bf850d70d5 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Mon, 15 Feb 2021 18:56:43 -0500 Subject: [PATCH 1/3] wip --- .../filters/block_proximity_fusion.rb | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/boilerpipe/filters/block_proximity_fusion.rb b/lib/boilerpipe/filters/block_proximity_fusion.rb index 8ec40f0..2dd9135 100644 --- a/lib/boilerpipe/filters/block_proximity_fusion.rb +++ b/lib/boilerpipe/filters/block_proximity_fusion.rb @@ -37,20 +37,20 @@ def process(doc) next end - diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1 - if diff_blocks <= @max_blocks_distance - ok = true - ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only - ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only - - if ok - prev_block.merge_next(tb) - blocks_to_remove << tb - else - prev_block = tb - end + block_distance = tb.offset_blocks_start - prev_block.offset_blocks_end - 1 + + ok = block_distance <= @max_blocks_distance + ok = false if ok && @content_only && prev_block.is_not_content? + ok = false if ok && @same_tag_level_only && prev_block.tag_level != tb.tag_level + + if ok + prev_block.merge_next(tb) + blocks_to_remove << tb + else + prev_block = tb end end + doc.replace_text_blocks!(text_blocks - blocks_to_remove) doc end From ab1934973ba8277f1f15e7658ea76da5a3181800 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Mon, 15 Feb 2021 19:27:55 -0500 Subject: [PATCH 2/3] wip --- lib/boilerpipe/filters/block_proximity_fusion.rb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/boilerpipe/filters/block_proximity_fusion.rb b/lib/boilerpipe/filters/block_proximity_fusion.rb index 2dd9135..ca7100e 100644 --- a/lib/boilerpipe/filters/block_proximity_fusion.rb +++ b/lib/boilerpipe/filters/block_proximity_fusion.rb @@ -32,15 +32,15 @@ def process(doc) blocks_to_remove = [] blocks.each do |tb| - if tb.is_not_content? - prev_block = tb - next - end - +# if tb.is_not_content? +# prev_block = tb +# next +# end +# block_distance = tb.offset_blocks_start - prev_block.offset_blocks_end - 1 ok = block_distance <= @max_blocks_distance - ok = false if ok && @content_only && prev_block.is_not_content? + ok = false if ok && @content_only && (prev_block.is_not_content? || tb.is_not_content?) ok = false if ok && @same_tag_level_only && prev_block.tag_level != tb.tag_level if ok From 841f82892ffdc1cff551876701544b954bfda0b9 Mon Sep 17 00:00:00 2001 From: Gregory Ostermayr Date: Sun, 21 Feb 2021 18:57:01 -0500 Subject: [PATCH 3/3] order by alpha --- lib/boilerpipe/sax/tag_action_map.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/boilerpipe/sax/tag_action_map.rb b/lib/boilerpipe/sax/tag_action_map.rb index f643166..30f8ceb 100644 --- a/lib/boilerpipe/sax/tag_action_map.rb +++ b/lib/boilerpipe/sax/tag_action_map.rb @@ -3,13 +3,13 @@ class TagActionMap def self.tag_actions labels = ::Boilerpipe::Labels { - STYLE: TagActions::IgnorableElement.new, - SCRIPT: TagActions::IgnorableElement.new, - OPTION: TagActions::IgnorableElement.new, - OBJECT: TagActions::IgnorableElement.new, - EMBED: TagActions::IgnorableElement.new, APPLET: TagActions::IgnorableElement.new, + EMBED: TagActions::IgnorableElement.new, LINK: TagActions::IgnorableElement.new, + OPTION: TagActions::IgnorableElement.new, + OBJECT: TagActions::IgnorableElement.new, + SCRIPT: TagActions::IgnorableElement.new, + STYLE: TagActions::IgnorableElement.new, A: TagActions::AnchorText.new, BODY: TagActions::Body.new,