Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
ffcfa78
remove dead check
gregors Jul 25, 2020
bf4d41b
TextBlock#merge adds num_words
gregors Jul 28, 2020
49547b0
num_words_text_anchor spec
gregors Jul 28, 2020
f8de958
spec for num_wrapped lines
gregors Jul 28, 2020
666fae5
remove num_full_text_words not used by anything
gregors Jul 28, 2020
3d78b74
spec for #merge start takes min starting block index
gregors Jul 28, 2020
1234c42
spec for merge - use the later end block
gregors Jul 28, 2020
76a4511
spec for densities
gregors Jul 29, 2020
0824787
num_wrapped_lines needs to default to 1
gregors Jul 29, 2020
a5f76ad
merge OR's content flag
gregors Jul 29, 2020
6be50e4
reorder reads better
gregors Jul 29, 2020
94305a6
merges labels
gregors Jul 29, 2020
cd12dd0
no need for label check, constructor creates an empty set
gregors Jul 29, 2020
2cf7f3e
move code for readability
gregors Jul 29, 2020
f3806d9
merges tag levels
gregors Jul 29, 2020
dec0ca2
clean up text_block#to_s
gregors Jul 29, 2020
ae06224
remove dead code
gregors Aug 7, 2020
4dd160d
add simplecov
gregors Aug 7, 2020
dad94f6
delete comments
gregors Aug 21, 2020
cf913e1
clean up whitespace event logic
gregors Aug 21, 2020
5c9dd16
clean up token count logic
gregors Aug 21, 2020
ab58372
remove extra space from tokens
gregors Aug 21, 2020
716bf59
clean up / split up label logic
gregors Aug 21, 2020
4814318
update specs
gregors Aug 21, 2020
800dc59
remove verbose unnecessary logic
gregors Feb 15, 2021
f05cf0e
invert boolean to reduce nesting
gregors Feb 15, 2021
31f3a4f
reorder boolean checks
gregors Feb 15, 2021
647a6f1
formatting
gregors Aug 21, 2020
6545233
wip
gregors Feb 13, 2021
1f0a335
remove gemfile lock
gregors Feb 15, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions boilerpipe-ruby.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@ Gem::Specification.new do |spec|
spec.add_development_dependency 'rake', '>= 12.3.3'
spec.add_development_dependency 'rickshaw', '~> 0.5.0'
spec.add_development_dependency 'rspec', '~> 3.10'
spec.add_development_dependency 'simplecov', '~> 0.18.5'
spec.add_runtime_dependency 'nokogiri', '~> 1.10'
end
1 change: 0 additions & 1 deletion lib/boilerpipe.rb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
require 'boilerpipe/sax/tag_actions/body'
require 'boilerpipe/sax/tag_actions/inline_whitespace'
require 'boilerpipe/sax/tag_actions/inline_no_whitespace'
require 'boilerpipe/sax/tag_actions/block_level'
require 'boilerpipe/sax/tag_actions/font'
require 'boilerpipe/sax/tag_actions/inline_tag_label'
require 'boilerpipe/sax/tag_actions/block_tag_label'
34 changes: 13 additions & 21 deletions lib/boilerpipe/document/text_block.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ class TextBlock

attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
:num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
:link_density, :labels, :tag_level, :num_full_text_words
:link_density, :labels, :tag_level

attr_accessor :content

Expand All @@ -16,7 +16,6 @@ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_w
@num_words_in_anchor_text = num_words_in_anchor_text
@num_words_in_wrapped_lines = num_words_in_wrapped_lines
@num_wrapped_lines = num_wrapped_lines
@num_full_text_words = 0
@offset_blocks_start = offset_blocks
@offset_blocks_end = offset_blocks
@content = false
Expand All @@ -26,7 +25,7 @@ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_w
end

def self.empty_start
new('', 0, 0, 0, 0, -1)
new('', 0, 0, 0, 1, -1)
end

def set_tag_level(level)
Expand Down Expand Up @@ -67,29 +66,23 @@ def merge_next(other)
@num_wrapped_lines += other.num_wrapped_lines
@offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
@offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
init_densities
@content |= other.is_content?

@num_full_text_words += other.num_full_text_words

if other.labels
if @labels.nil?
@labels = other.labels.clone
else
@labels.merge(other.labels.clone)
end
end

@labels.merge(other.labels.clone)
@tag_level = [@tag_level, other.tag_level].min

init_densities
end

def to_s
# "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
labels = 'null'
if [email protected]?
labels = "[#{@labels.to_a.join(',')}]"
"[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels_to_s}\n#{text}"
end

def labels_to_s
if @labels.empty?
'null'
else
"[#{@labels.to_a.join(',')}]"
end
"[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
end

def clone
Expand All @@ -101,7 +94,6 @@ def clone
def init_densities
if @num_words_in_wrapped_lines == 0
@num_words_in_wrapped_lines = @num_words
@num_wrapped_lines = 1
end
@text_density = @num_words_in_wrapped_lines / @num_wrapped_lines.to_f
@link_density = @num_words == 0 ? 0.0 : @num_words_in_anchor_text / @num_words.to_f
Expand Down
30 changes: 13 additions & 17 deletions lib/boilerpipe/filters/block_proximity_fusion.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,7 @@ def process(doc)
text_blocks = doc.text_blocks
return false if text_blocks.size < 2

prev_block = if @content_only
text_blocks.find { |tb| tb.is_content? }
else
text_blocks.first
end

prev_block = text_blocks.first
return false if prev_block.nil?

offset = text_blocks.index(prev_block) + 1
Expand All @@ -38,19 +33,20 @@ def process(doc)
end

diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1
if diff_blocks <= @max_blocks_distance
ok = true
ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only

if ok
prev_block.merge_next(tb)
blocks_to_remove << tb
else
prev_block = tb
end
next if diff_blocks > @max_blocks_distance

ok = true
ok = false if @content_only && prev_block.is_not_content?
ok = false if ok && @same_tag_level_only && prev_block.tag_level != tb.tag_level

if ok
prev_block.merge_next(tb)
blocks_to_remove << tb
else
prev_block = tb
end
end

doc.replace_text_blocks!(text_blocks - blocks_to_remove)
doc
end
Expand Down
65 changes: 26 additions & 39 deletions lib/boilerpipe/sax/html_content_handler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@ class HTMLContentHandler < Nokogiri::XML::SAX::Document
ANCHOR_TEXT_END = ">\ue00a$"

def initialize
@label_stacks = []
@label_stacks = [[]]
@tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions
@tag_level = 0
@sb_last_was_whitespace = false
@text_buffer = ''
@token_buffer = ''
@offset_blocks = 0
Expand All @@ -28,17 +27,19 @@ def initialize
end

def start_element(name, attrs = [])
@label_stacks << nil
@label_stacks << []
tag = name.upcase.intern

tag_action = @tag_actions[tag]
org = @tag_level
if tag_action
@tag_level += 1 if tag_action.changes_tag_level?
@flush = tag_action.start(self, name, attrs) | @flush
else
@tag_level += 1
@flush = true
end
puts "before: #{org}, after: #{@tag_level}"

@last_event = :START_TAG
@last_start_tag = tag
Expand All @@ -61,18 +62,20 @@ def characters(text)
# add a single space if the block was only whitespace
if text.empty?
append_space
@last_event = :WHITESPACE
return
end

# set block levels
@block_tag_level = @tag_level if @block_tag_level == -1
if @block_tag_level == -1
puts "-1 setting block level tag_level: #{@tag_level}"
@block_tag_level = @tag_level
end
puts "block_tag_level: #{@block_tag_level}"

append_space if started_with_whitespace
append_text(text)
append_space if ended_with_whitespace

@last_event = :CHARACTERS
end

def end_element(name)
Expand Down Expand Up @@ -112,11 +115,10 @@ def flush_block
when 0
return
when 1
clear_buffers if @sb_last_was_whitespace
clear_buffers if @last_event == :WHITESPACE
return
end

num_tokens = 0
num_words = 0
num_words_current_line = 0
num_words_in_wrapped_lines = 0
Expand All @@ -132,7 +134,6 @@ def flush_block
elsif ANCHOR_TEXT_END == token
@in_anchor_text = false
elsif is_word?(token)
num_tokens += 1
num_words += 1
num_words_current_line += 1
num_linked_words += 1 if @in_anchor_text
Expand All @@ -144,12 +145,10 @@ def flush_block
current_line_length = token_length
num_words_current_line = 1
end
else
num_tokens += 1
end
end

return if num_tokens == 0
return if tokens.empty?

num_words_in_wrapped_lines = 0
if num_wrapped_lines == 0
Expand All @@ -163,11 +162,13 @@ def flush_block
num_words,
num_linked_words,
num_words_in_wrapped_lines,
num_wrapped_lines, @offset_blocks)
num_wrapped_lines,
@offset_blocks)

@offset_blocks += 1
clear_buffers
text_block.set_tag_level(@block_tag_level)
classify_text_block_with_labels(text_block)
add_text_block(text_block)
@block_tag_level = -1
end
Expand All @@ -192,16 +193,6 @@ def is_word?(word)
word =~ VALID_WORD_CHARACTER
end

# public void flushBlock() {
# int numWords = 0;
# int numLinkedWords = 0;
# int numWrappedLines = 0;
# int currentLineLength = -1; // don't count the first space
# final int maxLineLength = 80;
# int numTokens = 0;
# int numWordsCurrentLine = 0;
# }

def increase_in_ignorable_element!
@in_ignorable_element += 1
end
Expand All @@ -227,29 +218,30 @@ def in_anchor_tag?
@in_anchor_tag > 0
end

def add_text_block(text_block)
@label_stacks.each do |stack|
next unless stack

stack.each do |label_action|
text_block.add_label(label_action.labels) if label_action
end
def classify_text_block_with_labels(text_block)
@label_stacks
.flatten
.filter{|stack| stack}
.each do |label_action|
text_block.add_label(label_action.labels)
end
end

def add_text_block(text_block)
@text_blocks << text_block
end

# append space if last character wasn't already one
def append_space
return if @sb_last_was_whitespace

@sb_last_was_whitespace = true
return if @last_event == :WHITESPACE
@last_event = :WHITESPACE

@text_buffer << ' '
@token_buffer << ' '
end

def append_text(text)
@sb_last_was_whitespace = false
@last_event = :CHARACTERS
@text_buffer << text
@token_buffer << text
end
Expand All @@ -260,11 +252,6 @@ def append_token(token)

def add_label_action(label_action)
label_stack = @label_stacks.last
if label_stack.nil?
label_stack = []
@label_stacks.pop
@label_stacks << label_stack
end
label_stack << label_action
end

Expand Down
2 changes: 0 additions & 2 deletions lib/boilerpipe/sax/tag_actions/anchor_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,11 @@ def changes_tag_level?
def append_anchor_text_start(handler)
handler.append_space
handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_START)
handler.append_token(' ')
end

def append_anchor_text_end(handler)
handler.append_space
handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_END)
handler.append_token(' ')
end

def nested_achor_tag_error_recovering(handler, name)
Expand Down
17 changes: 0 additions & 17 deletions lib/boilerpipe/sax/tag_actions/block_level.rb

This file was deleted.

Loading