Skip to content

Commit 28cfb79

Browse files
committed
clean up / split up label logic
1 parent 0662ee6 commit 28cfb79

File tree

1 file changed

+12
-14
lines changed

1 file changed

+12
-14
lines changed

lib/boilerpipe/sax/html_content_handler.rb

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ class HTMLContentHandler < Nokogiri::XML::SAX::Document
77
ANCHOR_TEXT_END = ">\ue00a$"
88

99
def initialize
10-
@label_stacks = []
10+
@label_stacks = [[]]
1111
@tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions
1212
@tag_level = 0
1313
@text_buffer = ''
@@ -27,7 +27,7 @@ def initialize
2727
end
2828

2929
def start_element(name, attrs = [])
30-
@label_stacks << nil
30+
@label_stacks << []
3131
tag = name.upcase.intern
3232

3333
tag_action = @tag_actions[tag]
@@ -155,6 +155,7 @@ def flush_block
155155
@offset_blocks += 1
156156
clear_buffers
157157
text_block.set_tag_level(@block_tag_level)
158+
classify_text_block_with_labels(text_block)
158159
add_text_block(text_block)
159160
@block_tag_level = -1
160161
end
@@ -204,14 +205,16 @@ def in_anchor_tag?
204205
@in_anchor_tag > 0
205206
end
206207

207-
def add_text_block(text_block)
208-
@label_stacks.each do |stack|
209-
next unless stack
210-
211-
stack.each do |label_action|
212-
text_block.add_label(label_action.labels) if label_action
213-
end
208+
def classify_text_block_with_labels(text_block)
209+
@label_stacks
210+
.flatten
211+
.filter{|stack| stack}
212+
.each do |label_action|
213+
text_block.add_label(label_action.labels)
214214
end
215+
end
216+
217+
def add_text_block(text_block)
215218
@text_blocks << text_block
216219
end
217220

@@ -236,11 +239,6 @@ def append_token(token)
236239

237240
def add_label_action(label_action)
238241
label_stack = @label_stacks.last
239-
if label_stack.nil?
240-
label_stack = []
241-
@label_stacks.pop
242-
@label_stacks << label_stack
243-
end
244242
label_stack << label_action
245243
end
246244

0 commit comments

Comments
 (0)