Skip to content

Commit de3804e

Browse files
committed
clean up / split up label logic
1 parent 6912998 commit de3804e

File tree

1 file changed

+12
-14
lines changed

1 file changed

+12
-14
lines changed

lib/boilerpipe/sax/html_content_handler.rb

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ class HTMLContentHandler < Nokogiri::XML::SAX::Document
77
ANCHOR_TEXT_END = ">\ue00a$"
88

99
def initialize
10-
@label_stacks = []
10+
@label_stacks = [[]]
1111
@tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions
1212
@tag_level = 0
1313
@text_buffer = ''
@@ -27,7 +27,7 @@ def initialize
2727
end
2828

2929
def start_element(name, attrs = [])
30-
@label_stacks << nil
30+
@label_stacks << []
3131
tag = name.upcase.intern
3232

3333
tag_action = @tag_actions[tag]
@@ -161,6 +161,7 @@ def flush_block
161161
@offset_blocks += 1
162162
clear_buffers
163163
text_block.set_tag_level(@block_tag_level)
164+
classify_text_block_with_labels(text_block)
164165
add_text_block(text_block)
165166
@block_tag_level = -1
166167
end
@@ -210,14 +211,16 @@ def in_anchor_tag?
210211
@in_anchor_tag > 0
211212
end
212213

213-
def add_text_block(text_block)
214-
@label_stacks.each do |stack|
215-
next unless stack
216-
217-
stack.each do |label_action|
218-
text_block.add_label(label_action.labels) if label_action
219-
end
214+
def classify_text_block_with_labels(text_block)
215+
@label_stacks
216+
.flatten
217+
.filter{|stack| stack}
218+
.each do |label_action|
219+
text_block.add_label(label_action.labels)
220220
end
221+
end
222+
223+
def add_text_block(text_block)
221224
@text_blocks << text_block
222225
end
223226

@@ -242,11 +245,6 @@ def append_token(token)
242245

243246
def add_label_action(label_action)
244247
label_stack = @label_stacks.last
245-
if label_stack.nil?
246-
label_stack = []
247-
@label_stacks.pop
248-
@label_stacks << label_stack
249-
end
250248
label_stack << label_action
251249
end
252250

0 commit comments

Comments
 (0)