@@ -7,7 +7,7 @@ class HTMLContentHandler < Nokogiri::XML::SAX::Document
77 ANCHOR_TEXT_END = ">\ue00a $"
88
99 def initialize
10- @label_stacks = [ ]
10+ @label_stacks = [ [ ] ]
1111 @tag_actions = ::Boilerpipe ::SAX ::TagActionMap . tag_actions
1212 @tag_level = 0
1313 @text_buffer = ''
@@ -27,7 +27,7 @@ def initialize
2727 end
2828
2929 def start_element ( name , attrs = [ ] )
30- @label_stacks << nil
30+ @label_stacks << [ ]
3131 tag = name . upcase . intern
3232
3333 tag_action = @tag_actions [ tag ]
@@ -161,6 +161,7 @@ def flush_block
161161 @offset_blocks += 1
162162 clear_buffers
163163 text_block . set_tag_level ( @block_tag_level )
164+ classify_text_block_with_labels ( text_block )
164165 add_text_block ( text_block )
165166 @block_tag_level = -1
166167 end
@@ -210,14 +211,16 @@ def in_anchor_tag?
210211 @in_anchor_tag > 0
211212 end
212213
213- def add_text_block ( text_block )
214- @label_stacks . each do |stack |
215- next unless stack
216-
217- stack . each do |label_action |
218- text_block . add_label ( label_action . labels ) if label_action
219- end
214+ def classify_text_block_with_labels ( text_block )
215+ @label_stacks
216+ . flatten
217+ . filter { |stack | stack }
218+ . each do |label_action |
219+ text_block . add_label ( label_action . labels )
220220 end
221+ end
222+
223+ def add_text_block ( text_block )
221224 @text_blocks << text_block
222225 end
223226
@@ -242,11 +245,6 @@ def append_token(token)
242245
243246 def add_label_action ( label_action )
244247 label_stack = @label_stacks . last
245- if label_stack . nil?
246- label_stack = [ ]
247- @label_stacks . pop
248- @label_stacks << label_stack
249- end
250248 label_stack << label_action
251249 end
252250
0 commit comments