@@ -7,7 +7,7 @@ class HTMLContentHandler < Nokogiri::XML::SAX::Document
77 ANCHOR_TEXT_END = ">\ue00a $"
88
99 def initialize
10- @label_stacks = [ ]
10+ @label_stacks = [ [ ] ]
1111 @tag_actions = ::Boilerpipe ::SAX ::TagActionMap . tag_actions
1212 @tag_level = 0
1313 @text_buffer = ''
@@ -27,7 +27,7 @@ def initialize
2727 end
2828
2929 def start_element ( name , attrs = [ ] )
30- @label_stacks << nil
30+ @label_stacks << [ ]
3131 tag = name . upcase . intern
3232
3333 tag_action = @tag_actions [ tag ]
@@ -155,6 +155,7 @@ def flush_block
155155 @offset_blocks += 1
156156 clear_buffers
157157 text_block . set_tag_level ( @block_tag_level )
158+ classify_text_block_with_labels ( text_block )
158159 add_text_block ( text_block )
159160 @block_tag_level = -1
160161 end
@@ -204,14 +205,16 @@ def in_anchor_tag?
204205 @in_anchor_tag > 0
205206 end
206207
207- def add_text_block ( text_block )
208- @label_stacks . each do |stack |
209- next unless stack
210-
211- stack . each do |label_action |
212- text_block . add_label ( label_action . labels ) if label_action
213- end
208+ def classify_text_block_with_labels ( text_block )
209+ @label_stacks
210+ . flatten
211+ . filter { |stack | stack }
212+ . each do |label_action |
213+ text_block . add_label ( label_action . labels )
214214 end
215+ end
216+
217+ def add_text_block ( text_block )
215218 @text_blocks << text_block
216219 end
217220
@@ -236,11 +239,6 @@ def append_token(token)
236239
237240 def add_label_action ( label_action )
238241 label_stack = @label_stacks . last
239- if label_stack . nil?
240- label_stack = [ ]
241- @label_stacks . pop
242- @label_stacks << label_stack
243- end
244242 label_stack << label_action
245243 end
246244
0 commit comments