Skip to content

Commit 6545233

Browse files
committed
wip
1 parent 647a6f1 commit 6545233

File tree

3 files changed

+64
-5
lines changed

3 files changed

+64
-5
lines changed

Gemfile.lock

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
PATH
2+
remote: .
3+
specs:
4+
boilerpipe-ruby (0.4.3)
5+
nokogiri (~> 1.10)
6+
7+
GEM
8+
remote: https://rubygems.org/
9+
specs:
10+
diff-lcs (1.4.4)
11+
docile (1.3.2)
12+
mini_portile2 (2.4.0)
13+
nokogiri (1.10.10)
14+
mini_portile2 (~> 2.4.0)
15+
rake (13.0.1)
16+
rickshaw (0.5.0)
17+
rspec (3.9.0)
18+
rspec-core (~> 3.9.0)
19+
rspec-expectations (~> 3.9.0)
20+
rspec-mocks (~> 3.9.0)
21+
rspec-core (3.9.2)
22+
rspec-support (~> 3.9.3)
23+
rspec-expectations (3.9.2)
24+
diff-lcs (>= 1.2.0, < 2.0)
25+
rspec-support (~> 3.9.0)
26+
rspec-mocks (3.9.1)
27+
diff-lcs (>= 1.2.0, < 2.0)
28+
rspec-support (~> 3.9.0)
29+
rspec-support (3.9.3)
30+
simplecov (0.18.5)
31+
docile (~> 1.1)
32+
simplecov-html (~> 0.11)
33+
simplecov-html (0.12.2)
34+
35+
PLATFORMS
36+
ruby
37+
38+
DEPENDENCIES
39+
boilerpipe-ruby!
40+
bundler (~> 2.0)
41+
rake (>= 12.3.3)
42+
rickshaw (~> 0.5.0)
43+
rspec (~> 3.9)
44+
simplecov (~> 0.18.5)
45+
46+
BUNDLED WITH
47+
2.1.4

lib/boilerpipe/sax/html_content_handler.rb

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,15 @@ def start_element(name, attrs = [])
3131
tag = name.upcase.intern
3232

3333
tag_action = @tag_actions[tag]
34+
org = @tag_level
3435
if tag_action
3536
@tag_level += 1 if tag_action.changes_tag_level?
3637
@flush = tag_action.start(self, name, attrs) | @flush
3738
else
3839
@tag_level += 1
3940
@flush = true
4041
end
42+
puts "before: #{org}, after: #{@tag_level}"
4143

4244
@last_event = :START_TAG
4345
@last_start_tag = tag
@@ -64,7 +66,11 @@ def characters(text)
6466
end
6567

6668
# set block levels
67-
@block_tag_level = @tag_level if @block_tag_level == -1
69+
if @block_tag_level == -1
70+
puts "-1 setting block level tag_level: #{@tag_level}"
71+
@block_tag_level = @tag_level
72+
end
73+
puts "block_tag_level: #{@block_tag_level}"
6874

6975
append_space if started_with_whitespace
7076
append_text(text)

spec/filters/block_proximity_fusion_spec.rb

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
module Boilerpipe::Filters
44
describe BlockProximityFusion do
5-
let(:text_block1) { Boilerpipe::Document::TextBlock.new('one', 0, 0, 0, 0, 0) }
6-
let(:text_block2) { Boilerpipe::Document::TextBlock.new('two', 0, 0, 0, 0, 1) }
7-
let(:text_block3) { Boilerpipe::Document::TextBlock.new('three', 0, 0, 0, 0, 2) }
8-
let(:text_block4) { Boilerpipe::Document::TextBlock.new('four', 0, 0, 0, 0, 3) }
5+
let(:text_block1) { Boilerpipe::Document::TextBlock.new('one', 0, 0, 0, 1, 0) }
6+
let(:text_block2) { Boilerpipe::Document::TextBlock.new('two', 0, 0, 0, 1, 1) }
7+
let(:text_block3) { Boilerpipe::Document::TextBlock.new('three', 0, 0, 0, 1, 2) }
8+
let(:text_block4) { Boilerpipe::Document::TextBlock.new('four', 0, 0, 0, 1, 3) }
99

1010
let(:text_blocks) { [text_block1, text_block2, text_block3, text_block4] }
1111
let!(:doc) { Boilerpipe::Document::TextDocument.new('', text_blocks) }
@@ -18,6 +18,7 @@ module Boilerpipe::Filters
1818

1919
describe '#process' do
2020
context 'where blocks exceed distance' do
21+
# only_content: true, same_tag_level: false
2122
it 'doesnt change blocks' do
2223
expect(doc.text_blocks.size).to eq 4
2324
filter = BlockProximityFusion.new(1, true, false)
@@ -27,10 +28,15 @@ module Boilerpipe::Filters
2728
end
2829

2930
context 'where blocks do not exceed distance' do
31+
# only_content: false, same_tag_level: false
3032
it 'Fuses adjacent blocks' do
33+
puts doc.text_blocks.map(&:text).inspect
34+
puts doc.debug_s
3135
expect(doc.text_blocks.last.text.size).to eq 4
3236
filter = BlockProximityFusion.new(1, false, false)
3337
filter.process(doc)
38+
puts doc.text_blocks.map(&:text).inspect
39+
puts doc.debug_s
3440
expect(doc.text_blocks.last.text).to eq "three\nfour"
3541
end
3642

0 commit comments

Comments
 (0)