Skip to content

Commit 3a8deca

Browse files
author
Kelley Reynolds
committed
Slightly refactor to allow adding of referers by method instead of file
1 parent d746f84 commit 3a8deca

File tree

3 files changed

+131
-58
lines changed

3 files changed

+131
-58
lines changed

ruby/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ parser = RefererParser::Parser.new('http://example.com/path/to/other/referers.ym
4646
parser = RefererParser::Parser.new
4747
parser.update('/path/to/internal.yml')
4848

49+
# Default referers, then add your own internal domain inline instead of from a file
50+
parser = RefererParser::Parser.new
51+
parser.add_referer('internal', 'SnowPlow', 'snowplowanalytics.com')
52+
4953
# Clear all of the existing referers
5054
parser.clear!
5155
```

ruby/lib/referer-parser/parser.rb

Lines changed: 45 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ class Parser
2222

2323
# Create a new parser from one or more filenames/uris, defaults to ../data/referers.json
2424
def initialize(uris=DefaultFile)
25+
@domain_index ||= {}
26+
@name_hash ||= {}
27+
2528
update(uris)
2629
end
2730

@@ -37,10 +40,48 @@ def update(uris)
3740
# Clean out the database
3841
def clear!
3942
@domain_index, @name_hash = {}, {}
40-
43+
4144
true
4245
end
4346

47+
# Add a referer to the database with medium, name, domain or array of domains, and a parameter or array of parameters
48+
# If called manually and a domain is added to an existing entry with a path, you may need to call optimize_index! afterwards.
49+
def add_referer(medium, name, domains, parameters=nil)
50+
# The same name can be used with multiple mediums so we make a key here
51+
name_key = "#{name}-#{medium}"
52+
53+
# Update the name has with the parameter and medium data
54+
@name_hash[name_key] = {:source => name, :medium => medium, :parameters => [parameters].flatten }
55+
56+
# Update the domain to name index
57+
[domains].flatten.each do |domain_url|
58+
domain, *path = domain_url.split('/')
59+
if domain =~ /\Awww\.(.*)\z/i
60+
domain = $1
61+
end
62+
63+
domain.downcase!
64+
65+
@domain_index[domain] ||= []
66+
if !path.empty?
67+
@domain_index[domain] << ['/' + path.join('/'), name_key]
68+
else
69+
@domain_index[domain] << ['/', name_key]
70+
end
71+
end
72+
end
73+
74+
# Prune duplicate entries and sort with the most specific path first if there is more than one entry
75+
# In this case, sorting by the longest string works fine
76+
def optimize_index!
77+
@domain_index.each do |key, val|
78+
# Sort each path/name_key pair by the longest path
79+
@domain_index[key].sort! { |a, b|
80+
b[0].size <=> a[0].size
81+
}.uniq!
82+
end
83+
end
84+
4485
# Given a string or URI, return a hash of data
4586
def parse(obj)
4687
url = obj.is_a?(URI) ? obj : URI.parse(obj.to_s)
@@ -141,7 +182,7 @@ def deserialize_json(data)
141182

142183
def read_referer_data(uri)
143184
# Attempt to read the data from the network if application, or the file on the local system
144-
if uri =~ /^(?:ht|f)tps?:\/\//
185+
if uri =~ /\A(?:ht|f)tps?:\/\//
145186
require 'open-uri'
146187
begin
147188
open(uri).read
@@ -160,44 +201,13 @@ def read_referer_data(uri)
160201
# Format of the name_hash:
161202
# { name_key => {:source, :medium, :parameters} }
162203
def parse_referer_data(data)
163-
@domain_index ||= {}
164-
@name_hash ||= {}
165-
166204
data.each do |medium, name_hash|
167205
name_hash.each do |name, name_data|
168-
# The same name can be used with multiple mediums so we make a key here
169-
name_key = "#{name}-#{medium}"
170-
171-
# Update the name has with the parameter and medium data
172-
@name_hash[name_key] = {:source => name, :medium => medium, :parameters => name_data['parameters'] }
173-
174-
# Update the domain to name index
175-
name_data['domains'].each do |domain_url|
176-
domain, *path = domain_url.split('/')
177-
if domain =~ /\Awww\.(.*)\z/i
178-
domain = $1
179-
end
180-
181-
domain.downcase!
182-
183-
@domain_index[domain] ||= []
184-
if !path.empty?
185-
@domain_index[domain] << ['/' + path.join('/'), name_key]
186-
else
187-
@domain_index[domain] << ['/', name_key]
188-
end
189-
end
206+
add_referer(medium, name, name_data['domains'], name_data['parameters'])
190207
end
191208
end
192209

193-
# Prune duplicate entries and sort with the most specific path first if there is more than one entry
194-
# In this case, sorting by the longest string works fine
195-
@domain_index.each do |key, val|
196-
# Sort each path/name_key pair by the longest path
197-
@domain_index[key].sort! { |a, b|
198-
b[0].size <=> a[0].size
199-
}.uniq!
200-
end
210+
optimize_index!
201211
rescue
202212
raise CorruptReferersError.new("Unable to parse referer data", $!)
203213
end

ruby/spec/parser_spec.rb

Lines changed: 82 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -21,79 +21,89 @@
2121
let(:internal_parser) { RefererParser::Parser.new(fixture('internal.json')) }
2222
let(:combined_parser) { RefererParser::Parser.new([RefererParser::Parser::DefaultFile, fixture('internal.json')]) }
2323
let(:remote_parser) { RefererParser::Parser.new(remote_file) }
24+
let(:domain_index) { parser.instance_variable_get(:@domain_index) }
25+
let(:name_hash) { parser.instance_variable_get(:@name_hash) }
26+
27+
# This gets overridden for different parsers in subsections
28+
let(:parser) { default_parser }
2429

2530
describe "exceptions" do
2631
it "should raise UnsupportedFormatError" do
27-
lambda { default_parser.update(__FILE__) }.should raise_error(RefererParser::UnsupportedFormatError)
32+
lambda { parser.update(__FILE__) }.should raise_error(RefererParser::UnsupportedFormatError)
2833
end
2934

3035
it "should raise CorruptReferersError with invalid json" do
31-
lambda { default_parser.update(fixture('invalid.json')) }.should raise_error(RefererParser::CorruptReferersError)
36+
lambda { parser.update(fixture('invalid.json')) }.should raise_error(RefererParser::CorruptReferersError)
3237
end
3338

3439
it "should raise CorruptReferersError with invalid yaml" do
35-
lambda { default_parser.update(fixture('invalid.yml')) }.should raise_error(RefererParser::CorruptReferersError)
40+
lambda { parser.update(fixture('invalid.yml')) }.should raise_error(RefererParser::CorruptReferersError)
3641
end
3742

3843
it "should raise CorruptReferersError with valid file with invalid data" do
39-
lambda { default_parser.update(fixture('referer-tests.json')) }.should raise_error(RefererParser::CorruptReferersError)
44+
lambda { parser.update(fixture('referer-tests.json')) }.should raise_error(RefererParser::CorruptReferersError)
4045
end
4146

4247
it "should raise InvalidUriError with insane" do
43-
lambda { default_parser.parse('>total gibberish<') }.should raise_error(RefererParser::InvalidUriError)
48+
lambda { parser.parse('>total gibberish<') }.should raise_error(RefererParser::InvalidUriError)
4449
end
4550

4651
it "should raise InvalidUriError with non http(s)" do
47-
lambda { default_parser.parse('ftp://ftp.really.com/whatever.json') }.should raise_error(RefererParser::InvalidUriError)
52+
lambda { parser.parse('ftp://ftp.really.com/whatever.json') }.should raise_error(RefererParser::InvalidUriError)
4853
end
4954
end
5055

5156
describe "with the default parser" do
5257
it "should have a non-empty domain_index" do
53-
default_parser.instance_variable_get(:@domain_index).should_not be_empty
58+
domain_index.should_not be_empty
5459
end
5560

5661
it "should have a non-empty name_hash" do
57-
default_parser.instance_variable_get(:@name_hash).should_not be_empty
62+
name_hash.should_not be_empty
5863
end
5964

6065
it "should be clearable" do
61-
default_parser.clear!
62-
default_parser.instance_variable_get(:@name_hash).should be_empty
63-
default_parser.instance_variable_get(:@domain_index).should be_empty
66+
parser.clear!
67+
name_hash.should be_empty
68+
domain_index.should be_empty
6469
end
6570

6671
it "should be updatable" do
67-
size = default_parser.instance_variable_get(:@domain_index).size
68-
default_parser.update(fixture('internal.json'))
69-
default_parser.instance_variable_get(:@domain_index).size.should > size
72+
size = domain_index.size
73+
parser.update(fixture('internal.json'))
74+
domain_index.size.should > size
7075
end
7176
end
7277

7378
describe "with the internal parser" do
79+
let(:parser) { internal_parser }
80+
7481
it "should have internal mediums only" do
75-
internal_parser.instance_variable_get(:@domain_index).each_value do |(arr)|
82+
domain_index.each_value do |(arr)|
7683
path, name_key = arr[0], arr[1]
77-
internal_parser.instance_variable_get(:@name_hash)[name_key][:medium].should == 'internal'
84+
name_hash[name_key][:medium].should == 'internal'
7885
end
7986
end
8087
end
8188

8289
describe "with the remote parser" do
90+
let(:parser) { remote_parser }
91+
8392
# These are combined here to reduce network fetches
8493
it "should have a non-empty domain_index and name_hash" do
85-
remote_parser.instance_variable_get(:@domain_index).should_not be_empty
86-
remote_parser.instance_variable_get(:@name_hash).should_not be_empty
94+
domain_index.should_not be_empty
95+
name_hash.should_not be_empty
8796
end
8897
end
8998

9099
describe "sample fixtures" do
100+
let(:parser) { combined_parser }
91101
# Include our internal data as well
92102
JSON.parse(File.read(File.join(File.dirname(__FILE__), 'fixtures', 'referer-tests.json'))).each do |fixture|
93103
it fixture['spec'] do
94104
parsed_as_string, parsed_as_uri = nil, nil
95-
lambda { parsed_as_string = combined_parser.parse(fixture['uri']) }.should_not raise_error
96-
lambda { parsed_as_uri = combined_parser.parse(URI.parse(fixture['uri'])) }.should_not raise_error
105+
lambda { parsed_as_string = parser.parse(fixture['uri']) }.should_not raise_error
106+
lambda { parsed_as_uri = parser.parse(URI.parse(fixture['uri'])) }.should_not raise_error
97107

98108
['source', 'term', 'known', 'medium'].each do |key|
99109
parsed_as_uri[key.to_sym].should == fixture[key]
@@ -105,18 +115,67 @@
105115

106116
describe "general behavior" do
107117
it "should return the better result when the referer contains two or more parameters" do
108-
parsed = default_parser.parse("http://search.tiscali.it/?tiscalitype=web&collection=web&q=&key=hello")
118+
parsed = parser.parse("http://search.tiscali.it/?tiscalitype=web&collection=web&q=&key=hello")
109119
parsed[:term].should == "hello"
110120
end
111121

112122
it "should return the better result when the referer contains same parameters" do
113-
parsed = default_parser.parse("http://search.tiscali.it/?tiscalitype=web&collection=web&key=&key=hello")
123+
parsed = parser.parse("http://search.tiscali.it/?tiscalitype=web&collection=web&key=&key=hello")
114124
parsed[:term].should == "hello"
115125
end
116126

117127
it "should return the normalized domain" do
118-
parsed = default_parser.parse("http://it.images.search.YAHOO.COM/images/view;_ylt=A0PDodgQmGBQpn4AWQgdDQx.;_ylu=X3oDMTBlMTQ4cGxyBHNlYwNzcgRzbGsDaW1n?back=http%3A%2F%2Fit.images.search.yahoo.com%2Fsearch%2Fimages%3Fp%3DEarth%2BMagic%2BOracle%2BCards%26fr%3Dmcafee%26fr2%3Dpiv-web%26tab%3Dorganic%26ri%3D5&w=1064&h=1551&imgurl=mdm.pbzstatic.com%2Foracles%2Fearth-magic-oracle-cards%2Fcard-1.png&rurl=http%3A%2F%2Fwww.psychicbazaar.com%2Foracles%2F143-earth-magic-oracle-cards.html&size=2.8+KB&name=Earth+Magic+Oracle+Cards+-+Psychic+Bazaar&p=Earth+Magic+Oracle+Cards&oid=f0a5ad5c4211efe1c07515f56cf5a78e&fr2=piv-web&fr=mcafee&tt=Earth%2BMagic%2BOracle%2BCards%2B-%2BPsychic%2BBazaar&b=0&ni=90&no=5&ts=&tab=organic&sigr=126n355ib&sigb=13hbudmkc&sigi=11ta8f0gd&.crumb=IZBOU1c0UHU")
128+
parsed = parser.parse("http://it.images.search.YAHOO.COM/images/view;_ylt=A0PDodgQmGBQpn4AWQgdDQx.;_ylu=X3oDMTBlMTQ4cGxyBHNlYwNzcgRzbGsDaW1n?back=http%3A%2F%2Fit.images.search.yahoo.com%2Fsearch%2Fimages%3Fp%3DEarth%2BMagic%2BOracle%2BCards%26fr%3Dmcafee%26fr2%3Dpiv-web%26tab%3Dorganic%26ri%3D5&w=1064&h=1551&imgurl=mdm.pbzstatic.com%2Foracles%2Fearth-magic-oracle-cards%2Fcard-1.png&rurl=http%3A%2F%2Fwww.psychicbazaar.com%2Foracles%2F143-earth-magic-oracle-cards.html&size=2.8+KB&name=Earth+Magic+Oracle+Cards+-+Psychic+Bazaar&p=Earth+Magic+Oracle+Cards&oid=f0a5ad5c4211efe1c07515f56cf5a78e&fr2=piv-web&fr=mcafee&tt=Earth%2BMagic%2BOracle%2BCards%2B-%2BPsychic%2BBazaar&b=0&ni=90&no=5&ts=&tab=organic&sigr=126n355ib&sigb=13hbudmkc&sigi=11ta8f0gd&.crumb=IZBOU1c0UHU")
119129
parsed[:domain].should == "images.search.yahoo.com"
120130
end
121131
end
132+
133+
describe "optimize_index" do
134+
let(:domains) { ['fnord.com', 'fnord.com', 'fnord.com/path'] }
135+
136+
before do
137+
parser.add_referer('internal', 'Fnord', domains)
138+
end
139+
140+
it "should have out of order and duplicate domains before optimization" do
141+
domain_index['fnord.com'].transpose.first.should == ['/', '/', '/path']
142+
end
143+
144+
it "should have out of order domains before optimization" do
145+
parser.optimize_index!
146+
domain_index['fnord.com'].transpose.first.should == ['/path', '/']
147+
end
148+
end
149+
150+
describe "add_referer" do
151+
it "should add a referer to the domain_index" do
152+
domain_index['fnord.com'].should be_nil
153+
parser.add_referer('internal', 'Fnord', 'fnord.com')
154+
domain_index['fnord.com'].should_not be_nil
155+
end
156+
157+
it "should add a referer with multiple domains to the domain_index" do
158+
domain_index['fnord.com'].should be_nil
159+
domain_index['boo.com'].should be_nil
160+
parser.add_referer('internal', 'Fnord', ['fnord.com', 'boo.com'])
161+
domain_index['fnord.com'].should_not be_nil
162+
domain_index['boo.com'].should_not be_nil
163+
end
164+
165+
it "should add a referer to the name_hash" do
166+
name_hash['fnord.com-internal'].should be_nil
167+
parser.add_referer('internal', 'Fnord', 'fnord.com')
168+
name_hash['Fnord-internal'].should_not be_nil
169+
end
170+
171+
it "should add parameters to the name_hash" do
172+
parser.add_referer('internal', 'Fnord', 'fnord.com', ['Q', 'q'])
173+
name_hash['Fnord-internal'][:parameters].should == ['Q', 'q']
174+
end
175+
176+
it "should add a single parameter to the name_hash" do
177+
parser.add_referer('internal', 'Fnord', 'fnord.com', 'q')
178+
name_hash['Fnord-internal'][:parameters].should == ['q']
179+
end
180+
end
122181
end

0 commit comments

Comments
 (0)