Skip to content

Commit c8c0f34

Browse files
committed
updated Python referer_parser to match Java version, added tests
1 parent 79bfd36 commit c8c0f34

File tree

2 files changed

+286
-19
lines changed

2 files changed

+286
-19
lines changed

python/referer_parser/__init__.py

Lines changed: 82 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,28 +5,91 @@
55

66
JSON_FILE = os.path.join(os.path.dirname(__file__), 'data', 'referers.json')
77
REFERERS = {}
8-
for ref, config in json.load(open(JSON_FILE))['search'].iteritems(): # TODO: update this to support the other referer mediums as well
9-
for domain in config['domains']:
10-
REFERERS[domain] = {
11-
'name': ref,
12-
'params': map(unicode.lower, config['parameters']),
13-
}
8+
9+
with open(JSON_FILE) as json_content:
10+
for medium, conf_list in json.load(json_content).iteritems():
11+
for ref, config in conf_list.iteritems():
12+
make_ref = None
13+
if 'parameters' in config:
14+
def make_ref_params(config_dict):
15+
return {
16+
'name': ref,
17+
'params': map(unicode.lower, config_dict['parameters']),
18+
'medium': medium,
19+
}
20+
make_ref = make_ref_params
21+
else:
22+
make_ref = lambda _: {'name': ref, 'medium': medium}
23+
for domain in config['domains']:
24+
REFERERS[domain] = make_ref(config)
1425

1526

1627
class Referer(object):
17-
def __init__(self, url):
18-
self.uri = urlparse(url)
19-
host = self.uri.netloc.split(':', 1)[0]
20-
hostpath = host + self.uri.path
21-
if hostpath in REFERERS:
22-
host = hostpath
23-
self.known = False if host not in REFERERS else True
28+
def __init__(self, ref_url, curr_url=None):
29+
self.known = False
2430
self.referer = None
25-
self.search_parameter = ''
26-
self.search_term = ''
27-
if self.known:
28-
self.referer = REFERERS[host]['name']
29-
for param, val in parse_qsl(self.uri.query):
30-
if param.lower() in REFERERS[host]['params']:
31+
self.medium = 'unknown'
32+
self.search_parameter = None
33+
self.search_term = None
34+
35+
ref_uri = urlparse(ref_url)
36+
ref_host = ref_uri.hostname
37+
self.known = ref_uri.scheme in {'http', 'https'}
38+
39+
# print "Scheme: %s" % ref_uri.scheme
40+
41+
if not self.known:
42+
return
43+
44+
if curr_url:
45+
curr_uri = urlparse(curr_url)
46+
curr_host = curr_uri.hostname
47+
if curr_host == ref_host:
48+
self.medium = 'internal'
49+
return
50+
51+
# print "Getting referer with path"
52+
referer = self.__lookup_referer(ref_host, ref_uri.path, True)
53+
# print "Got %s" % referer
54+
if not referer:
55+
# print "Getting referer without path"
56+
referer = self.__lookup_referer(ref_host, ref_uri.path, False)
57+
# print "Got %s" % referer
58+
if not referer:
59+
self.medium = 'unknown'
60+
return
61+
62+
# print "Assigning name %s" % referer['name']
63+
self.referer = referer['name']
64+
self.medium = referer['medium']
65+
66+
if referer['medium'] == 'search':
67+
if 'params' not in referer or not referer['params']:
68+
# print "Returning"
69+
return
70+
for param, val in parse_qsl(ref_uri.query):
71+
if param.lower() in referer['params']:
3172
self.search_parameter = param
3273
self.search_term = val
74+
75+
def __lookup_referer(self, ref_host, ref_path, include_path):
76+
referer = None
77+
try:
78+
referer = REFERERS[ref_host + ref_path] if include_path else REFERERS[ref_host]
79+
except KeyError:
80+
if include_path:
81+
path_parts = ref_path.split('/')
82+
if len(path_parts) > 1:
83+
try:
84+
referer = REFERERS[ref_host + '/' + path_parts[1]]
85+
except KeyError:
86+
pass
87+
if not referer:
88+
try:
89+
idx = ref_host.index('.')
90+
return self.__lookup_referer(ref_host[idx + 1:], ref_path, include_path)
91+
except ValueError:
92+
return None
93+
else:
94+
return referer
95+

python/test_referer_parser.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
import unittest
2+
from referer_parser import Referer
3+
4+
class TestRefererParsing(unittest.TestCase):
5+
def check_equals(self, ref_obj, referer, term, medium):
6+
self.assertTrue(ref_obj.known)
7+
self.assertEqual(ref_obj.referer, referer)
8+
self.assertEqual(ref_obj.search_term, term)
9+
self.assertEqual(ref_obj.medium, medium)
10+
11+
def check_no_term(self, ref_obj, referer, medium):
12+
self.assertTrue(ref_obj.known)
13+
self.assertEqual(ref_obj.referer, referer)
14+
self.assertIsNone(ref_obj.search_term)
15+
self.assertEqual(ref_obj.medium, medium)
16+
17+
def test_google_minimal(self):
18+
""" Google search #1
19+
"""
20+
r = Referer('http://www.google.com/search')
21+
# r.known, r.referer, r.search_term, r.medium
22+
self.assertTrue(r.known)
23+
self.assertEqual(r.referer, 'Google')
24+
self.assertIsNone(r.search_term)
25+
self.assertEqual(r.medium, 'search')
26+
27+
def test_google_term(self):
28+
""" Google search #2
29+
"""
30+
r = Referer('http://www.google.com/search?q=gateway+oracle+cards+denise+linn&hl=\
31+
en&client=safari')
32+
self.check_equals(r, 'Google', 'gateway oracle cards denise linn', 'search')
33+
34+
def test_powered_by_google(self):
35+
""" Powered by Google
36+
"""
37+
r = Referer("""http://isearch.avg.com/pages/images.aspx?q=tarot+card+change&sap=\
38+
dsp&lang=en&mid=209215200c4147d1a9d6d1565005540b-b0d4f81a8999f5981f04537c5ec8468fd523459\
39+
3&cid=%7B50F9298B-C111-4C7E-9740-363BF0015949%7D&v=12.1.0.21&ds=AVG&d=7%2F23%2F2012+10%3\
40+
A31%3A08+PM&pr=fr&sba=06oENya4ZG1YS6vOLJwpLiFdjG91ICt2YE59W2p5ENc2c4w8KvJb5xbvjkj3ceMjny\
41+
TSpZq-e6pj7GQUylIQtuK4psJU60wZuI-8PbjX-OqtdX3eIcxbMoxg3qnIasP0ww2fuID1B-p2qJln8vBHxWztkp\
42+
xeixjZPSppHnrb9fEcx62a9DOR0pZ-V-Kjhd-85bIL0QG5qi1OuA4M1eOP4i_NzJQVRXPQDmXb-CpIcruc2h5FE9\
43+
2Tc8QMUtNiTEWBbX-QiCoXlgbHLpJo5Jlq-zcOisOHNWU2RSHYJnK7IUe_SH6iQ.%2CYT0zO2s9MTA7aD1mNjZmZ\
44+
DBjMjVmZDAxMGU4&snd=hdr&tc=test1""")
45+
self.check_equals(r, 'Google', 'tarot card change', 'search')
46+
47+
def test_google_img_search(self):
48+
""" Google Images search
49+
"""
50+
r = Referer("""http://www.google.fr/imgres?q=Ogham+the+celtic+oracle&hl=fr&safe=\
51+
off&client=firefox-a&hs=ZDu&sa=X&rls=org.mozilla:fr-FR:unofficial&tbm=isch&prmd=imvnsa&t\
52+
bnid=HUVaj-o88ZRdYM:&imgrefurl=http://www.psychicbazaar.com/oracles/101-ogham-the-celtic\
53+
-oracle-set.html&docid=DY5_pPFMliYUQM&imgurl=http://mdm.pbzstatic.com/oracles/ogham-the-\
54+
celtic-oracle-set/montage.png&w=734&h=250&ei=GPdWUIePCOqK0AWp3oCQBA&zoom=1&iact=hc&vpx=1\
55+
29&vpy=276&dur=827&hovh=131&hovw=385&tx=204&ty=71&sig=104115776612919232039&page=1&tbnh=\
56+
69&tbnw=202&start=0&ndsp=26&ved=1t:429,r:13,s:0,i:114&biw=1272&bih=826""")
57+
self.check_equals(r, 'Google Images', 'Ogham the celtic oracle', 'search')
58+
59+
def test_yahoo_search(self):
60+
""" Yahoo! search
61+
"""
62+
r = Referer("""http://es.search.yahoo.com/search;_ylt=A7x9QbwbZXxQ9EMAPCKT.Qt.?p=\
63+
BIEDERMEIER+FORTUNE+TELLING+CARDS&ei=utf-8&type=685749&fr=chr-greentree_gc&xargs=0&pstar\
64+
t=1&b=11""")
65+
self.check_equals(r, 'Yahoo!', 'BIEDERMEIER FORTUNE TELLING CARDS', 'search')
66+
67+
def test_yahoo_img_search(self):
68+
""" Yahoo! Images search
69+
"""
70+
r = Referer("""http://it.images.search.yahoo.com/images/view;_ylt=A0PDodgQmGBQpn\
71+
4AWQgdDQx.;_ylu=X3oDMTBlMTQ4cGxyBHNlYwNzcgRzbGsDaW1n?back=http%3A%2F%2Fit.images.search.\
72+
yahoo.com%2Fsearch%2Fimages%3Fp%3DEarth%2BMagic%2BOracle%2BCards%26fr%3Dmcafee%26fr2%3Dp\
73+
iv-web%26tab%3Dorganic%26ri%3D5&w=1064&h=1551&imgurl=mdm.pbzstatic.com%2Foracles%2Fearth\
74+
-magic-oracle-cards%2Fcard-1.png&rurl=http%3A%2F%2Fwww.psychicbazaar.com%2Foracles%2F143\
75+
-earth-magic-oracle-cards.html&size=2.8+KB&name=Earth+Magic+Oracle+Cards+-+Psychic+Bazaa\
76+
r&p=Earth+Magic+Oracle+Cards&oid=f0a5ad5c4211efe1c07515f56cf5a78e&fr2=piv-web&fr=mcafee&\
77+
tt=Earth%2BMagic%2BOracle%2BCards%2B-%2BPsychic%2BBazaar&b=0&ni=90&no=5&ts=&tab=organic&\
78+
sigr=126n355ib&sigb=13hbudmkc&sigi=11ta8f0gd&.crumb=IZBOU1c0UHU""")
79+
self.check_equals(r, 'Yahoo! Images', 'Earth Magic Oracle Cards', 'search')
80+
81+
def test_price_runner_search(self):
82+
""" PriceRunner search
83+
"""
84+
r = Referer("""http://www.pricerunner.co.uk/search?displayNoHitsMessage=1&q=wild\
85+
+wisdom+of+the+faery+oracle""")
86+
self.check_equals(r, 'PriceRunner', 'wild wisdom of the faery oracle', 'search')
87+
88+
def test_bing_img(self):
89+
""" Bing Images
90+
"""
91+
r = Referer("""http://www.bing.com/images/search?q=psychic+oracle+cards&view=det\
92+
ail&id=D268EDDEA8D3BF20AF887E62AF41E8518FE96F08""")
93+
self.check_equals(r, 'Bing Images', 'psychic oracle cards', 'search')
94+
95+
def test_ixquick(self):
96+
""" IXquick search
97+
"""
98+
r = Referer("""https://s3-us3.ixquick.com/do/search""")
99+
self.assertTrue(r.known)
100+
self.assertEqual(r.referer, 'IXquick')
101+
self.assertIsNone(r.search_term)
102+
self.assertEqual(r.medium, 'search')
103+
104+
def test_aol_search(self):
105+
""" AOL search
106+
"""
107+
r = Referer("""http://aolsearch.aol.co.uk/aol/search?s_chn=hp&enabled_terms=&s_i\
108+
t=aoluk-homePage50&q=pendulums""")
109+
self.check_equals(r, 'AOL', 'pendulums', 'search')
110+
111+
def test_ask_search(self):
112+
""" Ask search
113+
"""
114+
r = Referer("""http://uk.search-results.com/web?qsrc=1&o=1921&l=dis&q=pendulums&\
115+
dm=ctry&atb=sysid%3D406%3Aappid%3D113%3Auid%3D8f40f651e7b608b5%3Auc%3D1346336505%3Aqu%3D\
116+
pendulums%3Asrc%3Dcrt%3Ao%3D1921&locale=en_GB""")
117+
self.check_equals(r, 'Ask', 'pendulums', 'search')
118+
119+
def test_mailru_search(self):
120+
""" Mail.ru search
121+
"""
122+
r = Referer("""http://go.mail.ru/search?q=Gothic%20Tarot%20Cards&where=any&num=1\
123+
0&rch=e&sf=20""")
124+
self.check_equals(r, 'Mail.ru', 'Gothic Tarot Cards', 'search')
125+
126+
def test_yandex_search(self):
127+
""" Yandex search
128+
"""
129+
r = Referer("""http://images.yandex.ru/yandsearch?text=Blue%20Angel%20Oracle%20B\
130+
lue%20Angel%20Oracle&noreask=1&pos=16&rpt=simage&lr=45&img_url=http%3A%2F%2Fmdm.pbzstati\
131+
c.com%2Foracles%2Fblue-angel-oracle%2Fbox-small.png""")
132+
self.check_equals(r, 'Yandex Images', 'Blue Angel Oracle Blue Angel Oracle', 'search')
133+
134+
def test_twitter_redirect(self):
135+
""" Twitter redirect
136+
"""
137+
r = Referer("""http://t.co/chrgFZDb""")
138+
self.check_no_term(r, 'Twitter', 'social')
139+
140+
def test_fb_social(self):
141+
""" Facebook social
142+
"""
143+
r = Referer("""http://www.facebook.com/l.php?u=http%3A%2F%2Fwww.psychicbazaar.co\
144+
m&h=yAQHZtXxS&s=1""")
145+
self.check_no_term(r, 'Facebook', 'social')
146+
147+
def test_fb_mobile(self):
148+
""" Facebook mobile
149+
"""
150+
r = Referer("""http://m.facebook.com/l.php?u=http%3A%2F%2Fwww.psychicbazaar.com%\
151+
2Fblog%2F2012%2F09%2Fpsychic-bazaar-reviews-tarot-foundations-31-days-to-read-tarot-with\
152+
-confidence%2F&h=kAQGXKbf9&s=1""")
153+
self.check_no_term(r, 'Facebook', 'social')
154+
155+
def test_odnoklassniki(self):
156+
""" Odnoklassniki
157+
"""
158+
r = Referer("""http://www.odnoklassniki.ru/dk?cmd=logExternal&st._aid=Conversati\
159+
ons_Openlink&st.name=externalLinkRedirect&st.link=http%3A%2F%2Fwww.psychicbazaar.com%2Fo\
160+
racles%2F187-blue-angel-oracle.html""")
161+
self.check_no_term(r, 'Odnoklassniki', 'social')
162+
163+
def test_tumblr(self):
164+
""" Tumblr social #1
165+
"""
166+
r = Referer("http://www.tumblr.com/dashboard")
167+
self.check_no_term(r, 'Tumblr', 'social')
168+
169+
def test_tumblr_subdomain(self):
170+
""" Tumblr w subdomain
171+
"""
172+
r = Referer("http://psychicbazaar.tumblr.com/")
173+
self.check_no_term(r, 'Tumblr', 'social')
174+
175+
def test_yahoo_mail(self):
176+
""" Yahoo! Mail
177+
"""
178+
r = Referer("""http://36ohk6dgmcd1n-c.c.yom.mail.yahoo.net/om/api/1.0/openmail.a\
179+
pp.invoke/36ohk6dgmcd1n/11/1.0.35/us/en-US/view.html/0""")
180+
self.check_no_term(r, 'Yahoo! Mail', 'email')
181+
182+
def test_outlookcom_mail(self):
183+
""" Outlook.com mail
184+
"""
185+
r = Referer("http://co106w.col106.mail.live.com/default.aspx?rru=inbox")
186+
self.check_no_term(r, 'Outlook.com', 'email')
187+
188+
def test_orange_webmail(self):
189+
""" Orange Webmail
190+
"""
191+
r = Referer("""http://webmail1m.orange.fr/webmail/fr_FR/read.html?FOLDER=SF_INBO\
192+
X&IDMSG=8594&check=&SORTBY=31""")
193+
self.check_no_term(r, 'Orange Webmail', 'email')
194+
195+
def test_internal(self):
196+
r = Referer("http://www.snowplowanalytics.com/about/team",
197+
"http://www.snowplowanalytics.com/account/profile")
198+
self.assertTrue(r.known)
199+
self.assertEqual(r.medium, 'internal')
200+
self.assertIsNone(r.search_term)
201+
self.assertIsNone(r.referer)
202+
203+
if __name__ == '__main__':
204+
unittest.main()

0 commit comments

Comments
 (0)