Skip to content

Commit c80e1b3

Browse files
committed
fix only first qualifier was parsed
fix only first reference group was parsed add url support
1 parent c98be93 commit c80e1b3

File tree

1 file changed

+24
-21
lines changed

1 file changed

+24
-21
lines changed

propertysuggester/parser/JsonReader.py

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,56 +15,59 @@
1515
print "ujson not found"
1616
import json as json
1717

18+
1819
def read_json(input_file):
1920
"""
2021
@rtype : collections.Iterable[Entity]
2122
@type input_file: file or GzipFile or StringIO.StringIO
2223
"""
2324
count = 0
2425
for jsonline in input_file:
25-
count += 1
26+
count += 1
2627
if count % 3000 == 0:
2728
print "processed %.2fMB" % (input_file.tell() / 1024.0 ** 2)
2829

29-
jsonline = jsonline.rstrip(",\r\n")
30-
31-
data = json.loads(jsonline)
30+
if jsonline[0] == "{":
31+
jsonline = jsonline.rstrip(",\r\n")
32+
data = json.loads(jsonline)
33+
if data["type"] == "item":
34+
yield _process_json(data)
3235

33-
if data["type"] == "item":
34-
yield _process_json(data)
3536

3637
def _process_json(data):
3738
title = data["id"]
3839
if not "claims" in data:
3940
return Entity(title, [])
4041
claims = []
41-
for prop, statements in data["claims"].iteritems():
42+
for property_id, statements in data["claims"].iteritems():
4243
for statement in statements:
4344
references = []
4445
if "references" in statement:
45-
for prop, snaks in statement["references"][0]["snaks"].iteritems():
46-
for snak in snaks:
47-
ref = _parse_json_snak(snak)
48-
if ref:
49-
references.append(ref)
46+
for reference in statement["references"]: # TODO: group reference snaks correctly
47+
for ref_id, snaks in reference["snaks"].iteritems():
48+
for snak in snaks:
49+
ref = _parse_json_snak(snak)
50+
if ref:
51+
references.append(ref)
5052
qualifiers = []
51-
if "qualifiers" in statement:
52-
for prop, snaks in statement["qualifiers"].iteritems():
53-
for snak in snaks:
53+
if "qualifiers" in statement:
54+
for qual_id, snaks in statement["qualifiers"].iteritems():
55+
for snak in snaks:
5456
qualifier = _parse_json_snak(snak)
55-
if qualifier:
56-
qualifiers.append(qualifier)
57+
if qualifier:
58+
qualifiers.append(qualifier)
5759
claim = _parse_json_snak(statement["mainsnak"])
5860
if claim:
5961
claims.append(Claim(claim, qualifiers, references))
6062

6163
return Entity(title, claims)
6264

65+
6366
def _parse_json_snak(claim_json):
6467
if claim_json["snaktype"] == "value":
6568
datatype = claim_json["datatype"]
6669
datavalue = claim_json["datavalue"]["value"]
67-
if datatype == "string":
70+
if datatype in ("string", "commonsMedia", "url"):
6871
value = datavalue
6972
elif datatype == "wikibase-item":
7073
if datavalue["entity-type"] == "item":
@@ -76,9 +79,9 @@ def _parse_json_snak(claim_json):
7679
elif datatype == "quantity":
7780
value = datavalue["amount"]
7881
elif datatype == "globe-coordinate":
79-
value = "N{0}, E{1}".format(datavalue["latitude"], datavalue["longitude"])
80-
elif datatype == "commonsMedia":
81-
value = datavalue
82+
value = "N{0[latitude]}, E{0[longitude]}".format(datavalue)
83+
elif datatype == "monolingualtext":
84+
value = u"{0[text]} ({0[language]})".format(datavalue)
8285
elif datatype == "bad":
8386
# for example in Q2241
8487
return None

0 commit comments

Comments
 (0)