Skip to content

Commit 34f9af9

Browse files
committed
Merge branch 'master' of https://github.com/Wikidata-lib/PropertySuggester-Python into ConsiderClassifyingProperties
Conflicts: propertysuggester/parser/JsonReader.py propertysuggester/test/parser/test_json_reader.py
2 parents deccf9b + 9f97719 commit 34f9af9

File tree

3 files changed

+33
-24
lines changed

3 files changed

+33
-24
lines changed

propertysuggester/parser/JsonReader.py

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,52 +15,59 @@
1515
print "ujson not found"
1616
import json as json
1717

18+
1819
def read_json(input_file):
20+
"""
21+
@rtype : collections.Iterable[Entity]
22+
@type input_file: file or GzipFile or StringIO.StringIO
23+
"""
1924
count = 0
2025
for jsonline in input_file:
21-
count += 1
26+
count += 1
2227
if count % 3000 == 0:
2328
print "processed %.2fMB" % (input_file.tell() / 1024.0 ** 2)
24-
jsonline = jsonline[:-2]
25-
try:
29+
30+
if jsonline[0] == "{":
31+
jsonline = jsonline.rstrip(",\r\n")
2632
data = json.loads(jsonline)
27-
except ValueError:
28-
continue
29-
if data["type"] == "item":
30-
yield _process_json(data)
33+
if data["type"] == "item":
34+
yield _process_json(data)
35+
3136

3237
def _process_json(data):
3338
title = data["id"]
3439
if not "claims" in data:
3540
return Entity(title, [])
3641
claims = []
37-
for prop, statements in data["claims"].iteritems():
42+
for property_id, statements in data["claims"].iteritems():
3843
for statement in statements:
3944
references = []
4045
if "references" in statement:
41-
for prop, snaks in statement["references"][0]["snaks"].iteritems():
42-
for snak in snaks:
43-
ref = _parse_json_snak(snak)
44-
if ref:
45-
references.append(ref)
46+
for reference in statement["references"]: # TODO: group reference snaks correctly
47+
for ref_id, snaks in reference["snaks"].iteritems():
48+
for snak in snaks:
49+
ref = _parse_json_snak(snak)
50+
if ref:
51+
references.append(ref)
4652
qualifiers = []
47-
if "qualifiers" in statement:
48-
for prop, snaks in statement["qualifiers"].iteritems():
49-
for snak in snaks:
53+
if "qualifiers" in statement:
54+
for qual_id, snaks in statement["qualifiers"].iteritems():
55+
for snak in snaks:
5056
qualifier = _parse_json_snak(snak)
51-
if qualifier:
52-
qualifiers.append(qualifier)
57+
if qualifier:
58+
qualifiers.append(qualifier)
5359
claim = _parse_json_snak(statement["mainsnak"])
5460
if claim:
5561
claims.append(Claim(claim, qualifiers, references))
5662

5763
return Entity(title, claims)
5864

65+
5966
def _parse_json_snak(claim_json):
6067
if claim_json["snaktype"] == "value":
6168
datatype = claim_json["datatype"]
6269
datavalue = claim_json["datavalue"]["value"]
63-
if datatype == "string":
70+
if datatype in ("string", "commonsMedia", "url"):
6471
value = datavalue
6572
elif datatype == "wikibase-item":
6673
if datavalue["entity-type"] == "item":
@@ -72,13 +79,15 @@ def _parse_json_snak(claim_json):
7279
elif datatype == "quantity":
7380
value = datavalue["amount"]
7481
elif datatype == "globe-coordinate":
75-
value = "N{0}, E{1}".format(datavalue["latitude"], datavalue["longitude"])
82+
value = "N{0[latitude]}, E{0[longitude]}".format(datavalue)
83+
elif datatype == "monolingualtext":
84+
value = u"{0[text]} ({0[language]})".format(datavalue)
7685
elif datatype == "bad":
7786
# for example in Q2241
7887
return None
7988
else:
80-
#print "WARNING unknown wikidata datatype: %s" % datatype
81-
value = "irrelevant"
89+
print "WARNING unknown wikidata datatype: %s" % datatype
90+
return None
8291
else: # novalue, somevalue, ...
8392
datatype = "unknown"
8493
value = claim_json["snaktype"]

propertysuggester/test/parser/test_csv_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_universe(self):
2828
def test_multiple_entities(self):
2929
out = StringIO()
3030
out.writelines(["Q1,claim,373,string,Universe\n",
31-
"Q2,claim,143,wikibase-entityid,Q328\n"])
31+
"Q2,claim,143,wikibase-item,Q328\n"])
3232
out.seek(0)
3333
result = list(CsvReader.read_csv(out))
3434

propertysuggester/test/parser/test_json_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def test_updated_dump(self):
1616
with gzip.open(resource_filename(__name__, "Wikidata-Q15511.json.gz"), "r") as f:
1717
result = list(JsonReader.read_json(f))
1818

19-
self.assertThat(len(result), Equals(1))
19+
self.assertThat(result, HasLength(1))
2020
q15511 = result[0]
2121
self.assertThat(q15511.title, Equals("Q15511"))
2222
self.assertThat(q15511.claims, Contains(Claim(Snak(1082, "quantity", "+25"), [Snak(585, "time", "+00000002001-01-01T00:00:00Z"), Snak(459, "wikibase-item", "Q745221")], [Snak(248, "wikibase-item", "Q17597573")])))

0 commit comments

Comments
 (0)