Skip to content

Commit 9f97719

Browse files
committed
Merge pull request #13 from Wikidata-lib/useJsonDumps
use Json Dumps for analysis
2 parents 4c29008 + c80e1b3 commit 9f97719

File tree

8 files changed

+143
-18
lines changed

8 files changed

+143
-18
lines changed

dumpconverter.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,16 @@
22
import sys
33
import time
44

5-
from propertysuggester.parser import XmlReader, CsvWriter
5+
from propertysuggester.parser import JsonReader, CsvWriter
66
from propertysuggester.utils.CompressedFileType import CompressedFileType
77

88
if __name__ == "__main__":
9-
parser = argparse.ArgumentParser(description="this program converts wikidata XML dumps to CSV data.")
10-
parser.add_argument("input", help="The XML input file (a wikidata dump)", type=CompressedFileType('r'))
9+
parser = argparse.ArgumentParser(description="this program converts wikidata JSON dumps to CSV data.")
10+
parser.add_argument("input", help="The JSON input file (a wikidata dump)", type=CompressedFileType('r'))
1111
parser.add_argument("output", help="The CSV output file (default=sys.stdout)", default=sys.stdout, nargs='?',
1212
type=CompressedFileType('wb'))
13-
parser.add_argument("-p", "--processes", help="Number of processors to use (default 4)", type=int, default=4)
13+
#parser.add_argument("-p", "--processes", help="Number of processors to use (default 4)", type=int, default=4)
1414
args = parser.parse_args()
15-
1615
start = time.time()
17-
CsvWriter.write_csv(XmlReader.read_xml(args.input, args.processes), args.output)
16+
CsvWriter.write_csv(JsonReader.read_json(args.input), args.output)
1817
print "total time: %.2fs" % (time.time() - start)
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
"""
2+
read_json returns a generator that yields Entities)
3+
4+
usage:
5+
with open("file.csv", "r") as f:
6+
for entity in read_json(f):
7+
do_things()
8+
9+
"""
10+
from propertysuggester.utils.datamodel import Claim, Entity, Snak
11+
12+
try:
13+
import ujson as json
14+
except ImportError:
15+
print "ujson not found"
16+
import json as json
17+
18+
19+
def read_json(input_file):
20+
"""
21+
@rtype : collections.Iterable[Entity]
22+
@type input_file: file or GzipFile or StringIO.StringIO
23+
"""
24+
count = 0
25+
for jsonline in input_file:
26+
count += 1
27+
if count % 3000 == 0:
28+
print "processed %.2fMB" % (input_file.tell() / 1024.0 ** 2)
29+
30+
if jsonline[0] == "{":
31+
jsonline = jsonline.rstrip(",\r\n")
32+
data = json.loads(jsonline)
33+
if data["type"] == "item":
34+
yield _process_json(data)
35+
36+
37+
def _process_json(data):
38+
title = data["id"]
39+
if not "claims" in data:
40+
return Entity(title, [])
41+
claims = []
42+
for property_id, statements in data["claims"].iteritems():
43+
for statement in statements:
44+
references = []
45+
if "references" in statement:
46+
for reference in statement["references"]: # TODO: group reference snaks correctly
47+
for ref_id, snaks in reference["snaks"].iteritems():
48+
for snak in snaks:
49+
ref = _parse_json_snak(snak)
50+
if ref:
51+
references.append(ref)
52+
qualifiers = []
53+
if "qualifiers" in statement:
54+
for qual_id, snaks in statement["qualifiers"].iteritems():
55+
for snak in snaks:
56+
qualifier = _parse_json_snak(snak)
57+
if qualifier:
58+
qualifiers.append(qualifier)
59+
claim = _parse_json_snak(statement["mainsnak"])
60+
if claim:
61+
claims.append(Claim(claim, qualifiers, references))
62+
63+
return Entity(title, claims)
64+
65+
66+
def _parse_json_snak(claim_json):
67+
if claim_json["snaktype"] == "value":
68+
datatype = claim_json["datatype"]
69+
datavalue = claim_json["datavalue"]["value"]
70+
if datatype in ("string", "commonsMedia", "url"):
71+
value = datavalue
72+
elif datatype == "wikibase-item":
73+
if datavalue["entity-type"] == "item":
74+
value = "Q" + str(datavalue["numeric-id"])
75+
else:
76+
print "WARNING unknown entitytype: {0}".format(datavalue["entity-type"])
77+
elif datatype == "time":
78+
value = datavalue["time"]
79+
elif datatype == "quantity":
80+
value = datavalue["amount"]
81+
elif datatype == "globe-coordinate":
82+
value = "N{0[latitude]}, E{0[longitude]}".format(datavalue)
83+
elif datatype == "monolingualtext":
84+
value = u"{0[text]} ({0[language]})".format(datavalue)
85+
elif datatype == "bad":
86+
# for example in Q2241
87+
return None
88+
else:
89+
print "WARNING unknown wikidata datatype: %s" % datatype
90+
return None
91+
else: # novalue, somevalue, ...
92+
datatype = "unknown"
93+
value = claim_json["snaktype"]
94+
property_id = int(claim_json["property"][1:])
95+
return Snak(property_id, datatype, value)

propertysuggester/parser/XmlReader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ def _parse_json_snak(claim_json):
114114
if datatype == "string":
115115
value = claim_json[3]
116116
elif datatype == "wikibase-entityid":
117+
datatype = "wikibase-item"
117118
if claim_json[3]["entity-type"] == "item":
118119
value = "Q" + str(claim_json[3]["numeric-id"])
119120
else:
3.43 KB
Binary file not shown.

propertysuggester/test/parser/test_abstract_reader.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ def assert_universe(self, result):
1111

1212
self.assertThat(q1.title, Equals("Q1"))
1313
self.assertThat(q1.claims, Contains(Claim(Snak(373, "string", "Universe"), [],
14-
[Snak(143, "wikibase-entityid", "Q328")])))
15-
self.assertThat(q1.claims, Contains(Claim(Snak(31, "wikibase-entityid", "Q223557"))))
16-
self.assertThat(q1.claims, Contains(Claim(Snak(31, "wikibase-entityid", "Q1088088"))))
17-
self.assertThat(q1.claims, Contains(Claim(Snak(361, "wikibase-entityid", "Q3327819"),
18-
[Snak(31, "wikibase-entityid", "Q41719")], [])))
14+
[Snak(143, "wikibase-item", "Q328")])))
15+
self.assertThat(q1.claims, Contains(Claim(Snak(31, "wikibase-item", "Q223557"))))
16+
self.assertThat(q1.claims, Contains(Claim(Snak(31, "wikibase-item", "Q1088088"))))
17+
self.assertThat(q1.claims, Contains(Claim(Snak(361, "wikibase-item", "Q3327819"),
18+
[Snak(31, "wikibase-item", "Q41719")], [])))

propertysuggester/test/parser/test_csv_reader.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,19 @@ def setUp(self):
1616
def test_universe(self):
1717
out = StringIO()
1818
out.writelines(["Q1,claim,373,string,Universe\n",
19-
"Q1,reference,143,wikibase-entityid,Q328\n"
20-
"Q1,claim,31,wikibase-entityid,Q223557\n",
21-
"Q1,claim,31,wikibase-entityid,Q1088088\n",
22-
"Q1,claim,361,wikibase-entityid,Q3327819\n",
23-
"Q1,qualifier,31,wikibase-entityid,Q41719\n"])
19+
"Q1,reference,143,wikibase-item,Q328\n"
20+
"Q1,claim,31,wikibase-item,Q223557\n",
21+
"Q1,claim,31,wikibase-item,Q1088088\n",
22+
"Q1,claim,361,wikibase-item,Q3327819\n",
23+
"Q1,qualifier,31,wikibase-item,Q41719\n"])
2424
out.seek(0)
2525
result = list(CsvReader.read_csv(out))
2626
self.assert_universe(result)
2727

2828
def test_multiple_entities(self):
2929
out = StringIO()
3030
out.writelines(["Q1,claim,373,string,Universe\n",
31-
"Q2,claim,143,wikibase-entityid,Q328\n"])
31+
"Q2,claim,143,wikibase-item,Q328\n"])
3232
out.seek(0)
3333
result = list(CsvReader.read_csv(out))
3434

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import unittest
2+
import gzip
3+
4+
from pkg_resources import resource_filename
5+
from testtools import TestCase
6+
from testtools.matchers import *
7+
8+
from propertysuggester.test.parser.test_abstract_reader import AbstractUniverseTest
9+
from propertysuggester.parser import JsonReader
10+
from propertysuggester.utils.datamodel import Claim, Snak, Entity
11+
12+
13+
class JsonReaderTest(AbstractUniverseTest):
14+
15+
def test_updated_dump(self):
16+
with gzip.open(resource_filename(__name__, "Wikidata-Q15511.json.gz"), "r") as f:
17+
result = list(JsonReader.read_json(f))
18+
19+
self.assertThat(result, HasLength(1))
20+
q15511 = result[0]
21+
self.assertThat(q15511.title, Equals("Q15511"))
22+
self.assertThat(q15511.claims, Contains(Claim(Snak(1082, "quantity", "+25"), [Snak(585, "time", "+00000002001-01-01T00:00:00Z"), Snak(459, "wikibase-item", "Q745221")], [Snak(248, "wikibase-item", "Q17597573")])))
23+
24+
def test_special_cases(self):
25+
data = dict([("id", "Q1"), ("type", "item")])
26+
self.assertThat(JsonReader._process_json(data), Equals(Entity("Q1", [])))
27+
28+
if __name__ == '__main__':
29+
unittest.main()
30+

propertysuggester/test/parser/test_xml_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def test_updated_dump(self):
2323
self.assertThat(len(result), Equals(1))
2424
q9351 = result[0]
2525
self.assertThat(q9351.title, Equals("Q9351"))
26-
self.assertThat(q9351.claims, Contains(Claim(Snak(156, "wikibase-entityid", "Q1647331"))))
26+
self.assertThat(q9351.claims, Contains(Claim(Snak(156, "wikibase-item", "Q1647331"))))
2727
self.assertThat(q9351.claims, Contains(Claim(Snak(1112, "quantity", "+25"))))
2828

2929
def test_special_cases(self):

0 commit comments

Comments
 (0)