Skip to content

Commit e3f3dbc

Browse files
committed
adapt analysis to Json-Dumps
1 parent 929316c commit e3f3dbc

File tree

3 files changed

+99
-6
lines changed

3 files changed

+99
-6
lines changed

dumpconverter.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,16 @@
22
import sys
33
import time
44

5-
from propertysuggester.parser import XmlReader, CsvWriter
5+
from propertysuggester.parser import JsonReader, CsvWriter
66
from propertysuggester.utils.CompressedFileType import CompressedFileType
77

88
if __name__ == "__main__":
9-
parser = argparse.ArgumentParser(description="this program converts wikidata XML dumps to CSV data.")
10-
parser.add_argument("input", help="The XML input file (a wikidata dump)", type=CompressedFileType('r'))
9+
parser = argparse.ArgumentParser(description="this program converts wikidata JSON dumps to CSV data.")
10+
parser.add_argument("input", help="The JSON input file (a wikidata dump)", type=CompressedFileType('r'))
1111
parser.add_argument("output", help="The CSV output file (default=sys.stdout)", default=sys.stdout, nargs='?',
1212
type=CompressedFileType('wb'))
13-
parser.add_argument("-p", "--processes", help="Number of processors to use (default 4)", type=int, default=4)
13+
#parser.add_argument("-p", "--processes", help="Number of processors to use (default 4)", type=int, default=4)
1414
args = parser.parse_args()
15-
1615
start = time.time()
17-
CsvWriter.write_csv(XmlReader.read_xml(args.input, args.processes), args.output)
16+
CsvWriter.write_csv(newJsonReader.process_json(args.input), args.output)
1817
print "total time: %.2fs" % (time.time() - start)
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""
2+
process_json returns a generator that yields Entities)
3+
4+
usage:
5+
with open("file.csv", "r") as f:
6+
for entity in process_json(f):
7+
do_things()
8+
9+
"""
10+
from propertysuggester.utils.datamodel import Claim, Entity, Snak
11+
12+
try:
13+
import ujson as json
14+
except ImportError:
15+
print "ujson not found"
16+
import json as json
17+
18+
def process_json(input_file):
19+
count = 0
20+
for jsonline in input_file:
21+
count += 1
22+
if count % 3000 == 0:
23+
print "processed %.2fMB" % (input_file.tell() / 1024.0 ** 2)
24+
jsonline = jsonline[:-2]
25+
try:
26+
data = json.loads(jsonline)
27+
except ValueError:
28+
continue
29+
if data["type"] == "item":
30+
title = data["id"]
31+
if not "claims" in data:
32+
yield Entity(title, [])
33+
continue
34+
claims = []
35+
for prop, statements in data["claims"].iteritems():
36+
for statement in statements:
37+
references = []
38+
if "references" in statement:
39+
for prop, snaks in statement["references"][0]["snaks"].iteritems():
40+
for snak in snaks:
41+
ref = _parse_json_snak(snak)
42+
if ref:
43+
references.append(ref)
44+
qualifiers = []
45+
if "qualifiers" in statement:
46+
for prop, snaks in statement["qualifiers"].iteritems():
47+
for snak in snaks:
48+
qualifier = _parse_json_snak(snak)
49+
if qualifier:
50+
qualifiers.append(qualifier)
51+
claim = _parse_json_snak(statement["mainsnak"])
52+
if claim:
53+
claims.append(Claim(claim, qualifiers, references))
54+
55+
yield Entity(title, claims)
56+
57+
58+
def _parse_json_snak(claim_json):
59+
if claim_json["snaktype"] == "value":
60+
datatype = claim_json["datatype"]
61+
datavalue = claim_json["datavalue"]["value"]
62+
if datatype == "string":
63+
value = datavalue
64+
elif datatype == "wikibase-item":
65+
if datavalue["entity-type"] == "item":
66+
value = "Q" + str(datavalue["numeric-id"])
67+
else:
68+
print "WARNING unknown entitytype: {0}".format(datavalue["entity-type"])
69+
elif datatype == "time":
70+
value = datavalue["time"]
71+
elif datatype == "quantity":
72+
value = datavalue["amount"]
73+
elif datatype == "globe-coordinate":
74+
value = "N{0}, E{1}".format(datavalue["latitude"], datavalue["longitude"])
75+
elif datatype == "bad":
76+
# for example in Q2241
77+
return None
78+
else:
79+
#print "WARNING unknown wikidata datatype: %s" % datatype
80+
value = "irrelevant"
81+
else: # novalue, somevalue, ...
82+
datatype = "unknown"
83+
value = claim_json["snaktype"]
84+
property_id = claim_json["property"][1:]
85+
return Snak(property_id, datatype, value)

readme.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,14 @@ nosetests
3030

3131
## Release Notes
3232

33+
### 1.2
34+
* Consider classifying Properties
35+
* use Json dumps for analysis
36+
37+
### 1.1
38+
* Generate associationrules for qualifier and references
39+
* Improve ranking to avoid suggestions of human properties
40+
* remove very unlikely rules (<1%)
41+
3342
### 1.0
3443
* Converts a wikidata dump to a csv file with associationrules between properties

0 commit comments

Comments
 (0)