1515 print "ujson not found"
1616 import json as json
1717
18+
1819def read_json (input_file ):
20+ """
21+ @rtype : collections.Iterable[Entity]
22+ @type input_file: file or GzipFile or StringIO.StringIO
23+ """
1924 count = 0
2025 for jsonline in input_file :
21- count += 1
26+ count += 1
2227 if count % 3000 == 0 :
2328 print "processed %.2fMB" % (input_file .tell () / 1024.0 ** 2 )
24- jsonline = jsonline [:- 2 ]
25- try :
29+
30+ if jsonline [0 ] == "{" :
31+ jsonline = jsonline .rstrip (",\r \n " )
2632 data = json .loads (jsonline )
27- except ValueError :
28- continue
29- if data ["type" ] == "item" :
30- yield _process_json (data )
33+ if data ["type" ] == "item" :
34+ yield _process_json (data )
35+
3136
3237def _process_json (data ):
3338 title = data ["id" ]
3439 if not "claims" in data :
3540 return Entity (title , [])
3641 claims = []
37- for prop , statements in data ["claims" ].iteritems ():
42+ for property_id , statements in data ["claims" ].iteritems ():
3843 for statement in statements :
3944 references = []
4045 if "references" in statement :
41- for prop , snaks in statement ["references" ][0 ]["snaks" ].iteritems ():
42- for snak in snaks :
43- ref = _parse_json_snak (snak )
44- if ref :
45- references .append (ref )
46+ for reference in statement ["references" ]: # TODO: group reference snaks correctly
47+ for ref_id , snaks in reference ["snaks" ].iteritems ():
48+ for snak in snaks :
49+ ref = _parse_json_snak (snak )
50+ if ref :
51+ references .append (ref )
4652 qualifiers = []
47- if "qualifiers" in statement :
48- for prop , snaks in statement ["qualifiers" ].iteritems ():
49- for snak in snaks :
53+ if "qualifiers" in statement :
54+ for qual_id , snaks in statement ["qualifiers" ].iteritems ():
55+ for snak in snaks :
5056 qualifier = _parse_json_snak (snak )
51- if qualifier :
52- qualifiers .append (qualifier )
57+ if qualifier :
58+ qualifiers .append (qualifier )
5359 claim = _parse_json_snak (statement ["mainsnak" ])
5460 if claim :
5561 claims .append (Claim (claim , qualifiers , references ))
5662
5763 return Entity (title , claims )
5864
65+
5966def _parse_json_snak (claim_json ):
6067 if claim_json ["snaktype" ] == "value" :
6168 datatype = claim_json ["datatype" ]
6269 datavalue = claim_json ["datavalue" ]["value" ]
63- if datatype == "string" :
70+ if datatype in ( "string" , "commonsMedia" , "url" ) :
6471 value = datavalue
6572 elif datatype == "wikibase-item" :
6673 if datavalue ["entity-type" ] == "item" :
@@ -72,13 +79,15 @@ def _parse_json_snak(claim_json):
7279 elif datatype == "quantity" :
7380 value = datavalue ["amount" ]
7481 elif datatype == "globe-coordinate" :
75- value = "N{0}, E{1}" .format (datavalue ["latitude" ], datavalue ["longitude" ])
82+ value = "N{0[latitude]}, E{0[longitude]}" .format (datavalue )
83+ elif datatype == "monolingualtext" :
84+ value = u"{0[text]} ({0[language]})" .format (datavalue )
7685 elif datatype == "bad" :
7786 # for example in Q2241
7887 return None
7988 else :
80- # print "WARNING unknown wikidata datatype: %s" % datatype
81- value = "irrelevant"
89+ print "WARNING unknown wikidata datatype: %s" % datatype
90+ return None
8291 else : # novalue, somevalue, ...
8392 datatype = "unknown"
8493 value = claim_json ["snaktype" ]
0 commit comments