1515 print "ujson not found"
1616 import json as json
1717
18+
1819def read_json (input_file ):
1920 """
2021 @rtype : collections.Iterable[Entity]
2122 @type input_file: file or GzipFile or StringIO.StringIO
2223 """
2324 count = 0
2425 for jsonline in input_file :
25- count += 1
26+ count += 1
2627 if count % 3000 == 0 :
2728 print "processed %.2fMB" % (input_file .tell () / 1024.0 ** 2 )
2829
29- jsonline = jsonline .rstrip (",\r \n " )
30-
31- data = json .loads (jsonline )
30+ if jsonline [0 ] == "{" :
31+ jsonline = jsonline .rstrip (",\r \n " )
32+ data = json .loads (jsonline )
33+ if data ["type" ] == "item" :
34+ yield _process_json (data )
3235
33- if data ["type" ] == "item" :
34- yield _process_json (data )
3536
3637def _process_json (data ):
3738 title = data ["id" ]
3839 if not "claims" in data :
3940 return Entity (title , [])
4041 claims = []
41- for prop , statements in data ["claims" ].iteritems ():
42+ for property_id , statements in data ["claims" ].iteritems ():
4243 for statement in statements :
4344 references = []
4445 if "references" in statement :
45- for prop , snaks in statement ["references" ][0 ]["snaks" ].iteritems ():
46- for snak in snaks :
47- ref = _parse_json_snak (snak )
48- if ref :
49- references .append (ref )
46+ for reference in statement ["references" ]: # TODO: group reference snaks correctly
47+ for ref_id , snaks in reference ["snaks" ].iteritems ():
48+ for snak in snaks :
49+ ref = _parse_json_snak (snak )
50+ if ref :
51+ references .append (ref )
5052 qualifiers = []
51- if "qualifiers" in statement :
52- for prop , snaks in statement ["qualifiers" ].iteritems ():
53- for snak in snaks :
53+ if "qualifiers" in statement :
54+ for qual_id , snaks in statement ["qualifiers" ].iteritems ():
55+ for snak in snaks :
5456 qualifier = _parse_json_snak (snak )
55- if qualifier :
56- qualifiers .append (qualifier )
57+ if qualifier :
58+ qualifiers .append (qualifier )
5759 claim = _parse_json_snak (statement ["mainsnak" ])
5860 if claim :
5961 claims .append (Claim (claim , qualifiers , references ))
6062
6163 return Entity (title , claims )
6264
65+
6366def _parse_json_snak (claim_json ):
6467 if claim_json ["snaktype" ] == "value" :
6568 datatype = claim_json ["datatype" ]
6669 datavalue = claim_json ["datavalue" ]["value" ]
67- if datatype == "string" :
70+ if datatype in ( "string" , "commonsMedia" , "url" ) :
6871 value = datavalue
6972 elif datatype == "wikibase-item" :
7073 if datavalue ["entity-type" ] == "item" :
@@ -76,9 +79,9 @@ def _parse_json_snak(claim_json):
7679 elif datatype == "quantity" :
7780 value = datavalue ["amount" ]
7881 elif datatype == "globe-coordinate" :
79- value = "N{0}, E{1 }" .format (datavalue [ "latitude" ], datavalue [ "longitude" ] )
80- elif datatype == "commonsMedia " :
81- value = datavalue
82+ value = "N{0[latitude] }, E{0[longitude] }" .format (datavalue )
83+ elif datatype == "monolingualtext " :
84+ value = u"{0[text]} ({0[language]})" . format ( datavalue )
8285 elif datatype == "bad" :
8386 # for example in Q2241
8487 return None
0 commit comments