|
| 1 | +""" |
| 2 | +read_json returns a generator that yields Entities) |
| 3 | +
|
| 4 | +usage: |
| 5 | +with open("file.csv", "r") as f: |
| 6 | + for entity in read_json(f): |
| 7 | + do_things() |
| 8 | +
|
| 9 | +""" |
| 10 | +from propertysuggester.utils.datamodel import Claim, Entity, Snak |
| 11 | + |
| 12 | +try: |
| 13 | + import ujson as json |
| 14 | +except ImportError: |
| 15 | + print "ujson not found" |
| 16 | + import json as json |
| 17 | + |
| 18 | + |
| 19 | +def read_json(input_file): |
| 20 | + """ |
| 21 | + @rtype : collections.Iterable[Entity] |
| 22 | + @type input_file: file or GzipFile or StringIO.StringIO |
| 23 | + """ |
| 24 | + count = 0 |
| 25 | + for jsonline in input_file: |
| 26 | + count += 1 |
| 27 | + if count % 3000 == 0: |
| 28 | + print "processed %.2fMB" % (input_file.tell() / 1024.0 ** 2) |
| 29 | + |
| 30 | + if jsonline[0] == "{": |
| 31 | + jsonline = jsonline.rstrip(",\r\n") |
| 32 | + data = json.loads(jsonline) |
| 33 | + if data["type"] == "item": |
| 34 | + yield _process_json(data) |
| 35 | + |
| 36 | + |
| 37 | +def _process_json(data): |
| 38 | + title = data["id"] |
| 39 | + if not "claims" in data: |
| 40 | + return Entity(title, []) |
| 41 | + claims = [] |
| 42 | + for property_id, statements in data["claims"].iteritems(): |
| 43 | + for statement in statements: |
| 44 | + references = [] |
| 45 | + if "references" in statement: |
| 46 | + for reference in statement["references"]: # TODO: group reference snaks correctly |
| 47 | + for ref_id, snaks in reference["snaks"].iteritems(): |
| 48 | + for snak in snaks: |
| 49 | + ref = _parse_json_snak(snak) |
| 50 | + if ref: |
| 51 | + references.append(ref) |
| 52 | + qualifiers = [] |
| 53 | + if "qualifiers" in statement: |
| 54 | + for qual_id, snaks in statement["qualifiers"].iteritems(): |
| 55 | + for snak in snaks: |
| 56 | + qualifier = _parse_json_snak(snak) |
| 57 | + if qualifier: |
| 58 | + qualifiers.append(qualifier) |
| 59 | + claim = _parse_json_snak(statement["mainsnak"]) |
| 60 | + if claim: |
| 61 | + claims.append(Claim(claim, qualifiers, references)) |
| 62 | + |
| 63 | + return Entity(title, claims) |
| 64 | + |
| 65 | + |
| 66 | +def _parse_json_snak(claim_json): |
| 67 | + if claim_json["snaktype"] == "value": |
| 68 | + datatype = claim_json["datatype"] |
| 69 | + datavalue = claim_json["datavalue"]["value"] |
| 70 | + if datatype in ("string", "commonsMedia", "url"): |
| 71 | + value = datavalue |
| 72 | + elif datatype == "wikibase-item": |
| 73 | + if datavalue["entity-type"] == "item": |
| 74 | + value = "Q" + str(datavalue["numeric-id"]) |
| 75 | + else: |
| 76 | + print "WARNING unknown entitytype: {0}".format(datavalue["entity-type"]) |
| 77 | + elif datatype == "time": |
| 78 | + value = datavalue["time"] |
| 79 | + elif datatype == "quantity": |
| 80 | + value = datavalue["amount"] |
| 81 | + elif datatype == "globe-coordinate": |
| 82 | + value = "N{0[latitude]}, E{0[longitude]}".format(datavalue) |
| 83 | + elif datatype == "monolingualtext": |
| 84 | + value = u"{0[text]} ({0[language]})".format(datavalue) |
| 85 | + elif datatype == "bad": |
| 86 | + # for example in Q2241 |
| 87 | + return None |
| 88 | + else: |
| 89 | + print "WARNING unknown wikidata datatype: %s" % datatype |
| 90 | + return None |
| 91 | + else: # novalue, somevalue, ... |
| 92 | + datatype = "unknown" |
| 93 | + value = claim_json["snaktype"] |
| 94 | + property_id = int(claim_json["property"][1:]) |
| 95 | + return Snak(property_id, datatype, value) |
0 commit comments