|
| 1 | +""" |
| 2 | +process_json returns a generator that yields Entities) |
| 3 | +
|
| 4 | +usage: |
| 5 | +with open("file.csv", "r") as f: |
| 6 | + for entity in process_json(f): |
| 7 | + do_things() |
| 8 | +
|
| 9 | +""" |
| 10 | +from propertysuggester.utils.datamodel import Claim, Entity, Snak |
| 11 | + |
| 12 | +try: |
| 13 | + import ujson as json |
| 14 | +except ImportError: |
| 15 | + print "ujson not found" |
| 16 | + import json as json |
| 17 | + |
| 18 | +def process_json(input_file): |
| 19 | + count = 0 |
| 20 | + for jsonline in input_file: |
| 21 | + count += 1 |
| 22 | + if count % 3000 == 0: |
| 23 | + print "processed %.2fMB" % (input_file.tell() / 1024.0 ** 2) |
| 24 | + jsonline = jsonline[:-2] |
| 25 | + try: |
| 26 | + data = json.loads(jsonline) |
| 27 | + except ValueError: |
| 28 | + continue |
| 29 | + if data["type"] == "item": |
| 30 | + title = data["id"] |
| 31 | + if not "claims" in data: |
| 32 | + yield Entity(title, []) |
| 33 | + continue |
| 34 | + claims = [] |
| 35 | + for prop, statements in data["claims"].iteritems(): |
| 36 | + for statement in statements: |
| 37 | + references = [] |
| 38 | + if "references" in statement: |
| 39 | + for prop, snaks in statement["references"][0]["snaks"].iteritems(): |
| 40 | + for snak in snaks: |
| 41 | + ref = _parse_json_snak(snak) |
| 42 | + if ref: |
| 43 | + references.append(ref) |
| 44 | + qualifiers = [] |
| 45 | + if "qualifiers" in statement: |
| 46 | + for prop, snaks in statement["qualifiers"].iteritems(): |
| 47 | + for snak in snaks: |
| 48 | + qualifier = _parse_json_snak(snak) |
| 49 | + if qualifier: |
| 50 | + qualifiers.append(qualifier) |
| 51 | + claim = _parse_json_snak(statement["mainsnak"]) |
| 52 | + if claim: |
| 53 | + claims.append(Claim(claim, qualifiers, references)) |
| 54 | + |
| 55 | + yield Entity(title, claims) |
| 56 | + |
| 57 | + |
| 58 | +def _parse_json_snak(claim_json): |
| 59 | + if claim_json["snaktype"] == "value": |
| 60 | + datatype = claim_json["datatype"] |
| 61 | + datavalue = claim_json["datavalue"]["value"] |
| 62 | + if datatype == "string": |
| 63 | + value = datavalue |
| 64 | + elif datatype == "wikibase-item": |
| 65 | + if datavalue["entity-type"] == "item": |
| 66 | + value = "Q" + str(datavalue["numeric-id"]) |
| 67 | + else: |
| 68 | + print "WARNING unknown entitytype: {0}".format(datavalue["entity-type"]) |
| 69 | + elif datatype == "time": |
| 70 | + value = datavalue["time"] |
| 71 | + elif datatype == "quantity": |
| 72 | + value = datavalue["amount"] |
| 73 | + elif datatype == "globe-coordinate": |
| 74 | + value = "N{0}, E{1}".format(datavalue["latitude"], datavalue["longitude"]) |
| 75 | + elif datatype == "bad": |
| 76 | + # for example in Q2241 |
| 77 | + return None |
| 78 | + else: |
| 79 | + #print "WARNING unknown wikidata datatype: %s" % datatype |
| 80 | + value = "irrelevant" |
| 81 | + else: # novalue, somevalue, ... |
| 82 | + datatype = "unknown" |
| 83 | + value = claim_json["snaktype"] |
| 84 | + property_id = claim_json["property"][1:] |
| 85 | + return Snak(property_id, datatype, value) |
0 commit comments