klingon-assistant-data/xml2json.py at main · De7vID/klingon-assistant-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#!/usr/bin/env python3

# xml2json.py
#
# Read the database XML files and output a JSON representation of the database
# to stdout, and report unresolvable links to stderr.
#
# The JSON structure is roughly:
#
# {
#   "format_version" : "1"
#   "version" : "<database_version>",
#   "locales" : {
#     "de" : "Deutsch",
#     "en" : "English",
#     ...
#   },
#   "supported_locales" : [
#     "de",
#     "en",
#     ...
#   ],
#   "qawHaq" : {
#     "<search_name>" : {
#         "_id" : "<id>",
#         "entry_name" : "<entry name>",
#         "part_of_speech" : "<part_of_speech>",
#         "definition" : {
#           "de" : "<definition_de>",
#           "en" : "<definition>",
#           ...
#         },
#         "synonyms" : "<synonyms>",
#         "antonyms" : "<antonyms>",
#         "see_also" : "<see_also>",
#         "notes" : {
#           "de" : "<notes_de>",
#           "en" : "<notes>",
#           ...
#         },
#         "hidden_notes" : "<hidden_notes>",
#         "components" : "<components>",
#         "examples" : {
#           "de" : "<examples_de>",
#           "en" : "<examples>",
#           ...
#         },
#         "search_tags" : {
#           "de" : ["<search_tag_de>", ...],
#           "en" : ["<search_tag>", ...],
#           ...
#         },
#         "source" : "<source>"
#     },
#     ...
#   }
# }
#
# format_version must be incremented if the format changes in a backwards
# incompatible way (adding new fields ought to be backwards compatible).
#
# version is the version of the database
#
# locales is a map of key/value pairs with locale codes as the key and localized
# locale names as the value
#
# supported_locales is a list of locale codes that are considered complete enough
# to display in the menu by default
#
# search_name is constructed as: entry_name:base_part_of_speech(:homophone_num),
# where entry_name is the entry name, base_part_of_speech is the first field of
# the part of speech (e.g. "n" rather than "n:name"), and homophone_num is the
# homophone number parsed from the part_of_speech field, if present. Entries
# with no homophones do not specify a homophone field.
#
# The search_tags and search_tags_de fields are treated as comma-separated lists
# and split into arrays of separate search tags.
#
# The remaining values are taken directly from the XML database. Empty values
# are omitted from the JSON representation.

import xml.etree.ElementTree as ET
import json
import sys
import fileinput
import os
import re
import unicodedata
from collections import OrderedDict

# A single entry parsed from the XML tree
class EntryNode:
    # Constructor from XML node
    def __init__(self, node):
        self.data = {}
        # Iterate over columns in the entry and store their values
        for child in node:
            if child.tag == 'column':
                name = child.attrib['name']
                namesplit = name.split('_')
                # Normalize Unicode characters into decomposed form
                text = unicodedata.normalize('NFKD', ''.join(child.itertext()))
                if text:
                    # Store localized fields hierarchically
                    if namesplit[0] in [
                        'definition',
                        'notes',
                        'search', # 'search_tags'
                        'examples',
                    ]:
                        if namesplit[0] == 'search':
                            component = 'search_tags'
                        else:
                            component = namesplit[0]

                        if len(namesplit) > 1:
                            locale = namesplit[-1]
                            if locale == 'tags': # 'search_tags'
                                locale = 'en'
                        else:
                            locale = 'en'

                        if locale == 'HK': # 'zh_HK'
                            locale = 'zh_HK'

                        if not component in self.data:
                            self.data[component] = {}

                        # Split search tags into array
                        if component == 'search_tags':
                            data = re.split(', *', text)
                        else:
                            data = text

                        self.data[component][locale] = data
                    # Non localized fields are stored at the entry's top level
                    else:
                        self.data[name] = text

    # Normalize the search name from the stored entry name and part of speech
    def searchName(self):
        return normalize(self.data['entry_name'], self.data['part_of_speech'])

# Convert an entry name and part of speech, which may include a homophone
# number and non-homophone tags, into a normalized search name
def normalize(name, pos):
    # Split part of speech into separate fields
    posSplit = pos.split(':')
    pos = posSplit[0]
    # If there is a second field, it contains comma-separated tags
    if len(posSplit) > 1:
        flags = posSplit[1]
    else:
        flags = ''
    homophone = ''
    # Look for a homophone number in the flags. Ignore an 'h' which is used
    # to indicate a hidden homophone number.
    for flag in flags.split(','):
        flag = flag.rstrip('h')
        if flag.isdigit():
            homophone = ':' + flag
            break
    return name + ':' + pos + homophone

# Traverse the database tree and try to identify links that cannot be resolved
# unambiguously to an entry. Report any unresolvable links to stderr.
def validatelinks(root, node):
    # If this node is a dict or a list, recurse into its children
    if isinstance(node, dict):
        for subnode in node:
            validatelinks(root, node[subnode])
    elif isinstance(node, list):
        for item in node:
            validatelinks(root, item)
    else:
        # Find all text in {curly braces}
        remaining = node
        while remaining.find('{') != -1:
            remaining = remaining[remaining.find('{')+1:]
            tag = remaining[0:remaining.find('}')]

            # For {sentences with components@@sentences, with, components},
            # check the individual components.
            if tag.find('@@') != -1:
                for term in tag.split('@@')[1].split(','):
                    validatelinks(root, '{' + term.strip(' ') + '}')
                continue

            tagsplit = tag.split(':')

            if len(tagsplit) > 1:
                # The second field identifies the text type: don't bother
                # validating url links or src attributions.
                if tagsplit[1] == 'url' or tagsplit[1] == 'src':
                    continue
                # Check the flags in the third field and ignore text tagged
                # with the "nolink" flag.
                if len(tagsplit) > 2 and 'nolink' in tagsplit[2].split(','):
                    continue

                # Normalize the search name and check if an entry exists
                normalized = normalize(tagsplit[0], ':'.join(tagsplit[1:]))
                if not normalized in root:
                    hom = ''

                    # Check if the failure to resolve was due to an ambiguous
                    # homophone
                    if normalized + ':1' in root:
                        hom = ' (homophone exists)'
                    # A homophone number of 0 explicitly indicates that the
                    # link is supposed to lead to all homophones
                    elif normalized[-2:] == ':0':
                        if normalized[:-2] + ':1' in root:
                            continue

                    sys.stderr.write('no entry for {' + tag + '}' + hom + '.\n')

# Section names of the individual XML fragments that make up the database
memparts = ['header', 'b', 'ch', 'D', 'gh', 'H', 'j', 'l', 'm', 'n', 'ng', 'p',
            'q', 'Q', 'r', 'S' ,'t', 'tlh', 'v', 'w', 'y', 'a', 'e', 'I', 'o',
            'u', 'suffixes', 'extra', 'examples', 'footer']
filenames = []
concat=''
sdir = os.path.dirname(os.path.realpath(sys.argv[0]))

for i, part in enumerate(memparts):
    filenames.append(os.path.join(sdir,'mem-{0:02d}-{1}.xml'.format(i, part)))

# Concatenate the individual files into a single database string
for file in filenames:
    with open(file) as fh:
        concat += fh.read()

# Read the database version from the version file
ver = fileinput.FileInput(files=(os.path.join(sdir,'VERSION')))
version = next(iter(ver)).strip()
ver.close()

# Parse the database XML tree and store the parsed entries in a dict
xmltree = ET.fromstring(concat)
qawHaq = OrderedDict()
overwritten = 0
for child in xmltree[0]:
    node = EntryNode(child)

    if node.searchName() in qawHaq:
        sys.stderr.write(node.searchName() + ' overwrites an existing entry\n')
        overwritten += 1

    # Every entry should have a definition
    if 'definition' in node.data:
        qawHaq[node.searchName()] = node.data
    else:
        sys.stderr.write('no definition for entry ' + node.searchName() + '\n')

# Now that the database has been parsed, search for unfollowable links
validatelinks(qawHaq, qawHaq)

ret = OrderedDict()
ret['format_version'] = '1'
ret['version'] = version
ret['locales'] = OrderedDict()

ret['locales']['de'] = 'Deutsch'
ret['locales']['en'] = 'English'
ret['locales']['fa'] = 'فارسى'
ret['locales']['ru'] = 'Русский язык'
ret['locales']['sv'] = 'Svenska'
ret['locales']['zh_HK'] = '中文 (香港)'
ret['locales']['pt'] = 'Português'
ret['locales']['fi'] = 'Suomi'
ret['locales']['fr'] = 'Français'

ret['supported_locales'] = [
  'de',
  'en',
  'sv',
]
ret['qawHaq'] = qawHaq

# Dump the database as JSON
print(json.dumps(ret))

if (overwritten):
    sys.stderr.write('\n*** yIqImqu\' jay\'! ***\n\n')
    sys.stderr.write(str(overwritten) + ' entries overwritten by duplicates!\n')
    sys.exit(1)