klingon-assistant-data/generate_db.sh at main · et-gregor/klingon-assistant-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#!/bin/bash

# Get the directory with the original data.
cd "$(dirname "$0")"
SOURCE_DIR=$PWD

# Sanity check that the export to Anki script isn't broken.
# TODO: Check not only that the script succeeds, but that the output is as
# expected.
./export_to_anki.py --test > /dev/null
if [[ ! $? = 0 ]]; then
    echo "Anki export is broken."
    exit 1
fi

# Check for non-interactive mode flag.
if [[ "$1" = "--noninteractive" ]]
then
    NONINTERACTIVE=true
    shift
fi

# Check for xml-only mode flag (exclusive with "--noninteractive").
if [[ "$1" = "--xmlonly" ]]
then
    XMLONLY=true
    shift
fi

# Check whether qawHaq.db exists and is at least as new as the source files.
ALREADY_UP_TO_DATE=true
if [[ ! -f $SOURCE_DIR/qawHaq.db ]]; then
    ALREADY_UP_TO_DATE=
else
    for f in $SOURCE_DIR/mem-*.xml
    do
        [[ "$f" -nt $SOURCE_DIR/qawHaq.db ]] && ALREADY_UP_TO_DATE=
    done
    [[ $SOURCE_DIR/VERSION -nt $SOURCE_DIR/qawHaq.db ]] && ALREADY_UP_TO_DATE=
fi
if [[ $ALREADY_UP_TO_DATE ]] && [[ ! $XMLONLY ]]
then
    echo "qawHaq.db is up-to-date."
    exit
fi
if [[ ! $XMLONLY ]]
then
    echo "Generating qawHaq.db."
else
    echo "Generating mem.xml."
fi

# Check for MacOS and use GNU-sed if detected.
if [[ "$(uname -s)" = "Darwin" ]]
then
    SED=gsed
else
    SED=sed
fi

# Copy files into temporary directory, renumber, and concatenate data into one
# xml file.
TMP_DIR=$(mktemp -d "${TMPDIR:-/tmp}/klingon-assistant-data.XXXXXXXX")
cp $SOURCE_DIR/mem-*.xml $TMP_DIR
cp $SOURCE_DIR/clear_autotranslated_notes.sh $TMP_DIR
cp $SOURCE_DIR/renumber.py $TMP_DIR
cd $TMP_DIR
./clear_autotranslated_notes.sh
./renumber.py
cat mem-00-header.xml mem-01-b.xml mem-02-ch.xml mem-03-D.xml mem-04-gh.xml mem-05-H.xml mem-06-j.xml mem-07-l.xml mem-08-m.xml mem-09-n.xml mem-10-ng.xml mem-11-p.xml mem-12-q.xml mem-13-Q.xml mem-14-r.xml mem-15-S.xml mem-16-t.xml mem-17-tlh.xml mem-18-v.xml mem-19-w.xml mem-20-y.xml mem-21-a.xml mem-22-e.xml mem-23-I.xml mem-24-o.xml mem-25-u.xml mem-26-suffixes.xml mem-27-extra.xml mem-28-examples.xml mem-29-footer.xml > $TMP_DIR/mem.xml
cp $TMP_DIR/EXTRA $SOURCE_DIR
cd $SOURCE_DIR

# Write the ID of the first entry in the "extra" section to the KlingonContentDatabase.java file.
JAVA_FILE="$SOURCE_DIR/../app/src/main/java/org/tlhInganHol/android/klingonassistant/KlingonContentDatabase.java"
if [[ ! -f $JAVA_FILE ]]; then
    echo "Info: KlingonContentDatabase.java not updated."
else
    ${SED} -i -e "s/\(private static final int ID_OF_FIRST_EXTRA_ENTRY = \).*;/\1$(cat EXTRA);/" $JAVA_FILE
fi

# We only want the xml file for debugging purposes, so stop.
if [[ $XMLONLY ]]
then
    cp $TMP_DIR/mem.xml $SOURCE_DIR
    exit
fi

# Ensure entries are numbered first.
MISSING_IDS=$(grep "_id\"><" $TMP_DIR/mem.xml)
if [[ ! -z "$MISSING_IDS" ]]
then
    echo "Missing IDs: run renumber.py."
    echo
    exit 1
fi

# Write database version number.
VERSION=$(cat VERSION)
echo Writing database version $VERSION...
${SED} -i -e "s/\[\[VERSION\]\]/$VERSION/" $TMP_DIR/mem.xml

# Convert from xml to sql instructions.
./xml2sql.pl $TMP_DIR > $TMP_DIR/mem.sql
${SED} -i -e 's/INSERT INTO "mem"/INSERT INTO mem/g' $TMP_DIR/mem.sql

# Track if any warnings are displayed.
HAS_WARNINGS=

# Print any entries with duplicate columns.
DUPLICATE_COLUMNS=$(grep "ARRAY" $TMP_DIR/mem.sql)
if [[ ! -z "$DUPLICATE_COLUMNS" ]]
then
    echo "Entries with duplicate columns:"
    echo "$DUPLICATE_COLUMNS"
    echo
    HAS_WARNINGS=true
fi

# Print any parts of speech accidentally entered into the definition.
POS_DEFINITION_MIXUP=$(grep -B2 "definition\">\(v\|n\|adv\|conj\|ques\|sen\|excl\)[:<]" $TMP_DIR/mem.xml)
if [[ ! -z "$POS_DEFINITION_MIXUP" ]]
then
    echo "Part of speech information entered into definition:"
    echo "$POS_DEFINITION_MIXUP"
    echo
    HAS_WARNINGS=true
fi

# Print any empty German definitions.
MISSING_DE=$(grep -B3 "definition_de\"><" $TMP_DIR/mem.xml | grep "entry_name")
if [[ ! -z "$MISSING_DE" ]]
then
    echo "Missing German definitions:"
    echo "$MISSING_DE"
    echo
    HAS_WARNINGS=true
fi

# Print any empty Portuguese definitions.
MISSING_PT=$(grep -B8 "definition_pt\"><" $TMP_DIR/mem.xml | grep "entry_name")
if [[ ! -z "$MISSING_PT" ]]
then
    echo "Missing Portuguese definitions:"
    echo "$MISSING_PT"
    echo
    HAS_WARNINGS=true
fi

# Print any empty Finnish definitions.
MISSING_FI=$(grep -B8 "definition_fi\"><" $TMP_DIR/mem.xml | grep "entry_name")
if [[ ! -z "$MISSING_FI" ]]
then
    echo "Missing Finnish definitions:"
    echo "$MISSING_FI"
    echo
    HAS_WARNINGS=true
fi

# Print any untranslated entries.
MISSED_TRANSLATE=$(grep ">\(AUTO\)\?TRANSLATE" $TMP_DIR/mem.xml)
if [[ ! -z "$MISSED_TRANSLATE" ]]
then
    echo "Missing translations:"
    echo "$MISSED_TRANSLATE"
    echo
    HAS_WARNINGS=true
fi

# Print any mistyped colons.
COLON_TYPO=$(grep ";[nv]" $TMP_DIR/mem.xml)
if [[ ! -z "$COLON_TYPO" ]]
then
    echo "Mistyped colon:"
    echo "$COLON_TYPO"
    echo
    HAS_WARNINGS=true
fi

# Print any field beginning in a space, or ending in a space or comma (which is on one line).
MISPLACED_SPACE_OR_COMMA=$(grep -n "> \|>.*  \|[^ >]\+  .*<\|>.*[ ,]<" $TMP_DIR/mem.xml)
if [[ ! -z "$MISPLACED_SPACE_OR_COMMA" ]]
then
    echo "Misplaced space or comma:"
    echo "$MISPLACED_SPACE_OR_COMMA"
    echo
    HAS_WARNINGS=true
fi

# Catch some multi-line cases missed by the above.
MISPLACED_SPACE=$(grep -n "> \| <\/column>" $TMP_DIR/mem.xml)
if [[ ! -z "$MISPLACED_SPACE" ]]
then
    echo "Misplaced space:"
    echo "$MISPLACED_SPACE"
    echo
    HAS_WARNINGS=true
fi

# Print any lines with trailing whitespace.
EOL_WHITESPACE=$(grep -n "[[:space:]]$" $TMP_DIR/mem.xml)
if [[ ! -z "$EOL_WHITESPACE" ]]
then
    echo "End-of-line whitespace:"
    echo "$EOL_WHITESPACE"
    echo
    HAS_WARNINGS=true
fi

# Print any junk that accidentally added to the XML file at the beginning of a line.
BOL_JUNK=$(grep "^\s*[^ ]\+\s*<\(table\|column\)" $TMP_DIR/mem.xml)
if [[ ! -z "$BOL_JUNK" ]]
then
    echo "Junk at beginning of line:"
    echo "$BOL_JUNK"
    echo
    HAS_WARNINGS=true
fi

# Print any full-line examples which were accidentally indented.
BADLY_INDENTED_EXAMPLES=$(grep "^\s\+▶" $TMP_DIR/mem.xml)
if [[ ! -z "$BADLY_INDENTED_EXAMPLES" ]]
then
    echo "Full-line examples which are badly indented:"
    echo "$BADLY_INDENTED_EXAMPLES"
    echo
    HAS_WARNINGS=true
fi

# Print badly indented lines.
BADLY_INDENTED_TABLES=$(grep -A2 "^\(\s\{0,3\}\|\s\{5,7\}\|\s\{9,12\}\)<table" $TMP_DIR/mem.xml)
if [[ ! -z "$BADLY_INDENTED_TABLES" ]]
then
    echo "Badly indented lines:"
    echo "$BADLY_INDENTED_TABLES"
    echo
    HAS_WARNINGS=true
fi

# Print more badly indented lines.
BADLY_INDENTED_COLUMNS=$(grep -A2 "^\(\s\{0,5\}\|\s\{7,9\}\|\s\{11,13\}\)<column" $TMP_DIR/mem.xml)
if [[ ! -z "$BADLY_INDENTED_COLUMNS" ]]
then
    echo "Badly indented lines:"
    echo "$BADLY_INDENTED_COLUMNS"
    echo
    HAS_WARNINGS=true
fi

# Print any broken references.
BROKEN_REFERENCES=$(./xml2json.py 2> >(sort|uniq) > /dev/null)
if [[ ! -z "$BROKEN_REFERENCES" ]]
then
    echo "Broken references:"
    echo "$BROKEN_REFERENCES"
    echo
    HAS_WARNINGS=true
fi

# Print any sources which are not empty but don't begin with "[".
MISSED_SOURCE_BRACKET=$(grep "source\">[^\[<]" $TMP_DIR/mem.xml)
if [[ ! -z "$MISSED_SOURCE_BRACKET" ]]
then
    echo "Missing source index:"
    echo "$MISSED_SOURCE_BRACKET"
    echo
    HAS_WARNINGS=true
fi

# Print any sources missing its type.
MISSED_SOURCE_TYPE=$(grep "source\">.*{[^:]*}" $TMP_DIR/mem.xml)
if [[ ! -z "$MISSED_SOURCE_TYPE" ]]
then
    echo "Missing source type:"
    echo "$MISSED_SOURCE_TYPE"
    echo
    HAS_WARNINGS=true
fi

# Print any new entries containing {ngh} or {ngH}. The "xifan hol" expansion
# logic in the Android app needs to be updated if any such entries are added.
NGH_DIFF=$(grep "entry_name\">.*ng[hH]" $TMP_DIR/mem.xml | diff - expected_ngh.txt)
if [[ ! -z "$NGH_DIFF" ]]
then
    echo "Changed entries with {ngh} or {ngH}:"
    echo "$NGH_DIFF"
    echo
    HAS_WARNINGS=true
fi

# Print any new 2-letter verbs. The parsing logic in the Android app needs to
# be updated if any such verbs are added.
TWO_LETTER_VERBS_DIFF=$(grep -B1 "part_of_speech\">v" $TMP_DIR/mem.xml | grep "entry_name\">..<" | diff - expected_two_letter_verbs.txt)
if [[ ! -z "$TWO_LETTER_VERBS_DIFF" ]]
then
    echo "Changed two-letter verbs:"
    echo "$TWO_LETTER_VERBS_DIFF"
    echo
    HAS_WARNINGS=true
fi

# Exit with error if any warnings were displayed.
if [[ $HAS_WARNINGS ]]
then
    echo "Warnings were found. Please fix the issues above."
    exit 1
fi

# Pause (in case of error).
if [[ ! $NONINTERACTIVE ]]
then
    read -n1 -r -p "Press any key to continue..."
    echo
fi

# Create db binary.
if [[ -f $SOURCE_DIR/qawHaq.db ]]
then
    if [[ ! $NONINTERACTIVE ]]
    then
        # If the db already exists, show a diff.
        sqlite3 $SOURCE_DIR/qawHaq.db .dump > $TMP_DIR/old-mem.sql
        ${SED} -i -e 's/INSERT INTO "mem"/INSERT INTO mem/g' $TMP_DIR/old-mem.sql
        # This is necessary after sqlite3 v3.19.
        # See: https://stackoverflow.com/questions/44989176/sqlite3-dump-inserts-replace-function-in-dump-change-from-3-18-to-3-19
        ${SED} -i -e "s/replace(//g" $TMP_DIR/old-mem.sql
        ${SED} -i -e "s/,'\\\\n',char(10))//g" $TMP_DIR/old-mem.sql
        ${SED} -i -e "s/\\\\n/\n/g" $TMP_DIR/old-mem.sql
        ${EDITOR:-vim} -d $TMP_DIR/old-mem.sql $TMP_DIR/mem.sql
        read -n1 -r -p "Press any key to generate new db..."
        echo
    fi
    mv $SOURCE_DIR/qawHaq.db $TMP_DIR/qawHaq.db~
fi
sqlite3 $SOURCE_DIR/qawHaq.db < $TMP_DIR/mem.sql

# Sanity check.
# TODO: Refactor the creation of old-mem.sql and sanity.sql into function.
sqlite3 $SOURCE_DIR/qawHaq.db .dump > $TMP_DIR/sanity.sql
${SED} -i -e 's/INSERT INTO "mem"/INSERT INTO mem/g' $TMP_DIR/sanity.sql
${SED} -i -e "s/replace(//g" $TMP_DIR/sanity.sql
${SED} -i -e "s/,'\\\\n',char(10))//g" $TMP_DIR/sanity.sql
${SED} -i -e "s/\\\\n/\n/g" $TMP_DIR/sanity.sql
IN_OUT_DIFF=$(diff $TMP_DIR/mem.sql $TMP_DIR/sanity.sql)
if [[ ! -z "$IN_OUT_DIFF" ]]
then
    echo "Sanity check failed, entries possibly missing or out of order:"
    echo "$IN_OUT_DIFF"
    echo
    echo "Temporary files: $TMP_DIR"
    echo
    exit 1
fi

# Pause (in case of error).
if [[ ! $NONINTERACTIVE ]]
then
    read -n1 -r -p "Press any key to delete temporary files..."
    echo
fi

# Clean up temporary files.
rm -R $TMP_DIR