add annotator

vzhong · vzhong · commit a242be9800f9 · 2017-08-16T16:44:51.000Z
diff --git a/README.md b/README.md
@@ -218,3 +218,12 @@ If everything works correctly, the output should be:
   "lf_accuracy": 0.2334609075997813
 }
 ```
+
+
+## Annotation
+
+In addition to the raw data dump, we also release an optional annotation script that annotates WikiSQL using [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/).
+The `annotate.py` script will annotate the query, question, and SQL table, as well as a sequence to sequence construction of the input and output for convenience of using Seq2Seq models.
+To use `annotate.py`, you must set up the CoreNLP python client using [Stanford Stanza](https://github.com/stanfordnlp/stanza).
+Note that the sequence output contain symbols to delineate the boundaries of fields.
+In `lib/query.py` you will also find accompanying functions to reconstruct a query given a sequence output in the annotated format.
diff --git a/annotate.py b/annotate.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+import os
+import records
+import ujson as json
+from stanza.nlp.corenlp import CoreNLPClient
+from tqdm import tqdm
+import copy
+from lib.common import count_lines, detokenize
+from lib.query import Query
+
+
+client = None
+
+
+def annotate(sentence, lower=True):
+    global client
+    if client is None:
+        client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','))
+    words, gloss, after = [], [], []
+    for s in client.annotate(sentence):
+        for t in s:
+            words.append(t.word)
+            gloss.append(t.originalText)
+            after.append(t.after)
+    if lower:
+        words = [w.lower() for w in words]
+    return {
+        'gloss': gloss,
+        'words': words,
+        'after': after,
+        }
+
+
+def annotate_example(example, table):
+    ann = {'table_id': example['table_id']}
+    ann['question'] = annotate(example['question'])
+    ann['table'] = {
+        'header': [annotate(h) for h in table['header']],
+    }
+    ann['query'] = sql = copy.deepcopy(example['sql'])
+    for c in ann['query']['conds']:
+        c[-1] = annotate(str(c[-1]))
+
+    q1 = 'SYMSELECT SYMAGG {} SYMCOL {}'.format(Query.agg_ops[sql['agg']], table['header'][sql['sel']])
+    q2 = ['SYMCOL {} SYMOP {} SYMCOND {}'.format(table['header'][col], Query.cond_ops[op], detokenize(cond)) for col, op, cond in sql['conds']]
+    if q2:
+        q2 = 'SYMWHERE ' + ' SYMAND '.join(q2) + ' SYMEND'
+    else:
+        q2 = 'SYMEND'
+    inp = 'SYMSYMS {syms} SYMAGGOPS {aggops} SYMCONDOPS {condops} SYMTABLE {table} SYMQUESTION {question}'.format(
+        syms=' '.join(['SYM' + s for s in Query.syms]),
+        table=' '.join(['SYMCOL ' + s for s in table['header']]),
+        question=example['question'],
+        aggops=' '.join([s for s in Query.agg_ops]),
+        condops=' '.join([s for s in Query.cond_ops]),
+    )
+    ann['seq_input'] = annotate(inp)
+    out = '{q1} {q2}'.format(q1=q1, q2=q2) if q2 else q1
+    ann['seq_output'] = annotate(out)
+    ann['where_output'] = annotate(q2)
+    assert 'symend' in ann['seq_output']['words']
+    assert 'symend' in ann['where_output']['words']
+    return ann
+
+
+def is_valid_example(e):
+    if not all([h['words'] for h in e['table']['header']]):
+        return False
+    headers = [detokenize(h).lower() for h in e['table']['header']]
+    if len(headers) != len(set(headers)):
+        return False
+    input_vocab = set(e['seq_input']['words'])
+    for w in e['seq_output']['words']:
+        if w not in input_vocab:
+            print('query word "{}" is not in input vocabulary.\n{}'.format(w, e['seq_input']['words']))
+            return False
+    input_vocab = set(e['question']['words'])
+    for col, op, cond in e['query']['conds']:
+        for w in cond['words']:
+            if w not in input_vocab:
+                print('cond word "{}" is not in input vocabulary.\n{}'.format(w, e['question']['words']))
+                return False
+    return True
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--din', default='data', help='data directory')
+    parser.add_argument('--dout', default='annotated', help='output directory')
+    args = parser.parse_args()
+
+    if not os.path.isdir(args.dout):
+        os.makedirs(args.dout)
+
+    for split in ['train', 'dev', 'test']:
+        fsplit = os.path.join(args.din, split) + '.jsonl'
+        ftable = os.path.join(args.din, split) + '.tables.jsonl'
+        fout = os.path.join(args.dout, split) + '.jsonl'
+
+        print('annotating {}'.format(fsplit))
+        with open(fsplit) as fs, open(ftable) as ft, open(fout, 'wt') as fo:
+            print('loading tables')
+            tables = {}
+            for line in tqdm(ft, total=count_lines(ftable)):
+                d = json.loads(line)
+                tables[d['id']] = d
+            print('loading examples')
+            n_written = 0
+            for line in tqdm(fs, total=count_lines(fsplit)):
+                d = json.loads(line)
+                a = annotate_example(d, tables[d['table_id']])
+                if not is_valid_example(a):
+                    raise Exception(str(a))
+
+                gold = Query.from_tokenized_dict(a['query'])
+                reconstruct = Query.from_sequence(a['seq_output'], a['table'], lowercase=True)
+                if gold.lower() != reconstruct.lower():
+                    raise Exception ('Expected:\n{}\nGot:\n{}'.format(gold, reconstruct))
+                fo.write(json.dumps(a) + '\n')
+                n_written += 1
+            print('wrote {} examples'.format(n_written))
diff --git a/lib/query.py b/lib/query.py
@@ -3,14 +3,15 @@
 from copy import deepcopy
 import re
 
+
 re_whitespace = re.compile(r'\s+', flags=re.UNICODE)
 
 
 class Query:
 
     agg_ops = ['', 'MAX', 'MIN', 'COUNT', 'SUM', 'AVG']
     cond_ops = ['=', '>', '<', 'OP']
-    syms = ['SELECT', 'WHERE', 'AND', 'COL', 'TABLE', 'CAPTION', 'PAGE', 'SECTION', 'OP', 'COND', 'QUESTION', 'AGG', 'AGGOPS', 'CONDOPS']
+    syms = ['SELECT', 'WHERE', 'AND', 'COL', 'TABLE', 'CAPTION', 'PAGE', 'SECTION', 'OP', 'COND', 'QUESTION', 'AGG', 'AGGOPS', 'CONDOPS', 'END']
 
     def __init__(self, sel_index, agg_index, conditions=tuple()):
         self.sel_index = sel_index
@@ -68,3 +69,161 @@ def from_generated_dict(cls, d):
             end = len(val['words'])
             conds.append([col, op, detokenize(val)])
         return cls(d['sel'], d['agg'], conds)
+
+    @classmethod
+    def from_sequence(cls, sequence, table, lowercase=True):
+        sequence = deepcopy(sequence)
+        if 'symend' in sequence['words']:
+            end = sequence['words'].index('symend')
+            for k, v in sequence.items():
+                sequence[k] = v[:end]
+        terms = [{'gloss': g, 'word': w, 'after': a} for  g, w, a in zip(sequence['gloss'], sequence['words'], sequence['after'])]
+        headers = [detokenize(h) for h in table['header']]
+
+        # lowercase everything and truncate sequence
+        if lowercase:
+            headers = [h.lower() for h in headers]
+            for i, t in enumerate(terms):
+                for k, v in t.items():
+                    t[k] = v.lower()
+        headers_no_whitespcae = [re.sub(re_whitespace, '', h) for h in headers]
+
+        # get select
+        if 'symselect' != terms.pop(0)['word']:
+            raise Exception('Missing symselect operator')
+
+        # get aggregation
+        if 'symagg' != terms.pop(0)['word']:
+            raise Exception('Missing symagg operator')
+        agg_op = terms.pop(0)['word']
+
+        if agg_op == 'symcol':
+            agg_op = ''
+        else:
+            if 'symcol' != terms.pop(0)['word']:
+                raise Exception('Missing aggregation column')
+        try:
+            agg_op = cls.agg_ops.index(agg_op.upper())
+        except Exception as e:
+            raise Exception('Invalid agg op {}'.format(agg_op))
+        
+        def find_column(name):
+            return headers_no_whitespcae.index(re.sub(re_whitespace, '', name))
+
+        def flatten(tokens):
+            ret = {'words': [], 'after': [], 'gloss': []}
+            for t in tokens:
+                ret['words'].append(t['word'])
+                ret['after'].append(t['after'])
+                ret['gloss'].append(t['gloss'])
+            return ret
+        where_index = [i for i, t in enumerate(terms) if t['word'] == 'symwhere']
+        where_index = where_index[0] if where_index else len(terms)
+        flat = flatten(terms[:where_index])
+        try:
+            agg_col = find_column(detokenize(flat))
+        except Exception as e:
+            raise Exception('Cannot find aggregation column {}'.format(flat['words']))
+        where_terms = terms[where_index+1:]
+
+        # get conditions
+        conditions = []
+        while where_terms:
+            t = where_terms.pop(0)
+            flat = flatten(where_terms)
+            if t['word'] != 'symcol':
+                raise Exception('Missing conditional column {}'.format(flat['words']))
+            try:
+                op_index = flat['words'].index('symop')
+                col_tokens = flatten(where_terms[:op_index])
+            except Exception as e:
+                raise Exception('Missing conditional operator {}'.format(flat['words']))
+            cond_op = where_terms[op_index+1]['word']
+            try:
+                cond_op = cls.cond_ops.index(cond_op.upper())
+            except Exception as e:
+                raise Exception('Invalid cond op {}'.format(cond_op))
+            try:
+                cond_col = find_column(detokenize(col_tokens))
+            except Exception as e:
+                raise Exception('Cannot find conditional column {}'.format(col_tokens['words']))
+            try:
+                val_index = flat['words'].index('symcond')
+            except Exception as e:
+                raise Exception('Cannot find conditional value {}'.format(flat['words']))
+
+            where_terms = where_terms[val_index+1:]
+            flat = flatten(where_terms)
+            val_end_index = flat['words'].index('symand') if 'symand' in flat['words'] else len(where_terms)
+            cond_val = detokenize(flatten(where_terms[:val_end_index]))
+            conditions.append([cond_col, cond_op, cond_val])
+            where_terms = where_terms[val_end_index+1:]
+        q = cls(agg_col, agg_op, conditions)
+        return q
+
+    @classmethod
+    def from_partial_sequence(cls, agg_col, agg_op, sequence, table, lowercase=True):
+        sequence = deepcopy(sequence)
+        if 'symend' in sequence['words']:
+            end = sequence['words'].index('symend')
+            for k, v in sequence.items():
+                sequence[k] = v[:end]
+        terms = [{'gloss': g, 'word': w, 'after': a} for  g, w, a in zip(sequence['gloss'], sequence['words'], sequence['after'])]
+        headers = [detokenize(h) for h in table['header']]
+
+        # lowercase everything and truncate sequence
+        if lowercase:
+            headers = [h.lower() for h in headers]
+            for i, t in enumerate(terms):
+                for k, v in t.items():
+                    t[k] = v.lower()
+        headers_no_whitespcae = [re.sub(re_whitespace, '', h) for h in headers]
+
+        def find_column(name):
+            return headers_no_whitespcae.index(re.sub(re_whitespace, '', name))
+
+        def flatten(tokens):
+            ret = {'words': [], 'after': [], 'gloss': []}
+            for t in tokens:
+                ret['words'].append(t['word'])
+                ret['after'].append(t['after'])
+                ret['gloss'].append(t['gloss'])
+            return ret
+        where_index = [i for i, t in enumerate(terms) if t['word'] == 'symwhere']
+        where_index = where_index[0] if where_index else len(terms)
+        where_terms = terms[where_index+1:]
+
+        # get conditions
+        conditions = []
+        while where_terms:
+            t = where_terms.pop(0)
+            flat = flatten(where_terms)
+            if t['word'] != 'symcol':
+                raise Exception('Missing conditional column {}'.format(flat['words']))
+            try:
+                op_index = flat['words'].index('symop')
+                col_tokens = flatten(where_terms[:op_index])
+            except Exception as e:
+                raise Exception('Missing conditional operator {}'.format(flat['words']))
+            cond_op = where_terms[op_index+1]['word']
+            try:
+                cond_op = cls.cond_ops.index(cond_op.upper())
+            except Exception as e:
+                raise Exception('Invalid cond op {}'.format(cond_op))
+            try:
+                cond_col = find_column(detokenize(col_tokens))
+            except Exception as e:
+                raise Exception('Cannot find conditional column {}'.format(col_tokens['words']))
+            try:
+                val_index = flat['words'].index('symcond')
+            except Exception as e:
+                raise Exception('Cannot find conditional value {}'.format(flat['words']))
+
+            where_terms = where_terms[val_index+1:]
+            flat = flatten(where_terms)
+            val_end_index = flat['words'].index('symand') if 'symand' in flat['words'] else len(where_terms)
+            cond_val = detokenize(flatten(where_terms[:val_end_index]))
+            conditions.append([cond_col, cond_op, cond_val])
+            where_terms = where_terms[val_end_index+1:]
+        q = cls(agg_col, agg_op, conditions)
+        return q