From 912d2c2b19d7b55aaa2354b69a58a3ff55720ee8 Mon Sep 17 00:00:00 2001 From: Harsh Parekh Date: Fri, 17 Dec 2021 14:30:58 -0500 Subject: [PATCH 1/2] Make tokenization respect escaped quotes. --- process_sql.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/process_sql.py b/process_sql.py index 839612e..18a823e 100644 --- a/process_sql.py +++ b/process_sql.py @@ -112,11 +112,24 @@ def get_schema_from_json(fpath): return schema +def find_quoted_strings(text: str) -> List[int]: + quote_idxs = [] + idx = 0 + escaped = False + while idx < len(text): + if escaped: + idx, escaped = idx + 1, False + c = text[idx] + if c == "\\": + escaped = True + elif c == "\'" or c == "\"": + quote_idxs.append(idx) + idx += 1 + return quote_idxs + def tokenize(string): - string = str(string) - string = string.replace("\'", "\"") # ensures all string values wrapped by "" problem?? - quote_idxs = [idx for idx, char in enumerate(string) if char == '"'] + quote_idxs = find_quoted_strings(string) assert len(quote_idxs) % 2 == 0, "Unexpected quote" # keep string value as token From a699add1c56f317fa0c744fd0fd9994175caf53f Mon Sep 17 00:00:00 2001 From: Harsh Parekh Date: Sat, 18 Dec 2021 09:50:18 -0500 Subject: [PATCH 2/2] Add import for type annotation. --- process_sql.py | 1 + 1 file changed, 1 insertion(+) diff --git a/process_sql.py b/process_sql.py index 18a823e..af098da 100644 --- a/process_sql.py +++ b/process_sql.py @@ -26,6 +26,7 @@ import json import sqlite3 +from typing import List from nltk import word_tokenize CLAUSE_KEYWORDS = ('select', 'from', 'where', 'group', 'order', 'limit', 'intersect', 'union', 'except')