From 912d2c2b19d7b55aaa2354b69a58a3ff55720ee8 Mon Sep 17 00:00:00 2001
From: Harsh Parekh <h.x.dev@outlook.com>
Date: Fri, 17 Dec 2021 14:30:58 -0500
Subject: [PATCH 1/2] Make tokenization  respect escaped quotes.

---
 process_sql.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/process_sql.py b/process_sql.py
index 839612e..18a823e 100644
--- a/process_sql.py
+++ b/process_sql.py
@@ -112,11 +112,24 @@ def get_schema_from_json(fpath):
 
     return schema
 
+def find_quoted_strings(text: str) -> List[int]:
+    quote_idxs = []
+    idx = 0
+    escaped = False
+    while idx < len(text):
+        if escaped:
+            idx, escaped = idx + 1, False
+        c = text[idx]
+        if c == "\\":
+            escaped = True
+        elif c == "\'" or c == "\"":
+            quote_idxs.append(idx)
+        idx += 1
+    return quote_idxs
+
 
 def tokenize(string):
-    string = str(string)
-    string = string.replace("\'", "\"")  # ensures all string values wrapped by "" problem??
-    quote_idxs = [idx for idx, char in enumerate(string) if char == '"']
+    quote_idxs = find_quoted_strings(string)
     assert len(quote_idxs) % 2 == 0, "Unexpected quote"
 
     # keep string value as token

From a699add1c56f317fa0c744fd0fd9994175caf53f Mon Sep 17 00:00:00 2001
From: Harsh Parekh <h.x.dev@outlook.com>
Date: Sat, 18 Dec 2021 09:50:18 -0500
Subject: [PATCH 2/2] Add import for type annotation.

---
 process_sql.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/process_sql.py b/process_sql.py
index 18a823e..af098da 100644
--- a/process_sql.py
+++ b/process_sql.py
@@ -26,6 +26,7 @@
 
 import json
 import sqlite3
+from typing import List
 from nltk import word_tokenize
 
 CLAUSE_KEYWORDS = ('select', 'from', 'where', 'group', 'order', 'limit', 'intersect', 'union', 'except')