|
| 1 | +import re |
| 2 | + |
| 3 | + |
| 4 | +def test_bug(): |
| 5 | + # My thought: \b matches at the transition between non-word and word. |
| 6 | + # In 'FROM "public"', the characters are F, R, O, M, space, ", p, u, b, l, i, c, " |
| 7 | + # Transitions: |
| 8 | + # space(non-word) to "(non-word) -> NO \b |
| 9 | + # "(non-word) to p(word) -> YES \b |
| 10 | + # c(word) to "(non-word) -> YES \b |
| 11 | + |
| 12 | + # So \bpublic\b matches exactly public. |
| 13 | + # If the input is "public"."table", the regex matches: |
| 14 | + # 1. (?:"public"|\bpublic\b) -> matches "public" (first branch) OR public (second branch) |
| 15 | + # 2. . |
| 16 | + # 3. "table" |
| 17 | + |
| 18 | + # If it matches "public" via the FIRST branch, then group(0) is "public"."table". |
| 19 | + # BUT, regex engines try to match greedily or in order. |
| 20 | + # Let's test if the second branch \bpublic\b matches part of "public" |
| 21 | + |
| 22 | + sql = 'SELECT * FROM "public"."workflow"' |
| 23 | + pattern_v110 = r'(?i)(?:"public"|\bpublic\b)\s*\.\s*(?:"(\w+)"|(\w+))' |
| 24 | + |
| 25 | + match = re.search(pattern_v110, sql) |
| 26 | + print(f"Match: {match.group(0)}") |
| 27 | + print(f"Start: {match.start()}") |
| 28 | + |
| 29 | + # Wait, if Match Start is 14, then it matched "public" correctly. |
| 30 | + # SELECT * FROM (14 chars) |
| 31 | + # 01234567890123 |
| 32 | + |
| 33 | + # Let's check with a DIFFERENT string |
| 34 | + sql2 = 'SELECT "public"."user"."id" FROM "public"."user"' |
| 35 | + match2 = re.search(pattern_v110, sql2) |
| 36 | + print(f"Match2: {match2.group(0)}") |
| 37 | + |
| 38 | + # Ah! I think I see it. If I use \bpublic\b it might match the INNER part. |
| 39 | + # But wait, my output above says Match found: '"public"."workflow"' |
| 40 | + # So it IS matching the quotes. |
| 41 | + |
| 42 | + |
| 43 | +if __name__ == "__main__": |
| 44 | + test_bug() |
0 commit comments