Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 71 additions & 5 deletions compiler/syntax/src/res_scanner.ml
Original file line number Diff line number Diff line change
Expand Up @@ -580,9 +580,53 @@ let scan_regex scanner =
bring_buf_up_to_date ~start_offset:last_char_offset;
Buffer.contents buf)
in
let rec scan () =
(* Look ahead from a given absolute offset to see if a valid class closer
exists on the same line.
Semantics:
- Applies BOS rules: an initial '^' does not count as content; the
very first ']' after '[' or after '[^' is treated as literal.
- Skips escaped characters (\\.) while scanning.
- Returns true only if a subsequent unescaped ']' (after some content)
is found before a line break or EOF. *)
let has_valid_class_closer_ahead ~from_offset =
let src = scanner.src in
let len = String.length src in
let i = ref (from_offset + 1) in
(* start scanning after current '[' *)
let bos = ref true in
let rec loop () =
if !i >= len then false
else
match String.unsafe_get src !i with
Copy link

Copilot AI Aug 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using String.unsafe_get can lead to bounds checking violations. Consider using String.get instead, or ensure proper bounds checking is performed before this call.

Suggested change
match String.unsafe_get src !i with
match String.get src !i with

Copilot uses AI. Check for mistakes.

| '\n' | '\r' -> false
| '\\' ->
Copy link

Copilot AI Aug 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The escape handling logic increments i by 2 without checking if i + 1 is within bounds after the increment. Consider adding bounds checking for the second character of the escape sequence.

Copilot uses AI. Check for mistakes.

if !i + 1 < len then (
i := !i + 2;
loop ())
else false
| '^' when !bos ->
incr i;
loop ()
| ']' when !bos ->
(* Leading ']' is literal content; after that, we're no longer at BOS. *)
bos := false;
incr i;
loop ()
| ']' -> true
| _ ->
bos := false;
incr i;
loop ()
in
loop ()
in
Copy link

Copilot AI Aug 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Starting the scan at from_offset + 1 assumes the character at from_offset is '[', but this is not validated. If from_offset doesn't point to '[', the parsing logic may be incorrect.

Suggested change
in
(* Validate that from_offset points to '[' *)
if from_offset < 0 || from_offset >= len || String.unsafe_get src from_offset <> '[' then
false
else
let i = ref (from_offset + 1) in
(* start scanning after current '[' *)
let bos = ref true in
let rec loop () =
if !i >= len then false
else
match String.unsafe_get src !i with
| '\n' | '\r' -> false
| '\\' ->
if !i + 1 < len then (
i := !i + 2;
loop ())
else false
| '^' when !bos ->
incr i;
loop ()
| ']' when !bos ->
(* Leading ']' is literal content; after that, we're no longer at BOS. *)
bos := false;
incr i;
loop ()
| ']' -> true
| _ ->
bos := false;
incr i;
loop ()
in
loop ()

Copilot uses AI. Check for mistakes.


(* Scan until closing '/' that is not inside a character class. Only enter
character-class mode when a valid ']' is present ahead (same line).
Track beginning-of-class to allow a leading ']' (or leading '^' then ']'). *)
let rec scan ~in_class ~class_at_bos =
match scanner.ch with
| '/' ->
| '/' when not in_class ->
let last_char_offset = scanner.offset in
next scanner;
let pattern = result ~first_char_offset ~last_char_offset in
Expand All @@ -606,12 +650,34 @@ let scan_regex scanner =
| '\\' ->
next scanner;
next scanner;
scan ()
(* Escapes count as content when inside a class; clear BOS. *)
scan ~in_class ~class_at_bos:(if in_class then false else class_at_bos)
| '[' when not in_class ->
(* Only enter a character class if a closing ']' exists ahead on the
same line. Otherwise treat '[' as a normal char. *)
if has_valid_class_closer_ahead ~from_offset:scanner.offset then (
next scanner;
scan ~in_class:true ~class_at_bos:true)
else (
next scanner;
scan ~in_class ~class_at_bos)
| '^' when in_class && class_at_bos ->
(* Leading caret does not count as content. *)
next scanner;
scan ~in_class ~class_at_bos:true
| ']' when in_class && class_at_bos ->
(* First ']' after '[' or '[^' is literal, not a closer. *)
next scanner;
scan ~in_class ~class_at_bos:false
| ']' when in_class ->
(* Leave character class. *)
next scanner;
scan ~in_class:false ~class_at_bos:false
| _ ->
next scanner;
scan ()
scan ~in_class ~class_at_bos:(if in_class then false else class_at_bos)
in
let pattern, flags = scan () in
let pattern, flags = scan ~in_class:false ~class_at_bos:false in
let end_pos = position scanner in
(start_pos, end_pos, Token.Regex (pattern, flags))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -499,4 +499,12 @@ let re = [%re {js|/^a*?$/|js}]
let re = [%re {js|/^((a)c)?(ab)$/|js}]
let re = [%re {js|/^([ab]*?)(?=(b)?)c/|js}]
let re = [%re {js|/^([ab]*?)(?!(b))c/|js}]
let re = [%re {js|/^([ab]*?)(?<!(a))c/|js}]
let re = [%re {js|/^([ab]*?)(?<!(a))c/|js}]
let re = [%re {js|/\.[^/.]+$/|js}]
let re = [%re {js|/[]/]/|js}]
let re = [%re {js|/[^]]/|js}]
let re = [%re {js|/[/]/|js}]
let re = [%re {js|/[]]/|js}]
let re = [%re {js|/[\]]/|js}]
let re = [%re {js|/[[]]/|js}]
let re = [%re {js|/[^]/]/|js}]
13 changes: 13 additions & 0 deletions tests/syntax_tests/data/parsing/grammar/expressions/regex.res
Original file line number Diff line number Diff line change
Expand Up @@ -607,3 +607,16 @@ let re = /^((a)c)?(ab)$/
let re = /^([ab]*?)(?=(b)?)c/
let re = /^([ab]*?)(?!(b))c/
let re = /^([ab]*?)(?<!(a))c/

let re = /\.[^/.]+$/

// Leading ']' is literal; '/' inside class must not terminate
let re = /[]/]/
let re = /[^]]/
let re = /[/]/

// Additional leading ']' edge cases
let re = /[]]/
let re = /[\]]/
let re = /[[]]/
let re = /[^]/]/
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible for the compiler to reject /[]/]/ and /[^]/]/? They're not valid regexes in JS and so would cause a runtime error

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally yeah, but I'm not sure how we could do that in a good way. Any ideas?

Copy link
Member

@mediremi mediremi Aug 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Claude generated the following (diff is against master, not this branch):

diff --git i/compiler/syntax/src/res_scanner.ml w/compiler/syntax/src/res_scanner.ml
index c404d36cc..3415fbd02 100644
--- i/compiler/syntax/src/res_scanner.ml
+++ w/compiler/syntax/src/res_scanner.ml
@@ -580,9 +580,9 @@ let scan_regex scanner =
       bring_buf_up_to_date ~start_offset:last_char_offset;
       Buffer.contents buf)
   in
-  let rec scan () =
+  let rec scan ?(in_char_class = false) () =
     match scanner.ch with
-    | '/' ->
+    | '/' when not in_char_class ->
       let last_char_offset = scanner.offset in
       next scanner;
       let pattern = result ~first_char_offset ~last_char_offset in
@@ -606,10 +606,16 @@ let scan_regex scanner =
     | '\\' ->
       next scanner;
       next scanner;
-      scan ()
+      scan ~in_char_class ()
+    | '[' when not in_char_class ->
+      next scanner;
+      scan ~in_char_class:true ()
+    | ']' when in_char_class ->
+      next scanner;
+      scan ~in_char_class:false ()
     | _ ->
       next scanner;
-      scan ()
+      scan ~in_char_class ()
   in
   let pattern, flags = scan () in
   let end_pos = position scanner in

Which manages to compile these regexes:

let re = /\.[^/.]+$/
let re = /[^]]/
let re = /[/]/

let re = /[]]/
let re = /[\]]/
let re = /[[]]/

And rejects the following:

let re = /[]/]/
let re = /[^]/]/

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's a definition of what are valid regexes?
Eg wondering about 2 vs 3 "/" and how to know one regexp is finished.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a character class you can 0 to infinity (unescaped) /:

let re = /[/]/
let re = /[//]/
let re = /[///]/

But outside of character classes, / has to be escaped

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we need to keep track of the [], and that's it essentially?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like it

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mediremi would you be up for taking a stab at fixing what's left of this? Making sure the cases you mention are rejected.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem - I'll finish off this PR later today 👍

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Feels like it needs a fresh pair of eyes.

Loading