Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 148 additions & 9 deletions packages/parser/lib/Lexer.re
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,28 @@ module Location = Ppxlib.Location;

let (let.ok) = Result.bind;

/* Lexer modes for contextual whitespace handling */
type lexer_mode =
| Inside_selector /* WS is significant (descendant combinator) */
| Inside_declaration /* WS is ignored (cosmetic) */
| Inside_at_rule; /* WS handling TBD, skip for now */

let mode = ref(Inside_declaration);
let reset_mode = m => mode := m;

/* Token buffer for lookahead - stores (token, start_pos, end_pos) */
type buffered_token = (Parser.token, Lexing.position, Lexing.position);
let token_buffer: ref(list(buffered_token)) = ref([]);

/* Block context stack - tracks selector vs declaration context at each brace level */
let context_stack: ref(list(lexer_mode)) = ref([]);

/* Reset all lexer state (called by driver before parsing) */
let reset_lexer_state = () => {
token_buffer := [];
context_stack := [];
};

/** Signals a lexing error at the provided source location. */
exception LexingError((Lexing.position, Lexing.position, string));

Expand Down Expand Up @@ -598,10 +620,63 @@ let handle_tokenizer_error = lexbuf => {
};
};

let rec get_next_token = lexbuf => {
/*
* Lookahead mechanism for determining selector vs declaration context.
*
* After seeing `{`, we peek ahead to find either:
* - `{` at depth 0 → selector context (nested rule like `.foo { .bar { } }`)
* - `;` at depth 0 → declaration context (property like `color: red;`)
*
* We buffer the peeked tokens and filter WS based on the determined context.
*/

/* Peek ahead from current position to determine context.
Returns (context, list of buffered tokens).
Stops when we find `{` or `;` at depth 0, or `}` or EOF. */
let rec peek_for_context = (lexbuf, depth, acc) => {
let (start_pos, _) = Sedlexing.lexing_positions(lexbuf);
let token = get_next_token_raw(lexbuf);
let (_, end_pos) = Sedlexing.lexing_positions(lexbuf);
let buffered = (token, start_pos, end_pos);
let acc = [buffered, ...acc];

switch (token) {
/* Found `{` at depth 0 - this is a nested selector */
| Parser.LEFT_BRACE when depth == 0 => (Inside_selector, List.rev(acc))
/* Found `;` at depth 0 - this is a declaration */
| Parser.SEMI_COLON when depth == 0 => (Inside_declaration, List.rev(acc))
/* Found `}` at depth 0 - end of block, assume declaration context */
| Parser.RIGHT_BRACE when depth == 0 => (Inside_declaration, List.rev(acc))
/* EOF - assume declaration context */
| Parser.EOF => (Inside_declaration, List.rev(acc))
/* Track brace depth */
| Parser.LEFT_BRACE => peek_for_context(lexbuf, depth + 1, acc)
| Parser.RIGHT_BRACE => peek_for_context(lexbuf, depth - 1, acc)
/* Track paren/bracket depth (for things like `calc(...)`) */
| Parser.LEFT_PAREN
| Parser.LEFT_BRACKET => peek_for_context(lexbuf, depth + 1, acc)
| Parser.RIGHT_PAREN
| Parser.RIGHT_BRACKET => peek_for_context(lexbuf, depth - 1, acc)
/* Keep scanning */
| _ => peek_for_context(lexbuf, depth, acc)
};
}

/* Filter WS tokens from buffer based on context.
NOTE: Currently disabled - we keep all WS because:
1. Declaration values need WS (e.g., "1px solid transparent")
2. @media preludes need WS (e.g., "screen and (min-width: 768px)")
The parser grammar handles WS via WS? for optional whitespace. */
and filter_buffer_ws = (_context, buffer) => {
/* Keep all WS - the parser grammar handles it appropriately */
buffer;
}

/* Raw token getter - doesn't do any context-aware filtering */
and get_next_token_raw = lexbuf => {
switch%sedlex (lexbuf) {
| eof => Parser.EOF
| Star(comment) => get_next_token(lexbuf)
| Star(comment) => get_next_token_raw(lexbuf)
| "/*" => discard_comments(lexbuf)
| '.' => DOT
| ';' => SEMI_COLON
Expand Down Expand Up @@ -648,6 +723,58 @@ let rec get_next_token = lexbuf => {
| _ => unreachable(lexbuf)
};
}

/* Context-aware token getter with lookahead for WS handling.
Note: Does NOT handle buffered tokens - that's done by get_next_tokens_with_location.
This function populates the buffer when it sees `{`. */
and get_next_token = lexbuf => {
let token = get_next_token_raw(lexbuf);

switch (token) {
/* On `{`, determine context via lookahead */
| Parser.LEFT_BRACE =>
/* Push current mode onto stack before entering block */
context_stack := [mode^, ...context_stack^];

/* Peek ahead to determine if this block contains selectors or declarations */
let (context, peeked_tokens) = peek_for_context(lexbuf, 0, []);

/* Set mode for this block */
mode := context;

/* Filter WS from peeked tokens based on context and buffer them */
token_buffer := filter_buffer_ws(context, peeked_tokens);

/* Return the LEFT_BRACE */
Parser.LEFT_BRACE;

/* On `}`, restore previous context from stack */
| Parser.RIGHT_BRACE =>
switch (context_stack^) {
| [prev_mode, ...rest] =>
mode := prev_mode;
context_stack := rest;
| [] =>
/* Stack underflow - reset to selector mode (top level) */
mode := Inside_selector
};
Parser.RIGHT_BRACE;

/* On `@`-rules, keep in selector mode to preserve WS in preludes like "@media screen and (...)" */
| Parser.AT_RULE(_)
| Parser.AT_RULE_STATEMENT(_)
| Parser.AT_KEYFRAMES(_) =>
/* At-rule preludes need WS, so stay in selector mode.
The `{` will trigger lookahead for the block content. */
token

/* Keep WS tokens - the parser grammar handles them via WS? */
| Parser.WS => Parser.WS

/* All other tokens pass through */
| _ => token
};
}
and get_dimension = (n, lexbuf) => {
switch%sedlex (lexbuf) {
| length => FLOAT_DIMENSION((n, lexeme(lexbuf)))
Expand All @@ -661,7 +788,7 @@ and get_dimension = (n, lexbuf) => {
and discard_comments = lexbuf => {
let (start_pos, curr_pos) = Sedlexing.lexing_positions(lexbuf);
switch%sedlex (lexbuf) {
| "*/" => get_next_token(lexbuf)
| "*/" => get_next_token_raw(lexbuf)
| any => discard_comments(lexbuf)
| eof =>
raise(
Expand All @@ -671,16 +798,23 @@ and discard_comments = lexbuf => {
"Unterminated comment at the end of the string",
)),
)
| _ => get_next_token(lexbuf)
| _ => get_next_token_raw(lexbuf)
};
};

let get_next_tokens_with_location = lexbuf => {
let (position_start, _) = Sedlexing.lexing_positions(lexbuf);
let token = get_next_token(lexbuf);
let (_, position_end) = Sedlexing.lexing_positions(lexbuf);

(token, position_start, position_end);
/* Check if we have a buffered token with stored positions */
switch (token_buffer^) {
| [(token, start_pos, end_pos), ...rest] =>
token_buffer := rest;
(token, start_pos, end_pos);
| [] =>
/* No buffered token - get fresh token with positions */
let (position_start, _) = Sedlexing.lexing_positions(lexbuf);
let token = get_next_token(lexbuf);
let (_, position_end) = Sedlexing.lexing_positions(lexbuf);
(token, position_start, position_end);
};
};

let check_if_three_codepoints_would_start_an_identifier =
Expand Down Expand Up @@ -973,6 +1107,11 @@ let from_string = string => {
};

let tokenize = input => {
/* Reset lexer state for clean tokenization */
reset_lexer_state();
/* Start in selector mode to emit WS tokens (for testing raw lexer behavior) */
mode := Inside_selector;

let buffer = Sedlexing.Utf8.from_string(input);
let rec from_string = acc => {
switch (get_next_tokens_with_location(buffer)) {
Expand Down
11 changes: 11 additions & 0 deletions packages/parser/lib/Lexer.rei
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
/** Signals a lexing error at the provided source location. */
exception LexingError((Lexing.position, Lexing.position, string));

/* Lexer modes for contextual whitespace handling */
type lexer_mode =
| Inside_selector /* WS is significant (descendant combinator) */
| Inside_declaration /* WS is ignored (cosmetic) */
| Inside_at_rule; /* WS handling TBD, skip for now */

let reset_mode: lexer_mode => unit;

/* Reset all lexer state (buffer, context stack) - call before parsing */
let reset_lexer_state: unit => unit;

type token_with_location = {
txt: result(Tokens.token, (Tokens.token, Tokens.error)),
loc: Ast.loc,
Expand Down
Loading
Loading