add support for parsing unicode ranges (#137)

bartveneman · web-flow · commit 217f88782cfe · 2026-01-20T23:21:23.000+01:00
closes #136
diff --git a/src/arena.ts b/src/arena.ts
@@ -62,7 +62,7 @@ export const FUNCTION = 15 // function: calc(), var()
 export const OPERATOR = 16 // operator: +, -, *, /, comma
 export const PARENTHESIS = 17 // parenthesized expression: (100% - 50px)
 export const URL = 18 // URL: url("file.css"), url(image.png), used in values and @import
-export const VALUE = 19 // Wrapper for declaration values
+export const UNICODE_RANGE = 19 // unicode range: u+0025-00ff, u+4??
 
 // Selector node type constants (for detailed selector parsing)
 export const SELECTOR_LIST = 20 // comma-separated selectors
@@ -90,6 +90,9 @@ export const PRELUDE_OPERATOR = 38 // logical operator: and, or, not
 export const FEATURE_RANGE = 39 // Range syntax: (50px <= width <= 100px)
 export const AT_RULE_PRELUDE = 40 // Wrapper for at-rule prelude children
 
+// Wrapper node types
+export const VALUE = 50 // Wrapper for declaration values
+
 // Flag constants (bit-packed in 1 byte)
 export const FLAG_IMPORTANT = 1 << 0 // Has !important
 export const FLAG_HAS_ERROR = 1 << 1 // Syntax error
diff --git a/src/css-node.ts b/src/css-node.ts
@@ -17,6 +17,7 @@ import {
 	OPERATOR,
 	PARENTHESIS,
 	URL,
+	UNICODE_RANGE,
 	VALUE,
 	SELECTOR_LIST,
 	TYPE_SELECTOR,
@@ -69,6 +70,7 @@ export const TYPE_NAMES = {
 	[OPERATOR]: 'Operator',
 	[PARENTHESIS]: 'Parentheses',
 	[URL]: 'Url',
+	[UNICODE_RANGE]: 'UnicodeRange',
 	[VALUE]: 'Value',
 	[SELECTOR_LIST]: 'SelectorList',
 	[TYPE_SELECTOR]: 'TypeSelector',
diff --git a/src/parse-value.test.ts b/src/parse-value.test.ts
@@ -1,6 +1,6 @@
 import { describe, it, expect } from 'vitest'
 import { parse } from './parse'
-import { IDENTIFIER, NUMBER, DIMENSION, STRING, HASH, FUNCTION, OPERATOR, PARENTHESIS, URL, VALUE } from './arena'
+import { IDENTIFIER, NUMBER, DIMENSION, STRING, HASH, FUNCTION, OPERATOR, PARENTHESIS, URL, UNICODE_RANGE, VALUE } from './arena'
 
 describe('Value Node Types', () => {
 	// Helper to get first value node from a declaration
@@ -217,6 +217,30 @@ describe('Value Node Types', () => {
 				expect(value?.column).toBe(15)
 			})
 		})
+
+		describe('UNICODE_RANGE', () => {
+			it('should have correct offset and length', () => {
+				const root = parse('@font-face { unicode-range: u+0025-00ff; }')
+				const declaration = root.first_child?.block?.first_child
+				const value = declaration?.first_child?.children[0]
+				expect(value?.start).toBe(28)
+				expect(value?.length).toBe(11)
+				expect(value?.end).toBe(39)
+				expect(value?.line).toBe(1)
+				expect(value?.column).toBe(29)
+			})
+
+			it('should have correct line and column on line 2', () => {
+				const root = parse('@font-face {\n  unicode-range: u+4??;\n}')
+				const declaration = root.first_child?.block?.first_child
+				const value = declaration?.first_child?.children[0]
+				expect(value?.start).toBe(30)
+				expect(value?.length).toBe(5)
+				expect(value?.end).toBe(35)
+				expect(value?.line).toBe(2)
+				expect(value?.column).toBe(18)
+			})
+		})
 	})
 
 	describe('Types', () => {
@@ -267,6 +291,14 @@ describe('Value Node Types', () => {
 			const value = getValue('div { background: url("image.png"); }')
 			expect(value?.type).toBe(URL)
 		})
+
+		it('UNICODE_RANGE type constant', () => {
+			const root = parse('@font-face { unicode-range: u+0460-052f, u+1c80-1c8a, u+20b4, u+2de0-2dff, u+a640-a69f, u+fe2e-fe2f; }')
+			const atrule = root.first_child
+			const declaration = atrule?.block?.first_child
+			const unicode_range = declaration?.first_child?.children[0]
+			expect(unicode_range?.type).toBe(UNICODE_RANGE)
+		})
 	})
 
 	describe('Type Names', () => {
@@ -317,6 +349,12 @@ describe('Value Node Types', () => {
 			const value = getValue('div { background: url("image.png"); }')
 			expect(value?.type_name).toBe('Url')
 		})
+
+		it('UNICODE_RANGE type_name', () => {
+			const root = parse('@font-face { unicode-range: u+0025-00ff; }')
+			const unicode_range = root.first_child?.block?.first_child?.first_child?.children[0]
+			expect(unicode_range?.type_name).toBe('UnicodeRange')
+		})
 	})
 
 	describe('Value Properties', () => {
@@ -759,6 +797,90 @@ describe('Value Node Types', () => {
 			})
 		})
 
+		describe('UNICODE_RANGE', () => {
+			it('should parse simple unicode range', () => {
+				const root = parse('@font-face { unicode-range: u+0025-00ff; }')
+				const decl = root.first_child?.block?.first_child
+
+				expect(decl?.first_child!.children).toHaveLength(1)
+				expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[0].text).toBe('u+0025-00ff')
+			})
+
+			it('should parse single codepoint', () => {
+				const root = parse('@font-face { unicode-range: u+26; }')
+				const decl = root.first_child?.block?.first_child
+
+				expect(decl?.first_child!.children).toHaveLength(1)
+				expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[0].text).toBe('u+26')
+			})
+
+			it('should parse wildcard pattern with question marks', () => {
+				const root = parse('@font-face { unicode-range: u+4??; }')
+				const decl = root.first_child?.block?.first_child
+
+				expect(decl?.first_child!.children).toHaveLength(1)
+				expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[0].text).toBe('u+4??')
+			})
+
+			it('should parse uppercase U+', () => {
+				const root = parse('@font-face { unicode-range: U+0025-00FF; }')
+				const decl = root.first_child?.block?.first_child
+
+				expect(decl?.first_child!.children).toHaveLength(1)
+				expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[0].text).toBe('U+0025-00FF')
+			})
+
+			it('should parse multiple unicode ranges', () => {
+				const root = parse('@font-face { unicode-range: u+0460-052f, u+1c80-1c8a, u+20b4; }')
+				const decl = root.first_child?.block?.first_child
+
+				expect(decl?.first_child!.children).toHaveLength(5) // 3 ranges + 2 commas
+				expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[0].text).toBe('u+0460-052f')
+				expect(decl?.first_child!.children[1].type).toBe(OPERATOR)
+				expect(decl?.first_child!.children[1].text).toBe(',')
+				expect(decl?.first_child!.children[2].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[2].text).toBe('u+1c80-1c8a')
+				expect(decl?.first_child!.children[3].type).toBe(OPERATOR)
+				expect(decl?.first_child!.children[4].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[4].text).toBe('u+20b4')
+			})
+
+			it('should parse short hex values', () => {
+				const root = parse('@font-face { unicode-range: u+0; }')
+				const decl = root.first_child?.block?.first_child
+
+				expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[0].text).toBe('u+0')
+			})
+
+			it('should parse maximum valid unicode', () => {
+				const root = parse('@font-face { unicode-range: u+10ffff; }')
+				const decl = root.first_child?.block?.first_child
+
+				expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[0].text).toBe('u+10ffff')
+			})
+
+			it('should parse wildcard variations', () => {
+				const root = parse('@font-face { unicode-range: u+?, u+??, u+???, u+????, u+?????, u+??????; }')
+				const decl = root.first_child?.block?.first_child
+
+				expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[0].text).toBe('u+?')
+				expect(decl?.first_child!.children[2].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[2].text).toBe('u+??')
+				expect(decl?.first_child!.children[4].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[4].text).toBe('u+???')
+				expect(decl?.first_child!.children[10].type).toBe(UNICODE_RANGE)
+				expect(decl?.first_child!.children[10].text).toBe('u+??????')
+			})
+		})
+
 		describe('Mixed values', () => {
 			it('should parse mixed value types', () => {
 				const root = parse('body { border: 1px solid red; }')
diff --git a/src/parse-value.ts b/src/parse-value.ts
@@ -1,6 +1,6 @@
 // Value Parser - Parses CSS declaration values into structured AST nodes
 import { Lexer } from './tokenize'
-import { CSSDataArena, IDENTIFIER, NUMBER, DIMENSION, STRING, HASH, FUNCTION, OPERATOR, PARENTHESIS, URL, VALUE } from './arena'
+import { CSSDataArena, IDENTIFIER, NUMBER, DIMENSION, STRING, HASH, FUNCTION, OPERATOR, PARENTHESIS, URL, UNICODE_RANGE, VALUE } from './arena'
 import {
 	TOKEN_IDENT,
 	TOKEN_NUMBER,
@@ -14,6 +14,7 @@ import {
 	TOKEN_EOF,
 	TOKEN_LEFT_PAREN,
 	TOKEN_RIGHT_PAREN,
+	TOKEN_UNICODE_RANGE,
 } from './token-types'
 import { is_whitespace, CHAR_MINUS_HYPHEN, CHAR_PLUS, CHAR_ASTERISK, CHAR_FORWARD_SLASH, str_equals } from './string-utils'
 import { CSSNode } from './css-node'
@@ -130,6 +131,9 @@ export class ValueParser {
 			case TOKEN_HASH:
 				return this.create_node(HASH, start, end)
 
+			case TOKEN_UNICODE_RANGE:
+				return this.create_node(UNICODE_RANGE, start, end)
+
 			case TOKEN_FUNCTION:
 				return this.parse_function_node(start, end)
 
diff --git a/src/token-types.ts b/src/token-types.ts
@@ -27,6 +27,7 @@ export const TOKEN_LEFT_BRACE = 23 // {
 export const TOKEN_RIGHT_BRACE = 24 // }
 export const TOKEN_COMMENT = 25
 export const TOKEN_EOF = 26
+export const TOKEN_UNICODE_RANGE = 27 // u+0025-00ff, u+4??
 
 export type TokenType =
 	| typeof TOKEN_IDENT
@@ -55,6 +56,7 @@ export type TokenType =
 	| typeof TOKEN_RIGHT_BRACE
 	| typeof TOKEN_COMMENT
 	| typeof TOKEN_EOF
+	| typeof TOKEN_UNICODE_RANGE
 
 export type Token = {
 	type: TokenType
diff --git a/src/tokenize.ts b/src/tokenize.ts
@@ -33,6 +33,7 @@ import {
 	TOKEN_LEFT_BRACE,
 	TOKEN_RIGHT_BRACE,
 	TOKEN_EOF,
+	TOKEN_UNICODE_RANGE,
 	type Token,
 	type TokenType,
 } from './token-types'
@@ -63,6 +64,9 @@ const CHAR_PLUS = 0x2b // +
 const CHAR_PERCENT = 0x25 // %
 const CHAR_LOWERCASE_E = 0x65 // e
 const CHAR_UPPERCASE_E = 0x45 // E
+const CHAR_LOWERCASE_U = 0x75 // u
+const CHAR_UPPERCASE_U = 0x55 // U
+const CHAR_QUESTION_MARK = 0x3f // ?
 const CHAR_CARRIAGE_RETURN = 0x0d // \r
 const CHAR_LINE_FEED = 0x0a // \n
 
@@ -488,6 +492,16 @@ export class Lexer {
 			}
 		}
 
+		// Check for unicode-range: u+ or U+
+		// Must be exactly 'u' or 'U' followed by '+'
+		if (this.pos - start === 1) {
+			let first_ch = this.source.charCodeAt(start)
+			if ((first_ch === CHAR_LOWERCASE_U || first_ch === CHAR_UPPERCASE_U) &&
+				this.pos < this.source.length && this.source.charCodeAt(this.pos) === CHAR_PLUS) {
+				return this.consume_unicode_range(start, start_line, start_column)
+			}
+		}
+
 		// Check for function: ident(
 		if (this.pos < this.source.length && this.source.charCodeAt(this.pos) === CHAR_LEFT_PAREN) {
 			this.advance()
@@ -497,6 +511,60 @@ export class Lexer {
 		return this.make_token(TOKEN_IDENT, start, this.pos, start_line, start_column)
 	}
 
+	consume_unicode_range(start: number, start_line: number, start_column: number): TokenType {
+		// We're positioned after 'u' or 'U', at the '+'
+		this.advance() // consume '+'
+
+		let hex_digits = 0
+		let has_question = false
+
+		// Consume hex digits and/or question marks (up to 6 total)
+		while (this.pos < this.source.length && hex_digits < 6) {
+			let ch = this.source.charCodeAt(this.pos)
+			if (is_hex_digit(ch)) {
+				if (has_question) {
+					// Can't have hex digits after question marks
+					break
+				}
+				this.advance()
+				hex_digits++
+			} else if (ch === CHAR_QUESTION_MARK) {
+				this.advance()
+				hex_digits++
+				has_question = true
+			} else {
+				break
+			}
+		}
+
+		// If we have question marks, we're done (no range allowed)
+		if (has_question) {
+			return this.make_token(TOKEN_UNICODE_RANGE, start, this.pos, start_line, start_column)
+		}
+
+		// Check for range syntax: -HHHHHH
+		if (this.pos < this.source.length && this.source.charCodeAt(this.pos) === CHAR_HYPHEN) {
+			// Peek ahead to see if there's a hex digit
+			if (this.pos + 1 < this.source.length && is_hex_digit(this.source.charCodeAt(this.pos + 1))) {
+				this.advance() // consume '-'
+
+				// Consume up to 6 hex digits for the end of the range
+				let end_hex_digits = 0
+				while (this.pos < this.source.length && end_hex_digits < 6) {
+					let ch = this.source.charCodeAt(this.pos)
+					if (is_hex_digit(ch)) {
+						this.advance()
+						end_hex_digits++
+					} else {
+						break
+					}
+				}
+			}
+		}
+
+		return this.make_token(TOKEN_UNICODE_RANGE, start, this.pos, start_line, start_column)
+	}
+
 	consume_at_keyword(start_line: number, start_column: number): TokenType {
 		let start = this.pos
 		this.advance() // Skip @