Skip to content

Commit 217f887

Browse files
authored
add support for parsing unicode ranges (#137)
closes #136
1 parent 04b9aef commit 217f887

File tree

6 files changed

+204
-3
lines changed

6 files changed

+204
-3
lines changed

src/arena.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ export const FUNCTION = 15 // function: calc(), var()
6262
export const OPERATOR = 16 // operator: +, -, *, /, comma
6363
export const PARENTHESIS = 17 // parenthesized expression: (100% - 50px)
6464
export const URL = 18 // URL: url("file.css"), url(image.png), used in values and @import
65-
export const VALUE = 19 // Wrapper for declaration values
65+
export const UNICODE_RANGE = 19 // unicode range: u+0025-00ff, u+4??
6666

6767
// Selector node type constants (for detailed selector parsing)
6868
export const SELECTOR_LIST = 20 // comma-separated selectors
@@ -90,6 +90,9 @@ export const PRELUDE_OPERATOR = 38 // logical operator: and, or, not
9090
export const FEATURE_RANGE = 39 // Range syntax: (50px <= width <= 100px)
9191
export const AT_RULE_PRELUDE = 40 // Wrapper for at-rule prelude children
9292

93+
// Wrapper node types
94+
export const VALUE = 50 // Wrapper for declaration values
95+
9396
// Flag constants (bit-packed in 1 byte)
9497
export const FLAG_IMPORTANT = 1 << 0 // Has !important
9598
export const FLAG_HAS_ERROR = 1 << 1 // Syntax error

src/css-node.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import {
1717
OPERATOR,
1818
PARENTHESIS,
1919
URL,
20+
UNICODE_RANGE,
2021
VALUE,
2122
SELECTOR_LIST,
2223
TYPE_SELECTOR,
@@ -69,6 +70,7 @@ export const TYPE_NAMES = {
6970
[OPERATOR]: 'Operator',
7071
[PARENTHESIS]: 'Parentheses',
7172
[URL]: 'Url',
73+
[UNICODE_RANGE]: 'UnicodeRange',
7274
[VALUE]: 'Value',
7375
[SELECTOR_LIST]: 'SelectorList',
7476
[TYPE_SELECTOR]: 'TypeSelector',

src/parse-value.test.ts

Lines changed: 123 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { describe, it, expect } from 'vitest'
22
import { parse } from './parse'
3-
import { IDENTIFIER, NUMBER, DIMENSION, STRING, HASH, FUNCTION, OPERATOR, PARENTHESIS, URL, VALUE } from './arena'
3+
import { IDENTIFIER, NUMBER, DIMENSION, STRING, HASH, FUNCTION, OPERATOR, PARENTHESIS, URL, UNICODE_RANGE, VALUE } from './arena'
44

55
describe('Value Node Types', () => {
66
// Helper to get first value node from a declaration
@@ -217,6 +217,30 @@ describe('Value Node Types', () => {
217217
expect(value?.column).toBe(15)
218218
})
219219
})
220+
221+
describe('UNICODE_RANGE', () => {
222+
it('should have correct offset and length', () => {
223+
const root = parse('@font-face { unicode-range: u+0025-00ff; }')
224+
const declaration = root.first_child?.block?.first_child
225+
const value = declaration?.first_child?.children[0]
226+
expect(value?.start).toBe(28)
227+
expect(value?.length).toBe(11)
228+
expect(value?.end).toBe(39)
229+
expect(value?.line).toBe(1)
230+
expect(value?.column).toBe(29)
231+
})
232+
233+
it('should have correct line and column on line 2', () => {
234+
const root = parse('@font-face {\n unicode-range: u+4??;\n}')
235+
const declaration = root.first_child?.block?.first_child
236+
const value = declaration?.first_child?.children[0]
237+
expect(value?.start).toBe(30)
238+
expect(value?.length).toBe(5)
239+
expect(value?.end).toBe(35)
240+
expect(value?.line).toBe(2)
241+
expect(value?.column).toBe(18)
242+
})
243+
})
220244
})
221245

222246
describe('Types', () => {
@@ -267,6 +291,14 @@ describe('Value Node Types', () => {
267291
const value = getValue('div { background: url("image.png"); }')
268292
expect(value?.type).toBe(URL)
269293
})
294+
295+
it('UNICODE_RANGE type constant', () => {
296+
const root = parse('@font-face { unicode-range: u+0460-052f, u+1c80-1c8a, u+20b4, u+2de0-2dff, u+a640-a69f, u+fe2e-fe2f; }')
297+
const atrule = root.first_child
298+
const declaration = atrule?.block?.first_child
299+
const unicode_range = declaration?.first_child?.children[0]
300+
expect(unicode_range?.type).toBe(UNICODE_RANGE)
301+
})
270302
})
271303

272304
describe('Type Names', () => {
@@ -317,6 +349,12 @@ describe('Value Node Types', () => {
317349
const value = getValue('div { background: url("image.png"); }')
318350
expect(value?.type_name).toBe('Url')
319351
})
352+
353+
it('UNICODE_RANGE type_name', () => {
354+
const root = parse('@font-face { unicode-range: u+0025-00ff; }')
355+
const unicode_range = root.first_child?.block?.first_child?.first_child?.children[0]
356+
expect(unicode_range?.type_name).toBe('UnicodeRange')
357+
})
320358
})
321359

322360
describe('Value Properties', () => {
@@ -759,6 +797,90 @@ describe('Value Node Types', () => {
759797
})
760798
})
761799

800+
describe('UNICODE_RANGE', () => {
801+
it('should parse simple unicode range', () => {
802+
const root = parse('@font-face { unicode-range: u+0025-00ff; }')
803+
const decl = root.first_child?.block?.first_child
804+
805+
expect(decl?.first_child!.children).toHaveLength(1)
806+
expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
807+
expect(decl?.first_child!.children[0].text).toBe('u+0025-00ff')
808+
})
809+
810+
it('should parse single codepoint', () => {
811+
const root = parse('@font-face { unicode-range: u+26; }')
812+
const decl = root.first_child?.block?.first_child
813+
814+
expect(decl?.first_child!.children).toHaveLength(1)
815+
expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
816+
expect(decl?.first_child!.children[0].text).toBe('u+26')
817+
})
818+
819+
it('should parse wildcard pattern with question marks', () => {
820+
const root = parse('@font-face { unicode-range: u+4??; }')
821+
const decl = root.first_child?.block?.first_child
822+
823+
expect(decl?.first_child!.children).toHaveLength(1)
824+
expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
825+
expect(decl?.first_child!.children[0].text).toBe('u+4??')
826+
})
827+
828+
it('should parse uppercase U+', () => {
829+
const root = parse('@font-face { unicode-range: U+0025-00FF; }')
830+
const decl = root.first_child?.block?.first_child
831+
832+
expect(decl?.first_child!.children).toHaveLength(1)
833+
expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
834+
expect(decl?.first_child!.children[0].text).toBe('U+0025-00FF')
835+
})
836+
837+
it('should parse multiple unicode ranges', () => {
838+
const root = parse('@font-face { unicode-range: u+0460-052f, u+1c80-1c8a, u+20b4; }')
839+
const decl = root.first_child?.block?.first_child
840+
841+
expect(decl?.first_child!.children).toHaveLength(5) // 3 ranges + 2 commas
842+
expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
843+
expect(decl?.first_child!.children[0].text).toBe('u+0460-052f')
844+
expect(decl?.first_child!.children[1].type).toBe(OPERATOR)
845+
expect(decl?.first_child!.children[1].text).toBe(',')
846+
expect(decl?.first_child!.children[2].type).toBe(UNICODE_RANGE)
847+
expect(decl?.first_child!.children[2].text).toBe('u+1c80-1c8a')
848+
expect(decl?.first_child!.children[3].type).toBe(OPERATOR)
849+
expect(decl?.first_child!.children[4].type).toBe(UNICODE_RANGE)
850+
expect(decl?.first_child!.children[4].text).toBe('u+20b4')
851+
})
852+
853+
it('should parse short hex values', () => {
854+
const root = parse('@font-face { unicode-range: u+0; }')
855+
const decl = root.first_child?.block?.first_child
856+
857+
expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
858+
expect(decl?.first_child!.children[0].text).toBe('u+0')
859+
})
860+
861+
it('should parse maximum valid unicode', () => {
862+
const root = parse('@font-face { unicode-range: u+10ffff; }')
863+
const decl = root.first_child?.block?.first_child
864+
865+
expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
866+
expect(decl?.first_child!.children[0].text).toBe('u+10ffff')
867+
})
868+
869+
it('should parse wildcard variations', () => {
870+
const root = parse('@font-face { unicode-range: u+?, u+??, u+???, u+????, u+?????, u+??????; }')
871+
const decl = root.first_child?.block?.first_child
872+
873+
expect(decl?.first_child!.children[0].type).toBe(UNICODE_RANGE)
874+
expect(decl?.first_child!.children[0].text).toBe('u+?')
875+
expect(decl?.first_child!.children[2].type).toBe(UNICODE_RANGE)
876+
expect(decl?.first_child!.children[2].text).toBe('u+??')
877+
expect(decl?.first_child!.children[4].type).toBe(UNICODE_RANGE)
878+
expect(decl?.first_child!.children[4].text).toBe('u+???')
879+
expect(decl?.first_child!.children[10].type).toBe(UNICODE_RANGE)
880+
expect(decl?.first_child!.children[10].text).toBe('u+??????')
881+
})
882+
})
883+
762884
describe('Mixed values', () => {
763885
it('should parse mixed value types', () => {
764886
const root = parse('body { border: 1px solid red; }')

src/parse-value.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// Value Parser - Parses CSS declaration values into structured AST nodes
22
import { Lexer } from './tokenize'
3-
import { CSSDataArena, IDENTIFIER, NUMBER, DIMENSION, STRING, HASH, FUNCTION, OPERATOR, PARENTHESIS, URL, VALUE } from './arena'
3+
import { CSSDataArena, IDENTIFIER, NUMBER, DIMENSION, STRING, HASH, FUNCTION, OPERATOR, PARENTHESIS, URL, UNICODE_RANGE, VALUE } from './arena'
44
import {
55
TOKEN_IDENT,
66
TOKEN_NUMBER,
@@ -14,6 +14,7 @@ import {
1414
TOKEN_EOF,
1515
TOKEN_LEFT_PAREN,
1616
TOKEN_RIGHT_PAREN,
17+
TOKEN_UNICODE_RANGE,
1718
} from './token-types'
1819
import { is_whitespace, CHAR_MINUS_HYPHEN, CHAR_PLUS, CHAR_ASTERISK, CHAR_FORWARD_SLASH, str_equals } from './string-utils'
1920
import { CSSNode } from './css-node'
@@ -130,6 +131,9 @@ export class ValueParser {
130131
case TOKEN_HASH:
131132
return this.create_node(HASH, start, end)
132133

134+
case TOKEN_UNICODE_RANGE:
135+
return this.create_node(UNICODE_RANGE, start, end)
136+
133137
case TOKEN_FUNCTION:
134138
return this.parse_function_node(start, end)
135139

src/token-types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ export const TOKEN_LEFT_BRACE = 23 // {
2727
export const TOKEN_RIGHT_BRACE = 24 // }
2828
export const TOKEN_COMMENT = 25
2929
export const TOKEN_EOF = 26
30+
export const TOKEN_UNICODE_RANGE = 27 // u+0025-00ff, u+4??
3031

3132
export type TokenType =
3233
| typeof TOKEN_IDENT
@@ -55,6 +56,7 @@ export type TokenType =
5556
| typeof TOKEN_RIGHT_BRACE
5657
| typeof TOKEN_COMMENT
5758
| typeof TOKEN_EOF
59+
| typeof TOKEN_UNICODE_RANGE
5860

5961
export type Token = {
6062
type: TokenType

src/tokenize.ts

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import {
3333
TOKEN_LEFT_BRACE,
3434
TOKEN_RIGHT_BRACE,
3535
TOKEN_EOF,
36+
TOKEN_UNICODE_RANGE,
3637
type Token,
3738
type TokenType,
3839
} from './token-types'
@@ -63,6 +64,9 @@ const CHAR_PLUS = 0x2b // +
6364
const CHAR_PERCENT = 0x25 // %
6465
const CHAR_LOWERCASE_E = 0x65 // e
6566
const CHAR_UPPERCASE_E = 0x45 // E
67+
const CHAR_LOWERCASE_U = 0x75 // u
68+
const CHAR_UPPERCASE_U = 0x55 // U
69+
const CHAR_QUESTION_MARK = 0x3f // ?
6670
const CHAR_CARRIAGE_RETURN = 0x0d // \r
6771
const CHAR_LINE_FEED = 0x0a // \n
6872

@@ -488,6 +492,16 @@ export class Lexer {
488492
}
489493
}
490494

495+
// Check for unicode-range: u+ or U+
496+
// Must be exactly 'u' or 'U' followed by '+'
497+
if (this.pos - start === 1) {
498+
let first_ch = this.source.charCodeAt(start)
499+
if ((first_ch === CHAR_LOWERCASE_U || first_ch === CHAR_UPPERCASE_U) &&
500+
this.pos < this.source.length && this.source.charCodeAt(this.pos) === CHAR_PLUS) {
501+
return this.consume_unicode_range(start, start_line, start_column)
502+
}
503+
}
504+
491505
// Check for function: ident(
492506
if (this.pos < this.source.length && this.source.charCodeAt(this.pos) === CHAR_LEFT_PAREN) {
493507
this.advance()
@@ -497,6 +511,60 @@ export class Lexer {
497511
return this.make_token(TOKEN_IDENT, start, this.pos, start_line, start_column)
498512
}
499513

514+
consume_unicode_range(start: number, start_line: number, start_column: number): TokenType {
515+
// We're positioned after 'u' or 'U', at the '+'
516+
this.advance() // consume '+'
517+
518+
let hex_digits = 0
519+
let has_question = false
520+
521+
// Consume hex digits and/or question marks (up to 6 total)
522+
while (this.pos < this.source.length && hex_digits < 6) {
523+
let ch = this.source.charCodeAt(this.pos)
524+
if (is_hex_digit(ch)) {
525+
if (has_question) {
526+
// Can't have hex digits after question marks
527+
break
528+
}
529+
this.advance()
530+
hex_digits++
531+
} else if (ch === CHAR_QUESTION_MARK) {
532+
this.advance()
533+
hex_digits++
534+
has_question = true
535+
} else {
536+
break
537+
}
538+
}
539+
540+
// If we have question marks, we're done (no range allowed)
541+
if (has_question) {
542+
return this.make_token(TOKEN_UNICODE_RANGE, start, this.pos, start_line, start_column)
543+
}
544+
545+
// Check for range syntax: -HHHHHH
546+
if (this.pos < this.source.length && this.source.charCodeAt(this.pos) === CHAR_HYPHEN) {
547+
// Peek ahead to see if there's a hex digit
548+
if (this.pos + 1 < this.source.length && is_hex_digit(this.source.charCodeAt(this.pos + 1))) {
549+
this.advance() // consume '-'
550+
551+
// Consume up to 6 hex digits for the end of the range
552+
let end_hex_digits = 0
553+
while (this.pos < this.source.length && end_hex_digits < 6) {
554+
let ch = this.source.charCodeAt(this.pos)
555+
if (is_hex_digit(ch)) {
556+
this.advance()
557+
end_hex_digits++
558+
} else {
559+
break
560+
}
561+
}
562+
}
563+
}
564+
565+
return this.make_token(TOKEN_UNICODE_RANGE, start, this.pos, start_line, start_column)
566+
}
567+
500568
consume_at_keyword(start_line: number, start_column: number): TokenType {
501569
let start = this.pos
502570
this.advance() // Skip @

0 commit comments

Comments
 (0)