From 98068b747ac7a0a0dd92517b5935634cb58e6eb4 Mon Sep 17 00:00:00 2001 From: Matt Kantor Date: Sun, 22 Dec 2024 12:53:20 -0500 Subject: [PATCH 1/2] Add parser.lookaheadNot combinator --- src/parsing/combinators.ts | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/parsing/combinators.ts b/src/parsing/combinators.ts index adfa054..7f5a82b 100644 --- a/src/parsing/combinators.ts +++ b/src/parsing/combinators.ts @@ -37,6 +37,24 @@ export const lazy = input => parser()(input) +export const lookaheadNot = + ( + parser: Parser, + notFollowedBy: Parser, + followedByName: string, + ): Parser => + input => + either.flatMap(parser(input), success => + either.match(notFollowedBy(success.remainingInput), { + left: _ => either.makeRight(success), + right: _ => + either.makeLeft({ + input, + message: `input was unexpectedly followed by ${followedByName}`, + }), + }), + ) + export const map = ( parser: Parser, From e44514867c934031b33fd02d721ce5e5b0adb5d8 Mon Sep 17 00:00:00 2001 From: Matt Kantor Date: Sun, 22 Dec 2024 09:57:00 -0500 Subject: [PATCH 2/2] Add syntax for comments --- src/end-to-end.test.ts | 13 +++++ src/language/parsing/atom.ts | 74 +++++++++++++++++++++-------- src/language/parsing/molecule.ts | 21 +++++--- src/language/parsing/parentheses.ts | 10 ++-- src/language/parsing/syntax-tree.ts | 9 +++- src/language/parsing/trivia.ts | 25 ++++++++++ src/language/parsing/whitespace.ts | 3 -- 7 files changed, 117 insertions(+), 38 deletions(-) create mode 100644 src/language/parsing/trivia.ts delete mode 100644 src/language/parsing/whitespace.ts diff --git a/src/end-to-end.test.ts b/src/end-to-end.test.ts index 9a474b3..81c953e 100644 --- a/src/end-to-end.test.ts +++ b/src/end-to-end.test.ts @@ -62,6 +62,18 @@ testCases(endToEnd, code => code)('end-to-end tests', [ ['{ (a: A) (b: B) }', either.makeRight({ a: 'A', b: 'B' })], ['( { ((a): :(b)) ( ( b ): B ) } )', either.makeRight({ a: 'B', b: 'B' })], ['{ (a: :(")")), (")": (B)) }', either.makeRight({ a: 'B', ')': 'B' })], + [`/**/a/**/`, either.makeRight('a')], + ['hello//world', either.makeRight('hello')], + [`"hello//world"`, either.makeRight('hello//world')], + [`{a/* this works as a delimiter */b}`, either.makeRight({ 0: 'a', 1: 'b' })], + [ + `/**/{/**/a:/**/b/**/,/**/c:/**/d/**/}/**/`, + either.makeRight({ a: 'b', c: 'd' }), + ], + [ + `/**/(/**/a/**/=>/**/:a/**/)(/**/output/**/)/**/`, + either.makeRight('output'), + ], [':match({ a: A })({ tag: a, value: {} })', either.makeRight('A')], [':{string concatenate}(a)(b)', either.makeRight('ba')], [ @@ -70,6 +82,7 @@ testCases(endToEnd, code => code)('end-to-end tests', [ ], [ `{ + // foo: bar "static data":"blah blah blah" "evaluated data": { 0:@runtime diff --git a/src/language/parsing/atom.ts b/src/language/parsing/atom.ts index 38d31af..6a71837 100644 --- a/src/language/parsing/atom.ts +++ b/src/language/parsing/atom.ts @@ -1,5 +1,6 @@ import { parser, type Parser } from '../../parsing.js' import { optionallySurroundedByParentheses } from './parentheses.js' +import { whitespace } from './trivia.js' export type Atom = string @@ -9,31 +10,62 @@ export const isAtom = (value: unknown): value is Atom => export const unit = '' as const export const atomParser: Parser = optionallySurroundedByParentheses( - parser.map( - parser.lazy(() => parser.oneOf([quotedAtom, unquotedAtom])), - output => output.join(''), - ), + parser.lazy(() => parser.oneOf([quotedAtom, unquotedAtom])), +) + +const quotedAtom = parser.map( + parser.sequence([ + parser.as(parser.literal('"'), ''), + parser.map( + parser.zeroOrMore( + parser.oneOf([ + parser.butNot( + parser.anySingleCharacter, + parser.oneOf([parser.literal('"'), parser.literal('\\')]), + '`"` or `\\`', + ), + parser.as(parser.literal('\\"'), '"'), + parser.as(parser.literal('\\\\'), '\\'), + ]), + ), + output => output.join(''), + ), + parser.as(parser.literal('"'), ''), + ]), + ([_1, contents, _2]) => contents, ) -const quotedAtom = parser.sequence([ - parser.as(parser.literal('"'), ''), - parser.map( - parser.zeroOrMore( +const unquotedAtom = parser.map( + parser.oneOrMore( + parser.butNot( + parser.anySingleCharacter, parser.oneOf([ - parser.butNot( - parser.anySingleCharacter, - parser.oneOf([parser.literal('"'), parser.literal('\\')]), - '`"` or `\\`', - ), - parser.as(parser.literal('\\"'), '"'), - parser.as(parser.literal('\\\\'), '\\'), + whitespace, + parser.literal('"'), + parser.literal('{'), + parser.literal('}'), + parser.literal('['), + parser.literal(']'), + parser.literal('('), + parser.literal(')'), + parser.literal('<'), + parser.literal('>'), + parser.literal('#'), + parser.literal('&'), + parser.literal('|'), + parser.literal('\\'), + parser.literal('='), + parser.literal(':'), + parser.literal(';'), + parser.literal(','), + parser.literal('//'), + parser.literal('/*'), + parser.literal('*/'), ]), + 'a forbidden character sequence', ), - output => output.join(''), ), - parser.as(parser.literal('"'), ''), -]) - -const unquotedAtom = parser.oneOrMore( - parser.regularExpression(/[^\s{}[\]()<>#&\|\\=:;,]+/), + characters => characters.join(''), ) + +export { unquotedAtom as unquotedAtomParser } diff --git a/src/language/parsing/molecule.ts b/src/language/parsing/molecule.ts index e54d6f5..b970714 100644 --- a/src/language/parsing/molecule.ts +++ b/src/language/parsing/molecule.ts @@ -1,7 +1,7 @@ import { parser, type Parser } from '../../parsing.js' import { atomParser, type Atom } from './atom.js' import { optionallySurroundedByParentheses } from './parentheses.js' -import { whitespace } from './whitespace.js' +import { trivia } from './trivia.js' export type Molecule = { readonly [key: Atom]: Molecule | Atom } @@ -48,7 +48,14 @@ const makeIncrementingIndexer = (): Indexer => { // Language-specific parsers follow. -const propertyDelimiter = parser.regularExpression(/[\s,]+/) +const propertyDelimiter = parser.oneOf([ + parser.sequence([ + optional(omit(trivia)), + parser.literal(','), + optional(omit(trivia)), + ]), + trivia, +]) const sugaredLookup: Parser = optionallySurroundedByParentheses( @@ -67,9 +74,9 @@ const sugaredFunction: Parser = flat( parser.sequence([ parser.map(atomParser, output => [output]), - omit(whitespace), + omit(trivia), omit(parser.literal('=>')), - omit(whitespace), + omit(trivia), parser.map( parser.lazy(() => propertyValue), output => [output], @@ -90,9 +97,9 @@ const sugaredApply: Parser = parser.map( parser.oneOrMore( parser.sequence([ parser.literal('('), - optional(omit(whitespace)), + optional(omit(trivia)), parser.lazy(() => propertyValue), - optional(omit(whitespace)), + optional(omit(trivia)), parser.literal(')'), ]), ), @@ -120,7 +127,7 @@ const namedProperty = flat( parser.sequence([ propertyKey, omit(parser.literal(':')), - optional(omit(whitespace)), + optional(omit(trivia)), propertyValue, ]), ) diff --git a/src/language/parsing/parentheses.ts b/src/language/parsing/parentheses.ts index eacec96..abbab53 100644 --- a/src/language/parsing/parentheses.ts +++ b/src/language/parsing/parentheses.ts @@ -1,5 +1,5 @@ import { parser, type Parser } from '../../parsing.js' -import { whitespace } from './whitespace.js' +import { trivia } from './trivia.js' const optionallySurroundedBy = ( parser1: Parser, @@ -18,15 +18,15 @@ export const optionallySurroundedByParentheses = ( theParser: Parser, ): Parser => parser.oneOf([ - // This allows `theParser` to greedily consume whitespace. + // This allows `theParser` to greedily consume trivia. optionallySurroundedBy( parser.literal('('), theParser, - parser.sequence([parser.zeroOrMore(whitespace), parser.literal(')')]), + parser.sequence([parser.zeroOrMore(trivia), parser.literal(')')]), ), optionallySurroundedBy( - parser.sequence([parser.literal('('), parser.zeroOrMore(whitespace)]), + parser.sequence([parser.literal('('), parser.zeroOrMore(trivia)]), theParser, - parser.sequence([parser.zeroOrMore(whitespace), parser.literal(')')]), + parser.sequence([parser.zeroOrMore(trivia), parser.literal(')')]), ), ]) diff --git a/src/language/parsing/syntax-tree.ts b/src/language/parsing/syntax-tree.ts index b6f8167..38cef8c 100644 --- a/src/language/parsing/syntax-tree.ts +++ b/src/language/parsing/syntax-tree.ts @@ -10,6 +10,7 @@ import type { import type { KeyPath } from '../semantics.js' import { atomParser, type Atom } from './atom.js' import { moleculeParser, type Molecule } from './molecule.js' +import { trivia } from './trivia.js' declare const _canonicalized: unique symbol export type Canonicalized = { readonly [_canonicalized]: true } @@ -79,6 +80,10 @@ type JSONRecordForbiddingSymbolicKeys = { }> export const syntaxTreeParser: Parser = parser.map( - parser.oneOf([atomParser, moleculeParser]), - canonicalize, + parser.sequence([ + parser.zeroOrMore(trivia), + parser.oneOf([atomParser, moleculeParser]), + parser.zeroOrMore(trivia), + ]), + ([_leadingTrivia, syntaxTree, _trailingTrivia]) => canonicalize(syntaxTree), ) diff --git a/src/language/parsing/trivia.ts b/src/language/parsing/trivia.ts new file mode 100644 index 0000000..e8f534e --- /dev/null +++ b/src/language/parsing/trivia.ts @@ -0,0 +1,25 @@ +import { parser } from '../../parsing.js' + +const blockComment = parser.sequence([ + parser.literal('/*'), + parser.zeroOrMore( + parser.oneOf([ + parser.butNot(parser.anySingleCharacter, parser.literal('*'), '*'), + parser.lookaheadNot(parser.literal('*'), parser.literal('/'), '/'), + ]), + ), + parser.literal('*/'), +]) + +const singleLineComment = parser.sequence([ + parser.literal('//'), + parser.zeroOrMore( + parser.butNot(parser.anySingleCharacter, parser.literal('\n'), 'newline'), + ), +]) + +export const whitespace = parser.regularExpression(/\s+/) + +export const trivia = parser.oneOrMore( + parser.oneOf([whitespace, singleLineComment, blockComment]), +) diff --git a/src/language/parsing/whitespace.ts b/src/language/parsing/whitespace.ts deleted file mode 100644 index 212bb43..0000000 --- a/src/language/parsing/whitespace.ts +++ /dev/null @@ -1,3 +0,0 @@ -import { parser } from '../../parsing.js' - -export const whitespace = parser.regularExpression(/\s+/)