|
| 1 | +/** |
| 2 | + * Terminology inspired from https://www.rfc-editor.org/rfc/rfc9535.html |
| 3 | + * |
| 4 | + * jsonpath-query = segment* |
| 5 | + * segment = .name-shorthand / bracketed-selection |
| 6 | + * bracketed-selection = ['name-selector'] / ["name-selector"] / [index-selector] |
| 7 | + * |
| 8 | + * Useful references: |
| 9 | + * - https://goessner.net/articles/JsonPath/ |
| 10 | + * - https://jsonpath.com/ |
| 11 | + * - https://github.com/jsonpath-standard |
| 12 | + */ |
| 13 | + |
| 14 | +interface ParsingContext { |
| 15 | + quote: string | undefined |
| 16 | + escapeSequence: string | undefined |
| 17 | +} |
| 18 | + |
| 19 | +/** |
| 20 | + * Extract selectors from a simple JSON path expression, return [] for an invalid path |
| 21 | + * |
| 22 | + * Supports: |
| 23 | + * - Dot notation: `foo.bar.baz` |
| 24 | + * - Bracket notation: `['foo']["bar"]` |
| 25 | + * - Array indices: `items[0]`, `data['users'][1]` |
| 26 | + * |
| 27 | + * Examples: |
| 28 | + * parseJsonPath("['foo'].bar[12]") |
| 29 | + * => ['foo', 'bar', '12'] |
| 30 | + * |
| 31 | + * parseJsonPath("['foo") |
| 32 | + * => [] |
| 33 | + */ |
| 34 | +export function parseJsonPath(path: string): string[] { |
| 35 | + const selectors: string[] = [] |
| 36 | + let previousToken = Token.START |
| 37 | + let currentToken: Token | undefined |
| 38 | + const parsingContext: ParsingContext = { quote: undefined, escapeSequence: undefined } |
| 39 | + let currentSelector = '' |
| 40 | + for (const char of path) { |
| 41 | + // find which kind of token is this char |
| 42 | + currentToken = ALLOWED_NEXT_TOKENS[previousToken].find((token) => TOKEN_PREDICATE[token](char, parsingContext)) |
| 43 | + if (!currentToken) { |
| 44 | + return [] |
| 45 | + } |
| 46 | + if (parsingContext.escapeSequence !== undefined && currentToken !== Token.ESCAPE_SEQUENCE_CHAR) { |
| 47 | + if (!isValidEscapeSequence(parsingContext.escapeSequence)) { |
| 48 | + return [] |
| 49 | + } |
| 50 | + currentSelector += resolveEscapeSequence(parsingContext.escapeSequence) |
| 51 | + parsingContext.escapeSequence = undefined |
| 52 | + } |
| 53 | + if (ALLOWED_SELECTOR_TOKENS.includes(currentToken)) { |
| 54 | + // buffer the char if it belongs to the selector |
| 55 | + // ex: foo['bar'] |
| 56 | + // ^ ^ |
| 57 | + currentSelector += char |
| 58 | + } else if (ALLOWED_SELECTOR_DELIMITER_TOKENS.includes(currentToken) && currentSelector !== '') { |
| 59 | + // close the current path part if we have reach a path part delimiter |
| 60 | + // ex: foo.bar['qux'] |
| 61 | + // ^ ^ ^ |
| 62 | + selectors.push(currentSelector) |
| 63 | + currentSelector = '' |
| 64 | + } else if (currentToken === Token.ESCAPE_SEQUENCE_CHAR) { |
| 65 | + parsingContext.escapeSequence = parsingContext.escapeSequence ? `${parsingContext.escapeSequence}${char}` : char |
| 66 | + } else if (currentToken === Token.QUOTE_START) { |
| 67 | + parsingContext.quote = char |
| 68 | + } else if (currentToken === Token.QUOTE_END) { |
| 69 | + parsingContext.quote = undefined |
| 70 | + } |
| 71 | + previousToken = currentToken |
| 72 | + } |
| 73 | + if (!ALLOWED_NEXT_TOKENS[previousToken].includes(Token.END)) { |
| 74 | + return [] |
| 75 | + } |
| 76 | + if (currentSelector !== '') { |
| 77 | + selectors.push(currentSelector) |
| 78 | + } |
| 79 | + return selectors |
| 80 | +} |
| 81 | + |
| 82 | +/** |
| 83 | + * List of all tokens in the path |
| 84 | + * |
| 85 | + * @example foo.bar['qu\'x'][0] |
| 86 | + * | | | | | |
| 87 | + * Token sequence: | | | | | |
| 88 | + * 1. START (before first char) <-+ | | | | |
| 89 | + * 2. NAME_SHORTHAND_FIRST_CHAR: f | | | | |
| 90 | + * 3. NAME_SHORTHAND_CHAR: oo | | | | |
| 91 | + * 4. DOT: . <------------------------+ | | | |
| 92 | + * 5. NAME_SHORTHAND_FIRST_CHAR: b | | | |
| 93 | + * 6. NAME_SHORTHAND_CHAR: ar | | | |
| 94 | + * 7. BRACKET_START: [ <------------------+ | | |
| 95 | + * 8. QUOTE_START: ' | | |
| 96 | + * 9. NAME_SELECTOR_CHAR: qu | | |
| 97 | + * 10. ESCAPE: \ | | |
| 98 | + * 11. ESCAPABLE_CHAR: ' | | |
| 99 | + * 12. NAME_SELECTOR_CHAR: x | | |
| 100 | + * 13. QUOTE_END: ' | | |
| 101 | + * 14. BRACKET_END: ] | | |
| 102 | + * 15. BRACKET_START: [ <--------------------------+ | |
| 103 | + * 16. DIGIT: 0 | |
| 104 | + * 17. BRACKET_END: ] | |
| 105 | + * 18. END (after last char) <------------------------+ |
| 106 | + */ |
| 107 | +const enum Token { |
| 108 | + START, |
| 109 | + END, |
| 110 | + |
| 111 | + NAME_SHORTHAND_FIRST_CHAR, |
| 112 | + NAME_SHORTHAND_CHAR, |
| 113 | + DOT, |
| 114 | + |
| 115 | + BRACKET_START, |
| 116 | + BRACKET_END, |
| 117 | + DIGIT, |
| 118 | + |
| 119 | + QUOTE_START, |
| 120 | + QUOTE_END, |
| 121 | + NAME_SELECTOR_CHAR, |
| 122 | + ESCAPE, |
| 123 | + ESCAPE_SEQUENCE_CHAR, |
| 124 | +} |
| 125 | + |
| 126 | +const NAME_SHORTHAND_FIRST_CHAR_REGEX = /[a-zA-Z_$]/ |
| 127 | +const NAME_SHORTHAND_CHAR_REGEX = /[a-zA-Z0-9_$]/ |
| 128 | +const DIGIT_REGEX = /[0-9]/ |
| 129 | +const UNICODE_CHAR_REGEX = /[a-fA-F0-9]/ |
| 130 | +const QUOTE_CHARS = '\'"' |
| 131 | + |
| 132 | +const TOKEN_PREDICATE: { [token in Token]: (char: string, parsingContext: ParsingContext) => boolean } = { |
| 133 | + // no char should match to START or END |
| 134 | + [Token.START]: () => false, |
| 135 | + [Token.END]: () => false, |
| 136 | + |
| 137 | + [Token.NAME_SHORTHAND_FIRST_CHAR]: (char) => NAME_SHORTHAND_FIRST_CHAR_REGEX.test(char), |
| 138 | + [Token.NAME_SHORTHAND_CHAR]: (char) => NAME_SHORTHAND_CHAR_REGEX.test(char), |
| 139 | + [Token.DOT]: (char) => char === '.', |
| 140 | + |
| 141 | + [Token.BRACKET_START]: (char) => char === '[', |
| 142 | + [Token.BRACKET_END]: (char) => char === ']', |
| 143 | + [Token.DIGIT]: (char) => DIGIT_REGEX.test(char), |
| 144 | + |
| 145 | + [Token.QUOTE_START]: (char) => QUOTE_CHARS.includes(char), |
| 146 | + [Token.QUOTE_END]: (char, parsingContext) => char === parsingContext.quote, |
| 147 | + [Token.NAME_SELECTOR_CHAR]: () => true, // any char can be used in name selector |
| 148 | + [Token.ESCAPE]: (char) => char === '\\', |
| 149 | + [Token.ESCAPE_SEQUENCE_CHAR]: (char, parsingContext) => { |
| 150 | + if (parsingContext.escapeSequence === undefined) { |
| 151 | + // see https://www.rfc-editor.org/rfc/rfc9535.html#name-semantics-3 |
| 152 | + return `${parsingContext.quote}/\\bfnrtu`.includes(char) |
| 153 | + } else if (parsingContext.escapeSequence.startsWith('u') && parsingContext.escapeSequence.length < 5) { |
| 154 | + return UNICODE_CHAR_REGEX.test(char) |
| 155 | + } |
| 156 | + return false |
| 157 | + }, |
| 158 | +} |
| 159 | + |
| 160 | +const ALLOWED_NEXT_TOKENS: { [token in Token]: Token[] } = { |
| 161 | + [Token.START]: [Token.NAME_SHORTHAND_FIRST_CHAR, Token.BRACKET_START], |
| 162 | + [Token.END]: [], |
| 163 | + |
| 164 | + [Token.NAME_SHORTHAND_FIRST_CHAR]: [Token.NAME_SHORTHAND_CHAR, Token.DOT, Token.BRACKET_START, Token.END], |
| 165 | + [Token.NAME_SHORTHAND_CHAR]: [Token.NAME_SHORTHAND_CHAR, Token.DOT, Token.BRACKET_START, Token.END], |
| 166 | + [Token.DOT]: [Token.NAME_SHORTHAND_FIRST_CHAR], |
| 167 | + |
| 168 | + [Token.BRACKET_START]: [Token.QUOTE_START, Token.DIGIT], |
| 169 | + [Token.BRACKET_END]: [Token.DOT, Token.BRACKET_START, Token.END], |
| 170 | + [Token.DIGIT]: [Token.DIGIT, Token.BRACKET_END], |
| 171 | + |
| 172 | + [Token.QUOTE_START]: [Token.ESCAPE, Token.QUOTE_END, Token.NAME_SELECTOR_CHAR], |
| 173 | + [Token.QUOTE_END]: [Token.BRACKET_END], |
| 174 | + [Token.NAME_SELECTOR_CHAR]: [Token.ESCAPE, Token.QUOTE_END, Token.NAME_SELECTOR_CHAR], |
| 175 | + [Token.ESCAPE]: [Token.ESCAPE_SEQUENCE_CHAR], |
| 176 | + [Token.ESCAPE_SEQUENCE_CHAR]: [Token.ESCAPE_SEQUENCE_CHAR, Token.ESCAPE, Token.QUOTE_END, Token.NAME_SELECTOR_CHAR], |
| 177 | +} |
| 178 | + |
| 179 | +// foo['bar\n'][12] |
| 180 | +// ^^ ^ ^^ ^ |
| 181 | +const ALLOWED_SELECTOR_TOKENS = [ |
| 182 | + Token.NAME_SHORTHAND_FIRST_CHAR, |
| 183 | + Token.NAME_SHORTHAND_CHAR, |
| 184 | + Token.DIGIT, |
| 185 | + Token.NAME_SELECTOR_CHAR, |
| 186 | +] |
| 187 | + |
| 188 | +// foo.bar['qux'] |
| 189 | +// ^ ^ ^ |
| 190 | +const ALLOWED_SELECTOR_DELIMITER_TOKENS = [Token.DOT, Token.BRACKET_START, Token.BRACKET_END] |
| 191 | + |
| 192 | +function isValidEscapeSequence(escapeSequence: string): boolean { |
| 193 | + return '"\'/\\bfnrt'.includes(escapeSequence) || (escapeSequence.startsWith('u') && escapeSequence.length === 5) |
| 194 | +} |
| 195 | + |
| 196 | +const ESCAPED_CHARS: { [key: string]: string } = { |
| 197 | + '"': '"', |
| 198 | + "'": "'", |
| 199 | + '/': '/', |
| 200 | + '\\': '\\', |
| 201 | + b: '\b', |
| 202 | + f: '\f', |
| 203 | + n: '\n', |
| 204 | + r: '\r', |
| 205 | + t: '\t', |
| 206 | +} |
| 207 | + |
| 208 | +function resolveEscapeSequence(escapeSequence: string): string { |
| 209 | + if (escapeSequence.startsWith('u')) { |
| 210 | + // build Unicode char from code |
| 211 | + return String.fromCharCode(parseInt(escapeSequence.slice(1), 16)) |
| 212 | + } |
| 213 | + return ESCAPED_CHARS[escapeSequence] |
| 214 | +} |
0 commit comments