|
| 1 | +/*--------------------------------------------------------------------------------------------- |
| 2 | + * Copyright (c) Microsoft Corporation. All rights reserved. |
| 3 | + * Licensed under the MIT License. See License.txt in the project root for license information. |
| 4 | + *--------------------------------------------------------------------------------------------*/ |
| 5 | + |
| 6 | +import { MarkdownLink } from './tokens/markdownLink.js'; |
| 7 | +import { NewLine } from '../linesCodec/tokens/newLine.js'; |
| 8 | +import { assert } from '../../../../base/common/assert.js'; |
| 9 | +import { FormFeed } from '../simpleCodec/tokens/formFeed.js'; |
| 10 | +import { VSBuffer } from '../../../../base/common/buffer.js'; |
| 11 | +import { VerticalTab } from '../simpleCodec/tokens/verticalTab.js'; |
| 12 | +import { ReadableStream } from '../../../../base/common/stream.js'; |
| 13 | +import { CarriageReturn } from '../linesCodec/tokens/carriageReturn.js'; |
| 14 | +import { BaseDecoder } from '../../../../base/common/codecs/baseDecoder.js'; |
| 15 | +import { LeftBracket, RightBracket } from '../simpleCodec/tokens/brackets.js'; |
| 16 | +import { SimpleDecoder, TSimpleToken } from '../simpleCodec/simpleDecoder.js'; |
| 17 | +import { ParserBase, TAcceptTokenResult } from '../simpleCodec/parserBase.js'; |
| 18 | +import { LeftParenthesis, RightParenthesis } from '../simpleCodec/tokens/parentheses.js'; |
| 19 | + |
| 20 | +/** |
| 21 | + * Tokens handled by this decoder. |
| 22 | + */ |
| 23 | +export type TMarkdownToken = MarkdownLink | TSimpleToken; |
| 24 | + |
| 25 | +/** |
| 26 | + * List of characters that stop a markdown link sequence. |
| 27 | + */ |
| 28 | +const MARKDOWN_LINK_STOP_CHARACTERS: readonly string[] = [CarriageReturn, NewLine, VerticalTab, FormFeed] |
| 29 | + .map((token) => { return token.symbol; }); |
| 30 | + |
| 31 | +/** |
| 32 | + * The parser responsible for parsing a `markdown link caption` part of a markdown |
| 33 | + * link (e.g., the `[caption text]` part of the `[caption text](./some/path)` link). |
| 34 | + * |
| 35 | + * The parsing process starts with single `[` token and collects all tokens until |
| 36 | + * the first `]` token is encountered. In this successful case, the parser transitions |
| 37 | + * into the {@linkcode MarkdownLinkCaption} parser type which continues the general |
| 38 | + * parsing process of the markdown link. |
| 39 | + * |
| 40 | + * Otherwise, if one of the stop characters defined in the {@linkcode MARKDOWN_LINK_STOP_CHARACTERS} |
| 41 | + * is encountered before the `]` token, the parsing process is aborted which is communicated to |
| 42 | + * the caller by returning a `failure` result. In this case, the caller is assumed to be responsible |
| 43 | + * for re-emitting the {@link tokens} accumulated so far as standalone entities since they are no |
| 44 | + * longer represent a coherent token entity of a larger size. |
| 45 | + */ |
| 46 | +class PartialMarkdownLinkCaption extends ParserBase<TSimpleToken, PartialMarkdownLinkCaption | MarkdownLinkCaption> { |
| 47 | + constructor(token: LeftBracket) { |
| 48 | + super([token]); |
| 49 | + } |
| 50 | + |
| 51 | + public accept(token: TSimpleToken): TAcceptTokenResult<PartialMarkdownLinkCaption | MarkdownLinkCaption> { |
| 52 | + // any of stop characters is are breaking a markdown link caption sequence |
| 53 | + if (MARKDOWN_LINK_STOP_CHARACTERS.includes(token.text)) { |
| 54 | + return { |
| 55 | + result: 'failure', |
| 56 | + wasTokenConsumed: false, |
| 57 | + }; |
| 58 | + } |
| 59 | + |
| 60 | + // the `]` character ends the caption of a markdown link |
| 61 | + if (token instanceof RightBracket) { |
| 62 | + return { |
| 63 | + result: 'success', |
| 64 | + nextParser: new MarkdownLinkCaption([...this.tokens, token]), |
| 65 | + wasTokenConsumed: true, |
| 66 | + }; |
| 67 | + } |
| 68 | + |
| 69 | + // otherwise, include the token in the sequence |
| 70 | + // and keep the current parser object instance |
| 71 | + this.currentTokens.push(token); |
| 72 | + return { |
| 73 | + result: 'success', |
| 74 | + nextParser: this, |
| 75 | + wasTokenConsumed: true, |
| 76 | + }; |
| 77 | + } |
| 78 | +} |
| 79 | + |
| 80 | +/** |
| 81 | + * The parser responsible for transitioning from a {@linkcode PartialMarkdownLinkCaption} |
| 82 | + * parser to the {@link PartialMarkdownLink} one, therefore serves a parser glue between |
| 83 | + * the `[caption]` and the `(./some/path)` parts of the `[caption](./some/path)` link. |
| 84 | + * |
| 85 | + * The only successful case of this parser is the `(` token that initiated the process |
| 86 | + * of parsing the `reference` part of a markdown link and in this case the parser |
| 87 | + * transitions into the `PartialMarkdownLink` parser type. |
| 88 | + * |
| 89 | + * Any other character is considered a failure result. In this case, the caller is assumed |
| 90 | + * to be responsible for re-emitting the {@link tokens} accumulated so far as standalone |
| 91 | + * entities since they are no longer represent a coherent token entity of a larger size. |
| 92 | + */ |
| 93 | +class MarkdownLinkCaption extends ParserBase<TSimpleToken, MarkdownLinkCaption | PartialMarkdownLink> { |
| 94 | + public accept(token: TSimpleToken): TAcceptTokenResult<MarkdownLinkCaption | PartialMarkdownLink> { |
| 95 | + // the `(` character starts the link part of a markdown link |
| 96 | + // that is the only character that can follow the caption |
| 97 | + if (token instanceof LeftParenthesis) { |
| 98 | + return { |
| 99 | + result: 'success', |
| 100 | + wasTokenConsumed: true, |
| 101 | + nextParser: new PartialMarkdownLink([...this.tokens], token), |
| 102 | + }; |
| 103 | + } |
| 104 | + |
| 105 | + return { |
| 106 | + result: 'failure', |
| 107 | + wasTokenConsumed: false, |
| 108 | + }; |
| 109 | + } |
| 110 | +} |
| 111 | + |
| 112 | +/** |
| 113 | + * The parser responsible for parsing a `link reference` part of a markdown link |
| 114 | + * (e.g., the `(./some/path)` part of the `[caption text](./some/path)` link). |
| 115 | + * |
| 116 | + * The parsing process starts with tokens that represent the `[caption]` part of a markdown |
| 117 | + * link, followed by the `(` token. The parser collects all subsequent tokens until final closing |
| 118 | + * parenthesis (`)`) is encountered (*\*see [1] below*). In this successful case, the parser object |
| 119 | + * transitions into the {@linkcode MarkdownLink} token type which signifies the end of the entire |
| 120 | + * parsing process of the link text. |
| 121 | + * |
| 122 | + * Otherwise, if one of the stop characters defined in the {@linkcode MARKDOWN_LINK_STOP_CHARACTERS} |
| 123 | + * is encountered before the final `)` token, the parsing process is aborted which is communicated to |
| 124 | + * the caller by returning a `failure` result. In this case, the caller is assumed to be responsible |
| 125 | + * for re-emitting the {@link tokens} accumulated so far as standalone entities since they are no |
| 126 | + * longer represent a coherent token entity of a larger size. |
| 127 | + * |
| 128 | + * `[1]` The `reference` part of the markdown link can contain any number of nested parenthesis, e.g., |
| 129 | + * `[caption](/some/p(th/file.md)` is a valid markdown link and a valid folder name, hence number |
| 130 | + * of open parenthesis must match the number of closing ones and the path sequence is considered |
| 131 | + * to be complete as soon as this requirement is met. Therefore the `final` word is used in |
| 132 | + * the description comments above to highlight this important detail. |
| 133 | + */ |
| 134 | +class PartialMarkdownLink extends ParserBase<TSimpleToken, PartialMarkdownLink | MarkdownLink> { |
| 135 | + /** |
| 136 | + * Number of open parenthesis in the sequence. |
| 137 | + * See comment in the {@linkcode accept} method for more details. |
| 138 | + */ |
| 139 | + private openParensCount: number = 1; |
| 140 | + |
| 141 | + constructor( |
| 142 | + protected readonly captionTokens: TSimpleToken[], |
| 143 | + token: LeftParenthesis, |
| 144 | + ) { |
| 145 | + super([token]); |
| 146 | + } |
| 147 | + |
| 148 | + public override get tokens(): readonly TSimpleToken[] { |
| 149 | + return [...this.captionTokens, ...this.currentTokens]; |
| 150 | + } |
| 151 | + |
| 152 | + public accept(token: TSimpleToken): TAcceptTokenResult<PartialMarkdownLink | MarkdownLink> { |
| 153 | + // markdown links allow for nested parenthesis inside the link reference part, but |
| 154 | + // the number of open parenthesis must match the number of closing parenthesis, e.g.: |
| 155 | + // - `[caption](/some/p()th/file.md)` is a valid markdown link |
| 156 | + // - `[caption](/some/p(th/file.md)` is an invalid markdown link |
| 157 | + // hence we use the `openParensCount` variable to keep track of the number of open |
| 158 | + // parenthesis encountered so far; then upon encountering a closing parenthesis we |
| 159 | + // decrement the `openParensCount` and if it reaches 0 - we consider the link reference |
| 160 | + // to be complete |
| 161 | + |
| 162 | + if (token instanceof LeftParenthesis) { |
| 163 | + this.openParensCount += 1; |
| 164 | + } |
| 165 | + |
| 166 | + if (token instanceof RightParenthesis) { |
| 167 | + this.openParensCount -= 1; |
| 168 | + |
| 169 | + // sanity check! this must alway hold true because we return a complete markdown |
| 170 | + // link as soon as we encounter matching number of closing parenthesis, hence |
| 171 | + // we must never have `openParensCount` that is less than 0 |
| 172 | + assert( |
| 173 | + this.openParensCount >= 0, |
| 174 | + `Unexpected right parenthesis token encountered: '${token}'.`, |
| 175 | + ); |
| 176 | + |
| 177 | + // the markdown link is complete as soon as we get the same number of closing parenthesis |
| 178 | + if (this.openParensCount === 0) { |
| 179 | + const { startLineNumber, startColumn } = this.captionTokens[0].range; |
| 180 | + |
| 181 | + // create link caption string |
| 182 | + const caption = this.captionTokens |
| 183 | + .map((token) => { return token.text; }) |
| 184 | + .join(''); |
| 185 | + |
| 186 | + // create link reference string |
| 187 | + this.currentTokens.push(token); |
| 188 | + const reference = this.currentTokens |
| 189 | + .map((token) => { return token.text; }).join(''); |
| 190 | + |
| 191 | + // return complete markdown link object |
| 192 | + return { |
| 193 | + result: 'success', |
| 194 | + wasTokenConsumed: true, |
| 195 | + nextParser: new MarkdownLink( |
| 196 | + startLineNumber, |
| 197 | + startColumn, |
| 198 | + caption, |
| 199 | + reference, |
| 200 | + ), |
| 201 | + }; |
| 202 | + } |
| 203 | + } |
| 204 | + |
| 205 | + // any of stop characters is are breaking a markdown link reference sequence |
| 206 | + if (MARKDOWN_LINK_STOP_CHARACTERS.includes(token.text)) { |
| 207 | + return { |
| 208 | + result: 'failure', |
| 209 | + wasTokenConsumed: false, |
| 210 | + }; |
| 211 | + } |
| 212 | + |
| 213 | + // the rest of the tokens can be included in the sequence |
| 214 | + this.currentTokens.push(token); |
| 215 | + return { |
| 216 | + result: 'success', |
| 217 | + nextParser: this, |
| 218 | + wasTokenConsumed: true, |
| 219 | + }; |
| 220 | + } |
| 221 | +} |
| 222 | + |
| 223 | +/** |
| 224 | + * Decoder capable of parsing markdown entities (e.g., links) from a sequence of simplier tokens. |
| 225 | + */ |
| 226 | +export class MarkdownDecoder extends BaseDecoder<TMarkdownToken, TSimpleToken> { |
| 227 | + /** |
| 228 | + * Current parser object that is responsible for parsing a sequence of tokens |
| 229 | + * into some markdown entity. |
| 230 | + */ |
| 231 | + private current?: PartialMarkdownLinkCaption | MarkdownLinkCaption | PartialMarkdownLink; |
| 232 | + |
| 233 | + constructor( |
| 234 | + stream: ReadableStream<VSBuffer>, |
| 235 | + ) { |
| 236 | + super(new SimpleDecoder(stream)); |
| 237 | + } |
| 238 | + |
| 239 | + protected override onStreamData(token: TSimpleToken): void { |
| 240 | + // markdown links start with `[` character, so here we can |
| 241 | + // initiate the process of parsing a markdown link |
| 242 | + if (token instanceof LeftBracket && !this.current) { |
| 243 | + this.current = new PartialMarkdownLinkCaption(token); |
| 244 | + |
| 245 | + return; |
| 246 | + } |
| 247 | + |
| 248 | + // if current parser was not initiated before, - we are not inside a |
| 249 | + // sequence of tokens we care about, therefore re-emit the token |
| 250 | + // immediately and continue to the next one |
| 251 | + if (!this.current) { |
| 252 | + this._onData.fire(token); |
| 253 | + return; |
| 254 | + } |
| 255 | + |
| 256 | + // if there is a current parser object, submit the token to it |
| 257 | + // so it can progress with parsing the tokens sequence |
| 258 | + const parseResult = this.current.accept(token); |
| 259 | + if (parseResult.result === 'success') { |
| 260 | + // if got a parsed out `MarkdownLink` back, emit it |
| 261 | + // then reset the current parser object |
| 262 | + if (parseResult.nextParser instanceof MarkdownLink) { |
| 263 | + this._onData.fire(parseResult.nextParser); |
| 264 | + delete this.current; |
| 265 | + } else { |
| 266 | + // otherwise, update the current parser object |
| 267 | + this.current = parseResult.nextParser; |
| 268 | + } |
| 269 | + } else { |
| 270 | + // if failed to parse a sequence of a tokens as a single markdown |
| 271 | + // entity (e.g., a link), re-emit the tokens accumulated so far |
| 272 | + // then reset the current parser object |
| 273 | + for (const token of this.current.tokens) { |
| 274 | + this._onData.fire(token); |
| 275 | + delete this.current; |
| 276 | + } |
| 277 | + } |
| 278 | + |
| 279 | + // if token was not consumed by the parser, call `onStreamData` again |
| 280 | + // so the token is properly handled by the decoder in the case when a |
| 281 | + // new sequence starts with this token |
| 282 | + if (!parseResult.wasTokenConsumed) { |
| 283 | + this.onStreamData(token); |
| 284 | + } |
| 285 | + } |
| 286 | + |
| 287 | + protected override onStreamEnd(): void { |
| 288 | + // if the stream has ended and there is a current incomplete parser |
| 289 | + // object present, then re-emit its tokens as standalone entities |
| 290 | + if (this.current) { |
| 291 | + const { tokens } = this.current; |
| 292 | + delete this.current; |
| 293 | + |
| 294 | + for (const token of [...tokens]) { |
| 295 | + this._onData.fire(token); |
| 296 | + } |
| 297 | + } |
| 298 | + |
| 299 | + super.onStreamEnd(); |
| 300 | + } |
| 301 | +} |
0 commit comments