Skip to content

Commit d665c1b

Browse files
authored
support markdown links syntax in prompt files (microsoft#237698)
* [markdown links]: add II of `MarkdownDecoder` * [markdown links]: add `Colon` and `Hash` simple tokens, implement the new logic inside `ChatPromptDecoder` * [markdown links]: add unit tests for the `MarkdownDecoder`, update other unit tests to account for the new MD links * [markdown links]: refactor and improve docs * [markdown links]: improve unit tests of the `MarkdownDecoder` * [markdown links]: fix recursion issue caused by `##` in `ChatPromptDecoder` and improve unit tests * [markdown links]: improve docs
1 parent 9547e6d commit d665c1b

File tree

24 files changed

+1500
-67
lines changed

24 files changed

+1500
-67
lines changed

src/vs/editor/common/codecs/baseToken.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ export abstract class BaseToken {
1818
return this._range;
1919
}
2020

21+
/**
22+
* Return text representation of the token.
23+
*/
24+
public abstract get text(): string;
25+
2126
/**
2227
* Check if this token has the same range as another one.
2328
*/

src/vs/editor/common/codecs/linesCodec/tokens/carriageReturn.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@ export class CarriageReturn extends BaseToken {
3131
return CarriageReturn.byte;
3232
}
3333

34+
/**
35+
* Return text representation of the token.
36+
*/
37+
public get text(): string {
38+
return CarriageReturn.symbol;
39+
}
40+
3441
/**
3542
* Create new `CarriageReturn` token with range inside
3643
* the given `Line` at the given `column number`.

src/vs/editor/common/codecs/linesCodec/tokens/newLine.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,13 @@ export class NewLine extends BaseToken {
2424
*/
2525
public static readonly byte = VSBuffer.fromString(NewLine.symbol);
2626

27+
/**
28+
* Return text representation of the token.
29+
*/
30+
public get text(): string {
31+
return NewLine.symbol;
32+
}
33+
2734
/**
2835
* The byte representation of the token.
2936
*/
Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
/*---------------------------------------------------------------------------------------------
2+
* Copyright (c) Microsoft Corporation. All rights reserved.
3+
* Licensed under the MIT License. See License.txt in the project root for license information.
4+
*--------------------------------------------------------------------------------------------*/
5+
6+
import { MarkdownLink } from './tokens/markdownLink.js';
7+
import { NewLine } from '../linesCodec/tokens/newLine.js';
8+
import { assert } from '../../../../base/common/assert.js';
9+
import { FormFeed } from '../simpleCodec/tokens/formFeed.js';
10+
import { VSBuffer } from '../../../../base/common/buffer.js';
11+
import { VerticalTab } from '../simpleCodec/tokens/verticalTab.js';
12+
import { ReadableStream } from '../../../../base/common/stream.js';
13+
import { CarriageReturn } from '../linesCodec/tokens/carriageReturn.js';
14+
import { BaseDecoder } from '../../../../base/common/codecs/baseDecoder.js';
15+
import { LeftBracket, RightBracket } from '../simpleCodec/tokens/brackets.js';
16+
import { SimpleDecoder, TSimpleToken } from '../simpleCodec/simpleDecoder.js';
17+
import { ParserBase, TAcceptTokenResult } from '../simpleCodec/parserBase.js';
18+
import { LeftParenthesis, RightParenthesis } from '../simpleCodec/tokens/parentheses.js';
19+
20+
/**
21+
* Tokens handled by this decoder.
22+
*/
23+
export type TMarkdownToken = MarkdownLink | TSimpleToken;
24+
25+
/**
26+
* List of characters that stop a markdown link sequence.
27+
*/
28+
const MARKDOWN_LINK_STOP_CHARACTERS: readonly string[] = [CarriageReturn, NewLine, VerticalTab, FormFeed]
29+
.map((token) => { return token.symbol; });
30+
31+
/**
32+
* The parser responsible for parsing a `markdown link caption` part of a markdown
33+
* link (e.g., the `[caption text]` part of the `[caption text](./some/path)` link).
34+
*
35+
* The parsing process starts with single `[` token and collects all tokens until
36+
* the first `]` token is encountered. In this successful case, the parser transitions
37+
* into the {@linkcode MarkdownLinkCaption} parser type which continues the general
38+
* parsing process of the markdown link.
39+
*
40+
* Otherwise, if one of the stop characters defined in the {@linkcode MARKDOWN_LINK_STOP_CHARACTERS}
41+
* is encountered before the `]` token, the parsing process is aborted which is communicated to
42+
* the caller by returning a `failure` result. In this case, the caller is assumed to be responsible
43+
* for re-emitting the {@link tokens} accumulated so far as standalone entities since they are no
44+
* longer represent a coherent token entity of a larger size.
45+
*/
46+
class PartialMarkdownLinkCaption extends ParserBase<TSimpleToken, PartialMarkdownLinkCaption | MarkdownLinkCaption> {
47+
constructor(token: LeftBracket) {
48+
super([token]);
49+
}
50+
51+
public accept(token: TSimpleToken): TAcceptTokenResult<PartialMarkdownLinkCaption | MarkdownLinkCaption> {
52+
// any of stop characters is are breaking a markdown link caption sequence
53+
if (MARKDOWN_LINK_STOP_CHARACTERS.includes(token.text)) {
54+
return {
55+
result: 'failure',
56+
wasTokenConsumed: false,
57+
};
58+
}
59+
60+
// the `]` character ends the caption of a markdown link
61+
if (token instanceof RightBracket) {
62+
return {
63+
result: 'success',
64+
nextParser: new MarkdownLinkCaption([...this.tokens, token]),
65+
wasTokenConsumed: true,
66+
};
67+
}
68+
69+
// otherwise, include the token in the sequence
70+
// and keep the current parser object instance
71+
this.currentTokens.push(token);
72+
return {
73+
result: 'success',
74+
nextParser: this,
75+
wasTokenConsumed: true,
76+
};
77+
}
78+
}
79+
80+
/**
81+
* The parser responsible for transitioning from a {@linkcode PartialMarkdownLinkCaption}
82+
* parser to the {@link PartialMarkdownLink} one, therefore serves a parser glue between
83+
* the `[caption]` and the `(./some/path)` parts of the `[caption](./some/path)` link.
84+
*
85+
* The only successful case of this parser is the `(` token that initiated the process
86+
* of parsing the `reference` part of a markdown link and in this case the parser
87+
* transitions into the `PartialMarkdownLink` parser type.
88+
*
89+
* Any other character is considered a failure result. In this case, the caller is assumed
90+
* to be responsible for re-emitting the {@link tokens} accumulated so far as standalone
91+
* entities since they are no longer represent a coherent token entity of a larger size.
92+
*/
93+
class MarkdownLinkCaption extends ParserBase<TSimpleToken, MarkdownLinkCaption | PartialMarkdownLink> {
94+
public accept(token: TSimpleToken): TAcceptTokenResult<MarkdownLinkCaption | PartialMarkdownLink> {
95+
// the `(` character starts the link part of a markdown link
96+
// that is the only character that can follow the caption
97+
if (token instanceof LeftParenthesis) {
98+
return {
99+
result: 'success',
100+
wasTokenConsumed: true,
101+
nextParser: new PartialMarkdownLink([...this.tokens], token),
102+
};
103+
}
104+
105+
return {
106+
result: 'failure',
107+
wasTokenConsumed: false,
108+
};
109+
}
110+
}
111+
112+
/**
113+
* The parser responsible for parsing a `link reference` part of a markdown link
114+
* (e.g., the `(./some/path)` part of the `[caption text](./some/path)` link).
115+
*
116+
* The parsing process starts with tokens that represent the `[caption]` part of a markdown
117+
* link, followed by the `(` token. The parser collects all subsequent tokens until final closing
118+
* parenthesis (`)`) is encountered (*\*see [1] below*). In this successful case, the parser object
119+
* transitions into the {@linkcode MarkdownLink} token type which signifies the end of the entire
120+
* parsing process of the link text.
121+
*
122+
* Otherwise, if one of the stop characters defined in the {@linkcode MARKDOWN_LINK_STOP_CHARACTERS}
123+
* is encountered before the final `)` token, the parsing process is aborted which is communicated to
124+
* the caller by returning a `failure` result. In this case, the caller is assumed to be responsible
125+
* for re-emitting the {@link tokens} accumulated so far as standalone entities since they are no
126+
* longer represent a coherent token entity of a larger size.
127+
*
128+
* `[1]` The `reference` part of the markdown link can contain any number of nested parenthesis, e.g.,
129+
* `[caption](/some/p(th/file.md)` is a valid markdown link and a valid folder name, hence number
130+
* of open parenthesis must match the number of closing ones and the path sequence is considered
131+
* to be complete as soon as this requirement is met. Therefore the `final` word is used in
132+
* the description comments above to highlight this important detail.
133+
*/
134+
class PartialMarkdownLink extends ParserBase<TSimpleToken, PartialMarkdownLink | MarkdownLink> {
135+
/**
136+
* Number of open parenthesis in the sequence.
137+
* See comment in the {@linkcode accept} method for more details.
138+
*/
139+
private openParensCount: number = 1;
140+
141+
constructor(
142+
protected readonly captionTokens: TSimpleToken[],
143+
token: LeftParenthesis,
144+
) {
145+
super([token]);
146+
}
147+
148+
public override get tokens(): readonly TSimpleToken[] {
149+
return [...this.captionTokens, ...this.currentTokens];
150+
}
151+
152+
public accept(token: TSimpleToken): TAcceptTokenResult<PartialMarkdownLink | MarkdownLink> {
153+
// markdown links allow for nested parenthesis inside the link reference part, but
154+
// the number of open parenthesis must match the number of closing parenthesis, e.g.:
155+
// - `[caption](/some/p()th/file.md)` is a valid markdown link
156+
// - `[caption](/some/p(th/file.md)` is an invalid markdown link
157+
// hence we use the `openParensCount` variable to keep track of the number of open
158+
// parenthesis encountered so far; then upon encountering a closing parenthesis we
159+
// decrement the `openParensCount` and if it reaches 0 - we consider the link reference
160+
// to be complete
161+
162+
if (token instanceof LeftParenthesis) {
163+
this.openParensCount += 1;
164+
}
165+
166+
if (token instanceof RightParenthesis) {
167+
this.openParensCount -= 1;
168+
169+
// sanity check! this must alway hold true because we return a complete markdown
170+
// link as soon as we encounter matching number of closing parenthesis, hence
171+
// we must never have `openParensCount` that is less than 0
172+
assert(
173+
this.openParensCount >= 0,
174+
`Unexpected right parenthesis token encountered: '${token}'.`,
175+
);
176+
177+
// the markdown link is complete as soon as we get the same number of closing parenthesis
178+
if (this.openParensCount === 0) {
179+
const { startLineNumber, startColumn } = this.captionTokens[0].range;
180+
181+
// create link caption string
182+
const caption = this.captionTokens
183+
.map((token) => { return token.text; })
184+
.join('');
185+
186+
// create link reference string
187+
this.currentTokens.push(token);
188+
const reference = this.currentTokens
189+
.map((token) => { return token.text; }).join('');
190+
191+
// return complete markdown link object
192+
return {
193+
result: 'success',
194+
wasTokenConsumed: true,
195+
nextParser: new MarkdownLink(
196+
startLineNumber,
197+
startColumn,
198+
caption,
199+
reference,
200+
),
201+
};
202+
}
203+
}
204+
205+
// any of stop characters is are breaking a markdown link reference sequence
206+
if (MARKDOWN_LINK_STOP_CHARACTERS.includes(token.text)) {
207+
return {
208+
result: 'failure',
209+
wasTokenConsumed: false,
210+
};
211+
}
212+
213+
// the rest of the tokens can be included in the sequence
214+
this.currentTokens.push(token);
215+
return {
216+
result: 'success',
217+
nextParser: this,
218+
wasTokenConsumed: true,
219+
};
220+
}
221+
}
222+
223+
/**
224+
* Decoder capable of parsing markdown entities (e.g., links) from a sequence of simplier tokens.
225+
*/
226+
export class MarkdownDecoder extends BaseDecoder<TMarkdownToken, TSimpleToken> {
227+
/**
228+
* Current parser object that is responsible for parsing a sequence of tokens
229+
* into some markdown entity.
230+
*/
231+
private current?: PartialMarkdownLinkCaption | MarkdownLinkCaption | PartialMarkdownLink;
232+
233+
constructor(
234+
stream: ReadableStream<VSBuffer>,
235+
) {
236+
super(new SimpleDecoder(stream));
237+
}
238+
239+
protected override onStreamData(token: TSimpleToken): void {
240+
// markdown links start with `[` character, so here we can
241+
// initiate the process of parsing a markdown link
242+
if (token instanceof LeftBracket && !this.current) {
243+
this.current = new PartialMarkdownLinkCaption(token);
244+
245+
return;
246+
}
247+
248+
// if current parser was not initiated before, - we are not inside a
249+
// sequence of tokens we care about, therefore re-emit the token
250+
// immediately and continue to the next one
251+
if (!this.current) {
252+
this._onData.fire(token);
253+
return;
254+
}
255+
256+
// if there is a current parser object, submit the token to it
257+
// so it can progress with parsing the tokens sequence
258+
const parseResult = this.current.accept(token);
259+
if (parseResult.result === 'success') {
260+
// if got a parsed out `MarkdownLink` back, emit it
261+
// then reset the current parser object
262+
if (parseResult.nextParser instanceof MarkdownLink) {
263+
this._onData.fire(parseResult.nextParser);
264+
delete this.current;
265+
} else {
266+
// otherwise, update the current parser object
267+
this.current = parseResult.nextParser;
268+
}
269+
} else {
270+
// if failed to parse a sequence of a tokens as a single markdown
271+
// entity (e.g., a link), re-emit the tokens accumulated so far
272+
// then reset the current parser object
273+
for (const token of this.current.tokens) {
274+
this._onData.fire(token);
275+
delete this.current;
276+
}
277+
}
278+
279+
// if token was not consumed by the parser, call `onStreamData` again
280+
// so the token is properly handled by the decoder in the case when a
281+
// new sequence starts with this token
282+
if (!parseResult.wasTokenConsumed) {
283+
this.onStreamData(token);
284+
}
285+
}
286+
287+
protected override onStreamEnd(): void {
288+
// if the stream has ended and there is a current incomplete parser
289+
// object present, then re-emit its tokens as standalone entities
290+
if (this.current) {
291+
const { tokens } = this.current;
292+
delete this.current;
293+
294+
for (const token of [...tokens]) {
295+
this._onData.fire(token);
296+
}
297+
}
298+
299+
super.onStreamEnd();
300+
}
301+
}

0 commit comments

Comments
 (0)