Skip to content

Commit 8a2e9fb

Browse files
committed
refactor: keep working on wip docstring parser
The parser is mostly done. This commit will be ammended/fixed up once the docstring parser is completely done, so more details can be found in the accompanying PR.
1 parent 93e3c59 commit 8a2e9fb

File tree

2 files changed

+327
-4
lines changed

2 files changed

+327
-4
lines changed

docs/typlodocus/extractor.typ

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,12 @@
1818

1919
if line.starts-with("///") {
2020
in-comment = true
21-
current-comment += line.slice(3).trim() + "\n"
21+
current-comment += line.slice(3) + "\n"
2222
} else if in-comment {
2323
if line.starts-with(regex(`#let\s+`.text)) {
24-
let function = parser.parse-function-signature(lines.slice(i))
2524
comments.push((
26-
comment: parser.parse-docstring(current-comment),
27-
signature: function,
25+
comment: parser.parse-docstring-alt(current-comment),
26+
signature: parser.parse-function-signature(lines.slice(i)),
2827
))
2928
}
3029

docs/typlodocus/parser.typ

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1+
// TODO: see if we can perform documentation diagnostics:
2+
// (1) the diagnostics would include:
3+
// - default argument type validation (would have to look into how do
4+
// this with custom types but it's feasible,)
5+
// - argument name validation (feasible considering we preprocess the
6+
// function signature prior to the docstring, though it would
7+
// require further rework,)
8+
// (2) the diagnostics should panic such that querying fails and the error
9+
// is reported during the doc building process (to better diagnose
10+
// issues whenever this happens in CI, it would be best if the errors
11+
// were also written to some log file or alternatively replaced the
12+
// query operation, and the Python script reported the actual error
13+
// and wrote to file if a specific json schema in the expected query
14+
// results is detected.)
15+
116
#let parse-function-signature(lines) = {
217
let ident = lines
318
.first()
@@ -150,6 +165,315 @@
150165
)
151166
}
152167

168+
// TODO: change the `symbol` key in the corresponding dictionaries with the
169+
// `token` key.
170+
// TODO: see if repetitive code blocks at different stages of the parsing
171+
// process can be refactored into a set of functions that handle such
172+
// functionality in an isolated manner.
173+
// NOTE: a current limitation of this parser is that Typst code that would get
174+
// parsed as some output 'A', will get parsed as an output 'B' instead
175+
// because whitespace removal is performed eagerly at the start of all
176+
// lines. As an example, take Typst's lists; They use whitespace to denote
177+
// offsetting from top-level items, but this won't be the case after this
178+
// gets parsed.
179+
#let parse-docstring-alt(string) = {
180+
let indent-ws = 0
181+
let example-fields = (
182+
symbol: (
183+
open: "```example",
184+
close: "```",
185+
),
186+
inside: false,
187+
)
188+
let parameter-fields = (
189+
symbol: (
190+
param: "-",
191+
param-list-open: "(",
192+
param-list-close: ")",
193+
),
194+
inside: false,
195+
inside-param-list: false,
196+
re: (
197+
line: regex(
198+
// TODO: rework the part of the regex parsing the optional type list to
199+
// make it accept only comma-optional separated sequences.
200+
`^-(\s+)((?:\.{2})?[[:alnum:]_-]+)((?:\s+)(?:\()([[:alnum:][:blank:]-_\.,]+)(?:\)))?((?:\s+=\s+)([^:]+))?(:\s+(.+))?`.text,
201+
),
202+
parameter-list: regex(`([[:alnum:]-_\.]+)(?:,){0,1}`.text),
203+
default-param: regex(`\)(\s+=\s+([^:]+))?:(?:\s*)?(.*)?`.text),
204+
),
205+
indent-ws: 0,
206+
)
207+
let result-fields = (
208+
symbol: "->",
209+
inside: false,
210+
re: regex(`^->(\s+)\(?([[:alnum:]-_]+)\)?((?:\s+)(.+))?`.text),
211+
indent-ws: 0,
212+
)
213+
// NOTE: this matches the empty string on non-matching haystacks, so it's
214+
// never `none`.
215+
let comment-ws-re = regex(`^([[:blank:]]*).*`.text)
216+
let arguments = ()
217+
let result = ()
218+
let text = ()
219+
// NOTE: this is meant to store the contents of (possibly) parameter
220+
// documentation in case it turns out to be Typst native list syntax
221+
// instead of a multiline parameter type list.
222+
let tmp-buffer = none
223+
224+
for line in string.split("\n") {
225+
if not (example-fields.inside or parameter-fields.inside or result-fields.inside) {
226+
indent-ws = line.match(comment-ws-re).captures.first().len()
227+
line = line.slice(indent-ws)
228+
229+
if line.len() == 0 and text.last().len() == 0 {
230+
continue
231+
} else if line.starts-with(example-fields.symbol.open) {
232+
example-fields.inside = true
233+
text.push(line.trim(at: end))
234+
} else if line.starts-with(result-fields.symbol) {
235+
let re-result = line.match(result-fields.re)
236+
if re-result != none {
237+
result-fields.indent-ws = re-result.captures.first().len()
238+
result-fields.inside = true
239+
240+
result.push((
241+
type: re-result.captures.at(1),
242+
text: if re-result.captures.at(3) != none {
243+
(re-result.captures.at(3).trim(at: end),)
244+
} else {
245+
()
246+
},
247+
))
248+
249+
continue
250+
}
251+
} else if line.starts-with(parameter-fields.symbol.param) {
252+
let param = line.match(parameter-fields.re.line)
253+
if param != none {
254+
parameter-fields.indent-ws = param.captures.first().len()
255+
parameter-fields.inside = true
256+
257+
if (
258+
param.captures.at(2) == none
259+
and param.captures.at(3) == none
260+
and param.captures.at(4) == none
261+
) { tmp-buffer = (line,) }
262+
263+
arguments.push((
264+
name: param.captures.at(1),
265+
types: if param.captures.at(3) != none {
266+
param
267+
.captures
268+
.at(3)
269+
.matches(parameter-fields.re.parameter-list)
270+
.map(it => it.captures.first())
271+
} else {
272+
()
273+
},
274+
default-value: if param.captures.at(5) != none { param.captures.at(5) } else { none },
275+
text: if param.captures.at(7) != none {
276+
(param.captures.at(7).trim(at: end),)
277+
} else {
278+
()
279+
},
280+
))
281+
282+
continue
283+
}
284+
} else {
285+
text.push(line.trim(at: end))
286+
}
287+
} else if example-fields.inside {
288+
let tmp-ws = line.match(comment-ws-re).captures.first().len()
289+
line = line.slice(tmp-ws)
290+
291+
if line.starts-with(example-fields.symbol.close) { example-fields.inside = false }
292+
text.push(line)
293+
} else if parameter-fields.inside {
294+
let tmp-ws = line.match(comment-ws-re).captures.first().len()
295+
line = line.slice(tmp-ws)
296+
297+
if tmp-ws < indent-ws + parameter-fields.indent-ws + parameter-fields.symbol.param.len() {
298+
parameter-fields.inside = false
299+
indent-ws = tmp-ws
300+
301+
if tmp-buffer != none {
302+
arguments.pop()
303+
for buf in tmp-buffer { text.push(buf) }
304+
305+
tmp-buffer = none
306+
}
307+
308+
if line.starts-with(result-fields.symbol) {
309+
let re-result = line.match(result-fields.re)
310+
if re-result != none {
311+
result-fields.indent-ws = re-result.captures.first().len()
312+
result-fields.inside = true
313+
314+
result.push((
315+
type: re-result.captures.at(1),
316+
text: if re-result.captures.at(3) != none {
317+
(re-result.captures.at(3).trim(at: end),)
318+
} else {
319+
()
320+
},
321+
))
322+
}
323+
} else if line.starts-with(parameter-fields.symbol.param) {
324+
let param = line.match(parameter-fields.re.line)
325+
if param != none {
326+
parameter-fields.indent-ws = param.captures.first().len()
327+
parameter-fields.inside = true
328+
329+
if (
330+
param.captures.at(2) == none
331+
or param.captures.at(3) == none
332+
or param.captures.at(4) == none
333+
) { tmp-buffer = (line,) }
334+
335+
arguments.push((
336+
name: param.captures.at(1),
337+
types: if param.captures.at(3) != none {
338+
param
339+
.captures
340+
.at(3)
341+
.matches(parameter-fields.re.parameter-list)
342+
.map(it => it.captures.first())
343+
} else {
344+
()
345+
},
346+
default-value: if param.captures.at(5) != none { param.captures.at(5) } else { none },
347+
text: if param.captures.at(7) != none {
348+
(param.captures.at(7).trim(at: end),)
349+
} else {
350+
()
351+
},
352+
))
353+
}
354+
} else {
355+
if text.last().len() == 0 and line.len() == 0 { continue }
356+
text.push(line.trim(at: end))
357+
}
358+
} else {
359+
if parameter-fields.inside-param-list {
360+
if line.starts-with(parameter-fields.symbol.param-list-close) {
361+
parameter-fields.inside-param-list = false
362+
363+
let result = line.match(parameter-fields.re.default-param)
364+
if result != none { arguments.last().text.push(result.captures.at(1)) }
365+
} else {
366+
let result = line.match(parameter-fields.re.parameter-list)
367+
if result != none { arguments.last().types.push(result.captures.first()) }
368+
}
369+
} else if arguments.last().types.len() == 0 {
370+
if tmp-buffer.len() == 1 and line.first() == parameter-fields.symbol.param-list-open {
371+
parameter-fields.inside-param-list = true
372+
} else {
373+
tmp-buffer.push(line)
374+
}
375+
} else {
376+
arguments.last().text.push(line.trim(at: end))
377+
}
378+
}
379+
} else if result-fields.inside {
380+
let tmp-ws = line.match(comment-ws-re).captures.first().len()
381+
line = line.slice(tmp-ws)
382+
383+
if tmp-ws < indent-ws + result-fields.indent-ws + result-fields.symbol.len() {
384+
result-fields.inside = false
385+
indent-ws = tmp-ws
386+
387+
if line.starts-with(result-fields.symbol) {
388+
let re-result = line.match(result-fields.re)
389+
if re-result != none {
390+
result-fields.indent-ws = re-result.captures.first().len()
391+
result-fields.inside = true
392+
393+
result.push((
394+
type: re-result.captures.at(1),
395+
text: if re-result.captures.at(3) != none {
396+
(re-result.captures.at(3).trim(at: end),)
397+
} else {
398+
()
399+
},
400+
))
401+
}
402+
} else if line.starts-with(parameter-fields.symbol.param) {
403+
let param = line.match(parameter-fields.re.line)
404+
if param != none {
405+
parameter-fields.indent-ws = param.captures.first().len()
406+
parameter-fields.inside = true
407+
408+
if (
409+
param.captures.at(2) == none
410+
or param.captures.at(3) == none
411+
or param.captures.at(4) == none
412+
) { tmp-buffer = (line,) }
413+
414+
arguments.push((
415+
name: param.captures.at(1),
416+
types: if param.captures.at(3) != none {
417+
param
418+
.captures
419+
.at(3)
420+
.matches(parameter-fields.re.parameter-list)
421+
.map(it => it.captures.first())
422+
} else {
423+
()
424+
},
425+
default-value: if param.captures.at(5) != none { param.captures.at(5) } else { none },
426+
text: if param.captures.at(7) != none {
427+
(param.captures.at(7).trim(at: end),)
428+
} else {
429+
()
430+
},
431+
))
432+
}
433+
} else if line.starts-with(example-fields.symbol.open) {
434+
example-fields.inside = true
435+
text.push(line.trim(at: end))
436+
} else {
437+
if text.last().len() == 0 and line.len() == 0 { continue }
438+
text.push(line.trim(at: end))
439+
}
440+
} else {
441+
result.last().text.push(line.trim(at: end))
442+
}
443+
}
444+
}
445+
446+
if tmp-buffer != none {
447+
for buf in tmp-buffer { text.push(buf) }
448+
arguments.pop()
449+
}
450+
451+
return (
452+
raw: string,
453+
text: text.join("\n", default: "").trim(),
454+
arguments: arguments.map(it => (
455+
..it,
456+
text: it
457+
.text
458+
.join(
459+
"\n",
460+
default: "",
461+
)
462+
.trim(),
463+
)),
464+
result: result.map(it => (
465+
..it,
466+
text: it
467+
.text
468+
.join(
469+
"\n",
470+
default: "",
471+
)
472+
.trim(),
473+
)),
474+
)
475+
}
476+
153477
#let parse-docstring(string) = {
154478
let argument-re = regex("-\s+(\.*[_a-zA-Z]+[-\w]*)\s+(\\(.*?\\))?(\s*=\s*.*?)?:(.*)")
155479
let result-re = regex("->\s+\(?([_a-zA-Z]+[-\w]*)\)?\s*(.*)")

0 commit comments

Comments
 (0)