Skip to content

Commit f2f5748

Browse files
bors[bot]matklad
andauthored
Merge #11118
11118: internal: move ws attachment logic to the parser crate r=matklad a=matklad This has to re-introduce the `sink` pattern, because doing this purely with iterators is awkward :( Maaaybe the event vector was a false start? But, anyway, I like the current factoring more -- it sort-of obvious that we do want to keep ws-attachment business in the parser, and that we also don't want that to depend on the particular tree structure. I think `shortcuts` module achieves that. bors r+ 🤖 Co-authored-by: Aleksey Kladov <[email protected]>
2 parents c456b21 + f4cb0ff commit f2f5748

File tree

6 files changed

+255
-221
lines changed

6 files changed

+255
-221
lines changed

crates/parser/src/lexed_str.rs

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -122,31 +122,6 @@ impl<'a> LexedStr<'a> {
122122
self.error.iter().map(|it| (it.token as usize, it.msg.as_str()))
123123
}
124124

125-
pub fn to_input(&self) -> crate::Input {
126-
let mut res = crate::Input::default();
127-
let mut was_joint = false;
128-
for i in 0..self.len() {
129-
let kind = self.kind(i);
130-
if kind.is_trivia() {
131-
was_joint = false
132-
} else {
133-
if kind == SyntaxKind::IDENT {
134-
let token_text = self.text(i);
135-
let contextual_kw = SyntaxKind::from_contextual_keyword(token_text)
136-
.unwrap_or(SyntaxKind::IDENT);
137-
res.push_ident(contextual_kw);
138-
} else {
139-
if was_joint {
140-
res.was_joint();
141-
}
142-
res.push(kind);
143-
}
144-
was_joint = true;
145-
}
146-
}
147-
res
148-
}
149-
150125
fn push(&mut self, kind: SyntaxKind, offset: usize) {
151126
self.kind.push(kind);
152127
self.start.push(offset as u32);

crates/parser/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ mod parser;
2626
mod grammar;
2727
mod input;
2828
mod output;
29+
mod shortcuts;
2930

3031
#[cfg(test)]
3132
mod tests;
@@ -36,6 +37,7 @@ pub use crate::{
3637
input::Input,
3738
lexed_str::LexedStr,
3839
output::{Output, Step},
40+
shortcuts::StrStep,
3941
syntax_kind::SyntaxKind,
4042
};
4143

crates/parser/src/shortcuts.rs

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
//! Shortcuts that span lexer/parser abstraction.
2+
//!
3+
//! The way Rust works, parser doesn't necessary parse text, and you might
4+
//! tokenize text without parsing it further. So, it makes sense to keep
5+
//! abstract token parsing, and string tokenization as completely separate
6+
//! layers.
7+
//!
8+
//! However, often you do pares text into syntax trees and the glue code for
9+
//! that needs to live somewhere. Rather than putting it to lexer or parser, we
10+
//! use a separate shortcuts module for that.
11+
12+
use std::mem;
13+
14+
use crate::{
15+
LexedStr, Step,
16+
SyntaxKind::{self, *},
17+
};
18+
19+
pub enum StrStep<'a> {
20+
Token { kind: SyntaxKind, text: &'a str },
21+
Enter { kind: SyntaxKind },
22+
Exit,
23+
Error { msg: &'a str, pos: usize },
24+
}
25+
26+
impl<'a> LexedStr<'a> {
27+
pub fn to_input(&self) -> crate::Input {
28+
let mut res = crate::Input::default();
29+
let mut was_joint = false;
30+
for i in 0..self.len() {
31+
let kind = self.kind(i);
32+
if kind.is_trivia() {
33+
was_joint = false
34+
} else {
35+
if kind == SyntaxKind::IDENT {
36+
let token_text = self.text(i);
37+
let contextual_kw = SyntaxKind::from_contextual_keyword(token_text)
38+
.unwrap_or(SyntaxKind::IDENT);
39+
res.push_ident(contextual_kw);
40+
} else {
41+
if was_joint {
42+
res.was_joint();
43+
}
44+
res.push(kind);
45+
}
46+
was_joint = true;
47+
}
48+
}
49+
res
50+
}
51+
52+
pub fn intersperse_trivia(
53+
&self,
54+
output: &crate::Output,
55+
synthetic_root: bool,
56+
sink: &mut dyn FnMut(StrStep),
57+
) -> bool {
58+
let mut builder = Builder { lexed: self, pos: 0, state: State::PendingEnter, sink };
59+
60+
if synthetic_root {
61+
builder.enter(SyntaxKind::SOURCE_FILE);
62+
}
63+
for event in output.iter() {
64+
match event {
65+
Step::Token { kind, n_input_tokens: n_raw_tokens } => {
66+
builder.token(kind, n_raw_tokens)
67+
}
68+
Step::Enter { kind } => builder.enter(kind),
69+
Step::Exit => builder.exit(),
70+
Step::Error { msg } => {
71+
let text_pos = builder.lexed.text_start(builder.pos);
72+
(builder.sink)(StrStep::Error { msg, pos: text_pos });
73+
}
74+
}
75+
}
76+
if synthetic_root {
77+
builder.exit();
78+
}
79+
80+
match mem::replace(&mut builder.state, State::Normal) {
81+
State::PendingExit => {
82+
builder.eat_trivias();
83+
(builder.sink)(StrStep::Exit);
84+
}
85+
State::PendingEnter | State::Normal => unreachable!(),
86+
}
87+
88+
let is_eof = builder.pos == builder.lexed.len();
89+
is_eof
90+
}
91+
}
92+
93+
struct Builder<'a, 'b> {
94+
lexed: &'a LexedStr<'a>,
95+
pos: usize,
96+
state: State,
97+
sink: &'b mut dyn FnMut(StrStep<'_>),
98+
}
99+
100+
enum State {
101+
PendingEnter,
102+
Normal,
103+
PendingExit,
104+
}
105+
106+
impl Builder<'_, '_> {
107+
fn token(&mut self, kind: SyntaxKind, n_tokens: u8) {
108+
match mem::replace(&mut self.state, State::Normal) {
109+
State::PendingEnter => unreachable!(),
110+
State::PendingExit => (self.sink)(StrStep::Exit),
111+
State::Normal => (),
112+
}
113+
self.eat_trivias();
114+
self.do_token(kind, n_tokens as usize);
115+
}
116+
117+
fn enter(&mut self, kind: SyntaxKind) {
118+
match mem::replace(&mut self.state, State::Normal) {
119+
State::PendingEnter => {
120+
(self.sink)(StrStep::Enter { kind });
121+
// No need to attach trivias to previous node: there is no
122+
// previous node.
123+
return;
124+
}
125+
State::PendingExit => (self.sink)(StrStep::Exit),
126+
State::Normal => (),
127+
}
128+
129+
let n_trivias =
130+
(self.pos..self.lexed.len()).take_while(|&it| self.lexed.kind(it).is_trivia()).count();
131+
let leading_trivias = self.pos..self.pos + n_trivias;
132+
let n_attached_trivias = n_attached_trivias(
133+
kind,
134+
leading_trivias.rev().map(|it| (self.lexed.kind(it), self.lexed.text(it))),
135+
);
136+
self.eat_n_trivias(n_trivias - n_attached_trivias);
137+
(self.sink)(StrStep::Enter { kind });
138+
self.eat_n_trivias(n_attached_trivias);
139+
}
140+
141+
fn exit(&mut self) {
142+
match mem::replace(&mut self.state, State::PendingExit) {
143+
State::PendingEnter => unreachable!(),
144+
State::PendingExit => (self.sink)(StrStep::Exit),
145+
State::Normal => (),
146+
}
147+
}
148+
149+
fn eat_trivias(&mut self) {
150+
while self.pos < self.lexed.len() {
151+
let kind = self.lexed.kind(self.pos);
152+
if !kind.is_trivia() {
153+
break;
154+
}
155+
self.do_token(kind, 1);
156+
}
157+
}
158+
159+
fn eat_n_trivias(&mut self, n: usize) {
160+
for _ in 0..n {
161+
let kind = self.lexed.kind(self.pos);
162+
assert!(kind.is_trivia());
163+
self.do_token(kind, 1);
164+
}
165+
}
166+
167+
fn do_token(&mut self, kind: SyntaxKind, n_tokens: usize) {
168+
let text = &self.lexed.range_text(self.pos..self.pos + n_tokens);
169+
self.pos += n_tokens;
170+
(self.sink)(StrStep::Token { kind, text });
171+
}
172+
}
173+
174+
fn n_attached_trivias<'a>(
175+
kind: SyntaxKind,
176+
trivias: impl Iterator<Item = (SyntaxKind, &'a str)>,
177+
) -> usize {
178+
match kind {
179+
CONST | ENUM | FN | IMPL | MACRO_CALL | MACRO_DEF | MACRO_RULES | MODULE | RECORD_FIELD
180+
| STATIC | STRUCT | TRAIT | TUPLE_FIELD | TYPE_ALIAS | UNION | USE | VARIANT => {
181+
let mut res = 0;
182+
let mut trivias = trivias.enumerate().peekable();
183+
184+
while let Some((i, (kind, text))) = trivias.next() {
185+
match kind {
186+
WHITESPACE if text.contains("\n\n") => {
187+
// we check whether the next token is a doc-comment
188+
// and skip the whitespace in this case
189+
if let Some((COMMENT, peek_text)) = trivias.peek().map(|(_, pair)| pair) {
190+
if is_outer(peek_text) {
191+
continue;
192+
}
193+
}
194+
break;
195+
}
196+
COMMENT => {
197+
if is_inner(text) {
198+
break;
199+
}
200+
res = i + 1;
201+
}
202+
_ => (),
203+
}
204+
}
205+
res
206+
}
207+
_ => 0,
208+
}
209+
}
210+
211+
fn is_outer(text: &str) -> bool {
212+
if text.starts_with("////") || text.starts_with("/***") {
213+
return false;
214+
}
215+
text.starts_with("///") || text.starts_with("/**")
216+
}
217+
218+
fn is_inner(text: &str) -> bool {
219+
text.starts_with("//!") || text.starts_with("/*!")
220+
}

crates/syntax/src/parsing.rs

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
//! Lexing, bridging to parser (which does the actual parsing) and
22
//! incremental reparsing.
33
4-
mod text_tree_sink;
54
mod reparsing;
65

7-
use crate::{
8-
parsing::text_tree_sink::build_tree, syntax_node::GreenNode, AstNode, SyntaxError, SyntaxNode,
9-
};
6+
use rowan::TextRange;
7+
8+
use crate::{syntax_node::GreenNode, AstNode, SyntaxError, SyntaxNode, SyntaxTreeBuilder};
109

1110
pub(crate) use crate::parsing::reparsing::incremental_reparse;
1211

@@ -37,3 +36,32 @@ pub(crate) fn parse_text_as<T: AstNode>(
3736

3837
SyntaxNode::new_root(node).first_child().and_then(T::cast).ok_or(())
3938
}
39+
40+
pub(crate) fn build_tree(
41+
lexed: parser::LexedStr<'_>,
42+
parser_output: parser::Output,
43+
synthetic_root: bool,
44+
) -> (GreenNode, Vec<SyntaxError>, bool) {
45+
let mut builder = SyntaxTreeBuilder::default();
46+
47+
let is_eof = lexed.intersperse_trivia(&parser_output, synthetic_root, &mut |step| match step {
48+
parser::StrStep::Token { kind, text } => builder.token(kind, text),
49+
parser::StrStep::Enter { kind } => builder.start_node(kind),
50+
parser::StrStep::Exit => builder.finish_node(),
51+
parser::StrStep::Error { msg, pos } => {
52+
builder.error(msg.to_string(), pos.try_into().unwrap())
53+
}
54+
});
55+
56+
let (node, mut errors) = builder.finish_raw();
57+
for (i, err) in lexed.errors() {
58+
let text_range = lexed.text_range(i);
59+
let text_range = TextRange::new(
60+
text_range.start.try_into().unwrap(),
61+
text_range.end.try_into().unwrap(),
62+
);
63+
errors.push(SyntaxError::new(err, text_range))
64+
}
65+
66+
(node, errors, is_eof)
67+
}

crates/syntax/src/parsing/reparsing.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use parser::Reparser;
1010
use text_edit::Indel;
1111

1212
use crate::{
13-
parsing::text_tree_sink::build_tree,
13+
parsing::build_tree,
1414
syntax_node::{GreenNode, GreenToken, NodeOrToken, SyntaxElement, SyntaxNode},
1515
SyntaxError,
1616
SyntaxKind::*,

0 commit comments

Comments
 (0)