Skip to content

Commit 775d828

Browse files
authored
Allow multi-character symbols/variants (#92)
1 parent 9ac86f9 commit 775d828

File tree

3 files changed

+118
-70
lines changed

3 files changed

+118
-70
lines changed

build.rs

Lines changed: 40 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -34,28 +34,28 @@ enum Def<'a> {
3434

3535
/// A symbol, either a leaf or with modifiers with optional deprecation.
3636
enum Symbol<'a> {
37-
Single(char),
38-
Multi(Vec<(ModifierSet<&'a str>, char, Option<&'a str>)>),
37+
Single(String),
38+
Multi(Vec<(ModifierSet<&'a str>, String, Option<&'a str>)>),
3939
}
4040

4141
/// A single line during parsing.
42-
#[derive(Debug, Copy, Clone)]
42+
#[derive(Debug, Clone)]
4343
enum Line<'a> {
4444
Blank,
4545
Deprecated(&'a str),
4646
ModuleStart(&'a str),
4747
ModuleEnd,
48-
Symbol(&'a str, Option<char>),
49-
Variant(ModifierSet<&'a str>, char),
48+
Symbol(&'a str, Option<String>),
49+
Variant(ModifierSet<&'a str>, String),
5050
Eof,
5151
}
5252

53-
#[derive(Debug, Copy, Clone)]
53+
#[derive(Debug, Clone)]
5454
enum Declaration<'a> {
5555
ModuleStart(&'a str, Option<&'a str>),
5656
ModuleEnd,
57-
Symbol(&'a str, Option<char>, Option<&'a str>),
58-
Variant(ModifierSet<&'a str>, char, Option<&'a str>),
57+
Symbol(&'a str, Option<String>, Option<&'a str>),
58+
Variant(ModifierSet<&'a str>, String, Option<&'a str>),
5959
}
6060

6161
fn main() {
@@ -103,11 +103,11 @@ fn process(buf: &mut String, file: &Path, name: &str, desc: &str) {
103103
Some(Ok(Declaration::ModuleEnd))
104104
}
105105
}
106-
Ok(Line::Symbol(name, c)) => {
107-
Some(Ok(Declaration::Symbol(name, c, deprecation.take())))
106+
Ok(Line::Symbol(name, value)) => {
107+
Some(Ok(Declaration::Symbol(name, value, deprecation.take())))
108108
}
109-
Ok(Line::Variant(modifiers, c)) => {
110-
Some(Ok(Declaration::Variant(modifiers, c, deprecation.take())))
109+
Ok(Line::Variant(modifiers, value)) => {
110+
Some(Ok(Declaration::Variant(modifiers, value, deprecation.take())))
111111
}
112112
Ok(Line::Eof) => {
113113
deprecation.map(|_| Err(String::from("dangling `@deprecated:`")))
@@ -156,12 +156,12 @@ fn tokenize(line: &str) -> StrResult<Line> {
156156
for part in rest.split('.') {
157157
validate_ident(part)?;
158158
}
159-
let c = decode_char(tail.ok_or("missing char")?)?;
160-
Line::Variant(ModifierSet::from_raw_dotted(rest), c)
159+
let value = decode_value(tail.ok_or("missing char")?)?;
160+
Line::Variant(ModifierSet::from_raw_dotted(rest), value)
161161
} else {
162162
validate_ident(head)?;
163-
let c = tail.map(decode_char).transpose()?;
164-
Line::Symbol(head, c)
163+
let value = tail.map(decode_value).transpose()?;
164+
Line::Symbol(head, value)
165165
})
166166
}
167167

@@ -174,20 +174,23 @@ fn validate_ident(string: &str) -> StrResult<()> {
174174
Err(format!("invalid identifier: {string:?}"))
175175
}
176176

177-
/// Extracts either a single char or parses a U+XXXX escape.
178-
fn decode_char(text: &str) -> StrResult<char> {
179-
if let Some(hex) = text.strip_prefix("U+") {
180-
u32::from_str_radix(hex, 16)
181-
.ok()
182-
.and_then(|n| char::try_from(n).ok())
183-
.ok_or_else(|| format!("invalid unicode escape {text:?}"))
184-
} else {
185-
let mut chars = text.chars();
186-
match (chars.next(), chars.next()) {
187-
(Some(c), None) => Ok(c),
188-
_ => Err(format!("expected exactly one char, found {text:?}")),
189-
}
177+
/// Extracts the value of a variant, parsing `\u{XXXX}` escapes
178+
fn decode_value(text: &str) -> StrResult<String> {
179+
let mut iter = text.split("\\u{");
180+
let mut res = iter.next().unwrap().to_string();
181+
for other in iter {
182+
let (hex, rest) = other.split_once("}").ok_or_else(|| {
183+
format!("unclosed unicode escape \\u{{{}", other.escape_debug())
184+
})?;
185+
res.push(
186+
u32::from_str_radix(hex, 16)
187+
.ok()
188+
.and_then(|n| char::try_from(n).ok())
189+
.ok_or_else(|| format!("invalid unicode escape \\u{{{hex}}}"))?,
190+
);
191+
res += rest;
190192
}
193+
Ok(res)
191194
}
192195

193196
/// Turns a stream of lines into a list of definitions.
@@ -200,23 +203,23 @@ fn parse<'a>(
200203
None | Some(Declaration::ModuleEnd) => {
201204
break;
202205
}
203-
Some(Declaration::Symbol(name, c, deprecation)) => {
206+
Some(Declaration::Symbol(name, value, deprecation)) => {
204207
let mut variants = vec![];
205-
while let Some(Declaration::Variant(name, c, deprecation)) =
208+
while let Some(Declaration::Variant(name, value, deprecation)) =
206209
p.peek().cloned().transpose()?
207210
{
208-
variants.push((name, c, deprecation));
211+
variants.push((name, value, deprecation));
209212
p.next();
210213
}
211214

212215
let symbol = if !variants.is_empty() {
213-
if let Some(c) = c {
214-
variants.insert(0, (ModifierSet::default(), c, None));
216+
if let Some(value) = value {
217+
variants.insert(0, (ModifierSet::default(), value, None));
215218
}
216219
Symbol::Multi(variants)
217220
} else {
218-
let c = c.ok_or("symbol needs char or variants")?;
219-
Symbol::Single(c)
221+
let value = value.ok_or("symbol needs char or variants")?;
222+
Symbol::Single(value)
220223
};
221224

222225
defs.push((name, Binding { def: Def::Symbol(symbol), deprecation }));
@@ -251,7 +254,7 @@ fn encode(buf: &mut String, module: &Module) {
251254
Def::Symbol(symbol) => {
252255
buf.push_str("Def::Symbol(Symbol::");
253256
match symbol {
254-
Symbol::Single(c) => write!(buf, "Single({c:?})").unwrap(),
257+
Symbol::Single(value) => write!(buf, "Single({value:?})").unwrap(),
255258
Symbol::Multi(list) => write!(buf, "Multi(&{list:?})").unwrap(),
256259
}
257260
buf.push(')');

src/lib.rs

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,15 @@ pub enum Def {
6262
#[derive(Debug, Copy, Clone)]
6363
pub enum Symbol {
6464
/// A symbol without modifiers.
65-
Single(char),
65+
Single(&'static str),
6666
/// A symbol with named modifiers. The symbol defaults to its first variant.
67-
Multi(&'static [(ModifierSet<&'static str>, char, Option<&'static str>)]),
67+
Multi(&'static [(ModifierSet<&'static str>, &'static str, Option<&'static str>)]),
6868
}
6969

7070
impl Symbol {
7171
/// Get the symbol's character for a given set of modifiers, alongside an optional deprecation
7272
/// message.
73-
pub fn get(&self, modifs: ModifierSet<&str>) -> Option<(char, Option<&str>)> {
73+
pub fn get(&self, modifs: ModifierSet<&str>) -> Option<(&'static str, Option<&str>)> {
7474
match self {
7575
Self::Single(c) => modifs.is_empty().then_some((*c, None)),
7676
Self::Multi(list) => {
@@ -84,13 +84,13 @@ impl Symbol {
8484
/// Each variant is represented by a tuple `(modifiers, character, deprecation)`.
8585
pub fn variants(
8686
&self,
87-
) -> impl Iterator<Item = (ModifierSet<&str>, char, Option<&str>)> {
87+
) -> impl Iterator<Item = (ModifierSet<&str>, &'static str, Option<&str>)> {
8888
enum Variants {
89-
Single(std::iter::Once<char>),
89+
Single(std::iter::Once<&'static str>),
9090
Multi(
9191
std::slice::Iter<
9292
'static,
93-
(ModifierSet<&'static str>, char, Option<&'static str>),
93+
(ModifierSet<&'static str>, &'static str, Option<&'static str>),
9494
>,
9595
),
9696
}
@@ -124,6 +124,7 @@ include!(concat!(env!("OUT_DIR"), "/out.rs"));
124124
#[cfg(test)]
125125
mod test {
126126
use super::*;
127+
use std::collections::BTreeSet;
127128

128129
#[test]
129130
fn all_modules_sorted() {
@@ -139,4 +140,48 @@ mod test {
139140

140141
assert_sorted_recursively(ROOT);
141142
}
143+
144+
#[test]
145+
fn unicode_escapes() {
146+
let Def::Symbol(wj) = SYM.get("wj").unwrap().def else { panic!() };
147+
assert_eq!(wj.get(ModifierSet::default()).unwrap().0, "\u{2060}");
148+
let Def::Symbol(space) = SYM.get("space").unwrap().def else { panic!() };
149+
assert_eq!(space.get(ModifierSet::default()).unwrap().0, " ");
150+
assert_eq!(
151+
space.get(ModifierSet::from_raw_dotted("nobreak")).unwrap().0,
152+
"\u{A0}"
153+
);
154+
}
155+
156+
#[test]
157+
fn random_sample() {
158+
for (key, control) in [
159+
("backslash", [("", "\\"), ("circle", "⦸"), ("not", "⧷")].as_slice()),
160+
("chi", &[("", "χ")]),
161+
("forces", &[("", "⊩"), ("not", "⊮")]),
162+
("interleave", &[("", "⫴"), ("big", "⫼"), ("struck", "⫵")]),
163+
("uranus", &[("", "⛢"), ("alt", "♅")]),
164+
] {
165+
let Def::Symbol(s) = SYM.get(key).unwrap().def else {
166+
panic!("{key:?} is not a symbol")
167+
};
168+
let variants = s
169+
.variants()
170+
.map(|(m, v, _)| (m.into_iter().collect::<BTreeSet<_>>(), v))
171+
.collect::<BTreeSet<_>>();
172+
let control = control
173+
.iter()
174+
.map(|&(m, v)| {
175+
(
176+
ModifierSet::from_raw_dotted(m)
177+
.into_iter()
178+
.collect::<BTreeSet<_>>(),
179+
v,
180+
)
181+
})
182+
.collect::<BTreeSet<_>>();
183+
184+
assert_eq!(variants, control);
185+
}
186+
}
142187
}

src/modules/sym.txt

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,25 @@
11
// Control.
2-
wj U+2060
3-
zwj U+200D
4-
zwnj U+200C
5-
zws U+200B
6-
lrm U+200E
7-
rlm U+200F
2+
wj \u{2060}
3+
zwj \u{200D}
4+
zwnj \u{200C}
5+
zws \u{200B}
6+
lrm \u{200E}
7+
rlm \u{200F}
88

99
// Spaces.
10-
space U+20
11-
.nobreak U+A0
12-
.nobreak.narrow U+202F
13-
.en U+2002
14-
.quad U+2003
15-
.third U+2004
16-
.quarter U+2005
17-
.sixth U+2006
18-
.med U+205F
19-
.fig U+2007
20-
.punct U+2008
21-
.thin U+2009
22-
.hair U+200A
10+
space \u{20}
11+
.nobreak \u{A0}
12+
.nobreak.narrow \u{202F}
13+
.en \u{2002}
14+
.quad \u{2003}
15+
.third \u{2004}
16+
.quarter \u{2005}
17+
.sixth \u{2006}
18+
.med \u{205F}
19+
.fig \u{2007}
20+
.punct \u{2008}
21+
.thin \u{2009}
22+
.hair \u{200A}
2323

2424
// Delimiters.
2525
paren
@@ -30,9 +30,9 @@ paren
3030
.t ⏜
3131
.b ⏝
3232
brace
33-
.l U+7B
33+
.l \u{7B}
3434
.l.double ⦃
35-
.r U+7D
35+
.r \u{7D}
3636
.r.double ⦄
3737
.t ⏞
3838
.b ⏟
@@ -141,14 +141,14 @@ dash
141141
.wave.double 〰
142142
dot
143143
.op ⋅
144-
.basic U+2E
144+
.basic \u{2E}
145145
.c ·
146146
.circle ⊙
147147
.circle.big ⨀
148148
.square ⊡
149149
.double ¨
150-
.triple U+20DB
151-
.quad U+20DC
150+
.triple \u{20DB}
151+
.quad \u{20DC}
152152
excl !
153153
.double ‼
154154
.inv ¡
@@ -161,10 +161,10 @@ interrobang ‽
161161
.inv ⸘
162162
hash #
163163
hyph ‐
164-
.minus U+2D
165-
.nobreak U+2011
164+
.minus \u{2D}
165+
.nobreak \u{2011}
166166
.point ‧
167-
.soft U+AD
167+
.soft \u{AD}
168168
numero №
169169
percent %
170170
permille ‰

0 commit comments

Comments
 (0)