Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

Commit 4d02f09

Browse files
author
Christoph Hegemann
authored
scip-syntax: adds strict SCIP symbol parsing and formatting (#63443)
Adds strict and performant symbol parsing/formatting for `scip-syntax`. Parsing is "zero" allocation when the symbol does not contain escapes. (Technically it does allocate a Vec to hold the descriptors) Final benchmark numbers: ``` symbol parsing/parse_v1/10000 [19.158 ms 19.231 ms 19.306 ms] symbol parsing/parse_v1/100000 [252.10 ms 252.48 ms 252.86 ms] symbol parsing/parse_v1/1000000 [2.9972 s 3.0033 s 3.0094 s] symbol parsing/parse_v2/10000 [1.1307 ms 1.1357 ms 1.1413 ms] symbol parsing/parse_v2/100000 [15.645 ms 15.670 ms 15.697 ms] symbol parsing/parse_v2/1000000 [191.80 ms 192.11 ms 192.44 ms] ``` ## Test plan Some basic unit tests. Verified manually that it produces the same symbols as the existing parser for all of chromium.scip
1 parent afea138 commit 4d02f09

File tree

12 files changed

+629
-16
lines changed

12 files changed

+629
-16
lines changed

docker-images/syntax-highlighter/Cargo.Bazel.lock

Lines changed: 97 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docker-images/syntax-highlighter/Cargo.lock

Lines changed: 18 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docker-images/syntax-highlighter/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ tree-sitter-highlight = "0.20.1"
6161
walkdir = "2"
6262
path-clean = "1"
6363
camino = "1.1"
64+
nom = "7.1.3"
6465

6566
scip = "0.3.2"
6667
protobuf = "3"

docker-images/syntax-highlighter/crates/scip-syntax/BUILD.bazel

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,7 @@ WORKSPACE_DEPS = [
2828
rust_library(
2929
name = "scip_syntax_lib",
3030
srcs = glob(
31-
[
32-
"src/*.rs",
33-
],
31+
["src/**/*.rs"],
3432
allow_empty = False,
3533
exclude = ["src/main.rs"],
3634
),
@@ -49,7 +47,11 @@ rust_library(
4947
rust_test(
5048
name = "unit_test",
5149
size = "small",
52-
srcs = glob(["src/*.rs"]),
50+
srcs = glob(
51+
["src/**/*.rs"],
52+
allow_empty = False,
53+
exclude = ["src/main.rs"],
54+
),
5355
proc_macro_deps = all_crate_deps(
5456
proc_macro = True,
5557
),
@@ -68,7 +70,11 @@ rust_test(
6870
rust_test(
6971
name = "integration_test",
7072
size = "small",
71-
srcs = glob(["tests/*.rs"]),
73+
srcs = glob(
74+
["src/**/*.rs"],
75+
allow_empty = False,
76+
exclude = ["src/main.rs"],
77+
),
7278
compile_data = glob(
7379
[
7480
"testdata/**",

docker-images/syntax-highlighter/crates/scip-syntax/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ edition = "2021"
66
[[bin]]
77
name = "scip-syntax"
88

9+
[[bench]]
10+
name = "symbol_parsing"
11+
harness = false
12+
913
[dependencies]
1014
assert_cmd = "2.0.12"
1115
predicates = "3.0.4"
@@ -26,10 +30,12 @@ walkdir = { workspace = true }
2630
path-clean = { workspace = true }
2731
camino = { workspace = true }
2832
tree-sitter = { workspace = true }
33+
nom = { workspace = true }
2934

3035
syntax-analysis = { path = "../syntax-analysis" }
3136
tree-sitter-all-languages = { path = "../tree-sitter-all-languages" }
3237
tar = "0.4.40"
3338

3439
[dev-dependencies]
3540
tempfile="3.10.1"
41+
criterion = { version = "0.4", features = ["html_reports"] }
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
use camino::Utf8Path;
2+
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
3+
use scip_syntax::{io::read_index_from_file, scip_strict};
4+
5+
fn parse_symbols(symbols: &[&str]) {
6+
for symbol in symbols {
7+
scip::symbol::parse_symbol(symbol).unwrap();
8+
}
9+
}
10+
11+
fn parse_symbols_v2(symbols: &[&str]) {
12+
for symbol in symbols {
13+
scip_strict::Symbol::parse(&symbol).unwrap();
14+
}
15+
}
16+
17+
fn symbols_from_index(path: &str) -> impl Iterator<Item = String> {
18+
let index = read_index_from_file(Utf8Path::new(path))
19+
.unwrap();
20+
index
21+
.documents
22+
.into_iter()
23+
.flat_map(|document| {
24+
document
25+
.occurrences
26+
.into_iter()
27+
.map(|occurrence| occurrence.symbol)
28+
})
29+
}
30+
31+
fn bench_symbol_parsing(c: &mut Criterion) {
32+
// let all_symbols: Vec<String> = symbols_from_index("~/work/scip-indices/spring-framework-syntactic.scip").collect();
33+
let all_symbols: Vec<String> = symbols_from_index("/Users/creek/work/scip-indices/chromium-1.scip").collect();
34+
let mut group = c.benchmark_group("symbol parsing");
35+
for n in [10_000, 100_000, 1_000_000] {
36+
let symbols: Vec<&str> = all_symbols.iter().take(n).map(|s| s.as_str()).collect();
37+
group.bench_with_input(BenchmarkId::new("parse_v1", n), &symbols, |b, syms| {
38+
b.iter(|| parse_symbols(syms))
39+
});
40+
group.bench_with_input(BenchmarkId::new("parse_v2", n), &symbols, |b, syms| {
41+
b.iter(|| parse_symbols_v2(syms))
42+
});
43+
}
44+
}
45+
46+
criterion_group!(benches, bench_symbol_parsing);
47+
criterion_main!(benches);

docker-images/syntax-highlighter/crates/scip-syntax/src/evaluate.rs

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use serde::Serializer;
1414
use string_interner::{symbol::SymbolU32, StringInterner, Symbol};
1515
use syntax_analysis::range::Range;
1616

17-
use crate::{io::read_index_from_file, progress::*};
17+
use crate::{io::read_index_from_file, progress::*, scip_strict};
1818

1919
pub fn evaluate_command(
2020
candidate: &Utf8Path,
@@ -731,16 +731,12 @@ impl SymbolFormatter {
731731

732732
fn try_strip_package_details<T: Copy>(&mut self, sym: SymbolId<T>) -> SymbolId<T> {
733733
let s = self.display_symbol(sym);
734-
if s.as_bytes().iter().filter(|&c| *c == b' ').count() != 5 {
734+
let Result::Ok(scip_strict::Symbol::NonLocal(mut symbol)) = scip_strict::Symbol::parse(s)
735+
else {
735736
return sym;
736-
}
737-
let parts: Vec<&str> = s.splitn(5, ' ').collect();
738-
let scheme = parts[0];
739-
let _manager = parts[1];
740-
let _package_name = parts[2];
741-
let _version = parts[3];
742-
let descriptor = parts[4];
743-
self.make_symbol_id(&format!("{scheme} . . . {descriptor}"))
737+
};
738+
symbol.package = scip_strict::Package::default();
739+
self.make_symbol_id(&symbol.to_string())
744740
}
745741
}
746742

docker-images/syntax-highlighter/crates/scip-syntax/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ pub mod evaluate;
22
pub mod index;
33
pub mod io;
44
pub mod progress;
5+
pub mod scip_strict;

0 commit comments

Comments
 (0)