Skip to content

Commit e4ba021

Browse files
feat: Minimize allocations in SCIP symbol parsing.
See PR for benchmarks.
1 parent 79fb777 commit e4ba021

File tree

25 files changed

+3506
-2413
lines changed

25 files changed

+3506
-2413
lines changed

.github/workflows/golang.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ jobs:
1818
- uses: ./.github/actions/asdf
1919
with:
2020
golang: true
21-
- run: go test ./... -v
21+
- run: go test ./... -v -tags asserts

.tool-versions

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
golang 1.20.14
1+
golang 1.22.0
22
nodejs 16.20.2
33
shellcheck 0.7.1
44
yarn 1.22.22

Development.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
- [Project structure](#project-structure)
44
- [Code generation](#code-generation)
55
- [Debugging](#debugging)
6+
- [Benchmarking](#benchmarking)
67
- [Testing and adding new SCIP semantics](#testing-and-adding-new-scip-semantics)
78
- [Release a new version](#release-a-new-version)
89

@@ -69,6 +70,24 @@ and is not recommended for use in other settings.
6970
scip lint /path/to/index.scip
7071
```
7172

73+
## Benchmarking
74+
75+
For benchmarks, one can put test SCIP indexes under `dev/sample_indexes`.
76+
77+
Sourcegraph teammates can download several large indexes
78+
from this [Google drive folder](https://drive.google.com/drive/folders/1z62Se7eHaa5T89a16-y7s0Z1qbRY4VCg).
79+
80+
After that you can run:
81+
82+
```bash
83+
go run ./bindings/go/scip/speedtest
84+
```
85+
86+
to see the results.
87+
88+
Make sure to share benchmark results when making changes to
89+
the symbol parsing logic.
90+
7291
## Testing and adding new SCIP semantics
7392

7493
It is helpful to use reprolang to check the existing code navigation behavior,

bindings/go/scip/assertions.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
//go:build asserts
2+
3+
package scip
4+
5+
func assert(cond bool, msg string) {
6+
if !cond {
7+
panic(msg)
8+
}
9+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
//go:build !asserts
2+
3+
package scip
4+
5+
func assert(cond bool, msg string) {}
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
package internal
2+
3+
import (
4+
"io"
5+
"os"
6+
"path/filepath"
7+
"sync/atomic"
8+
"testing"
9+
10+
"github.com/sourcegraph/beaut"
11+
"github.com/sourcegraph/beaut/lib/knownwf"
12+
conciter "github.com/sourcegraph/conc/iter"
13+
"github.com/sourcegraph/scip/bindings/go/scip"
14+
"github.com/sourcegraph/scip/bindings/go/scip/internal/shared"
15+
"github.com/stretchr/testify/require"
16+
"google.golang.org/protobuf/proto"
17+
)
18+
19+
func TestParseCompat(t *testing.T) {
20+
for _, path := range shared.SampleIndexes() {
21+
t.Run(filepath.Base(path), func(t *testing.T) {
22+
t.Parallel()
23+
scipReader, err := os.Open(path)
24+
require.Nil(t, err)
25+
scipBytes, err := io.ReadAll(scipReader)
26+
require.Nil(t, err)
27+
scipIndex := scip.Index{}
28+
require.NoError(t, proto.Unmarshal(scipBytes, &scipIndex))
29+
var total atomic.Int64
30+
conciter.ForEach(scipIndex.Documents, func(docPtr **scip.Document) {
31+
document := *docPtr
32+
if total.Load() > 1000*1000 {
33+
return
34+
}
35+
total.Add(int64(len(document.Occurrences)))
36+
var sym scip.Symbol
37+
for i := 0; i < len(document.Occurrences); i++ {
38+
occ := document.Occurrences[i]
39+
old, err := ParsePartialSymbolV1ToBeDeleted(occ.Symbol, true)
40+
require.NoError(t, err)
41+
require.NotPanics(t, func() {
42+
str := beaut.NewUTF8StringUnchecked(occ.Symbol, knownwf.UTF8DeserializedFromProtobufString)
43+
err = scip.ParseSymbolUTF8With(str, scip.ParseSymbolOptions{
44+
IncludeDescriptors: true,
45+
RecordOutput: &sym,
46+
})
47+
}, "panic for symbol: %q", occ.Symbol)
48+
require.NoError(t, err)
49+
require.Equal(t, old.Scheme, sym.Scheme)
50+
require.Equal(t, old.Package, sym.Package)
51+
require.Equalf(t, len(old.Descriptors), len(sym.Descriptors), "symbol: %v, d1: %+v, d2: %+v", occ.Symbol,
52+
old.Descriptors, sym.Descriptors)
53+
for i, d := range old.Descriptors {
54+
dnew := sym.Descriptors[i]
55+
require.Equal(t, d.Name, dnew.Name, "symbol: %v", occ.Symbol)
56+
require.Equal(t, d.Suffix, dnew.Suffix, "symbol: %v", occ.Symbol)
57+
require.Equal(t, d.Disambiguator, dnew.Disambiguator, "symbol: %v", occ.Symbol)
58+
}
59+
}
60+
})
61+
})
62+
}
63+
}
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
package internal
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
7+
"github.com/cockroachdb/errors"
8+
"github.com/sourcegraph/scip/bindings/go/scip"
9+
"github.com/sourcegraph/scip/bindings/go/scip/internal/shared"
10+
)
11+
12+
func tryParseLocalSymbol(symbol string) (string, error) {
13+
if !strings.HasPrefix(symbol, "local ") {
14+
return "", nil
15+
}
16+
suffix := symbol[6:]
17+
if len(suffix) > 0 && shared.IsSimpleIdentifier(suffix) {
18+
return suffix, nil
19+
}
20+
return "", errors.Newf("expected format 'local <simple-identifier>' but got: %v", symbol)
21+
}
22+
23+
// ParsePartialSymbolV1ToBeDeleted parses an SCIP string into the Symbol message
24+
// with the option to exclude the `.Descriptor` field.
25+
//
26+
// Nov 30 2024: This is currently only present for benchmarking + compatibility
27+
// reasons. We can remove this in the future once we're confident that the new
28+
// parser handles everything correctly.
29+
func ParsePartialSymbolV1ToBeDeleted(symbol string, includeDescriptors bool) (*scip.Symbol, error) {
30+
// TODO: Rip out this, and call
31+
local, err := tryParseLocalSymbol(symbol)
32+
if err != nil {
33+
return nil, err
34+
}
35+
if local != "" {
36+
return newLocalSymbol(local), nil
37+
}
38+
s := newSymbolParser(symbol)
39+
scheme, err := s.acceptSpaceEscapedIdentifier("scheme")
40+
if err != nil {
41+
return nil, err
42+
}
43+
manager, err := s.acceptSpaceEscapedIdentifier("package manager")
44+
if err != nil {
45+
return nil, err
46+
}
47+
if manager == "." {
48+
manager = ""
49+
}
50+
packageName, err := s.acceptSpaceEscapedIdentifier("package name")
51+
if err != nil {
52+
return nil, err
53+
}
54+
if packageName == "." {
55+
packageName = ""
56+
}
57+
packageVersion, err := s.acceptSpaceEscapedIdentifier("package version")
58+
if err != nil {
59+
return nil, err
60+
}
61+
if packageVersion == "." {
62+
packageVersion = ""
63+
}
64+
var descriptors []*scip.Descriptor
65+
if includeDescriptors {
66+
descriptors, err = s.parseDescriptors()
67+
}
68+
return &scip.Symbol{
69+
Scheme: scheme,
70+
Package: &scip.Package{
71+
Manager: manager,
72+
Name: packageName,
73+
Version: packageVersion,
74+
},
75+
Descriptors: descriptors,
76+
}, err
77+
}
78+
79+
func newLocalSymbol(id string) *scip.Symbol {
80+
return &scip.Symbol{
81+
Scheme: "local",
82+
Descriptors: []*scip.Descriptor{
83+
{Name: id, Suffix: scip.Descriptor_Local},
84+
},
85+
}
86+
}
87+
88+
type symbolParser struct {
89+
Symbol []rune
90+
index int
91+
SymbolString string
92+
}
93+
94+
func newSymbolParser(symbol string) *symbolParser {
95+
return &symbolParser{
96+
SymbolString: symbol,
97+
Symbol: []rune(symbol),
98+
index: 0,
99+
}
100+
}
101+
102+
func (s *symbolParser) error(message string) error {
103+
return errors.Newf("%s\n%s\n%s^", message, s.SymbolString, strings.Repeat("_", s.index))
104+
}
105+
106+
func (s *symbolParser) current() rune {
107+
if s.index < len(s.Symbol) {
108+
return s.Symbol[s.index]
109+
}
110+
return '\x00'
111+
}
112+
113+
func (s *symbolParser) peekNext() rune {
114+
if s.index+1 < len(s.Symbol) {
115+
return s.Symbol[s.index]
116+
}
117+
return 0
118+
}
119+
120+
func (s *symbolParser) parseDescriptors() ([]*scip.Descriptor, error) {
121+
var result []*scip.Descriptor
122+
for s.index < len(s.Symbol) {
123+
descriptor, err := s.parseDescriptor()
124+
if err != nil {
125+
return nil, err
126+
}
127+
result = append(result, descriptor)
128+
}
129+
return result, nil
130+
}
131+
132+
func (s *symbolParser) parseDescriptor() (*scip.Descriptor, error) {
133+
start := s.index
134+
switch s.peekNext() {
135+
case '(':
136+
s.index++
137+
name, err := s.acceptIdentifier("parameter name")
138+
if err != nil {
139+
return nil, err
140+
}
141+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Parameter}, s.acceptCharacter(')', "closing parameter name")
142+
case '[':
143+
s.index++
144+
name, err := s.acceptIdentifier("type parameter name")
145+
if err != nil {
146+
return nil, err
147+
}
148+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_TypeParameter}, s.acceptCharacter(']', "closing type parameter name")
149+
default:
150+
name, err := s.acceptIdentifier("descriptor name")
151+
if err != nil {
152+
return nil, err
153+
}
154+
suffix := s.current()
155+
s.index++
156+
switch suffix {
157+
case '(':
158+
disambiguator := ""
159+
if s.peekNext() != ')' {
160+
disambiguator, err = s.acceptIdentifier("method disambiguator")
161+
if err != nil {
162+
return nil, err
163+
}
164+
}
165+
err = s.acceptCharacter(')', "closing method")
166+
if err != nil {
167+
return nil, err
168+
}
169+
return &scip.Descriptor{Name: name, Disambiguator: disambiguator, Suffix: scip.Descriptor_Method}, s.acceptCharacter('.', "closing method")
170+
case '/':
171+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Namespace}, nil
172+
case '.':
173+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Term}, nil
174+
case '#':
175+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Type}, nil
176+
case ':':
177+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Meta}, nil
178+
case '!':
179+
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Macro}, nil
180+
default:
181+
}
182+
}
183+
184+
end := s.index
185+
if s.index > len(s.Symbol) {
186+
end = len(s.Symbol)
187+
}
188+
return nil, errors.Newf("unrecognized descriptor %q", string(s.Symbol[start:end]))
189+
}
190+
191+
func (s *symbolParser) acceptIdentifier(what string) (string, error) {
192+
if s.current() == '`' {
193+
s.index++
194+
return s.acceptBacktickEscapedIdentifier(what)
195+
}
196+
start := s.index
197+
for s.index < len(s.Symbol) && shared.IsSimpleIdentifierCharacter(s.current()) {
198+
s.index++
199+
}
200+
if start == s.index {
201+
return "", s.error("empty identifier")
202+
}
203+
return string(s.Symbol[start:s.index]), nil
204+
}
205+
206+
func (s *symbolParser) acceptSpaceEscapedIdentifier(what string) (string, error) {
207+
return s.acceptEscapedIdentifier(what, ' ')
208+
}
209+
210+
func (s *symbolParser) acceptBacktickEscapedIdentifier(what string) (string, error) {
211+
return s.acceptEscapedIdentifier(what, '`')
212+
}
213+
214+
func (s *symbolParser) acceptEscapedIdentifier(what string, escapeCharacter rune) (string, error) {
215+
builder := strings.Builder{}
216+
for s.index < len(s.Symbol) {
217+
ch := s.current()
218+
if ch == escapeCharacter {
219+
s.index++
220+
if s.index >= len(s.Symbol) {
221+
break
222+
}
223+
if s.current() == escapeCharacter {
224+
// Escaped space character.
225+
builder.WriteRune(ch)
226+
} else {
227+
return builder.String(), nil
228+
}
229+
} else {
230+
builder.WriteRune(ch)
231+
}
232+
s.index++
233+
}
234+
return "", s.error(fmt.Sprintf("reached end of symbol while parsing <%s>, expected a '%v' character", what, string(escapeCharacter)))
235+
}
236+
237+
func (s *symbolParser) acceptCharacter(r rune, what string) error {
238+
if s.current() == r {
239+
s.index++
240+
return nil
241+
}
242+
return s.error(fmt.Sprintf("expected '%v', obtained '%v', while parsing %v", string(r), string(s.current()), what))
243+
}

0 commit comments

Comments
 (0)