Skip to content

Commit 6c29cf0

Browse files
committed
Allow code search by filename
Signed-off-by: Bruno Sofiato <[email protected]>
1 parent fa35ace commit 6c29cf0

File tree

33 files changed

+648
-40
lines changed

33 files changed

+648
-40
lines changed

models/fixtures/repository.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1768,3 +1768,34 @@
17681768
size: 0
17691769
is_fsck_enabled: true
17701770
close_issues_via_commit_in_any_branch: false
1771+
1772+
-
1773+
id: 62
1774+
owner_id: 42
1775+
owner_name: org42
1776+
lower_name: search-by-path
1777+
name: search-by-path
1778+
default_branch: master
1779+
num_watches: 0
1780+
num_stars: 0
1781+
num_forks: 0
1782+
num_issues: 0
1783+
num_closed_issues: 0
1784+
num_pulls: 1
1785+
num_closed_pulls: 0
1786+
num_milestones: 0
1787+
num_closed_milestones: 0
1788+
num_projects: 0
1789+
num_closed_projects: 0
1790+
is_private: false
1791+
is_empty: false
1792+
is_archived: false
1793+
is_mirror: false
1794+
status: 0
1795+
is_fork: false
1796+
fork_id: 0
1797+
is_template: false
1798+
template_id: 0
1799+
size: 0
1800+
is_fsck_enabled: true
1801+
close_issues_via_commit_in_any_branch: false

models/fixtures/user.yml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1517,3 +1517,40 @@
15171517
repo_admin_change_team_access: false
15181518
theme: ""
15191519
keep_activity_private: false
1520+
1521+
-
1522+
id: 42
1523+
lower_name: org42
1524+
name: org42
1525+
full_name: Org42
1526+
1527+
keep_email_private: false
1528+
email_notifications_preference: onmention
1529+
passwd: ZogKvWdyEx:password
1530+
passwd_hash_algo: dummy
1531+
must_change_password: false
1532+
login_source: 0
1533+
login_name: org42
1534+
type: 1
1535+
salt: ZogKvWdyEx
1536+
max_repo_creation: -1
1537+
is_active: false
1538+
is_admin: false
1539+
is_restricted: false
1540+
allow_git_hook: false
1541+
allow_import_local: false
1542+
allow_create_organization: true
1543+
prohibit_login: false
1544+
avatar: avatar42
1545+
avatar_email: [email protected]
1546+
use_custom_avatar: false
1547+
num_followers: 0
1548+
num_following: 0
1549+
num_stars: 0
1550+
num_repos: 1
1551+
num_teams: 2
1552+
num_members: 3
1553+
visibility: 0
1554+
repo_admin_change_team_access: false
1555+
theme: ""
1556+
keep_activity_private: false

modules/indexer/code/bleve/bleve.go

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"code.gitea.io/gitea/modules/charset"
1818
"code.gitea.io/gitea/modules/git"
1919
"code.gitea.io/gitea/modules/gitrepo"
20+
path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
2021
"code.gitea.io/gitea/modules/indexer/code/internal"
2122
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
2223
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
@@ -53,6 +54,7 @@ type RepoIndexerData struct {
5354
RepoID int64
5455
CommitID string
5556
Content string
57+
Filename string
5658
Language string
5759
UpdatedAt time.Time
5860
}
@@ -64,8 +66,10 @@ func (d *RepoIndexerData) Type() string {
6466

6567
const (
6668
repoIndexerAnalyzer = "repoIndexerAnalyzer"
69+
filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
70+
filenameIndexerTokenizer = "filenameIndexerTokenizer"
6771
repoIndexerDocType = "repoIndexerDocType"
68-
repoIndexerLatestVersion = 6
72+
repoIndexerLatestVersion = 7
6973
)
7074

7175
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
@@ -79,6 +83,11 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
7983
textFieldMapping.IncludeInAll = false
8084
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
8185

86+
fileNamedMapping := bleve.NewTextFieldMapping()
87+
fileNamedMapping.IncludeInAll = false
88+
fileNamedMapping.Analyzer = filenameIndexerAnalyzer
89+
docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
90+
8291
termFieldMapping := bleve.NewTextFieldMapping()
8392
termFieldMapping.IncludeInAll = false
8493
termFieldMapping.Analyzer = analyzer_keyword.Name
@@ -90,6 +99,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
9099
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
91100

92101
mapping := bleve.NewIndexMapping()
102+
93103
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
94104
return nil, err
95105
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
@@ -100,6 +110,16 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
100110
}); err != nil {
101111
return nil, err
102112
}
113+
114+
if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
115+
"type": analyzer_custom.Name,
116+
"char_filters": []string{},
117+
"tokenizer": unicode.Name,
118+
"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
119+
}); err != nil {
120+
return nil, err
121+
}
122+
103123
mapping.DefaultAnalyzer = repoIndexerAnalyzer
104124
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
105125
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
@@ -174,6 +194,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
174194
return batch.Index(id, &RepoIndexerData{
175195
RepoID: repo.ID,
176196
CommitID: commitSha,
197+
Filename: update.Filename,
177198
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
178199
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
179200
UpdatedAt: time.Now().UTC(),
@@ -240,14 +261,19 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
240261
keywordQuery query.Query
241262
)
242263

243-
phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
244-
phraseQuery.FieldVal = "Content"
245-
phraseQuery.Analyzer = repoIndexerAnalyzer
246-
keywordQuery = phraseQuery
264+
pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
265+
pathQuery.FieldVal = "Filename"
266+
pathQuery.SetBoost(10)
267+
268+
contentQuery := bleve.NewMatchQuery(opts.Keyword)
269+
contentQuery.FieldVal = "Content"
270+
247271
if opts.IsKeywordFuzzy {
248-
phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
272+
contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
249273
}
250274

275+
keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
276+
251277
if len(opts.RepoIDs) > 0 {
252278
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
253279
for _, repoID := range opts.RepoIDs {
@@ -277,7 +303,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
277303

278304
from, pageSize := opts.GetSkipTake()
279305
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
280-
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
306+
searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
281307
searchRequest.IncludeLocations = true
282308

283309
if len(opts.Language) == 0 {
@@ -307,6 +333,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
307333
endIndex = locationEnd
308334
}
309335
}
336+
if len(hit.Locations["Filename"]) > 0 {
337+
startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
338+
}
339+
310340
language := hit.Fields["Language"].(string)
311341
var updatedUnix timeutil.TimeStamp
312342
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
// Copyright 2019 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
package path
5+
6+
import (
7+
"slices"
8+
"strings"
9+
10+
"github.com/blevesearch/bleve/v2/analysis"
11+
"github.com/blevesearch/bleve/v2/registry"
12+
)
13+
14+
const (
15+
Name = "gitea/path"
16+
)
17+
18+
type TokenFilter struct{}
19+
20+
func NewTokenFilter() *TokenFilter {
21+
return &TokenFilter{}
22+
}
23+
24+
func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
25+
return NewTokenFilter(), nil
26+
}
27+
28+
func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
29+
if len(input) == 1 {
30+
// if there is only one token, we dont need to generate the reversed chain
31+
return generatePathTokens(input, false)
32+
}
33+
34+
normal := generatePathTokens(input, false)
35+
reversed := generatePathTokens(input, true)
36+
37+
return append(normal, reversed...)
38+
}
39+
40+
func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
41+
terms := make([]string, 0, len(input))
42+
longestTerm := 0
43+
44+
if reversed {
45+
slices.Reverse(input)
46+
}
47+
48+
for i := 0; i < len(input); i++ {
49+
var sb strings.Builder
50+
sb.WriteString(string(input[0].Term))
51+
52+
for j := 1; j < i; j++ {
53+
sb.WriteString("/")
54+
sb.WriteString(string(input[j].Term))
55+
}
56+
57+
term := sb.String()
58+
59+
if longestTerm < len(term) {
60+
longestTerm = len(term)
61+
}
62+
63+
terms = append(terms, term)
64+
}
65+
66+
output := make(analysis.TokenStream, 0, len(terms))
67+
68+
for _, term := range terms {
69+
var start, end int
70+
71+
if reversed {
72+
start = 0
73+
end = len(term)
74+
} else {
75+
start = longestTerm - len(term)
76+
end = longestTerm
77+
}
78+
79+
token := analysis.Token{
80+
Position: 1,
81+
Start: start,
82+
End: end,
83+
Type: analysis.AlphaNumeric,
84+
Term: []byte(term),
85+
}
86+
87+
output = append(output, &token)
88+
}
89+
90+
return output
91+
}
92+
93+
func init() {
94+
registry.RegisterTokenFilter(Name, TokenFilterConstructor)
95+
}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// Copyright 2019 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
package path
5+
6+
import (
7+
"fmt"
8+
"testing"
9+
10+
"github.com/blevesearch/bleve/v2/analysis"
11+
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
12+
"github.com/stretchr/testify/assert"
13+
)
14+
15+
type Scenario struct {
16+
Input string
17+
Tokens []string
18+
}
19+
20+
func TestTokenFilter(t *testing.T) {
21+
scenarios := []struct {
22+
Input string
23+
Terms []string
24+
}{
25+
{
26+
Input: "Dockerfile",
27+
Terms: []string{"Dockerfile"},
28+
},
29+
{
30+
Input: "Dockerfile.rootless",
31+
Terms: []string{"Dockerfile.rootless"},
32+
},
33+
{
34+
Input: "a/b/c/Dockerfile.rootless",
35+
Terms: []string{"a", "a/b", "a/b/c", "a/b/c/Dockerfile.rootless", "Dockerfile.rootless", "Dockerfile.rootless/c", "Dockerfile.rootless/c/b", "Dockerfile.rootless/c/b/a"},
36+
},
37+
{
38+
Input: "",
39+
Terms: []string{},
40+
},
41+
}
42+
43+
for _, scenario := range scenarios {
44+
t.Run(fmt.Sprintf("ensure terms of '%s'", scenario.Input), func(t *testing.T) {
45+
terms := extractTerms(scenario.Input)
46+
47+
assert.Len(t, terms, len(scenario.Terms))
48+
49+
for _, term := range terms {
50+
assert.Contains(t, scenario.Terms, term)
51+
}
52+
})
53+
}
54+
}
55+
56+
func extractTerms(input string) []string {
57+
tokens := tokenize(input)
58+
filteredTokens := filter(tokens)
59+
terms := make([]string, 0, len(filteredTokens))
60+
61+
for _, token := range filteredTokens {
62+
terms = append(terms, string(token.Term))
63+
}
64+
65+
return terms
66+
}
67+
68+
func filter(input analysis.TokenStream) analysis.TokenStream {
69+
filter := NewTokenFilter()
70+
return filter.Filter(input)
71+
}
72+
73+
func tokenize(input string) analysis.TokenStream {
74+
tokenizer := unicode.NewUnicodeTokenizer()
75+
return tokenizer.Tokenize([]byte(input))
76+
}

0 commit comments

Comments
 (0)