Skip to content

Commit d7dfe3b

Browse files
Harshil Goelmatthewmcneely
andauthored
feat(core): Add a new string index: n-gram (#9463)
Co-authored-by: mattthew <[email protected]>
1 parent c2ccd6e commit d7dfe3b

File tree

83 files changed

+2312
-980
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+2312
-980
lines changed

dgraph/cmd/alpha/dashboard.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ func keywordHandler(w http.ResponseWriter, r *http.Request) {
8585
"min",
8686
"mutation",
8787
"near",
88+
"ngram",
8889
"not",
8990
"offset",
9091
"or",

dgraphtest/image.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ func buildDgraphBinary(dir, binaryDir, version string) error {
154154

155155
cmd := exec.Command("make", "dgraph")
156156
cmd.Dir = filepath.Join(dir, "dgraph")
157+
cmd.Env = append(os.Environ(), "GOOS=linux", "GOARCH=amd64")
157158
if out, err := cmd.CombinedOutput(); err != nil {
158159
return errors.Wrapf(err, "error while building dgraph binary\noutput:%v", string(out))
159160
}

dql/parser.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1700,7 +1700,7 @@ func validFuncName(name string) bool {
17001700
}
17011701

17021702
switch name {
1703-
case "regexp", "anyofterms", "allofterms", "alloftext", "anyoftext",
1703+
case "regexp", "anyofterms", "allofterms", "alloftext", "anyoftext", "ngram",
17041704
"has", "uid", "uid_in", "anyof", "allof", "type", "match", "similar_to":
17051705
return true
17061706
}

graphql/e2e/common/common.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,13 @@ type movie struct {
196196
Director []*director `json:"moviedirector,omitempty"`
197197
}
198198

199+
// Ngram test struct definitions
200+
type article struct {
201+
ID string `json:"id,omitempty"`
202+
Title string `json:"title,omitempty"`
203+
Summaries []string `json:"summaries,omitempty"`
204+
}
205+
199206
type director struct {
200207
ID string `json:"id,omitempty"`
201208
Name string `json:"name,omitempty"`
@@ -818,6 +825,13 @@ func RunAll(t *testing.T) {
818825
t.Run("enum filter", enumFilter)
819826
t.Run("default enum filter", defaultEnumFilter)
820827
t.Run("query by multiple invalid ids", queryByMultipleInvalidIds)
828+
t.Run("ngram filters", ngramFilters)
829+
t.Run("ngram stemming", ngramStemming)
830+
t.Run("ngram stop words", ngramStopWords)
831+
t.Run("ngram array fields", ngramArrayFields)
832+
t.Run("ngram case insensitive", ngramCaseInsensitive)
833+
t.Run("ngram compound words", ngramCompoundWords)
834+
t.Run("ngram linguistic variations", ngramLinguisticVariations)
821835
t.Run("query typename", queryTypename)
822836
t.Run("query nested typename", queryNestedTypename)
823837
t.Run("typename for interface", typenameForInterface)

graphql/e2e/common/query.go

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4093,3 +4093,236 @@ func queryWithIDFieldAndInterfaceArg(t *testing.T) {
40934093
// Cleanup
40944094
DeleteGqlType(t, "LibraryMember", map[string]interface{}{}, 1, nil)
40954095
}
4096+
4097+
func ngramFilters(t *testing.T) {
4098+
// Test 1: Single word queries
4099+
queryArticleByTitle(t, "Running", []*article{
4100+
{Title: "Running for Cardiovascular Health"},
4101+
})
4102+
4103+
queryArticleByTitle(t, "Training", []*article{
4104+
{Title: "Weight Training and Muscle Building"},
4105+
})
4106+
4107+
queryArticleByTitle(t, "Nutrition", []*article{
4108+
{Title: "Nutrition and Healthy Eating Habits"},
4109+
})
4110+
4111+
// Test 2: Two word queries
4112+
queryArticleByTitle(t, "Cardiovascular Health", []*article{
4113+
{Title: "Running for Cardiovascular Health"},
4114+
})
4115+
4116+
queryArticleByTitle(t, "Muscle Building", []*article{
4117+
{Title: "Weight Training and Muscle Building"},
4118+
})
4119+
4120+
queryArticleByTitle(t, "Eating Habits", []*article{
4121+
{Title: "Nutrition and Healthy Eating Habits"},
4122+
})
4123+
4124+
// Test 3: Three word queries
4125+
queryArticleByTitle(t, "Weight Training and", []*article{
4126+
{Title: "Weight Training and Muscle Building"},
4127+
})
4128+
}
4129+
4130+
func ngramStemming(t *testing.T) {
4131+
// Test stemming with single words
4132+
queryArticleByTitle(t, "run", []*article{
4133+
{Title: "Running for Cardiovascular Health"},
4134+
})
4135+
4136+
queryArticleByTitle(t, "train", []*article{
4137+
{Title: "Weight Training and Muscle Building"},
4138+
})
4139+
4140+
queryArticleByTitle(t, "eat", []*article{
4141+
{Title: "Nutrition and Healthy Eating Habits"},
4142+
})
4143+
4144+
// Test stemming in array field
4145+
queryArticleBySummaries(t, "strengthen", []*article{
4146+
{Title: "Running for Cardiovascular Health"},
4147+
})
4148+
4149+
queryArticleBySummaries(t, "build", []*article{
4150+
{Title: "Weight Training and Muscle Building"},
4151+
})
4152+
4153+
queryArticleBySummaries(t, "maintain", []*article{
4154+
{Title: "Nutrition and Healthy Eating Habits"},
4155+
})
4156+
}
4157+
4158+
func ngramStopWords(t *testing.T) {
4159+
// Test stop words in two word queries
4160+
queryArticleByTitle(t, "for Health", []*article{
4161+
{Title: "Running for Cardiovascular Health"},
4162+
})
4163+
4164+
queryArticleByTitle(t, "and Muscle", []*article{
4165+
{Title: "Weight Training and Muscle Building"},
4166+
})
4167+
4168+
queryArticleByTitle(t, "and Healthy", []*article{
4169+
{Title: "Nutrition and Healthy Eating Habits"},
4170+
})
4171+
4172+
// Test stop words in summaries with longer phrases
4173+
queryArticleBySummaries(t, "is the best", []*article{
4174+
{Title: "Running for Cardiovascular Health"},
4175+
})
4176+
4177+
queryArticleBySummaries(t, "for beginners and", []*article{
4178+
{Title: "Weight Training and Muscle Building"},
4179+
})
4180+
4181+
queryArticleBySummaries(t, "that support long-term", []*article{
4182+
{Title: "Nutrition and Healthy Eating Habits"},
4183+
})
4184+
}
4185+
4186+
func ngramCaseInsensitive(t *testing.T) {
4187+
// Test case insensitivity in titles
4188+
queryArticleByTitle(t, "RUNNING", []*article{
4189+
{Title: "Running for Cardiovascular Health"},
4190+
})
4191+
4192+
queryArticleByTitle(t, "weight training", []*article{
4193+
{Title: "Weight Training and Muscle Building"},
4194+
})
4195+
4196+
queryArticleByTitle(t, "NUTRITION AND HEALTHY", []*article{
4197+
{Title: "Nutrition and Healthy Eating Habits"},
4198+
})
4199+
4200+
// Test case insensitivity in summaries
4201+
queryArticleBySummaries(t, "CARDIOVASCULAR BENEFITS", []*article{
4202+
{Title: "Running for Cardiovascular Health"},
4203+
})
4204+
4205+
queryArticleBySummaries(t, "strength training", []*article{
4206+
{Title: "Weight Training and Muscle Building"},
4207+
})
4208+
}
4209+
4210+
func ngramCompoundWords(t *testing.T) {
4211+
// Test compound words and hyphenated terms
4212+
queryArticleBySummaries(t, "long-term", []*article{
4213+
{Title: "Nutrition and Healthy Eating Habits"},
4214+
})
4215+
4216+
// Test compound concepts
4217+
queryArticleByTitle(t, "Cardiovascular", []*article{
4218+
{Title: "Running for Cardiovascular Health"},
4219+
})
4220+
4221+
// Test multi-word technical terms
4222+
queryArticleBySummaries(t, "bone density", []*article{
4223+
{Title: "Weight Training and Muscle Building"},
4224+
})
4225+
4226+
queryArticleBySummaries(t, "heart health", []*article{
4227+
{Title: "Running for Cardiovascular Health"},
4228+
})
4229+
}
4230+
4231+
func ngramArrayFields(t *testing.T) {
4232+
// Test ngram search specifically on array field (summaries)
4233+
queryArticleBySummaries(t, "exercise", []*article{
4234+
{Title: "Running for Cardiovascular Health"},
4235+
})
4236+
4237+
queryArticleBySummaries(t, "muscle mass", []*article{
4238+
{Title: "Weight Training and Muscle Building"},
4239+
})
4240+
4241+
queryArticleBySummaries(t, "optimal performance", []*article{
4242+
{Title: "Nutrition and Healthy Eating Habits"},
4243+
})
4244+
4245+
// Test array field with multiple word phrases
4246+
queryArticleBySummaries(t, "progressive resistance", []*article{
4247+
{Title: "Weight Training and Muscle Building"},
4248+
})
4249+
}
4250+
4251+
func ngramLinguisticVariations(t *testing.T) {
4252+
// Test negative cases - should return empty results
4253+
queryArticleByTitle(t, "nonexistent", []*article{})
4254+
queryArticleBySummaries(t, "swimming", []*article{})
4255+
4256+
// Test partial word boundaries
4257+
queryArticleBySummaries(t, "wellness", []*article{
4258+
{Title: "Nutrition and Healthy Eating Habits"},
4259+
})
4260+
4261+
// Test technical terminology variations
4262+
queryArticleBySummaries(t, "nutritional guideline", []*article{
4263+
{Title: "Nutrition and Healthy Eating Habits"},
4264+
})
4265+
4266+
// Test cross-phrase matching in array field
4267+
queryArticleBySummaries(t, "advanced athletes", []*article{
4268+
{Title: "Weight Training and Muscle Building"},
4269+
})
4270+
4271+
// Test punctuation variations
4272+
queryArticleBySummaries(t, " beginners advanced", []*article{
4273+
{Title: "Weight Training and Muscle Building"},
4274+
})
4275+
queryArticleBySummaries(t, "eat nutrition healthy", []*article{
4276+
{Title: "Nutrition and Healthy Eating Habits"},
4277+
})
4278+
}
4279+
4280+
func queryArticleByTitle(t *testing.T, title string, expectedArticles []*article) {
4281+
getArticleParams := &GraphQLParams{
4282+
Query: `query queryArticle($title: String!) {
4283+
queryArticle(filter: { title: { ngram: $title } }) {
4284+
title
4285+
}
4286+
}`,
4287+
Variables: map[string]interface{}{"title": title},
4288+
}
4289+
4290+
gqlResponse := getArticleParams.ExecuteAsPost(t, GraphqlURL)
4291+
RequireNoGQLErrors(t, gqlResponse)
4292+
4293+
var result struct {
4294+
QueryArticle []*article
4295+
}
4296+
err := json.Unmarshal(gqlResponse.Data, &result)
4297+
require.NoError(t, err)
4298+
4299+
opt := cmpopts.IgnoreFields(article{}, "ID")
4300+
if diff := cmp.Diff(expectedArticles, result.QueryArticle, opt); diff != "" {
4301+
t.Errorf("result mismatch (-want +got):\n%s", diff)
4302+
}
4303+
}
4304+
4305+
func queryArticleBySummaries(t *testing.T, summaries string, expectedArticles []*article) {
4306+
getArticleParams := &GraphQLParams{
4307+
Query: `query queryArticle($summaries: String!) {
4308+
queryArticle(filter: { summaries: { ngram: $summaries } }) {
4309+
title
4310+
}
4311+
}`,
4312+
Variables: map[string]interface{}{"summaries": summaries},
4313+
}
4314+
4315+
gqlResponse := getArticleParams.ExecuteAsPost(t, GraphqlURL)
4316+
RequireNoGQLErrors(t, gqlResponse)
4317+
4318+
var result struct {
4319+
QueryArticle []*article
4320+
}
4321+
err := json.Unmarshal(gqlResponse.Data, &result)
4322+
require.NoError(t, err)
4323+
4324+
opt := cmpopts.IgnoreFields(article{}, "ID")
4325+
if diff := cmp.Diff(expectedArticles, result.QueryArticle, opt); diff != "" {
4326+
t.Errorf("result mismatch (-want +got):\n%s", diff)
4327+
}
4328+
}

graphql/e2e/directives/schema.graphql

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,3 +424,9 @@ type LibraryManager {
424424
name: String! @id
425425
manages: [LibraryMember]
426426
}
427+
428+
type Article {
429+
id: ID!
430+
title: String! @search(by: [ngram, regexp])
431+
summaries: [String] @search(by: [ngram])
432+
}

0 commit comments

Comments
 (0)