Skip to content

Commit e46095c

Browse files
authored
add ability to override bm25 relevance params (#31675)
1 parent d72dc37 commit e46095c

File tree

9 files changed

+470
-42
lines changed

9 files changed

+470
-42
lines changed

ydb/core/kqp/common/kqp_yql.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,12 @@ TKqpReadTableFullTextIndexSettings TKqpReadTableFullTextIndexSettings::Parse(con
217217
} else if (name == TKqpReadTableFullTextIndexSettings::SkipLimitSettingName) {
218218
YQL_ENSURE(tuple.Value().IsValid());
219219
settings.SkipLimit = tuple.Value().Cast().Ptr();
220+
} else if (name == TKqpReadTableFullTextIndexSettings::BFactorSettingName) {
221+
YQL_ENSURE(tuple.Value().IsValid());
222+
settings.BFactor = tuple.Value().Cast().Ptr();
223+
} else if (name == TKqpReadTableFullTextIndexSettings::K1FactorSettingName) {
224+
YQL_ENSURE(tuple.Value().IsValid());
225+
settings.K1Factor = tuple.Value().Cast().Ptr();
220226
} else {
221227
YQL_ENSURE(false, "Unknown KqpReadTableFullTextIndex setting name '" << name << "'");
222228
}
@@ -243,6 +249,20 @@ NNodes::TCoNameValueTupleList TKqpReadTableFullTextIndexSettings::BuildNode(TExp
243249
.Done());
244250
}
245251

252+
if (BFactor) {
253+
settings.emplace_back(Build<TCoNameValueTuple>(ctx, pos)
254+
.Name().Build(BFactorSettingName)
255+
.Value(BFactor)
256+
.Done());
257+
}
258+
259+
if (K1Factor) {
260+
settings.emplace_back(Build<TCoNameValueTuple>(ctx, pos)
261+
.Name().Build(K1FactorSettingName)
262+
.Value(K1Factor)
263+
.Done());
264+
}
265+
246266
return Build<TCoNameValueTupleList>(ctx, pos)
247267
.Add(settings)
248268
.Done();

ydb/core/kqp/common/kqp_yql.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,12 +130,17 @@ struct TKqpReadTableFullTextIndexSettings: public TSortingOperator<ERequestSorti
130130
public:
131131
static constexpr TStringBuf ItemsLimitSettingName = "ItemsLimit";
132132
static constexpr TStringBuf SkipLimitSettingName = "SkipLimit";
133-
133+
static constexpr TStringBuf BFactorSettingName = "BFactor";
134+
static constexpr TStringBuf K1FactorSettingName = "K1Factor";
134135
TExprNode::TPtr ItemsLimit;
135136
TExprNode::TPtr SkipLimit;
137+
TExprNode::TPtr BFactor;
138+
TExprNode::TPtr K1Factor;
136139

137140
void SetItemsLimit(const TExprNode::TPtr& expr) { ItemsLimit = expr; }
138141
void SetSkipLimit(const TExprNode::TPtr& expr) { SkipLimit = expr; }
142+
void SetBFactor(const TExprNode::TPtr& expr) { BFactor = expr; }
143+
void SetK1Factor(const TExprNode::TPtr& expr) { K1Factor = expr; }
139144

140145
static TKqpReadTableFullTextIndexSettings Parse(const NNodes::TCoNameValueTupleList& node);
141146
NNodes::TCoNameValueTupleList BuildNode(TExprContext& ctx, TPositionHandle pos) const;

ydb/core/kqp/executer_actor/kqp_tasks_graph.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2595,6 +2595,25 @@ void TKqpTasksGraph::BuildFullTextScanTasksFromSource(TStageInfo& stageInfo, TQu
25952595
}
25962596
}
25972597

2598+
if (fullTextSource.HasBFactor()) {
2599+
auto value = ExtractPhyValue(
2600+
stageInfo, fullTextSource.GetBFactor(),
2601+
TxAlloc->HolderFactory, TxAlloc->TypeEnv, NUdf::TUnboxedValuePod());
2602+
2603+
if (value.HasValue()) {
2604+
settings->SetBFactor(value.Get<double>());
2605+
}
2606+
}
2607+
2608+
if (fullTextSource.HasK1Factor()) {
2609+
auto value = ExtractPhyValue(
2610+
stageInfo, fullTextSource.GetK1Factor(),
2611+
TxAlloc->HolderFactory, TxAlloc->TypeEnv, NUdf::TUnboxedValuePod());
2612+
if (value.HasValue()) {
2613+
settings->SetK1Factor(value.Get<double>());
2614+
}
2615+
}
2616+
25982617
for(const auto& column : fullTextSource.GetQuerySettings().GetColumns()) {
25992618
auto* protoColumn = settings->MutableQuerySettings()->AddColumns();
26002619
protoColumn->SetId(column.GetId());

ydb/core/kqp/opt/logical/kqp_opt_log_indexes.cpp

Lines changed: 201 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1136,38 +1136,210 @@ TReadMatch ExtractFullTextRead(const TExprBase& node, const TKqpOptimizeContext&
11361136
return read;
11371137
}
11381138

1139-
TMaybeNode<TCoApply> FindMatchingApply(const TExprBase& node) {
1140-
TMaybeNode<TCoApply> matchingApply;
1139+
struct TFullTextApplyParseResult {
1140+
TExprNode::TPtr Apply;
1141+
TExprNode::TPtr SearchColumn;
1142+
TExprNode::TPtr SearchQuery;
1143+
TExprNode::TPtr BFactor;
1144+
TExprNode::TPtr K1Factor;
1145+
TStringBuf MethodName;
1146+
1147+
TFullTextApplyParseResult()
1148+
{}
1149+
1150+
bool IsFullTextApply() {
1151+
return IsIn({"FullText.Contains", "FullText.Relevance", "FullText.ContainsUtf8", "FullText.RelevanceUtf8"}, MethodName);
1152+
}
1153+
1154+
bool IsRelevanceApply() {
1155+
return IsIn({"FullText.Relevance", "FullText.RelevanceUtf8"}, MethodName);
1156+
}
1157+
1158+
bool ValidateBFactor() {
1159+
if (!BFactor) {
1160+
return true;
1161+
}
1162+
1163+
if (!TExprBase(BFactor).Maybe<TCoJust>()) {
1164+
return false;
1165+
}
1166+
1167+
auto just = TExprBase(BFactor).Maybe<TCoJust>().Cast();
1168+
if (!just.Input().Maybe<TCoDouble>() && !just.Input().Maybe<TCoParameter>() && !just.Input().Maybe<TCoFloat>()) {
1169+
return false;
1170+
}
1171+
1172+
return true;
1173+
}
1174+
1175+
bool ValidateK1Factor() {
1176+
if (!K1Factor) {
1177+
return true;
1178+
}
1179+
1180+
if (!TExprBase(K1Factor).Maybe<TCoJust>()) {
1181+
return false;
1182+
}
1183+
1184+
auto just = TExprBase(K1Factor).Maybe<TCoJust>().Cast();
1185+
if (!just.Input().Maybe<TCoDouble>() && !just.Input().Maybe<TCoParameter>() && !just.Input().Maybe<TCoFloat>()) {
1186+
return false;
1187+
}
1188+
1189+
return true;
1190+
}
1191+
1192+
bool ValidateRequiredSettings() {
1193+
if (!TExprBase(SearchColumn).Maybe<TCoMember>()) {
1194+
return false;
1195+
}
1196+
1197+
1198+
if (!TExprBase(SearchQuery).Maybe<TCoString>() &&
1199+
!TExprBase(SearchQuery).Maybe<TCoAtom>() &&
1200+
!TExprBase(SearchQuery).Maybe<TCoParameter>())
1201+
{
1202+
return false;
1203+
}
1204+
1205+
return true;
1206+
}
1207+
1208+
TVector<TCoNameValueTuple> Settings(TExprContext& ctx, TPositionHandle pos) {
1209+
TVector<TCoNameValueTuple> settings;
1210+
if (BFactor) {
1211+
settings.push_back(Build<TCoNameValueTuple>(ctx, pos)
1212+
.Name<TCoAtom>()
1213+
.Value(TKqpReadTableFullTextIndexSettings::BFactorSettingName)
1214+
.Build()
1215+
.Value(BFactor)
1216+
.Done());
1217+
}
1218+
if (K1Factor) {
1219+
settings.push_back(Build<TCoNameValueTuple>(ctx, pos)
1220+
.Name<TCoAtom>()
1221+
.Value(TKqpReadTableFullTextIndexSettings::K1FactorSettingName)
1222+
.Build()
1223+
.Value(K1Factor)
1224+
.Done());
1225+
}
1226+
return settings;
1227+
}
1228+
};
1229+
1230+
TFullTextApplyParseResult FindMatchingApply(const TExprBase& node, TExprContext& ctx) {
1231+
1232+
TFullTextApplyParseResult result;
1233+
11411234
VisitExpr(node.Ptr(), [&] (const TExprNode::TPtr& expr) {
1235+
if (expr->Content() == "NamedApply") {
1236+
if (!EnsureMinArgsCount(*expr, 3, ctx)) {
1237+
return false;
1238+
}
1239+
1240+
auto callable = expr->Child(0);
1241+
if (!EnsureCallable(*callable, ctx)) {
1242+
return false;
1243+
}
1244+
1245+
if (TCoUdf::Match(callable)) {
1246+
auto udf = TExprBase(callable).Cast<TCoUdf>();
1247+
result.MethodName = udf.MethodName().Value();
1248+
if (!result.IsFullTextApply()) {
1249+
return false;
1250+
}
1251+
}
1252+
1253+
if (!EnsureTuple(*expr->Child(1), ctx)) {
1254+
return false;
1255+
}
1256+
1257+
auto args = expr->Child(1);
1258+
if (args->ChildrenSize() != 2) {
1259+
return false;
1260+
}
1261+
1262+
result.SearchColumn = args->Child(0);
1263+
result.SearchQuery = args->Child(1);
1264+
if (!result.ValidateRequiredSettings()) {
1265+
return false;
1266+
}
1267+
1268+
if (expr->Child(2)->Content() != "AsStruct") {
1269+
return false;
1270+
}
1271+
1272+
auto namedApplyArgs = expr->Child(2);
1273+
for(auto& arg : namedApplyArgs->Children()) {
1274+
if (!TExprBase(arg).Maybe<TCoNameValueTuple>()) {
1275+
return false;
1276+
}
1277+
1278+
auto nameValueTuple = TExprBase(arg).Cast<TCoNameValueTuple>();
1279+
if (nameValueTuple.Name().StringValue() == "B") {
1280+
result.BFactor = nameValueTuple.Value().Cast().Ptr();
1281+
}
1282+
1283+
if (nameValueTuple.Name().StringValue() == "K1") {
1284+
result.K1Factor = nameValueTuple.Value().Cast().Ptr();
1285+
}
1286+
}
1287+
1288+
if (!result.ValidateBFactor()) {
1289+
return false;
1290+
}
1291+
1292+
if (!result.ValidateK1Factor()) {
1293+
return false;
1294+
}
1295+
1296+
result.Apply = expr;
1297+
return false;
1298+
}
1299+
11421300
if (TCoApply::Match(expr.Get())) {
11431301
auto apply = TExprBase(expr).Cast<TCoApply>();
1144-
if (!apply.Callable().Maybe<TCoUdf>() || apply.Args().Count() != 3) {
1302+
if (!apply.Callable().Maybe<TCoUdf>()) {
11451303
return false;
11461304
}
11471305

1148-
if (!apply.Args().Get(1).Maybe<TCoMember>()) {
1306+
if (apply.Args().Count() < 3 || apply.Args().Count() > 5) {
11491307
return false;
11501308
}
11511309

1152-
if (!apply.Args().Get(2).Maybe<TCoString>() &&
1153-
!apply.Args().Get(2).Maybe<TCoAtom>() &&
1154-
!apply.Args().Get(2).Maybe<TCoParameter>())
1155-
{
1310+
result.SearchQuery = apply.Args().Get(2).Ptr();
1311+
result.SearchColumn = apply.Args().Get(1).Ptr();
1312+
if (!result.ValidateRequiredSettings()) {
11561313
return false;
11571314
}
11581315

1316+
if (apply.Args().Count() >= 4) {
1317+
result.BFactor = apply.Args().Get(3).Ptr();
1318+
if (!result.ValidateBFactor()) {
1319+
return false;
1320+
}
1321+
}
1322+
1323+
if (apply.Args().Count() == 5) {
1324+
result.K1Factor = apply.Args().Get(4).Ptr();
1325+
if (!result.ValidateK1Factor()) {
1326+
return false;
1327+
}
1328+
}
1329+
11591330
auto udf = apply.Callable().Maybe<TCoUdf>().Cast();
1160-
if (IsIn({"FullText.Contains", "FullText.Relevance", "FullText.ContainsUtf8", "FullText.RelevanceUtf8"}, udf.MethodName().Value())) {
1161-
matchingApply = apply;
1331+
result.MethodName = udf.MethodName().Value();
1332+
if (!result.IsFullTextApply()) {
11621333
return false;
11631334
}
1164-
1335+
result.Apply = apply.Ptr();
1336+
return false;
11651337
}
11661338

11671339
return true;
11681340
});
11691341

1170-
return matchingApply;
1342+
return result;
11711343
}
11721344

11731345

@@ -1195,15 +1367,13 @@ TExprBase KqpRewriteFlatMapOverFullTextRelevance(const NYql::NNodes::TExprBase&
11951367
return node;
11961368
}
11971369

1198-
auto maybeApply = FindMatchingApply(topSort.KeySelectorLambda().Body());
1199-
if (!maybeApply.IsValid()) {
1370+
auto result = FindMatchingApply(topSort.KeySelectorLambda().Body(), ctx);
1371+
if (!result.Apply) {
12001372
return node;
12011373
}
1202-
auto apply = maybeApply.Cast();
12031374

1204-
auto args = apply.Args();
1205-
auto searchQuery = args.Get(2);
1206-
auto searchColumn = args.Get(1).Maybe<TCoMember>().Cast();
1375+
auto searchQuery = TExprBase(result.SearchQuery);
1376+
auto searchColumn = TExprBase(result.SearchColumn).Maybe<TCoMember>().Cast();
12071377

12081378
auto searchColumns = Build<TCoAtomList>(ctx, node.Pos())
12091379
.Add(Build<TCoAtom>(ctx, node.Pos())
@@ -1224,10 +1394,10 @@ TExprBase KqpRewriteFlatMapOverFullTextRelevance(const NYql::NNodes::TExprBase&
12241394
.Add(resultColumnsVector)
12251395
.Done();
12261396

1227-
TVector<TCoNameValueTuple> settings;
1397+
auto settings = result.Settings(ctx, node.Pos());
12281398
settings.push_back(Build<TCoNameValueTuple>(ctx, node.Pos())
12291399
.Name<TCoAtom>()
1230-
.Value("ItemsLimit")
1400+
.Value(TKqpReadTableFullTextIndexSettings::ItemsLimitSettingName)
12311401
.Build()
12321402
.Value(topSort.Count())
12331403
.Done());
@@ -1247,7 +1417,7 @@ TExprBase KqpRewriteFlatMapOverFullTextRelevance(const NYql::NNodes::TExprBase&
12471417
.Struct(searchColumn.Struct())
12481418
.Done();
12491419

1250-
replaces.emplace(apply.Raw(), newMember.Ptr());
1420+
replaces.emplace(result.Apply.Get(), newMember.Ptr());
12511421
auto newLambda = TCoLambda{ctx.NewLambda(
12521422
topSort.KeySelectorLambda().Pos(),
12531423
std::move(topSort.KeySelectorLambda().Args().Ptr()),
@@ -1291,17 +1461,15 @@ TExprBase KqpRewriteFlatMapOverFullTextContains(const NYql::NNodes::TExprBase& n
12911461
return node;
12921462
}
12931463

1294-
auto maybeApply = FindMatchingApply(flatMap.Lambda().Body());
1295-
if (!maybeApply.IsValid()) {
1464+
auto result = FindMatchingApply(flatMap.Lambda().Body(), ctx);
1465+
if (!result.Apply) {
12961466
return node;
12971467
}
12981468

1299-
auto apply = maybeApply.Cast();
1300-
auto args = apply.Args();
1301-
auto searchQuery = args.Get(2);
1302-
auto searchColumn = args.Get(1).Maybe<TCoMember>().Cast();
1469+
auto searchQuery = TExprBase(result.SearchQuery);
1470+
auto searchColumn = TExprBase(result.SearchColumn).Maybe<TCoMember>().Cast();
13031471

1304-
bool isRelevance = IsIn({"FullText.Relevance", "FullText.RelevanceUtf8"}, apply.Callable().Maybe<TCoUdf>().Cast().MethodName().Value());
1472+
bool isRelevance = IsIn({"FullText.Relevance", "FullText.RelevanceUtf8"}, result.MethodName);
13051473

13061474
auto searchColumns = Build<TCoAtomList>(ctx, node.Pos())
13071475
.Add(Build<TCoAtom>(ctx, node.Pos())
@@ -1320,6 +1488,8 @@ TExprBase KqpRewriteFlatMapOverFullTextContains(const NYql::NNodes::TExprBase& n
13201488
.Done());
13211489
}
13221490

1491+
auto settings = result.Settings(ctx, node.Pos());
1492+
13231493
auto resultColumns = Build<TCoAtomList>(ctx, node.Pos())
13241494
.Add(resultColumnsVector)
13251495
.Done();
@@ -1330,7 +1500,7 @@ TExprBase KqpRewriteFlatMapOverFullTextContains(const NYql::NNodes::TExprBase& n
13301500
.Columns(searchColumns.Ptr())
13311501
.Query(searchQuery.Ptr())
13321502
.ResultColumns(resultColumns.Ptr())
1333-
.Settings<TCoNameValueTupleList>().Build()
1503+
.Settings<TCoNameValueTupleList>().Add(settings).Build()
13341504
.Done();
13351505

13361506
TNodeOnNodeOwnedMap replaces;
@@ -1340,10 +1510,10 @@ TExprBase KqpRewriteFlatMapOverFullTextContains(const NYql::NNodes::TExprBase& n
13401510
.Name().Build("_yql_full_text_relevance")
13411511
.Struct(searchColumn.Struct())
13421512
.Done();
1343-
replaces.emplace(apply.Raw(), newMember.Ptr());
1513+
replaces.emplace(result.Apply.Get(), newMember.Ptr());
13441514
} else {
13451515
auto newMember = Build<TCoBool>(ctx, searchColumn.Pos()).Literal().Build("true").Done().Ptr();
1346-
replaces.emplace(apply.Raw(), newMember);
1516+
replaces.emplace(result.Apply.Get(), newMember);
13471517
}
13481518

13491519
auto newLambdaBody = TCoLambda{ctx.NewLambda(

0 commit comments

Comments
 (0)