Skip to content

Commit 225fd15

Browse files
authored
LUCENE-9952: Fix dim count inaccuracies in SSDV faceting when a dim is multi-valued (#621)
1 parent fa63c68 commit 225fd15

File tree

6 files changed

+121
-34
lines changed

6 files changed

+121
-34
lines changed

lucene/CHANGES.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ API Changes
2929
* LUCENE-10349: WordListLoader methods now return unmodifiable CharArraySets. (Uwe Schindler)
3030

3131
* LUCENE-10377: SortField.getComparator() has changed signature. The second parameter is now
32-
a boolean indicating whether or not skipping should be enabled on the comparator.
32+
a boolean indicating whether or not skipping should be enabled on the comparator.
3333
(Alan Woodward)
3434

3535
* LUCENE-10381: Require users to provide FacetsConfig for SSDV faceting. (Greg Miller)
@@ -155,12 +155,15 @@ Bug Fixes
155155
* LUCENE-10352: Fixed ctor argument checks: JapaneseKatakanaStemFilter,
156156
DoubleMetaphoneFilter (Uwe Schindler, Robert Muir)
157157

158-
* LLUCENE-10353: Add random null injection to TestRandomChains. (Robert Muir,
158+
* LUCENE-10353: Add random null injection to TestRandomChains. (Robert Muir,
159159
Uwe Schindler)
160160

161161
* LUCENE-10377: CheckIndex could incorrectly throw an error when checking index sorts
162162
defined on older indexes. (Alan Woodward)
163163

164+
* LUCENE-9952: Address inaccurate dim counts for SSDV faceting in cases where a dim is configured
165+
as multi-valued. (Greg Miller)
166+
164167
Other
165168
---------------------
166169

lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -486,10 +486,14 @@ private void processSSDVFacetFields(
486486
for (SortedSetDocValuesFacetField facetField : ent.getValue()) {
487487
FacetLabel facetLabel = new FacetLabel(facetField.dim, facetField.path);
488488
DimConfig dimConfig = getDimConfig(facetField.dim);
489+
490+
// For facet counts:
489491
if (dimConfig.hierarchical) {
492+
// Index each member of the path explicitly. This is required for hierarchical counting
493+
// to work properly since we need to ensure every unique path has a corresponding ordinal
494+
// in the SSDV field:
490495
for (int i = 0; i < facetLabel.length; i++) {
491496
String fullPath = pathToString(facetLabel.components, i + 1);
492-
// For facet counts:
493497
doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath)));
494498
}
495499
} else {
@@ -501,10 +505,15 @@ private void processSSDVFacetFields(
501505
+ facetField.path.length
502506
+ " components");
503507
}
508+
if (dimConfig.multiValued && dimConfig.requireDimCount) {
509+
// If non-hierarchical but multi-valued and requiring dim count, make sure to
510+
// explicitly index the dimension so we can get accurate counts for it:
511+
doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(facetField.dim)));
512+
}
504513
String fullPath = pathToString(facetLabel.components, facetLabel.length);
505-
// For facet counts:
506514
doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath)));
507515
}
516+
508517
// For drill-down:
509518
indexDrillDownTerms(doc, indexFieldName, dimConfig, facetLabel);
510519
}

lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -103,14 +103,16 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
103103
throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")");
104104
}
105105

106-
if (stateConfig.getDimConfig(dim).hierarchical) {
106+
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
107+
108+
if (dimConfig.hierarchical) {
107109
int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
108110
if (pathOrd < 0) {
109111
// path was never indexed
110112
return null;
111113
}
112114
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
113-
return getDim(dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
115+
return getPathResult(dimConfig, dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
114116
} else {
115117
if (path.length > 0) {
116118
throw new IllegalArgumentException(
@@ -121,12 +123,25 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
121123
// means dimension was never indexed
122124
return null;
123125
}
124-
return getDim(dim, null, -1, ordRange.iterator(), topN);
126+
int dimOrd = ordRange.start;
127+
PrimitiveIterator.OfInt childIt = ordRange.iterator();
128+
if (dimConfig.multiValued && dimConfig.requireDimCount) {
129+
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
130+
// the dimension and we need to skip past it so the iterator is positioned on the first
131+
// child:
132+
childIt.next();
133+
}
134+
return getPathResult(dimConfig, dim, null, dimOrd, childIt, topN);
125135
}
126136
}
127137

128-
private FacetResult getDim(
129-
String dim, String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, int topN)
138+
private FacetResult getPathResult(
139+
FacetsConfig.DimConfig dimConfig,
140+
String dim,
141+
String[] path,
142+
int pathOrd,
143+
PrimitiveIterator.OfInt childOrds,
144+
int topN)
130145
throws IOException {
131146

132147
TopOrdAndIntQueue q = null;
@@ -175,11 +190,17 @@ private FacetResult getDim(
175190
labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
176191
}
177192

178-
if (pathOrd == -1) {
179-
// not hierarchical facet
193+
if (dimConfig.hierarchical == false) {
194+
// see if dimCount is actually reliable or needs to be reset
195+
if (dimConfig.multiValued) {
196+
if (dimConfig.requireDimCount) {
197+
dimCount = counts.get(pathOrd);
198+
} else {
199+
dimCount = -1; // dimCount is in accurate at this point, so set it to -1
200+
}
201+
}
180202
return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount);
181203
} else {
182-
// hierarchical facet
183204
return new FacetResult(dim, path, counts.get(pathOrd), labelValues, childCount);
184205
}
185206
}
@@ -394,15 +415,25 @@ public List<FacetResult> getAllDims(int topN) throws IOException {
394415

395416
List<FacetResult> results = new ArrayList<>();
396417
for (String dim : state.getDims()) {
397-
if (stateConfig.getDimConfig(dim).hierarchical) {
418+
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
419+
if (dimConfig.hierarchical) {
398420
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
399-
FacetResult fr = getDim(dim, emptyPath, dimTree.dimStartOrd, dimTree.iterator(), topN);
421+
int dimOrd = dimTree.dimStartOrd;
422+
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, dimTree.iterator(), topN);
400423
if (fr != null) {
401424
results.add(fr);
402425
}
403426
} else {
404427
OrdRange ordRange = state.getOrdRange(dim);
405-
FacetResult fr = getDim(dim, emptyPath, -1, ordRange.iterator(), topN);
428+
int dimOrd = ordRange.start;
429+
PrimitiveIterator.OfInt childIt = ordRange.iterator();
430+
if (dimConfig.multiValued && dimConfig.requireDimCount) {
431+
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
432+
// the dimension and we need to skip past it so the iterator is positioned on the first
433+
// child:
434+
childIt.next();
435+
}
436+
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, childIt, topN);
406437
if (fr != null) {
407438
results.add(fr);
408439
}

lucene/facet/src/java/org/apache/lucene/facet/sortedset/DefaultSortedSetDocValuesReaderState.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,9 @@ private int createOneFlatFacetDimState(SortedSetDocValues dv, int dimStartOrd)
215215

216216
BytesRef nextTerm = dv.lookupOrd(dimEndOrd);
217217
String[] nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString());
218-
if (nextComponents.length != 2) {
218+
// The first entry should always be length 1 or 2 (either just the dim itself if we explicitly
219+
// indexed it, or the first child):
220+
if (nextComponents.length > 2) {
219221
throw new IllegalArgumentException(
220222
"dimension not configured to handle hierarchical field; got: "
221223
+ Arrays.toString(nextComponents)
@@ -238,6 +240,7 @@ private int createOneFlatFacetDimState(SortedSetDocValues dv, int dimStartOrd)
238240
break;
239241
}
240242

243+
// Each entry should have a length of exactly 2 since the dim is non-hierarchical:
241244
if (nextComponents.length != 2) {
242245
throw new IllegalArgumentException(
243246
"dimension not configured to handle hierarchical field; got: "

lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,11 @@
6161
* <p><b>NOTE</b>: this class should be instantiated and then used from a single thread, because it
6262
* holds a thread-private instance of {@link SortedSetDocValues}.
6363
*
64-
* <p><b>NOTE:</b>: tie-break is by unicode sort order
64+
* <p><b>NOTE</b>: tie-break is by unicode sort order
65+
*
66+
* <p><b>NOTE</b>: if you have multi-valued dims that require dim counts (see {@link FacetsConfig},
67+
* make sure to provide your {@code FacetsConfig} instance when instantiating {@link
68+
* SortedSetDocValuesReaderState}, or else dim counts can be inaccurate
6569
*
6670
* @lucene.experimental
6771
*/
@@ -102,14 +106,16 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
102106
throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")");
103107
}
104108

105-
if (stateConfig.getDimConfig(dim).hierarchical) {
109+
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
110+
111+
if (dimConfig.hierarchical) {
106112
int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
107113
if (pathOrd < 0) {
108114
// path was never indexed
109115
return null;
110116
}
111117
DimTree dimTree = state.getDimTree(dim);
112-
return getDim(dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
118+
return getPathResult(dimConfig, dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
113119
} else {
114120
if (path.length > 0) {
115121
throw new IllegalArgumentException(
@@ -120,12 +126,25 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
120126
// means dimension was never indexed
121127
return null;
122128
}
123-
return getDim(dim, null, -1, ordRange.iterator(), topN);
129+
int dimOrd = ordRange.start;
130+
PrimitiveIterator.OfInt childIt = ordRange.iterator();
131+
if (dimConfig.multiValued && dimConfig.requireDimCount) {
132+
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
133+
// the dimension and we need to skip past it so the iterator is positioned on the first
134+
// child:
135+
childIt.next();
136+
}
137+
return getPathResult(dimConfig, dim, null, dimOrd, childIt, topN);
124138
}
125139
}
126140

127-
private FacetResult getDim(
128-
String dim, String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, int topN)
141+
private FacetResult getPathResult(
142+
FacetsConfig.DimConfig dimConfig,
143+
String dim,
144+
String[] path,
145+
int pathOrd,
146+
PrimitiveIterator.OfInt childOrds,
147+
int topN)
129148
throws IOException {
130149

131150
TopOrdAndIntQueue q = null;
@@ -173,11 +192,17 @@ private FacetResult getDim(
173192
labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
174193
}
175194

176-
if (pathOrd == -1) {
177-
// not hierarchical facet
195+
if (dimConfig.hierarchical == false) {
196+
// see if dimCount is actually reliable or needs to be reset
197+
if (dimConfig.multiValued) {
198+
if (dimConfig.requireDimCount) {
199+
dimCount = counts[pathOrd];
200+
} else {
201+
dimCount = -1; // dimCount is in accurate at this point, so set it to -1
202+
}
203+
}
178204
return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount);
179205
} else {
180-
// hierarchical facet
181206
return new FacetResult(dim, path, counts[pathOrd], labelValues, childCount);
182207
}
183208
}
@@ -347,15 +372,25 @@ public List<FacetResult> getAllDims(int topN) throws IOException {
347372

348373
List<FacetResult> results = new ArrayList<>();
349374
for (String dim : state.getDims()) {
350-
if (stateConfig.getDimConfig(dim).hierarchical) {
375+
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
376+
if (dimConfig.hierarchical) {
351377
DimTree dimTree = state.getDimTree(dim);
352-
FacetResult fr = getDim(dim, emptyPath, dimTree.dimStartOrd, dimTree.iterator(), topN);
378+
int dimOrd = dimTree.dimStartOrd;
379+
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, dimTree.iterator(), topN);
353380
if (fr != null) {
354381
results.add(fr);
355382
}
356383
} else {
357384
OrdRange ordRange = state.getOrdRange(dim);
358-
FacetResult fr = getDim(dim, emptyPath, -1, ordRange.iterator(), topN);
385+
int dimOrd = ordRange.start;
386+
PrimitiveIterator.OfInt childIt = ordRange.iterator();
387+
if (dimConfig.multiValued && dimConfig.requireDimCount) {
388+
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
389+
// the dimension and we need to skip past it so the iterator is positioned on the first
390+
// child:
391+
childIt.next();
392+
}
393+
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, childIt, topN);
359394
if (fr != null) {
360395
results.add(fr);
361396
}

lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,20 +63,24 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
6363
public void testBasic() throws Exception {
6464
FacetsConfig config = new FacetsConfig();
6565
config.setMultiValued("a", true);
66+
config.setMultiValued("b", true);
67+
config.setRequireDimCount("b", true);
6668
try (Directory dir = newDirectory();
6769
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
6870
Document doc = new Document();
6971
doc.add(new SortedSetDocValuesFacetField("a", "foo"));
7072
doc.add(new SortedSetDocValuesFacetField("a", "bar"));
7173
doc.add(new SortedSetDocValuesFacetField("a", "zoo"));
7274
doc.add(new SortedSetDocValuesFacetField("b", "baz"));
75+
doc.add(new SortedSetDocValuesFacetField("b", "buzz"));
7376
writer.addDocument(config.build(doc));
7477
if (random().nextBoolean()) {
7578
writer.commit();
7679
}
7780

7881
doc = new Document();
7982
doc.add(new SortedSetDocValuesFacetField("a", "foo"));
83+
doc.add(new SortedSetDocValuesFacetField("b", "buzz"));
8084
writer.addDocument(config.build(doc));
8185

8286
// NRT open
@@ -91,12 +95,13 @@ public void testBasic() throws Exception {
9195
try {
9296
Facets facets = getAllFacets(searcher, state, exec);
9397

94-
// value should ideally be 2 but SSDV facets are bugged here
98+
// value for dim a should be -1 since it's multivalued but doesn't require dim counts:
9599
assertEquals(
96-
"dim=a path=[] value=4 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
100+
"dim=a path=[] value=-1 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
97101
facets.getTopChildren(10, "a").toString());
102+
// value for dim b should be 2 since it's multivalued but _does_ require dim counts:
98103
assertEquals(
99-
"dim=b path=[] value=1 childCount=1\n baz (1)\n",
104+
"dim=b path=[] value=2 childCount=2\n buzz (2)\n baz (1)\n",
100105
facets.getTopChildren(10, "b").toString());
101106

102107
// DrillDown:
@@ -115,6 +120,7 @@ public void testBasic() throws Exception {
115120
public void testBasicHierarchical() throws Exception {
116121
FacetsConfig config = new FacetsConfig();
117122
config.setMultiValued("a", true);
123+
config.setRequireDimCount("a", true);
118124
config.setMultiValued("c", true);
119125
config.setHierarchical("c", true);
120126
try (Directory dir = newDirectory();
@@ -152,10 +158,10 @@ public void testBasicHierarchical() throws Exception {
152158
try {
153159
Facets facets = getAllFacets(searcher, state, exec);
154160

155-
// since a is not set to be hierarchical, it's value count will be bugged as ancestral
156-
// paths are not indexed
161+
// since a is not set to be hierarchical but _is_ multi-valued, we expect a value of 2
162+
// (since two unique docs contain at least one value for this dim):
157163
assertEquals(
158-
"dim=a path=[] value=4 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
164+
"dim=a path=[] value=2 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
159165
facets.getTopChildren(10, "a").toString());
160166
assertEquals(
161167
"dim=b path=[] value=1 childCount=1\n baz (1)\n",

0 commit comments

Comments
 (0)