Skip to content

Commit 36ed965

Browse files
authored
Insert ql:langtag triple when inserting triple with an object with a language tag (#2674)
For each triple with an object with a language tag, QLever stores two additional so-called "internal" triples. For example, for `wd:Q873 rdfs:label "Meryl Streep"@en`, QLever also stores `wd:Q873 @en@rdfs:label "Meryl Streep"@en` and `"Meryl Streep"@en ql:langtag ql:@en`. These triples are used for the efficient processing of queries involving a language filter like `FILTER (LANG(?label) = "en")`. Specifically, the internal `@en@rdfs:label` triple is used when the predicate of the graph pattern introducing `?label` is fixed, while the internal `ql:langtag` triple is used when the predicate is a variable. Since #2561 and #2461, the `@lang@...` triples are properly taking care of when processing an update operation. However, the corresponding `ql:langtag` triple was not updated so far, thus leading to potentially incorrect results for queries involving a language filter for an object introduced by a graph pattern with a variable predicate. With this change, the corresponding `ql:langtag` is added when inserting a triple with an object with a language tag. However, when deleting such a triple, the corresponding `ql:langtag` triple is **not** deleted (because this would require keeping track of how many subjects require that triple). This does not harm correctness. It does potentially increase the number of delta triples beyond what is required.
1 parent 745bb54 commit 36ed965

File tree

10 files changed

+168
-112
lines changed

10 files changed

+168
-112
lines changed

src/engine/SpatialJoinParser.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ namespace ad_utility::detail::parallel_wkt_parser {
1313
// _____________________________________________________________________________
1414
WKTParser::WKTParser(sj::Sweeper* sweeper, size_t numThreads,
1515
bool usePrefiltering,
16-
const std::optional<util::geo::DBox>& prefilterLatLngBox,
16+
const std::optional<::util::geo::DBox>& prefilterLatLngBox,
1717
const Index& index)
1818
: sj::WKTParserBase<SpatialJoinParseJob>(sweeper, numThreads),
1919
_numSkipped(numThreads),
@@ -66,22 +66,22 @@ void WKTParser::processQueue(size_t t) {
6666
parseCounter++;
6767
} else if (dt == Datatype::GeoPoint) {
6868
const auto& p = job.valueId.getGeoPoint();
69-
const util::geo::DPoint utilPoint{p.getLng(), p.getLat()};
69+
const ::util::geo::DPoint utilPoint{p.getLng(), p.getLat()};
7070

7171
// If point is not contained in the prefilter box, we can skip it
7272
// immediately instead of feeding it to the parser.
7373
if (_prefilterLatLngBox.has_value() &&
74-
!util::geo::intersects(_prefilterLatLngBox.value(), utilPoint)) {
74+
!::util::geo::intersects(_prefilterLatLngBox.value(), utilPoint)) {
7575
prefilterCounter++;
7676
continue;
7777
}
7878
// parse point directly
7979
auto mercPoint = latLngToWebMerc(utilPoint);
8080

81-
util::geo::I32Point addPoint{
81+
::util::geo::I32Point addPoint{
8282
static_cast<int32_t>(mercPoint.getX() * PREC),
8383
static_cast<int32_t>(mercPoint.getY() * PREC)};
84-
_bboxes[t] = util::geo::extendBox(
84+
_bboxes[t] = ::util::geo::extendBox(
8585
_sweeper->add(addPoint, std::to_string(job.line), job.side, w),
8686
_bboxes[t]);
8787
parseCounter++;

src/engine/SpatialJoinParser.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ inline bool operator==(const SpatialJoinParseJob& a,
4040
class WKTParser : public sj::WKTParserBase<SpatialJoinParseJob> {
4141
public:
4242
WKTParser(sj::Sweeper* sweeper, size_t numThreads, bool usePrefiltering,
43-
const std::optional<util::geo::DBox>& prefilterLatLngBox,
43+
const std::optional<::util::geo::DBox>& prefilterLatLngBox,
4444
const Index& index);
4545

4646
// Enqueue a new row from the input table (given the `ValueId` of the
@@ -69,7 +69,7 @@ class WKTParser : public sj::WKTParserBase<SpatialJoinParseJob> {
6969

7070
// Configure prefiltering geometries by bounding box.
7171
bool _usePrefiltering;
72-
std::optional<util::geo::DBox> _prefilterLatLngBox;
72+
std::optional<::util::geo::DBox> _prefilterLatLngBox;
7373

7474
// A reference to QLever's index is needed to access precomputed geometry
7575
// bounding boxes and to resolve `ValueId`s into WKT literals.

src/index/DeltaTriples.cpp

Lines changed: 46 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
#include "index/Index.h"
2121
#include "index/IndexImpl.h"
2222
#include "index/LocatedTriples.h"
23-
#include "util/LruCache.h"
2423
#include "util/Serializer/TripleSerializer.h"
2524

2625
// ____________________________________________________________________________
@@ -167,16 +166,22 @@ DeltaTriplesCount DeltaTriples::getCounts() const {
167166
}
168167

169168
// _____________________________________________________________________________
170-
DeltaTriples::Triples DeltaTriples::makeInternalTriples(
171-
const Triples& triples) {
169+
DeltaTriples::Triples DeltaTriples::makeInternalTriples(const Triples& triples,
170+
bool insertion) {
172171
// NOTE: If this logic is ever changed, you need to also change the code
173172
// in `IndexBuilderTypes.h`, the function `getIdMapLambdas` specifically,
174173
// which adds the same extra triples for language tags to the internal triples
175174
// on the initial index build.
176175
Triples internalTriples;
177-
constexpr size_t predicateCacheSize = 50;
178-
ad_utility::util::LRUCache<Id::T, ad_utility::triple_component::Iri>
179-
predicateCache{predicateCacheSize};
176+
// Initialize on first use.
177+
if (languagePredicate_.isUndefined()) {
178+
languagePredicate_ =
179+
TripleComponent{
180+
ad_utility::triple_component::Iri::fromIriref(LANGUAGE_PREDICATE)}
181+
.toValueId(index_.getVocab(), localVocab_,
182+
index_.encodedIriManager());
183+
}
184+
ad_utility::HashSet<Id> addedObjects;
180185
for (const auto& triple : triples) {
181186
const auto& ids = triple.ids();
182187
Id objectId = ids.at(2);
@@ -188,28 +193,45 @@ DeltaTriples::Triples DeltaTriples::makeInternalTriples(
188193
continue;
189194
}
190195
const auto& predicate =
191-
predicateCache.getOrCompute(ids.at(1).getBits(), [this](Id::T bits) {
196+
predicateCache_.getOrCompute(ids.at(1).getBits(), [this](Id::T bits) {
192197
auto optionalPredicate = ExportQueryExecutionTrees::idToLiteralOrIri(
193198
index_, Id::fromBits(bits), localVocab_, true);
194199
AD_CORRECTNESS_CHECK(optionalPredicate.has_value());
195200
AD_CORRECTNESS_CHECK(optionalPredicate.value().isIri());
196201
return std::move(optionalPredicate.value().getIri());
197202
});
198-
auto specialPredicate = ad_utility::convertToLanguageTaggedPredicate(
199-
predicate,
200-
asStringViewUnsafe(optionalLiteralOrIri.value().getLanguageTag()));
203+
auto langtag =
204+
asStringViewUnsafe(optionalLiteralOrIri.value().getLanguageTag());
205+
auto specialPredicate =
206+
ad_utility::convertToLanguageTaggedPredicate(predicate, langtag);
201207
Id specialId = TripleComponent{std::move(specialPredicate)}.toValueId(
202208
index_.getVocab(), localVocab_, index_.encodedIriManager());
203-
// NOTE: We currently only add one of the language triples, specifically
204-
// `<subject> @language@<predicate> "object"@language` because it is
205-
// directly tied to a regular triple, so insertion and removal work exactly
206-
// the same. `<object> ql:langtag <@language>` on the other hand needs
207-
// either some sort of reference counting or we have to keep it
208-
// indefinitely, even if the object is removed. This means that some queries
209-
// will return no results for entries with language tags that were inserted
210-
// via an UPDATE operation.
209+
// Extra triple `<subject> @language@<predicate> "object"@language`.
211210
internalTriples.push_back(
212-
IdTriple<0>{std::array{ids.at(0), specialId, ids.at(2), ids.at(3)}});
211+
IdTriple<0>{std::array{ids.at(0), specialId, objectId, ids.at(3)}});
212+
// If we have already added the triple for this object with its langtag we
213+
// can't add it a second time.
214+
if (addedObjects.contains(objectId)) {
215+
continue;
216+
}
217+
Id langtagId =
218+
languageTagCache_.getOrCompute(langtag, [this](const std::string& tag) {
219+
return TripleComponent{ad_utility::convertLangtagToEntityUri(tag)}
220+
.toValueId(index_.getVocab(), localVocab_,
221+
index_.encodedIriManager());
222+
});
223+
224+
// Because we don't track the exact counts of existing objects, we just
225+
// conservatively add these internal triples on insertion, and never remove
226+
// them. This is inefficient, but never wrong because queries that use these
227+
// internal triples will always join these internal triples with a regular
228+
// index scan.
229+
if (insertion) {
230+
// Extra triple `"object"@language ql:langtag <@language>`.
231+
internalTriples.push_back(IdTriple<0>{
232+
std::array{objectId, languagePredicate_, langtagId, ids.at(3)}});
233+
addedObjects.emplace(objectId);
234+
}
213235
}
214236
// Because of the special predicates, we need to re-sort the triples.
215237
ql::ranges::sort(internalTriples);
@@ -221,7 +243,7 @@ void DeltaTriples::insertTriples(CancellationHandle cancellationHandle,
221243
Triples triples,
222244
ad_utility::timer::TimeTracer& tracer) {
223245
tracer.beginTrace("makeInternalTriples");
224-
auto internalTriples = makeInternalTriples(triples);
246+
auto internalTriples = makeInternalTriples(triples, true);
225247
tracer.endTrace("makeInternalTriples");
226248
tracer.beginTrace("externalPermutation");
227249
modifyTriplesImpl<false, true>(cancellationHandle, std::move(triples),
@@ -240,7 +262,7 @@ void DeltaTriples::deleteTriples(CancellationHandle cancellationHandle,
240262
Triples triples,
241263
ad_utility::timer::TimeTracer& tracer) {
242264
tracer.beginTrace("makeInternalTriples");
243-
auto internalTriples = makeInternalTriples(triples);
265+
auto internalTriples = makeInternalTriples(triples, false);
244266
tracer.endTrace("makeInternalTriples");
245267
tracer.beginTrace("externalPermutation");
246268
modifyTriplesImpl<false, false>(cancellationHandle, std::move(triples),
@@ -344,11 +366,7 @@ void DeltaTriples::modifyTriplesImpl(CancellationHandle cancellationHandle,
344366
tracer.beginTrace("rewriteLocalVocabEntries");
345367
rewriteLocalVocabEntriesAndBlankNodes(triples);
346368
tracer.endTrace("rewriteLocalVocabEntries");
347-
// TODO<joka921> Once the migration is finished, check whether we can remove
348-
// the `ifndef` here again.
349-
#ifndef QLEVER_CPP_17
350369
AD_EXPENSIVE_CHECK(ql::ranges::is_sorted(triples));
351-
#endif
352370
AD_EXPENSIVE_CHECK(std::unique(triples.begin(), triples.end()) ==
353371
triples.end());
354372
tracer.beginTrace("removeExistingTriples");
@@ -417,6 +435,9 @@ DeltaTriplesCount operator-(const DeltaTriplesCount& lhs,
417435
DeltaTriples::DeltaTriples(const Index& index)
418436
: DeltaTriples(index.getImpl()) {}
419437

438+
// ____________________________________________________________________________
439+
DeltaTriples::DeltaTriples(const IndexImpl& index) : index_{index} {}
440+
420441
// ____________________________________________________________________________
421442
DeltaTriplesManager::DeltaTriplesManager(const IndexImpl& index)
422443
: deltaTriples_{index},

src/index/DeltaTriples.h

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "index/IndexBuilderTypes.h"
2020
#include "index/LocatedTriples.h"
2121
#include "index/Permutation.h"
22+
#include "util/LruCache.h"
2223
#include "util/Synchronized.h"
2324
#include "util/TimeTracer.h"
2425

@@ -128,6 +129,23 @@ class DeltaTriples {
128129
// See the documentation of `setPersist()` below.
129130
std::optional<std::string> filenameForPersisting_;
130131

132+
// Store the id of the `ql:langtag` predicate to avoid repeated disk lookups.
133+
// This is initialized on first use.
134+
Id languagePredicate_ = Id::makeUndefined();
135+
136+
// Store commonly used language tags of the form `<@lang>` to avoid repeated
137+
// disk lookups.
138+
static constexpr size_t languageTagCacheSize_ = 1000;
139+
ad_utility::util::LRUCache<std::string, Id> languageTagCache_{
140+
languageTagCacheSize_};
141+
142+
// Cache commonly used predicates and their IRI representation between calls
143+
// of `makeInternalTriples`. For example in wikidata `wdt:P31`, or `wdt:P279`
144+
// are frequently used, so we try to avoid an expensive lookup from disk.
145+
static constexpr size_t predicateCacheSize_ = 1000;
146+
ad_utility::util::LRUCache<Id::T, ad_utility::triple_component::Iri>
147+
predicateCache_{predicateCacheSize_};
148+
131149
// Assert that the Permutation Enum values have the expected int values.
132150
// This is used to store and lookup items that exist for permutation in an
133151
// array.
@@ -167,7 +185,7 @@ class DeltaTriples {
167185
public:
168186
// Construct for given index.
169187
explicit DeltaTriples(const Index& index);
170-
explicit DeltaTriples(const IndexImpl& index) : index_{index} {}
188+
explicit DeltaTriples(const IndexImpl& index);
171189

172190
// Disable accidental copying.
173191
DeltaTriples(const DeltaTriples&) = delete;
@@ -213,8 +231,11 @@ class DeltaTriples {
213231
// bunch of triples to be inserted into the internal permutation to make
214232
// things like efficient language filters work. This currently performs a
215233
// lookup from disk to check the language tag, but in the future this may be
216-
// implemented more efficiently.
217-
Triples makeInternalTriples(const Triples& triples);
234+
// implemented more efficiently. If `insertion` is false, this indicates that
235+
// the triples are meant for deletion. In that case no triples are returned
236+
// that may be unsafe to delete. In particular this refers to triples of the
237+
// form `<object> ql:langtag <@language>`.
238+
Triples makeInternalTriples(const Triples& triples, bool insertion);
218239

219240
// Insert triples.
220241
void insertTriples(CancellationHandle cancellationHandle, Triples triples,

src/rdfTypes/GeometryInfoHelpersImpl.h

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ using ParsedWkt =
4646
MultiPoint<CoordType>, MultiLine<CoordType>,
4747
MultiPolygon<CoordType>, Collection<CoordType>>;
4848
using ParseResult = std::pair<WKTType, std::optional<ParsedWkt>>;
49-
using DAnyGeometry = util::geo::AnyGeometry<CoordType>;
49+
using DAnyGeometry = AnyGeometry<CoordType>;
5050

5151
template <typename T>
5252
CPP_concept WktSingleGeometryType =
@@ -245,19 +245,16 @@ inline std::optional<std::string_view> wktTypeToIri(uint8_t type) {
245245

246246
// Reverse projection applied by `sj::WKTParser`: convert coordinates from web
247247
// mercator int32 to normal lat-long double coordinates.
248-
inline util::geo::DPoint projectInt32WebMercToDoubleLatLng(
249-
const util::geo::I32Point& p) {
250-
return util::geo::webMercToLatLng<double>(
251-
static_cast<double>(p.getX()) / PREC,
252-
static_cast<double>(p.getY()) / PREC);
253-
};
248+
inline DPoint projectInt32WebMercToDoubleLatLng(const I32Point& p) {
249+
return webMercToLatLng<double>(static_cast<double>(p.getX()) / PREC,
250+
static_cast<double>(p.getY()) / PREC);
251+
}
254252

255253
// Same as above, but for a bounding box.
256-
inline util::geo::DBox projectInt32WebMercToDoubleLatLng(
257-
const util::geo::I32Box& box) {
254+
inline DBox projectInt32WebMercToDoubleLatLng(const I32Box& box) {
258255
return {projectInt32WebMercToDoubleLatLng(box.getLowerLeft()),
259256
projectInt32WebMercToDoubleLatLng(box.getUpperRight())};
260-
};
257+
}
261258

262259
// Counts the number of geometries in a geometry collection.
263260
inline uint32_t countChildGeometries(const ParsedWkt& geom) {
@@ -657,7 +654,7 @@ struct MetricDistanceVisitor {
657654
// Delegate the actual distance computation to `pb_util`.
658655
CPP_template(typename T, typename U)(requires IsPairOfUtilGeoms<T, U>) double
659656
operator()(const T& a, const U& b) const {
660-
return util::geo::webMercMeterDist<T, U>(a, b);
657+
return webMercMeterDist<T, U>(a, b);
661658
}
662659

663660
// Handle optional geometries that may be contained in a `ParseResult`.

src/util/LruCache.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ class LRUCache {
3535
// found. Otherwise, compute the value using `computeFunction` and store it in
3636
// the cache. If the cache is already at maximum capacity, evict the least
3737
// recently used element.
38-
CPP_template(typename Func)(
38+
CPP_template(typename Key, typename Func)(
3939
requires ad_utility::InvocableWithConvertibleReturnType<
40-
Func, V, const K&>) const V& getOrCompute(const K& key,
40+
Func, V, const K&>) const V& getOrCompute(Key&& key,
4141
Func computeFunction) {
4242
auto it = cache_.find(key);
4343
if (it != cache_.end()) {
@@ -56,9 +56,10 @@ class LRUCache {
5656
lruKey = key;
5757
} else {
5858
// Push new element if not full
59-
keys_.push_front(key);
59+
keys_.push_front(K{key});
6060
}
61-
auto result = cache_.try_emplace(key, computeFunction(key), keys_.begin());
61+
auto result = cache_.try_emplace(
62+
AD_FWD(key), computeFunction(keys_.front()), keys_.begin());
6263
AD_CORRECTNESS_CHECK(result.second);
6364
return result.first->second.first;
6465
}

0 commit comments

Comments
 (0)