Skip to content

Commit 687975b

Browse files
ryanjdewMarkLogic Builder
authored andcommitted
DHFPROD-8164: Better handle massive match events
1 parent 9f37490 commit 687975b

File tree

8 files changed

+226
-39
lines changed

8 files changed

+226
-39
lines changed

marklogic-data-hub/src/main/resources/ml-modules/root/com.marklogic.smart-mastering/impl/process.xqy

Lines changed: 61 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -398,28 +398,59 @@ declare function proc-impl:build-match-summary(
398398
$match-options-node//(*:max-scan|maxScan) ! xs:integer(.),
399399
500
400400
))
401+
let $outlier-map := map:map()
402+
let $estimate-map := map:map()
401403
let $all-matches :=
402404
let $start-elapsed := xdmp:elapsed-time()
403405
let $matches :=
404406
map:new((
405-
$normalized-input !
406-
map:entry(
407-
(. => map:get("uri")),
408-
let $match-results := matcher:find-document-matches-by-options(
409-
(. => map:get("value")),
410-
$match-options,
411-
1,
412-
$max-scan,
413-
$minimum-threshold,
414-
$fine-grain-provenance,
415-
$filter-query
407+
util-impl:process-items-in-set-time(
408+
function ($input) {
409+
map:entry(
410+
($input => map:get("uri")),
411+
let $match-results := matcher:find-document-matches-by-options(
412+
($input => map:get("value")),
413+
$match-options,
414+
1,
415+
$max-scan,
416+
$minimum-threshold,
417+
$fine-grain-provenance,
418+
$filter-query
419+
)
420+
return (
421+
if ($lock-for-update) then
422+
$match-results/result[fn:exists(@action)]/@uri ! merge-impl:lock-for-update(fn:string(.))
423+
else (),
424+
$match-results
425+
)
426+
)
427+
},
428+
$normalized-input,
429+
function($input) {
430+
if (map:contains($estimate-map,($input => map:get("uri")))) then
431+
map:get($estimate-map,($input => map:get("uri")))/@total ! xs:unsignedLong(.)
432+
else
433+
let $estimate := matcher:find-document-matches-by-options(
434+
($input => map:get("value")),
435+
$match-options,
436+
1,
437+
$max-scan,
438+
$minimum-threshold,
439+
$fine-grain-provenance,
440+
$filter-query,
441+
fn:false()
442+
)
443+
return (
444+
map:put($estimate-map,($input => map:get("uri")), $estimate-map),
445+
$estimate/@total ! xs:unsignedLong(.)
416446
)
417-
return (
418-
if ($lock-for-update) then
419-
$match-results/result[fn:exists(@action)]/@uri ! merge-impl:lock-for-update(fn:string(.))
420-
else (),
421-
$match-results
422-
)
447+
},
448+
function($outliers) {
449+
for $outlier in $outliers
450+
let $uri := ($outlier => map:get("uri"))
451+
let $output := map:get($estimate-map,$uri)
452+
return map:put($outlier-map, $uri, $output)
453+
}
423454
)
424455
))
425456
return (
@@ -509,6 +540,17 @@ declare function proc-impl:build-match-summary(
509540
let $notifications :=
510541
(: Process notifications :)
511542
map:new((
543+
let $outlier-threshold := "Unable to process"
544+
for $outlier-uri in map:keys($outlier-map)
545+
let $notification-uri := notify-impl:build-notification-uri($outlier-threshold, $outlier-uri)
546+
return map:entry(
547+
$notification-uri,
548+
map:map()
549+
=> map:with("action", "notify")
550+
=> map:with("threshold", $outlier-threshold)
551+
=> map:with("uris", json:to-array($outlier-uri))
552+
=> map:with("query", ($outlier-map => map:get($outlier-uri))/match-query/* ! cts:query(.))
553+
),
512554
for $notification in $consolidated-notifies
513555
let $_lock := if ($lock-for-update) then (merge-impl:lock-for-update($notification)) else ()
514556
let $parts := fn:tokenize($notification, $STRING-TOKEN)
@@ -739,9 +781,10 @@ declare function proc-impl:build-content-objects-from-match-summary(
739781
)
740782
case "notify" return
741783
let $uris := $action-details => map:get("uris") => json:array-values()
784+
let $query := ($action-details => map:get("query")) ! cts:query(.)
742785
let $threshold := $action-details => map:get("threshold")
743786
let $provenance := $action-details => map:get("provenance")
744-
let $match-write-object := matcher:build-match-notification($threshold, $uris, $merge-options-node)
787+
let $match-write-object := matcher:build-match-notification($threshold, $uris, $merge-options-node, $query)
745788
where fn:exists($match-write-object)
746789
return
747790
if (fn:exists($provenance)) then

marklogic-data-hub/src/main/resources/ml-modules/root/com.marklogic.smart-mastering/impl/util.xqy

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,3 +264,66 @@ declare function util-impl:function-is-javascript(
264264
) as xs:boolean {
265265
fn:exists($fun) and fn:ends-with(xdmp:function-module($fun), "js")
266266
};
267+
268+
declare variable $SESSION_TIMEOUT := 600;
269+
270+
(:
271+
: Ensures each item is run in set amount of time
272+
: @param $fun as function(item())
273+
: @param $items as item()*
274+
: @return item()*
275+
:)
276+
declare function util-impl:process-items-in-set-time(
277+
$fun,
278+
$items as item()*,
279+
$item-cost-fun,
280+
$outlier-handler
281+
) {
282+
try {
283+
for $item in $items
284+
return $fun($item)
285+
} catch ($e) {
286+
if ($e/error:code eq "XDMP-EXTIME") then (
287+
let $items-cost := $items ! $item-cost-fun(.)
288+
let $high-outliers-cost := util-impl:determine-high-outliers($items-cost)
289+
let $high-outlier-indexes := $high-outliers-cost ! fn:index-of($items-cost, .)
290+
let $high-outliers := $items[fn:position() = $high-outlier-indexes]
291+
let $others := $items[fn:not(fn:position() = $high-outlier-indexes)]
292+
return (
293+
util-impl:process-items-in-set-time(
294+
$fun,
295+
$others,
296+
$item-cost-fun,
297+
$outlier-handler
298+
),
299+
if (fn:exists($outlier-handler)) then
300+
$outlier-handler($high-outliers)
301+
else (),
302+
for $outlier in $high-outliers
303+
return
304+
xdmp:log("Unable to process "|| xdmp:describe($outlier, (), ()) || " outlier item.", "warning")
305+
)
306+
) else
307+
xdmp:rethrow()
308+
}
309+
};
310+
311+
(:
312+
: Returns high outliers
313+
: @param $items as item()*
314+
: @return item()*
315+
:)
316+
declare function util-impl:determine-high-outliers(
317+
$items as xs:unsignedLong*) {
318+
let $ordered-items :=
319+
for $i in $items
320+
order by $i ascending
321+
return $i
322+
let $size := fn:count($items)
323+
let $half := $size idiv 2
324+
let $median-low := fn:avg(fn:subsequence($ordered-items, 1, $half))
325+
let $median-high := fn:avg(fn:subsequence($ordered-items,$half + 1))
326+
let $median-difference := $median-high - $median-low
327+
let $outlier-threshold := $median-high + (1.5 * $median-difference)
328+
return $items[. gt $outlier-threshold]
329+
};

marklogic-data-hub/src/main/resources/ml-modules/root/com.marklogic.smart-mastering/matcher-impl/helper-impl.xqy

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ declare function helper-impl:group-queries-by-scope($queries as cts:query*, $gro
218218
for $query in $queries
219219
let $is-json-prop-scope := $query instance of cts:json-property-scope-query
220220
let $is-element-scope := $query instance of cts:element-query
221+
let $is-and-not := $query instance of cts:and-not-query
221222
let $key :=
222223
if ($is-json-prop-scope) then
223224
"json-prop:" || fn:string-join(
@@ -229,6 +230,8 @@ declare function helper-impl:group-queries-by-scope($queries as cts:query*, $gro
229230
for $qn in cts:element-query-element-name($query) order by $qn return xdmp:key-from-QName($qn),
230231
$string-token
231232
)
233+
else if ($is-and-not) then
234+
"and-not"
232235
else
233236
"_other"
234237
let $values :=
@@ -253,6 +256,10 @@ declare function helper-impl:group-queries-by-scope($queries as cts:query*, $gro
253256
cts:json-property-scope-query(fn:tokenize(fn:substring-after($key, "json-prop:"), $string-token), $grouped-queries)
254257
else if (fn:starts-with($key, "element:")) then
255258
cts:element-query(fn:tokenize(fn:substring-after($key, "element:"), $string-token) ! xdmp:QName-from-key(.), $grouped-queries)
259+
else if ($key eq "and-not" and fn:count($grouped-queries) gt 1) then
260+
let $positive-queries := helper-impl:group-queries-by-scope(for $q in $grouped-queries return cts:and-not-query-positive-query($q), $grouping-query-fun)
261+
let $negative-queries := helper-impl:group-queries-by-scope(for $q in $grouped-queries return cts:and-not-query-negative-query($q), $grouping-query-fun)
262+
return cts:and-not-query($positive-queries, $negative-queries)
256263
else
257264
$grouped-queries
258265
return

marklogic-data-hub/src/main/resources/ml-modules/root/com.marklogic.smart-mastering/matcher-impl/matcher-impl.xqy

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,9 @@ declare function match-impl:find-document-matches-by-options(
262262
else
263263
let $estimate := xdmp:estimate(cts:search(fn:collection(), $match-query, "unfiltered"))
264264
return (
265+
if ($estimate ge 250) then
266+
xdmp:log("A large number ("|| $estimate ||") of potential matches were discovered for document '" || $document-uri || "' with the following query: " || xdmp:describe($match-query, (), ()), "warning")
267+
else (),
265268
if ($match-trace-is-enabled) then
266269
xdmp:trace($const:TRACE-MATCH-RESULTS, "Estimated " || $estimate || " doc(s) found for cts.doc('"|| $document-uri ||"') in " || xdmp:database-name(xdmp:database()))
267270
else (),

marklogic-data-hub/src/main/resources/ml-modules/root/com.marklogic.smart-mastering/matcher-impl/notification-impl.xqy

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,14 @@ declare function notify-impl:save-match-notification(
6767
};
6868

6969
declare variable $_notifications-inserted := map:map();
70+
declare function notify-impl:build-match-notification(
71+
$threshold-label as xs:string,
72+
$uris as xs:string*,
73+
$options as item()?
74+
) as map:map?
75+
{
76+
notify-impl:build-match-notification($threshold-label, $uris, $options, ())
77+
};
7078
(:
7179
: Create a new notification document. If there is already a notification for
7280
: this combination of label and URIs, that notification will be replaced.
@@ -77,7 +85,8 @@ declare variable $_notifications-inserted := map:map();
7785
declare function notify-impl:build-match-notification(
7886
$threshold-label as xs:string,
7987
$uris as xs:string*,
80-
$options as item()?
88+
$options as item()?,
89+
$query as cts:query?
8190
) as map:map?
8291
{
8392
let $options-node := merge-impl:options-to-node($options)
@@ -109,7 +118,8 @@ declare function notify-impl:build-match-notification(
109118
element sm:threshold-label {$threshold-label},
110119
element sm:document-uris {
111120
$doc-uris
112-
}
121+
},
122+
$query ! element sm:query {.}
113123
}
114124
),
115125
map:entry("context",

marklogic-data-hub/src/main/resources/ml-modules/root/com.marklogic.smart-mastering/matcher-impl/options-impl.xqy

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ import module namespace helper-impl = "http://marklogic.com/smart-mastering/help
2525
at "/com.marklogic.smart-mastering/matcher-impl/helper-impl.xqy";
2626
import module namespace json="http://marklogic.com/xdmp/json"
2727
at "/MarkLogic/json/json.xqy";
28+
import module namespace sem = "http://marklogic.com/semantics"
29+
at "/MarkLogic/semantics.xqy";
2830
import module namespace util-impl = "http://marklogic.com/smart-mastering/util-impl"
2931
at "/com.marklogic.smart-mastering/impl/util.xqy";
3032

@@ -545,21 +547,7 @@ declare function opt-impl:compile-match-options(
545547
map:entry("orderedThresholds", $ordered-thresholds),
546548
map:entry("minimumThresholdCombinations", $minimum-threshold-combinations),
547549
map:entry("propertyNamesToValues", $property-names-to-values),
548-
map:entry("baseContentQuery",
549-
if (fn:exists($target-entity-type-def)) then
550-
cts:or-query((
551-
cts:json-property-scope-query(
552-
"info",
553-
cts:json-property-value-query("title", fn:string($target-entity-type-def/entityTitle), (), 0)
554-
),
555-
cts:element-query(
556-
xs:QName("es:info"),
557-
cts:element-value-query(xs:QName("es:title"), fn:string($target-entity-type-def/entityTitle), (), 0)
558-
)
559-
))
560-
else
561-
opt-impl:build-collection-query(coll:content-collections($match-options))
562-
)
550+
map:entry("baseContentQuery", opt-impl:build-base-query($match-options, $target-entity-type-def, $target-entity-type-iri))
563551
))
564552
let $cache-ids := (
565553
$cache-id,
@@ -584,6 +572,30 @@ declare function opt-impl:compile-match-options(
584572
)
585573
};
586574

575+
576+
declare function opt-impl:build-base-query($match-options as item()?, $target-entity-type-def as item()?, $target-entity-type-iri as xs:string?) {
577+
let $base-query :=
578+
if (fn:exists($target-entity-type-def)) then
579+
let $triple-query := cts:triple-range-query((), sem:curie-expand("rdf:type"), sem:iri($target-entity-type-iri))
580+
return
581+
if (xdmp:exists(cts:search(fn:collection(), $triple-query))) then
582+
$triple-query
583+
else
584+
cts:or-query((
585+
cts:json-property-scope-query(
586+
"info",
587+
cts:json-property-value-query("title", fn:string($target-entity-type-def/entityTitle), (), 0)
588+
),
589+
cts:element-query(
590+
xs:QName("es:info"),
591+
cts:element-value-query(xs:QName("es:title"), fn:string($target-entity-type-def/entityTitle), (), 0)
592+
)
593+
))
594+
else
595+
opt-impl:build-collection-query(coll:content-collections($match-options))
596+
return cts:registered-query(cts:register($base-query))
597+
};
598+
587599
declare function opt-impl:convert-match-rule-for-custom-module($match-rule, $match-options, $custom-algorithm)
588600
{
589601
if (fn:empty($custom-algorithm)) then
@@ -734,7 +746,7 @@ declare function opt-impl:minimum-threshold-combinations($query-results, $thresh
734746
else
735747
(: Each of $queries-ge-threshold has a weight high enough to hit the $threshold :)
736748
let $queries-ge-threshold := $query-results[fn:empty((. => map:get("weight"))) or (. => map:get("weight")) >= $threshold]
737-
let $queries-lt-threshold := $query-results[(. => map:get("weight")) <= $threshold]
749+
let $queries-lt-threshold := $query-results[(. => map:get("weight")) < $threshold]
738750
return (
739751
$queries-ge-threshold ! (map:entry("queries", .) => map:with("weight", map:get(., "weight"))),
740752
opt-impl:filter-for-required-queries($queries-lt-threshold, 0, $threshold, ())

marklogic-data-hub/src/main/resources/ml-modules/root/com.marklogic.smart-mastering/matcher.xqy

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,36 @@ declare function matcher:find-document-matches-by-options(
186186
)
187187
};
188188

189+
(:
190+
: Starting with the specified document, look for a page of potential matches based on previously-saved matching options.
191+
:
192+
: @param $document document to find matches for
193+
: @param $options match options saved using matcher:save-options
194+
: @param $start starting index for potential match results (starts at 1)
195+
: @param $page-length maximum number of results to return in this call
196+
: @param $minimum-threshold value of the lowest threshold score; the match query will require matches to score at
197+
least this high to be returned
198+
: @param $include-matches whether the response should list the matched properties for each potential match
199+
: @param $filter-query a cts:query used to restrict matches to a set, such as a specific entity type or collection
200+
: @param $include-results a boolean that determines if results should be retrieved or just an estimate
201+
: @return the queries used for search and the search results themselves
202+
: @see https://marklogic-community.github.io/smart-mastering-core/docs/match-results/
203+
:)
204+
declare function matcher:find-document-matches-by-options(
205+
$document,
206+
$options as item(), (: as (element(matcher:options)|object-node()) :)
207+
$start as xs:integer,
208+
$page-length as xs:integer,
209+
$minimum-threshold as xs:double,
210+
$include-matches as xs:boolean,
211+
$filter-query,
212+
$include-results as xs:boolean
213+
) as element(results)
214+
{
215+
match-impl:find-document-matches-by-options(
216+
$document, $options, $start, $page-length, $minimum-threshold, $include-matches, $filter-query, $include-results
217+
)
218+
};
189219
(:
190220
: Convert match results from XML to JSON.
191221
: @param $results-xml XML match results as returned from the
@@ -489,6 +519,25 @@ declare function matcher:build-match-notification(
489519
notify-impl:build-match-notification($threshold-label, $uris, $options)
490520
};
491521

522+
(:
523+
: Builds a map action for new notification. If a notification document already exists for
524+
: this label/URIs combination, it will be replaced with the new notification.
525+
: @param $threshold-label human-readable label used to indicate the
526+
: likelihood of the match
527+
: @param $uris URIs of the content documents that are merge candidates
528+
: @param $merge-options merge options for determining notification collections
529+
: @param $query merge options for determining notification collections
530+
: @return content of the newly-constructed notification
531+
:)
532+
declare function matcher:build-match-notification(
533+
$threshold-label as xs:string,
534+
$uris as xs:string*,
535+
$options as item()?,
536+
$query as cts:query?
537+
) as map:map?
538+
{
539+
notify-impl:build-match-notification($threshold-label, $uris, $options, $query)
540+
};
492541
(:
493542
: Delete the specified notification.
494543
: @param $uri URI of the notification document to be deleted

0 commit comments

Comments
 (0)