diff --git a/docs/reference/esql/esql-commands.asciidoc b/docs/reference/esql/esql-commands.asciidoc index 33e748d7eb7c1..e200ce760f110 100644 --- a/docs/reference/esql/esql-commands.asciidoc +++ b/docs/reference/esql/esql-commands.asciidoc @@ -42,9 +42,7 @@ ifeval::["{release-state}"=="unreleased"] endif::[] * <> * <> -ifeval::["{release-state}"=="unreleased"] -//* experimental:[] <> -endif::[] +* experimental:[] <> * experimental:[] <> * <> * <> @@ -67,9 +65,7 @@ ifeval::["{release-state}"=="unreleased"] endif::[] include::processing-commands/keep.asciidoc[] include::processing-commands/limit.asciidoc[] -ifeval::["{release-state}"=="unreleased"] -//include::processing-commands/lookup.asciidoc[] -endif::[] +include::processing-commands/lookup.asciidoc[] include::processing-commands/mv_expand.asciidoc[] include::processing-commands/rename.asciidoc[] include::processing-commands/sort.asciidoc[] diff --git a/docs/reference/esql/esql-language.asciidoc b/docs/reference/esql/esql-language.asciidoc index 151ca803bf2eb..cb2d8260469f6 100644 --- a/docs/reference/esql/esql-language.asciidoc +++ b/docs/reference/esql/esql-language.asciidoc @@ -12,6 +12,7 @@ Detailed reference documentation for the {esql} language: * <> * <> * <> +* <> * <> * <> * <> @@ -23,5 +24,6 @@ include::metadata-fields.asciidoc[] include::multivalued-fields.asciidoc[] include::esql-process-data-with-dissect-grok.asciidoc[] include::esql-enrich-data.asciidoc[] +include::esql-lookup-join.asciidoc[] include::implicit-casting.asciidoc[] include::time-spans.asciidoc[] diff --git a/docs/reference/esql/esql-lookup-join.asciidoc b/docs/reference/esql/esql-lookup-join.asciidoc new file mode 100644 index 0000000000000..400afabdb03b2 --- /dev/null +++ b/docs/reference/esql/esql-lookup-join.asciidoc @@ -0,0 +1,189 @@ +=== LOOKUP JOIN + +++++ +Correlate data with LOOKUP JOIN +++++ + +The {esql} <> +processing command combines data from your {esql} query results +table with matching records from a specified lookup index. It adds +fields from the lookup index as new columns to your results table based +on matching values in the join field. + +Teams often have data scattered across multiple indices – like logs, +IPs, user IDs, hosts, employees etc. Without a direct way to enrich or +correlate each event with reference data, root-cause analysis, security +checks, and operational insights become time-consuming. + +For example, you can use `LOOKUP JOIN` to: + +* Retrieve environment or ownership details for each host to correlate +your metrics data. +* Quickly see if any source IPs match known malicious addresses. +* Tag logs with the owning team or escalation info for faster triage and +incident response. + +<> is similar to <> +in the fact that they both help you join data together. You should use +`LOOKUP JOIN` when: + +* Your enrichment data changes frequently +* You want to avoid index-time processing +* You're working with regular indices +* You need to preserve distinct matches +* You need to match on any field in a lookup index +* You use document or field level security +* You want to restrict users to a specific lookup indices that they can +you + +[discrete] +[[esql-how-lookup-join-works]] +==== How the `LOOKUP JOIN` command works ++[++esql-how-lookup-join-works++]++ + +The `LOOKUP JOIN` command adds new columns to a table, with data from +{es} indices. + +image::images/esql/esql-lookup-join.png[align="center"] + +[[esql-lookup-join-lookup-index]] +lookup_index:: +The name of the lookup index. This must +be a specific index name - wildcards, aliases, and remote cluster +references are not supported. + +[[esql-lookup-join-field-name]] +field_name:: +The field to join on. This field must exist +in both your current query results and in the lookup index. If the field +contains multi-valued entries, those entries will not match anything +(the added fields will contain `null` for those rows). + +[discrete] +[[esql-lookup-join-example]] +==== Example + +`LOOKUP JOIN` has left-join behavior. If no rows match in the looked index, `LOOKUP JOIN` retains the incoming row and adds `null`s. If many rows in the lookedup index match, `LOOKUP JOIN` adds one row per match. + +In this example, we have two sample tables: + +*employees* + +[cols=",,,,,",options="header",] +|=== +|birth++_++date |emp++_++no |first++_++name |gender |hire++_++date +|language +|1955-10-04T00:00:00Z |10091 |Amabile |M |1992-11-18T00:00:00Z |3 + +|1964-10-18T00:00:00Z |10092 |Valdiodio |F |1989-09-22T00:00:00Z |1 + +|1964-06-11T00:00:00Z |10093 |Sailaja |M |1996-11-05T00:00:00Z |3 + +|1957-05-25T00:00:00Z |10094 |Arumugam |F |1987-04-18T00:00:00Z |5 + +|1965-01-03T00:00:00Z |10095 |Hilari |M |1986-07-15T00:00:00Z |4 +|=== + +*languages++_++non++_++unique++_++key* + +[cols=",,",options="header",] +|=== +|language++_++code |language++_++name |country +|1 |English |Canada +|1 |English | +|1 | |United Kingdom +|1 |English |United States of America +|2 |German |++[++Germany{vbar}Austria++]++ +|2 |German |Switzerland +|2 |German | +|4 |Quenya | +|5 | |Atlantis +|++[++6{vbar}7++]++ |Mv-Lang |Mv-Land +|++[++7{vbar}8++]++ |Mv-Lang2 |Mv-Land2 +|Null-Lang |Null-Land | +|Null-Lang2 |Null-Land2 | +|=== + +Running the following query would provide the results shown below. + +[source,esql] +---- +FROM employees +| EVAL language_code = emp_no % 10 +| LOOKUP JOIN languages_lookup_non_unique_key ON language_code +| WHERE emp_no > 10090 AND emp_no < 10096 +| SORT emp_no, country +| KEEP emp_no, language_code, language_name, country; +---- + +[cols=",,,",options="header",] +|=== +|emp++_++no |language++_++code |language++_++name |country +|10091 |1 |English |Canada +|10091 |1 |null |United Kingdom +|10091 |1 |English |United States of America +|10091 |1 |English |null +|10092 |2 |German |++[++Germany, Austria++]++ +|10092 |2 |German |Switzerland +|10092 |2 |German |null +|10093 |3 |null |null +|10094 |4 |Spanish |null +|10095 |5 |null |France +|=== + +[IMPORTANT] +==== +`LOOKUP JOIN` does not guarantee the output to be in +any particular order. If a certain order is required, users should use a +link:/reference/query-languages/esql/esql-commands.md#esql-sort[`SORT`] +somewhere after the `LOOKUP JOIN`. +==== + +[discrete] +[[esql-lookup-join-prereqs]] +==== Prerequisites + +To use `LOOKUP JOIN`, the following requirements must be met: + +* *Compatible data types*: The join key and join field in the lookup +index must have compatible data types. This means: +** The data types must either be identical or be internally represented +as the same type in Elasticsearch's type system +** Numeric types follow these compatibility rules: +*** `short` and `byte` are compatible with `integer` (all represented as +`int`) +*** `float`, `half_float`, and `scaled_float` are compatible +with `double` (all represented as `double`) +** For text fields: You can use text fields on the left-hand side of the +join only if they have a `.keyword` subfield + +For a complete list of supported data types and their internal +representations, see the +link:/reference/query-languages/esql/limitations.md#_supported_types[Supported +Field Types documentation]. + +[discrete] +[[esql-lookup-join-limitations]] +==== Limitations + +The following are the current limitations with `LOOKUP JOIN` + +* `LOOKUP JOIN` will be successful if the join field in the lookup index +is a `KEYWORD` type. If the main index's join field is `TEXT` type, it +must have an exact `.keyword` subfield that can be matched with the +lookup index's `KEYWORD` field. +* Indices in +link:/reference/elasticsearch/index-settings/index-modules.md#index-mode-setting[lookup] +mode are always single-sharded. +* Cross cluster search is unsupported. Both source and lookup indices +must be local. +* `LOOKUP JOIN` can only use a single match field and a single index. +Wildcards, aliases, datemath, and datastreams are not supported. +* The name of the match field in +`LOOKUP JOIN lu++_++idx ON match++_++field` must match an existing field +in the query. This may require renames or evals to achieve. +* The query will circuit break if there are too many matching documents +in the lookup index, or if the documents are too large. More precisely, +`LOOKUP JOIN` works in batches of, normally, about 10,000 rows; a large +amount of heap space is needed if the matching documents from the lookup +index for a batch are multiple megabytes or larger. This is roughly the +same as for `ENRICH`. diff --git a/docs/reference/esql/processing-commands/lookup.asciidoc b/docs/reference/esql/processing-commands/lookup.asciidoc new file mode 100644 index 0000000000000..f7146ab9084b9 --- /dev/null +++ b/docs/reference/esql/processing-commands/lookup.asciidoc @@ -0,0 +1,111 @@ +[discrete] +[[esql-lookup-join]] +=== `LOOKUP JOIN` + +[WARNING] +==== +This functionality is in technical preview and may be +changed or removed in a future release. Elastic will work to fix any +issues, but features in technical preview are not subject to the support +SLA of official GA features. :::: +==== +`LOOKUP JOIN` enables you to add data from another index, AKA a 'lookup' +index, to your ++{{++esql}} query results, simplifying data enrichment +and analysis workflows. + +*Syntax* + +.... +FROM +| LOOKUP JOIN ON +.... + +[source,esql] +---- +FROM firewall_logs +| LOOKUP JOIN threat_list ON source.IP +| WHERE threat_level IS NOT NULL +---- + +*Parameters* + +`lookup_index`:: +The name of the lookup index. This must be a specific index name - wildcards, aliases, and remote cluster +references are not supported. + +`field_name`:: +The field to join on. This field must exist +in both your current query results and in the lookup index. If the field +contains multi-valued entries, those entries will not match anything +(the added fields will contain `null` for those rows). + +*Description* + +The `LOOKUP JOIN` command adds new columns to your ++{++esql} query +results table by finding documents in a lookup index that share the same +join field value as your result rows. + +For each row in your results table that matches a document in the lookup +index based on the join field, all fields from the matching document are +added as new columns to that row. + +If multiple documents in the lookup index match a single row in your +results, the output will contain one row for each matching combination. + +*Examples* + +[TIP] +==== +In case of name collisions, the newly created columns will override existing columns. +==== + +*IP Threat correlation*: This query would allow you to see if any source +IPs match known malicious addresses. + +[source,esql] +---- +FROM firewall_logs +| LOOKUP JOIN threat_list ON source.IP +---- + +*Host metadata correlation*: This query pulls in environment or +ownership details for each host to correlate with your metrics data. + +[source,esql] +---- +FROM system_metrics +| LOOKUP JOIN host_inventory ON host.name +| LOOKUP JOIN employees ON host.name +---- + +*Service ownership mapping*: This query would show logs with the owning +team or escalation information for faster triage and incident response. + +[source,esql] +---- +FROM app_logs +| LOOKUP JOIN service_owners ON service_id +---- + +`LOOKUP JOIN` is generally faster when there are fewer rows to join +with. {esql} will try and perform any `WHERE` clause before the +`LOOKUP JOIN` where possible. + +The two following examples will have the same results. The two examples +have the `WHERE` clause before and after the `LOOKUP JOIN`. It does not +matter how you write your query, our optimizer will move the filter +before the lookup when ran. + +[source,esql] +---- +FROM Left +| WHERE Language IS NOT NULL +| LOOKUP JOIN Right ON Key +---- + +[source,esql] +---- +FROM Left +| LOOKUP JOIN Right ON Key +| WHERE Language IS NOT NULL +---- diff --git a/docs/reference/esql/processing-commands/lookup.disabled b/docs/reference/esql/processing-commands/lookup.disabled deleted file mode 100644 index ca456d8e70eed..0000000000000 --- a/docs/reference/esql/processing-commands/lookup.disabled +++ /dev/null @@ -1,64 +0,0 @@ -[discrete] -[[esql-lookup]] -=== `LOOKUP` - -experimental::["LOOKUP is highly experimental and only available in SNAPSHOT versions."] - -`LOOKUP` matches values from the input against a `table` provided in the request, -adding the other fields from the `table` to the output. - -**Syntax** - -[source,esql] ----- -LOOKUP table ON match_field1[, match_field2, ...] ----- - -*Parameters* - -`table`:: -The name of the `table` provided in the request to match. -If the table's column names conflict with existing columns, the existing columns will be dropped. - -`match_field`:: -The fields in the input to match against the table. - -*Examples* - -// tag::examples[] -[source,console,id=esql-lookup-example] ----- -POST /_query?format=txt -{ - "query": """ - FROM library - | SORT page_count DESC - | KEEP name, author - | LOOKUP era ON author - | LIMIT 5 - """, - "tables": { - "era": { - "author": {"keyword": ["Frank Herbert", "Peter F. Hamilton", "Vernor Vinge", "Alastair Reynolds", "James S.A. Corey"]}, - "era": {"keyword": [ "The New Wave", "Diamond", "Diamond", "Diamond", "Hadron"]} - } - } -} ----- -// TEST[setup:library] - -Which returns: - -[source,text] ----- - name | author | era ---------------------+-----------------+--------------- -Pandora's Star |Peter F. Hamilton|Diamond -A Fire Upon the Deep|Vernor Vinge |Diamond -Dune |Frank Herbert |The New Wave -Revelation Space |Alastair Reynolds|Diamond -Leviathan Wakes |James S.A. Corey |Hadron ----- -// TESTRESPONSE[s/\|/\\|/ s/\+/\\+/] -// TESTRESPONSE[non_json] -// end::examples[] diff --git a/docs/reference/images/esql/esql-lookup-join.png b/docs/reference/images/esql/esql-lookup-join.png new file mode 100644 index 0000000000000..de220b0638a06 Binary files /dev/null and b/docs/reference/images/esql/esql-lookup-join.png differ diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc index 73e2db6e45e34..e431a1bb6e1aa 100644 --- a/docs/reference/index-modules.asciidoc +++ b/docs/reference/index-modules.asciidoc @@ -113,6 +113,8 @@ Index mode supports the following values: `standard`::: Standard indexing with default settings. +`lookup`::: Index that can be used for lookup joins in ES|QL. Limited to 1 shard. + `time_series`::: _(data streams only)_ Index mode optimized for storage of metrics. For more information, see <>. `logsdb`::: _(data streams only)_ Index mode optimized for <>.