diff --git a/build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/ElasticsearchTestBasePlugin.java b/build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/ElasticsearchTestBasePlugin.java index 0bd766752a377..02eee97f79d43 100644 --- a/build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/ElasticsearchTestBasePlugin.java +++ b/build-tools-internal/src/main/java/org/elasticsearch/gradle/internal/ElasticsearchTestBasePlugin.java @@ -130,6 +130,10 @@ public void execute(Task t) { "--add-opens=java.base/java.time=ALL-UNNAMED", "--add-opens=java.management/java.lang.management=ALL-UNNAMED", "--enable-native-access=ALL-UNNAMED", + // Arrow (may need to be replaced by org.apache.arrow.memory.core once modularized) + "--add-opens=java.base/java.nio=ALL-UNNAMED", + // Define the allocation manager type to avoid classpath scanning to locate one. + "-Darrow.allocation.manager.type=Unsafe", "-XX:+HeapDumpOnOutOfMemoryError" ); diff --git a/distribution/src/config/jvm.options b/distribution/src/config/jvm.options index f4cc3b1bf6191..2abdf948e9e22 100644 --- a/distribution/src/config/jvm.options +++ b/distribution/src/config/jvm.options @@ -89,3 +89,12 @@ ## GC logging -Xlog:gc*,gc+age=trace,safepoint:file=gc.log:utctime,level,pid,tags:filecount=32,filesize=64m + +## Arrow +# Allow accessing a private field of java.nio.Buffer for direct memory access. +# See org.apache.arrow.memory.MemoryUtil and https://arrow.apache.org/docs/java/install.html +# See also libs/arrow/src/main/java/module-info.java-disabled for why we open to ALL-UNNAMED +# instead of limiting to org.apache.arrow.memory.core +--add-opens=java.base/java.nio=ALL-UNNAMED +# Define the allocation manager type to avoid classpath scanning to locate one. +-Darrow.allocation.manager.type=Unsafe diff --git a/docs/changelog/125040.yaml b/docs/changelog/125040.yaml new file mode 100644 index 0000000000000..cdc1989c7285e --- /dev/null +++ b/docs/changelog/125040.yaml @@ -0,0 +1,5 @@ +pr: 125040 +summary: Add Apache Arrow as a bulk ingestion format +area: CRUD +type: enhancement +issues: [] diff --git a/gradle/verification-metadata.xml b/gradle/verification-metadata.xml index 86146db87dbc1..b6929c5dabf12 100644 --- a/gradle/verification-metadata.xml +++ b/gradle/verification-metadata.xml @@ -687,6 +687,11 @@ + + + + + @@ -697,6 +702,11 @@ + + + + + @@ -3728,6 +3738,11 @@ + + + + + @@ -4263,6 +4278,11 @@ + + + + + diff --git a/libs/arrow/build.gradle b/libs/arrow/build.gradle new file mode 100644 index 0000000000000..c3c01d27c3a0e --- /dev/null +++ b/libs/arrow/build.gradle @@ -0,0 +1,81 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +// Notes: +// - additional JVM arguments are added to distribution/src/config/jvm.options and ElasticsearchTestBasePlugin +// - additional permissions are added to server/src/main/resources/org/elasticsearch/bootstrap/security.policy + +import org.elasticsearch.gradle.internal.precommit.CheckForbiddenApisTask + +apply plugin: 'elasticsearch.build' + +var arrowVersion = "18.3.0" + +dependencies { + //implementation(project(":libs:x-content")) + + // jackson-core is provided by :libs:x-content:impl. If declared here, there's a module issue that prevents ES from starting: + // + // fatal exception while booting Elasticsearch java.lang.IllegalAccessError: class org.elasticsearch.xcontent.provider.json.JsonXContentImpl (in module org.elasticsearch.xcontent.impl) cannot access class com.fasterxml.jackson.core.JsonFactoryBuilder (in unnamed module @0x4727e5fc) because module org.elasticsearch.xcontent.impl does not read unnamed module @0x4727e5fc + // at org.elasticsearch.xcontent.impl@9.0.0-SNAPSHOT/org.elasticsearch.xcontent.provider.json.JsonXContentImpl.(JsonXContentImpl.java:50) + // at org.elasticsearch.xcontent.impl@9.0.0-SNAPSHOT/org.elasticsearch.xcontent.provider.XContentProviderImpl$2.XContent(XContentProviderImpl.java:54) + // at org.elasticsearch.xcontent@9.0.0-SNAPSHOT/org.elasticsearch.xcontent.json.JsonXContent.(JsonXContent.java:37) + // at org.elasticsearch.xcontent@9.0.0-SNAPSHOT/org.elasticsearch.xcontent.XContentType.(XContentType.java:28) + // at org.elasticsearch.server@9.0.0-SNAPSHOT/org.elasticsearch.common.settings.Setting.arrayToParsableString(Setting.java:1883) + //implementation(project(":libs:x-content:impl")) + + // arrow-vector + api("org.apache.arrow:arrow-vector:${arrowVersion}") + api("com.fasterxml.jackson.core:jackson-core:${versions.jackson}") + api("com.fasterxml.jackson.core:jackson-annotations:${versions.jackson}") + api("com.fasterxml.jackson.core:jackson-databind:${versions.jackson}") + api("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${versions.jackson}") + + api("com.google.flatbuffers:flatbuffers-java:25.2.10") + api("commons-codec:commons-codec:${versions.commonscodec}") // Arrow 18 -> commons-codec 1.17.1 + api("org.slf4j:slf4j-api:${versions.slf4j}") + api("org.immutables:value-annotations:2.10.1") // provided dependency + + // arrow-format + api("org.apache.arrow:arrow-format:${arrowVersion}") + // also depends on flatbuffers + + // arrow-memory-core + api("org.apache.arrow:arrow-memory-core:${arrowVersion}") + api("com.google.errorprone:error_prone_annotations:2.31.0") // provided dependency + api('org.checkerframework:checker-qual:3.48.1') // provided dependency + // also depends on value-annotations (provided dependency) + + // arrow-memory-unsafe + api("org.apache.arrow:arrow-memory-unsafe:${arrowVersion}") + // also depends on value-annotations (provided dependency) + + testImplementation(project(":test:framework")) { + exclude group: 'org.elasticsearch', module: 'arrow' + } +} + +tasks.named("dependencyLicenses").configure { + mapping from: /jackson-.*/, to: 'jackson' + mapping from: /arrow-.*/, to: 'arrow' + mapping from: /value-annotations.*/, to: 'org-immutables' +} + +tasks.named("thirdPartyAudit").configure { + ignoreViolations( + 'org.apache.arrow.memory.util.MemoryUtil', + 'org.apache.arrow.memory.util.MemoryUtil$1', + ) +} + +tasks.withType(CheckForbiddenApisTask).configureEach { + // Remove server signatures as they will fail on classes missing in this lib's classpath, + // like org.apache.lucene.util.IOUtils + replaceSignatureFiles('jdk-signatures') +} diff --git a/x-pack/plugin/esql/arrow/licenses/arrow-LICENSE.txt b/libs/arrow/licenses/arrow-LICENSE.txt similarity index 100% rename from x-pack/plugin/esql/arrow/licenses/arrow-LICENSE.txt rename to libs/arrow/licenses/arrow-LICENSE.txt diff --git a/x-pack/plugin/esql/arrow/licenses/arrow-NOTICE.txt b/libs/arrow/licenses/arrow-NOTICE.txt similarity index 100% rename from x-pack/plugin/esql/arrow/licenses/arrow-NOTICE.txt rename to libs/arrow/licenses/arrow-NOTICE.txt diff --git a/x-pack/plugin/esql/arrow/licenses/checker-qual-LICENSE.txt b/libs/arrow/licenses/checker-qual-LICENSE.txt similarity index 100% rename from x-pack/plugin/esql/arrow/licenses/checker-qual-LICENSE.txt rename to libs/arrow/licenses/checker-qual-LICENSE.txt diff --git a/x-pack/plugin/esql/arrow/licenses/checker-qual-NOTICE.txt b/libs/arrow/licenses/checker-qual-NOTICE.txt similarity index 100% rename from x-pack/plugin/esql/arrow/licenses/checker-qual-NOTICE.txt rename to libs/arrow/licenses/checker-qual-NOTICE.txt diff --git a/x-pack/plugin/esql/arrow/licenses/flatbuffers-java-LICENSE.txt b/libs/arrow/licenses/commons-codec-LICENSE.txt similarity index 100% rename from x-pack/plugin/esql/arrow/licenses/flatbuffers-java-LICENSE.txt rename to libs/arrow/licenses/commons-codec-LICENSE.txt diff --git a/libs/arrow/licenses/commons-codec-NOTICE.txt b/libs/arrow/licenses/commons-codec-NOTICE.txt new file mode 100644 index 0000000000000..56916449bbe10 --- /dev/null +++ b/libs/arrow/licenses/commons-codec-NOTICE.txt @@ -0,0 +1,17 @@ +Apache Commons Codec +Copyright 2002-2015 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java +contains test data from http://aspell.net/test/orig/batch0.tab. +Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org) + +=============================================================================== + +The content of package org.apache.commons.codec.language.bm has been translated +from the original php source code available at http://stevemorse.org/phoneticinfo.htm +with permission from the original authors. +Original source copyright: +Copyright (c) 2008 Alexander Beider & Stephen P. Morse. diff --git a/libs/arrow/licenses/error_prone_annotations-LICENSE.txt b/libs/arrow/licenses/error_prone_annotations-LICENSE.txt new file mode 100644 index 0000000000000..5c304d1a4a7b4 --- /dev/null +++ b/libs/arrow/licenses/error_prone_annotations-LICENSE.txt @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/x-pack/plugin/esql/arrow/licenses/flatbuffers-java-NOTICE.txt b/libs/arrow/licenses/error_prone_annotations-NOTICE.txt similarity index 100% rename from x-pack/plugin/esql/arrow/licenses/flatbuffers-java-NOTICE.txt rename to libs/arrow/licenses/error_prone_annotations-NOTICE.txt diff --git a/x-pack/plugin/esql/arrow/licenses/jackson-LICENSE.txt b/libs/arrow/licenses/flatbuffers-java-LICENSE.txt similarity index 100% rename from x-pack/plugin/esql/arrow/licenses/jackson-LICENSE.txt rename to libs/arrow/licenses/flatbuffers-java-LICENSE.txt diff --git a/x-pack/plugin/esql/arrow/licenses/jackson-NOTICE.txt b/libs/arrow/licenses/flatbuffers-java-NOTICE.txt similarity index 100% rename from x-pack/plugin/esql/arrow/licenses/jackson-NOTICE.txt rename to libs/arrow/licenses/flatbuffers-java-NOTICE.txt diff --git a/libs/arrow/licenses/jackson-LICENSE.txt b/libs/arrow/licenses/jackson-LICENSE.txt new file mode 100644 index 0000000000000..d645695673349 --- /dev/null +++ b/libs/arrow/licenses/jackson-LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/libs/arrow/licenses/jackson-NOTICE.txt b/libs/arrow/licenses/jackson-NOTICE.txt new file mode 100644 index 0000000000000..4c976b7b4cc58 --- /dev/null +++ b/libs/arrow/licenses/jackson-NOTICE.txt @@ -0,0 +1,20 @@ +# Jackson JSON processor + +Jackson is a high-performance, Free/Open Source JSON processing library. +It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has +been in development since 2007. +It is currently developed by a community of developers, as well as supported +commercially by FasterXML.com. + +## Licensing + +Jackson core and extension components may licensed under different licenses. +To find the details that apply to this artifact see the accompanying LICENSE file. +For more information, including possible other licensing options, contact +FasterXML.com (http://fasterxml.com). + +## Credits + +A list of contributors may be found from CREDITS file, which is included +in some artifacts (usually source distributions); but is always available +from the source code management (SCM) system project uses. diff --git a/libs/arrow/licenses/org-immutables-LICENSE.txt b/libs/arrow/licenses/org-immutables-LICENSE.txt new file mode 100644 index 0000000000000..5c304d1a4a7b4 --- /dev/null +++ b/libs/arrow/licenses/org-immutables-LICENSE.txt @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/x-pack/plugin/esql/arrow/licenses/slf4j-NOTICE.txt b/libs/arrow/licenses/org-immutables-NOTICE.txt similarity index 100% rename from x-pack/plugin/esql/arrow/licenses/slf4j-NOTICE.txt rename to libs/arrow/licenses/org-immutables-NOTICE.txt diff --git a/x-pack/plugin/esql/arrow/licenses/slf4j-LICENSE.txt b/libs/arrow/licenses/slf4j-api-LICENSE.txt similarity index 100% rename from x-pack/plugin/esql/arrow/licenses/slf4j-LICENSE.txt rename to libs/arrow/licenses/slf4j-api-LICENSE.txt diff --git a/libs/arrow/licenses/slf4j-api-NOTICE.txt b/libs/arrow/licenses/slf4j-api-NOTICE.txt new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libs/arrow/src/main/java/module-info.java-disabled b/libs/arrow/src/main/java/module-info.java-disabled new file mode 100644 index 0000000000000..33b421033f218 --- /dev/null +++ b/libs/arrow/src/main/java/module-info.java-disabled @@ -0,0 +1,35 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +// Note: we cannot use Java modules for this library. +// +// For performance reasons Arrow accesses some private fields of ByteBuffer, and this requires "opens" +// permission on the java.base/java.nio package. See https://arrow.apache.org/docs/java/install.html +// +// - if this module is loaded from a plugin, the JVM will not know it at startup, and will not honor +// --add-opens=java.base/java.nio=org.apache.arrow.memory.core. +// The startup logs will contain: "Unknown module: org.apache.arrow.memory.core specified to --add-opens" +// +// - if loaded from the server or core modules, it reveals versions conflicts: +// - because of xcontent:impl com.fasterxml.jackson.core:jackson-core between versions 2.17.2 and 2.15.0 +// - because of ?? commons-codec:commons-codec between versions 1.16.1 and 1.15 +// Fixing them isn't a trivial task (also impacts serverless). +// +// So we have to disable Java modules for this Gradle module and use -add-opens=java.base/ALL-UNNAMED +// in distribution/src/config/jvm.options until the version conflict issue is solved and we can load it from +// the server or core modules. + +module org.elasticsearch.libs.arrow { + exports org.elasticsearch.libs.arrow; + + requires transitive org.apache.arrow.memory.core; + requires transitive org.apache.arrow.vector; + requires transitive org.apache.arrow.format; + requires org.apache.arrow.memory.unsafe; +} diff --git a/libs/arrow/src/main/java/org/elasticsearch/libs/arrow/Arrow.java b/libs/arrow/src/main/java/org/elasticsearch/libs/arrow/Arrow.java new file mode 100644 index 0000000000000..d3f306fc64acc --- /dev/null +++ b/libs/arrow/src/main/java/org/elasticsearch/libs/arrow/Arrow.java @@ -0,0 +1,49 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.libs.arrow; + +import org.apache.arrow.memory.AllocationListener; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; + +public class Arrow { + + /** + * Arrow IPC stream media type. + * + * @see Format docs + * @see IANA assignment + */ + public static String MEDIA_TYPE = "application/vnd.apache.arrow.stream"; + + private static final RootAllocator ROOT_ALLOCATOR = new RootAllocator(); + + /** + * Returns the global root allocator. Do not use it to allocate memory, use {@link #newChildAllocator(String, long, long)} to + * enforce allocation limits and track potential memory leaks when the child allocator is closed. + */ + public static RootAllocator rootAllocator() { + return ROOT_ALLOCATOR; + } + + /** + * Creates a new allocator, child of the root allocator. + */ + public static BufferAllocator newChildAllocator(String name, long initReservation, long maxAllocation) { + return ROOT_ALLOCATOR.newChildAllocator(name, initReservation, maxAllocation); + } + + /** + * Creates a new allocator, child of the root allocator. + */ + public static BufferAllocator newChildAllocator(String name, AllocationListener listener, long initReservation, long maxAllocation) { + return ROOT_ALLOCATOR.newChildAllocator(name, listener, initReservation, maxAllocation); + } +} diff --git a/libs/arrow/src/main/java/org/elasticsearch/libs/arrow/ArrowFormatException.java b/libs/arrow/src/main/java/org/elasticsearch/libs/arrow/ArrowFormatException.java new file mode 100644 index 0000000000000..bc350c9061258 --- /dev/null +++ b/libs/arrow/src/main/java/org/elasticsearch/libs/arrow/ArrowFormatException.java @@ -0,0 +1,17 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.libs.arrow; + +public class ArrowFormatException extends RuntimeException { + + public ArrowFormatException(String msg) { + super(msg); + } +} diff --git a/modules/arrow/build.gradle b/modules/arrow/build.gradle new file mode 100644 index 0000000000000..403d88b22cd0b --- /dev/null +++ b/modules/arrow/build.gradle @@ -0,0 +1,31 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +apply plugin: 'elasticsearch.internal-es-plugin' +apply plugin: 'elasticsearch.internal-cluster-test' + +esplugin { + name = 'arrow' + description = 'Provides Arrow integration for Elasticsearch' + classname ='org.elasticsearch.arrow.ArrowPlugin' +} + +dependencies { + implementation(project(":libs:x-content")) + implementation(project(":libs:arrow")) + implementation("com.fasterxml.jackson.core:jackson-core:${versions.jackson}") + implementation("com.fasterxml.jackson.core:jackson-databind:${versions.jackson}") + implementation("com.fasterxml.jackson.core:jackson-annotations:${versions.jackson}") + + testImplementation(project(":test:framework")) +} + +tasks.named("dependencyLicenses").configure { + mapping from: /jackson-.*/, to: 'jackson' +} diff --git a/modules/arrow/licenses/jackson-LICENSE.txt b/modules/arrow/licenses/jackson-LICENSE.txt new file mode 100644 index 0000000000000..d645695673349 --- /dev/null +++ b/modules/arrow/licenses/jackson-LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/modules/arrow/licenses/jackson-NOTICE.txt b/modules/arrow/licenses/jackson-NOTICE.txt new file mode 100644 index 0000000000000..4c976b7b4cc58 --- /dev/null +++ b/modules/arrow/licenses/jackson-NOTICE.txt @@ -0,0 +1,20 @@ +# Jackson JSON processor + +Jackson is a high-performance, Free/Open Source JSON processing library. +It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has +been in development since 2007. +It is currently developed by a community of developers, as well as supported +commercially by FasterXML.com. + +## Licensing + +Jackson core and extension components may licensed under different licenses. +To find the details that apply to this artifact see the accompanying LICENSE file. +For more information, including possible other licensing options, contact +FasterXML.com (http://fasterxml.com). + +## Credits + +A list of contributors may be found from CREDITS file, which is included +in some artifacts (usually source distributions); but is always available +from the source code management (SCM) system project uses. diff --git a/modules/arrow/src/internalClusterTest/java/org/elasticsearch/arrow/bulk/ArrowBulkActionIT.java b/modules/arrow/src/internalClusterTest/java/org/elasticsearch/arrow/bulk/ArrowBulkActionIT.java new file mode 100644 index 0000000000000..3f289d8d0db5e --- /dev/null +++ b/modules/arrow/src/internalClusterTest/java/org/elasticsearch/arrow/bulk/ArrowBulkActionIT.java @@ -0,0 +1,153 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.bulk; + +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.Text; +import org.apache.http.entity.AbstractHttpEntity; +import org.apache.http.entity.ByteArrayEntity; +import org.apache.http.entity.ContentType; +import org.apache.http.entity.InputStreamEntity; +import org.elasticsearch.arrow.ArrowPlugin; +import org.elasticsearch.client.Request; +import org.elasticsearch.client.RestClient; +import org.elasticsearch.libs.arrow.Arrow; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentType; +import org.junit.After; +import org.junit.Before; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.List; + +/** + * End-to-end test for Arrow bulk ingestion. Tests for the various Arrow datatypes and + * bulk actions are in {@code ArrowBulkIncrementalParserTests} + */ +public class ArrowBulkActionIT extends ESSingleNodeRestTestCase { + + private RestClient restClient; + + @Before + public void init() { + restClient = createRestClient(); + } + + @After + public void cleanup() throws IOException { + restClient.close(); + } + + @Override + protected Collection> getPlugins() { + return Collections.singletonList(ArrowPlugin.class); + } + + /** + * An end-to-end test that checks that Arrow data is correctly indexed and can be searched. + */ + public void testBulk() throws Exception { + + String index = "arrow_bulk_test"; + + { + // Check that the index doesn't exist + var request = new Request("HEAD", "/" + index); + var response = restClient.performRequest(request); + assertEquals(404, response.getStatusLine().getStatusCode()); + } + + // Create a dataframe with two columns: integer and string + Field intField = new Field("ints", FieldType.nullable(new ArrowType.Int(32, true)), null); + Field strField = new Field("strings", FieldType.nullable(new ArrowType.Utf8()), null); + Schema schema = new Schema(List.of(intField, strField)); + + int batchCount = randomIntBetween(1, 10); + int rowCount = randomIntBetween(1, 10); + boolean chunked = randomBoolean(); + + byte[] payload; + + // Create vectors and write them to a byte array + try (var allocator = Arrow.newChildAllocator("test", 0, Long.MAX_VALUE); var root = VectorSchemaRoot.create(schema, allocator);) { + var baos = new ByteArrayOutputStream(); + IntVector intVector = (IntVector) root.getVector(0); + VarCharVector stringVector = (VarCharVector) root.getVector(1); + + try (ArrowStreamWriter writer = new ArrowStreamWriter(root, null, baos)) { + for (int batch = 0; batch < batchCount; batch++) { + for (int row = 0; row < rowCount; row++) { + int globalRow = row + batch * rowCount; + intVector.setSafe(row, globalRow); + stringVector.setSafe(row, new Text("row" + globalRow)); + } + root.setRowCount(rowCount); + writer.writeBatch(); + } + } + payload = baos.toByteArray(); + } + + { + // Bulk insert the arrow stream + var request = new Request("POST", "/_arrow/" + index + "/_bulk"); + request.addParameter("refresh", "wait_for"); + request.addParameter("error_trace", "true"); + request.setOptions(request.getOptions().toBuilder().addHeader("Content-type", "application/vnd.apache.arrow.stream")); + AbstractHttpEntity entity; + if (chunked) { + entity = new InputStreamEntity(new ByteArrayInputStream(payload), ContentType.create(Arrow.MEDIA_TYPE)); + entity.setChunked(true); + } else { + entity = new ByteArrayEntity(payload, ContentType.create(Arrow.MEDIA_TYPE)); + } + request.setEntity(entity); + + var response = restClient.performRequest(request); + + // Response is an Arrow stream with empty vectors, indicating success + assertEquals(Arrow.MEDIA_TYPE, response.getHeader("Content-Type")); + try ( + var allocator = Arrow.newChildAllocator("test", 0, Long.MAX_VALUE); + var reader = new ArrowStreamReader(response.getEntity().getContent(), allocator); + ) { + reader.loadNextBatch(); + var root = reader.getVectorSchemaRoot(); + var itemNoVector = root.getVector(ArrowBulkAction.ERR_ITEM_NO); + assertNotNull(itemNoVector); + assertEquals(0, itemNoVector.getValueCount()); + } + } + + { + // Check that the index effectively contains what we sent + var request = new Request("GET", "/" + index + "/_count"); + var response = restClient.performRequest(request); + var result = XContentType.JSON.xContent() + .createParser(XContentParserConfiguration.EMPTY, response.getEntity().getContent()) + .map(); + + assertEquals(batchCount * rowCount, result.get("count")); + } + } +} diff --git a/modules/arrow/src/internalClusterTest/java/org/elasticsearch/arrow/bulk/ESSingleNodeRestTestCase.java b/modules/arrow/src/internalClusterTest/java/org/elasticsearch/arrow/bulk/ESSingleNodeRestTestCase.java new file mode 100644 index 0000000000000..5d33862db9d74 --- /dev/null +++ b/modules/arrow/src/internalClusterTest/java/org/elasticsearch/arrow/bulk/ESSingleNodeRestTestCase.java @@ -0,0 +1,42 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.bulk; + +import org.apache.http.HttpHost; +import org.elasticsearch.client.RestClient; +import org.elasticsearch.common.network.NetworkAddress; +import org.elasticsearch.http.HttpInfo; +import org.elasticsearch.node.NodeService; +import org.elasticsearch.test.ESSingleNodeTestCase; + +import java.net.InetSocketAddress; + +/** + * An {@link ESSingleNodeTestCase} with a Rest client (a feature that is provided by {@code ESIntegTestCase}). + */ +public abstract class ESSingleNodeRestTestCase extends ESSingleNodeTestCase { + + @Override + protected boolean addMockHttpTransport() { + return false; + } + + public RestClient createRestClient() { + NodeService instance = node().injector().getInstance(NodeService.class); + var httpInfo = instance.info(false, false, false, false, false, false, true, false, false, false, false, false) + .getInfo(HttpInfo.class); + + assertNotNull("Couldn't get the node's http info", httpInfo); + + InetSocketAddress address = httpInfo.address().publishAddress().address(); + HttpHost host = new HttpHost(NetworkAddress.format(address.getAddress()), address.getPort(), "http"); + return RestClient.builder(host).build(); + } +} diff --git a/modules/arrow/src/main/java/module-info.java-disabled b/modules/arrow/src/main/java/module-info.java-disabled new file mode 100644 index 0000000000000..a3b51edc7b9ed --- /dev/null +++ b/modules/arrow/src/main/java/module-info.java-disabled @@ -0,0 +1,24 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +// See why Java modules are disabled here in libs/arrow/src/main/java/module-info.java-disabled + +module org.elasticsearch.arrow { + requires org.apache.arrow.vector; + requires org.apache.arrow.memory.core; + requires org.apache.arrow.format; + + requires com.fasterxml.jackson.core; + requires com.fasterxml.jackson.databind; + requires org.elasticsearch.libs.arrow; + requires org.elasticsearch.xcontent; + requires org.elasticsearch.server; + requires org.elasticsearch.base; + requires org.apache.lucene.core; +} diff --git a/modules/arrow/src/main/java/org/elasticsearch/arrow/ArrowPlugin.java b/modules/arrow/src/main/java/org/elasticsearch/arrow/ArrowPlugin.java new file mode 100644 index 0000000000000..66724e2c9aeca --- /dev/null +++ b/modules/arrow/src/main/java/org/elasticsearch/arrow/ArrowPlugin.java @@ -0,0 +1,77 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow; + +import org.elasticsearch.arrow.bulk.ArrowBulkAction; +import org.elasticsearch.client.internal.Client; +import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; +import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.common.component.AbstractLifecycleComponent; +import org.elasticsearch.common.io.stream.NamedWriteableRegistry; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.IndexScopedSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsFilter; +import org.elasticsearch.features.NodeFeature; +import org.elasticsearch.libs.arrow.Arrow; +import org.elasticsearch.plugins.ActionPlugin; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.rest.RestController; +import org.elasticsearch.rest.RestHandler; +import org.elasticsearch.telemetry.TelemetryProvider; + +import java.util.Collection; +import java.util.List; +import java.util.function.Predicate; +import java.util.function.Supplier; + +public class ArrowPlugin extends Plugin implements ActionPlugin { + + private Client client; + private TelemetryProvider telemetryProvider; + + @Override + public Collection createComponents(PluginServices services) { + this.client = services.client(); + this.telemetryProvider = services.telemetryProvider(); + + return List.of(new AbstractLifecycleComponent() { + @Override + protected void doStart() { + // Make sure Arrow is initialized + Arrow.rootAllocator(); + } + + @Override + protected void doStop() {} + + @Override + protected void doClose() {} + }); + } + + /** + * Rest handlers added by this plugin. + */ + @Override + public Collection getRestHandlers( + Settings settings, + NamedWriteableRegistry namedWriteableRegistry, + RestController restController, + ClusterSettings clusterSettings, + IndexScopedSettings indexScopedSettings, + SettingsFilter settingsFilter, + IndexNameExpressionResolver indexNameExpressionResolver, + Supplier nodesInCluster, + Predicate clusterSupportsFeature + ) { + return List.of(new ArrowBulkAction(this.client, this.telemetryProvider, settings)); + } +} diff --git a/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowBulkAction.java b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowBulkAction.java new file mode 100644 index 0000000000000..0b09fc5739270 --- /dev/null +++ b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowBulkAction.java @@ -0,0 +1,249 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.bulk; + +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.ExceptionsHelper; +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.action.DocWriteRequest; +import org.elasticsearch.action.bulk.BulkItemResponse; +import org.elasticsearch.action.bulk.BulkRequest; +import org.elasticsearch.action.bulk.BulkResponse; +import org.elasticsearch.action.bulk.BulkShardRequest; +import org.elasticsearch.action.bulk.IncrementalBulkService; +import org.elasticsearch.action.support.ActiveShardCount; +import org.elasticsearch.client.internal.Client; +import org.elasticsearch.client.internal.node.NodeClient; +import org.elasticsearch.common.bytes.ReleasableBytesReference; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.index.IndexingPressure; +import org.elasticsearch.libs.arrow.Arrow; +import org.elasticsearch.rest.BaseRestHandler; +import org.elasticsearch.rest.RestChannel; +import org.elasticsearch.rest.RestRequest; +import org.elasticsearch.rest.RestResponse; +import org.elasticsearch.rest.RestStatus; +import org.elasticsearch.rest.RestUtils; +import org.elasticsearch.rest.action.RestRefCountedChunkedToXContentListener; +import org.elasticsearch.rest.action.RestToXContentListener; +import org.elasticsearch.rest.action.document.RestBulkAction; +import org.elasticsearch.search.fetch.subphase.FetchSourceContext; +import org.elasticsearch.telemetry.TelemetryProvider; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import static org.elasticsearch.rest.RestRequest.Method.POST; +import static org.elasticsearch.rest.RestRequest.Method.PUT; + +public class ArrowBulkAction extends BaseRestHandler { + + public static final String ID = "_id"; + public static final String INDEX = "_index"; + public static final String ACTION = "_bulk_action"; + + public static final String ERR_ITEM_NO = "item_no"; + public static final String ERR_STATUS = "status"; + public static final String ERR_TYPE = "type"; + public static final String ERR_REASON = "reason"; + + private final IncrementalBulkService bulkHandler; + + public ArrowBulkAction(Client client, TelemetryProvider telemetryProvider, Settings settings) { + this.bulkHandler = new IncrementalBulkService(client, new IndexingPressure(settings), telemetryProvider.getMeterRegistry()); + } + + @Override + public String getName() { + return "arrow_bulk_action"; + } + + @Override + public List routes() { + return List.of( + new Route(POST, "/_arrow/_bulk"), + new Route(PUT, "/_arrow/_bulk"), + new Route(POST, "/_arrow/{index}/_bulk"), + new Route(PUT, "/_arrow/{index}/_bulk") + ); + } + + public boolean mediaTypesValid(RestRequest request) { + return ArrowBulkRequestParser.isArrowRequest(request); + } + + @Override + public boolean supportsContentStream() { + return true; + } + + @Override + protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) throws IOException { + + if (request.isStreamedContent() == false) { + // FIXME: can we ever land here since supportsContentStream() returns true? + BulkRequest bulkRequest = new BulkRequest(); + String defaultIndex = request.param("index"); + String defaultRouting = request.param("routing"); + FetchSourceContext defaultFetchSourceContext = FetchSourceContext.parseFromRestRequest(request); + String defaultPipeline = request.param("pipeline"); + boolean defaultListExecutedPipelines = request.paramAsBoolean("list_executed_pipelines", false); + String waitForActiveShards = request.param("wait_for_active_shards"); + if (waitForActiveShards != null) { + bulkRequest.waitForActiveShards(ActiveShardCount.parseString(waitForActiveShards)); + } + Boolean defaultRequireAlias = request.paramAsBoolean(DocWriteRequest.REQUIRE_ALIAS, false); + boolean defaultRequireDataStream = request.paramAsBoolean(DocWriteRequest.REQUIRE_DATA_STREAM, false); + bulkRequest.timeout(request.paramAsTime("timeout", BulkShardRequest.DEFAULT_TIMEOUT)); + bulkRequest.setRefreshPolicy(request.param("refresh")); + bulkRequest.includeSourceOnError(RestUtils.getIncludeSourceOnError(request)); + bulkRequest.requestParamsUsed(request.params().keySet()); + ReleasableBytesReference content = request.content(); + String accept = request.header("Accept"); + boolean arrowResponse = accept == null || Arrow.MEDIA_TYPE.equals(accept); + + try { + ArrowBulkRequestParser parser = new ArrowBulkRequestParser(request); + parser.parse( + content, + defaultIndex, + defaultRouting, + defaultFetchSourceContext, + defaultPipeline, + defaultRequireAlias, + defaultRequireDataStream, + defaultListExecutedPipelines, + false, + null, + (req, type) -> bulkRequest.add(req), + bulkRequest::add, + bulkRequest::add + ); + } catch (Exception e) { + return channel -> new RestToXContentListener<>(channel).onFailure( + new ElasticsearchParseException("Failed to parse Arrow format", e) + ); + } + return channel -> { + // FIXME: review ref counting and release in nominal and failure mode + content.mustIncRef(); + var parent = new RestRefCountedChunkedToXContentListener(channel); + client.bulk(bulkRequest, ActionListener.releaseAfter(new ArrowResponseListener(channel, arrowResponse, parent), content)); + }; + + } else { + + String waitForActiveShards = request.param("wait_for_active_shards"); + TimeValue timeout = request.paramAsTime("timeout", BulkShardRequest.DEFAULT_TIMEOUT); + String refresh = request.param("refresh"); + // Return an Arrow response if Accept is missing or is the Arrow media type. + // The Arrow response only contains failures, and will be an empty table if there are no failures. + // Global (request-level) failures are always returned as JSON. + String accept = request.header("Accept"); + boolean arrowResponse = accept == null || Arrow.MEDIA_TYPE.equals(accept); + + return new RestBulkAction.ChunkHandler( + false, + request, + () -> bulkHandler.newBulkRequest(waitForActiveShards, timeout, refresh, request.params().keySet()), + new ArrowBulkRequestParser(request) + ) { + @Override + protected ActionListener createResponseListener(RestChannel channel) { + return new ArrowResponseListener(channel, arrowResponse, super.createResponseListener(channel)); + } + }; + } + } + + private record ArrowResponseListener(RestChannel channel, boolean arrowResponse, ActionListener parent) + implements + ActionListener { + + @Override + public void onResponse(BulkResponse bulkItemResponses) { + if (arrowResponse == false) { + // JSON response + parent.onResponse(bulkItemResponses); + return; + } + + // FIXME: we can be more efficient and stream the response, like we do in ESQL's ArrowResponse + var output = new BytesReferenceOutputStream(); + try ( + var allocator = Arrow.newChildAllocator("bulk_response", 0, 10_000_000L); + var itemNoVector = new UInt4Vector(ERR_ITEM_NO, allocator); + // Could be dictionary-encoded to reduce response size - only beneficial when there are errors + var indexVector = new VarCharVector(INDEX, allocator); + var idVector = new VarCharVector(ID, allocator); + var statusVector = new UInt2Vector(ERR_STATUS, allocator); + // Could be dictionary-encoded to reduce payload size + var typeVector = new VarCharVector(ERR_TYPE, allocator); + var reasonVector = new VarCharVector(ERR_REASON, allocator); + + var root = new VectorSchemaRoot(List.of(itemNoVector, indexVector, idVector, statusVector, typeVector, reasonVector)); + var writer = new ArrowStreamWriter(root, null, output); + ) { + int failureCount = 0; + var items = bulkItemResponses.getItems(); + for (int itemNo = 0; itemNo < items.length; itemNo++) { + var item = items[itemNo]; + if (item.isFailed()) { + BulkItemResponse.Failure failure = item.getFailure(); + + itemNoVector.setSafe(failureCount, itemNo); + addValue(indexVector, failureCount, failure.getIndex()); + addValue(idVector, failureCount, failure.getId()); + statusVector.setSafe(failureCount, failure.getStatus().getStatus()); + + Throwable cause = ExceptionsHelper.unwrapCause(failure.getCause()); + addValue(typeVector, failureCount, cause == null ? null : ElasticsearchException.getExceptionName(cause)); + addValue(reasonVector, failureCount, cause == null ? null : cause.getMessage()); + + failureCount++; + } + } + + for (var vec : root.getFieldVectors()) { + vec.setValueCount(failureCount); + } + root.setRowCount(failureCount); + + writer.writeBatch(); + } catch (IOException e) { + this.onFailure(e); + } + var response = new RestResponse(RestStatus.OK, Arrow.MEDIA_TYPE, output.asBytesReference()); + channel.sendResponse(response); + } + + private void addValue(VarCharVector vector, int position, String value) { + if (value == null) { + vector.setNull(position); + } else { + vector.setSafe(position, value.getBytes(StandardCharsets.UTF_8)); + } + } + + @Override + public void onFailure(Exception e) { + // Output the failure as JSON + parent.onFailure(e); + } + } +} diff --git a/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowBulkIncrementalParser.java b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowBulkIncrementalParser.java new file mode 100644 index 0000000000000..fd902018e0fd7 --- /dev/null +++ b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowBulkIncrementalParser.java @@ -0,0 +1,316 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.bulk; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.util.Text; +import org.elasticsearch.action.DocWriteRequest; +import org.elasticsearch.action.bulk.BulkRequestParser; +import org.elasticsearch.action.delete.DeleteRequest; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.action.update.UpdateRequest; +import org.elasticsearch.arrow.xcontent.ArrowToString; +import org.elasticsearch.arrow.xcontent.ArrowToXContent; +import org.elasticsearch.arrow.xcontent.XContentBuffer; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.libs.arrow.Arrow; +import org.elasticsearch.libs.arrow.ArrowFormatException; +import org.elasticsearch.search.fetch.subphase.FetchSourceContext; +import org.elasticsearch.xcontent.XContent; +import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentType; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Map; +import java.util.function.BiConsumer; +import java.util.function.Consumer; + +class ArrowBulkIncrementalParser extends BulkRequestParser.XContentIncrementalParser { + + /** XContent format used to encode source documents */ + private static final XContent SOURCE_XCONTENT = XContentType.CBOR.xContent(); + + private final DocWriteRequest.OpType defaultOpType; + + private final ArrowIncrementalParser arrowParser; + private VectorSchemaRoot schemaRoot; + private Map dictionaries; + + private Integer idField = null; + private Integer indexField = null; + private Integer actionField = null; + private BitSet valueFields; + + private final ArrowToXContent arrowToXContent = new ArrowToXContent(); + + ArrowBulkIncrementalParser( + DocWriteRequest.OpType defaultOpType, + @Nullable String defaultIndex, + @Nullable String defaultRouting, + @Nullable FetchSourceContext defaultFetchSourceContext, + @Nullable String defaultPipeline, + @Nullable Boolean defaultRequireAlias, + @Nullable Boolean defaultRequireDataStream, + @Nullable Boolean defaultListExecutedPipelines, + boolean allowExplicitIndex, + XContentType xContentType, + XContentParserConfiguration config, + BiConsumer indexRequestConsumer, + Consumer updateRequestConsumer, + Consumer deleteRequestConsumer + ) { + super( + defaultIndex, + defaultRouting, + defaultFetchSourceContext, + defaultPipeline, + defaultRequireAlias, + defaultRequireDataStream, + defaultListExecutedPipelines, + allowExplicitIndex, + true, // deprecateOrErrorOnType + xContentType, + config, + indexRequestConsumer, + updateRequestConsumer, + deleteRequestConsumer + ); + + this.defaultOpType = defaultOpType; + + // FIXME: hard-coded limit to 100 MiB per record batch. Should we add an AllocationListener that calls ES memory management? + BufferAllocator allocator = Arrow.newChildAllocator("bulk-ingestion", 0, 100 * 1024 * 1024); + + this.arrowParser = new ArrowIncrementalParser(allocator, new ArrowIncrementalParser.Listener() { + @Override + public void startStream(VectorSchemaRoot schemaRoot) throws IOException { + startArrowStream(schemaRoot); + } + + @Override + public void nextBatch(Map dictionary) throws IOException { + nextArrowBatch(dictionary); + } + + @Override + public void endStream() throws IOException { + endArrowStream(); + } + }); + } + + @Override + public int parse(BytesReference data, boolean lastData) throws IOException { + return arrowParser.parse(data, lastData); + } + + @Override + public void close() { + super.close(); + if (schemaRoot != null) { + schemaRoot.close(); + schemaRoot = null; + } + } + + private void startArrowStream(VectorSchemaRoot root) { + + this.schemaRoot = root; + + var schemaFields = root.getFieldVectors(); + var valueFields = new BitSet(schemaFields.size()); + + for (int i = 0; i < schemaFields.size(); i++) { + var field = schemaFields.get(i); + + switch (field.getName()) { + case ArrowBulkAction.ID -> idField = i; + case ArrowBulkAction.INDEX -> indexField = i; + case ArrowBulkAction.ACTION -> { + var type = field.getMinorType(); + if (type != Types.MinorType.MAP && type != Types.MinorType.STRUCT) { + throw new ArrowFormatException("Field '" + ArrowBulkAction.ACTION + "' should be a map or a struct"); + } + actionField = i; + } + // Regular field that will be added to the document. + default -> valueFields.set(i); + } + } + + this.valueFields = valueFields; + } + + private void nextArrowBatch(Map dictionary) throws IOException { + this.dictionaries = dictionary; + int rowCount = schemaRoot.getRowCount(); + FieldVector idVector = idField == null ? null : schemaRoot.getVector(idField); + FieldVector indexVector = indexField == null ? null : schemaRoot.getVector(indexField); + FieldVector actionVector = actionField == null ? null : schemaRoot.getVector(actionField); + + for (int i = 0; i < rowCount; i++) { + String id = idVector == null ? null : ArrowToString.getString(idVector, i, dictionary); + String index = indexVector == null ? null : ArrowToString.getString(indexVector, i, dictionary); + + var action = parseAction(actionVector, i, id, index); + switch (action) { + case IndexRequest ir -> { + ir.source(generateSource(i), SOURCE_XCONTENT.type()); + indexRequestConsumer.accept(ir, null); + } + case UpdateRequest ur -> { + // Script updates aren't supported in Arrow format + ur.doc(generateSource(i), SOURCE_XCONTENT.type()); + updateRequestConsumer.accept(ur); + } + case DeleteRequest dr -> { + deleteRequestConsumer.accept(dr); + } + default -> { + } + } + } + } + + protected BytesReference generateSource(int position) throws IOException { + var output = new BytesReferenceOutputStream(); + try (var generator = SOURCE_XCONTENT.createGenerator(output)) { + generator.writeStartObject(); + int rowCount = schemaRoot.getRowCount(); + for (int i = 0; i < rowCount; i++) { + if (valueFields.get(i)) { + arrowToXContent.writeField(schemaRoot.getVector(i), position, dictionaries, generator); + } + } + generator.writeEndObject(); + } + + return output.asBytesReference(); + } + + private void endArrowStream() { + close(); + } + + // Visible for testing + DocWriteRequest parseAction(@Nullable FieldVector actionVector, int position, String id, String index) throws IOException { + + DocWriteRequest request; + + try (var generator = new XContentBuffer()) { + + if (actionVector == null) { + // Create a `{ defaultOpType: {} }` action + generator.writeStartObject(); + generator.writeFieldName(defaultOpType.getLowercase()); + generator.writeStartObject(); + generator.writeEndObject(); + generator.writeEndObject(); + } else { + String opType = getNamedString(actionVector, "op_type", position); + if (opType == null) { + opType = defaultOpType.getLowercase(); + } + // Create a `{ opType: { properties } }` action + // Note: the "op_type" property may also exist, but the action parser accepts it. + generator.writeStartObject(); + generator.writeFieldName(opType); + arrowToXContent.writeValue(actionVector, position, dictionaries, generator); + generator.writeEndObject(); + } + + request = parseActionLine(generator.asParser()); + } + + if (id != null) { + if (request.id() != null) { + throw new ArrowFormatException( + "'" + + ArrowBulkAction.ID + + "' found both as top-level field and in '" + + ArrowBulkAction.ACTION + + "' at position [" + + position + + "]" + ); + } + + switch (request) { + case IndexRequest ir -> ir.id(id); + case UpdateRequest ur -> ur.id(id); + case DeleteRequest ur -> ur.id(id); + default -> throw new IllegalArgumentException("Unknown request type [" + request.opType() + "]"); + } + } + + if (index != null) { + // Testing references on purpose to detect default index passed down to the request + if (request.index() != defaultIndex) { + throw new ArrowFormatException( + "'" + + ArrowBulkAction.INDEX + + "' found both as top-level field and in '" + + ArrowBulkAction.ACTION + + "' at position [" + + position + + "]" + ); + } + request.index(index); + } + + return request; + } + + private String getNamedString(FieldVector vector, String name, int position) { + byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8); + + if (vector instanceof MapVector mapVector) { + // A Map is a variable-size list of structs with two fields, key and value (in this order) + var data = mapVector.getDataVector(); + var keyVec = (VarCharVector) data.getChildrenFromFields().get(0); + var valueVec = data.getChildrenFromFields().get(1); + + var key = new Text(); + for (int pos = mapVector.getElementStartIndex(position); pos < mapVector.getElementEndIndex(position); pos++) { + keyVec.read(pos, key); + if (Arrays.equals(nameBytes, 0, nameBytes.length, key.getBytes(), 0, (int) key.getLength())) { + return ArrowToString.getString(valueVec, pos, this.dictionaries); + } + } + // Not found + return null; + } + + if (vector instanceof StructVector structVector) { + var childVector = structVector.getChild(name); + return childVector == null ? null : ArrowToString.getString(childVector, position, this.dictionaries); + } + + for (var child : vector.getChildrenFromFields()) { + if (child instanceof ValueVector valueVector && valueVector.getName().equals(name)) { + return ArrowToString.getString(valueVector, position, this.dictionaries); + } + } + return null; + } +} diff --git a/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowBulkRequestParser.java b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowBulkRequestParser.java new file mode 100644 index 0000000000000..a9079b547a54e --- /dev/null +++ b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowBulkRequestParser.java @@ -0,0 +1,119 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.bulk; + +import org.elasticsearch.action.DocWriteRequest; +import org.elasticsearch.action.bulk.AbstractBulkRequestParser; +import org.elasticsearch.action.delete.DeleteRequest; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.action.update.UpdateRequest; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.core.RestApiVersion; +import org.elasticsearch.libs.arrow.Arrow; +import org.elasticsearch.rest.RestRequest; +import org.elasticsearch.search.fetch.subphase.FetchSourceContext; +import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentType; + +import java.io.IOException; +import java.util.function.BiConsumer; +import java.util.function.Consumer; + +public class ArrowBulkRequestParser extends AbstractBulkRequestParser { + + public static boolean isArrowRequest(RestRequest request) { + return request.getParsedContentType().mediaTypeWithoutParameters().equals(Arrow.MEDIA_TYPE); + } + + private final RestApiVersion apiVersion; + private final DocWriteRequest.OpType defaultOpType; + + public ArrowBulkRequestParser(RestRequest request) { + // Default operation read from the "op_type" query parameter + // We default to create requests as it's safe and versatile: + // - accepts requests with and without an id, + // - if an id is present, ensures we don't accidentally overwrite an existing document, + // - datastreams only accept create operations. + String str = request.param("op_type", DocWriteRequest.OpType.CREATE.getLowercase()); + this.defaultOpType = DocWriteRequest.OpType.fromString(str); + this.apiVersion = request.getRestApiVersion(); + } + + @Override + public void parse( + BytesReference data, + @Nullable String defaultIndex, + @Nullable String defaultRouting, + @Nullable FetchSourceContext defaultFetchSourceContext, + @Nullable String defaultPipeline, + @Nullable Boolean defaultRequireAlias, + @Nullable Boolean defaultRequireDataStream, + @Nullable Boolean defaultListExecutedPipelines, + boolean allowExplicitIndex, + XContentType xContentType, + BiConsumer indexRequestConsumer, + Consumer updateRequestConsumer, + Consumer deleteRequestConsumer + ) throws IOException { + try ( + IncrementalParser parser = incrementalParser( + defaultIndex, + defaultRouting, + defaultFetchSourceContext, + defaultPipeline, + defaultRequireAlias, + defaultRequireDataStream, + defaultListExecutedPipelines, + allowExplicitIndex, + xContentType, + indexRequestConsumer, + updateRequestConsumer, + deleteRequestConsumer + ) + ) { + parser.parse(data, true); + } + } + + @Override + public IncrementalParser incrementalParser( + @Nullable String defaultIndex, + @Nullable String defaultRouting, + @Nullable FetchSourceContext defaultFetchSourceContext, + @Nullable String defaultPipeline, + @Nullable Boolean defaultRequireAlias, + @Nullable Boolean defaultRequireDataStream, + @Nullable Boolean defaultListExecutedPipelines, + boolean allowExplicitIndex, + XContentType xContentType, + BiConsumer indexRequestConsumer, + Consumer updateRequestConsumer, + Consumer deleteRequestConsumer + ) { + return new ArrowBulkIncrementalParser( + defaultOpType, + defaultIndex, + defaultRouting, + defaultFetchSourceContext, + defaultPipeline, + defaultRequireAlias, + defaultRequireDataStream, + defaultListExecutedPipelines, + allowExplicitIndex, + xContentType, + XContentParserConfiguration.EMPTY.withRestApiVersion(apiVersion), + indexRequestConsumer, + updateRequestConsumer, + deleteRequestConsumer + ); + } + +} diff --git a/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowIncrementalParser.java b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowIncrementalParser.java new file mode 100644 index 0000000000000..000180de2c3a1 --- /dev/null +++ b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/ArrowIncrementalParser.java @@ -0,0 +1,165 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.bulk; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ReadChannel; +import org.apache.arrow.vector.ipc.message.MessageMetadataResult; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.libs.arrow.ArrowFormatException; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Map; + +/** + * An incremental reader for Arrow dataframes. + */ +public class ArrowIncrementalParser implements Closeable { + + public interface Listener { + /** + * Start of the Arrow stream. It's the responsibility of the listener to close this vector root, + * as it may need to live longer than the parser. + */ + void startStream(VectorSchemaRoot schemaRoot) throws IOException; + + /** + * A new {@code RecordBatch} was read. Its vectors are available in the {@code VectorSchemaRoot} that + * was passed to {@link #startStream(VectorSchemaRoot)}. + */ + void nextBatch(Map dictionary) throws IOException; + + /** + * Reached the end of the Arrow stream. + */ + void endStream() throws IOException; + } + + private final Listener listener; + private BytesReferenceChannel channel; + private long expectedDataLength; + private ArrowStreamReader reader; + + private static final int PREFIX_LEN = 8; + + public ArrowIncrementalParser(BufferAllocator allocator, Listener listener) { + this.listener = listener; + this.expectedDataLength = PREFIX_LEN; + this.channel = new BytesReferenceChannel(); + this.reader = new ArrowStreamReader(channel, allocator); + } + + /** + * When {@link #parse(BytesReference, boolean)} returns zero, provides the number of bytes + * that are needed to continue parsing the Arrow stream. Note that {@code parse()} can + * return zero multiple times with an increasing expected data length. + */ + public long expectedDataLength() { + return this.expectedDataLength; + } + + @Override + public void close() throws IOException { + if (this.reader != null) { + this.reader.close(); // Will also close channel. + this.channel = null; + this.reader = null; + } + } + + public int parse(BytesReference data, boolean lastData) throws IOException { + int total = 0; + int consumed; + int chunkLength = data.length(); + while ((consumed = doParse(data, lastData)) > 0) { + total += consumed; + data = data.slice(consumed, data.length() - consumed); + // Start a new message + expectedDataLength = PREFIX_LEN; + } + + if (lastData && total != chunkLength) { + throw new ArrowFormatException("Incomplete or invalid Arrow stream"); + } + return total; + } + + /** + * Parse an Arrow message (metadata + body). If there aren't enough bytes available, return zero. + */ + private int doParse(BytesReference data, boolean lastData) throws IOException { + + if (data.length() < expectedDataLength) { + return 0; + } + + // See https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc + + var continuation = data.getIntLE(0); + if (continuation != 0xFFFFFFFF) { + throw new IOException("Bad Arrow continuation prefix [" + Integer.toHexString(continuation) + "] prefix"); + } + + var metadataSize = data.getIntLE(4); + + if (metadataSize == 0) { + // End of stream + return PREFIX_LEN; + } + + // FIXME: enforce a hard limit on metadata size? + int trailing = metadataSize % 8; + if (trailing % 8 != 0) { + // padded to 8 bytes + metadataSize += (8 - trailing); + } + + expectedDataLength = PREFIX_LEN + metadataSize; + if (data.length() < expectedDataLength) { + return 0; + } + + // We may expect some data after the metadata, read metadata to find body length. + // The Arrow library doesn't make it easy to read metadata and then the body, so we read + // the metadata once to get the body length (overhead is low since flatbuffers is zero-copy) + ReadChannel ch = new ReadChannel(new BytesReferenceChannel(data)); + MessageMetadataResult metadata = MessageSerializer.readMessage(ch); + // FIXME: enforce a hard limit on body length? + expectedDataLength += metadata.getMessageBodyLength(); + if (data.length() < expectedDataLength) { + return 0; + } + + // We now have enough data to read a batch (message + data) + channel.setData(data, lastData); + long initialBytesRead = reader.bytesRead(); + + if (reader.bytesRead() == 0) { + VectorSchemaRoot root = reader.getVectorSchemaRoot(); + listener.startStream(root); + + } else { + if (reader.loadNextBatch()) { + listener.nextBatch(reader.getDictionaryVectors()); + } else { + expectedDataLength = 0; + listener.endStream(); + close(); + } + } + + return (int) (reader.bytesRead() - initialBytesRead); + } +} diff --git a/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/BytesReferenceChannel.java b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/BytesReferenceChannel.java new file mode 100644 index 0000000000000..b60322cd6df93 --- /dev/null +++ b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/BytesReferenceChannel.java @@ -0,0 +1,85 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.bulk; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; +import org.elasticsearch.common.bytes.BytesReference; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.ReadableByteChannel; + +/** + * A {@code ReadableByteChannel} that reads from {@code ByteReference} data. That data + * can be updated, allowing incremental parsing from a single channel. + */ +class BytesReferenceChannel implements ReadableByteChannel { + + private BytesRefIterator iterator; + private BytesRef current; + private int currentOffset; + private int endOffset; + private boolean lastData = false; + + BytesReferenceChannel() { + // Keep zero/null values + } + + BytesReferenceChannel(BytesReference data) throws IOException { + setData(data, true); + } + + void setData(BytesReference data, boolean lastData) throws IOException { + this.lastData = lastData; + this.iterator = data.iterator(); + nextBytesRef(); + } + + private void nextBytesRef() throws IOException { + this.current = iterator.next(); + if (this.current == null) { + this.currentOffset = 0; + this.endOffset = 0; + } else { + this.currentOffset = this.current.offset; + this.endOffset = this.currentOffset + this.current.length; + } + } + + @Override + public int read(ByteBuffer dst) throws IOException { + int written = 0; + int remaining; + while ((remaining = dst.remaining()) > 0 && this.current != null) { + int len = Math.min(remaining, this.endOffset - this.currentOffset); + dst.put(this.current.bytes, this.currentOffset, len); + this.currentOffset += len; + written += len; + + if (this.currentOffset == this.endOffset) { + nextBytesRef(); + } + } + + return written == 0 && lastData ? -1 : written; + } + + @Override + public boolean isOpen() { + return iterator != null; + } + + @Override + public void close() { + iterator = null; + current = null; + } +} diff --git a/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/BytesReferenceOutputStream.java b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/BytesReferenceOutputStream.java new file mode 100644 index 0000000000000..8f36b0911ac7a --- /dev/null +++ b/modules/arrow/src/main/java/org/elasticsearch/arrow/bulk/BytesReferenceOutputStream.java @@ -0,0 +1,23 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.bulk; + +import org.elasticsearch.common.bytes.BytesArray; + +import java.io.ByteArrayOutputStream; + +/** + * A byte array stream that can be converted to {@code BytesReference} with zero copy. + */ +class BytesReferenceOutputStream extends ByteArrayOutputStream { + BytesArray asBytesReference() { + return new BytesArray(buf, 0, count); + } +} diff --git a/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/ArrowJsonXContentParser.java b/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/ArrowJsonXContentParser.java new file mode 100644 index 0000000000000..f2b7aedf931f5 --- /dev/null +++ b/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/ArrowJsonXContentParser.java @@ -0,0 +1,313 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.xcontent; + +import com.fasterxml.jackson.core.JsonLocation; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.core.exc.InputCoercionException; +import com.fasterxml.jackson.core.io.JsonEOFException; + +import org.elasticsearch.xcontent.XContentEOFException; +import org.elasticsearch.xcontent.XContentLocation; +import org.elasticsearch.xcontent.XContentParseException; +import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xcontent.support.AbstractXContentParser; + +import java.io.IOException; +import java.nio.CharBuffer; + +// copy of JsonXContentParser in :libs:x-content:impl +class ArrowJsonXContentParser extends AbstractXContentParser { + + final JsonParser parser; + + ArrowJsonXContentParser(XContentParserConfiguration config, JsonParser parser) { + super(config.registry(), config.deprecationHandler(), config.restApiVersion()); + // this.parser = ((XContentParserConfigurationImpl) config).filter(parser); + this.parser = parser; + } + + @Override + public XContentType contentType() { + return XContentType.JSON; + } + + @Override + public void allowDuplicateKeys(boolean allowDuplicateKeys) { + parser.configure(JsonParser.Feature.STRICT_DUPLICATE_DETECTION, allowDuplicateKeys == false); + } + + private static XContentParseException newXContentParseException(JsonProcessingException e) { + JsonLocation loc = e.getLocation(); + throw new XContentParseException(new XContentLocation(loc.getLineNr(), loc.getColumnNr()), e.getMessage(), e); + } + + @Override + public Token nextToken() throws IOException { + try { + return convertToken(parser.nextToken()); + } catch (JsonEOFException e) { + JsonLocation location = e.getLocation(); + throw new XContentEOFException(new XContentLocation(location.getLineNr(), location.getColumnNr()), "Unexpected end of file", e); + } catch (JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public String nextFieldName() throws IOException { + try { + return parser.nextFieldName(); + } catch (JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public void skipChildren() throws IOException { + parser.skipChildren(); + } + + @Override + public Token currentToken() { + return convertToken(parser.getCurrentToken()); + } + + @Override + public NumberType numberType() throws IOException { + return convertNumberType(parser.getNumberType()); + } + + @Override + public String currentName() throws IOException { + return parser.getCurrentName(); + } + + @Override + protected boolean doBooleanValue() throws IOException { + try { + return parser.getBooleanValue(); + } catch (JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public String text() throws IOException { + if (currentToken().isValue() == false) { + throwOnNoText(); + } + return parser.getText(); + } + + private void throwOnNoText() { + throw new IllegalArgumentException("Expected text at " + getTokenLocation() + " but found " + currentToken()); + } + + @Override + public CharBuffer charBuffer() throws IOException { + try { + return CharBuffer.wrap(parser.getTextCharacters(), parser.getTextOffset(), parser.getTextLength()); + } catch (JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public Object objectText() throws IOException { + JsonToken currentToken = parser.getCurrentToken(); + if (currentToken == JsonToken.VALUE_STRING) { + return text(); + } else if (currentToken == JsonToken.VALUE_NUMBER_INT || currentToken == JsonToken.VALUE_NUMBER_FLOAT) { + return parser.getNumberValue(); + } else if (currentToken == JsonToken.VALUE_TRUE) { + return Boolean.TRUE; + } else if (currentToken == JsonToken.VALUE_FALSE) { + return Boolean.FALSE; + } else if (currentToken == JsonToken.VALUE_NULL) { + return null; + } else { + return text(); + } + } + + @Override + public Object objectBytes() throws IOException { + JsonToken currentToken = parser.getCurrentToken(); + if (currentToken == JsonToken.VALUE_STRING) { + return charBuffer(); + } else if (currentToken == JsonToken.VALUE_NUMBER_INT || currentToken == JsonToken.VALUE_NUMBER_FLOAT) { + return parser.getNumberValue(); + } else if (currentToken == JsonToken.VALUE_TRUE) { + return Boolean.TRUE; + } else if (currentToken == JsonToken.VALUE_FALSE) { + return Boolean.FALSE; + } else if (currentToken == JsonToken.VALUE_NULL) { + return null; + } else { + return charBuffer(); + } + } + + @Override + public boolean hasTextCharacters() { + return parser.hasTextCharacters(); + } + + @Override + public char[] textCharacters() throws IOException { + try { + return parser.getTextCharacters(); + } catch (JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public int textLength() throws IOException { + try { + return parser.getTextLength(); + } catch (JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public int textOffset() throws IOException { + try { + return parser.getTextOffset(); + } catch (JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public Number numberValue() throws IOException { + try { + return parser.getNumberValue(); + } catch (InputCoercionException | JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public short doShortValue() throws IOException { + try { + return parser.getShortValue(); + } catch (InputCoercionException | JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public int doIntValue() throws IOException { + try { + return parser.getIntValue(); + } catch (InputCoercionException | JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public long doLongValue() throws IOException { + try { + return parser.getLongValue(); + } catch (InputCoercionException | JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public float doFloatValue() throws IOException { + try { + return parser.getFloatValue(); + } catch (InputCoercionException | JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public double doDoubleValue() throws IOException { + try { + return parser.getDoubleValue(); + } catch (InputCoercionException | JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public byte[] binaryValue() throws IOException { + try { + return parser.getBinaryValue(); + } catch (JsonParseException e) { + throw newXContentParseException(e); + } + } + + @Override + public XContentLocation getTokenLocation() { + JsonLocation loc = parser.getTokenLocation(); + if (loc == null) { + return null; + } + return new XContentLocation(loc.getLineNr(), loc.getColumnNr()); + } + + @Override + public void close() { + // noinspection EmptyCatchBlock + try { + parser.close(); + } catch (final IOException | RuntimeException e) {} + } + + private static NumberType convertNumberType(JsonParser.NumberType numberType) { + return switch (numberType) { + case INT -> NumberType.INT; + case BIG_INTEGER -> NumberType.BIG_INTEGER; + case LONG -> NumberType.LONG; + case FLOAT -> NumberType.FLOAT; + case DOUBLE -> NumberType.DOUBLE; + case BIG_DECIMAL -> NumberType.BIG_DECIMAL; + }; + } + + private static Token convertToken(JsonToken token) { + if (token == null) { + return null; + } + return switch (token) { + case START_OBJECT -> Token.START_OBJECT; + case END_OBJECT -> Token.END_OBJECT; + case START_ARRAY -> Token.START_ARRAY; + case END_ARRAY -> Token.END_ARRAY; + case FIELD_NAME -> Token.FIELD_NAME; + case VALUE_EMBEDDED_OBJECT -> Token.VALUE_EMBEDDED_OBJECT; + case VALUE_STRING -> Token.VALUE_STRING; + case VALUE_NUMBER_INT, VALUE_NUMBER_FLOAT -> Token.VALUE_NUMBER; + case VALUE_FALSE, VALUE_TRUE -> Token.VALUE_BOOLEAN; + case VALUE_NULL -> Token.VALUE_NULL; + default -> throw unknownTokenException(token); + }; + } + + private static IllegalStateException unknownTokenException(JsonToken token) { + return new IllegalStateException("No matching token for json_token [" + token + "]"); + } + + @Override + public boolean isClosed() { + return parser.isClosed(); + } +} diff --git a/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/ArrowToString.java b/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/ArrowToString.java new file mode 100644 index 0000000000000..9c913bf069596 --- /dev/null +++ b/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/ArrowToString.java @@ -0,0 +1,56 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.xcontent; + +import org.apache.arrow.vector.BaseIntVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VariableWidthFieldVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.elasticsearch.libs.arrow.ArrowFormatException; + +import java.nio.charset.StandardCharsets; +import java.util.Map; + +public class ArrowToString { + + public static String getString(ValueVector vector, int position, Map dictionaries) { + if (vector.isNull(position)) { + return null; + } + + return switch (vector.getMinorType()) { + + case VARCHAR, LARGEVARCHAR, VIEWVARCHAR -> { + var bytesVector = (VariableWidthFieldVector) vector; + yield new String(bytesVector.get(position), StandardCharsets.UTF_8); + } + + case TINYINT, SMALLINT, INT, BIGINT, UINT1, UINT2, UINT4, UINT8 -> String.valueOf( + ((BaseIntVector) vector).getValueAsLong(position) + ); + + case UNION -> { + UnionVector unionVector = (UnionVector) vector; + // Find the child field that isn't null, which is the active variant. + for (var variantVec : unionVector.getChildrenFromFields()) { + if (variantVec.isNull(position) == false) { + yield getString(variantVec, position, dictionaries); + } + } + yield null; + } + + default -> { + throw new ArrowFormatException("Arrow type [" + vector.getMinorType() + "] cannot be converted to string"); + } + }; + } +} diff --git a/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/ArrowToXContent.java b/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/ArrowToXContent.java new file mode 100644 index 0000000000000..4f632efd8ce01 --- /dev/null +++ b/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/ArrowToXContent.java @@ -0,0 +1,467 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.xcontent; + +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.databind.util.ByteBufferBackedInputStream; + +import org.apache.arrow.vector.BaseIntVector; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float2Vector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VariableWidthFieldVector; +import org.apache.arrow.vector.complex.BaseListVector; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.util.ReusableByteArray; +import org.elasticsearch.libs.arrow.ArrowFormatException; +import org.elasticsearch.xcontent.XContentGenerator; +import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentType; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.EnumSet; +import java.util.Map; + +/** + * Utility methods to serialize Arrow dataframes to XContent events. + *

+ * Limitations and caveats: + *

    + *
  • time and timestamps are converted to milliseconds or nanoseconds depending on their unit + *
  • + *
  • some types aren't implemented + *
  • + *
+ * + * @see Arrow data types + * @see Arrow schema + */ +public class ArrowToXContent { + + private static final EnumSet STRING_TYPES = EnumSet.of( + Types.MinorType.VARCHAR, + Types.MinorType.LARGEVARCHAR, + Types.MinorType.VIEWVARCHAR + ); + + // Reusable buffer to transfer strings and byte values of length smaller than MAX_BUFFER_SIZE + private final ReusableByteArray bytesBuffer = new ReusableByteArray(); + private static final int MAX_BUFFER_SIZE = 1024 * 1024; + + private ReusableByteArray getBuffer(int length) { + return length > MAX_BUFFER_SIZE ? new ReusableByteArray() : bytesBuffer; + } + + /** + * Write a field and its value from an Arrow vector as XContent + * + * @param vector the Arrow vector + * @param position the value position in the vector + * @param dictionaries to look up values for dictionary-encoded vectors + * @param generator XContent output + */ + public void writeField(ValueVector vector, int position, Map dictionaries, XContentGenerator generator) + throws IOException { + generator.writeFieldName(vector.getName()); + writeValue(vector, position, dictionaries, generator); + } + + /** + * Write a value from an Arrow vector as XContent + * + * @param vector the Arrow vector + * @param position the value position in the vector + * @param dictionaries to look up values for dictionary-encoded vectors + * @param generator XContent output + */ + public void writeValue(ValueVector vector, int position, Map dictionaries, XContentGenerator generator) + throws IOException { + + if (vector.isNull(position)) { + generator.writeNull(); + return; + } + + var field = vector.getField(); + var extension = field.getMetadata().get(ArrowType.ExtensionType.EXTENSION_METADATA_KEY_NAME); + + var dictEncoding = field.getDictionary(); + if (dictEncoding != null) { + // Note: to improve performance and reduce GC thrashing, we could eagerly convert dictionary + // VarCharVectors to String arrays (likely the most frequent use of dictionaries) + Dictionary dictionary = dictionaries.get(dictEncoding.getId()); + // The spec allows any integer type, although signed 32 bits are recommended + position = (int) ((BaseIntVector) vector).getValueAsLong(position); + vector = dictionary.getVector(); + + // Dictionary entries can be null + if (vector.isNull(position)) { + generator.writeNull(); + return; + } + } + + if (extension != null) { + switch (extension) { + case "arrow.json" -> { + writeJsonExtensionValue(vector, position, generator); + return; + } + // Other canonical extensions: uuid, tensors, opaque, 8-bit boolean + // See https://arrow.apache.org/docs/format/CanonicalExtensions.html + // + // TODO: GeoArrow (non canonical) + // See https://geoarrow.org/ + } + } + + // Use an expression switch to make sure the compiler checks that every enumeration member is used. + + Void x = switch (vector.getMinorType()) { + + // ---- Numbers + // Performance: we could have cast the vector to the common BaseIntVector/FloatingPoint interface, + // but this would cause more costly casts and polymorphic dispatch to access the value, whereas + // concrete classes are final, allowing better optimizations or even inlining. + case TINYINT -> { + generator.writeNumber(((TinyIntVector) vector).get(position)); + yield null; + } + + case SMALLINT -> { + generator.writeNumber(((SmallIntVector) vector).get(position)); + yield null; + } + + case INT -> { + generator.writeNumber(((IntVector) vector).getValueAsLong(position)); + yield null; + } + + case BIGINT -> { + generator.writeNumber(((BigIntVector) vector).get(position)); + yield null; + } + + case UINT1 -> { + generator.writeNumber(((UInt1Vector) vector).getValueAsLong(position)); + yield null; + } + + case UINT2 -> { + generator.writeNumber(((UInt2Vector) vector).get(position)); + yield null; + } + + case UINT4 -> { + // Use valueAsLong to have unsigned integers if the value is greater than 0x7FFF_FFFF + generator.writeNumber(((UInt4Vector) vector).getValueAsLong(position)); + yield null; + } + + case UINT8 -> { + generator.writeNumber(((UInt8Vector) vector).get(position)); + yield null; + } + + case FLOAT2 -> { + generator.writeNumber(((Float2Vector) vector).getValueAsFloat(position)); + yield null; + } + + case FLOAT4 -> { + generator.writeNumber(((Float4Vector) vector).get(position)); + yield null; + } + + case FLOAT8 -> { + generator.writeNumber(((Float8Vector) vector).get(position)); + yield null; + } + + case DECIMAL -> { + var dVector = (DecimalVector) vector; + generator.writeNumber(dVector.getObjectNotNull(position)); + yield null; + } + + case DECIMAL256 -> { + var dVector = (Decimal256Vector) vector; + generator.writeNumber(dVector.getObjectNotNull(position)); + yield null; + } + + // ---- Booleans + + case BIT -> { + generator.writeBoolean(((BitVector) vector).get(position) != 0); + yield null; + } + + // ---- Strings + + case VARCHAR, LARGEVARCHAR, VIEWVARCHAR -> { + var bytesVector = (VariableWidthFieldVector) vector; + var buffer = getBuffer(bytesVector.getValueLength(position)); + bytesVector.read(position, buffer); + generator.writeUTF8String(buffer.getBuffer(), 0, (int) buffer.getLength()); + yield null; + } + + // ---- Binary + + case VARBINARY, LARGEVARBINARY, VIEWVARBINARY -> { + var bytesVector = (VariableWidthFieldVector) vector; + var buffer = getBuffer(bytesVector.getValueLength(position)); + bytesVector.read(position, buffer); + generator.writeBinary(buffer.getBuffer(), 0, (int) buffer.getLength()); + yield null; + } + + case FIXEDSIZEBINARY -> { + var bytesVector = (FixedSizeBinaryVector) vector; + var buffer = getBuffer(bytesVector.getByteWidth()); + bytesVector.read(position, buffer); + generator.writeBinary(buffer.getBuffer(), 0, (int) buffer.getLength()); + yield null; + } + + // ----- Timestamps + // + // Timestamp values are relative to the Unix epoch in UTC, with an optional timezone. + // The ES date type has no timezone, so we drop this information. + // (TODO: define where the TZ should go, e.g. providing the name of a TZ field in the field's metadata) + // + // Seconds and millis are stored as millis, and micros and nanos as nanos, so that there's + // no precision loss. (FIXME: define this conversion using the ES field type) + + case TIMESTAMPSEC, TIMESTAMPMICRO, TIMESTAMPSECTZ, TIMESTAMPMICROTZ -> { + var tsVector = (TimeStampVector) vector; + generator.writeNumber(tsVector.get(position) * 1000L); + yield null; + } + + case TIMESTAMPMILLI, TIMESTAMPNANO, TIMESTAMPMILLITZ, TIMESTAMPNANOTZ -> { + var tsVector = (TimeStampVector) vector; + generator.writeNumber(tsVector.get(position)); + yield null; + } + + // ---- Date + // + // Time since the epoch, in days or millis evenly divisible by 86_400_000 + // Stored as millis + + case DATEDAY -> { + var ddVector = (DateDayVector) vector; + generator.writeNumber(ddVector.get(position) * 86_400_000); + yield null; + } + + case DATEMILLI -> { + var dmVector = (DateMilliVector) vector; + generator.writeNumber(dmVector.get(position)); + yield null; + } + + // ----- Time + // + // Time since midnight, either a 32-bit or 64-bit signed integer. + // There is no equivalent in ES, but we still convert to millis or nanos + // to be consistent with timestamps. + + case TIMESEC -> { + var tVector = (TimeSecVector) vector; + generator.writeNumber(tVector.get(position) * 1000); + yield null; + } + + case TIMEMILLI -> { + var tVector = (TimeMilliVector) vector; + generator.writeNumber(tVector.get(position)); + yield null; + } + + case TIMEMICRO -> { + var tVector = (TimeMicroVector) vector; + generator.writeNumber(tVector.get(position) * 1000); + yield null; + } + + case TIMENANO -> { + var tsVector = (TimeNanoVector) vector; + generator.writeNumber(tsVector.get(position)); + yield null; + } + + // ---- Other fixed size types + + case DURATION -> { + var dVector = (DurationVector) vector; + long value = DurationVector.get(dVector.getDataBuffer(), position); + + value *= switch (dVector.getUnit()) { + case SECOND, MICROSECOND -> 1000L; + case MILLISECOND, NANOSECOND -> 1L; + }; + + generator.writeNumber(value); + yield null; + } + + // ---- Structured types + + case LIST, FIXED_SIZE_LIST, LISTVIEW -> { + var listVector = (BaseListVector) vector; + var valueVector = listVector.getChildrenFromFields().getFirst(); + int start = listVector.getElementStartIndex(position); + int end = listVector.getElementEndIndex(position); + + generator.writeStartArray(); + for (int i = start; i < end; i++) { + writeValue(valueVector, i, dictionaries, generator); + } + generator.writeEndArray(); + yield null; + } + + case MAP -> { + // A map is a container vector that is composed of a list of struct values with "key" and "value" fields. The MapVector + // is nullable, but if a map is set at a given index, there must be an entry. In other words, the StructVector data is + // non-nullable. Also for a given entry, the "key" is non-nullable, however the "value" can be null. + + var mapVector = (MapVector) vector; + var structVector = (StructVector) mapVector.getChildrenFromFields().getFirst(); + var kVector = structVector.getChildrenFromFields().getFirst(); + if (STRING_TYPES.contains(kVector.getMinorType()) == false) { + throw new ArrowFormatException("Maps must have string keys"); + } + + var keyVector = (VarCharVector) kVector; + var valueVector = structVector.getChildrenFromFields().get(1); + + int start = mapVector.getElementStartIndex(position); + int end = mapVector.getElementEndIndex(position); + + generator.writeStartObject(); + for (int i = start; i < end; i++) { + if (keyVector.isNull(i)) { + throw new ArrowFormatException("Null map key found at position [" + position + "]"); + } + var key = new String(keyVector.get(i), StandardCharsets.UTF_8); + generator.writeFieldName(key); + writeValue(valueVector, i, dictionaries, generator); + } + generator.writeEndObject(); + yield null; + } + + case STRUCT -> { + var structVector = (StructVector) vector; + generator.writeStartObject(); + for (var structField : structVector.getChildrenFromFields()) { + generator.writeFieldName(structField.getName()); + writeValue(structField, position, dictionaries, generator); + } + generator.writeEndObject(); + yield null; + } + + case DENSEUNION -> { + var unionVector = (DenseUnionVector) vector; + var typeId = unionVector.getTypeId(position); + var valueVector = unionVector.getVectorByType(typeId); + var valuePosition = unionVector.getOffset(position); + + writeValue(valueVector, valuePosition, dictionaries, generator); + yield null; + } + + case UNION -> { // sparse union + var unionVector = (UnionVector) vector; + var typeId = unionVector.getTypeValue(position); + var valueVector = unionVector.getVectorByType(typeId); + + writeValue(valueVector, position, dictionaries, generator); + yield null; + } + + case NULL -> { + // Should have been handled at the beginning of this method, + // but keep it to have exhaustive coverage of enum values. + generator.writeNull(); + yield null; + } + + case INTERVALYEAR, INTERVALDAY, INTERVALMONTHDAYNANO, // ES doesn't have any interval types + LARGELIST, LARGELISTVIEW // 64-bit vector support is incomplete + -> throw new JsonParseException( + "Arrow type [" + vector.getMinorType() + "] not supported for field [" + vector.getName() + "]" + ); + + case RUNENDENCODED -> { + var reVector = (RunEndEncodedVector) vector; + // Caveat: performance could be improved. getRunEnd() does a binary search for the position + // in the value array, and so does isNull() at the top of this method. If run-end encoding + // is heavily used, we could use an optimized cursor structure that is moved forward at + // each iteration in the calling loop. + writeValue(reVector.getValuesVector(), reVector.getRunEnd(position), dictionaries, generator); + yield null; + } + + case EXTENSIONTYPE -> throw new JsonParseException( + "Arrow extension [" + vector.getMinorType() + "] not supported for field [" + vector.getName() + "]" + ); + }; + } + + private static void writeJsonExtensionValue(ValueVector vector, int position, XContentGenerator generator) throws IOException { + if (STRING_TYPES.contains(vector.getMinorType()) == false) { + throw new ArrowFormatException("Json vectors must be strings"); + } + // Parse directly from the Arrow buffer wrapped in a ByteBuffer + var pointer = ((VariableWidthFieldVector) vector).getDataPointer(position); + var buf = pointer.getBuf().nioBuffer(pointer.getOffset(), (int) pointer.getLength()); + + var parser = XContentType.JSON.xContent().createParser(XContentParserConfiguration.EMPTY, new ByteBufferBackedInputStream(buf)); + + generator.copyCurrentStructure(parser); + } +} diff --git a/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/XContentBuffer.java b/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/XContentBuffer.java new file mode 100644 index 0000000000000..f26001257efe9 --- /dev/null +++ b/modules/arrow/src/main/java/org/elasticsearch/arrow/xcontent/XContentBuffer.java @@ -0,0 +1,443 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.xcontent; + +import com.fasterxml.jackson.core.JsonGenerationException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.util.TokenBuffer; + +import org.elasticsearch.core.CheckedConsumer; +import org.elasticsearch.xcontent.XContentFactory; +import org.elasticsearch.xcontent.XContentGenerationException; +import org.elasticsearch.xcontent.XContentGenerator; +import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentType; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; + +/** + * A buffer of {@code XContent} events that can be replayed as an {@code XContentParser}. Useful to create synthetic + * JSON documents that are fed to existing JSON parsers. + */ +public class XContentBuffer implements XContentGenerator { + + /** Buffer used to write content **/ + private final TokenBuffer generator; + + public XContentBuffer() { + this.generator = new TokenBuffer(new ObjectMapper(), false); + } + + /** + * Return this buffer as an {@code XContent} parser. Events can be added to the buffer while events are + * consumed from the parser, but these appends are not thread-safe. + */ + public XContentParser asParser() { + return new ArrowJsonXContentParser(XContentParserConfiguration.EMPTY, this.generator.asParser()); + } + + @Override + public XContentType contentType() { + return null; + } + + @Override + public final void usePrettyPrint() {} + + @Override + public boolean isPrettyPrint() { + return false; + } + + @Override + public void usePrintLineFeedAtEnd() {} + + @Override + public void writeStartObject() throws IOException { + try { + generator.writeStartObject(); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeEndObject() throws IOException { + try { + generator.writeEndObject(); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeStartArray() throws IOException { + try { + generator.writeStartArray(); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeEndArray() throws IOException { + try { + generator.writeEndArray(); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeFieldName(String name) throws IOException { + try { + generator.writeFieldName(name); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNull() throws IOException { + try { + generator.writeNull(); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNullField(String name) throws IOException { + try { + generator.writeNullField(name); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeBooleanField(String name, boolean value) throws IOException { + try { + generator.writeBooleanField(name, value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeBoolean(boolean value) throws IOException { + try { + generator.writeBoolean(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumberField(String name, double value) throws IOException { + try { + generator.writeNumberField(name, value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumber(double value) throws IOException { + try { + generator.writeNumber(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumberField(String name, float value) throws IOException { + try { + generator.writeNumberField(name, value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumber(float value) throws IOException { + try { + generator.writeNumber(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumberField(String name, int value) throws IOException { + try { + generator.writeNumberField(name, value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumberField(String name, BigInteger value) throws IOException { + // as jackson's JsonGenerator doesn't have this method for BigInteger + // we have to implement it ourselves + try { + generator.writeFieldName(name); + generator.writeNumber(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumberField(String name, BigDecimal value) throws IOException { + try { + generator.writeNumberField(name, value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumber(int value) throws IOException { + try { + generator.writeNumber(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumberField(String name, long value) throws IOException { + try { + generator.writeNumberField(name, value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumber(long value) throws IOException { + try { + generator.writeNumber(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumber(short value) throws IOException { + try { + generator.writeNumber(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumber(BigInteger value) throws IOException { + try { + generator.writeNumber(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeNumber(BigDecimal value) throws IOException { + try { + generator.writeNumber(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeStringField(String name, String value) throws IOException { + try { + generator.writeStringField(name, value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeString(String value) throws IOException { + try { + generator.writeString(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeStringArray(String[] array) throws IOException { + try { + generator.writeArray(array, 0, array.length); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeString(char[] value, int offset, int len) throws IOException { + try { + generator.writeString(value, offset, len); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeUTF8String(byte[] value, int offset, int length) throws IOException { + try { + // TokenBuffer doesn't support writeUTF8String + generator.writeString(new String(value, offset, length, StandardCharsets.UTF_8)); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeBinaryField(String name, byte[] value) throws IOException { + try { + generator.writeBinaryField(name, value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeBinary(byte[] value) throws IOException { + try { + generator.writeBinary(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeBinary(byte[] value, int offset, int len) throws IOException { + try { + generator.writeBinary(value, offset, len); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void writeRawField(String name, InputStream content) throws IOException { + if (content.markSupported() == false) { + // needed for the XContentFactory.xContentType call + content = new BufferedInputStream(content); + } + XContentType contentType = XContentFactory.xContentType(content); + if (contentType == null) { + throw new IllegalArgumentException("Can't write raw bytes whose xcontent-type can't be guessed"); + } + writeRawField(name, content, contentType); + } + + @Override + public void writeRawField(String name, InputStream content, XContentType contentType) throws IOException { + writeFieldName(name); + writeRawValue(content, contentType); + } + + @Override + public void writeRawValue(InputStream content, XContentType contentType) throws IOException { + try (XContentParser parser = XContentFactory.xContent(contentType).createParser(XContentParserConfiguration.EMPTY, content)) { + parser.nextToken(); + copyCurrentStructure(parser); + } + } + + @Override + public void writeRawValue(String value) throws IOException { + try { + generator.writeRawValue(value); + } catch (JsonGenerationException e) { + throw new XContentGenerationException(e); + } + } + + @Override + public void copyCurrentStructure(XContentParser parser) throws IOException { + // the start of the parser + if (parser.currentToken() == null) { + parser.nextToken(); + } + copyCurrentStructure(this, parser); + } + + /** + * Low level implementation detail of {@link XContentGenerator#copyCurrentStructure(XContentParser)}. + */ + private static void copyCurrentStructure(XContentGenerator destination, XContentParser parser) throws IOException { + XContentParser.Token token = parser.currentToken(); + + // Let's handle field-name separately first + if (token == XContentParser.Token.FIELD_NAME) { + destination.writeFieldName(parser.currentName()); + token = parser.nextToken(); + // fall-through to copy the associated value + } + + switch (token) { + case START_ARRAY -> { + destination.writeStartArray(); + while (parser.nextToken() != XContentParser.Token.END_ARRAY) { + copyCurrentStructure(destination, parser); + } + destination.writeEndArray(); + } + case START_OBJECT -> { + destination.writeStartObject(); + while (parser.nextToken() != XContentParser.Token.END_OBJECT) { + copyCurrentStructure(destination, parser); + } + destination.writeEndObject(); + } + default -> // others are simple: + destination.copyCurrentEvent(parser); + } + } + + @Override + public void writeDirectField(String name, CheckedConsumer writer) throws IOException { + throw new UnsupportedOperationException("writeDirectField is not supported"); + } + + @Override + public void flush() throws IOException { + generator.flush(); + } + + @Override + public void close() throws IOException { + if (generator.isClosed()) { + return; + } + generator.close(); + } + + @Override + public boolean isClosed() { + return generator.isClosed(); + } +} diff --git a/modules/arrow/src/main/plugin-metadata/plugin-security.policy b/modules/arrow/src/main/plugin-metadata/plugin-security.policy new file mode 100644 index 0000000000000..eb16f071ae2dd --- /dev/null +++ b/modules/arrow/src/main/plugin-metadata/plugin-security.policy @@ -0,0 +1,24 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +// Arrow uses Unsafe to access memory and accesses a private field in java.nio.Buffer +// See also additional global permissions below +grant codebase "${codebase.arrow-memory-core}" { + permission java.lang.RuntimePermission "accessDeclaredMembers"; + permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; + permission java.lang.RuntimePermission "accessClassInPackage.sun.misc"; +}; + +//// Everything else: +grant { + // This should be covered by arrow-memory-core above, but field access to java.nio.Buffer.address + // from org.apache.arrow.memory.util.MemoryUtil isn't in a privileged section. + permission java.lang.RuntimePermission "accessDeclaredMembers"; + permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; +}; diff --git a/modules/arrow/src/test/java/org/elasticsearch/arrow/bulk/ArrowBulkIncrementalParserTests.java b/modules/arrow/src/test/java/org/elasticsearch/arrow/bulk/ArrowBulkIncrementalParserTests.java new file mode 100644 index 0000000000000..eb49feaad48b2 --- /dev/null +++ b/modules/arrow/src/test/java/org/elasticsearch/arrow/bulk/ArrowBulkIncrementalParserTests.java @@ -0,0 +1,465 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.bulk; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryEncoder; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.Text; +import org.elasticsearch.action.DocWriteRequest; +import org.elasticsearch.action.delete.DeleteRequest; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.action.update.UpdateRequest; +import org.elasticsearch.common.bytes.BytesArray; +import org.elasticsearch.core.RestApiVersion; +import org.elasticsearch.libs.arrow.Arrow; +import org.elasticsearch.search.fetch.subphase.FetchSourceContext; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentType; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.function.BiConsumer; +import java.util.function.Consumer; + +import static org.hamcrest.Matchers.equalTo; + +public class ArrowBulkIncrementalParserTests extends ESTestCase { + + // ----- Test Arrow batches and incremental parsing + + public void testBatchingAndChunking() throws IOException { + checkBatchingAndChunking(1, 10, false); + checkBatchingAndChunking(1, 10, true); + checkBatchingAndChunking(2, 10, false); + checkBatchingAndChunking(2, 10, true); + } + + /** Create a payload for a 1-column dataframe (int and string), given a number of batches and rows per batch */ + private void checkBatchingAndChunking(int batchCount, int rowCount, boolean incremental) throws IOException { + byte[] payload; + + // Create a dataframe with two columns: integer and string + Field intField = new Field("ints", FieldType.nullable(new ArrowType.Int(32, true)), null); + Field strField = new Field("strings", FieldType.nullable(new ArrowType.Utf8()), null); + Schema schema = new Schema(List.of(intField, strField)); + + // Create vectors and write them to a byte array + try (var allocator = Arrow.newChildAllocator("test", 0, Long.MAX_VALUE); var root = VectorSchemaRoot.create(schema, allocator);) { + var baos = new ByteArrayOutputStream(); + IntVector intVector = (IntVector) root.getVector(0); + VarCharVector stringVector = (VarCharVector) root.getVector(1); + + try (ArrowStreamWriter writer = new ArrowStreamWriter(root, null, baos)) { + for (int batch = 0; batch < batchCount; batch++) { + intVector.allocateNew(rowCount); + stringVector.allocateNew(rowCount); + for (int row = 0; row < rowCount; row++) { + int globalRow = row + batch * rowCount; + intVector.set(row, globalRow); + stringVector.set(row, new Text("row" + globalRow)); + } + root.setRowCount(rowCount); + writer.writeBatch(); + } + } + payload = baos.toByteArray(); + } + + var operations = new ArrayList>(); + try (var parser = createParser("test", operations)) { + parse(parser, payload, incremental); + } + + assertEquals(batchCount * rowCount, operations.size()); + + for (int i = 0; i < operations.size(); i++) { + IndexRequest operation = (IndexRequest) operations.get(i); + + assertEquals(DocWriteRequest.OpType.INDEX, operation.opType()); + assertEquals("test", operation.index()); + + assertEquals(XContentType.CBOR, operation.getContentType()); + + var map = operation.sourceAsMap(); + assertEquals(i, map.get("ints")); + assertEquals("row" + i, map.get("strings")); + } + } + + public void testInlineIdAndIndex() throws Exception { + byte[] payload; + + Field indexField = new Field("_index", FieldType.nullable(new ArrowType.Utf8()), null); + Field idField = new Field("_id", FieldType.nullable(new ArrowType.Utf8()), null); + Field intField = new Field("ints", FieldType.nullable(new ArrowType.Int(32, true)), null); + Field strField = new Field("strings", FieldType.nullable(new ArrowType.Utf8()), null); + Schema schema = new Schema(List.of(indexField, idField, intField, strField)); + + try (var allocator = Arrow.newChildAllocator("test", 0, Long.MAX_VALUE); var root = VectorSchemaRoot.create(schema, allocator);) { + var baos = new ByteArrayOutputStream(); + VarCharVector indexVector = (VarCharVector) root.getVector(0); + VarCharVector idVector = (VarCharVector) root.getVector(1); + IntVector intVector = (IntVector) root.getVector(2); + VarCharVector stringVector = (VarCharVector) root.getVector(3); + + try (ArrowStreamWriter writer = new ArrowStreamWriter(root, null, baos)) { + indexVector.allocateNew(4); + idVector.allocateNew(4); + intVector.allocateNew(4); + stringVector.allocateNew(4); + + // No index, no id + indexVector.setNull(0); + idVector.setNull(0); + stringVector.set(0, new Text("row0")); + intVector.set(0, 0); + + // No index, id + indexVector.setNull(1); + idVector.set(1, new Text("id1")); + stringVector.set(1, new Text("row1")); + intVector.set(1, 1); + + // Index, no id + indexVector.set(2, new Text("index2")); + idVector.setNull(2); + stringVector.set(2, new Text("row2")); + intVector.set(2, 2); + + // Index & id + indexVector.set(3, new Text("index3")); + idVector.set(3, new Text("id3")); + stringVector.set(1, new Text("row3")); + intVector.set(1, 3); + + root.setRowCount(4); + writer.writeBatch(); + } + payload = baos.toByteArray(); + } + + var operations = new ArrayList>(); + try (var parser = createParser("defaultIndex", operations)) { + parse(parser, payload, false); + } + ; + + IndexRequest operation = (IndexRequest) operations.get(0); + assertEquals("defaultIndex", operation.index()); + assertEquals(null, operation.id()); + + operation = (IndexRequest) operations.get(1); + assertEquals("defaultIndex", operation.index()); + assertEquals("id1", operation.id()); + + operation = (IndexRequest) operations.get(2); + assertEquals("index2", operation.index()); + assertEquals(null, operation.id()); + + operation = (IndexRequest) operations.get(3); + assertEquals("index3", operation.index()); + assertEquals("id3", operation.id()); + + } + + // ----- Test action decoding + + /** Action as a map of (string, string) */ + public void testActionsAsStringMap() throws Exception { + + try ( + var allocator = Arrow.newChildAllocator("test", 0, Long.MAX_VALUE); + var vector = new MapVector("action", allocator, FieldType.nullable(new ArrowType.Map(false)), null); + var parser = createParser("default-index", List.of()) + ) { + var w = vector.getWriter(); + + w.startMap(); + + // Override operation type (default is create) + w.startEntry(); + w.key().varChar().writeVarChar("op_type"); + w.value().varChar().writeVarChar("update"); + w.endEntry(); + + // Override default "default-index" index + w.startEntry(); + w.key().varChar().writeVarChar("_index"); + w.value().varChar().writeVarChar("first-index"); + w.endEntry(); + + // Set if_seq_no as a string, to test a lazy approach with a simple (string, string) map + w.startEntry(); + w.key().varChar().writeVarChar("if_seq_no"); + w.value().varChar().writeVarChar("3"); + w.endEntry(); + + w.endMap(); + + w.startMap(); + + // Override default "default-index" index + w.startEntry(); + w.key().varChar().writeVarChar("_index"); + w.value().varChar().writeVarChar("second-index"); + w.endEntry(); + + // Override operation type (default is create) + w.startEntry(); + w.key().varChar().writeVarChar("op_type"); + w.value().varChar().writeVarChar("index"); + w.endEntry(); + + // Set version as a string, to test a lazy approach with a simple (string, string) map + w.startEntry(); + w.key().varChar().writeVarChar("if_seq_no"); + w.value().varChar().writeVarChar("4"); + w.endEntry(); + + w.endMap(); + + vector.setValueCount(w.getPosition()); + // Value type is varchar + assertEquals(Types.MinorType.VARCHAR, vector.getChildrenFromFields().get(0).getChildrenFromFields().get(1).getMinorType()); + + { + var request = parser.parseAction(vector, 0, null, null); + assertEquals(DocWriteRequest.OpType.UPDATE, request.opType()); + assertEquals("first-index", request.index()); + assertEquals(3, request.ifSeqNo()); + } + + { + var request = parser.parseAction(vector, 1, null, null); + assertEquals(DocWriteRequest.OpType.INDEX, request.opType()); + assertEquals("second-index", request.index()); + assertEquals(4, request.ifSeqNo()); + } + } + } + + /** Action as a map of (string, union(string, int)) */ + public void testActionsAsUnionMap() throws Exception { + + try ( + var allocator = Arrow.newChildAllocator("test", 0, Long.MAX_VALUE); + var vector = new MapVector("action", allocator, FieldType.nullable(new ArrowType.Map(false)), null); + var parser = createParser("default-index", List.of()) + ) { + var w = vector.getWriter(); + + w.startMap(); + + // Override operation type (default is create) + w.startEntry(); + w.key().varChar().writeVarChar("op_type"); + w.value().varChar().writeVarChar("update"); + w.endEntry(); + + // Override default "default-index" index + w.startEntry(); + w.key().varChar().writeVarChar("_index"); + w.value().varChar().writeVarChar("some-index"); + w.endEntry(); + + // Set version as a number. This promotes the value field to a union type + w.startEntry(); + w.key().varChar().writeVarChar("if_seq_no"); + w.value().integer().writeInt(3); + w.endEntry(); + + w.endMap(); + + vector.setValueCount(w.getPosition()); + var request = parser.parseAction(vector, 0, null, null); + + // Value type is a union + assertEquals(Types.MinorType.UNION, vector.getChildrenFromFields().get(0).getChildrenFromFields().get(1).getMinorType()); + + assertEquals(DocWriteRequest.OpType.UPDATE, request.opType()); + assertEquals("some-index", request.index()); + assertEquals(3, request.ifSeqNo()); + } + } + + /** Action as a struct */ + public void testActionsAsStruct() throws Exception { + + try ( + var allocator = Arrow.newChildAllocator("test", 0, Long.MAX_VALUE); + var vector = new StructVector("action", allocator, FieldType.nullable(new ArrowType.Struct()), null); + var parser = createParser("default-index", List.of()) + ) { + var w = vector.getWriter(); + + w.start(); + w.varChar("op_type").writeVarChar("update"); + w.varChar("_index").writeVarChar("first-index"); + w.integer("if_seq_no").writeInt(3); + w.end(); + + w.start(); + w.varChar("op_type").writeVarChar("index"); + w.varChar("_index").writeVarChar("second-index"); + w.integer("if_seq_no").writeInt(4); + w.end(); + + vector.setValueCount(w.getPosition()); + + { + var request = parser.parseAction(vector, 0, null, null); + assertEquals(DocWriteRequest.OpType.UPDATE, request.opType()); + assertEquals("first-index", request.index()); + assertEquals(3, request.ifSeqNo()); + } + + { + var request = parser.parseAction(vector, 1, null, null); + assertEquals(DocWriteRequest.OpType.INDEX, request.opType()); + assertEquals("second-index", request.index()); + assertEquals(4, request.ifSeqNo()); + } + } + } + + // ----- Dictionary encoding + public void testDictionaryEncoding() throws Exception { + + ByteArrayOutputStream payload = new ByteArrayOutputStream(); + + try ( + var allocator = Arrow.newChildAllocator("test", 0, Long.MAX_VALUE); + VarCharVector dictVector = new VarCharVector("dict", allocator); + VarCharVector vector = new VarCharVector("data_field", allocator); + DictionaryProvider.MapDictionaryProvider dictionaryProvider = new DictionaryProvider.MapDictionaryProvider(); + ) { + // create dictionary lookup vector + dictVector.allocateNewSafe(); + dictVector.setSafe(0, new Text("aa")); + dictVector.setSafe(1, new Text("bb")); + dictVector.setSafe(2, new Text("cc")); + dictVector.setValueCount(3); + + // create dictionary + long dictionaryId = 1L; + Dictionary dictionary = new Dictionary(dictVector, new DictionaryEncoding(dictionaryId, false, /*indexType=*/null)); + + dictionaryProvider.put(dictionary); + + // create original data vector + vector.allocateNewSafe(); + vector.setSafe(0, new Text("bb")); + vector.setSafe(1, new Text("bb")); + vector.setSafe(2, new Text("cc")); + vector.setSafe(3, new Text("aa")); + vector.setValueCount(4); + + // Encode the vector with the dictionary + IntVector encodedVector = (IntVector) DictionaryEncoder.encode(vector, dictionary); + + // create VectorSchemaRoot + List fields = List.of(encodedVector.getField()); + List vectors = List.of(encodedVector); + + try ( + VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors); + ArrowStreamWriter writer = new ArrowStreamWriter(root, dictionaryProvider, payload); + ) { + // write data + writer.start(); + writer.writeBatch(); + writer.end(); + } + + var operations = new ArrayList>(); + try (var parser = createParser("defaultIndex", operations)) { + parse(parser, payload.toByteArray(), false); + } + + // Check that dictionary-encoded values were correctly decoded + assertEquals("bb", ((IndexRequest) operations.get(0)).sourceAsMap().get("data_field")); + assertEquals("bb", ((IndexRequest) operations.get(1)).sourceAsMap().get("data_field")); + assertEquals("cc", ((IndexRequest) operations.get(2)).sourceAsMap().get("data_field")); + assertEquals("aa", ((IndexRequest) operations.get(3)).sourceAsMap().get("data_field")); + } + } + + // ----- Utilities + + private static ArrowBulkIncrementalParser createParser(String defaultIndex, List> requests) { + + DocWriteRequest.OpType defaultOpType = DocWriteRequest.OpType.INDEX; + String defaultRouting = null; + FetchSourceContext defaultFetchSourceContext = null; + String defaultPipeline = null; + Boolean defaultRequireAlias = false; + Boolean defaultRequireDataStream = false; + Boolean defaultListExecutedPipelines = false; + + boolean allowExplicitIndex = true; + XContentType xContentType = null; + BiConsumer indexRequestConsumer = (r, t) -> requests.add(r); + Consumer updateRequestConsumer = requests::add; + Consumer deleteRequestConsumer = requests::add; + + return new ArrowBulkIncrementalParser( + defaultOpType, + defaultIndex, + defaultRouting, + defaultFetchSourceContext, + defaultPipeline, + defaultRequireAlias, + defaultRequireDataStream, + defaultListExecutedPipelines, + allowExplicitIndex, + xContentType, + XContentParserConfiguration.EMPTY.withRestApiVersion(RestApiVersion.current()), + indexRequestConsumer, + updateRequestConsumer, + deleteRequestConsumer + ); + } + + private void parse(ArrowBulkIncrementalParser parser, byte[] payload, boolean incremental) throws IOException { + + int consumed = 0; + var request = new BytesArray(payload); + + if (incremental) { + // Borrowed from BulkRequestParserTests + for (int i = 0; i < request.length() - 1; ++i) { + consumed += parser.parse(request.slice(consumed, i - consumed + 1), false); + } + consumed += parser.parse(request.slice(consumed, request.length() - consumed), true); + assertThat(consumed, equalTo(request.length())); + } else { + consumed = parser.parse(request, true); + } + + assertEquals(payload.length, consumed); + } +} diff --git a/modules/arrow/src/test/java/org/elasticsearch/arrow/bulk/BytesReferenceChannelTests.java b/modules/arrow/src/test/java/org/elasticsearch/arrow/bulk/BytesReferenceChannelTests.java new file mode 100644 index 0000000000000..0381b707bd8c9 --- /dev/null +++ b/modules/arrow/src/test/java/org/elasticsearch/arrow/bulk/BytesReferenceChannelTests.java @@ -0,0 +1,50 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.bulk; + +import org.elasticsearch.common.bytes.BytesArray; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.bytes.CompositeBytesReference; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.nio.channels.Channels; +import java.util.ArrayList; + +public class BytesReferenceChannelTests extends ESTestCase { + + /** + * Check iteration on the buffers of a composite byteref + */ + public void testMultipleBuffers() throws IOException { + + var chunks = new ArrayList(); + byte[] chunk = null; + for (int i = 0; i < 0x100; i++) { + if (i % 0x10 == 0) { + chunk = new byte[0x10]; + chunks.add(new BytesArray(chunk)); + } + chunk[i % 0x10] = (byte) i; + } + + var bytesref = CompositeBytesReference.of(chunks.toArray(new BytesReference[0])); + + try (var channel = new BytesReferenceChannel(bytesref)) { + var in = Channels.newInputStream(channel); + + for (int i = 0; i < 0x100; i++) { + assertEquals(i, in.read()); + } + + assertEquals(-1, in.read()); + } + } +} diff --git a/modules/arrow/src/test/java/org/elasticsearch/arrow/xcontent/ArrowToXContentTests.java b/modules/arrow/src/test/java/org/elasticsearch/arrow/xcontent/ArrowToXContentTests.java new file mode 100644 index 0000000000000..142657e93d086 --- /dev/null +++ b/modules/arrow/src/test/java/org/elasticsearch/arrow/xcontent/ArrowToXContentTests.java @@ -0,0 +1,531 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.arrow.xcontent; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.Float16; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; +import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.DurationVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.FixedSizeBinaryVector; +import org.apache.arrow.vector.Float2Vector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.LargeVarBinaryVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeNanoVector; +import org.apache.arrow.vector.TimeSecVector; +import org.apache.arrow.vector.TimeStampMicroTZVector; +import org.apache.arrow.vector.TimeStampMicroVector; +import org.apache.arrow.vector.TimeStampMilliTZVector; +import org.apache.arrow.vector.TimeStampMilliVector; +import org.apache.arrow.vector.TimeStampNanoTZVector; +import org.apache.arrow.vector.TimeStampNanoVector; +import org.apache.arrow.vector.TimeStampSecTZVector; +import org.apache.arrow.vector.TimeStampSecVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ViewVarBinaryVector; +import org.apache.arrow.vector.ViewVarCharVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.elasticsearch.libs.arrow.Arrow; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xcontent.XContentType; +import org.junit.After; +import org.junit.Before; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; +import java.time.ZoneId; +import java.util.Base64; +import java.util.List; + +public class ArrowToXContentTests extends ESTestCase { + + private BufferAllocator allocator; + + @Before + public void init() { + this.allocator = Arrow.newChildAllocator("test", 0, Long.MAX_VALUE); + } + + @After + public void close() { + this.allocator.close(); + } + + private void checkPosition(FieldVector vector, int position, String json) throws IOException { + var arrowToXContent = new ArrowToXContent(); + + var root = new VectorSchemaRoot(List.of(vector)); + // We don't close `root` as it would close `vector` which is owned by the caller. + // This allows checkPosition() to be called several times with the same vector. + + // Roundtrip the vector through its binary representation + var arrowOut = new ByteArrayOutputStream(); + try (var writer = new ArrowStreamWriter(root, null, arrowOut)) { + writer.writeBatch(); + } + + try (var reader = new ArrowStreamReader(new ByteArrayInputStream(arrowOut.toByteArray()), allocator)) { + reader.loadNextBatch(); + var newVector = reader.getVectorSchemaRoot().getVector(0); + + var jsonOut = new ByteArrayOutputStream(); + try (var generator = XContentType.JSON.xContent().createGenerator(jsonOut)) { + generator.writeStartObject(); + arrowToXContent.writeField(newVector, position, null, generator); + generator.writeEndObject(); + } + + assertEquals(json, jsonOut.toString(StandardCharsets.UTF_8)); + } + } + + private FieldVector newVector(String name, Types.MinorType type) { + return type.getNewVector(name, new FieldType(true, type.getType(), null), allocator, null); + } + + // Tests below are in the same order as ArrowToXContent.writeValue() + // + // Note: dictionary encoding is tested in ArrowBulkIncrementalParserTests as + // dictionaries are attached to the StreamReader and need more than checkPosition() above. + + public void testNullValue() throws Exception { + try (var vector = new IntVector("intField", allocator)) { + vector.allocateNew(1); + vector.setNull(0); + vector.setValueCount(1); + + checkPosition(vector, 0, "{\"intField\":null}"); + } + } + + public void testIntegers() throws IOException { + + try (var vector = new TinyIntVector("intField", allocator)) { + vector.allocateNew(1); + vector.set(0, 123); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"intField\":123}"); + } + + try (var vector = new SmallIntVector("intField", allocator)) { + vector.allocateNew(1); + vector.set(0, 123); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"intField\":123}"); + } + + try (var vector = new IntVector("intField", allocator)) { + vector.allocateNew(1); + vector.set(0, 123); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"intField\":123}"); + } + + try (var vector = new BigIntVector("intField", allocator)) { + vector.allocateNew(1); + vector.set(0, 123); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"intField\":123}"); + } + + try (var vector = new UInt1Vector("intField", allocator)) { + vector.allocateNew(2); + vector.set(0, 123); + vector.set(1, 253); // unsigned > 0x7F + vector.setValueCount(2); + checkPosition(vector, 0, "{\"intField\":123}"); + checkPosition(vector, 1, "{\"intField\":253}"); + } + + try (var vector = new UInt2Vector("intField", allocator)) { + vector.allocateNew(2); + vector.set(0, 123); + vector.set(1, 65533); // unsigned > 0x7FFF + vector.setValueCount(2); + checkPosition(vector, 0, "{\"intField\":123}"); + checkPosition(vector, 1, "{\"intField\":65533}"); + } + + try (var vector = new UInt4Vector("intField", allocator)) { + vector.allocateNew(2); + vector.set(0, 123); + + long x = 0xFFFFFFFDL; + assertEquals(4294967293L, x); + // "A narrowing conversion of a signed integer to an integral type T simply + // discards all but the n lowest order bits" + // https://docs.oracle.com/javase/specs/jls/se11/html/jls-5.html#jls-5.1.3 + vector.set(1, (int) x); + + vector.setValueCount(2); + checkPosition(vector, 0, "{\"intField\":123}"); + checkPosition(vector, 1, "{\"intField\":4294967293}"); + } + + try (var vector = new UInt8Vector("intField", allocator)) { + vector.allocateNew(1); + vector.set(0, 123); + vector.setValueCount(1); + // No test for large unsigned value as Java has no support for it. + checkPosition(vector, 0, "{\"intField\":123}"); + } + } + + public void testFloats() throws IOException { + float value = 1.25f; // roundtrips through string. + + try (var vector = new Float2Vector("floatField", allocator)) { + vector.allocateNew(1); + vector.set(0, Float16.toFloat16(value)); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"floatField\":1.25}"); + } + + try (var vector = new Float4Vector("floatField", allocator)) { + vector.allocateNew(1); + vector.set(0, value); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"floatField\":1.25}"); + } + + try (var vector = new Float8Vector("floatField", allocator)) { + vector.allocateNew(1); + vector.set(0, value); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"floatField\":1.25}"); + } + } + + public void testDecimals() throws IOException { + var value = new BigDecimal("1.25"); + + try (var vector = new DecimalVector("decimalField", allocator, value.precision(), value.scale())) { + vector.allocateNew(1); + vector.set(0, new BigDecimal("1.25")); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"decimalField\":1.25}"); + } + + try (var vector = new Decimal256Vector("decimalField", allocator, value.precision(), value.scale())) { + vector.allocateNew(1); + vector.set(0, new BigDecimal("1.25")); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"decimalField\":1.25}"); + } + } + + public void testBoolean() throws IOException { + + try (var vector = new BitVector("bitField", allocator)) { + vector.allocateNew(3); + vector.set(0, 0); // 0 is false, other values are true + vector.set(1, 1); + vector.set(2, 2); + vector.setValueCount(3); + checkPosition(vector, 0, "{\"bitField\":false}"); + checkPosition(vector, 1, "{\"bitField\":true}"); + checkPosition(vector, 2, "{\"bitField\":true}"); + } + } + + public void testVarChar() throws Exception { + try (var vector = new VarCharVector("stringField", allocator)) { + vector.allocateNew(); + vector.set(0, "test".getBytes(StandardCharsets.UTF_8)); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"stringField\":\"test\"}"); + } + + try (var vector = new LargeVarCharVector("stringField", allocator)) { + vector.allocateNew(); + vector.set(0, "test".getBytes(StandardCharsets.UTF_8)); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"stringField\":\"test\"}"); + } + + try (var vector = new ViewVarCharVector("stringField", allocator)) { + vector.allocateNew(); + vector.set(0, "test".getBytes(StandardCharsets.UTF_8)); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"stringField\":\"test\"}"); + } + } + + public void testBinary() throws Exception { + var value = "test".getBytes(StandardCharsets.UTF_8); + var expected = Base64.getEncoder().encodeToString(value); + + try (var vector = new VarBinaryVector("bytesField", allocator)) { + vector.allocateNew(); + vector.set(0, value); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"bytesField\":\"" + expected + "\"}"); + } + + try (var vector = new LargeVarBinaryVector("bytesField", allocator)) { + vector.allocateNew(); + vector.set(0, value); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"bytesField\":\"" + expected + "\"}"); + } + + try (var vector = new ViewVarBinaryVector("bytesField", allocator)) { + vector.allocateNew(); + vector.set(0, value); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"bytesField\":\"" + expected + "\"}"); + } + + try (var vector = new FixedSizeBinaryVector("bytesField", allocator, value.length)) { + vector.allocateNew(); + vector.set(0, value); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"bytesField\":\"" + expected + "\"}"); + } + } + + public void testTimeStamp() throws Exception { + var millis = 1744304614884L; // Thu Apr 10 19:03:34 CEST 2025 + + try (var vector = new TimeStampSecVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, millis / 1000L); + vector.setValueCount(1); + // Check millis value truncated to seconds + checkPosition(vector, 0, "{\"field\":" + (millis / 1000L * 1000L) + "}"); + } + + try (var vector = new TimeStampMilliVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, millis); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + millis + "}"); + } + + var nanos = millis * 1_000_000L + 123_456L; + + try (var vector = new TimeStampMicroVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, nanos / 1000L); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + (nanos / 1000L * 1000L) + "}"); + } + + try (var vector = new TimeStampNanoVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, nanos); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + nanos + "}"); + } + } + + public void testTimeStampTZ() throws Exception { + // Only used to create the vector. It is dropped when converting to XContent. + var tz = ZoneId.of("UTC+1"); + + var millis = 1744304614884L; // Thu Apr 10 19:03:34 CEST 2025 + + try (var vector = new TimeStampSecTZVector("field", allocator, tz.getId())) { + vector.allocateNew(1); + vector.set(0, millis / 1000L); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + (millis / 1000L * 1000L) + "}"); + } + + try (var vector = new TimeStampMilliTZVector("field", allocator, tz.getId())) { + vector.allocateNew(1); + vector.set(0, millis); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + millis + "}"); + } + + var nanos = millis * 1_000_000L + 123_456L; + + try (var vector = new TimeStampMicroTZVector("field", allocator, tz.getId())) { + vector.allocateNew(1); + vector.set(0, nanos / 1000L); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + (nanos / 1000L * 1000L) + "}"); + } + + try (var vector = new TimeStampNanoTZVector("field", allocator, tz.getId())) { + vector.allocateNew(1); + vector.set(0, nanos); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + nanos + "}"); + } + } + + public void testDate() throws IOException { + var days = randomIntBetween(1, 20329744); // 2025-08-29 + var millis = days * 86_400_000; + + try (var vector = new DateDayVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, days); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + millis + "}"); + } + + try (var vector = new DateMilliVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, millis); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + millis + "}"); + } + } + + public void testTime() throws Exception { + var millis = randomIntBetween(0, 86_400_000 - 1); + + try (var vector = new TimeSecVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, millis / 1000); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + (millis / 1000 * 1000) + "}"); + } + + try (var vector = new TimeMilliVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, millis); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + millis + "}"); + } + + var nanos = randomLongBetween(0, 86_400_000_000_000L - 1); + + try (var vector = new TimeMicroVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, nanos / 1000L); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + (nanos / 1000 * 1000) + "}"); + } + + try (var vector = new TimeNanoVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, nanos); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + nanos + "}"); + } + } + + public void testDuration() throws IOException { + var millis = randomLongBetween(-1_000_000L, 1_000_000L); + + try (var vector = new DurationVector("field", FieldType.nullable(new ArrowType.Duration(TimeUnit.SECOND)), allocator)) { + vector.allocateNew(1); + vector.set(0, millis / 1000L); + vector.setValueCount(1); + // Check millis value truncated to seconds + checkPosition(vector, 0, "{\"field\":" + (millis / 1000L * 1000L) + "}"); + } + + try (var vector = new TimeStampMilliVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, millis); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + millis + "}"); + } + + var nanos = millis * 1_000_000L + 123_456L; + + try (var vector = new TimeStampMicroVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, nanos / 1000L); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + (nanos / 1000L * 1000L) + "}"); + } + + try (var vector = new TimeStampNanoVector("field", allocator)) { + vector.allocateNew(1); + vector.set(0, nanos); + vector.setValueCount(1); + checkPosition(vector, 0, "{\"field\":" + nanos + "}"); + } + + } + + public void testList() throws Exception { + try (var vector = ListVector.empty("listField", allocator)) { + var w = vector.getWriter(); + + w.startList(); + w.writeInt(1); + w.writeInt(2); + w.endList(); + + w.startList(); + w.writeInt(3); + w.writeInt(4); + w.writeInt(5); + w.endList(); + w.setValueCount(w.getPosition()); + + checkPosition(vector, 0, "{\"listField\":[1,2]}"); + checkPosition(vector, 1, "{\"listField\":[3,4,5]}"); + } + } + + public void testMap() throws Exception { + try (var vector = MapVector.empty("mapField", allocator, false)) { + var w = vector.getWriter(); + + w.startMap(); + w.startEntry(); + w.key().varChar().writeVarChar("key1"); + w.value().integer().writeInt(42); + w.endEntry(); + w.endMap(); + w.setValueCount(w.getPosition()); + + checkPosition(vector, 0, "{\"mapField\":{\"key1\":42}}"); + } + } + + // TODO: struct (already exercised in ArrowBulkIncrementalParserTests) + // TODO: dense union + // TODO: union (already exercised in ArrowBulkIncrementalParserTests) + + public void testNullVector() throws Exception { + try (NullVector vector = new NullVector("nullField", 1);) { + checkPosition(vector, 0, "{\"nullField\":null}"); + } + } + +} diff --git a/rest-api-spec/src/main/resources/rest-api-spec/api/arrow.bulk.json b/rest-api-spec/src/main/resources/rest-api-spec/api/arrow.bulk.json new file mode 100644 index 0000000000000..c7ebc9bb7c114 --- /dev/null +++ b/rest-api-spec/src/main/resources/rest-api-spec/api/arrow.bulk.json @@ -0,0 +1,98 @@ +{ + "arrow.bulk":{ + "documentation":{ + "url":"https://www.elastic.co/guide/en/elasticsearch/reference/master/docs-bulk.html", + "description":"Allows to perform multiple index/update/delete operations in a single request using the Arrow IPC streaming format." + }, + "stability":"experimental", + "visibility":"public", + "headers":{ + "accept": [ "application/json"], + "content_type": ["application/vnd.apache.arrow.stream"] + }, + "url":{ + "paths":[ + { + "path":"/_arrow/_bulk", + "methods":[ + "POST", + "PUT" + ] + }, + { + "path":"/_arrow/{index}/_bulk", + "methods":[ + "POST", + "PUT" + ], + "parts":{ + "index":{ + "type":"string", + "description":"Default index for items which don't provide one" + } + } + } + ] + }, + "params":{ + "wait_for_active_shards":{ + "type":"string", + "description":"Sets the number of shard copies that must be active before proceeding with the bulk operation. Defaults to 1, meaning the primary shard only. Set to `all` for all shard copies, otherwise set to any non-negative value less than or equal to the total number of copies for the shard (number of replicas + 1)" + }, + "refresh":{ + "type":"enum", + "options":[ + "true", + "false", + "wait_for" + ], + "description":"If `true` then refresh the affected shards to make this operation visible to search, if `wait_for` then wait for a refresh to make this operation visible to search, if `false` (the default) then do nothing with refreshes." + }, + "routing":{ + "type":"string", + "description":"Specific routing value" + }, + "timeout":{ + "type":"time", + "description":"Explicit operation timeout" + }, + "_source":{ + "type":"list", + "description":"True or false to return the _source field or not, or default list of fields to return, can be overridden on each sub-request" + }, + "_source_excludes":{ + "type":"list", + "description":"Default list of fields to exclude from the returned _source field, can be overridden on each sub-request" + }, + "_source_includes":{ + "type":"list", + "description":"Default list of fields to extract and return from the _source field, can be overridden on each sub-request" + }, + "pipeline":{ + "type":"string", + "description":"The pipeline id to preprocess incoming documents with" + }, + "require_alias": { + "type": "boolean", + "description": "If true, the request’s actions must target an index alias. Defaults to false." + }, + "require_data_stream": { + "type": "boolean", + "description": "If true, the request's actions must target a data stream (existing or to-be-created). Default to false" + }, + "list_executed_pipelines": { + "type": "boolean", + "description": "Sets list_executed_pipelines for all incoming documents. Defaults to unset (false)" + }, + "include_source_on_error": { + "type": "boolean", + "description": "True or false if to include the document source in the error message in case of parsing errors. Defaults to true." + } + }, + "body":{ + "description":"The operation definition and data (action-data pairs), separated by newlines", + "required":true, + "serialize":"bulk" + } + } +} diff --git a/server/build.gradle b/server/build.gradle index 7163aad57b390..ef0cc63376cf3 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -14,6 +14,7 @@ apply plugin: 'elasticsearch.internal-test-artifact' apply plugin: 'elasticsearch.test-build-info' apply plugin: 'elasticsearch.transport-version-references' apply plugin: 'elasticsearch.transport-version-resources' +apply plugin: 'elasticsearch.internal-java-rest-test' publishing { publications { diff --git a/server/src/main/java/org/elasticsearch/action/bulk/AbstractBulkRequestParser.java b/server/src/main/java/org/elasticsearch/action/bulk/AbstractBulkRequestParser.java new file mode 100644 index 0000000000000..9e9c72b337c8c --- /dev/null +++ b/server/src/main/java/org/elasticsearch/action/bulk/AbstractBulkRequestParser.java @@ -0,0 +1,75 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.action.bulk; + +import org.elasticsearch.action.delete.DeleteRequest; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.action.update.UpdateRequest; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.core.Releasable; +import org.elasticsearch.search.fetch.subphase.FetchSourceContext; +import org.elasticsearch.xcontent.XContentType; + +import java.io.IOException; +import java.util.function.BiConsumer; +import java.util.function.Consumer; + +public abstract class AbstractBulkRequestParser { + + /** + * A parser for streamed data. Every call to {@link #parse(BytesReference, boolean)} should + * consume as much data as possible and return the number of bytes that were consumed. + */ + public interface IncrementalParser extends Releasable { + /** + * @param data the data + * @param lastData is there more data that can be read? + * @return the number of bytes that were parsed + */ + int parse(BytesReference data, boolean lastData) throws IOException; + } + + /** + * Parse the provided {@code data} assuming the provided default values. Index requests + * will be passed to the {@code indexRequestConsumer}, update requests to the + * {@code updateRequestConsumer} and delete requests to the {@code deleteRequestConsumer}. + */ + public abstract void parse( + BytesReference data, + @Nullable String defaultIndex, + @Nullable String defaultRouting, + @Nullable FetchSourceContext defaultFetchSourceContext, + @Nullable String defaultPipeline, + @Nullable Boolean defaultRequireAlias, + @Nullable Boolean defaultRequireDataStream, + @Nullable Boolean defaultListExecutedPipelines, + boolean allowExplicitIndex, + XContentType xContentType, + BiConsumer indexRequestConsumer, + Consumer updateRequestConsumer, + Consumer deleteRequestConsumer + ) throws IOException; + + public abstract IncrementalParser incrementalParser( + @Nullable String defaultIndex, + @Nullable String defaultRouting, + @Nullable FetchSourceContext defaultFetchSourceContext, + @Nullable String defaultPipeline, + @Nullable Boolean defaultRequireAlias, + @Nullable Boolean defaultRequireDataStream, + @Nullable Boolean defaultListExecutedPipelines, + boolean allowExplicitIndex, + XContentType xContentType, + BiConsumer indexRequestConsumer, + Consumer updateRequestConsumer, + Consumer deleteRequestConsumer + ); +} diff --git a/server/src/main/java/org/elasticsearch/action/bulk/BulkRequestParser.java b/server/src/main/java/org/elasticsearch/action/bulk/BulkRequestParser.java index 2f336566953ba..3c81e3a94e03f 100644 --- a/server/src/main/java/org/elasticsearch/action/bulk/BulkRequestParser.java +++ b/server/src/main/java/org/elasticsearch/action/bulk/BulkRequestParser.java @@ -43,7 +43,7 @@ /** * Helper to parse bulk requests. This should be considered an internal class. */ -public final class BulkRequestParser { +public final class BulkRequestParser extends AbstractBulkRequestParser { @UpdateForV10(owner = UpdateForV10.Owner.DATA_MANAGEMENT) // Remove deprecation logger when its usages in checkBulkActionIsProperlyClosed are removed @@ -127,6 +127,7 @@ private static BytesReference sliceTrimmingCarriageReturn( * will be passed to the {@code indexRequestConsumer}, update requests to the * {@code updateRequestConsumer} and delete requests to the {@code deleteRequestConsumer}. */ + @Override public void parse( BytesReference data, @Nullable String defaultIndex, @@ -142,7 +143,7 @@ public void parse( Consumer updateRequestConsumer, Consumer deleteRequestConsumer ) throws IOException { - IncrementalParser incrementalParser = new IncrementalParser( + IncrementalParser incrementalParser = new XContentIncrementalParser( defaultIndex, defaultRouting, defaultFetchSourceContext, @@ -151,7 +152,9 @@ public void parse( defaultRequireDataStream, defaultListExecutedPipelines, allowExplicitIndex, + deprecateOrErrorOnType, xContentType, + config, indexRequestConsumer, updateRequestConsumer, deleteRequestConsumer @@ -160,6 +163,7 @@ public void parse( incrementalParser.parse(data, true); } + @Override public IncrementalParser incrementalParser( @Nullable String defaultIndex, @Nullable String defaultRouting, @@ -174,7 +178,7 @@ public IncrementalParser incrementalParser( Consumer updateRequestConsumer, Consumer deleteRequestConsumer ) { - return new IncrementalParser( + return new XContentIncrementalParser( defaultIndex, defaultRouting, defaultFetchSourceContext, @@ -183,21 +187,23 @@ public IncrementalParser incrementalParser( defaultRequireDataStream, defaultListExecutedPipelines, allowExplicitIndex, + deprecateOrErrorOnType, xContentType, + config, indexRequestConsumer, updateRequestConsumer, deleteRequestConsumer ); } - public class IncrementalParser { + public static class XContentIncrementalParser implements IncrementalParser { // Bulk requests can contain a lot of repeated strings for the index, pipeline and routing parameters. This map is used to // deduplicate duplicate strings parsed for these parameters. While it does not prevent instantiating the duplicate strings, it // reduces their lifetime to the lifetime of this parse call instead of the lifetime of the full bulk request. private final Map stringDeduplicator = new HashMap<>(); - private final String defaultIndex; + protected final String defaultIndex; private final String defaultRouting; private final FetchSourceContext defaultFetchSourceContext; private final String defaultPipeline; @@ -205,12 +211,14 @@ public class IncrementalParser { private final Boolean defaultRequireDataStream; private final Boolean defaultListExecutedPipelines; private final boolean allowExplicitIndex; + private final boolean deprecateOrErrorOnType; private final XContentType xContentType; + private final XContentParserConfiguration config; private final byte marker; - private final BiConsumer indexRequestConsumer; - private final Consumer updateRequestConsumer; - private final Consumer deleteRequestConsumer; + protected final BiConsumer indexRequestConsumer; + protected final Consumer updateRequestConsumer; + protected final Consumer deleteRequestConsumer; private Exception failure = null; private int incrementalFromOffset = 0; @@ -222,7 +230,7 @@ public class IncrementalParser { private boolean currentListExecutedPipelines = false; private FetchSourceContext currentFetchSourceContext = null; - private IncrementalParser( + protected XContentIncrementalParser( @Nullable String defaultIndex, @Nullable String defaultRouting, @Nullable FetchSourceContext defaultFetchSourceContext, @@ -231,7 +239,9 @@ private IncrementalParser( @Nullable Boolean defaultRequireDataStream, @Nullable Boolean defaultListExecutedPipelines, boolean allowExplicitIndex, + boolean deprecateOrErrorOnType, XContentType xContentType, + XContentParserConfiguration config, BiConsumer indexRequestConsumer, Consumer updateRequestConsumer, Consumer deleteRequestConsumer @@ -244,13 +254,16 @@ private IncrementalParser( this.defaultRequireDataStream = defaultRequireDataStream; this.defaultListExecutedPipelines = defaultListExecutedPipelines; this.allowExplicitIndex = allowExplicitIndex; + this.deprecateOrErrorOnType = deprecateOrErrorOnType; this.xContentType = xContentType; - this.marker = xContentType.xContent().bulkSeparator(); + this.config = config; + this.marker = xContentType == null ? 0 : xContentType.xContent().bulkSeparator(); // null for Arrow this.indexRequestConsumer = indexRequestConsumer; this.updateRequestConsumer = updateRequestConsumer; this.deleteRequestConsumer = deleteRequestConsumer; } + @Override public int parse(BytesReference data, boolean lastData) throws IOException { if (failure != null) { assert false : failure.getMessage(); @@ -264,6 +277,11 @@ public int parse(BytesReference data, boolean lastData) throws IOException { } } + @Override + public void close() { + // Nothing + } + private int tryParse(BytesReference data, boolean lastData) throws IOException { int from = 0; int consumed = 0; @@ -299,228 +317,235 @@ private int tryParse(BytesReference data, boolean lastData) throws IOException { private boolean parseActionLine(BytesReference data, int from, int to) throws IOException { assert currentRequest == null; + try (XContentParser parser = createParser(xContentType.xContent(), config, data, from, to)) { + currentRequest = parseActionLine(parser); + return currentRequest != null; + } + } + + protected DocWriteRequest parseActionLine(XContentParser parser) throws IOException { + + DocWriteRequest currentRequest = null; + // Reset the fields which are accessed during document line parsing currentType = null; currentPipeline = defaultPipeline; currentListExecutedPipelines = defaultListExecutedPipelines != null && defaultListExecutedPipelines; currentFetchSourceContext = defaultFetchSourceContext; - try (XContentParser parser = createParser(xContentType.xContent(), data, from, to)) { + // Move to START_OBJECT + XContentParser.Token token = parser.nextToken(); + if (token == null) { + return null; + } + if (token != XContentParser.Token.START_OBJECT) { + throw new IllegalArgumentException( + "Malformed action/metadata line [" + + line + + "], expected " + + XContentParser.Token.START_OBJECT + + " but found [" + + token + + "]" + ); + } + // Move to FIELD_NAME, that's the action + token = parser.nextToken(); + if (token != XContentParser.Token.FIELD_NAME) { + throw new IllegalArgumentException( + "Malformed action/metadata line [" + + line + + "], expected " + + XContentParser.Token.FIELD_NAME + + " but found [" + + token + + "]" + ); + } + String action = parser.currentName(); + if (SUPPORTED_ACTIONS.contains(action) == false) { + throw new IllegalArgumentException( + "Malformed action/metadata line [" + + line + + "], expected field [create], [delete], [index] or [update] but found [" + + action + + "]" + ); + } - // Move to START_OBJECT - XContentParser.Token token = parser.nextToken(); - if (token == null) { - return false; - } - if (token != XContentParser.Token.START_OBJECT) { - throw new IllegalArgumentException( - "Malformed action/metadata line [" - + line - + "], expected " - + XContentParser.Token.START_OBJECT - + " but found [" - + token - + "]" - ); - } - // Move to FIELD_NAME, that's the action - token = parser.nextToken(); - if (token != XContentParser.Token.FIELD_NAME) { - throw new IllegalArgumentException( - "Malformed action/metadata line [" - + line - + "], expected " - + XContentParser.Token.FIELD_NAME - + " but found [" - + token - + "]" - ); - } - String action = parser.currentName(); - if (SUPPORTED_ACTIONS.contains(action) == false) { - throw new IllegalArgumentException( - "Malformed action/metadata line [" - + line - + "], expected field [create], [delete], [index] or [update] but found [" - + action - + "]" - ); - } + String index = defaultIndex; + String id = null; + String routing = defaultRouting; + String opType = null; + long version = Versions.MATCH_ANY; + VersionType versionType = VersionType.INTERNAL; + long ifSeqNo = SequenceNumbers.UNASSIGNED_SEQ_NO; + long ifPrimaryTerm = UNASSIGNED_PRIMARY_TERM; + int retryOnConflict = 0; + boolean requireAlias = defaultRequireAlias != null && defaultRequireAlias; + boolean requireDataStream = defaultRequireDataStream != null && defaultRequireDataStream; + Map dynamicTemplates = Map.of(); + + // at this stage, next token can either be END_OBJECT (and use default index and type, with auto generated id) + // or START_OBJECT which will have another set of parameters + token = parser.nextToken(); - String index = defaultIndex; - String id = null; - String routing = defaultRouting; - String opType = null; - long version = Versions.MATCH_ANY; - VersionType versionType = VersionType.INTERNAL; - long ifSeqNo = SequenceNumbers.UNASSIGNED_SEQ_NO; - long ifPrimaryTerm = UNASSIGNED_PRIMARY_TERM; - int retryOnConflict = 0; - boolean requireAlias = defaultRequireAlias != null && defaultRequireAlias; - boolean requireDataStream = defaultRequireDataStream != null && defaultRequireDataStream; - Map dynamicTemplates = Map.of(); - - // at this stage, next token can either be END_OBJECT (and use default index and type, with auto generated id) - // or START_OBJECT which will have another set of parameters - token = parser.nextToken(); - - if (token == XContentParser.Token.START_OBJECT) { - String currentFieldName = null; - while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { - if (token == XContentParser.Token.FIELD_NAME) { - currentFieldName = parser.currentName(); - } else if (token.isValue()) { - if (INDEX.match(currentFieldName, parser.getDeprecationHandler())) { - if (allowExplicitIndex == false) { - throw new IllegalArgumentException("explicit index in bulk is not allowed"); - } - index = stringDeduplicator.computeIfAbsent(parser.text(), Function.identity()); - } else if (TYPE.match(currentFieldName, parser.getDeprecationHandler())) { - if (deprecateOrErrorOnType) { - throw new IllegalArgumentException( - "Action/metadata line [" + line + "] contains an unknown parameter [" + currentFieldName + "]" - ); - } - currentType = stringDeduplicator.computeIfAbsent(parser.text(), Function.identity()); - } else if (ID.match(currentFieldName, parser.getDeprecationHandler())) { - id = parser.text(); - } else if (ROUTING.match(currentFieldName, parser.getDeprecationHandler())) { - routing = stringDeduplicator.computeIfAbsent(parser.text(), Function.identity()); - } else if (OP_TYPE.match(currentFieldName, parser.getDeprecationHandler())) { - opType = parser.text(); - } else if (VERSION.match(currentFieldName, parser.getDeprecationHandler())) { - version = parser.longValue(); - } else if (VERSION_TYPE.match(currentFieldName, parser.getDeprecationHandler())) { - versionType = VersionType.fromString(parser.text()); - } else if (IF_SEQ_NO.match(currentFieldName, parser.getDeprecationHandler())) { - ifSeqNo = parser.longValue(); - } else if (IF_PRIMARY_TERM.match(currentFieldName, parser.getDeprecationHandler())) { - ifPrimaryTerm = parser.longValue(); - } else if (RETRY_ON_CONFLICT.match(currentFieldName, parser.getDeprecationHandler())) { - retryOnConflict = parser.intValue(); - } else if (PIPELINE.match(currentFieldName, parser.getDeprecationHandler())) { - currentPipeline = stringDeduplicator.computeIfAbsent(parser.text(), Function.identity()); - } else if (SOURCE.match(currentFieldName, parser.getDeprecationHandler())) { - currentFetchSourceContext = FetchSourceContext.fromXContent(parser); - } else if (REQUIRE_ALIAS.match(currentFieldName, parser.getDeprecationHandler())) { - requireAlias = parser.booleanValue(); - } else if (REQUIRE_DATA_STREAM.match(currentFieldName, parser.getDeprecationHandler())) { - requireDataStream = parser.booleanValue(); - } else if (LIST_EXECUTED_PIPELINES.match(currentFieldName, parser.getDeprecationHandler())) { - currentListExecutedPipelines = parser.booleanValue(); - } else { + if (token == XContentParser.Token.START_OBJECT) { + String currentFieldName = null; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else if (token.isValue()) { + if (INDEX.match(currentFieldName, parser.getDeprecationHandler())) { + if (allowExplicitIndex == false) { + throw new IllegalArgumentException("explicit index in bulk is not allowed"); + } + index = stringDeduplicator.computeIfAbsent(parser.text(), Function.identity()); + } else if (TYPE.match(currentFieldName, parser.getDeprecationHandler())) { + if (deprecateOrErrorOnType) { throw new IllegalArgumentException( "Action/metadata line [" + line + "] contains an unknown parameter [" + currentFieldName + "]" ); } - } else if (token == XContentParser.Token.START_ARRAY) { + currentType = stringDeduplicator.computeIfAbsent(parser.text(), Function.identity()); + } else if (ID.match(currentFieldName, parser.getDeprecationHandler())) { + id = parser.text(); + } else if (ROUTING.match(currentFieldName, parser.getDeprecationHandler())) { + routing = stringDeduplicator.computeIfAbsent(parser.text(), Function.identity()); + } else if (OP_TYPE.match(currentFieldName, parser.getDeprecationHandler())) { + opType = parser.text(); + } else if (VERSION.match(currentFieldName, parser.getDeprecationHandler())) { + version = parser.longValue(); + } else if (VERSION_TYPE.match(currentFieldName, parser.getDeprecationHandler())) { + versionType = VersionType.fromString(parser.text()); + } else if (IF_SEQ_NO.match(currentFieldName, parser.getDeprecationHandler())) { + ifSeqNo = parser.longValue(); + } else if (IF_PRIMARY_TERM.match(currentFieldName, parser.getDeprecationHandler())) { + ifPrimaryTerm = parser.longValue(); + } else if (RETRY_ON_CONFLICT.match(currentFieldName, parser.getDeprecationHandler())) { + retryOnConflict = parser.intValue(); + } else if (PIPELINE.match(currentFieldName, parser.getDeprecationHandler())) { + currentPipeline = stringDeduplicator.computeIfAbsent(parser.text(), Function.identity()); + } else if (SOURCE.match(currentFieldName, parser.getDeprecationHandler())) { + currentFetchSourceContext = FetchSourceContext.fromXContent(parser); + } else if (REQUIRE_ALIAS.match(currentFieldName, parser.getDeprecationHandler())) { + requireAlias = parser.booleanValue(); + } else if (REQUIRE_DATA_STREAM.match(currentFieldName, parser.getDeprecationHandler())) { + requireDataStream = parser.booleanValue(); + } else if (LIST_EXECUTED_PIPELINES.match(currentFieldName, parser.getDeprecationHandler())) { + currentListExecutedPipelines = parser.booleanValue(); + } else { throw new IllegalArgumentException( - "Malformed action/metadata line [" - + line - + "], expected a simple value for field [" - + currentFieldName - + "] but found [" - + token - + "]" + "Action/metadata line [" + line + "] contains an unknown parameter [" + currentFieldName + "]" ); + } + } else if (token == XContentParser.Token.START_ARRAY) { + throw new IllegalArgumentException( + "Malformed action/metadata line [" + + line + + "], expected a simple value for field [" + + currentFieldName + + "] but found [" + + token + + "]" + ); + } else if (token == XContentParser.Token.START_OBJECT + && DYNAMIC_TEMPLATES.match(currentFieldName, parser.getDeprecationHandler())) { + dynamicTemplates = parser.mapStrings(); } else if (token == XContentParser.Token.START_OBJECT - && DYNAMIC_TEMPLATES.match(currentFieldName, parser.getDeprecationHandler())) { - dynamicTemplates = parser.mapStrings(); - } else if (token == XContentParser.Token.START_OBJECT - && SOURCE.match(currentFieldName, parser.getDeprecationHandler())) { - currentFetchSourceContext = FetchSourceContext.fromXContent(parser); - } else if (token != XContentParser.Token.VALUE_NULL) { - throw new IllegalArgumentException( - "Malformed action/metadata line [" - + line - + "], expected a simple value for field [" - + currentFieldName - + "] but found [" - + token - + "]" - ); - } - } - } else if (token != XContentParser.Token.END_OBJECT) { + && SOURCE.match(currentFieldName, parser.getDeprecationHandler())) { + currentFetchSourceContext = FetchSourceContext.fromXContent(parser); + } else if (token != XContentParser.Token.VALUE_NULL) { + throw new IllegalArgumentException( + "Malformed action/metadata line [" + + line + + "], expected a simple value for field [" + + currentFieldName + + "] but found [" + + token + + "]" + ); + } + } + } else if (token != XContentParser.Token.END_OBJECT) { + throw new IllegalArgumentException( + "Malformed action/metadata line [" + + line + + "], expected " + + XContentParser.Token.START_OBJECT + + " or " + + XContentParser.Token.END_OBJECT + + " but found [" + + token + + "]" + ); + } + checkBulkActionIsProperlyClosed(parser, config, line); + + if ("delete".equals(action)) { + if (dynamicTemplates.isEmpty() == false) { throw new IllegalArgumentException( - "Malformed action/metadata line [" - + line - + "], expected " - + XContentParser.Token.START_OBJECT - + " or " - + XContentParser.Token.END_OBJECT - + " but found [" - + token - + "]" + "Delete request in line [" + line + "] does not accept " + DYNAMIC_TEMPLATES.getPreferredName() ); } - checkBulkActionIsProperlyClosed(parser, line); - - if ("delete".equals(action)) { + currentRequest = new DeleteRequest(index).id(id) + .routing(routing) + .version(version) + .versionType(versionType) + .setIfSeqNo(ifSeqNo) + .setIfPrimaryTerm(ifPrimaryTerm); + } else { + // we use internalAdd so we don't fork here, this allows us not to copy over the big byte array to small chunks + // of index request. + if ("index".equals(action) || "create".equals(action)) { + var indexRequest = new IndexRequest(index).id(id) + .routing(routing) + .version(version) + .versionType(versionType) + .setPipeline(currentPipeline) + .setIfSeqNo(ifSeqNo) + .setIfPrimaryTerm(ifPrimaryTerm) + .setDynamicTemplates(dynamicTemplates) + .setRequireAlias(requireAlias) + .setRequireDataStream(requireDataStream) + .setListExecutedPipelines(currentListExecutedPipelines) + .setIncludeSourceOnError(config.includeSourceOnError()); + if ("create".equals(action)) { + indexRequest = indexRequest.create(true); + } else if (opType != null) { + indexRequest = indexRequest.create("create".equals(opType)); + } + currentRequest = indexRequest; + } else if ("update".equals(action)) { + if (version != Versions.MATCH_ANY || versionType != VersionType.INTERNAL) { + throw new IllegalArgumentException( + "Update requests do not support versioning. " + "Please use `if_seq_no` and `if_primary_term` instead" + ); + } + if (requireDataStream) { + throw new IllegalArgumentException( + "Update requests do not support the `require_data_stream` flag, " + + "as data streams do not support update operations" + ); + } + // TODO: support dynamic_templates in update requests if (dynamicTemplates.isEmpty() == false) { throw new IllegalArgumentException( - "Delete request in line [" + line + "] does not accept " + DYNAMIC_TEMPLATES.getPreferredName() + "Update request in line [" + line + "] does not accept " + DYNAMIC_TEMPLATES.getPreferredName() ); } - currentRequest = new DeleteRequest(index).id(id) + UpdateRequest updateRequest = new UpdateRequest().index(index) + .id(id) .routing(routing) - .version(version) - .versionType(versionType) + .retryOnConflict(retryOnConflict) .setIfSeqNo(ifSeqNo) - .setIfPrimaryTerm(ifPrimaryTerm); - } else { - // we use internalAdd so we don't fork here, this allows us not to copy over the big byte array to small chunks - // of index request. - if ("index".equals(action) || "create".equals(action)) { - var indexRequest = new IndexRequest(index).id(id) - .routing(routing) - .version(version) - .versionType(versionType) - .setPipeline(currentPipeline) - .setIfSeqNo(ifSeqNo) - .setIfPrimaryTerm(ifPrimaryTerm) - .setDynamicTemplates(dynamicTemplates) - .setRequireAlias(requireAlias) - .setRequireDataStream(requireDataStream) - .setListExecutedPipelines(currentListExecutedPipelines) - .setIncludeSourceOnError(config.includeSourceOnError()); - if ("create".equals(action)) { - indexRequest = indexRequest.create(true); - } else if (opType != null) { - indexRequest = indexRequest.create("create".equals(opType)); - } - currentRequest = indexRequest; - } else if ("update".equals(action)) { - if (version != Versions.MATCH_ANY || versionType != VersionType.INTERNAL) { - throw new IllegalArgumentException( - "Update requests do not support versioning. " + "Please use `if_seq_no` and `if_primary_term` instead" - ); - } - if (requireDataStream) { - throw new IllegalArgumentException( - "Update requests do not support the `require_data_stream` flag, " - + "as data streams do not support update operations" - ); - } - // TODO: support dynamic_templates in update requests - if (dynamicTemplates.isEmpty() == false) { - throw new IllegalArgumentException( - "Update request in line [" + line + "] does not accept " + DYNAMIC_TEMPLATES.getPreferredName() - ); - } - UpdateRequest updateRequest = new UpdateRequest().index(index) - .id(id) - .routing(routing) - .retryOnConflict(retryOnConflict) - .setIfSeqNo(ifSeqNo) - .setIfPrimaryTerm(ifPrimaryTerm) - .setRequireAlias(requireAlias) - .routing(routing); - currentRequest = updateRequest; - } + .setIfPrimaryTerm(ifPrimaryTerm) + .setRequireAlias(requireAlias) + .routing(routing); + currentRequest = updateRequest; } } - return true; + return currentRequest; } private void parseAndConsumeDocumentLine(BytesReference data, int from, int to) throws IOException { @@ -532,6 +557,7 @@ private void parseAndConsumeDocumentLine(BytesReference data, int from, int to) try ( XContentParser sliceParser = createParser( xContentType.xContent(), + config, sliceTrimmingCarriageReturn(data, from, to, xContentType) ) ) { @@ -551,7 +577,8 @@ private void parseAndConsumeDocumentLine(BytesReference data, int from, int to) } @UpdateForV10(owner = UpdateForV10.Owner.DATA_MANAGEMENT) // Remove lenient parsing in V8 BWC mode - private void checkBulkActionIsProperlyClosed(XContentParser parser, int line) throws IOException { + private static void checkBulkActionIsProperlyClosed(XContentParser parser, XContentParserConfiguration config, int line) + throws IOException { XContentParser.Token token; try { token = parser.nextToken(); @@ -602,9 +629,10 @@ private void checkBulkActionIsProperlyClosed(XContentParser parser, int line) th } } - private XContentParser createParser(XContent xContent, BytesReference data) throws IOException { + private static XContentParser createParser(XContent xContent, XContentParserConfiguration config, BytesReference data) + throws IOException { if (data.hasArray()) { - return parseBytesArray(xContent, data, 0, data.length()); + return parseBytesArray(xContent, config, data, 0, data.length()); } else { return xContent.createParser(config, data.streamInput()); } @@ -612,21 +640,33 @@ private XContentParser createParser(XContent xContent, BytesReference data) thro // Create an efficient parser of the given bytes, trying to directly parse a byte array if possible and falling back to stream wrapping // otherwise. - private XContentParser createParser(XContent xContent, BytesReference data, int from, int nextMarker) throws IOException { + private static XContentParser createParser( + XContent xContent, + XContentParserConfiguration config, + BytesReference data, + int from, + int nextMarker + ) throws IOException { if (data.hasArray()) { - return parseBytesArray(xContent, data, from, nextMarker); + return parseBytesArray(xContent, config, data, from, nextMarker); } else { final int length = nextMarker - from; final BytesReference slice = data.slice(from, length); if (slice.hasArray()) { - return parseBytesArray(xContent, slice, 0, length); + return parseBytesArray(xContent, config, slice, 0, length); } else { return xContent.createParser(config, slice.streamInput()); } } } - private XContentParser parseBytesArray(XContent xContent, BytesReference array, int from, int nextMarker) throws IOException { + private static XContentParser parseBytesArray( + XContent xContent, + XContentParserConfiguration config, + BytesReference array, + int from, + int nextMarker + ) throws IOException { assert array.hasArray(); final int offset = array.arrayOffset(); return xContent.createParser(config, array.array(), offset + from, nextMarker - from); diff --git a/server/src/main/java/org/elasticsearch/action/bulk/IncrementalBulkService.java b/server/src/main/java/org/elasticsearch/action/bulk/IncrementalBulkService.java index b721a8f4f2b6b..e04e09f809215 100644 --- a/server/src/main/java/org/elasticsearch/action/bulk/IncrementalBulkService.java +++ b/server/src/main/java/org/elasticsearch/action/bulk/IncrementalBulkService.java @@ -55,11 +55,15 @@ public class IncrementalBulkService { public IncrementalBulkService(Client client, IndexingPressure indexingPressure, MeterRegistry meterRegistry) { this.client = client; this.indexingPressure = indexingPressure; - this.chunkWaitTimeMillisHistogram = meterRegistry.registerLongHistogram( - CHUNK_WAIT_TIME_HISTOGRAM_NAME, - "Total time in millis spent waiting for next chunk of a bulk request", - "ms" - ); + var chunkWaitTimeMillisHistogram = meterRegistry.getLongHistogram(CHUNK_WAIT_TIME_HISTOGRAM_NAME); + if (chunkWaitTimeMillisHistogram == null) { + chunkWaitTimeMillisHistogram = meterRegistry.registerLongHistogram( + CHUNK_WAIT_TIME_HISTOGRAM_NAME, + "Total time in millis spent waiting for next chunk of a bulk request", + "ms" + ); + } + this.chunkWaitTimeMillisHistogram = chunkWaitTimeMillisHistogram; } public Handler newBulkRequest() { diff --git a/server/src/main/java/org/elasticsearch/rest/RestController.java b/server/src/main/java/org/elasticsearch/rest/RestController.java index 6fca7c2d60c6d..35f1ee0fcce58 100644 --- a/server/src/main/java/org/elasticsearch/rest/RestController.java +++ b/server/src/main/java/org/elasticsearch/rest/RestController.java @@ -87,6 +87,7 @@ public class RestController implements HttpServerTransport.Dispatcher { * https://fetch.spec.whatwg.org/#cors-safelisted-request-header */ static final Set SAFELISTED_MEDIA_TYPES = Set.of("application/x-www-form-urlencoded", "multipart/form-data", "text/plain"); + static final String ARROW_STREAM = "application/vnd.apache.arrow.stream"; static final String ELASTIC_PRODUCT_HTTP_HEADER = "X-elastic-product"; static final String ELASTIC_PRODUCT_HTTP_HEADER_VALUE = "Elasticsearch"; diff --git a/server/src/main/java/org/elasticsearch/rest/action/document/RestBulkAction.java b/server/src/main/java/org/elasticsearch/rest/action/document/RestBulkAction.java index e381b1c207072..98e9c693c5608 100644 --- a/server/src/main/java/org/elasticsearch/rest/action/document/RestBulkAction.java +++ b/server/src/main/java/org/elasticsearch/rest/action/document/RestBulkAction.java @@ -12,8 +12,10 @@ import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.DocWriteRequest; +import org.elasticsearch.action.bulk.AbstractBulkRequestParser; import org.elasticsearch.action.bulk.BulkRequest; import org.elasticsearch.action.bulk.BulkRequestParser; +import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.bulk.BulkShardRequest; import org.elasticsearch.action.bulk.IncrementalBulkService; import org.elasticsearch.action.support.ActiveShardCount; @@ -159,7 +161,7 @@ private static Exception parseFailureException(Exception e) { } } - static class ChunkHandler implements BaseRestHandler.RequestBodyChunkConsumer { + public static class ChunkHandler implements BaseRestHandler.RequestBodyChunkConsumer { private final RestRequest request; @@ -175,24 +177,37 @@ static class ChunkHandler implements BaseRestHandler.RequestBodyChunkConsumer { private long requestNextChunkTime; private long totalChunkWaitTimeInNanos = 0L; - ChunkHandler(boolean allowExplicitIndex, RestRequest request, Supplier handlerSupplier) { + public ChunkHandler(boolean allowExplicitIndex, RestRequest request, Supplier handlerSupplier) { + this( + allowExplicitIndex, + request, + handlerSupplier, + new BulkRequestParser(true, RestUtils.getIncludeSourceOnError(request), request.getRestApiVersion()) + ); + } + + public ChunkHandler( + boolean allowExplicitIndex, + RestRequest request, + Supplier handlerSupplier, + AbstractBulkRequestParser requestParser + ) { this.request = request; this.handlerSupplier = handlerSupplier; - this.parser = new BulkRequestParser(true, RestUtils.getIncludeSourceOnError(request), request.getRestApiVersion()) - .incrementalParser( - request.param("index"), - request.param("routing"), - FetchSourceContext.parseFromRestRequest(request), - request.param("pipeline"), - request.paramAsBoolean(DocWriteRequest.REQUIRE_ALIAS, false), - request.paramAsBoolean(DocWriteRequest.REQUIRE_DATA_STREAM, false), - request.paramAsBoolean("list_executed_pipelines", false), - allowExplicitIndex, - request.getXContentType(), - (indexRequest, type) -> items.add(indexRequest), - items::add, - items::add - ); + this.parser = requestParser.incrementalParser( + request.param("index"), + request.param("routing"), + FetchSourceContext.parseFromRestRequest(request), + request.param("pipeline"), + request.paramAsBoolean(DocWriteRequest.REQUIRE_ALIAS, false), + request.paramAsBoolean(DocWriteRequest.REQUIRE_DATA_STREAM, false), + request.paramAsBoolean("list_executed_pipelines", false), + allowExplicitIndex, + request.getXContentType(), + (indexRequest, type) -> items.add(indexRequest), + items::add, + items::add + ); } @Override @@ -203,6 +218,10 @@ public void accept(RestChannel restChannel) { request.contentStream().next(); } + protected ActionListener createResponseListener(RestChannel channel) { + return new RestRefCountedChunkedToXContentListener<>(channel); + } + @Override public void handleChunk(RestChannel channel, ReleasableBytesReference chunk, boolean isLast) { assert handler != null; @@ -254,7 +273,7 @@ public void handleChunk(RestChannel channel, ReleasableBytesReference chunk, boo assert channel != null; ArrayList> toPass = new ArrayList<>(items); items.clear(); - handler.lastItems(toPass, () -> Releasables.close(releasables), new RestRefCountedChunkedToXContentListener<>(channel)); + handler.lastItems(toPass, () -> Releasables.close(releasables), createResponseListener(channel)); } handler.updateWaitForChunkMetrics(TimeUnit.NANOSECONDS.toMillis(totalChunkWaitTimeInNanos)); totalChunkWaitTimeInNanos = 0L; @@ -282,7 +301,7 @@ public void streamClose() { private void shortCircuit() { shortCircuited = true; - Releasables.close(handler); + Releasables.close(parser, handler); Releasables.close(unParsedChunks); unParsedChunks.clear(); } diff --git a/server/src/main/resources/org/elasticsearch/bootstrap/security.policy b/server/src/main/resources/org/elasticsearch/bootstrap/security.policy index 55abdc84fc8fb..17c682319c98f 100644 --- a/server/src/main/resources/org/elasticsearch/bootstrap/security.policy +++ b/server/src/main/resources/org/elasticsearch/bootstrap/security.policy @@ -73,6 +73,7 @@ grant codeBase "${codebase.elasticsearch-simdvec}" { permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; }; + //// Everything else: grant { diff --git a/x-pack/plugin/esql/arrow/build.gradle b/x-pack/plugin/esql/arrow/build.gradle index d6fa48982d029..9f96e26cfbd98 100644 --- a/x-pack/plugin/esql/arrow/build.gradle +++ b/x-pack/plugin/esql/arrow/build.gradle @@ -12,40 +12,7 @@ dependencies { compileOnly project(':x-pack:plugin:esql:compute') compileOnly project(':x-pack:plugin:esql-core') compileOnly project(':x-pack:plugin:mapper-version') - implementation('org.apache.arrow:arrow-vector:18.3.0') - implementation('org.apache.arrow:arrow-format:18.3.0') - implementation('org.apache.arrow:arrow-memory-core:18.3.0') - implementation('org.checkerframework:checker-qual:3.42.0') - implementation('com.google.flatbuffers:flatbuffers-java:23.5.26') - // Needed for the json arrow serialization, and loaded even if we don't use it. - implementation("com.fasterxml.jackson.core:jackson-annotations:${versions.jackson}") - implementation("com.fasterxml.jackson.core:jackson-core:${versions.jackson}") - implementation("com.fasterxml.jackson.core:jackson-databind:${versions.jackson}") - implementation("org.slf4j:slf4j-api:${versions.slf4j}") - runtimeOnly "org.slf4j:slf4j-nop:${versions.slf4j}" + implementation(project(":libs:arrow")) testImplementation project(':test:framework') - testImplementation('org.apache.arrow:arrow-memory-unsafe:18.3.0') - testImplementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${versions.jackson}") -} - -tasks.named("dependencyLicenses").configure { - mapping from: /jackson-.*/, to: 'jackson' - mapping from: /arrow-.*/, to: 'arrow' - mapping from: /slf4j-.*/, to: 'slf4j' -} - -tasks.named("thirdPartyAudit").configure { - ignoreViolations( - // uses sun.misc.Unsafe. Only used in tests. - 'org.apache.arrow.memory.util.MemoryUtil', - 'org.apache.arrow.memory.util.MemoryUtil$1', - ) - ignoreMissingClasses( - 'org.apache.commons.codec.binary.Hex' - ) -} - -tasks.named("test").configure { - jvmArgs('--add-opens=java.base/java.nio=ALL-UNNAMED') } diff --git a/x-pack/plugin/esql/arrow/src/main/java/org/elasticsearch/xpack/esql/arrow/AllocationManagerShim.java b/x-pack/plugin/esql/arrow/src/main/java/org/elasticsearch/xpack/esql/arrow/AllocationManagerShim.java deleted file mode 100644 index b52d1053ff595..0000000000000 --- a/x-pack/plugin/esql/arrow/src/main/java/org/elasticsearch/xpack/esql/arrow/AllocationManagerShim.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0; you may not use this file except in compliance with the Elastic License - * 2.0. - */ - -package org.elasticsearch.xpack.esql.arrow; - -import org.apache.arrow.memory.AllocationManager; -import org.apache.arrow.memory.ArrowBuf; -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.DefaultAllocationManagerOption; -import org.elasticsearch.core.SuppressForbidden; -import org.elasticsearch.logging.LogManager; -import org.elasticsearch.logging.Logger; - -import java.lang.reflect.Field; -import java.security.AccessController; -import java.security.PrivilegedAction; - -/** - * An Arrow memory allocation manager that always fails. - *

- * We don't actually use Arrow's memory manager as we stream dataframe buffers directly from ESQL blocks. - * But Arrow won't initialize properly unless it has one (and requires either the arrow-memory-netty or arrow-memory-unsafe libraries). - * It also does some fancy classpath scanning and calls to {@code setAccessible} which will be rejected by the security manager. - *

- * So we configure an allocation manager that will fail on any attempt to allocate memory. - * - * @see DefaultAllocationManagerOption - */ -public class AllocationManagerShim implements AllocationManager.Factory { - - private static final Logger logger = LogManager.getLogger(AllocationManagerShim.class); - - /** - * Initialize the Arrow memory allocation manager shim. - */ - @SuppressForbidden(reason = "Inject the default Arrow memory allocation manager") - public static void init() { - try { - Class.forName("org.elasticsearch.test.ESTestCase"); - logger.info("We're in tests, not disabling Arrow memory manager so we can use a real runtime for testing"); - } catch (ClassNotFoundException notfound) { - logger.debug("Disabling Arrow's allocation manager"); - AccessController.doPrivileged((PrivilegedAction) () -> { - try { - Field field = DefaultAllocationManagerOption.class.getDeclaredField("DEFAULT_ALLOCATION_MANAGER_FACTORY"); - field.setAccessible(true); - field.set(null, new AllocationManagerShim()); - } catch (Exception e) { - throw new AssertionError("Can't init Arrow", e); - } - return null; - }); - } - } - - @Override - public AllocationManager create(BufferAllocator accountingAllocator, long size) { - throw new UnsupportedOperationException("Arrow memory manager is disabled"); - } - - @Override - public ArrowBuf empty() { - throw new UnsupportedOperationException("Arrow memory manager is disabled"); - } -} diff --git a/x-pack/plugin/esql/arrow/src/main/java/org/elasticsearch/xpack/esql/arrow/ArrowResponse.java b/x-pack/plugin/esql/arrow/src/main/java/org/elasticsearch/xpack/esql/arrow/ArrowResponse.java index 208d3308d508b..0c02e0e698a7b 100644 --- a/x-pack/plugin/esql/arrow/src/main/java/org/elasticsearch/xpack/esql/arrow/ArrowResponse.java +++ b/x-pack/plugin/esql/arrow/src/main/java/org/elasticsearch/xpack/esql/arrow/ArrowResponse.java @@ -128,11 +128,6 @@ public void close() { * the schema header, the data buffers, and the trailer. */ protected abstract static class ResponseSegment { - static { - // Init the Arrow memory manager shim - AllocationManagerShim.init(); - } - protected final ArrowResponse response; ResponseSegment(ArrowResponse response) { diff --git a/x-pack/plugin/esql/qa/server/single-node/build.gradle b/x-pack/plugin/esql/qa/server/single-node/build.gradle index ce962ef4c7e74..f34c67bea97e7 100644 --- a/x-pack/plugin/esql/qa/server/single-node/build.gradle +++ b/x-pack/plugin/esql/qa/server/single-node/build.gradle @@ -13,21 +13,10 @@ dependencies { javaRestTestImplementation project(xpackModule('esql:qa:testFixtures')) javaRestTestImplementation project(xpackModule('esql:qa:server')) javaRestTestImplementation project(xpackModule('esql:tools')) + javaRestTestImplementation project(":libs:arrow") javaRestTestImplementation project(xpackModule('esql')) yamlRestTestImplementation project(xpackModule('esql:qa:server')) - javaRestTestImplementation('org.apache.arrow:arrow-vector:18.3.0') - javaRestTestImplementation('org.apache.arrow:arrow-format:18.3.0') - javaRestTestImplementation('org.apache.arrow:arrow-memory-core:18.3.0') - javaRestTestImplementation('org.checkerframework:checker-qual:3.42.0') - javaRestTestImplementation('com.google.flatbuffers:flatbuffers-java:23.5.26') - javaRestTestImplementation("com.fasterxml.jackson.core:jackson-annotations:${versions.jackson}") - javaRestTestImplementation("com.fasterxml.jackson.core:jackson-core:${versions.jackson}") - javaRestTestImplementation("com.fasterxml.jackson.core:jackson-databind:${versions.jackson}") - javaRestTestImplementation("org.slf4j:slf4j-api:${versions.slf4j}") - javaRestTestImplementation("org.slf4j:slf4j-nop:${versions.slf4j}") - javaRestTestImplementation('org.apache.arrow:arrow-memory-unsafe:18.3.0') - clusterPlugins project(':plugins:mapper-size') clusterPlugins project(':plugins:mapper-murmur3') clusterPlugins project(':x-pack:plugin:inference:qa:test-service-plugin') @@ -45,7 +34,6 @@ restResources { tasks.named('javaRestTest') { usesDefaultDistribution("to be triaged") maxParallelForks = 1 - jvmArgs('--add-opens=java.base/java.nio=ALL-UNNAMED') } tasks.named('yamlRestTest') { diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/sagemaker/schema/TaskAndApi.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/sagemaker/schema/TaskAndApi.java index fa258c3275283..87e336e1e8778 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/sagemaker/schema/TaskAndApi.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/sagemaker/schema/TaskAndApi.java @@ -1,3 +1,4 @@ + /* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License