diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 55addaa666c6c9..c223f892d4a541 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -714,10 +714,16 @@ if (ENABLE_CLANG_COVERAGE AND ENABLE_CLANG_COVERAGE STREQUAL ON AND COMPILER_CLA endif () if (MAKE_TEST) - add_compile_options(-fprofile-arcs -ftest-coverage -DGTEST_USE_OWN_TR1_TUPLE=0) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fprofile-arcs -ftest-coverage") - if (NOT OS_MACOSX) - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lgcov") + add_compile_options(-DGTEST_USE_OWN_TR1_TUPLE=0) + # Only add GCC-style coverage when NOT using Clang coverage + # to avoid duplicate symbol errors (e.g., __gcov_fork, __gcov_reset) + # between libgcov.a and libclang_rt.profile-x86_64.a + if (NOT (ENABLE_CLANG_COVERAGE STREQUAL "ON" AND COMPILER_CLANG)) + add_compile_options(-fprofile-arcs -ftest-coverage) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fprofile-arcs -ftest-coverage") + if (NOT OS_MACOSX) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lgcov") + endif() endif() add_definitions(-DBE_TEST) if (ARCH_ARM) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 5f78c7d9294a30..0292b5b2578197 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,7 @@ #include "io/fs/file_writer.h" #include "io/fs/local_file_system.h" #include "util/cpu_info.h" +#include "util/string_util.h" namespace doris::config { #include "common/compile_check_avoid_begin.h" @@ -1082,6 +1084,20 @@ DEFINE_mInt32(segcompaction_num_threads, "5"); // enable java udf and jdbc scannode DEFINE_Bool(enable_java_support, "true"); +// enable python udf +DEFINE_Bool(enable_python_udf_support, "false"); +// python env mode, options: conda, venv +DEFINE_String(python_env_mode, ""); +// root path of conda runtime, python_env_mode should be conda +DEFINE_String(python_conda_root_path, ""); +// root path of venv runtime, python_env_mode should be venv +DEFINE_String(python_venv_root_path, "${DORIS_HOME}/lib/udf/python"); +// python interpreter paths used by venv, e.g. /usr/bin/python3.7:/usr/bin/python3.6 +DEFINE_String(python_venv_interpreter_paths, ""); +// max python processes in global shared pool, each version can have up to this many processes +// 0 means use CPU core count as default, otherwise use the specified value +DEFINE_mInt32(max_python_process_num, "0"); + // Set config randomly to check more issues in github workflow DEFINE_Bool(enable_fuzzy_mode, "false"); diff --git a/be/src/common/config.h b/be/src/common/config.h index c9d9fe94ffbdca..29c6cd6cb64a10 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1131,6 +1131,19 @@ DECLARE_mInt32(segcompaction_num_threads); // enable java udf and jdbc scannode DECLARE_Bool(enable_java_support); +// enable python udf +DECLARE_Bool(enable_python_udf_support); +// python env mode, options: conda, venv +DECLARE_String(python_env_mode); +// root path of conda runtime, python_env_mode should be conda +DECLARE_String(python_conda_root_path); +// root path of venv runtime, python_env_mode should be venv +DECLARE_String(python_venv_root_path); +// python interpreter paths used by venv, e.g. /usr/bin/python3.7:/usr/bin/python3.6 +DECLARE_String(python_venv_interpreter_paths); +// max python processes in global shared pool, each version can have up to this many processes +DECLARE_mInt32(max_python_process_num); + // Set config randomly to check more issues in github workflow DECLARE_Bool(enable_fuzzy_mode); diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index cb4ad78984b973..e6a4787f2a2952 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -102,6 +102,7 @@ #include "service/backend_options.h" #include "service/backend_service.h" #include "service/point_query_executor.h" +#include "udf/python/python_server.h" #include "util/bfd_parser.h" #include "util/bit_util.h" #include "util/brpc_client_cache.h" @@ -926,6 +927,7 @@ void ExecEnv::destroy() { _s_tracking_memory = false; clear_storage_resource(); + PythonServerManager::instance().shutdown(); LOG(INFO) << "Doris exec envorinment is destoried."; } diff --git a/be/src/runtime/user_function_cache.cpp b/be/src/runtime/user_function_cache.cpp index ce6453fc609918..d54c2f473b49ce 100644 --- a/be/src/runtime/user_function_cache.cpp +++ b/be/src/runtime/user_function_cache.cpp @@ -20,6 +20,7 @@ // IWYU pragma: no_include #include // IWYU pragma: keep #include +#include #include #include #include @@ -41,6 +42,7 @@ #include "io/fs/local_file_system.h" #include "runtime/exec_env.h" #include "runtime/plugin/cloud_plugin_downloader.h" +#include "util/defer_op.h" #include "util/dynamic_util.h" #include "util/md5.h" #include "util/string_util.h" @@ -88,6 +90,9 @@ struct UserFunctionCacheEntry { // And this is used to indicate whether library is downloaded. bool is_downloaded = false; + // Indicate if the zip file is unziped. + bool is_unziped = false; + // used to lookup a symbol void* lib_handle = nullptr; @@ -144,9 +149,12 @@ Status UserFunctionCache::_load_entry_from_lib(const std::string& dir, const std lib_type = LibType::SO; } else if (ends_with(file, ".jar")) { lib_type = LibType::JAR; + } else if (ends_with(file, ".zip") && _check_cache_is_python_udf(dir, file)) { + lib_type = LibType::PY_ZIP; } else { return Status::InternalError( - "unknown library file format. the file type is not end with xxx.jar or xxx.so : " + + "unknown library file format. the file type is not end with xxx.jar or xxx.so" + " or xxx.zip : " + file); } @@ -249,12 +257,117 @@ Status UserFunctionCache::_load_cache_entry(const std::string& url, RETURN_IF_ERROR(_download_lib(url, entry)); } + if (!entry->is_unziped && entry->type == LibType::PY_ZIP) { + RETURN_IF_ERROR(_unzip_lib(entry->lib_file)); + entry->lib_file = entry->lib_file.substr(0, entry->lib_file.size() - 4); + entry->is_unziped = true; + } + if (entry->type == LibType::SO) { RETURN_IF_ERROR(_load_cache_entry_internal(entry)); - } else if (entry->type != LibType::JAR) { + } else if (entry->type != LibType::JAR && entry->type != LibType::PY_ZIP) { return Status::InvalidArgument( - "Unsupported lib type! Make sure your lib type is one of 'so' and 'jar'!"); + "Unsupported lib type! Make sure your lib type is one of 'so' and 'jar' and " + "python 'zip'!"); + } + return Status::OK(); +} + +Status UserFunctionCache::_check_cache_is_python_udf(const std::string& dir, + const std::string& file) { + const std::string& full_path = dir + "/" + file; + RETURN_IF_ERROR(_unzip_lib(full_path)); + std::string unzip_dir = full_path.substr(0, full_path.size() - 4); + + bool has_python_file = false; + + auto scan_cb = [&has_python_file](const io::FileInfo& file) { + if (file.is_file && ends_with(file.file_name, ".py")) { + has_python_file = true; + return false; // Stop iteration once we find a Python file + } + return true; + }; + RETURN_IF_ERROR(io::global_local_filesystem()->iterate_directory(unzip_dir, scan_cb)); + if (!has_python_file) { + return Status::InternalError("No Python file found in the unzipped directory."); + } + return Status::OK(); +} + +Status UserFunctionCache::_unzip_lib(const std::string& zip_file) { + std::string unzip_dir = zip_file.substr(0, zip_file.size() - 4); + RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(unzip_dir)); + + unzFile zip_file_handle = unzOpen(zip_file.c_str()); + if (zip_file_handle == nullptr) { + return Status::InternalError("Failed to open zip file: " + zip_file); + } + + Defer defer([&] { unzClose(zip_file_handle); }); + + unz_global_info global_info; + if (unzGetGlobalInfo(zip_file_handle, &global_info) != UNZ_OK) { + return Status::InternalError("Failed to get global info from zip file: " + zip_file); + } + + for (uLong i = 0; i < global_info.number_entry; ++i) { + unz_file_info file_info; + char filename[256]; + if (unzGetCurrentFileInfo(zip_file_handle, &file_info, filename, sizeof(filename), nullptr, + 0, nullptr, 0) != UNZ_OK) { + return Status::InternalError("Failed to get file info from zip file: " + zip_file); + } + + if (std::string(filename).find("__MACOSX") != std::string::npos) { + if ((i + 1) < global_info.number_entry) { + if (unzGoToNextFile(zip_file_handle) != UNZ_OK) { + return Status::InternalError("Failed to go to next file in zip: " + zip_file); + } + } + continue; + } + + std::string full_filename = unzip_dir + "/" + filename; + if (full_filename.length() > PATH_MAX) { + return Status::InternalError( + fmt::format("File path {}... is too long, maximum path length is {}", + full_filename.substr(0, 50), PATH_MAX)); + } + + if (filename[strlen(filename) - 1] == '/') { + RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(full_filename)); + } else { + if (unzOpenCurrentFile(zip_file_handle) != UNZ_OK) { + return Status::InternalError("Failed to open file in zip: " + + std::string(filename)); + } + + FILE* out = fopen(full_filename.c_str(), "wb"); + if (out == nullptr) { + unzCloseCurrentFile(zip_file_handle); + return Status::InternalError("Failed to create file: " + full_filename); + } + char buffer[8192]; + int bytes_read; + while ((bytes_read = unzReadCurrentFile(zip_file_handle, buffer, sizeof(buffer))) > 0) { + fwrite(buffer, bytes_read, 1, out); + } + fclose(out); + unzCloseCurrentFile(zip_file_handle); + if (bytes_read < 0) { + return Status::InternalError("Failed to read file in zip: " + + std::string(filename)); + } + } + + if ((i + 1) < global_info.number_entry) { + if (unzGoToNextFile(zip_file_handle) != UNZ_OK) { + return Status::InternalError("Failed to go to next file in zip: " + zip_file); + } + } } + return Status::OK(); } @@ -348,6 +461,8 @@ std::string UserFunctionCache::_make_lib_file(int64_t function_id, const std::st ss << _lib_dir << '/' << shard << '/' << function_id << '.' << checksum; if (type == LibType::JAR) { ss << '.' << file_name; + } else if (type == LibType::PY_ZIP) { + ss << '.' << file_name; } else { ss << ".so"; } @@ -362,6 +477,14 @@ Status UserFunctionCache::get_jarpath(int64_t fid, const std::string& url, return Status::OK(); } +Status UserFunctionCache::get_pypath(int64_t fid, const std::string& url, + const std::string& checksum, std::string* libpath) { + std::shared_ptr entry = nullptr; + RETURN_IF_ERROR(_get_cache_entry(fid, url, checksum, entry, LibType::PY_ZIP)); + *libpath = entry->lib_file; + return Status::OK(); +} + std::vector UserFunctionCache::_split_string_by_checksum(const std::string& file) { std::vector result; diff --git a/be/src/runtime/user_function_cache.h b/be/src/runtime/user_function_cache.h index f5a04a5858338e..1596f4c2440d03 100644 --- a/be/src/runtime/user_function_cache.h +++ b/be/src/runtime/user_function_cache.h @@ -43,7 +43,7 @@ struct UserFunctionCacheEntry; // with id, this function library is valid. And when user wants to // change its implementation(URL), Doris will generate a new function // id. -enum class LibType { JAR, SO }; +enum class LibType { JAR, SO, PY_ZIP }; class UserFunctionCache { public: @@ -59,6 +59,9 @@ class UserFunctionCache { Status get_jarpath(int64_t fid, const std::string& url, const std::string& checksum, std::string* libpath); + Status get_pypath(int64_t fid, const std::string& url, const std::string& checksum, + std::string* libpath); + private: Status _load_cached_lib(); Status _load_entry_from_lib(const std::string& dir, const std::string& file); @@ -66,6 +69,14 @@ class UserFunctionCache { std::shared_ptr& output_entry, LibType type); Status _load_cache_entry(const std::string& url, std::shared_ptr entry); Status _download_lib(const std::string& url, std::shared_ptr entry); + /** + * Unzip the python udf user file. + */ + Status _unzip_lib(const std::string& file); + /** + * Check if the cache file is python udf. + */ + Status _check_cache_is_python_udf(const std::string& dir, const std::string& file); Status _load_cache_entry_internal(std::shared_ptr entry); std::string _make_lib_file(int64_t function_id, const std::string& checksum, LibType type, diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index 8c8c14f92150e6..56e142f6ea541e 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -24,6 +24,7 @@ // IWYU pragma: no_include #include // IWYU pragma: keep #include +#include #if !defined(__SANITIZE_ADDRESS__) && !defined(ADDRESS_SANITIZER) && !defined(LEAK_SANITIZER) && \ !defined(THREAD_SANITIZER) && !defined(USE_JEMALLOC) #include // IWYU pragma: keep @@ -76,9 +77,11 @@ #include "service/backend_service.h" #include "service/brpc_service.h" #include "service/http_service.h" +#include "udf/python/python_env.h" #include "util/debug_util.h" #include "util/disk_info.h" #include "util/mem_info.h" +#include "util/string_util.h" #include "util/thrift_rpc_helper.h" #include "util/thrift_server.h" #include "util/uid_util.h" @@ -499,6 +502,70 @@ int main(int argc, char** argv) { } } + if (doris::config::enable_python_udf_support) { + if (std::string python_udf_root_path = + fmt::format("{}/lib/udf/python", std::getenv("DORIS_HOME")); + !std::filesystem::exists(python_udf_root_path)) { + std::filesystem::create_directories(python_udf_root_path); + } + + // Normalize and trim all Python-related config parameters + std::string python_env_mode = + std::string(doris::trim(doris::to_lower(doris::config::python_env_mode))); + std::string python_conda_root_path = + std::string(doris::trim(doris::config::python_conda_root_path)); + std::string python_venv_root_path = + std::string(doris::trim(doris::config::python_venv_root_path)); + std::string python_venv_interpreter_paths = + std::string(doris::trim(doris::config::python_venv_interpreter_paths)); + + if (python_env_mode == "conda") { + if (python_conda_root_path.empty()) { + LOG(ERROR) + << "Python conda root path is empty, please set `python_conda_root_path` " + "or set `enable_python_udf_support` to `false`"; + exit(1); + } + LOG(INFO) << "Doris backend python version manager is initialized. Python conda " + "root path: " + << python_conda_root_path; + status = doris::PythonVersionManager::instance().init(doris::PythonEnvType::CONDA, + python_conda_root_path, ""); + } else if (python_env_mode == "venv") { + if (python_venv_root_path.empty()) { + LOG(ERROR) + << "Python venv root path is empty, please set `python_venv_root_path` or " + "set `enable_python_udf_support` to `false`"; + exit(1); + } + if (python_venv_interpreter_paths.empty()) { + LOG(ERROR) + << "Python interpreter paths is empty, please set " + "`python_venv_interpreter_paths` or set `enable_python_udf_support` to " + "`false`"; + exit(1); + } + LOG(INFO) << "Doris backend python version manager is initialized. Python venv " + "root path: " + << python_venv_root_path + << ", python interpreter paths: " << python_venv_interpreter_paths; + status = doris::PythonVersionManager::instance().init(doris::PythonEnvType::VENV, + python_venv_root_path, + python_venv_interpreter_paths); + } else { + status = Status::InvalidArgument( + "Python env mode is invalid, should be `conda` or `venv`. If you don't want to " + "enable the Python UDF function, please set `enable_python_udf_support` to " + "`false`"); + } + + if (!status.ok()) { + LOG(ERROR) << "Failed to initialize python version manager: " << status; + exit(1); + } + LOG(INFO) << doris::PythonVersionManager::instance().to_string(); + } + // Doris own signal handler must be register after jvm is init. // Or our own sig-handler for SIGINT & SIGTERM will not be chained ... // https://www.oracle.com/java/technologies/javase/signals.html diff --git a/be/src/udf/CMakeLists.txt b/be/src/udf/CMakeLists.txt index 60ea86cf761043..34e6eecf85d923 100755 --- a/be/src/udf/CMakeLists.txt +++ b/be/src/udf/CMakeLists.txt @@ -20,7 +20,13 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/udf") # where to put generated binaries set(EXECUTABLE_OUTPUT_PATH "${BUILD_DIR}/src/udf") +set(UDF_SOURCES udf.cpp) + +file(GLOB PYTHON_UDF_SOURCES "python/*.cpp") + +list(APPEND UDF_SOURCES ${PYTHON_UDF_SOURCES}) + # Build this library twice. Once to be linked into the main Doris. This version # can have dependencies on our other libs. The second version is shipped as part # of the UDF sdk, which can't use other libs. -add_library(Udf STATIC udf.cpp) \ No newline at end of file +add_library(Udf STATIC ${UDF_SOURCES}) diff --git a/be/src/udf/python/python_client.cpp b/be/src/udf/python/python_client.cpp new file mode 100644 index 00000000000000..4713c91c047f4d --- /dev/null +++ b/be/src/udf/python/python_client.cpp @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "udf/python/python_client.h" + +#include "arrow/flight/client.h" +#include "arrow/flight/server.h" +#include "common/compiler_util.h" +#include "common/config.h" +#include "common/status.h" +#include "udf/python/python_udf_meta.h" +#include "udf/python/python_udf_runtime.h" +#include "util/arrow/utils.h" + +namespace doris { + +Status PythonClient::init(const PythonUDFMeta& func_meta, ProcessPtr process) { + if (_inited) { + return Status::InternalError("PythonClient has already been initialized"); + } + + // Set operation name based on client type + switch (func_meta.client_type) { + case PythonClientType::UDF: + _operation_name = "Python UDF"; + break; + case PythonClientType::UDAF: + _operation_name = "Python UDAF"; + break; + case PythonClientType::UDTF: + _operation_name = "Python UDTF"; + break; + default: + return Status::InternalError("Invalid Python client type"); + } + + // Parse and connect to Python server location + arrow::flight::Location location; + RETURN_DORIS_STATUS_IF_RESULT_ERROR(location, + arrow::flight::Location::Parse(process->get_uri())); + RETURN_DORIS_STATUS_IF_RESULT_ERROR(_arrow_client, FlightClient::Connect(location)); + + // Serialize function metadata to JSON command + std::string command; + RETURN_IF_ERROR(func_meta.serialize_to_json(&command)); + + // Create Flight descriptor and establish bidirectional streaming + FlightDescriptor descriptor = FlightDescriptor::Command(command); + arrow::flight::FlightClient::DoExchangeResult exchange_res; + RETURN_DORIS_STATUS_IF_RESULT_ERROR(exchange_res, _arrow_client->DoExchange(descriptor)); + + _reader = std::move(exchange_res.reader); + _writer = std::move(exchange_res.writer); + _process = std::move(process); + _inited = true; + + return Status::OK(); +} + +Status PythonClient::close() { + if (!_inited || !_writer) { + return Status::OK(); + } + + auto writer_res = _writer->Close(); + if (!writer_res.ok()) { + // Don't propagate error from close, just log it + LOG(WARNING) << "Error closing Python client writer: " << writer_res.message(); + } + + _inited = false; + _begin = false; + _arrow_client.reset(); + _writer.reset(); + _reader.reset(); + _process.reset(); + + return Status::OK(); +} + +Status PythonClient::handle_error(arrow::Status status) { + DCHECK(!status.ok()); + + // Clean up resources + _writer.reset(); + _reader.reset(); + + // Extract and clean error message + std::string msg = status.message(); + LOG(ERROR) << _operation_name << " error: " << msg; + + // Remove Python traceback noise for cleaner error messages + size_t pos = msg.find("The above exception was the direct cause"); + if (pos != std::string::npos) { + msg = msg.substr(0, pos); + } + + return Status::RuntimeError(trim(msg)); +} + +Status PythonClient::begin_stream(const std::shared_ptr& schema) { + if (UNLIKELY(!_begin)) { + auto begin_res = _writer->Begin(schema); + if (!begin_res.ok()) { + return handle_error(begin_res); + } + _begin = true; + } + return Status::OK(); +} + +Status PythonClient::write_batch(const arrow::RecordBatch& input) { + auto write_res = _writer->WriteRecordBatch(input); + if (!write_res.ok()) { + return handle_error(write_res); + } + return Status::OK(); +} + +Status PythonClient::read_batch(std::shared_ptr* output) { + auto read_res = _reader->Next(); + if (!read_res.ok()) { + return handle_error(read_res.status()); + } + + arrow::flight::FlightStreamChunk chunk = std::move(*read_res); + if (!chunk.data) { + return Status::InternalError("Received null RecordBatch from {} server", _operation_name); + } + + *output = std::move(chunk.data); + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/udf/python/python_client.h b/be/src/udf/python/python_client.h new file mode 100644 index 00000000000000..5cba7d893fcfc9 --- /dev/null +++ b/be/src/udf/python/python_client.h @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/flight/client.h" +#include "common/status.h" +#include "udf/python/python_udf_meta.h" +#include "udf/python/python_udf_runtime.h" +#include "util/arrow/utils.h" + +namespace doris { + +/** + * Base class for Python UDF/UDAF/UDTF clients + * + * Provides common functionality for communicating with Python server via Arrow Flight: + * - Connection management + * - Stream initialization + * - Error handling + * - Process lifecycle management + */ +class PythonClient { +public: + using FlightDescriptor = arrow::flight::FlightDescriptor; + using FlightClient = arrow::flight::FlightClient; + using FlightStreamWriter = arrow::flight::FlightStreamWriter; + using FlightStreamReader = arrow::flight::FlightStreamReader; + + PythonClient() = default; + virtual ~PythonClient() = default; + + /** + * Initialize connection to Python server + * @param func_meta Function metadata (contains client_type for operation name) + * @param process Python process handle + * @return Status + */ + Status init(const PythonUDFMeta& func_meta, ProcessPtr process); + + /** + * Close connection and cleanup resources + * @return Status + */ + Status close(); + + /** + * Handle Arrow Flight error + * @param status Arrow status + * @return Doris Status with formatted error message + */ + Status handle_error(arrow::Status status); + + /** + * Get process information for debugging + * @return Process string representation + */ + std::string print_process() const { return _process ? _process->to_string() : "null"; } + + /** + * Get the underlying Python process + * @return Process pointer + */ + ProcessPtr get_process() const { return _process; } + +protected: + /** + * Begin Flight stream with schema (called only once per stream) + * @param schema Input schema + * @return Status + */ + Status begin_stream(const std::shared_ptr& schema); + + /** + * Write RecordBatch to server + * @param input Input RecordBatch + * @return Status + */ + Status write_batch(const arrow::RecordBatch& input); + + /** + * Read RecordBatch from server + * @param output Output RecordBatch + * @return Status + */ + Status read_batch(std::shared_ptr* output); + + // Common state + bool _inited = false; + bool _begin = false; // Track if Begin() has been called + std::string _operation_name; // Operation name for error messages + std::unique_ptr _arrow_client; + std::unique_ptr _writer; + std::unique_ptr _reader; + ProcessPtr _process; + +private: + DISALLOW_COPY_AND_ASSIGN(PythonClient); +}; + +} // namespace doris diff --git a/be/src/udf/python/python_env.cpp b/be/src/udf/python/python_env.cpp new file mode 100644 index 00000000000000..cbee22ef621e98 --- /dev/null +++ b/be/src/udf/python/python_env.cpp @@ -0,0 +1,289 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "python_env.h" + +#include + +#include +#include +#include +#include + +#include "common/status.h" +#include "udf/python/python_server.h" +#include "util/string_util.h" + +namespace doris { + +namespace fs = std::filesystem; + +// extract python version by executing `python --version` and extract "3.9.16" from "Python 3.9.16" +// @param python_path: path to python executable, e.g. "/opt/miniconda3/envs/myenv/bin/python" +// @param version: extracted python version, e.g. "3.9.16" +static Status extract_python_version(const std::string& python_path, std::string* version) { + static std::regex python_version_re(R"(^Python (\d+\.\d+\.\d+))"); + + if (!fs::exists(python_path)) { + return Status::NotFound("Python executable not found: {}", python_path); + } + + std::string cmd = fmt::format("\"{}\" --version", python_path); + FILE* pipe = popen(cmd.c_str(), "r"); + if (!pipe) { + return Status::InternalError("Failed to run: {}", cmd); + } + + std::string result; + char buf[128]; + while (fgets(buf, sizeof(buf), pipe)) { + result += buf; + } + pclose(pipe); + + std::smatch match; + if (std::regex_search(result, match, python_version_re)) { + *version = match[1].str(); + return Status::OK(); + } + + return Status::InternalError("Failed to extract Python version from path: {}, result: {}", + python_path, result); +} + +PythonEnvironment::PythonEnvironment(const std::string& name, const PythonVersion& python_version) + : env_name(name), python_version(python_version) {} + +std::string PythonEnvironment::to_string() const { + return fmt::format( + "[env_name: {}, env_base_path: {}, python_base_path: {}, python_full_version: {}]", + env_name, python_version.base_path, python_version.executable_path, + python_version.full_version); +} + +bool PythonEnvironment::is_valid() const { + if (!python_version.is_valid()) return false; + + auto perms = fs::status(python_version.executable_path).permissions(); + if ((perms & fs::perms::owner_exec) == fs::perms::none) { + return false; + } + + std::string version; + if (!extract_python_version(python_version.executable_path, &version).ok()) { + LOG(WARNING) << "Failed to extract python version from path: " + << python_version.executable_path; + return false; + } + + return python_version.full_version == version; +} + +// Scan for environments under the /{conda_root_path}/envs directory from the conda root. +Status PythonEnvironment::scan_from_conda_root_path(const fs::path& conda_root_path, + std::vector* environments) { + DCHECK(!conda_root_path.empty() && environments != nullptr); + + fs::path envs_dir = conda_root_path / "envs"; + if (!fs::exists(envs_dir) || !fs::is_directory(envs_dir)) { + return Status::NotFound("Conda envs directory not found: {}", envs_dir.string()); + } + + for (const auto& entry : fs::directory_iterator(envs_dir)) { + if (!entry.is_directory()) continue; + + std::string env_name = entry.path().filename(); // e.g. "myenv" + std::string env_base_path = entry.path(); // e.g. "/opt/miniconda3/envs/myenv" + std::string python_path = + env_base_path + "/bin/python"; // e.g. "/{env_base_path}/bin/python" + std::string python_full_version; // e.g. "3.9.16" + RETURN_IF_ERROR(extract_python_version(python_path, &python_full_version)); + size_t pos = python_full_version.find_last_of('.'); + + if (UNLIKELY(pos == std::string::npos)) { + return Status::InvalidArgument("Invalid python version: {}", python_full_version); + } + + PythonVersion python_version(python_full_version, env_base_path, python_path); + PythonEnvironment conda_env(env_name, python_version); + + if (UNLIKELY(!conda_env.is_valid())) { + LOG(WARNING) << "Invalid conda environment: " << conda_env.to_string(); + continue; + } + + environments->push_back(std::move(conda_env)); + } + + if (environments->empty()) { + return Status::NotFound("No conda python environments found"); + } + + return Status::OK(); +} + +Status PythonEnvironment::scan_from_venv_root_path( + const fs::path& venv_root_path, const std::vector& interpreter_paths, + std::vector* environments) { + DCHECK(!venv_root_path.empty() && environments != nullptr); + + for (const auto& interpreter_path : interpreter_paths) { + if (!fs::exists(interpreter_path) || !fs::is_regular_file(interpreter_path)) { + return Status::NotFound("Interpreter path not found: {}", interpreter_path); + } + std::string python_full_version; + RETURN_IF_ERROR(extract_python_version(interpreter_path, &python_full_version)); + size_t pos = python_full_version.find_last_of('.'); + if (UNLIKELY(pos == std::string::npos)) { + return Status::InvalidArgument("Invalid python version: {}", python_full_version); + } + // Extract major.minor version (e.g., "3.12" from "3.12.0") + std::string python_major_minor_version = python_full_version.substr(0, pos); + + std::string env_name = fmt::format("python{}", python_full_version); // e.g. "python3.9.16" + std::string env_base_path = fmt::format("{}/{}", venv_root_path.string(), + env_name); // e.g. "/opt/venv/python3.9.16" + std::string python_path = + fmt::format("{}/bin/python", env_base_path); // e.g. "/{venv_base_path}/bin/python" + + if (!fs::exists(env_base_path) || !fs::exists(python_path)) { + fs::create_directories(env_base_path); + std::string create_venv_cmd = + fmt::format("{} -m venv {}", interpreter_path, env_base_path); + + if (system(create_venv_cmd.c_str()) != 0 || !fs::exists(python_path)) { + return Status::RuntimeError("Failed to create python virtual environment, cmd: {}", + create_venv_cmd); + } + } + + // Use major.minor version for site-packages path (e.g., "python3.12") + std::string python_dependency_path = fmt::format("{}/lib/python{}/site-packages", + env_base_path, python_major_minor_version); + + if (!fs::exists(python_dependency_path)) { + return Status::NotFound("Python dependency path not found: {}", python_dependency_path); + } + + PythonVersion python_version(python_full_version, env_base_path, python_path); + PythonEnvironment venv_env(env_name, python_version); + + if (UNLIKELY(!venv_env.is_valid())) { + LOG(WARNING) << "Invalid venv environment: " << venv_env.to_string(); + continue; + } + + environments->push_back(std::move(venv_env)); + } + + if (environments->empty()) { + return Status::NotFound("No venv python environments found"); + } + + return Status::OK(); +} + +Status PythonEnvScanner::get_versions(std::vector* versions) const { + DCHECK(versions != nullptr); + if (_envs.empty()) { + return Status::InternalError("not found available version"); + } + for (const auto& env : _envs) { + versions->push_back(env.python_version); + } + return Status::OK(); +} + +Status PythonEnvScanner::get_version(const std::string& runtime_version, + PythonVersion* version) const { + if (_envs.empty()) { + return Status::InternalError("not found available version"); + } + std::string_view runtime_version_view(runtime_version); + runtime_version_view = trim(runtime_version_view); + for (const auto& env : _envs) { + if (env.python_version.full_version == runtime_version_view) { + *version = env.python_version; + return Status::OK(); + } + } + return Status::NotFound("not found runtime version: {}", runtime_version); +} + +Status CondaEnvScanner::scan() { + RETURN_IF_ERROR(PythonEnvironment::scan_from_conda_root_path(_env_root_path, &_envs)); + return Status::OK(); +} + +std::string CondaEnvScanner::to_string() const { + std::stringstream ss; + ss << "Conda environments: "; + for (const auto& conda_env : _envs) { + ss << conda_env.to_string() << ", "; + } + return ss.str(); +} + +Status VenvEnvScanner::scan() { + RETURN_IF_ERROR(PythonEnvironment::scan_from_venv_root_path(_env_root_path, _interpreter_paths, + &_envs)); + return Status::OK(); +} + +std::string VenvEnvScanner::to_string() const { + std::stringstream ss; + ss << "Venv environments: "; + for (const auto& venv_env : _envs) { + ss << venv_env.to_string() << ", "; + } + return ss.str(); +} + +Status PythonVersionManager::init(PythonEnvType env_type, const fs::path& python_root_path, + const std::string& python_venv_interpreter_paths) { + switch (env_type) { + case PythonEnvType::CONDA: { + if (!fs::exists(python_root_path) || !fs::is_directory(python_root_path)) { + return Status::InvalidArgument("Invalid conda root path: {}", + python_root_path.string()); + } + _env_scanner = std::make_unique(python_root_path); + break; + } + case PythonEnvType::VENV: { + if (!fs::exists(python_root_path) || !fs::is_directory(python_root_path)) { + return Status::InvalidArgument("Invalid venv root path: {}", python_root_path.string()); + } + std::vector interpreter_paths = split(python_venv_interpreter_paths, ":"); + if (interpreter_paths.empty()) { + return Status::InvalidArgument("Invalid python interpreter paths: {}", + python_venv_interpreter_paths); + } + _env_scanner = std::make_unique(python_root_path, interpreter_paths); + break; + } + default: + return Status::NotSupported("Unsupported python runtime type: {}", + static_cast(env_type)); + } + std::vector versions; + RETURN_IF_ERROR(_env_scanner->scan()); + RETURN_IF_ERROR(_env_scanner->get_versions(&versions)); + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/udf/python/python_env.h b/be/src/udf/python/python_env.h new file mode 100644 index 00000000000000..4d3a5acca60407 --- /dev/null +++ b/be/src/udf/python/python_env.h @@ -0,0 +1,164 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/status.h" + +namespace doris { + +namespace fs = std::filesystem; + +enum class PythonEnvType { CONDA, VENV }; + +struct PythonVersion { + std::string full_version; // e.g. "3.9.16" + std::string base_path; // e.g. "/root/anaconda3/envs/python3.9" + std::string executable_path; // e.g. "{base_path}/bin/python3" + + PythonVersion() = default; + + explicit PythonVersion(std::string full_version, std::string base_path, + std::string executable_path) + : full_version(std::move(full_version)), + base_path(std::move(base_path)), + executable_path(std::move(executable_path)) {} + + bool operator==(const PythonVersion& other) const { + return full_version == other.full_version && base_path == other.base_path && + executable_path == other.executable_path; + } + + const std::string& get_base_path() const { return base_path; } + + const std::string& get_executable_path() const { return executable_path; } + + bool is_valid() const { + return !full_version.empty() && !base_path.empty() && !executable_path.empty() && + fs::exists(base_path) && fs::exists(executable_path); + } + + std::string to_string() const { + return fmt::format("[full_version: {}, base_path: {}, executable_path: {}]", full_version, + base_path, executable_path); + } +}; + +struct PythonEnvironment { + std::string env_name; // e.g. "base" or "myenv" + PythonVersion python_version; + + PythonEnvironment(const std::string& name, const PythonVersion& python_version); + + std::string to_string() const; + + bool is_valid() const; + + static Status scan_from_conda_root_path(const fs::path& conda_root_path, + std::vector* environments); + + static Status scan_from_venv_root_path(const fs::path& venv_root_path, + const std::vector& interpreter_paths, + std::vector* environments); +}; + +class PythonEnvScanner { +public: + PythonEnvScanner(const fs::path& env_root_path) : _env_root_path(env_root_path) {} + + virtual ~PythonEnvScanner() = default; + + virtual Status scan() = 0; + + Status get_versions(std::vector* versions) const; + + Status get_version(const std::string& runtime_version, PythonVersion* version) const; + + std::string root_path() const { return _env_root_path.string(); } + + virtual PythonEnvType env_type() const = 0; + + virtual std::string to_string() const = 0; + +protected: + fs::path _env_root_path; + std::vector _envs; +}; + +class CondaEnvScanner : public PythonEnvScanner { +public: + CondaEnvScanner(const fs::path& python_root_path) : PythonEnvScanner(python_root_path) {} + + ~CondaEnvScanner() override = default; + + Status scan() override; + + std::string to_string() const override; + + PythonEnvType env_type() const override { return PythonEnvType::CONDA; } +}; + +class VenvEnvScanner : public PythonEnvScanner { +public: + VenvEnvScanner(const fs::path& python_root_path, + const std::vector& interpreter_paths) + : PythonEnvScanner(python_root_path), _interpreter_paths(interpreter_paths) {} + + ~VenvEnvScanner() override = default; + + Status scan() override; + + std::string to_string() const override; + + PythonEnvType env_type() const override { return PythonEnvType::VENV; } + +private: + std::vector _interpreter_paths; +}; + +class PythonVersionManager { +public: + static PythonVersionManager& instance() { + static PythonVersionManager instance; + return instance; + } + + Status init(PythonEnvType env_type, const fs::path& python_root_path, + const std::string& python_venv_interpreter_paths); + + Status get_version(const std::string& runtime_version, PythonVersion* version) const { + return _env_scanner->get_version(runtime_version, version); + } + + std::string to_string() const { return _env_scanner->to_string(); } + +private: + std::unique_ptr _env_scanner; +}; + +} // namespace doris + +namespace std { +template <> +struct hash { + size_t operator()(const doris::PythonVersion& v) const noexcept { + return hash {}(v.full_version); + } +}; +} // namespace std diff --git a/be/src/udf/python/python_server.cpp b/be/src/udf/python/python_server.cpp new file mode 100644 index 00000000000000..be597647ca5f76 --- /dev/null +++ b/be/src/udf/python/python_server.cpp @@ -0,0 +1,292 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "udf/python/python_server.h" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "common/config.h" +#include "udf/python/python_udaf_client.h" +#include "udf/python/python_udf_client.h" +#include "udf/python/python_udtf_client.h" +#include "util/cpu_info.h" + +namespace doris { + +template +Status PythonServerManager::get_client(const PythonUDFMeta& func_meta, const PythonVersion& version, + std::shared_ptr* client, + const std::shared_ptr& data_schema) { + // Ensure process pool is initialized for this version + RETURN_IF_ERROR(ensure_pool_initialized(version)); + + ProcessPtr process; + RETURN_IF_ERROR(get_process(version, &process)); + + if constexpr (std::is_same_v) { + RETURN_IF_ERROR(T::create(func_meta, std::move(process), data_schema, client)); + } else { + RETURN_IF_ERROR(T::create(func_meta, std::move(process), client)); + } + + return Status::OK(); +} + +Status PythonServerManager::ensure_pool_initialized(const PythonVersion& version) { + std::lock_guard lock(_pools_mutex); + + // Check if already initialized + if (_initialized_versions.count(version)) return Status::OK(); + + std::vector& pool = _process_pools[version]; + // 0 means use CPU core count as default, otherwise use the specified value + int max_pool_size = config::max_python_process_num > 0 ? config::max_python_process_num + : CpuInfo::num_cores(); + + LOG(INFO) << "Initializing Python process pool for version " << version.to_string() << " with " + << max_pool_size + << " processes (config::max_python_process_num=" << config::max_python_process_num + << ", CPU cores=" << CpuInfo::num_cores() << ")"; + + std::vector> futures; + std::vector temp_processes(max_pool_size); + + for (int i = 0; i < max_pool_size; i++) { + futures.push_back(std::async(std::launch::async, [this, &version, i, &temp_processes]() { + ProcessPtr process; + Status s = fork(version, &process); + if (s.ok()) { + temp_processes[i] = std::move(process); + } + return s; + })); + } + + int success_count = 0; + int failure_count = 0; + for (int i = 0; i < max_pool_size; i++) { + Status s = futures[i].get(); + if (s.ok() && temp_processes[i]) { + pool.push_back(std::move(temp_processes[i])); + success_count++; + } else { + failure_count++; + LOG(WARNING) << "Failed to create Python process " << (i + 1) << "/" << max_pool_size + << ": " << s.to_string(); + } + } + + if (pool.empty()) { + return Status::InternalError( + "Failed to initialize Python process pool: all {} process creation attempts failed", + max_pool_size); + } + + LOG(INFO) << "Python process pool initialized for version " << version.to_string() + << ": created " << success_count << " processes" + << (failure_count > 0 ? fmt::format(" ({} failed)", failure_count) : ""); + + _initialized_versions.insert(version); + _start_health_check_thread(); + + return Status::OK(); +} + +Status PythonServerManager::get_process(const PythonVersion& version, ProcessPtr* process) { + std::lock_guard lock(_pools_mutex); + std::vector& pool = _process_pools[version]; + + if (UNLIKELY(pool.empty())) { + return Status::InternalError("Python process pool is empty for version {}", + version.to_string()); + } + + // Find process with minimum load (use_count - 1 gives active client count) + auto min_iter = std::min_element( + pool.begin(), pool.end(), + [](const ProcessPtr& a, const ProcessPtr& b) { return a.use_count() < b.use_count(); }); + + // Return process with minimum load + *process = *min_iter; + return Status::OK(); +} + +Status PythonServerManager::fork(const PythonVersion& version, ProcessPtr* process) { + std::string python_executable_path = version.get_executable_path(); + std::string fight_server_path = get_fight_server_path(); + std::string base_unix_socket_path = get_base_unix_socket_path(); + std::vector args = {"-u", fight_server_path, base_unix_socket_path}; + boost::process::environment env = boost::this_process::environment(); + boost::process::ipstream child_output; + + try { + boost::process::child c( + python_executable_path, args, boost::process::std_out > child_output, + boost::process::env = env, + boost::process::on_exit([](int exit_code, const std::error_code& ec) { + if (ec) { + LOG(WARNING) << "Python UDF server exited with error: " << ec.message(); + } + })); + + // Wait for socket file to be created (indicates server is ready) + std::string expected_socket_path = get_unix_socket_file_path(c.id()); + bool started_successfully = false; + std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); + const auto timeout = std::chrono::milliseconds(5000); + + while (std::chrono::steady_clock::now() - start < timeout) { + struct stat buffer; + if (stat(expected_socket_path.c_str(), &buffer) == 0) { + started_successfully = true; + break; + } + + if (!c.running()) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + + if (!started_successfully) { + if (c.running()) { + c.terminate(); + c.wait(); + } + return Status::InternalError("Python server start failed: socket file not found at {}", + expected_socket_path); + } + + *process = std::make_shared(std::move(c), std::move(child_output)); + + } catch (const std::exception& e) { + return Status::InternalError("Failed to start Python UDF server: {}", e.what()); + } + + return Status::OK(); +} + +void PythonServerManager::_start_health_check_thread() { + if (_health_check_thread) return; + + LOG(INFO) << "Starting Python process health check thread (interval: 60 seconds)"; + + _health_check_thread = std::make_unique([this]() { + // Health check loop + while (!_shutdown_flag.load(std::memory_order_acquire)) { + // Wait for interval or shutdown signal + { + std::unique_lock lock(_health_check_mutex); + _health_check_cv.wait_for(lock, std::chrono::seconds(60), [this]() { + return _shutdown_flag.load(std::memory_order_acquire); + }); + } + + if (_shutdown_flag.load(std::memory_order_acquire)) break; + + std::lock_guard lock(_pools_mutex); + + int total_checked = 0; + int total_dead = 0; + int total_recreated = 0; + + for (auto& [version, pool] : _process_pools) { + for (size_t i = 0; i < pool.size(); ++i) { + auto& process = pool[i]; + if (!process) continue; + + total_checked++; + if (!process->is_alive()) { + total_dead++; + LOG(WARNING) + << "Detected dead Python process (pid=" << process->get_child_pid() + << ", version=" << version.to_string() << "), recreating..."; + + ProcessPtr new_process; + Status s = fork(version, &new_process); + if (s.ok()) { + pool[i] = std::move(new_process); + total_recreated++; + LOG(INFO) << "Successfully recreated Python process for version " + << version.to_string(); + } else { + LOG(ERROR) << "Failed to recreate Python process for version " + << version.to_string() << ": " << s.to_string(); + pool.erase(pool.begin() + i); + --i; + } + } + } + } + + if (total_dead > 0) { + LOG(INFO) << "Health check completed: checked=" << total_checked + << ", dead=" << total_dead << ", recreated=" << total_recreated; + } + } + + LOG(INFO) << "Python process health check thread exiting"; + }); +} + +void PythonServerManager::shutdown() { + // Signal health check thread to stop + _shutdown_flag.store(true, std::memory_order_release); + _health_check_cv.notify_one(); + + if (_health_check_thread && _health_check_thread->joinable()) { + _health_check_thread->join(); + _health_check_thread.reset(); + } + + // Shutdown all processes + std::lock_guard lock(_pools_mutex); + for (auto& [version, pool] : _process_pools) { + for (auto& process : pool) { + if (process) { + process->shutdown(); + } + } + } + _process_pools.clear(); +} + +// Explicit template instantiation for UDF, UDAF and UDTF clients +template Status PythonServerManager::get_client( + const PythonUDFMeta& func_meta, const PythonVersion& version, + std::shared_ptr* client, + const std::shared_ptr& data_schema); + +template Status PythonServerManager::get_client( + const PythonUDFMeta& func_meta, const PythonVersion& version, + std::shared_ptr* client, + const std::shared_ptr& data_schema); + +template Status PythonServerManager::get_client( + const PythonUDFMeta& func_meta, const PythonVersion& version, + std::shared_ptr* client, + const std::shared_ptr& data_schema); + +} // namespace doris \ No newline at end of file diff --git a/be/src/udf/python/python_server.h b/be/src/udf/python/python_server.h new file mode 100644 index 00000000000000..4db368179756e9 --- /dev/null +++ b/be/src/udf/python/python_server.h @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "common/status.h" +#include "udf/python/python_udf_meta.h" +#include "udf/python/python_udf_runtime.h" + +namespace doris { + +class PythonServerManager { +public: + PythonServerManager() = default; + + ~PythonServerManager() { shutdown(); } + + static PythonServerManager& instance() { + static PythonServerManager instance; + return instance; + } + + template + Status get_client(const PythonUDFMeta& func_meta, const PythonVersion& version, + std::shared_ptr* client, + const std::shared_ptr& data_schema = nullptr); + + Status fork(const PythonVersion& version, ProcessPtr* process); + + Status get_process(const PythonVersion& version, ProcessPtr* process); + + Status ensure_pool_initialized(const PythonVersion& version); + + void shutdown(); + +private: + /** + * Start health check background thread (called once by ensure_pool_initialized) + * Thread periodically checks process health and recreates dead processes + */ + void _start_health_check_thread(); + + std::unordered_map> _process_pools; + // Protects _process_pools access + std::mutex _pools_mutex; + // Track which versions have been initialized + std::unordered_set _initialized_versions; + // Health check background thread + std::unique_ptr _health_check_thread; + std::atomic _shutdown_flag {false}; + std::condition_variable _health_check_cv; + std::mutex _health_check_mutex; +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/udf/python/python_server.py b/be/src/udf/python/python_server.py new file mode 100644 index 00000000000000..d668ee3c8452b0 --- /dev/null +++ b/be/src/udf/python/python_server.py @@ -0,0 +1,2296 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import base64 +import gc +import importlib +import inspect +import json +import sys +import os +import traceback +import logging +import time +import threading +import pickle +from abc import ABC, abstractmethod +from contextlib import contextmanager +from typing import Any, Callable, Optional, Tuple, get_origin, Dict +from datetime import datetime +from enum import Enum +from pathlib import Path +from logging.handlers import RotatingFileHandler + +import pandas as pd +import pyarrow as pa +from pyarrow import flight + + +class ServerState: + """Global server state container.""" + + unix_socket_path: str = "" + + PYTHON_SERVER_START_SUCCESS_MSG: str = "Start python server successfully" + + @staticmethod + def setup_logging(): + """Setup logging configuration for the UDF server with rotation.""" + + doris_home = os.getenv("DORIS_HOME") + if not doris_home: + # Fallback to current directory if DORIS_HOME is not set + doris_home = os.getcwd() + + log_dir = os.path.join(doris_home, "lib", "udf", "python") + os.makedirs(log_dir, exist_ok=True) + + # Use shared log file with process ID in each log line + log_file = os.path.join(log_dir, "python_udf_output.log") + max_bytes = 128 * 1024 * 1024 # 128MB + backup_count = 5 + + # Use RotatingFileHandler to automatically manage log file size + file_handler = RotatingFileHandler( + log_file, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8" + ) + + # Include process ID in log format + file_handler.setFormatter( + logging.Formatter( + "[%(asctime)s] [PID:%(process)d] [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s" + ) + ) + + logging.basicConfig( + level=logging.INFO, + handlers=[ + file_handler, + logging.StreamHandler(sys.stderr), # Also log to stderr for debugging + ], + ) + logging.info( + "Logging initialized. Log file: %s (max_size=%dMB, backups=%d)", + log_file, + max_bytes // (1024 * 1024), + backup_count, + ) + + @staticmethod + def extract_base_unix_socket_path(unix_socket_uri: str) -> str: + """ + Extract the file system path from a gRPC Unix socket URI. + + Args: + unix_socket_uri: URI in format 'grpc+unix:///path/to/socket' + + Returns: + The file system path without the protocol prefix + """ + if unix_socket_uri.startswith("grpc+unix://"): + unix_socket_uri = unix_socket_uri[len("grpc+unix://") :] + return unix_socket_uri + + @staticmethod + def remove_unix_socket(unix_socket_uri: str) -> None: + """ + Remove the Unix domain socket file if it exists. + + Args: + unix_socket_uri: URI of the Unix socket to remove + """ + if unix_socket_uri is None: + return + base_unix_socket_path = ServerState.extract_base_unix_socket_path( + unix_socket_uri + ) + if os.path.exists(base_unix_socket_path): + try: + os.unlink(base_unix_socket_path) + logging.info( + "Removed UNIX socket %s successfully", base_unix_socket_path + ) + except OSError as e: + logging.error( + "Failed to remove UNIX socket %s: %s", base_unix_socket_path, e + ) + else: + logging.warning("UNIX socket %s does not exist", base_unix_socket_path) + + @staticmethod + def monitor_parent_exit(): + """ + Monitor the parent process and exit gracefully if it dies. + This prevents orphaned UDF server processes. + """ + parent_pid = os.getppid() + if parent_pid == 1: + # Parent process is init, no need to monitor + logging.info("Parent process is init (PID 1), skipping parent monitoring") + return + + logging.info("Started monitoring parent process (PID: %s)", parent_pid) + + while True: + try: + # os.kill(pid, 0) only checks whether the process exists + # without sending an actual signal + os.kill(parent_pid, 0) + except OSError: + # Parent process died + ServerState.remove_unix_socket(ServerState.unix_socket_path) + logging.error( + "Parent process %s died, exiting UDF server, unix socket path: %s", + parent_pid, + ServerState.unix_socket_path, + ) + os._exit(0) + # Check every 2 seconds + time.sleep(2) + + +ServerState.setup_logging() +monitor_thread = threading.Thread(target=ServerState.monitor_parent_exit, daemon=True) +monitor_thread.start() + + +@contextmanager +def temporary_sys_path(path: str): + """ + Context manager to temporarily add a path to sys.path. + Ensures the path is removed after use to avoid pollution. + + Args: + path: Directory path to add to sys.path + + Yields: + None + """ + path_added = False + if path not in sys.path: + sys.path.insert(0, path) + path_added = True + + try: + yield + finally: + if path_added and path in sys.path: + sys.path.remove(path) + + +class VectorType(Enum): + """Enum representing supported vector types.""" + + LIST = "list" + PANDAS_SERIES = "pandas.Series" + ARROW_ARRAY = "pyarrow.Array" + + @property + def python_type(self): + """ + Returns the Python type corresponding to this VectorType. + + Returns: + The Python type class (list, pd.Series, or pa.Array) + """ + mapping = { + VectorType.LIST: list, + VectorType.PANDAS_SERIES: pd.Series, + VectorType.ARROW_ARRAY: pa.Array, + } + return mapping[self] + + @staticmethod + def resolve_vector_type(param: inspect.Parameter): + """ + Resolves the param's type annotation to the corresponding VectorType enum. + Returns None if the type is unsupported or not a vector type. + """ + if ( + param is None + or param.annotation is None + or param.annotation is inspect.Parameter.empty + ): + return None + + annotation = param.annotation + origin = get_origin(annotation) + raw_type = origin if origin is not None else annotation + + if raw_type is list: + return VectorType.LIST + if raw_type is pd.Series: + return VectorType.PANDAS_SERIES + + return None + + +class ClientType(Enum): + """Enum representing Python client types.""" + + UDF = 0 + UDAF = 1 + UDTF = 2 + UNKNOWN = 3 + + def __str__(self) -> str: + """Return string representation of the client type.""" + return self.name + + +class PythonUDFMeta: + """Metadata container for a Python UDF.""" + + def __init__( + self, + name: str, + symbol: str, + location: str, + udf_load_type: int, + runtime_version: str, + always_nullable: bool, + inline_code: bytes, + input_types: pa.Schema, + output_type: pa.DataType, + client_type: int, + ) -> None: + """ + Initialize Python UDF metadata. + + Args: + name: UDF function name + symbol: Symbol to load (function name or module.function) + location: File path or directory containing the UDF + udf_load_type: 0 for inline code, 1 for module + runtime_version: Python runtime version requirement + always_nullable: Whether the UDF can return NULL values + inline_code: Base64-encoded inline Python code (if applicable) + input_types: PyArrow schema for input parameters + output_type: PyArrow data type for return value + client_type: 0 for UDF, 1 for UDAF, 2 for UDTF + """ + self.name = name + self.symbol = symbol + self.location = location + self.udf_load_type = udf_load_type + self.runtime_version = runtime_version + self.always_nullable = always_nullable + self.inline_code = inline_code + self.input_types = input_types + self.output_type = output_type + self.client_type = ClientType(client_type) + + def is_udf(self) -> bool: + """Check if this is a UDF (User-Defined Function).""" + return self.client_type == ClientType.UDF + + def is_udaf(self) -> bool: + """Check if this is a UDAF (User-Defined Aggregate Function).""" + return self.client_type == ClientType.UDAF + + def is_udtf(self) -> bool: + """Check if this is a UDTF (User-Defined Table Function).""" + return self.client_type == ClientType.UDTF + + def __str__(self) -> str: + """Returns a string representation of the UDF metadata.""" + udf_load_type_str = "INLINE" if self.udf_load_type == 0 else "MODULE" + return ( + f"PythonUDFMeta(name={self.name}, symbol={self.symbol}, " + f"location={self.location}, udf_load_type={udf_load_type_str}, runtime_version={self.runtime_version}, " + f"always_nullable={self.always_nullable}, client_type={self.client_type.name}, " + f"input_types={self.input_types}, output_type={self.output_type})" + ) + + +class AdaptivePythonUDF: + """ + A wrapper around a UDF function that supports both scalar and vectorized execution modes. + The mode is determined by the type hints of the function parameters. + """ + + def __init__(self, python_udf_meta: PythonUDFMeta, func: Callable) -> None: + """ + Initialize the adaptive UDF wrapper. + + Args: + python_udf_meta: Metadata describing the UDF + func: The actual Python function to execute + """ + self.python_udf_meta = python_udf_meta + self._eval_func = func + + def __str__(self) -> str: + """Returns a string representation of the UDF wrapper.""" + input_type_strs = [str(t) for t in self.python_udf_meta.input_types.types] + output_type_str = str(self.python_udf_meta.output_type) + eval_func_str = f"{self.python_udf_meta.name}({', '.join(input_type_strs)}) -> {output_type_str}" + return f"AdaptivePythonUDF(python_udf_meta: {self.python_udf_meta}, eval_func: {eval_func_str})" + + def __call__(self, record_batch: pa.RecordBatch) -> pa.Array: + """ + Executes the UDF on the given record batch. Supports both scalar and vectorized modes. + + :param record_batch: Input data with N columns, each of length num_rows + :return: Output array of length num_rows + """ + if record_batch.num_rows == 0: + return pa.array([], type=self._get_output_type()) + + if self._should_use_vectorized(): + return self._vectorized_call(record_batch) + + return self._scalar_call(record_batch) + + @staticmethod + def _cast_arrow_to_vector(arrow_array: pa.Array, vec_type: VectorType): + """ + Convert a pa.Array to an instance of the specified VectorType. + """ + if vec_type == VectorType.LIST: + return arrow_array.to_pylist() + elif vec_type == VectorType.PANDAS_SERIES: + return arrow_array.to_pandas() + else: + raise ValueError(f"Unsupported vector type: {vec_type}") + + def _should_use_vectorized(self) -> bool: + """ + Determines whether to use vectorized mode based on parameter type annotations. + Returns True if any parameter is annotated as: + - list + - pd.Series + """ + try: + signature = inspect.signature(self._eval_func) + except ValueError: + # Cannot inspect built-in or C functions; default to scalar + return False + + for param in signature.parameters.values(): + if VectorType.resolve_vector_type(param): + return True + + return False + + def _convert_from_arrow_to_py(self, field): + if field is None: + return None + + if pa.types.is_map(field.type): + # pyarrow.lib.MapScalar's as_py() returns a list of tuples, convert to dict + list_of_tuples = field.as_py() + return dict(list_of_tuples) if list_of_tuples is not None else None + return field.as_py() + + def _scalar_call(self, record_batch: pa.RecordBatch) -> pa.Array: + """ + Applies the UDF in scalar mode: one row at a time. + + Args: + record_batch: Input data batch + + Returns: + Output array with results for each row + """ + columns = record_batch.columns + num_rows = record_batch.num_rows + result = [] + + for i in range(num_rows): + converted_args = [self._convert_from_arrow_to_py(col[i]) for col in columns] + + try: + res = self._eval_func(*converted_args) + # Check if result is None when always_nullable is False + if res is None and not self.python_udf_meta.always_nullable: + raise RuntimeError( + f"the result of row {i} is null, but the return type is not nullable, " + f"please check the always_nullable property in create function statement, " + f"it should be true" + ) + result.append(res) + except Exception as e: + logging.error( + "Error in scalar UDF execution at row %s: %s\nArgs: %s\nTraceback: %s", + i, + e, + converted_args, + traceback.format_exc(), + ) + # Return None for failed rows if always_nullable is True + if self.python_udf_meta.always_nullable: + result.append(None) + else: + raise + + return pa.array(result, type=self._get_output_type()) + + def _vectorized_call(self, record_batch: pa.RecordBatch) -> pa.Array: + """ + Applies the UDF in vectorized mode: processes entire columns at once. + + Args: + record_batch: Input data batch + + Returns: + Output array with results + """ + column_args = record_batch.columns + logging.info("Vectorized call with %s columns", len(column_args)) + + sig = inspect.signature(self._eval_func) + params = list(sig.parameters.values()) + + if len(column_args) != len(params): + raise ValueError(f"UDF expects {len(params)} args, got {len(column_args)}") + + converted_args = [] + for param, arrow_col in zip(params, column_args): + vec_type = VectorType.resolve_vector_type(param) + + if vec_type is None: + # For scalar types (int, float, str, etc.), extract the first value + # instead of converting to list + pylist = arrow_col.to_pylist() + if len(pylist) > 0: + converted = pylist[0] + logging.info( + "Converted %s to scalar (first value): %s", + param.name, + type(converted).__name__, + ) + else: + converted = None + logging.info( + "Converted %s to scalar (None, empty column)", param.name + ) + else: + converted = self._cast_arrow_to_vector(arrow_col, vec_type) + logging.info("Converted %s: %s", param.name, vec_type) + + converted_args.append(converted) + + try: + result = self._eval_func(*converted_args) + except Exception as e: + logging.error( + "Error in vectorized UDF: %s\nTraceback: %s", e, traceback.format_exc() + ) + raise RuntimeError(f"Error in vectorized UDF: {e}") from e + + # Convert result to PyArrow Array + result_array = None + if isinstance(result, pd.Series): + result_array = pa.array(result, type=self._get_output_type()) + elif isinstance(result, list): + result_array = pa.array(result, type=self._get_output_type()) + else: + # Scalar result - broadcast to all rows + out_type = self._get_output_type() + logging.warning( + "UDF returned scalar value, broadcasting to %s rows", + record_batch.num_rows, + ) + result_array = pa.array([result] * record_batch.num_rows, type=out_type) + + # Check for None values when always_nullable is False + if not self.python_udf_meta.always_nullable: + null_count = result_array.null_count + if null_count > 0: + # Find the first null index for error message + for i, value in enumerate(result_array): + if value.is_valid is False: + raise RuntimeError( + f"the result of row {i} is null, but the return type is not nullable, " + f"please check the always_nullable property in create function statement, " + f"it should be true" + ) + + return result_array + + def _get_output_type(self) -> pa.DataType: + """ + Returns the expected output type for the UDF. + + Returns: + PyArrow DataType for the output + """ + return self.python_udf_meta.output_type or pa.null() + + +class UDFLoader(ABC): + """Abstract base class for loading UDFs from different sources.""" + + def __init__(self, python_udf_meta: PythonUDFMeta) -> None: + """ + Initialize the UDF loader. + + Args: + python_udf_meta: Metadata describing the UDF to load + """ + self.python_udf_meta = python_udf_meta + + @abstractmethod + def load(self) -> AdaptivePythonUDF: + """Load the UDF and return an AdaptivePythonUDF wrapper.""" + raise NotImplementedError("Subclasses must implement load().") + + +class InlineUDFLoader(UDFLoader): + """Loads a UDF defined directly in inline code.""" + + def load(self) -> AdaptivePythonUDF: + """ + Load and execute inline Python code to extract the UDF function. + + Returns: + AdaptivePythonUDF wrapper around the loaded function + + Raises: + RuntimeError: If code execution fails + ValueError: If the function is not found or not callable + """ + symbol = self.python_udf_meta.symbol + inline_code = self.python_udf_meta.inline_code.decode("utf-8") + env: dict[str, Any] = {} + + try: + # Execute the code in a clean environment + # pylint: disable=exec-used + # Note: exec() is necessary here for dynamic UDF loading from inline code + exec(inline_code, env) # nosec B102 + except Exception as e: + logging.error( + "Failed to exec inline code: %s\nTraceback: %s", + e, + traceback.format_exc(), + ) + raise RuntimeError(f"Failed to exec inline code: {e}") from e + + func = env.get(symbol) + if func is None: + available_funcs = [ + k for k, v in env.items() if callable(v) and not k.startswith("_") + ] + logging.error( + "Function '%s' not found in inline code. Available functions: %s", + symbol, + available_funcs, + ) + raise ValueError(f"Function '{symbol}' not found in inline code.") + + if not callable(func): + logging.error( + "'%s' exists but is not callable (type: %s)", symbol, type(func) + ) + raise ValueError(f"'{symbol}' is not a callable function.") + + return AdaptivePythonUDF(self.python_udf_meta, func) + + +class ModuleUDFLoader(UDFLoader): + """Loads a UDF from a Python module file (.py).""" + + # Class-level lock dictionary for thread-safe module imports + # Using RLock allows the same thread to acquire the lock multiple times + _import_locks: Dict[str, threading.RLock] = {} + _import_locks_lock = threading.Lock() + + @classmethod + def _get_import_lock(cls, module_name: str) -> threading.RLock: + """ + Get or create a reentrant lock for the given module name. + + Uses double-checked locking pattern for optimal performance: + - Fast path: return existing lock without acquiring global lock + - Slow path: create new lock under global lock protection + """ + # Fast path: check without lock (read-only, safe for most cases) + if module_name in cls._import_locks: + return cls._import_locks[module_name] + + # Slow path: create lock under protection + with cls._import_locks_lock: + # Double-check: another thread might have created it while we waited + if module_name not in cls._import_locks: + cls._import_locks[module_name] = threading.RLock() + return cls._import_locks[module_name] + + def load(self) -> AdaptivePythonUDF: + """ + Loads a UDF from a Python module file. + + Returns: + AdaptivePythonUDF instance wrapping the loaded function + + Raises: + ValueError: If module file not found + TypeError: If symbol is not callable + """ + symbol = self.python_udf_meta.symbol # [package_name.]module_name.function_name + location = self.python_udf_meta.location # /path/to/module_name[.py] + + if not os.path.exists(location): + raise ValueError(f"Module file not found: {location}") + + package_name, module_name, func_name = self.parse_symbol(symbol) + func = self.load_udf_from_module(location, package_name, module_name, func_name) + + if not callable(func): + raise TypeError( + f"'{symbol}' exists but is not callable (type: {type(func).__name__})" + ) + + return AdaptivePythonUDF(self.python_udf_meta, func) + + def parse_symbol(self, symbol: str): + """ + Parse symbol into (package_name, module_name, func_name) + + Supported formats: + - "module.func" → (None, module, func) + - "package.module.func" → (package, module, func) + """ + if not symbol or "." not in symbol: + raise ValueError( + f"Invalid symbol format: '{symbol}'. " + "Expected 'module.function' or 'package.module.function'" + ) + + parts = symbol.split(".") + if len(parts) == 2: + # module.func → Single-file mode + module_name, func_name = parts + package_name = None + if not module_name or not module_name.strip(): + raise ValueError(f"Module name is empty in symbol: '{symbol}'") + if not func_name or not func_name.strip(): + raise ValueError(f"Function name is empty in symbol: '{symbol}'") + elif len(parts) > 2: + package_name = parts[0] + module_name = ".".join(parts[1:-1]) + func_name = parts[-1] + if not package_name or not package_name.strip(): + raise ValueError(f"Package name is empty in symbol: '{symbol}'") + if not module_name or not module_name.strip(): + raise ValueError(f"Module name is empty in symbol: '{symbol}'") + if not func_name or not func_name.strip(): + raise ValueError(f"Function name is empty in symbol: '{symbol}'") + else: + raise ValueError(f"Invalid symbol format: '{symbol}'") + + return package_name, module_name, func_name + + def _get_or_import_module(self, location: str, full_module_name: str) -> Any: + """Get module from cache or import it (thread-safe).""" + # Use a per-module lock to prevent race conditions during import + import_lock = ModuleUDFLoader._get_import_lock(full_module_name) + + with import_lock: + # Double-check pattern: verify module is still not loaded after acquiring lock + if full_module_name in sys.modules: + cached_module = sys.modules[full_module_name] + # Verify the cached module is valid (has __file__ or __path__ attribute) + # This prevents using broken/incomplete modules from failed imports + if cached_module is not None and ( + hasattr(cached_module, "__file__") + or hasattr(cached_module, "__path__") + ): + return cached_module + else: + del sys.modules[full_module_name] + + # Import the module (only one thread will reach here per module) + with temporary_sys_path(location): + try: + module = importlib.import_module(full_module_name) + return module + except Exception as e: + # Clean up any partially-imported modules from sys.modules + # This prevents broken modules from being cached + if full_module_name in sys.modules: + del sys.modules[full_module_name] + raise + + def _extract_function( + self, module: Any, func_name: str, module_name: str + ) -> Callable: + """Extract and validate function from module.""" + func = getattr(module, func_name, None) + if func is None: + # Diagnostic info: log module details to understand why function is missing + module_attrs = dir(module) + module_file = getattr(module, "__file__", "N/A") + module_dict_keys = ( + list(module.__dict__.keys()) if hasattr(module, "__dict__") else [] + ) + + logging.error( + "Function '%s' not found in module '%s'. " + "Module file: %s, " + "Public attributes: %s, " + "All dict keys: %s", + func_name, + module_name, + module_file, + [a for a in module_attrs if not a.startswith("_")][:20], + module_dict_keys[:20], + ) + + # Check if module has import errors stored + if hasattr(module, "__import_error__"): + logging.error( + "Module '%s' has stored import error: %s", + module_name, + module.__import_error__, + ) + + raise AttributeError( + f"Function '{func_name}' not found in module '{module_name}'" + ) + if not callable(func): + raise TypeError(f"'{func_name}' is not callable") + return func + + def _load_single_file_udf( + self, location: str, module_name: str, func_name: str + ) -> Callable: + """Load UDF from a single Python file.""" + py_file = os.path.join(location, f"{module_name}.py") + if not os.path.isfile(py_file): + raise ImportError(f"Python file not found: {py_file}") + + try: + udf_module = self._get_or_import_module(location, module_name) + return self._extract_function(udf_module, func_name, module_name) + except (ImportError, AttributeError, TypeError) as e: + raise ImportError( + f"Failed to load single-file UDF '{module_name}.{func_name}': {e}" + ) from e + except Exception as e: + logging.error( + "Unexpected error loading UDF: %s\n%s", e, traceback.format_exc() + ) + raise + + def _ensure_package_init(self, package_path: str, package_name: str) -> None: + """Ensure __init__.py exists in the package directory.""" + init_path = os.path.join(package_path, "__init__.py") + if not os.path.exists(init_path): + logging.warning( + "__init__.py not found in package '%s', attempting to create it", + package_name, + ) + try: + with open(init_path, "w", encoding="utf-8") as f: + f.write( + "# Auto-generated by UDF loader to make directory a Python package\n" + ) + logging.info("Created __init__.py in %s", package_path) + except OSError as e: + raise ImportError( + f"Cannot create __init__.py in package '{package_name}': {e}" + ) from e + + def _build_full_module_name(self, package_name: str, module_name: str) -> str: + """Build the full module name for package mode.""" + if module_name == "__init__": + return package_name + return f"{package_name}.{module_name}" + + def _load_package_udf( + self, location: str, package_name: str, module_name: str, func_name: str + ) -> Callable: + """Load UDF from a Python package.""" + package_path = os.path.join(location, package_name) + if not os.path.isdir(package_path): + raise ImportError(f"Package '{package_name}' not found in '{location}'") + + self._ensure_package_init(package_path, package_name) + + try: + full_module_name = self._build_full_module_name(package_name, module_name) + udf_module = self._get_or_import_module(location, full_module_name) + return self._extract_function(udf_module, func_name, full_module_name) + except (ImportError, AttributeError, TypeError) as e: + raise ImportError( + f"Failed to load packaged UDF '{package_name}.{module_name}.{func_name}': {e}" + ) from e + except Exception as e: + logging.error( + "Unexpected error loading packaged UDF: %s\n%s", + e, + traceback.format_exc(), + ) + raise + + def load_udf_from_module( + self, + location: str, + package_name: Optional[str], + module_name: str, + func_name: str, + ) -> Callable: + """ + Load a UDF from a Python module, supporting both: + 1. Single-file mode: package_name=None, module_name="your_file" + 2. Package mode: package_name="your_pkg", module_name="submodule" or "__init__" + + Args: + location: + - In package mode: parent directory of the package + - In single-file mode: directory containing the .py file + package_name: + - If None or empty: treat as single-file mode + - Else: standard package name + module_name: + - In package mode: submodule name (e.g., "main") or "__init__" + - In single-file mode: filename without .py (e.g., "udf_script") + func_name: name of the function to load + + Returns: + The callable UDF function. + """ + if not os.path.isdir(location): + raise ValueError(f"Location is not a directory: {location}") + + if not package_name or package_name.strip() == "": + return self._load_single_file_udf(location, module_name, func_name) + else: + return self._load_package_udf( + location, package_name, module_name, func_name + ) + + +class UDFLoaderFactory: + """Factory to select the appropriate loader based on UDF location.""" + + @staticmethod + def get_loader(python_udf_meta: PythonUDFMeta) -> UDFLoader: + """ + Factory method to create the appropriate UDF loader based on metadata. + + Args: + python_udf_meta: UDF metadata containing load type and location + + Returns: + Appropriate UDFLoader instance (InlineUDFLoader or ModuleUDFLoader) + + Raises: + ValueError: If UDF load type or location is unsupported + """ + location = python_udf_meta.location + udf_load_type = python_udf_meta.udf_load_type # 0: inline, 1: module + + if udf_load_type == 0: + return InlineUDFLoader(python_udf_meta) + elif udf_load_type == 1: + if UDFLoaderFactory.check_module(location): + return ModuleUDFLoader(python_udf_meta) + else: + raise ValueError(f"Unsupported UDF location: {location}") + else: + raise ValueError(f"Unsupported UDF load type: {udf_load_type}") + + @staticmethod + def check_module(location: str) -> bool: + """ + Checks if a location is a valid Python module or package. + + A valid module is either: + - A .py file, or + - A directory containing __init__.py (i.e., a package). + + Raises: + ValueError: If the location does not exist or contains no Python module. + + Returns: + True if valid. + """ + if not os.path.exists(location): + raise ValueError(f"Module not found: {location}") + + if os.path.isfile(location): + if location.endswith(".py"): + return True + else: + raise ValueError(f"File is not a Python module (.py): {location}") + + if os.path.isdir(location): + if UDFLoaderFactory.has_python_file_recursive(location): + return True + else: + raise ValueError( + f"Directory contains no Python (.py) files: {location}" + ) + + raise ValueError(f"Invalid module location (not file or directory): {location}") + + @staticmethod + def has_python_file_recursive(location: str) -> bool: + """ + Recursively checks if a directory contains any Python (.py) files. + + Args: + location: Directory path to search + + Returns: + True if at least one .py file is found, False otherwise + """ + path = Path(location) + if not path.is_dir(): + return False + return any(path.rglob("*.py")) + + +class UDAFClassLoader: + """ + Utility class for loading UDAF classes from various sources. + + This class is responsible for loading UDAF classes from: + - Inline code (embedded in SQL) + - Module files (imported from filesystem) + """ + + @staticmethod + def load_udaf_class(python_udf_meta: PythonUDFMeta) -> type: + """ + Load the UDAF class from metadata. + + Args: + python_udf_meta: UDAF metadata + + Returns: + The UDAF class + + Raises: + RuntimeError: If inline code execution fails + ValueError: If class is not found or invalid + """ + loader = UDFLoaderFactory.get_loader(python_udf_meta) + + # For UDAF, we need the class, not an instance + if isinstance(loader, InlineUDFLoader): + return UDAFClassLoader.load_from_inline(python_udf_meta) + elif isinstance(loader, ModuleUDFLoader): + return UDAFClassLoader.load_from_module(python_udf_meta, loader) + else: + raise ValueError(f"Unsupported loader type: {type(loader)}") + + @staticmethod + def load_from_inline(python_udf_meta: PythonUDFMeta) -> type: + """ + Load UDAF class from inline code. + + Args: + python_udf_meta: UDAF metadata with inline code + + Returns: + The UDAF class + """ + symbol = python_udf_meta.symbol + inline_code = python_udf_meta.inline_code.decode("utf-8") + env: dict[str, Any] = {} + + try: + exec(inline_code, env) # nosec B102 + except Exception as e: + raise RuntimeError(f"Failed to exec inline code: {e}") from e + + udaf_class = env.get(symbol) + if udaf_class is None: + raise ValueError(f"UDAF class '{symbol}' not found in inline code") + + if not inspect.isclass(udaf_class): + raise ValueError(f"'{symbol}' is not a class (type: {type(udaf_class)})") + + UDAFClassLoader.validate_udaf_class(udaf_class) + return udaf_class + + @staticmethod + def load_from_module( + python_udf_meta: PythonUDFMeta, loader: ModuleUDFLoader + ) -> type: + """ + Load UDAF class from module file. + + Args: + python_udf_meta: UDAF metadata with module location + loader: Module loader instance + + Returns: + The UDAF class + """ + symbol = python_udf_meta.symbol + location = python_udf_meta.location + + package_name, module_name, class_name = loader.parse_symbol(symbol) + udaf_class = loader.load_udf_from_module( + location, package_name, module_name, class_name + ) + + if not inspect.isclass(udaf_class): + raise ValueError(f"'{symbol}' is not a class (type: {type(udaf_class)})") + + UDAFClassLoader.validate_udaf_class(udaf_class) + return udaf_class + + @staticmethod + def validate_udaf_class(udaf_class: type): + """ + Validate that the UDAF class implements required methods. + + Args: + udaf_class: The class to validate + + Raises: + ValueError: If class doesn't implement required methods or properties + """ + required_methods = ["__init__", "accumulate", "merge", "finish"] + for method in required_methods: + if not hasattr(udaf_class, method): + raise ValueError( + f"UDAF class must implement '{method}' method. " + f"Missing in {udaf_class.__name__}" + ) + + # Check for aggregate_state property + if not hasattr(udaf_class, "aggregate_state"): + raise ValueError( + f"UDAF class must have 'aggregate_state' property. " + f"Missing in {udaf_class.__name__}" + ) + + # Verify it's actually a property + try: + attr = inspect.getattr_static(udaf_class, "aggregate_state") + if not isinstance(attr, property): + raise ValueError( + f"'aggregate_state' must be a @property in {udaf_class.__name__}" + ) + except AttributeError: + raise ValueError( + f"UDAF class must have 'aggregate_state' property. " + f"Missing in {udaf_class.__name__}" + ) + + +class UDAFStateManager: + """ + Manages UDAF aggregate states for Python UDAF execution. + + This class maintains a mapping from place_id to UDAF instances, + following the Snowflake UDAF pattern: + - __init__(): Initialize state + - aggregate_state: Property returning serializable state + - accumulate(*args): Add input values + - merge(other_state): Merge two states + - finish(): Return final result + """ + + def __init__(self): + """Initialize the state manager.""" + self.states: Dict[int, Any] = {} # place_id -> UDAF instance + self.udaf_class = None # UDAF class to instantiate + self._destroy_counter = 0 # Track number of destroys since last GC + self._gc_threshold = 100 # Trigger GC every N destroys + + def set_udaf_class(self, udaf_class: type): + """ + Set the UDAF class to use for creating instances. + + Args: + udaf_class: The UDAF class + + Note: + Validation is performed by UDAFClassLoader before calling this method. + """ + self.udaf_class = udaf_class + + def create_state(self, place_id: int) -> None: + """ + Create a new UDAF state for the given place_id. + + Args: + place_id: Unique identifier for this aggregate state (globally unique) + + Note: + This method assumes C++ layer guarantees no concurrent access to the same place_id. + """ + try: + self.states[place_id] = self.udaf_class() + except Exception as e: + logging.error( + "Failed to create UDAF state for place_id=%s: %s\nUDAF class: %s\nTraceback: %s", + place_id, + e, + self.udaf_class.__name__ if self.udaf_class else "None", + traceback.format_exc(), + ) + raise RuntimeError(f"Failed to create UDAF state: {e}") from e + + def get_state(self, place_id: int) -> Any: + """ + Get the UDAF state for the given place_id. + + Args: + place_id: Unique identifier for the aggregate state + + Returns: + The UDAF instance + """ + return self.states[place_id] + + def accumulate(self, place_id: int, *args) -> None: + """ + Accumulate input values into the aggregate state. + + Args: + place_id: Unique identifier for the aggregate state + *args: Input values to accumulate + """ + state = self.states[place_id] + try: + state.accumulate(*args) + except Exception as e: + logging.error( + "Error in accumulate for place_id %s: %s", + place_id, + e, + ) + raise RuntimeError(f"Error in accumulate: {e}") from e + + def serialize(self, place_id: int) -> bytes: + """ + Serialize the aggregate state to bytes. + + Args: + place_id: Unique identifier for the aggregate state + + Returns: + Serialized state as bytes (using pickle) + """ + state = self.states[place_id] + try: + aggregate_state = state.aggregate_state + serialized = pickle.dumps(aggregate_state) + return serialized + except Exception as e: + logging.error( + "Error serializing state for place_id %s: %s", + place_id, + e, + ) + raise RuntimeError(f"Error serializing state: {e}") from e + + def merge(self, place_id: int, other_state_bytes: bytes) -> None: + """ + Merge another serialized state into this state. + + Args: + place_id: Unique identifier for the aggregate state + other_state_bytes: Serialized state to merge (pickle bytes) + """ + try: + other_state = pickle.loads(other_state_bytes) + except Exception as e: + logging.error("Error deserializing state bytes: %s", e) + raise RuntimeError(f"Error deserializing state: {e}") from e + + state = self.states[place_id] + + try: + state.merge(other_state) + except Exception as e: + logging.error( + "Error in merge for place_id %s: %s", + place_id, + e, + ) + raise RuntimeError(f"Error in merge: {e}") from e + + def finalize(self, place_id: int) -> Any: + """ + Get the final result from the aggregate state. + + Args: + place_id: Unique identifier for the aggregate state + + Returns: + Final aggregation result + """ + state = self.states[place_id] + try: + result = state.finish() + return result + except Exception as e: + logging.error( + "Error finalizing state for place_id %s: %s", + place_id, + e, + ) + raise RuntimeError(f"Error finalizing state: {e}") from e + + def reset(self, place_id: int) -> None: + """ + Reset the aggregate state (for window functions). + + Args: + place_id: Unique identifier for the aggregate state + + Raises: + RuntimeError: If state does not exist for this place_id or UDAF class not set + """ + try: + self.states[place_id] = self.udaf_class() + except Exception as e: + logging.error( + "Error resetting state for place_id %s: %s", + place_id, + e, + ) + raise RuntimeError(f"Error resetting state: {e}") from e + + def destroy(self, place_id: int) -> None: + """ + Destroy the aggregate state and free resources. + + Args: + place_id: Unique identifier for the aggregate state + """ + if place_id not in self.states: + return + + del self.states[place_id] + + self._destroy_counter += 1 + # Trigger GC periodically based on destroy count + if self._destroy_counter >= self._gc_threshold: + remaining = len(self.states) + + # Clear all states - force full cleanup + if remaining == 0: + self.states.clear() + gc.collect() + logging.debug( + "[UDAF GC] Full cleanup: all states destroyed, GC triggered" + ) + # Many states destroyed recently - trigger GC + elif self._destroy_counter >= self._gc_threshold: + gc.collect() + logging.debug( + "[UDAF GC] Periodic GC triggered after %d destroys, %d states remaining", + self._destroy_counter, + remaining, + ) + + self._destroy_counter = 0 + + +class FlightServer(flight.FlightServerBase): + """Arrow Flight server for executing Python UDFs, UDAFs, and UDTFs.""" + + def __init__(self, location: str): + """ + Initialize the Flight server. + + Args: + location: Unix socket path for the server + """ + super().__init__(location) + # Use a dictionary to maintain separate state managers for each UDAF function + # Key: function signature (name + input_types), Value: UDAFStateManager instance + self.udaf_state_managers: Dict[str, UDAFStateManager] = {} + self.udaf_managers_lock = threading.Lock() + + def _get_udaf_state_manager( + self, python_udaf_meta: PythonUDFMeta + ) -> UDAFStateManager: + """ + Get or create a state manager for the given UDAF function. + Each UDAF function gets its own independent state manager. + + Args: + python_udaf_meta: Metadata for the UDAF function + + Returns: + UDAFStateManager instance for this specific UDAF + """ + # Create a unique key based on function name and argument types + type_names = [str(field.type) for field in python_udaf_meta.input_types] + func_key = f"{python_udaf_meta.name}({','.join(type_names)})" + + with self.udaf_managers_lock: + if func_key not in self.udaf_state_managers: + manager = UDAFStateManager() + # Load and set the UDAF class for this manager using UDAFClassLoader + udaf_class = UDAFClassLoader.load_udaf_class(python_udaf_meta) + manager.set_udaf_class(udaf_class) + self.udaf_state_managers[func_key] = manager + + return self.udaf_state_managers[func_key] + + @staticmethod + def parse_python_udf_meta( + descriptor: flight.FlightDescriptor, + ) -> Optional[PythonUDFMeta]: + """ + Parses UDF/UDAF/UDTF metadata from a command descriptor. + + Returns: + PythonUDFMeta object containing the function metadata + """ + + if descriptor.descriptor_type != flight.DescriptorType.CMD: + logging.error("Invalid descriptor type: %s", descriptor.descriptor_type) + return None + + cmd_json = json.loads(descriptor.command) + name = cmd_json["name"] + symbol = cmd_json["symbol"] + location = cmd_json["location"] + udf_load_type = cmd_json["udf_load_type"] + runtime_version = cmd_json["runtime_version"] + always_nullable = cmd_json["always_nullable"] + # client_type: 0: UDF, 1: UDAF, 2: UDTF + client_type = cmd_json["client_type"] + + inline_code = base64.b64decode(cmd_json["inline_code"]) + input_binary = base64.b64decode(cmd_json["input_types"]) + output_binary = base64.b64decode(cmd_json["return_type"]) + + input_schema = pa.ipc.read_schema(pa.BufferReader(input_binary)) + output_schema = pa.ipc.read_schema(pa.BufferReader(output_binary)) + + if len(output_schema) != 1: + logging.error( + "Output schema must have exactly one field: %s", output_schema + ) + return None + + output_type = output_schema.field(0).type + + python_udf_meta = PythonUDFMeta( + name=name, + symbol=symbol, + location=location, + udf_load_type=udf_load_type, + runtime_version=runtime_version, + always_nullable=always_nullable, + inline_code=inline_code, + input_types=input_schema, + output_type=output_type, + client_type=client_type, + ) + + return python_udf_meta + + @staticmethod + def check_schema( + record_batch: pa.RecordBatch, expected_schema: pa.Schema + ) -> Tuple[bool, str]: + """ + Validates that the input RecordBatch schema matches the expected schema. + Checks that field count and types match, but field names can differ. + + :return: (result, error_message) + """ + actual = record_batch.schema + expected = expected_schema + + # Check field count + if len(actual) != len(expected): + return ( + False, + f"Schema length mismatch, got {len(actual)} fields, expected {len(expected)} fields", + ) + + # Check each field type (ignore field names) + for i, (actual_field, expected_field) in enumerate(zip(actual, expected)): + if not actual_field.type.equals(expected_field.type): + return False, ( + f"Type mismatch at field index {i}, " + f"got {actual_field.type}, expected {expected_field.type}" + ) + + return True, "" + + def _create_unified_response( + self, success: bool, rows_processed: int, data: bytes + ) -> pa.RecordBatch: + """ + Create unified UDAF response batch. + + Schema: [success: bool, rows_processed: int64, serialized_data: binary] + """ + return pa.RecordBatch.from_arrays( + [ + pa.array([success], type=pa.bool_()), + pa.array([rows_processed], type=pa.int64()), + pa.array([data], type=pa.binary()), + ], + schema=pa.schema( + [ + pa.field("success", pa.bool_()), + pa.field("rows_processed", pa.int64()), + pa.field("serialized_data", pa.binary()), + ] + ), + ) + + def _handle_udaf_create( + self, place_id: int, state_manager: UDAFStateManager + ) -> pa.RecordBatch: + """Handle UDAF CREATE operation. + + Returns: [success: bool] + """ + try: + state_manager.create_state(place_id) + success = True + except Exception as e: + logging.error( + "CREATE operation failed for place_id=%s: %s", + place_id, + e, + ) + success = False + + return pa.RecordBatch.from_arrays( + [pa.array([success], type=pa.bool_())], ["success"] + ) + + def _handle_udaf_accumulate( + self, + place_id: int, + is_single_place: bool, + row_start: int, + row_end: int, + data_batch: pa.RecordBatch, + state_manager: UDAFStateManager, + ) -> pa.RecordBatch: + """ + Handle UDAF ACCUMULATE operation with optimized metadata from app_metadata. + + Args: + place_id: Primary place identifier + is_single_place: If True, single aggregation; if False, GROUP BY aggregation + row_start: Start row index in data batch + row_end: End row index in data batch (exclusive) + data_batch: Input data RecordBatch (argument columns + optional places column) + state_manager: UDAF state manager instance + + Returns: [rows_processed: int64] (0 if failed) + """ + if data_batch is None: + raise ValueError("ACCUMULATE requires data_batch, got None") + + rows_processed = 0 + + try: + has_places = ( + data_batch.schema.field(data_batch.num_columns - 1).name == "places" + ) + num_input_cols = ( + data_batch.num_columns - 1 if has_places else data_batch.num_columns + ) + loop_start = row_start + loop_end = min(row_end, data_batch.num_rows) + + if is_single_place: + if place_id not in state_manager.states: + raise KeyError(f"State for place_id {place_id} not found") + state = state_manager.states[place_id] + + # Extract row range using Arrow slicing (zero-copy) + sliced_batch = data_batch.slice(loop_start, loop_end - loop_start) + columns = [sliced_batch.column(j) for j in range(num_input_cols)] + + for i in range(sliced_batch.num_rows): + try: + row_args = tuple(col[i].as_py() for col in columns) + state.accumulate(*row_args) + rows_processed += 1 + except Exception as e: + logging.error( + "Error in accumulate for place_id %s at row %d: %s", + place_id, + loop_start + i, + e, + ) + raise RuntimeError(f"Error in accumulate: {e}") from e + + del columns + del sliced_batch + else: + # Multiple places (GROUP BY): iterate row by row + places_col = data_batch.column(data_batch.num_columns - 1) + num_rows = data_batch.num_rows + data_columns = [data_batch.column(j) for j in range(num_input_cols)] + + # Process each row directly from Arrow arrays (single pass) + for i in range(num_rows): + try: + place_id = places_col[i].as_py() + state = state_manager.states[place_id] + row_args = tuple(col[i].as_py() for col in data_columns) + state.accumulate(*row_args) + rows_processed += 1 + except KeyError: + logging.error( + "State not found for place_id=%s at row %d. " + "CREATE must be called before ACCUMULATE.", + place_id, + i, + ) + raise + except Exception as e: + logging.error( + "Error in accumulate for place_id %s at row %d: %s", + place_id, + i, + e, + ) + raise RuntimeError(f"Error in accumulate: {e}") from e + + del data_columns + del places_col + del data_batch + + except Exception as e: + logging.error( + "ACCUMULATE operation failed at row %d: %s\nTraceback: %s", + rows_processed, + e, + traceback.format_exc(), + ) + raise + + return pa.RecordBatch.from_arrays( + [pa.array([rows_processed], type=pa.int64())], ["rows_processed"] + ) + + def _handle_udaf_serialize( + self, place_id: int, state_manager: UDAFStateManager + ) -> pa.RecordBatch: + """Handle UDAF SERIALIZE operation. + + Returns: [serialized_state: binary] (empty if failed) + """ + try: + serialized = state_manager.serialize(place_id) + except Exception as e: + logging.error( + "SERIALIZE operation failed for place_id=%s: %s", + place_id, + e, + ) + serialized = b"" + + return pa.RecordBatch.from_arrays( + [pa.array([serialized], type=pa.binary())], ["serialized_state"] + ) + + def _handle_udaf_merge( + self, + place_id: int, + data_binary: bytes, + state_manager: UDAFStateManager, + ) -> pa.RecordBatch: + """Handle UDAF MERGE operation. + + data_binary contains the serialized state to merge. + Returns: [success: bool] + """ + if data_binary is None: + raise ValueError(f"MERGE requires data_binary, got None") + + try: + state_manager.merge(place_id, data_binary) + success = True + except Exception as e: + logging.error( + "MERGE operation failed for place_id=%s: %s", + place_id, + e, + ) + success = False + + return pa.RecordBatch.from_arrays( + [pa.array([success], type=pa.bool_())], ["success"] + ) + + def _handle_udaf_finalize( + self, + place_id: int, + output_type: pa.DataType, + state_manager: UDAFStateManager, + ) -> pa.RecordBatch: + """Handle UDAF FINALIZE operation. + + Returns: [result: output_type] (null if failed) + """ + try: + result = state_manager.finalize(place_id) + except Exception as e: + logging.error( + "FINALIZE operation failed for place_id=%s: %s", + place_id, + e, + ) + result = None + + return pa.RecordBatch.from_arrays( + [pa.array([result], type=output_type)], ["result"] + ) + + def _handle_udaf_reset( + self, place_id: int, state_manager: UDAFStateManager + ) -> pa.RecordBatch: + """Handle UDAF RESET operation. + + Returns: [success: bool] + """ + try: + state_manager.reset(place_id) + success = True + except Exception as e: + logging.error( + "RESET operation failed for place_id=%s: %s", + place_id, + e, + ) + success = False + + return pa.RecordBatch.from_arrays( + [pa.array([success], type=pa.bool_())], ["success"] + ) + + def _handle_udaf_destroy( + self, place_ids: list, state_manager: UDAFStateManager + ) -> bool: + """Handle UDAF DESTROY operation for one or more place_ids. + + Args: + place_ids: List of place_ids to destroy (can be single element) + state_manager: UDAF state manager + + Returns: + bool: True if all destroys succeeded, False if any failed + """ + num_ids = len(place_ids) + success_count = 0 + failed_count = 0 + + for place_id in place_ids: + try: + state_manager.destroy(place_id) + success_count += 1 + except Exception as e: + logging.error( + "Failed to destroy place_id=%s: %s", + place_id, + e, + ) + failed_count += 1 + + if failed_count > 0: + if num_ids > 1: + logging.warning( + "[UDAF Memory] Destroy completed with %d succeeded, %d failed", + success_count, + failed_count, + ) + return False + + return True + + def _handle_exchange_udf( + self, + python_udf_meta: PythonUDFMeta, + reader: flight.MetadataRecordBatchReader, + writer: flight.MetadataRecordBatchWriter, + ) -> None: + """Handle bidirectional streaming for UDF execution.""" + loader = UDFLoaderFactory.get_loader(python_udf_meta) + udf = loader.load() + logging.info("Loaded UDF: %s", udf) + + started = False + for chunk in reader: + if not chunk.data: + logging.info("Empty chunk received, skipping") + continue + + check_schema_result, error_msg = self.check_schema( + chunk.data, python_udf_meta.input_types + ) + if not check_schema_result: + logging.error("Schema mismatch: %s", error_msg) + raise ValueError(f"Schema mismatch: {error_msg}") + + result_array = udf(chunk.data) + + if not python_udf_meta.output_type.equals(result_array.type): + logging.error( + "Output type mismatch: got %s, expected %s", + result_array.type, + python_udf_meta.output_type, + ) + raise ValueError( + f"Output type mismatch: got {result_array.type}, expected {python_udf_meta.output_type}" + ) + + result_batch = pa.RecordBatch.from_arrays([result_array], ["result"]) + if not started: + try: + writer.begin(result_batch.schema) + started = True + except Exception as e: + logging.error( + "Failed to begin UDF writer stream (client may have disconnected): %s", + e, + ) + return + + try: + writer.write_batch(result_batch) + except Exception as e: + logging.error( + "Failed to write UDF response batch (client may have disconnected): %s", + e, + ) + return + + def _handle_exchange_udaf( + self, + python_udaf_meta: PythonUDFMeta, + reader: flight.MetadataRecordBatchReader, + writer: flight.MetadataRecordBatchWriter, + ) -> None: + """ + Handle bidirectional streaming for UDAF execution. + + Protocol (optimized with direct RecordBatch transmission): + - app_metadata: 30-byte binary structure containing: + * meta_version: uint32 (4 bytes) - Metadata version (currently 1) + * operation: uint8 (1 byte) - UDAFOperationType enum + * is_single_place: uint8 (1 byte) - Boolean (ACCUMULATE only) + * place_id: int64 (8 bytes) - Aggregate state identifier (globally unique) + * row_start: int64 (8 bytes) - Start row index (ACCUMULATE only) + * row_end: int64 (8 bytes) - End row index (ACCUMULATE only) + + - RecordBatch data: [argument_types..., places: int64, binary_data: binary] + * Schema is function-specific: created from argument_types + places + binary_data columns + * Different operations fill different columns: + - ACCUMULATE (single-place): data columns are filled, places is NULL, binary_data is NULL + - ACCUMULATE (multi-place): data columns are filled, places contains place IDs, binary_data is NULL + - MERGE: data columns are NULL, places is NULL, binary_data contains serialized state + - Other operations (CREATE/SERIALIZE/FINALIZE/RESET/DESTROY): all columns are NULL + * places column: indicates which place each row belongs to in GROUP BY scenarios + * This eliminates extra serialization/deserialization for ACCUMULATE operations + + Response: Unified schema [success: bool, rows_processed: int64, serialized_data: binary] + - Different operations use different fields: + * CREATE/MERGE/RESET/DESTROY: use success only + * ACCUMULATE: use success + rows_processed (number of rows processed) + * SERIALIZE: use success + serialized_data (serialized_state) + * FINALIZE: use success + serialized_data (serialized result) + """ + + # Get or create state manager for this specific UDAF function + state_manager = self._get_udaf_state_manager(python_udaf_meta) + started = False + + # Define unified response schema (consistent with C++ kUnifiedUDAFResponseSchema) + unified_schema = pa.schema( + [ + pa.field("success", pa.bool_()), + pa.field("rows_processed", pa.int64()), + pa.field("serialized_data", pa.binary()), + ] + ) + + for chunk in reader: + if not chunk.data or chunk.data.num_rows == 0: + logging.warning("Empty chunk received, skipping") + continue + + batch = chunk.data + app_metadata = chunk.app_metadata + + # Validate app_metadata + if not app_metadata or len(app_metadata) != 30: + raise ValueError( + f"Invalid app_metadata: expected 30 bytes, got {len(app_metadata) if app_metadata else 0}" + ) + + # Parse fixed-size binary metadata (30 bytes total) + # Layout: meta_version(4) + operation(1) + is_single_place(1) + place_id(8) + row_start(8) + row_end(8) + metadata_bytes = app_metadata.to_pybytes() + + # Validate metadata version + meta_version = int.from_bytes(metadata_bytes[0:4], "little", signed=False) + if meta_version != 1: + raise ValueError( + f"Unsupported metadata version: {meta_version}. Expected version 1. " + "Please upgrade the Python server or downgrade the C++ client." + ) + operation_type = UDAFOperationType(metadata_bytes[4]) + is_single_place = metadata_bytes[5] == 1 + place_id = int.from_bytes(metadata_bytes[6:14], "little", signed=True) + row_start = int.from_bytes(metadata_bytes[14:22], "little", signed=True) + row_end = int.from_bytes(metadata_bytes[22:30], "little", signed=True) + + # Extract data from batch + # RPC schema: [argument_types..., places: int64, binary_data: binary] + # - Second-to-last column is places (int64) + # - Last column is binary_data (binary) + # - ACCUMULATE (single-place): data columns filled, places is NULL, binary_data is NULL + # - ACCUMULATE (multi-place): data columns filled, places contains place IDs, binary_data is NULL + # - MERGE: data columns are NULL, places is NULL, binary_data is filled + # - Other operations: all columns are NULL + + if batch.num_columns < 1: + raise ValueError(f"Expected at least 1 column, got {batch.num_columns}") + + # Last column is binary_data + binary_col = batch.column(batch.num_columns - 1) + binary_data = binary_col[0].as_py() if binary_col[0].is_valid else None + + # Handle different operations and convert to unified format + try: + if operation_type == UDAFOperationType.CREATE: + result_batch = self._handle_udaf_create(place_id, state_manager) + success = result_batch.column(0)[0].as_py() + result_batch = self._create_unified_response( + success=success, rows_processed=0, data=b"" + ) + elif operation_type == UDAFOperationType.ACCUMULATE: + num_data_cols = batch.num_columns - 1 + data_batch = pa.RecordBatch.from_arrays( + [batch.column(i) for i in range(num_data_cols)], + schema=pa.schema( + [batch.schema.field(i) for i in range(num_data_cols)] + ), + ) + result_batch_accumulate = self._handle_udaf_accumulate( + place_id, + is_single_place, + row_start, + row_end, + data_batch, + state_manager, + ) + rows_processed = result_batch_accumulate.column(0)[0].as_py() + result_batch = self._create_unified_response( + success=(rows_processed > 0), + rows_processed=rows_processed, + data=b"", + ) + elif operation_type == UDAFOperationType.SERIALIZE: + result_batch_serialize = self._handle_udaf_serialize( + place_id, state_manager + ) + serialized = result_batch_serialize.column(0)[0].as_py() + result_batch = self._create_unified_response( + success=(len(serialized) > 0) if serialized else False, + rows_processed=0, + data=serialized if serialized else b"", + ) + elif operation_type == UDAFOperationType.MERGE: + # For MERGE: binary_data contains the serialized state + result_batch_merge = self._handle_udaf_merge( + place_id, binary_data, state_manager + ) + success = result_batch_merge.column(0)[0].as_py() + result_batch = self._create_unified_response( + success=success, rows_processed=0, data=b"" + ) + elif operation_type == UDAFOperationType.FINALIZE: + result_batch_finalize = self._handle_udaf_finalize( + place_id, python_udaf_meta.output_type, state_manager + ) + # Serialize the result to binary (including NULL results) + # NULL is a valid aggregation result, not an error + sink = pa.BufferOutputStream() + ipc_writer = pa.ipc.new_stream(sink, result_batch_finalize.schema) + ipc_writer.write_batch(result_batch_finalize) + ipc_writer.close() + result_data = sink.getvalue().to_pybytes() + result_batch = self._create_unified_response( + success=True, + rows_processed=0, + data=result_data, + ) + elif operation_type == UDAFOperationType.RESET: + result_batch_reset = self._handle_udaf_reset( + place_id, state_manager + ) + success = result_batch_reset.column(0)[0].as_py() + result_batch = self._create_unified_response( + success=success, rows_processed=0, data=b"" + ) + elif operation_type == UDAFOperationType.DESTROY: + if row_end > 1: + # Batch destroy mode - binary_data contains serialized place_ids + if binary_data is None: + raise ValueError("DESTROY_BATCH: binary_data is None") + data_reader = pa.ipc.open_stream(binary_data) + data_batch = data_reader.read_next_batch() + if data_batch.num_columns != 1: + raise ValueError( + f"DESTROY_BATCH: Expected 1 column (place_ids), got {data_batch.num_columns}" + ) + place_ids_array = data_batch.column(0) + place_ids = [ + place_ids_array[i].as_py() + for i in range(len(place_ids_array)) + ] + else: + # Single destroy mode + place_ids = [place_id] + + success = self._handle_udaf_destroy(place_ids, state_manager) + result_batch = self._create_unified_response( + success=success, rows_processed=0, data=b"" + ) + else: + raise ValueError(f"Unsupported operation type: {operation_type}") + except Exception as e: + logging.error( + "Operation %s failed for place_id=%s: %s\nTraceback: %s", + operation_type, + place_id, + e, + traceback.format_exc(), + ) + result_batch = self._create_unified_response( + success=False, rows_processed=0, data=b"" + ) + + # Begin stream with unified schema on first call + if not started: + try: + writer.begin(unified_schema) + started = True + except Exception as e: + logging.error( + "Failed to begin writer stream (client may have disconnected): %s", + e, + ) + # Client disconnected, stop processing + return + + try: + writer.write_batch(result_batch) + except Exception as e: + logging.error( + "Failed to write response batch (client may have disconnected): %s", + e, + ) + # Client disconnected, stop processing + return + + del result_batch + + def _handle_exchange_udtf( + self, + python_udtf_meta: PythonUDFMeta, + reader: flight.MetadataRecordBatchReader, + writer: flight.MetadataRecordBatchWriter, + ) -> None: + """ + Handle bidirectional streaming for UDTF execution. + + Protocol (ListArray-based): + - Input: RecordBatch with input columns + - Output: RecordBatch with a single ListArray column + * ListArray automatically manages offsets internally + * Each list element contains the outputs for one input row + + Example: + Input: 3 rows + UDTF yields: Row 0 -> 5 outputs, Row 1 -> 2 outputs, Row 2 -> 3 outputs + Output: ListArray with 3 elements (one per input row) + - Element 0: List of 5 structs + - Element 1: List of 2 structs + - Element 2: List of 3 structs + """ + loader = UDFLoaderFactory.get_loader(python_udtf_meta) + adaptive_udtf = loader.load() + udtf_func = adaptive_udtf._eval_func + started = False + + for chunk in reader: + if not chunk.data: + logging.info("Empty chunk received, skipping") + continue + + input_batch = chunk.data + + # Validate input schema + check_schema_result, error_msg = self.check_schema( + input_batch, python_udtf_meta.input_types + ) + if not check_schema_result: + logging.error("Schema mismatch: %s", error_msg) + raise ValueError(f"Schema mismatch: {error_msg}") + + # Process all input rows and build ListArray + try: + response_batch = self._process_udtf_with_list_array( + udtf_func, input_batch, python_udtf_meta.output_type + ) + + # Send the response batch + if not started: + try: + writer.begin(response_batch.schema) + started = True + except Exception as e: + logging.error( + "Failed to begin UDTF writer stream (client may have disconnected): %s", + e, + ) + return + + try: + writer.write_batch(response_batch) + except Exception as e: + logging.error( + "Failed to write UDTF response batch (client may have disconnected): %s", + e, + ) + return + + except Exception as e: + logging.error( + "Error in UDTF execution: %s\nTraceback: %s", + e, + traceback.format_exc(), + ) + raise RuntimeError(f"Error in UDTF execution: {e}") from e + + def _process_udtf_with_list_array( + self, + udtf_func: Callable, + input_batch: pa.RecordBatch, + expected_output_type: pa.DataType, + ) -> pa.RecordBatch: + """ + Process UDTF function on all input rows and generate a ListArray. + + Args: + udtf_func: The UDTF function to call + input_batch: Input RecordBatch with N rows + expected_output_type: Expected Arrow type for output data + + Returns: + RecordBatch with a single ListArray column where each element + is a list of outputs for the corresponding input row + """ + all_results = [] # List of lists: one list per input row + + # Check if output is single-field or multi-field + # For single-field output, we allow yielding scalar values directly + is_single_field = not pa.types.is_struct(expected_output_type) + + # Process each input row + for row_idx in range(input_batch.num_rows): + # Extract row as tuple of arguments + row_args = tuple( + input_batch.column(col_idx)[row_idx].as_py() + for col_idx in range(input_batch.num_columns) + ) + + # Call UDTF function - it can yield tuples or scalar values (for single-field output) + result = udtf_func(*row_args) + + # Collect output rows for this input row + row_outputs = [] + if inspect.isgenerator(result): + for output_value in result: + if is_single_field: + # Single-field output: accept both scalar and tuple + if isinstance(output_value, tuple): + # User provided tuple (e.g., (value,)) - extract scalar + if len(output_value) != 1: + raise ValueError( + f"Single-field UDTF should yield 1-tuples or scalars, got {len(output_value)}-tuple" + ) + row_outputs.append( + output_value[0] + ) # Extract scalar from tuple + else: + # User provided scalar - use directly + row_outputs.append(output_value) + else: + # Multi-field output: must be tuple + if not isinstance(output_value, tuple): + raise ValueError( + f"Multi-field UDTF must yield tuples, got {type(output_value)}" + ) + row_outputs.append(output_value) + elif result is not None: + # Function returned a single value instead of yielding + if is_single_field: + # Single-field: accept scalar or tuple + if isinstance(result, tuple): + if len(result) != 1: + raise ValueError( + f"Single-field UDTF should return 1-tuple or scalar, got {len(result)}-tuple" + ) + row_outputs.append(result[0]) # Extract scalar from tuple + else: + row_outputs.append(result) + else: + # Multi-field: must be tuple + if not isinstance(result, tuple): + raise ValueError( + f"Multi-field UDTF must return tuples, got {type(result)}" + ) + row_outputs.append(result) + + all_results.append(row_outputs) + + try: + list_array = pa.array(all_results, type=pa.list_(expected_output_type)) + except Exception as e: + logging.error( + "Failed to create ListArray: %s, element_type: %s", + e, + expected_output_type, + ) + raise RuntimeError(f"Failed to create ListArray: {e}") from e + + # Create RecordBatch with single ListArray column + schema = pa.schema([pa.field("results", pa.list_(expected_output_type))]) + response_batch = pa.RecordBatch.from_arrays([list_array], schema=schema) + + return response_batch + + def do_exchange( + self, + context: flight.ServerCallContext, + descriptor: flight.FlightDescriptor, + reader: flight.MetadataRecordBatchReader, + writer: flight.MetadataRecordBatchWriter, + ) -> None: + """ + Handle bidirectional streaming for UDF, UDAF, and UDTF execution. + + Determines operation type (UDF vs UDAF vs UDTF) from descriptor metadata. + """ + python_udf_meta = self.parse_python_udf_meta(descriptor) + if not python_udf_meta: + raise ValueError("Invalid or missing metadata in descriptor") + + if python_udf_meta.is_udf(): + self._handle_exchange_udf(python_udf_meta, reader, writer) + elif python_udf_meta.is_udaf(): + self._handle_exchange_udaf(python_udf_meta, reader, writer) + elif python_udf_meta.is_udtf(): + self._handle_exchange_udtf(python_udf_meta, reader, writer) + else: + raise ValueError(f"Unsupported client type: {python_udf_meta.client_type}") + + +class UDAFOperationType(Enum): + """Enum representing UDAF operation types.""" + + CREATE = 0 + ACCUMULATE = 1 + SERIALIZE = 2 + MERGE = 3 + FINALIZE = 4 + RESET = 5 + DESTROY = 6 + + +def check_unix_socket_path(unix_socket_path: str) -> bool: + """Validates the Unix domain socket path format.""" + if not unix_socket_path: + logging.error("Unix socket path is empty") + return False + + if not unix_socket_path.startswith("grpc+unix://"): + raise ValueError("gRPC UDS URL must start with 'grpc+unix://'") + + socket_path = unix_socket_path[len("grpc+unix://") :].strip() + if not socket_path: + logging.error("Extracted socket path is empty") + return False + + return True + + +def main(unix_socket_path: str) -> None: + """ + Main entry point for the Python UDF/UDAF/UDTF server. + + The server handles UDF, UDAF, and UDTF operations dynamically. + Operation type is determined from metadata in each request. + + Args: + unix_socket_path: Base path for the Unix domain socket + + Raises: + SystemExit: If socket path is invalid or server fails to start + """ + try: + if not check_unix_socket_path(unix_socket_path): + print(f"ERROR: Invalid socket path: {unix_socket_path}", flush=True) + sys.exit(1) + + current_pid = os.getpid() + ServerState.unix_socket_path = f"{unix_socket_path}_{current_pid}.sock" + + # Start unified server that handles UDF, UDAF, and UDTF + server = FlightServer(ServerState.unix_socket_path) + + print(ServerState.PYTHON_SERVER_START_SUCCESS_MSG, flush=True) + logging.info( + "##### PYTHON UDF/UDAF/UDTF SERVER STARTED AT %s #####", datetime.now() + ) + server.wait() + + except Exception as e: + print( + f"ERROR: Failed to start Python server: {type(e).__name__}: {e}", + flush=True, + ) + tb_lines = traceback.format_exception(type(e), e, e.__traceback__) + if len(tb_lines) > 1: + print(f"DETAIL: {tb_lines[-2].strip()}", flush=True) + sys.exit(1) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run an Arrow Flight UDF/UDAF/UDTF server over Unix socket. " + "The server handles UDF, UDAF, and UDTF operations dynamically." + ) + parser.add_argument( + "unix_socket_path", + type=str, + help="Path to the Unix socket (e.g., grpc+unix:///path/to/socket)", + ) + args = parser.parse_args() + main(args.unix_socket_path) diff --git a/be/src/udf/python/python_udaf_client.cpp b/be/src/udf/python/python_udaf_client.cpp new file mode 100644 index 00000000000000..4af23cffc1aa77 --- /dev/null +++ b/be/src/udf/python/python_udaf_client.cpp @@ -0,0 +1,522 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "udf/python/python_udaf_client.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/compiler_util.h" +#include "common/status.h" +#include "udf/python/python_udf_meta.h" +#include "udf/python/python_udf_runtime.h" +#include "util/arrow/utils.h" + +namespace doris { + +// Unified response structure for UDAF operations +// Arrow Schema: [success: bool, rows_processed: int64, data: binary] +// Different operations use different fields: +// - CREATE/MERGE/RESET/DESTROY: use success only +// - ACCUMULATE: use success + rows_processed (number of rows processed) +// - SERIALIZE: use success + data (serialized_state) +// - FINALIZE: use success + data (serialized result, may be null) +// +// This unified schema allows all operations to return consistent format, +// solving Arrow Flight's limitation that all responses must have the same schema. +static const std::shared_ptr kUnifiedUDAFResponseSchema = arrow::schema({ + arrow::field("success", arrow::boolean()), + arrow::field("rows_processed", arrow::int64()), + arrow::field("serialized_data", arrow::binary()), +}); + +Status PythonUDAFClient::create(const PythonUDFMeta& func_meta, ProcessPtr process, + const std::shared_ptr& data_schema, + PythonUDAFClientPtr* client) { + PythonUDAFClientPtr python_udaf_client = std::make_shared(); + RETURN_IF_ERROR(python_udaf_client->init(func_meta, std::move(process), data_schema)); + *client = std::move(python_udaf_client); + return Status::OK(); +} + +Status PythonUDAFClient::init(const PythonUDFMeta& func_meta, ProcessPtr process, + const std::shared_ptr& data_schema) { + _schema = data_schema; + return PythonClient::init(func_meta, std::move(process)); +} + +Status PythonUDAFClient::create(int64_t place_id) { + std::shared_ptr request_batch; + RETURN_IF_ERROR(_get_empty_request_batch(&request_batch)); + + UDAFMetadata metadata { + .meta_version = UDAF_METADATA_VERSION, + .operation = static_cast(UDAFOperation::CREATE), + .is_single_place = 0, + .place_id = place_id, + .row_start = 0, + .row_end = 0, + }; + + std::shared_ptr response_batch; + RETURN_IF_ERROR(_send_request(metadata, request_batch, &response_batch)); + + // Parse unified response_batch: [success: bool, rows_processed: int64, serialized_data: binary] + if (response_batch->num_rows() != 1) { + return Status::InternalError("Invalid CREATE response_batch: expected 1 row"); + } + + auto success_array = std::static_pointer_cast(response_batch->column(0)); + if (!success_array->Value(0)) { + return Status::InternalError("CREATE operation failed for place_id={}", place_id); + } + + _created_place_id = place_id; + return Status::OK(); +} + +Status PythonUDAFClient::accumulate(int64_t place_id, bool is_single_place, + const arrow::RecordBatch& input, int64_t row_start, + int64_t row_end) { + // Validate input parameters + if (UNLIKELY(row_start < 0 || row_end < row_start || row_end > input.num_rows())) { + return Status::InvalidArgument( + "Invalid row range: row_start={}, row_end={}, input.num_rows={}", row_start, + row_end, input.num_rows()); + } + + // In multi-place mode, input RecordBatch must contain "places" column as last column + if (UNLIKELY(!is_single_place && + (input.num_columns() == 0 || + input.schema()->field(input.num_columns() - 1)->name() != "places"))) { + return Status::InternalError( + "In multi-place mode, input RecordBatch must contain 'places' column as the " + "last column"); + } + + // Create request batch: input data + NULL binary_data column + std::shared_ptr request_batch; + RETURN_IF_ERROR(_create_data_request_batch(input, &request_batch)); + + // Create metadata structure + UDAFMetadata metadata { + .meta_version = UDAF_METADATA_VERSION, + .operation = static_cast(UDAFOperation::ACCUMULATE), + .is_single_place = static_cast(is_single_place ? 1 : 0), + .place_id = place_id, + .row_start = row_start, + .row_end = row_end, + }; + + // Send to server with metadata in app_metadata + std::shared_ptr response; + RETURN_IF_ERROR(_send_request(metadata, request_batch, &response)); + + // Parse unified response: [success: bool, rows_processed: int64, serialized_data: binary] + if (response->num_rows() != 1) { + return Status::InternalError("Invalid ACCUMULATE response: expected 1 row"); + } + + auto success_array = std::static_pointer_cast(response->column(0)); + auto rows_processed_array = std::static_pointer_cast(response->column(1)); + + if (!success_array->Value(0)) { + return Status::InternalError("ACCUMULATE operation failed for place_id={}", place_id); + } + + int64_t rows_processed = rows_processed_array->Value(0); + int64_t expected_rows = row_end - row_start; + + if (rows_processed < expected_rows) { + return Status::InternalError( + "ACCUMULATE operation only processed {} out of {} rows for place_id={}", + rows_processed, expected_rows, place_id); + } + return Status::OK(); +} + +Status PythonUDAFClient::serialize(int64_t place_id, + std::shared_ptr* serialized_state) { + std::shared_ptr request_batch; + RETURN_IF_ERROR(_get_empty_request_batch(&request_batch)); + + UDAFMetadata metadata { + .meta_version = UDAF_METADATA_VERSION, + .operation = static_cast(UDAFOperation::SERIALIZE), + .is_single_place = 0, + .place_id = place_id, + .row_start = 0, + .row_end = 0, + }; + + std::shared_ptr response; + RETURN_IF_ERROR(_send_request(metadata, request_batch, &response)); + + // Parse unified response: [success: bool, rows_processed: int64, serialized_data: binary] + auto success_array = std::static_pointer_cast(response->column(0)); + auto data_array = std::static_pointer_cast(response->column(2)); + + if (!success_array->Value(0)) { + return Status::InternalError("SERIALIZE operation failed for place_id={}", place_id); + } + + int32_t length; + const uint8_t* data = data_array->GetValue(0, &length); + + if (length == 0) { + return Status::InternalError("SERIALIZE operation returned empty data for place_id={}", + place_id); + } + + *serialized_state = arrow::Buffer::Wrap(data, length); + return Status::OK(); +} + +Status PythonUDAFClient::merge(int64_t place_id, + const std::shared_ptr& serialized_state) { + std::shared_ptr request_batch; + RETURN_IF_ERROR(_create_binary_request_batch(serialized_state, &request_batch)); + + UDAFMetadata metadata { + .meta_version = UDAF_METADATA_VERSION, + .operation = static_cast(UDAFOperation::MERGE), + .is_single_place = 0, + .place_id = place_id, + .row_start = 0, + .row_end = 0, + }; + + std::shared_ptr response; + RETURN_IF_ERROR(_send_request(metadata, request_batch, &response)); + + // Parse unified response: [success: bool, rows_processed: int64, serialized_data: binary] + if (response->num_rows() != 1) { + return Status::InternalError("Invalid MERGE response: expected 1 row"); + } + + auto success_array = std::static_pointer_cast(response->column(0)); + if (!success_array->Value(0)) { + return Status::InternalError("MERGE operation failed for place_id={}", place_id); + } + + return Status::OK(); +} + +Status PythonUDAFClient::finalize(int64_t place_id, std::shared_ptr* output) { + std::shared_ptr request_batch; + RETURN_IF_ERROR(_get_empty_request_batch(&request_batch)); + + UDAFMetadata metadata { + .meta_version = UDAF_METADATA_VERSION, + .operation = static_cast(UDAFOperation::FINALIZE), + .is_single_place = 0, + .place_id = place_id, + .row_start = 0, + .row_end = 0, + }; + + std::shared_ptr response_batch; + RETURN_IF_ERROR(_send_request(metadata, request_batch, &response_batch)); + + // Parse unified response_batch: [success: bool, rows_processed: int64, serialized_data: binary] + auto success_array = std::static_pointer_cast(response_batch->column(0)); + auto data_array = std::static_pointer_cast(response_batch->column(2)); + + if (!success_array->Value(0)) { + return Status::InternalError("FINALIZE operation failed for place_id={}", place_id); + } + + // Deserialize data column to get actual result + int32_t length; + const uint8_t* data = data_array->GetValue(0, &length); + + if (length == 0) { + return Status::InternalError("FINALIZE operation returned empty data for place_id={}", + place_id); + } + + auto buffer = arrow::Buffer::Wrap(data, length); + auto input_stream = std::make_shared(buffer); + + auto reader_result = arrow::ipc::RecordBatchStreamReader::Open(input_stream); + if (UNLIKELY(!reader_result.ok())) { + return Status::InternalError("Failed to deserialize FINALIZE result: {}", + reader_result.status().message()); + } + auto reader = std::move(reader_result).ValueOrDie(); + + auto batch_result = reader->Next(); + if (UNLIKELY(!batch_result.ok())) { + return Status::InternalError("Failed to read FINALIZE result: {}", + batch_result.status().message()); + } + + *output = std::move(batch_result).ValueOrDie(); + + return Status::OK(); +} + +Status PythonUDAFClient::reset(int64_t place_id) { + std::shared_ptr request_batch; + RETURN_IF_ERROR(_get_empty_request_batch(&request_batch)); + + UDAFMetadata metadata { + .meta_version = UDAF_METADATA_VERSION, + .operation = static_cast(UDAFOperation::RESET), + .is_single_place = 0, + .place_id = place_id, + .row_start = 0, + .row_end = 0, + }; + + std::shared_ptr response; + RETURN_IF_ERROR(_send_request(metadata, request_batch, &response)); + + // Parse unified response: [success: bool, rows_processed: int64, serialized_data: binary] + if (response->num_rows() != 1) { + return Status::InternalError("Invalid RESET response: expected 1 row"); + } + + auto success_array = std::static_pointer_cast(response->column(0)); + if (!success_array->Value(0)) { + return Status::InternalError("RESET operation failed for place_id={}", place_id); + } + + return Status::OK(); +} + +Status PythonUDAFClient::destroy(int64_t place_id) { + std::shared_ptr request_batch; + RETURN_IF_ERROR(_get_empty_request_batch(&request_batch)); + + UDAFMetadata metadata { + .meta_version = UDAF_METADATA_VERSION, + .operation = static_cast(UDAFOperation::DESTROY), + .is_single_place = 0, + .place_id = place_id, + .row_start = 0, + .row_end = 0, + }; + + std::shared_ptr response; + Status st = _send_request(metadata, request_batch, &response); + + // Always clear tracking, even if RPC failed + _created_place_id.reset(); + + if (!st.ok()) { + LOG(WARNING) << "Failed to destroy place_id=" << place_id << ": " << st.to_string(); + return st; + } + + // Parse unified response: [success: bool, rows_processed: int64, serialized_data: binary] + if (response->num_rows() != 1) { + return Status::InternalError("Invalid DESTROY response: expected 1 row"); + } + + auto success_array = std::static_pointer_cast(response->column(0)); + + if (!success_array->Value(0)) { + LOG(WARNING) << "DESTROY operation failed for place_id=" << place_id; + return Status::InternalError("DESTROY operation failed for place_id={}", place_id); + } + + return Status::OK(); +} + +Status PythonUDAFClient::close() { + if (!_inited || !_writer) return Status::OK(); + + // Destroy the place if it exists (cleanup on client destruction) + if (_created_place_id.has_value()) { + int64_t place_id = _created_place_id.value(); + Status st = destroy(place_id); + if (!st.ok()) { + LOG(WARNING) << "Failed to destroy place_id=" << place_id + << " during close: " << st.to_string(); + // Clear tracking even on failure to prevent issues in base class close + _created_place_id.reset(); + } + } + + return PythonClient::close(); +} + +Status PythonUDAFClient::_send_request(const UDAFMetadata& metadata, + const std::shared_ptr& request_batch, + std::shared_ptr* response_batch) { + DCHECK(response_batch != nullptr); + + // Create app_metadata buffer from metadata struct + auto app_metadata = + arrow::Buffer::Wrap(reinterpret_cast(&metadata), sizeof(metadata)); + + std::lock_guard lock(_operation_mutex); + + // Check if writer/reader are still valid (they could be reset by handle_error) + if (UNLIKELY(!_writer || !_reader)) { + return Status::InternalError("{} writer/reader have been closed due to previous error", + _operation_name); + } + + // Begin stream on first call (using data schema: argument_types + places + binary_data) + if (UNLIKELY(!_begin)) { + auto begin_res = _writer->Begin(_schema); + if (!begin_res.ok()) { + return handle_error(begin_res); + } + _begin = true; + } + + // Write batch with metadata in app_metadata + auto write_res = _writer->WriteWithMetadata(*request_batch, app_metadata); + if (!write_res.ok()) { + return handle_error(write_res); + } + + // Read unified response: [success: bool, rows_processed: int64, serialized_data: binary] + auto read_res = _reader->Next(); + if (!read_res.ok()) { + return handle_error(read_res.status()); + } + + arrow::flight::FlightStreamChunk chunk = std::move(*read_res); + if (!chunk.data) { + return Status::InternalError("Received empty RecordBatch from {} server", _operation_name); + } + + // Validate unified response schema + if (!chunk.data->schema()->Equals(kUnifiedUDAFResponseSchema)) { + return Status::InternalError( + "Invalid response schema: expected [success: bool, rows_processed: int64, " + "serialized_data: binary], got {}", + chunk.data->schema()->ToString()); + } + + *response_batch = std::move(chunk.data); + return Status::OK(); +} + +Status PythonUDAFClient::_create_data_request_batch(const arrow::RecordBatch& input_data, + std::shared_ptr* out) { + // Determine if input has places column + int num_input_columns = input_data.num_columns(); + bool has_places = false; + if (num_input_columns > 0 && + input_data.schema()->field(num_input_columns - 1)->name() == "places") { + has_places = true; + } + + // Expected schema structure: [argument_types..., places, binary_data] + // - Input in single-place mode: [argument_types...] + // - Input in multi-place mode: [argument_types..., places] + std::vector> columns; + // Copy argument_types columns + int num_arg_columns = has_places ? (num_input_columns - 1) : num_input_columns; + + for (int i = 0; i < num_arg_columns; ++i) { + columns.push_back(input_data.column(i)); + } + + // Add places column + if (has_places) { + // Use existing places column from input + columns.push_back(input_data.column(num_input_columns - 1)); + } else { + // Create NULL places column for single-place mode + arrow::Int64Builder places_builder; + std::shared_ptr places_array; + RETURN_DORIS_STATUS_IF_ERROR(places_builder.AppendNulls(input_data.num_rows())); + RETURN_DORIS_STATUS_IF_ERROR(places_builder.Finish(&places_array)); + columns.push_back(places_array); + } + + // Add NULL binary_data column + arrow::BinaryBuilder binary_builder; + std::shared_ptr binary_array; + RETURN_DORIS_STATUS_IF_ERROR(binary_builder.AppendNulls(input_data.num_rows())); + RETURN_DORIS_STATUS_IF_ERROR(binary_builder.Finish(&binary_array)); + columns.push_back(binary_array); + + *out = arrow::RecordBatch::Make(_schema, input_data.num_rows(), columns); + return Status::OK(); +} + +Status PythonUDAFClient::_create_binary_request_batch( + const std::shared_ptr& binary_data, + std::shared_ptr* out) { + std::vector> columns; + + // Create NULL arrays for data columns (all columns except the last binary_data column) + // Schema: [argument_types..., places, binary_data] + int num_data_columns = _schema->num_fields() - 1; + for (int i = 0; i < num_data_columns; ++i) { + std::unique_ptr builder; + std::shared_ptr null_array; + RETURN_DORIS_STATUS_IF_ERROR(arrow::MakeBuilder(arrow::default_memory_pool(), + _schema->field(i)->type(), &builder)); + RETURN_DORIS_STATUS_IF_ERROR(builder->AppendNull()); + RETURN_DORIS_STATUS_IF_ERROR(builder->Finish(&null_array)); + columns.push_back(null_array); + } + + // Create binary_data column + arrow::BinaryBuilder binary_builder; + std::shared_ptr binary_array; + RETURN_DORIS_STATUS_IF_ERROR( + binary_builder.Append(binary_data->data(), static_cast(binary_data->size()))); + RETURN_DORIS_STATUS_IF_ERROR(binary_builder.Finish(&binary_array)); + columns.push_back(binary_array); + + *out = arrow::RecordBatch::Make(_schema, 1, columns); + return Status::OK(); +} + +Status PythonUDAFClient::_get_empty_request_batch(std::shared_ptr* out) { + // Return cached batch if already created + if (_empty_request_batch) { + *out = _empty_request_batch; + return Status::OK(); + } + + // Create empty batch on first use (all columns NULL, 1 row) + std::vector> columns; + + for (int i = 0; i < _schema->num_fields(); ++i) { + auto field = _schema->field(i); + std::unique_ptr builder; + std::shared_ptr null_array; + RETURN_DORIS_STATUS_IF_ERROR( + arrow::MakeBuilder(arrow::default_memory_pool(), field->type(), &builder)); + RETURN_DORIS_STATUS_IF_ERROR(builder->AppendNull()); + RETURN_DORIS_STATUS_IF_ERROR(builder->Finish(&null_array)); + columns.push_back(null_array); + } + + _empty_request_batch = arrow::RecordBatch::Make(_schema, 1, columns); + *out = _empty_request_batch; + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/udf/python/python_udaf_client.h b/be/src/udf/python/python_udaf_client.h new file mode 100644 index 00000000000000..078c34a39ea967 --- /dev/null +++ b/be/src/udf/python/python_udaf_client.h @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "udf/python/python_client.h" + +namespace doris { + +class PythonUDAFClient; + +using PythonUDAFClientPtr = std::shared_ptr; + +// Fixed-size (30 bytes) binary metadata structure for UDAF operations (Request) +struct __attribute__((packed)) UDAFMetadata { + uint32_t meta_version; // 4 bytes: metadata version (current version = 1) + uint8_t operation; // 1 byte: UDAFOperation enum + uint8_t is_single_place; // 1 byte: boolean (0 or 1, ACCUMULATE only) + int64_t place_id; // 8 bytes: aggregate state identifier (globally unique) + int64_t row_start; // 8 bytes: start row index (ACCUMULATE only) + int64_t row_end; // 8 bytes: end row index (exclusive, ACCUMULATE only) +}; + +static_assert(sizeof(UDAFMetadata) == 30, "UDAFMetadata size must be 30 bytes"); + +// Current metadata version constant +constexpr uint32_t UDAF_METADATA_VERSION = 1; + +/** + * Python UDAF Client + * + * Implements Snowflake-style UDAF pattern with the following methods: + * - __init__(): Initialize aggregate state + * - aggregate_state: Property that returns internal state + * - accumulate(input): Add new input to aggregate state + * - merge(other_state): Combine two intermediate states + * - finish(): Generate final result from aggregate state + * + * Communication protocol with Python server: + * 1. CREATE: Initialize UDAF class instance and get initial state + * 2. ACCUMULATE: Send input data batch and get updated states + * 3. SERIALIZE: Get serialized state for shuffle/merge + * 4. MERGE: Combine serialized states + * 5. FINALIZE: Get final result from state + * 6. RESET: Reset state to initial value + * 7. DESTROY: Clean up resources + */ +class PythonUDAFClient : public PythonClient { +public: + // UDAF operation types + enum class UDAFOperation : uint8_t { + CREATE = 0, // Create new aggregate state + ACCUMULATE = 1, // Add input rows to state + SERIALIZE = 2, // Serialize state for shuffle + MERGE = 3, // Merge two states + FINALIZE = 4, // Get final result + RESET = 5, // Reset state + DESTROY = 6 // Destroy state + }; + + PythonUDAFClient() = default; + + ~PythonUDAFClient() override { + // Clean up all remaining states on destruction + auto st = close(); + if (!st.ok()) { + LOG(WARNING) << "Failed to close PythonUDAFClient in destructor: " << st.to_string(); + } + } + + static Status create(const PythonUDFMeta& func_meta, ProcessPtr process, + const std::shared_ptr& data_schema, + PythonUDAFClientPtr* client); + + /** + * Initialize UDAF client with data schema + * Overrides base class to set _schema before initialization + * @param func_meta Function metadata + * @param process Python process handle + * @param data_schema Arrow schema for UDAF data + * @return Status + */ + Status init(const PythonUDFMeta& func_meta, ProcessPtr process, + const std::shared_ptr& data_schema); + + /** + * Create aggregate state for a place + * @param place_id Unique identifier for the aggregate state + * @return Status + */ + Status create(int64_t place_id); + + /** + * Accumulate input data into aggregate state + * + * For single-place mode (is_single_place=true): + * - input RecordBatch contains only data columns + * - All rows are accumulated to the same place_id + * + * For multi-place mode (is_single_place=false): + * - input RecordBatch MUST contain a "places" column (int64) as the last column + * - The "places" column indicates which place each row belongs to + * - place_id parameter is ignored (set to 0 by convention) + * + * @param place_id Aggregate state identifier (used only in single-place mode) + * @param is_single_place Whether all rows go to single place + * @param input Input data batch (must contain "places" column if is_single_place=false) + * @param row_start Start row index + * @param row_end End row index (exclusive) + * @return Status + */ + Status accumulate(int64_t place_id, bool is_single_place, const arrow::RecordBatch& input, + int64_t row_start, int64_t row_end); + + /** + * Serialize aggregate state for shuffle/merge + * @param place_id Aggregate state identifier + * @param serialized_state Output serialized state + * @return Status + */ + Status serialize(int64_t place_id, std::shared_ptr* serialized_state); + + /** + * Merge another serialized state into current state + * @param place_id Target aggregate state identifier + * @param serialized_state Serialized state to merge + * @return Status + */ + Status merge(int64_t place_id, const std::shared_ptr& serialized_state); + + /** + * Get final result from aggregate state + * @param place_id Aggregate state identifier + * @param output Output result + * @return Status + */ + Status finalize(int64_t place_id, std::shared_ptr* output); + + /** + * Reset aggregate state to initial value + * @param place_id Aggregate state identifier + * @return Status + */ + Status reset(int64_t place_id); + + /** + * Destroy aggregate state and free resources + * @param place_id Aggregate state identifier + * @return Status + */ + Status destroy(int64_t place_id); + + /** + * Close client connection and cleanup + * Overrides base class to destroy the tracked place first + * @return Status + */ + Status close(); + +private: + DISALLOW_COPY_AND_ASSIGN(PythonUDAFClient); + + /** + * Send RecordBatch request to Python server with app_metadata + * @param metadata UDAFMetadata structure (will be sent as app_metadata) + * @param request_batch Request RecordBatch (contains data columns + binary_data column) + * @param response_batch Output RecordBatch + * @return Status + */ + Status _send_request(const UDAFMetadata& metadata, + const std::shared_ptr& request_batch, + std::shared_ptr* response_batch); + + /** + * Create request batch with data columns (for ACCUMULATE) + * Appends NULL binary_data column to input data batch + */ + Status _create_data_request_batch(const arrow::RecordBatch& input_data, + std::shared_ptr* out); + + /** + * Create request batch with binary data (for MERGE) + * Creates NULL data columns + binary_data column + */ + Status _create_binary_request_batch(const std::shared_ptr& binary_data, + std::shared_ptr* out); + + /** + * Get or create empty request batch (for CREATE/SERIALIZE/FINALIZE/RESET/DESTROY) + * All columns are NULL. Cached after first creation for reuse. + */ + Status _get_empty_request_batch(std::shared_ptr* out); + + // Arrow Flight schema: [argument_types..., places: int64, binary_data: binary] + std::shared_ptr _schema; + std::shared_ptr _empty_request_batch; + // Track created state for cleanup + std::optional _created_place_id; + // Thread safety: protect gRPC stream operations + // CRITICAL: gRPC ClientReaderWriter does NOT support concurrent Write() calls + // Even within same thread, multiple pipeline tasks may trigger concurrent operations + // (e.g., normal accumulate() + cleanup destroy() during task finalization) + mutable std::mutex _operation_mutex; +}; + +} // namespace doris diff --git a/be/src/udf/python/python_udf_client.cpp b/be/src/udf/python/python_udf_client.cpp new file mode 100644 index 00000000000000..368808a8a48040 --- /dev/null +++ b/be/src/udf/python/python_udf_client.cpp @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "udf/python/python_udf_client.h" + +#include "common/status.h" + +namespace doris { + +Status PythonUDFClient::create(const PythonUDFMeta& func_meta, ProcessPtr process, + PythonUDFClientPtr* client) { + PythonUDFClientPtr python_udf_client = std::make_shared(); + RETURN_IF_ERROR(python_udf_client->init(func_meta, std::move(process))); + *client = std::move(python_udf_client); + return Status::OK(); +} + +Status PythonUDFClient::evaluate(const arrow::RecordBatch& input, + std::shared_ptr* output) { + RETURN_IF_ERROR(begin_stream(input.schema())); + RETURN_IF_ERROR(write_batch(input)); + RETURN_IF_ERROR(read_batch(output)); + return Status::OK(); +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/udf/python/python_udf_client.h b/be/src/udf/python/python_udf_client.h new file mode 100644 index 00000000000000..969dd8d02d3cbd --- /dev/null +++ b/be/src/udf/python/python_udf_client.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "udf/python/python_client.h" + +namespace doris { + +class PythonUDFClient; + +using PythonUDFClientPtr = std::shared_ptr; + +/** + * Python UDF Client + * + * Implements standard UDF (User-Defined Function) pattern with a single evaluation function: + * - evaluate_func(*args): Process input arguments and return result + * + * UDF Characteristics: + * - Takes scalar or column inputs + * - Returns scalar or column outputs + * - Stateless evaluation (each call is independent) + * - Simple input-output transformation + * + * Example: + * ```python + * def evaluate_func(x, y): + * # Add two numbers + * return x + y + * ``` + * + * Communication protocol with Python server: + * 1. Send input batch (RecordBatch with N rows) + * 2. Python calls evaluate_func() for each row (or vectorized) + * 3. Receive output batch (RecordBatch with N rows) + */ +class PythonUDFClient : public PythonClient { +public: + PythonUDFClient() = default; + ~PythonUDFClient() override = default; + + static Status create(const PythonUDFMeta& func_meta, ProcessPtr process, + PythonUDFClientPtr* client); + + /** + * Evaluate UDF on input rows + * + * @param input Input row batch (columns = UDF function parameters) + * @param output Output row batch (single column = UDF return value) + * @return Status + */ + Status evaluate(const arrow::RecordBatch& input, std::shared_ptr* output); + +private: + DISALLOW_COPY_AND_ASSIGN(PythonUDFClient); +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/udf/python/python_udf_meta.cpp b/be/src/udf/python/python_udf_meta.cpp new file mode 100644 index 00000000000000..85428b4033bbb5 --- /dev/null +++ b/be/src/udf/python/python_udf_meta.cpp @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "udf/python/python_udf_meta.h" + +#include +#include +#include +#include + +#include + +#include "common/status.h" +#include "util/arrow/utils.h" +#include "util/string_util.h" + +namespace doris { + +Status PythonUDFMeta::convert_types_to_schema(const vectorized::DataTypes& types, + const std::string& timezone, + std::shared_ptr* schema) { + assert(!types.empty()); + arrow::SchemaBuilder builder; + for (size_t i = 0; i < types.size(); ++i) { + std::shared_ptr arrow_type; + RETURN_IF_ERROR(convert_to_arrow_type(types[i], &arrow_type, timezone)); + std::shared_ptr field = std::make_shared( + "arg" + std::to_string(i), arrow_type, types[i]->is_nullable()); + RETURN_DORIS_STATUS_IF_ERROR(builder.AddField(field)); + } + RETURN_DORIS_STATUS_IF_RESULT_ERROR(schema, builder.Finish()); + return Status::OK(); +} + +Status PythonUDFMeta::serialize_arrow_schema(const std::shared_ptr& schema, + std::shared_ptr* out) { + RETURN_DORIS_STATUS_IF_RESULT_ERROR( + out, arrow::ipc::SerializeSchema(*schema, arrow::default_memory_pool())); + return Status::OK(); +} + +/* + json format: + { + "name": "xxx", + "symbol": "xxx", + "location": "xxx", + "udf_load_type": 0 or 1, + "client_type": 0 (UDF) or 1 (UDAF) or 2 (UDTF), + "runtime_version": "x.xx.xx", + "always_nullable": true, + "inline_code": "base64_inline_code", + "input_types": "base64_input_types", + "return_type": "base64_return_type" + } +*/ +Status PythonUDFMeta::serialize_to_json(std::string* json_str) const { + rapidjson::Document doc; + doc.SetObject(); + auto& allocator = doc.GetAllocator(); + doc.AddMember("name", rapidjson::Value().SetString(name.c_str(), allocator), allocator); + doc.AddMember("symbol", rapidjson::Value().SetString(symbol.c_str(), allocator), allocator); + doc.AddMember("location", rapidjson::Value().SetString(location.c_str(), allocator), allocator); + doc.AddMember("udf_load_type", rapidjson::Value().SetInt(static_cast(type)), allocator); + doc.AddMember("client_type", rapidjson::Value().SetInt(static_cast(client_type)), + allocator); + doc.AddMember("runtime_version", + rapidjson::Value().SetString(runtime_version.c_str(), allocator), allocator); + doc.AddMember("always_nullable", rapidjson::Value().SetBool(always_nullable), allocator); + + { + // Serialize base64 inline code to json + std::string base64_str = arrow::util::base64_encode(inline_code); + doc.AddMember("inline_code", rapidjson::Value().SetString(base64_str.c_str(), allocator), + allocator); + } + { + // Serialize base64 input types to json + std::shared_ptr input_schema; + RETURN_IF_ERROR(convert_types_to_schema(input_types, TimezoneUtils::default_time_zone, + &input_schema)); + std::shared_ptr input_schema_buffer; + RETURN_IF_ERROR(serialize_arrow_schema(input_schema, &input_schema_buffer)); + std::string base64_str = + arrow::util::base64_encode({input_schema_buffer->data_as(), + static_cast(input_schema_buffer->size())}); + doc.AddMember("input_types", rapidjson::Value().SetString(base64_str.c_str(), allocator), + allocator); + } + { + // Serialize base64 return type to json + std::shared_ptr return_schema; + RETURN_IF_ERROR(convert_types_to_schema({return_type}, TimezoneUtils::default_time_zone, + &return_schema)); + std::shared_ptr return_schema_buffer; + RETURN_IF_ERROR(serialize_arrow_schema(return_schema, &return_schema_buffer)); + std::string base64_str = + arrow::util::base64_encode({return_schema_buffer->data_as(), + static_cast(return_schema_buffer->size())}); + doc.AddMember("return_type", rapidjson::Value().SetString(base64_str.c_str(), allocator), + allocator); + } + + // Convert document to json string + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + doc.Accept(writer); + *json_str = std::string(buffer.GetString(), buffer.GetSize()); + return Status::OK(); +} + +std::string PythonUDFMeta::to_string() const { + std::stringstream input_types_ss; + input_types_ss << "<"; + for (size_t i = 0; i < input_types.size(); ++i) { + input_types_ss << input_types[i]->get_name(); + if (i != input_types.size() - 1) { + input_types_ss << ", "; + } + } + input_types_ss << ">"; + return fmt::format( + "[name: {}, symbol: {}, location: {}, runtime_version: {}, always_nullable: {}, " + "inline_code: {}][input_types: {}][return_type: {}]", + name, symbol, location, runtime_version, always_nullable, inline_code, + input_types_ss.str(), return_type->get_name()); +} + +Status PythonUDFMeta::check() const { + if (trim(name).empty()) { + return Status::InvalidArgument("Python UDF name is empty"); + } + + if (trim(symbol).empty()) { + return Status::InvalidArgument("Python UDF symbol is empty"); + } + + if (trim(runtime_version).empty()) { + return Status::InvalidArgument("Python UDF runtime version is empty"); + } + + if (input_types.empty()) { + return Status::InvalidArgument("Python UDF input types is empty"); + } + + if (!return_type) { + return Status::InvalidArgument("Python UDF return type is empty"); + } + + if (type == PythonUDFLoadType::UNKNOWN) { + return Status::InvalidArgument( + "Python UDF load type is invalid, please check inline code or file path"); + } + + if (type == PythonUDFLoadType::MODULE) { + if (trim(location).empty()) { + return Status::InvalidArgument("Non-inline Python UDF location is empty"); + } + if (trim(checksum).empty()) { + return Status::InvalidArgument("Non-inline Python UDF checksum is empty"); + } + } + + return Status::OK(); +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/udf/python/python_udf_meta.h b/be/src/udf/python/python_udf_meta.h new file mode 100644 index 00000000000000..800f14b47df7c7 --- /dev/null +++ b/be/src/udf/python/python_udf_meta.h @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/buffer.h" +#include "arrow/flight/client.h" +#include "arrow/flight/server.h" +#include "common/status.h" +#include "util/arrow/row_batch.h" +#include "vec/data_types/data_type.h" + +namespace doris { + +enum class PythonUDFLoadType : uint8_t { INLINE = 0, MODULE = 1, UNKNOWN = 2 }; + +enum class PythonClientType : uint8_t { UDF = 0, UDAF = 1, UDTF = 2, UNKNOWN = 3 }; + +struct PythonUDFMeta { + int64_t id; + std::string name; + std::string symbol; + std::string location; + std::string checksum; + std::string runtime_version; + std::string inline_code; + bool always_nullable; + vectorized::DataTypes input_types; + vectorized::DataTypePtr return_type; + PythonUDFLoadType type; + PythonClientType client_type; + + static Status convert_types_to_schema(const vectorized::DataTypes& types, + const std::string& timezone, + std::shared_ptr* schema); + + static Status serialize_arrow_schema(const std::shared_ptr& schema, + std::shared_ptr* out); + + Status serialize_to_json(std::string* json_str) const; + + std::string to_string() const; + + Status check() const; + + bool operator==(const PythonUDFMeta& other) const { return id == other.id; } +}; + +} // namespace doris + +namespace std { +template <> +struct hash { + size_t operator()(const doris::PythonUDFMeta& meta) const { + return std::hash()(meta.id); + } +}; +} // namespace std \ No newline at end of file diff --git a/be/src/udf/python/python_udf_runtime.cpp b/be/src/udf/python/python_udf_runtime.cpp new file mode 100644 index 00000000000000..9d687c43f40dd6 --- /dev/null +++ b/be/src/udf/python/python_udf_runtime.cpp @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "udf/python/python_udf_runtime.h" + +#include +#include +#include + +#include + +#include "common/logging.h" + +namespace doris { + +void PythonUDFProcess::remove_unix_socket() { + if (_uri.empty() || _unix_socket_file_path.empty()) return; + + if (unlink(_unix_socket_file_path.c_str()) == 0) { + LOG(INFO) << "Successfully removed unix socket: " << _unix_socket_file_path; + return; + } + + if (errno == ENOENT) { + // File does not exist, this is fine, no need to warn + LOG(INFO) << "Unix socket not found (already removed): " << _uri; + } else { + LOG(WARNING) << "Failed to remove unix socket " << _uri << ": " << std::strerror(errno) + << " (errno=" << errno << ")"; + } +} + +void PythonUDFProcess::shutdown() { + if (!_child.valid() || _is_shutdown) return; + + _child.terminate(); + bool graceful = false; + constexpr std::chrono::milliseconds retry_interval(100); // 100ms + + for (int i = 0; i < TERMINATE_RETRY_TIMES; ++i) { + if (!_child.running()) { + graceful = true; + break; + } + std::this_thread::sleep_for(retry_interval); + } + + if (!graceful) { + LOG(WARNING) << "Python process did not terminate gracefully, sending SIGKILL"; + ::kill(_child_pid, SIGKILL); + _child.wait(); + } + + if (int exit_code = _child.exit_code(); exit_code > 128 && exit_code <= 255) { + int signal = exit_code - 128; + LOG(INFO) << "Python process was killed by signal " << signal; + } else { + LOG(INFO) << "Python process exited normally with code: " << exit_code; + } + + _output_stream.close(); + remove_unix_socket(); + _is_shutdown = true; +} + +std::string PythonUDFProcess::to_string() const { + return fmt::format( + "PythonUDFProcess(child_pid={}, uri={}, " + "unix_socket_file_path={}, is_shutdown={})", + _child_pid, _uri, _unix_socket_file_path, _is_shutdown); +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/udf/python/python_udf_runtime.h b/be/src/udf/python/python_udf_runtime.h new file mode 100644 index 00000000000000..17acaf46ac765e --- /dev/null +++ b/be/src/udf/python/python_udf_runtime.h @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "python_env.h" + +namespace doris { + +static const char* UNIX_SOCKET_PREFIX = "grpc+unix://"; +static const char* BASE_UNIX_SOCKET_PATH_TEMPLATE = "{}{}/lib/udf/python/python_udf"; +static const char* UNIX_SOCKET_PATH_TEMPLATE = "{}_{}.sock"; +static const char* FLIGHT_SERVER_PATH_TEMPLATE = "{}/plugins/python_udf/{}"; +static const char* FLIGHT_SERVER_FILENAME = "python_server.py"; + +inline std::string get_base_unix_socket_path() { + return fmt::format(BASE_UNIX_SOCKET_PATH_TEMPLATE, UNIX_SOCKET_PREFIX, + std::getenv("DORIS_HOME")); +} + +inline std::string get_unix_socket_path(pid_t child_pid) { + return fmt::format(UNIX_SOCKET_PATH_TEMPLATE, get_base_unix_socket_path(), child_pid); +} + +inline std::string get_unix_socket_file_path(pid_t child_pid) { + return fmt::format(UNIX_SOCKET_PATH_TEMPLATE, + fmt::format(BASE_UNIX_SOCKET_PATH_TEMPLATE, "", std::getenv("DORIS_HOME")), + child_pid); +} + +inline std::string get_fight_server_path() { + return fmt::format(FLIGHT_SERVER_PATH_TEMPLATE, std::getenv("DORIS_HOME"), + FLIGHT_SERVER_FILENAME); +} + +class PythonUDFProcess; + +using ProcessPtr = std::shared_ptr; + +class PythonUDFProcess { +public: + PythonUDFProcess(boost::process::child child, boost::process::ipstream output_stream) + : _is_shutdown(false), + _child_pid(child.id()), + _uri(get_unix_socket_path(_child_pid)), + _unix_socket_file_path(get_unix_socket_file_path(_child_pid)), + _child(std::move(child)), + _output_stream(std::move(output_stream)) {} + + ~PythonUDFProcess() { shutdown(); } + + std::string get_uri() const { return _uri; } + + const std::string& get_socket_file_path() const { return _unix_socket_file_path; } + + bool is_shutdown() const { return _is_shutdown; } + + bool is_alive() const { return !_is_shutdown && _child.running(); } + + void remove_unix_socket(); + + void shutdown(); + + std::string to_string() const; + + pid_t get_child_pid() const { return _child_pid; } + + bool operator==(const PythonUDFProcess& other) const { return _child_pid == other._child_pid; } + + bool operator!=(const PythonUDFProcess& other) const { return !(*this == other); } + +private: + constexpr static int TERMINATE_RETRY_TIMES = 10; + constexpr static size_t MAX_ACCUMULATED_LOG_SIZE = 65536; + + bool _is_shutdown {false}; + pid_t _child_pid; + std::string _uri; + std::string _unix_socket_file_path; + mutable boost::process::child _child; + boost::process::ipstream _output_stream; + std::string _accumulated_log; +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/udf/python/python_udtf_client.cpp b/be/src/udf/python/python_udtf_client.cpp new file mode 100644 index 00000000000000..a14cc28e423d7d --- /dev/null +++ b/be/src/udf/python/python_udtf_client.cpp @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "udf/python/python_udtf_client.h" + +#include "arrow/array/array_nested.h" +#include "arrow/array/array_primitive.h" +#include "arrow/record_batch.h" +#include "arrow/type.h" +#include "common/status.h" + +namespace doris { + +Status PythonUDTFClient::create(const PythonUDFMeta& func_meta, ProcessPtr process, + PythonUDTFClientPtr* client) { + PythonUDTFClientPtr python_udtf_client = std::make_shared(); + RETURN_IF_ERROR(python_udtf_client->init(func_meta, std::move(process))); + *client = std::move(python_udtf_client); + return Status::OK(); +} + +Status PythonUDTFClient::evaluate(const arrow::RecordBatch& input, + std::shared_ptr* list_array) { + RETURN_IF_ERROR(begin_stream(input.schema())); + RETURN_IF_ERROR(write_batch(input)); + + // Read the response (ListArray-based) + std::shared_ptr response_batch; + RETURN_IF_ERROR(read_batch(&response_batch)); + + // Validate response structure: should have a single ListArray column + if (response_batch->num_columns() != 1) { + return Status::InternalError( + fmt::format("Invalid UDTF response: expected 1 column (ListArray), got {}", + response_batch->num_columns())); + } + + auto list_array_ptr = response_batch->column(0); + if (list_array_ptr->type_id() != arrow::Type::LIST) { + return Status::InternalError( + fmt::format("Invalid UDTF response: expected ListArray, got type {}", + list_array_ptr->type()->ToString())); + } + + *list_array = std::static_pointer_cast(list_array_ptr); + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/udf/python/python_udtf_client.h b/be/src/udf/python/python_udtf_client.h new file mode 100644 index 00000000000000..35ae29d1b7ae4d --- /dev/null +++ b/be/src/udf/python/python_udtf_client.h @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "udf/python/python_client.h" + +namespace doris { + +class PythonUDTFClient; + +using PythonUDTFClientPtr = std::shared_ptr; + +/** + * Python UDTF Client + * + * Implements simplified UDTF (User-Defined Table Function): + * + * Handler Function: + * - evaluate_func(*args): Process input arguments and yield output rows + * + * UDTF Characteristics: + * - Takes scalar or table inputs + * - Returns table (multiple rows) + * - Simple yield pattern + * + * Example: + * ```python + * def evaluate_func(text, delimiter): + * # Split string by delimiter and return multiple results + * for item in text.split(delimiter): + * # or yield (item, ) + * yield item + * ``` + * + * Communication protocol with Python server: + * 1. Send input row batch to Python + * 2. Python calls evaluate_func() for each input row + * 3. Collect all output rows and return + */ +class PythonUDTFClient : public PythonClient { +public: + PythonUDTFClient() = default; + ~PythonUDTFClient() override = default; + + static Status create(const PythonUDFMeta& func_meta, ProcessPtr process, + PythonUDTFClientPtr* client); + + /** + * Evaluate UDTF on input rows + * + * Protocol (ListArray-based): + * Python server returns a RecordBatch with 1 column: + * - Column 0: ListArray where each list element corresponds to one input row's outputs + * + * Example: + * Input: 3 rows + * Output ListArray: + * [0]: [val1, val2, val3] (3 elements for input row 0) + * [1]: [] (0 elements for input row 1) + * [2]: [val4, val5, val6, val7] (4 elements for input row 2) + * + * @param input Input row batch (columns = UDTF function parameters) + * @param list_array Output ListArray (length = num_input_rows) + * @return Status + */ + Status evaluate(const arrow::RecordBatch& input, std::shared_ptr* list_array); + +private: + DISALLOW_COPY_AND_ASSIGN(PythonUDTFClient); +}; + +} // namespace doris diff --git a/be/src/util/arrow/block_convertor.cpp b/be/src/util/arrow/block_convertor.cpp index 0da5c2261f74d8..daf8151527da6a 100644 --- a/be/src/util/arrow/block_convertor.cpp +++ b/be/src/util/arrow/block_convertor.cpp @@ -53,49 +53,30 @@ class Array; namespace doris { #include "common/compile_check_begin.h" -class FromBlockConverter { -public: - FromBlockConverter(const vectorized::Block& block, const std::shared_ptr& schema, - arrow::MemoryPool* pool, const cctz::time_zone& timezone_obj) - : _block(block), - _schema(schema), - _pool(pool), - _cur_field_idx(-1), - _timezone_obj(timezone_obj) {} - - ~FromBlockConverter() = default; - - Status convert(std::shared_ptr* out); - -private: - const vectorized::Block& _block; - const std::shared_ptr& _schema; - arrow::MemoryPool* _pool; - - size_t _cur_field_idx; - size_t _cur_start; - size_t _cur_rows; - vectorized::ColumnPtr _cur_col; - vectorized::DataTypePtr _cur_type; - arrow::ArrayBuilder* _cur_builder = nullptr; - - const cctz::time_zone& _timezone_obj; - - std::vector> _arrays; -}; - -Status FromBlockConverter::convert(std::shared_ptr* out) { +Status FromBlockToRecordBatchConverter::convert(std::shared_ptr* out) { int num_fields = _schema->num_fields(); if (_block.columns() != num_fields) { return Status::InvalidArgument("number fields not match"); } + // Calculate actual row range to convert + size_t actual_start = _row_range_start; + size_t actual_rows = _row_range_end > 0 ? (_row_range_end - _row_range_start) + : (_block.rows() - _row_range_start); + + // Validate range + if (actual_start + actual_rows > _block.rows()) { + return Status::InvalidArgument( + "Row range out of bounds: start={}, num_rows={}, block_rows={}", actual_start, + actual_rows, _block.rows()); + } + _arrays.resize(num_fields); for (int idx = 0; idx < num_fields; ++idx) { _cur_field_idx = idx; - _cur_start = 0; - _cur_rows = _block.rows(); + _cur_start = actual_start; + _cur_rows = actual_rows; _cur_col = _block.get_by_position(idx).column; _cur_type = _block.get_by_position(idx).type; auto column = _cur_col->convert_to_full_column_if_const(); @@ -123,7 +104,31 @@ Status FromBlockConverter::convert(std::shared_ptr* out) { return to_doris_status(arrow_st); } } - *out = arrow::RecordBatch::Make(_schema, _block.rows(), std::move(_arrays)); + *out = arrow::RecordBatch::Make(_schema, actual_rows, std::move(_arrays)); + return Status::OK(); +} + +Status FromRecordBatchToBlockConverter::convert(vectorized::Block* block) { + DCHECK(block); + int num_fields = _batch->num_columns(); + if ((size_t)num_fields != _types.size()) { + return Status::InvalidArgument("number fields not match"); + } + + int64_t num_rows = _batch->num_rows(); + _columns.reserve(num_fields); + + for (int idx = 0; idx < num_fields; ++idx) { + auto doris_type = _types[idx]; + auto doris_column = doris_type->create_column(); + auto arrow_column = _batch->column(idx); + DCHECK_EQ(arrow_column->length(), num_rows); + RETURN_IF_ERROR(doris_type->get_serde()->read_column_from_arrow( + *doris_column, &*arrow_column, 0, num_rows, _timezone_obj)); + _columns.emplace_back(std::move(doris_column), std::move(doris_type), std::to_string(idx)); + } + + block->swap(_columns); return Status::OK(); } @@ -131,9 +136,26 @@ Status convert_to_arrow_batch(const vectorized::Block& block, const std::shared_ptr& schema, arrow::MemoryPool* pool, std::shared_ptr* result, const cctz::time_zone& timezone_obj) { - FromBlockConverter converter(block, schema, pool, timezone_obj); + FromBlockToRecordBatchConverter converter(block, schema, pool, timezone_obj); + return converter.convert(result); +} + +Status convert_to_arrow_batch(const vectorized::Block& block, + const std::shared_ptr& schema, arrow::MemoryPool* pool, + std::shared_ptr* result, + const cctz::time_zone& timezone_obj, size_t start_row, + size_t end_row) { + FromBlockToRecordBatchConverter converter(block, schema, pool, timezone_obj, start_row, + end_row); return converter.convert(result); } +Status convert_from_arrow_batch(const std::shared_ptr& batch, + const vectorized::DataTypes& types, vectorized::Block* block, + const cctz::time_zone& timezone_obj) { + FromRecordBatchToBlockConverter converter(batch, types, timezone_obj); + return converter.convert(block); +} + #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/util/arrow/block_convertor.h b/be/src/util/arrow/block_convertor.h index 6c3163b05de3e2..5bf9e2665e8bc5 100644 --- a/be/src/util/arrow/block_convertor.h +++ b/be/src/util/arrow/block_convertor.h @@ -22,7 +22,9 @@ #include #include "common/status.h" +#include "vec/columns/column.h" #include "vec/core/block.h" +#include "vec/data_types/data_type.h" // This file will convert Doris Block to/from Arrow's RecordBatch // Block is used by Doris query engine to exchange data between @@ -38,9 +40,87 @@ class Schema; namespace doris { +class FromBlockToRecordBatchConverter { +public: + FromBlockToRecordBatchConverter(const vectorized::Block& block, + const std::shared_ptr& schema, + arrow::MemoryPool* pool, const cctz::time_zone& timezone_obj) + : _block(block), + _schema(schema), + _pool(pool), + _cur_field_idx(-1), + _timezone_obj(timezone_obj), + _row_range_start(0), + _row_range_end(0) {} + + FromBlockToRecordBatchConverter(const vectorized::Block& block, + const std::shared_ptr& schema, + arrow::MemoryPool* pool, const cctz::time_zone& timezone_obj, + size_t start_row, size_t end_row) + : _block(block), + _schema(schema), + _pool(pool), + _cur_field_idx(-1), + _timezone_obj(timezone_obj), + _row_range_start(start_row), + _row_range_end(end_row) {} + + ~FromBlockToRecordBatchConverter() = default; + + Status convert(std::shared_ptr* out); + +private: + const vectorized::Block& _block; + const std::shared_ptr& _schema; + arrow::MemoryPool* _pool; + + size_t _cur_field_idx; + size_t _cur_start; + size_t _cur_rows; + vectorized::ColumnPtr _cur_col; + vectorized::DataTypePtr _cur_type; + arrow::ArrayBuilder* _cur_builder = nullptr; + + const cctz::time_zone& _timezone_obj; + + // Row range for zero-copy slicing (0 means use all rows from _row_range_start) + size_t _row_range_start; + size_t _row_range_end; + + std::vector> _arrays; +}; + +class FromRecordBatchToBlockConverter { +public: + FromRecordBatchToBlockConverter(const std::shared_ptr& batch, + const vectorized::DataTypes& types, + const cctz::time_zone& timezone_obj) + : _batch(batch), _types(types), _timezone_obj(timezone_obj) {} + + ~FromRecordBatchToBlockConverter() = default; + + Status convert(vectorized::Block* block); + +private: + const std::shared_ptr& _batch; + const vectorized::DataTypes& _types; + const cctz::time_zone& _timezone_obj; + vectorized::ColumnsWithTypeAndName _columns; +}; + Status convert_to_arrow_batch(const vectorized::Block& block, const std::shared_ptr& schema, arrow::MemoryPool* pool, std::shared_ptr* result, const cctz::time_zone& timezone_obj); +Status convert_to_arrow_batch(const vectorized::Block& block, + const std::shared_ptr& schema, arrow::MemoryPool* pool, + std::shared_ptr* result, + const cctz::time_zone& timezone_obj, size_t start_row, + size_t end_row); + +Status convert_from_arrow_batch(const std::shared_ptr& batch, + const vectorized::DataTypes& types, vectorized::Block* block, + const cctz::time_zone& timezone_obj); + } // namespace doris diff --git a/be/src/util/arrow/utils.h b/be/src/util/arrow/utils.h index 0a731bafbd5b5c..7794906b384475 100644 --- a/be/src/util/arrow/utils.h +++ b/be/src/util/arrow/utils.h @@ -17,8 +17,11 @@ #pragma once +#include + #include +#include "common/compiler_util.h" #include "common/status.h" // This files contains some utilities to convert Doris internal @@ -72,4 +75,33 @@ Status arrow_pretty_print(const arrow::Array& rb, std::ostream* os); Status to_doris_status(const arrow::Status& status); arrow::Status to_arrow_status(const Status& status); +template +inline void assign_from_result(T& output, const arrow::Result& result) { + output = *result; +} + +template +inline void assign_from_result(T& output, arrow::Result&& result) { + output = std::move(*result); +} + +template +inline void assign_from_result(T* output, const arrow::Result& result) { + *output = *result; +} + +template +inline void assign_from_result(T* output, arrow::Result&& result) { + *output = std::move(*result); +} + +#define RETURN_DORIS_STATUS_IF_RESULT_ERROR(output, result_expr) \ + do { \ + auto&& _result_ = (result_expr); \ + if (UNLIKELY(!_result_.ok())) { \ + return to_doris_status(_result_.status()); \ + } \ + assign_from_result(output, std::forward(_result_)); \ + } while (0) + } // namespace doris diff --git a/be/src/vec/aggregate_functions/aggregate_function_python_udaf.cpp b/be/src/vec/aggregate_functions/aggregate_function_python_udaf.cpp new file mode 100644 index 00000000000000..3904ccc5fa8732 --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_python_udaf.cpp @@ -0,0 +1,430 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/aggregate_functions/aggregate_function_python_udaf.h" + +#include +#include +#include +#include +#include + +#include "common/exception.h" +#include "common/logging.h" +#include "runtime/define_primitive_type.h" +#include "runtime/user_function_cache.h" +#include "udf/python/python_env.h" +#include "udf/python/python_server.h" +#include "util/arrow/block_convertor.h" +#include "util/arrow/row_batch.h" +#include "util/timezone_utils.h" +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_vector.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/data_types/data_type_nullable.h" + +namespace doris::vectorized { + +Status AggregatePythonUDAFData::create(int64_t place) { + DCHECK(client) << "Client must be set before calling create"; + RETURN_IF_ERROR(client->create(place)); + return Status::OK(); +} + +Status AggregatePythonUDAFData::add(int64_t place_id, const IColumn** columns, + int64_t row_num_start, int64_t row_num_end, + const DataTypes& argument_types) { + DCHECK(client) << "Client must be set before calling add"; + + // Zero-copy: Use full columns with range specification + Block input_block; + for (size_t i = 0; i < argument_types.size(); ++i) { + input_block.insert( + ColumnWithTypeAndName(columns[i]->get_ptr(), argument_types[i], std::to_string(i))); + } + + std::shared_ptr schema; + RETURN_IF_ERROR( + get_arrow_schema_from_block(input_block, &schema, TimezoneUtils::default_time_zone)); + cctz::time_zone timezone_obj; + TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, timezone_obj); + + std::shared_ptr batch; + // Zero-copy: convert only the specified range + RETURN_IF_ERROR(convert_to_arrow_batch(input_block, schema, arrow::default_memory_pool(), + &batch, timezone_obj, row_num_start, row_num_end)); + // Send the batch (already sliced in convert_to_arrow_batch) + // Single place mode: no places column needed + RETURN_IF_ERROR(client->accumulate(place_id, true, *batch, 0, batch->num_rows())); + return Status::OK(); +} + +Status AggregatePythonUDAFData::add_batch(AggregateDataPtr* places, size_t place_offset, + size_t num_rows, const IColumn** columns, + const DataTypes& argument_types, size_t start, + size_t end) { + DCHECK(client) << "Client must be set before calling add_batch"; + DCHECK(end > start) << "end must be greater than start"; + DCHECK(end <= num_rows) << "end must not exceed num_rows"; + + size_t slice_rows = end - start; + Block input_block; + for (size_t i = 0; i < argument_types.size(); ++i) { + DCHECK(columns[i]->size() == num_rows) << "Column size must match num_rows"; + input_block.insert( + ColumnWithTypeAndName(columns[i]->get_ptr(), argument_types[i], std::to_string(i))); + } + + auto places_col = ColumnInt64::create(num_rows); + auto& places_data = places_col->get_data(); + + // Fill places column with place IDs for the slice [start, end) + for (size_t i = start; i < end; ++i) { + places_data[i] = reinterpret_cast(places[i] + place_offset); + } + + static DataTypePtr places_type = + DataTypeFactory::instance().create_data_type(PrimitiveType::TYPE_BIGINT, false); + input_block.insert(ColumnWithTypeAndName(std::move(places_col), places_type, "places")); + + std::shared_ptr schema; + RETURN_IF_ERROR( + get_arrow_schema_from_block(input_block, &schema, TimezoneUtils::default_time_zone)); + cctz::time_zone timezone_obj; + TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, timezone_obj); + + std::shared_ptr batch; + // Zero-copy: convert only the [start, end) range + // This slice includes the places column automatically + RETURN_IF_ERROR(convert_to_arrow_batch(input_block, schema, arrow::default_memory_pool(), + &batch, timezone_obj, start, end)); + // Send entire batch (already contains places column) to Python + // place_id=0 is ignored when is_single_place=false + RETURN_IF_ERROR(client->accumulate(0, false, *batch, 0, slice_rows)); + return Status::OK(); +} + +Status AggregatePythonUDAFData::merge(const AggregatePythonUDAFData& rhs, int64_t place) { + DCHECK(client) << "Client must be set before calling merge"; + + // Get serialized state from rhs (already stored in serialize_data by read()) + auto serialized_state = arrow::Buffer::Wrap( + reinterpret_cast(rhs.serialize_data.data()), rhs.serialize_data.size()); + RETURN_IF_ERROR(client->merge(place, serialized_state)); + return Status::OK(); +} + +Status AggregatePythonUDAFData::write(BufferWritable& buf, int64_t place) const { + DCHECK(client) << "Client must be set before calling write"; + + // Serialize state from Python server + std::shared_ptr serialized_state; + RETURN_IF_ERROR(client->serialize(place, &serialized_state)); + const char* data = reinterpret_cast(serialized_state->data()); + size_t size = serialized_state->size(); + buf.write_binary(StringRef {data, size}); + return Status::OK(); +} + +void AggregatePythonUDAFData::read(BufferReadable& buf) { + // Read serialized state from buffer into serialize_data + // This will be used later by merge() in deserialize_and_merge() + buf.read_binary(serialize_data); +} + +Status AggregatePythonUDAFData::reset(int64_t place) { + DCHECK(client) << "Client must be set before calling reset"; + RETURN_IF_ERROR(client->reset(place)); + // After reset, state still exists but is back to initial state + return Status::OK(); +} + +Status AggregatePythonUDAFData::destroy(int64_t place) { + DCHECK(client) << "Client must be set before calling destroy"; + RETURN_IF_ERROR(client->destroy(place)); + return Status::OK(); +} + +Status AggregatePythonUDAFData::get(IColumn& to, const DataTypePtr& result_type, + int64_t place) const { + DCHECK(client) << "Client must be set before calling get"; + + // Get final result from Python server + std::shared_ptr result; + RETURN_IF_ERROR(client->finalize(place, &result)); + + // Convert Arrow RecordBatch to Block + Block result_block; + DataTypes types = {result_type}; + cctz::time_zone timezone_obj; + TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, timezone_obj); + RETURN_IF_ERROR(convert_from_arrow_batch(result, types, &result_block, timezone_obj)); + + // Insert the result value into output column + if (result_block.rows() != 1) { + return Status::InternalError("Expected 1 row in result block, got {}", result_block.rows()); + } + + auto& result_column = result_block.get_by_position(0).column; + to.insert_from(*result_column, 0); + return Status::OK(); +} + +Status AggregatePythonUDAF::open() { + // Build function metadata from TFunction + _func_meta.id = _fn.id; + _func_meta.name = _fn.name.function_name; + + // For UDAF, symbol is in aggregate_fn + if (_fn.__isset.aggregate_fn && _fn.aggregate_fn.__isset.symbol) { + _func_meta.symbol = _fn.aggregate_fn.symbol; + } else { + return Status::InvalidArgument("Python UDAF symbol is not set"); + } + + // Determine load type (inline code or module) + if (!_fn.function_code.empty()) { + _func_meta.type = PythonUDFLoadType::INLINE; + _func_meta.location = "inline"; + _func_meta.inline_code = _fn.function_code; + } else if (!_fn.hdfs_location.empty()) { + _func_meta.type = PythonUDFLoadType::MODULE; + _func_meta.location = _fn.hdfs_location; + _func_meta.checksum = _fn.checksum; + } else { + _func_meta.type = PythonUDFLoadType::UNKNOWN; + _func_meta.location = "unknown"; + } + + _func_meta.input_types = argument_types; + _func_meta.return_type = _return_type; + _func_meta.client_type = PythonClientType::UDAF; + + // Get Python version + if (_fn.__isset.runtime_version && !_fn.runtime_version.empty()) { + RETURN_IF_ERROR(PythonVersionManager::instance().get_version(_fn.runtime_version, + &_python_version)); + } else { + return Status::InvalidArgument("Python UDAF runtime version is not set"); + } + + _func_meta.runtime_version = _python_version.full_version; + RETURN_IF_ERROR(_func_meta.check()); + _func_meta.always_nullable = _return_type->is_nullable(); + + LOG(INFO) << fmt::format("Creating Python UDAF: {}, runtime_version: {}, func_meta: {}", + _fn.name.function_name, _python_version.to_string(), + _func_meta.to_string()); + + if (_func_meta.type == PythonUDFLoadType::MODULE) { + RETURN_IF_ERROR(UserFunctionCache::instance()->get_pypath( + _func_meta.id, _func_meta.location, _func_meta.checksum, &_func_meta.location)); + } + + return Status::OK(); +} + +void AggregatePythonUDAF::create(AggregateDataPtr __restrict place) const { + std::call_once(_schema_init_flag, [this]() { + std::vector> fields; + + std::string timezone = TimezoneUtils::default_time_zone; + for (size_t i = 0; i < argument_types.size(); ++i) { + std::shared_ptr arrow_type; + Status st = convert_to_arrow_type(argument_types[i], &arrow_type, timezone); + if (!st.ok()) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Failed to convert argument type {} to Arrow type: {}", i, + st.to_string()); + } + fields.push_back(arrow::field(std::to_string(i), arrow_type)); + } + + // Add places column for GROUP BY aggregation (always included, NULL in single-place mode) + fields.push_back(arrow::field("places", arrow::int64())); + // Add binary_data column for merge operations + fields.push_back(arrow::field("binary_data", arrow::binary())); + _schema = arrow::schema(fields); + }); + + // Initialize the data structure + new (place) Data(); + DCHECK(reinterpret_cast(place)) << "Place must not be null"; + + if (Status st = PythonServerManager::instance().get_client( + _func_meta, _python_version, &(this->data(place).client), _schema); + UNLIKELY(!st.ok())) { + this->data(place).~Data(); + throw doris::Exception(ErrorCode::INTERNAL_ERROR, "Failed to get Python UDAF client: {}", + st.to_string()); + } + + // Initialize UDAF state in Python server + int64_t place_id = reinterpret_cast(place); + if (Status st = this->data(place).create(place_id); UNLIKELY(!st.ok())) { + this->data(place).~Data(); + throw doris::Exception(ErrorCode::INTERNAL_ERROR, st.to_string()); + } +} + +void AggregatePythonUDAF::destroy(AggregateDataPtr __restrict place) const noexcept { + try { + int64_t place_id = reinterpret_cast(place); + + // Destroy state in Python server + if (this->data(place).client) { + Status st = this->data(place).destroy(place_id); + if (UNLIKELY(!st.ok())) { + LOG(WARNING) << "Failed to destroy Python UDAF state for place_id=" << place_id + << ", function=" << _func_meta.name << ": " << st.to_string(); + } + + this->data(place).client.reset(); + } + + this->data(place).~Data(); + } catch (const std::exception& e) { + LOG(ERROR) << "Exception in AggregatePythonUDAF::destroy: " << e.what(); + } catch (...) { + LOG(ERROR) << "Unknown exception in AggregatePythonUDAF::destroy"; + } +} + +void AggregatePythonUDAF::add(AggregateDataPtr __restrict place, const IColumn** columns, + ssize_t row_num, Arena&) const { + int64_t place_id = reinterpret_cast(place); + Status st = this->data(place).add(place_id, columns, row_num, row_num + 1, argument_types); + if (UNLIKELY(!st.ok())) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, st.to_string()); + } +} + +void AggregatePythonUDAF::add_batch(size_t batch_size, AggregateDataPtr* places, + size_t place_offset, const IColumn** columns, Arena&, + bool /*agg_many*/) const { + if (batch_size == 0) return; + + size_t start = 0; + while (start < batch_size) { + // Get the starting place for this segment + AggregateDataPtr start_place = places[start] + place_offset; + auto& start_place_data = this->data(start_place); + // Get the process for this segment + const auto* current_process = start_place_data.client->get_process().get(); + + // Scan forward to find the end of this consecutive segment (same process) + size_t end = start + 1; + while (end < batch_size) { + AggregateDataPtr end_place = places[end] + place_offset; + auto& end_place_data = this->data(end_place); + const auto* next_process = end_place_data.client->get_process().get(); + // If different process, end the current segment + if (*next_process != *current_process) break; + ++end; + } + + // Send this segment to Python with zero-copy + // Pass places array and let add_batch construct place_ids on-demand + Status st = start_place_data.add_batch(places, place_offset, batch_size, columns, + argument_types, start, end); + + if (UNLIKELY(!st.ok())) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Failed to send segment to Python: " + st.to_string()); + } + + start = end; + } +} + +void AggregatePythonUDAF::add_batch_single_place(size_t batch_size, AggregateDataPtr place, + const IColumn** columns, Arena&) const { + int64_t place_id = reinterpret_cast(place); + Status st = this->data(place).add(place_id, columns, 0, batch_size, argument_types); + if (UNLIKELY(!st.ok())) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, st.to_string()); + } +} + +void AggregatePythonUDAF::add_range_single_place(int64_t partition_start, int64_t partition_end, + int64_t frame_start, int64_t frame_end, + AggregateDataPtr place, const IColumn** columns, + Arena& arena, UInt8* current_window_empty, + UInt8* current_window_has_inited) const { + // Calculate actual frame range + frame_start = std::max(frame_start, partition_start); + frame_end = std::min(frame_end, partition_end); + + if (frame_start >= frame_end) { + if (!*current_window_has_inited) { + *current_window_empty = true; + } + return; + } + + int64_t place_id = reinterpret_cast(place); + Status st = this->data(place).add(place_id, columns, frame_start, frame_end, argument_types); + if (UNLIKELY(!st.ok())) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, st.to_string()); + } + + *current_window_empty = false; + *current_window_has_inited = true; +} + +void AggregatePythonUDAF::reset(AggregateDataPtr place) const { + int64_t place_id = reinterpret_cast(place); + Status st = this->data(place).reset(place_id); + if (UNLIKELY(!st.ok())) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, st.to_string()); + } +} + +void AggregatePythonUDAF::merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, + Arena&) const { + int64_t place_id = reinterpret_cast(place); + Status st = this->data(place).merge(this->data(rhs), place_id); + if (UNLIKELY(!st.ok())) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, st.to_string()); + } +} + +void AggregatePythonUDAF::serialize(ConstAggregateDataPtr __restrict place, + BufferWritable& buf) const { + int64_t place_id = reinterpret_cast(place); + Status st = this->data(place).write(buf, place_id); + if (UNLIKELY(!st.ok())) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, st.to_string()); + } +} + +void AggregatePythonUDAF::deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, + Arena&) const { + this->data(place).read(buf); +} + +void AggregatePythonUDAF::insert_result_into(ConstAggregateDataPtr __restrict place, + IColumn& to) const { + int64_t place_id = reinterpret_cast(place); + Status st = this->data(place).get(to, _return_type, place_id); + if (UNLIKELY(!st.ok())) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, st.to_string()); + } +} + +} // namespace doris::vectorized diff --git a/be/src/vec/aggregate_functions/aggregate_function_python_udaf.h b/be/src/vec/aggregate_functions/aggregate_function_python_udaf.h new file mode 100644 index 00000000000000..b81388a51eab75 --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_python_udaf.h @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "common/status.h" +#include "udf/python/python_env.h" +#include "udf/python/python_udaf_client.h" +#include "udf/python/python_udf_meta.h" +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/columns/column.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" + +namespace doris::vectorized { +#include "common/compile_check_begin.h" + +/** + * Aggregate state data for Python UDAF + * + * Python UDAF state is managed remotely (in Python server). + * We cache serialized state for shuffle/merge operations (similar to Java UDAF). + */ +struct AggregatePythonUDAFData { + std::string serialize_data; + PythonUDAFClientPtr client; + + AggregatePythonUDAFData() = default; + + AggregatePythonUDAFData(const AggregatePythonUDAFData& other) + : serialize_data(other.serialize_data), client(other.client) {} + + ~AggregatePythonUDAFData() = default; + + Status create(int64_t place); + + Status add(int64_t place_id, const IColumn** columns, int64_t row_num_start, + int64_t row_num_end, const DataTypes& argument_types); + + Status add_batch(AggregateDataPtr* places, size_t place_offset, size_t num_rows, + const IColumn** columns, const DataTypes& argument_types, size_t start, + size_t end); + + Status merge(const AggregatePythonUDAFData& rhs, int64_t place); + + Status write(BufferWritable& buf, int64_t place) const; + + void read(BufferReadable& buf); + + Status reset(int64_t place); + + Status destroy(int64_t place); + + Status get(IColumn& to, const DataTypePtr& result_type, int64_t place) const; +}; + +/** + * Python UDAF Aggregate Function + * + * Implements Snowflake-style UDAF pattern: + * - __init__(): Initialize aggregate state + * - aggregate_state: Property returning serializable state + * - accumulate(*args): Add input to state + * - merge(other_state): Combine two states + * - finish(): Get final result + * + * Communication with Python server via PythonUDAFClient using Arrow Flight. + */ +class AggregatePythonUDAF final + : public IAggregateFunctionDataHelper, + VarargsExpression, + NullableAggregateFunction { +public: + ENABLE_FACTORY_CREATOR(AggregatePythonUDAF); + + AggregatePythonUDAF(const TFunction& fn, const DataTypes& argument_types_, + const DataTypePtr& return_type) + : IAggregateFunctionDataHelper(argument_types_), _fn(fn), _return_type(return_type) {} + + ~AggregatePythonUDAF() override = default; + + static AggregateFunctionPtr create(const TFunction& fn, const DataTypes& argument_types_, + const DataTypePtr& return_type) { + return std::make_shared(fn, argument_types_, return_type); + } + + String get_name() const override { return _fn.name.function_name; } + + DataTypePtr get_return_type() const override { return _return_type; } + + /** + * Initialize function metadata + */ + Status open(); + + /** + * Create aggregate state in Python server + */ + void create(AggregateDataPtr __restrict place) const override; + + /** + * Destroy aggregate state in Python server + */ + void destroy(AggregateDataPtr __restrict place) const noexcept override; + + /** + * Add single row to aggregate state + */ + void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, + Arena&) const override; + + /** + * Add batch of rows to multiple aggregate states (GROUP BY) + */ + void add_batch(size_t batch_size, AggregateDataPtr* places, size_t place_offset, + const IColumn** columns, Arena&, bool /*agg_many*/) const override; + + /** + * Add batch of rows to single aggregate state (no GROUP BY) + */ + void add_batch_single_place(size_t batch_size, AggregateDataPtr place, const IColumn** columns, + Arena&) const override; + + /** + * Add range of rows to single place (for window functions) + */ + void add_range_single_place(int64_t partition_start, int64_t partition_end, int64_t frame_start, + int64_t frame_end, AggregateDataPtr place, const IColumn** columns, + Arena& arena, UInt8* current_window_empty, + UInt8* current_window_has_inited) const override; + + /** + * Reset aggregate state to initial value + */ + void reset(AggregateDataPtr place) const override; + + /** + * Merge two aggregate states + */ + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena&) const override; + + /** + * Serialize aggregate state for shuffle + */ + void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override; + + /** + * Deserialize aggregate state from shuffle + */ + void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, Arena&) const override; + + /** + * Get final result and insert into output column + */ + void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override; + +private: + TFunction _fn; + DataTypePtr _return_type; + PythonUDFMeta _func_meta; + PythonVersion _python_version; + // Arrow Flight schema: [argument_types..., places: int64, binary_data: binary] + // Used for all UDAF RPC operations + // - places column is always present (NULL in single-place mode, actual place_id values in GROUP BY mode) + // - binary_data column contains serialized data for MERGE operations (NULL for ACCUMULATE) + mutable std::shared_ptr _schema; + mutable std::once_flag _schema_init_flag; +}; + +} // namespace doris::vectorized + +#include "common/compile_check_end.h" diff --git a/be/src/vec/exprs/table_function/python_udtf_function.cpp b/be/src/vec/exprs/table_function/python_udtf_function.cpp new file mode 100644 index 00000000000000..b1e598b127d86e --- /dev/null +++ b/be/src/vec/exprs/table_function/python_udtf_function.cpp @@ -0,0 +1,283 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exprs/table_function/python_udtf_function.h" + +#include +#include +#include +#include +#include + +#include "runtime/runtime_state.h" +#include "runtime/user_function_cache.h" +#include "udf/python/python_env.h" +#include "udf/python/python_server.h" +#include "udf/python/python_udf_meta.h" +#include "util/arrow/block_convertor.h" +#include "util/arrow/row_batch.h" +#include "util/arrow/utils.h" +#include "util/timezone_utils.h" +#include "vec/columns/column.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_nullable.h" +#include "vec/common/assert_cast.h" +#include "vec/core/block.h" +#include "vec/core/column_numbers.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/data_types/serde/data_type_array_serde.h" +#include "vec/exprs/vexpr.h" +#include "vec/exprs/vexpr_context.h" +#include "vec/functions/array/function_array_utils.h" + +namespace doris::vectorized { +#include "common/compile_check_begin.h" + +PythonUDTFFunction::PythonUDTFFunction(const TFunction& t_fn) : TableFunction(), _t_fn(t_fn) { + _fn_name = _t_fn.name.function_name; + TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, _timezone_obj); + + // Like Java UDTF, FE passes the element type T, and we wrap it into array here + // This makes the behavior consistent with Java UDTF + DataTypePtr element_type = DataTypeFactory::instance().create_data_type(t_fn.ret_type); + _return_type = make_nullable(std::make_shared(make_nullable(element_type))); +} + +Status PythonUDTFFunction::open() { + PythonUDFMeta python_udf_meta; + python_udf_meta.id = _t_fn.id; + python_udf_meta.name = _t_fn.name.function_name; + python_udf_meta.symbol = _t_fn.scalar_fn.symbol; + + if (!_t_fn.function_code.empty()) { + python_udf_meta.type = PythonUDFLoadType::INLINE; + python_udf_meta.location = "inline"; + python_udf_meta.inline_code = _t_fn.function_code; + } else if (!_t_fn.hdfs_location.empty()) { + python_udf_meta.type = PythonUDFLoadType::MODULE; + python_udf_meta.location = _t_fn.hdfs_location; + python_udf_meta.checksum = _t_fn.checksum; + } else { + python_udf_meta.type = PythonUDFLoadType::UNKNOWN; + python_udf_meta.location = "unknown"; + } + + python_udf_meta.client_type = PythonClientType::UDTF; + + if (python_udf_meta.type == PythonUDFLoadType::MODULE) { + RETURN_IF_ERROR(UserFunctionCache::instance()->get_pypath( + python_udf_meta.id, python_udf_meta.location, python_udf_meta.checksum, + &python_udf_meta.location)); + } + + PythonVersion version; + if (_t_fn.__isset.runtime_version && !_t_fn.runtime_version.empty()) { + RETURN_IF_ERROR( + PythonVersionManager::instance().get_version(_t_fn.runtime_version, &version)); + python_udf_meta.runtime_version = version.full_version; + } else { + return Status::InvalidArgument("Python UDTF runtime version is not set"); + } + + for (const auto& arg_type : _t_fn.arg_types) { + DataTypePtr doris_type = DataTypeFactory::instance().create_data_type(arg_type); + python_udf_meta.input_types.push_back(doris_type); + } + + // For Python UDTF, FE passes the element type T (like Java UDTF) + // Use it directly as the UDF's return type for Python metadata + python_udf_meta.return_type = DataTypeFactory::instance().create_data_type(_t_fn.ret_type); + python_udf_meta.always_nullable = python_udf_meta.return_type->is_nullable(); + RETURN_IF_ERROR(python_udf_meta.check()); + + RETURN_IF_ERROR( + PythonServerManager::instance().get_client(python_udf_meta, version, &_udtf_client)); + + if (!_udtf_client) { + return Status::InternalError("Failed to create Python UDTF client"); + } + + return Status::OK(); +} + +Status PythonUDTFFunction::process_init(Block* block, RuntimeState* state) { + // Step 1: Extract input columns from child expressions + auto child_size = _expr_context->root()->children().size(); + ColumnNumbers child_column_idxs; + child_column_idxs.resize(child_size); + for (int i = 0; i < child_size; ++i) { + int result_id = -1; + RETURN_IF_ERROR(_expr_context->root()->children()[i]->execute(_expr_context.get(), block, + &result_id)); + DCHECK_NE(result_id, -1); + child_column_idxs[i] = result_id; + } + + // Step 2: Build input block and convert to Arrow format + vectorized::Block input_block; + for (uint32_t i = 0; i < child_column_idxs.size(); ++i) { + input_block.insert(block->get_by_position(child_column_idxs[i])); + } + std::shared_ptr input_schema; + std::shared_ptr input_batch; + RETURN_IF_ERROR(get_arrow_schema_from_block(input_block, &input_schema, + TimezoneUtils::default_time_zone)); + RETURN_IF_ERROR(convert_to_arrow_batch(input_block, input_schema, arrow::default_memory_pool(), + &input_batch, _timezone_obj)); + + // Step 3: Call Python UDTF to evaluate all rows at once (similar to Java UDTF's JNI call) + // Python returns a ListArray where each element contains outputs for one input row + std::shared_ptr list_array; + RETURN_IF_ERROR(_udtf_client->evaluate(*input_batch, &list_array)); + + // Step 4: Convert Python server output (ListArray) to Doris array column + RETURN_IF_ERROR(_convert_list_array_to_array_column(list_array)); + + // Step 5: Extract array column metadata using extract_column_array_info + if (!extract_column_array_info(*_array_result_column, _array_column_detail)) { + return Status::NotSupported("column type {} not supported now", + _array_result_column->get_name()); + } + + return Status::OK(); +} + +void PythonUDTFFunction::process_row(size_t row_idx) { + TableFunction::process_row(row_idx); + + // Check if array is null for this row + if (!_array_column_detail.array_nullmap_data || + !_array_column_detail.array_nullmap_data[row_idx]) { + _array_offset = (*_array_column_detail.offsets_ptr)[row_idx - 1]; + _cur_size = (*_array_column_detail.offsets_ptr)[row_idx] - _array_offset; + } + // When it's NULL at row_idx, _cur_size stays 0, meaning current_empty() + // If outer function: will continue with insert_default + // If not outer function: will not insert any value +} + +void PythonUDTFFunction::process_close() { + _array_result_column = nullptr; + _array_column_detail.reset(); + _array_offset = 0; +} + +void PythonUDTFFunction::get_same_many_values(MutableColumnPtr& column, int length) { + size_t pos = _array_offset + _cur_offset; + if (current_empty() || (_array_column_detail.nested_nullmap_data && + _array_column_detail.nested_nullmap_data[pos])) { + column->insert_many_defaults(length); + } else { + if (_is_nullable) { + auto* nullable_column = assert_cast(column.get()); + auto nested_column = nullable_column->get_nested_column_ptr(); + auto nullmap_column = nullable_column->get_null_map_column_ptr(); + nested_column->insert_many_from(*_array_column_detail.nested_col, pos, length); + assert_cast(nullmap_column.get())->insert_many_defaults(length); + } else { + column->insert_many_from(*_array_column_detail.nested_col, pos, length); + } + } +} + +int PythonUDTFFunction::get_value(MutableColumnPtr& column, int max_step) { + max_step = std::min(max_step, (int)(_cur_size - _cur_offset)); + size_t pos = _array_offset + _cur_offset; + + if (current_empty()) { + column->insert_default(); + max_step = 1; + } else { + if (_is_nullable) { + auto* nullable_column = assert_cast(column.get()); + auto nested_column = nullable_column->get_nested_column_ptr(); + auto* nullmap_column = + assert_cast(nullable_column->get_null_map_column_ptr().get()); + + nested_column->insert_range_from(*_array_column_detail.nested_col, pos, max_step); + size_t old_size = nullmap_column->size(); + nullmap_column->resize(old_size + max_step); + memcpy(nullmap_column->get_data().data() + old_size, + _array_column_detail.nested_nullmap_data + pos * sizeof(UInt8), + max_step * sizeof(UInt8)); + } else { + column->insert_range_from(*_array_column_detail.nested_col, pos, max_step); + } + } + forward(max_step); + return max_step; +} + +Status PythonUDTFFunction::close() { + // Close UDTF client + if (_udtf_client) { + Status status = _udtf_client->close(); + if (!status.ok()) { + LOG(WARNING) << "Failed to close UDTF client: " << status.to_string(); + } + _udtf_client.reset(); + } + + return TableFunction::close(); +} + +Status PythonUDTFFunction::_convert_list_array_to_array_column( + const std::shared_ptr& list_array) { + if (!list_array) { + return Status::InternalError("Received null ListArray from Python UDTF"); + } + + size_t num_input_rows = list_array->length(); + + // Handle nullable array column + MutableColumnPtr array_col_ptr = _return_type->create_column(); + ColumnNullable* nullable_col = nullptr; + ColumnArray* array_col = nullptr; + + if (_return_type->is_nullable()) { + nullable_col = assert_cast(array_col_ptr.get()); + array_col = assert_cast( + nullable_col->get_nested_column_ptr()->assume_mutable().get()); + } else { + array_col = assert_cast(array_col_ptr.get()); + } + + // Create DataTypeArraySerDe for direct Arrow conversion + DataTypePtr element_type = DataTypeFactory::instance().create_data_type(_t_fn.ret_type); + DataTypePtr array_type = std::make_shared(make_nullable(element_type)); + auto array_serde = array_type->get_serde(); + + // Use read_column_from_arrow for optimized conversion + // This directly converts Arrow ListArray to Doris ColumnArray + // No struct unwrapping needed - Python server sends the correct format! + RETURN_IF_ERROR(array_serde->read_column_from_arrow( + array_col->assume_mutable_ref(), list_array.get(), 0, num_input_rows, _timezone_obj)); + + // Handle nullable wrapper: all array elements are non-null + // (empty arrays [] are non-null, different from NULL) + if (nullable_col) { + auto& null_map = nullable_col->get_null_map_data(); + null_map.resize_fill(num_input_rows, 0); // All non-null + } + + _array_result_column = std::move(array_col_ptr); + return Status::OK(); +} + +#include "common/compile_check_end.h" +} // namespace doris::vectorized diff --git a/be/src/vec/exprs/table_function/python_udtf_function.h b/be/src/vec/exprs/table_function/python_udtf_function.h new file mode 100644 index 00000000000000..c4a62aa9864948 --- /dev/null +++ b/be/src/vec/exprs/table_function/python_udtf_function.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "common/status.h" +#include "udf/python/python_udtf_client.h" +#include "vec/columns/column.h" +#include "vec/data_types/data_type.h" +#include "vec/exprs/table_function/table_function.h" +#include "vec/functions/array/function_array_utils.h" + +namespace doris::vectorized { +#include "common/compile_check_begin.h" + +/** + * PythonUDTFFunction - Python User-Defined Table Function + * + * Execution Flow: + * 1. open() - Create Python UDTF client and establish RPC connection + * 2. process_init(block) - Batch evaluate all rows: + * - Convert input block to Arrow RecordBatch + * - Call Python UDTF server via RPC (evaluates all rows in one call) + * - Receive Arrow ListArray (one list per input row) + * - Convert to array column using DataTypeArraySerDe + * 3. process_row(row_idx) - Set array offset for current row + * 4. get_value()/get_same_many_values() - Extract values from array column + * 5. process_close() - Clean up batch state (array column, offsets) + * 6. close() - Close Python UDTF client and RPC connection + */ +class PythonUDTFFunction final : public TableFunction { + ENABLE_FACTORY_CREATOR(PythonUDTFFunction); + +public: + PythonUDTFFunction(const TFunction& t_fn); + ~PythonUDTFFunction() override = default; + + Status open() override; + Status process_init(Block* block, RuntimeState* state) override; + void process_row(size_t row_idx) override; + void process_close() override; + void get_same_many_values(MutableColumnPtr& column, int length) override; + int get_value(MutableColumnPtr& column, int max_step) override; + Status close() override; + +private: + /** + * Convert Python UDTF output (Arrow ListArray) to Doris array column + * + * Input from Python server (via Arrow RPC): + * - list_array: Arrow ListArray where each element corresponds to one input row's outputs + * + * Format: + * - Single-column output: List (e.g., List, List) + * - Multi-column output: List> + * + * Example: 3 input rows producing variable output rows + * ListArray structure: + * [0]: [val1, val2, val3] (3 elements) + * [1]: [] (0 elements - empty array) + * [2]: [val4, val5, val6, val7] (4 elements) + * + * @param list_array Arrow ListArray containing UDTF output (length = num_input_rows) + * @return Status indicating success or validation/conversion errors + */ + Status _convert_list_array_to_array_column(const std::shared_ptr& list_array); + + const TFunction& _t_fn; + DataTypePtr _return_type; + PythonUDTFClientPtr _udtf_client; + cctz::time_zone _timezone_obj; + + // Result storage (similar to Java UDTF) + ColumnPtr _array_result_column; // Array column storing all results + ColumnArrayExecutionData _array_column_detail; // Array metadata for efficient access + int64_t _array_offset = 0; // Offset into array for current row +}; + +#include "common/compile_check_end.h" +} // namespace doris::vectorized diff --git a/be/src/vec/exprs/table_function/table_function_factory.cpp b/be/src/vec/exprs/table_function/table_function_factory.cpp index 9b4aaa29ea763b..0990b7754ed385 100644 --- a/be/src/vec/exprs/table_function/table_function_factory.cpp +++ b/be/src/vec/exprs/table_function/table_function_factory.cpp @@ -24,6 +24,7 @@ #include "agent/be_exec_version_manager.h" #include "common/object_pool.h" +#include "vec/exprs/table_function/python_udtf_function.h" #include "vec/exprs/table_function/table_function.h" #include "vec/exprs/table_function/udf_table_function.h" #include "vec/exprs/table_function/vexplode.h" @@ -66,6 +67,12 @@ Status TableFunctionFactory::get_fn(const TFunction& t_fn, ObjectPool* pool, Tab (*fn)->set_outer(); } return Status::OK(); + } else if (t_fn.binary_type == TFunctionBinaryType::PYTHON_UDF) { + *fn = pool->add(PythonUDTFFunction::create_unique(t_fn).release()); + if (is_outer) { + (*fn)->set_outer(); + } + return Status::OK(); } else { const std::string& fn_name_raw = t_fn.name.function_name; const std::string& fn_name_real = diff --git a/be/src/vec/exprs/vectorized_agg_fn.cpp b/be/src/vec/exprs/vectorized_agg_fn.cpp index 7da847d8d6361d..c8495e9b39fff8 100644 --- a/be/src/vec/exprs/vectorized_agg_fn.cpp +++ b/be/src/vec/exprs/vectorized_agg_fn.cpp @@ -31,6 +31,7 @@ #include "common/object_pool.h" #include "vec/aggregate_functions/aggregate_function_ai_agg.h" #include "vec/aggregate_functions/aggregate_function_java_udaf.h" +#include "vec/aggregate_functions/aggregate_function_python_udaf.h" #include "vec/aggregate_functions/aggregate_function_rpc.h" #include "vec/aggregate_functions/aggregate_function_simple_factory.h" #include "vec/aggregate_functions/aggregate_function_sort.h" @@ -161,6 +162,18 @@ Status AggFnEvaluator::prepare(RuntimeState* state, const RowDescriptor& desc, "Java UDAF is not enabled, you can change be config enable_java_support to " "true and restart be."); } + } else if (_fn.binary_type == TFunctionBinaryType::PYTHON_UDF) { + if (config::enable_python_udf_support) { + _function = AggregatePythonUDAF::create(_fn, argument_types, _data_type); + RETURN_IF_ERROR(static_cast(_function.get())->open()); + LOG(INFO) << fmt::format( + "Created Python UDAF: {}, runtime_version: {}, function_code: {}", + _fn.name.function_name, _fn.runtime_version, _fn.function_code); + } else { + return Status::InternalError( + "Python UDAF is not enabled, you can change be config " + "enable_python_udf_support to true and restart be."); + } } else if (_fn.binary_type == TFunctionBinaryType::RPC) { _function = AggregateRpcUdaf::create(_fn, argument_types, _data_type); } else if (_fn.binary_type == TFunctionBinaryType::AGG_STATE) { diff --git a/be/src/vec/exprs/vectorized_fn_call.cpp b/be/src/vec/exprs/vectorized_fn_call.cpp index 98f1eae272887a..f708316b33f490 100644 --- a/be/src/vec/exprs/vectorized_fn_call.cpp +++ b/be/src/vec/exprs/vectorized_fn_call.cpp @@ -17,6 +17,7 @@ #include "vec/exprs/vectorized_fn_call.h" +#include #include #include // IWYU pragma: keep #include @@ -57,6 +58,7 @@ #include "vec/functions/function_agg_state.h" #include "vec/functions/function_fake.h" #include "vec/functions/function_java_udf.h" +#include "vec/functions/function_python_udf.h" #include "vec/functions/function_rpc.h" #include "vec/functions/simple_function_factory.h" #include "vec/utils/util.hpp" @@ -115,6 +117,25 @@ Status VectorizedFnCall::prepare(RuntimeState* state, const RowDescriptor& desc, "Java UDF is not enabled, you can change be config enable_java_support to true " "and restart be."); } + } else if (_fn.binary_type == TFunctionBinaryType::PYTHON_UDF) { + if (config::enable_python_udf_support) { + if (_fn.is_udtf_function) { + // fake function. it's no use and can't execute. + // Python UDTF is executed via PythonUDTFFunction in table function path + auto builder = + std::make_shared(FunctionFake::create()); + _function = builder->build(argument_template, std::make_shared()); + } else { + _function = PythonFunctionCall::create(_fn, argument_template, _data_type); + LOG(INFO) << fmt::format( + "create python function call: {}, runtime version: {}, function code: {}", + _fn.name.function_name, _fn.runtime_version, _fn.function_code); + } + } else { + return Status::InternalError( + "Python UDF is not enabled, you can change be config enable_python_udf_support " + "to true and restart be."); + } } else if (_fn.binary_type == TFunctionBinaryType::AGG_STATE) { DataTypes argument_types; for (auto column : argument_template) { diff --git a/be/src/vec/functions/function_python_udf.cpp b/be/src/vec/functions/function_python_udf.cpp new file mode 100644 index 00000000000000..bc35b76d802813 --- /dev/null +++ b/be/src/vec/functions/function_python_udf.cpp @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/functions/function_python_udf.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "common/status.h" +#include "runtime/user_function_cache.h" +#include "udf/python/python_server.h" +#include "udf/python/python_udf_client.h" +#include "udf/python/python_udf_meta.h" +#include "util/arrow/block_convertor.h" +#include "util/arrow/row_batch.h" +#include "util/timezone_utils.h" +#include "vec/core/block.h" +#include "vec/exec/jni_connector.h" + +namespace doris::vectorized { + +PythonFunctionCall::PythonFunctionCall(const TFunction& fn, const DataTypes& argument_types, + const DataTypePtr& return_type) + : _fn(fn), _argument_types(argument_types), _return_type(return_type) {} + +Status PythonFunctionCall::open(FunctionContext* context, + FunctionContext::FunctionStateScope scope) { + if (scope == FunctionContext::FunctionStateScope::FRAGMENT_LOCAL) { + LOG(INFO) << "Open python UDF fragment local"; + return Status::OK(); + } + + PythonVersion version; + PythonUDFMeta func_meta; + func_meta.id = _fn.id; + func_meta.name = _fn.name.function_name; + func_meta.symbol = _fn.scalar_fn.symbol; + if (!_fn.function_code.empty()) { + func_meta.type = PythonUDFLoadType::INLINE; + func_meta.location = "inline"; + func_meta.inline_code = _fn.function_code; + } else if (!_fn.hdfs_location.empty()) { + func_meta.type = PythonUDFLoadType::MODULE; + func_meta.location = _fn.hdfs_location; + func_meta.checksum = _fn.checksum; + } else { + func_meta.type = PythonUDFLoadType::UNKNOWN; + func_meta.location = "unknown"; + } + + func_meta.input_types = _argument_types; + func_meta.return_type = _return_type; + func_meta.client_type = PythonClientType::UDF; + + if (_fn.__isset.runtime_version && !_fn.runtime_version.empty()) { + RETURN_IF_ERROR( + PythonVersionManager::instance().get_version(_fn.runtime_version, &version)); + } else { + return Status::InvalidArgument("Python UDF runtime version is not set"); + } + + func_meta.runtime_version = version.full_version; + RETURN_IF_ERROR(func_meta.check()); + func_meta.always_nullable = _return_type->is_nullable(); + LOG(INFO) << fmt::format("runtime_version: {}, func_meta: {}", version.to_string(), + func_meta.to_string()); + + if (func_meta.type == PythonUDFLoadType::MODULE) { + RETURN_IF_ERROR(UserFunctionCache::instance()->get_pypath( + func_meta.id, func_meta.location, func_meta.checksum, &func_meta.location)); + } + + PythonUDFClientPtr client = nullptr; + RETURN_IF_ERROR(PythonServerManager::instance().get_client(func_meta, version, &client)); + + if (!client) { + return Status::InternalError("Python UDF client is null"); + } + + context->set_function_state(FunctionContext::THREAD_LOCAL, client); + LOG(INFO) << fmt::format("Successfully get python UDF client, process: {}", + client->print_process()); + return Status::OK(); +} + +Status PythonFunctionCall::execute_impl(FunctionContext* context, Block& block, + const ColumnNumbers& arguments, uint32_t result, + size_t num_rows) const { + auto client = reinterpret_cast( + context->get_function_state(FunctionContext::THREAD_LOCAL)); + if (!client) { + LOG(WARNING) << "Python UDF client is null"; + return Status::InternalError("Python UDF client is null"); + } + + int64_t input_rows = block.rows(); + uint32_t input_columns = block.columns(); + DCHECK(input_columns > 0 && result < input_columns && + _argument_types.size() == arguments.size()); + vectorized::Block input_block; + vectorized::Block output_block; + + if (!_return_type->equals(*block.get_by_position(result).type)) { + return Status::InternalError(fmt::format("Python UDF output type {} not equal to {}", + block.get_by_position(result).type->get_name(), + _return_type->get_name())); + } + + for (uint32_t i = 0; i < arguments.size(); ++i) { + if (!_argument_types[i]->equals(*block.get_by_position(arguments[i]).type)) { + return Status::InternalError( + fmt::format("Python UDF input type {} not equal to {}", + block.get_by_position(arguments[i]).type->get_name(), + _argument_types[i]->get_name())); + } + input_block.insert(block.get_by_position(arguments[i])); + } + + std::shared_ptr schema; + RETURN_IF_ERROR( + get_arrow_schema_from_block(input_block, &schema, TimezoneUtils::default_time_zone)); + std::shared_ptr input_batch; + std::shared_ptr output_batch; + cctz::time_zone _timezone_obj; // default UTC + RETURN_IF_ERROR(convert_to_arrow_batch(input_block, schema, arrow::default_memory_pool(), + &input_batch, _timezone_obj)); + RETURN_IF_ERROR(client->evaluate(*input_batch, &output_batch)); + int64_t output_rows = output_batch->num_rows(); + + if (output_batch->num_columns() != 1) { + return Status::InternalError(fmt::format("Python UDF output columns {} not equal to 1", + output_batch->num_columns())); + } + + if (input_rows != output_rows) { + return Status::InternalError(fmt::format( + "Python UDF output rows {} not equal to input rows {}", output_rows, input_rows)); + } + + RETURN_IF_ERROR( + convert_from_arrow_batch(output_batch, {_return_type}, &output_block, _timezone_obj)); + DCHECK_EQ(output_block.columns(), 1); + block.replace_by_position(result, std::move(output_block.get_by_position(0).column)); + return Status::OK(); +} + +Status PythonFunctionCall::close(FunctionContext* context, + FunctionContext::FunctionStateScope scope) { + auto client = reinterpret_cast( + context->get_function_state(FunctionContext::THREAD_LOCAL)); + if (!client) { + LOG(WARNING) << "Python UDF client is null"; + return Status::InternalError("Python UDF client is null"); + } + RETURN_IF_ERROR(client->close()); + return Status::OK(); +} + +} // namespace doris::vectorized diff --git a/be/src/vec/functions/function_python_udf.h b/be/src/vec/functions/function_python_udf.h new file mode 100644 index 00000000000000..e13bf4943945cc --- /dev/null +++ b/be/src/vec/functions/function_python_udf.h @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include + +#include "common/status.h" +#include "udf/udf.h" +#include "vec/core/block.h" +#include "vec/core/column_numbers.h" +#include "vec/core/columns_with_type_and_name.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/functions/function.h" + +namespace doris::vectorized { + +class PythonUDFPreparedFunction : public PreparedFunctionImpl { +public: + using execute_call_back = std::function; + + explicit PythonUDFPreparedFunction(const execute_call_back& func, const std::string& name) + : callback_function(func), name(name) {} + + String get_name() const override { return name; } + +protected: + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + uint32_t result, size_t input_rows_count) const override { + return callback_function(context, block, arguments, result, input_rows_count); + } + + bool use_default_implementation_for_nulls() const override { return false; } + +private: + execute_call_back callback_function; + std::string name; +}; + +class PythonFunctionCall : public IFunctionBase { +public: + PythonFunctionCall(const TFunction& fn, const DataTypes& argument_types, + const DataTypePtr& return_type); + + static FunctionBasePtr create(const TFunction& fn, const ColumnsWithTypeAndName& argument_types, + const DataTypePtr& return_type) { + DataTypes data_types(argument_types.size()); + for (size_t i = 0; i < argument_types.size(); ++i) { + data_types[i] = argument_types[i].type; + } + return std::make_shared(fn, data_types, return_type); + } + + /// Get the main function name. + String get_name() const override { return _fn.name.function_name; } + + const DataTypes& get_argument_types() const override { return _argument_types; } + const DataTypePtr& get_return_type() const override { return _return_type; } + + PreparedFunctionPtr prepare(FunctionContext* context, const Block& sample_block, + const ColumnNumbers& arguments, uint32_t result) const override { + return std::make_shared( + [this](auto&& PH1, auto&& PH2, auto&& PH3, auto&& PH4, auto&& PH5) { + return PythonFunctionCall::execute_impl( + std::forward(PH1), std::forward(PH2), + std::forward(PH3), std::forward(PH4), + std::forward(PH5)); + }, + _fn.name.function_name); + } + + Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + uint32_t result, size_t input_rows_count) const; + + Status close(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; + + bool is_use_default_implementation_for_constants() const override { return true; } + + bool is_udf_function() const override { return true; } + +private: + const TFunction& _fn; + const DataTypes _argument_types; + const DataTypePtr _return_type {nullptr}; +}; + +} // namespace doris::vectorized diff --git a/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp b/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp index 87dd33a1ec0485..51fa4a784d2384 100644 --- a/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp +++ b/be/test/vec/data_types/serde/data_type_serde_arrow_test.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ #include #include "olap/hll.h" +#include "runtime/define_primitive_type.h" #include "runtime/descriptors.cpp" #include "util/arrow/block_convertor.h" #include "util/arrow/row_batch.h" @@ -77,8 +79,8 @@ namespace doris::vectorized { -void serialize_and_deserialize_arrow_test(std::vector cols, int row_num, - bool is_nullable) { +std::shared_ptr create_test_block(std::vector cols, int row_num, + bool is_nullable) { auto block = std::make_shared(); for (int i = 0; i < cols.size(); i++) { std::string col_name = std::to_string(i); @@ -397,6 +399,12 @@ void serialize_and_deserialize_arrow_test(std::vector cols, int r LOG(FATAL) << "error column type"; } } + return block; +} + +void serialize_and_deserialize_arrow_test(std::vector cols, int row_num, + bool is_nullable) { + std::shared_ptr block = create_test_block(cols, row_num, is_nullable); std::shared_ptr record_batch = CommonDataTypeSerdeTest::serialize_arrow(block); auto assert_block = std::make_shared(block->clone_empty()); @@ -404,6 +412,25 @@ void serialize_and_deserialize_arrow_test(std::vector cols, int r CommonDataTypeSerdeTest::compare_two_blocks(block, assert_block); } +void block_converter_test(std::vector cols, int row_num, bool is_nullable) { + std::shared_ptr source_block = create_test_block(cols, row_num, is_nullable); + std::shared_ptr record_batch; + std::shared_ptr schema; + Status status = Status::OK(); + status = get_arrow_schema_from_block(*source_block, &schema, TimezoneUtils::default_time_zone); + ASSERT_TRUE(status.ok() && schema); + cctz::time_zone default_timezone; //default UTC + status = convert_to_arrow_batch(*source_block, schema, arrow::default_memory_pool(), + &record_batch, default_timezone); + ASSERT_TRUE(status.ok() && record_batch); + auto target_block = std::make_shared(source_block->clone_empty()); + DataTypes source_data_types = source_block->get_data_types(); + status = convert_from_arrow_batch(record_batch, source_data_types, &*target_block, + default_timezone); + ASSERT_TRUE(status.ok() && target_block); + CommonDataTypeSerdeTest::compare_two_blocks(source_block, target_block); +} + TEST(DataTypeSerDeArrowTest, DataTypeScalaSerDeTest) { std::vector cols = { TYPE_INT, TYPE_INT, TYPE_STRING, TYPE_DECIMAL128I, TYPE_BOOLEAN, @@ -485,4 +512,14 @@ TEST(DataTypeSerDeArrowTest, BigStringSerDeTest) { CommonDataTypeSerdeTest::compare_two_blocks(block, assert_block); } +TEST(DataTypeSerDeArrowTest, BlockConverterTest) { + std::vector cols = { + TYPE_INT, TYPE_INT, TYPE_STRING, TYPE_DECIMAL128I, TYPE_BOOLEAN, + TYPE_DECIMAL32, TYPE_DECIMAL64, TYPE_IPV4, TYPE_IPV6, TYPE_DATETIME, + TYPE_DATETIMEV2, TYPE_DATE, TYPE_DATEV2, + }; + block_converter_test(cols, 7, true); + block_converter_test(cols, 7, false); +} + } // namespace doris::vectorized diff --git a/build.sh b/build.sh index c3851b7c7d6afa..580f5d3a047db4 100755 --- a/build.sh +++ b/build.sh @@ -996,9 +996,11 @@ EOF mkdir -p "${DORIS_OUTPUT}/be/storage" mkdir -p "${DORIS_OUTPUT}/be/plugins/jdbc_drivers/" mkdir -p "${DORIS_OUTPUT}/be/plugins/java_udf/" + mkdir -p "${DORIS_OUTPUT}/be/plugins/python_udf/" mkdir -p "${DORIS_OUTPUT}/be/plugins/connectors/" mkdir -p "${DORIS_OUTPUT}/be/plugins/hadoop_conf/" mkdir -p "${DORIS_OUTPUT}/be/plugins/java_extensions/" + cp -r -p "${DORIS_HOME}/be/src/udf/python/python_server.py" "${DORIS_OUTPUT}/be/plugins/python_udf/" fi if [[ "${BUILD_BROKER}" -eq 1 ]]; then diff --git a/conf/fe.conf b/conf/fe.conf index fee2eabc963890..0b9eb2978114b9 100644 --- a/conf/fe.conf +++ b/conf/fe.conf @@ -70,4 +70,4 @@ sys_log_mode = ASYNC # meta_delay_toleration_second = 10 # qe_max_connection = 1024 # qe_query_timeout_second = 300 -# qe_slow_log_ms = 5000 +# qe_slow_log_ms = 5000 \ No newline at end of file diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 58e0562daf37ab..1f2351e5e6d87e 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -2886,6 +2886,14 @@ public class Config extends ConfigBase { }) public static boolean enable_udf_in_load = false; + @ConfField(description = { + "开启python_udf, 默认为false。如果该配置为false,则禁止创建和使用python_udf。在一些场景下关闭该配置可防止命令注入攻击。", + "Used to enable python_udf, default is false. if this configuration is false, creation and use of python_udf " + + "is disabled. in some scenarios it may be necessary to disable this configuration to prevent " + + "command injection attacks." + }) + public static boolean enable_python_udf = false; + @ConfField(description = { "是否忽略 Image 文件中未知的模块。如果为 true,不在 PersistMetaModules.MODULE_NAMES 中的元数据模块将被忽略并跳过。" + "默认为 false,如果 Image 文件中包含未知的模块,Doris 将会抛出异常。" diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index d11e2706281891..9e8840c0f51bd5 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -56,6 +56,7 @@ LEFT_BRACKET: '['; RIGHT_BRACKET: ']'; LEFT_BRACE: '{'; RIGHT_BRACE: '}'; +DOLLAR_QUOTED_STRING: '$$' ( ~'$' | '$' ~'$' )* '$$'; // TODO: add a doc to list reserved words diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index fe25fa2c04528d..ce8343f4058fdc 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -233,7 +233,8 @@ supportedCreateStatement (TABLES | AGGREGATE)? FUNCTION (IF NOT EXISTS)? functionIdentifier LEFT_PAREN functionArguments? RIGHT_PAREN RETURNS returnType=dataType (INTERMEDIATE intermediateType=dataType)? - properties=propertyClause? #createUserDefineFunction + properties=propertyClause? + (AS functionCode=dollarQuotedString)? #createUserDefineFunction | CREATE statementScope? ALIAS FUNCTION (IF NOT EXISTS)? functionIdentifier LEFT_PAREN functionArguments? RIGHT_PAREN WITH PARAMETER LEFT_PAREN parameters=identifierSeq? RIGHT_PAREN @@ -1900,6 +1901,10 @@ number | SUBTRACT? (EXPONENT_VALUE | DECIMAL_VALUE) #decimalLiteral ; +dollarQuotedString + : DOLLAR_QUOTED_STRING + ; + // there are 1 kinds of keywords in Doris. // - Non-reserved keywords: // normal version of non-reserved keywords. diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java index f28fbb4a0cde3c..be74b957d25077 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java @@ -23,6 +23,7 @@ import org.apache.doris.thrift.TFunction; import org.apache.doris.thrift.TFunctionBinaryType; +import com.google.common.base.Strings; import com.google.gson.annotations.SerializedName; import java.util.Arrays; @@ -408,6 +409,15 @@ public TFunction toThrift(Type realReturnType, Type[] realArgTypes, Boolean[] re } // agg_fn.setIgnores_distinct(ignoresDistinct); fn.setAggregateFn(aggFn); + + // Set runtime_version and function_code for Python UDAF + if (getBinaryType() == TFunctionBinaryType.PYTHON_UDF) { + if (!Strings.isNullOrEmpty(functionCode)) { + fn.setFunctionCode(functionCode); + } + fn.setRuntimeVersion(runtimeVersion); + } + return fn; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Function.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Function.java index 8b9bdad45aa79e..ef4f5249ed53ee 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Function.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Function.java @@ -103,6 +103,10 @@ public enum NullableMode { protected boolean isStaticLoad = false; @SerializedName("eT") protected long expirationTime = 360; // default 6 hours; + @SerializedName("rv") + protected String runtimeVersion; + @SerializedName("fc") + protected String functionCode; // Only used for serialization protected Function() { @@ -161,6 +165,8 @@ public Function(Function other) { this.isUDTFunction = other.isUDTFunction; this.isStaticLoad = other.isStaticLoad; this.expirationTime = other.expirationTime; + this.runtimeVersion = other.runtimeVersion; + this.functionCode = other.functionCode; } public Function clone() { @@ -268,6 +274,22 @@ public void setGlobal(boolean global) { isGlobal = global; } + public String getRuntimeVersion() { + return runtimeVersion; + } + + public void setRuntimeVersion(String runtimeVersion) { + this.runtimeVersion = runtimeVersion; + } + + public String getFunctionCode() { + return functionCode; + } + + public void setFunctionCode(String functionCode) { + this.functionCode = functionCode; + } + // TODO(cmy): Currently we judge whether it is UDF by wheter the 'location' is set. // Maybe we should use a separate variable to identify, // but additional variables need to modify the persistence information. diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionUtil.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionUtil.java index 8e62eb4412af47..6a75179d151501 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionUtil.java @@ -25,7 +25,11 @@ import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdaf; import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdf; import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdtf; +import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdaf; +import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdf; +import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdtf; import org.apache.doris.nereids.types.DataType; +import org.apache.doris.thrift.TFunctionBinaryType; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; @@ -191,12 +195,24 @@ private static void translateToNereidsImpl(String dbName, Function function) { AliasUdf.translateToNereidsFunction(dbName, ((AliasFunction) function)); } else if (function instanceof ScalarFunction) { if (function.isUDTFunction()) { - JavaUdtf.translateToNereidsFunction(dbName, ((ScalarFunction) function)); + if (function.getBinaryType() == TFunctionBinaryType.JAVA_UDF) { + JavaUdtf.translateToNereidsFunction(dbName, ((ScalarFunction) function)); + } else if (function.getBinaryType() == TFunctionBinaryType.PYTHON_UDF) { + PythonUdtf.translateToNereidsFunction(dbName, ((ScalarFunction) function)); + } } else { - JavaUdf.translateToNereidsFunction(dbName, ((ScalarFunction) function)); + if (function.getBinaryType() == TFunctionBinaryType.JAVA_UDF) { + JavaUdf.translateToNereidsFunction(dbName, ((ScalarFunction) function)); + } else if (function.getBinaryType() == TFunctionBinaryType.PYTHON_UDF) { + PythonUdf.translateToNereidsFunction(dbName, (ScalarFunction) function); + } } } else if (function instanceof AggregateFunction) { - JavaUdaf.translateToNereidsFunction(dbName, ((AggregateFunction) function)); + if (function.getBinaryType() == TFunctionBinaryType.JAVA_UDF) { + JavaUdaf.translateToNereidsFunction(dbName, ((AggregateFunction) function)); + } else if (function.getBinaryType() == TFunctionBinaryType.PYTHON_UDF) { + PythonUdaf.translateToNereidsFunction(dbName, ((AggregateFunction) function)); + } } } @@ -219,4 +235,9 @@ public static void checkEnableJavaUdf() throws AnalysisException { } } + public static void checkEnablePythonUdf() throws AnalysisException { + if (!Config.enable_python_udf) { + throw new AnalysisException("python_udf has been disabled."); + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarFunction.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarFunction.java index babb02595d2f64..5599b58e152943 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarFunction.java @@ -24,6 +24,7 @@ import org.apache.doris.thrift.TFunctionBinaryType; import org.apache.doris.thrift.TScalarFunction; +import com.google.common.base.Strings; import com.google.gson.annotations.SerializedName; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -249,11 +250,18 @@ public String toSql(boolean ifNotExists) { public TFunction toThrift(Type realReturnType, Type[] realArgTypes, Boolean[] realArgTypeNullables) { TFunction fn = super.toThrift(realReturnType, realArgTypes, realArgTypeNullables); fn.setScalarFn(new TScalarFunction()); - if (getBinaryType() == TFunctionBinaryType.JAVA_UDF || getBinaryType() == TFunctionBinaryType.RPC) { + if (getBinaryType() == TFunctionBinaryType.JAVA_UDF || getBinaryType() == TFunctionBinaryType.RPC + || getBinaryType() == TFunctionBinaryType.PYTHON_UDF) { fn.getScalarFn().setSymbol(symbolName); } else { fn.getScalarFn().setSymbol(""); } + if (getBinaryType() == TFunctionBinaryType.PYTHON_UDF) { + if (!Strings.isNullOrEmpty(functionCode)) { + fn.setFunctionCode(functionCode); + } + fn.setRuntimeVersion(runtimeVersion); + } if (dictFunction != null) { fn.setDictFunction(dictFunction); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java index 6a5c5703c61087..6c04200043a307 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java @@ -102,6 +102,9 @@ import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdaf; import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdf; import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdtf; +import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdaf; +import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdf; +import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdtf; import org.apache.doris.nereids.trees.expressions.functions.window.WindowFunction; import org.apache.doris.nereids.trees.expressions.literal.Literal; import org.apache.doris.nereids.trees.expressions.visitor.DefaultExpressionVisitor; @@ -908,6 +911,30 @@ public Expr visitJavaUdaf(JavaUdaf udaf, PlanTranslatorContext context) { return new FunctionCallExpr(udaf.getCatalogFunction(), exprs, udaf.nullable()); } + @Override + public Expr visitPythonUdf(PythonUdf udf, PlanTranslatorContext context) { + FunctionParams exprs = new FunctionParams(udf.children().stream() + .map(expression -> expression.accept(this, context)) + .collect(Collectors.toList())); + return new FunctionCallExpr(udf.getCatalogFunction(), exprs, udf.nullable()); + } + + @Override + public Expr visitPythonUdaf(PythonUdaf udaf, PlanTranslatorContext context) { + FunctionParams exprs = new FunctionParams(udaf.isDistinct(), udaf.children().stream() + .map(expression -> expression.accept(this, context)) + .collect(Collectors.toList())); + return new FunctionCallExpr(udaf.getCatalogFunction(), exprs, udaf.nullable()); + } + + @Override + public Expr visitPythonUdtf(PythonUdtf udtf, PlanTranslatorContext context) { + FunctionParams exprs = new FunctionParams(udtf.children().stream() + .map(expression -> expression.accept(this, context)) + .collect(Collectors.toList())); + return new FunctionCallExpr(udtf.getCatalogFunction(), exprs, udtf.nullable()); + } + // TODO: Supports for `distinct` private Expr translateAggregateFunction(AggregateFunction function, List currentPhaseArguments, List aggFnArguments, diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index 35d64c4455e2e8..49c1d438d55ec4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -5141,9 +5141,10 @@ public Command visitCreateUserDefineFunction(CreateUserDefineFunctionContext ctx Map properties = ctx.propertyClause() != null ? Maps.newHashMap(visitPropertyClause(ctx.propertyClause())) : Maps.newHashMap(); + String functionCode = ctx.dollarQuotedString() != null ? ctx.dollarQuotedString().getText() : ""; return new CreateFunctionCommand(statementScope, ifNotExists, isAggFunction, false, isTableFunction, function, functionArgTypesInfo, returnType, intermediateType, - null, null, properties); + null, null, properties, functionCode); } @Override @@ -5161,7 +5162,7 @@ public Command visitCreateAliasFunction(CreateAliasFunctionContext ctx) { Expression originFunction = getExpression(ctx.expression()); return new CreateFunctionCommand(statementScope, ifNotExists, false, true, false, function, functionArgTypesInfo, VarcharType.MAX_VARCHAR_TYPE, null, - parameters, originFunction, null); + parameters, originFunction, null, null); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdaf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdaf.java new file mode 100644 index 00000000000000..19303bf5a6e315 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdaf.java @@ -0,0 +1,220 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.udf; + +import org.apache.doris.analysis.FunctionName; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.Function; +import org.apache.doris.catalog.Function.NullableMode; +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.catalog.Type; +import org.apache.doris.common.util.URI; +import org.apache.doris.nereids.exceptions.AnalysisException; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.SlotReference; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.Udf; +import org.apache.doris.nereids.trees.expressions.functions.agg.AggregateFunction; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.DataType; +import org.apache.doris.thrift.TFunctionBinaryType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +/** + * Python UDAF for Nereids + */ +public class PythonUdaf extends AggregateFunction implements ExplicitlyCastableSignature, Udf { + private final String dbName; + private final long functionId; + private final TFunctionBinaryType binaryType; + private final FunctionSignature signature; + private final DataType intermediateType; + private final NullableMode nullableMode; + private final String objectFile; + private final String symbol; + private final String initFn; + private final String updateFn; + private final String mergeFn; + private final String serializeFn; + private final String finalizeFn; + private final String getValueFn; + private final String removeFn; + private final String checkSum; + private final boolean isStaticLoad; + private final long expirationTime; + private final String runtimeVersion; + private final String functionCode; + + /** + * Constructor of UDAF + */ + public PythonUdaf(String name, long functionId, String dbName, TFunctionBinaryType binaryType, + FunctionSignature signature, + DataType intermediateType, NullableMode nullableMode, + String objectFile, String symbol, + String initFn, String updateFn, String mergeFn, + String serializeFn, String finalizeFn, String getValueFn, String removeFn, + boolean isDistinct, String checkSum, boolean isStaticLoad, long expirationTime, + String runtimeVersion, String functionCode, Expression... args) { + super(name, isDistinct, args); + this.dbName = dbName; + this.functionId = functionId; + this.binaryType = binaryType; + this.signature = signature; + this.intermediateType = intermediateType == null ? signature.returnType : intermediateType; + this.nullableMode = nullableMode; + this.objectFile = objectFile; + this.symbol = symbol; + this.initFn = initFn; + this.updateFn = updateFn; + this.mergeFn = mergeFn; + this.serializeFn = serializeFn; + this.finalizeFn = finalizeFn; + this.getValueFn = getValueFn; + this.removeFn = removeFn; + this.checkSum = checkSum; + this.isStaticLoad = isStaticLoad; + this.expirationTime = expirationTime; + this.runtimeVersion = runtimeVersion; + this.functionCode = functionCode; + } + + @Override + public List getSignatures() { + return ImmutableList.of(signature); + } + + @Override + public boolean hasVarArguments() { + return signature.hasVarArgs; + } + + @Override + public int arity() { + return signature.argumentsTypes.size(); + } + + @Override + public NullableMode getNullableMode() { + return nullableMode; + } + + /** + * withChildren. + */ + @Override + public PythonUdaf withDistinctAndChildren(boolean isDistinct, List children) { + Preconditions.checkArgument(children.size() == this.children.size()); + return new PythonUdaf(getName(), functionId, dbName, binaryType, signature, intermediateType, nullableMode, + objectFile, symbol, initFn, updateFn, mergeFn, serializeFn, finalizeFn, getValueFn, removeFn, + isDistinct, checkSum, isStaticLoad, expirationTime, runtimeVersion, functionCode, + children.toArray(new Expression[0])); + } + + /** + * translate catalog python udaf to nereids python udaf + */ + public static void translateToNereidsFunction(String dbName, org.apache.doris.catalog.AggregateFunction aggregate) { + String fnName = aggregate.functionName(); + DataType retType = DataType.fromCatalogType(aggregate.getReturnType()); + List argTypes = Arrays.stream(aggregate.getArgs()) + .map(DataType::fromCatalogType) + .collect(Collectors.toList()); + + FunctionSignature.FuncSigBuilder sigBuilder = FunctionSignature.ret(retType); + FunctionSignature sig = aggregate.hasVarArgs() + ? sigBuilder.varArgs(argTypes.toArray(new DataType[0])) + : sigBuilder.args(argTypes.toArray(new DataType[0])); + + SlotReference[] arguments = argTypes.stream() + .map(type -> new SlotReference(type.toString(), type)) + .toArray(SlotReference[]::new); + + DataType intermediateType = null; + if (aggregate.getIntermediateType() != null) { + intermediateType = DataType.fromCatalogType(aggregate.getIntermediateType()); + } + + PythonUdaf udaf = new PythonUdaf(fnName, aggregate.getId(), dbName, aggregate.getBinaryType(), sig, + intermediateType, + aggregate.getNullableMode(), + aggregate.getLocation() == null ? null : aggregate.getLocation().getLocation(), + aggregate.getSymbolName(), + aggregate.getInitFnSymbol(), + aggregate.getUpdateFnSymbol(), + aggregate.getMergeFnSymbol(), + aggregate.getSerializeFnSymbol(), + aggregate.getFinalizeFnSymbol(), + aggregate.getGetValueFnSymbol(), + aggregate.getRemoveFnSymbol(), + false, + aggregate.getChecksum(), + aggregate.isStaticLoad(), + aggregate.getExpirationTime(), + aggregate.getRuntimeVersion(), + aggregate.getFunctionCode(), + arguments); + + PythonUdafBuilder builder = new PythonUdafBuilder(udaf); + Env.getCurrentEnv().getFunctionRegistry().addUdf(dbName, fnName, builder); + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitPythonUdaf(this, context); + } + + @Override + public Function getCatalogFunction() { + try { + org.apache.doris.catalog.AggregateFunction expr = new org.apache.doris.catalog.AggregateFunction( + new FunctionName(dbName, getName()), + signature.argumentsTypes.stream().map(DataType::toCatalogDataType).toArray(Type[]::new), + signature.returnType.toCatalogDataType(), + signature.hasVarArgs, + intermediateType.toCatalogDataType(), + objectFile == null ? null : URI.create(objectFile), + initFn, + updateFn, + mergeFn, + serializeFn, + finalizeFn, + getValueFn, + removeFn + ); + expr.setSymbolName(symbol); + expr.setBinaryType(binaryType); + expr.setNullableMode(nullableMode); + expr.setChecksum(checkSum); + expr.setId(functionId); + expr.setStaticLoad(isStaticLoad); + expr.setExpirationTime(expirationTime); + expr.setRuntimeVersion(runtimeVersion); + expr.setFunctionCode(functionCode); + return expr; + } catch (Exception e) { + throw new AnalysisException(e.getMessage(), e.getCause()); + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdafBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdafBuilder.java new file mode 100644 index 00000000000000..73a50b6bc400ee --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdafBuilder.java @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.udf; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.common.Pair; +import org.apache.doris.common.util.ReflectionUtils; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.BoundFunction; +import org.apache.doris.nereids.types.DataType; + +import com.google.common.base.Suppliers; + +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +/** + * function builder for python udaf + */ +public class PythonUdafBuilder extends UdfBuilder { + private final PythonUdaf udaf; + private final int arity; + private final boolean isVarArgs; + + public PythonUdafBuilder(PythonUdaf udaf) { + this.udaf = udaf; + this.isVarArgs = udaf.hasVarArguments(); + this.arity = udaf.arity(); + } + + @Override + public List getArgTypes() { + return Suppliers.memoize(() -> udaf.getSignatures().get(0).argumentsTypes.stream() + .map(DataType.class::cast) + .collect(Collectors.toList())).get(); + } + + @Override + public List getSignatures() { + return udaf.getSignatures(); + } + + @Override + public Class functionClass() { + return PythonUdaf.class; + } + + @Override + public boolean canApply(List arguments) { + if ((isVarArgs && arity > arguments.size() + 1) || (!isVarArgs && arguments.size() != arity)) { + return false; + } + for (Object argument : arguments) { + if (!(argument instanceof Expression)) { + Optional primitiveType = ReflectionUtils.getPrimitiveType(argument.getClass()); + if (!primitiveType.isPresent() || !Expression.class.isAssignableFrom(primitiveType.get())) { + return false; + } + } + } + return true; + } + + @Override + public Pair build(String name, List arguments) { + return Pair.ofSame((PythonUdaf) udaf.withChildren( + arguments.stream() + .map(Expression.class::cast) + .collect(Collectors.toList())) + ); + } + + @Override + public String parameterDisplayString() { + StringBuilder string = new StringBuilder("("); + for (int i = 0; i < udaf.getArgumentsTypes().size(); ++i) { + if (i > 0) { + string.append(", "); + } + string.append(udaf.getArgumentsTypes().get(i)); + if (isVarArgs && i + 1 == udaf.getArgumentsTypes().size()) { + string.append("..."); + } + } + return string.append(")").toString(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java new file mode 100644 index 00000000000000..c5870473997629 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdf.java @@ -0,0 +1,184 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.udf; + +import org.apache.doris.analysis.FunctionName; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.Function; +import org.apache.doris.catalog.Function.NullableMode; +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.catalog.Type; +import org.apache.doris.common.util.URI; +import org.apache.doris.nereids.exceptions.AnalysisException; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.SlotReference; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.Udf; +import org.apache.doris.nereids.trees.expressions.functions.scalar.ScalarFunction; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.DataType; +import org.apache.doris.thrift.TFunctionBinaryType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +/** + * Python UDF for Nereids + */ +public class PythonUdf extends ScalarFunction implements ExplicitlyCastableSignature, Udf { + private final String dbName; + private final long functionId; + private final TFunctionBinaryType binaryType; + private final FunctionSignature signature; + private final NullableMode nullableMode; + private final String objectFile; + private final String symbol; + private final String prepareFn; + private final String closeFn; + private final String checkSum; + private final boolean isStaticLoad; + private final long expirationTime; + private final String runtimeVersion; + private final String functionCode; + + /** + * Constructor of UDF + */ + public PythonUdf(String name, long functionId, String dbName, TFunctionBinaryType binaryType, + FunctionSignature signature, + NullableMode nullableMode, String objectFile, String symbol, String prepareFn, String closeFn, + String checkSum, boolean isStaticLoad, long expirationTime, + String runtimeVersion, String functionCode, Expression... args) { + super(name, args); + this.dbName = dbName; + this.functionId = functionId; + this.binaryType = binaryType; + this.signature = signature; + this.nullableMode = nullableMode; + this.objectFile = objectFile; + this.symbol = symbol; + this.prepareFn = prepareFn; + this.closeFn = closeFn; + this.checkSum = checkSum; + this.isStaticLoad = isStaticLoad; + this.expirationTime = expirationTime; + this.runtimeVersion = runtimeVersion; + this.functionCode = functionCode; + } + + @Override + public List getSignatures() { + return ImmutableList.of(signature); + } + + @Override + public boolean hasVarArguments() { + return signature.hasVarArgs; + } + + @Override + public int arity() { + return signature.argumentsTypes.size(); + } + + @Override + public NullableMode getNullableMode() { + return nullableMode; + } + + /** + * withChildren. + */ + @Override + public PythonUdf withChildren(List children) { + Preconditions.checkArgument(children.size() == this.children.size()); + return new PythonUdf(getName(), functionId, dbName, binaryType, signature, nullableMode, + objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, + runtimeVersion, functionCode, children.toArray(new Expression[0])); + } + + /** + * translate catalog java udf to nereids java udf + */ + public static void translateToNereidsFunction(String dbName, org.apache.doris.catalog.ScalarFunction scalar) { + String fnName = scalar.functionName(); + DataType retType = DataType.fromCatalogType(scalar.getReturnType()); + List argTypes = Arrays.stream(scalar.getArgs()) + .map(DataType::fromCatalogType) + .collect(Collectors.toList()); + + FunctionSignature.FuncSigBuilder sigBuilder = FunctionSignature.ret(retType); + FunctionSignature sig = scalar.hasVarArgs() + ? sigBuilder.varArgs(argTypes.toArray(new DataType[0])) + : sigBuilder.args(argTypes.toArray(new DataType[0])); + + SlotReference[] arguments = argTypes.stream() + .map(type -> new SlotReference(type.toString(), type)) + .toArray(SlotReference[]::new); + + PythonUdf udf = new PythonUdf(fnName, scalar.getId(), dbName, scalar.getBinaryType(), sig, + scalar.getNullableMode(), + scalar.getLocation() == null ? null : scalar.getLocation().getLocation(), + scalar.getSymbolName(), + scalar.getPrepareFnSymbol(), + scalar.getCloseFnSymbol(), + scalar.getChecksum(), scalar.isStaticLoad(), scalar.getExpirationTime(), + scalar.getRuntimeVersion(), + scalar.getFunctionCode(), + arguments); + + PythonUdfBuilder builder = new PythonUdfBuilder(udf); + Env.getCurrentEnv().getFunctionRegistry().addUdf(dbName, fnName, builder); + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitPythonUdf(this, context); + } + + @Override + public Function getCatalogFunction() { + try { + org.apache.doris.catalog.ScalarFunction expr = org.apache.doris.catalog.ScalarFunction.createUdf( + binaryType, + new FunctionName(dbName, getName()), + signature.argumentsTypes.stream().map(DataType::toCatalogDataType).toArray(Type[]::new), + signature.returnType.toCatalogDataType(), + signature.hasVarArgs, + objectFile == null ? null : URI.create(objectFile), + symbol, + prepareFn, + closeFn + ); + expr.setNullableMode(nullableMode); + expr.setChecksum(checkSum); + expr.setId(functionId); + expr.setStaticLoad(isStaticLoad); + expr.setExpirationTime(expirationTime); + expr.setRuntimeVersion(runtimeVersion); + expr.setFunctionCode(functionCode); + return expr; + } catch (Exception e) { + throw new AnalysisException(e.getMessage(), e.getCause()); + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java new file mode 100644 index 00000000000000..7185594099b87c --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdfBuilder.java @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.udf; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.common.Pair; +import org.apache.doris.common.util.ReflectionUtils; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.BoundFunction; +import org.apache.doris.nereids.types.DataType; +import org.apache.doris.nereids.util.TypeCoercionUtils; + +import com.google.common.base.Suppliers; +import com.google.common.collect.Lists; + +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +/** + * function builder for python udf + */ +public class PythonUdfBuilder extends UdfBuilder { + private final PythonUdf udf; + private final int arity; + private final boolean isVarArgs; + + public PythonUdfBuilder(PythonUdf udf) { + this.udf = udf; + this.isVarArgs = udf.hasVarArguments(); + this.arity = udf.arity(); + } + + @Override + public List getArgTypes() { + return Suppliers.memoize(() -> udf.getSignatures().get(0).argumentsTypes.stream() + .map(DataType.class::cast) + .collect(Collectors.toList())).get(); + } + + @Override + public List getSignatures() { + return udf.getSignatures(); + } + + @Override + public Class functionClass() { + return JavaUdf.class; + } + + @Override + public boolean canApply(List arguments) { + if ((isVarArgs && arity > arguments.size() + 1) || (!isVarArgs && arguments.size() != arity)) { + return false; + } + for (Object argument : arguments) { + if (!(argument instanceof Expression)) { + Optional primitiveType = ReflectionUtils.getPrimitiveType(argument.getClass()); + if (!primitiveType.isPresent() || !Expression.class.isAssignableFrom(primitiveType.get())) { + return false; + } + } + } + return true; + } + + @Override + public Pair build(String name, List arguments) { + List exprs = arguments.stream().map(Expression.class::cast).collect(Collectors.toList()); + List argTypes = udf.getSignatures().get(0).argumentsTypes; + + List processedExprs = Lists.newArrayList(); + for (int i = 0; i < exprs.size(); ++i) { + processedExprs.add(TypeCoercionUtils.castIfNotSameType(exprs.get(i), argTypes.get(i))); + } + return Pair.ofSame(udf.withChildren(processedExprs)); + } + + @Override + public String parameterDisplayString() { + StringBuilder string = new StringBuilder("("); + for (int i = 0; i < udf.getArgumentsTypes().size(); ++i) { + if (i > 0) { + string.append(", "); + } + string.append(udf.getArgumentsTypes().get(i)); + if (isVarArgs && i + 1 == udf.getArgumentsTypes().size()) { + string.append("..."); + } + } + return string.append(")").toString(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtf.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtf.java new file mode 100644 index 00000000000000..e0214d9e5b0bc6 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtf.java @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.udf; + +import org.apache.doris.analysis.FunctionName; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.Function; +import org.apache.doris.catalog.Function.NullableMode; +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.catalog.Type; +import org.apache.doris.common.util.URI; +import org.apache.doris.nereids.exceptions.AnalysisException; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.SlotReference; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.Udf; +import org.apache.doris.nereids.trees.expressions.functions.generator.TableGeneratingFunction; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.DataType; +import org.apache.doris.thrift.TFunctionBinaryType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +/** + * Python UDTF for Nereids + */ +public class PythonUdtf extends TableGeneratingFunction implements ExplicitlyCastableSignature, Udf { + private final String dbName; + private final long functionId; + private final TFunctionBinaryType binaryType; + private final FunctionSignature signature; + private final NullableMode nullableMode; + private final String objectFile; + private final String symbol; + private final String prepareFn; + private final String closeFn; + private final String checkSum; + private final boolean isStaticLoad; + private final long expirationTime; + private final String runtimeVersion; + private final String functionCode; + + /** + * Constructor of Python UDTF + */ + public PythonUdtf(String name, long functionId, String dbName, TFunctionBinaryType binaryType, + FunctionSignature signature, + NullableMode nullableMode, String objectFile, String symbol, String prepareFn, String closeFn, + String checkSum, boolean isStaticLoad, long expirationTime, + String runtimeVersion, String functionCode, Expression... args) { + super(name, args); + this.dbName = dbName; + this.functionId = functionId; + this.binaryType = binaryType; + this.signature = signature; + this.nullableMode = nullableMode; + this.objectFile = objectFile; + this.symbol = symbol; + this.prepareFn = prepareFn; + this.closeFn = closeFn; + this.checkSum = checkSum; + this.isStaticLoad = isStaticLoad; + this.expirationTime = expirationTime; + this.runtimeVersion = runtimeVersion; + this.functionCode = functionCode; + } + + /** + * withChildren. + */ + @Override + public PythonUdtf withChildren(List children) { + Preconditions.checkArgument(children.size() == this.children.size()); + return new PythonUdtf(getName(), functionId, dbName, binaryType, signature, nullableMode, + objectFile, symbol, prepareFn, closeFn, checkSum, isStaticLoad, expirationTime, + runtimeVersion, functionCode, children.toArray(new Expression[0])); + } + + @Override + public List getSignatures() { + return ImmutableList.of(signature); + } + + @Override + public boolean hasVarArguments() { + return signature.hasVarArgs; + } + + @Override + public int arity() { + return signature.argumentsTypes.size(); + } + + @Override + public Function getCatalogFunction() { + try { + org.apache.doris.catalog.ScalarFunction expr = org.apache.doris.catalog.ScalarFunction.createUdf( + binaryType, + new FunctionName(dbName, getName()), + signature.argumentsTypes.stream().map(DataType::toCatalogDataType).toArray(Type[]::new), + signature.returnType.toCatalogDataType(), + signature.hasVarArgs, + objectFile == null ? null : URI.create(objectFile), + symbol, + prepareFn, + closeFn + ); + expr.setNullableMode(nullableMode); + expr.setChecksum(checkSum); + expr.setId(functionId); + expr.setStaticLoad(isStaticLoad); + expr.setExpirationTime(expirationTime); + expr.setUDTFunction(true); + expr.setRuntimeVersion(runtimeVersion); + expr.setFunctionCode(functionCode); + return expr; + } catch (Exception e) { + throw new AnalysisException(e.getMessage(), e.getCause()); + } + } + + /** + * translate catalog python udtf to nereids python udtf + */ + public static void translateToNereidsFunction(String dbName, org.apache.doris.catalog.ScalarFunction scalar) { + String fnName = scalar.functionName(); + DataType retType = DataType.fromCatalogType(scalar.getReturnType()); + List argTypes = Arrays.stream(scalar.getArgs()) + .map(DataType::fromCatalogType) + .collect(Collectors.toList()); + + FunctionSignature.FuncSigBuilder sigBuilder = FunctionSignature.ret(retType); + FunctionSignature sig = scalar.hasVarArgs() + ? sigBuilder.varArgs(argTypes.toArray(new DataType[0])) + : sigBuilder.args(argTypes.toArray(new DataType[0])); + + SlotReference[] arguments = argTypes.stream() + .map(type -> new SlotReference(type.toString(), type)) + .toArray(SlotReference[]::new); + + PythonUdtf udtf = new PythonUdtf(fnName, scalar.getId(), dbName, scalar.getBinaryType(), sig, + scalar.getNullableMode(), + scalar.getLocation() == null ? null : scalar.getLocation().getLocation(), + scalar.getSymbolName(), + scalar.getPrepareFnSymbol(), + scalar.getCloseFnSymbol(), + scalar.getChecksum(), + scalar.isStaticLoad(), + scalar.getExpirationTime(), + scalar.getRuntimeVersion(), + scalar.getFunctionCode(), + arguments); + + PythonUdtfBuilder builder = new PythonUdtfBuilder(udtf); + Env.getCurrentEnv().getFunctionRegistry().addUdf(dbName, fnName, builder); + } + + @Override + public NullableMode getNullableMode() { + return nullableMode; + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitPythonUdtf(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtfBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtfBuilder.java new file mode 100644 index 00000000000000..3c032ba18abe9d --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/udf/PythonUdtfBuilder.java @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.udf; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.common.Pair; +import org.apache.doris.common.util.ReflectionUtils; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.BoundFunction; +import org.apache.doris.nereids.types.DataType; +import org.apache.doris.nereids.util.TypeCoercionUtils; + +import com.google.common.base.Suppliers; +import com.google.common.collect.Lists; + +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +/** + * function builder for python udtf + */ +public class PythonUdtfBuilder extends UdfBuilder { + private final PythonUdtf udtf; + private final int arity; + private final boolean isVarArgs; + + public PythonUdtfBuilder(PythonUdtf udtf) { + this.udtf = udtf; + this.isVarArgs = udtf.hasVarArguments(); + this.arity = udtf.arity(); + } + + @Override + public List getArgTypes() { + return Suppliers.memoize(() -> udtf.getSignatures().get(0).argumentsTypes.stream() + .map(DataType.class::cast) + .collect(Collectors.toList())).get(); + } + + @Override + public List getSignatures() { + return udtf.getSignatures(); + } + + @Override + public Class functionClass() { + return PythonUdtf.class; + } + + @Override + public boolean canApply(List arguments) { + if ((isVarArgs && arity > arguments.size() + 1) || (!isVarArgs && arguments.size() != arity)) { + return false; + } + for (Object argument : arguments) { + if (!(argument instanceof Expression)) { + Optional primitiveType = ReflectionUtils.getPrimitiveType(argument.getClass()); + if (!primitiveType.isPresent() || !Expression.class.isAssignableFrom(primitiveType.get())) { + return false; + } + } + } + return true; + } + + @Override + public Pair build(String name, List arguments) { + List exprs = arguments.stream().map(Expression.class::cast).collect(Collectors.toList()); + List argTypes = udtf.getSignatures().get(0).argumentsTypes; + + List processedExprs = Lists.newArrayList(); + for (int i = 0; i < exprs.size(); ++i) { + processedExprs.add(TypeCoercionUtils.castIfNotSameType(exprs.get(i), argTypes.get(i))); + } + return Pair.ofSame(udtf.withChildren(processedExprs)); + } + + @Override + public String parameterDisplayString() { + StringBuilder string = new StringBuilder("("); + for (int i = 0; i < udtf.getArgumentsTypes().size(); ++i) { + if (i > 0) { + string.append(", "); + } + string.append(udtf.getArgumentsTypes().get(i)); + if (isVarArgs && i + 1 == udtf.getArgumentsTypes().size()) { + string.append("..."); + } + } + return string.append(")").toString(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java index 09ecdd2714fa83..bb4be4ffffaff6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/AggregateFunctionVisitor.java @@ -97,6 +97,7 @@ import org.apache.doris.nereids.trees.expressions.functions.combinator.MergeCombinator; import org.apache.doris.nereids.trees.expressions.functions.combinator.UnionCombinator; import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdaf; +import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdaf; /** AggregateFunctionVisitor. */ public interface AggregateFunctionVisitor { @@ -419,4 +420,7 @@ default R visitJavaUdaf(JavaUdaf javaUdaf, C context) { return visitAggregateFunction(javaUdaf, context); } + default R visitPythonUdaf(PythonUdaf pythonUdaf, C context) { + return visitAggregateFunction(pythonUdaf, context); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 0e939800b2926d..cb71fceae5fafe 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -562,6 +562,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.YearsSub; import org.apache.doris.nereids.trees.expressions.functions.udf.AliasUdf; import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdf; +import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdf; /** * ScalarFunctionVisitor. @@ -2595,6 +2596,10 @@ default R visitJavaUdf(JavaUdf javaUdf, C context) { return visitScalarFunction(javaUdf, context); } + default R visitPythonUdf(PythonUdf pythonUdf, C context) { + return visitScalarFunction(pythonUdf, context); + } + default R visitAliasUdf(AliasUdf aliasUdf, C context) { return visitScalarFunction(aliasUdf, context); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableGeneratingFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableGeneratingFunctionVisitor.java index 9fae7c397cada9..4192ca61465171 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableGeneratingFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/TableGeneratingFunctionVisitor.java @@ -42,6 +42,7 @@ import org.apache.doris.nereids.trees.expressions.functions.generator.PosExplodeOuter; import org.apache.doris.nereids.trees.expressions.functions.generator.TableGeneratingFunction; import org.apache.doris.nereids.trees.expressions.functions.udf.JavaUdtf; +import org.apache.doris.nereids.trees.expressions.functions.udf.PythonUdtf; /** * visitor function for all table generating function. @@ -137,6 +138,10 @@ default R visitJavaUdtf(JavaUdtf udtf, C context) { return visitTableGeneratingFunction(udtf, context); } + default R visitPythonUdtf(PythonUdtf udtf, C context) { + return visitTableGeneratingFunction(udtf, context); + } + default R visitPosExplode(PosExplode posExplode, C context) { return visitTableGeneratingFunction(posExplode, context); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java index 7464a80539c347..ba6d783753914f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateFunctionCommand.java @@ -92,6 +92,8 @@ import org.apache.commons.codec.binary.Hex; import org.apache.commons.collections4.map.CaseInsensitiveMap; import org.apache.commons.lang3.StringUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.io.IOException; import java.io.InputStream; @@ -110,6 +112,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.regex.Pattern; import java.util.stream.Collectors; /** @@ -144,6 +147,10 @@ public class CreateFunctionCommand extends Command implements ForwardWithSync { // iff is static load, BE will be cache the udf class load, so only need load once public static final String IS_STATIC_LOAD = "static_load"; public static final String EXPIRATION_TIME = "expiration_time"; + public static final String RUNTIME_VERSION = "runtime_version"; + + private static final Pattern PYTHON_VERSION_PATTERN = Pattern.compile("^3\\.\\d{1,2}(?:\\.\\d{1,2})?$"); + private static final Logger LOG = LogManager.getLogger(CreateFunctionCommand.class); // timeout for both connection and read. 10 seconds is long enough. private static final int HTTP_TIMEOUT_MS = 10000; @@ -171,14 +178,16 @@ public class CreateFunctionCommand extends Command implements ForwardWithSync { // if not, will core dump when input is not null column, but need return null // like https://github.com/apache/doris/pull/14002/files private NullableMode returnNullMode = NullableMode.ALWAYS_NULLABLE; + private String runtimeVersion; + private String functionCode; /** * CreateFunctionCommand */ public CreateFunctionCommand(SetType setType, boolean ifNotExists, boolean isAggregate, boolean isAlias, - boolean isTableFunction, FunctionName functionName, FunctionArgTypesInfo argsDef, - DataType returnType, DataType intermediateType, List parameters, - Expression originFunction, Map properties) { + boolean isTableFunction, FunctionName functionName, FunctionArgTypesInfo argsDef, + DataType returnType, DataType intermediateType, List parameters, + Expression originFunction, Map properties, String functionCode) { super(PlanType.CREATE_FUNCTION_COMMAND); this.setType = setType; this.ifNotExists = ifNotExists; @@ -200,6 +209,7 @@ public CreateFunctionCommand(SetType setType, boolean ifNotExists, boolean isAgg } else { this.properties = ImmutableSortedMap.copyOf(properties, String.CASE_INSENSITIVE_ORDER); } + this.functionCode = functionCode; } @Override @@ -300,7 +310,7 @@ private void analyzeCommon(ConnectContext ctx) throws AnalysisException { String type = properties.getOrDefault(BINARY_TYPE, "JAVA_UDF"); binaryType = getFunctionBinaryType(type); if (binaryType == null) { - throw new AnalysisException("unknown function type"); + throw new AnalysisException("Unknown function type: '" + type + "'"); } if (type.equals("NATIVE")) { throw new AnalysisException("do not support 'NATIVE' udf type after doris version 1.2.0," @@ -337,22 +347,48 @@ private void analyzeCommon(ConnectContext ctx) throws AnalysisException { if (staticLoad != null && staticLoad) { isStaticLoad = true; } - String expirationTimeString = properties.get(EXPIRATION_TIME); - if (expirationTimeString != null) { - long timeMinutes = 0; - try { - timeMinutes = Long.parseLong(expirationTimeString); - } catch (NumberFormatException e) { - throw new AnalysisException(e.getMessage()); - } - if (timeMinutes <= 0) { - throw new AnalysisException("expirationTime should greater than zero: "); - } - this.expirationTime = timeMinutes; + extractExpirationTime(); + } else if (binaryType == TFunctionBinaryType.PYTHON_UDF) { + FunctionUtil.checkEnablePythonUdf(); + + // always_nullable the default value is true, equal null means true + Boolean isReturnNull = parseBooleanFromProperties(IS_RETURN_NULL); + if (isReturnNull != null && !isReturnNull) { + returnNullMode = NullableMode.ALWAYS_NOT_NULLABLE; + } + extractExpirationTime(); + String runtimeVersionString = properties.get(RUNTIME_VERSION); + if (runtimeVersionString == null) { + throw new AnalysisException("Python runtime version is not set"); + } else if (!validatePythonRuntimeVersion(runtimeVersionString)) { + throw new AnalysisException( + String.format("Invalid Python runtime version: '%s'. Expected format:" + + "'3.X.X' or '3.XX.XX' (e.g. '3.10.2').", runtimeVersionString)); + } + runtimeVersion = runtimeVersionString; + } + } + + private void extractExpirationTime() throws AnalysisException { + String expirationTimeString = properties.get(EXPIRATION_TIME); + if (expirationTimeString != null) { + long timeMinutes = 0; + try { + timeMinutes = Long.parseLong(expirationTimeString); + } catch (NumberFormatException e) { + throw new AnalysisException(e.getMessage()); + } + if (timeMinutes <= 0) { + throw new AnalysisException("expirationTime should greater than zero: "); } + this.expirationTime = timeMinutes; } } + private static boolean validatePythonRuntimeVersion(String runtimeVersionString) { + return runtimeVersionString != null && PYTHON_VERSION_PATTERN.matcher(runtimeVersionString).matches(); + } + private Boolean parseBooleanFromProperties(String propertyString) throws AnalysisException { String valueOfString = properties.get(propertyString); if (valueOfString == null) { @@ -419,9 +455,13 @@ private void analyzeUdtf() throws AnalysisException { throw new AnalysisException("No 'symbol' in properties"); } if (!returnType.isArrayType()) { - throw new AnalysisException("JAVA_UDF OF UDTF return type must be array type"); + throw new AnalysisException("JAVA_UDTF OR PYTHON_UDTF return type must be array type"); + } + if (binaryType == TFunctionBinaryType.JAVA_UDF) { + analyzeJavaUdf(symbol); + } else if (binaryType == TFunctionBinaryType.PYTHON_UDF) { + analyzePythonUdtf(symbol); } - analyzeJavaUdf(symbol); URI location; if (!Strings.isNullOrEmpty(originalUserFile)) { location = URI.create(originalUserFile); @@ -435,6 +475,8 @@ private void analyzeUdtf() throws AnalysisException { function.setChecksum(checksum); function.setNullableMode(returnNullMode); function.setUDTFunction(true); + function.setRuntimeVersion(runtimeVersion); + function.setFunctionCode(functionCode); // Todo: maybe in create tables function, need register two function, one is // normal and one is outer as those have different result when result is NULL. } @@ -453,15 +495,18 @@ private void analyzeUdaf() throws AnalysisException { .location(location); String initFnSymbol = properties.get(INIT_KEY); if (initFnSymbol == null && !(binaryType == TFunctionBinaryType.JAVA_UDF + || binaryType == TFunctionBinaryType.PYTHON_UDF || binaryType == TFunctionBinaryType.RPC)) { throw new AnalysisException("No 'init_fn' in properties"); } String updateFnSymbol = properties.get(UPDATE_KEY); - if (updateFnSymbol == null && !(binaryType == TFunctionBinaryType.JAVA_UDF)) { + if (updateFnSymbol == null && !(binaryType == TFunctionBinaryType.JAVA_UDF + || binaryType == TFunctionBinaryType.PYTHON_UDF)) { throw new AnalysisException("No 'update_fn' in properties"); } String mergeFnSymbol = properties.get(MERGE_KEY); - if (mergeFnSymbol == null && !(binaryType == TFunctionBinaryType.JAVA_UDF)) { + if (mergeFnSymbol == null && !(binaryType == TFunctionBinaryType.JAVA_UDF + || binaryType == TFunctionBinaryType.PYTHON_UDF)) { throw new AnalysisException("No 'merge_fn' in properties"); } String serializeFnSymbol = properties.get(SERIALIZE_KEY); @@ -492,6 +537,8 @@ private void analyzeUdaf() throws AnalysisException { throw new AnalysisException("No 'symbol' in properties of java-udaf"); } analyzeJavaUdaf(symbol); + } else if (binaryType == TFunctionBinaryType.PYTHON_UDF) { + analyzePythonUdaf(symbol); } function = builder.initFnSymbol(initFnSymbol).updateFnSymbol(updateFnSymbol).mergeFnSymbol(mergeFnSymbol) .serializeFnSymbol(serializeFnSymbol).finalizeFnSymbol(finalizeFnSymbol) @@ -502,6 +549,8 @@ private void analyzeUdaf() throws AnalysisException { function.setNullableMode(returnNullMode); function.setStaticLoad(isStaticLoad); function.setExpirationTime(expirationTime); + function.setRuntimeVersion(runtimeVersion); + function.setFunctionCode(functionCode); } private void analyzeUdf() throws AnalysisException { @@ -520,6 +569,8 @@ private void analyzeUdf() throws AnalysisException { checkRPCUdf(symbol); } else if (binaryType == TFunctionBinaryType.JAVA_UDF) { analyzeJavaUdf(symbol); + } else if (binaryType == TFunctionBinaryType.PYTHON_UDF) { + analyzePythonUdf(symbol); } URI location; if (!Strings.isNullOrEmpty(originalUserFile)) { @@ -535,6 +586,8 @@ private void analyzeUdf() throws AnalysisException { function.setNullableMode(returnNullMode); function.setStaticLoad(isStaticLoad); function.setExpirationTime(expirationTime); + function.setRuntimeVersion(runtimeVersion); + function.setFunctionCode(functionCode); } private void analyzeJavaUdaf(String clazz) throws AnalysisException { @@ -564,6 +617,26 @@ private void analyzeJavaUdaf(String clazz) throws AnalysisException { } } + private void analyzePythonUdaf(String clazz) throws AnalysisException { + if (Strings.isNullOrEmpty(clazz)) { + throw new AnalysisException("No symbol class name provided for Python UDAF"); + } + + if (Strings.isNullOrEmpty(this.functionCode)) { + return; + } + + this.functionCode = this.functionCode.trim(); + if (!(this.functionCode.startsWith("$$") && this.functionCode.endsWith("$$"))) { + throw new AnalysisException("Inline Python UDAF code must be start with $$ and end with $$"); + } + + this.functionCode = this.functionCode.substring(2, this.functionCode.length() - 2); + if (this.functionCode.isEmpty()) { + throw new AnalysisException("Inline Python UDAF is empty"); + } + } + private void checkUdafClass(String clazz, ClassLoader cl, HashMap allMethods) throws ClassNotFoundException, AnalysisException { Class udfClass = cl.loadClass(clazz); @@ -715,6 +788,26 @@ private void analyzeJavaUdf(String clazz) throws AnalysisException { } } + private void analyzePythonUdf(String clazz) throws AnalysisException { + if (Strings.isNullOrEmpty(clazz)) { + throw new AnalysisException("No symbol class name provided for Python UDF"); + } + + if (Strings.isNullOrEmpty(this.functionCode)) { + return; + } + + this.functionCode = this.functionCode.trim(); + if (!(this.functionCode.startsWith("$$") && this.functionCode.endsWith("$$"))) { + throw new AnalysisException("Inline Python UDF code must be start with $$ and end with $$"); + } + + this.functionCode = this.functionCode.substring(2, this.functionCode.length() - 2); + if (this.functionCode.isEmpty()) { + throw new AnalysisException("Inline Python UDF is empty"); + } + } + private void checkUdfClass(String clazz, ClassLoader cl) throws ClassNotFoundException, AnalysisException { Class udfClass = cl.loadClass(clazz); List evalList = Arrays.stream(udfClass.getMethods()) @@ -807,6 +900,26 @@ private void checkUdfType(Class clazz, Method method, Type expType, Class pType, } } + private void analyzePythonUdtf(String clazz) throws AnalysisException { + if (Strings.isNullOrEmpty(clazz)) { + throw new AnalysisException("No symbol class name provided for Python UDTF"); + } + + if (Strings.isNullOrEmpty(this.functionCode)) { + return; + } + + this.functionCode = this.functionCode.trim(); + if (!(this.functionCode.startsWith("$$") && this.functionCode.endsWith("$$"))) { + throw new AnalysisException("Inline Python UDTF code must be start with $$ and end with $$"); + } + + this.functionCode = this.functionCode.substring(2, this.functionCode.length() - 2); + if (this.functionCode.isEmpty()) { + throw new AnalysisException("Inline Python UDTF is empty"); + } + } + private void checkRPCUdf(String symbol) throws AnalysisException { // TODO(yangzhg) support check function in FE when function service behind load balancer // the format for load balance can ref https://github.com/apache/incubator-brpc/blob/master/docs/en/client.md#connect-to-a-cluster diff --git a/gensrc/thrift/Types.thrift b/gensrc/thrift/Types.thrift index a3aa07c6ee0682..87be26403d67cf 100644 --- a/gensrc/thrift/Types.thrift +++ b/gensrc/thrift/Types.thrift @@ -332,7 +332,9 @@ enum TFunctionBinaryType { JAVA_UDF = 5, - AGG_STATE = 6 + AGG_STATE = 6, + + PYTHON_UDF = 7 } // Represents a fully qualified function name. @@ -408,6 +410,8 @@ struct TFunction { 15: optional bool is_static_load = false 16: optional i64 expiration_time //minutes 17: optional TDictFunction dict_function + 18: optional string runtime_version + 19: optional string function_code } enum TJdbcOperation { diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_basic.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_basic.out new file mode 100644 index 00000000000000..434472b1e2ec51 --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_basic.out @@ -0,0 +1,86 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +1 10 100 1.5 cat1 +2 20 200 3 cat2 +3 30 300 4.5 cat0 +4 40 400 6 cat1 +5 50 500 7.5 cat2 +6 60 600 9 cat0 +7 70 700 10.5 cat1 +8 80 800 12 cat2 +9 90 900 13.5 cat0 +10 100 1000 15 cat1 + +-- !select_sum_all -- +550 + +-- !select_sum_group -- +cat0 180 +cat1 220 +cat2 150 + +-- !select_sum_multiple -- +cat0 180 180 +cat1 220 220 +cat2 150 150 + +-- !select_sum_with_null -- +550 + +-- !select_sum_group_with_null -- +cat0 180 +cat1 220 +cat2 150 + +-- !select_avg_all -- +9.75 + +-- !select_avg_group -- +cat0 11.25 11.25 +cat1 8.25 8.25 +cat2 9.75 9.75 + +-- !select_window_partition -- +3 cat0 30 180 +6 cat0 60 180 +9 cat0 90 180 +1 cat1 10 220 +4 cat1 40 220 +7 cat1 70 220 +10 cat1 100 220 +2 cat2 20 150 +5 cat2 50 150 +8 cat2 80 150 + +-- !select_window_order -- +3 cat0 30 30 +6 cat0 60 90 +9 cat0 90 180 +1 cat1 10 10 +4 cat1 40 50 +7 cat1 70 120 +10 cat1 100 220 +2 cat2 20 20 +5 cat2 50 70 +8 cat2 80 150 + +-- !select_window_rows -- +3 cat0 30 90 +6 cat0 60 180 +9 cat0 90 150 +1 cat1 10 50 +4 cat1 40 120 +7 cat1 70 210 +10 cat1 100 170 +2 cat2 20 70 +5 cat2 50 150 +8 cat2 80 130 + +-- !select_global_1 -- +550 + +-- !select_global_2 -- +cat0 180 +cat1 220 +cat2 150 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_aggregation_inline.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_aggregation_inline.out new file mode 100644 index 00000000000000..4b62812a4dd400 --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_aggregation_inline.out @@ -0,0 +1,79 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_data -- +1 A 10.5 85 alpha +2 A 20.3 92 beta +3 A 15.7 78 gamma +4 A 30.2 95 delta +5 A 25.1 88 alpha +6 B 12.4 70 beta +7 B 18.9 85 gamma +8 B 22.5 90 alpha +9 B 16.3 82 beta +10 C 35.7 98 delta +11 C 28.4 91 gamma +12 C 31.2 87 alpha +13 C 26.8 93 beta +14 C 29.5 89 delta +15 C 33.1 95 gamma + +-- !variance_all -- +56.98728888888877 + +-- !variance_group -- +A 47.64639999999991 +B 13.60187500000001 +C 8.818055555555702 + +-- !stddev_all -- +7.548992574435927 + +-- !stddev_group -- +A 6.902637177195388 +B 3.688071989535997 +C 2.969521098688424 + +-- !median_all -- +25.1 + +-- !median_group -- +A 20.3 +B 17.6 +C 30.35 + +-- !collect_all -- +alpha,alpha,alpha,alpha,beta,beta,beta,beta,delta,delta,delta,gamma,gamma,gamma,gamma + +-- !collect_group -- +A alpha,alpha,beta,delta,gamma +B alpha,beta,beta,gamma +C alpha,beta,delta,delta,gamma,gamma + +-- !range_all -- +28 + +-- !range_group -- +A 17 +B 20 +C 11 + +-- !geomean_all -- +22.40937175581814 + +-- !geomean_group -- +A 19.09207495673589 +B 17.12235128890801 +C 30.64227404617737 + +-- !weighted_avg_all -- +24.24764795144158 + +-- !weighted_avg_group -- +A 20.69063926940639 +B 17.8474006116208 +C 30.849547920434 + +-- !multi_agg -- +A 47.64639999999991 6.902637177195388 20.3 17 19.09207495673589 20.69063926940639 +B 13.60187500000001 3.688071989535997 17.6 20 17.12235128890801 17.8474006116208 +C 8.818055555555702 2.969521098688424 30.35 11 30.64227404617737 30.849547920434 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_aggregation_module.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_aggregation_module.out new file mode 100644 index 00000000000000..f28adc2bba4f47 --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_aggregation_module.out @@ -0,0 +1,138 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_data -- +1 A 10.5 85 alpha +2 A 20.3 92 beta +3 A 15.7 78 gamma +4 A 30.2 95 delta +5 A 25.1 88 alpha +6 B 12.4 70 beta +7 B 18.9 85 gamma +8 B 22.5 90 alpha +9 B 16.3 82 beta +10 C 35.7 98 delta +11 C 28.4 91 gamma +12 C 31.2 87 alpha +13 C 26.8 93 beta +14 C 29.5 89 delta +15 C 33.1 95 gamma + +-- !variance_all -- +56.98728888888877 + +-- !variance_group -- +A 47.64639999999991 +B 13.60187500000001 +C 8.818055555555702 + +-- !stddev_all -- +7.548992574435927 + +-- !stddev_group -- +A 6.902637177195388 +B 3.688071989535997 +C 2.969521098688424 + +-- !median_all -- +25.1 + +-- !median_group -- +A 20.3 +B 17.6 +C 30.35 + +-- !collect_all -- +alpha,alpha,alpha,alpha,beta,beta,beta,beta,delta,delta,delta,gamma,gamma,gamma,gamma + +-- !collect_group -- +A alpha,alpha,beta,delta,gamma +B alpha,beta,beta,gamma +C alpha,beta,delta,delta,gamma,gamma + +-- !range_all -- +28 + +-- !range_group -- +A 17 +B 20 +C 11 + +-- !geomean_all -- +22.40937175581814 + +-- !geomean_group -- +A 19.09207495673589 +B 17.12235128890801 +C 30.64227404617737 + +-- !weighted_avg_all -- +24.24764795144158 + +-- !weighted_avg_group -- +A 20.69063926940639 +B 17.8474006116208 +C 30.849547920434 + +-- !multi_agg -- +A 47.64639999999991 6.902637177195388 20.3 17 19.09207495673589 20.69063926940639 +B 13.60187500000001 3.688071989535997 17.6 20 17.12235128890801 17.8474006116208 +C 8.818055555555702 2.969521098688424 30.35 11 30.64227404617737 30.849547920434 + +-- !window_partition -- +1 A 10.5 47.64639999999991 20.3 +2 A 20.3 47.64639999999991 20.3 +3 A 15.7 47.64639999999991 20.3 +4 A 30.2 47.64639999999991 20.3 +5 A 25.1 47.64639999999991 20.3 +6 B 12.4 13.60187500000001 17.6 +7 B 18.9 13.60187500000001 17.6 +8 B 22.5 13.60187500000001 17.6 +9 B 16.3 13.60187500000001 17.6 +10 C 35.7 8.818055555555702 30.35 +11 C 28.4 8.818055555555702 30.35 +12 C 31.2 8.818055555555702 30.35 +13 C 26.8 8.818055555555702 30.35 +14 C 29.5 8.818055555555702 30.35 +15 C 33.1 8.818055555555702 30.35 + +-- !window_order -- +1 A 10.5 0 10.5 +2 A 20.3 24.00999999999999 15.4 +3 A 15.7 16.0266666666667 15.7 +4 A 30.2 52.53687499999995 18 +5 A 25.1 47.64639999999991 20.3 +6 B 12.4 0 12.4 +7 B 18.9 10.5625 15.65 +8 B 22.5 17.4688888888889 18.9 +9 B 16.3 13.60187500000001 17.6 +10 C 35.7 0 35.7 +11 C 28.4 13.32250000000022 32.05 +12 C 31.2 9.042222222222335 31.2 +13 C 26.8 11.40687500000024 29.8 +14 C 29.5 9.293600000000083 29.5 +15 C 33.1 8.818055555555702 30.35 + +-- !compare_native -- +A 6.902637177195388 6.902637177195394 47.64639999999991 47.64639999999999 +B 3.688071989535997 3.688071989535996 13.60187500000001 13.601875 +C 2.969521098688424 2.9695210986884 8.818055555555702 8.81805555555556 + +-- !null_handling -- +66.66666666666669 20 10 alpha,alpha,beta + +-- !global_variance -- +56.98728888888877 + +-- !global_variance_group -- +A 47.64639999999991 +B 13.60187500000001 +C 8.818055555555702 + +-- !empty -- +\N + +-- !single -- +0 + +-- !two -- +15.4 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_state_objects_inline.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_state_objects_inline.out new file mode 100644 index 00000000000000..a54c688a128cf2 --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_state_objects_inline.out @@ -0,0 +1,71 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !test_user_profile -- +{"101": {"item_count": 4, "total_spent": 1909.95, "unique_categories": 1}, "102": {"item_count": 3, "total_spent": 155.95, "unique_categories": 3}, "103": {"item_count": 3, "total_spent": 1479.95, "unique_categories": 1}, "104": {"item_count": 2, "total_spent": 61.97, "unique_categories": 2}, "105": {"item_count": 1, "total_spent": 159.98, "unique_categories": 1}, "106": {"item_count": 1, "total_spent": 799.98, "unique_categories": 1}, "107": {"item_count": 1, "total_spent": 299.99, "unique_categories": 1}} + +-- !test_product_stats -- +{"Desk Lamp": {"avg_price": 45.99, "total_quantity": 1, "total_revenue": 45.99, "transactions": 1}, "HDMI Cable": {"avg_price": 15.99, "total_quantity": 2, "total_revenue": 31.98, "transactions": 1}, "Headphones": {"avg_price": 149.99, "total_quantity": 1, "total_revenue": 149.99, "transactions": 1}, "Keyboard": {"avg_price": 79.99, "total_quantity": 3, "total_revenue": 239.97, "transactions": 2}, "Laptop Pro": {"avg_price": 1299.99, "total_quantity": 2, "total_revenue": 2599.98, "transactions": 2}, "Monitor": {"avg_price": 399.99, "total_quantity": 3, "total_revenue": 1199.97, "transactions": 2}, "Mouse": {"avg_price": 29.99, "total_quantity": 6, "total_revenue": 179.94, "transactions": 3}, "Office Chair": {"avg_price": 299.99, "total_quantity": 1, "total_revenue": 299.99, "transactions": 1}, "USB Cable": {"avg_price": 9.99, "total_quantity": 3, "total_revenue": 29.97, "transactions": 1}, "Webcam": {"avg_price": 89.99, "total_quantity": 1, "total_revenue": 89.99, "transactions": 1}} + +-- !test_transaction_timeline -- +East {"count": 4, "total": 1779.94, "first_transaction": "2024-01-03 14:00:00+08:00", "last_transaction": "2024-01-08 11:00:00+08:00", "first_amount": 1299.99, "last_amount": 89.97} +North {"count": 5, "total": 2069.93, "first_transaction": "2024-01-01 10:00:00+08:00", "last_transaction": "2024-01-06 14:00:00+08:00", "first_amount": 1299.99, "last_amount": 149.99} +South {"count": 4, "total": 955.93, "first_transaction": "2024-01-02 11:00:00+08:00", "last_transaction": "2024-01-07 15:00:00+08:00", "first_amount": 79.99, "last_amount": 45.99} +West {"count": 2, "total": 61.97, "first_transaction": "2024-01-04 15:00:00+08:00", "last_transaction": "2024-01-06 10:00:00+08:00", "first_amount": 29.99, "last_amount": 31.98} + +-- !test_unique_tracker -- +Accessories {"unique_users": 2, "unique_products": 2, "payment_methods": ["Cash"]} +Electronics {"unique_users": 6, "unique_products": 6, "payment_methods": ["Credit", "Debit", "PayPal"]} +Furniture {"unique_users": 1, "unique_products": 1, "payment_methods": ["Credit"]} +Home {"unique_users": 1, "unique_products": 1, "payment_methods": ["Debit"]} + +-- !test_category_summary -- +{"Accessories": {"avg_per_transaction": 30.98, "total_items": 5, "total_revenue": 61.95, "transactions": 2}, "Electronics": {"avg_per_transaction": 405.44, "total_items": 16, "total_revenue": 4459.84, "transactions": 11}, "Furniture": {"avg_per_transaction": 299.99, "total_items": 1, "total_revenue": 299.99, "transactions": 1}, "Home": {"avg_per_transaction": 45.99, "total_items": 1, "total_revenue": 45.99, "transactions": 1}} + +-- !test_hierarchical_agg -- +{"East": {"categories": {"Electronics": {"products": 3, "revenue": 1479.95}, "Furniture": {"products": 1, "revenue": 299.99}}, "total_revenue": 1779.94}, "North": {"categories": {"Electronics": {"products": 5, "revenue": 2069.93}}, "total_revenue": 2069.93}, "South": {"categories": {"Accessories": {"products": 1, "revenue": 29.97}, "Electronics": {"products": 2, "revenue": 879.97}, "Home": {"products": 1, "revenue": 45.99}}, "total_revenue": 955.93}, "West": {"categories": {"Accessories": {"products": 1, "revenue": 31.98}, "Electronics": {"products": 1, "revenue": 29.99}}, "total_revenue": 61.97}} + +-- !test_complex_window -- +101 Laptop Pro 1299.99 {"101": {"item_count": 1, "total_spent": 1299.99, "unique_categories": 1}} +101 Mouse 29.99 {"101": {"item_count": 2, "total_spent": 1359.97, "unique_categories": 1}} +101 Monitor 399.99 {"101": {"item_count": 3, "total_spent": 1759.96, "unique_categories": 1}} +101 Headphones 149.99 {"101": {"item_count": 4, "total_spent": 1909.95, "unique_categories": 1}} +102 Keyboard 79.99 {"102": {"item_count": 1, "total_spent": 79.99, "unique_categories": 1}} +102 USB Cable 9.99 {"102": {"item_count": 2, "total_spent": 109.96, "unique_categories": 2}} +102 Desk Lamp 45.99 {"102": {"item_count": 3, "total_spent": 155.95, "unique_categories": 3}} +103 Laptop Pro 1299.99 {"103": {"item_count": 1, "total_spent": 1299.99, "unique_categories": 1}} +103 Webcam 89.99 {"103": {"item_count": 2, "total_spent": 1389.98, "unique_categories": 1}} +103 Mouse 29.99 {"103": {"item_count": 3, "total_spent": 1479.95, "unique_categories": 1}} +104 Mouse 29.99 {"104": {"item_count": 1, "total_spent": 29.99, "unique_categories": 1}} +104 HDMI Cable 15.99 {"104": {"item_count": 2, "total_spent": 61.97, "unique_categories": 2}} +105 Keyboard 79.99 {"105": {"item_count": 1, "total_spent": 159.98, "unique_categories": 1}} +106 Monitor 399.99 {"106": {"item_count": 1, "total_spent": 799.98, "unique_categories": 1}} +107 Office Chair 299.99 {"107": {"item_count": 1, "total_spent": 299.99, "unique_categories": 1}} + +-- !test_multi_complex -- +East {"unique_users": 2, "unique_products": 4, "payment_methods": ["Credit", "PayPal"]} {"Electronics": {"avg_per_transaction": 493.32, "total_items": 5, "total_revenue": 1479.95, "transactions": 3}, "Furniture": {"avg_per_transaction": 299.99, "total_items": 1, "total_revenue": 299.99, "transactions": 1}} +North {"unique_users": 2, "unique_products": 5, "payment_methods": ["Credit", "Debit"]} {"Electronics": {"avg_per_transaction": 413.99, "total_items": 7, "total_revenue": 2069.93, "transactions": 5}} +South {"unique_users": 2, "unique_products": 4, "payment_methods": ["Cash", "Credit", "Debit"]} {"Accessories": {"avg_per_transaction": 29.97, "total_items": 3, "total_revenue": 29.97, "transactions": 1}, "Electronics": {"avg_per_transaction": 439.99, "total_items": 3, "total_revenue": 879.97, "transactions": 2}, "Home": {"avg_per_transaction": 45.99, "total_items": 1, "total_revenue": 45.99, "transactions": 1}} +West {"unique_users": 1, "unique_products": 2, "payment_methods": ["Cash", "Credit"]} {"Accessories": {"avg_per_transaction": 31.98, "total_items": 2, "total_revenue": 31.98, "transactions": 1}, "Electronics": {"avg_per_transaction": 29.99, "total_items": 1, "total_revenue": 29.99, "transactions": 1}} + +-- !test_nested_complex -- +East {"Laptop Pro": {"avg_price": 1299.99, "total_quantity": 1, "total_revenue": 1299.99, "transactions": 1}, "Office Chair": {"avg_price": 299.99, "total_quantity": 1, "total_revenue": 299.99, "transactions": 1}, "Webcam": {"avg_price": 89.99, "total_quantity": 1, "total_revenue": 89.99, "transactions": 1}} +North {"Headphones": {"avg_price": 149.99, "total_quantity": 1, "total_revenue": 149.99, "transactions": 1}, "Keyboard": {"avg_price": 79.99, "total_quantity": 2, "total_revenue": 159.98, "transactions": 1}, "Laptop Pro": {"avg_price": 1299.99, "total_quantity": 1, "total_revenue": 1299.99, "transactions": 1}, "Monitor": {"avg_price": 399.99, "total_quantity": 1, "total_revenue": 399.99, "transactions": 1}} +South {"Keyboard": {"avg_price": 79.99, "total_quantity": 1, "total_revenue": 79.99, "transactions": 1}, "Monitor": {"avg_price": 399.99, "total_quantity": 2, "total_revenue": 799.98, "transactions": 1}} + +-- !test_complex_shuffle -- +East Electronics {"East": {"categories": {"Electronics": {"products": 3, "revenue": 1479.95}}, "total_revenue": 1479.95}} +East Furniture {"East": {"categories": {"Furniture": {"products": 1, "revenue": 299.99}}, "total_revenue": 299.99}} +North Electronics {"North": {"categories": {"Electronics": {"products": 5, "revenue": 2069.93}}, "total_revenue": 2069.93}} +South Accessories {"South": {"categories": {"Accessories": {"products": 1, "revenue": 29.97}}, "total_revenue": 29.97}} +South Electronics {"South": {"categories": {"Electronics": {"products": 2, "revenue": 879.97}}, "total_revenue": 879.97}} +South Home {"South": {"categories": {"Home": {"products": 1, "revenue": 45.99}}, "total_revenue": 45.99}} +West Accessories {"West": {"categories": {"Accessories": {"products": 1, "revenue": 31.98}}, "total_revenue": 31.98}} +West Electronics {"West": {"categories": {"Electronics": {"products": 1, "revenue": 29.99}}, "total_revenue": 29.99}} + +-- !test_empty_groups -- + +-- !test_null_handling -- +{"101": {"item_count": 1, "total_spent": 100.0, "unique_categories": 1}, "102": {"item_count": 0, "total_spent": 300.0, "unique_categories": 1}, "103": {"item_count": 1, "total_spent": 400.0, "unique_categories": 0}, "104": {"item_count": 1, "total_spent": 0.0, "unique_categories": 1}} + +-- !test_large_state -- +15 {"East": {"categories": {"Electronics": {"products": 3, "revenue": 1479.95}, "Furniture": {"products": 1, "revenue": 299.99}}, "total_revenue": 1779.94}, "North": {"categories": {"Electronics": {"products": 5, "revenue": 2069.93}}, "total_revenue": 2069.93}, "South": {"categories": {"Accessories": {"products": 1, "revenue": 29.97}, "Electronics": {"products": 2, "revenue": 879.97}, "Home": {"products": 1, "revenue": 45.99}}, "total_revenue": 955.93}, "West": {"categories": {"Accessories": {"products": 1, "revenue": 31.98}, "Electronics": {"products": 1, "revenue": 29.99}}, "total_revenue": 61.97}} {"101": {"item_count": 4, "total_spent": 1909.95, "unique_categories": 1}, "102": {"item_count": 3, "total_spent": 155.95, "unique_categories": 3}, "103": {"item_count": 3, "total_spent": 1479.95, "unique_categories": 1}, "104": {"item_count": 2, "total_spent": 61.97, "unique_categories": 2}, "105": {"item_count": 1, "total_spent": 159.98, "unique_categories": 1}, "106": {"item_count": 1, "total_spent": 799.98, "unique_categories": 1}, "107": {"item_count": 1, "total_spent": 299.99, "unique_categories": 1}} + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_state_objects_module.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_state_objects_module.out new file mode 100644 index 00000000000000..03796e9a118a6d --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_complex_state_objects_module.out @@ -0,0 +1,77 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !test_user_profile -- +{"101": {"item_count": 4, "total_spent": 1909.95, "unique_categories": 1}, "102": {"item_count": 3, "total_spent": 155.95, "unique_categories": 3}, "103": {"item_count": 3, "total_spent": 1479.95, "unique_categories": 1}, "104": {"item_count": 2, "total_spent": 61.97, "unique_categories": 2}, "105": {"item_count": 1, "total_spent": 159.98, "unique_categories": 1}, "106": {"item_count": 1, "total_spent": 799.98, "unique_categories": 1}, "107": {"item_count": 1, "total_spent": 299.99, "unique_categories": 1}} + +-- !test_product_stats -- +{"Desk Lamp": {"avg_price": 45.99, "total_quantity": 1, "total_revenue": 45.99, "transactions": 1}, "HDMI Cable": {"avg_price": 15.99, "total_quantity": 2, "total_revenue": 31.98, "transactions": 1}, "Headphones": {"avg_price": 149.99, "total_quantity": 1, "total_revenue": 149.99, "transactions": 1}, "Keyboard": {"avg_price": 79.99, "total_quantity": 3, "total_revenue": 239.97, "transactions": 2}, "Laptop Pro": {"avg_price": 1299.99, "total_quantity": 2, "total_revenue": 2599.98, "transactions": 2}, "Monitor": {"avg_price": 399.99, "total_quantity": 3, "total_revenue": 1199.97, "transactions": 2}, "Mouse": {"avg_price": 29.99, "total_quantity": 6, "total_revenue": 179.94, "transactions": 3}, "Office Chair": {"avg_price": 299.99, "total_quantity": 1, "total_revenue": 299.99, "transactions": 1}, "USB Cable": {"avg_price": 9.99, "total_quantity": 3, "total_revenue": 29.97, "transactions": 1}, "Webcam": {"avg_price": 89.99, "total_quantity": 1, "total_revenue": 89.99, "transactions": 1}} + +-- !test_transaction_timeline -- +East {"count": 4, "total": 1779.94, "first_transaction": "2024-01-03 14:00:00+08:00", "last_transaction": "2024-01-08 11:00:00+08:00", "first_amount": 1299.99, "last_amount": 89.97} +North {"count": 5, "total": 2069.93, "first_transaction": "2024-01-01 10:00:00+08:00", "last_transaction": "2024-01-06 14:00:00+08:00", "first_amount": 1299.99, "last_amount": 149.99} +South {"count": 4, "total": 955.93, "first_transaction": "2024-01-02 11:00:00+08:00", "last_transaction": "2024-01-07 15:00:00+08:00", "first_amount": 79.99, "last_amount": 45.99} +West {"count": 2, "total": 61.97, "first_transaction": "2024-01-04 15:00:00+08:00", "last_transaction": "2024-01-06 10:00:00+08:00", "first_amount": 29.99, "last_amount": 31.98} + +-- !test_unique_tracker -- +Accessories {"unique_users": 2, "unique_products": 2, "payment_methods": ["Cash"]} +Electronics {"unique_users": 6, "unique_products": 6, "payment_methods": ["Credit", "Debit", "PayPal"]} +Furniture {"unique_users": 1, "unique_products": 1, "payment_methods": ["Credit"]} +Home {"unique_users": 1, "unique_products": 1, "payment_methods": ["Debit"]} + +-- !test_category_summary -- +{"Accessories": {"avg_per_transaction": 30.98, "total_items": 5, "total_revenue": 61.95, "transactions": 2}, "Electronics": {"avg_per_transaction": 405.44, "total_items": 16, "total_revenue": 4459.84, "transactions": 11}, "Furniture": {"avg_per_transaction": 299.99, "total_items": 1, "total_revenue": 299.99, "transactions": 1}, "Home": {"avg_per_transaction": 45.99, "total_items": 1, "total_revenue": 45.99, "transactions": 1}} + +-- !test_hierarchical_agg -- +{"East": {"categories": {"Electronics": {"products": 3, "revenue": 1479.95}, "Furniture": {"products": 1, "revenue": 299.99}}, "total_revenue": 1779.94}, "North": {"categories": {"Electronics": {"products": 5, "revenue": 2069.93}}, "total_revenue": 2069.93}, "South": {"categories": {"Accessories": {"products": 1, "revenue": 29.97}, "Electronics": {"products": 2, "revenue": 879.97}, "Home": {"products": 1, "revenue": 45.99}}, "total_revenue": 955.93}, "West": {"categories": {"Accessories": {"products": 1, "revenue": 31.98}, "Electronics": {"products": 1, "revenue": 29.99}}, "total_revenue": 61.97}} + +-- !test_complex_window -- +101 Laptop Pro 1299.99 {"101": {"item_count": 1, "total_spent": 1299.99, "unique_categories": 1}} +101 Mouse 29.99 {"101": {"item_count": 2, "total_spent": 1359.97, "unique_categories": 1}} +101 Monitor 399.99 {"101": {"item_count": 3, "total_spent": 1759.96, "unique_categories": 1}} +101 Headphones 149.99 {"101": {"item_count": 4, "total_spent": 1909.95, "unique_categories": 1}} +102 Keyboard 79.99 {"102": {"item_count": 1, "total_spent": 79.99, "unique_categories": 1}} +102 USB Cable 9.99 {"102": {"item_count": 2, "total_spent": 109.96, "unique_categories": 2}} +102 Desk Lamp 45.99 {"102": {"item_count": 3, "total_spent": 155.95, "unique_categories": 3}} +103 Laptop Pro 1299.99 {"103": {"item_count": 1, "total_spent": 1299.99, "unique_categories": 1}} +103 Webcam 89.99 {"103": {"item_count": 2, "total_spent": 1389.98, "unique_categories": 1}} +103 Mouse 29.99 {"103": {"item_count": 3, "total_spent": 1479.95, "unique_categories": 1}} +104 Mouse 29.99 {"104": {"item_count": 1, "total_spent": 29.99, "unique_categories": 1}} +104 HDMI Cable 15.99 {"104": {"item_count": 2, "total_spent": 61.97, "unique_categories": 2}} +105 Keyboard 79.99 {"105": {"item_count": 1, "total_spent": 159.98, "unique_categories": 1}} +106 Monitor 399.99 {"106": {"item_count": 1, "total_spent": 799.98, "unique_categories": 1}} +107 Office Chair 299.99 {"107": {"item_count": 1, "total_spent": 299.99, "unique_categories": 1}} + +-- !test_multi_complex -- +East {"unique_users": 2, "unique_products": 4, "payment_methods": ["Credit", "PayPal"]} {"Electronics": {"avg_per_transaction": 493.32, "total_items": 5, "total_revenue": 1479.95, "transactions": 3}, "Furniture": {"avg_per_transaction": 299.99, "total_items": 1, "total_revenue": 299.99, "transactions": 1}} +North {"unique_users": 2, "unique_products": 5, "payment_methods": ["Credit", "Debit"]} {"Electronics": {"avg_per_transaction": 413.99, "total_items": 7, "total_revenue": 2069.93, "transactions": 5}} +South {"unique_users": 2, "unique_products": 4, "payment_methods": ["Cash", "Credit", "Debit"]} {"Accessories": {"avg_per_transaction": 29.97, "total_items": 3, "total_revenue": 29.97, "transactions": 1}, "Electronics": {"avg_per_transaction": 439.99, "total_items": 3, "total_revenue": 879.97, "transactions": 2}, "Home": {"avg_per_transaction": 45.99, "total_items": 1, "total_revenue": 45.99, "transactions": 1}} +West {"unique_users": 1, "unique_products": 2, "payment_methods": ["Cash", "Credit"]} {"Accessories": {"avg_per_transaction": 31.98, "total_items": 2, "total_revenue": 31.98, "transactions": 1}, "Electronics": {"avg_per_transaction": 29.99, "total_items": 1, "total_revenue": 29.99, "transactions": 1}} + +-- !test_nested_complex -- +East {"Laptop Pro": {"avg_price": 1299.99, "total_quantity": 1, "total_revenue": 1299.99, "transactions": 1}, "Office Chair": {"avg_price": 299.99, "total_quantity": 1, "total_revenue": 299.99, "transactions": 1}, "Webcam": {"avg_price": 89.99, "total_quantity": 1, "total_revenue": 89.99, "transactions": 1}} +North {"Headphones": {"avg_price": 149.99, "total_quantity": 1, "total_revenue": 149.99, "transactions": 1}, "Keyboard": {"avg_price": 79.99, "total_quantity": 2, "total_revenue": 159.98, "transactions": 1}, "Laptop Pro": {"avg_price": 1299.99, "total_quantity": 1, "total_revenue": 1299.99, "transactions": 1}, "Monitor": {"avg_price": 399.99, "total_quantity": 1, "total_revenue": 399.99, "transactions": 1}} +South {"Keyboard": {"avg_price": 79.99, "total_quantity": 1, "total_revenue": 79.99, "transactions": 1}, "Monitor": {"avg_price": 399.99, "total_quantity": 2, "total_revenue": 799.98, "transactions": 1}} + +-- !test_complex_shuffle -- +East Electronics {"East": {"categories": {"Electronics": {"products": 3, "revenue": 1479.95}}, "total_revenue": 1479.95}} +East Furniture {"East": {"categories": {"Furniture": {"products": 1, "revenue": 299.99}}, "total_revenue": 299.99}} +North Electronics {"North": {"categories": {"Electronics": {"products": 5, "revenue": 2069.93}}, "total_revenue": 2069.93}} +South Accessories {"South": {"categories": {"Accessories": {"products": 1, "revenue": 29.97}}, "total_revenue": 29.97}} +South Electronics {"South": {"categories": {"Electronics": {"products": 2, "revenue": 879.97}}, "total_revenue": 879.97}} +South Home {"South": {"categories": {"Home": {"products": 1, "revenue": 45.99}}, "total_revenue": 45.99}} +West Accessories {"West": {"categories": {"Accessories": {"products": 1, "revenue": 31.98}}, "total_revenue": 31.98}} +West Electronics {"West": {"categories": {"Electronics": {"products": 1, "revenue": 29.99}}, "total_revenue": 29.99}} + +-- !test_empty_groups -- + +-- !test_null_handling -- +{"101": {"item_count": 1, "total_spent": 100.0, "unique_categories": 1}, "102": {"item_count": 0, "total_spent": 300.0, "unique_categories": 1}, "103": {"item_count": 1, "total_spent": 400.0, "unique_categories": 0}, "104": {"item_count": 1, "total_spent": 0.0, "unique_categories": 1}} + +-- !test_large_state -- +15 {"East": {"categories": {"Electronics": {"products": 3, "revenue": 1479.95}, "Furniture": {"products": 1, "revenue": 299.99}}, "total_revenue": 1779.94}, "North": {"categories": {"Electronics": {"products": 5, "revenue": 2069.93}}, "total_revenue": 2069.93}, "South": {"categories": {"Accessories": {"products": 1, "revenue": 29.97}, "Electronics": {"products": 2, "revenue": 879.97}, "Home": {"products": 1, "revenue": 45.99}}, "total_revenue": 955.93}, "West": {"categories": {"Accessories": {"products": 1, "revenue": 31.98}, "Electronics": {"products": 1, "revenue": 29.99}}, "total_revenue": 61.97}} {"101": {"item_count": 4, "total_spent": 1909.95, "unique_categories": 1}, "102": {"item_count": 3, "total_spent": 155.95, "unique_categories": 3}, "103": {"item_count": 3, "total_spent": 1479.95, "unique_categories": 1}, "104": {"item_count": 2, "total_spent": 61.97, "unique_categories": 2}, "105": {"item_count": 1, "total_spent": 159.98, "unique_categories": 1}, "106": {"item_count": 1, "total_spent": 799.98, "unique_categories": 1}, "107": {"item_count": 1, "total_spent": 299.99, "unique_categories": 1}} + +-- !module_reuse -- +{"101": {"item_count": 4, "total_spent": 1909.95, "unique_categories": 1}, "102": {"item_count": 3, "total_spent": 155.95, "unique_categories": 3}, "103": {"item_count": 3, "total_spent": 1479.95, "unique_categories": 1}, "104": {"item_count": 2, "total_spent": 61.97, "unique_categories": 2}, "105": {"item_count": 1, "total_spent": 159.98, "unique_categories": 1}, "106": {"item_count": 1, "total_spent": 799.98, "unique_categories": 1}, "107": {"item_count": 1, "total_spent": 299.99, "unique_categories": 1}} {"101": {"item_count": 4, "total_spent": 1909.95, "unique_categories": 1}, "102": {"item_count": 3, "total_spent": 155.95, "unique_categories": 3}, "103": {"item_count": 3, "total_spent": 1479.95, "unique_categories": 1}, "104": {"item_count": 2, "total_spent": 61.97, "unique_categories": 2}, "105": {"item_count": 1, "total_spent": 159.98, "unique_categories": 1}, "106": {"item_count": 1, "total_spent": 799.98, "unique_categories": 1}, "107": {"item_count": 1, "total_spent": 299.99, "unique_categories": 1}} + +-- !global_test -- +{"101": {"item_count": 4, "total_spent": 1909.95, "unique_categories": 1}, "102": {"item_count": 3, "total_spent": 155.95, "unique_categories": 3}, "103": {"item_count": 3, "total_spent": 1479.95, "unique_categories": 1}, "104": {"item_count": 2, "total_spent": 61.97, "unique_categories": 2}, "105": {"item_count": 1, "total_spent": 159.98, "unique_categories": 1}, "106": {"item_count": 1, "total_spent": 799.98, "unique_categories": 1}, "107": {"item_count": 1, "total_spent": 299.99, "unique_categories": 1}} + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_concurrent.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_concurrent.out new file mode 100644 index 00000000000000..d4257e8a4971b0 --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_concurrent.out @@ -0,0 +1,47 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !concurrent_two_udaf -- +550 10 + +-- !concurrent_three_udaf -- +550 10 55 + +-- !concurrent_udaf_group_by -- +A 200 4 50 +B 150 3 50 +C 200 3 66.66666666666667 + +-- !concurrent_five_udaf -- +550 10 55 100 10 + +-- !concurrent_mixed_types -- +550 60 10 5.5 + +-- !concurrent_complex_group -- +A 200 4 50 100 10 180 +B 150 3 50 80 30 82.5 +C 200 3 66.66666666666667 90 50 125 + +-- !concurrent_same_udaf -- +550 55 + +-- !concurrent_with_filter -- +490 7 70 + +-- !concurrent_with_having -- +A 200 4 50 +B 150 3 50 +C 200 3 66.66666666666667 + +-- !concurrent_stress -- +A 200 26 4 4 50 6.5 100 10 22 180 +B 150 13 3 3 50 4.333333333333333 80 30 16.5 82.5 +C 200 16 3 3 66.66666666666667 5.333333333333333 90 50 21.5 125 + +-- !concurrent_verify_sum -- +550 550 10 10 + +-- !concurrent_verify_group -- +A 200 200 4 4 50 50 +B 150 150 3 3 50 50 +C 200 200 3 3 66.66666666666667 66.66666666666667 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_data_types.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_data_types.out new file mode 100644 index 00000000000000..730c26011fc2eb --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_data_types.out @@ -0,0 +1,35 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_data -- +1 10 100 1000 10000 1.5 10.55 100.50 apple 2024-01-01 true +2 20 200 2000 20000 2.5 20.55 200.50 banana 2024-01-02 false +3 30 300 3000 30000 3.5 30.55 300.50 cherry 2024-01-03 true +4 40 400 4000 40000 4.5 40.55 400.50 date 2024-01-04 true +5 50 500 5000 50000 5.5 50.55 500.50 elderberry 2024-01-05 false + +-- !tinyint -- +150 + +-- !smallint -- +1500 + +-- !bigint -- +150000 + +-- !float -- +17.5 + +-- !decimal -- +1502.5 + +-- !string -- +apple,banana,cherry,date,elderberry + +-- !boolean -- +3 + +-- !mixed_types -- +150 1500 150000 17.5 1502.5 apple,banana,cherry,date,elderberry 3 + +-- !type_conversion -- +15000 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_edge_cases.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_edge_cases.out new file mode 100644 index 00000000000000..2462201fae348a --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_edge_cases.out @@ -0,0 +1,72 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_data -- +1 positive 2147483647 1.797693134862316e+308 +2 positive 1000000000 999999999.999 +3 negative -2147483648 -1.797693134862316e+308 +4 negative -1000000000 -999999999.999 +5 zero 0 0 +6 zero 0 0 +7 small 1 1e-10 +8 small -1 -1e-10 +9 duplicate 100 100.5 +10 duplicate 100 100.5 +11 duplicate 100 100.5 +12 mixed 50 -50.5 +13 mixed -50 50.5 + +-- !large_numbers -- +duplicate 300 +mixed 0 +negative -3147483648 +positive 3147483647 +small 0 +zero 0 + +-- !negative_min -- +-2147483648 + +-- !large_max -- +2147483647 + +-- !distinct_count -- +duplicate 1 +mixed 2 +negative 2 +positive 2 +small 2 +zero 1 + +-- !product -- +small -1 + +-- !abs_sum -- +duplicate 300 +mixed 100 +negative 3147483648 +positive 3147483647 +small 2 +zero 0 + +-- !safe_avg -- +duplicate 100 +mixed 0 +negative -1573741824 +positive 1573741823.5 +small 0 +zero 0 + +-- !sign_summary -- +pos:7,neg:4,zero:2 + +-- !all_zeros_sum -- +0 + +-- !all_zeros_avg -- +0 + +-- !all_zeros_product -- +0 + +-- !single_value -- +42 42 42 42 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_inline.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_inline.out new file mode 100644 index 00000000000000..5fff44d15155e4 --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_inline.out @@ -0,0 +1,78 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_data -- +1 10 10.5 A +2 20 20.5 A +3 30 30.5 B +4 40 40.5 B +5 50 50.5 C + +-- !test1 -- +150 150 + +-- !test2 -- +A 30 30 +B 70 70 +C 50 50 + +-- !test3 -- +30.5 30.5 + +-- !test4 -- +A 15.5 15.5 +B 35.5 35.5 +C 50.5 50.5 + +-- !test5 -- +5 5 + +-- !test6 -- +A 2 2 +B 2 2 +C 1 1 + +-- !test7 -- +50 50 + +-- !test8 -- +A 20 20 +B 40 40 +C 50 50 + +-- !test_null1 -- +150 150 + +-- !test_null2 -- +5 5 + +-- !test_null3 -- +A 30 30 +B 70 70 +C 50 50 + +-- !test_window1 -- +1 A 10 30 30 +2 A 20 30 30 +3 B 30 70 70 +4 B 40 70 70 +5 C 50 50 50 + +-- !test_window2 -- +1 A 10 10 10 +2 A 20 30 30 +3 B 30 30 30 +4 B 40 70 70 +5 C 50 50 50 + +-- !test_multiple -- +A 30 30 2 2 20 20 30.5 30.5 +B 70 70 2 2 40 40 47.16666666666666 47.16666666666666 +C 50 50 1 1 50 50 50.5 50.5 + +-- !test_global1 -- +150 150 + +-- !test_global2 -- +A 30 30 +B 70 70 +C 50 50 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_inline_simple.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_inline_simple.out new file mode 100644 index 00000000000000..abdebbd3d1a934 --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_inline_simple.out @@ -0,0 +1,21 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !data -- +1 10 A +2 20 A +3 30 B +4 40 B +5 50 C + +-- !sum_all -- +150 + +-- !sum_group -- +A 30 +B 70 +C 50 + +-- !compare -- +A 30 30 +B 70 70 +C 50 50 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_nested_query.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_nested_query.out new file mode 100644 index 00000000000000..84f1493598976a --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_nested_query.out @@ -0,0 +1,67 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_orders -- +1 101 1 2 29.99 2024-01-01 +2 101 2 1 49.99 2024-01-02 +3 102 1 3 29.99 2024-01-03 +4 102 3 1 99.98999999999999 2024-01-04 +5 103 2 2 49.99 2024-01-05 +6 103 1 1 29.99 2024-01-06 +7 104 3 2 99.98999999999999 2024-01-07 +8 104 2 3 49.99 2024-01-08 +9 105 1 4 29.99 2024-01-09 +10 105 3 1 99.98999999999999 2024-01-10 + +-- !select_customers -- +101 Alice New York Premium +102 Bob Los Angeles Standard +103 Charlie Chicago Premium +104 David Houston Standard +105 Eve Phoenix Premium + +-- !subquery_1 -- +101 109.97 +102 189.96 +103 129.97 +104 349.95 +105 219.95 + +-- !join_1 -- +Alice New York Premium 109.97 2 +Bob Los Angeles Standard 189.96 2 +Charlie Chicago Premium 129.97 2 +David Houston Standard 349.95 2 +Eve Phoenix Premium 219.95 2 + +-- !cte_1 -- +104 David 349.95 2 +105 Eve 219.95 2 +102 Bob 189.96 2 +103 Charlie 129.97 2 +101 Alice 109.97 2 + +-- !cte_2 -- +Eve 219.95 +Charlie 129.97 +Alice 109.97 + +-- !nested_agg -- +Premium 3 459.89 +Standard 2 539.91 + +-- !having -- +Standard 539.91 4 +Premium 459.89 6 + +-- !self_join -- +1 299.9 +2 299.94 +3 399.96 + +-- !union -- +Premium 459.89 +Standard 539.91 + +-- !complex_nested -- +Premium 153.2966666666667 219.95 109.97 +Standard 269.955 349.95 189.96 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_null_handling.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_null_handling.out new file mode 100644 index 00000000000000..ed430f0e4f9659 --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_null_handling.out @@ -0,0 +1,65 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_data -- +1 A 10 10.5 apple +2 A \N 20.3 banana +3 A 30 \N \N +4 A \N \N cherry +5 B 40 40.2 \N +6 B \N \N \N +7 B 60 60.8 date +8 C \N \N \N +9 C \N \N \N +10 C \N \N \N + +-- !count_null_all -- +4 + +-- !count_null_group -- +A 2 +B 2 +C 0 + +-- !sum_null_all -- +131.8 + +-- !sum_null_group -- +A 30.8 +B 101 +C \N + +-- !first_nonnull_all -- +apple + +-- !first_nonnull_group -- +A apple +B date +C \N + +-- !null_count_all -- +6 + +-- !null_count_group -- +A 2 +B 1 +C 3 + +-- !avg_coalesce_all -- +32.95 + +-- !avg_coalesce_group -- +A 15.4 +B 50.5 +C \N + +-- !all_null_count -- +0 + +-- !all_null_sum -- +\N + +-- !empty_count -- +0 + +-- !empty_sum -- +\N + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_simple.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_simple.out new file mode 100644 index 00000000000000..af2f48f16cf596 --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_simple.out @@ -0,0 +1,21 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_data -- +1 10 A +2 20 A +3 30 B +4 40 B +5 50 C + +-- !test1 -- +150 + +-- !test2 -- +A 30 +B 70 +C 50 + +-- !test3 -- +A 30 30 +B 70 70 +C 50 50 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_window_advanced_inline.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_window_advanced_inline.out new file mode 100644 index 00000000000000..c41bd2107bb4aa --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_window_advanced_inline.out @@ -0,0 +1,141 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_data -- +2024-01-01T10:00 temperature 20.5 sensor1 room1 +2024-01-01T10:00 humidity 45 sensor2 room1 +2024-01-01T10:00 temperature 19.5 sensor3 room2 +2024-01-01T10:05 temperature 21 sensor1 room1 +2024-01-01T10:05 humidity 46 sensor2 room1 +2024-01-01T10:05 temperature 20 sensor3 room2 +2024-01-01T10:10 temperature 21.5 sensor1 room1 +2024-01-01T10:10 humidity 47.5 sensor2 room1 +2024-01-01T10:10 temperature 20.8 sensor3 room2 +2024-01-01T10:15 temperature 22 sensor1 room1 +2024-01-01T10:15 humidity 48 sensor2 room1 +2024-01-01T10:15 temperature 21.2 sensor3 room2 +2024-01-01T10:20 temperature 22.5 sensor1 room1 +2024-01-01T10:20 humidity 49 sensor2 room1 +2024-01-01T10:20 temperature 21.8 sensor3 room2 + +-- !moving_avg_3period -- +2024-01-01T10:00 sensor1 20.5 20.5 +2024-01-01T10:05 sensor1 21 20.75 +2024-01-01T10:10 sensor1 21.5 21 +2024-01-01T10:15 sensor1 22 21.5 +2024-01-01T10:20 sensor1 22.5 22 +2024-01-01T10:00 sensor3 19.5 19.5 +2024-01-01T10:05 sensor3 20 19.75 +2024-01-01T10:10 sensor3 20.8 20.1 +2024-01-01T10:15 sensor3 21.2 20.66666666666667 +2024-01-01T10:20 sensor3 21.8 21.26666666666667 + +-- !rolling_stddev -- +2024-01-01T10:00 sensor1 20.5 \N +2024-01-01T10:05 sensor1 21 0.25 +2024-01-01T10:10 sensor1 21.5 0.408248290463863 +2024-01-01T10:15 sensor1 22 0.5590169943749475 +2024-01-01T10:20 sensor1 22.5 0.5590169943749475 +2024-01-01T10:00 sensor3 19.5 \N +2024-01-01T10:05 sensor3 20 0.25 +2024-01-01T10:10 sensor3 20.8 0.535412613473634 +2024-01-01T10:15 sensor3 21.2 0.6647367900154164 +2024-01-01T10:20 sensor3 21.8 0.6538348415311012 + +-- !change_from_first -- +2024-01-01T10:00 sensor1 20.5 20.5 +2024-01-01T10:05 sensor1 21 20.75 +2024-01-01T10:10 sensor1 21.5 21 +2024-01-01T10:15 sensor1 22 21.5 +2024-01-01T10:20 sensor1 22.5 22 +2024-01-01T10:00 sensor3 19.5 19.5 +2024-01-01T10:05 sensor3 20 19.75 +2024-01-01T10:10 sensor3 20.8 20.1 +2024-01-01T10:15 sensor3 21.2 20.66666666666667 +2024-01-01T10:20 sensor3 21.8 21.26666666666667 + +-- !minmax_normalize -- +sensor1 2024-01-01T10:00 20.5 20.5 20.5 +sensor1 2024-01-01T10:05 21 20.5 21 +sensor1 2024-01-01T10:10 21.5 20.5 21.5 +sensor1 2024-01-01T10:15 22 20.5 22 +sensor1 2024-01-01T10:20 22.5 20.5 22.5 +sensor3 2024-01-01T10:00 19.5 19.5 19.5 +sensor3 2024-01-01T10:05 20 19.5 20 +sensor3 2024-01-01T10:10 20.8 19.5 20.8 +sensor3 2024-01-01T10:15 21.2 19.5 21.2 +sensor3 2024-01-01T10:20 21.8 19.5 21.8 + +-- !cumulative_weighted -- +2024-01-01T10:00 room1 humidity 45 45 45 +2024-01-01T10:05 room1 humidity 46 45.5 45.5 +2024-01-01T10:10 room1 humidity 47.5 46.16666666666666 46.75 +2024-01-01T10:15 room1 humidity 48 46.625 47.75 +2024-01-01T10:20 room1 humidity 49 47.1 48.5 +2024-01-01T10:00 room1 temperature 20.5 20.5 20.5 +2024-01-01T10:05 room1 temperature 21 20.75 20.75 +2024-01-01T10:10 room1 temperature 21.5 21 21.25 +2024-01-01T10:15 room1 temperature 22 21.25 21.75 +2024-01-01T10:20 room1 temperature 22.5 21.5 22.25 +2024-01-01T10:00 room2 temperature 19.5 19.5 19.5 +2024-01-01T10:05 room2 temperature 20 19.75 19.75 +2024-01-01T10:10 room2 temperature 20.8 20.1 20.4 +2024-01-01T10:15 room2 temperature 21.2 20.375 21 +2024-01-01T10:20 room2 temperature 21.8 20.66 21.5 + +-- !trend_detection -- +2024-01-01T10:00 sensor1 20.5 20.5 Below MA +2024-01-01T10:05 sensor1 21 20.75 Above MA +2024-01-01T10:10 sensor1 21.5 21 Above MA +2024-01-01T10:15 sensor1 22 21.5 Above MA +2024-01-01T10:20 sensor1 22.5 22 Above MA +2024-01-01T10:00 sensor3 19.5 19.5 Below MA +2024-01-01T10:05 sensor3 20 19.75 Above MA +2024-01-01T10:10 sensor3 20.8 20.1 Above MA +2024-01-01T10:15 sensor3 21.2 20.66666666666667 Above MA +2024-01-01T10:20 sensor3 21.8 21.26666666666667 Above MA + +-- !multi_metric -- +2024-01-01T10:00 room1 20.5 45 20.5 45 +2024-01-01T10:05 room1 21 46 20.75 45.5 +2024-01-01T10:10 room1 21.5 47.5 21 46.16666666666666 +2024-01-01T10:15 room1 22 48 21.5 47.16666666666666 +2024-01-01T10:20 room1 22.5 49 22 48.16666666666666 +2024-01-01T10:00 room2 19.5 \N 19.5 \N +2024-01-01T10:05 room2 20 \N 19.75 \N +2024-01-01T10:10 room2 20.8 \N 20.1 \N +2024-01-01T10:15 room2 21.2 \N 20.66666666666667 \N +2024-01-01T10:20 room2 21.8 \N 21.26666666666667 \N + +-- !gap_analysis -- +A 2024-01-01T10:00 10 10 +A 2024-01-01T10:05 11 11 +A 2024-01-01T10:10 12 12 +A 2024-01-01T10:20 15 15 +A 2024-01-01T10:25 16 16 +B 2024-01-01T10:00 20 20 +B 2024-01-01T10:10 22 22 +B 2024-01-01T10:15 23 23 + +-- !window_percentile -- +room1 2024-01-01T10:00 20.5 20.5 +room1 2024-01-01T10:05 21 20.75 +room1 2024-01-01T10:10 21.5 21 +room1 2024-01-01T10:15 22 21.5 +room1 2024-01-01T10:20 22.5 22 +room2 2024-01-01T10:00 19.5 19.5 +room2 2024-01-01T10:05 20 19.75 +room2 2024-01-01T10:10 20.8 20 +room2 2024-01-01T10:15 21.2 20.8 +room2 2024-01-01T10:20 21.8 21.2 + +-- !cumulative_dist -- +sensor1 20.5 20.5 1 +sensor1 21 20.75 2 +sensor1 21.5 21 3 +sensor1 22 21.25 4 +sensor1 22.5 21.5 5 +sensor3 19.5 19.5 1 +sensor3 20 19.75 2 +sensor3 20.8 20.1 3 +sensor3 21.2 20.375 4 +sensor3 21.8 20.66 5 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_window_advanced_module.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_window_advanced_module.out new file mode 100644 index 00000000000000..dc35cd95697efd --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_window_advanced_module.out @@ -0,0 +1,157 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_data -- +2024-01-01T10:00 temperature 20.5 sensor1 room1 +2024-01-01T10:00 humidity 45 sensor2 room1 +2024-01-01T10:00 temperature 19.5 sensor3 room2 +2024-01-01T10:05 temperature 21 sensor1 room1 +2024-01-01T10:05 humidity 46 sensor2 room1 +2024-01-01T10:05 temperature 20 sensor3 room2 +2024-01-01T10:10 temperature 21.5 sensor1 room1 +2024-01-01T10:10 humidity 47.5 sensor2 room1 +2024-01-01T10:10 temperature 20.8 sensor3 room2 +2024-01-01T10:15 temperature 22 sensor1 room1 +2024-01-01T10:15 humidity 48 sensor2 room1 +2024-01-01T10:15 temperature 21.2 sensor3 room2 +2024-01-01T10:20 temperature 22.5 sensor1 room1 +2024-01-01T10:20 humidity 49 sensor2 room1 +2024-01-01T10:20 temperature 21.8 sensor3 room2 + +-- !moving_avg_3period -- +2024-01-01T10:00 sensor1 20.5 20.5 +2024-01-01T10:05 sensor1 21 20.75 +2024-01-01T10:10 sensor1 21.5 21 +2024-01-01T10:15 sensor1 22 21.5 +2024-01-01T10:20 sensor1 22.5 22 +2024-01-01T10:00 sensor3 19.5 19.5 +2024-01-01T10:05 sensor3 20 19.75 +2024-01-01T10:10 sensor3 20.8 20.1 +2024-01-01T10:15 sensor3 21.2 20.66666666666667 +2024-01-01T10:20 sensor3 21.8 21.26666666666667 + +-- !rolling_stddev -- +2024-01-01T10:00 sensor1 20.5 \N +2024-01-01T10:05 sensor1 21 0.25 +2024-01-01T10:10 sensor1 21.5 0.408248290463863 +2024-01-01T10:15 sensor1 22 0.5590169943749475 +2024-01-01T10:20 sensor1 22.5 0.5590169943749475 +2024-01-01T10:00 sensor3 19.5 \N +2024-01-01T10:05 sensor3 20 0.25 +2024-01-01T10:10 sensor3 20.8 0.535412613473634 +2024-01-01T10:15 sensor3 21.2 0.6647367900154164 +2024-01-01T10:20 sensor3 21.8 0.6538348415311012 + +-- !change_from_first -- +2024-01-01T10:00 sensor1 20.5 20.5 +2024-01-01T10:05 sensor1 21 20.75 +2024-01-01T10:10 sensor1 21.5 21 +2024-01-01T10:15 sensor1 22 21.5 +2024-01-01T10:20 sensor1 22.5 22 +2024-01-01T10:00 sensor3 19.5 19.5 +2024-01-01T10:05 sensor3 20 19.75 +2024-01-01T10:10 sensor3 20.8 20.1 +2024-01-01T10:15 sensor3 21.2 20.66666666666667 +2024-01-01T10:20 sensor3 21.8 21.26666666666667 + +-- !minmax_normalize -- +sensor1 2024-01-01T10:00 20.5 20.5 20.5 +sensor1 2024-01-01T10:05 21 20.5 21 +sensor1 2024-01-01T10:10 21.5 20.5 21.5 +sensor1 2024-01-01T10:15 22 20.5 22 +sensor1 2024-01-01T10:20 22.5 20.5 22.5 +sensor3 2024-01-01T10:00 19.5 19.5 19.5 +sensor3 2024-01-01T10:05 20 19.5 20 +sensor3 2024-01-01T10:10 20.8 19.5 20.8 +sensor3 2024-01-01T10:15 21.2 19.5 21.2 +sensor3 2024-01-01T10:20 21.8 19.5 21.8 + +-- !cumulative_weighted -- +2024-01-01T10:00 room1 humidity 45 45 45 +2024-01-01T10:05 room1 humidity 46 45.5 45.5 +2024-01-01T10:10 room1 humidity 47.5 46.16666666666666 46.75 +2024-01-01T10:15 room1 humidity 48 46.625 47.75 +2024-01-01T10:20 room1 humidity 49 47.1 48.5 +2024-01-01T10:00 room1 temperature 20.5 20.5 20.5 +2024-01-01T10:05 room1 temperature 21 20.75 20.75 +2024-01-01T10:10 room1 temperature 21.5 21 21.25 +2024-01-01T10:15 room1 temperature 22 21.25 21.75 +2024-01-01T10:20 room1 temperature 22.5 21.5 22.25 +2024-01-01T10:00 room2 temperature 19.5 19.5 19.5 +2024-01-01T10:05 room2 temperature 20 19.75 19.75 +2024-01-01T10:10 room2 temperature 20.8 20.1 20.4 +2024-01-01T10:15 room2 temperature 21.2 20.375 21 +2024-01-01T10:20 room2 temperature 21.8 20.66 21.5 + +-- !trend_detection -- +2024-01-01T10:00 sensor1 20.5 20.5 Below MA +2024-01-01T10:05 sensor1 21 20.75 Above MA +2024-01-01T10:10 sensor1 21.5 21 Above MA +2024-01-01T10:15 sensor1 22 21.5 Above MA +2024-01-01T10:20 sensor1 22.5 22 Above MA +2024-01-01T10:00 sensor3 19.5 19.5 Below MA +2024-01-01T10:05 sensor3 20 19.75 Above MA +2024-01-01T10:10 sensor3 20.8 20.1 Above MA +2024-01-01T10:15 sensor3 21.2 20.66666666666667 Above MA +2024-01-01T10:20 sensor3 21.8 21.26666666666667 Above MA + +-- !multi_metric -- +2024-01-01T10:00 room1 20.5 45 20.5 45 +2024-01-01T10:05 room1 21 46 20.75 45.5 +2024-01-01T10:10 room1 21.5 47.5 21 46.16666666666666 +2024-01-01T10:15 room1 22 48 21.5 47.16666666666666 +2024-01-01T10:20 room1 22.5 49 22 48.16666666666666 +2024-01-01T10:00 room2 19.5 \N 19.5 \N +2024-01-01T10:05 room2 20 \N 19.75 \N +2024-01-01T10:10 room2 20.8 \N 20.1 \N +2024-01-01T10:15 room2 21.2 \N 20.66666666666667 \N +2024-01-01T10:20 room2 21.8 \N 21.26666666666667 \N + +-- !gap_analysis -- +A 2024-01-01T10:00 10 10 +A 2024-01-01T10:05 11 11 +A 2024-01-01T10:10 12 12 +A 2024-01-01T10:20 15 15 +A 2024-01-01T10:25 16 16 +B 2024-01-01T10:00 20 20 +B 2024-01-01T10:10 22 22 +B 2024-01-01T10:15 23 23 + +-- !window_percentile -- +room1 2024-01-01T10:00 20.5 20.5 +room1 2024-01-01T10:05 21 20.75 +room1 2024-01-01T10:10 21.5 21 +room1 2024-01-01T10:15 22 21.5 +room1 2024-01-01T10:20 22.5 22 +room2 2024-01-01T10:00 19.5 19.5 +room2 2024-01-01T10:05 20 19.75 +room2 2024-01-01T10:10 20.8 20 +room2 2024-01-01T10:15 21.2 20.8 +room2 2024-01-01T10:20 21.8 21.2 + +-- !cumulative_dist -- +sensor1 20.5 20.5 1 +sensor1 21 20.75 2 +sensor1 21.5 21 3 +sensor1 22 21.25 4 +sensor1 22.5 21.5 5 +sensor3 19.5 19.5 1 +sensor3 20 19.75 2 +sensor3 20.8 20.1 3 +sensor3 21.2 20.375 4 +sensor3 21.8 20.66 5 + +-- !module_reuse -- +sensor1 21.5 21.5 +sensor3 20.66 20.66 + +-- !combined_analytics -- +2024-01-01T10:00 sensor1 20.5 20.5 \N 20.5 20.5 +2024-01-01T10:05 sensor1 21 20.75 0.25 20.5 21 +2024-01-01T10:10 sensor1 21.5 21 0.408248290463863 20.5 21.5 +2024-01-01T10:15 sensor1 22 21.5 0.408248290463863 21 22 +2024-01-01T10:20 sensor1 22.5 22 0.408248290463863 21.5 22.5 +2024-01-01T10:00 sensor3 19.5 19.5 \N 19.5 19.5 +2024-01-01T10:05 sensor3 20 19.75 0.25 19.5 20 +2024-01-01T10:10 sensor3 20.8 20.1 0.535412613473634 19.5 20.8 +2024-01-01T10:15 sensor3 21.2 20.66666666666667 0.4988876515698587 20 21.2 +2024-01-01T10:20 sensor3 21.8 21.26666666666667 0.4109609335312652 20.8 21.8 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudaf_window_functions.out b/regression-test/data/pythonudaf_p0/test_pythonudaf_window_functions.out new file mode 100644 index 00000000000000..bee92641c2fc5e --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudaf_window_functions.out @@ -0,0 +1,258 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_data -- +1 2024-01-01 North Laptop 1200 2 Alice +2 2024-01-02 North Mouse 25 10 Alice +3 2024-01-03 North Keyboard 75 5 Bob +4 2024-01-04 South Laptop 1150 1 Charlie +5 2024-01-05 South Monitor 300 3 Charlie +6 2024-01-06 South Mouse 20 15 David +7 2024-01-07 East Laptop 1300 2 Eve +8 2024-01-08 East Keyboard 80 4 Eve +9 2024-01-09 East Monitor 320 2 Frank +10 2024-01-10 West Laptop 1250 3 Grace +11 2024-01-11 West Mouse 22 12 Grace +12 2024-01-12 West Keyboard 70 6 Henry +13 2024-01-13 North Monitor 310 2 Alice +14 2024-01-14 South Keyboard 78 3 Charlie +15 2024-01-15 East Mouse 24 8 Eve + +-- !window_partition_by -- +7 East 1300 1300 +8 East 80 1380 +9 East 320 1700 +15 East 24 1724 +1 North 1200 1200 +2 North 25 1225 +3 North 75 1300 +13 North 310 1610 +4 South 1150 1150 +5 South 300 1450 +6 South 20 1470 +14 South 78 1548 +10 West 1250 1250 +11 West 22 1272 +12 West 70 1342 + +-- !window_order_by -- +1 2024-01-01 1200 1200 1 +2 2024-01-02 25 1225 2 +3 2024-01-03 75 1300 3 +4 2024-01-04 1150 2450 4 +5 2024-01-05 300 2750 5 +6 2024-01-06 20 2770 6 +7 2024-01-07 1300 4070 7 +8 2024-01-08 80 4150 8 +9 2024-01-09 320 4470 9 +10 2024-01-10 1250 5720 10 +11 2024-01-11 22 5742 11 +12 2024-01-12 70 5812 12 +13 2024-01-13 310 6122 13 +14 2024-01-14 78 6200 14 +15 2024-01-15 24 6224 15 + +-- !window_multi_partition -- +East Laptop 1300 1300 3650 1300 +East Keyboard 80 1380 155 690 +East Monitor 320 1700 620 566.6666666666666 +East Mouse 24 1724 91 431 +North Laptop 1200 1200 1200 1200 +North Mouse 25 1225 25 612.5 +North Keyboard 75 1300 75 433.3333333333333 +North Monitor 310 1610 930 402.5 +South Laptop 1150 1150 2350 1150 +South Monitor 300 1450 300 725 +South Mouse 20 1470 45 490 +South Keyboard 78 1548 303 387 +West Laptop 1250 1250 4900 1250 +West Mouse 22 1272 67 636 +West Keyboard 70 1342 225 447.3333333333333 + +-- !window_rows_between -- +1 2024-01-01 1200 1200 +2 2024-01-02 25 1225 +3 2024-01-03 75 1300 +4 2024-01-04 1150 1250 +5 2024-01-05 300 1525 +6 2024-01-06 20 1470 +7 2024-01-07 1300 1620 +8 2024-01-08 80 1400 +9 2024-01-09 320 1700 +10 2024-01-10 1250 1650 +11 2024-01-11 22 1592 +12 2024-01-12 70 1342 +13 2024-01-13 310 402 +14 2024-01-14 78 458 +15 2024-01-15 24 412 + +-- !window_unbounded -- +East 2024-01-07 1300 1300 +East 2024-01-08 80 1380 +East 2024-01-09 320 1700 +East 2024-01-15 24 1724 +North 2024-01-01 1200 1200 +North 2024-01-02 25 1225 +North 2024-01-03 75 1300 +North 2024-01-13 310 1610 +South 2024-01-04 1150 1150 +South 2024-01-05 300 1450 +South 2024-01-06 20 1470 +South 2024-01-14 78 1548 +West 2024-01-10 1250 1250 +West 2024-01-11 22 1272 +West 2024-01-12 70 1342 + +-- !window_salesperson -- +Alice 2024-01-01 1200 1200 1 +Alice 2024-01-02 25 1225 2 +Alice 2024-01-13 310 1535 3 +Bob 2024-01-03 75 75 1 +Charlie 2024-01-04 1150 1150 1 +Charlie 2024-01-05 300 1450 2 +Charlie 2024-01-14 78 1528 3 +David 2024-01-06 20 20 1 +Eve 2024-01-07 1300 1300 1 +Eve 2024-01-08 80 1380 2 +Eve 2024-01-15 24 1404 3 +Frank 2024-01-09 320 320 1 +Grace 2024-01-10 1250 1250 1 +Grace 2024-01-11 22 1272 2 +Henry 2024-01-12 70 70 1 + +-- !window_vs_group_by -- +East 1300 1724 1724 +East 80 1724 1724 +East 320 1724 1724 +East 24 1724 1724 +North 1200 1610 1610 +North 25 1610 1610 +North 75 1610 1610 +North 310 1610 1610 +South 1150 1548 1548 +South 300 1548 1548 +South 20 1548 1548 +South 78 1548 1548 +West 1250 1342 1342 +West 22 1342 1342 +West 70 1342 1342 + +-- !window_multiple_specs -- +Keyboard 2024-01-03 75 75 433.3333333333333 1 +Keyboard 2024-01-08 80 155 518.75 2 +Keyboard 2024-01-12 70 225 484.3333333333333 3 +Keyboard 2024-01-14 78 303 442.8571428571428 4 +Laptop 2024-01-01 1200 1200 1200 1 +Laptop 2024-01-04 1150 2350 612.5 2 +Laptop 2024-01-07 1300 3650 581.4285714285714 3 +Laptop 2024-01-10 1250 4900 572 4 +Monitor 2024-01-05 300 300 550 1 +Monitor 2024-01-09 320 620 496.6666666666667 2 +Monitor 2024-01-13 310 930 470.9230769230769 3 +Mouse 2024-01-02 25 25 612.5 1 +Mouse 2024-01-06 20 45 461.6666666666667 2 +Mouse 2024-01-11 22 67 522 3 +Mouse 2024-01-15 24 91 414.9333333333333 4 + +-- !window_complex_order -- +East Laptop 1300 2 1300 +East Monitor 320 2 1620 +East Keyboard 80 4 1700 +East Mouse 24 8 1724 +North Laptop 1200 2 1200 +North Monitor 310 2 1510 +North Keyboard 75 5 1585 +North Mouse 25 10 1610 +South Laptop 1150 1 1150 +South Monitor 300 3 1450 +South Keyboard 78 3 1528 +South Mouse 20 15 1548 +West Laptop 1250 3 1250 +West Keyboard 70 6 1320 +West Mouse 22 12 1342 + +-- !window_with_where -- +East Laptop 1300 1300 +East Keyboard 80 1380 +East Monitor 320 1700 +North Laptop 1200 1200 +North Keyboard 75 1275 +North Monitor 310 1585 +South Laptop 1150 1150 +South Monitor 300 1450 +South Keyboard 78 1528 +West Laptop 1250 1250 +West Keyboard 70 1320 + +-- !window_nested -- +East 2024-01-07 1300 1300 0 +East 2024-01-08 80 1380 1300 +East 2024-01-09 320 1700 1380 +East 2024-01-15 24 1724 1700 +North 2024-01-01 1200 1200 0 +North 2024-01-02 25 1225 1200 +North 2024-01-03 75 1300 1225 +North 2024-01-13 310 1610 1300 +South 2024-01-04 1150 1150 0 +South 2024-01-05 300 1450 1150 +South 2024-01-06 20 1470 1450 +South 2024-01-14 78 1548 1470 +West 2024-01-10 1250 1250 0 +West 2024-01-11 22 1272 1250 +West 2024-01-12 70 1342 1272 + +-- !window_range_frame -- +1 2024-01-01 1200 1225 +2 2024-01-02 25 1300 +3 2024-01-03 75 1250 +4 2024-01-04 1150 1525 +5 2024-01-05 300 1470 +6 2024-01-06 20 1620 +7 2024-01-07 1300 1400 +8 2024-01-08 80 1700 +9 2024-01-09 320 1650 +10 2024-01-10 1250 1592 +11 2024-01-11 22 1342 +12 2024-01-12 70 402 +13 2024-01-13 310 458 +14 2024-01-14 78 412 +15 2024-01-15 24 102 + +-- !window_sparse -- +A 100 100 1 +A 200 300 2 +B 300 300 1 +C 400 400 1 +C 500 900 2 +C 600 1500 3 + +-- !window_first_value -- +East 2024-01-07 1300 1300 +East 2024-01-08 80 1300 +East 2024-01-09 320 1300 +East 2024-01-15 24 1300 +North 2024-01-01 1200 1200 +North 2024-01-02 25 1200 +North 2024-01-03 75 1200 +North 2024-01-13 310 1200 +South 2024-01-04 1150 1150 +South 2024-01-05 300 1150 +South 2024-01-06 20 1150 +South 2024-01-14 78 1150 +West 2024-01-10 1250 1250 +West 2024-01-11 22 1250 +West 2024-01-12 70 1250 + +-- !window_with_nulls -- +A 10 10 1 +A \N 10 1 +A 30 40 2 +B \N 0 0 +B 50 50 1 +B \N 50 1 + +-- !window_large_dataset -- +0 20 498.75 +1 20 514.5 +2 20 530.25 +3 20 546 +4 20 561.75 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudf_udaf_mixed.out b/regression-test/data/pythonudaf_p0/test_pythonudf_udaf_mixed.out new file mode 100644 index 00000000000000..160f66557e5154 --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudf_udaf_mixed.out @@ -0,0 +1,52 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !mixed_1 -- +1060 8 200 + +-- !mixed_2 -- +BOO 190 3 +CLO 270 2 +ELE 600 3 + +-- !mixed_3 -- +Books 380 3 +Clothing 540 2 +Electronics 1200 3 + +-- !mixed_4 -- +BOO 190 3 39.9 +CLO 270 2 69.90000000000001 +ELE 600 3 199.9 + +-- !mixed_5 -- +Books 190 36.67433333333333 3 +Clothing 270 57.4175 2 +Electronics 600 161.5833333333333 3 + +-- !mixed_6 -- +Books BOO 380 3 39.9 +Clothing CLO 540 2 69.90000000000001 +Electronics ELE 1200 3 199.9 + +-- !mixed_7 -- +Books 190 3 +Clothing 270 2 +Electronics 600 3 + +-- !mixed_8 -- +2120 8 + +-- !mixed_9 -- +Books 140 2 +Clothing 270 2 +Electronics 600 3 + +-- !mixed_10 -- +BOO 190 380 3 39.9 36.67433333333333 +CLO 270 540 2 69.90000000000001 57.4175 +ELE 600 1200 3 199.9 161.5833333333333 + +-- !mixed_verify -- +Books 190 190 3 3 +Clothing 270 270 2 2 +Electronics 600 600 3 3 + diff --git a/regression-test/data/pythonudaf_p0/test_pythonudwf_comprehensive.out b/regression-test/data/pythonudaf_p0/test_pythonudwf_comprehensive.out new file mode 100644 index 00000000000000..56bcca94c2c832 --- /dev/null +++ b/regression-test/data/pythonudaf_p0/test_pythonudwf_comprehensive.out @@ -0,0 +1,598 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !test_partition_by_running_sum -- +7 East 1300.00 1300 +8 East 80.00 1380 +9 East 180.00 1560 +15 East 24.00 1584 +19 East 1280.00 2864 +1 North 1200.50 1200.5 +2 North 25.99 1226.49 +3 North 350.00 1576.49 +13 North 310.00 1886.49 +17 North 380.00 2266.49 +4 South 1150.00 1150 +5 South 200.00 1350 +6 South 300.00 1650 +14 South 78.00 1728 +18 South 295.00 2023 +10 West 1250.00 1250 +11 West 22.50 1272.5 +12 West 450.00 1722.5 +16 West 195.00 1917.5 +20 West 85.00 2002.5 + +-- !test_partition_by_running_avg -- +1 Electronics 1200.50 1200.5 +2 Electronics 25.99 613.245 +4 Electronics 1150.00 792.1633333333333 +6 Electronics 300.00 669.1224999999999 +7 Electronics 1300.00 795.298 +8 Electronics 80.00 676.0816666666666 +10 Electronics 1250.00 758.0699999999999 +11 Electronics 22.50 666.12375 +13 Electronics 310.00 626.5544444444445 +14 Electronics 78.00 571.699 +15 Electronics 24.00 521.9081818181818 +18 Electronics 295.00 502.9991666666667 +19 Electronics 1280.00 562.7684615384616 +20 Electronics 85.00 528.6421428571429 +3 Furniture 350.00 350 +5 Furniture 200.00 275 +9 Furniture 180.00 243.3333333333333 +12 Furniture 450.00 295 +16 Furniture 195.00 275 +17 Furniture 380.00 292.5 + +-- !test_partition_by_running_count -- +1 Alice 2 1 +2 Alice 10 2 +13 Alice 2 3 +3 Bob 1 1 +17 Bob 1 2 +4 Charlie 1 1 +5 Charlie 4 2 +14 Charlie 3 3 +6 David 3 1 +18 David 2 2 +7 Eve 2 1 +8 Eve 5 2 +15 Eve 8 3 +9 Frank 2 1 +19 Frank 1 2 +10 Grace 3 1 +11 Grace 12 2 +16 Grace 5 3 +12 Henry 1 1 +20 Henry 6 2 + +-- !test_multi_partition_columns -- +7 East Electronics 1300.00 1300 +8 East Electronics 80.00 1380 +15 East Electronics 24.00 1404 +19 East Electronics 1280.00 2684 +9 East Furniture 180.00 180 +1 North Electronics 1200.50 1200.5 +2 North Electronics 25.99 1226.49 +13 North Electronics 310.00 1536.49 +3 North Furniture 350.00 350 +17 North Furniture 380.00 730 +4 South Electronics 1150.00 1150 +6 South Electronics 300.00 1450 +14 South Electronics 78.00 1528 +18 South Electronics 295.00 1823 +5 South Furniture 200.00 200 +10 West Electronics 1250.00 1250 +11 West Electronics 22.50 1272.5 +20 West Electronics 85.00 1357.5 +12 West Furniture 450.00 450 +16 West Furniture 195.00 645 + +-- !test_order_by_cumulative_sum -- +1 2024-01-01 1200.50 1200.5 +2 2024-01-02 25.99 1226.49 +3 2024-01-03 350.00 1576.49 +4 2024-01-04 1150.00 2726.49 +5 2024-01-05 200.00 2926.49 +6 2024-01-06 300.00 3226.49 +7 2024-01-07 1300.00 4526.49 +8 2024-01-08 80.00 4606.49 +9 2024-01-09 180.00 4786.49 +10 2024-01-10 1250.00 6036.49 +11 2024-01-11 22.50 6058.99 +12 2024-01-12 450.00 6508.99 +13 2024-01-13 310.00 6818.99 +14 2024-01-14 78.00 6896.99 +15 2024-01-15 24.00 6920.99 +16 2024-01-16 195.00 7115.99 +17 2024-01-17 380.00 7495.99 +18 2024-01-18 295.00 7790.99 +19 2024-01-19 1280.00 9070.99 +20 2024-01-20 85.00 9155.99 + +-- !test_order_by_cumulative_avg -- +1 2024-01-01 1200.50 1200.5 +2 2024-01-02 25.99 613.245 +3 2024-01-03 350.00 525.4966666666667 +4 2024-01-04 1150.00 681.6224999999999 +5 2024-01-05 200.00 585.298 +6 2024-01-06 300.00 537.7483333333333 +7 2024-01-07 1300.00 646.6414285714285 +8 2024-01-08 80.00 575.81125 +9 2024-01-09 180.00 531.8322222222222 +10 2024-01-10 1250.00 603.649 +11 2024-01-11 22.50 550.8172727272727 +12 2024-01-12 450.00 542.4158333333334 +13 2024-01-13 310.00 524.5376923076923 +14 2024-01-14 78.00 492.6421428571429 +15 2024-01-15 24.00 461.3993333333333 +16 2024-01-16 195.00 444.749375 +17 2024-01-17 380.00 440.9405882352941 +18 2024-01-18 295.00 432.8327777777778 +19 2024-01-19 1280.00 477.4205263157895 +20 2024-01-20 85.00 457.7995 + +-- !test_order_by_desc -- +7 1300.00 1300 +19 1280.00 2580 +10 1250.00 3830 +1 1200.50 5030.5 +4 1150.00 6180.5 +12 450.00 6630.5 +17 380.00 7010.5 +3 350.00 7360.5 +13 310.00 7670.5 +6 300.00 7970.5 +18 295.00 8265.5 +5 200.00 8465.5 +16 195.00 8660.5 +9 180.00 8840.5 +20 85.00 8925.5 +8 80.00 9005.5 +14 78.00 9083.5 +2 25.99 9109.49 +15 24.00 9133.49 +11 22.50 9155.99 + +-- !test_rows_between_moving_avg -- +1 2024-01-01 1200.50 1200.5 +2 2024-01-02 25.99 613.245 +3 2024-01-03 350.00 525.4966666666667 +4 2024-01-04 1150.00 508.6633333333334 +5 2024-01-05 200.00 566.6666666666666 +6 2024-01-06 300.00 550 +7 2024-01-07 1300.00 600 +8 2024-01-08 80.00 560 +9 2024-01-09 180.00 520 +10 2024-01-10 1250.00 503.3333333333333 +11 2024-01-11 22.50 484.1666666666667 +12 2024-01-12 450.00 574.1666666666666 +13 2024-01-13 310.00 260.8333333333333 +14 2024-01-14 78.00 279.3333333333333 +15 2024-01-15 24.00 137.3333333333333 +16 2024-01-16 195.00 99 +17 2024-01-17 380.00 199.6666666666667 +18 2024-01-18 295.00 290 +19 2024-01-19 1280.00 651.6666666666666 +20 2024-01-20 85.00 553.3333333333334 + +-- !test_rows_between_with_partition -- +7 East 1300.00 690 +8 East 80.00 520 +9 East 180.00 94.66666666666667 +15 East 24.00 494.6666666666667 +19 East 1280.00 652 +1 North 1200.50 613.245 +2 North 25.99 525.4966666666667 +3 North 350.00 228.6633333333333 +13 North 310.00 346.6666666666667 +17 North 380.00 345 +4 South 1150.00 675 +5 South 200.00 550 +6 South 300.00 192.6666666666667 +14 South 78.00 224.3333333333333 +18 South 295.00 186.5 +10 West 1250.00 636.25 +11 West 22.50 574.1666666666666 +12 West 450.00 222.5 +16 West 195.00 243.3333333333333 +20 West 85.00 140 + +-- !test_rows_unbounded_preceding -- +1 Electronics 1200.50 1200.5 +2 Electronics 25.99 1226.49 +4 Electronics 1150.00 2376.49 +6 Electronics 300.00 2676.49 +7 Electronics 1300.00 3976.49 +8 Electronics 80.00 4056.49 +10 Electronics 1250.00 5306.49 +11 Electronics 22.50 5328.99 +13 Electronics 310.00 5638.99 +14 Electronics 78.00 5716.99 +15 Electronics 24.00 5740.99 +18 Electronics 295.00 6035.99 +19 Electronics 1280.00 7315.99 +20 Electronics 85.00 7400.99 +3 Furniture 350.00 350 +5 Furniture 200.00 550 +9 Furniture 180.00 730 +12 Furniture 450.00 1180 +16 Furniture 195.00 1375 +17 Furniture 380.00 1755 + +-- !test_rows_unbounded_following -- +7 East 1300.00 2864 +8 East 80.00 1564 +9 East 180.00 1484 +15 East 24.00 1304 +19 East 1280.00 1280 +1 North 1200.50 2266.49 +2 North 25.99 1065.99 +3 North 350.00 1040 +13 North 310.00 690 +17 North 380.00 380 +4 South 1150.00 2023 +5 South 200.00 873 +6 South 300.00 673 +14 South 78.00 373 +18 South 295.00 295 +10 West 1250.00 2002.5 +11 West 22.50 752.5 +12 West 450.00 730 +16 West 195.00 280 +20 West 85.00 85 + +-- !test_multiple_udwf_same_partition -- +7 East 1300.00 1300 1300 1 +8 East 80.00 1380 690 2 +9 East 180.00 1560 520 3 +15 East 24.00 1584 396 4 +19 East 1280.00 2864 572.8 5 +1 North 1200.50 1200.5 1200.5 1 +2 North 25.99 1226.49 613.245 2 +3 North 350.00 1576.49 525.4966666666667 3 +13 North 310.00 1886.49 471.6225 4 +17 North 380.00 2266.49 453.2979999999999 5 +4 South 1150.00 1150 1150 1 +5 South 200.00 1350 675 2 +6 South 300.00 1650 550 3 +14 South 78.00 1728 432 4 +18 South 295.00 2023 404.6 5 +10 West 1250.00 1250 1250 1 +11 West 22.50 1272.5 636.25 2 +12 West 450.00 1722.5 574.1666666666666 3 +16 West 195.00 1917.5 479.375 4 +20 West 85.00 2002.5 400.5 5 + +-- !test_multiple_udwf_diff_partition -- +1 North Electronics 1200.50 1200.5 1200.5 1200.5 +2 North Electronics 25.99 1226.49 1226.49 1226.49 +3 North Furniture 350.00 1576.49 350 1576.49 +4 South Electronics 1150.00 1150 2376.49 2726.49 +5 South Furniture 200.00 1350 550 2926.49 +6 South Electronics 300.00 1650 2676.49 3226.49 +7 East Electronics 1300.00 1300 3976.49 4526.49 +8 East Electronics 80.00 1380 4056.49 4606.49 +9 East Furniture 180.00 1560 730 4786.49 +10 West Electronics 1250.00 1250 5306.49 6036.49 +11 West Electronics 22.50 1272.5 5328.99 6058.99 +12 West Furniture 450.00 1722.5 1180 6508.99 +13 North Electronics 310.00 1886.49 5638.99 6818.99 +14 South Electronics 78.00 1728 5716.99 6896.99 +15 East Electronics 24.00 1584 5740.99 6920.99 +16 West Furniture 195.00 1917.5 1375 7115.99 +17 North Furniture 380.00 2266.49 1755 7495.99 +18 South Electronics 295.00 2023 6035.99 7790.99 +19 East Electronics 1280.00 2864 7315.99 9070.99 +20 West Electronics 85.00 2002.5 7400.99 9155.99 + +-- !test_mix_udwf_builtin -- +7 East 1300.00 1300 1300.00 1300 1300.0000 +8 East 80.00 1380 1380.00 690 690.0000 +9 East 180.00 1560 1560.00 520 520.0000 +15 East 24.00 1584 1584.00 396 396.0000 +19 East 1280.00 2864 2864.00 572.8 572.8000 +1 North 1200.50 1200.5 1200.50 1200.5 1200.5000 +2 North 25.99 1226.49 1226.49 613.245 613.2450 +3 North 350.00 1576.49 1576.49 525.4966666666667 525.4966 +13 North 310.00 1886.49 1886.49 471.6225 471.6225 +17 North 380.00 2266.49 2266.49 453.2979999999999 453.2980 +4 South 1150.00 1150 1150.00 1150 1150.0000 +5 South 200.00 1350 1350.00 675 675.0000 +6 South 300.00 1650 1650.00 550 550.0000 +14 South 78.00 1728 1728.00 432 432.0000 +18 South 295.00 2023 2023.00 404.6 404.6000 +10 West 1250.00 1250 1250.00 1250 1250.0000 +11 West 22.50 1272.5 1272.50 636.25 636.2500 +12 West 450.00 1722.5 1722.50 574.1666666666666 574.1666 +16 West 195.00 1917.5 1917.50 479.375 479.3750 +20 West 85.00 2002.5 2002.50 400.5 400.5000 + +-- !test_stddev_by_partition -- +AAPL 2024-01-01T09:30 150.25 \N +AAPL 2024-01-01T09:35 151.5 0.625 +AAPL 2024-01-01T09:40 150.75 0.5137011669140814 +AAPL 2024-01-01T09:45 152 0.5137011669140814 +AAPL 2024-01-01T09:50 151.25 0.5137011669140814 +GOOGL 2024-01-01T09:30 2800 \N +GOOGL 2024-01-01T09:35 2815.5 7.75 +GOOGL 2024-01-01T09:40 2810 6.416125518306719 +GOOGL 2024-01-01T09:45 2825 6.195876765147036 +GOOGL 2024-01-01T09:50 2820.5 6.284902544988268 +MSFT 2024-01-01T09:30 380 \N +MSFT 2024-01-01T09:35 382.5 1.25 +MSFT 2024-01-01T09:40 381 1.027402333828163 +MSFT 2024-01-01T09:45 383.75 1.124228130269337 +MSFT 2024-01-01T09:50 382.25 1.124228130269337 + +-- !test_min_max_moving_window -- +AAPL 2024-01-01T09:30 150.25 150.25 150.25 +AAPL 2024-01-01T09:35 151.5 150.25 151.5 +AAPL 2024-01-01T09:40 150.75 150.25 151.5 +AAPL 2024-01-01T09:45 152 150.75 152 +AAPL 2024-01-01T09:50 151.25 150.75 152 +GOOGL 2024-01-01T09:30 2800 2800 2800 +GOOGL 2024-01-01T09:35 2815.5 2800 2815.5 +GOOGL 2024-01-01T09:40 2810 2800 2815.5 +GOOGL 2024-01-01T09:45 2825 2810 2825 +GOOGL 2024-01-01T09:50 2820.5 2810 2825 +MSFT 2024-01-01T09:30 380 380 380 +MSFT 2024-01-01T09:35 382.5 380 382.5 +MSFT 2024-01-01T09:40 381 380 382.5 +MSFT 2024-01-01T09:45 383.75 381 383.75 +MSFT 2024-01-01T09:50 382.25 381 383.75 + +-- !test_first_last_value -- +AAPL 2024-01-01T09:30 150.25 150.25 151.25 +AAPL 2024-01-01T09:35 151.5 150.25 151.25 +AAPL 2024-01-01T09:40 150.75 150.25 151.25 +AAPL 2024-01-01T09:45 152 150.25 151.25 +AAPL 2024-01-01T09:50 151.25 150.25 151.25 +GOOGL 2024-01-01T09:30 2800 2800 2820.5 +GOOGL 2024-01-01T09:35 2815.5 2800 2820.5 +GOOGL 2024-01-01T09:40 2810 2800 2820.5 +GOOGL 2024-01-01T09:45 2825 2800 2820.5 +GOOGL 2024-01-01T09:50 2820.5 2800 2820.5 +MSFT 2024-01-01T09:30 380 380 382.25 +MSFT 2024-01-01T09:35 382.5 380 382.25 +MSFT 2024-01-01T09:40 381 380 382.25 +MSFT 2024-01-01T09:45 383.75 380 382.25 +MSFT 2024-01-01T09:50 382.25 380 382.25 + +-- !test_complex_growth_analysis -- +East Electronics 2684.00 2684 +East Furniture 180.00 2864 +North Electronics 1536.49 1536.49 +North Furniture 730.00 2266.49 +South Electronics 1823.00 1823 +South Furniture 200.00 2023 +West Electronics 1357.50 1357.5 +West Furniture 645.00 2002.5 + +-- !test_topn_analysis -- +Bob ClassA English 90 1 +Alice ClassA English 88 2 +Charlie ClassA English 85 3 +Alice ClassA Math 95 1 +Charlie ClassA Math 92 2 +Bob ClassA Math 87 3 +Alice ClassA Physics 92 1 +Charlie ClassA Physics 88 2 +Bob ClassA Physics 85 3 +Eve ClassB English 93 1 +Frank ClassB English 87 2 +David ClassB English 82 3 +Eve ClassB Math 90 1 +Frank ClassB Math 85 2 +David ClassB Math 78 3 +Frank ClassB Physics 91 1 +Eve ClassB Physics 89 2 +David ClassB Physics 80 3 + +-- !test_percentile_analysis -- +ClassA English 85 1 +ClassA English 88 2 +ClassA English 90 3 +ClassA Math 87 1 +ClassA Math 92 2 +ClassA Math 95 3 +ClassA Physics 85 1 +ClassA Physics 88 2 +ClassA Physics 92 3 +ClassB English 82 1 +ClassB English 87 2 +ClassB English 93 3 +ClassB Math 78 1 +ClassB Math 85 2 +ClassB Math 90 3 +ClassB Physics 80 1 +ClassB Physics 89 2 +ClassB Physics 91 3 + +-- !test_empty_partition -- +3 North 350.00 350 +17 North 380.00 730 +1 North 1200.50 1200.5 +13 North 310.00 310 +2 North 25.99 25.99 + +-- !test_single_row_partition -- +1 A 100 100 +2 B 200 200 +3 C 300 300 + +-- !test_null_values -- +1 A 100 100 1 100 +2 A \N 100 1 100 +3 A 200 300 2 150 +4 A \N 300 2 150 +5 A 300 600 3 200 +6 B \N 0 0 \N +7 B 150 150 1 150 +8 B \N 150 1 150 + +-- !test_all_nulls_partition -- +1 A \N 0 \N +2 A \N 0 \N +3 B 100 100 100 + +-- !test_large_partition -- +1 Alice 1200.50 1200.5 +2 Alice 25.99 1226.49 +13 Alice 310.00 1536.49 +3 Bob 350.00 350 +17 Bob 380.00 730 +4 Charlie 1150.00 1150 +5 Charlie 200.00 1350 +14 Charlie 78.00 1428 +6 David 300.00 300 +18 David 295.00 595 +7 Eve 1300.00 1300 +8 Eve 80.00 1380 +15 Eve 24.00 1404 +9 Frank 180.00 180 +19 Frank 1280.00 1460 +10 Grace 1250.00 1250 +11 Grace 22.50 1272.5 +16 Grace 195.00 1467.5 +12 Henry 450.00 450 +20 Henry 85.00 535 + +-- !test_multiple_complex_windows -- +1 North Electronics Laptop 1200.50 1200.5 1200.5 1200.5 1200.5 1 +2 North Electronics Mouse 25.99 1226.49 613.245 25.99 1200.5 1 +3 North Furniture Desk 350.00 1576.49 525.4966666666667 350 350 1 +4 South Electronics Laptop 1150.00 1150 1150 25.99 1200.5 2 +5 South Furniture Chair 200.00 1350 675 200 350 1 +6 South Electronics Monitor 300.00 1650 550 25.99 1200.5 1 +7 East Electronics Laptop 1300.00 1300 1300 25.99 1300 3 +8 East Electronics Keyboard 80.00 1380 690 25.99 1300 1 +9 East Furniture Bookshelf 180.00 1560 520 180 350 1 +10 West Electronics Laptop 1250.00 1250 1250 25.99 1300 4 +11 West Electronics Mouse 22.50 1272.5 636.25 22.5 1300 2 +12 West Furniture Table 450.00 1722.5 574.1666666666666 180 450 1 +13 North Electronics Monitor 310.00 1886.49 471.6225 22.5 1300 2 +14 South Electronics Keyboard 78.00 1728 432 22.5 1300 2 +15 East Electronics Mouse 24.00 1584 396 22.5 1300 3 +16 West Furniture Chair 195.00 1917.5 479.375 180 450 2 +17 North Furniture Desk 380.00 2266.49 453.2979999999999 180 450 2 +18 South Electronics Monitor 295.00 2023 404.6 22.5 1300 3 +19 East Electronics Laptop 1280.00 2864 572.8 22.5 1300 5 +20 West Electronics Keyboard 85.00 2002.5 400.5 22.5 1300 3 + +-- !test_int_type -- +7 East 2 1 +8 East 5 2 +9 East 2 3 +15 East 8 4 +19 East 1 5 +1 North 2 1 +2 North 10 2 +3 North 1 3 +13 North 2 4 +17 North 1 5 +4 South 1 1 +5 South 4 2 +6 South 3 3 +14 South 3 4 +18 South 2 5 +10 West 3 1 +11 West 12 2 +12 West 1 3 +16 West 5 4 +20 West 6 5 + +-- !test_decimal_type -- +1 Electronics 1200.50 1200.5 +2 Electronics 25.99 1226.49 +4 Electronics 1150.00 2376.49 +6 Electronics 300.00 2676.49 +7 Electronics 1300.00 3976.49 +8 Electronics 80.00 4056.49 +10 Electronics 1250.00 5306.49 +11 Electronics 22.50 5328.99 +13 Electronics 310.00 5638.99 +14 Electronics 78.00 5716.99 +15 Electronics 24.00 5740.99 +18 Electronics 295.00 6035.99 +19 Electronics 1280.00 7315.99 +20 Electronics 85.00 7400.99 +3 Furniture 350.00 350 +5 Furniture 200.00 550 +9 Furniture 180.00 730 +12 Furniture 450.00 1180 +16 Furniture 195.00 1375 +17 Furniture 380.00 1755 + +-- !test_double_type -- +1 AAPL 150.25 150.25 +2 AAPL 151.5 150.875 +3 AAPL 150.75 150.8333333333333 +4 AAPL 152 151.125 +5 AAPL 151.25 151.15 +6 GOOGL 2800 2800 +7 GOOGL 2815.5 2807.75 +8 GOOGL 2810 2808.5 +9 GOOGL 2825 2812.625 +10 GOOGL 2820.5 2814.2 +11 MSFT 380 380 +12 MSFT 382.5 381.25 +13 MSFT 381 381.1666666666667 +14 MSFT 383.75 381.8125 +15 MSFT 382.25 381.9 + +-- !test_window_in_subquery -- +East 1737.6 +North 1631.292 +South 1580.2 +West 1633 + +-- !test_window_with_cte -- +East 2864 +North 2266.49 +South 2023 +West 2002.5 + +-- !test_nested_windows -- +East 7 1300.00 1300 1300 +East 8 80.00 1380 1340 +East 9 180.00 1560 1413.333333333333 +East 15 24.00 1584 1456 +East 19 1280.00 2864 1737.6 +North 1 1200.50 1200.5 1200.5 +North 2 25.99 1226.49 1213.495 +North 3 350.00 1576.49 1334.493333333333 +North 13 310.00 1886.49 1472.4925 +North 17 380.00 2266.49 1631.292 +South 4 1150.00 1150 1150 +South 5 200.00 1350 1250 +South 6 300.00 1650 1383.333333333333 +South 14 78.00 1728 1469.5 +South 18 295.00 2023 1580.2 +West 10 1250.00 1250 1250 +West 11 22.50 1272.5 1261.25 +West 12 450.00 1722.5 1415 +West 16 195.00 1917.5 1540.625 +West 20 85.00 2002.5 1633 + +-- !test_window_after_join -- +6 David Bronze 300.00 300 +12 Henry Bronze 450.00 750 +18 David Bronze 295.00 1045 +20 Henry Bronze 85.00 1130 +1 Alice Gold 1200.50 1200.5 +2 Alice Gold 25.99 1226.49 +4 Charlie Gold 1150.00 2376.49 +5 Charlie Gold 200.00 2576.49 +7 Eve Gold 1300.00 3876.49 +8 Eve Gold 80.00 3956.49 +10 Grace Gold 1250.00 5206.49 +11 Grace Gold 22.50 5228.99 +13 Alice Gold 310.00 5538.99 +14 Charlie Gold 78.00 5616.99 +15 Eve Gold 24.00 5640.99 +16 Grace Gold 195.00 5835.99 +3 Bob Silver 350.00 350 +9 Frank Silver 180.00 530 +17 Bob Silver 380.00 910 +19 Frank Silver 1280.00 2190 + diff --git a/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertequal.out b/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertequal.out new file mode 100644 index 00000000000000..3376296a3cafd4 --- /dev/null +++ b/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertequal.out @@ -0,0 +1,4 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +23.34 == 23.34 + diff --git a/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertlessthan.out b/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertlessthan.out new file mode 100644 index 00000000000000..41cb52194e89bd --- /dev/null +++ b/regression-test/data/pythonudf_p0/sanity/test_pythonudf_assertlessthan.out @@ -0,0 +1,5 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +0.123 < 0.124 +23.34 < 23.35 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_aggregate.out b/regression-test/data/pythonudf_p0/test_pythonudf_aggregate.out new file mode 100644 index 00000000000000..ac418896a195c6 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_aggregate.out @@ -0,0 +1,38 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_grades -- +1 Alice English 88 B +1 Alice Math 95 A +1 Alice Science 92 A +2 Bob English 85 B +2 Bob Math 78 C +2 Bob Science 80 B +3 Charlie English 70 C +3 Charlie Math 65 D +3 Charlie Science 68 D +4 David English 60 D +4 David Math 55 F +4 David Science 58 F + +-- !select_group_by_grade -- +A 2 93.5 +B 3 84.33333333333333 +C 2 74 +D 3 64.33333333333333 +F 2 56.5 + +-- !select_aggregate_with_udf -- +1 Alice 91.66666666666667 A +2 Bob 81 B +3 Charlie 67.66666666666667 D +4 David 57.66666666666666 F + +-- !select_age_group_aggregate -- +Adult 2 85000 90000 80000 +Minor 1 0 0 0 +Senior 2 105000 110000 100000 +Young Adult 3 51666.66666666666 60000 45000 + +-- !select_having_with_udf -- +1 Alice 91.66666666666667 +2 Bob 81 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_always_nullable.out b/regression-test/data/pythonudf_p0/test_pythonudf_always_nullable.out new file mode 100644 index 00000000000000..055bbad76d6c76 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_always_nullable.out @@ -0,0 +1,42 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_nullable_true_normal -- +20 + +-- !select_nullable_true_null -- +\N + +-- !select_nullable_true_negative -- +\N + +-- !select_nullable_false_normal -- +20 + +-- !select_nullable_false_null -- +0 + +-- !select_nullable_false_returns_none_normal -- +20 + +-- !select_table_nullable_true -- +1 10 20 +2 \N \N +3 -5 \N +4 0 0 +5 100 200 + +-- !select_table_nullable_false -- +1 10 20 +2 \N 0 +3 -5 -10 +4 0 0 +5 100 200 + +-- !select_string_nullable -- +HELLO + +-- !select_string_nullable_null -- +\N + +-- !select_string_nullable_empty -- +\N + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_array.out b/regression-test/data/pythonudf_p0/test_pythonudf_array.out new file mode 100644 index 00000000000000..eda9275d0bd7c7 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_array.out @@ -0,0 +1,109 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +1 2 a1b +2 4 a2b +3 6 a3b +4 8 a4b +5 10 a5b +6 12 a6b +7 14 a7b +8 16 a8b +9 18 a9b +10 20 a10b + +-- !select_1 -- +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 + +-- !select_2 -- +\N + +-- !select_3 -- +[1] 2 +[2] 4 +[3] 6 +[4] 8 +[5] 10 +[6] 12 +[7] 14 +[8] 16 +[9] 18 +[10] 20 + +-- !select_4 -- +[2] 2 +[4] 4 +[6] 6 +[8] 8 +[10] 10 +[12] 12 +[14] 14 +[16] 16 +[18] 18 +[20] 20 + +-- !select_5 -- +\N + +-- !select_6 -- +["a1b"] 2 +["a2b"] 4 +["a3b"] 6 +["a4b"] 8 +["a5b"] 10 +["a6b"] 12 +["a7b"] 14 +["a8b"] 16 +["a9b"] 18 +["a10b"] 20 + +-- !select_7 -- +["a1b1"] 2 +["a2b2"] 4 +["a3b3"] 6 +["a4b4"] 8 +["a5b5"] 10 +["a6b6"] 12 +["a7b7"] 14 +["a8b8"] 16 +["a9b9"] 18 +["a10b10"] 20 + +-- !select_8 -- +\N + +-- !select_9 -- +a1b 2 +a2b 4 +a3b 6 +a4b 8 +a5b 10 +a6b 12 +a7b 14 +a8b 16 +a9b 18 +a10b 20 + +-- !select_10 -- +a1b1 2 +a2b2 4 +a3b3 6 +a4b4 8 +a5b5 10 +a6b6 12 +a7b7 14 +a8b8 16 +a9b9 18 +a10b10 20 + +-- !select_11 -- +\N + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_base_data_type.out b/regression-test/data/pythonudf_p0/test_pythonudf_base_data_type.out new file mode 100644 index 00000000000000..781ea3168073e9 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_base_data_type.out @@ -0,0 +1,15 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_1 -- +True,127,32767,2147483647,9223372036854775807,170141183460469231731687303715884105727,1.2300000190734863,4.56789,123456.780000000,12345678901.230000000,123456789012345678901.234567890,2023-01-01,2023-01-01 20:34:56+08:00,char_data_1,varchar_data_1,string_data_1 +False,-128,-32768,-2147483648,-9223372036854775808,-170141183460469231731687303715884105728,-2.3399999141693115,-5.6789,-987654.320000000,-98765432.110000000,-987654321098765432.109876540,2024-05-15,2024-05-15 16:22:10+08:00,char_data_2,varchar_data_2,string_data_2 +True,0,0,0,0,0,0.0,0.0,0E-9,0E-9,0E-9,2025-10-15,2025-10-15 08:00:00+08:00,char_zero,varchar_zero,string_zero +False,100,20000,300000000,4000000000000000000,99999999999999999999999999999999999999,3.140000104904175,2.71828,999999.990000000,99999999999999.990000000,100000000000000000000000.000000000,2022-12-31,2023-01-01 07:59:59+08:00,char_max,varchar_max,string_max +True,-50,-10000,-100000000,-5000000000000000000,-99999999999999999999999999999999999999,-1.409999966621399,-0.57721,-0.010000000,-0.010000000,0E-9,2021-07-04,2021-07-04 22:30:00+08:00,char_neg,varchar_neg,string_neg + +-- !select_2 -- +True,127,32767,2147483647,9223372036854775807,170141183460469231731687303715884105727,1.2300000190734863,4.56789,123456.780000000,12345678901.230000000,123456789012345678901.234567890,2023-01-01,2023-01-01 20:34:56+08:00,char_data_1,varchar_data_1,string_data_1 +False,-128,-32768,-2147483648,-9223372036854775808,-170141183460469231731687303715884105728,-2.3399999141693115,-5.6789,-987654.320000000,-98765432.110000000,-987654321098765432.109876540,2024-05-15,2024-05-15 16:22:10+08:00,char_data_2,varchar_data_2,string_data_2 +True,0,0,0,0,0,0.0,0.0,0E-9,0E-9,0E-9,2025-10-15,2025-10-15 08:00:00+08:00,char_zero,varchar_zero,string_zero +False,100,20000,300000000,4000000000000000000,99999999999999999999999999999999999999,3.140000104904175,2.71828,999999.990000000,99999999999999.990000000,100000000000000000000000.000000000,2022-12-31,2023-01-01 07:59:59+08:00,char_max,varchar_max,string_max +True,-50,-10000,-100000000,-5000000000000000000,-99999999999999999999999999999999999999,-1.409999966621399,-0.57721,-0.010000000,-0.010000000,0E-9,2021-07-04,2021-07-04 22:30:00+08:00,char_neg,varchar_neg,string_neg + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_boolean.out b/regression-test/data/pythonudf_p0/test_pythonudf_boolean.out new file mode 100644 index 00000000000000..e43b50260cf09c --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_boolean.out @@ -0,0 +1,28 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +111 true +112 false +113 false +114 true + +-- !select -- +false + +-- !select -- +true + +-- !select -- +false + +-- !select -- +true + +-- !select -- +true + +-- !select -- +111 false +112 true +113 true +114 false + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_complex_data_type.out b/regression-test/data/pythonudf_p0/test_pythonudf_complex_data_type.out new file mode 100644 index 00000000000000..3f3d82127edbe2 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_complex_data_type.out @@ -0,0 +1,37 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_1 -- +[1,2,3]|[a,b,c]|[[1,2],[3,4]] +[]|[]|[] +NULL|[x,NULL,z]|NULL +[0,-1,2147483647]|[hello,world]|[[],[1]] + +-- !select_2 -- +[1,2,3]|[a,b,c]|[[1,2],[3,4]] +[]|[]|[] +NULL|[x,NULL,z]|NULL +[0,-1,2147483647]|[hello,world]|[[],[1]] + +-- !select_3 -- +{1:one,2:two}|{e:2.718,pi:3.14} +{}|{} +NULL|{null_key:NULL} +{-1:minus_one,0:zero}|{max:1.79769e+308} + +-- !select_4 -- +{1:one,2:two}|{e:2.718,pi:3.14} +{}|{} +NULL|{null_key:NULL} +{-1:minus_one,0:zero}|{max:1.79769e+308} + +-- !select_5 -- +(Alice,30,75000.50)|(1.5,2.5,[red,blue]) +(NULL,NULL,NULL)|(0.0,0.0,[]) +(Bob,25,60000.00)|(NULL,3.14,[tag1,NULL,tag3]) +(,0,0.00)|(-1.0,-2.0,NULL) + +-- !select_6 -- +(Alice,30,75000.50)|(1.5,2.5,[red,blue]) +(NULL,NULL,NULL)|(0.0,0.0,[]) +(Bob,25,60000.00)|(NULL,3.14,[tag1,NULL,tag3]) +(,0,0.00)|(-1.0,-2.0,NULL) + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_concurrent.out b/regression-test/data/pythonudf_p0/test_pythonudf_concurrent.out new file mode 100644 index 00000000000000..77fbb3d6d0ee30 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_concurrent.out @@ -0,0 +1,55 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !concurrent_scalar_1 -- +1 30 200 0.6 hello_world 5 +2 70 1200 0.7777777777777778 foo_bar 3 +3 110 3000 0.8461538461538461 test_case 4 +4 150 5600 0.8823529411764706 python_udf 6 +5 190 9000 0.9047619047619048 doris_db 5 + +-- !concurrent_scalar_2 -- +1 40 11 +2 100 7 +3 160 9 +4 220 10 +5 280 8 + +-- !concurrent_scalar_3 -- +3 50 60 +4 70 80 +5 90 100 + +-- !concurrent_vector_1 -- +1 30 10 20 +2 70 10 40 +3 110 10 60 +4 150 10 80 +5 190 10 100 + +-- !concurrent_mixed_1 -- +1 30 30 hello_world 100 +2 70 70 foo_bar 300 +3 110 110 test_case 500 +4 150 150 python_udf 700 +5 190 190 doris_db 900 + +-- !concurrent_with_agg -- +5 550 100 6 + +-- !concurrent_group_by -- +group1 3 210 0.7413105413105413 +group2 2 340 0.8935574229691876 + +-- !concurrent_type_mix -- +1 30 0.6 10_hello +2 70 0.7777777777777778 30_foo +3 110 0.8461538461538461 50_test +4 150 0.8823529411764706 70_python +5 190 0.9047619047619048 90_doris + +-- !concurrent_stress -- +1 30 200 110 100 0.6 1.666666666666667 hello_world 5 5 10 +2 70 1200 130 200 0.7777777777777778 1.285714285714286 foo_bar 3 3 30 +3 110 3000 150 300 0.8461538461538461 1.181818181818182 test_case 4 4 50 +4 150 5600 170 400 0.8823529411764706 1.133333333333333 python_udf 6 3 70 +5 190 9000 190 500 0.9047619047619048 1.105263157894737 doris_db 5 2 90 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_data_types.out b/regression-test/data/pythonudf_p0/test_pythonudf_data_types.out new file mode 100644 index 00000000000000..a79b4993f57bac --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_data_types.out @@ -0,0 +1,24 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_tinyint -- +11 + +-- !select_smallint -- +2000 + +-- !select_bigint -- +1000001000000 + +-- !select_decimal -- +\N + +-- !select_date -- +2024-01-15 + +-- !select_datetime -- +2024-01-15 18:30:45+08:00 + +-- !select_table_types -- +1 11 200 1010000 +2 21 400 1020000 +3 \N \N \N + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_error_handling.out b/regression-test/data/pythonudf_p0/test_pythonudf_error_handling.out new file mode 100644 index 00000000000000..2dc0c07317f316 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_error_handling.out @@ -0,0 +1,50 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_divide_normal -- +5 + +-- !select_divide_zero -- +\N + +-- !select_divide_null -- +\N + +-- !select_substring_valid -- +e + +-- !select_substring_invalid -- +\N + +-- !select_substring_negative -- +\N + +-- !select_parse_valid -- +123 + +-- !select_parse_invalid -- +\N + +-- !select_parse_empty -- +\N + +-- !select_array_valid -- +20 + +-- !select_array_invalid -- +\N + +-- !select_table_error_handling -- +1 100 10 10 123 123 +2 50 0 \N abc \N +3 \N 5 \N \N +4 75 \N \N 456 456 +5 25 5 5 xyz \N + +-- !select_length_normal -- +5 + +-- !select_length_empty -- +0 + +-- !select_length_null -- +\N + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_file_protocol.out b/regression-test/data/pythonudf_p0/test_pythonudf_file_protocol.out new file mode 100644 index 00000000000000..8d1d3a594e5e2b --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_file_protocol.out @@ -0,0 +1,22 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_file_int -- +100 + +-- !select_file_string -- +123****890 + +-- !select_file_float -- +\N + +-- !select_file_bool_true -- +false + +-- !select_file_bool_false -- +true + +-- !select_table_file -- +1 10 11 hello h***o +2 20 21 world w***d +3 30 31 python p****n +4 40 41 doris d***s + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_float.out b/regression-test/data/pythonudf_p0/test_pythonudf_float.out new file mode 100644 index 00000000000000..6baa5258792d33 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_float.out @@ -0,0 +1,45 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +111 11111.11 222222.3 12345678.34455677 1111112 +112 1234556.0 222222.3 222222222.3333333 4444444444444.556 +113 8.765432E7 \N 6666666666.666667 \N + +-- !select -- +-108.2747 + +-- !select -- +-108.2747 + +-- !select -- +\N + +-- !select -- +\N + +-- !select -- +111 -211111.2 +112 1012334.0 +113 \N + +-- !select -- +111 -211111.2 +112 1012334.0 +113 \N + +-- !select -- +113.9475611 + +-- !select -- +113.9475611 + +-- !select -- +\N + +-- !select -- +\N + +-- !select -- +111 24691356.68911354 +112 444444444.6666667 +113 13333333333.33333 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_global_function.out b/regression-test/data/pythonudf_p0/test_pythonudf_global_function.out new file mode 100644 index 00000000000000..ad3d84acd73b87 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_global_function.out @@ -0,0 +1,23 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_global_multiply -- +56 + +-- !select_global_lower -- +hello world + +-- !select_local_add -- +40 + +-- !select_table_global -- +1 5 6 30 APPLE apple +2 10 20 200 BANANA banana +3 3 7 21 CHERRY cherry +4 \N 5 \N DATE date +5 8 9 72 \N \N + +-- !select_global_power -- +8 + +-- !select_global_power_decimal -- +2.23606797749979 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_inline_complex.out b/regression-test/data/pythonudf_p0/test_pythonudf_inline_complex.out new file mode 100644 index 00000000000000..22015afda6b2f4 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_inline_complex.out @@ -0,0 +1,19 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_array_sum -- +15 + +-- !select_reverse -- +olleH + +-- !select_weighted_avg -- +84 + +-- !select_format_name -- +DOE, John + +-- !select_in_range_true -- +true + +-- !select_in_range_false -- +false + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_inline_scalar.out b/regression-test/data/pythonudf_p0/test_pythonudf_inline_scalar.out new file mode 100644 index 00000000000000..9632ff9817fbfb --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_inline_scalar.out @@ -0,0 +1,28 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_add -- +30 + +-- !select_add_null -- +\N + +-- !select_concat -- +Hello World + +-- !select_concat_null -- +\N + +-- !select_square -- +25 + +-- !select_square_negative -- +9 + +-- !select_positive -- +true + +-- !select_negative -- +false + +-- !select_zero -- +false + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_inline_vector.out b/regression-test/data/pythonudf_p0/test_pythonudf_inline_vector.out new file mode 100644 index 00000000000000..de95543b1effb5 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_inline_vector.out @@ -0,0 +1,85 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !vec_add_int -- +1 10 20 31 +2 30 40 71 +3 \N 50 \N +4 60 \N \N +5 70 80 151 + +-- !vec_multiply_double -- +1 1.5 2.5 3.75 +2 3.5 4.5 15.75 +3 5.5 \N \N +4 \N 6.5 \N +5 7.5 8.5 63.75 + +-- !vec_concat_string -- +1 hello world hello_world +2 foo bar foo_bar +3 \N test \N +4 data \N \N +5 python udf python_udf + +-- !vec_max_int -- +1 10 20 20 +2 30 40 40 +3 \N 50 \N +4 60 \N \N +5 70 80 80 + +-- !vec_sqrt_double -- +1 1.5 1.224744871391589 +2 3.5 1.870828693386971 +3 5.5 2.345207879911715 +4 \N \N +5 7.5 2.738612787525831 + +-- !vec_upper_string -- +1 hello HELLO +2 foo FOO +3 \N \N +4 data DATA +5 python PYTHON + +-- !vec_weighted_sum -- +1 10 20 17 +2 30 40 37 +3 \N 50 \N +4 60 \N \N +5 70 80 77 + +-- !vec_not_bool -- +1 true false +2 false true +3 true false +4 false true +5 true false + +-- !vec_greater_than -- +1 10 20 false +2 30 40 false +3 \N 50 false +4 60 \N false +5 70 80 false + +-- !vec_string_length -- +1 hello 5 +2 foo 3 +3 \N \N +4 data 4 +5 python 6 + +-- !vec_fill_null_int -- +1 10 10 +2 30 30 +3 \N 0 +4 60 60 +5 70 70 + +-- !vec_cumsum_int -- +1 10 10 +2 30 40 +3 \N \N +4 60 100 +5 70 170 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_int.out b/regression-test/data/pythonudf_p0/test_pythonudf_int.out new file mode 100644 index 00000000000000..cd3a1de1f20db2 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_int.out @@ -0,0 +1,112 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +1 2 3 4 +2 4 6 8 +3 6 9 12 +4 8 12 16 +5 10 15 20 +6 12 18 24 +7 14 21 28 +8 16 24 32 +9 18 27 36 +10 20 30 40 + +-- !select -- +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 + +-- !select -- +\N + +-- !select -- +3 +5 +7 +9 +11 +13 +15 +17 +19 +21 + +-- !select -- +\N + +-- !select -- +4 +7 +10 +13 +16 +19 +22 +25 +28 +31 + +-- !select -- +\N + +-- !select -- +5 +9 +13 +17 +21 +25 +29 +33 +37 +41 + +-- !select -- +\N + +-- !select_global_1 -- +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 + +-- !select_global_2 -- +\N + +-- !select_global_3 -- +4 +4 +4 +4 +4 +4 +4 +4 +4 +4 + +-- !select_global_4 -- +4 +4 +4 +4 +4 +4 +4 +4 +4 +4 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_map.out b/regression-test/data/pythonudf_p0/test_pythonudf_map.out new file mode 100644 index 00000000000000..7c7cf58b2f038c --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_map.out @@ -0,0 +1,10 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_1 -- +{1:1, 10:1, 100:1} 111 +{2:1, 20:1, 200:1, 2000:1} 2222 +{3:1} 3 + +-- !select_2 -- +{"114":"514", "1919":"810"} 1145141919810 +{"a":"bc", "def":"g", "hij":"k"} abcdefghijk + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_mixed_params.out b/regression-test/data/pythonudf_p0/test_pythonudf_mixed_params.out new file mode 100644 index 00000000000000..45d661e45decef --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_mixed_params.out @@ -0,0 +1,77 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_1 -- +1 100 150 +2 200 300 +3 150 225 +4 300 450 +5 250 375 +6 180 270 +7 220 330 +8 120 180 +9 280 420 +10 350 525 + +-- !select_2 -- +1 100 5 550 +2 200 3 660 +3 150 8 1320 +4 300 2 660 +5 250 6 1650 + +-- !select_3 -- +1 100 0.1 90 +2 200 0.15 170 +3 150 0.2 120 +4 300 0.05 285 +5 250 0.12 220 + +-- !select_4 -- +1 100 5 0.1 460 +2 200 3 0.15 520 +3 150 8 0.2 970 +4 300 2 0.05 580 +5 250 6 0.12 1330 + +-- !select_5 -- +1 A CAT_A +2 B CAT_B +3 A CAT_A +4 C CAT_C +5 B CAT_B + +-- !select_6 -- +1 5 15 +2 3 13 +3 8 18 +4 2 12 +5 6 16 + +-- !select_7 -- +1 100 0.1 100 +2 200 0.15 170 +3 150 0.2 150 +4 300 0.05 285 +5 250 0.12 220 +6 180 0.18 180 +7 220 0.08 202.4 +8 120 0.25 120 +9 280 0.1 252 +10 350 0.15 297.5 + +-- !select_8 -- +1 100 5 600 +2 200 3 720 +3 150 8 1440 + +-- !select_9 -- +1 100 5 120 +2 200 3 200 +3 150 8 180 +4 300 2 300 +5 250 6 300 + +-- !select_10 -- +1 100 109.5 +2 200 214 +3 150 161.75 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_module.out b/regression-test/data/pythonudf_p0/test_pythonudf_module.out new file mode 100644 index 00000000000000..a1a8c2163ad02e --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_module.out @@ -0,0 +1,13 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +1001 5 10 500 62.19368581839511 +1002 40 1 20 2.679441541679836 +1003 15 5 300 40.4622349294233 +1004 -1 3 100 \N +1005 \N 2 200 \N +1006 7 \N 150 \N +1007 30 0 \N \N +1008 0 100 5000 100 +1009 100 2 10 3.595836866004329 +1010 8 8 800 68.85254329722605 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_module_advanced.out b/regression-test/data/pythonudf_p0/test_pythonudf_module_advanced.out new file mode 100644 index 00000000000000..67bbf40e0a3cdf --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_module_advanced.out @@ -0,0 +1,57 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_module_ltv_normal -- +100 + +-- !select_module_ltv_null -- +\N + +-- !select_module_ltv_zero -- +100 + +-- !select_customer_analytics -- +1001 Premium 5 50 10000 100 +1002 Regular 30 10 2000 67.19368581839511 +1003 Inactive 60 5 500 20.37527840768416 +1004 VIP 2 100 25000 100 +1005 Regular 15 25 5000 100 +1006 Regular \N 30 6000 \N +1007 Regular 10 \N 3000 \N +1008 Inactive 45 8 \N \N +1009 VIP 0 200 50000 100 +1010 Churned 90 2 100 6.295836866004329 + +-- !select_segment_analysis -- +Churned 1 100 6.295836866004329 +Inactive 2 500 20.37527840768416 +Premium 1 10000 100 +Regular 4 4000 83.59684290919756 +VIP 2 37500 100 + +-- !select_high_value_customers -- + +-- !select_sorted_by_ltv -- +1009 VIP 100 +1005 Regular 100 +1004 VIP 100 +1001 Premium 100 +1002 Regular 67.19368581839511 + +-- !select_complex_query -- +1001 Premium 5 50 10000 100 Low Value +1004 VIP 2 100 25000 100 Low Value +1005 Regular 15 25 5000 100 Low Value +1009 VIP 0 200 50000 100 Low Value +1002 Regular 30 10 2000 67.19368581839511 Low Value +1003 Inactive 60 5 500 20.37527840768416 Low Value +1010 Churned 90 2 100 6.295836866004329 Low Value +1007 Regular 10 \N 3000 \N Unknown +1008 Inactive 45 8 \N \N Unknown +1006 Regular \N 30 6000 \N Unknown + +-- !select_join_with_module_udf -- +1001 Alice Johnson Premium 10000 100 +1004 Diana Prince VIP 25000 100 +1005 Eve Wilson Regular 5000 100 +1002 Bob Smith Regular 2000 67.19368581839511 +1003 Charlie Brown Inactive 500 20.37527840768416 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_module_scalar.out b/regression-test/data/pythonudf_p0/test_pythonudf_module_scalar.out new file mode 100644 index 00000000000000..534da01a61418d --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_module_scalar.out @@ -0,0 +1,210 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !add_three -- +1 10 20 30 60 +2 5 15 25 45 +3 100 50 25 175 +4 7 3 11 21 +5 17 19 23 59 + +-- !safe_div -- +1 100 10 10 +2 200 20 10 +3 150 0 \N +4 80 5 16 +5 300 15 20 + +-- !discount -- +1 100 90 75 +2 200 180 150 +3 150 135 112.5 +4 80 72 60 +5 300 270 225 + +-- !compound_interest -- +1 100 162.8894626777442 +2 200 325.7789253554884 +3 150 244.3341940166163 +4 80 130.3115701421954 +5 300 488.6683880332326 + +-- !bmi -- +1 22.86 29.39 +2 22.86 29.39 +3 22.86 29.39 +4 22.86 29.39 +5 22.86 29.39 + +-- !fibonacci -- +1 10 55 +2 5 5 +4 7 13 +5 17 1597 + +-- !is_prime -- +1 10 20 30 false false false +2 5 15 25 true false false +3 100 50 25 false false false +4 7 3 11 true true true +5 17 19 23 true true true + +-- !gcd -- +1 10 20 10 +2 5 15 5 +3 100 50 50 +4 7 3 1 +5 17 19 1 + +-- !lcm -- +1 10 20 20 +2 5 15 15 +3 100 50 100 +4 7 3 21 +5 17 19 323 + +-- !reverse -- +1 hello world dlrow olleh +2 foo bar baz zab rab oof +3 racecar racecar +4 a man a plan a canal panama amanap lanac a nalp a nam a +5 python udf test tset fdu nohtyp + +-- !count_vowels -- +1 hello world 3 +2 foo bar baz 4 +3 racecar 3 +4 a man a plan a canal panama 10 +5 python udf test 3 + +-- !count_words -- +1 hello world 2 +2 foo bar baz 3 +3 racecar 1 +4 a man a plan a canal panama 7 +5 python udf test 3 + +-- !capitalize -- +1 hello world Hello World +2 foo bar baz Foo Bar Baz +3 racecar Racecar +4 a man a plan a canal panama A Man A Plan A Canal Panama +5 python udf test Python Udf Test + +-- !is_palindrome -- +1 hello world false +2 foo bar baz false +3 racecar true +4 a man a plan a canal panama true +5 python udf test false + +-- !similarity -- +1 hello world 50 +2 foo bar baz 10 +3 racecar 14.29 +4 a man a plan a canal panama 10 +5 python udf test 23.08 + +-- !mask_email -- +1 test@example.com t***@example.com +2 user@domain.com u***@domain.com +3 admin@test.org a***@test.org +4 info@company.net i***@company.net +5 contact@site.io c***@site.io + +-- !extract_domain -- +1 test@example.com example.com +2 user@domain.com domain.com +3 admin@test.org test.org +4 info@company.net company.net +5 contact@site.io site.io + +-- !levenshtein -- +1 hello world 0 +2 foo bar baz 10 +3 racecar 10 +4 a man a plan a canal panama 24 +5 python udf test 13 + +-- !days_between -- +1 2024-01-15 2024-01-20 5 +2 2024-02-10 2024-03-15 34 +3 2023-12-01 2024-01-01 31 +4 2024-06-15 2024-06-15 0 +5 2024-03-01 2024-12-31 305 + +-- !is_weekend -- +1 2024-01-15 false +2 2024-02-10 true +3 2023-12-01 false +4 2024-06-15 true +5 2024-03-01 false + +-- !get_quarter -- +1 2024-01-15 1 +2 2024-02-10 1 +3 2023-12-01 4 +4 2024-06-15 2 +5 2024-03-01 1 + +-- !age -- +1 34 +2 34 +3 33 +4 34 +5 34 + +-- !in_range -- +1 10 true +2 5 false +3 100 false +4 7 false +5 17 true + +-- !xor -- +1 true true false +2 false true true +3 true false true +4 false false false +5 true true false + +-- !grade -- +1 100 A +2 200 A +3 150 A +4 80 B +5 300 A + +-- !categorize_age -- +1 10 Child +2 5 Child +3 100 Senior +4 7 Child +5 17 Teenager + +-- !tax -- +1 100 15 +2 200 30 +3 150 22.5 +4 80 12 +5 300 45 + +-- !truncate -- +1 hello world hello w... +2 foo bar baz foo bar... +3 racecar racecar +4 a man a plan a canal panama a man a... +5 python udf test python ... + +-- !null_handling -- +1 10 20 30 60 +2 \N 20 30 \N +3 10 \N 30 \N +4 10 20 \N \N +5 \N \N \N \N + +-- !string_edge -- +1 normal string gnirts lamron 3 2 +2 0 0 +3 0 0 +4 a a 1 1 +5 \N \N \N \N + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_module_vector.out b/regression-test/data/pythonudf_p0/test_pythonudf_module_vector.out new file mode 100644 index 00000000000000..bfcab62d10b990 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_module_vector.out @@ -0,0 +1,106 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !vec_add_const -- +1 10 20 130 +2 30 15 145 +3 50 50 200 +4 5 25 130 +5 100 10 210 + +-- !vec_multiply_round -- +1 1.5 2.5 3.75 +2 3.5 4.5 15.75 +3 5.5 2 11 +4 7.5 1.5 11.25 +5 9.5 3.5 33.25 + +-- !vec_concat_sep -- +1 hello world python udf hello world | python udf +2 foo bar test case foo bar | test case +3 data science machine learning data science | machine learning +4 apache doris database system apache doris | database system +5 vector operations pandas series vector operations | pandas series + +-- !vec_title_case -- +1 hello world Hello World +2 foo bar Foo Bar +3 data science Data Science +4 apache doris Apache Doris +5 vector operations Vector Operations + +-- !vec_conditional -- +1 10 20 20 +2 30 15 30 +3 50 50 50 +4 5 25 25 +5 100 10 100 + +-- !vec_percentage -- +1 1.5 2.5 60 +2 3.5 4.5 77.78 +3 5.5 2 275 +4 7.5 1.5 500 +5 9.5 3.5 271.43 + +-- !vec_in_range -- +1 10 true +2 30 true +3 50 true +4 5 false +5 100 false + +-- !vec_safe_div -- +1 1.5 2.5 0.6 +2 3.5 4.5 0.7777777777777778 +3 5.5 2 2.75 +4 7.5 1.5 5 +5 9.5 3.5 2.714285714285714 + +-- !vec_exp_decay -- +1 1.5 10 1.074796965860684 +2 3.5 30 1.287578044100048 +3 5.5 50 1.03881581560659 +4 7.5 5 6.348612936679606 +5 9.5 100 0.3389029367988978 + +-- !vec_first_word -- +1 hello world hello +2 foo bar foo +3 data science data +4 apache doris apache +5 vector operations vector + +-- !vec_abs_diff -- +1 10 20 10 +2 30 15 15 +3 50 50 0 +4 5 25 20 +5 100 10 90 + +-- !vec_power -- +1 1.5 2.25 +2 3.5 12.25 +3 5.5 30.25 +4 7.5 56.25 +5 9.5 90.25 + +-- !vec_bool_and -- +1 true true true +2 false true false +3 true false false +4 false false false +5 true true true + +-- !vec_bool_or -- +1 true true true +2 false true true +3 true false true +4 false false false +5 true true true + +-- !vec_clip -- +1 10 20 +2 30 30 +3 50 50 +4 5 20 +5 100 60 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_multiline_inline.out b/regression-test/data/pythonudf_p0/test_pythonudf_multiline_inline.out new file mode 100644 index 00000000000000..a6e0c9c5c00e2c --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_multiline_inline.out @@ -0,0 +1,23 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_complex_calc -- +25 + +-- !select_business_logic_vip -- +MEDIUM:3750.00 + +-- !select_business_logic_regular -- +MEDIUM:1800.00 + +-- !select_text_analyzer -- +len:15,words:3,upper:2,lower:8,digits:3 + +-- !select_statistics -- +mean:25.00,std:11.18,max:40.00,min:10.00 + +-- !select_table_multiline -- +1 VIP 15000 150 HIGH:11250.00 len:22,words:3,upper:1,lower:19,digits:0 +2 PREMIUM 8000 80 MEDIUM:6560.00 len:13,words:2,upper:1,lower:11,digits:0 +3 REGULAR 3000 40 MEDIUM:2700.00 len:13,words:2,upper:1,lower:11,digits:0 +4 VIP 500 10 LOW:400.00 len:15,words:3,upper:4,lower:9,digits:0 +5 REGULAR 12000 200 HIGH:10200.00 len:19,words:3,upper:1,lower:16,digits:0 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_performance.out b/regression-test/data/pythonudf_p0/test_pythonudf_performance.out new file mode 100644 index 00000000000000..d82f52f1df87cd --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_performance.out @@ -0,0 +1,59 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_perf_simple -- +4990 + +-- !select_perf_aggregate -- +A 2500 996 +B 2500 998 +C 2500 1000 +D 2500 1002 + +-- !select_perf_multiple_udf -- +D 1880 +C 1870 +B 1870 +A 1870 + +-- !select_perf_string -- +A 250 +B 250 +C 250 +D 250 + +-- !select_perf_complex -- +A 3372.3 6744.6 0 +B 3373.65 6745.950000000001 1.35 +C 3375 6747.3 2.7 +D 3376.35 6748.650000000001 4.05 + +-- !select_perf_nested -- +D 1002 +C 1000 +B 998 +A 996 + +-- !select_perf_null -- +5000 4000 50 + +-- !select_perf_order -- +9999 999 1998 +8999 999 1998 +7999 999 1998 +6999 999 1998 +5999 999 1998 +4999 999 1998 +3999 999 1998 +2999 999 1998 +1999 999 1998 +999 999 1998 +9998 998 1996 +8998 998 1996 +7998 998 1996 +6998 998 1996 +5998 998 1996 +4998 998 1996 +3998 998 1996 +2998 998 1996 +1998 998 1996 +998 998 1996 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_ret_map.out b/regression-test/data/pythonudf_p0/test_pythonudf_ret_map.out new file mode 100644 index 00000000000000..b1160ebf38bfd9 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_ret_map.out @@ -0,0 +1,17 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_1 -- +{1:1.1, 11:11.1} {10:11, 110:111} +{2:2.2, 22:22.2} {20:22, 220:222} + +-- !select_2 -- +{1:1, 10:1, 100:1} {10:10, 100:10, 1000:10} +{2:2, 20:2, 200:2} {20:20, 200:20, 2000:20} + +-- !select_3 -- +10 1.1 {"11410":"5141.1"} +20 2.2 {"11420":"5142.2"} + +-- !select_4 -- +{"abc":"efg", "h":"i"} {"abc114":"efg514", "h114":"i514"} +{"j":"k"} {"j114":"k514"} + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_runtime_version.out b/regression-test/data/pythonudf_p0/test_pythonudf_runtime_version.out new file mode 100644 index 00000000000000..2658cd549c7a55 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_runtime_version.out @@ -0,0 +1,4 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_version_short -- +42 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_schema_check.out b/regression-test/data/pythonudf_p0/test_pythonudf_schema_check.out new file mode 100644 index 00000000000000..a17c75e6f474df --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_schema_check.out @@ -0,0 +1,112 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_1 -- +1 10 1000 1010 +2 20 2000 2020 +3 30 3000 3030 +4 40 4000 4040 +5 50 5000 5050 + +-- !select_2 -- +1 100 1000 1100 +2 200 2000 2200 +3 300 3000 3300 +4 400 4000 4400 +5 500 5000 5500 + +-- !select_3 -- +1 1000 10000 11000 +2 2000 20000 22000 +3 3000 30000 33000 +4 4000 40000 44000 +5 5000 50000 55000 + +-- !select_4 -- +1 1.5 10.5 12 +2 2.5 20.5 23 +3 3.5 30.5 34 +4 4.5 40.5 45 +5 5.5 50.5 56 + +-- !select_5 -- +1 10 100 1000 1110 +2 20 200 2000 2220 +3 30 300 3000 3330 +4 40 400 4000 4440 +5 50 500 5000 5550 + +-- !select_6 -- +1 10 100 1000 +2 20 200 4000 +3 30 300 9000 +4 40 400 16000 +5 50 500 25000 + +-- !select_7 -- +1 1.5 10.5 7 +2 2.5 20.5 8.199999999999999 +3 3.5 30.5 8.714285714285714 +4 4.5 40.5 9 +5 5.5 50.5 9.181818181818182 + +-- !select_8 -- +1 1000 1.5 2001.5 +2 2000 2.5 4002.5 +3 3000 3.5 6003.5 +4 4000 4.5 8004.5 +5 5000 5.5 10005.5 + +-- !select_9 -- +1 test1 TEST1 +2 test2 TEST2 +3 test3 TEST3 +4 test4 TEST4 +5 test5 TEST5 + +-- !select_10 -- +1 true false +2 false true +3 true false +4 false true +5 true false + +-- !select_11 -- +1 10 10000 10010 +2 20 20000 20020 +3 30 30000 30030 +4 40 40000 40040 +5 50 50000 50050 + +-- !select_12 -- +1 1000 1500 +2 2000 3000 +3 3000 4500 +4 4000 6000 +5 5000 7500 + +-- !select_13 -- +1 test1 \N + +-- !select_14 -- +1 10000 11000 + +-- !select_15 -- +1 10.5 12.0 + +-- !select_16 -- +1 true 1001 + +-- !select_17 -- +1 2024-01-01 2024-01-01 + +-- !select_18 -- +1 1000 false + +-- !select_19 -- +1 test1 \N + +-- !select_20 -- +1 test1 true \N + +-- !select_22 -- +1 1.5 1001 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudf_string.out b/regression-test/data/pythonudf_p0/test_pythonudf_string.out new file mode 100644 index 00000000000000..59f2f7c776dd51 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudf_string.out @@ -0,0 +1,67 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +1 1 abcdefg1 poiuytre1abcdefg +2 2 abcdefg2 poiuytre2abcdefg +3 3 abcdefg3 poiuytre3abcdefg +4 4 abcdefg4 poiuytre4abcdefg +5 5 abcdefg5 poiuytre5abcdefg +6 6 abcdefg6 poiuytre6abcdefg +7 7 abcdefg7 poiuytre7abcdefg +8 8 abcdefg8 poiuytre8abcdefg +9 9 abcdefg9 poiuytre9abcdefg + +-- !select_default_2 -- +1 1 abcdefg1 poiuytre1abcdefg +2 2 abcdefg2 poiuytre2abcdefg +3 3 abcdefg3 poiuytre3abcdefg +4 4 abcdefg4 poiuytre4abcdefg +5 5 abcdefg5 poiuytre5abcdefg +6 6 abcdefg6 poiuytre6abcdefg +7 7 abcdefg7 poiuytre7abcdefg +8 8 abcdefg8 poiuytre8abcdefg +9 9 abcdefg9 poiuytre9abcdefg + +-- !select -- +ab***fg1 +ab***fg2 +ab***fg3 +ab***fg4 +ab***fg5 +ab***fg6 +ab***fg7 +ab***fg8 +ab***fg9 + +-- !select -- +po***********efg +po***********efg +po***********efg +po***********efg +po***********efg +po***********efg +po***********efg +po***********efg +po***********efg + +-- !select -- +ab*def ab**efg +ab*def ab**efg +ab*def ab**efg +ab*def ab**efg +ab*def ab**efg +ab*def ab**efg +ab*def ab**efg +ab*def ab**efg +ab*def ab**efg + +-- !select_4 -- +ab***fg1 ab***fg1 +ab***fg2 ab***fg2 +ab***fg3 ab***fg3 +ab***fg4 ab***fg4 +ab***fg5 ab***fg5 +ab***fg6 ab***fg6 +ab***fg7 ab***fg7 +ab***fg8 ab***fg8 +ab***fg9 ab***fg9 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_array.out b/regression-test/data/pythonudf_p0/test_pythonudtf_array.out new file mode 100644 index 00000000000000..866fd08e7f052a --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudtf_array.out @@ -0,0 +1,28 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +1 2 a1b +2 4 a2b +3 6 a3b + +-- !select_1 -- +1 [1, 2, 3] +1 [1, 2, 3] +1 [1, 2, 3] +2 [1, 2, 3] +2 [1, 2, 3] +2 [1, 2, 3] +3 [1, 2, 3] +3 [1, 2, 3] +3 [1, 2, 3] + +-- !select_2 -- +1 ["Hi", "DataMind", "Good"] +1 ["Hi", "DataMind", "Good"] +1 ["Hi", "DataMind", "Good"] +2 ["Hi", "DataMind", "Good"] +2 ["Hi", "DataMind", "Good"] +2 ["Hi", "DataMind", "Good"] +3 ["Hi", "DataMind", "Good"] +3 ["Hi", "DataMind", "Good"] +3 ["Hi", "DataMind", "Good"] + diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_float.out b/regression-test/data/pythonudf_p0/test_pythonudtf_float.out new file mode 100644 index 00000000000000..907e4e402aba14 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudtf_float.out @@ -0,0 +1,24 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +111 11111.111 222222.33 1.234567834455677E7 1111112.0 +112 1234556.1 222222.33 2.2222222233333334E8 4.444444444444556E12 +113 8.765432E7 \N 6.666666666666667E9 \N + +-- !select1 -- +111 1.234567834455677E7 1.234567834455677E8 +112 2.2222222233333334E8 2.2222222233333335E9 +113 6.666666666666667E9 6.666666666666667E10 + +-- !select2 -- +111 1111112.0 1.111112E7 +112 4.444444444444556E12 4.4444444444445555E13 + +-- !select3 -- +111 11111.111 11101.111 +112 1234556.1 1234546.1 +113 8.765432E7 8.7654312E7 + +-- !select4 -- +111 222222.33 222212.33 +112 222222.33 222212.33 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_int.out b/regression-test/data/pythonudf_p0/test_pythonudtf_int.out new file mode 100644 index 00000000000000..7d35ab6a1014ca --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudtf_int.out @@ -0,0 +1,128 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +0 3 300 3000 +0 6 600 6000 +0 9 900 9000 +1 1 100 1000 +1 4 400 4000 +1 7 700 7000 +2 2 200 2000 +2 5 500 5000 +2 8 800 8000 + +-- !select1 -- +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +0 0 +1 1 +1 1 +1 1 +1 1 +1 1 +1 1 +1 1 +1 1 +1 1 +2 2 +2 2 +2 2 +2 2 +2 2 +2 2 +2 2 +2 2 +2 2 + +-- !select2 -- +1 1 +1 1 +1 1 +2 2 +2 2 +2 2 +3 3 +3 3 +3 3 +4 4 +4 4 +4 4 +5 5 +5 5 +5 5 +6 6 +6 6 +6 6 +7 7 +7 7 +7 7 +8 8 +8 8 +8 8 +9 9 +9 9 +9 9 + +-- !select3 -- +100 100 +100 100 +100 100 +200 200 +200 200 +200 200 +300 300 +300 300 +300 300 +400 400 +400 400 +400 400 +500 500 +500 500 +500 500 +600 600 +600 600 +600 600 +700 700 +700 700 +700 700 +800 800 +800 800 +800 800 +900 900 +900 900 +900 900 + +-- !select4 -- +1000 1000 +1000 1000 +1000 1000 +2000 2000 +2000 2000 +2000 2000 +3000 3000 +3000 3000 +3000 3000 +4000 4000 +4000 4000 +4000 4000 +5000 5000 +5000 5000 +5000 5000 +6000 6000 +6000 6000 +6000 6000 +7000 7000 +7000 7000 +7000 7000 +8000 8000 +8000 8000 +8000 8000 +9000 9000 +9000 9000 +9000 9000 + diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_map.out b/regression-test/data/pythonudf_p0/test_pythonudtf_map.out new file mode 100644 index 00000000000000..62556750bb3615 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudtf_map.out @@ -0,0 +1,9 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_1 -- +1 {"114":514, "1919":810} +1 {"114":514, "1919":810} +1 {"114":514, "1919":810} +2 {"a":11, "def":22, "hij":33} +2 {"a":11, "def":22, "hij":33} +2 {"a":11, "def":22, "hij":33} + diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_string.out b/regression-test/data/pythonudf_p0/test_pythonudtf_string.out new file mode 100644 index 00000000000000..da31e541493c20 --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudtf_string.out @@ -0,0 +1,32 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +1 1 abc,defg poiuytre,abcdefg +2 2 abc,defg poiuytre,abcdefg +0 3 abc,defg poiuytre,abcdefg +1 4 abc,defg poiuytre,abcdefg +2 5 abc,defg poiuytre,abcdefg +0 6 abc,defg poiuytre,abcdefg +1 7 abc,defg poiuytre,abcdefg +2 8 abc,defg poiuytre,abcdefg +9 9 ab,cdefg poiuytreabcde,fg + +-- !select1 -- +0 abc,defg abc +0 abc,defg defg +0 abc,defg abc +0 abc,defg defg +1 abc,defg abc +1 abc,defg defg +1 abc,defg abc +1 abc,defg defg +1 abc,defg abc +1 abc,defg defg +2 abc,defg abc +2 abc,defg defg +2 abc,defg abc +2 abc,defg defg +2 abc,defg abc +2 abc,defg defg +9 ab,cdefg ab +9 ab,cdefg cdefg + diff --git a/regression-test/data/pythonudf_p0/test_pythonudtf_struct.out b/regression-test/data/pythonudf_p0/test_pythonudtf_struct.out new file mode 100644 index 00000000000000..f6410400043b6b --- /dev/null +++ b/regression-test/data/pythonudf_p0/test_pythonudtf_struct.out @@ -0,0 +1,17 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_default -- +0 +1 +2 + +-- !select1 -- +0 1 0.112 Hello, DataMind +0 1 0.112 Hello, DataMind +0 1 0.112 Hello, DataMind +1 1 0.112 Hello, DataMind +1 1 0.112 Hello, DataMind +1 1 0.112 Hello, DataMind +2 1 0.112 Hello, DataMind +2 1 0.112 Hello, DataMind +2 1 0.112 Hello, DataMind + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_basic_inline.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_basic_inline.out new file mode 100644 index 00000000000000..8fff208efbbe19 --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_basic_inline.out @@ -0,0 +1,245 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !split_string -- +apple +banana +cherry + +-- !generate_series -- +1 +2 +3 +4 +5 +10 +11 +12 + +-- !generate_series_multiple -- +1 +2 +3 +4 +5 +10 +11 +12 + +-- !running_sum -- +10 10 +20 20 +30 30 +40 40 + +-- !explode_json -- +apple +banana +cherry + +-- !top_n -- +A 100 1 +A 90 1 +A 80 1 +A 70 1 +B 200 1 +B 190 1 + +-- !duplicate -- +Hello 1 +Hello 2 +Hello 3 + +-- !filter_positive -- +1 +3 +7 + +-- !cartesian -- +A X +A Y +A Z +B X +B Y +B Z + +-- !all_filtered -- + +-- !mixed_filter -- +2 -5 +4 -3 + +-- !empty_input -- + +-- !nullable_true -- +1 1 +2 \N +3 3 +4 \N +5 5 + +-- !non_nullable_false -- +1 20 +2 40 +3 60 + +-- !nullable_with_nulls -- +1 \N +2 \N +3 \N +4 \N + +-- !non_nullable_with_nulls -- +1 0 +2 20 +3 0 +4 40 + +-- !default_nullable -- +1 HELLO +2 \N +3 WORLD +4 \N + +-- !multi_nullable -- +1 a +1 b +1 c +2 \N +2 x +2 z +3 \N +3 \N +3 \N + +-- !scalar_int -- +1 +2 +3 +4 +5 + +-- !scalar_string -- +apple +banana +cherry + +-- !mixed_style -- +10 +20 +30 + +-- !return_scalar -- +HELLO + +-- !multi_field_check -- +42 84 + +-- !outer_without -- +1 20 +5 30 + +-- !outer_with -- +1 20 +2 \N +3 \N +4 \N +5 30 + +-- !outer_string_without -- +1 hello +1 world +5 single + +-- !outer_string_with -- +1 hello +1 world +2 \N +3 \N +4 \N +5 single + +-- !outer_range_without -- +1 1 +1 2 +1 3 +5 1 + +-- !outer_range_with -- +1 1 +1 2 +1 3 +2 \N +3 \N +4 \N +5 1 + +-- !outer_multifield_without -- +1 Alice 30 +5 Charlie 25 + +-- !outer_multifield_with -- +1 Alice 30 +2 \N \N +3 \N \N +4 \N \N +5 Charlie 25 + +-- !outer_mixed_functions -- +1 1 1 +1 1 2 +1 1 3 +1 1 4 +1 1 5 +1 2 1 +1 2 2 +1 2 3 +1 2 4 +1 2 5 +1 3 1 +1 3 2 +1 3 3 +1 3 4 +1 3 5 +1 4 1 +1 4 2 +1 4 3 +1 4 4 +1 4 5 +1 5 1 +1 5 2 +1 5 3 +1 5 4 +1 5 5 +3 1 1 +3 1 2 +3 1 3 +3 2 1 +3 2 2 +3 2 3 +3 3 1 +3 3 2 +3 3 3 + +-- !outer_builtin_explode -- +1 1 +1 2 +1 3 +4 5 + +-- !outer_builtin_explode_outer -- +1 1 +1 2 +1 3 +2 \N +3 \N +4 5 + +-- !outer_doc_inner -- +1 apple,banana +4 cherry + +-- !outer_doc_outer -- +1 apple,banana +2 \N +3 \N +4 cherry + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_basic_module.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_basic_module.out new file mode 100644 index 00000000000000..03c1fbd5947ff0 --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_basic_module.out @@ -0,0 +1,71 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !split_string -- +apple +banana +cherry + +-- !generate_series -- +1 +2 +3 +4 +5 +10 +11 +12 + +-- !generate_series_multiple -- +1 +2 +3 +4 +5 +10 +11 +12 + +-- !running_sum -- +10 10 +20 20 +30 30 +40 40 + +-- !explode_json -- +apple +banana +cherry + +-- !top_n -- +A 100 1 +A 90 1 +A 80 1 +A 70 1 +B 200 1 +B 190 1 + +-- !duplicate -- +Hello 1 +Hello 2 +Hello 3 + +-- !filter_positive -- +1 +3 +7 + +-- !cartesian -- +A X +A Y +A Z +B X +B Y +B Z + +-- !all_filtered -- + +-- !mixed_filter -- +2 -5 +4 -3 + +-- !empty_input -- + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_data_types_inline.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_data_types_inline.out new file mode 100644 index 00000000000000..6fc24ef99dea3f --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_data_types_inline.out @@ -0,0 +1,112 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !tinyint -- +-60 -120 +0 0 +63 126 + +-- !smallint -- +-1000 1000000 +0 0 +1000 1000000 + +-- !bigint -- +-1000000000000 -999999999999 +0 1 +1000000000000 1000000000001 + +-- !float -- +-3.14 -1.57 +0.0 0.0 +2.718 1.359 + +-- !double -- +0.0 0.0 +4.0 2.0 +16.0 4.0 +100.0 10.0 + +-- !boolean -- +false true FALSE +true false TRUE + +-- !string -- +DoRiS 5 DORIS doris +Hello 5 HELLO hello +WORLD 5 WORLD world + +-- !date -- +2024-01-01 2024 1 1 +2024-06-15 2024 6 15 +2024-12-31 2024 12 31 + +-- !datetime -- +2024-01-01T08:30 8 30 +2024-06-15T12:00 12 0 +2024-12-31T23:59 23 59 + +-- !array_int -- +1 0 1 2 +1 1 2 4 +1 2 3 6 +2 0 10 20 +2 1 20 40 +3 0 100 200 + +-- !array_string -- +1 apple 5 +1 banana 6 +2 bird 4 +2 cat 3 +2 dog 3 + +-- !struct -- +Alice 25 adult +Bob 15 child +Charlie 30 adult + +-- !multi_types -- +100 apple apple_100 +200 banana banana_200 + +-- !decimal -- +123.45 246.90 +678.90 1357.80 +999.99 1999.98 + +-- !map_like -- +1 age 25 +1 score 90 +2 age 30 +2 level 3 +2 score 85 + +-- !nested_array -- +1 0 10 +1 0 20 +1 1 30 +1 1 40 +2 0 50 +2 1 60 +2 1 70 +2 1 80 + +-- !array_structs -- +1 Alice 25 90 +1 Bob 30 85 +2 Charlie 28 88 + +-- !struct_array -- +1 Alice 3 sports,music,reading +2 Bob 2 coding,gaming + +-- !json_extract -- +1 age 25 +1 city NYC +1 name Alice +2 age 30 +2 name Bob + +-- !complex_struct -- +1 101 Alice NYC 10001 +2 102 Bob LA 90001 + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_data_types_module.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_data_types_module.out new file mode 100644 index 00000000000000..6fc24ef99dea3f --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_data_types_module.out @@ -0,0 +1,112 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !tinyint -- +-60 -120 +0 0 +63 126 + +-- !smallint -- +-1000 1000000 +0 0 +1000 1000000 + +-- !bigint -- +-1000000000000 -999999999999 +0 1 +1000000000000 1000000000001 + +-- !float -- +-3.14 -1.57 +0.0 0.0 +2.718 1.359 + +-- !double -- +0.0 0.0 +4.0 2.0 +16.0 4.0 +100.0 10.0 + +-- !boolean -- +false true FALSE +true false TRUE + +-- !string -- +DoRiS 5 DORIS doris +Hello 5 HELLO hello +WORLD 5 WORLD world + +-- !date -- +2024-01-01 2024 1 1 +2024-06-15 2024 6 15 +2024-12-31 2024 12 31 + +-- !datetime -- +2024-01-01T08:30 8 30 +2024-06-15T12:00 12 0 +2024-12-31T23:59 23 59 + +-- !array_int -- +1 0 1 2 +1 1 2 4 +1 2 3 6 +2 0 10 20 +2 1 20 40 +3 0 100 200 + +-- !array_string -- +1 apple 5 +1 banana 6 +2 bird 4 +2 cat 3 +2 dog 3 + +-- !struct -- +Alice 25 adult +Bob 15 child +Charlie 30 adult + +-- !multi_types -- +100 apple apple_100 +200 banana banana_200 + +-- !decimal -- +123.45 246.90 +678.90 1357.80 +999.99 1999.98 + +-- !map_like -- +1 age 25 +1 score 90 +2 age 30 +2 level 3 +2 score 85 + +-- !nested_array -- +1 0 10 +1 0 20 +1 1 30 +1 1 40 +2 0 50 +2 1 60 +2 1 70 +2 1 80 + +-- !array_structs -- +1 Alice 25 90 +1 Bob 30 85 +2 Charlie 28 88 + +-- !struct_array -- +1 Alice 3 sports,music,reading +2 Bob 2 coding,gaming + +-- !json_extract -- +1 age 25 +1 city NYC +1 name Alice +2 age 30 +2 name Bob + +-- !complex_struct -- +1 101 Alice NYC 10001 +2 102 Bob LA 90001 + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_edge_cases_inline.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_edge_cases_inline.out new file mode 100644 index 00000000000000..0fd24efbe00d2e --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_edge_cases_inline.out @@ -0,0 +1,70 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !null_int -- +1 \N true -1 +2 0 false 0 +3 10 false 20 +4 \N true -1 + +-- !null_string -- +1 NULL -1 +2 EMPTY 0 +3 NORMAL 5 +4 NULL -1 + +-- !empty_array -- +1 NULL -1 +2 EMPTY 0 +3 NORMAL 3 + +-- !null_struct -- +1 true true Alice_25 +2 true false Bob_no_age +3 false true no_name_30 +4 false false all_fields_null + +-- !empty_table -- + +-- !single_row -- +100 100 +100 101 +100 102 + +-- !long_string -- +1 1000 AAAAAAAAAA AAAAAAAAAA +2 5000 BBBBBBBBBB BBBBBBBBBB + +-- !large_array -- +1 100 100 1 1 +2 50 250 5 5 + +-- !output_explosion -- +1 10 0 9 +2 50 0 49 + +-- !special_numbers -- +1 -2147483648 NEGATIVE true +2 -1 NEGATIVE false +3 0 ZERO false +4 1 POSITIVE false +5 2147483647 POSITIVE true +6 \N NULL false + +-- !special_doubles -- +1 0.0 ZERO +2 1.0E-15 VERY_SMALL +3 1.0E15 VERY_LARGE +4 -1.0E15 VERY_LARGE +5 3.14159 NORMAL + +-- !special_strings -- +1 11 false NORMAL +2 15 false HAS_SYMBOLS +3 8 false HAS_WHITESPACE +4 4 true HAS_UNICODE +5 0 false EMPTY + +-- !boundary_dates -- +1 1970-01-01 1970 true +2 2024-06-15 2024 false +3 9999-12-31 9999 true + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_edge_cases_module.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_edge_cases_module.out new file mode 100644 index 00000000000000..e5238713750663 --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_edge_cases_module.out @@ -0,0 +1,70 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !null_int -- +1 \N true -1 +2 0 false 0 +3 10 false 20 +4 \N true -1 + +-- !null_string -- +1 NULL -1 +2 EMPTY 0 +3 NORMAL 5 +4 NULL -1 + +-- !empty_array -- +1 NULL -1 +2 EMPTY 0 +3 NORMAL 3 + +-- !null_struct -- +1 true true Alice_25 +2 true false Bob_no_age +3 false true no_name_30 +4 false false all_fields_null + +-- !empty_table -- + +-- !single_row -- +100 100 +100 101 +100 102 + +-- !long_string -- +1 1000 AAAAAAAAAA AAAAAAAAAA +2 5000 BBBBBBBBBB BBBBBBBBBB + +-- !large_array -- +1 100 100 1 1 +2 50 250 5 5 + +-- !output_explosion -- +1 10 0 9 +2 50 0 49 + +-- !special_numbers -- +1 -2147483648 NEGATIVE true +2 -1 NEGATIVE false +3 0 ZERO false +4 1 POSITIVE false +5 2147483647 POSITIVE true +6 \N NULL false + +-- !special_doubles -- +1 0 ZERO +2 1e-15 VERY_SMALL +3 1000000000000000 VERY_LARGE +4 -1000000000000000 VERY_LARGE +5 3.14159 NORMAL + +-- !special_strings -- +1 11 false NORMAL +2 15 false HAS_SYMBOLS +3 8 false HAS_WHITESPACE +4 4 true HAS_UNICODE +5 0 false EMPTY + +-- !boundary_dates -- +1 1970-01-01 1970 true +2 2024-06-15 2024 false +3 9999-12-31 9999 true + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_exceptions_inline.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_exceptions_inline.out new file mode 100644 index 00000000000000..afd3588411dfe5 --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_exceptions_inline.out @@ -0,0 +1,86 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !safe_divide -- +1 10 2 5.0 success +2 10 0 \N division_by_zero +3 0 5 0.0 success +4 -8 4 -2.0 success + +-- !overflow_check -- +1 100 200 safe +2 5000000000000 10000000000000 safe +3 -5000000000000 -10000000000000 safe +4 \N \N null_input + +-- !parse_number -- +1 123 123.0 true +2 45.67 45.67 true +3 abc \N false +4 12.34.56 \N false +5 \N false +6 \N \N false + +-- !type_check -- +1 hello str 5 +2 str 0 +3 12345 str 5 +4 \N NoneType 0 + +-- !safe_index -- +1 3 1 20 success +2 3 5 \N out_of_bounds +3 3 -1 \N out_of_bounds +4 0 0 \N empty_array +5 0 0 \N null_array + +-- !collection_stats -- +1 5 15 3.0 computed +2 0 0 0.0 empty_array +3 0 0 0.0 null_array +4 2 30 15.0 computed + +-- !safe_struct_access -- +1 true true Alice 30 +2 true false Bob \N +3 false true \N 25 +4 false false \N \N + +-- !string_slice -- +1 hello world 0 5 hello success +2 hello world 6 11 world success +3 hello world 20 30 empty_slice +4 hello world 5 2 empty_slice +5 \N 0 5 \N null_string + +-- !check_encoding -- +1 hello 5 5 false +2 你好世界 12 4 true +3 café 5 4 true +4 0 0 false +5 \N 0 0 false + +-- !conditional_process -- +1 -10 negative 10 +2 0 zero 1 +3 50 small_positive 100 +4 200 large_positive 200 +5 \N null 0 + +-- !filter_yield -- +1 10 +5 22 + +-- !number_range -- +1 0.0 zero true +2 1.0E-150 extremely_small true +3 1.0E150 extremely_large true +4 0.5 small true +5 123.456 normal true +6 \N null true + +-- !date_validation -- +1 2024-01-01 2024 true normal +2 2000-02-29 2000 true normal +3 1970-01-01 1970 false normal +4 9999-12-31 9999 false far_future +5 \N 0 false null_date + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_exceptions_module.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_exceptions_module.out new file mode 100644 index 00000000000000..bf33b0a48658ab --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_exceptions_module.out @@ -0,0 +1,86 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !safe_divide -- +1 10 2 5 success +2 10 0 \N division_by_zero +3 0 5 0 success +4 -8 4 -2 success + +-- !overflow_check -- +1 100 200 safe +2 5000000000000 10000000000000 safe +3 -5000000000000 -10000000000000 safe +4 \N \N null_input + +-- !parse_number -- +1 123 123 true +2 45.67 45.67 true +3 abc \N false +4 12.34.56 \N false +5 \N false +6 \N \N false + +-- !type_check -- +1 hello str 5 +2 str 0 +3 12345 str 5 +4 \N NoneType 0 + +-- !safe_index -- +1 3 1 20 success +2 3 5 \N out_of_bounds +3 3 -1 \N out_of_bounds +4 0 0 \N empty_array +5 0 0 \N null_array + +-- !collection_stats -- +1 5 15 3 computed +2 0 0 0 empty_array +3 0 0 0 null_array +4 2 30 15 computed + +-- !safe_struct_access -- +1 true true Alice 30 +2 true false Bob \N +3 false true \N 25 +4 false false \N \N + +-- !string_slice -- +1 hello world 0 5 hello success +2 hello world 6 11 world success +3 hello world 20 30 empty_slice +4 hello world 5 2 empty_slice +5 \N 0 5 \N null_string + +-- !check_encoding -- +1 hello 5 5 false +2 你好世界 12 4 true +3 café 5 4 true +4 0 0 false +5 \N 0 0 false + +-- !conditional_process -- +1 -10 negative 10 +2 0 zero 1 +3 50 small_positive 100 +4 200 large_positive 200 +5 \N null 0 + +-- !filter_yield -- +1 10 +5 22 + +-- !number_range -- +1 0 zero true +2 1e-150 extremely_small true +3 1e+150 extremely_large true +4 0.5 small true +5 123.456 normal true +6 \N null true + +-- !date_validation -- +1 2024-01-01 2024 true normal +2 2000-02-29 2000 true normal +3 1970-01-01 1970 false normal +4 9999-12-31 9999 false far_future +5 \N 0 false null_date + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_io_patterns_inline.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_io_patterns_inline.out new file mode 100644 index 00000000000000..5b2e8d612694d3 --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_io_patterns_inline.out @@ -0,0 +1,72 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !one_to_one -- +10 20 +20 40 +30 60 + +-- !one_to_many -- +1 1 +1 2 +1 3 +2 1 +2 2 +3 1 +3 2 +3 3 +3 4 + +-- !one_to_zero -- +2 +4 +6 + +-- !one_to_variable -- +1 hello +2 hello +2 world +4 a +4 b +4 c + +-- !aggregate_pattern -- +large 2 +medium 2 +small 2 + +-- !explosive -- +0 0 +0 1 +0 2 +1 0 +1 1 +1 2 + +-- !conditional -- +0 neutral +0 zero +5 negative +7 positive +10 positive + +-- !all_or_nothing -- +1 h 0 +1 e 1 +1 l 2 +1 l 3 +1 o 4 +3 w 0 +3 o 1 +3 r 2 +3 l 3 +3 d 4 + +-- !empty_input -- + +-- !batch_process -- +10 2 20 +10 3 30 +10 5 50 +20 2 40 +20 3 60 +20 5 100 + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_io_patterns_module.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_io_patterns_module.out new file mode 100644 index 00000000000000..5b2e8d612694d3 --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_io_patterns_module.out @@ -0,0 +1,72 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !one_to_one -- +10 20 +20 40 +30 60 + +-- !one_to_many -- +1 1 +1 2 +1 3 +2 1 +2 2 +3 1 +3 2 +3 3 +3 4 + +-- !one_to_zero -- +2 +4 +6 + +-- !one_to_variable -- +1 hello +2 hello +2 world +4 a +4 b +4 c + +-- !aggregate_pattern -- +large 2 +medium 2 +small 2 + +-- !explosive -- +0 0 +0 1 +0 2 +1 0 +1 1 +1 2 + +-- !conditional -- +0 neutral +0 zero +5 negative +7 positive +10 positive + +-- !all_or_nothing -- +1 h 0 +1 e 1 +1 l 2 +1 l 3 +1 o 4 +3 w 0 +3 o 1 +3 r 2 +3 l 3 +3 d 4 + +-- !empty_input -- + +-- !batch_process -- +10 2 20 +10 3 30 +10 5 50 +20 2 40 +20 3 60 +20 5 100 + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_sql_integration_inline.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_sql_integration_inline.out new file mode 100644 index 00000000000000..d452805258fee1 --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_sql_integration_inline.out @@ -0,0 +1,213 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !where_before -- +1 A 0 apple +1 A 1 banana +3 A 0 red +3 A 1 green +3 A 2 blue + +-- !where_after -- +1 0 apple +3 0 red +3 1 green +3 2 blue +4 0 one + +-- !where_combined -- +1 A apple +2 B cat +3 A red + +-- !join_inner -- +1 1 one false +1 2 two true +1 3 three false +2 2 two true +2 3 three false +2 4 four true + +-- !join_left -- +1 1 one +1 2 two +2 5 five +2 6 \N +2 7 \N + +-- !join_self -- +1 20 2 20 +1 30 2 30 + +-- !group_by_udtf -- +apple 2 +banana 2 +cat 2 +cherry 1 +dog 1 + +-- !group_by_mixed -- +animal cat 2 +animal dog 1 +fruit apple 2 +fruit banana 2 +fruit cherry 1 + +-- !group_by_having -- +apple 2 +banana 2 +cat 2 + +-- !group_by_multi_agg -- +1 5 1 5 15 3.0 +2 5 3 7 25 5.0 +3 3 10 12 33 11.0 + +-- !order_by_udtf -- +2 Bob 95 +1 Alice 92 +2 Bob 90 +2 Bob 88 +3 Charlie 88 + +-- !order_by_mixed -- +1 Alice 2 78 +1 Alice 1 92 +1 Alice 0 85 +2 Bob 2 95 +2 Bob 1 88 +2 Bob 0 90 +3 Charlie 2 88 +3 Charlie 1 82 +3 Charlie 0 70 + +-- !limit_only -- +1 85 +1 92 +1 78 + +-- !top_n_pattern -- +1 Alice 92 +1 Alice 85 +2 Bob 95 +2 Bob 90 +3 Charlie 88 +3 Charlie 82 + +-- !subquery_in -- +1 A,B,C +3 A,C,E + +-- !subquery_from -- +C 3 +A 2 +B 2 +D 1 +E 1 + +-- !subquery_nested -- +C 3 +A 2 +B 2 + +-- !distinct_udtf -- +blue +green +red +yellow + +-- !count_distinct -- +4 + +-- !union_all -- +1 X +1 Y +2 Y +2 Z + +-- !union_distinct -- +X +Y +Z + +-- !array_filter -- +1 3 +2 3 +2 4 +2 5 +3 3 +3 4 + +-- !array_aggregate -- +1 3 6 2.0 +2 4 14 3.5 +3 2 7 3.5 + +-- !window_function -- +1 A 10 1 100 +2 A 15 2 100 +1 A 20 3 100 +2 A 25 4 100 +1 A 30 5 100 +3 B 5 1 30 +3 B 10 2 30 +3 B 15 3 30 + +-- !case_when -- +1 5 small +1 15 medium +1 25 large +2 10 medium +2 20 large +2 30 large + +-- !nested_2level -- +1 1,2 1 +1 1,2 2 +1 3 3 +2 4,5 4 +2 4,5 5 + +-- !parallel_lateral -- +1 A X +1 A Y +1 B X +1 B Y +2 C Z + +-- !nested_join -- +1 soccer 10 +1 tennis 8 +1 pizza 5 +2 rock 9 + +-- !nested_groupby -- +1 3 450 +2 3 650 + +-- !nested_3level -- +1 0 A +1 0 B +1 0 C +1 1 D +1 1 E +1 2 F + +-- !nested_array_expansion -- +1 1 10 +1 1 20 +1 2 30 +2 1 40 +2 1 50 + +-- !nested_multifilter -- +1 \N 20 +1 \N 30 +1 \N 40 +2 \N 50 +2 \N 60 + +-- !nested_distinct -- +blue +green +red +yellow + diff --git a/regression-test/data/pythonudtf_p0/test_pythonudtf_sql_integration_module.out b/regression-test/data/pythonudtf_p0/test_pythonudtf_sql_integration_module.out new file mode 100644 index 00000000000000..c1107085c7c45a --- /dev/null +++ b/regression-test/data/pythonudtf_p0/test_pythonudtf_sql_integration_module.out @@ -0,0 +1,213 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !where_before -- +1 A 0 apple +1 A 1 banana +3 A 0 red +3 A 1 green +3 A 2 blue + +-- !where_after -- +1 0 apple +3 0 red +3 1 green +3 2 blue +4 0 one + +-- !where_combined -- +1 A apple +2 B cat +3 A red + +-- !join_inner -- +1 1 one false +1 2 two true +1 3 three false +2 2 two true +2 3 three false +2 4 four true + +-- !join_left -- +1 1 one +1 2 two +2 5 five +2 6 \N +2 7 \N + +-- !join_self -- +1 20 2 20 +1 30 2 30 + +-- !group_by_udtf -- +apple 2 +banana 2 +cat 2 +cherry 1 +dog 1 + +-- !group_by_mixed -- +animal cat 2 +animal dog 1 +fruit apple 2 +fruit banana 2 +fruit cherry 1 + +-- !group_by_having -- +apple 2 +banana 2 +cat 2 + +-- !group_by_multi_agg -- +1 5 1 5 15 3 +2 5 3 7 25 5 +3 3 10 12 33 11 + +-- !order_by_udtf -- +2 Bob 95 +1 Alice 92 +2 Bob 90 +2 Bob 88 +3 Charlie 88 + +-- !order_by_mixed -- +1 Alice 2 78 +1 Alice 1 92 +1 Alice 0 85 +2 Bob 2 95 +2 Bob 1 88 +2 Bob 0 90 +3 Charlie 2 88 +3 Charlie 1 82 +3 Charlie 0 70 + +-- !limit_only -- +1 85 +1 92 +1 78 + +-- !top_n_pattern -- +1 Alice 92 +1 Alice 85 +2 Bob 95 +2 Bob 90 +3 Charlie 88 +3 Charlie 82 + +-- !subquery_in -- +1 A,B,C +3 A,C,E + +-- !subquery_from -- +C 3 +A 2 +B 2 +D 1 +E 1 + +-- !subquery_nested -- +C 3 +A 2 +B 2 + +-- !distinct_udtf -- +blue +green +red +yellow + +-- !count_distinct -- +4 + +-- !union_all -- +1 X +1 Y +2 Y +2 Z + +-- !union_distinct -- +X +Y +Z + +-- !array_filter -- +1 3 +2 3 +2 4 +2 5 +3 3 +3 4 + +-- !array_aggregate -- +1 3 6 2 +2 4 14 3.5 +3 2 7 3.5 + +-- !window_function -- +1 A 10 1 100 +2 A 15 2 100 +1 A 20 3 100 +2 A 25 4 100 +1 A 30 5 100 +3 B 5 1 30 +3 B 10 2 30 +3 B 15 3 30 + +-- !case_when -- +1 5 small +1 15 medium +1 25 large +2 10 medium +2 20 large +2 30 large + +-- !nested_2level -- +1 1,2 1 +1 1,2 2 +1 3 3 +2 4,5 4 +2 4,5 5 + +-- !parallel_lateral -- +1 A X +1 A Y +1 B X +1 B Y +2 C Z + +-- !nested_join -- +1 soccer 10 +1 tennis 8 +1 pizza 5 +2 rock 9 + +-- !nested_groupby -- +1 3 450 +2 3 650 + +-- !nested_3level -- +1 0 A +1 0 B +1 0 C +1 1 D +1 1 E +1 2 F + +-- !nested_array_expansion -- +1 1 10 +1 1 20 +1 2 30 +2 1 40 +2 1 50 + +-- !nested_multifilter -- +1 \N 20 +1 \N 30 +1 \N 40 +2 \N 50 +2 \N 60 + +-- !nested_distinct -- +blue +green +red +yellow + diff --git a/regression-test/pipeline/cloud_p0/conf/be_custom.conf b/regression-test/pipeline/cloud_p0/conf/be_custom.conf index 4d271ee063cd0a..01b8b407c67848 100644 --- a/regression-test/pipeline/cloud_p0/conf/be_custom.conf +++ b/regression-test/pipeline/cloud_p0/conf/be_custom.conf @@ -54,3 +54,8 @@ enable_batch_get_delete_bitmap=true get_delete_bitmap_bytes_threshold=10 enable_fetch_rowsets_from_peer_replicas = true + +# enable to use python udf +enable_python_udf_support = true +python_env_mode = venv +python_venv_interpreter_paths = /usr/bin/python \ No newline at end of file diff --git a/regression-test/pipeline/cloud_p0/conf/fe_custom.conf b/regression-test/pipeline/cloud_p0/conf/fe_custom.conf index 992ddbb00a6200..b68a40de572e9e 100644 --- a/regression-test/pipeline/cloud_p0/conf/fe_custom.conf +++ b/regression-test/pipeline/cloud_p0/conf/fe_custom.conf @@ -46,4 +46,7 @@ workload_group_max_num = 25 enable_advance_next_id = true check_table_lock_leaky = true -enable_outfile_to_local=true \ No newline at end of file +enable_outfile_to_local=true + +# enable python udf +enable_python_udf = true \ No newline at end of file diff --git a/regression-test/pipeline/cloud_p1/conf/be_custom.conf b/regression-test/pipeline/cloud_p1/conf/be_custom.conf index aed4d69efbf704..bd98ffda4ffbd2 100644 --- a/regression-test/pipeline/cloud_p1/conf/be_custom.conf +++ b/regression-test/pipeline/cloud_p1/conf/be_custom.conf @@ -37,3 +37,8 @@ enable_table_size_correctness_check=true enable_write_index_searcher_cache=true large_cumu_compaction_task_min_thread_num=3 enable_prefill_all_dbm_agg_cache_after_compaction=true + +# enable to use python udf +enable_python_udf_support = true +python_env_mode = venv +python_venv_interpreter_paths = /usr/bin/python \ No newline at end of file diff --git a/regression-test/pipeline/cloud_p1/conf/fe_custom.conf b/regression-test/pipeline/cloud_p1/conf/fe_custom.conf index b91a4ed6d38bdf..730fc121393ba4 100644 --- a/regression-test/pipeline/cloud_p1/conf/fe_custom.conf +++ b/regression-test/pipeline/cloud_p1/conf/fe_custom.conf @@ -37,3 +37,6 @@ enable_advance_next_id = true arrow_flight_sql_port = 8081 enable_job_schedule_second_for_test = true + +# enable python udf +enable_python_udf = true \ No newline at end of file diff --git a/regression-test/pipeline/external/conf/be.conf b/regression-test/pipeline/external/conf/be.conf index b2faf3123c7a4e..978e7a3b8125f4 100644 --- a/regression-test/pipeline/external/conf/be.conf +++ b/regression-test/pipeline/external/conf/be.conf @@ -73,4 +73,7 @@ crash_in_memory_tracker_inaccurate = true enable_parquet_page_index=true enable_graceful_exit_check=true - +# enable to use python udf +enable_python_udf_support = true +python_env_mode = venv +python_venv_interpreter_paths = /usr/bin/python diff --git a/regression-test/pipeline/external/conf/fe.conf b/regression-test/pipeline/external/conf/fe.conf index 365c0b9337576e..7fdf18c63c6552 100644 --- a/regression-test/pipeline/external/conf/fe.conf +++ b/regression-test/pipeline/external/conf/fe.conf @@ -100,3 +100,6 @@ hms_events_polling_interval_ms=700 KRB5_CONFIG=/keytabs/krb5.conf + +# enable python udf +enable_python_udf = true \ No newline at end of file diff --git a/regression-test/pipeline/nonConcurrent/conf/be.conf b/regression-test/pipeline/nonConcurrent/conf/be.conf index e172968083f003..4779461b41ce08 100644 --- a/regression-test/pipeline/nonConcurrent/conf/be.conf +++ b/regression-test/pipeline/nonConcurrent/conf/be.conf @@ -91,4 +91,7 @@ enable_graceful_exit_check=true enable_fetch_rowsets_from_peer_replicas = true - +# enable to use python udf +enable_python_udf_support = true +python_env_mode = venv +python_venv_interpreter_paths = /usr/bin/python diff --git a/regression-test/pipeline/nonConcurrent/conf/fe.conf b/regression-test/pipeline/nonConcurrent/conf/fe.conf index 48085415c08444..14021e12592e07 100644 --- a/regression-test/pipeline/nonConcurrent/conf/fe.conf +++ b/regression-test/pipeline/nonConcurrent/conf/fe.conf @@ -92,3 +92,6 @@ max_query_profile_num = 2000 max_spilled_profile_num = 2000 check_table_lock_leaky=true + +# enable python udf +enable_python_udf = true \ No newline at end of file diff --git a/regression-test/pipeline/p0/conf/be.conf b/regression-test/pipeline/p0/conf/be.conf index 01df493f002f37..62c08e1551f24d 100644 --- a/regression-test/pipeline/p0/conf/be.conf +++ b/regression-test/pipeline/p0/conf/be.conf @@ -92,4 +92,7 @@ enable_graceful_exit_check=true enable_prefill_all_dbm_agg_cache_after_compaction=true enable_fetch_rowsets_from_peer_replicas = true - +# enable to use python udf +enable_python_udf_support = true +python_env_mode = venv +python_venv_interpreter_paths = /usr/bin/python diff --git a/regression-test/pipeline/p0/conf/fe.conf b/regression-test/pipeline/p0/conf/fe.conf index df28a5743bd300..02756ec90c09c7 100644 --- a/regression-test/pipeline/p0/conf/fe.conf +++ b/regression-test/pipeline/p0/conf/fe.conf @@ -92,3 +92,6 @@ max_query_profile_num = 2000 max_spilled_profile_num = 2000 check_table_lock_leaky=true + +# enable python udf +enable_python_udf = true \ No newline at end of file diff --git a/regression-test/pipeline/p1/conf/be.conf b/regression-test/pipeline/p1/conf/be.conf index 4d02df140fbe98..f8bce1605abad0 100644 --- a/regression-test/pipeline/p1/conf/be.conf +++ b/regression-test/pipeline/p1/conf/be.conf @@ -77,4 +77,7 @@ enable_graceful_exit_check=true enable_prefill_all_dbm_agg_cache_after_compaction=true - +# enable to use python udf +enable_python_udf_support = true +python_env_mode = venv +python_venv_interpreter_paths = /usr/bin/python diff --git a/regression-test/pipeline/p1/conf/fe.conf b/regression-test/pipeline/p1/conf/fe.conf index f20d51b62e6575..1431c264381ddb 100644 --- a/regression-test/pipeline/p1/conf/fe.conf +++ b/regression-test/pipeline/p1/conf/fe.conf @@ -81,3 +81,6 @@ auth_token = 5ff161c3-2c08-4079-b108-26c8850b6598 # sys_log_verbose_modules = enable_advance_next_id = true + +# enable python udf +enable_python_udf = true \ No newline at end of file diff --git a/regression-test/pipeline/performance/conf/be_custom.conf b/regression-test/pipeline/performance/conf/be_custom.conf index 08b91280a5321b..7e0e5f7e7428d1 100644 --- a/regression-test/pipeline/performance/conf/be_custom.conf +++ b/regression-test/pipeline/performance/conf/be_custom.conf @@ -24,4 +24,9 @@ streaming_load_max_mb=102400 # So feature has bug, so by default is false, only open it in pipeline to observe enable_parquet_page_index=true -enable_graceful_exit_check=true \ No newline at end of file +enable_graceful_exit_check=true + +# enable to use python udf +enable_python_udf_support = true +python_env_mode = venv +python_venv_interpreter_paths = /usr/bin/python \ No newline at end of file diff --git a/regression-test/pipeline/performance/conf/fe_custom.conf b/regression-test/pipeline/performance/conf/fe_custom.conf index 65af05709caa3a..4cfb162f0fa497 100644 --- a/regression-test/pipeline/performance/conf/fe_custom.conf +++ b/regression-test/pipeline/performance/conf/fe_custom.conf @@ -27,3 +27,6 @@ priority_networks=127.0.0.1/24 meta_dir=/data/doris-meta-${branch_name} stream_load_default_timeout_second=3600 + +# enable python udf +enable_python_udf = true diff --git a/regression-test/pipeline/vault_p0/conf/be_custom.conf b/regression-test/pipeline/vault_p0/conf/be_custom.conf index d201cad3facc27..a71f99dda4a203 100644 --- a/regression-test/pipeline/vault_p0/conf/be_custom.conf +++ b/regression-test/pipeline/vault_p0/conf/be_custom.conf @@ -39,3 +39,8 @@ pipeline_task_leakage_detect_period_sec=1 crash_in_memory_tracker_inaccurate = true enable_table_size_correctness_check=true enable_brpc_connection_check=true + +# enable to use python udf +enable_python_udf_support = true +python_env_mode = venv +python_venv_interpreter_paths = /usr/bin/python \ No newline at end of file diff --git a/regression-test/pipeline/vault_p0/conf/fe_custom.conf b/regression-test/pipeline/vault_p0/conf/fe_custom.conf index f62ffa19fc7676..17b97a6b5732d0 100644 --- a/regression-test/pipeline/vault_p0/conf/fe_custom.conf +++ b/regression-test/pipeline/vault_p0/conf/fe_custom.conf @@ -40,3 +40,6 @@ enable_job_schedule_second_for_test=true workload_sched_policy_interval_ms = 1000 enable_advance_next_id = true + +# enable python udf +enable_python_udf = true \ No newline at end of file diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_basic.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_basic.groovy new file mode 100644 index 00000000000000..2c5e976b36ae25 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_basic.groovy @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_basic") { + def pyPath = """${context.file.parent}/udaf_scripts/pyudaf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${pyPath}".toString()) + + try { + sql """ DROP TABLE IF EXISTS test_pythonudaf_basic """ + sql """ + CREATE TABLE IF NOT EXISTS test_pythonudaf_basic ( + `user_id` INT NOT NULL COMMENT "User ID", + `int_col` INT COMMENT "Integer column (nullable for NULL tests)", + `bigint_col` BIGINT NOT NULL COMMENT "BigInt column", + `double_col` DOUBLE NOT NULL COMMENT "Double column", + `category` VARCHAR(10) NOT NULL COMMENT "Category for grouping" + ) + DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1"); + """ + + // Insert test data + StringBuilder sb = new StringBuilder() + int i = 1 + for (; i < 10; i++) { + sb.append(""" + (${i}, ${i * 10}, ${i * 100}, ${i * 1.5}, 'cat${i % 3}'), + """) + } + sb.append(""" + (${i}, ${i * 10}, ${i * 100}, ${i * 1.5}, 'cat${i % 3}') + """) + sql """ INSERT INTO test_pythonudaf_basic VALUES + ${sb.toString()} + """ + qt_select_default """ SELECT * FROM test_pythonudaf_basic ORDER BY user_id; """ + + // ======================================== + // Test 1: Basic SumInt UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS python_udaf_sum_int(int) """ + + sql """ CREATE AGGREGATE FUNCTION python_udaf_sum_int(int) RETURNS bigint PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="sum_int.SumInt", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + // Test basic aggregation - sum all int_col values + qt_select_sum_all """ SELECT python_udaf_sum_int(int_col) as result FROM test_pythonudaf_basic; """ + + // Test with GROUP BY + qt_select_sum_group """ SELECT category, python_udaf_sum_int(int_col) as sum_result + FROM test_pythonudaf_basic + GROUP BY category + ORDER BY category; """ + + // Test with multiple aggregates in same query + qt_select_sum_multiple """ SELECT category, + python_udaf_sum_int(int_col) as py_sum, + sum(int_col) as native_sum + FROM test_pythonudaf_basic + GROUP BY category + ORDER BY category; """ + + // Test with NULL handling - insert some NULL values + sql """ INSERT INTO test_pythonudaf_basic VALUES (11, NULL, 1100, 16.5, 'cat2'); """ + sql """ INSERT INTO test_pythonudaf_basic VALUES (12, NULL, 1200, 18.0, 'cat0'); """ + + qt_select_sum_with_null """ SELECT python_udaf_sum_int(int_col) as result FROM test_pythonudaf_basic; """ + + qt_select_sum_group_with_null """ SELECT category, python_udaf_sum_int(int_col) as sum_result + FROM test_pythonudaf_basic + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 2: AvgDouble UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS python_udaf_avg_double(double) """ + + sql """ CREATE AGGREGATE FUNCTION python_udaf_avg_double(double) RETURNS double PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="sum_int.AvgDouble", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select_avg_all """ SELECT python_udaf_avg_double(double_col) as result FROM test_pythonudaf_basic; """ + + qt_select_avg_group """ SELECT category, + python_udaf_avg_double(double_col) as py_avg, + avg(double_col) as native_avg + FROM test_pythonudaf_basic + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 3: Window Function Support + // ======================================== + qt_select_window_partition """ SELECT user_id, category, int_col, + python_udaf_sum_int(int_col) OVER(PARTITION BY category) as sum_by_category + FROM test_pythonudaf_basic + WHERE int_col IS NOT NULL + ORDER BY category, user_id; """ + + qt_select_window_order """ SELECT user_id, category, int_col, + python_udaf_sum_int(int_col) OVER(PARTITION BY category ORDER BY user_id) as running_sum + FROM test_pythonudaf_basic + WHERE int_col IS NOT NULL + ORDER BY category, user_id; """ + + qt_select_window_rows """ SELECT user_id, category, int_col, + python_udaf_sum_int(int_col) OVER( + PARTITION BY category + ORDER BY user_id + ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING + ) as window_sum + FROM test_pythonudaf_basic + WHERE int_col IS NOT NULL + ORDER BY category, user_id; """ + + // ======================================== + // Test 4: Global Function + // ======================================== + sql """ DROP GLOBAL FUNCTION IF EXISTS python_udaf_sum_int_global(int) """ + + sql """ CREATE GLOBAL AGGREGATE FUNCTION python_udaf_sum_int_global(int) RETURNS bigint PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="sum_int.SumInt", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select_global_1 """ SELECT python_udaf_sum_int_global(int_col) as result FROM test_pythonudaf_basic; """ + qt_select_global_2 """ SELECT category, python_udaf_sum_int_global(int_col) as sum_result + FROM test_pythonudaf_basic + GROUP BY category + ORDER BY category; """ + + } finally { + try_sql("DROP GLOBAL FUNCTION IF EXISTS python_udaf_sum_int_global(int);") + try_sql("DROP FUNCTION IF EXISTS python_udaf_avg_double(double);") + try_sql("DROP FUNCTION IF EXISTS python_udaf_sum_int(int);") + try_sql("DROP TABLE IF EXISTS test_pythonudaf_basic") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_aggregation_inline.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_aggregation_inline.groovy new file mode 100644 index 00000000000000..da4efc9b4cce17 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_aggregation_inline.groovy @@ -0,0 +1,408 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_complex_aggregation_inline") { + // Test complex aggregation scenarios with Python UDAFs + // Including: variance, standard deviation, median, percentile, collect_list + + def runtime_version = "3.8.10" + + try { + // Create test table with statistical data + sql """ DROP TABLE IF EXISTS stats_test; """ + sql """ + CREATE TABLE stats_test ( + id INT, + category STRING, + value DOUBLE, + score INT, + tag STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO stats_test VALUES + (1, 'A', 10.5, 85, 'alpha'), + (2, 'A', 20.3, 92, 'beta'), + (3, 'A', 15.7, 78, 'gamma'), + (4, 'A', 30.2, 95, 'delta'), + (5, 'A', 25.1, 88, 'alpha'), + (6, 'B', 12.4, 70, 'beta'), + (7, 'B', 18.9, 85, 'gamma'), + (8, 'B', 22.5, 90, 'alpha'), + (9, 'B', 16.3, 82, 'beta'), + (10, 'C', 35.7, 98, 'delta'), + (11, 'C', 28.4, 91, 'gamma'), + (12, 'C', 31.2, 87, 'alpha'), + (13, 'C', 26.8, 93, 'beta'), + (14, 'C', 29.5, 89, 'delta'), + (15, 'C', 33.1, 95, 'gamma'); + """ + + qt_select_data """ SELECT * FROM stats_test ORDER BY id; """ + + // ======================================== + // Test 1: Variance UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_variance(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_variance(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "VarianceUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class VarianceUDAF: + def __init__(self): + self.count = 0 + self.sum_val = 0.0 + self.sum_sq = 0.0 + + @property + def aggregate_state(self): + return (self.count, self.sum_val, self.sum_sq) + + def accumulate(self, value): + if value is not None: + self.count += 1 + self.sum_val += value + self.sum_sq += value * value + + def merge(self, other_state): + other_count, other_sum, other_sum_sq = other_state + self.count += other_count + self.sum_val += other_sum + self.sum_sq += other_sum_sq + + def finish(self): + if self.count == 0: + return None + mean = self.sum_val / self.count + variance = (self.sum_sq / self.count) - (mean * mean) + return variance +\$\$; + """ + + qt_variance_all """ SELECT py_variance(value) as variance FROM stats_test; """ + qt_variance_group """ SELECT category, py_variance(value) as variance + FROM stats_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 2: Standard Deviation UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_stddev(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_stddev(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "StdDevUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +import math + +class StdDevUDAF: + def __init__(self): + self.count = 0 + self.sum_val = 0.0 + self.sum_sq = 0.0 + + @property + def aggregate_state(self): + return (self.count, self.sum_val, self.sum_sq) + + def accumulate(self, value): + if value is not None: + self.count += 1 + self.sum_val += value + self.sum_sq += value * value + + def merge(self, other_state): + other_count, other_sum, other_sum_sq = other_state + self.count += other_count + self.sum_val += other_sum + self.sum_sq += other_sum_sq + + def finish(self): + if self.count == 0: + return None + mean = self.sum_val / self.count + variance = (self.sum_sq / self.count) - (mean * mean) + return math.sqrt(max(0, variance)) +\$\$; + """ + + qt_stddev_all """ SELECT py_stddev(value) as stddev FROM stats_test; """ + qt_stddev_group """ SELECT category, py_stddev(value) as stddev + FROM stats_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 3: Median UDAF (Approximate using sorted list) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_median(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_median(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "MedianUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class MedianUDAF: + def __init__(self): + self.values = [] + + @property + def aggregate_state(self): + return self.values + + def accumulate(self, value): + if value is not None: + self.values.append(value) + + def merge(self, other_state): + if other_state: + self.values.extend(other_state) + + def finish(self): + if not self.values: + return None + sorted_vals = sorted(self.values) + n = len(sorted_vals) + if n % 2 == 0: + return (sorted_vals[n//2 - 1] + sorted_vals[n//2]) / 2.0 + else: + return sorted_vals[n//2] +\$\$; + """ + + qt_median_all """ SELECT py_median(value) as median FROM stats_test; """ + qt_median_group """ SELECT category, py_median(value) as median + FROM stats_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 4: Collect List UDAF (String concatenation) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_collect_list(STRING); """ + sql """ + CREATE AGGREGATE FUNCTION py_collect_list(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "CollectListUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class CollectListUDAF: + def __init__(self): + self.items = [] + + @property + def aggregate_state(self): + return self.items + + def accumulate(self, value): + if value is not None: + self.items.append(value) + + def merge(self, other_state): + if other_state: + self.items.extend(other_state) + + def finish(self): + if not self.items: + return None + return ','.join(sorted(self.items)) +\$\$; + """ + + qt_collect_all """ SELECT py_collect_list(tag) as tags FROM stats_test; """ + qt_collect_group """ SELECT category, py_collect_list(tag) as tags + FROM stats_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 5: Min-Max Range UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_range(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_range(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "RangeUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class RangeUDAF: + def __init__(self): + self.min_val = None + self.max_val = None + + @property + def aggregate_state(self): + return (self.min_val, self.max_val) + + def accumulate(self, value): + if value is not None: + if self.min_val is None or value < self.min_val: + self.min_val = value + if self.max_val is None or value > self.max_val: + self.max_val = value + + def merge(self, other_state): + other_min, other_max = other_state + if other_min is not None: + if self.min_val is None or other_min < self.min_val: + self.min_val = other_min + if other_max is not None: + if self.max_val is None or other_max > self.max_val: + self.max_val = other_max + + def finish(self): + if self.min_val is None or self.max_val is None: + return None + return self.max_val - self.min_val +\$\$; + """ + + qt_range_all """ SELECT py_range(score) as score_range FROM stats_test; """ + qt_range_group """ SELECT category, py_range(score) as score_range + FROM stats_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 6: Geometric Mean UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_geomean(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_geomean(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "GeometricMeanUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +import math + +class GeometricMeanUDAF: + def __init__(self): + self.log_sum = 0.0 + self.count = 0 + + @property + def aggregate_state(self): + return (self.log_sum, self.count) + + def accumulate(self, value): + if value is not None and value > 0: + self.log_sum += math.log(value) + self.count += 1 + + def merge(self, other_state): + other_log_sum, other_count = other_state + self.log_sum += other_log_sum + self.count += other_count + + def finish(self): + if self.count == 0: + return None + return math.exp(self.log_sum / self.count) +\$\$; + """ + + qt_geomean_all """ SELECT py_geomean(value) as geomean FROM stats_test; """ + qt_geomean_group """ SELECT category, py_geomean(value) as geomean + FROM stats_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 7: Weighted Average UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_weighted_avg(DOUBLE, INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_weighted_avg(DOUBLE, INT) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "WeightedAvgUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class WeightedAvgUDAF: + def __init__(self): + self.weighted_sum = 0.0 + self.weight_sum = 0 + + @property + def aggregate_state(self): + return (self.weighted_sum, self.weight_sum) + + def accumulate(self, value, weight): + if value is not None and weight is not None and weight > 0: + self.weighted_sum += value * weight + self.weight_sum += weight + + def merge(self, other_state): + other_weighted_sum, other_weight_sum = other_state + self.weighted_sum += other_weighted_sum + self.weight_sum += other_weight_sum + + def finish(self): + if self.weight_sum == 0: + return None + return self.weighted_sum / self.weight_sum +\$\$; + """ + + qt_weighted_avg_all """ SELECT py_weighted_avg(value, score) as weighted_avg FROM stats_test; """ + qt_weighted_avg_group """ SELECT category, py_weighted_avg(value, score) as weighted_avg + FROM stats_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 8: Multiple aggregations in one query + // ======================================== + qt_multi_agg """ + SELECT + category, + py_variance(value) as variance, + py_stddev(value) as stddev, + py_median(value) as median, + py_range(score) as score_range, + py_geomean(value) as geomean, + py_weighted_avg(value, score) as weighted_avg + FROM stats_test + GROUP BY category + ORDER BY category; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_variance(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_stddev(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_median(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_collect_list(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_range(INT);") + try_sql("DROP FUNCTION IF EXISTS py_geomean(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_weighted_avg(DOUBLE, INT);") + try_sql("DROP TABLE IF EXISTS stats_test;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_aggregation_module.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_aggregation_module.groovy new file mode 100644 index 00000000000000..31e122b88b9895 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_aggregation_module.groovy @@ -0,0 +1,340 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_complex_aggregation_module") { + // Test complex aggregation scenarios with Python UDAFs using file-based deployment + // UDAFs are loaded from pyudaf.zip file + + def pyPath = """${context.file.parent}/udaf_scripts/pyudaf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${pyPath}".toString()) + + try { + // Create test table with statistical data + sql """ DROP TABLE IF EXISTS stats_test_file; """ + sql """ + CREATE TABLE stats_test_file ( + id INT, + category VARCHAR(50), + value DOUBLE, + score INT, + tag VARCHAR(50) + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO stats_test_file VALUES + (1, 'A', 10.5, 85, 'alpha'), + (2, 'A', 20.3, 92, 'beta'), + (3, 'A', 15.7, 78, 'gamma'), + (4, 'A', 30.2, 95, 'delta'), + (5, 'A', 25.1, 88, 'alpha'), + (6, 'B', 12.4, 70, 'beta'), + (7, 'B', 18.9, 85, 'gamma'), + (8, 'B', 22.5, 90, 'alpha'), + (9, 'B', 16.3, 82, 'beta'), + (10, 'C', 35.7, 98, 'delta'), + (11, 'C', 28.4, 91, 'gamma'), + (12, 'C', 31.2, 87, 'alpha'), + (13, 'C', 26.8, 93, 'beta'), + (14, 'C', 29.5, 89, 'delta'), + (15, 'C', 33.1, 95, 'gamma'); + """ + + qt_select_data """ SELECT * FROM stats_test_file ORDER BY id; """ + + // ======================================== + // Test 1: Variance UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_variance_file(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_variance_file(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "stats_udaf.VarianceUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_variance_all """ SELECT py_variance_file(value) as variance FROM stats_test_file; """ + qt_variance_group """ SELECT category, py_variance_file(value) as variance + FROM stats_test_file GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 2: Standard Deviation UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_stddev_file(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_stddev_file(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "stats_udaf.StdDevUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_stddev_all """ SELECT py_stddev_file(value) as stddev FROM stats_test_file; """ + qt_stddev_group """ SELECT category, py_stddev_file(value) as stddev + FROM stats_test_file GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 3: Median UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_median_file(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_median_file(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "stats_udaf.MedianUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_median_all """ SELECT py_median_file(value) as median FROM stats_test_file; """ + qt_median_group """ SELECT category, py_median_file(value) as median + FROM stats_test_file GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 4: Collect List UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_collect_list_file(VARCHAR); """ + sql """ + CREATE AGGREGATE FUNCTION py_collect_list_file(VARCHAR) + RETURNS VARCHAR + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "stats_udaf.CollectListUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_collect_all """ SELECT py_collect_list_file(tag) as tags FROM stats_test_file; """ + qt_collect_group """ SELECT category, py_collect_list_file(tag) as tags + FROM stats_test_file GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 5: Range (Max - Min) UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_value_range(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_value_range(INT) + RETURNS INT + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "stats_udaf.RangeUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_range_all """ SELECT py_value_range(score) as score_range FROM stats_test_file; """ + qt_range_group """ SELECT category, py_value_range(score) as score_range + FROM stats_test_file GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 6: Geometric Mean UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_geomean_file(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_geomean_file(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "stats_udaf.GeometricMeanUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_geomean_all """ SELECT py_geomean_file(value) as geomean FROM stats_test_file; """ + qt_geomean_group """ SELECT category, py_geomean_file(value) as geomean + FROM stats_test_file GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 7: Weighted Average UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_weighted_avg_file(DOUBLE, INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_weighted_avg_file(DOUBLE, INT) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "stats_udaf.WeightedAvgUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_weighted_avg_all """ SELECT py_weighted_avg_file(value, score) as weighted_avg FROM stats_test_file; """ + qt_weighted_avg_group """ SELECT category, py_weighted_avg_file(value, score) as weighted_avg + FROM stats_test_file GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 8: Multiple aggregations in one query + // ======================================== + qt_multi_agg """ + SELECT + category, + py_variance_file(value) as variance, + py_stddev_file(value) as stddev, + py_median_file(value) as median, + py_value_range(score) as score_range, + py_geomean_file(value) as geomean, + py_weighted_avg_file(value, score) as weighted_avg + FROM stats_test_file + GROUP BY category + ORDER BY category; + """ + + // ======================================== + // Test 9: Window Function Support + // ======================================== + qt_window_partition """ + SELECT + id, + category, + value, + py_variance_file(value) OVER(PARTITION BY category) as variance_by_category, + py_median_file(value) OVER(PARTITION BY category) as median_by_category + FROM stats_test_file + ORDER BY category, id; + """ + + qt_window_order """ + SELECT + id, + category, + value, + py_variance_file(value) OVER(PARTITION BY category ORDER BY id) as running_variance, + py_median_file(value) OVER(PARTITION BY category ORDER BY id) as running_median + FROM stats_test_file + ORDER BY category, id; + """ + + // ======================================== + // Test 10: Comparison with Native Functions + // ======================================== + qt_compare_native """ + SELECT + category, + py_stddev_file(value) as py_stddev, + stddev(value) as native_stddev, + py_variance_file(value) as py_variance, + variance(value) as native_variance + FROM stats_test_file + GROUP BY category + ORDER BY category; + """ + + // ======================================== + // Test 11: NULL Handling + // ======================================== + sql """ DROP TABLE IF EXISTS stats_nulls_file; """ + sql """ + CREATE TABLE stats_nulls_file ( + id INT, + value DOUBLE, + score INT, + tag VARCHAR(50) + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO stats_nulls_file VALUES + (1, 10.0, 80, 'alpha'), + (2, NULL, 90, 'beta'), + (3, 20.0, NULL, 'alpha'), + (4, 30.0, 85, NULL), + (5, NULL, NULL, NULL); + """ + + qt_null_handling """ + SELECT + py_variance_file(value) as variance, + py_median_file(value) as median, + py_value_range(score) as range_val, + py_collect_list_file(tag) as tags + FROM stats_nulls_file; + """ + + // ======================================== + // Test 12: Global Functions + // ======================================== + sql """ DROP GLOBAL FUNCTION IF EXISTS py_variance_global(DOUBLE); """ + sql """ + CREATE GLOBAL AGGREGATE FUNCTION py_variance_global(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "stats_udaf.VarianceUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_global_variance """ SELECT py_variance_global(value) as variance FROM stats_test_file; """ + qt_global_variance_group """ SELECT category, py_variance_global(value) as variance + FROM stats_test_file GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 13: Edge Cases + // ======================================== + + // Empty result set + qt_empty """ SELECT py_variance_file(value) as variance FROM stats_test_file WHERE 1=0; """ + + // Single value + qt_single """ SELECT py_variance_file(value) as variance FROM stats_test_file WHERE id = 1; """ + + // Two values + qt_two """ SELECT py_median_file(value) as median FROM stats_test_file WHERE id IN (1, 2); """ + + } finally { + try_sql("DROP GLOBAL FUNCTION IF EXISTS py_variance_global(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_variance_file(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_stddev_file(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_median_file(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_collect_list_file(VARCHAR);") + try_sql("DROP FUNCTION IF EXISTS py_value_range(INT);") + try_sql("DROP FUNCTION IF EXISTS py_geomean_file(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_weighted_avg_file(DOUBLE, INT);") + try_sql("DROP TABLE IF EXISTS stats_test_file;") + try_sql("DROP TABLE IF EXISTS stats_nulls_file;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_state_objects_inline.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_state_objects_inline.groovy new file mode 100644 index 00000000000000..51e2b904930abb --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_state_objects_inline.groovy @@ -0,0 +1,678 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_complex_state_objects_inline") { + // Comprehensive test for complex Python objects as aggregate states + // Tests various pickle-serializable data structures: + // 1. Nested dictionaries + // 2. Custom classes (dataclass) + // 3. Lists of tuples + // 4. Sets and frozensets + // 5. Named tuples + // 6. Mixed complex structures + + def runtime_version = "3.8.10" + + try { + // ======================================== + // Setup: Create test tables + // ======================================== + + // Table 1: Transaction data for complex aggregations + sql """ DROP TABLE IF EXISTS complex_transactions; """ + sql """ + CREATE TABLE complex_transactions ( + transaction_id INT, + user_id INT, + product_id INT, + product_name VARCHAR(100), + category VARCHAR(50), + price DECIMAL(10,2), + quantity INT, + timestamp DATETIME, + region VARCHAR(50), + payment_method VARCHAR(50) + ) ENGINE=OLAP + DUPLICATE KEY(transaction_id) + DISTRIBUTED BY HASH(transaction_id) BUCKETS 3 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO complex_transactions VALUES + (1, 101, 1001, 'Laptop Pro', 'Electronics', 1299.99, 1, '2024-01-01 10:00:00', 'North', 'Credit'), + (2, 101, 1002, 'Mouse', 'Electronics', 29.99, 2, '2024-01-01 10:05:00', 'North', 'Credit'), + (3, 102, 1003, 'Keyboard', 'Electronics', 79.99, 1, '2024-01-02 11:00:00', 'South', 'Debit'), + (4, 101, 1004, 'Monitor', 'Electronics', 399.99, 1, '2024-01-03 09:30:00', 'North', 'Credit'), + (5, 103, 1001, 'Laptop Pro', 'Electronics', 1299.99, 1, '2024-01-03 14:00:00', 'East', 'PayPal'), + (6, 102, 1005, 'USB Cable', 'Accessories', 9.99, 3, '2024-01-04 10:00:00', 'South', 'Cash'), + (7, 104, 1002, 'Mouse', 'Electronics', 29.99, 1, '2024-01-04 15:00:00', 'West', 'Credit'), + (8, 103, 1006, 'Webcam', 'Electronics', 89.99, 1, '2024-01-05 11:00:00', 'East', 'PayPal'), + (9, 105, 1003, 'Keyboard', 'Electronics', 79.99, 2, '2024-01-05 16:00:00', 'North', 'Debit'), + (10, 104, 1007, 'HDMI Cable', 'Accessories', 15.99, 2, '2024-01-06 10:00:00', 'West', 'Cash'), + (11, 101, 1008, 'Headphones', 'Electronics', 149.99, 1, '2024-01-06 14:00:00', 'North', 'Credit'), + (12, 106, 1004, 'Monitor', 'Electronics', 399.99, 2, '2024-01-07 09:00:00', 'South', 'Credit'), + (13, 102, 1009, 'Desk Lamp', 'Home', 45.99, 1, '2024-01-07 15:00:00', 'South', 'Debit'), + (14, 107, 1010, 'Office Chair', 'Furniture', 299.99, 1, '2024-01-08 10:00:00', 'East', 'Credit'), + (15, 103, 1002, 'Mouse', 'Electronics', 29.99, 3, '2024-01-08 11:00:00', 'East', 'PayPal'); + """ + + // ======================================== + // UDAF 1: Nested Dictionary State - User Purchase Profile + // Tracks: {user_id: {'total_spent': float, 'items': [product_names], 'categories': set}} + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_user_profile(INT, VARCHAR, VARCHAR, DECIMAL, INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_user_profile(INT, VARCHAR, VARCHAR, DECIMAL, INT) + RETURNS VARCHAR + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "UserProfileUDAF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class UserProfileUDAF: + def __init__(self): + # Complex nested structure: dict of dicts with lists and sets + self.profiles = {} + + @property + def aggregate_state(self): + # Convert sets to lists for pickle serialization + serializable = {} + for user_id, profile in self.profiles.items(): + serializable[user_id] = { + 'total_spent': profile['total_spent'], + 'items': profile['items'], + 'categories': list(profile['categories']) + } + return serializable + + def accumulate(self, user_id, product_name, category, price, quantity): + if user_id is None: + return + + if user_id not in self.profiles: + self.profiles[user_id] = { + 'total_spent': 0.0, + 'items': [], + 'categories': set() + } + + revenue = float(price) * int(quantity) if price and quantity else 0.0 + self.profiles[user_id]['total_spent'] += revenue + if product_name: + self.profiles[user_id]['items'].append(product_name) + if category: + self.profiles[user_id]['categories'].add(category) + + def merge(self, other_state): + for user_id, profile in other_state.items(): + if user_id not in self.profiles: + self.profiles[user_id] = { + 'total_spent': 0.0, + 'items': [], + 'categories': set() + } + + self.profiles[user_id]['total_spent'] += profile['total_spent'] + self.profiles[user_id]['items'].extend(profile['items']) + self.profiles[user_id]['categories'].update(profile['categories']) + + def finish(self): + # Return summary as JSON string + import json + result = {} + for user_id, profile in self.profiles.items(): + result[str(user_id)] = { + 'total_spent': round(profile['total_spent'], 2), + 'item_count': len(profile['items']), + 'unique_categories': len(profile['categories']) + } + return json.dumps(result, sort_keys=True) +\$\$; + """ + + // ======================================== + // UDAF 2: Custom Class State - Product Statistics + // Uses a custom StatisticsTracker class + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_product_stats(VARCHAR, DECIMAL, INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_product_stats(VARCHAR, DECIMAL, INT) + RETURNS VARCHAR + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "ProductStatsUDAF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +from dataclasses import dataclass +from typing import List +import json + +@dataclass +class ProductStats: + product_name: str + prices: List[float] + quantities: List[int] + + def total_revenue(self): + return sum(p * q for p, q in zip(self.prices, self.quantities)) + + def avg_price(self): + return sum(self.prices) / len(self.prices) if self.prices else 0.0 + + def total_quantity(self): + return sum(self.quantities) + +class ProductStatsUDAF: + def __init__(self): + self.stats = {} # product_name -> ProductStats + + @property + def aggregate_state(self): + # Convert dataclass instances to dicts for serialization + return { + name: { + 'product_name': stat.product_name, + 'prices': stat.prices, + 'quantities': stat.quantities + } + for name, stat in self.stats.items() + } + + def accumulate(self, product_name, price, quantity): + if product_name is None: + return + + if product_name not in self.stats: + self.stats[product_name] = ProductStats( + product_name=product_name, + prices=[], + quantities=[] + ) + + if price is not None: + self.stats[product_name].prices.append(float(price)) + if quantity is not None: + self.stats[product_name].quantities.append(int(quantity)) + + def merge(self, other_state): + for name, stat_dict in other_state.items(): + if name not in self.stats: + self.stats[name] = ProductStats( + product_name=stat_dict['product_name'], + prices=stat_dict['prices'][:], + quantities=stat_dict['quantities'][:] + ) + else: + self.stats[name].prices.extend(stat_dict['prices']) + self.stats[name].quantities.extend(stat_dict['quantities']) + + def finish(self): + result = {} + for name, stat in self.stats.items(): + result[name] = { + 'avg_price': round(stat.avg_price(), 2), + 'total_quantity': stat.total_quantity(), + 'total_revenue': round(stat.total_revenue(), 2), + 'transactions': len(stat.prices) + } + return json.dumps(result, sort_keys=True) +\$\$; + """ + + // ======================================== + // UDAF 3: List of Tuples State - Transaction Timeline + // Stores chronological list of (timestamp, amount) tuples + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_transaction_timeline(DATETIME, DECIMAL); """ + sql """ + CREATE AGGREGATE FUNCTION py_transaction_timeline(DATETIME, DECIMAL) + RETURNS VARCHAR + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "TransactionTimelineUDAF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +import json +from datetime import datetime + +class TransactionTimelineUDAF: + def __init__(self): + # List of tuples: [(timestamp_str, amount), ...] + self.timeline = [] + + @property + def aggregate_state(self): + return self.timeline + + def accumulate(self, timestamp, amount): + if timestamp is not None and amount is not None: + # Convert datetime to string for serialization + ts_str = str(timestamp) + self.timeline.append((ts_str, float(amount))) + + def merge(self, other_state): + self.timeline.extend(other_state) + + def finish(self): + # Sort by timestamp and return summary + sorted_timeline = sorted(self.timeline, key=lambda x: x[0]) + + if not sorted_timeline: + return json.dumps({'count': 0}) + + total = sum(amount for _, amount in sorted_timeline) + + result = { + 'count': len(sorted_timeline), + 'total': round(total, 2), + 'first_transaction': sorted_timeline[0][0], + 'last_transaction': sorted_timeline[-1][0], + 'first_amount': round(sorted_timeline[0][1], 2), + 'last_amount': round(sorted_timeline[-1][1], 2) + } + return json.dumps(result) +\$\$; + """ + + // ======================================== + // UDAF 4: Set-based State - Unique Value Tracker + // Tracks unique users, products, and payment methods + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_unique_tracker(INT, INT, VARCHAR); """ + sql """ + CREATE AGGREGATE FUNCTION py_unique_tracker(INT, INT, VARCHAR) + RETURNS VARCHAR + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "UniqueTrackerUDAF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +import json + +class UniqueTrackerUDAF: + def __init__(self): + # Use sets to track unique values + self.unique_users = set() + self.unique_products = set() + self.payment_methods = set() + + @property + def aggregate_state(self): + # Convert sets to lists for pickle + return { + 'users': list(self.unique_users), + 'products': list(self.unique_products), + 'payments': list(self.payment_methods) + } + + def accumulate(self, user_id, product_id, payment_method): + if user_id is not None: + self.unique_users.add(user_id) + if product_id is not None: + self.unique_products.add(product_id) + if payment_method is not None: + self.payment_methods.add(payment_method) + + def merge(self, other_state): + self.unique_users.update(other_state['users']) + self.unique_products.update(other_state['products']) + self.payment_methods.update(other_state['payments']) + + def finish(self): + return json.dumps({ + 'unique_users': len(self.unique_users), + 'unique_products': len(self.unique_products), + 'payment_methods': sorted(list(self.payment_methods)) + }) +\$\$; + """ + + // ======================================== + // UDAF 5: Named Tuple State - Category Summary + // Uses collections.namedtuple for structured data + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_category_summary(VARCHAR, DECIMAL, INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_category_summary(VARCHAR, DECIMAL, INT) + RETURNS VARCHAR + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "CategorySummaryUDAF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +from collections import namedtuple +import json + +CategoryData = namedtuple('CategoryData', ['total_revenue', 'total_items', 'transaction_count']) + +class CategorySummaryUDAF: + def __init__(self): + # Dict of category -> namedtuple + self.categories = {} + + @property + def aggregate_state(self): + # Convert namedtuples to tuples for pickle + return { + cat: (data.total_revenue, data.total_items, data.transaction_count) + for cat, data in self.categories.items() + } + + def accumulate(self, category, price, quantity): + if category is None: + return + + revenue = float(price) * int(quantity) if price and quantity else 0.0 + items = int(quantity) if quantity else 0 + + if category in self.categories: + old = self.categories[category] + self.categories[category] = CategoryData( + total_revenue=old.total_revenue + revenue, + total_items=old.total_items + items, + transaction_count=old.transaction_count + 1 + ) + else: + self.categories[category] = CategoryData( + total_revenue=revenue, + total_items=items, + transaction_count=1 + ) + + def merge(self, other_state): + for cat, (revenue, items, count) in other_state.items(): + if cat in self.categories: + old = self.categories[cat] + self.categories[cat] = CategoryData( + total_revenue=old.total_revenue + revenue, + total_items=old.total_items + items, + transaction_count=old.transaction_count + count + ) + else: + self.categories[cat] = CategoryData( + total_revenue=revenue, + total_items=items, + transaction_count=count + ) + + def finish(self): + result = {} + for cat, data in self.categories.items(): + result[cat] = { + 'total_revenue': round(data.total_revenue, 2), + 'total_items': data.total_items, + 'transactions': data.transaction_count, + 'avg_per_transaction': round(data.total_revenue / data.transaction_count, 2) if data.transaction_count > 0 else 0.0 + } + return json.dumps(result, sort_keys=True) +\$\$; + """ + + // ======================================== + // UDAF 6: Complex Nested State - Hierarchical Aggregation + // Multi-level nested structure: region -> category -> product -> stats + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_hierarchical_agg(VARCHAR, VARCHAR, VARCHAR, DECIMAL, INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_hierarchical_agg(VARCHAR, VARCHAR, VARCHAR, DECIMAL, INT) + RETURNS VARCHAR + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "HierarchicalAggUDAF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +import json +from collections import defaultdict + +class HierarchicalAggUDAF: + def __init__(self): + # Complex nested dict: {region: {category: {product: {'revenue': float, 'quantity': int}}}} + self.hierarchy = {} + + @property + def aggregate_state(self): + return self.hierarchy + + def accumulate(self, region, category, product, price, quantity): + if not all([region, category, product]): + return + + if region not in self.hierarchy: + self.hierarchy[region] = {} + if category not in self.hierarchy[region]: + self.hierarchy[region][category] = {} + if product not in self.hierarchy[region][category]: + self.hierarchy[region][category][product] = {'revenue': 0.0, 'quantity': 0} + + revenue = float(price) * int(quantity) if price and quantity else 0.0 + qty = int(quantity) if quantity else 0 + + self.hierarchy[region][category][product]['revenue'] += revenue + self.hierarchy[region][category][product]['quantity'] += qty + + def merge(self, other_state): + for region, categories in other_state.items(): + if region not in self.hierarchy: + self.hierarchy[region] = {} + + for category, products in categories.items(): + if category not in self.hierarchy[region]: + self.hierarchy[region][category] = {} + + for product, stats in products.items(): + if product not in self.hierarchy[region][category]: + self.hierarchy[region][category][product] = {'revenue': 0.0, 'quantity': 0} + + self.hierarchy[region][category][product]['revenue'] += stats['revenue'] + self.hierarchy[region][category][product]['quantity'] += stats['quantity'] + + def finish(self): + # Summarize hierarchy at each level + result = {} + for region, categories in self.hierarchy.items(): + region_total = 0.0 + region_data = {} + + for category, products in categories.items(): + category_total = sum(p['revenue'] for p in products.values()) + region_total += category_total + region_data[category] = { + 'revenue': round(category_total, 2), + 'products': len(products) + } + + result[region] = { + 'total_revenue': round(region_total, 2), + 'categories': region_data + } + + return json.dumps(result, sort_keys=True) +\$\$; + """ + + // ======================================== + // Test Cases + // ======================================== + + // Test 1: User Profile Aggregation (Nested Dict) + qt_test_user_profile """ + SELECT + py_user_profile(user_id, product_name, category, price, quantity) as user_profiles + FROM complex_transactions; + """ + + // Test 2: Product Statistics (Custom Class) + qt_test_product_stats """ + SELECT + py_product_stats(product_name, price, quantity) as product_statistics + FROM complex_transactions; + """ + + // Test 3: Transaction Timeline (List of Tuples) + qt_test_transaction_timeline """ + SELECT + region, + py_transaction_timeline(timestamp, price * quantity) as timeline + FROM complex_transactions + GROUP BY region + ORDER BY region; + """ + + // Test 4: Unique Tracker (Sets) + qt_test_unique_tracker """ + SELECT + category, + py_unique_tracker(user_id, product_id, payment_method) as unique_stats + FROM complex_transactions + GROUP BY category + ORDER BY category; + """ + + // Test 5: Category Summary (Named Tuples) + qt_test_category_summary """ + SELECT + py_category_summary(category, price, quantity) as category_summary + FROM complex_transactions; + """ + + // Test 6: Hierarchical Aggregation (Deep Nesting) + qt_test_hierarchical_agg """ + SELECT + py_hierarchical_agg(region, category, product_name, price, quantity) as hierarchy + FROM complex_transactions; + """ + + // Test 7: Complex State with Window Function + qt_test_complex_window """ + SELECT + user_id, + product_name, + price, + py_user_profile(user_id, product_name, category, price, quantity) + OVER (PARTITION BY user_id ORDER BY transaction_id) as running_profile + FROM complex_transactions + ORDER BY user_id, transaction_id; + """ + + // Test 8: Multiple Complex UDAFs in Single Query + qt_test_multi_complex """ + SELECT + region, + py_unique_tracker(user_id, product_id, payment_method) as uniques, + py_category_summary(category, price, quantity) as summary + FROM complex_transactions + GROUP BY region + ORDER BY region; + """ + + // Test 9: Nested Query with Complex State + qt_test_nested_complex """ + SELECT + region, + product_stats + FROM ( + SELECT + region, + py_product_stats(product_name, price, quantity) as product_stats + FROM complex_transactions + WHERE price > 50 + GROUP BY region + ) t + ORDER BY region; + """ + + // Test 10: Complex State Serialization in Shuffle (GROUP BY multiple columns) + qt_test_complex_shuffle """ + SELECT + region, + category, + py_hierarchical_agg(region, category, product_name, price, quantity) as hier_stats + FROM complex_transactions + GROUP BY region, category + ORDER BY region, category; + """ + + // Test 11: Edge Case - Empty Groups + qt_test_empty_groups """ + SELECT + region, + py_user_profile(user_id, product_name, category, price, quantity) as profile + FROM complex_transactions + WHERE 1 = 0 + GROUP BY region; + """ + + // Test 12: Edge Case - NULL Values + sql """ DROP TABLE IF EXISTS complex_nulls; """ + sql """ + CREATE TABLE complex_nulls ( + id INT, + user_id INT, + product VARCHAR(50), + category VARCHAR(50), + price DECIMAL(10,2) + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO complex_nulls VALUES + (1, 101, 'ItemA', 'Cat1', 100.0), + (2, NULL, 'ItemB', 'Cat2', 200.0), + (3, 102, NULL, 'Cat3', 300.0), + (4, 103, 'ItemC', NULL, 400.0), + (5, 104, 'ItemD', 'Cat4', NULL); + """ + + qt_test_null_handling """ + SELECT + py_user_profile(user_id, product, category, price, 1) as profile_with_nulls + FROM complex_nulls; + """ + + // Test 13: Performance - Large Complex State + qt_test_large_state """ + SELECT + COUNT(*) as total_transactions, + py_hierarchical_agg(region, category, product_name, price, quantity) as full_hierarchy, + py_user_profile(user_id, product_name, category, price, quantity) as all_profiles + FROM complex_transactions; + """ + + } finally { + // Cleanup + sql """ DROP TABLE IF EXISTS complex_transactions; """ + sql """ DROP TABLE IF EXISTS complex_nulls; """ + + sql """ DROP FUNCTION IF EXISTS py_user_profile(INT, VARCHAR, VARCHAR, DECIMAL, INT); """ + sql """ DROP FUNCTION IF EXISTS py_product_stats(VARCHAR, DECIMAL, INT); """ + sql """ DROP FUNCTION IF EXISTS py_transaction_timeline(DATETIME, DECIMAL); """ + sql """ DROP FUNCTION IF EXISTS py_unique_tracker(INT, INT, VARCHAR); """ + sql """ DROP FUNCTION IF EXISTS py_category_summary(VARCHAR, DECIMAL, INT); """ + sql """ DROP FUNCTION IF EXISTS py_hierarchical_agg(VARCHAR, VARCHAR, VARCHAR, DECIMAL, INT); """ + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_state_objects_module.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_state_objects_module.groovy new file mode 100644 index 00000000000000..0a454dd6e2ab7c --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_complex_state_objects_module.groovy @@ -0,0 +1,373 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_complex_state_objects_module") { + // Comprehensive test for complex Python objects as aggregate states using MODULE mode + // All UDAFs are loaded from pyudaf.zip (complex_state_udaf.py module) + // Tests various pickle-serializable data structures + + def pyPath = """${context.file.parent}/udaf_scripts/pyudaf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${pyPath}".toString()) + + try { + // ======================================== + // Setup: Create test tables + // ======================================== + + // Table 1: Transaction data for complex aggregations + sql """ DROP TABLE IF EXISTS complex_transactions_mod; """ + sql """ + CREATE TABLE complex_transactions_mod ( + transaction_id INT, + user_id INT, + product_id INT, + product_name VARCHAR(100), + category VARCHAR(50), + price DECIMAL(10,2), + quantity INT, + timestamp DATETIME, + region VARCHAR(50), + payment_method VARCHAR(50) + ) ENGINE=OLAP + DUPLICATE KEY(transaction_id) + DISTRIBUTED BY HASH(transaction_id) BUCKETS 3 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO complex_transactions_mod VALUES + (1, 101, 1001, 'Laptop Pro', 'Electronics', 1299.99, 1, '2024-01-01 10:00:00', 'North', 'Credit'), + (2, 101, 1002, 'Mouse', 'Electronics', 29.99, 2, '2024-01-01 10:05:00', 'North', 'Credit'), + (3, 102, 1003, 'Keyboard', 'Electronics', 79.99, 1, '2024-01-02 11:00:00', 'South', 'Debit'), + (4, 101, 1004, 'Monitor', 'Electronics', 399.99, 1, '2024-01-03 09:30:00', 'North', 'Credit'), + (5, 103, 1001, 'Laptop Pro', 'Electronics', 1299.99, 1, '2024-01-03 14:00:00', 'East', 'PayPal'), + (6, 102, 1005, 'USB Cable', 'Accessories', 9.99, 3, '2024-01-04 10:00:00', 'South', 'Cash'), + (7, 104, 1002, 'Mouse', 'Electronics', 29.99, 1, '2024-01-04 15:00:00', 'West', 'Credit'), + (8, 103, 1006, 'Webcam', 'Electronics', 89.99, 1, '2024-01-05 11:00:00', 'East', 'PayPal'), + (9, 105, 1003, 'Keyboard', 'Electronics', 79.99, 2, '2024-01-05 16:00:00', 'North', 'Debit'), + (10, 104, 1007, 'HDMI Cable', 'Accessories', 15.99, 2, '2024-01-06 10:00:00', 'West', 'Cash'), + (11, 101, 1008, 'Headphones', 'Electronics', 149.99, 1, '2024-01-06 14:00:00', 'North', 'Credit'), + (12, 106, 1004, 'Monitor', 'Electronics', 399.99, 2, '2024-01-07 09:00:00', 'South', 'Credit'), + (13, 102, 1009, 'Desk Lamp', 'Home', 45.99, 1, '2024-01-07 15:00:00', 'South', 'Debit'), + (14, 107, 1010, 'Office Chair', 'Furniture', 299.99, 1, '2024-01-08 10:00:00', 'East', 'Credit'), + (15, 103, 1002, 'Mouse', 'Electronics', 29.99, 3, '2024-01-08 11:00:00', 'East', 'PayPal'); + """ + + // ======================================== + // UDAF 1: Nested Dictionary State - User Purchase Profile + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_user_profile_mod(INT, VARCHAR, VARCHAR, DECIMAL, INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_user_profile_mod(INT, VARCHAR, VARCHAR, DECIMAL, INT) + RETURNS VARCHAR + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "complex_state_udaf.UserProfileUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + // ======================================== + // UDAF 2: Custom Class State - Product Statistics + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_product_stats_mod(VARCHAR, DECIMAL, INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_product_stats_mod(VARCHAR, DECIMAL, INT) + RETURNS VARCHAR + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "complex_state_udaf.ProductStatsUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + // ======================================== + // UDAF 3: List of Tuples State - Transaction Timeline + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_transaction_timeline_mod(DATETIME, DECIMAL); """ + sql """ + CREATE AGGREGATE FUNCTION py_transaction_timeline_mod(DATETIME, DECIMAL) + RETURNS VARCHAR + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "complex_state_udaf.TransactionTimelineUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + // ======================================== + // UDAF 4: Set-based State - Unique Value Tracker + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_unique_tracker_mod(INT, INT, VARCHAR); """ + sql """ + CREATE AGGREGATE FUNCTION py_unique_tracker_mod(INT, INT, VARCHAR) + RETURNS VARCHAR + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "complex_state_udaf.UniqueTrackerUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + // ======================================== + // UDAF 5: Named Tuple State - Category Summary + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_category_summary_mod(VARCHAR, DECIMAL, INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_category_summary_mod(VARCHAR, DECIMAL, INT) + RETURNS VARCHAR + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "complex_state_udaf.CategorySummaryUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + // ======================================== + // UDAF 6: Complex Nested State - Hierarchical Aggregation + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_hierarchical_agg_mod(VARCHAR, VARCHAR, VARCHAR, DECIMAL, INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_hierarchical_agg_mod(VARCHAR, VARCHAR, VARCHAR, DECIMAL, INT) + RETURNS VARCHAR + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "complex_state_udaf.HierarchicalAggUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + // ======================================== + // Test Cases + // ======================================== + + // Test 1: User Profile Aggregation (Nested Dict) + qt_test_user_profile """ + SELECT + py_user_profile_mod(user_id, product_name, category, price, quantity) as user_profiles + FROM complex_transactions_mod; + """ + + // Test 2: Product Statistics (Custom Class) + qt_test_product_stats """ + SELECT + py_product_stats_mod(product_name, price, quantity) as product_statistics + FROM complex_transactions_mod; + """ + + // Test 3: Transaction Timeline (List of Tuples) + qt_test_transaction_timeline """ + SELECT + region, + py_transaction_timeline_mod(timestamp, price * quantity) as timeline + FROM complex_transactions_mod + GROUP BY region + ORDER BY region; + """ + + // Test 4: Unique Tracker (Sets) + qt_test_unique_tracker """ + SELECT + category, + py_unique_tracker_mod(user_id, product_id, payment_method) as unique_stats + FROM complex_transactions_mod + GROUP BY category + ORDER BY category; + """ + + // Test 5: Category Summary (Named Tuples) + qt_test_category_summary """ + SELECT + py_category_summary_mod(category, price, quantity) as category_summary + FROM complex_transactions_mod; + """ + + // Test 6: Hierarchical Aggregation (Deep Nesting) + qt_test_hierarchical_agg """ + SELECT + py_hierarchical_agg_mod(region, category, product_name, price, quantity) as hierarchy + FROM complex_transactions_mod; + """ + + // Test 7: Complex State with Window Function + qt_test_complex_window """ + SELECT + user_id, + product_name, + price, + py_user_profile_mod(user_id, product_name, category, price, quantity) + OVER (PARTITION BY user_id ORDER BY transaction_id) as running_profile + FROM complex_transactions_mod + ORDER BY user_id, transaction_id; + """ + + // Test 8: Multiple Complex UDAFs in Single Query + qt_test_multi_complex """ + SELECT + region, + py_unique_tracker_mod(user_id, product_id, payment_method) as uniques, + py_category_summary_mod(category, price, quantity) as summary + FROM complex_transactions_mod + GROUP BY region + ORDER BY region; + """ + + // Test 9: Nested Query with Complex State + qt_test_nested_complex """ + SELECT + region, + product_stats + FROM ( + SELECT + region, + py_product_stats_mod(product_name, price, quantity) as product_stats + FROM complex_transactions_mod + WHERE price > 50 + GROUP BY region + ) t + ORDER BY region; + """ + + // Test 10: Complex State Serialization in Shuffle (GROUP BY multiple columns) + qt_test_complex_shuffle """ + SELECT + region, + category, + py_hierarchical_agg_mod(region, category, product_name, price, quantity) as hier_stats + FROM complex_transactions_mod + GROUP BY region, category + ORDER BY region, category; + """ + + // Test 11: Edge Case - Empty Groups + qt_test_empty_groups """ + SELECT + region, + py_user_profile_mod(user_id, product_name, category, price, quantity) as profile + FROM complex_transactions_mod + WHERE 1 = 0 + GROUP BY region; + """ + + // Test 12: Edge Case - NULL Values + sql """ DROP TABLE IF EXISTS complex_nulls_mod; """ + sql """ + CREATE TABLE complex_nulls_mod ( + id INT, + user_id INT, + product VARCHAR(50), + category VARCHAR(50), + price DECIMAL(10,2) + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO complex_nulls_mod VALUES + (1, 101, 'ItemA', 'Cat1', 100.0), + (2, NULL, 'ItemB', 'Cat2', 200.0), + (3, 102, NULL, 'Cat3', 300.0), + (4, 103, 'ItemC', NULL, 400.0), + (5, 104, 'ItemD', 'Cat4', NULL); + """ + + qt_test_null_handling """ + SELECT + py_user_profile_mod(user_id, product, category, price, 1) as profile_with_nulls + FROM complex_nulls_mod; + """ + + // Test 13: Performance - Large Complex State + qt_test_large_state """ + SELECT + COUNT(*) as total_transactions, + py_hierarchical_agg_mod(region, category, product_name, price, quantity) as full_hierarchy, + py_user_profile_mod(user_id, product_name, category, price, quantity) as all_profiles + FROM complex_transactions_mod; + """ + + // Test 14: Module Reusability - Create another function from same module + sql """ DROP FUNCTION IF EXISTS py_user_profile_mod2(INT, VARCHAR, VARCHAR, DECIMAL, INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_user_profile_mod2(INT, VARCHAR, VARCHAR, DECIMAL, INT) + RETURNS VARCHAR + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "complex_state_udaf.UserProfileUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_module_reuse """ + SELECT + py_user_profile_mod(user_id, product_name, category, price, quantity) as profile1, + py_user_profile_mod2(user_id, product_name, category, price, quantity) as profile2 + FROM complex_transactions_mod; + """ + + // Test 15: Global Functions + sql """ DROP GLOBAL FUNCTION IF EXISTS py_user_profile_global(INT, VARCHAR, VARCHAR, DECIMAL, INT); """ + sql """ + CREATE GLOBAL AGGREGATE FUNCTION py_user_profile_global(INT, VARCHAR, VARCHAR, DECIMAL, INT) + RETURNS VARCHAR + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "complex_state_udaf.UserProfileUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_global_test """ + SELECT + py_user_profile_global(user_id, product_name, category, price, quantity) as user_profiles + FROM complex_transactions_mod; + """ + + } finally { + // Cleanup + try_sql("DROP GLOBAL FUNCTION IF EXISTS py_user_profile_global(INT, VARCHAR, VARCHAR, DECIMAL, INT);") + try_sql("DROP FUNCTION IF EXISTS py_user_profile_mod(INT, VARCHAR, VARCHAR, DECIMAL, INT);") + try_sql("DROP FUNCTION IF EXISTS py_product_stats_mod(VARCHAR, DECIMAL, INT);") + try_sql("DROP FUNCTION IF EXISTS py_transaction_timeline_mod(DATETIME, DECIMAL);") + try_sql("DROP FUNCTION IF EXISTS py_unique_tracker_mod(INT, INT, VARCHAR);") + try_sql("DROP FUNCTION IF EXISTS py_category_summary_mod(VARCHAR, DECIMAL, INT);") + try_sql("DROP FUNCTION IF EXISTS py_hierarchical_agg_mod(VARCHAR, VARCHAR, VARCHAR, DECIMAL, INT);") + try_sql("DROP FUNCTION IF EXISTS py_user_profile_mod2(INT, VARCHAR, VARCHAR, DECIMAL, INT);") + try_sql("DROP TABLE IF EXISTS complex_transactions_mod;") + try_sql("DROP TABLE IF EXISTS complex_nulls_mod;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_concurrent.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_concurrent.groovy new file mode 100644 index 00000000000000..944bd7479c38f0 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_concurrent.groovy @@ -0,0 +1,400 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_concurrent") { + // Test multiple Python UDAFs executing concurrently in the same SQL query + // This is the key test case to verify the fix for multi-UDAF state management + + try { + // Create test table + sql """ DROP TABLE IF EXISTS concurrent_udaf_test; """ + sql """ + CREATE TABLE concurrent_udaf_test ( + id INT, + category STRING, + value INT, + price DOUBLE, + quantity INT, + amount DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO concurrent_udaf_test VALUES + (1, 'A', 10, 1.5, 5, 7.5), + (2, 'A', 20, 2.5, 3, 7.5), + (3, 'B', 30, 3.5, 4, 14.0), + (4, 'B', 40, 4.5, 2, 9.0), + (5, 'C', 50, 5.5, 6, 33.0), + (6, 'C', 60, 6.5, 1, 6.5), + (7, 'A', 70, 7.5, 8, 60.0), + (8, 'B', 80, 8.5, 7, 59.5), + (9, 'C', 90, 9.5, 9, 85.5), + (10, 'A', 100, 10.5, 10, 105.0); + """ + + // UDAF 1: Sum aggregation + sql """ DROP FUNCTION IF EXISTS inline_udaf_sum(INT); """ + sql """ + CREATE AGGREGATE FUNCTION inline_udaf_sum(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SumUDAF: + def __init__(self): + self.sum = 0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + // UDAF 2: Count aggregation + sql """ DROP FUNCTION IF EXISTS inline_udaf_count(INT); """ + sql """ + CREATE AGGREGATE FUNCTION inline_udaf_count(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "CountUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class CountUDAF: + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, value): + if value is not None: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def finish(self): + return self.count +\$\$; + """ + + // UDAF 3: Average aggregation + sql """ DROP FUNCTION IF EXISTS inline_udaf_avg(INT); """ + sql """ + CREATE AGGREGATE FUNCTION inline_udaf_avg(INT) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "AvgUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class AvgUDAF: + def __init__(self): + self.count = 0 + self.sum = 0 + + @property + def aggregate_state(self): + return (self.count, self.sum) + + def accumulate(self, value): + if value is not None: + self.count += 1 + self.sum += value + + def merge(self, other_state): + other_count, other_sum = other_state + self.count += other_count + self.sum += other_sum + + def finish(self): + if self.count == 0: + return None + return self.sum / self.count +\$\$; + """ + + // UDAF 4: Max aggregation + sql """ DROP FUNCTION IF EXISTS inline_udaf_max(INT); """ + sql """ + CREATE AGGREGATE FUNCTION inline_udaf_max(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "MaxUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class MaxUDAF: + def __init__(self): + self.max_value = None + + @property + def aggregate_state(self): + return self.max_value + + def accumulate(self, value): + if value is not None: + if self.max_value is None or value > self.max_value: + self.max_value = value + + def merge(self, other_state): + if other_state is not None: + if self.max_value is None or other_state > self.max_value: + self.max_value = other_state + + def finish(self): + return self.max_value +\$\$; + """ + + // UDAF 5: Min aggregation + sql """ DROP FUNCTION IF EXISTS inline_udaf_min(INT); """ + sql """ + CREATE AGGREGATE FUNCTION inline_udaf_min(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "MinUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class MinUDAF: + def __init__(self): + self.min_value = None + + @property + def aggregate_state(self): + return self.min_value + + def accumulate(self, value): + if value is not None: + if self.min_value is None or value < self.min_value: + self.min_value = value + + def merge(self, other_state): + if other_state is not None: + if self.min_value is None or other_state < self.min_value: + self.min_value = other_state + + def finish(self): + return self.min_value +\$\$; + """ + + // UDAF 6: Sum for DOUBLE type + sql """ DROP FUNCTION IF EXISTS inline_udaf_sum_double(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION inline_udaf_sum_double(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumDoubleUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SumDoubleUDAF: + def __init__(self): + self.sum = 0.0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + // Test 1: Two different UDAFs in the same query (Critical test case!) + qt_concurrent_two_udaf """ + SELECT + inline_udaf_sum(value) AS total_sum, + inline_udaf_count(value) AS total_count + FROM concurrent_udaf_test; + """ + + // Test 2: Three different UDAFs in the same query + qt_concurrent_three_udaf """ + SELECT + inline_udaf_sum(value) AS total_sum, + inline_udaf_count(value) AS total_count, + inline_udaf_avg(value) AS avg_value + FROM concurrent_udaf_test; + """ + + // Test 3: Multiple UDAFs with GROUP BY (Critical test case!) + qt_concurrent_udaf_group_by """ + SELECT + category, + inline_udaf_sum(value) AS sum_value, + inline_udaf_count(value) AS count_value, + inline_udaf_avg(value) AS avg_value + FROM concurrent_udaf_test + GROUP BY category + ORDER BY category; + """ + + // Test 4: Five different UDAFs in the same query + qt_concurrent_five_udaf """ + SELECT + inline_udaf_sum(value) AS total_sum, + inline_udaf_count(value) AS total_count, + inline_udaf_avg(value) AS avg_value, + inline_udaf_max(value) AS max_value, + inline_udaf_min(value) AS min_value + FROM concurrent_udaf_test; + """ + + // Test 5: Multiple UDAFs with different types (INT and DOUBLE) + qt_concurrent_mixed_types """ + SELECT + inline_udaf_sum(value) AS sum_int, + inline_udaf_sum_double(price) AS sum_double, + inline_udaf_count(value) AS count_value, + inline_udaf_avg(quantity) AS avg_quantity + FROM concurrent_udaf_test; + """ + + // Test 6: Multiple UDAFs with GROUP BY on different columns + qt_concurrent_complex_group """ + SELECT + category, + inline_udaf_sum(value) AS sum_value, + inline_udaf_count(value) AS count_value, + inline_udaf_avg(value) AS avg_value, + inline_udaf_max(value) AS max_value, + inline_udaf_min(value) AS min_value, + inline_udaf_sum_double(amount) AS sum_amount + FROM concurrent_udaf_test + GROUP BY category + ORDER BY category; + """ + + // Test 7: Same UDAF function called multiple times on different columns + qt_concurrent_same_udaf """ + SELECT + inline_udaf_sum(value) AS sum_value, + inline_udaf_sum(quantity) AS sum_quantity + FROM concurrent_udaf_test; + """ + + // Test 8: Nested aggregation - UDAFs with WHERE clause + qt_concurrent_with_filter """ + SELECT + inline_udaf_sum(value) AS sum_value, + inline_udaf_count(value) AS count_value, + inline_udaf_avg(value) AS avg_value + FROM concurrent_udaf_test + WHERE value > 30; + """ + + // Test 9: Multiple UDAFs with HAVING clause + qt_concurrent_with_having """ + SELECT + category, + inline_udaf_sum(value) AS sum_value, + inline_udaf_count(value) AS count_value, + inline_udaf_avg(value) AS avg_value + FROM concurrent_udaf_test + GROUP BY category + HAVING inline_udaf_sum(value) > 100 + ORDER BY category; + """ + + // Test 10: Stress test - Multiple UDAFs called multiple times + qt_concurrent_stress """ + SELECT + category, + inline_udaf_sum(value) AS sum_value, + inline_udaf_sum(quantity) AS sum_quantity, + inline_udaf_count(value) AS count_value, + inline_udaf_count(quantity) AS count_quantity, + inline_udaf_avg(value) AS avg_value, + inline_udaf_avg(quantity) AS avg_quantity, + inline_udaf_max(value) AS max_value, + inline_udaf_min(value) AS min_value, + inline_udaf_sum_double(price) AS sum_price, + inline_udaf_sum_double(amount) AS sum_amount + FROM concurrent_udaf_test + GROUP BY category + ORDER BY category; + """ + + // Test 11: Verify correctness - Compare with native functions + qt_concurrent_verify_sum """ + SELECT + inline_udaf_sum(value) AS python_sum, + SUM(value) AS native_sum, + inline_udaf_count(value) AS python_count, + COUNT(value) AS native_count + FROM concurrent_udaf_test; + """ + + qt_concurrent_verify_group """ + SELECT + category, + inline_udaf_sum(value) AS python_sum, + SUM(value) AS native_sum, + inline_udaf_count(value) AS python_count, + COUNT(value) AS native_count, + inline_udaf_avg(value) AS python_avg, + AVG(value) AS native_avg + FROM concurrent_udaf_test + GROUP BY category + ORDER BY category; + """ + + } finally { + // Cleanup + try_sql("DROP FUNCTION IF EXISTS inline_udaf_sum(INT);") + try_sql("DROP FUNCTION IF EXISTS inline_udaf_count(INT);") + try_sql("DROP FUNCTION IF EXISTS inline_udaf_avg(INT);") + try_sql("DROP FUNCTION IF EXISTS inline_udaf_max(INT);") + try_sql("DROP FUNCTION IF EXISTS inline_udaf_min(INT);") + try_sql("DROP FUNCTION IF EXISTS inline_udaf_sum_double(DOUBLE);") + try_sql("DROP TABLE IF EXISTS concurrent_udaf_test;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_data_types.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_data_types.groovy new file mode 100644 index 00000000000000..e8d52aee7173c2 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_data_types.groovy @@ -0,0 +1,366 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_data_types") { + // Test Python UDAFs with various data types + // Including: TINYINT, SMALLINT, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, STRING, DATE, DATETIME + + def runtime_version = "3.8.10" + + try { + // Create test table with various data types + sql """ DROP TABLE IF EXISTS data_types_test; """ + sql """ + CREATE TABLE data_types_test ( + id INT, + tiny_val TINYINT, + small_val SMALLINT, + int_val INT, + big_val BIGINT, + float_val FLOAT, + double_val DOUBLE, + decimal_val DECIMAL(10, 2), + str_val STRING, + date_val DATE, + bool_val BOOLEAN + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO data_types_test VALUES + (1, 10, 100, 1000, 10000, 1.5, 10.55, 100.50, 'apple', '2024-01-01', true), + (2, 20, 200, 2000, 20000, 2.5, 20.55, 200.50, 'banana', '2024-01-02', false), + (3, 30, 300, 3000, 30000, 3.5, 30.55, 300.50, 'cherry', '2024-01-03', true), + (4, 40, 400, 4000, 40000, 4.5, 40.55, 400.50, 'date', '2024-01-04', true), + (5, 50, 500, 5000, 50000, 5.5, 50.55, 500.50, 'elderberry', '2024-01-05', false); + """ + + qt_select_data """ SELECT * FROM data_types_test ORDER BY id; """ + + // ======================================== + // Test 1: TINYINT aggregation + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_sum_tinyint(TINYINT); """ + sql """ + CREATE AGGREGATE FUNCTION py_sum_tinyint(TINYINT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumTinyIntUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SumTinyIntUDAF: + def __init__(self): + self.sum = 0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + qt_tinyint """ SELECT py_sum_tinyint(tiny_val) as sum_tiny FROM data_types_test; """ + + // ======================================== + // Test 2: SMALLINT aggregation + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_sum_smallint(SMALLINT); """ + sql """ + CREATE AGGREGATE FUNCTION py_sum_smallint(SMALLINT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumSmallIntUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SumSmallIntUDAF: + def __init__(self): + self.sum = 0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + qt_smallint """ SELECT py_sum_smallint(small_val) as sum_small FROM data_types_test; """ + + // ======================================== + // Test 3: BIGINT aggregation + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_sum_bigint(BIGINT); """ + sql """ + CREATE AGGREGATE FUNCTION py_sum_bigint(BIGINT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumBigIntUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SumBigIntUDAF: + def __init__(self): + self.sum = 0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + qt_bigint """ SELECT py_sum_bigint(big_val) as sum_big FROM data_types_test; """ + + // ======================================== + // Test 4: FLOAT aggregation + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_sum_float(FLOAT); """ + sql """ + CREATE AGGREGATE FUNCTION py_sum_float(FLOAT) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumFloatUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SumFloatUDAF: + def __init__(self): + self.sum = 0.0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += float(value) + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + qt_float """ SELECT py_sum_float(float_val) as sum_float FROM data_types_test; """ + + // ======================================== + // Test 5: DECIMAL aggregation + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_sum_decimal(DECIMAL); """ + sql """ + CREATE AGGREGATE FUNCTION py_sum_decimal(DECIMAL) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumDecimalUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SumDecimalUDAF: + def __init__(self): + self.sum = 0.0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += float(value) + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + qt_decimal """ SELECT py_sum_decimal(decimal_val) as sum_decimal FROM data_types_test; """ + + // ======================================== + // Test 6: STRING concatenation + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_concat_str(STRING); """ + sql """ + CREATE AGGREGATE FUNCTION py_concat_str(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "ConcatStrUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class ConcatStrUDAF: + def __init__(self): + self.values = [] + + @property + def aggregate_state(self): + return self.values + + def accumulate(self, value): + if value is not None: + self.values.append(value) + + def merge(self, other_state): + if other_state: + self.values.extend(other_state) + + def finish(self): + if not self.values: + return None + return ','.join(sorted(self.values)) +\$\$; + """ + + qt_string """ SELECT py_concat_str(str_val) as concat_result FROM data_types_test; """ + + // ======================================== + // Test 7: BOOLEAN count + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_count_true(BOOLEAN); """ + sql """ + CREATE AGGREGATE FUNCTION py_count_true(BOOLEAN) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "CountTrueUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class CountTrueUDAF: + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, value): + if value is True: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def finish(self): + return self.count +\$\$; + """ + + qt_boolean """ SELECT py_count_true(bool_val) as true_count FROM data_types_test; """ + + // ======================================== + // Test 8: Mixed data types in one query + // ======================================== + qt_mixed_types """ + SELECT + py_sum_tinyint(tiny_val) as sum_tiny, + py_sum_smallint(small_val) as sum_small, + py_sum_bigint(big_val) as sum_big, + py_sum_float(float_val) as sum_float, + py_sum_decimal(decimal_val) as sum_decimal, + py_concat_str(str_val) as concat_str, + py_count_true(bool_val) as true_count + FROM data_types_test; + """ + + // ======================================== + // Test 9: Type conversion in UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_int_to_double_sum(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_int_to_double_sum(INT) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "IntToDoubleSumUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class IntToDoubleSumUDAF: + def __init__(self): + self.sum = 0.0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += float(value) + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + qt_type_conversion """ SELECT py_int_to_double_sum(int_val) as sum_as_double FROM data_types_test; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_sum_tinyint(TINYINT);") + try_sql("DROP FUNCTION IF EXISTS py_sum_smallint(SMALLINT);") + try_sql("DROP FUNCTION IF EXISTS py_sum_bigint(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS py_sum_float(FLOAT);") + try_sql("DROP FUNCTION IF EXISTS py_sum_decimal(DECIMAL);") + try_sql("DROP FUNCTION IF EXISTS py_concat_str(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_count_true(BOOLEAN);") + try_sql("DROP FUNCTION IF EXISTS py_int_to_double_sum(INT);") + try_sql("DROP TABLE IF EXISTS data_types_test;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_edge_cases.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_edge_cases.groovy new file mode 100644 index 00000000000000..26f6b3eeeea14b --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_edge_cases.groovy @@ -0,0 +1,442 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_edge_cases") { + // Test Python UDAFs with edge cases and boundary values + // Including: very large numbers, very small numbers, negative numbers, zero, duplicates + + def runtime_version = "3.8.10" + + try { + // Create test table with edge cases + sql """ DROP TABLE IF EXISTS edge_cases_test; """ + sql """ + CREATE TABLE edge_cases_test ( + id INT, + category STRING, + int_val BIGINT, + double_val DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO edge_cases_test VALUES + (1, 'positive', 2147483647, 1.7976931348623157e+308), -- Max INT, near max DOUBLE + (2, 'positive', 1000000000, 999999999.999), + (3, 'negative', -2147483648, -1.7976931348623157e+308), -- Min INT, near min DOUBLE + (4, 'negative', -1000000000, -999999999.999), + (5, 'zero', 0, 0.0), + (6, 'zero', 0, 0.0), + (7, 'small', 1, 0.0000000001), -- Very small positive + (8, 'small', -1, -0.0000000001), -- Very small negative + (9, 'duplicate', 100, 100.5), + (10, 'duplicate', 100, 100.5), + (11, 'duplicate', 100, 100.5), + (12, 'mixed', 50, -50.5), + (13, 'mixed', -50, 50.5); + """ + + qt_select_data """ SELECT * FROM edge_cases_test ORDER BY id; """ + + // ======================================== + // Test 1: Sum with large numbers + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_sum_large(BIGINT); """ + sql """ + CREATE AGGREGATE FUNCTION py_sum_large(BIGINT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumLargeUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SumLargeUDAF: + def __init__(self): + self.sum = 0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + qt_large_numbers """ SELECT category, py_sum_large(int_val) as sum_result + FROM edge_cases_test + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 2: Min with negative numbers + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_min_val(BIGINT); """ + sql """ + CREATE AGGREGATE FUNCTION py_min_val(BIGINT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "MinValUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class MinValUDAF: + def __init__(self): + self.min_value = None + + @property + def aggregate_state(self): + return self.min_value + + def accumulate(self, value): + if value is not None: + if self.min_value is None or value < self.min_value: + self.min_value = value + + def merge(self, other_state): + if other_state is not None: + if self.min_value is None or other_state < self.min_value: + self.min_value = other_state + + def finish(self): + return self.min_value +\$\$; + """ + + qt_negative_min """ SELECT py_min_val(int_val) as min_value FROM edge_cases_test; """ + + // ======================================== + // Test 3: Max with large numbers + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_max_val(BIGINT); """ + sql """ + CREATE AGGREGATE FUNCTION py_max_val(BIGINT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "MaxValUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class MaxValUDAF: + def __init__(self): + self.max_value = None + + @property + def aggregate_state(self): + return self.max_value + + def accumulate(self, value): + if value is not None: + if self.max_value is None or value > self.max_value: + self.max_value = value + + def merge(self, other_state): + if other_state is not None: + if self.max_value is None or other_state > self.max_value: + self.max_value = other_state + + def finish(self): + return self.max_value +\$\$; + """ + + qt_large_max """ SELECT py_max_val(int_val) as max_value FROM edge_cases_test; """ + + // ======================================== + // Test 4: Count distinct values + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_count_distinct(BIGINT); """ + sql """ + CREATE AGGREGATE FUNCTION py_count_distinct(BIGINT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "CountDistinctUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class CountDistinctUDAF: + def __init__(self): + self.distinct_values = set() + + @property + def aggregate_state(self): + return list(self.distinct_values) + + def accumulate(self, value): + if value is not None: + self.distinct_values.add(value) + + def merge(self, other_state): + if other_state: + self.distinct_values.update(other_state) + + def finish(self): + return len(self.distinct_values) +\$\$; + """ + + qt_distinct_count """ SELECT category, py_count_distinct(int_val) as distinct_count + FROM edge_cases_test + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 5: Product aggregation (handles overflow) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_product(BIGINT); """ + sql """ + CREATE AGGREGATE FUNCTION py_product(BIGINT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "ProductUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class ProductUDAF: + def __init__(self): + self.product = 1 + self.has_value = False + + @property + def aggregate_state(self): + return (self.product, self.has_value) + + def accumulate(self, value): + if value is not None: + self.product *= value + self.has_value = True + + def merge(self, other_state): + other_product, other_has_value = other_state + if other_has_value: + self.product *= other_product + self.has_value = True + + def finish(self): + return self.product if self.has_value else None +\$\$; + """ + + qt_product """ SELECT category, py_product(int_val) as product_result + FROM edge_cases_test + WHERE category = 'small' + GROUP BY category; """ + + // ======================================== + // Test 6: Absolute sum (sum of absolute values) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_abs_sum(BIGINT); """ + sql """ + CREATE AGGREGATE FUNCTION py_abs_sum(BIGINT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "AbsSumUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class AbsSumUDAF: + def __init__(self): + self.sum = 0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += abs(value) + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + qt_abs_sum """ SELECT category, py_abs_sum(int_val) as abs_sum + FROM edge_cases_test + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 7: Safe division (handles division by zero) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_safe_avg(BIGINT); """ + sql """ + CREATE AGGREGATE FUNCTION py_safe_avg(BIGINT) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SafeAvgUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SafeAvgUDAF: + def __init__(self): + self.sum = 0 + self.count = 0 + + @property + def aggregate_state(self): + return (self.sum, self.count) + + def accumulate(self, value): + if value is not None: + self.sum += value + self.count += 1 + + def merge(self, other_state): + other_sum, other_count = other_state + self.sum += other_sum + self.count += other_count + + def finish(self): + if self.count == 0: + return 0.0 # Safe default + return float(self.sum) / float(self.count) +\$\$; + """ + + qt_safe_avg """ SELECT category, py_safe_avg(int_val) as safe_avg + FROM edge_cases_test + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 8: Sign count (count positive, negative, zero) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_sign_summary(BIGINT); """ + sql """ + CREATE AGGREGATE FUNCTION py_sign_summary(BIGINT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SignSummaryUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SignSummaryUDAF: + def __init__(self): + self.positive_count = 0 + self.negative_count = 0 + self.zero_count = 0 + + @property + def aggregate_state(self): + return (self.positive_count, self.negative_count, self.zero_count) + + def accumulate(self, value): + if value is not None: + if value > 0: + self.positive_count += 1 + elif value < 0: + self.negative_count += 1 + else: + self.zero_count += 1 + + def merge(self, other_state): + other_pos, other_neg, other_zero = other_state + self.positive_count += other_pos + self.negative_count += other_neg + self.zero_count += other_zero + + def finish(self): + return f"pos:{self.positive_count},neg:{self.negative_count},zero:{self.zero_count}" +\$\$; + """ + + qt_sign_summary """ SELECT py_sign_summary(int_val) as sign_summary FROM edge_cases_test; """ + + // ======================================== + // Test 9: Test with only zeros + // ======================================== + sql """ DROP TABLE IF EXISTS zero_test; """ + sql """ + CREATE TABLE zero_test ( + id INT, + val BIGINT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO zero_test VALUES + (1, 0), (2, 0), (3, 0), (4, 0), (5, 0); + """ + + qt_all_zeros_sum """ SELECT py_sum_large(val) as sum_zeros FROM zero_test; """ + qt_all_zeros_avg """ SELECT py_safe_avg(val) as avg_zeros FROM zero_test; """ + qt_all_zeros_product """ SELECT py_product(val) as product_zeros FROM zero_test; """ + + // ======================================== + // Test 10: Single value aggregation + // ======================================== + sql """ DROP TABLE IF EXISTS single_value_test; """ + sql """ + CREATE TABLE single_value_test ( + id INT, + val BIGINT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO single_value_test VALUES (1, 42); + """ + + qt_single_value """ SELECT + py_sum_large(val) as sum_val, + py_min_val(val) as min_val, + py_max_val(val) as max_val, + py_safe_avg(val) as avg_val + FROM single_value_test; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_sum_large(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS py_min_val(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS py_max_val(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS py_count_distinct(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS py_product(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS py_abs_sum(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS py_safe_avg(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS py_sign_summary(BIGINT);") + try_sql("DROP TABLE IF EXISTS edge_cases_test;") + try_sql("DROP TABLE IF EXISTS zero_test;") + try_sql("DROP TABLE IF EXISTS single_value_test;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_inline.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_inline.groovy new file mode 100644 index 00000000000000..d6acee25b6bc89 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_inline.groovy @@ -0,0 +1,332 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_inline") { + // Test Python UDAF using Inline mode + + def runtime_version = "3.8.10" + + try { + // Create test table + sql """ DROP TABLE IF EXISTS test_pythonudaf_inline_table """ + sql """ + CREATE TABLE IF NOT EXISTS test_pythonudaf_inline_table ( + `id` INT NOT NULL COMMENT "ID", + `value` INT COMMENT "Value", + `amount` DOUBLE COMMENT "Amount", + `category` VARCHAR(10) NOT NULL COMMENT "Category" + ) + DISTRIBUTED BY HASH(id) PROPERTIES("replication_num" = "1"); + """ + + // Insert test data + sql """ INSERT INTO test_pythonudaf_inline_table VALUES + (1, 10, 10.5, 'A'), + (2, 20, 20.5, 'A'), + (3, 30, 30.5, 'B'), + (4, 40, 40.5, 'B'), + (5, 50, 50.5, 'C'); + """ + + qt_select_data """ SELECT * FROM test_pythonudaf_inline_table ORDER BY id; """ + + // ======================================== + // Test 1: Simple Sum UDAF (Inline) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udaf_sum_inline(INT); """ + + sql """ + CREATE AGGREGATE FUNCTION udaf_sum_inline(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SumUDAF: + def __init__(self): + self.sum = 0 + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + if other_state is not None: + self.sum += other_state + + def finish(self): + return self.sum + + @property + def aggregate_state(self): + return self.sum +\$\$; + """ + + // Test basic aggregation + qt_test1 """ SELECT udaf_sum_inline(value) as total, sum(value) as native_sum FROM test_pythonudaf_inline_table; """ + + // Test with GROUP BY + qt_test2 """ SELECT category, + udaf_sum_inline(value) as sum_val, + sum(value) as native_sum + FROM test_pythonudaf_inline_table + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 2: Average UDAF (Inline) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udaf_avg_inline(DOUBLE); """ + + sql """ + CREATE AGGREGATE FUNCTION udaf_avg_inline(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "AvgUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class AvgUDAF: + def __init__(self): + self.count = 0 + self.sum = 0.0 + + def accumulate(self, value): + if value is not None: + self.count += 1 + self.sum += value + + def merge(self, other_state): + if other_state is not None: + other_count, other_sum = other_state + self.count += other_count + self.sum += other_sum + + def finish(self): + if self.count == 0: + return None + return self.sum / self.count + + @property + def aggregate_state(self): + return (self.count, self.sum) +\$\$; + """ + + qt_test3 """ SELECT udaf_avg_inline(amount) as avg_amount, avg(amount) as native_avg FROM test_pythonudaf_inline_table; """ + + qt_test4 """ SELECT category, + udaf_avg_inline(amount) as py_avg, + avg(amount) as native_avg + FROM test_pythonudaf_inline_table + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 3: Count UDAF (Inline) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udaf_count_inline(INT); """ + + sql """ + CREATE AGGREGATE FUNCTION udaf_count_inline(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "CountUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class CountUDAF: + def __init__(self): + self.count = 0 + + def accumulate(self, value): + if value is not None: + self.count += 1 + + def merge(self, other_state): + if other_state is not None: + self.count += other_state + + def finish(self): + return self.count + + @property + def aggregate_state(self): + return self.count +\$\$; + """ + + qt_test5 """ SELECT udaf_count_inline(value) as total_count, count(value) as native_count FROM test_pythonudaf_inline_table; """ + + qt_test6 """ SELECT category, + udaf_count_inline(value) as py_count, + count(value) as native_count + FROM test_pythonudaf_inline_table + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 4: Max UDAF (Inline) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udaf_max_inline(INT); """ + + sql """ + CREATE AGGREGATE FUNCTION udaf_max_inline(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "MaxUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class MaxUDAF: + def __init__(self): + self.max_value = None + + def accumulate(self, value): + if value is not None: + if self.max_value is None or value > self.max_value: + self.max_value = value + + def merge(self, other_state): + if other_state is not None: + if self.max_value is None or other_state > self.max_value: + self.max_value = other_state + + def finish(self): + return self.max_value + + @property + def aggregate_state(self): + return self.max_value +\$\$; + """ + + qt_test7 """ SELECT udaf_max_inline(value) as max_value, max(value) as native_max FROM test_pythonudaf_inline_table; """ + + qt_test8 """ SELECT category, + udaf_max_inline(value) as py_max, + max(value) as native_max + FROM test_pythonudaf_inline_table + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 5: NULL handling + // ======================================== + sql """ INSERT INTO test_pythonudaf_inline_table VALUES (6, NULL, 60.5, 'A'); """ + sql """ INSERT INTO test_pythonudaf_inline_table VALUES (7, NULL, 70.5, 'B'); """ + + qt_test_null1 """ SELECT udaf_sum_inline(value) as total, sum(value) as native_sum FROM test_pythonudaf_inline_table; """ + qt_test_null2 """ SELECT udaf_count_inline(value) as count, count(value) as native_count FROM test_pythonudaf_inline_table; """ + qt_test_null3 """ SELECT category, + udaf_sum_inline(value) as sum_val, + sum(value) as native_sum + FROM test_pythonudaf_inline_table + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 6: Window Functions + // ======================================== + qt_test_window1 """ SELECT id, category, value, + udaf_sum_inline(value) OVER(PARTITION BY category) as sum_by_cat, + sum(value) OVER(PARTITION BY category) as native_sum + FROM test_pythonudaf_inline_table + WHERE value IS NOT NULL + ORDER BY category, id; """ + + qt_test_window2 """ SELECT id, category, value, + udaf_sum_inline(value) OVER(PARTITION BY category ORDER BY id) as running_sum, + sum(value) OVER(PARTITION BY category ORDER BY id) as native_sum + FROM test_pythonudaf_inline_table + WHERE value IS NOT NULL + ORDER BY category, id; """ + + // ======================================== + // Test 7: Multiple UDAFs in one query + // ======================================== + qt_test_multiple """ SELECT category, + udaf_sum_inline(value) as sum_val, + sum(value) as native_sum, + udaf_count_inline(value) as count_val, + count(value) as native_count, + udaf_max_inline(value) as max_val, + max(value) as native_max, + udaf_avg_inline(amount) as avg_amount, + avg(amount) as native_avg + FROM test_pythonudaf_inline_table + GROUP BY category + ORDER BY category; """ + + // ======================================== + // Test 8: Global Function + // ======================================== + sql """ DROP GLOBAL FUNCTION IF EXISTS udaf_sum_global(INT); """ + + sql """ + CREATE GLOBAL AGGREGATE FUNCTION udaf_sum_global(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SumUDAF: + def __init__(self): + self.sum = 0 + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + if other_state is not None: + self.sum += other_state + + def finish(self): + return self.sum + + @property + def aggregate_state(self): + return self.sum +\$\$; + """ + + qt_test_global1 """ SELECT udaf_sum_global(value) as total, sum(value) as native_sum FROM test_pythonudaf_inline_table; """ + qt_test_global2 """ SELECT category, + udaf_sum_global(value) as sum_val, + sum(value) as native_sum + FROM test_pythonudaf_inline_table + GROUP BY category + ORDER BY category; """ + + } finally { + try_sql("DROP GLOBAL FUNCTION IF EXISTS udaf_sum_global(INT);") + try_sql("DROP FUNCTION IF EXISTS udaf_sum_inline(INT);") + try_sql("DROP FUNCTION IF EXISTS udaf_avg_inline(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS udaf_count_inline(INT);") + try_sql("DROP FUNCTION IF EXISTS udaf_max_inline(INT);") + try_sql("DROP TABLE IF EXISTS test_pythonudaf_inline_table") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_inline_simple.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_inline_simple.groovy new file mode 100644 index 00000000000000..4751390fe6f6ef --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_inline_simple.groovy @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_inline_simple") { + // Simplest Python UDAF test using inline mode + + def runtime_version = "3.8.10" + + try { + // Create test table + sql """ DROP TABLE IF EXISTS udaf_inline_test """ + sql """ + CREATE TABLE udaf_inline_test ( + id INT, + val INT, + cat VARCHAR(10) + ) DISTRIBUTED BY HASH(id) PROPERTIES("replication_num" = "1"); + """ + + // Insert simple data + sql """ INSERT INTO udaf_inline_test VALUES + (1, 10, 'A'), + (2, 20, 'A'), + (3, 30, 'B'), + (4, 40, 'B'), + (5, 50, 'C'); + """ + + qt_data """ SELECT * FROM udaf_inline_test ORDER BY id; """ + + // Create inline UDAF - Sum + sql """ DROP FUNCTION IF EXISTS inline_sum(INT); """ + sql """ + CREATE AGGREGATE FUNCTION inline_sum(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "MySum", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class MySum: + def __init__(self): + self.total = 0 + + def accumulate(self, value): + if value is not None: + self.total += value + + def merge(self, other_state): + if other_state is not None: + self.total += other_state + + def finish(self): + return self.total + + @property + def aggregate_state(self): + return self.total +\$\$; + """ + + // Test 1: Sum all values + qt_sum_all """ SELECT inline_sum(val) as result FROM udaf_inline_test; """ + + // Test 2: Sum with GROUP BY + qt_sum_group """ SELECT cat, inline_sum(val) as sum_result + FROM udaf_inline_test + GROUP BY cat + ORDER BY cat; """ + + // Test 3: Compare with native SUM + qt_compare """ SELECT cat, + inline_sum(val) as py_sum, + sum(val) as native_sum + FROM udaf_inline_test + GROUP BY cat + ORDER BY cat; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS inline_sum(INT);") + try_sql("DROP TABLE IF EXISTS udaf_inline_test") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_nested_query.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_nested_query.groovy new file mode 100644 index 00000000000000..11875810d6e257 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_nested_query.groovy @@ -0,0 +1,363 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_nested_query") { + // Test Python UDAFs in complex nested queries, subqueries, CTEs, and JOINs + + def runtime_version = "3.8.10" + + try { + // Create orders table + sql """ DROP TABLE IF EXISTS orders; """ + sql """ + CREATE TABLE orders ( + order_id INT, + customer_id INT, + product_id INT, + quantity INT, + price DOUBLE, + order_date DATE + ) ENGINE=OLAP + DUPLICATE KEY(order_id) + DISTRIBUTED BY HASH(order_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO orders VALUES + (1, 101, 1, 2, 29.99, '2024-01-01'), + (2, 101, 2, 1, 49.99, '2024-01-02'), + (3, 102, 1, 3, 29.99, '2024-01-03'), + (4, 102, 3, 1, 99.99, '2024-01-04'), + (5, 103, 2, 2, 49.99, '2024-01-05'), + (6, 103, 1, 1, 29.99, '2024-01-06'), + (7, 104, 3, 2, 99.99, '2024-01-07'), + (8, 104, 2, 3, 49.99, '2024-01-08'), + (9, 105, 1, 4, 29.99, '2024-01-09'), + (10, 105, 3, 1, 99.99, '2024-01-10'); + """ + + // Create customers table + sql """ DROP TABLE IF EXISTS customers; """ + sql """ + CREATE TABLE customers ( + customer_id INT, + customer_name STRING, + city STRING, + segment STRING + ) ENGINE=OLAP + DUPLICATE KEY(customer_id) + DISTRIBUTED BY HASH(customer_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO customers VALUES + (101, 'Alice', 'New York', 'Premium'), + (102, 'Bob', 'Los Angeles', 'Standard'), + (103, 'Charlie', 'Chicago', 'Premium'), + (104, 'David', 'Houston', 'Standard'), + (105, 'Eve', 'Phoenix', 'Premium'); + """ + + qt_select_orders """ SELECT * FROM orders ORDER BY order_id; """ + qt_select_customers """ SELECT * FROM customers ORDER BY customer_id; """ + + // Create Python UDAFs + + // UDAF 1: Total Revenue + sql """ DROP FUNCTION IF EXISTS py_total_revenue(INT, DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_total_revenue(INT, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "TotalRevenueUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class TotalRevenueUDAF: + def __init__(self): + self.total = 0.0 + + @property + def aggregate_state(self): + return self.total + + def accumulate(self, quantity, price): + if quantity is not None and price is not None: + self.total += quantity * price + + def merge(self, other_state): + self.total += other_state + + def finish(self): + return self.total +\$\$; + """ + + // UDAF 2: Order Count + sql """ DROP FUNCTION IF EXISTS py_order_count(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_order_count(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "OrderCountUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class OrderCountUDAF: + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, order_id): + if order_id is not None: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def finish(self): + return self.count +\$\$; + """ + + // UDAF 3: Average Order Value + sql """ DROP FUNCTION IF EXISTS py_avg_order_value(INT, DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_avg_order_value(INT, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "AvgOrderValueUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class AvgOrderValueUDAF: + def __init__(self): + self.total_value = 0.0 + self.count = 0 + + @property + def aggregate_state(self): + return (self.total_value, self.count) + + def accumulate(self, quantity, price): + if quantity is not None and price is not None: + self.total_value += quantity * price + self.count += 1 + + def merge(self, other_state): + other_value, other_count = other_state + self.total_value += other_value + self.count += other_count + + def finish(self): + if self.count == 0: + return None + return self.total_value / self.count +\$\$; + """ + + // ======================================== + // Test 1: Subquery with UDAF + // ======================================== + qt_subquery_1 """ + SELECT customer_id, total_revenue + FROM ( + SELECT customer_id, py_total_revenue(quantity, price) as total_revenue + FROM orders + GROUP BY customer_id + ) t + WHERE total_revenue > 100 + ORDER BY customer_id; + """ + + // ======================================== + // Test 2: JOIN with UDAF aggregation + // ======================================== + qt_join_1 """ + SELECT + c.customer_name, + c.city, + c.segment, + py_total_revenue(o.quantity, o.price) as total_spent, + py_order_count(o.order_id) as order_count + FROM customers c + INNER JOIN orders o ON c.customer_id = o.customer_id + GROUP BY c.customer_name, c.city, c.segment + ORDER BY c.customer_name; + """ + + // ======================================== + // Test 3: CTE with UDAF + // ======================================== + qt_cte_1 """ + WITH customer_stats AS ( + SELECT + customer_id, + py_total_revenue(quantity, price) as revenue, + py_order_count(order_id) as orders + FROM orders + GROUP BY customer_id + ) + SELECT + cs.customer_id, + c.customer_name, + cs.revenue, + cs.orders + FROM customer_stats cs + JOIN customers c ON cs.customer_id = c.customer_id + WHERE cs.revenue > 100 + ORDER BY cs.revenue DESC; + """ + + // ======================================== + // Test 4: Multiple CTEs with UDAFs + // ======================================== + qt_cte_2 """ + WITH premium_customers AS ( + SELECT customer_id, customer_name, segment + FROM customers + WHERE segment = 'Premium' + ), + customer_revenue AS ( + SELECT + o.customer_id, + py_total_revenue(o.quantity, o.price) as revenue + FROM orders o + GROUP BY o.customer_id + ) + SELECT + pc.customer_name, + cr.revenue + FROM premium_customers pc + JOIN customer_revenue cr ON pc.customer_id = cr.customer_id + ORDER BY cr.revenue DESC; + """ + + // ======================================== + // Test 5: Nested aggregation + // ======================================== + qt_nested_agg """ + SELECT + segment, + COUNT(*) as customer_count, + SUM(total_revenue) as segment_revenue + FROM ( + SELECT + c.customer_id, + c.segment, + py_total_revenue(o.quantity, o.price) as total_revenue + FROM customers c + JOIN orders o ON c.customer_id = o.customer_id + GROUP BY c.customer_id, c.segment + ) t + GROUP BY segment + ORDER BY segment; + """ + + // ======================================== + // Test 6: HAVING clause with UDAF + // ======================================== + qt_having """ + SELECT + c.segment, + py_total_revenue(o.quantity, o.price) as segment_revenue, + py_order_count(o.order_id) as order_count + FROM customers c + JOIN orders o ON c.customer_id = o.customer_id + GROUP BY c.segment + HAVING py_total_revenue(o.quantity, o.price) > 300 + ORDER BY segment_revenue DESC; + """ + + // ======================================== + // Test 7: Self-join with UDAF + // ======================================== + qt_self_join """ + SELECT + o1.product_id, + py_total_revenue(o1.quantity, o1.price) as total_revenue + FROM orders o1 + WHERE EXISTS ( + SELECT 1 FROM orders o2 + WHERE o1.product_id = o2.product_id + AND o1.order_id != o2.order_id + ) + GROUP BY o1.product_id + ORDER BY o1.product_id; + """ + + // ======================================== + // Test 8: UNION with UDAF + // ======================================== + qt_union """ + SELECT 'Premium' as segment_type, py_total_revenue(o.quantity, o.price) as revenue + FROM customers c + JOIN orders o ON c.customer_id = o.customer_id + WHERE c.segment = 'Premium' + UNION ALL + SELECT 'Standard' as segment_type, py_total_revenue(o.quantity, o.price) as revenue + FROM customers c + JOIN orders o ON c.customer_id = o.customer_id + WHERE c.segment = 'Standard' + ORDER BY segment_type; + """ + + // ======================================== + // Test 9: Complex nested query with multiple UDAFs + // ======================================== + qt_complex_nested """ + SELECT + segment, + avg_revenue, + max_revenue, + min_revenue + FROM ( + SELECT + segment, + AVG(customer_revenue) as avg_revenue, + MAX(customer_revenue) as max_revenue, + MIN(customer_revenue) as min_revenue + FROM ( + SELECT + c.customer_id, + c.segment, + py_total_revenue(o.quantity, o.price) as customer_revenue + FROM customers c + JOIN orders o ON c.customer_id = o.customer_id + GROUP BY c.customer_id, c.segment + ) customer_level + GROUP BY segment + ) segment_level + ORDER BY segment; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_total_revenue(INT, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_order_count(INT);") + try_sql("DROP FUNCTION IF EXISTS py_avg_order_value(INT, DOUBLE);") + try_sql("DROP TABLE IF EXISTS orders;") + try_sql("DROP TABLE IF EXISTS customers;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_null_handling.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_null_handling.groovy new file mode 100644 index 00000000000000..577cef73cae510 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_null_handling.groovy @@ -0,0 +1,303 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_null_handling") { + // Test NULL handling in Python UDAFs + // This is critical for data quality and edge cases + + def runtime_version = "3.8.10" + + try { + // Create test table with NULLs + sql """ DROP TABLE IF EXISTS null_test; """ + sql """ + CREATE TABLE null_test ( + id INT, + category STRING, + int_val INT, + double_val DOUBLE, + str_val STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO null_test VALUES + (1, 'A', 10, 10.5, 'apple'), + (2, 'A', NULL, 20.3, 'banana'), + (3, 'A', 30, NULL, NULL), + (4, 'A', NULL, NULL, 'cherry'), + (5, 'B', 40, 40.2, NULL), + (6, 'B', NULL, NULL, NULL), + (7, 'B', 60, 60.8, 'date'), + (8, 'C', NULL, NULL, NULL), + (9, 'C', NULL, NULL, NULL), + (10, 'C', NULL, NULL, NULL); + """ + + qt_select_data """ SELECT * FROM null_test ORDER BY id; """ + + // ======================================== + // Test 1: Count with NULL handling + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_count_nonnull(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_count_nonnull(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "CountNonNullUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class CountNonNullUDAF: + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, value): + if value is not None: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def finish(self): + return self.count +\$\$; + """ + + qt_count_null_all """ SELECT py_count_nonnull(int_val) as count_nonnull FROM null_test; """ + qt_count_null_group """ SELECT category, py_count_nonnull(int_val) as count_nonnull + FROM null_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 2: Sum with NULL handling + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_sum_null(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_sum_null(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumNullUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class SumNullUDAF: + def __init__(self): + self.sum = 0.0 + self.has_value = False + + @property + def aggregate_state(self): + return (self.sum, self.has_value) + + def accumulate(self, value): + if value is not None: + self.sum += value + self.has_value = True + + def merge(self, other_state): + other_sum, other_has_value = other_state + self.sum += other_sum + self.has_value = self.has_value or other_has_value + + def finish(self): + return self.sum if self.has_value else None +\$\$; + """ + + qt_sum_null_all """ SELECT py_sum_null(double_val) as sum_result FROM null_test; """ + qt_sum_null_group """ SELECT category, py_sum_null(double_val) as sum_result + FROM null_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 3: First Non-NULL Value UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_first_nonnull(STRING); """ + sql """ + CREATE AGGREGATE FUNCTION py_first_nonnull(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "FirstNonNullUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class FirstNonNullUDAF: + def __init__(self): + self.first_value = None + + @property + def aggregate_state(self): + return self.first_value + + def accumulate(self, value): + if self.first_value is None and value is not None: + self.first_value = value + + def merge(self, other_state): + if self.first_value is None and other_state is not None: + self.first_value = other_state + + def finish(self): + return self.first_value +\$\$; + """ + + qt_first_nonnull_all """ SELECT py_first_nonnull(str_val) as first_str FROM null_test; """ + qt_first_nonnull_group """ SELECT category, py_first_nonnull(str_val) as first_str + FROM null_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 4: NULL Count UDAF + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_count_null(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_count_null(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "CountNullUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class CountNullUDAF: + def __init__(self): + self.null_count = 0 + + @property + def aggregate_state(self): + return self.null_count + + def accumulate(self, value): + if value is None: + self.null_count += 1 + + def merge(self, other_state): + self.null_count += other_state + + def finish(self): + return self.null_count +\$\$; + """ + + qt_null_count_all """ SELECT py_count_null(int_val) as null_count FROM null_test; """ + qt_null_count_group """ SELECT category, py_count_null(int_val) as null_count + FROM null_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 5: Coalesce Average (ignore NULLs) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_avg_coalesce(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_avg_coalesce(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "AvgCoalesceUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class AvgCoalesceUDAF: + def __init__(self): + self.sum = 0.0 + self.count = 0 + + @property + def aggregate_state(self): + return (self.sum, self.count) + + def accumulate(self, value): + if value is not None: + self.sum += value + self.count += 1 + + def merge(self, other_state): + other_sum, other_count = other_state + self.sum += other_sum + self.count += other_count + + def finish(self): + if self.count == 0: + return None + return self.sum / self.count +\$\$; + """ + + qt_avg_coalesce_all """ SELECT py_avg_coalesce(double_val) as avg_result FROM null_test; """ + qt_avg_coalesce_group """ SELECT category, py_avg_coalesce(double_val) as avg_result + FROM null_test GROUP BY category ORDER BY category; """ + + // ======================================== + // Test 6: All NULLs scenario + // ======================================== + sql """ DROP TABLE IF EXISTS all_null_test; """ + sql """ + CREATE TABLE all_null_test ( + id INT, + val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO all_null_test VALUES + (1, NULL), + (2, NULL), + (3, NULL); + """ + + qt_all_null_count """ SELECT py_count_nonnull(val) as count_result FROM all_null_test; """ + qt_all_null_sum """ SELECT py_sum_null(CAST(val AS DOUBLE)) as sum_result FROM all_null_test; """ + + // ======================================== + // Test 7: Empty table scenario + // ======================================== + sql """ DROP TABLE IF EXISTS empty_test; """ + sql """ + CREATE TABLE empty_test ( + id INT, + val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + qt_empty_count """ SELECT py_count_nonnull(val) as count_result FROM empty_test; """ + qt_empty_sum """ SELECT py_sum_null(CAST(val AS DOUBLE)) as sum_result FROM empty_test; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_count_nonnull(INT);") + try_sql("DROP FUNCTION IF EXISTS py_sum_null(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_first_nonnull(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_count_null(INT);") + try_sql("DROP FUNCTION IF EXISTS py_avg_coalesce(DOUBLE);") + try_sql("DROP TABLE IF EXISTS null_test;") + try_sql("DROP TABLE IF EXISTS all_null_test;") + try_sql("DROP TABLE IF EXISTS empty_test;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_simple.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_simple.groovy new file mode 100644 index 00000000000000..8a5c69545e9a10 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_simple.groovy @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_simple") { + def pyPath = """${context.file.parent}/udaf_scripts/pyudaf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${pyPath}".toString()) + + def tableName = "test_pythonudaf_simple" + + try { + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} ( + `id` INT NOT NULL COMMENT "ID", + `value` INT NOT NULL COMMENT "Value", + `category` VARCHAR(10) NOT NULL COMMENT "Category" + ) + DISTRIBUTED BY HASH(id) PROPERTIES("replication_num" = "1"); + """ + + // Insert simple test data + sql """ INSERT INTO ${tableName} VALUES + (1, 10, 'A'), + (2, 20, 'A'), + (3, 30, 'B'), + (4, 40, 'B'), + (5, 50, 'C'); + """ + + qt_select_data """ SELECT * FROM ${tableName} ORDER BY id; """ + + // Create UDAF function + sql """ DROP FUNCTION IF EXISTS py_sum(int) """ + + sql """ CREATE AGGREGATE FUNCTION py_sum(int) RETURNS bigint PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="sum_int.SumInt", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + // Test 1: Basic sum of all values + qt_test1 """ SELECT py_sum(value) as total FROM ${tableName}; """ + + // Test 2: Sum with GROUP BY + qt_test2 """ SELECT category, py_sum(value) as sum_val + FROM ${tableName} + GROUP BY category + ORDER BY category; """ + + // Test 3: Compare with native SUM + qt_test3 """ SELECT category, + py_sum(value) as py_sum, + sum(value) as native_sum + FROM ${tableName} + GROUP BY category + ORDER BY category; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_sum(int);") + try_sql("DROP TABLE IF EXISTS ${tableName}") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_window_advanced_inline.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_window_advanced_inline.groovy new file mode 100644 index 00000000000000..f016cac3ac3c97 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_window_advanced_inline.groovy @@ -0,0 +1,509 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_window_advanced_inline") { + // Advanced window function tests with Python UDAFs + // Including: moving averages, percentiles, custom analytics + + def runtime_version = "3.8.10" + + try { + // Create time series data for advanced analytics + sql """ DROP TABLE IF EXISTS time_series_data; """ + sql """ + CREATE TABLE time_series_data ( + timestamp DATETIME, + metric_name STRING, + metric_value DOUBLE, + device_id STRING, + location STRING + ) ENGINE=OLAP + DUPLICATE KEY(timestamp) + DISTRIBUTED BY HASH(timestamp) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO time_series_data VALUES + ('2024-01-01 10:00:00', 'temperature', 20.5, 'sensor1', 'room1'), + ('2024-01-01 10:05:00', 'temperature', 21.0, 'sensor1', 'room1'), + ('2024-01-01 10:10:00', 'temperature', 21.5, 'sensor1', 'room1'), + ('2024-01-01 10:15:00', 'temperature', 22.0, 'sensor1', 'room1'), + ('2024-01-01 10:20:00', 'temperature', 22.5, 'sensor1', 'room1'), + ('2024-01-01 10:00:00', 'humidity', 45.0, 'sensor2', 'room1'), + ('2024-01-01 10:05:00', 'humidity', 46.0, 'sensor2', 'room1'), + ('2024-01-01 10:10:00', 'humidity', 47.5, 'sensor2', 'room1'), + ('2024-01-01 10:15:00', 'humidity', 48.0, 'sensor2', 'room1'), + ('2024-01-01 10:20:00', 'humidity', 49.0, 'sensor2', 'room1'), + ('2024-01-01 10:00:00', 'temperature', 19.5, 'sensor3', 'room2'), + ('2024-01-01 10:05:00', 'temperature', 20.0, 'sensor3', 'room2'), + ('2024-01-01 10:10:00', 'temperature', 20.8, 'sensor3', 'room2'), + ('2024-01-01 10:15:00', 'temperature', 21.2, 'sensor3', 'room2'), + ('2024-01-01 10:20:00', 'temperature', 21.8, 'sensor3', 'room2'); + """ + + qt_select_data """ SELECT * FROM time_series_data ORDER BY timestamp, device_id; """ + + // ======================================== + // UDAF 1: Moving Average (SMA - Simple Moving Average) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_moving_avg(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_moving_avg(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "MovingAvgUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class MovingAvgUDAF: + def __init__(self): + self.values = [] + + @property + def aggregate_state(self): + return self.values + + def accumulate(self, value): + if value is not None: + self.values.append(value) + + def merge(self, other_state): + if other_state: + self.values.extend(other_state) + + def finish(self): + if not self.values: + return None + return sum(self.values) / len(self.values) +\$\$; + """ + + // ======================================== + // UDAF 2: Standard Deviation (for volatility analysis) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_window_stddev(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_window_stddev(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "WindowStdDevUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +import math + +class WindowStdDevUDAF: + def __init__(self): + self.values = [] + + @property + def aggregate_state(self): + return self.values + + def accumulate(self, value): + if value is not None: + self.values.append(value) + + def merge(self, other_state): + if other_state: + self.values.extend(other_state) + + def finish(self): + if not self.values or len(self.values) < 2: + return None + mean = sum(self.values) / len(self.values) + variance = sum((x - mean) ** 2 for x in self.values) / len(self.values) + return math.sqrt(variance) +\$\$; + """ + + // ======================================== + // UDAF 3: Delta (Change from previous value) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_last_value(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_last_value(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "LastValueUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class LastValueUDAF: + def __init__(self): + self.last = None + + @property + def aggregate_state(self): + return self.last + + def accumulate(self, value): + if value is not None: + self.last = value + + def merge(self, other_state): + if other_state is not None: + self.last = other_state + + def finish(self): + return self.last +\$\$; + """ + + // ======================================== + // UDAF 4: Min-Max Normalization in window + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_window_min(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_window_min(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "WindowMinUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class WindowMinUDAF: + def __init__(self): + self.min_val = None + + @property + def aggregate_state(self): + return self.min_val + + def accumulate(self, value): + if value is not None: + if self.min_val is None or value < self.min_val: + self.min_val = value + + def merge(self, other_state): + if other_state is not None: + if self.min_val is None or other_state < self.min_val: + self.min_val = other_state + + def finish(self): + return self.min_val +\$\$; + """ + + sql """ DROP FUNCTION IF EXISTS py_window_max(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_window_max(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "WindowMaxUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class WindowMaxUDAF: + def __init__(self): + self.max_val = None + + @property + def aggregate_state(self): + return self.max_val + + def accumulate(self, value): + if value is not None: + if self.max_val is None or value > self.max_val: + self.max_val = value + + def merge(self, other_state): + if other_state is not None: + if self.max_val is None or other_state > self.max_val: + self.max_val = other_state + + def finish(self): + return self.max_val +\$\$; + """ + + // ======================================== + // Test 1: Moving Average with sliding window + // ======================================== + qt_moving_avg_3period """ + SELECT + timestamp, + device_id, + metric_value, + py_moving_avg(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as sma_3period + FROM time_series_data + WHERE metric_name = 'temperature' + ORDER BY device_id, timestamp; + """ + + // ======================================== + // Test 2: Rolling standard deviation + // ======================================== + qt_rolling_stddev """ + SELECT + timestamp, + device_id, + metric_value, + py_window_stddev(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 3 PRECEDING AND CURRENT ROW + ) as rolling_stddev_4period + FROM time_series_data + WHERE metric_name = 'temperature' + ORDER BY device_id, timestamp; + """ + + // ======================================== + // Test 3: Moving average in window + // ======================================== + qt_change_from_first """ + SELECT + timestamp, + device_id, + metric_value, + py_moving_avg(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as moving_avg_3 + FROM time_series_data + WHERE metric_name = 'temperature' + ORDER BY device_id, timestamp; + """ + + // ======================================== + // Test 4: Min-Max normalization within window + // ======================================== + qt_minmax_normalize """ + SELECT + device_id, + timestamp, + metric_value, + py_window_min(metric_value) OVER (PARTITION BY device_id ORDER BY timestamp) as window_min, + py_window_max(metric_value) OVER (PARTITION BY device_id ORDER BY timestamp) as window_max + FROM time_series_data + WHERE metric_name = 'temperature' + ORDER BY device_id, timestamp; + """ + + // ======================================== + // Test 5: Exponential smoothing simulation + // ======================================== + qt_cumulative_weighted """ + SELECT + timestamp, + location, + metric_name, + metric_value, + py_moving_avg(metric_value) OVER ( + PARTITION BY location, metric_name + ORDER BY timestamp + ) as overall_avg, + py_moving_avg(metric_value) OVER ( + PARTITION BY location, metric_name + ORDER BY timestamp + ROWS BETWEEN 1 PRECEDING AND CURRENT ROW + ) as two_period_avg + FROM time_series_data + ORDER BY location, metric_name, timestamp; + """ + + // ======================================== + // Test 6: Trend detection (comparing to moving average) + // ======================================== + qt_trend_detection """ + SELECT + timestamp, + device_id, + metric_value, + py_moving_avg(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as ma_3, + CASE + WHEN metric_value > py_moving_avg(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) THEN 'Above MA' + ELSE 'Below MA' + END as trend + FROM time_series_data + WHERE metric_name = 'temperature' + ORDER BY device_id, timestamp; + """ + + // ======================================== + // Test 7: Multi-metric window analysis with separate queries + // ======================================== + qt_multi_metric """ + SELECT + t.timestamp, + t.location, + t.temp_value, + h.humidity_value, + py_moving_avg(t.temp_value) OVER (PARTITION BY t.location ORDER BY t.timestamp ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as temp_ma, + py_moving_avg(h.humidity_value) OVER (PARTITION BY h.location ORDER BY h.timestamp ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as humidity_ma + FROM + (SELECT timestamp, location, metric_value as temp_value + FROM time_series_data + WHERE metric_name = 'temperature') t + LEFT JOIN + (SELECT timestamp, location, metric_value as humidity_value + FROM time_series_data + WHERE metric_name = 'humidity') h + ON t.timestamp = h.timestamp AND t.location = h.location + WHERE t.temp_value IS NOT NULL OR h.humidity_value IS NOT NULL + ORDER BY t.location, t.timestamp; + """ + + // ======================================== + // Test 8: Gap detection in time series + // ======================================== + sql """ DROP TABLE IF EXISTS gap_data; """ + sql """ + CREATE TABLE gap_data ( + id INT, + ts DATETIME, + sensor STRING, + value DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO gap_data VALUES + (1, '2024-01-01 10:00:00', 'A', 10.0), + (2, '2024-01-01 10:05:00', 'A', 11.0), + (3, '2024-01-01 10:10:00', 'A', 12.0), + (4, '2024-01-01 10:20:00', 'A', 15.0), -- gap here + (5, '2024-01-01 10:25:00', 'A', 16.0), + (6, '2024-01-01 10:00:00', 'B', 20.0), + (7, '2024-01-01 10:10:00', 'B', 22.0), -- gap here + (8, '2024-01-01 10:15:00', 'B', 23.0); + """ + + qt_gap_analysis """ + SELECT + sensor, + ts, + value, + py_last_value(value) OVER ( + PARTITION BY sensor + ORDER BY ts + ROWS BETWEEN 1 PRECEDING AND CURRENT ROW + ) as running_last + FROM gap_data + ORDER BY sensor, ts; + """ + + // ======================================== + // Test 9: Percentile approximation in windows + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_percentile_50(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_percentile_50(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "Percentile50UDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class Percentile50UDAF: + def __init__(self): + self.values = [] + + @property + def aggregate_state(self): + return self.values + + def accumulate(self, value): + if value is not None: + self.values.append(value) + + def merge(self, other_state): + if other_state: + self.values.extend(other_state) + + def finish(self): + if not self.values: + return None + sorted_vals = sorted(self.values) + n = len(sorted_vals) + if n % 2 == 0: + return (sorted_vals[n//2 - 1] + sorted_vals[n//2]) / 2.0 + else: + return sorted_vals[n//2] +\$\$; + """ + + qt_window_percentile """ + SELECT + location, + timestamp, + metric_value, + py_percentile_50(metric_value) OVER ( + PARTITION BY location + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as rolling_median + FROM time_series_data + WHERE metric_name = 'temperature' + ORDER BY location, timestamp; + """ + + // ======================================== + // Test 10: Cumulative distribution + // ======================================== + qt_cumulative_dist """ + SELECT + device_id, + metric_value, + py_moving_avg(metric_value) OVER ( + PARTITION BY device_id + ORDER BY metric_value + ) as cumulative_avg, + COUNT(*) OVER ( + PARTITION BY device_id + ORDER BY metric_value + ) as count_up_to_value + FROM time_series_data + WHERE metric_name = 'temperature' + ORDER BY device_id, metric_value; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_moving_avg(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_window_stddev(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_last_value(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_window_min(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_window_max(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_percentile_50(DOUBLE);") + try_sql("DROP TABLE IF EXISTS time_series_data;") + try_sql("DROP TABLE IF EXISTS gap_data;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_window_advanced_module.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_window_advanced_module.groovy new file mode 100644 index 00000000000000..21e9f91629da30 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_window_advanced_module.groovy @@ -0,0 +1,448 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_window_advanced_module") { + // Advanced window function tests with Python UDAFs using file-based deployment + // UDAFs are loaded from pyudaf.zip file (window_udaf module) + + def pyPath = """${context.file.parent}/udaf_scripts/pyudaf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${pyPath}".toString()) + + try { + // Create time series data for advanced analytics + sql """ DROP TABLE IF EXISTS time_series_data_mod; """ + sql """ + CREATE TABLE time_series_data_mod ( + timestamp DATETIME, + metric_name STRING, + metric_value DOUBLE, + device_id STRING, + location STRING + ) ENGINE=OLAP + DUPLICATE KEY(timestamp) + DISTRIBUTED BY HASH(timestamp) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO time_series_data_mod VALUES + ('2024-01-01 10:00:00', 'temperature', 20.5, 'sensor1', 'room1'), + ('2024-01-01 10:05:00', 'temperature', 21.0, 'sensor1', 'room1'), + ('2024-01-01 10:10:00', 'temperature', 21.5, 'sensor1', 'room1'), + ('2024-01-01 10:15:00', 'temperature', 22.0, 'sensor1', 'room1'), + ('2024-01-01 10:20:00', 'temperature', 22.5, 'sensor1', 'room1'), + ('2024-01-01 10:00:00', 'humidity', 45.0, 'sensor2', 'room1'), + ('2024-01-01 10:05:00', 'humidity', 46.0, 'sensor2', 'room1'), + ('2024-01-01 10:10:00', 'humidity', 47.5, 'sensor2', 'room1'), + ('2024-01-01 10:15:00', 'humidity', 48.0, 'sensor2', 'room1'), + ('2024-01-01 10:20:00', 'humidity', 49.0, 'sensor2', 'room1'), + ('2024-01-01 10:00:00', 'temperature', 19.5, 'sensor3', 'room2'), + ('2024-01-01 10:05:00', 'temperature', 20.0, 'sensor3', 'room2'), + ('2024-01-01 10:10:00', 'temperature', 20.8, 'sensor3', 'room2'), + ('2024-01-01 10:15:00', 'temperature', 21.2, 'sensor3', 'room2'), + ('2024-01-01 10:20:00', 'temperature', 21.8, 'sensor3', 'room2'); + """ + + qt_select_data """ SELECT * FROM time_series_data_mod ORDER BY timestamp, device_id; """ + + // ======================================== + // UDAF 1: Moving Average (SMA - Simple Moving Average) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_moving_avg_mod(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_moving_avg_mod(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "window_udaf.MovingAvgUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + // ======================================== + // UDAF 2: Standard Deviation (for volatility analysis) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_window_stddev_mod(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_window_stddev_mod(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "window_udaf.WindowStdDevUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + // ======================================== + // UDAF 3: Delta (Change from previous value) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_last_value_mod(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_last_value_mod(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "window_udaf.LastValueUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + // ======================================== + // UDAF 4: Min-Max Normalization in window + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_window_min_mod(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_window_min_mod(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "window_udaf.WindowMinUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP FUNCTION IF EXISTS py_window_max_mod(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_window_max_mod(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "window_udaf.WindowMaxUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + // ======================================== + // Test 1: Moving Average with sliding window + // ======================================== + qt_moving_avg_3period """ + SELECT + timestamp, + device_id, + metric_value, + py_moving_avg_mod(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as sma_3period + FROM time_series_data_mod + WHERE metric_name = 'temperature' + ORDER BY device_id, timestamp; + """ + + // ======================================== + // Test 2: Rolling standard deviation + // ======================================== + qt_rolling_stddev """ + SELECT + timestamp, + device_id, + metric_value, + py_window_stddev_mod(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 3 PRECEDING AND CURRENT ROW + ) as rolling_stddev_4period + FROM time_series_data_mod + WHERE metric_name = 'temperature' + ORDER BY device_id, timestamp; + """ + + // ======================================== + // Test 3: Moving average in window + // ======================================== + qt_change_from_first """ + SELECT + timestamp, + device_id, + metric_value, + py_moving_avg_mod(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as moving_avg_3 + FROM time_series_data_mod + WHERE metric_name = 'temperature' + ORDER BY device_id, timestamp; + """ + + // ======================================== + // Test 4: Min-Max normalization within window + // ======================================== + qt_minmax_normalize """ + SELECT + device_id, + timestamp, + metric_value, + py_window_min_mod(metric_value) OVER (PARTITION BY device_id ORDER BY timestamp) as window_min, + py_window_max_mod(metric_value) OVER (PARTITION BY device_id ORDER BY timestamp) as window_max + FROM time_series_data_mod + WHERE metric_name = 'temperature' + ORDER BY device_id, timestamp; + """ + + // ======================================== + // Test 5: Exponential smoothing simulation + // ======================================== + qt_cumulative_weighted """ + SELECT + timestamp, + location, + metric_name, + metric_value, + py_moving_avg_mod(metric_value) OVER ( + PARTITION BY location, metric_name + ORDER BY timestamp + ) as overall_avg, + py_moving_avg_mod(metric_value) OVER ( + PARTITION BY location, metric_name + ORDER BY timestamp + ROWS BETWEEN 1 PRECEDING AND CURRENT ROW + ) as two_period_avg + FROM time_series_data_mod + ORDER BY location, metric_name, timestamp; + """ + + // ======================================== + // Test 6: Trend detection (comparing to moving average) + // ======================================== + qt_trend_detection """ + SELECT + timestamp, + device_id, + metric_value, + py_moving_avg_mod(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as ma_3, + CASE + WHEN metric_value > py_moving_avg_mod(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) THEN 'Above MA' + ELSE 'Below MA' + END as trend + FROM time_series_data_mod + WHERE metric_name = 'temperature' + ORDER BY device_id, timestamp; + """ + + // ======================================== + // Test 7: Multi-metric window analysis with separate queries + // ======================================== + qt_multi_metric """ + SELECT + t.timestamp, + t.location, + t.temp_value, + h.humidity_value, + py_moving_avg_mod(t.temp_value) OVER (PARTITION BY t.location ORDER BY t.timestamp ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as temp_ma, + py_moving_avg_mod(h.humidity_value) OVER (PARTITION BY h.location ORDER BY h.timestamp ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as humidity_ma + FROM + (SELECT timestamp, location, metric_value as temp_value + FROM time_series_data_mod + WHERE metric_name = 'temperature') t + LEFT JOIN + (SELECT timestamp, location, metric_value as humidity_value + FROM time_series_data_mod + WHERE metric_name = 'humidity') h + ON t.timestamp = h.timestamp AND t.location = h.location + WHERE t.temp_value IS NOT NULL OR h.humidity_value IS NOT NULL + ORDER BY t.location, t.timestamp; + """ + + // ======================================== + // Test 8: Gap detection in time series + // ======================================== + sql """ DROP TABLE IF EXISTS gap_data_mod; """ + sql """ + CREATE TABLE gap_data_mod ( + id INT, + ts DATETIME, + sensor STRING, + value DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO gap_data_mod VALUES + (1, '2024-01-01 10:00:00', 'A', 10.0), + (2, '2024-01-01 10:05:00', 'A', 11.0), + (3, '2024-01-01 10:10:00', 'A', 12.0), + (4, '2024-01-01 10:20:00', 'A', 15.0), + (5, '2024-01-01 10:25:00', 'A', 16.0), + (6, '2024-01-01 10:00:00', 'B', 20.0), + (7, '2024-01-01 10:10:00', 'B', 22.0), + (8, '2024-01-01 10:15:00', 'B', 23.0); + """ + + qt_gap_analysis """ + SELECT + sensor, + ts, + value, + py_last_value_mod(value) OVER ( + PARTITION BY sensor + ORDER BY ts + ROWS BETWEEN 1 PRECEDING AND CURRENT ROW + ) as running_last + FROM gap_data_mod + ORDER BY sensor, ts; + """ + + // ======================================== + // Test 9: Percentile approximation in windows + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_percentile_50_mod(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_percentile_50_mod(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "window_udaf.Percentile50UDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_window_percentile """ + SELECT + location, + timestamp, + metric_value, + py_percentile_50_mod(metric_value) OVER ( + PARTITION BY location + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as rolling_median + FROM time_series_data_mod + WHERE metric_name = 'temperature' + ORDER BY location, timestamp; + """ + + // ======================================== + // Test 10: Cumulative distribution + // ======================================== + qt_cumulative_dist """ + SELECT + device_id, + metric_value, + py_moving_avg_mod(metric_value) OVER ( + PARTITION BY device_id + ORDER BY metric_value + ) as cumulative_avg, + COUNT(*) OVER ( + PARTITION BY device_id + ORDER BY metric_value + ) as count_up_to_value + FROM time_series_data_mod + WHERE metric_name = 'temperature' + ORDER BY device_id, metric_value; + """ + + // ======================================== + // Test 11: Module Reusability Test + // Create another function referencing the same module + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_moving_avg_mod2(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_moving_avg_mod2(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "window_udaf.MovingAvgUDAF", + "type" = "PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_module_reuse """ + SELECT + device_id, + py_moving_avg_mod(metric_value) as avg1, + py_moving_avg_mod2(metric_value) as avg2 + FROM time_series_data_mod + WHERE metric_name = 'temperature' + GROUP BY device_id + ORDER BY device_id; + """ + + // ======================================== + // Test 12: Combined Analytics + // Using multiple window UDAFs together + // ======================================== + qt_combined_analytics """ + SELECT + timestamp, + device_id, + metric_value, + py_moving_avg_mod(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as ma, + py_window_stddev_mod(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as stddev, + py_window_min_mod(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as min_val, + py_window_max_mod(metric_value) OVER ( + PARTITION BY device_id + ORDER BY timestamp + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as max_val + FROM time_series_data_mod + WHERE metric_name = 'temperature' + ORDER BY device_id, timestamp; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_moving_avg_mod(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_moving_avg_mod2(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_window_stddev_mod(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_last_value_mod(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_window_min_mod(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_window_max_mod(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_percentile_50_mod(DOUBLE);") + try_sql("DROP TABLE IF EXISTS time_series_data_mod;") + try_sql("DROP TABLE IF EXISTS gap_data_mod;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudaf_window_functions.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudaf_window_functions.groovy new file mode 100644 index 00000000000000..e61d4faa16d3be --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudaf_window_functions.groovy @@ -0,0 +1,538 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudaf_window_functions") { + // Test Python UDAFs with window functions (OVER clause) + // This tests PARTITION BY, ORDER BY, and frame specifications + + def runtime_version = "3.8.10" + + try { + // Create sales data table for window function tests + sql """ DROP TABLE IF EXISTS sales_data; """ + sql """ + CREATE TABLE sales_data ( + id INT, + sales_date DATE, + region STRING, + product STRING, + sales_amount DOUBLE, + quantity INT, + salesperson STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO sales_data VALUES + (1, '2024-01-01', 'North', 'Laptop', 1200.00, 2, 'Alice'), + (2, '2024-01-02', 'North', 'Mouse', 25.00, 10, 'Alice'), + (3, '2024-01-03', 'North', 'Keyboard', 75.00, 5, 'Bob'), + (4, '2024-01-04', 'South', 'Laptop', 1150.00, 1, 'Charlie'), + (5, '2024-01-05', 'South', 'Monitor', 300.00, 3, 'Charlie'), + (6, '2024-01-06', 'South', 'Mouse', 20.00, 15, 'David'), + (7, '2024-01-07', 'East', 'Laptop', 1300.00, 2, 'Eve'), + (8, '2024-01-08', 'East', 'Keyboard', 80.00, 4, 'Eve'), + (9, '2024-01-09', 'East', 'Monitor', 320.00, 2, 'Frank'), + (10, '2024-01-10', 'West', 'Laptop', 1250.00, 3, 'Grace'), + (11, '2024-01-11', 'West', 'Mouse', 22.00, 12, 'Grace'), + (12, '2024-01-12', 'West', 'Keyboard', 70.00, 6, 'Henry'), + (13, '2024-01-13', 'North', 'Monitor', 310.00, 2, 'Alice'), + (14, '2024-01-14', 'South', 'Keyboard', 78.00, 3, 'Charlie'), + (15, '2024-01-15', 'East', 'Mouse', 24.00, 8, 'Eve'); + """ + + qt_select_data """ SELECT * FROM sales_data ORDER BY id; """ + + // Create Python UDAFs for window functions + + // ======================================== + // UDAF 1: Running Sum (Cumulative Sum) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_running_sum(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_running_sum(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "RunningSumUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class RunningSumUDAF: + def __init__(self): + self.sum = 0.0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + // ======================================== + // UDAF 2: Running Count + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_running_count(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_running_count(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "RunningCountUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class RunningCountUDAF: + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, value): + if value is not None: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def finish(self): + return self.count +\$\$; + """ + + // ======================================== + // UDAF 3: Running Average + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_running_avg(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_running_avg(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "RunningAvgUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class RunningAvgUDAF: + def __init__(self): + self.sum = 0.0 + self.count = 0 + + @property + def aggregate_state(self): + return (self.sum, self.count) + + def accumulate(self, value): + if value is not None: + self.sum += value + self.count += 1 + + def merge(self, other_state): + other_sum, other_count = other_state + self.sum += other_sum + self.count += other_count + + def finish(self): + if self.count == 0: + return None + return self.sum / self.count +\$\$; + """ + + // ======================================== + // Test 1: Simple window with PARTITION BY + // ======================================== + qt_window_partition_by """ + SELECT + id, + region, + sales_amount, + py_running_sum(sales_amount) OVER (PARTITION BY region ORDER BY id) as region_running_sum + FROM sales_data + ORDER BY region, id; + """ + + // ======================================== + // Test 2: Window with ORDER BY only (no partition) + // ======================================== + qt_window_order_by """ + SELECT + id, + sales_date, + sales_amount, + py_running_sum(sales_amount) OVER (ORDER BY sales_date) as cumulative_sales, + py_running_count(quantity) OVER (ORDER BY sales_date) as cumulative_count + FROM sales_data + ORDER BY sales_date; + """ + + // ======================================== + // Test 3: Multiple partitions with different UDAFs + // ======================================== + qt_window_multi_partition """ + SELECT + region, + product, + sales_amount, + py_running_sum(sales_amount) OVER (PARTITION BY region ORDER BY id) as region_sum, + py_running_sum(sales_amount) OVER (PARTITION BY product ORDER BY id) as product_sum, + py_running_avg(sales_amount) OVER (PARTITION BY region ORDER BY id) as region_avg + FROM sales_data + ORDER BY region, id; + """ + + // ======================================== + // Test 4: Window with frame specification - ROWS BETWEEN + // ======================================== + qt_window_rows_between """ + SELECT + id, + sales_date, + sales_amount, + py_running_sum(sales_amount) OVER ( + ORDER BY sales_date + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as moving_sum_3days + FROM sales_data + ORDER BY sales_date; + """ + + // ======================================== + // Test 5: Window with cumulative sum (default frame) + // ======================================== + qt_window_unbounded """ + SELECT + region, + sales_date, + sales_amount, + py_running_sum(sales_amount) OVER ( + PARTITION BY region + ORDER BY sales_date + ) as region_cumulative + FROM sales_data + ORDER BY region, sales_date; + """ + + // ======================================== + // Test 6: Salesperson performance with running totals + // ======================================== + qt_window_salesperson """ + SELECT + salesperson, + sales_date, + sales_amount, + py_running_sum(sales_amount) OVER ( + PARTITION BY salesperson + ORDER BY sales_date + ) as person_cumulative_sales, + py_running_count(quantity) OVER ( + PARTITION BY salesperson + ORDER BY sales_date + ) as person_total_transactions + FROM sales_data + ORDER BY salesperson, sales_date; + """ + + // ======================================== + // Test 7: Compare window function with regular aggregation + // ======================================== + qt_window_vs_group_by """ + SELECT + region, + sales_amount, + py_running_sum(sales_amount) OVER (PARTITION BY region) as window_total, + (SELECT py_running_sum(sales_amount) FROM sales_data s2 WHERE s2.region = s1.region) as subquery_total + FROM sales_data s1 + ORDER BY region, id; + """ + + // ======================================== + // Test 8: Multiple window specifications + // ======================================== + qt_window_multiple_specs """ + SELECT + product, + sales_date, + sales_amount, + py_running_sum(sales_amount) OVER (PARTITION BY product ORDER BY sales_date) as product_running_sum, + py_running_avg(sales_amount) OVER (ORDER BY sales_date) as overall_avg, + py_running_count(quantity) OVER (PARTITION BY product ORDER BY sales_date) as product_count + FROM sales_data + ORDER BY product, sales_date; + """ + + // ======================================== + // Test 9: Window with complex ordering + // ======================================== + qt_window_complex_order """ + SELECT + region, + product, + sales_amount, + quantity, + py_running_sum(sales_amount) OVER ( + PARTITION BY region + ORDER BY sales_amount DESC, quantity + ) as ranked_cumulative + FROM sales_data + ORDER BY region, sales_amount DESC, quantity; + """ + + // ======================================== + // Test 10: Window function with WHERE clause + // ======================================== + qt_window_with_where """ + SELECT + region, + product, + sales_amount, + py_running_sum(sales_amount) OVER ( + PARTITION BY region + ORDER BY sales_date + ) as region_cumulative + FROM sales_data + WHERE sales_amount > 50 + ORDER BY region, sales_date; + """ + + // ======================================== + // Test 11: Nested query with window functions + // ======================================== + qt_window_nested """ + SELECT + region, + sales_date, + sales_amount, + region_running_sum, + region_running_sum - sales_amount as previous_sum + FROM ( + SELECT + region, + sales_date, + sales_amount, + py_running_sum(sales_amount) OVER ( + PARTITION BY region + ORDER BY sales_date + ) as region_running_sum + FROM sales_data + ) t + ORDER BY region, sales_date; + """ + + // ======================================== + // Test 12: Window with RANGE frame (if supported) + // ======================================== + qt_window_range_frame """ + SELECT + id, + sales_date, + sales_amount, + py_running_sum(sales_amount) OVER ( + ORDER BY id + ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING + ) as three_row_sum + FROM sales_data + ORDER BY id; + """ + + // ======================================== + // Test 13: Empty partition handling + // ======================================== + sql """ DROP TABLE IF EXISTS sparse_data; """ + sql """ + CREATE TABLE sparse_data ( + id INT, + category STRING, + value DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO sparse_data VALUES + (1, 'A', 100), + (2, 'A', 200), + (3, 'B', 300), + (4, 'C', 400), + (5, 'C', 500), + (6, 'C', 600); + """ + + qt_window_sparse """ + SELECT + category, + value, + py_running_sum(value) OVER (PARTITION BY category ORDER BY id) as category_sum, + py_running_count(CAST(value AS INT)) OVER (PARTITION BY category ORDER BY id) as category_count + FROM sparse_data + ORDER BY category, id; + """ + + // ======================================== + // Test 14: First and last value in window + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_window_first(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_window_first(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "WindowFirstUDAF", + "runtime_version" = "3.8.10" + ) + AS \$\$ +class WindowFirstUDAF: + def __init__(self): + self.first_value = None + self.has_value = False + + @property + def aggregate_state(self): + return (self.first_value, self.has_value) + + def accumulate(self, value): + if not self.has_value and value is not None: + self.first_value = value + self.has_value = True + + def merge(self, other_state): + other_first, other_has = other_state + if not self.has_value and other_has: + self.first_value = other_first + self.has_value = True + + def finish(self): + return self.first_value +\$\$; + """ + + qt_window_first_value """ + SELECT + region, + sales_date, + sales_amount, + py_window_first(sales_amount) OVER ( + PARTITION BY region + ORDER BY sales_date + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW + ) as first_sale_in_region + FROM sales_data + ORDER BY region, sales_date; + """ + + // ======================================== + // Test 15: Window with NULL values + // ======================================== + sql """ DROP TABLE IF EXISTS window_null_test; """ + sql """ + CREATE TABLE window_null_test ( + id INT, + category STRING, + value DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO window_null_test VALUES + (1, 'A', 10.0), + (2, 'A', NULL), + (3, 'A', 30.0), + (4, 'B', NULL), + (5, 'B', 50.0), + (6, 'B', NULL); + """ + + qt_window_with_nulls """ + SELECT + category, + value, + py_running_sum(value) OVER (PARTITION BY category ORDER BY id) as running_sum, + py_running_count(CAST(value AS INT)) OVER (PARTITION BY category ORDER BY id) as running_count + FROM window_null_test + ORDER BY category, id; + """ + + // ======================================== + // Test 16: Window function performance - larger dataset + // ======================================== + sql """ DROP TABLE IF EXISTS large_window_test; """ + sql """ + CREATE TABLE large_window_test ( + id INT, + group_id INT, + value DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO large_window_test + SELECT + number as id, + number % 5 as group_id, + number * 1.5 as value + FROM numbers("number" = "100"); + """ + + qt_window_large_dataset """ + SELECT + group_id, + COUNT(*) as total_rows, + AVG(running_sum) as avg_running_sum + FROM ( + SELECT + group_id, + value, + py_running_sum(value) OVER (PARTITION BY group_id ORDER BY id) as running_sum + FROM large_window_test + ) t + GROUP BY group_id + ORDER BY group_id; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_running_sum(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_running_count(INT);") + try_sql("DROP FUNCTION IF EXISTS py_running_avg(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_window_first(DOUBLE);") + try_sql("DROP TABLE IF EXISTS sales_data;") + try_sql("DROP TABLE IF EXISTS sparse_data;") + try_sql("DROP TABLE IF EXISTS window_null_test;") + try_sql("DROP TABLE IF EXISTS large_window_test;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudf_udaf_mixed.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudf_udaf_mixed.groovy new file mode 100644 index 00000000000000..024d8cd2e0de0d --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudf_udaf_mixed.groovy @@ -0,0 +1,355 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_udaf_mixed") { + // Test mixing Python scalar UDFs and aggregate UDAFs in the same query + // This verifies that UDFs and UDAFs can coexist and work correctly together + + def runtime_version = "3.8.10" + + try { + // Create test table + sql """ DROP TABLE IF EXISTS mixed_test; """ + sql """ + CREATE TABLE mixed_test ( + id INT, + category STRING, + value INT, + price DOUBLE, + discount DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO mixed_test VALUES + (1, 'Electronics', 100, 99.9, 0.1), + (2, 'Electronics', 200, 199.9, 0.15), + (3, 'Books', 50, 29.9, 0.05), + (4, 'Books', 80, 49.9, 0.1), + (5, 'Clothing', 150, 79.9, 0.2), + (6, 'Clothing', 120, 59.9, 0.15), + (7, 'Electronics', 300, 299.9, 0.25), + (8, 'Books', 60, 39.9, 0.08); + """ + + // Create scalar UDFs + + // UDF 1: Calculate final price after discount + sql """ DROP FUNCTION IF EXISTS py_final_price(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_final_price(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "final_price", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def final_price(price, discount): + if price is None or discount is None: + return None + return price * (1 - discount) +\$\$; + """ + + // UDF 2: Double the value + sql """ DROP FUNCTION IF EXISTS py_double(INT); """ + sql """ + CREATE FUNCTION py_double(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "double_value", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def double_value(x): + if x is None: + return None + return x * 2 +\$\$; + """ + + // UDF 3: Get category prefix (first 3 characters) + sql """ DROP FUNCTION IF EXISTS py_category_prefix(STRING); """ + sql """ + CREATE FUNCTION py_category_prefix(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "get_prefix", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def get_prefix(s): + if s is None or len(s) == 0: + return None + return s[:3].upper() +\$\$; + """ + + // Create aggregate UDAFs + + // UDAF 1: Sum + sql """ DROP FUNCTION IF EXISTS py_udaf_sum(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_udaf_sum(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "SumUDAF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class SumUDAF: + def __init__(self): + self.sum = 0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + // UDAF 2: Count + sql """ DROP FUNCTION IF EXISTS py_udaf_count(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_udaf_count(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "CountUDAF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class CountUDAF: + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, value): + if value is not None: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def finish(self): + return self.count +\$\$; + """ + + // UDAF 3: Average for DOUBLE + sql """ DROP FUNCTION IF EXISTS py_udaf_avg_double(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_udaf_avg_double(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "AvgDoubleUDAF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class AvgDoubleUDAF: + def __init__(self): + self.count = 0 + self.sum = 0.0 + + @property + def aggregate_state(self): + return (self.count, self.sum) + + def accumulate(self, value): + if value is not None: + self.count += 1 + self.sum += value + + def merge(self, other_state): + other_count, other_sum = other_state + self.count += other_count + self.sum += other_sum + + def finish(self): + if self.count == 0: + return None + return self.sum / self.count +\$\$; + """ + + // Test 1: Mix scalar UDF with UDAF in SELECT + // Use UDF for constant calculation, UDAF for aggregation + qt_mixed_1 """ + SELECT + py_udaf_sum(value) AS total_value, + py_udaf_count(value) AS total_count, + py_double(100) AS doubled_constant + FROM mixed_test; + """ + + // Test 2: UDF in GROUP BY expression, UDAF in SELECT + qt_mixed_2 """ + SELECT + py_category_prefix(category) AS prefix, + py_udaf_sum(value) AS total_value, + py_udaf_count(value) AS item_count + FROM mixed_test + GROUP BY py_category_prefix(category) + ORDER BY prefix; + """ + + // Test 3: UDAF aggregating UDF results + // Apply UDF to each row, then aggregate with UDAF + qt_mixed_3 """ + SELECT + category, + py_udaf_sum(py_double(value)) AS sum_doubled_value, + py_udaf_count(value) AS count_value + FROM mixed_test + GROUP BY category + ORDER BY category; + """ + + // Test 4: Multiple UDAFs with UDF in GROUP BY + qt_mixed_4 """ + SELECT + py_category_prefix(category) AS prefix, + py_udaf_sum(value) AS total_value, + py_udaf_count(value) AS item_count, + py_udaf_avg_double(price) AS avg_price + FROM mixed_test + GROUP BY py_category_prefix(category) + ORDER BY prefix; + """ + + // Test 5: UDF and UDAF with complex expressions + // Calculate final price with UDF, then aggregate with UDAF + qt_mixed_5 """ + SELECT + category, + py_udaf_sum(value) AS total_value, + py_udaf_avg_double(py_final_price(price, discount)) AS avg_final_price, + py_udaf_count(value) AS count_value + FROM mixed_test + GROUP BY category + ORDER BY category; + """ + + // Test 6: Multiple UDFs and UDAFs together + qt_mixed_6 """ + SELECT + category, + py_category_prefix(category) AS prefix, + py_udaf_sum(py_double(value)) AS sum_doubled, + py_udaf_count(value) AS count_items, + py_udaf_avg_double(price) AS avg_price + FROM mixed_test + GROUP BY category + ORDER BY category; + """ + + // Test 7: HAVING clause with UDAF + qt_mixed_7 """ + SELECT + category, + py_udaf_sum(value) AS total_value, + py_udaf_count(value) AS item_count + FROM mixed_test + GROUP BY category + HAVING py_udaf_sum(value) > 150 + ORDER BY category; + """ + + // Test 8: Subquery with UDF, outer query with UDAF + qt_mixed_8 """ + SELECT + py_udaf_sum(doubled_value) AS total_doubled, + py_udaf_count(doubled_value) AS count_doubled + FROM ( + SELECT py_double(value) AS doubled_value + FROM mixed_test + ) t; + """ + + // Test 9: UDF in WHERE clause, UDAF in SELECT + qt_mixed_9 """ + SELECT + category, + py_udaf_sum(value) AS total_value, + py_udaf_count(value) AS item_count + FROM mixed_test + WHERE py_double(value) > 100 + GROUP BY category + ORDER BY category; + """ + + // Test 10: Complex case - Multiple UDFs and UDAFs with different operations + qt_mixed_10 """ + SELECT + py_category_prefix(category) AS prefix, + py_udaf_sum(value) AS sum_value, + py_udaf_sum(py_double(value)) AS sum_doubled, + py_udaf_count(value) AS count_value, + py_udaf_avg_double(price) AS avg_price, + py_udaf_avg_double(py_final_price(price, discount)) AS avg_final_price + FROM mixed_test + GROUP BY py_category_prefix(category) + ORDER BY prefix; + """ + + // Test 11: Verify UDAF results match native functions + qt_mixed_verify """ + SELECT + category, + py_udaf_sum(value) AS python_sum, + SUM(value) AS native_sum, + py_udaf_count(value) AS python_count, + COUNT(value) AS native_count + FROM mixed_test + GROUP BY category + ORDER BY category; + """ + + } finally { + // Cleanup + try_sql("DROP FUNCTION IF EXISTS py_final_price(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_double(INT);") + try_sql("DROP FUNCTION IF EXISTS py_category_prefix(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_udaf_sum(INT);") + try_sql("DROP FUNCTION IF EXISTS py_udaf_count(INT);") + try_sql("DROP FUNCTION IF EXISTS py_udaf_avg_double(DOUBLE);") + try_sql("DROP TABLE IF EXISTS mixed_test;") + } +} diff --git a/regression-test/suites/pythonudaf_p0/test_pythonudwf_comprehensive.groovy b/regression-test/suites/pythonudaf_p0/test_pythonudwf_comprehensive.groovy new file mode 100644 index 00000000000000..32f75d4739934f --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/test_pythonudwf_comprehensive.groovy @@ -0,0 +1,1140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudwf_comprehensive") { + // Comprehensive test suite for Python User-Defined Window Functions (UDWF) + // Tests cover: PARTITION BY, ORDER BY, frame specifications, edge cases, and complex scenarios + + def runtime_version = "3.8.10" + + try { + // ======================================== + // Setup: Create test tables with diverse data + // ======================================== + + // Table 1: Sales data for basic window function tests + sql """ DROP TABLE IF EXISTS udwf_sales; """ + sql """ + CREATE TABLE udwf_sales ( + order_id INT, + order_date DATE, + region VARCHAR(50), + product VARCHAR(50), + category VARCHAR(50), + sales_amount DECIMAL(10,2), + quantity INT, + salesperson VARCHAR(50) + ) ENGINE=OLAP + DUPLICATE KEY(order_id) + DISTRIBUTED BY HASH(order_id) BUCKETS 3 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO udwf_sales VALUES + (1, '2024-01-01', 'North', 'Laptop', 'Electronics', 1200.50, 2, 'Alice'), + (2, '2024-01-02', 'North', 'Mouse', 'Electronics', 25.99, 10, 'Alice'), + (3, '2024-01-03', 'North', 'Desk', 'Furniture', 350.00, 1, 'Bob'), + (4, '2024-01-04', 'South', 'Laptop', 'Electronics', 1150.00, 1, 'Charlie'), + (5, '2024-01-05', 'South', 'Chair', 'Furniture', 200.00, 4, 'Charlie'), + (6, '2024-01-06', 'South', 'Monitor', 'Electronics', 300.00, 3, 'David'), + (7, '2024-01-07', 'East', 'Laptop', 'Electronics', 1300.00, 2, 'Eve'), + (8, '2024-01-08', 'East', 'Keyboard', 'Electronics', 80.00, 5, 'Eve'), + (9, '2024-01-09', 'East', 'Bookshelf', 'Furniture', 180.00, 2, 'Frank'), + (10, '2024-01-10', 'West', 'Laptop', 'Electronics', 1250.00, 3, 'Grace'), + (11, '2024-01-11', 'West', 'Mouse', 'Electronics', 22.50, 12, 'Grace'), + (12, '2024-01-12', 'West', 'Table', 'Furniture', 450.00, 1, 'Henry'), + (13, '2024-01-13', 'North', 'Monitor', 'Electronics', 310.00, 2, 'Alice'), + (14, '2024-01-14', 'South', 'Keyboard', 'Electronics', 78.00, 3, 'Charlie'), + (15, '2024-01-15', 'East', 'Mouse', 'Electronics', 24.00, 8, 'Eve'), + (16, '2024-01-16', 'West', 'Chair', 'Furniture', 195.00, 5, 'Grace'), + (17, '2024-01-17', 'North', 'Desk', 'Furniture', 380.00, 1, 'Bob'), + (18, '2024-01-18', 'South', 'Monitor', 'Electronics', 295.00, 2, 'David'), + (19, '2024-01-19', 'East', 'Laptop', 'Electronics', 1280.00, 1, 'Frank'), + (20, '2024-01-20', 'West', 'Keyboard', 'Electronics', 85.00, 6, 'Henry'); + """ + + // Table 2: Stock prices for time-series analysis + sql """ DROP TABLE IF EXISTS udwf_stock_prices; """ + sql """ + CREATE TABLE udwf_stock_prices ( + trade_id INT, + trade_time DATETIME, + symbol VARCHAR(20), + price DOUBLE, + volume INT + ) ENGINE=OLAP + DUPLICATE KEY(trade_id) + DISTRIBUTED BY HASH(trade_id) BUCKETS 2 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO udwf_stock_prices VALUES + (1, '2024-01-01 09:30:00', 'AAPL', 150.25, 1000), + (2, '2024-01-01 09:35:00', 'AAPL', 151.50, 1200), + (3, '2024-01-01 09:40:00', 'AAPL', 150.75, 800), + (4, '2024-01-01 09:45:00', 'AAPL', 152.00, 1500), + (5, '2024-01-01 09:50:00', 'AAPL', 151.25, 900), + (6, '2024-01-01 09:30:00', 'GOOGL', 2800.00, 500), + (7, '2024-01-01 09:35:00', 'GOOGL', 2815.50, 600), + (8, '2024-01-01 09:40:00', 'GOOGL', 2810.00, 400), + (9, '2024-01-01 09:45:00', 'GOOGL', 2825.00, 700), + (10, '2024-01-01 09:50:00', 'GOOGL', 2820.50, 550), + (11, '2024-01-01 09:30:00', 'MSFT', 380.00, 2000), + (12, '2024-01-01 09:35:00', 'MSFT', 382.50, 2200), + (13, '2024-01-01 09:40:00', 'MSFT', 381.00, 1800), + (14, '2024-01-01 09:45:00', 'MSFT', 383.75, 2500), + (15, '2024-01-01 09:50:00', 'MSFT', 382.25, 1900); + """ + + // Table 3: Student scores for ranking tests + sql """ DROP TABLE IF EXISTS udwf_student_scores; """ + sql """ + CREATE TABLE udwf_student_scores ( + student_id INT, + student_name VARCHAR(50), + class VARCHAR(20), + subject VARCHAR(20), + score INT, + exam_date DATE + ) ENGINE=OLAP + DUPLICATE KEY(student_id) + DISTRIBUTED BY HASH(student_id) BUCKETS 2 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO udwf_student_scores VALUES + (1, 'Alice', 'ClassA', 'Math', 95, '2024-01-15'), + (1, 'Alice', 'ClassA', 'English', 88, '2024-01-15'), + (1, 'Alice', 'ClassA', 'Physics', 92, '2024-01-15'), + (2, 'Bob', 'ClassA', 'Math', 87, '2024-01-15'), + (2, 'Bob', 'ClassA', 'English', 90, '2024-01-15'), + (2, 'Bob', 'ClassA', 'Physics', 85, '2024-01-15'), + (3, 'Charlie', 'ClassA', 'Math', 92, '2024-01-15'), + (3, 'Charlie', 'ClassA', 'English', 85, '2024-01-15'), + (3, 'Charlie', 'ClassA', 'Physics', 88, '2024-01-15'), + (4, 'David', 'ClassB', 'Math', 78, '2024-01-15'), + (4, 'David', 'ClassB', 'English', 82, '2024-01-15'), + (4, 'David', 'ClassB', 'Physics', 80, '2024-01-15'), + (5, 'Eve', 'ClassB', 'Math', 90, '2024-01-15'), + (5, 'Eve', 'ClassB', 'English', 93, '2024-01-15'), + (5, 'Eve', 'ClassB', 'Physics', 89, '2024-01-15'), + (6, 'Frank', 'ClassB', 'Math', 85, '2024-01-15'), + (6, 'Frank', 'ClassB', 'English', 87, '2024-01-15'), + (6, 'Frank', 'ClassB', 'Physics', 91, '2024-01-15'); + """ + + // ======================================== + // UDWF Definitions: Various window functions + // ======================================== + + // UDWF 1: Running Sum (Cumulative Sum) + sql """ DROP FUNCTION IF EXISTS py_udwf_running_sum(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_udwf_running_sum(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "RunningSumUDWF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class RunningSumUDWF: + def __init__(self): + self.sum = 0.0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum +\$\$; + """ + + // UDWF 2: Running Average + sql """ DROP FUNCTION IF EXISTS py_udwf_running_avg(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_udwf_running_avg(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "RunningAvgUDWF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class RunningAvgUDWF: + def __init__(self): + self.sum = 0.0 + self.count = 0 + + @property + def aggregate_state(self): + return (self.sum, self.count) + + def accumulate(self, value): + if value is not None: + self.sum += value + self.count += 1 + + def merge(self, other_state): + other_sum, other_count = other_state + self.sum += other_sum + self.count += other_count + + def finish(self): + if self.count == 0: + return None + return self.sum / self.count +\$\$; + """ + + // UDWF 3: Running Count + sql """ DROP FUNCTION IF EXISTS py_udwf_running_count(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_udwf_running_count(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "RunningCountUDWF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class RunningCountUDWF: + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, value): + if value is not None: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def finish(self): + return self.count +\$\$; + """ + + // UDWF 4: Moving Average (for frame-based windows) + sql """ DROP FUNCTION IF EXISTS py_udwf_moving_avg(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_udwf_moving_avg(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "MovingAvgUDWF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class MovingAvgUDWF: + def __init__(self): + self.values = [] + + @property + def aggregate_state(self): + return self.values + + def accumulate(self, value): + if value is not None: + self.values.append(value) + + def merge(self, other_state): + if other_state: + self.values.extend(other_state) + + def finish(self): + if not self.values: + return None + return sum(self.values) / len(self.values) +\$\$; + """ + + // UDWF 5: Standard Deviation (for volatility analysis) + sql """ DROP FUNCTION IF EXISTS py_udwf_stddev(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_udwf_stddev(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "StdDevUDWF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +import math + +class StdDevUDWF: + def __init__(self): + self.values = [] + + @property + def aggregate_state(self): + return self.values + + def accumulate(self, value): + if value is not None: + self.values.append(value) + + def merge(self, other_state): + if other_state: + self.values.extend(other_state) + + def finish(self): + if not self.values or len(self.values) < 2: + return None + mean = sum(self.values) / len(self.values) + variance = sum((x - mean) ** 2 for x in self.values) / len(self.values) + return math.sqrt(variance) +\$\$; + """ + + // UDWF 6: Min Value + sql """ DROP FUNCTION IF EXISTS py_udwf_min(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_udwf_min(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "MinUDWF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class MinUDWF: + def __init__(self): + self.min_val = None + + @property + def aggregate_state(self): + return self.min_val + + def accumulate(self, value): + if value is not None: + if self.min_val is None or value < self.min_val: + self.min_val = value + + def merge(self, other_state): + if other_state is not None: + if self.min_val is None or other_state < self.min_val: + self.min_val = other_state + + def finish(self): + return self.min_val +\$\$; + """ + + // UDWF 7: Max Value + sql """ DROP FUNCTION IF EXISTS py_udwf_max(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_udwf_max(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "MaxUDWF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class MaxUDWF: + def __init__(self): + self.max_val = None + + @property + def aggregate_state(self): + return self.max_val + + def accumulate(self, value): + if value is not None: + if self.max_val is None or value > self.max_val: + self.max_val = value + + def merge(self, other_state): + if other_state is not None: + if self.max_val is None or other_state > self.max_val: + self.max_val = other_state + + def finish(self): + return self.max_val +\$\$; + """ + + // UDWF 8: First Value + sql """ DROP FUNCTION IF EXISTS py_udwf_first_value(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_udwf_first_value(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "FirstValueUDWF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class FirstValueUDWF: + def __init__(self): + self.first = None + self.has_value = False + + @property + def aggregate_state(self): + return (self.first, self.has_value) + + def accumulate(self, value): + if value is not None and not self.has_value: + self.first = value + self.has_value = True + + def merge(self, other_state): + other_first, other_has_value = other_state + if other_has_value and not self.has_value: + self.first = other_first + self.has_value = True + + def finish(self): + return self.first +\$\$; + """ + + // UDWF 9: Last Value + sql """ DROP FUNCTION IF EXISTS py_udwf_last_value(DOUBLE); """ + sql """ + CREATE AGGREGATE FUNCTION py_udwf_last_value(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "LastValueUDWF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class LastValueUDWF: + def __init__(self): + self.last = None + + @property + def aggregate_state(self): + return self.last + + def accumulate(self, value): + if value is not None: + self.last = value + + def merge(self, other_state): + if other_state is not None: + self.last = other_state + + def finish(self): + return self.last +\$\$; + """ + + // UDWF 10: Rank (Dense Rank implementation) + sql """ DROP FUNCTION IF EXISTS py_udwf_rank(INT); """ + sql """ + CREATE AGGREGATE FUNCTION py_udwf_rank(INT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "RankUDWF", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +class RankUDWF: + def __init__(self): + self.count = 0 + + @property + def aggregate_state(self): + return self.count + + def accumulate(self, value): + if value is not None: + self.count += 1 + + def merge(self, other_state): + self.count += other_state + + def finish(self): + return self.count +\$\$; + """ + + // ======================================== + // Test Category 1: Basic Window Functions with PARTITION BY + // ======================================== + + // Test 1.1: Simple PARTITION BY with running sum + qt_test_partition_by_running_sum """ + SELECT + order_id, + region, + sales_amount, + py_udwf_running_sum(sales_amount) OVER (PARTITION BY region ORDER BY order_id) as region_running_sum + FROM udwf_sales + ORDER BY region, order_id; + """ + + // Test 1.2: PARTITION BY with running average + qt_test_partition_by_running_avg """ + SELECT + order_id, + category, + sales_amount, + py_udwf_running_avg(sales_amount) OVER (PARTITION BY category ORDER BY order_date) as category_running_avg + FROM udwf_sales + ORDER BY category, order_date, order_id; + """ + + // Test 1.3: PARTITION BY with running count + qt_test_partition_by_running_count """ + SELECT + order_id, + salesperson, + quantity, + py_udwf_running_count(quantity) OVER (PARTITION BY salesperson ORDER BY order_id) as sales_count + FROM udwf_sales + ORDER BY salesperson, order_id; + """ + + // Test 1.4: Multiple PARTITION BY columns + qt_test_multi_partition_columns """ + SELECT + order_id, + region, + category, + sales_amount, + py_udwf_running_sum(sales_amount) OVER (PARTITION BY region, category ORDER BY order_id) as segment_sum + FROM udwf_sales + ORDER BY region, category, order_id; + """ + + // ======================================== + // Test Category 2: Window Functions with ORDER BY only + // ======================================== + + // Test 2.1: ORDER BY with cumulative sum + qt_test_order_by_cumulative_sum """ + SELECT + order_id, + order_date, + sales_amount, + py_udwf_running_sum(sales_amount) OVER (ORDER BY order_date, order_id) as cumulative_sales + FROM udwf_sales + ORDER BY order_date, order_id; + """ + + // Test 2.2: ORDER BY with cumulative average + qt_test_order_by_cumulative_avg """ + SELECT + order_id, + order_date, + sales_amount, + py_udwf_running_avg(sales_amount) OVER (ORDER BY order_date) as cumulative_avg + FROM udwf_sales + ORDER BY order_date, order_id; + """ + + // Test 2.3: ORDER BY DESC + qt_test_order_by_desc """ + SELECT + order_id, + sales_amount, + py_udwf_running_sum(sales_amount) OVER (ORDER BY sales_amount DESC) as sum_by_amount_desc + FROM udwf_sales + ORDER BY sales_amount DESC, order_id; + """ + + // ======================================== + // Test Category 3: Window Functions with Frame Specifications + // ======================================== + + // Test 3.1: ROWS BETWEEN frame (moving average - 3 row window) + qt_test_rows_between_moving_avg """ + SELECT + order_id, + order_date, + sales_amount, + py_udwf_moving_avg(sales_amount) OVER ( + ORDER BY order_date + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as moving_avg_3 + FROM udwf_sales + ORDER BY order_date, order_id; + """ + + // Test 3.2: ROWS BETWEEN with partition + qt_test_rows_between_with_partition """ + SELECT + order_id, + region, + sales_amount, + py_udwf_moving_avg(sales_amount) OVER ( + PARTITION BY region + ORDER BY order_id + ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING + ) as moving_avg_region + FROM udwf_sales + ORDER BY region, order_id; + """ + + // Test 3.3: ROWS BETWEEN UNBOUNDED PRECEDING + qt_test_rows_unbounded_preceding """ + SELECT + order_id, + category, + sales_amount, + py_udwf_running_sum(sales_amount) OVER ( + PARTITION BY category + ORDER BY order_id + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW + ) as category_cumsum + FROM udwf_sales + ORDER BY category, order_id; + """ + + // Test 3.4: ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + qt_test_rows_unbounded_following """ + SELECT + order_id, + region, + sales_amount, + py_udwf_running_sum(sales_amount) OVER ( + PARTITION BY region + ORDER BY order_id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + ) as remaining_sum + FROM udwf_sales + ORDER BY region, order_id; + """ + + // ======================================== + // Test Category 4: Multiple Window Functions in Single Query + // ======================================== + + // Test 4.1: Multiple UDWFs with same partition + qt_test_multiple_udwf_same_partition """ + SELECT + order_id, + region, + sales_amount, + py_udwf_running_sum(sales_amount) OVER (PARTITION BY region ORDER BY order_id) as running_sum, + py_udwf_running_avg(sales_amount) OVER (PARTITION BY region ORDER BY order_id) as running_avg, + py_udwf_running_count(quantity) OVER (PARTITION BY region ORDER BY order_id) as running_count + FROM udwf_sales + ORDER BY region, order_id; + """ + + // Test 4.2: Multiple UDWFs with different partitions + qt_test_multiple_udwf_diff_partition """ + SELECT + order_id, + region, + category, + sales_amount, + py_udwf_running_sum(sales_amount) OVER (PARTITION BY region ORDER BY order_id) as region_sum, + py_udwf_running_sum(sales_amount) OVER (PARTITION BY category ORDER BY order_id) as category_sum, + py_udwf_running_sum(sales_amount) OVER (ORDER BY order_id) as total_sum + FROM udwf_sales + ORDER BY order_id; + """ + + // Test 4.3: Mix of UDWFs and built-in functions + qt_test_mix_udwf_builtin """ + SELECT + order_id, + region, + sales_amount, + py_udwf_running_sum(sales_amount) OVER (PARTITION BY region ORDER BY order_id) as py_sum, + SUM(sales_amount) OVER (PARTITION BY region ORDER BY order_id) as builtin_sum, + py_udwf_running_avg(sales_amount) OVER (PARTITION BY region ORDER BY order_id) as py_avg, + AVG(sales_amount) OVER (PARTITION BY region ORDER BY order_id) as builtin_avg + FROM udwf_sales + ORDER BY region, order_id; + """ + + // ======================================== + // Test Category 5: Statistical Analysis Functions + // ======================================== + + // Test 5.1: Standard deviation by partition + qt_test_stddev_by_partition """ + SELECT + symbol, + trade_time, + price, + py_udwf_stddev(price) OVER ( + PARTITION BY symbol + ORDER BY trade_time + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as price_volatility + FROM udwf_stock_prices + ORDER BY symbol, trade_time; + """ + + // Test 5.2: Min and Max in moving window + qt_test_min_max_moving_window """ + SELECT + symbol, + trade_time, + price, + py_udwf_min(price) OVER ( + PARTITION BY symbol + ORDER BY trade_time + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as rolling_min, + py_udwf_max(price) OVER ( + PARTITION BY symbol + ORDER BY trade_time + ROWS BETWEEN 2 PRECEDING AND CURRENT ROW + ) as rolling_max + FROM udwf_stock_prices + ORDER BY symbol, trade_time; + """ + + // Test 5.3: First and Last value in window + qt_test_first_last_value """ + SELECT + symbol, + trade_time, + price, + py_udwf_first_value(price) OVER ( + PARTITION BY symbol + ORDER BY trade_time + ) as opening_price, + py_udwf_last_value(price) OVER ( + PARTITION BY symbol + ORDER BY trade_time + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) as closing_price + FROM udwf_stock_prices + ORDER BY symbol, trade_time; + """ + + // ======================================== + // Test Category 6: Complex Analytical Queries + // ======================================== + + // Test 6.1: Category-based cumulative sales analysis + qt_test_complex_growth_analysis """ + SELECT + region, + category, + total_sales, + py_udwf_running_sum(total_sales) OVER ( + PARTITION BY region + ORDER BY category + ) as cumulative_by_category + FROM ( + SELECT + region, + category, + SUM(sales_amount) as total_sales + FROM udwf_sales + GROUP BY region, category + ) t + ORDER BY region, category; + """ + + // Test 6.2: Top-N analysis with ranking + qt_test_topn_analysis """ + SELECT + student_name, + class, + subject, + score, + py_udwf_rank(score) OVER ( + PARTITION BY class, subject + ORDER BY score DESC + ) as rank_in_subject + FROM udwf_student_scores + ORDER BY class, subject, score DESC; + """ + + // Test 6.3: Percentile calculation using window + qt_test_percentile_analysis """ + SELECT + class, + subject, + score, + py_udwf_running_count(score) OVER ( + PARTITION BY class, subject + ORDER BY score + ) as count_below_or_equal + FROM udwf_student_scores + ORDER BY class, subject, score; + """ + + // ======================================== + // Test Category 7: Edge Cases and Null Handling + // ======================================== + + // Test 7.1: Empty partition + qt_test_empty_partition """ + SELECT + order_id, + region, + sales_amount, + py_udwf_running_sum(sales_amount) OVER ( + PARTITION BY product + ORDER BY order_id + ) as product_sum + FROM udwf_sales + WHERE region = 'North' + ORDER BY product, order_id; + """ + + // Test 7.2: Single row partition + sql """ DROP TABLE IF EXISTS udwf_single_row; """ + sql """ + CREATE TABLE udwf_single_row ( + id INT, + category VARCHAR(10), + value DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO udwf_single_row VALUES + (1, 'A', 100.0), + (2, 'B', 200.0), + (3, 'C', 300.0); + """ + + qt_test_single_row_partition """ + SELECT + id, + category, + value, + py_udwf_running_sum(value) OVER (PARTITION BY category ORDER BY id) as cat_sum + FROM udwf_single_row + ORDER BY id; + """ + + // Test 7.3: NULL values handling + sql """ DROP TABLE IF EXISTS udwf_with_nulls; """ + sql """ + CREATE TABLE udwf_with_nulls ( + id INT, + category VARCHAR(10), + value DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO udwf_with_nulls VALUES + (1, 'A', 100.0), + (2, 'A', NULL), + (3, 'A', 200.0), + (4, 'A', NULL), + (5, 'A', 300.0), + (6, 'B', NULL), + (7, 'B', 150.0), + (8, 'B', NULL); + """ + + qt_test_null_values """ + SELECT + id, + category, + value, + py_udwf_running_sum(value) OVER (PARTITION BY category ORDER BY id) as sum_ignore_null, + py_udwf_running_count(value) OVER (PARTITION BY category ORDER BY id) as count_non_null, + py_udwf_running_avg(value) OVER (PARTITION BY category ORDER BY id) as avg_ignore_null + FROM udwf_with_nulls + ORDER BY category, id; + """ + + // Test 7.4: All NULL values in partition + sql """ DROP TABLE IF EXISTS udwf_all_nulls; """ + sql """ + CREATE TABLE udwf_all_nulls ( + id INT, + category VARCHAR(10), + value DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO udwf_all_nulls VALUES + (1, 'A', NULL), + (2, 'A', NULL), + (3, 'B', 100.0); + """ + + qt_test_all_nulls_partition """ + SELECT + id, + category, + value, + py_udwf_running_sum(value) OVER (PARTITION BY category ORDER BY id) as sum_result, + py_udwf_running_avg(value) OVER (PARTITION BY category ORDER BY id) as avg_result + FROM udwf_all_nulls + ORDER BY category, id; + """ + + // ======================================== + // Test Category 8: Performance and Scalability + // ======================================== + + // Test 8.1: Large partition test (high cardinality) + qt_test_large_partition """ + SELECT + order_id, + salesperson, + sales_amount, + py_udwf_running_sum(sales_amount) OVER ( + PARTITION BY salesperson + ORDER BY order_id + ) as salesperson_total + FROM udwf_sales + ORDER BY salesperson, order_id; + """ + + // Test 8.2: Multiple complex windows in one query + qt_test_multiple_complex_windows """ + SELECT + order_id, + region, + category, + product, + sales_amount, + py_udwf_running_sum(sales_amount) OVER (PARTITION BY region ORDER BY order_id) as region_sum, + py_udwf_running_avg(sales_amount) OVER (PARTITION BY region ORDER BY order_id) as region_avg, + py_udwf_min(sales_amount) OVER (PARTITION BY category ORDER BY order_id ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as category_min, + py_udwf_max(sales_amount) OVER (PARTITION BY category ORDER BY order_id ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as category_max, + py_udwf_running_count(quantity) OVER (PARTITION BY product ORDER BY order_id) as product_count + FROM udwf_sales + ORDER BY order_id; + """ + + // ======================================== + // Test Category 9: Data Type Coverage + // ======================================== + + // Test 9.1: INT type + qt_test_int_type """ + SELECT + order_id, + region, + quantity, + py_udwf_running_count(quantity) OVER (PARTITION BY region ORDER BY order_id) as qty_count + FROM udwf_sales + ORDER BY region, order_id; + """ + + // Test 9.2: DECIMAL type + qt_test_decimal_type """ + SELECT + order_id, + category, + sales_amount, + py_udwf_running_sum(sales_amount) OVER (PARTITION BY category ORDER BY order_id) as decimal_sum + FROM udwf_sales + ORDER BY category, order_id; + """ + + // Test 9.3: DOUBLE type + qt_test_double_type """ + SELECT + trade_id, + symbol, + price, + py_udwf_running_avg(price) OVER (PARTITION BY symbol ORDER BY trade_time) as avg_price + FROM udwf_stock_prices + ORDER BY symbol, trade_time; + """ + + // ======================================== + // Test Category 10: Subquery and CTE with Window Functions + // ======================================== + + // Test 10.1: Window function in subquery + qt_test_window_in_subquery """ + SELECT + region, + AVG(running_sum) as avg_running_sum + FROM ( + SELECT + region, + order_id, + py_udwf_running_sum(sales_amount) OVER ( + PARTITION BY region + ORDER BY order_id + ) as running_sum + FROM udwf_sales + ) t + GROUP BY region + ORDER BY region; + """ + + // Test 10.2: Window function with CTE + qt_test_window_with_cte """ + WITH regional_sales AS ( + SELECT + region, + order_id, + sales_amount, + py_udwf_running_sum(sales_amount) OVER ( + PARTITION BY region + ORDER BY order_id + ) as cumulative_sales + FROM udwf_sales + ) + SELECT + region, + MAX(cumulative_sales) as max_cumulative + FROM regional_sales + GROUP BY region + ORDER BY region; + """ + + // Test 10.3: Nested window functions (window over window result) + qt_test_nested_windows """ + SELECT + region, + order_id, + sales_amount, + running_sum, + py_udwf_running_avg(running_sum) OVER ( + PARTITION BY region + ORDER BY order_id + ) as avg_of_running_sum + FROM ( + SELECT + region, + order_id, + sales_amount, + py_udwf_running_sum(sales_amount) OVER ( + PARTITION BY region + ORDER BY order_id + ) as running_sum + FROM udwf_sales + ) t + ORDER BY region, order_id; + """ + + // ======================================== + // Test Category 11: JOIN with Window Functions + // ======================================== + + // Test 11.1: Window function after JOIN + sql """ DROP TABLE IF EXISTS udwf_customers; """ + sql """ + CREATE TABLE udwf_customers ( + salesperson VARCHAR(50), + customer_level VARCHAR(20), + commission_rate DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(salesperson) + DISTRIBUTED BY HASH(salesperson) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO udwf_customers VALUES + ('Alice', 'Gold', 0.15), + ('Bob', 'Silver', 0.10), + ('Charlie', 'Gold', 0.15), + ('David', 'Bronze', 0.08), + ('Eve', 'Gold', 0.15), + ('Frank', 'Silver', 0.10), + ('Grace', 'Gold', 0.15), + ('Henry', 'Bronze', 0.08); + """ + + qt_test_window_after_join """ + SELECT + s.order_id, + s.salesperson, + c.customer_level, + s.sales_amount, + py_udwf_running_sum(s.sales_amount) OVER ( + PARTITION BY c.customer_level + ORDER BY s.order_id + ) as level_running_sum + FROM udwf_sales s + JOIN udwf_customers c ON s.salesperson = c.salesperson + ORDER BY c.customer_level, s.order_id; + """ + + // ======================================== + // Cleanup + // ======================================== + + sql """ DROP FUNCTION IF EXISTS py_udwf_running_sum(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_running_avg(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_running_count(INT); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_moving_avg(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_stddev(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_min(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_max(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_first_value(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_last_value(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_rank(INT); """ + + sql """ DROP TABLE IF EXISTS udwf_sales; """ + sql """ DROP TABLE IF EXISTS udwf_stock_prices; """ + sql """ DROP TABLE IF EXISTS udwf_student_scores; """ + sql """ DROP TABLE IF EXISTS udwf_single_row; """ + sql """ DROP TABLE IF EXISTS udwf_with_nulls; """ + sql """ DROP TABLE IF EXISTS udwf_all_nulls; """ + sql """ DROP TABLE IF EXISTS udwf_customers; """ + + } finally { + // Ensure cleanup even if tests fail + try { + sql """ DROP FUNCTION IF EXISTS py_udwf_running_sum(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_running_avg(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_running_count(INT); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_moving_avg(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_stddev(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_min(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_max(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_first_value(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_last_value(DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_udwf_rank(INT); """ + + sql """ DROP TABLE IF EXISTS udwf_sales; """ + sql """ DROP TABLE IF EXISTS udwf_stock_prices; """ + sql """ DROP TABLE IF EXISTS udwf_student_scores; """ + sql """ DROP TABLE IF EXISTS udwf_single_row; """ + sql """ DROP TABLE IF EXISTS udwf_with_nulls; """ + sql """ DROP TABLE IF EXISTS udwf_all_nulls; """ + sql """ DROP TABLE IF EXISTS udwf_customers; """ + } catch (Exception e) { + // Ignore cleanup errors + } + } +} diff --git a/regression-test/suites/pythonudaf_p0/udaf_scripts/complex_state_udaf.py b/regression-test/suites/pythonudaf_p0/udaf_scripts/complex_state_udaf.py new file mode 100644 index 00000000000000..2d87b524fb59aa --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/udaf_scripts/complex_state_udaf.py @@ -0,0 +1,438 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Complex State Object UDAFs for Doris Python UDAF +Tests various pickle-serializable data structures +""" + +import json +from dataclasses import dataclass +from typing import List +from collections import namedtuple, deque +from datetime import datetime + + +# ======================================== +# UDAF 1: Nested Dictionary State - User Purchase Profile +# ======================================== +class UserProfileUDAF: + """Tracks user purchase profiles with nested dict structure""" + + def __init__(self): + # Complex nested structure: dict of dicts with lists and sets + self.profiles = {} + + @property + def aggregate_state(self): + # Convert sets to lists for pickle serialization + serializable = {} + for user_id, profile in self.profiles.items(): + serializable[user_id] = { + 'total_spent': profile['total_spent'], + 'items': profile['items'], + 'categories': list(profile['categories']) + } + return serializable + + def accumulate(self, user_id, product_name, category, price, quantity): + if user_id is None: + return + + if user_id not in self.profiles: + self.profiles[user_id] = { + 'total_spent': 0.0, + 'items': [], + 'categories': set() + } + + revenue = float(price) * int(quantity) if price and quantity else 0.0 + self.profiles[user_id]['total_spent'] += revenue + if product_name: + self.profiles[user_id]['items'].append(product_name) + if category: + self.profiles[user_id]['categories'].add(category) + + def merge(self, other_state): + for user_id, profile in other_state.items(): + if user_id not in self.profiles: + self.profiles[user_id] = { + 'total_spent': 0.0, + 'items': [], + 'categories': set() + } + + self.profiles[user_id]['total_spent'] += profile['total_spent'] + self.profiles[user_id]['items'].extend(profile['items']) + self.profiles[user_id]['categories'].update(profile['categories']) + + def finish(self): + # Return summary as JSON string + result = {} + for user_id, profile in self.profiles.items(): + result[str(user_id)] = { + 'total_spent': round(profile['total_spent'], 2), + 'item_count': len(profile['items']), + 'unique_categories': len(profile['categories']) + } + return json.dumps(result, sort_keys=True) + + +# ======================================== +# UDAF 2: Custom Class State - Product Statistics +# ======================================== +@dataclass +class ProductStats: + product_name: str + prices: List[float] + quantities: List[int] + + def total_revenue(self): + return sum(p * q for p, q in zip(self.prices, self.quantities)) + + def avg_price(self): + return sum(self.prices) / len(self.prices) if self.prices else 0.0 + + def total_quantity(self): + return sum(self.quantities) + + +class ProductStatsUDAF: + """Product statistics using dataclass""" + + def __init__(self): + self.stats = {} # product_name -> ProductStats + + @property + def aggregate_state(self): + # Convert dataclass instances to dicts for serialization + return { + name: { + 'product_name': stat.product_name, + 'prices': stat.prices, + 'quantities': stat.quantities + } + for name, stat in self.stats.items() + } + + def accumulate(self, product_name, price, quantity): + if product_name is None: + return + + if product_name not in self.stats: + self.stats[product_name] = ProductStats( + product_name=product_name, + prices=[], + quantities=[] + ) + + if price is not None: + self.stats[product_name].prices.append(float(price)) + if quantity is not None: + self.stats[product_name].quantities.append(int(quantity)) + + def merge(self, other_state): + for name, stat_dict in other_state.items(): + if name not in self.stats: + self.stats[name] = ProductStats( + product_name=stat_dict['product_name'], + prices=stat_dict['prices'][:], + quantities=stat_dict['quantities'][:] + ) + else: + self.stats[name].prices.extend(stat_dict['prices']) + self.stats[name].quantities.extend(stat_dict['quantities']) + + def finish(self): + result = {} + for name, stat in self.stats.items(): + result[name] = { + 'avg_price': round(stat.avg_price(), 2), + 'total_quantity': stat.total_quantity(), + 'total_revenue': round(stat.total_revenue(), 2), + 'transactions': len(stat.prices) + } + return json.dumps(result, sort_keys=True) + + +# ======================================== +# UDAF 3: List of Tuples State - Transaction Timeline +# ======================================== +class TransactionTimelineUDAF: + """Stores chronological list of (timestamp, amount) tuples""" + + def __init__(self): + # List of tuples: [(timestamp_str, amount), ...] + self.timeline = [] + + @property + def aggregate_state(self): + return self.timeline + + def accumulate(self, timestamp, amount): + if timestamp is not None and amount is not None: + # Convert datetime to string for serialization + ts_str = str(timestamp) + self.timeline.append((ts_str, float(amount))) + + def merge(self, other_state): + self.timeline.extend(other_state) + + def finish(self): + # Sort by timestamp and return summary + sorted_timeline = sorted(self.timeline, key=lambda x: x[0]) + + if not sorted_timeline: + return json.dumps({'count': 0}) + + total = sum(amount for _, amount in sorted_timeline) + + result = { + 'count': len(sorted_timeline), + 'total': round(total, 2), + 'first_transaction': sorted_timeline[0][0], + 'last_transaction': sorted_timeline[-1][0], + 'first_amount': round(sorted_timeline[0][1], 2), + 'last_amount': round(sorted_timeline[-1][1], 2) + } + return json.dumps(result) + + +# ======================================== +# UDAF 4: Set-based State - Unique Value Tracker +# ======================================== +class UniqueTrackerUDAF: + """Tracks unique users, products, and payment methods using sets""" + + def __init__(self): + # Use sets to track unique values + self.unique_users = set() + self.unique_products = set() + self.payment_methods = set() + + @property + def aggregate_state(self): + # Convert sets to lists for pickle + return { + 'users': list(self.unique_users), + 'products': list(self.unique_products), + 'payments': list(self.payment_methods) + } + + def accumulate(self, user_id, product_id, payment_method): + if user_id is not None: + self.unique_users.add(user_id) + if product_id is not None: + self.unique_products.add(product_id) + if payment_method is not None: + self.payment_methods.add(payment_method) + + def merge(self, other_state): + self.unique_users.update(other_state['users']) + self.unique_products.update(other_state['products']) + self.payment_methods.update(other_state['payments']) + + def finish(self): + return json.dumps({ + 'unique_users': len(self.unique_users), + 'unique_products': len(self.unique_products), + 'payment_methods': sorted(list(self.payment_methods)) + }) + + +# ======================================== +# UDAF 5: Named Tuple State - Category Summary +# ======================================== +CategoryData = namedtuple('CategoryData', ['total_revenue', 'total_items', 'transaction_count']) + + +class CategorySummaryUDAF: + """Uses collections.namedtuple for structured data""" + + def __init__(self): + # Dict of category -> namedtuple + self.categories = {} + + @property + def aggregate_state(self): + # Convert namedtuples to tuples for pickle + return { + cat: (data.total_revenue, data.total_items, data.transaction_count) + for cat, data in self.categories.items() + } + + def accumulate(self, category, price, quantity): + if category is None: + return + + revenue = float(price) * int(quantity) if price and quantity else 0.0 + items = int(quantity) if quantity else 0 + + if category in self.categories: + old = self.categories[category] + self.categories[category] = CategoryData( + total_revenue=old.total_revenue + revenue, + total_items=old.total_items + items, + transaction_count=old.transaction_count + 1 + ) + else: + self.categories[category] = CategoryData( + total_revenue=revenue, + total_items=items, + transaction_count=1 + ) + + def merge(self, other_state): + for cat, (revenue, items, count) in other_state.items(): + if cat in self.categories: + old = self.categories[cat] + self.categories[cat] = CategoryData( + total_revenue=old.total_revenue + revenue, + total_items=old.total_items + items, + transaction_count=old.transaction_count + count + ) + else: + self.categories[cat] = CategoryData( + total_revenue=revenue, + total_items=items, + transaction_count=count + ) + + def finish(self): + result = {} + for cat, data in self.categories.items(): + result[cat] = { + 'total_revenue': round(data.total_revenue, 2), + 'total_items': data.total_items, + 'transactions': data.transaction_count, + 'avg_per_transaction': round(data.total_revenue / data.transaction_count, 2) if data.transaction_count > 0 else 0.0 + } + return json.dumps(result, sort_keys=True) + + +# ======================================== +# UDAF 6: Complex Nested State - Hierarchical Aggregation +# ======================================== +class HierarchicalAggUDAF: + """Multi-level nested structure: region -> category -> product -> stats""" + + def __init__(self): + # Complex nested dict: {region: {category: {product: {'revenue': float, 'quantity': int}}}} + self.hierarchy = {} + + @property + def aggregate_state(self): + return self.hierarchy + + def accumulate(self, region, category, product, price, quantity): + if not all([region, category, product]): + return + + if region not in self.hierarchy: + self.hierarchy[region] = {} + if category not in self.hierarchy[region]: + self.hierarchy[region][category] = {} + if product not in self.hierarchy[region][category]: + self.hierarchy[region][category][product] = {'revenue': 0.0, 'quantity': 0} + + revenue = float(price) * int(quantity) if price and quantity else 0.0 + qty = int(quantity) if quantity else 0 + + self.hierarchy[region][category][product]['revenue'] += revenue + self.hierarchy[region][category][product]['quantity'] += qty + + def merge(self, other_state): + for region, categories in other_state.items(): + if region not in self.hierarchy: + self.hierarchy[region] = {} + + for category, products in categories.items(): + if category not in self.hierarchy[region]: + self.hierarchy[region][category] = {} + + for product, stats in products.items(): + if product not in self.hierarchy[region][category]: + self.hierarchy[region][category][product] = {'revenue': 0.0, 'quantity': 0} + + self.hierarchy[region][category][product]['revenue'] += stats['revenue'] + self.hierarchy[region][category][product]['quantity'] += stats['quantity'] + + def finish(self): + # Summarize hierarchy at each level + result = {} + for region, categories in self.hierarchy.items(): + region_total = 0.0 + region_data = {} + + for category, products in categories.items(): + category_total = sum(p['revenue'] for p in products.values()) + region_total += category_total + region_data[category] = { + 'revenue': round(category_total, 2), + 'products': len(products) + } + + result[region] = { + 'total_revenue': round(region_total, 2), + 'categories': region_data + } + + return json.dumps(result, sort_keys=True) + + +# ======================================== +# UDAF 7: Deque-based State - Recent Transactions Window +# Modified to use sorted aggregation for deterministic results +# ======================================== +class RecentWindowUDAF: + """Aggregates transactions with deterministic sorting""" + + def __init__(self): + # Keep all transactions for deterministic ordering + self.all_transactions = [] + + @property + def aggregate_state(self): + # Return all transactions for merging + return self.all_transactions + + def accumulate(self, price, quantity): + if price is not None and quantity is not None: + revenue = float(price) * int(quantity) + self.all_transactions.append(revenue) + + def merge(self, other_state): + # Merge all transactions + self.all_transactions.extend(other_state) + + def finish(self): + if not self.all_transactions: + return json.dumps({'count': 0}) + + # Sort for deterministic results, then take last 5 + sorted_trans = sorted(self.all_transactions) + window = sorted_trans[-5:] if len(sorted_trans) > 5 else sorted_trans + + return json.dumps({ + 'count': len(window), + 'values': [round(v, 2) for v in window], + 'avg': round(sum(window) / len(window), 2), + 'max': round(max(window), 2), + 'min': round(min(window), 2) + }) diff --git a/regression-test/suites/pythonudaf_p0/udaf_scripts/pyudaf.zip b/regression-test/suites/pythonudaf_p0/udaf_scripts/pyudaf.zip new file mode 100644 index 00000000000000..1dc76099d43326 Binary files /dev/null and b/regression-test/suites/pythonudaf_p0/udaf_scripts/pyudaf.zip differ diff --git a/regression-test/suites/pythonudaf_p0/udaf_scripts/simple_udaf.py b/regression-test/suites/pythonudaf_p0/udaf_scripts/simple_udaf.py new file mode 100644 index 00000000000000..e4383f1c1bd2bd --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/udaf_scripts/simple_udaf.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Simple Python UDAF for testing. +""" + + +class SumInt: + """Simple sum aggregation for integers.""" + + def __init__(self): + self.sum = 0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum diff --git a/regression-test/suites/pythonudaf_p0/udaf_scripts/stats_udaf.py b/regression-test/suites/pythonudaf_p0/udaf_scripts/stats_udaf.py new file mode 100644 index 00000000000000..77525fab13cfd7 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/udaf_scripts/stats_udaf.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Statistical Aggregate Functions for Doris Python UDAF +Includes: Variance, StdDev, Median, CollectList, Range, GeometricMean, WeightedAvg +""" + +import math + + +# ======================================== +# Variance UDAF +# ======================================== +class VarianceUDAF: + """Calculate population variance""" + + def __init__(self): + self.count = 0 + self.sum_val = 0.0 + self.sum_sq = 0.0 + + @property + def aggregate_state(self): + return (self.count, self.sum_val, self.sum_sq) + + def accumulate(self, value): + if value is not None: + self.count += 1 + self.sum_val += value + self.sum_sq += value * value + + def merge(self, other_state): + other_count, other_sum, other_sum_sq = other_state + self.count += other_count + self.sum_val += other_sum + self.sum_sq += other_sum_sq + + def finish(self): + if self.count == 0: + return None + mean = self.sum_val / self.count + variance = (self.sum_sq / self.count) - (mean * mean) + return variance + + +# ======================================== +# Standard Deviation UDAF +# ======================================== +class StdDevUDAF: + """Calculate population standard deviation""" + + def __init__(self): + self.count = 0 + self.sum_val = 0.0 + self.sum_sq = 0.0 + + @property + def aggregate_state(self): + return (self.count, self.sum_val, self.sum_sq) + + def accumulate(self, value): + if value is not None: + self.count += 1 + self.sum_val += value + self.sum_sq += value * value + + def merge(self, other_state): + other_count, other_sum, other_sum_sq = other_state + self.count += other_count + self.sum_val += other_sum + self.sum_sq += other_sum_sq + + def finish(self): + if self.count == 0: + return None + mean = self.sum_val / self.count + variance = (self.sum_sq / self.count) - (mean * mean) + return math.sqrt(max(0, variance)) + + +# ======================================== +# Median UDAF +# ======================================== +class MedianUDAF: + """Calculate median value""" + + def __init__(self): + self.values = [] + + @property + def aggregate_state(self): + return self.values + + def accumulate(self, value): + if value is not None: + self.values.append(value) + + def merge(self, other_state): + if other_state: + self.values.extend(other_state) + + def finish(self): + if not self.values: + return None + sorted_vals = sorted(self.values) + n = len(sorted_vals) + if n % 2 == 0: + return (sorted_vals[n//2 - 1] + sorted_vals[n//2]) / 2.0 + else: + return sorted_vals[n//2] + + +# ======================================== +# Collect List UDAF +# ======================================== +class CollectListUDAF: + """Collect values into a sorted, comma-separated string""" + + def __init__(self): + self.items = [] + + @property + def aggregate_state(self): + return self.items + + def accumulate(self, value): + if value is not None: + self.items.append(value) + + def merge(self, other_state): + if other_state: + self.items.extend(other_state) + + def finish(self): + if not self.items: + return None + return ','.join(sorted(self.items)) + + +# ======================================== +# Range (Max - Min) UDAF +# ======================================== +class RangeUDAF: + """Calculate range (max - min)""" + + def __init__(self): + self.min_val = None + self.max_val = None + + @property + def aggregate_state(self): + return (self.min_val, self.max_val) + + def accumulate(self, value): + if value is not None: + if self.min_val is None or value < self.min_val: + self.min_val = value + if self.max_val is None or value > self.max_val: + self.max_val = value + + def merge(self, other_state): + other_min, other_max = other_state + if other_min is not None: + if self.min_val is None or other_min < self.min_val: + self.min_val = other_min + if other_max is not None: + if self.max_val is None or other_max > self.max_val: + self.max_val = other_max + + def finish(self): + if self.min_val is None or self.max_val is None: + return None + return self.max_val - self.min_val + + +# ======================================== +# Geometric Mean UDAF +# ======================================== +class GeometricMeanUDAF: + """Calculate geometric mean using log transformation""" + + def __init__(self): + self.log_sum = 0.0 + self.count = 0 + + @property + def aggregate_state(self): + return (self.log_sum, self.count) + + def accumulate(self, value): + if value is not None and value > 0: + self.log_sum += math.log(value) + self.count += 1 + + def merge(self, other_state): + other_log_sum, other_count = other_state + self.log_sum += other_log_sum + self.count += other_count + + def finish(self): + if self.count == 0: + return None + return math.exp(self.log_sum / self.count) + + +# ======================================== +# Weighted Average UDAF +# ======================================== +class WeightedAvgUDAF: + """Calculate weighted average""" + + def __init__(self): + self.weighted_sum = 0.0 + self.weight_sum = 0 + + @property + def aggregate_state(self): + return (self.weighted_sum, self.weight_sum) + + def accumulate(self, value, weight): + if value is not None and weight is not None and weight > 0: + self.weighted_sum += value * weight + self.weight_sum += weight + + def merge(self, other_state): + other_weighted_sum, other_weight_sum = other_state + self.weighted_sum += other_weighted_sum + self.weight_sum += other_weight_sum + + def finish(self): + if self.weight_sum == 0: + return None + return self.weighted_sum / self.weight_sum diff --git a/regression-test/suites/pythonudaf_p0/udaf_scripts/sum_int.py b/regression-test/suites/pythonudaf_p0/udaf_scripts/sum_int.py new file mode 100644 index 00000000000000..1c9b6680366769 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/udaf_scripts/sum_int.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Python UDAF implementations for testing. +""" + + +class SumInt: + """Aggregate function that sums integer values.""" + + def __init__(self): + self.sum = 0 + + @property + def aggregate_state(self): + return self.sum + + def accumulate(self, value): + if value is not None: + self.sum += value + + def merge(self, other_state): + self.sum += other_state + + def finish(self): + return self.sum + + +class AvgDouble: + """Aggregate function that calculates average of double values.""" + + def __init__(self): + self.count = 0 + self.sum = 0.0 + + @property + def aggregate_state(self): + return (self.count, self.sum) + + def accumulate(self, value): + if value is not None: + self.count += 1 + self.sum += value + + def merge(self, other_state): + other_count, other_sum = other_state + self.count += other_count + self.sum += other_sum + + def finish(self): + if self.count == 0: + return None + return self.sum / self.count diff --git a/regression-test/suites/pythonudaf_p0/udaf_scripts/window_udaf.py b/regression-test/suites/pythonudaf_p0/udaf_scripts/window_udaf.py new file mode 100644 index 00000000000000..6b94dcf8476484 --- /dev/null +++ b/regression-test/suites/pythonudaf_p0/udaf_scripts/window_udaf.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Window Function UDAFs for Apache Doris +Advanced window analytics including moving averages, volatility, and percentiles +""" + +import math + + +class MovingAvgUDAF: + """Simple Moving Average (SMA) UDAF""" + + def __init__(self): + self.values = [] + + @property + def aggregate_state(self): + return self.values + + def accumulate(self, value): + if value is not None: + self.values.append(value) + + def merge(self, other_state): + if other_state: + self.values.extend(other_state) + + def finish(self): + if not self.values: + return None + return sum(self.values) / len(self.values) + + +class WindowStdDevUDAF: + """Standard Deviation for window volatility analysis""" + + def __init__(self): + self.values = [] + + @property + def aggregate_state(self): + return self.values + + def accumulate(self, value): + if value is not None: + self.values.append(value) + + def merge(self, other_state): + if other_state: + self.values.extend(other_state) + + def finish(self): + if not self.values or len(self.values) < 2: + return None + mean = sum(self.values) / len(self.values) + variance = sum((x - mean) ** 2 for x in self.values) / len(self.values) + return math.sqrt(variance) + + +class LastValueUDAF: + """Last value in window (for delta calculations)""" + + def __init__(self): + self.last = None + + @property + def aggregate_state(self): + return self.last + + def accumulate(self, value): + if value is not None: + self.last = value + + def merge(self, other_state): + if other_state is not None: + self.last = other_state + + def finish(self): + return self.last + + +class WindowMinUDAF: + """Minimum value in window""" + + def __init__(self): + self.min_val = None + + @property + def aggregate_state(self): + return self.min_val + + def accumulate(self, value): + if value is not None: + if self.min_val is None or value < self.min_val: + self.min_val = value + + def merge(self, other_state): + if other_state is not None: + if self.min_val is None or other_state < self.min_val: + self.min_val = other_state + + def finish(self): + return self.min_val + + +class WindowMaxUDAF: + """Maximum value in window""" + + def __init__(self): + self.max_val = None + + @property + def aggregate_state(self): + return self.max_val + + def accumulate(self, value): + if value is not None: + if self.max_val is None or value > self.max_val: + self.max_val = value + + def merge(self, other_state): + if other_state is not None: + if self.max_val is None or other_state > self.max_val: + self.max_val = other_state + + def finish(self): + return self.max_val + + +class Percentile50UDAF: + """50th percentile (median) calculation in window""" + + def __init__(self): + self.values = [] + + @property + def aggregate_state(self): + return self.values + + def accumulate(self, value): + if value is not None: + self.values.append(value) + + def merge(self, other_state): + if other_state: + self.values.extend(other_state) + + def finish(self): + if not self.values: + return None + sorted_vals = sorted(self.values) + n = len(sorted_vals) + if n % 2 == 0: + return (sorted_vals[n//2 - 1] + sorted_vals[n//2]) / 2.0 + else: + return sorted_vals[n//2] diff --git a/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertequal.groovy b/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertequal.groovy new file mode 100644 index 00000000000000..b55c2a5d692a0f --- /dev/null +++ b/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertequal.groovy @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_assertequal") { + def pyPath = """${context.file.parent}/../udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + try { + sql """ DROP TABLE IF EXISTS test_pythonudf_assertequal """ + sql """ + CREATE TABLE IF NOT EXISTS test_pythonudf_assertequal ( + `col` varchar(10) NOT NULL, + `col_1` double NOT NULL, + `col_2` double NOT NULL + ) + DISTRIBUTED BY HASH(col) PROPERTIES("replication_num" = "1"); + """ + + sql """ INSERT INTO test_pythonudf_assertequal VALUES ('abc', 23.34, 23.34); """ + + File path1 = new File(pyPath) + if (!path1.exists()) { + throw new IllegalStateException("""${pyPath} doesn't exist! """) + } + + sql """ CREATE FUNCTION asser_equal(double, double) RETURNS string PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="assert_equal_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select """ SELECT asser_equal(col_1, col_2) as a FROM test_pythonudf_assertequal ORDER BY a; """ + + + } finally { + try_sql("DROP FUNCTION IF EXISTS asser_equal(double, double); ") + try_sql("DROP TABLE IF EXISTS test_pythonudf_assertequal") + } +} diff --git a/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertlessthan.groovy b/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertlessthan.groovy new file mode 100644 index 00000000000000..b2a06c15d6167b --- /dev/null +++ b/regression-test/suites/pythonudf_p0/sanity/test_pythonudf_assertlessthan.groovy @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_assertlessthan") { + def tableName = "test_pythonudf_assertlessthan" + def pyPath = """${context.file.parent}/../udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + try { + sql """ DROP TABLE IF EXISTS test_pythonudf_assertlessthan """ + sql """ + CREATE TABLE IF NOT EXISTS test_pythonudf_assertlessthan ( + `col` varchar(10) NOT NULL, + `col_1` double NOT NULL, + `col_2` double NOT NULL + ) + DISTRIBUTED BY HASH(col) PROPERTIES("replication_num" = "1"); + """ + + sql """ INSERT INTO test_pythonudf_assertlessthan VALUES ('abc', 23.34, 23.35), ('bcd', 0.123, 0.124); """ + + File path1 = new File(pyPath) + if (!path1.exists()) { + throw new IllegalStateException("""${pyPath} doesn't exist! """) + } + + sql """ CREATE FUNCTION asser_lessthan(double, double) RETURNS string PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="assert_lessthan_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select """ SELECT asser_lessthan(col_1, col_2) as a FROM test_pythonudf_assertlessthan ORDER BY a; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS asser_lessthan(double, double); ") + try_sql("DROP TABLE IF EXISTS test_pythonudf_assertlessthan") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_aggregate.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_aggregate.groovy new file mode 100644 index 00000000000000..a24e487601709f --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_aggregate.groovy @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_aggregate") { + def runtime_version = "3.8.10" + + try { + // Test 1: Create simple aggregate function (although Python UDF is mainly for scalar functions) + // Test using Python UDF in aggregate queries + sql """ DROP FUNCTION IF EXISTS py_score_grade(DOUBLE); """ + sql """ + CREATE FUNCTION py_score_grade(DOUBLE) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(score): + if score is None: + return None + if score >= 90: + return 'A' + elif score >= 80: + return 'B' + elif score >= 70: + return 'C' + elif score >= 60: + return 'D' + else: + return 'F' +\$\$; + """ + + // Create test table + sql """ DROP TABLE IF EXISTS student_scores; """ + sql """ + CREATE TABLE student_scores ( + student_id INT, + student_name STRING, + subject STRING, + score DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(student_id) + DISTRIBUTED BY HASH(student_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO student_scores VALUES + (1, 'Alice', 'Math', 95.0), + (1, 'Alice', 'English', 88.0), + (1, 'Alice', 'Science', 92.0), + (2, 'Bob', 'Math', 78.0), + (2, 'Bob', 'English', 85.0), + (2, 'Bob', 'Science', 80.0), + (3, 'Charlie', 'Math', 65.0), + (3, 'Charlie', 'English', 70.0), + (3, 'Charlie', 'Science', 68.0), + (4, 'David', 'Math', 55.0), + (4, 'David', 'English', 60.0), + (4, 'David', 'Science', 58.0); + """ + + // Test using UDF in SELECT + qt_select_grades """ + SELECT + student_id, + student_name, + subject, + score, + py_score_grade(score) AS grade + FROM student_scores + ORDER BY student_id, subject; + """ + + // Test using UDF in GROUP BY + qt_select_group_by_grade """ + SELECT + py_score_grade(score) AS grade, + COUNT(*) AS count, + AVG(score) AS avg_score + FROM student_scores + GROUP BY py_score_grade(score) + ORDER BY grade; + """ + + // Test using UDF in aggregate functions + qt_select_aggregate_with_udf """ + SELECT + student_id, + student_name, + AVG(score) AS avg_score, + py_score_grade(AVG(score)) AS avg_grade + FROM student_scores + GROUP BY student_id, student_name + ORDER BY student_id; + """ + + // Test 2: Create classification function for aggregate analysis + sql """ DROP FUNCTION IF EXISTS py_age_group(INT); """ + sql """ + CREATE FUNCTION py_age_group(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(age): + if age is None: + return None + if age < 18: + return 'Minor' + elif age < 30: + return 'Young Adult' + elif age < 50: + return 'Adult' + else: + return 'Senior' +\$\$; + """ + + sql """ DROP TABLE IF EXISTS users; """ + sql """ + CREATE TABLE users ( + user_id INT, + name STRING, + age INT, + salary DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(user_id) + DISTRIBUTED BY HASH(user_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO users VALUES + (1, 'User1', 16, 0), + (2, 'User2', 25, 50000), + (3, 'User3', 35, 80000), + (4, 'User4', 55, 100000), + (5, 'User5', 28, 60000), + (6, 'User6', 45, 90000), + (7, 'User7', 22, 45000), + (8, 'User8', 60, 110000); + """ + + qt_select_age_group_aggregate """ + SELECT + py_age_group(age) AS age_group, + COUNT(*) AS user_count, + AVG(salary) AS avg_salary, + MAX(salary) AS max_salary, + MIN(salary) AS min_salary + FROM users + GROUP BY py_age_group(age) + ORDER BY age_group; + """ + + // Test 3: Use UDF in HAVING clause + qt_select_having_with_udf """ + SELECT + student_id, + student_name, + AVG(score) AS avg_score + FROM student_scores + GROUP BY student_id, student_name + HAVING py_score_grade(AVG(score)) IN ('A', 'B') + ORDER BY student_id; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_score_grade(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_age_group(INT);") + try_sql("DROP TABLE IF EXISTS student_scores;") + try_sql("DROP TABLE IF EXISTS users;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_always_nullable.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_always_nullable.groovy new file mode 100644 index 00000000000000..6936de8dd53dbb --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_always_nullable.groovy @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_always_nullable") { + // Test different configurations of always_nullable parameter + + def runtime_version = "3.8.10" + try { + // Test 1: always_nullable = true (default value) + sql """ DROP FUNCTION IF EXISTS py_nullable_true(INT); """ + sql """ + CREATE FUNCTION py_nullable_true(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + if x < 0: + return None + return x * 2 +\$\$; + """ + + qt_select_nullable_true_normal """ SELECT py_nullable_true(10) AS result; """ + qt_select_nullable_true_null """ SELECT py_nullable_true(NULL) AS result; """ + qt_select_nullable_true_negative """ SELECT py_nullable_true(-5) AS result; """ + + // Test 2: always_nullable = false + sql """ DROP FUNCTION IF EXISTS py_nullable_false(INT); """ + sql """ + CREATE FUNCTION py_nullable_false(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(x): + if x is None: + return 0 + return x * 2 +\$\$; + """ + + qt_select_nullable_false_normal """ SELECT py_nullable_false(10) AS result; """ + qt_select_nullable_false_null """ SELECT py_nullable_false(NULL) AS result; """ + + // Test 3: always_nullable = false but function returns None + // This tests the edge case where the function violates the always_nullable contract + sql """ DROP FUNCTION IF EXISTS py_nullable_false_returns_none(INT); """ + sql """ + CREATE FUNCTION py_nullable_false_returns_none(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "always_nullable" = "false", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(x): + if x < 0: + return None # Returns None even though always_nullable is false + return x * 2 +\$\$; + """ + + qt_select_nullable_false_returns_none_normal """ SELECT py_nullable_false_returns_none(10) AS result; """ + + test { + sql """ SELECT py_nullable_false_returns_none(-5) AS result; """ + exception "but the return type is not nullable, please check the always_nullable property in create function statement, it should be true" + } + + // Test 4: Test nullable behavior on table data + sql """ DROP TABLE IF EXISTS nullable_test_table; """ + sql """ + CREATE TABLE nullable_test_table ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO nullable_test_table VALUES + (1, 10), + (2, NULL), + (3, -5), + (4, 0), + (5, 100); + """ + + qt_select_table_nullable_true """ + SELECT + id, + value, + py_nullable_true(value) AS result + FROM nullable_test_table + ORDER BY id; + """ + + qt_select_table_nullable_false """ + SELECT + id, + value, + py_nullable_false(value) AS result + FROM nullable_test_table + ORDER BY id; + """ + + test { + sql """ + SELECT + id, + value, + py_nullable_false_returns_none(value) AS result + FROM nullable_test_table + ORDER BY id; + """ + exception "'<' not supported between instances of 'NoneType' and 'int'" + } + + // Test 5: Nullable test for string type + sql """ DROP FUNCTION IF EXISTS py_string_nullable(STRING); """ + sql """ + CREATE FUNCTION py_string_nullable(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(s): + if s is None or s == "": + return None + return s.upper() +\$\$; + """ + + qt_select_string_nullable """ SELECT py_string_nullable('hello') AS result; """ + qt_select_string_nullable_null """ SELECT py_string_nullable(NULL) AS result; """ + qt_select_string_nullable_empty """ SELECT py_string_nullable('') AS result; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_nullable_true(INT);") + try_sql("DROP FUNCTION IF EXISTS py_nullable_false(INT);") + try_sql("DROP FUNCTION IF EXISTS py_nullable_false_returns_none(INT);") + try_sql("DROP FUNCTION IF EXISTS py_string_nullable(STRING);") + try_sql("DROP TABLE IF EXISTS nullable_test_table;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_array.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_array.groovy new file mode 100644 index 00000000000000..503d49ef2e805c --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_array.groovy @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_array") { + def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + try { + sql """ DROP TABLE IF EXISTS test_pythonudf_array """ + sql """ + CREATE TABLE IF NOT EXISTS test_pythonudf_array ( + `user_id` INT NOT NULL COMMENT "", + `tinyint_col` TINYINT NOT NULL COMMENT "", + `string_col` STRING NOT NULL COMMENT "" + ) + DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1"); + """ + StringBuilder sb = new StringBuilder() + int i = 1 + for (; i < 10; i ++) { + sb.append(""" + (${i},${i}*2,'a${i}b'), + """) + } + sb.append(""" + (${i},${i}*2,'a${i}b') + """) + sql """ INSERT INTO test_pythonudf_array VALUES + ${sb.toString()} + """ + qt_select_default """ SELECT * FROM test_pythonudf_array t ORDER BY user_id; """ + + File path = new File(pyPath) + if (!path.exists()) { + throw new IllegalStateException("""${pyPath} doesn't exist! """) + } + + sql """ DROP FUNCTION IF EXISTS python_udf_array_int_test(array); """ + sql """ CREATE FUNCTION python_udf_array_int_test(array) RETURNS int PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="array_int_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + qt_select_1 """ SELECT python_udf_array_int_test(array(user_id)) result FROM test_pythonudf_array ORDER BY result; """ + qt_select_2 """ SELECT python_udf_array_int_test(null) result ; """ + + + sql """ DROP FUNCTION IF EXISTS python_udf_array_return_int_test(array); """ + sql """ CREATE FUNCTION python_udf_array_return_int_test(array) RETURNS array PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="array_return_array_int_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + qt_select_3 """ SELECT python_udf_array_return_int_test(array(user_id)), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """ + qt_select_4 """ SELECT python_udf_array_return_int_test(array(user_id,user_id)), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """ + qt_select_5 """ SELECT python_udf_array_return_int_test(null) result ; """ + + + sql """ DROP FUNCTION IF EXISTS python_udf_array_return_string_test(array); """ + sql """ CREATE FUNCTION python_udf_array_return_string_test(array) RETURNS array PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="array_return_array_string_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + qt_select_6 """ SELECT python_udf_array_return_string_test(array(string_col)), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """ + qt_select_7 """ SELECT python_udf_array_return_string_test(array(string_col, cast(user_id as string))), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """ + qt_select_8 """ SELECT python_udf_array_return_string_test(null) result ; """ + + sql """ DROP FUNCTION IF EXISTS python_udf_array_string_test(array); """ + sql """ CREATE FUNCTION python_udf_array_string_test(array) RETURNS string PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="array_string_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + qt_select_9 """ SELECT python_udf_array_string_test(array(string_col)), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """ + qt_select_10 """ SELECT python_udf_array_string_test(array(string_col, cast(user_id as string))), tinyint_col as result FROM test_pythonudf_array ORDER BY result; """ + qt_select_11 """ SELECT python_udf_array_string_test(null) result ; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS python_udf_array_int_test(array);") + try_sql("DROP FUNCTION IF EXISTS python_udf_array_return_int_test(array);") + try_sql("DROP FUNCTION IF EXISTS python_udf_array_return_string_test(array);") + try_sql("DROP FUNCTION IF EXISTS python_udf_array_string_test(array);") + try_sql("DROP TABLE IF EXISTS test_pythonudf_array") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_base_data_type.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_base_data_type.groovy new file mode 100644 index 00000000000000..172e6271497dd0 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_base_data_type.groovy @@ -0,0 +1,323 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_base_data_type") { + def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + + // TEST INLINE CASE + try { + sql """ + DROP FUNCTION IF EXISTS row_to_csv_all( + BOOLEAN, + TINYINT, + SMALLINT, + INT, + BIGINT, + LARGEINT, + FLOAT, + DOUBLE, + DECIMAL, + DECIMAL, + DECIMAL, + DATE, + DATETIME, + CHAR, + VARCHAR, + STRING + ); + """ + sql """ +CREATE FUNCTION row_to_csv_all( + BOOLEAN, + TINYINT, + SMALLINT, + INT, + BIGINT, + LARGEINT, + FLOAT, + DOUBLE, + DECIMAL, + DECIMAL, + DECIMAL, + DATE, + DATETIME, + CHAR, + VARCHAR, + STRING +) +RETURNS STRING +PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "row_to_csv_all_impl", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" +) +AS \$\$ +def row_to_csv_all_impl( + bool_col, tinyint_col, smallint_col, int_col, bigint_col, largeint_col, + float_col, double_col, decimal32_col, decimal64_col, decimal128_col, + date_col, datetime_col, char_col, varchar_col, string_col +): + cols = [ + bool_col, tinyint_col, smallint_col, int_col, bigint_col, largeint_col, + float_col, double_col, decimal32_col, decimal64_col, decimal128_col, + date_col, datetime_col, char_col, varchar_col, string_col + ] + + def safe_str(x): + return 'NULL' if x is None else str(x) + + return ','.join(safe_str(col) for col in cols) +\$\$; + """ + sql """ DROP TABLE IF EXISTS test_datatype_table; """ + sql """ + CREATE TABLE test_datatype_table ( + id INT, + bool_value BOOLEAN, + tinyint_value TINYINT, + smallint_value SMALLINT, + int_value INT, + bigint_value BIGINT, + largeint_value LARGEINT, + float_value float, + double_value DOUBLE, + decimal32_value DECIMAL(8, 2), + decimal64_value DECIMAL(16, 2), + decimal128_value DECIMAL(32, 8), + -- decimal256_value DECIMAL(64, 10), + date_value DATE, + datetime_value DATETIME, + char_value CHAR(100), + varchar_value VARCHAR(100), + string_value STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + sql """ + INSERT INTO test_datatype_table VALUES + (1, TRUE, 127, 32767, 2147483647, 9223372036854775807, 170141183460469231731687303715884105727, + 1.23, 4.56789, 123456.78, 12345678901.2345, 123456789012345678901.234567890, + '2023-01-01', '2023-01-01 12:34:56', 'char_data_1', 'varchar_data_1', 'string_data_1'), + + (2, FALSE, -128, -32768, -2147483648, -9223372036854775808, -170141183460469231731687303715884105728, + -2.34, -5.6789, -987654.32, -98765432.109876543, -987654321098765432.10987654321, + '2024-05-15', '2024-05-15 08:22:10', 'char_data_2', 'varchar_data_2', 'string_data_2'), + + (3, TRUE, 0, 0, 0, 0, 0, + 0.0, 0.0, 0.00, 0.00, 0.00000000, + '2025-10-15', '2025-10-15 00:00:00', 'char_zero', 'varchar_zero', 'string_zero'), + + (4, FALSE, 100, 20000, 300000000, 4000000000000000000, 99999999999999999999999999999999999999, + 3.14, 2.71828, 999999.99, 99999999999999.99, 99999999999999999999999.999999999999999, + '2022-12-31', '2022-12-31 23:59:59', 'char_max', 'varchar_max', 'string_max'), + + (5, TRUE, -50, -10000, -100000000, -5000000000000000000, -99999999999999999999999999999999999999, + -1.41, -0.57721, -0.01, -0.01, -0.000000001, + '2021-07-04', '2021-07-04 14:30:00', 'char_neg', 'varchar_neg', 'string_neg'); + """ + + qt_select_1 """ + SELECT row_to_csv_all( + bool_value, + tinyint_value, + smallint_value, + int_value, + bigint_value, + largeint_value, + float_value, + double_value, + decimal32_value, + decimal64_value, + decimal128_value, + date_value, + datetime_value, + char_value, + varchar_value, + string_value + ) AS csv_row + FROM test_datatype_table; + """ + } finally { + try_sql("""DROP FUNCTION IF EXISTS row_to_csv_all( + BOOLEAN, + TINYINT, + SMALLINT, + INT, + BIGINT, + LARGEINT, + FLOAT, + DOUBLE, + DECIMAL, + DECIMAL, + DECIMAL, + DATE, + DATETIME, + CHAR, + VARCHAR, + STRING + );""") + try_sql("DROP TABLE IF EXISTS test_datatype_table;") + } + + // TEST MODULE CASE + try { + sql """ + DROP FUNCTION IF EXISTS row_to_csv_all( + BOOLEAN, + TINYINT, + SMALLINT, + INT, + BIGINT, + LARGEINT, + FLOAT, + DOUBLE, + DECIMAL, + DECIMAL, + DECIMAL, + DATE, + DATETIME, + CHAR, + VARCHAR, + STRING + ); + """ + sql """ + CREATE FUNCTION row_to_csv_all( + BOOLEAN, + TINYINT, + SMALLINT, + INT, + BIGINT, + LARGEINT, + FLOAT, + DOUBLE, + DECIMAL, + DECIMAL, + DECIMAL, + DATE, + DATETIME, + CHAR, + VARCHAR, + STRING + ) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_data_type.row_to_csv_all_impl", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + sql """ DROP TABLE IF EXISTS test_datatype_table; """ + sql """ + CREATE TABLE test_datatype_table ( + id INT, + bool_value BOOLEAN, + tinyint_value TINYINT, + smallint_value SMALLINT, + int_value INT, + bigint_value BIGINT, + largeint_value LARGEINT, + float_value float, + double_value DOUBLE, + decimal32_value DECIMAL(8, 2), + decimal64_value DECIMAL(16, 2), + decimal128_value DECIMAL(32, 8), + -- decimal256_value DECIMAL(64, 10), + date_value DATE, + datetime_value DATETIME, + char_value CHAR(100), + varchar_value VARCHAR(100), + string_value STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + sql """ + INSERT INTO test_datatype_table VALUES + (1, TRUE, 127, 32767, 2147483647, 9223372036854775807, 170141183460469231731687303715884105727, + 1.23, 4.56789, 123456.78, 12345678901.2345, 123456789012345678901.234567890, + '2023-01-01', '2023-01-01 12:34:56', 'char_data_1', 'varchar_data_1', 'string_data_1'), + + (2, FALSE, -128, -32768, -2147483648, -9223372036854775808, -170141183460469231731687303715884105728, + -2.34, -5.6789, -987654.32, -98765432.109876543, -987654321098765432.10987654321, + '2024-05-15', '2024-05-15 08:22:10', 'char_data_2', 'varchar_data_2', 'string_data_2'), + + (3, TRUE, 0, 0, 0, 0, 0, + 0.0, 0.0, 0.00, 0.00, 0.00000000, + '2025-10-15', '2025-10-15 00:00:00', 'char_zero', 'varchar_zero', 'string_zero'), + + (4, FALSE, 100, 20000, 300000000, 4000000000000000000, 99999999999999999999999999999999999999, + 3.14, 2.71828, 999999.99, 99999999999999.99, 99999999999999999999999.999999999999999, + '2022-12-31', '2022-12-31 23:59:59', 'char_max', 'varchar_max', 'string_max'), + + (5, TRUE, -50, -10000, -100000000, -5000000000000000000, -99999999999999999999999999999999999999, + -1.41, -0.57721, -0.01, -0.01, -0.000000001, + '2021-07-04', '2021-07-04 14:30:00', 'char_neg', 'varchar_neg', 'string_neg'); + """ + + qt_select_2 """ + SELECT row_to_csv_all( + bool_value, + tinyint_value, + smallint_value, + int_value, + bigint_value, + largeint_value, + float_value, + double_value, + decimal32_value, + decimal64_value, + decimal128_value, + date_value, + datetime_value, + char_value, + varchar_value, + string_value + ) AS csv_row + FROM test_datatype_table; + """ + } finally { + try_sql("""DROP FUNCTION IF EXISTS row_to_csv_all( + BOOLEAN, + TINYINT, + SMALLINT, + INT, + BIGINT, + LARGEINT, + FLOAT, + DOUBLE, + DECIMAL, + DECIMAL, + DECIMAL, + DATE, + DATETIME, + CHAR, + VARCHAR, + STRING + );""") + try_sql("DROP TABLE IF EXISTS test_datatype_table;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_boolean.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_boolean.groovy new file mode 100644 index 00000000000000..ef3065418a9e64 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_boolean.groovy @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_boolean") { + def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + try { + sql """ DROP TABLE IF EXISTS test_pythonudf_boolean """ + sql """ + CREATE TABLE IF NOT EXISTS test_pythonudf_boolean ( + `user_id` INT NOT NULL COMMENT "", + `boo_1` BOOLEAN NOT NULL COMMENT "" + ) + DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1"); + """ + + sql """ INSERT INTO test_pythonudf_boolean (`user_id`,`boo_1`) VALUES + (111,true), + (112,false), + (113,0), + (114,1) + """ + qt_select_default """ SELECT * FROM test_pythonudf_boolean t ORDER BY user_id; """ + + File path1 = new File(pyPath) + if (!path1.exists()) { + throw new IllegalStateException("""${pyPath} doesn't exist! """) + } + + sql """ CREATE FUNCTION python_udf_boolean_test(BOOLEAN) RETURNS BOOLEAN PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="boolean_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select """ SELECT python_udf_boolean_test(1) as result; """ + qt_select """ SELECT python_udf_boolean_test(0) as result ; """ + qt_select """ SELECT python_udf_boolean_test(true) as result ; """ + qt_select """ SELECT python_udf_boolean_test(false) as result ; """ + qt_select """ SELECT python_udf_boolean_test(null) as result ; """ + qt_select """ SELECT user_id,python_udf_boolean_test(boo_1) as result FROM test_pythonudf_boolean order by user_id; """ + + + + } finally { + try_sql("DROP FUNCTION IF EXISTS python_udf_boolean_test(BOOLEAN);") + try_sql("DROP TABLE IF EXISTS test_pythonudf_boolean") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_complex_data_type.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_complex_data_type.groovy new file mode 100644 index 00000000000000..886d9fcb3c3ce7 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_complex_data_type.groovy @@ -0,0 +1,408 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_complex_data_type") { + def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + + // TEST ARRAY INLINE CASE + try { + sql """ + DROP FUNCTION IF EXISTS array_to_csv( + ARRAY, + ARRAY, + ARRAY> + ); + """ + sql """ +CREATE FUNCTION array_to_csv( + ARRAY, + ARRAY, + ARRAY> +) +RETURNS STRING +PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "array_to_csv_impl", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" +) +AS \$\$ +def array_to_csv_impl(int_arr, str_arr, nested_arr): + def safe_str(x): + return 'NULL' if x is None else str(x) + + def format_array(arr): + if arr is None: + return 'NULL' + return '[' + ','.join(safe_str(item) for item in arr) + ']' + + def format_nested_array(arr): + if arr is None: + return 'NULL' + return '[' + ','.join(format_array(inner) for inner in arr) + ']' + + parts = [ + format_array(int_arr), + format_array(str_arr), + format_nested_array(nested_arr) + ] + return '|'.join(parts) +\$\$; + """ + sql """ DROP TABLE IF EXISTS test_array_table; """ + sql """ + CREATE TABLE test_array_table ( + id INT, + int_array ARRAY, + string_array ARRAY, + nested_array ARRAY> + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + sql """ + INSERT INTO test_array_table VALUES + (1, [1, 2, 3], ['a', 'b', 'c'], [[1,2], [3,4]]), + (2, [], [], []), + (3, NULL, ['x', NULL, 'z'], NULL), + (4, [0, -1, 2147483647], ['hello', 'world'], [[], [1]]); + """ + + qt_select_1 """ + SELECT array_to_csv(int_array, string_array, nested_array) AS result FROM test_array_table; + """ + } finally { + try_sql("""DROP FUNCTION IF EXISTS array_to_csv( + ARRAY, + ARRAY, + ARRAY> + );""") + try_sql("DROP TABLE IF EXISTS test_array_table;") + } + + // TEST ARRAY MODULE CASE + try { + sql """ + DROP FUNCTION IF EXISTS array_to_csv( + ARRAY, + ARRAY, + ARRAY> + ); + """ + sql """ + CREATE FUNCTION array_to_csv( + ARRAY, + ARRAY, + ARRAY> + ) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file"="file://${pyPath}", + "symbol" = "python_udf_array_type.array_to_csv_impl", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + sql """ DROP TABLE IF EXISTS test_array_table; """ + sql """ + CREATE TABLE test_array_table ( + id INT, + int_array ARRAY, + string_array ARRAY, + nested_array ARRAY> + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + sql """ + INSERT INTO test_array_table VALUES + (1, [1, 2, 3], ['a', 'b', 'c'], [[1,2], [3,4]]), + (2, [], [], []), + (3, NULL, ['x', NULL, 'z'], NULL), + (4, [0, -1, 2147483647], ['hello', 'world'], [[], [1]]); + """ + + + qt_select_2 """ + SELECT array_to_csv(int_array, string_array, nested_array) AS result FROM test_array_table; + """ + } finally { + try_sql("""DROP FUNCTION IF EXISTS array_to_csv( + ARRAY, + ARRAY, + ARRAY> + );""") + try_sql("DROP TABLE IF EXISTS test_array_table;") + } + + // TEST MAP INLINE CASE + try { + sql """ + DROP FUNCTION IF EXISTS map_to_csv( + MAP, + MAP + ); + """ + sql """ +CREATE FUNCTION map_to_csv( + MAP, + MAP +) +RETURNS STRING +PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "map_to_csv_impl", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" +) +AS \$\$ +def map_to_csv_impl(map1, map2): + def safe_str(x): + return 'NULL' if x is None else str(x) + + def format_map(m): + if m is None: + return 'NULL' + # Doris passes MAP as Python dict + items = [f"{safe_str(k)}:{safe_str(v)}" for k, v in m.items()] + return '{' + ','.join(sorted(items)) + '}' + + return '|'.join([format_map(map1), format_map(map2)]) +\$\$; + """ + sql """ DROP TABLE IF EXISTS test_map_table; """ + sql """ + CREATE TABLE test_map_table ( + id INT, + int_string_map MAP, + string_double_map MAP + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + sql """ + INSERT INTO test_map_table VALUES + (1, {1:'one', 2:'two'}, {'pi':3.14, 'e':2.718}), + (2, {}, {}), + (3, NULL, {'null_key': NULL}), + (4, {0:'zero', -1:'minus_one'}, {'max':1.79769e308}); + """ + + qt_select_3 """ + SELECT map_to_csv(int_string_map, string_double_map) AS result FROM test_map_table; + """ + } finally { + try_sql("""DROP FUNCTION IF EXISTS map_to_csv( + MAP, + MAP + );""") + try_sql("DROP TABLE IF EXISTS test_map_table;") + } + + // TEST MAP MODULE CASE + try { + sql """ + DROP FUNCTION IF EXISTS map_to_csv( + MAP, + MAP + ); + """ + sql """ + CREATE FUNCTION map_to_csv( + MAP, + MAP + ) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file"="file://${pyPath}", + "symbol" = "python_udf_map_type.map_to_csv_impl", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + sql """ DROP TABLE IF EXISTS test_map_table; """ + sql """ + CREATE TABLE test_map_table ( + id INT, + int_string_map MAP, + string_double_map MAP + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + sql """ + INSERT INTO test_map_table VALUES + (1, {1:'one', 2:'two'}, {'pi':3.14, 'e':2.718}), + (2, {}, {}), + (3, NULL, {'null_key': NULL}), + (4, {0:'zero', -1:'minus_one'}, {'max':1.79769e308}); + """ + + qt_select_4 """ + SELECT map_to_csv(int_string_map, string_double_map) AS result FROM test_map_table; + """ + } finally { + try_sql("""DROP FUNCTION IF EXISTS map_to_csv( + MAP, + MAP + );""") + try_sql("DROP TABLE IF EXISTS test_map_table;") + } + + // TEST STRUCT INLINE CASE + try { + sql """ + DROP FUNCTION IF EXISTS struct_to_csv( + STRUCT, + STRUCT> + ); + """ + sql """ +CREATE FUNCTION struct_to_csv( + STRUCT, + STRUCT> +) +RETURNS STRING +PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "struct_to_csv_impl", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" +) +AS \$\$ +def struct_to_csv_impl(person, point): + def safe_str(x): + return 'NULL' if x is None else str(x) + + def format_array(arr): + if arr is None: + return 'NULL' + return '[' + ','.join(safe_str(item) for item in arr) + ']' + + def format_struct_dict(s, field_names): + if s is None: + return 'NULL' + parts = [] + for field in field_names: + val = s.get(field) + parts.append(safe_str(val)) + return '(' + ','.join(parts) + ')' + + person_str = format_struct_dict(person, ['name', 'age', 'salary']) + + if point is None: + point_str = 'NULL' + else: + x_val = safe_str(point.get('x')) + y_val = safe_str(point.get('y')) + tags_val = format_array(point.get('tags')) + point_str = f"({x_val},{y_val},{tags_val})" + + return '|'.join([person_str, point_str]) +\$\$; + """ + sql """ DROP TABLE IF EXISTS test_struct_table; """ + sql """ + CREATE TABLE test_struct_table( + id INT, + person STRUCT, + point STRUCT> + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + sql """ + INSERT INTO test_struct_table VALUES + (1, {'Alice', 30, 75000.50}, {1.5, 2.5, ['red', 'blue']}), + (2, {NULL, NULL, NULL}, {0.0, 0.0, []}), + (3, {'Bob', 25, 60000.00}, {NULL, 3.14, ['tag1', NULL, 'tag3']}), + (4, {'', 0, 0.0}, {-1.0, -2.0, NULL}); + """ + + qt_select_5 """ + SELECT struct_to_csv(person, point) AS result FROM test_struct_table; + """ + } finally { + try_sql("""DROP FUNCTION IF EXISTS struct_to_csv( + STRUCT, + STRUCT> + );""") + try_sql("DROP TABLE IF EXISTS test_struct_table;") + } + + // TEST STRUCT MODULE CASE + try { + sql """ + DROP FUNCTION IF EXISTS struct_to_csv( + STRUCT, + STRUCT> + ); + """ + sql """ + CREATE FUNCTION struct_to_csv( + STRUCT, + STRUCT> + ) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file"="file://${pyPath}", + "symbol" = "python_udf_struct_type.struct_to_csv_impl", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + """ + sql """ DROP TABLE IF EXISTS test_struct_table; """ + sql """ + CREATE TABLE test_struct_table( + id INT, + person STRUCT, + point STRUCT> + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + sql """ + INSERT INTO test_struct_table VALUES + (1, {'Alice', 30, 75000.50}, {1.5, 2.5, ['red', 'blue']}), + (2, {NULL, NULL, NULL}, {0.0, 0.0, []}), + (3, {'Bob', 25, 60000.00}, {NULL, 3.14, ['tag1', NULL, 'tag3']}), + (4, {'', 0, 0.0}, {-1.0, -2.0, NULL}); + """ + + qt_select_6 """ + SELECT struct_to_csv(person, point) AS result FROM test_struct_table; + """ + } finally { + try_sql("""DROP FUNCTION IF EXISTS struct_to_csv( + STRUCT, + STRUCT> + );""") + try_sql("DROP TABLE IF EXISTS test_struct_table;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_concurrent.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_concurrent.groovy new file mode 100644 index 00000000000000..cd683ecffb7414 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_concurrent.groovy @@ -0,0 +1,345 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_concurrent") { + // Test multiple Python UDFs executing concurrently in the same SQL query + + def runtime_version = "3.8.10" + + try { + // Create test table + sql """ DROP TABLE IF EXISTS concurrent_udf_test; """ + sql """ + CREATE TABLE concurrent_udf_test ( + id INT, + value1 INT, + value2 INT, + value3 DOUBLE, + value4 DOUBLE, + str1 STRING, + str2 STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO concurrent_udf_test VALUES + (1, 10, 20, 1.5, 2.5, 'hello', 'world'), + (2, 30, 40, 3.5, 4.5, 'foo', 'bar'), + (3, 50, 60, 5.5, 6.5, 'test', 'case'), + (4, 70, 80, 7.5, 8.5, 'python', 'udf'), + (5, 90, 100, 9.5, 10.5, 'doris', 'db'); + """ + + // Create multiple scalar UDFs with different operations + + // UDF 1: Integer addition + sql """ DROP FUNCTION IF EXISTS py_add_int(INT, INT); """ + sql """ + CREATE FUNCTION py_add_int(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "add_int", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def add_int(a, b): + if a is None or b is None: + return None + return a + b +\$\$; + """ + + // UDF 2: Integer multiplication + sql """ DROP FUNCTION IF EXISTS py_multiply_int(INT, INT); """ + sql """ + CREATE FUNCTION py_multiply_int(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "multiply_int", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def multiply_int(a, b): + if a is None or b is None: + return None + return a * b +\$\$; + """ + + // UDF 3: Double division + sql """ DROP FUNCTION IF EXISTS py_divide_double(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_divide_double(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "divide_double", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def divide_double(a, b): + if a is None or b is None or b == 0: + return None + return a / b +\$\$; + """ + + // UDF 4: String concatenation + sql """ DROP FUNCTION IF EXISTS py_concat_str(STRING, STRING); """ + sql """ + CREATE FUNCTION py_concat_str(STRING, STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "concat_str", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def concat_str(s1, s2): + if s1 is None or s2 is None: + return None + return s1 + '_' + s2 +\$\$; + """ + + // UDF 5: String length + sql """ DROP FUNCTION IF EXISTS py_str_len(STRING); """ + sql """ + CREATE FUNCTION py_str_len(STRING) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "str_len", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def str_len(s): + if s is None: + return None + return len(s) +\$\$; + """ + + // Test 1: Multiple scalar UDFs in SELECT clause + qt_concurrent_scalar_1 """ + SELECT + id, + py_add_int(value1, value2) AS add_result, + py_multiply_int(value1, value2) AS multiply_result, + py_divide_double(value3, value4) AS divide_result, + py_concat_str(str1, str2) AS concat_result, + py_str_len(str1) AS len_result + FROM concurrent_udf_test + ORDER BY id; + """ + + // Test 2: Multiple scalar UDFs with nested calls + qt_concurrent_scalar_2 """ + SELECT + id, + py_add_int(py_multiply_int(value1, 2), value2) AS nested_result1, + py_str_len(py_concat_str(str1, str2)) AS nested_result2 + FROM concurrent_udf_test + ORDER BY id; + """ + + // Test 3: Multiple scalar UDFs in WHERE clause + qt_concurrent_scalar_3 """ + SELECT + id, + value1, + value2 + FROM concurrent_udf_test + WHERE py_add_int(value1, value2) > 50 + AND py_str_len(str1) > 3 + ORDER BY id; + """ + + // Test 4: Multiple vectorized UDFs (using pandas) + sql """ DROP FUNCTION IF EXISTS py_vec_add(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_add(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "vec_add", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def vec_add(a: pd.Series, b: pd.Series) -> pd.Series: + return a + b +\$\$; + """ + + sql """ DROP FUNCTION IF EXISTS py_vec_sub(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_sub(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "vec_sub", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def vec_sub(a: pd.Series, b: pd.Series) -> pd.Series: + return a - b +\$\$; + """ + + sql """ DROP FUNCTION IF EXISTS py_vec_max(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_max(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "vec_max", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd +import numpy as np + +def vec_max(a: pd.Series, b: pd.Series) -> pd.Series: + return pd.Series(np.maximum(a, b)) +\$\$; + """ + + // Test 5: Multiple vectorized UDFs in the same query + qt_concurrent_vector_1 """ + SELECT + id, + py_vec_add(value1, value2) AS vec_add_result, + py_vec_sub(value2, value1) AS vec_sub_result, + py_vec_max(value1, value2) AS vec_max_result + FROM concurrent_udf_test + ORDER BY id; + """ + + // Test 6: Mix of scalar and vectorized UDFs + qt_concurrent_mixed_1 """ + SELECT + id, + py_add_int(value1, value2) AS scalar_add, + py_vec_add(value1, value2) AS vector_add, + py_concat_str(str1, str2) AS scalar_concat, + py_multiply_int(value1, 10) AS scalar_mul + FROM concurrent_udf_test + ORDER BY id; + """ + + // Test 7: Multiple UDFs with aggregation + qt_concurrent_with_agg """ + SELECT + COUNT(*) AS total_count, + SUM(py_add_int(value1, value2)) AS sum_add, + AVG(py_multiply_int(value1, 2)) AS avg_mul, + MAX(py_str_len(str1)) AS max_len + FROM concurrent_udf_test; + """ + + // Test 8: Multiple UDFs with GROUP BY + qt_concurrent_group_by """ + SELECT + CASE WHEN id <= 3 THEN 'group1' ELSE 'group2' END AS grp, + COUNT(*) AS cnt, + SUM(py_add_int(value1, value2)) AS sum_result, + AVG(py_divide_double(value3, value4)) AS avg_result + FROM concurrent_udf_test + GROUP BY grp + ORDER BY grp; + """ + + // Test 9: Multiple UDFs with different parameter types + sql """ DROP FUNCTION IF EXISTS py_int_to_str(INT); """ + sql """ + CREATE FUNCTION py_int_to_str(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "int_to_str", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +def int_to_str(n): + if n is None: + return None + return str(n) +\$\$; + """ + + qt_concurrent_type_mix """ + SELECT + id, + py_add_int(value1, value2) AS int_result, + py_divide_double(value3, value4) AS double_result, + py_concat_str(py_int_to_str(value1), str1) AS mixed_result + FROM concurrent_udf_test + ORDER BY id; + """ + + // Test 10: Stress test - many UDFs in one query + qt_concurrent_stress """ + SELECT + id, + py_add_int(value1, value2) AS r1, + py_multiply_int(value1, value2) AS r2, + py_add_int(value1, 100) AS r3, + py_multiply_int(value2, 5) AS r4, + py_divide_double(value3, value4) AS r5, + py_divide_double(value4, value3) AS r6, + py_concat_str(str1, str2) AS r7, + py_str_len(str1) AS r8, + py_str_len(str2) AS r9, + py_int_to_str(value1) AS r10 + FROM concurrent_udf_test + ORDER BY id; + """ + + } finally { + // Cleanup + try_sql("DROP FUNCTION IF EXISTS py_add_int(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_multiply_int(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_divide_double(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_concat_str(STRING, STRING);") + try_sql("DROP FUNCTION IF EXISTS py_str_len(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_vec_add(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_sub(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_max(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_int_to_str(INT);") + try_sql("DROP TABLE IF EXISTS concurrent_udf_test;") + } +} + + diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_data_types.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_data_types.groovy new file mode 100644 index 00000000000000..15423e4704aa9e --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_data_types.groovy @@ -0,0 +1,189 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_data_types") { + // Test various data types supported by Python UDF + def runtime_version = "3.8.10" + + try { + // Test 1: TINYINT type + sql """ DROP FUNCTION IF EXISTS py_tinyint_test(TINYINT); """ + sql """ + CREATE FUNCTION py_tinyint_test(TINYINT) + RETURNS TINYINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x + 1 +\$\$; + """ + + qt_select_tinyint """ SELECT py_tinyint_test(CAST(10 AS TINYINT)) AS result; """ + + // Test 2: SMALLINT type + sql """ DROP FUNCTION IF EXISTS py_smallint_test(SMALLINT); """ + sql """ + CREATE FUNCTION py_smallint_test(SMALLINT) + RETURNS SMALLINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x * 2 +\$\$; + """ + + qt_select_smallint """ SELECT py_smallint_test(CAST(1000 AS SMALLINT)) AS result; """ + + // Test 3: BIGINT type + sql """ DROP FUNCTION IF EXISTS py_bigint_test(BIGINT); """ + sql """ + CREATE FUNCTION py_bigint_test(BIGINT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x + 1000000 +\$\$; + """ + + qt_select_bigint """ SELECT py_bigint_test(1000000000000) AS result; """ + + // Test 4: DECIMAL type + sql """ DROP FUNCTION IF EXISTS py_decimal_test(DECIMAL(10,2)); """ + sql """ + CREATE FUNCTION py_decimal_test(DECIMAL(10,2)) + RETURNS DECIMAL(10,2) + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x * 1.1 +\$\$; + """ + + qt_select_decimal """ SELECT py_decimal_test(100.50) AS result; """ + + // Test 5: DATE type + sql """ DROP FUNCTION IF EXISTS py_date_test(DATE); """ + sql """ + CREATE FUNCTION py_date_test(DATE) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(d): + if d is None: + return None + return str(d) +\$\$; + """ + + qt_select_date """ SELECT py_date_test('2024-01-15') AS result; """ + + // Test 6: DATETIME type + sql """ DROP FUNCTION IF EXISTS py_datetime_test(DATETIME); """ + sql """ + CREATE FUNCTION py_datetime_test(DATETIME) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(dt): + if dt is None: + return None + return str(dt) +\$\$; + """ + + qt_select_datetime """ SELECT py_datetime_test('2024-01-15 10:30:45') AS result; """ + + // Test 7: Comprehensive test - create table and test multiple data types + sql """ DROP TABLE IF EXISTS data_types_test_table; """ + sql """ + CREATE TABLE data_types_test_table ( + id INT, + tiny_val TINYINT, + small_val SMALLINT, + int_val INT, + big_val BIGINT, + float_val FLOAT, + double_val DOUBLE, + decimal_val DECIMAL(10,2), + string_val STRING, + bool_val BOOLEAN + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO data_types_test_table VALUES + (1, 10, 100, 1000, 10000, 1.5, 2.5, 100.50, 'test1', true), + (2, 20, 200, 2000, 20000, 2.5, 3.5, 200.75, 'test2', false), + (3, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); + """ + + qt_select_table_types """ + SELECT + id, + py_tinyint_test(tiny_val) AS tiny_result, + py_smallint_test(small_val) AS small_result, + py_bigint_test(big_val) AS big_result + FROM data_types_test_table + ORDER BY id; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_tinyint_test(TINYINT);") + try_sql("DROP FUNCTION IF EXISTS py_smallint_test(SMALLINT);") + try_sql("DROP FUNCTION IF EXISTS py_bigint_test(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS py_decimal_test(DECIMAL(10,2));") + try_sql("DROP FUNCTION IF EXISTS py_date_test(DATE);") + try_sql("DROP FUNCTION IF EXISTS py_datetime_test(DATETIME);") + try_sql("DROP TABLE IF EXISTS data_types_test_table;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_error_handling.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_error_handling.groovy new file mode 100644 index 00000000000000..c6969e8ac4d3d2 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_error_handling.groovy @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_error_handling") { + // Test error handling and exception cases for Python UDF + + def runtime_version = "3.8.10" + try { + // Test 1: Division by zero error handling + sql """ DROP FUNCTION IF EXISTS py_safe_divide(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_safe_divide(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(a, b): + if a is None or b is None: + return None + if b == 0: + return None + return a / b +\$\$; + """ + + qt_select_divide_normal """ SELECT py_safe_divide(10.0, 2.0) AS result; """ + qt_select_divide_zero """ SELECT py_safe_divide(10.0, 0.0) AS result; """ + qt_select_divide_null """ SELECT py_safe_divide(10.0, NULL) AS result; """ + + // Test 2: String index out of bounds handling + sql """ DROP FUNCTION IF EXISTS py_safe_substring(STRING, INT); """ + sql """ + CREATE FUNCTION py_safe_substring(STRING, INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(s, index): + if s is None or index is None: + return None + if index < 0 or index >= len(s): + return None + return s[index] +\$\$; + """ + + qt_select_substring_valid """ SELECT py_safe_substring('hello', 1) AS result; """ + qt_select_substring_invalid """ SELECT py_safe_substring('hello', 10) AS result; """ + qt_select_substring_negative """ SELECT py_safe_substring('hello', -1) AS result; """ + + // Test 3: Type conversion error handling + sql """ DROP FUNCTION IF EXISTS py_safe_int_parse(STRING); """ + sql """ + CREATE FUNCTION py_safe_int_parse(STRING) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(s): + if s is None: + return None + try: + return int(s) + except (ValueError, TypeError): + return None +\$\$; + """ + + qt_select_parse_valid """ SELECT py_safe_int_parse('123') AS result; """ + qt_select_parse_invalid """ SELECT py_safe_int_parse('abc') AS result; """ + qt_select_parse_empty """ SELECT py_safe_int_parse('') AS result; """ + + // Test 4: Array out of bounds handling + sql """ DROP FUNCTION IF EXISTS py_safe_array_get(ARRAY, INT); """ + sql """ + CREATE FUNCTION py_safe_array_get(ARRAY, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(arr, index): + if arr is None or index is None: + return None + if index < 0 or index >= len(arr): + return None + return arr[index] +\$\$; + """ + + qt_select_array_valid """ SELECT py_safe_array_get([10, 20, 30], 1) AS result; """ + qt_select_array_invalid """ SELECT py_safe_array_get([10, 20, 30], 5) AS result; """ + + // Test 5: Test error handling on table data + sql """ DROP TABLE IF EXISTS error_handling_test_table; """ + sql """ + CREATE TABLE error_handling_test_table ( + id INT, + numerator DOUBLE, + denominator DOUBLE, + text STRING, + arr_index INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO error_handling_test_table VALUES + (1, 100.0, 10.0, '123', 0), + (2, 50.0, 0.0, 'abc', 1), + (3, NULL, 5.0, '', 2), + (4, 75.0, NULL, '456', -1), + (5, 25.0, 5.0, 'xyz', 10); + """ + + qt_select_table_error_handling """ + SELECT + id, + numerator, + denominator, + py_safe_divide(numerator, denominator) AS divide_result, + text, + py_safe_int_parse(text) AS parse_result + FROM error_handling_test_table + ORDER BY id; + """ + + // Test 6: Empty string handling + sql """ DROP FUNCTION IF EXISTS py_safe_length(STRING); """ + sql """ + CREATE FUNCTION py_safe_length(STRING) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(s): + if s is None: + return None + return len(s) +\$\$; + """ + + qt_select_length_normal """ SELECT py_safe_length('hello') AS result; """ + qt_select_length_empty """ SELECT py_safe_length('') AS result; """ + qt_select_length_null """ SELECT py_safe_length(NULL) AS result; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_safe_divide(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_safe_substring(STRING, INT);") + try_sql("DROP FUNCTION IF EXISTS py_safe_int_parse(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_safe_array_get(ARRAY, INT);") + try_sql("DROP FUNCTION IF EXISTS py_safe_length(STRING);") + try_sql("DROP TABLE IF EXISTS error_handling_test_table;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy new file mode 100644 index 00000000000000..4e1e2e3ec6f09d --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_file_protocol.groovy @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_file_protocol") { + // Test loading Python UDF from zip package using file:// protocol + + def zipPath = """${context.file.parent}/udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(zipPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${zipPath}".toString()) + + try { + // Test 1: Load int_test.py from zip package using file:// protocol + sql """ DROP FUNCTION IF EXISTS py_file_int_add(INT); """ + sql """ + CREATE FUNCTION py_file_int_add(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${zipPath}", + "symbol" = "int_test.evaluate", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_select_file_int """ SELECT py_file_int_add(99) AS result; """ + + // Test 2: Load string_test.py from zip package using file:// protocol + sql """ DROP FUNCTION IF EXISTS py_file_string_mask(STRING, INT, INT); """ + sql """ + CREATE FUNCTION py_file_string_mask(STRING, INT, INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${zipPath}", + "symbol" = "string_test.evaluate", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_select_file_string """ SELECT py_file_string_mask('1234567890', 3, 3) AS result; """ + + // Test 3: Load float_test.py from zip package using file:// protocol + sql """ DROP FUNCTION IF EXISTS py_file_float_process(FLOAT); """ + sql """ + CREATE FUNCTION py_file_float_process(FLOAT) + RETURNS FLOAT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${zipPath}", + "symbol" = "float_test.evaluate", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_select_file_float """ SELECT py_file_float_process(3.14) AS result; """ + + // Test 4: Load boolean_test.py from zip package using file:// protocol + sql """ DROP FUNCTION IF EXISTS py_file_bool_not(BOOLEAN); """ + sql """ + CREATE FUNCTION py_file_bool_not(BOOLEAN) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${zipPath}", + "symbol" = "boolean_test.evaluate", + "runtime_version" = "${runtime_version}" + ); + """ + + qt_select_file_bool_true """ SELECT py_file_bool_not(true) AS result; """ + qt_select_file_bool_false """ SELECT py_file_bool_not(false) AS result; """ + + // Test 5: Test UDF with file:// protocol on table data + sql """ DROP TABLE IF EXISTS file_protocol_test_table; """ + sql """ + CREATE TABLE file_protocol_test_table ( + id INT, + num INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO file_protocol_test_table VALUES + (1, 10, 'hello'), + (2, 20, 'world'), + (3, 30, 'python'), + (4, 40, 'doris'); + """ + + qt_select_table_file """ + SELECT + id, + num, + py_file_int_add(num) AS num_result, + text, + py_file_string_mask(text, 1, 1) AS text_result + FROM file_protocol_test_table + ORDER BY id; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_file_int_add(INT);") + try_sql("DROP FUNCTION IF EXISTS py_file_string_mask(STRING, INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_file_float_process(FLOAT);") + try_sql("DROP FUNCTION IF EXISTS py_file_bool_not(BOOLEAN);") + try_sql("DROP TABLE IF EXISTS file_protocol_test_table;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy new file mode 100644 index 00000000000000..5febd5bca5da59 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_float.groovy @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_float") { + def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + try { + sql """ DROP TABLE IF EXISTS test_pythonudf_float """ + sql """ + CREATE TABLE IF NOT EXISTS test_pythonudf_float ( + `user_id` INT NOT NULL COMMENT "", + `float_1` FLOAT NOT NULL COMMENT "", + `float_2` FLOAT COMMENT "", + `double_1` DOUBLE NOT NULL COMMENT "", + `double_2` DOUBLE COMMENT "" + ) + DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1"); + """ + + + sql """ INSERT INTO test_pythonudf_float (`user_id`,`float_1`,`float_2`,double_1,double_2) VALUES + (111,11111.11111,222222.3333333,12345678.34455677,1111111.999999999999), + (112,1234556.11111,222222.3333333,222222222.3333333333333,4444444444444.555555555555), + (113,87654321.11111,null,6666666666.6666666666,null) + """ + qt_select_default """ SELECT * FROM test_pythonudf_float t ORDER BY user_id; """ + + File path = new File(pyPath) + if (!path.exists()) { + throw new IllegalStateException("""${pyPath} doesn't exist! """) + } + + sql """ DROP FUNCTION IF EXISTS python_udf_float_test(FLOAT,FLOAT) """ + + sql """ CREATE FUNCTION python_udf_float_test(FLOAT,FLOAT) RETURNS FLOAT PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="float_test.evaluate", + "type"="PYTHON_UDF", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); """ + + qt_select """ SELECT python_udf_float_test(cast(2.83645 as float),cast(111.1111111 as float)) as result; """ + qt_select """ SELECT python_udf_float_test(2.83645,111.1111111) as result ; """ + qt_select """ SELECT python_udf_float_test(2.83645,null) as result ; """ + qt_select """ SELECT python_udf_float_test(cast(2.83645 as float),null) as result ; """ + qt_select """ SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; """ + createMV("create materialized view udf_mv as SELECT user_id as a1,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id;") + qt_select """ SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; """ + + explain { + sql("SELECT user_id,python_udf_float_test(float_1, float_2) as sum FROM test_pythonudf_float order by user_id; ") + contains "(udf_mv)" + } + + sql """ CREATE FUNCTION python_udf_double_test(DOUBLE,DOUBLE) RETURNS DOUBLE PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="double_test.evaluate", + "type"="PYTHON_UDF", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); """ + + qt_select """ SELECT python_udf_double_test(cast(2.83645 as DOUBLE),cast(111.1111111 as DOUBLE)) as result; """ + qt_select """ SELECT python_udf_double_test(2.83645,111.1111111) as result ; """ + qt_select """ SELECT python_udf_double_test(2.83645,null) as result ; """ + qt_select """ SELECT python_udf_double_test(cast(2.83645 as DOUBLE),null) as result ; """ + qt_select """ SELECT user_id,python_udf_double_test(double_1, double_1) as sum FROM test_pythonudf_float order by user_id; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS python_udf_double_test(DOUBLE,DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS python_udf_float_test(FLOAT,FLOAT);") + try_sql("DROP TABLE IF EXISTS test_pythonudf_float") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_global_function.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_global_function.groovy new file mode 100644 index 00000000000000..2847cb34fa8a03 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_global_function.groovy @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_global_function") { + // Test creating global Python UDF with GLOBAL keyword + + def runtime_version = "3.8.10" + try { + // Test 1: Create GLOBAL function + sql """ DROP GLOBAL FUNCTION IF EXISTS py_global_multiply(INT, INT); """ + sql """ + CREATE GLOBAL FUNCTION py_global_multiply(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(a, b): + if a is None or b is None: + return None + return a * b +\$\$; + """ + + qt_select_global_multiply """ SELECT py_global_multiply(7, 8) AS result; """ + + // Test 2: Create GLOBAL string function + sql """ DROP GLOBAL FUNCTION IF EXISTS py_global_lower(STRING); """ + sql """ + CREATE GLOBAL FUNCTION py_global_lower(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(s): + if s is None: + return None + return s.lower() +\$\$; + """ + + qt_select_global_lower """ SELECT py_global_lower('HELLO WORLD') AS result; """ + + // Test 3: Create regular (non-GLOBAL) function for comparison + sql """ DROP FUNCTION IF EXISTS py_local_add(INT, INT); """ + sql """ + CREATE FUNCTION py_local_add(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(a, b): + if a is None or b is None: + return None + return a + b +\$\$; + """ + + qt_select_local_add """ SELECT py_local_add(15, 25) AS result; """ + + // Test 4: Test GLOBAL function on table data + sql """ DROP TABLE IF EXISTS global_function_test_table; """ + sql """ + CREATE TABLE global_function_test_table ( + id INT, + val1 INT, + val2 INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO global_function_test_table VALUES + (1, 5, 6, 'APPLE'), + (2, 10, 20, 'BANANA'), + (3, 3, 7, 'CHERRY'), + (4, NULL, 5, 'DATE'), + (5, 8, 9, NULL); + """ + + qt_select_table_global """ + SELECT + id, + val1, + val2, + py_global_multiply(val1, val2) AS multiply_result, + text, + py_global_lower(text) AS lower_result + FROM global_function_test_table + ORDER BY id; + """ + + // Test 5: Mathematical calculation with GLOBAL function + sql """ DROP GLOBAL FUNCTION IF EXISTS py_global_power(DOUBLE, DOUBLE); """ + sql """ + CREATE GLOBAL FUNCTION py_global_power(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(base, exponent): + if base is None or exponent is None: + return None + return base ** exponent +\$\$; + """ + + qt_select_global_power """ SELECT py_global_power(2.0, 3.0) AS result; """ + qt_select_global_power_decimal """ SELECT py_global_power(5.0, 0.5) AS result; """ + + } finally { + try_sql("DROP GLOBAL FUNCTION IF EXISTS py_global_multiply(INT, INT);") + try_sql("DROP GLOBAL FUNCTION IF EXISTS py_global_lower(STRING);") + try_sql("DROP GLOBAL FUNCTION IF EXISTS py_global_power(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_local_add(INT, INT);") + try_sql("DROP TABLE IF EXISTS global_function_test_table;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_inline_complex.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_complex.groovy new file mode 100644 index 00000000000000..926d09f7c41288 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_complex.groovy @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_inline_complex") { + // Test complex Python UDF using Inline mode + + def runtime_version = "3.8.10" + try { + // Test 1: Array processing + sql """ DROP FUNCTION IF EXISTS py_array_sum(ARRAY); """ + sql """ + CREATE FUNCTION py_array_sum(ARRAY) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(arr): + if arr is None: + return None + return sum(arr) +\$\$; + """ + + qt_select_array_sum """ SELECT py_array_sum([1, 2, 3, 4, 5]) AS result; """ + + // Test 2: String processing - reverse + sql """ DROP FUNCTION IF EXISTS py_reverse_string(STRING); """ + sql """ + CREATE FUNCTION py_reverse_string(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(s): + if s is None: + return None + return s[::-1] +\$\$; + """ + + qt_select_reverse """ SELECT py_reverse_string('Hello') AS result; """ + + // Test 3: Multi-parameter complex calculation + sql """ DROP FUNCTION IF EXISTS py_weighted_avg(DOUBLE, DOUBLE, DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_weighted_avg(DOUBLE, DOUBLE, DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(val1, weight1, val2, weight2): + if any(x is None for x in [val1, weight1, val2, weight2]): + return None + total_weight = weight1 + weight2 + if total_weight == 0: + return None + return (val1 * weight1 + val2 * weight2) / total_weight +\$\$; + """ + + qt_select_weighted_avg """ SELECT py_weighted_avg(80.0, 0.6, 90.0, 0.4) AS result; """ + + // Test 4: String formatting + sql """ DROP FUNCTION IF EXISTS py_format_name(STRING, STRING); """ + sql """ + CREATE FUNCTION py_format_name(STRING, STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(first_name, last_name): + if first_name is None or last_name is None: + return None + return f"{last_name.upper()}, {first_name.capitalize()}" +\$\$; + """ + + qt_select_format_name """ SELECT py_format_name('john', 'doe') AS result; """ + + // Test 5: Numeric range validation + sql """ DROP FUNCTION IF EXISTS py_in_range(INT, INT, INT); """ + sql """ + CREATE FUNCTION py_in_range(INT, INT, INT) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(value, min_val, max_val): + if any(x is None for x in [value, min_val, max_val]): + return None + return min_val <= value <= max_val +\$\$; + """ + + qt_select_in_range_true """ SELECT py_in_range(50, 0, 100) AS result; """ + qt_select_in_range_false """ SELECT py_in_range(150, 0, 100) AS result; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_array_sum(ARRAY);") + try_sql("DROP FUNCTION IF EXISTS py_reverse_string(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_weighted_avg(DOUBLE, DOUBLE, DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_format_name(STRING, STRING);") + try_sql("DROP FUNCTION IF EXISTS py_in_range(INT, INT, INT);") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_inline_scalar.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_scalar.groovy new file mode 100644 index 00000000000000..fb32c48cfec6b9 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_scalar.groovy @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_inline_basic") { + // Test basic Python UDF using Inline mode + + def runtime_version = "3.8.10" + try { + // Test 1: Simple integer addition + sql """ DROP FUNCTION IF EXISTS py_add(INT, INT); """ + sql """ + CREATE FUNCTION py_add(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(a, b): + return a + b +\$\$; + """ + + qt_select_add """ SELECT py_add(10, 20) AS result; """ + qt_select_add_null """ SELECT py_add(NULL, 20) AS result; """ + + // Test 2: String concatenation + sql """ DROP FUNCTION IF EXISTS py_concat(STRING, STRING); """ + sql """ + CREATE FUNCTION py_concat(STRING, STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(s1, s2): + if s1 is None or s2 is None: + return None + return s1 + s2 +\$\$; + """ + + qt_select_concat """ SELECT py_concat('Hello', ' World') AS result; """ + qt_select_concat_null """ SELECT py_concat('Hello', NULL) AS result; """ + + // Test 3: Mathematical operations + sql """ DROP FUNCTION IF EXISTS py_square(DOUBLE); """ + sql """ + CREATE FUNCTION py_square(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(x): + if x is None: + return None + return x * x +\$\$; + """ + + qt_select_square """ SELECT py_square(5.0) AS result; """ + qt_select_square_negative """ SELECT py_square(-3.0) AS result; """ + + // Test 4: Conditional logic + sql """ DROP FUNCTION IF EXISTS py_is_positive(INT); """ + sql """ + CREATE FUNCTION py_is_positive(INT) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(num): + if num is None: + return None + return num > 0 +\$\$; + """ + + qt_select_positive """ SELECT py_is_positive(10) AS result; """ + qt_select_negative """ SELECT py_is_positive(-5) AS result; """ + qt_select_zero """ SELECT py_is_positive(0) AS result; """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_add(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_concat(STRING, STRING);") + try_sql("DROP FUNCTION IF EXISTS py_square(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_is_positive(INT);") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_inline_vector.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_vector.groovy new file mode 100644 index 00000000000000..5f925d75adc26c --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_inline_vector.groovy @@ -0,0 +1,409 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_inline_vector") { + // Test vectorized Python UDF using Inline mode with pandas.Series + + def runtime_version = "3.8.10" + try { + // Create test table + sql """ DROP TABLE IF EXISTS vector_udf_test_table; """ + sql """ + CREATE TABLE vector_udf_test_table ( + id INT, + int_col1 INT, + int_col2 INT, + double_col1 DOUBLE, + double_col2 DOUBLE, + string_col1 STRING, + string_col2 STRING, + bool_col BOOLEAN + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO vector_udf_test_table VALUES + (1, 10, 20, 1.5, 2.5, 'hello', 'world', true), + (2, 30, 40, 3.5, 4.5, 'foo', 'bar', false), + (3, NULL, 50, 5.5, NULL, NULL, 'test', true), + (4, 60, NULL, NULL, 6.5, 'data', NULL, false), + (5, 70, 80, 7.5, 8.5, 'python', 'udf', true); + """ + + // Test 1: Vector INT addition with pandas.Series + sql """ DROP FUNCTION IF EXISTS py_vec_add_int(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_add_int(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "add", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def add(a: pd.Series, b: pd.Series) -> pd.Series: + return a + b + 1 +\$\$; + """ + + qt_vec_add_int """ + SELECT + id, + int_col1, + int_col2, + py_vec_add_int(int_col1, int_col2) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + // Test 2: Vector DOUBLE multiplication with pandas.Series + sql """ DROP FUNCTION IF EXISTS py_vec_multiply_double(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_multiply_double(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "multiply", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def multiply(a: pd.Series, b: pd.Series) -> pd.Series: + return a * b +\$\$; + """ + + qt_vec_multiply_double """ + SELECT + id, + double_col1, + double_col2, + py_vec_multiply_double(double_col1, double_col2) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + // Test 3: Vector STRING concatenation with pandas.Series + sql """ DROP FUNCTION IF EXISTS py_vec_concat_string(STRING, STRING); """ + sql """ + CREATE FUNCTION py_vec_concat_string(STRING, STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "concat", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def concat(s1: pd.Series, s2: pd.Series) -> pd.Series: + return s1 + '_' + s2 +\$\$; + """ + + qt_vec_concat_string """ + SELECT + id, + string_col1, + string_col2, + py_vec_concat_string(string_col1, string_col2) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + // Test 4: Vector INT with conditional logic using pandas.Series + sql """ DROP FUNCTION IF EXISTS py_vec_max_int(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_max_int(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "get_max", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd +import numpy as np + +def get_max(a: pd.Series, b: pd.Series) -> pd.Series: + return pd.Series(np.maximum(a, b)) +\$\$; + """ + + qt_vec_max_int """ + SELECT + id, + int_col1, + int_col2, + py_vec_max_int(int_col1, int_col2) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + // Test 5: Vector DOUBLE with mathematical operations + sql """ DROP FUNCTION IF EXISTS py_vec_sqrt_double(DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_sqrt_double(DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "sqrt", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd +import numpy as np + +def sqrt(x: pd.Series) -> pd.Series: + return np.sqrt(x) +\$\$; + """ + + qt_vec_sqrt_double """ + SELECT + id, + double_col1, + py_vec_sqrt_double(double_col1) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + // Test 6: Vector STRING with upper case transformation + sql """ DROP FUNCTION IF EXISTS py_vec_upper_string(STRING); """ + sql """ + CREATE FUNCTION py_vec_upper_string(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "to_upper", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def to_upper(s: pd.Series) -> pd.Series: + return s.str.upper() +\$\$; + """ + + qt_vec_upper_string """ + SELECT + id, + string_col1, + py_vec_upper_string(string_col1) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + // Test 7: Vector INT with complex calculation + sql """ DROP FUNCTION IF EXISTS py_vec_weighted_sum(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_weighted_sum(INT, INT) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "weighted_sum", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def weighted_sum(a: pd.Series, b: pd.Series) -> pd.Series: + return a * 0.3 + b * 0.7 +\$\$; + """ + + qt_vec_weighted_sum """ + SELECT + id, + int_col1, + int_col2, + py_vec_weighted_sum(int_col1, int_col2) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + // Test 8: Vector BOOLEAN operations + sql """ DROP FUNCTION IF EXISTS py_vec_not_bool(BOOLEAN); """ + sql """ + CREATE FUNCTION py_vec_not_bool(BOOLEAN) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "negate", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def negate(b: pd.Series) -> pd.Series: + return ~b +\$\$; + """ + + qt_vec_not_bool """ + SELECT + id, + bool_col, + py_vec_not_bool(bool_col) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + // Test 9: Vector INT comparison returning BOOLEAN + sql """ DROP FUNCTION IF EXISTS py_vec_greater_than(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_greater_than(INT, INT) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "greater", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def greater(a: pd.Series, b: pd.Series) -> pd.Series: + return a > b +\$\$; + """ + + qt_vec_greater_than """ + SELECT + id, + int_col1, + int_col2, + py_vec_greater_than(int_col1, int_col2) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + // Test 10: Vector STRING length calculation + sql """ DROP FUNCTION IF EXISTS py_vec_string_length(STRING); """ + sql """ + CREATE FUNCTION py_vec_string_length(STRING) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "str_len", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def str_len(s: pd.Series) -> pd.Series: + return s.str.len() +\$\$; + """ + + qt_vec_string_length """ + SELECT + id, + string_col1, + py_vec_string_length(string_col1) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + // Test 11: Vector with NULL handling using fillna + sql """ DROP FUNCTION IF EXISTS py_vec_fill_null_int(INT); """ + sql """ + CREATE FUNCTION py_vec_fill_null_int(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "fill_null", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def fill_null(x: pd.Series) -> pd.Series: + return x.fillna(0) +\$\$; + """ + + qt_vec_fill_null_int """ + SELECT + id, + int_col1, + py_vec_fill_null_int(int_col1) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + // Test 12: Vector with aggregation-like operation (cumulative sum) + sql """ DROP FUNCTION IF EXISTS py_vec_cumsum_int(INT); """ + sql """ + CREATE FUNCTION py_vec_cumsum_int(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "cumsum", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def cumsum(x: pd.Series) -> pd.Series: + return x.cumsum() +\$\$; + """ + + qt_vec_cumsum_int """ + SELECT + id, + int_col1, + py_vec_cumsum_int(int_col1) AS result + FROM vector_udf_test_table + ORDER BY id; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_vec_add_int(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_multiply_double(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_vec_concat_string(STRING, STRING);") + try_sql("DROP FUNCTION IF EXISTS py_vec_max_int(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_sqrt_double(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_vec_upper_string(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_vec_weighted_sum(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_not_bool(BOOLEAN);") + try_sql("DROP FUNCTION IF EXISTS py_vec_greater_than(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_string_length(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_vec_fill_null_int(INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_cumsum_int(INT);") + try_sql("DROP TABLE IF EXISTS vector_udf_test_table;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_int.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_int.groovy new file mode 100644 index 00000000000000..fad4fd51210502 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_int.groovy @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_int") { + def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + try { + sql """ DROP TABLE IF EXISTS test_pythonudf_int """ + sql """ + CREATE TABLE IF NOT EXISTS test_pythonudf_int ( + `user_id` INT NOT NULL COMMENT "", + `tinyint_col` TINYINT NOT NULL COMMENT "", + `smallint_col` SMALLINT NOT NULL COMMENT "", + `bigint_col` BIGINT NOT NULL COMMENT "" + ) + DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1"); + """ + StringBuilder sb = new StringBuilder() + int i = 1 + for (; i < 10; i++) { + sb.append(""" + (${i},${i}*2,${i}*3,${i}*4), + """) + } + sb.append(""" + (${i},${i}*2,${i}*3,${i}*4) + """) + sql """ INSERT INTO test_pythonudf_int VALUES + ${sb.toString()} + """ + qt_select_default """ SELECT * FROM test_pythonudf_int t ORDER BY user_id; """ + + File path = new File(pyPath) + if (!path.exists()) { + throw new IllegalStateException("""${pyPath} doesn't exist! """) + } + + sql """ DROP FUNCTION IF EXISTS python_udf_int_test(int) """ + + sql """ CREATE FUNCTION python_udf_int_test(int) RETURNS int PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="int_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select """ SELECT python_udf_int_test(user_id) result FROM test_pythonudf_int ORDER BY result; """ + qt_select """ SELECT python_udf_int_test(null) result ; """ + + + sql """ CREATE FUNCTION python_udf_tinyint_test(tinyint) RETURNS tinyint PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="int_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select """ SELECT python_udf_tinyint_test(tinyint_col) result FROM test_pythonudf_int ORDER BY result; """ + qt_select """ SELECT python_udf_tinyint_test(null) result ; """ + + + sql """ CREATE FUNCTION python_udf_smallint_test(smallint) RETURNS smallint PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="int_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select """ SELECT python_udf_smallint_test(smallint_col) result FROM test_pythonudf_int ORDER BY result; """ + qt_select """ SELECT python_udf_smallint_test(null) result ; """ + + + sql """ CREATE FUNCTION python_udf_bigint_test(bigint) RETURNS bigint PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="int_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select """ SELECT python_udf_bigint_test(bigint_col) result FROM test_pythonudf_int ORDER BY result; """ + qt_select """ SELECT python_udf_bigint_test(null) result ; """ + + sql """ CREATE GLOBAL FUNCTION python_udf_int_test_global(int) RETURNS int PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="int_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select_global_1 """ SELECT python_udf_int_test_global(user_id) result FROM test_pythonudf_int ORDER BY result; """ + qt_select_global_2 """ SELECT python_udf_int_test_global(null) result ; """ + qt_select_global_3 """ SELECT python_udf_int_test_global(3) result FROM test_pythonudf_int ORDER BY result; """ + qt_select_global_4 """ SELECT abs(python_udf_int_test_global(3)) result FROM test_pythonudf_int ORDER BY result; """ + + } finally { + try_sql("DROP GLOBAL FUNCTION IF EXISTS python_udf_int_test_global(int);") + try_sql("DROP FUNCTION IF EXISTS python_udf_tinyint_test(tinyint);") + try_sql("DROP FUNCTION IF EXISTS python_udf_smallint_test(smallint);") + try_sql("DROP FUNCTION IF EXISTS python_udf_bigint_test(bigint);") + try_sql("DROP FUNCTION IF EXISTS python_udf_int_test(int);") + try_sql("DROP TABLE IF EXISTS test_pythonudf_int") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_map.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_map.groovy new file mode 100644 index 00000000000000..74b9a7cb1174b2 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_map.groovy @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_map") { + def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + try { + try_sql("DROP FUNCTION IF EXISTS udfii(Map);") + try_sql("DROP FUNCTION IF EXISTS udfss(Map);") + try_sql("DROP TABLE IF EXISTS map_ii") + try_sql("DROP TABLE IF EXISTS map_ss") + sql """ + CREATE TABLE IF NOT EXISTS map_ii ( + `id` INT(11) NULL COMMENT "", + `m` Map NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2" + ); + """ + sql """ """ + sql """ INSERT INTO map_ii VALUES(1, {1:1,10:1,100:1}); """ + sql """ INSERT INTO map_ii VALUES(2, {2:1,20:1,200:1,2000:1}); """ + sql """ INSERT INTO map_ii VALUES(3, {3:1}); """ + sql """ DROP FUNCTION IF EXISTS udfii(Map); """ + sql """ CREATE FUNCTION udfii(Map) RETURNS INT PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="map_int_int_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + + qt_select_1 """ select m,udfii(m) from map_ii order by id; """ + + sql """ CREATE TABLE IF NOT EXISTS map_ss ( + `id` INT(11) NULL COMMENT "", + `m` Map NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2" + ); """ + sql """ INSERT INTO map_ss VALUES(1, {"114":"514","1919":"810"}); """ + sql """ INSERT INTO map_ss VALUES(2, {"a":"bc","def":"g","hij":"k"}); """ + sql """ DROP FUNCTION IF EXISTS udfss(Map); """ + + sql """ CREATE FUNCTION udfss(Map) RETURNS STRING PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="map_string_string_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select_2 """ select m,udfss(m) from map_ss order by id; """ + } finally { + try_sql("DROP FUNCTION IF EXISTS udfii(Map);") + try_sql("DROP FUNCTION IF EXISTS udfss(Map);") + try_sql("DROP TABLE IF EXISTS map_ii") + try_sql("DROP TABLE IF EXISTS map_ss") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_mixed_params.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_mixed_params.groovy new file mode 100644 index 00000000000000..a662e9b1acfb3f --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_mixed_params.groovy @@ -0,0 +1,443 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_mixed_params") { + // Test vectorized Python UDF with mixed parameter types (pd.Series + scalar) + // This tests the scenario where some parameters are vectorized (pd.Series) + // and some are scalar values (int, float, str) + // + // Key concept: In vectorized UDF, you can mix: + // - pd.Series parameters (process entire column) + // - scalar parameters (single value like int, float, str) + + def runtime_version = "3.8.10" + + try { + // Create test table + sql """ DROP TABLE IF EXISTS test_mixed_params_table; """ + sql """ + CREATE TABLE test_mixed_params_table ( + id INT, + price DOUBLE, + quantity INT, + discount_rate DOUBLE, + category STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 3 + PROPERTIES("replication_num" = "1"); + """ + + // Insert test data + sql """ + INSERT INTO test_mixed_params_table VALUES + (1, 100.0, 5, 0.1, 'A'), + (2, 200.0, 3, 0.15, 'B'), + (3, 150.0, 8, 0.2, 'A'), + (4, 300.0, 2, 0.05, 'C'), + (5, 250.0, 6, 0.12, 'B'), + (6, 180.0, 4, 0.18, 'A'), + (7, 220.0, 7, 0.08, 'C'), + (8, 120.0, 9, 0.25, 'B'), + (9, 280.0, 1, 0.1, 'A'), + (10, 350.0, 5, 0.15, 'C'); + """ + + sql "sync" + + // ==================== Test 1: pd.Series + scalar float ==================== + log.info("=== Test 1: pd.Series + scalar float ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_multiply_constant(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_multiply_constant(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_multiply_constant", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_multiply_constant(values: pd.Series, multiplier: float) -> pd.Series: + # values: pd.Series (vectorized column data) + # multiplier: float (scalar constant) + return values * multiplier +\$\$; + """ + + qt_select_1 """ + SELECT + id, + price, + py_vec_multiply_constant(price, 1.5) AS price_multiplied + FROM test_mixed_params_table + ORDER BY id; + """ + + // ==================== Test 2: Multiple pd.Series + scalar float ==================== + log.info("=== Test 2: Multiple pd.Series + scalar float ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_calc_total(DOUBLE, INT, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_calc_total(DOUBLE, INT, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_calc_total", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_calc_total(price: pd.Series, quantity: pd.Series, tax_rate: float) -> pd.Series: + # price: pd.Series (vectorized) + # quantity: pd.Series (vectorized) + # tax_rate: float (scalar constant) + subtotal = price * quantity + return subtotal * (1 + tax_rate) +\$\$; + """ + + qt_select_2 """ + SELECT + id, + price, + quantity, + py_vec_calc_total(price, quantity, 0.1) AS total_with_tax + FROM test_mixed_params_table + ORDER BY id + LIMIT 5; + """ + + // ==================== Test 3: Two pd.Series (both vectorized) ==================== + log.info("=== Test 3: Two pd.Series (both vectorized) ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_apply_discount(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_apply_discount(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_apply_discount", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_apply_discount(price: pd.Series, discount_rate: pd.Series) -> pd.Series: + # Both are pd.Series (vectorized) + # Each row has its own discount rate from the column + return price * (1 - discount_rate) +\$\$; + """ + + qt_select_3 """ + SELECT + id, + price, + discount_rate, + py_vec_apply_discount(price, discount_rate) AS final_price + FROM test_mixed_params_table + ORDER BY id + LIMIT 5; + """ + + // ==================== Test 4: Complex Mixed Parameters (3 Series + 1 scalar) ==================== + log.info("=== Test 4: Complex calculation with mixed params (3 Series + 1 scalar) ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_complex_calc(DOUBLE, INT, DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_complex_calc(DOUBLE, INT, DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_complex_calc", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_complex_calc(price: pd.Series, quantity: pd.Series, discount_rate: pd.Series, shipping_fee: float) -> pd.Series: + # price: pd.Series (vectorized) + # quantity: pd.Series (vectorized) + # discount_rate: pd.Series (vectorized, per-row discount) + # shipping_fee: float (scalar constant) + + # Calculate: (price * quantity) * (1 - discount) + shipping_fee + subtotal = price * quantity + after_discount = subtotal * (1 - discount_rate) + return after_discount + shipping_fee +\$\$; + """ + + qt_select_4 """ + SELECT + id, + price, + quantity, + discount_rate, + py_vec_complex_calc(price, quantity, discount_rate, 10.0) AS final_total + FROM test_mixed_params_table + ORDER BY id + LIMIT 5; + """ + + // ==================== Test 5: String pd.Series + scalar str ==================== + log.info("=== Test 5: String pd.Series + scalar str ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_add_prefix(STRING, STRING); """ + sql """ + CREATE FUNCTION py_vec_add_prefix(STRING, STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_add_prefix", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_add_prefix(categories: pd.Series, prefix: str) -> pd.Series: + # categories: pd.Series (vectorized string column) + # prefix: str (scalar constant) + return prefix + '_' + categories +\$\$; + """ + + qt_select_5 """ + SELECT + id, + category, + py_vec_add_prefix(category, 'CAT') AS prefixed_category + FROM test_mixed_params_table + ORDER BY id + LIMIT 5; + """ + + // ==================== Test 6: pd.Series + scalar int ==================== + log.info("=== Test 6: pd.Series + scalar int ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_add_int(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_add_int(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_add_int", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_add_int(quantities: pd.Series, bonus: int) -> pd.Series: + # quantities: pd.Series (vectorized int column) + # bonus: int (scalar constant) + return quantities + bonus +\$\$; + """ + + qt_select_6 """ + SELECT + id, + quantity, + py_vec_add_int(quantity, 10) AS quantity_with_bonus + FROM test_mixed_params_table + ORDER BY id + LIMIT 5; + """ + + // ==================== Test 7: Conditional Logic with Mixed Params ==================== + log.info("=== Test 7: Conditional logic with mixed params (2 Series + 1 scalar) ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_conditional_discount(DOUBLE, DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_conditional_discount(DOUBLE, DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_conditional_discount", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd +import numpy as np + +def py_vec_conditional_discount(price: pd.Series, discount_rate: pd.Series, threshold: float) -> pd.Series: + # price: pd.Series (vectorized) + # discount_rate: pd.Series (vectorized) + # threshold: float (scalar constant - minimum price for discount) + + # Apply discount only if price >= threshold + result = np.where(price >= threshold, + price * (1 - discount_rate), + price) + return pd.Series(result) +\$\$; + """ + + qt_select_7 """ + SELECT + id, + price, + discount_rate, + py_vec_conditional_discount(price, discount_rate, 200.0) AS final_price + FROM test_mixed_params_table + ORDER BY id; + """ + + // ==================== Test 8: Scalar first, then Series ==================== + log.info("=== Test 8: Scalar parameter first, then Series ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_scale_and_add(DOUBLE, DOUBLE, INT); """ + sql """ + CREATE FUNCTION py_vec_scale_and_add(DOUBLE, DOUBLE, INT) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_scale_and_add", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_scale_and_add(scale_factor: float, prices: pd.Series, quantities: pd.Series) -> pd.Series: + # scale_factor: float (scalar constant) + # prices: pd.Series (vectorized) + # quantities: pd.Series (vectorized) + return (prices * quantities) * scale_factor +\$\$; + """ + + qt_select_8 """ + SELECT + id, + price, + quantity, + py_vec_scale_and_add(1.2, price, quantity) AS scaled_total + FROM test_mixed_params_table + ORDER BY id + LIMIT 3; + """ + + // ==================== Test 9: Alternating Series and Scalar ==================== + log.info("=== Test 9: Alternating Series and scalar parameters ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_alternating(DOUBLE, DOUBLE, INT, INT); """ + sql """ + CREATE FUNCTION py_vec_alternating(DOUBLE, DOUBLE, INT, INT) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_alternating", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_alternating(prices: pd.Series, markup: float, quantities: pd.Series, min_qty: int) -> pd.Series: + # prices: pd.Series (vectorized) + # markup: float (scalar constant) + # quantities: pd.Series (vectorized) + # min_qty: int (scalar constant) + + import numpy as np + # Apply markup only if quantity >= min_qty + result = np.where(quantities >= min_qty, + prices * (1 + markup), + prices) + return pd.Series(result) +\$\$; + """ + + qt_select_9 """ + SELECT + id, + price, + quantity, + py_vec_alternating(price, 0.2, quantity, 5) AS conditional_price + FROM test_mixed_params_table + ORDER BY id + LIMIT 5; + """ + + // ==================== Test 10: Multiple scalars with one Series ==================== + log.info("=== Test 10: Multiple scalar parameters with one Series ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_multi_scalar(DOUBLE, DOUBLE, DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_multi_scalar(DOUBLE, DOUBLE, DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_multi_scalar", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_multi_scalar(prices: pd.Series, tax: float, discount: float, fee: float) -> pd.Series: + # prices: pd.Series (vectorized) + # tax: float (scalar constant) + # discount: float (scalar constant) + # fee: float (scalar constant) + + # Calculate: (price * (1 - discount)) * (1 + tax) + fee + after_discount = prices * (1 - discount) + with_tax = after_discount * (1 + tax) + return with_tax + fee +\$\$; + """ + + qt_select_10 """ + SELECT + id, + price, + py_vec_multi_scalar(price, 0.1, 0.05, 5.0) AS final_price + FROM test_mixed_params_table + ORDER BY id + LIMIT 3; + """ + + log.info("All mixed parameter tests passed!") + + } finally { + // Cleanup + sql """ DROP FUNCTION IF EXISTS py_vec_multiply_constant(DOUBLE, DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_vec_calc_total(DOUBLE, INT, DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_vec_apply_discount(DOUBLE, DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_vec_complex_calc(DOUBLE, INT, DOUBLE, DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_vec_add_prefix(STRING, STRING); """ + sql """ DROP FUNCTION IF EXISTS py_vec_add_int(INT, INT); """ + sql """ DROP FUNCTION IF EXISTS py_vec_conditional_discount(DOUBLE, DOUBLE, DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_vec_scale_and_add(DOUBLE, DOUBLE, INT); """ + sql """ DROP FUNCTION IF EXISTS py_vec_alternating(DOUBLE, DOUBLE, INT, INT); """ + sql """ DROP FUNCTION IF EXISTS py_vec_multi_scalar(DOUBLE, DOUBLE, DOUBLE, DOUBLE); """ + sql """ DROP TABLE IF EXISTS test_mixed_params_table; """ + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_module.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_module.groovy new file mode 100644 index 00000000000000..358df2efe7760f --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_module.groovy @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_module") { + def pyPath = """${context.file.parent}/udf_scripts/python_udf_module_test.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + try { + sql """ DROP FUNCTION IF EXISTS python_udf_ltv_score(BIGINT, BIGINT, DOUBLE); """ + sql """ + CREATE FUNCTION python_udf_ltv_score(BIGINT, BIGINT, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file"="file://${pyPath}", + "symbol" = "python_udf_module_test.main.safe_ltv", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + sql """ DROP TABLE IF EXISTS user_behavior_test; """ + sql """ + CREATE TABLE user_behavior_test ( + user_id BIGINT, + days_since_last_action BIGINT, + total_actions BIGINT, + total_spend DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(user_id) + DISTRIBUTED BY HASH(user_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + sql """ + INSERT INTO user_behavior_test VALUES + (1001, 5, 10, 500.0), + (1002, 40, 1, 20.0), + (1003, 15, 5, 300.0), + (1004, -1, 3, 100.0), + (1005, NULL, 2, 200.0), + (1006, 7, NULL, 150.0), + (1007, 30, 0, NULL), + (1008, 0, 100, 5000.0), + (1009, 100, 2, 10.0), + (1010, 8, 8, 800.0); + """ + + qt_select """ SELECT + user_id, + days_since_last_action, + total_actions, + total_spend, + python_udf_ltv_score(days_since_last_action, total_actions, total_spend) AS ltv_score + FROM user_behavior_test + ORDER BY user_id; """ + } finally { + try_sql("DROP FUNCTION IF EXISTS python_udf_ltv_score(BIGINT, BIGINT, DOUBLE);") + try_sql("DROP TABLE IF EXISTS user_behavior_test;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_module_advanced.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_module_advanced.groovy new file mode 100644 index 00000000000000..d61cfd4a9433bd --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_module_advanced.groovy @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_module_advanced") { + // Test advanced Python UDF features using Module mode + + def zipPath = """${context.file.parent}/udf_scripts/python_udf_module_test.zip""" + scp_udf_file_to_all_be(zipPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${zipPath}".toString()) + + try { + // Test 1: Use different module paths in zip package + sql """ DROP FUNCTION IF EXISTS py_module_ltv(BIGINT, BIGINT, DOUBLE); """ + sql """ + CREATE FUNCTION py_module_ltv(BIGINT, BIGINT, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${zipPath}", + "symbol" = "python_udf_module_test.main.safe_ltv", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_select_module_ltv_normal """ SELECT py_module_ltv(10, 100, 5000.0) AS result; """ + qt_select_module_ltv_null """ SELECT py_module_ltv(NULL, 100, 5000.0) AS result; """ + qt_select_module_ltv_zero """ SELECT py_module_ltv(0, 0, 5000.0) AS result; """ + + // Test 2: Use Module UDF in complex queries + sql """ DROP TABLE IF EXISTS customer_analytics; """ + sql """ + CREATE TABLE customer_analytics ( + customer_id BIGINT, + days_inactive BIGINT, + total_orders BIGINT, + total_revenue DOUBLE, + customer_segment STRING + ) ENGINE=OLAP + DUPLICATE KEY(customer_id) + DISTRIBUTED BY HASH(customer_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO customer_analytics VALUES + (1001, 5, 50, 10000.0, 'Premium'), + (1002, 30, 10, 2000.0, 'Regular'), + (1003, 60, 5, 500.0, 'Inactive'), + (1004, 2, 100, 25000.0, 'VIP'), + (1005, 15, 25, 5000.0, 'Regular'), + (1006, NULL, 30, 6000.0, 'Regular'), + (1007, 10, NULL, 3000.0, 'Regular'), + (1008, 45, 8, NULL, 'Inactive'), + (1009, 0, 200, 50000.0, 'VIP'), + (1010, 90, 2, 100.0, 'Churned'); + """ + + qt_select_customer_analytics """ + SELECT + customer_id, + customer_segment, + days_inactive, + total_orders, + total_revenue, + py_module_ltv(days_inactive, total_orders, total_revenue) AS ltv_score + FROM customer_analytics + ORDER BY customer_id; + """ + + // Test 3: Use Module UDF for group aggregation + qt_select_segment_analysis """ + SELECT + customer_segment, + COUNT(*) AS customer_count, + AVG(total_revenue) AS avg_revenue, + AVG(py_module_ltv(days_inactive, total_orders, total_revenue)) AS avg_ltv_score + FROM customer_analytics + GROUP BY customer_segment + ORDER BY customer_segment; + """ + + // Test 4: Use Module UDF for filtering + qt_select_high_value_customers """ + SELECT + customer_id, + customer_segment, + total_revenue, + py_module_ltv(days_inactive, total_orders, total_revenue) AS ltv_score + FROM customer_analytics + WHERE py_module_ltv(days_inactive, total_orders, total_revenue) > 100 + ORDER BY ltv_score DESC; + """ + + // Test 5: Use Module UDF for sorting + qt_select_sorted_by_ltv """ + SELECT + customer_id, + customer_segment, + py_module_ltv(days_inactive, total_orders, total_revenue) AS ltv_score + FROM customer_analytics + ORDER BY py_module_ltv(days_inactive, total_orders, total_revenue) DESC + LIMIT 5; + """ + + // Test 6: Use Module UDF with multiple conditions + qt_select_complex_query """ + SELECT + customer_id, + customer_segment, + days_inactive, + total_orders, + total_revenue, + py_module_ltv(days_inactive, total_orders, total_revenue) AS ltv_score, + CASE + WHEN py_module_ltv(days_inactive, total_orders, total_revenue) > 200 THEN 'High Value' + WHEN py_module_ltv(days_inactive, total_orders, total_revenue) > 100 THEN 'Medium Value' + WHEN py_module_ltv(days_inactive, total_orders, total_revenue) IS NOT NULL THEN 'Low Value' + ELSE 'Unknown' + END AS value_category + FROM customer_analytics + ORDER BY ltv_score DESC; + """ + + // Test 7: Use Module UDF with JOIN operations + sql """ DROP TABLE IF EXISTS customer_info; """ + sql """ + CREATE TABLE customer_info ( + customer_id BIGINT, + customer_name STRING, + registration_date DATE + ) ENGINE=OLAP + DUPLICATE KEY(customer_id) + DISTRIBUTED BY HASH(customer_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO customer_info VALUES + (1001, 'Alice Johnson', '2023-01-15'), + (1002, 'Bob Smith', '2023-03-20'), + (1003, 'Charlie Brown', '2022-11-10'), + (1004, 'Diana Prince', '2023-05-01'), + (1005, 'Eve Wilson', '2023-02-14'); + """ + + qt_select_join_with_module_udf """ + SELECT + ci.customer_id, + ci.customer_name, + ca.customer_segment, + ca.total_revenue, + py_module_ltv(ca.days_inactive, ca.total_orders, ca.total_revenue) AS ltv_score + FROM customer_info ci + JOIN customer_analytics ca ON ci.customer_id = ca.customer_id + WHERE py_module_ltv(ca.days_inactive, ca.total_orders, ca.total_revenue) IS NOT NULL + ORDER BY ltv_score DESC; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_module_ltv(BIGINT, BIGINT, DOUBLE);") + try_sql("DROP TABLE IF EXISTS customer_analytics;") + try_sql("DROP TABLE IF EXISTS customer_info;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_module_scalar.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_module_scalar.groovy new file mode 100644 index 00000000000000..fc5a4cd1b2df5f --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_module_scalar.groovy @@ -0,0 +1,818 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_module_scalar") { + // Comprehensive test for scalar Python UDF using module mode + + def pyPath = """${context.file.parent}/udf_scripts/python_udf_scalar_ops.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + + log.info("Python module path: ${pyPath}".toString()) + + try { + // Create test table with diverse data types + sql """ DROP TABLE IF EXISTS scalar_module_test_table; """ + sql """ + CREATE TABLE scalar_module_test_table ( + id INT, + int_a INT, + int_b INT, + int_c INT, + double_a DOUBLE, + double_b DOUBLE, + string_a STRING, + string_b STRING, + bool_a BOOLEAN, + bool_b BOOLEAN, + date_a DATE, + date_b DATE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO scalar_module_test_table VALUES + (1, 10, 20, 30, 100.0, 10.0, 'hello world', 'test@example.com', true, true, '2024-01-15', '2024-01-20'), + (2, 5, 15, 25, 200.0, 20.0, 'foo bar baz', 'user@domain.com', false, true, '2024-02-10', '2024-03-15'), + (3, 100, 50, 25, 150.0, 0.0, 'racecar', 'admin@test.org', true, false, '2023-12-01', '2024-01-01'), + (4, 7, 3, 11, 80.0, 5.0, 'a man a plan a canal panama', 'info@company.net', false, false, '2024-06-15', '2024-06-15'), + (5, 17, 19, 23, 300.0, 15.0, 'python udf test', 'contact@site.io', true, true, '2024-03-01', '2024-12-31'); + """ + + // ==================== Numeric Operations Tests ==================== + + // Test 1: Add three numbers + sql """ DROP FUNCTION IF EXISTS py_add_three(INT, INT, INT); """ + sql """ + CREATE FUNCTION py_add_three(INT, INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.add_three_numbers", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_add_three """ + SELECT + id, + int_a, int_b, int_c, + py_add_three(int_a, int_b, int_c) AS result + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 2: Safe division with precision + sql """ DROP FUNCTION IF EXISTS py_safe_div(DOUBLE, DOUBLE, INT); """ + sql """ + CREATE FUNCTION py_safe_div(DOUBLE, DOUBLE, INT) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.safe_divide_with_precision", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_safe_div """ + SELECT + id, + double_a, double_b, + py_safe_div(double_a, double_b, 2) AS result + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 3: Calculate discount price + sql """ DROP FUNCTION IF EXISTS py_discount(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_discount(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.calculate_discount_price", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_discount """ + SELECT + id, + double_a, + py_discount(double_a, 10.0) AS price_10_off, + py_discount(double_a, 25.0) AS price_25_off + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 4: Compound interest + sql """ DROP FUNCTION IF EXISTS py_compound_interest(DOUBLE, DOUBLE, INT); """ + sql """ + CREATE FUNCTION py_compound_interest(DOUBLE, DOUBLE, INT) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.compound_interest", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_compound_interest """ + SELECT + id, + double_a, + py_compound_interest(double_a, 5.0, 10) AS future_value + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 5: Calculate BMI + sql """ DROP FUNCTION IF EXISTS py_bmi(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_bmi(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.calculate_bmi", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_bmi """ + SELECT + id, + py_bmi(70.0, 1.75) AS bmi_normal, + py_bmi(90.0, 1.75) AS bmi_overweight + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 6: Fibonacci number + sql """ DROP FUNCTION IF EXISTS py_fibonacci(INT); """ + sql """ + CREATE FUNCTION py_fibonacci(INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.fibonacci", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_fibonacci """ + SELECT + id, + int_a, + py_fibonacci(int_a) AS fib_result + FROM scalar_module_test_table + WHERE int_a <= 20 + ORDER BY id; + """ + + // Test 7: Is prime number + sql """ DROP FUNCTION IF EXISTS py_is_prime(INT); """ + sql """ + CREATE FUNCTION py_is_prime(INT) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.is_prime", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_is_prime """ + SELECT + id, + int_a, int_b, int_c, + py_is_prime(int_a) AS a_is_prime, + py_is_prime(int_b) AS b_is_prime, + py_is_prime(int_c) AS c_is_prime + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 8: GCD (Greatest Common Divisor) + sql """ DROP FUNCTION IF EXISTS py_gcd(INT, INT); """ + sql """ + CREATE FUNCTION py_gcd(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.gcd", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_gcd """ + SELECT + id, + int_a, int_b, + py_gcd(int_a, int_b) AS gcd_result + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 9: LCM (Least Common Multiple) + sql """ DROP FUNCTION IF EXISTS py_lcm(INT, INT); """ + sql """ + CREATE FUNCTION py_lcm(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.lcm", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_lcm """ + SELECT + id, + int_a, int_b, + py_lcm(int_a, int_b) AS lcm_result + FROM scalar_module_test_table + ORDER BY id; + """ + + // ==================== String Operations Tests ==================== + + // Test 10: Reverse string + sql """ DROP FUNCTION IF EXISTS py_reverse(STRING); """ + sql """ + CREATE FUNCTION py_reverse(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.reverse_string", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_reverse """ + SELECT + id, + string_a, + py_reverse(string_a) AS reversed + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 11: Count vowels + sql """ DROP FUNCTION IF EXISTS py_count_vowels(STRING); """ + sql """ + CREATE FUNCTION py_count_vowels(STRING) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.count_vowels", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_count_vowels """ + SELECT + id, + string_a, + py_count_vowels(string_a) AS vowel_count + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 12: Count words + sql """ DROP FUNCTION IF EXISTS py_count_words(STRING); """ + sql """ + CREATE FUNCTION py_count_words(STRING) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.count_words", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_count_words """ + SELECT + id, + string_a, + py_count_words(string_a) AS word_count + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 13: Capitalize words + sql """ DROP FUNCTION IF EXISTS py_capitalize(STRING); """ + sql """ + CREATE FUNCTION py_capitalize(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.capitalize_words", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_capitalize """ + SELECT + id, + string_a, + py_capitalize(string_a) AS capitalized + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 14: Is palindrome + sql """ DROP FUNCTION IF EXISTS py_is_palindrome(STRING); """ + sql """ + CREATE FUNCTION py_is_palindrome(STRING) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.is_palindrome", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_is_palindrome """ + SELECT + id, + string_a, + py_is_palindrome(string_a) AS is_palindrome + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 15: String similarity + sql """ DROP FUNCTION IF EXISTS py_similarity(STRING, STRING); """ + sql """ + CREATE FUNCTION py_similarity(STRING, STRING) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.string_similarity", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_similarity """ + SELECT + id, + string_a, + py_similarity(string_a, 'hello') AS similarity_to_hello + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 16: Mask email + sql """ DROP FUNCTION IF EXISTS py_mask_email(STRING); """ + sql """ + CREATE FUNCTION py_mask_email(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.mask_email", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_mask_email """ + SELECT + id, + string_b, + py_mask_email(string_b) AS masked_email + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 17: Extract domain from email + sql """ DROP FUNCTION IF EXISTS py_extract_domain(STRING); """ + sql """ + CREATE FUNCTION py_extract_domain(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.extract_domain", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_extract_domain """ + SELECT + id, + string_b, + py_extract_domain(string_b) AS domain + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 18: Levenshtein distance + sql """ DROP FUNCTION IF EXISTS py_levenshtein(STRING, STRING); """ + sql """ + CREATE FUNCTION py_levenshtein(STRING, STRING) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.levenshtein_distance", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_levenshtein """ + SELECT + id, + string_a, + py_levenshtein(string_a, 'hello world') AS edit_distance + FROM scalar_module_test_table + ORDER BY id; + """ + + // ==================== Date/Time Operations Tests ==================== + + // Test 19: Days between dates + sql """ DROP FUNCTION IF EXISTS py_days_between(DATE, DATE); """ + sql """ + CREATE FUNCTION py_days_between(DATE, DATE) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.days_between_dates", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_days_between """ + SELECT + id, + date_a, date_b, + py_days_between(date_a, date_b) AS days_diff + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 20: Is weekend + sql """ DROP FUNCTION IF EXISTS py_is_weekend(DATE); """ + sql """ + CREATE FUNCTION py_is_weekend(DATE) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.is_weekend", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_is_weekend """ + SELECT + id, + date_a, + py_is_weekend(date_a) AS is_weekend + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 21: Get quarter + sql """ DROP FUNCTION IF EXISTS py_get_quarter(DATE); """ + sql """ + CREATE FUNCTION py_get_quarter(DATE) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.get_quarter", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_get_quarter """ + SELECT + id, + date_a, + py_get_quarter(date_a) AS quarter + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 22: Age in years + sql """ DROP FUNCTION IF EXISTS py_age(DATE, DATE); """ + sql """ + CREATE FUNCTION py_age(DATE, DATE) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.age_in_years", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_age """ + SELECT + id, + py_age('1990-01-01', date_a) AS age + FROM scalar_module_test_table + ORDER BY id; + """ + + // ==================== Boolean/Conditional Operations Tests ==================== + + // Test 23: Is in range + sql """ DROP FUNCTION IF EXISTS py_in_range(INT, INT, INT); """ + sql """ + CREATE FUNCTION py_in_range(INT, INT, INT) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.is_in_range", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_in_range """ + SELECT + id, + int_a, + py_in_range(int_a, 10, 50) AS in_range_10_50 + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 24: XOR operation + sql """ DROP FUNCTION IF EXISTS py_xor(BOOLEAN, BOOLEAN); """ + sql """ + CREATE FUNCTION py_xor(BOOLEAN, BOOLEAN) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.xor_operation", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_xor """ + SELECT + id, + bool_a, bool_b, + py_xor(bool_a, bool_b) AS xor_result + FROM scalar_module_test_table + ORDER BY id; + """ + + // ==================== Complex/Mixed Operations Tests ==================== + + // Test 25: Calculate grade + sql """ DROP FUNCTION IF EXISTS py_grade(DOUBLE); """ + sql """ + CREATE FUNCTION py_grade(DOUBLE) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.calculate_grade", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_grade """ + SELECT + id, + double_a, + py_grade(double_a) AS grade + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 26: Categorize age + sql """ DROP FUNCTION IF EXISTS py_categorize_age(INT); """ + sql """ + CREATE FUNCTION py_categorize_age(INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.categorize_age", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_categorize_age """ + SELECT + id, + int_a, + py_categorize_age(int_a) AS age_category + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 27: Calculate tax + sql """ DROP FUNCTION IF EXISTS py_tax(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_tax(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.calculate_tax", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_tax """ + SELECT + id, + double_a, + py_tax(double_a, 15.0) AS tax_15_percent + FROM scalar_module_test_table + ORDER BY id; + """ + + // Test 28: Truncate string with suffix + sql """ DROP FUNCTION IF EXISTS py_truncate(STRING, INT, STRING); """ + sql """ + CREATE FUNCTION py_truncate(STRING, INT, STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_scalar_ops.truncate_string", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_truncate """ + SELECT + id, + string_a, + py_truncate(string_a, 10, '...') AS truncated + FROM scalar_module_test_table + ORDER BY id; + """ + + // ==================== Edge Cases and NULL Handling Tests ==================== + + // Test 29: NULL handling in numeric operations + sql """ DROP TABLE IF EXISTS null_test_table; """ + sql """ + CREATE TABLE null_test_table ( + id INT, + val1 INT, + val2 INT, + val3 INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO null_test_table VALUES + (1, 10, 20, 30), + (2, NULL, 20, 30), + (3, 10, NULL, 30), + (4, 10, 20, NULL), + (5, NULL, NULL, NULL); + """ + + qt_null_handling """ + SELECT + id, + val1, val2, val3, + py_add_three(val1, val2, val3) AS sum_result + FROM null_test_table + ORDER BY id; + """ + + // Test 30: Empty string handling + sql """ DROP TABLE IF EXISTS string_edge_test; """ + sql """ + CREATE TABLE string_edge_test ( + id INT, + str_val STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO string_edge_test VALUES + (1, 'normal string'), + (2, ''), + (3, ' '), + (4, 'a'), + (5, NULL); + """ + + qt_string_edge """ + SELECT + id, + str_val, + py_reverse(str_val) AS reversed, + py_count_vowels(str_val) AS vowels, + py_count_words(str_val) AS words + FROM string_edge_test + ORDER BY id; + """ + + } finally { + // Cleanup all functions + try_sql("DROP FUNCTION IF EXISTS py_add_three(INT, INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_safe_div(DOUBLE, DOUBLE, INT);") + try_sql("DROP FUNCTION IF EXISTS py_discount(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_compound_interest(DOUBLE, DOUBLE, INT);") + try_sql("DROP FUNCTION IF EXISTS py_bmi(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_fibonacci(INT);") + try_sql("DROP FUNCTION IF EXISTS py_is_prime(INT);") + try_sql("DROP FUNCTION IF EXISTS py_gcd(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_lcm(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_reverse(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_count_vowels(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_count_words(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_capitalize(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_is_palindrome(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_similarity(STRING, STRING);") + try_sql("DROP FUNCTION IF EXISTS py_mask_email(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_extract_domain(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_levenshtein(STRING, STRING);") + try_sql("DROP FUNCTION IF EXISTS py_days_between(DATE, DATE);") + try_sql("DROP FUNCTION IF EXISTS py_is_weekend(DATE);") + try_sql("DROP FUNCTION IF EXISTS py_get_quarter(DATE);") + try_sql("DROP FUNCTION IF EXISTS py_age(DATE, DATE);") + try_sql("DROP FUNCTION IF EXISTS py_in_range(INT, INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_xor(BOOLEAN, BOOLEAN);") + try_sql("DROP FUNCTION IF EXISTS py_grade(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_categorize_age(INT);") + try_sql("DROP FUNCTION IF EXISTS py_tax(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_truncate(STRING, INT, STRING);") + + // Cleanup tables + try_sql("DROP TABLE IF EXISTS scalar_module_test_table;") + try_sql("DROP TABLE IF EXISTS null_test_table;") + try_sql("DROP TABLE IF EXISTS string_edge_test;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_module_vector.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_module_vector.groovy new file mode 100644 index 00000000000000..01a3b81912bfaa --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_module_vector.groovy @@ -0,0 +1,429 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_module_vector") { + // Test vectorized Python UDF using module mode with pandas.Series + + def pyPath = """${context.file.parent}/udf_scripts/python_udf_vector_ops.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + + log.info("Python module path: ${pyPath}".toString()) + + try { + // Create test table + sql """ DROP TABLE IF EXISTS vector_module_test_table; """ + sql """ + CREATE TABLE vector_module_test_table ( + id INT, + int_a INT, + int_b INT, + double_a DOUBLE, + double_b DOUBLE, + string_a STRING, + string_b STRING, + bool_a BOOLEAN, + bool_b BOOLEAN + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO vector_module_test_table VALUES + (1, 10, 20, 1.5, 2.5, 'hello world', 'python udf', true, true), + (2, 30, 15, 3.5, 4.5, 'foo bar', 'test case', false, true), + (3, 50, 50, 5.5, 2.0, 'data science', 'machine learning', true, false), + (4, 5, 25, 7.5, 1.5, 'apache doris', 'database system', false, false), + (5, 100, 10, 9.5, 3.5, 'vector operations', 'pandas series', true, true); + """ + + // Test 1: Vector addition with constant + sql """ DROP FUNCTION IF EXISTS py_vec_add_const(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_add_const(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_add_with_constant", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_add_const """ + SELECT + id, + int_a, + int_b, + py_vec_add_const(int_a, int_b) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 2: Vector multiplication and rounding + sql """ DROP FUNCTION IF EXISTS py_vec_multiply_round(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_multiply_round(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_multiply_and_round", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_multiply_round """ + SELECT + id, + double_a, + double_b, + py_vec_multiply_round(double_a, double_b) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 3: Vector string concatenation with separator + sql """ DROP FUNCTION IF EXISTS py_vec_concat_sep(STRING, STRING); """ + sql """ + CREATE FUNCTION py_vec_concat_sep(STRING, STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_string_concat_with_separator", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_concat_sep """ + SELECT + id, + string_a, + string_b, + py_vec_concat_sep(string_a, string_b) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 4: Vector string title case + sql """ DROP FUNCTION IF EXISTS py_vec_title_case(STRING); """ + sql """ + CREATE FUNCTION py_vec_title_case(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_string_title_case", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_title_case """ + SELECT + id, + string_a, + py_vec_title_case(string_a) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 5: Vector conditional value (max of two values) + sql """ DROP FUNCTION IF EXISTS py_vec_conditional(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_conditional(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_conditional_value", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_conditional """ + SELECT + id, + int_a, + int_b, + py_vec_conditional(int_a, int_b) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 6: Vector percentage calculation + sql """ DROP FUNCTION IF EXISTS py_vec_percentage(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_percentage(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_percentage_calculation", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_percentage """ + SELECT + id, + double_a, + double_b, + py_vec_percentage(double_a, double_b) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 7: Vector range check + sql """ DROP FUNCTION IF EXISTS py_vec_in_range(INT, INT, INT); """ + sql """ + CREATE FUNCTION py_vec_in_range(INT, INT, INT) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_is_in_range", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_in_range """ + SELECT + id, + int_a, + py_vec_in_range(int_a, 10, 50) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 8: Vector safe division + sql """ DROP FUNCTION IF EXISTS py_vec_safe_div(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_safe_div(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_safe_divide", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_safe_div """ + SELECT + id, + double_a, + double_b, + py_vec_safe_div(double_a, double_b) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 9: Vector exponential decay + sql """ DROP FUNCTION IF EXISTS py_vec_exp_decay(DOUBLE, INT); """ + sql """ + CREATE FUNCTION py_vec_exp_decay(DOUBLE, INT) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_exponential_decay", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_exp_decay """ + SELECT + id, + double_a, + int_a, + py_vec_exp_decay(double_a, int_a) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 10: Vector string extract first word + sql """ DROP FUNCTION IF EXISTS py_vec_first_word(STRING); """ + sql """ + CREATE FUNCTION py_vec_first_word(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_string_extract_first_word", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_first_word """ + SELECT + id, + string_a, + py_vec_first_word(string_a) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 11: Vector absolute difference + sql """ DROP FUNCTION IF EXISTS py_vec_abs_diff(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_abs_diff(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_abs_difference", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_abs_diff """ + SELECT + id, + int_a, + int_b, + py_vec_abs_diff(int_a, int_b) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 12: Vector power operation + sql """ DROP FUNCTION IF EXISTS py_vec_power(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_power(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_power", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_power """ + SELECT + id, + double_a, + py_vec_power(double_a, 2.0) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 13: Vector boolean AND operation + sql """ DROP FUNCTION IF EXISTS py_vec_bool_and(BOOLEAN, BOOLEAN); """ + sql """ + CREATE FUNCTION py_vec_bool_and(BOOLEAN, BOOLEAN) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_boolean_and", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_bool_and """ + SELECT + id, + bool_a, + bool_b, + py_vec_bool_and(bool_a, bool_b) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 14: Vector boolean OR operation + sql """ DROP FUNCTION IF EXISTS py_vec_bool_or(BOOLEAN, BOOLEAN); """ + sql """ + CREATE FUNCTION py_vec_bool_or(BOOLEAN, BOOLEAN) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_boolean_or", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_bool_or """ + SELECT + id, + bool_a, + bool_b, + py_vec_bool_or(bool_a, bool_b) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + // Test 15: Vector clip values + sql """ DROP FUNCTION IF EXISTS py_vec_clip(INT, INT, INT); """ + sql """ + CREATE FUNCTION py_vec_clip(INT, INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "file" = "file://${pyPath}", + "symbol" = "python_udf_vector_ops.vec_clip_values", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + qt_vec_clip """ + SELECT + id, + int_a, + py_vec_clip(int_a, 20, 60) AS result + FROM vector_module_test_table + ORDER BY id; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_vec_add_const(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_multiply_round(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_vec_concat_sep(STRING, STRING);") + try_sql("DROP FUNCTION IF EXISTS py_vec_title_case(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_vec_conditional(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_percentage(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_vec_in_range(INT, INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_safe_div(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_vec_exp_decay(DOUBLE, INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_first_word(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_vec_abs_diff(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_vec_power(DOUBLE, DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS py_vec_bool_and(BOOLEAN, BOOLEAN);") + try_sql("DROP FUNCTION IF EXISTS py_vec_bool_or(BOOLEAN, BOOLEAN);") + try_sql("DROP FUNCTION IF EXISTS py_vec_clip(INT, INT, INT);") + try_sql("DROP TABLE IF EXISTS vector_module_test_table;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_multiline_inline.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_multiline_inline.groovy new file mode 100644 index 00000000000000..ecf8f0b2bccd3e --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_multiline_inline.groovy @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_multiline_inline") { + // Test complex multi-line inline Python code + + def runtime_version = "3.8.10" + try { + // Test 1: Inline code with helper functions + sql """ DROP FUNCTION IF EXISTS py_complex_calculation(INT, INT); """ + sql """ + CREATE FUNCTION py_complex_calculation(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def helper_function(x): + return x * x + +def evaluate(a, b): + if a is None or b is None: + return None + result = helper_function(a) + helper_function(b) + return result +\$\$; + """ + + qt_select_complex_calc """ SELECT py_complex_calculation(3, 4) AS result; """ + + // Test 2: Complex function with conditional logic + sql """ DROP FUNCTION IF EXISTS py_business_logic(STRING, DOUBLE, INT); """ + sql """ + CREATE FUNCTION py_business_logic(STRING, DOUBLE, INT) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(customer_type, amount, quantity): + if customer_type is None or amount is None or quantity is None: + return 'INVALID' + + # Calculate discount + discount = 0 + if customer_type == 'VIP': + discount = 0.2 + elif customer_type == 'PREMIUM': + discount = 0.15 + elif customer_type == 'REGULAR': + discount = 0.1 + else: + discount = 0 + + # Bulk discount + if quantity >= 100: + discount += 0.05 + elif quantity >= 50: + discount += 0.03 + + # Calculate final price + final_amount = amount * (1 - discount) + + # Return result + if final_amount > 10000: + return f'HIGH:{final_amount:.2f}' + elif final_amount > 1000: + return f'MEDIUM:{final_amount:.2f}' + else: + return f'LOW:{final_amount:.2f}' +\$\$; + """ + + qt_select_business_logic_vip """ SELECT py_business_logic('VIP', 5000.0, 120) AS result; """ + qt_select_business_logic_regular """ SELECT py_business_logic('REGULAR', 2000.0, 30) AS result; """ + + // Test 3: Complex string processing logic + sql """ DROP FUNCTION IF EXISTS py_text_analyzer(STRING); """ + sql """ + CREATE FUNCTION py_text_analyzer(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(text): + if text is None: + return None + + # Collect statistics + length = len(text) + words = text.split() + word_count = len(words) + + # Count character types + upper_count = sum(1 for c in text if c.isupper()) + lower_count = sum(1 for c in text if c.islower()) + digit_count = sum(1 for c in text if c.isdigit()) + + # Build result + result = f"len:{length},words:{word_count},upper:{upper_count},lower:{lower_count},digits:{digit_count}" + return result +\$\$; + """ + + qt_select_text_analyzer """ SELECT py_text_analyzer('Hello World 123') AS result; """ + + // Test 4: Complex mathematical calculation function + sql """ DROP FUNCTION IF EXISTS py_statistics(DOUBLE, DOUBLE, DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_statistics(DOUBLE, DOUBLE, DOUBLE, DOUBLE) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "evaluate", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def evaluate(v1, v2, v3, v4): + if any(x is None for x in [v1, v2, v3, v4]): + return None + + values = [v1, v2, v3, v4] + + # Calculate statistics + total = sum(values) + count = len(values) + mean = total / count + + # Calculate variance + variance = sum((x - mean) ** 2 for x in values) / count + + # Calculate standard deviation + import math + std_dev = math.sqrt(variance) + + # Find max and min values + max_val = max(values) + min_val = min(values) + + result = f"mean:{mean:.2f},std:{std_dev:.2f},max:{max_val:.2f},min:{min_val:.2f}" + return result +\$\$; + """ + + qt_select_statistics """ SELECT py_statistics(10.0, 20.0, 30.0, 40.0) AS result; """ + + // Test 5: Test complex inline code on table data + sql """ DROP TABLE IF EXISTS multiline_test_table; """ + sql """ + CREATE TABLE multiline_test_table ( + id INT, + customer_type STRING, + amount DOUBLE, + quantity INT, + description STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO multiline_test_table VALUES + (1, 'VIP', 15000.0, 150, 'Premium customer order'), + (2, 'PREMIUM', 8000.0, 80, 'Good customer'), + (3, 'REGULAR', 3000.0, 40, 'Regular order'), + (4, 'VIP', 500.0, 10, 'Small VIP order'), + (5, 'REGULAR', 12000.0, 200, 'Large regular order'); + """ + + qt_select_table_multiline """ + SELECT + id, + customer_type, + amount, + quantity, + py_business_logic(customer_type, amount, quantity) AS pricing_result, + py_text_analyzer(description) AS text_analysis + FROM multiline_test_table + ORDER BY id; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_complex_calculation(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_business_logic(STRING, DOUBLE, INT);") + try_sql("DROP FUNCTION IF EXISTS py_text_analyzer(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_statistics(DOUBLE, DOUBLE, DOUBLE, DOUBLE);") + try_sql("DROP TABLE IF EXISTS multiline_test_table;") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_ret_map.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_ret_map.groovy new file mode 100644 index 00000000000000..f31f763616b0dc --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_ret_map.groovy @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_ret_map") { + def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + try { + try_sql("DROP FUNCTION IF EXISTS retii(map);") + try_sql("DROP FUNCTION IF EXISTS retss(map);") + try_sql("DROP FUNCTION IF EXISTS retid(map);") + try_sql("DROP FUNCTION IF EXISTS retidss(int ,double);") + try_sql("DROP TABLE IF EXISTS db") + try_sql("DROP TABLE IF EXISTS dbss") + sql """ + CREATE TABLE IF NOT EXISTS db( + `id` INT NULL COMMENT "", + `i` INT NULL COMMENT "", + `d` Double NULL COMMENT "", + `mii` Map NULL COMMENT "", + `mid` Map NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2"); + """ + sql """ INSERT INTO db VALUES(1, 10,1.1,{1:1,10:1,100:1},{1:1.1,11:11.1}); """ + sql """ INSERT INTO db VALUES(2, 20,2.2,{2:2,20:2,200:2},{2:2.2,22:22.2}); """ + + sql """ + CREATE TABLE IF NOT EXISTS dbss( + `id` INT NULL COMMENT "", + `m` Map NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2"); + """ + + sql """ INSERT INTO dbss VALUES(1,{"abc":"efg","h":"i"}); """ + sql """ INSERT INTO dbss VALUES(2,{"j":"k"}); """ + + + sql """ + + CREATE FUNCTION retii(map) RETURNS map PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="map_ret_int_int_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + + """ + + sql """ + + CREATE FUNCTION retss(map) RETURNS map PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="map_ret_string_string_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + + """ + + + sql """ + + CREATE FUNCTION retid(map) RETURNS map PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="map_ret_int_double_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + + """ + + sql """ + + CREATE FUNCTION retidss(int ,double) RETURNS map PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="map_int_double_ret_string_string_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); + + """ + + qt_select_1 """ select mid , retid(mid) from db order by id; """ + + qt_select_2 """ select mii , retii(mii) from db order by id; """ + + qt_select_3 """ select i,d,retidss(i,d) from db order by id; """ + + qt_select_4 """ select m,retss(m) from dbss order by id; """ + } finally { + try_sql("DROP FUNCTION IF EXISTS retii(map);") + try_sql("DROP FUNCTION IF EXISTS retss(map);") + try_sql("DROP FUNCTION IF EXISTS retid(map);") + try_sql("DROP FUNCTION IF EXISTS retidss(int ,double);") + try_sql("DROP TABLE IF EXISTS db") + try_sql("DROP TABLE IF EXISTS dbss") + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_schema_check.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_schema_check.groovy new file mode 100644 index 00000000000000..2309574fae8023 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_schema_check.groovy @@ -0,0 +1,544 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_schema_check") { + // Test type compatibility in Python UDF + // Users can specify compatible types instead of exact matching types + // For example: TINYINT can be used where INT is expected + + def runtime_version = "3.8.10" + + try { + // Create test table with various integer types + sql """ DROP TABLE IF EXISTS test_type_compat_table; """ + sql """ + CREATE TABLE test_type_compat_table ( + id INT, + tiny_val TINYINT, + small_val SMALLINT, + int_val INT, + big_val BIGINT, + float_val FLOAT, + double_val DOUBLE, + str_val STRING, + bool_val BOOLEAN, + date_val DATE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 3 + PROPERTIES("replication_num" = "1"); + """ + + // Insert test data + sql """ + INSERT INTO test_type_compat_table VALUES + (1, 10, 100, 1000, 10000, 1.5, 10.5, 'test1', true, '2024-01-01'), + (2, 20, 200, 2000, 20000, 2.5, 20.5, 'test2', false, '2024-01-02'), + (3, 30, 300, 3000, 30000, 3.5, 30.5, 'test3', true, '2024-01-03'), + (4, 40, 400, 4000, 40000, 4.5, 40.5, 'test4', false, '2024-01-04'), + (5, 50, 500, 5000, 50000, 5.5, 50.5, 'test5', true, '2024-01-05'); + """ + + // ==================== Test 1: Integer Type Promotion (TINYINT -> INT) ==================== + log.info("=== Test 1: TINYINT can be used where INT is expected ===") + + sql """ DROP FUNCTION IF EXISTS py_add_int_sc(INT, INT); """ + sql """ + CREATE FUNCTION py_add_int_sc(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_add_int_sc", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def py_add_int_sc(a, b): + if a is None or b is None: + return None + return a + b +\$\$; + """ + + // Pass TINYINT where INT is expected + qt_select_1 """ + SELECT + id, + tiny_val, + int_val, + py_add_int_sc(tiny_val, int_val) AS result + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 2: Integer Type Promotion (SMALLINT -> INT) ==================== + log.info("=== Test 2: SMALLINT can be used where INT is expected ===") + + qt_select_2 """ + SELECT + id, + small_val, + int_val, + py_add_int_sc(small_val, int_val) AS result + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 3: Integer Type Promotion (INT -> BIGINT) ==================== + log.info("=== Test 3: INT can be used where BIGINT is expected ===") + + sql """ DROP FUNCTION IF EXISTS py_add_bigint(BIGINT, BIGINT); """ + sql """ + CREATE FUNCTION py_add_bigint(BIGINT, BIGINT) + RETURNS BIGINT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_add_bigint", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def py_add_bigint(a, b): + return a + b +\$\$; + """ + + qt_select_3 """ + SELECT + id, + int_val, + big_val, + py_add_bigint(int_val, big_val) AS result + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 4: Float Type Promotion (FLOAT -> DOUBLE) ==================== + log.info("=== Test 4: FLOAT can be used where DOUBLE is expected ===") + + sql """ DROP FUNCTION IF EXISTS py_add_double(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_add_double(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_add_double", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def py_add_double(a, b): + return a + b +\$\$; + """ + + qt_select_4 """ + SELECT + id, + float_val, + double_val, + py_add_double(float_val, double_val) AS result + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 5: Mixed Integer Types ==================== + log.info("=== Test 5: Mixed integer types (TINYINT, SMALLINT, INT) ===") + + sql """ DROP FUNCTION IF EXISTS py_sum_three(INT, INT, INT); """ + sql """ + CREATE FUNCTION py_sum_three(INT, INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_sum_three", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def py_sum_three(a, b, c): + return a + b + c +\$\$; + """ + + qt_select_5 """ + SELECT + id, + tiny_val, + small_val, + int_val, + py_sum_three(tiny_val, small_val, int_val) AS result + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 6: Vectorized UDF with Type Promotion ==================== + log.info("=== Test 6: Vectorized UDF with integer type promotion ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_multiply(INT, INT); """ + sql """ + CREATE FUNCTION py_vec_multiply(INT, INT) + RETURNS INT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_multiply", + "runtime_version" = "${runtime_version}", + "vectorized" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_multiply(a: pd.Series, b: pd.Series) -> pd.Series: + return a * b +\$\$; + """ + + // Use TINYINT and SMALLINT where INT is expected + qt_select_6 """ + SELECT + id, + tiny_val, + small_val, + py_vec_multiply(tiny_val, small_val) AS result + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 7: Vectorized UDF with Float Promotion ==================== + log.info("=== Test 7: Vectorized UDF with float type promotion ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_divide(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_divide(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_divide", + "runtime_version" = "${runtime_version}", + "vectorized" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_divide(a: pd.Series, b: pd.Series) -> pd.Series: + return a / b +\$\$; + """ + + // Use FLOAT where DOUBLE is expected + qt_select_7 """ + SELECT + id, + float_val, + double_val, + py_vec_divide(double_val, float_val) AS result + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 8: Mixed Types in Vectorized UDF ==================== + log.info("=== Test 8: Mixed integer and float types in vectorized UDF ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_calc(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_calc(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_calc", + "runtime_version" = "${runtime_version}", + "vectorized" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_calc(a: pd.Series, b: pd.Series) -> pd.Series: + return a * 2.0 + b +\$\$; + """ + + // Use INT and FLOAT where DOUBLE is expected + qt_select_8 """ + SELECT + id, + int_val, + float_val, + py_vec_calc(int_val, float_val) AS result + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 9: String Type Compatibility ==================== + log.info("=== Test 9: String type compatibility ===") + + sql """ DROP FUNCTION IF EXISTS py_string_upper(STRING); """ + sql """ + CREATE FUNCTION py_string_upper(STRING) + RETURNS STRING + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_string_upper", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def py_string_upper(s): + return s.upper() if s else None +\$\$; + """ + + qt_select_9 """ + SELECT + id, + str_val, + py_string_upper(str_val) AS upper_str + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 10: Boolean Type ==================== + log.info("=== Test 10: Boolean type compatibility ===") + + sql """ DROP FUNCTION IF EXISTS py_bool_not(BOOLEAN); """ + sql """ + CREATE FUNCTION py_bool_not(BOOLEAN) + RETURNS BOOLEAN + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_bool_not", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def py_bool_not(b): + return not b if b is not None else None +\$\$; + """ + + qt_select_10 """ + SELECT + id, + bool_val, + py_bool_not(bool_val) AS negated + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 11: Complex Type Promotion Chain ==================== + log.info("=== Test 11: Complex type promotion chain (TINYINT -> BIGINT) ===") + + qt_select_11 """ + SELECT + id, + tiny_val, + big_val, + py_add_bigint(tiny_val, big_val) AS result + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 12: Vectorized with Mixed Scalar and Series ==================== + log.info("=== Test 12: Vectorized UDF with type promotion and mixed params ===") + + sql """ DROP FUNCTION IF EXISTS py_vec_scale(DOUBLE, DOUBLE); """ + sql """ + CREATE FUNCTION py_vec_scale(DOUBLE, DOUBLE) + RETURNS DOUBLE + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_vec_scale", + "runtime_version" = "${runtime_version}", + "vectorized" = "true" + ) + AS \$\$ +import pandas as pd + +def py_vec_scale(values: pd.Series, factor: float) -> pd.Series: + return values * factor +\$\$; + """ + + // Use INT (promoted to DOUBLE) with scalar FLOAT + qt_select_12 """ + SELECT + id, + int_val, + py_vec_scale(int_val, 1.5) AS scaled + FROM test_type_compat_table + ORDER BY id; + """ + + // ==================== Test 13: Type Incompatibility - STRING to INT ==================== + log.info("=== Test 13: Type incompatibility - STRING cannot be used where INT is expected ===") + + qt_select_13 """ + SELECT + id, + str_val, + py_add_int_sc(str_val, int_val) AS result + FROM test_type_compat_table + ORDER BY id + LIMIT 1; + """ + + // ==================== Test 14: Type Incompatibility - BIGINT to INT ==================== + log.info("=== Test 14: Type incompatibility - BIGINT cannot be downcast to INT ===") + + qt_select_14 """ + SELECT + id, + big_val, + py_add_int_sc(big_val, int_val) AS result + FROM test_type_compat_table + ORDER BY id + LIMIT 1; + """ + + // ==================== Test 15: Type Incompatibility - DOUBLE to FLOAT ==================== + log.info("=== Test 15: Type incompatibility - DOUBLE cannot be downcast to FLOAT ===") + + sql """ DROP FUNCTION IF EXISTS py_add_float(FLOAT, FLOAT); """ + sql """ + CREATE FUNCTION py_add_float(FLOAT, FLOAT) + RETURNS FLOAT + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "py_add_float", + "runtime_version" = "${runtime_version}" + ) + AS \$\$ +def py_add_float(a, b): + return a + b +\$\$; + """ + + qt_select_15 """ + SELECT + id, + double_val, + py_add_float(double_val, float_val) AS result + FROM test_type_compat_table + ORDER BY id + LIMIT 1; + """ + + // ==================== Test 16: Type Incompatibility - BOOLEAN to INT ==================== + log.info("=== Test 16: Type incompatibility - BOOLEAN cannot be used where INT is expected ===") + + qt_select_16 """ + SELECT + id, + bool_val, + py_add_int_sc(bool_val, int_val) AS result + FROM test_type_compat_table + ORDER BY id + LIMIT 1; + """ + + // ==================== Test 17: Type Incompatibility - DATE to STRING ==================== + log.info("=== Test 17: Type incompatibility - DATE cannot be directly used where STRING is expected ===") + + qt_select_17 """ + SELECT + id, + date_val, + py_string_upper(date_val) AS result + FROM test_type_compat_table + ORDER BY id + LIMIT 1; + """ + + // ==================== Test 18: Type Incompatibility - INT to BOOLEAN ==================== + log.info("=== Test 18: Type incompatibility - INT cannot be used where BOOLEAN is expected ===") + + qt_select_18 """ + SELECT + id, + int_val, + py_bool_not(int_val) AS result + FROM test_type_compat_table + ORDER BY id + LIMIT 1; + """ + + // ==================== Test 19: Type Incompatibility in Vectorized UDF - STRING to INT ==================== + log.info("=== Test 19: Type incompatibility in vectorized UDF - STRING to INT ===") + + qt_select_19 """ + SELECT + id, + str_val, + py_vec_multiply(str_val, int_val) AS result + FROM test_type_compat_table + ORDER BY id + LIMIT 1; + """ + + // ==================== Test 20: Type Incompatibility - Mixed incompatible types ==================== + log.info("=== Test 20: Type incompatibility - Mixed incompatible types ===") + + qt_select_20 """ + SELECT + id, + str_val, + bool_val, + py_add_int_sc(str_val, bool_val) AS result + FROM test_type_compat_table + ORDER BY id + LIMIT 1; + """ + + // ==================== Test 21: Wrong number of arguments ==================== + log.info("=== Test 21: Wrong number of arguments ===") + + test { + sql """ + SELECT + id, + py_add_int_sc(int_val) AS result + FROM test_type_compat_table + ORDER BY id + LIMIT 1; + """ + exception "Can not found function 'py_add_int_sc' which has 1 arity. Candidate functions are: [py_add_int_sc(INT, INT)]" + } + + // ==================== Test 22: Type Incompatibility - FLOAT to INT ==================== + log.info("=== Test 22: Type incompatibility - FLOAT cannot be used where INT is expected ===") + + qt_select_22 """ + SELECT + id, + float_val, + py_add_int_sc(float_val, int_val) AS result + FROM test_type_compat_table + ORDER BY id + LIMIT 1; + """ + + log.info("All type compatibility tests (including negative tests) passed!") + + } finally { + // Cleanup + sql """ DROP FUNCTION IF EXISTS py_add_int_sc(INT, INT); """ + sql """ DROP FUNCTION IF EXISTS py_add_bigint(BIGINT, BIGINT); """ + sql """ DROP FUNCTION IF EXISTS py_add_double(DOUBLE, DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_add_float(FLOAT, FLOAT); """ + sql """ DROP FUNCTION IF EXISTS py_sum_three(INT, INT, INT); """ + sql """ DROP FUNCTION IF EXISTS py_vec_multiply(INT, INT); """ + sql """ DROP FUNCTION IF EXISTS py_vec_divide(DOUBLE, DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_vec_calc(DOUBLE, DOUBLE); """ + sql """ DROP FUNCTION IF EXISTS py_string_upper(STRING); """ + sql """ DROP FUNCTION IF EXISTS py_bool_not(BOOLEAN); """ + sql """ DROP FUNCTION IF EXISTS py_vec_scale(DOUBLE, DOUBLE); """ + sql """ DROP TABLE IF EXISTS test_type_compat_table; """ + } +} diff --git a/regression-test/suites/pythonudf_p0/test_pythonudf_string.groovy b/regression-test/suites/pythonudf_p0/test_pythonudf_string.groovy new file mode 100644 index 00000000000000..7c382f19fa5242 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/test_pythonudf_string.groovy @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudf_string") { + def tableName = "test_pythonudf_string" + def pyPath = """${context.file.parent}/udf_scripts/pyudf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python Zip path: ${pyPath}".toString()) + try { + sql """ DROP TABLE IF EXISTS test_pythonudf_string """ + sql """ DROP TABLE IF EXISTS test_pythonudf_string_2 """ + sql """ + CREATE TABLE IF NOT EXISTS test_pythonudf_string ( + `user_id` INT NOT NULL COMMENT "用户id", + `char_col` CHAR NOT NULL COMMENT "", + `varchar_col` VARCHAR(10) NOT NULL COMMENT "", + `string_col` STRING NOT NULL COMMENT "" + ) + DISTRIBUTED BY HASH(user_id) PROPERTIES("replication_num" = "1"); + """ + StringBuilder sb = new StringBuilder() + int i = 1 + for (; i < 9; i ++) { + sb.append(""" + (${i}, '${i}','abcdefg${i}','poiuytre${i}abcdefg'), + """) + } + sb.append(""" + (${i}, '${i}','abcdefg${i}','poiuytre${i}abcdefg') + """) + sql """ INSERT INTO test_pythonudf_string VALUES + ${sb.toString()} + """ + sql """ create table test_pythonudf_string_2 like test_pythonudf_string """ + sql """ insert into test_pythonudf_string_2 select * from test_pythonudf_string; """ + qt_select_default """ SELECT * FROM test_pythonudf_string t ORDER BY user_id; """ + qt_select_default_2 """ SELECT * FROM test_pythonudf_string_2 t ORDER BY user_id; """ + + File path = new File(pyPath) + if (!path.exists()) { + throw new IllegalStateException("""${pyPath} doesn't exist! """) + } + + sql """ CREATE FUNCTION python_udf_string_test(string, int, int) RETURNS string PROPERTIES ( + "file"="file://${pyPath}", + "symbol"="string_test.evaluate", + "type"="PYTHON_UDF", + "always_nullable" = "true", + "runtime_version" = "${runtime_version}" + ); """ + + qt_select """ SELECT python_udf_string_test(varchar_col, 2, 3) result FROM test_pythonudf_string ORDER BY result; """ + qt_select """ SELECT python_udf_string_test(string_col, 2, 3) result FROM test_pythonudf_string ORDER BY result; """ + qt_select """ SELECT python_udf_string_test('abcdef', 2, 3), python_udf_string_test('abcdefg', 2, 3) result FROM test_pythonudf_string ORDER BY result; """ + + qt_select_4 """ + SELECT + COALESCE( + python_udf_string_test(test_pythonudf_string.varchar_col, 2, 3), + 'not1' + ), + COALESCE( + python_udf_string_test(test_pythonudf_string.varchar_col, 2, 3), + 'not2' + ) + FROM + test_pythonudf_string + JOIN test_pythonudf_string_2 ON test_pythonudf_string.user_id = test_pythonudf_string_2.user_id order by 1,2; + """ + } finally { + try_sql("DROP FUNCTION IF EXISTS python_udf_string_test(string, int, int);") + try_sql("DROP TABLE IF EXISTS test_pythonudf_string") + try_sql("DROP TABLE IF EXISTS test_pythonudf_string_2") + } +} diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py new file mode 100644 index 00000000000000..ef3020985d4a4d --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_int_test.py @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(res): + value = 0 + for data in res: + if data is not None: + value += data + return value \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py new file mode 100644 index 00000000000000..7781d788f0794c --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_int_test.py @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(res): + value = 0 + for data in res: + if data is not None: + value += data + result = [] + result.append(value) + return result \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py new file mode 100644 index 00000000000000..92864bc800cb1f --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_return_array_string_test.py @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(res): + value = "" + for data in res: + if data is not None: + value += data + result = [] + result.append(value) + return result \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py new file mode 100644 index 00000000000000..ede02c1201e713 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/array_string_test.py @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(res): + value = "" + for data in res: + if data is not None: + value += data + return value \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/assert_equal_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/assert_equal_test.py new file mode 100644 index 00000000000000..43501d1041a2fc --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/assert_equal_test.py @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(val1, val2): + if val1 != val2: + raise RuntimeError("Assertion Not Met :: ! ( " + str(val1) + " == " + str(val2) + " )") + else: + return str(val1) + " == " + str(val2) \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/assert_lessthan_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/assert_lessthan_test.py new file mode 100644 index 00000000000000..b4ca8ff944c159 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/assert_lessthan_test.py @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(smaller, bigger): + if smaller is None or bigger is None: + raise RuntimeError("Null values found :: " + str(smaller) + " < " + str(bigger)) + if not (smaller < bigger): + raise RuntimeError("Assertion Not Met :: ! ( " + str(smaller) + " < " + str(bigger) + " )") + else: + return str(smaller) + " < " + str(bigger) \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/boolean_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/boolean_test.py new file mode 100644 index 00000000000000..b6443e33e2923d --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/boolean_test.py @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(arg): + if arg is True: + return False + else: + return True \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/double_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/double_test.py new file mode 100644 index 00000000000000..8667bc07e25192 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/double_test.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(arg1, arg2): + return arg1 + arg2 \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py new file mode 100644 index 00000000000000..3b2d726ff406f7 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/float_test.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(arg1, arg2): + return arg1 - arg2 \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py new file mode 100644 index 00000000000000..b96f6b0d4029ef --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/int_test.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(arg): + return int(arg + 1) \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_int_double_ret_string_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_int_double_ret_string_string_test.py new file mode 100644 index 00000000000000..f8be7d9e9d379f --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_int_double_ret_string_string_test.py @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(i, d): + ans = {} + ans["114" + str(i)] = "514" + str(d) + return ans \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_int_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_int_int_test.py new file mode 100644 index 00000000000000..87e27ecdd408a9 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_int_int_test.py @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(hashMap): + mul = 0 + for key, value in hashMap.items(): + mul += key * value + return mul \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_double_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_double_test.py new file mode 100644 index 00000000000000..3fc0028c473781 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_double_test.py @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(mid): + ans = {} + for key, value in mid.items(): + ans[key * 10] = value * 10 + return ans \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_int_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_int_test.py new file mode 100644 index 00000000000000..5e57f3d57d17cb --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_int_int_test.py @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(mii): + ans = {} + for key, value in mii.items(): + ans[key * 10] = value * 10 + return ans \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_string_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_string_string_test.py new file mode 100644 index 00000000000000..b6eb3a34dcfbba --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_ret_string_string_test.py @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(mp): + ans = {} + for key, value in mp.items(): + ans[key + "114"] = value + "514" + return ans \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/map_string_string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/map_string_string_test.py new file mode 100644 index 00000000000000..2121c9239c6baf --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/map_string_string_test.py @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(hashMap): + sb = [] + sortSet = set() + + for key, value in hashMap.items(): + sortSet.add(key + value) + + for item in sorted(sortSet): + sb.append(item) + + ans = ''.join(sb) + return ans \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_array_type.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_array_type.py new file mode 100644 index 00000000000000..6d8af80fba1e5d --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_array_type.py @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def array_to_csv_impl(int_arr, str_arr, nested_arr): + def safe_str(x): + return 'NULL' if x is None else str(x) + + def format_array(arr): + if arr is None: + return 'NULL' + return '[' + ','.join(safe_str(item) for item in arr) + ']' + + def format_nested_array(arr): + if arr is None: + return 'NULL' + return '[' + ','.join(format_array(inner) for inner in arr) + ']' + + parts = [ + format_array(int_arr), + format_array(str_arr), + format_nested_array(nested_arr) + ] + return '|'.join(parts) diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_data_type.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_data_type.py new file mode 100644 index 00000000000000..4786e9780c276c --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_data_type.py @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def row_to_csv_all_impl( + bool_col, tinyint_col, smallint_col, int_col, bigint_col, largeint_col, + float_col, double_col, decimal32_col, decimal64_col, decimal128_col, + date_col, datetime_col, char_col, varchar_col, string_col +): + cols = [ + bool_col, tinyint_col, smallint_col, int_col, bigint_col, largeint_col, + float_col, double_col, decimal32_col, decimal64_col, decimal128_col, + date_col, datetime_col, char_col, varchar_col, string_col + ] + + def safe_str(x): + return 'NULL' if x is None else str(x) + + return ','.join(safe_str(col) for col in cols) \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_map_type.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_map_type.py new file mode 100644 index 00000000000000..bd6f099760163b --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_map_type.py @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def map_to_csv_impl(map1, map2): + def safe_str(x): + return 'NULL' if x is None else str(x) + + def format_map(m): + if m is None: + return 'NULL' + # Doris passes MAP as Python dict + items = [f"{safe_str(k)}:{safe_str(v)}" for k, v in m.items()] + return '{' + ','.join(sorted(items)) + '}' + + return '|'.join([format_map(map1), format_map(map2)]) \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_module_test.zip b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_module_test.zip new file mode 100644 index 00000000000000..6dc6d9540592ee Binary files /dev/null and b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_module_test.zip differ diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.py new file mode 100644 index 00000000000000..95de4dc6120367 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.py @@ -0,0 +1,413 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Scalar Python UDF operations - row-by-row processing +""" + +import math +import re +from datetime import datetime, timedelta +from decimal import Decimal + + +# ==================== Numeric Operations ==================== + +def add_three_numbers(a, b, c): + """Add three numbers""" + if a is None or b is None or c is None: + return None + return a + b + c + + +def multiply_with_default(a, b, default=1): + """Multiply two numbers, return default if any is None""" + if a is None or b is None: + return default + return a * b + + +def safe_divide_with_precision(numerator, denominator, precision=2): + """Safe division with specified decimal precision""" + if numerator is None or denominator is None or denominator == 0: + return None + result = numerator / denominator + return round(result, precision) + + +def calculate_discount_price(original_price, discount_percent): + """Calculate price after discount""" + if original_price is None or discount_percent is None: + return None + if discount_percent < 0 or discount_percent > 100: + return original_price + return original_price * (1 - discount_percent / 100) + + +def compound_interest(principal, rate, years): + """Calculate compound interest: P * (1 + r)^t""" + if principal is None or rate is None or years is None: + return None + if principal <= 0 or rate < 0 or years < 0: + return None + return principal * math.pow(1 + rate / 100, years) + + +def calculate_bmi(weight_kg, height_m): + """Calculate Body Mass Index""" + if weight_kg is None or height_m is None or height_m <= 0: + return None + return round(weight_kg / (height_m * height_m), 2) + + +def fibonacci(n): + """Calculate nth Fibonacci number""" + if n is None or n < 0: + return None + if n <= 1: + return n + a, b = 0, 1 + for _ in range(2, n + 1): + a, b = b, a + b + return b + + +def is_prime(n): + """Check if a number is prime""" + if n is None or n < 2: + return False + if n == 2: + return True + if n % 2 == 0: + return False + for i in range(3, int(math.sqrt(n)) + 1, 2): + if n % i == 0: + return False + return True + + +def gcd(a, b): + """Calculate Greatest Common Divisor""" + if a is None or b is None: + return None + a, b = abs(a), abs(b) + while b: + a, b = b, a % b + return a + + +def lcm(a, b): + """Calculate Least Common Multiple""" + if a is None or b is None or a == 0 or b == 0: + return None + return abs(a * b) // gcd(a, b) + + +# ==================== String Operations ==================== + +def reverse_string(s): + """Reverse a string""" + if s is None: + return None + return s[::-1] + + +def count_vowels(s): + """Count number of vowels in a string""" + if s is None: + return None + vowels = 'aeiouAEIOU' + return sum(1 for char in s if char in vowels) + + +def count_words(s): + """Count number of words in a string""" + if s is None: + return None + return len(s.split()) + + +def string_length_custom(s): + """Calculate string length (custom implementation for testing)""" + if s is None: + return None + return len(s) + + +def capitalize_words(s): + """Capitalize first letter of each word""" + if s is None: + return None + return ' '.join(word.capitalize() for word in s.split()) + + +def remove_whitespace(s): + """Remove all whitespace from string""" + if s is None: + return None + return ''.join(s.split()) + + +def extract_numbers(s): + """Extract all numbers from string and concatenate""" + if s is None: + return None + numbers = re.findall(r'\d+', s) + return ','.join(numbers) if numbers else '' + + +def is_palindrome(s): + """Check if string is a palindrome (case-insensitive)""" + if s is None: + return None + cleaned = ''.join(c.lower() for c in s if c.isalnum()) + return cleaned == cleaned[::-1] + + +def string_similarity(s1, s2): + """Calculate simple string similarity (0-100)""" + if s1 is None or s2 is None: + return None + if s1 == s2: + return 100.0 + # Simple character overlap ratio + set1, set2 = set(s1.lower()), set(s2.lower()) + if not set1 or not set2: + return 0.0 + intersection = len(set1 & set2) + union = len(set1 | set2) + return round(intersection / union * 100, 2) + + +def mask_email(email): + """Mask email address: user@domain.com -> u***@domain.com""" + if email is None or '@' not in email: + return None + parts = email.split('@') + if len(parts[0]) <= 1: + return email + masked_user = parts[0][0] + '***' + return f"{masked_user}@{parts[1]}" + + +def extract_domain(email): + """Extract domain from email address""" + if email is None or '@' not in email: + return None + return email.split('@')[1] + + +def truncate_string(s, max_length, suffix='...'): + """Truncate string to max length with suffix""" + if s is None: + return None + if len(s) <= max_length: + return s + return s[:max_length - len(suffix)] + suffix + + +# ==================== Date/Time Operations ==================== + +def days_between_dates(date1_str, date2_str): + """Calculate days between two dates (YYYY-MM-DD format)""" + if date1_str is None or date2_str is None: + return None + try: + d1 = datetime.strptime(str(date1_str), '%Y-%m-%d') + d2 = datetime.strptime(str(date2_str), '%Y-%m-%d') + return abs((d2 - d1).days) + except: + return None + + +def is_weekend(date_str): + """Check if date is weekend (Saturday or Sunday)""" + if date_str is None: + return None + try: + date = datetime.strptime(str(date_str), '%Y-%m-%d') + return date.weekday() >= 5 # 5=Saturday, 6=Sunday + except: + return None + + +def get_quarter(date_str): + """Get quarter (1-4) from date""" + if date_str is None: + return None + try: + date = datetime.strptime(str(date_str), '%Y-%m-%d') + return (date.month - 1) // 3 + 1 + except: + return None + + +def age_in_years(birth_date_str, current_date_str): + """Calculate age in years""" + if birth_date_str is None or current_date_str is None: + return None + try: + birth = datetime.strptime(str(birth_date_str), '%Y-%m-%d') + current = datetime.strptime(str(current_date_str), '%Y-%m-%d') + age = current.year - birth.year + if (current.month, current.day) < (birth.month, birth.day): + age -= 1 + return age + except: + return None + + +# ==================== Boolean/Conditional Operations ==================== + +def is_in_range(value, min_val, max_val): + """Check if value is in range [min_val, max_val]""" + if value is None or min_val is None or max_val is None: + return None + return min_val <= value <= max_val + + +def xor_operation(a, b): + """XOR operation on two booleans""" + if a is None or b is None: + return None + return (a or b) and not (a and b) + + +def all_true(*args): + """Check if all arguments are True""" + if any(arg is None for arg in args): + return None + return all(args) + + +def any_true(*args): + """Check if any argument is True""" + if any(arg is None for arg in args): + return None + return any(args) + + +def count_true(*args): + """Count number of True values""" + if any(arg is None for arg in args): + return None + return sum(1 for arg in args if arg) + + +# ==================== Complex/Mixed Operations ==================== + +def calculate_grade(score): + """Convert numeric score to letter grade""" + if score is None: + return None + if score >= 90: + return 'A' + elif score >= 80: + return 'B' + elif score >= 70: + return 'C' + elif score >= 60: + return 'D' + else: + return 'F' + + +def categorize_age(age): + """Categorize age into groups""" + if age is None: + return None + if age < 0: + return 'Invalid' + elif age < 13: + return 'Child' + elif age < 20: + return 'Teenager' + elif age < 60: + return 'Adult' + else: + return 'Senior' + + +def calculate_tax(income, tax_rate): + """Calculate tax with progressive rates""" + if income is None or tax_rate is None: + return None + if income <= 0: + return 0.0 + return round(income * tax_rate / 100, 2) + + +def format_phone_number(phone): + """Format phone number: 1234567890 -> (123) 456-7890""" + if phone is None: + return None + digits = ''.join(c for c in str(phone) if c.isdigit()) + if len(digits) != 10: + return phone + return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}" + + +def validate_credit_card_luhn(card_number): + """Validate credit card using Luhn algorithm""" + if card_number is None: + return False + digits = [int(d) for d in str(card_number) if d.isdigit()] + if not digits: + return False + + checksum = 0 + for i, digit in enumerate(reversed(digits)): + if i % 2 == 1: + digit *= 2 + if digit > 9: + digit -= 9 + checksum += digit + return checksum % 10 == 0 + + +def json_extract_value(json_str, key): + """Extract value from simple JSON string""" + if json_str is None or key is None: + return None + try: + import json + data = json.loads(json_str) + return str(data.get(key, '')) + except: + return None + + +def levenshtein_distance(s1, s2): + """Calculate Levenshtein distance between two strings""" + if s1 is None or s2 is None: + return None + if len(s1) < len(s2): + return levenshtein_distance(s2, s1) + if len(s2) == 0: + return len(s1) + + previous_row = range(len(s2) + 1) + for i, c1 in enumerate(s1): + current_row = [i + 1] + for j, c2 in enumerate(s2): + insertions = previous_row[j + 1] + 1 + deletions = current_row[j] + 1 + substitutions = previous_row[j] + (c1 != c2) + current_row.append(min(insertions, deletions, substitutions)) + previous_row = current_row + + return previous_row[-1] diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.zip b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.zip new file mode 100644 index 00000000000000..15192eff5d2336 Binary files /dev/null and b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_scalar_ops.zip differ diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_struct_type.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_struct_type.py new file mode 100644 index 00000000000000..b785691c790e02 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_struct_type.py @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def struct_to_csv_impl(person, point): + def safe_str(x): + return 'NULL' if x is None else str(x) + + def format_array(arr): + if arr is None: + return 'NULL' + return '[' + ','.join(safe_str(item) for item in arr) + ']' + + def format_struct_dict(s, field_names): + if s is None: + return 'NULL' + parts = [] + for field in field_names: + val = s.get(field) + parts.append(safe_str(val)) + return '(' + ','.join(parts) + ')' + + person_str = format_struct_dict(person, ['name', 'age', 'salary']) + + if point is None: + point_str = 'NULL' + else: + x_val = safe_str(point.get('x')) + y_val = safe_str(point.get('y')) + tags_val = format_array(point.get('tags')) + point_str = f"({x_val},{y_val},{tags_val})" + + return '|'.join([person_str, point_str]) \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.py b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.py new file mode 100644 index 00000000000000..31dd4117eb33f6 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.py @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Vector Python UDF operations using pandas.Series +""" + +import pandas as pd +import numpy as np + + +def add_constant(a: pd.Series, constant: pd.Series) -> pd.Series: + """Add a constant to series""" + # constant is a series but we use the first value + const_val = constant.iloc[0] if len(constant) > 0 else 0 + return a + const_val + + +def multiply_by_constant(a: pd.Series, constant: pd.Series) -> pd.Series: + """Multiply series by a constant""" + const_val = constant.iloc[0] if len(constant) > 0 else 1 + return a * const_val + + +def calculate_discount(price: pd.Series, discount_percent: pd.Series) -> pd.Series: + """Calculate price after discount""" + return price * (1 - discount_percent) + + +def string_length(s: pd.Series) -> pd.Series: + """Calculate length of each string in series""" + return s.str.len() + + +def to_uppercase(s: pd.Series) -> pd.Series: + """Convert strings to uppercase""" + return s.str.upper() + + +def vec_add_with_constant(a: pd.Series, b: pd.Series) -> pd.Series: + """Add two series and add a constant""" + return a + b + 100 + + +def vec_multiply_and_round(a: pd.Series, b: pd.Series) -> pd.Series: + """Multiply two series and round to 2 decimal places""" + return (a * b).round(2) + + +def vec_string_concat_with_separator(s1: pd.Series, s2: pd.Series) -> pd.Series: + """Concatenate two string series with a separator""" + return s1 + ' | ' + s2 + + +def vec_string_title_case(s: pd.Series) -> pd.Series: + """Convert string series to title case""" + return s.str.title() + + +def vec_conditional_value(a: pd.Series, b: pd.Series) -> pd.Series: + """Return a if a > b, else return b""" + return pd.Series(np.where(a > b, a, b)) + + +def vec_percentage_calculation(part: pd.Series, total: pd.Series) -> pd.Series: + """Calculate percentage: (part / total) * 100""" + return (part / total * 100).round(2) + + +def vec_is_in_range(value: pd.Series, min_val: pd.Series, max_val: pd.Series) -> pd.Series: + """Check if value is between min_val and max_val""" + return (value >= min_val) & (value <= max_val) + + +def vec_safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series: + """Safe division, return 0 when denominator is 0 or None""" + result = numerator / denominator + # Replace inf and -inf with 0 + result = result.replace([np.inf, -np.inf], 0) + # Fill NaN with 0 + return result.fillna(0) + + +def vec_exponential_decay(value: pd.Series, days: pd.Series) -> pd.Series: + """Calculate exponential decay: value * exp(-days/30)""" + return value * np.exp(-days / 30.0) + + +def vec_string_extract_first_word(s: pd.Series) -> pd.Series: + """Extract the first word from a string""" + return s.str.split().str[0] + + +def vec_normalize_to_range(value: pd.Series) -> pd.Series: + """Normalize values to 0-1 range using min-max normalization""" + min_val = value.min() + max_val = value.max() + if max_val == min_val: + return pd.Series([0.5] * len(value)) + return (value - min_val) / (max_val - min_val) + + +def vec_moving_average(value: pd.Series) -> pd.Series: + """Calculate 3-point moving average""" + return value.rolling(window=3, min_periods=1).mean() + + +def vec_z_score(value: pd.Series) -> pd.Series: + """Calculate z-score: (value - mean) / std""" + mean = value.mean() + std = value.std() + if std == 0 or pd.isna(std): + return pd.Series([0.0] * len(value)) + return (value - mean) / std + + +def vec_clip_values(value: pd.Series, min_val: pd.Series, max_val: pd.Series) -> pd.Series: + """Clip values to be within min_val and max_val""" + return value.clip(lower=min_val, upper=max_val) + + +def vec_boolean_and(a: pd.Series, b: pd.Series) -> pd.Series: + """Logical AND operation on two boolean series""" + return a & b + + +def vec_boolean_or(a: pd.Series, b: pd.Series) -> pd.Series: + """Logical OR operation on two boolean series""" + return a | b + + +def vec_string_contains(s: pd.Series, pattern: pd.Series) -> pd.Series: + """Check if string contains pattern (case-insensitive)""" + # For simplicity, use the first pattern value for all rows + if len(pattern) > 0 and not pd.isna(pattern.iloc[0]): + pattern_str = str(pattern.iloc[0]) + return s.str.contains(pattern_str, case=False, na=False) + return pd.Series([False] * len(s)) + + +def vec_abs_difference(a: pd.Series, b: pd.Series) -> pd.Series: + """Calculate absolute difference between two series""" + return (a - b).abs() + + +def vec_power(base: pd.Series, exponent: pd.Series) -> pd.Series: + """Calculate base raised to the power of exponent""" + return base ** exponent + + +def vec_log_transform(value: pd.Series) -> pd.Series: + """Calculate natural logarithm, return 0 for non-positive values""" + result = np.log(value) + return result.replace([np.inf, -np.inf], 0).fillna(0) diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.zip b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.zip new file mode 100644 index 00000000000000..3efd38158ca8ba Binary files /dev/null and b/regression-test/suites/pythonudf_p0/udf_scripts/python_udf_vector_ops.zip differ diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip b/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip new file mode 100644 index 00000000000000..b4ed70a402bc02 Binary files /dev/null and b/regression-test/suites/pythonudf_p0/udf_scripts/pyudf.zip differ diff --git a/regression-test/suites/pythonudf_p0/udf_scripts/string_test.py b/regression-test/suites/pythonudf_p0/udf_scripts/string_test.py new file mode 100644 index 00000000000000..3505617e8e97e1 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udf_scripts/string_test.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(arg1, a, b): + return arg1[:a] + "*" * (len(arg1) - a - b) + arg1[-b:] \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/array_int_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/array_int_test.py new file mode 100644 index 00000000000000..78c1fcee957e4d --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udtf_scripts/array_int_test.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(count): + for i in range(3): + yield [1, 2, 3] \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/array_string_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/array_string_test.py new file mode 100644 index 00000000000000..7fb1f02294f5e1 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udtf_scripts/array_string_test.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(count): + for i in range(3): + yield ['Hi', 'DataMind', 'Good'] \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/double_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/double_test.py new file mode 100644 index 00000000000000..275c49355e043e --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udtf_scripts/double_test.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(val): + yield val * 10 \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/float_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/float_test.py new file mode 100644 index 00000000000000..de321bae8d6dee --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udtf_scripts/float_test.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(val): + yield val - 10 \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/int_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/int_test.py new file mode 100644 index 00000000000000..15ccedbf7da0a3 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udtf_scripts/int_test.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(count): + for i in range(3): + yield count \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/map_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/map_test.py new file mode 100644 index 00000000000000..290da850b42b29 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udtf_scripts/map_test.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(val): + for i in range(3): + yield val \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/string_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/string_test.py new file mode 100644 index 00000000000000..78939b8083b21a --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udtf_scripts/string_test.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(value, separator): + for part in value.split(separator): + yield part \ No newline at end of file diff --git a/regression-test/suites/pythonudf_p0/udtf_scripts/struct_test.py b/regression-test/suites/pythonudf_p0/udtf_scripts/struct_test.py new file mode 100644 index 00000000000000..1a93ba3caf3eb3 --- /dev/null +++ b/regression-test/suites/pythonudf_p0/udtf_scripts/struct_test.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def evaluate(val): + for i in range(3): + yield 1, 0.112, "Hello, DataMind" \ No newline at end of file diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_basic_inline.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_basic_inline.groovy new file mode 100644 index 00000000000000..dd2266c1195ceb --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_basic_inline.groovy @@ -0,0 +1,1247 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_basic_inline") { + // Basic Python UDTF tests following Snowflake syntax + // UDTF (User-Defined Table Function) returns table (multiple rows) from scalar/table input + + def runtime_version = "3.8.10" + + try { + // ======================================== + // Test 1: Simple String Split UDTF + // Input: Single string + // Output: Multiple rows (one per split part) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_split_string(STRING); """ + sql """ + CREATE TABLES FUNCTION py_split_string(STRING) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "split_string_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def split_string_udtf(input_str): + '''Split comma-separated string into rows''' + if input_str: + parts = input_str.split(',') + for part in parts: + yield (part.strip(),) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS temp_input; """ + sql """ + CREATE TABLE temp_input ( + id INT, + input_str STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO temp_input VALUES (1, 'apple,banana,cherry'); + """ + + qt_split_string """ + SELECT part + FROM temp_input + LATERAL VIEW py_split_string(input_str) tmp AS part; + """ + + // ======================================== + // Test 2: Generate Series UDTF + // Input: start, end integers + // Output: Sequence of integers + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_generate_series(INT, INT); """ + sql """ + CREATE TABLES FUNCTION py_generate_series(INT, INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "generate_series_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def generate_series_udtf(start, end): + '''Generate integer series from start to end''' + if start is not None and end is not None: + for i in range(start, end + 1): + yield (i,) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS temp_series; """ + sql """ + CREATE TABLE temp_series ( + id INT, + start_val INT, + end_val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO temp_series VALUES (1, 1, 5), (2, 10, 12); + """ + + qt_generate_series """ + SELECT tmp.value + FROM temp_series + LATERAL VIEW py_generate_series(start_val, end_val) tmp AS value; + """ + + qt_generate_series_multiple """ + SELECT tmp.value + FROM temp_series + LATERAL VIEW py_generate_series(start_val, end_val) tmp AS value + ORDER BY tmp.value; + """ + + // ======================================== + // Test 3: Running Sum UDTF (without state management) + // Note: Function-based UDTFs cannot maintain state across calls + // Each row is processed independently + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_running_sum(INT); """ + sql """ + CREATE TABLES FUNCTION py_running_sum(INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "running_sum_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def running_sum_udtf(value): + '''Return value with itself as cumulative sum (stateless)''' + # Note: Function-based UDTF cannot maintain state + # This is simplified to return (value, value) + if value is not None: + yield (value, value) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS numbers_table; """ + sql """ + CREATE TABLE numbers_table ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO numbers_table VALUES + (1, 10), + (2, 20), + (3, 30), + (4, 40); + """ + + qt_running_sum """ + SELECT original_value, cumulative_sum + FROM numbers_table + LATERAL VIEW py_running_sum(value) tmp AS original_value, cumulative_sum + ORDER BY original_value; + """ + + // ======================================== + // Test 4: Explode Array UDTF + // Similar to LATERAL VIEW explode in Hive + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_explode_json_array(STRING); """ + sql """ + CREATE TABLES FUNCTION py_explode_json_array(STRING) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "explode_json_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +import json + +def explode_json_udtf(json_str): + '''Explode JSON ARRAY into rows''' + if json_str: + try: + data = json.loads(json_str) + if isinstance(data, list): + for item in data: + yield (str(item),) + except: + pass # Skip invalid JSON +\$\$; + """ + + sql """ DROP TABLE IF EXISTS temp_json; """ + sql """ + CREATE TABLE temp_json ( + id INT, + json_data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO temp_json VALUES (1, '["apple", "banana", "cherry"]'); + """ + + qt_explode_json """ + SELECT element + FROM temp_json + LATERAL VIEW py_explode_json_array(json_data) tmp AS element; + """ + + // ======================================== + // Test 5: Top-N UDTF (stateless version) + // Note: Without state, this simply returns first n values per row + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_top_n(INT, INT); """ + sql """ + CREATE TABLES FUNCTION py_top_n(INT, INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "top_n_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def top_n_udtf(value, n): + '''Return single value with rank 1 (stateless)''' + # Without state, each row is independent + if value is not None and n is not None and n > 0: + yield (value, 1) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS ranked_data; """ + sql """ + CREATE TABLE ranked_data ( + id INT, + category STRING, + value INT, + top_n INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO ranked_data VALUES + (1, 'A', 100, 2), + (2, 'A', 90, 2), + (3, 'A', 80, 2), + (4, 'A', 70, 2), + (5, 'B', 200, 2), + (6, 'B', 190, 2); + """ + + qt_top_n """ + SELECT category, tmp.value, tmp.rank + FROM ranked_data + LATERAL VIEW py_top_n(value, top_n) tmp AS value, rank + ORDER BY category, tmp.rank; + """ + + // ======================================== + // Test 6: Multiple Outputs per Input + // One input row can generate multiple output rows + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_duplicate_n_times(STRING, INT); """ + sql """ + CREATE TABLES FUNCTION py_duplicate_n_times(STRING, INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "duplicate_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def duplicate_udtf(text, n): + '''Duplicate input text N times''' + if text and n: + for i in range(n): + yield (text, i + 1) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS temp_dup; """ + sql """ + CREATE TABLE temp_dup ( + id INT, + text STRING, + times INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO temp_dup VALUES (1, 'Hello', 3); + """ + + qt_duplicate """ + SELECT output, idx + FROM temp_dup + LATERAL VIEW py_duplicate_n_times(text, times) tmp AS output, idx; + """ + + // ======================================== + // Test 7: Conditional Output (Skip Rows) + // UDTF can skip rows by not yielding + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_filter_positive(INT); """ + sql """ + CREATE TABLES FUNCTION py_filter_positive(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "filter_positive_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def filter_positive_udtf(value): + '''Only output positive values''' + if value is not None and value > 0: + yield (value,) + # If value <= 0, don't yield (skip this row) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS mixed_numbers; """ + sql """ + CREATE TABLE mixed_numbers ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO mixed_numbers VALUES (1, -5), (2, 0), (3, 3), (4, -2), (5, 7), (6, 1); + """ + + qt_filter_positive """ + SELECT positive_value + FROM mixed_numbers + LATERAL VIEW py_filter_positive(value) tmp AS positive_value + ORDER BY positive_value; + """ + + // ======================================== + // Test 8: Cartesian Product UDTF + // Generate all combinations + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_cartesian(STRING, STRING); """ + sql """ + CREATE TABLES FUNCTION py_cartesian(STRING, STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "cartesian_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def cartesian_udtf(list1, list2): + '''Generate cartesian product of two comma-separated lists''' + if list1 and list2: + items1 = [x.strip() for x in list1.split(',')] + items2 = [y.strip() for y in list2.split(',')] + + for x in items1: + for y in items2: + yield (x, y) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS temp_cart; """ + sql """ + CREATE TABLE temp_cart ( + id INT, + list1 STRING, + list2 STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO temp_cart VALUES (1, 'A,B', 'X,Y,Z'); + """ + + qt_cartesian """ + SELECT item1, item2 + FROM temp_cart + LATERAL VIEW py_cartesian(list1, list2) tmp AS item1, item2 + ORDER BY item1, item2; + """ + + // ======================================== + // Test 9: All Rows Filtered (Empty Output) + // Tests data_batch = None case + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_filter_negative(INT); """ + sql """ + CREATE TABLES FUNCTION py_filter_negative(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "filter_negative_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def filter_negative_udtf(value): + '''Only output negative values (filter all positive numbers)''' + if value is not None and value < 0: + yield (value,) + # For positive numbers, don't yield anything +\$\$; + """ + + sql """ DROP TABLE IF EXISTS temp_all_positive; """ + sql """ + CREATE TABLE temp_all_positive ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + // Insert only positive numbers - all should be filtered + sql """ + INSERT INTO temp_all_positive VALUES (1, 10), (2, 20), (3, 30); + """ + + // Expected: No output rows (all filtered), but should not crash + qt_all_filtered """ + SELECT id, neg_value + FROM temp_all_positive + LATERAL VIEW py_filter_negative(value) tmp AS neg_value + ORDER BY id; + """ + + // ======================================== + // Test 10: Mixed - Some Filtered, Some Not + // ======================================== + sql """ DROP TABLE IF EXISTS temp_mixed; """ + sql """ + CREATE TABLE temp_mixed ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + // Mix of positive and negative - only negative should pass + sql """ + INSERT INTO temp_mixed VALUES (1, 10), (2, -5), (3, 20), (4, -3); + """ + + qt_mixed_filter """ + SELECT id, neg_value + FROM temp_mixed + LATERAL VIEW py_filter_negative(value) tmp AS neg_value + ORDER BY id, neg_value; + """ + + // ======================================== + // Test 11: Empty Input Table + // Tests empty batch case + // ======================================== + sql """ DROP TABLE IF EXISTS temp_empty; """ + sql """ + CREATE TABLE temp_empty ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + // No data inserted - empty table + qt_empty_input """ + SELECT id, neg_value + FROM temp_empty + LATERAL VIEW py_filter_negative(value) tmp AS neg_value; + """ + + // ======================================== + // Test 12: always_nullable = true (default) + // Function can return NULL even with NOT NULL input + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_nullable_processor(INT); """ + sql """ + CREATE TABLES FUNCTION py_nullable_processor(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "nullable_processor_udtf", + "runtime_version" = "3.8.10", + "always_nullable" = "true" + ) + AS \$\$ +def nullable_processor_udtf(value): + '''Return NULL for even numbers, value for odd numbers''' + if value is None: + yield (None,) + elif value % 2 == 0: + yield (None,) # Return NULL for even numbers + else: + yield (value,) # Return original value for odd numbers +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_nullable; """ + sql """ + CREATE TABLE test_nullable ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nullable VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5); + """ + + // Should return NULL for even values, original value for odd + qt_nullable_true """ + SELECT id, result + FROM test_nullable + LATERAL VIEW py_nullable_processor(value) tmp AS result + ORDER BY id; + """ + + // ======================================== + // Test 13: always_nullable = false + // Function guarantees NOT NULL output with NOT NULL input + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_non_nullable_processor(INT); """ + sql """ + CREATE TABLES FUNCTION py_non_nullable_processor(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "non_nullable_processor_udtf", + "runtime_version" = "3.8.10", + "always_nullable" = "false" + ) + AS \$\$ +def non_nullable_processor_udtf(value): + '''Always return non-NULL value, double the input''' + if value is None: + yield (0,) # Return 0 for NULL input + else: + yield (value * 2,) # Always return non-NULL +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_non_nullable; """ + sql """ + CREATE TABLE test_non_nullable ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_non_nullable VALUES (1, 10), (2, 20), (3, 30); + """ + + // Should return doubled values, all NOT NULL + qt_non_nullable_false """ + SELECT id, result + FROM test_non_nullable + LATERAL VIEW py_non_nullable_processor(value) tmp AS result + ORDER BY id; + """ + + // ======================================== + // Test 14: always_nullable with NULL inputs + // Test how both modes handle NULL inputs + // ======================================== + sql """ DROP TABLE IF EXISTS test_null_inputs; """ + sql """ + CREATE TABLE test_null_inputs ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_null_inputs VALUES (1, NULL), (2, 10), (3, NULL), (4, 20); + """ + + // Test with always_nullable = true (can return NULL) + qt_nullable_with_nulls """ + SELECT id, result + FROM test_null_inputs + LATERAL VIEW py_nullable_processor(value) tmp AS result + ORDER BY id; + """ + + // Test with always_nullable = false (converts NULL to 0) + qt_non_nullable_with_nulls """ + SELECT id, result + FROM test_null_inputs + LATERAL VIEW py_non_nullable_processor(value) tmp AS result + ORDER BY id; + """ + + // ======================================== + // Test 15: always_nullable default behavior + // If not specified, should default to true + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_default_nullable(STRING); """ + sql """ + CREATE TABLES FUNCTION py_default_nullable(STRING) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "default_nullable_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def default_nullable_udtf(text): + '''Return NULL for empty strings, uppercase for non-empty''' + if not text or text.strip() == '': + yield (None,) + else: + yield (text.upper(),) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_default_nullable; """ + sql """ + CREATE TABLE test_default_nullable ( + id INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_default_nullable VALUES (1, 'hello'), (2, ''), (3, 'world'), (4, ' '); + """ + + // Should return NULL for empty/blank strings (default nullable behavior) + qt_default_nullable """ + SELECT id, result + FROM test_default_nullable + LATERAL VIEW py_default_nullable(text) tmp AS result + ORDER BY id; + """ + + // ======================================== + // Test 16: always_nullable with multiple outputs + // Test nullable behavior with functions returning multiple rows + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_nullable_explode(STRING); """ + sql """ + CREATE TABLES FUNCTION py_nullable_explode(STRING) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "nullable_explode_udtf", + "runtime_version" = "3.8.10", + "always_nullable" = "true" + ) + AS \$\$ +def nullable_explode_udtf(csv_string): + '''Split CSV and return NULL for empty parts''' + if not csv_string: + return + parts = csv_string.split(',') + for part in parts: + stripped = part.strip() + if stripped: + yield (stripped,) + else: + yield (None,) # Return NULL for empty parts +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_multi_nullable; """ + sql """ + CREATE TABLE test_multi_nullable ( + id INT, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_multi_nullable VALUES + (1, 'a,b,c'), + (2, 'x,,z'), + (3, ',,'); + """ + + // Should return NULL for empty parts in CSV + qt_multi_nullable """ + SELECT id, part + FROM test_multi_nullable + LATERAL VIEW py_nullable_explode(data) tmp AS part + ORDER BY id, part; + """ + + // ======================================== + // Test: Scalar Value Support (New Feature) + // Single-field UDTF can yield scalar values directly + // ======================================== + + // Test Case 1: yield scalar (int) + sql """ DROP FUNCTION IF EXISTS py_scalar_int(INT, INT); """ + sql """ + CREATE TABLES FUNCTION py_scalar_int(INT, INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "scalar_int_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def scalar_int_udtf(start, end): + '''Yield scalar integers directly (no tuple wrapping)''' + if start is not None and end is not None: + for i in range(start, end + 1): + yield i # Direct scalar, not (i,) +\$\$; + """ + + qt_scalar_int """ + SELECT tmp.value + FROM (SELECT 1 as start_val, 5 as end_val) t + LATERAL VIEW py_scalar_int(start_val, end_val) tmp AS value + ORDER BY tmp.value; + """ + + // Test Case 2: yield scalar (string) + sql """ DROP FUNCTION IF EXISTS py_scalar_string(STRING); """ + sql """ + CREATE TABLES FUNCTION py_scalar_string(STRING) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "scalar_string_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def scalar_string_udtf(text): + '''Split string and yield parts directly as scalars''' + if text: + for part in text.split(','): + yield part.strip() # Direct string, not (part.strip(),) +\$\$; + """ + + qt_scalar_string """ + SELECT tmp.value + FROM (SELECT 'apple,banana,cherry' as text) t + LATERAL VIEW py_scalar_string(text) tmp AS value + ORDER BY tmp.value; + """ + + // Test Case 3: Mixed - both scalar and tuple should work + sql """ DROP FUNCTION IF EXISTS py_mixed_style(INT); """ + sql """ + CREATE TABLES FUNCTION py_mixed_style(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "mixed_style_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def mixed_style_udtf(n): + '''Test mixing scalar and tuple yields (should both work)''' + if n is not None and n > 0: + # First yield as scalar + yield n + # Then yield as tuple + yield (n * 2,) + # Then scalar again + yield n * 3 +\$\$; + """ + + qt_mixed_style """ + SELECT tmp.value + FROM (SELECT 10 as n) t + LATERAL VIEW py_mixed_style(n) tmp AS value + ORDER BY tmp.value; + """ + + // Test Case 4: return scalar (not yield) + sql """ DROP FUNCTION IF EXISTS py_return_scalar(STRING); """ + sql """ + CREATE TABLES FUNCTION py_return_scalar(STRING) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "return_scalar_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def return_scalar_udtf(text): + '''Return scalar value instead of yielding''' + if text: + return text.upper() # Direct return, not (text.upper(),) +\$\$; + """ + + qt_return_scalar """ + SELECT tmp.value + FROM (SELECT 'hello' as text) t + LATERAL VIEW py_return_scalar(text) tmp AS value; + """ + + // Test Case 5: Verify multi-field still requires tuples + sql """ DROP FUNCTION IF EXISTS py_multi_field_check(INT); """ + sql """ + CREATE TABLES FUNCTION py_multi_field_check(INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "multi_field_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def multi_field_udtf(n): + '''Multi-field UDTF still requires tuples''' + if n is not None: + yield (n, n * 2) # Must be tuple for multi-field +\$\$; + """ + + qt_multi_field_check """ + SELECT tmp.original, tmp.doubled + FROM (SELECT 42 as n) t + LATERAL VIEW py_multi_field_check(n) tmp AS original, doubled; + """ + + // ======================================== + // Test: OUTER Semantics + // When function is registered as func, both func and func_outer are available + // func: skips NULL/empty results (no output row) + // func_outer: outputs NULL for NULL/empty results (guaranteed output row) + // ======================================== + + // Test Case 1: Simple UDTF with NULL handling + sql """ DROP FUNCTION IF EXISTS py_process_value(INT); """ + sql """ + CREATE TABLES FUNCTION py_process_value(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_value_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_value_udtf(value): + '''Process value: if positive, yield doubled value; otherwise yield nothing''' + if value is not None and value > 0: + yield (value * 2,) + # If value is None or <= 0, don't yield anything +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_outer_basic; """ + sql """ + CREATE TABLE test_outer_basic ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_outer_basic VALUES + (1, 10), -- positive: should output 20 + (2, NULL), -- NULL: func skips, func_outer outputs NULL + (3, 0), -- zero: func skips, func_outer outputs NULL + (4, -5), -- negative: func skips, func_outer outputs NULL + (5, 15); -- positive: should output 30 + """ + + // Test without _outer: NULL/non-positive values are skipped (no output row) + qt_outer_without """ + SELECT id, result + FROM test_outer_basic + LATERAL VIEW py_process_value(value) tmp AS result + ORDER BY id; + """ + + // Test with _outer: NULL/non-positive values output NULL (guaranteed row per input) + qt_outer_with """ + SELECT id, result + FROM test_outer_basic + LATERAL VIEW py_process_value_outer(value) tmp AS result + ORDER BY id; + """ + + // Test Case 2: String split with empty/NULL strings + sql """ DROP FUNCTION IF EXISTS py_split_words(STRING); """ + sql """ + CREATE TABLES FUNCTION py_split_words(STRING) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "split_words_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def split_words_udtf(text): + '''Split text by spaces. Empty/NULL strings yield nothing.''' + if text and text.strip(): + words = text.strip().split() + for word in words: + yield (word,) + # Empty or NULL strings: no output +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_outer_strings; """ + sql """ + CREATE TABLE test_outer_strings ( + id INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_outer_strings VALUES + (1, 'hello world'), -- should split into 2 rows + (2, NULL), -- NULL + (3, ''), -- empty string + (4, ' '), -- whitespace only + (5, 'single'); -- single word + """ + + // Without _outer: only rows 1 and 5 produce output + qt_outer_string_without """ + SELECT id, word + FROM test_outer_strings + LATERAL VIEW py_split_words(text) tmp AS word + ORDER BY id, word; + """ + + // With _outer: all rows produce at least one output (NULL for rows 2,3,4) + qt_outer_string_with """ + SELECT id, word + FROM test_outer_strings + LATERAL VIEW py_split_words_outer(text) tmp AS word + ORDER BY id, word; + """ + + // Test Case 3: Array expansion with empty arrays + sql """ DROP FUNCTION IF EXISTS py_expand_range(INT); """ + sql """ + CREATE TABLES FUNCTION py_expand_range(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "expand_range_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def expand_range_udtf(n): + '''Generate numbers from 1 to n. If n <= 0 or NULL, yield nothing.''' + if n is not None and n > 0: + for i in range(1, n + 1): + yield (i,) + # If n is None or <= 0, no output +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_outer_range; """ + sql """ + CREATE TABLE test_outer_range ( + id INT, + count INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_outer_range VALUES + (1, 3), -- should generate 1,2,3 + (2, NULL), -- NULL + (3, 0), -- zero (no range) + (4, -2), -- negative (no range) + (5, 1); -- should generate 1 + """ + + // Without _outer: only rows 1 and 5 produce output + qt_outer_range_without """ + SELECT id, num + FROM test_outer_range + LATERAL VIEW py_expand_range(count) tmp AS num + ORDER BY id, num; + """ + + // With _outer: all rows produce output (NULL for rows 2,3,4) + qt_outer_range_with """ + SELECT id, num + FROM test_outer_range + LATERAL VIEW py_expand_range_outer(count) tmp AS num + ORDER BY id, num; + """ + + // Test Case 4: Multiple column output with OUTER + sql """ DROP FUNCTION IF EXISTS py_parse_csv(STRING); """ + sql """ + CREATE TABLES FUNCTION py_parse_csv(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "parse_csv_udtf", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def parse_csv_udtf(csv_line): + '''Parse CSV line into field1,field2. Empty/NULL yields nothing.''' + if csv_line and csv_line.strip(): + parts = csv_line.split(',') + if len(parts) >= 2: + yield (parts[0].strip(), parts[1].strip()) + # Empty or invalid CSV: no output +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_outer_multifield; """ + sql """ + CREATE TABLE test_outer_multifield ( + id INT, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_outer_multifield VALUES + (1, 'Alice,30'), -- valid CSV + (2, NULL), -- NULL + (3, ''), -- empty + (4, 'Bob'), -- incomplete CSV (only 1 field) + (5, 'Charlie,25'); -- valid CSV + """ + + // Without _outer: only rows 1 and 5 produce output + qt_outer_multifield_without """ + SELECT id, field1, field2 + FROM test_outer_multifield + LATERAL VIEW py_parse_csv(data) tmp AS field1, field2 + ORDER BY id; + """ + + // With _outer: all rows produce output (NULL,NULL for rows 2,3,4) + qt_outer_multifield_with """ + SELECT id, field1, field2 + FROM test_outer_multifield + LATERAL VIEW py_parse_csv_outer(data) tmp AS field1, field2 + ORDER BY id; + """ + + // Test Case 5: Combining regular and outer table functions + sql """ DROP TABLE IF EXISTS test_outer_combined; """ + sql """ + CREATE TABLE test_outer_combined ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_outer_combined VALUES + (1, 5), + (2, NULL), + (3, 3); + """ + + // Mix regular and outer: regular filters, outer preserves + qt_outer_mixed_functions """ + SELECT id, r1.num as regular_num, r2.num as outer_num + FROM test_outer_combined + LATERAL VIEW py_expand_range(value) r1 AS num + LATERAL VIEW py_expand_range_outer(value) r2 AS num + ORDER BY id, regular_num, outer_num; + """ + + // Test Case 6: Verify outer behavior with built-in functions + sql """ DROP TABLE IF EXISTS test_outer_builtin; """ + sql """ + CREATE TABLE test_outer_builtin ( + id INT, + arr ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_outer_builtin VALUES + (1, [1, 2, 3]), -- normal array + (2, NULL), -- NULL array + (3, []), -- empty array + (4, [5]); -- single element + """ + + // Built-in explode (no outer): skips NULL and empty + qt_outer_builtin_explode """ + SELECT id, elem + FROM test_outer_builtin + LATERAL VIEW explode(arr) tmp AS elem + ORDER BY id, elem; + """ + + // Built-in explode_outer: preserves NULL and empty rows + qt_outer_builtin_explode_outer """ + SELECT id, elem + FROM test_outer_builtin + LATERAL VIEW explode_outer(arr) tmp AS elem + ORDER BY id, elem; + """ + + // Test Case 7: Documentation example - LEFT OUTER JOIN semantics + sql """ DROP TABLE IF EXISTS orders; """ + sql """ + CREATE TABLE orders ( + order_id INT, + items STRING + ) ENGINE=OLAP + DUPLICATE KEY(order_id) + DISTRIBUTED BY HASH(order_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO orders VALUES + (1, 'apple,banana'), -- order with items + (2, NULL), -- order with NULL items + (3, ''), -- order with empty items + (4, 'cherry'); -- order with one item + """ + + // Without outer: orders 2 and 3 disappear (like INNER JOIN) + qt_outer_doc_inner """ + SELECT order_id, item + FROM orders + LATERAL VIEW py_split_words(items) tmp AS item + ORDER BY order_id, item; + """ + + // With outer: all orders preserved (like LEFT OUTER JOIN) + qt_outer_doc_outer """ + SELECT order_id, item + FROM orders + LATERAL VIEW py_split_words_outer(items) tmp AS item + ORDER BY order_id, item; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_split_string(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_generate_series(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_running_sum(INT);") + try_sql("DROP FUNCTION IF EXISTS py_explode_json_array(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_top_n(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_duplicate_n_times(STRING, INT);") + try_sql("DROP FUNCTION IF EXISTS py_filter_positive(INT);") + try_sql("DROP FUNCTION IF EXISTS py_cartesian(STRING, STRING);") + try_sql("DROP FUNCTION IF EXISTS py_filter_negative(INT);") + try_sql("DROP FUNCTION IF EXISTS py_nullable_processor(INT);") + try_sql("DROP FUNCTION IF EXISTS py_non_nullable_processor(INT);") + try_sql("DROP FUNCTION IF EXISTS py_default_nullable(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_nullable_explode(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_scalar_int(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_scalar_string(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_mixed_style(INT);") + try_sql("DROP FUNCTION IF EXISTS py_return_scalar(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_multi_field_check(INT);") + try_sql("DROP FUNCTION IF EXISTS py_process_value(INT);") + try_sql("DROP FUNCTION IF EXISTS py_split_words(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_expand_range(INT);") + try_sql("DROP FUNCTION IF EXISTS py_parse_csv(STRING);") + try_sql("DROP TABLE IF EXISTS temp_input;") + try_sql("DROP TABLE IF EXISTS numbers_table;") + try_sql("DROP TABLE IF EXISTS ranked_data;") + try_sql("DROP TABLE IF EXISTS mixed_numbers;") + try_sql("DROP TABLE IF EXISTS temp_all_positive;") + try_sql("DROP TABLE IF EXISTS temp_mixed;") + try_sql("DROP TABLE IF EXISTS temp_empty;") + try_sql("DROP TABLE IF EXISTS test_nullable;") + try_sql("DROP TABLE IF EXISTS test_non_nullable;") + try_sql("DROP TABLE IF EXISTS test_null_inputs;") + try_sql("DROP TABLE IF EXISTS test_default_nullable;") + try_sql("DROP TABLE IF EXISTS test_multi_nullable;") + try_sql("DROP TABLE IF EXISTS test_outer_basic;") + try_sql("DROP TABLE IF EXISTS test_outer_strings;") + try_sql("DROP TABLE IF EXISTS test_outer_range;") + try_sql("DROP TABLE IF EXISTS test_outer_multifield;") + try_sql("DROP TABLE IF EXISTS test_outer_combined;") + try_sql("DROP TABLE IF EXISTS test_outer_builtin;") + try_sql("DROP TABLE IF EXISTS orders;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_basic_module.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_basic_module.groovy new file mode 100644 index 00000000000000..861317823f382b --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_basic_module.groovy @@ -0,0 +1,475 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_basic_module") { + // Basic Python UDTF tests using module-based deployment + + def pyPath = """${context.file.parent}/udtf_scripts/pyudtf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${pyPath}".toString()) + + try { + // ======================================== + // Test 1: Simple String Split UDTF + // Input: Single string + // Output: Multiple rows (one per split part) + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_split_string_module(STRING); """ + sql """ + CREATE TABLES FUNCTION py_split_string_module(STRING) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.basic_udtf.split_string_udtf", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS temp_input_module; """ + sql """ + CREATE TABLE temp_input_module ( + id INT, + input_str STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO temp_input_module VALUES (1, 'apple,banana,cherry'); + """ + + qt_split_string """ + SELECT part + FROM temp_input_module + LATERAL VIEW py_split_string_module(input_str) tmp AS part; + """ + + // ======================================== + // Test 2: Generate Series UDTF + // Input: start, end integers + // Output: Sequence of integers + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_generate_series_module(INT, INT); """ + sql """ + CREATE TABLES FUNCTION py_generate_series_module(INT, INT) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.basic_udtf.generate_series_udtf", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS temp_series_module; """ + sql """ + CREATE TABLE temp_series_module ( + id INT, + start_val INT, + end_val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO temp_series_module VALUES (1, 1, 5), (2, 10, 12); + """ + + qt_generate_series """ + SELECT tmp.value + FROM temp_series_module + LATERAL VIEW py_generate_series_module(start_val, end_val) tmp AS value; + """ + + qt_generate_series_multiple """ + SELECT tmp.value + FROM temp_series_module + LATERAL VIEW py_generate_series_module(start_val, end_val) tmp AS value + ORDER BY tmp.value; + """ + + // ======================================== + // Test 3: Running Sum UDTF (without state management) + // Note: Function-based UDTFs cannot maintain state across calls + // Each row is processed independently + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_running_sum_module(INT); """ + sql """ + CREATE TABLES FUNCTION py_running_sum_module(INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.basic_udtf.running_sum_udtf", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS numbers_table_module; """ + sql """ + CREATE TABLE numbers_table_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO numbers_table_module VALUES + (1, 10), + (2, 20), + (3, 30), + (4, 40); + """ + + qt_running_sum """ + SELECT original_value, cumulative_sum + FROM numbers_table_module + LATERAL VIEW py_running_sum_module(value) tmp AS original_value, cumulative_sum + ORDER BY original_value; + """ + + // ======================================== + // Test 4: Explode Array UDTF + // Similar to LATERAL VIEW explode in Hive + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_explode_json_array_module(STRING); """ + sql """ + CREATE TABLES FUNCTION py_explode_json_array_module(STRING) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.basic_udtf.explode_json_udtf", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS temp_json_module; """ + sql """ + CREATE TABLE temp_json_module ( + id INT, + json_data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO temp_json_module VALUES (1, '["apple", "banana", "cherry"]'); + """ + + qt_explode_json """ + SELECT element + FROM temp_json_module + LATERAL VIEW py_explode_json_array_module(json_data) tmp AS element; + """ + + // ======================================== + // Test 5: Top-N UDTF (stateless version) + // Note: Without state, this simply returns first n values per row + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_top_n_module(INT, INT); """ + sql """ + CREATE TABLES FUNCTION py_top_n_module(INT, INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.basic_udtf.top_n_udtf", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS ranked_data_module; """ + sql """ + CREATE TABLE ranked_data_module ( + id INT, + category STRING, + value INT, + top_n INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO ranked_data_module VALUES + (1, 'A', 100, 2), + (2, 'A', 90, 2), + (3, 'A', 80, 2), + (4, 'A', 70, 2), + (5, 'B', 200, 2), + (6, 'B', 190, 2); + """ + + qt_top_n """ + SELECT category, tmp.value, tmp.rank + FROM ranked_data_module + LATERAL VIEW py_top_n_module(value, top_n) tmp AS value, rank + ORDER BY category, tmp.rank; + """ + + // ======================================== + // Test 6: Multiple Outputs per Input + // One input row can generate multiple output rows + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_duplicate_n_times_module(STRING, INT); """ + sql """ + CREATE TABLES FUNCTION py_duplicate_n_times_module(STRING, INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.basic_udtf.duplicate_udtf", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS temp_dup_module; """ + sql """ + CREATE TABLE temp_dup_module ( + id INT, + text STRING, + times INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO temp_dup_module VALUES (1, 'Hello', 3); + """ + + qt_duplicate """ + SELECT output, idx + FROM temp_dup_module + LATERAL VIEW py_duplicate_n_times_module(text, times) tmp AS output, idx; + """ + + // ======================================== + // Test 7: Conditional Output (Skip Rows) + // UDTF can skip rows by not yielding + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_filter_positive_module(INT); """ + sql """ + CREATE TABLES FUNCTION py_filter_positive_module(INT) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.basic_udtf.filter_positive_udtf", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS mixed_numbers_module; """ + sql """ + CREATE TABLE mixed_numbers_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO mixed_numbers_module VALUES (1, -5), (2, 0), (3, 3), (4, -2), (5, 7), (6, 1); + """ + + qt_filter_positive """ + SELECT positive_value + FROM mixed_numbers_module + LATERAL VIEW py_filter_positive_module(value) tmp AS positive_value + ORDER BY positive_value; + """ + + // ======================================== + // Test 8: Cartesian Product UDTF + // Generate all combinations + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_cartesian_module(STRING, STRING); """ + sql """ + CREATE TABLES FUNCTION py_cartesian_module(STRING, STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.basic_udtf.cartesian_udtf", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS temp_cart_module; """ + sql """ + CREATE TABLE temp_cart_module ( + id INT, + list1 STRING, + list2 STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO temp_cart_module VALUES (1, 'A,B', 'X,Y,Z'); + """ + + qt_cartesian """ + SELECT item1, item2 + FROM temp_cart_module + LATERAL VIEW py_cartesian_module(list1, list2) tmp AS item1, item2 + ORDER BY item1, item2; + """ + + // ======================================== + // Test 9: All Rows Filtered (Empty Output) + // Tests data_batch = None case + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_filter_negative_module(INT); """ + sql """ + CREATE TABLES FUNCTION py_filter_negative_module(INT) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.basic_udtf.filter_negative_udtf", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS temp_all_positive_module; """ + sql """ + CREATE TABLE temp_all_positive_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + // Insert only positive numbers - all should be filtered + sql """ + INSERT INTO temp_all_positive_module VALUES (1, 10), (2, 20), (3, 30); + """ + + // Expected: No output rows (all filtered), but should not crash + qt_all_filtered """ + SELECT id, neg_value + FROM temp_all_positive_module + LATERAL VIEW py_filter_negative_module(value) tmp AS neg_value + ORDER BY id; + """ + + // ======================================== + // Test 10: Mixed - Some Filtered, Some Not + // ======================================== + sql """ DROP TABLE IF EXISTS temp_mixed_module; """ + sql """ + CREATE TABLE temp_mixed_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + // Mix of positive and negative - only negative should pass + sql """ + INSERT INTO temp_mixed_module VALUES (1, 10), (2, -5), (3, 20), (4, -3); + """ + + qt_mixed_filter """ + SELECT id, neg_value + FROM temp_mixed_module + LATERAL VIEW py_filter_negative_module(value) tmp AS neg_value + ORDER BY id, neg_value; + """ + + // ======================================== + // Test 11: Empty Input Table + // Tests empty batch case + // ======================================== + sql """ DROP TABLE IF EXISTS temp_empty_module; """ + sql """ + CREATE TABLE temp_empty_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + // No data inserted - empty table + qt_empty_input """ + SELECT id, neg_value + FROM temp_empty_module + LATERAL VIEW py_filter_negative_module(value) tmp AS neg_value; + """ + + // ======================================== + // Test 12: always_nullable = true (default) + // Function can return NULL even with NOT NULL input + // ======================================== + sql """ DROP FUNCTION IF EXISTS py_nullable_processor_module(INT); """ + sql """ + CREATE TABLES FUNCTION py_nullable_processor_module(INT) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.basic_udtf.nullable_processor_udtf", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}", + "always_nullable" = "true" + ); + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS py_split_string_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_generate_series_module(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_running_sum_module(INT);") + try_sql("DROP FUNCTION IF EXISTS py_explode_json_array_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS py_top_n_module(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS py_duplicate_n_times_module(STRING, INT);") + try_sql("DROP FUNCTION IF EXISTS py_filter_positive_module(INT);") + try_sql("DROP FUNCTION IF EXISTS py_cartesian_module(STRING, STRING);") + try_sql("DROP FUNCTION IF EXISTS py_filter_negative_module(INT);") + try_sql("DROP TABLE IF EXISTS temp_input_module;") + try_sql("DROP TABLE IF EXISTS numbers_table_module;") + try_sql("DROP TABLE IF EXISTS ranked_data_module;") + try_sql("DROP TABLE IF EXISTS mixed_numbers_module;") + try_sql("DROP TABLE IF EXISTS temp_all_positive_module;") + try_sql("DROP TABLE IF EXISTS temp_mixed_module;") + try_sql("DROP TABLE IF EXISTS temp_empty_module;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_data_types_inline.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_data_types_inline.groovy new file mode 100644 index 00000000000000..ea4d24e1568f89 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_data_types_inline.groovy @@ -0,0 +1,979 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_data_types_inline") { + // Test Python UDTF with Various Data Types + // Coverage: Basic types, numeric types, date/time types, complex types + + def runtime_version = "3.8.10" + + try { + // ======================================== + // Type 1: TINYINT (1-byte integer: -128 to 127) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_tinyint(TINYINT); """ + sql """ + CREATE TABLES FUNCTION udtf_tinyint(TINYINT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_tinyint", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_tinyint(v): + '''Process TINYINT: test small integer range''' + if v is not None: + yield (v, v * 2) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_tinyint; """ + sql """ + CREATE TABLE test_tinyint ( + id INT, + v TINYINT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_tinyint VALUES (1, -60), (2, 0), (3, 63); + """ + + qt_tinyint """ + SELECT tmp.original, tmp.doubled + FROM test_tinyint + LATERAL VIEW udtf_tinyint(v) tmp AS original, doubled + ORDER BY tmp.original; + """ + + // ======================================== + // Type 2: SMALLINT (2-byte integer: -32768 to 32767) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_smallint(SMALLINT); """ + sql """ + CREATE TABLES FUNCTION udtf_smallint(SMALLINT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_smallint", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_smallint(v): + '''Process SMALLINT: test medium integer range''' + if v is not None: + yield (v, v * v) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_smallint; """ + sql """ + CREATE TABLE test_smallint ( + id INT, + v SMALLINT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_smallint VALUES (1, -1000), (2, 0), (3, 1000); + """ + + qt_smallint """ + SELECT tmp.original, tmp.squared + FROM test_smallint + LATERAL VIEW udtf_smallint(v) tmp AS original, squared + ORDER BY tmp.original; + """ + + // ======================================== + // Type 3: BIGINT (8-byte integer) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_bigint(BIGINT); """ + sql """ + CREATE TABLES FUNCTION udtf_bigint(BIGINT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_bigint", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_bigint(v): + '''Process BIGINT: test large integer range''' + if v is not None: + yield (v, v + 1) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_bigint; """ + sql """ + CREATE TABLE test_bigint ( + id INT, + v BIGINT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_bigint VALUES (1, -1000000000000), (2, 0), (3, 1000000000000); + """ + + qt_bigint """ + SELECT tmp.original, tmp.incremented + FROM test_bigint + LATERAL VIEW udtf_bigint(v) tmp AS original, incremented + ORDER BY tmp.original; + """ + + // ======================================== + // Type 4: FLOAT (4-byte floating point) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_float(FLOAT); """ + sql """ + CREATE TABLES FUNCTION udtf_float(FLOAT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_float", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_float(v): + '''Process FLOAT: test floating point numbers''' + if v is not None: + yield (v, v / 2.0) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_float; """ + sql """ + CREATE TABLE test_float ( + id INT, + v FLOAT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_float VALUES (1, -3.14), (2, 0.0), (3, 2.718); + """ + + qt_float """ + SELECT tmp.original, tmp.halved + FROM test_float + LATERAL VIEW udtf_float(v) tmp AS original, halved + ORDER BY tmp.original; + """ + + // ======================================== + // Type 5: DOUBLE (8-byte floating point) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_double(DOUBLE); """ + sql """ + CREATE TABLES FUNCTION udtf_double(DOUBLE) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_double", + "runtime_version" = "3.8.10" + ) + AS \$\$ +import math + +def process_double(v): + '''Process DOUBLE: test high precision floating point''' + if v is not None and v >= 0: + yield (v, math.sqrt(v)) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_double; """ + sql """ + CREATE TABLE test_double ( + id INT, + v DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_double VALUES (1, 0.0), (2, 4.0), (3, 16.0), (4, 100.0); + """ + + qt_double """ + SELECT tmp.original, tmp.sqrt_value + FROM test_double + LATERAL VIEW udtf_double(v) tmp AS original, sqrt_value + ORDER BY tmp.original; + """ + + // ======================================== + // Type 6: BOOLEAN + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_boolean(BOOLEAN); """ + sql """ + CREATE TABLES FUNCTION udtf_boolean(BOOLEAN) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_boolean", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_boolean(v): + '''Process BOOLEAN: test true/false values''' + if v is not None: + yield (v, not v, 'TRUE' if v else 'FALSE') +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_boolean; """ + sql """ + CREATE TABLE test_boolean ( + id INT, + v BOOLEAN + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_boolean VALUES (1, true), (2, false); + """ + + qt_boolean """ + SELECT tmp.original, tmp.negated, tmp.as_string + FROM test_boolean + LATERAL VIEW udtf_boolean(v) tmp AS original, negated, as_string + ORDER BY tmp.original; + """ + + // ======================================== + // Type 7: STRING (Variable length text) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_string(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_string(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_string", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_string(v): + '''Process STRING: test text manipulation''' + if v is not None: + yield (v, len(v), v.upper(), v.lower()) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_string; """ + sql """ + CREATE TABLE test_string ( + id INT, + v STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_string VALUES (1, 'Hello'), (2, 'WORLD'), (3, 'DoRiS'); + """ + + qt_string """ + SELECT tmp.original, tmp.length, tmp.upper, tmp.lower + FROM test_string + LATERAL VIEW udtf_string(v) tmp AS original, length, upper, lower + ORDER BY tmp.original; + """ + + // ======================================== + // Type 8: DATE (Date without time) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_date(DATE); """ + sql """ + CREATE TABLES FUNCTION udtf_date(DATE) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_date", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_date(v): + '''Process DATE: extract date components''' + if v is not None: + # v is a datetime.date object + yield (v, v.year, v.month, v.day) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_date; """ + sql """ + CREATE TABLE test_date ( + id INT, + v DATE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_date VALUES (1, '2024-01-01'), (2, '2024-06-15'), (3, '2024-12-31'); + """ + + qt_date """ + SELECT tmp.original, tmp.year, tmp.month, tmp.day + FROM test_date + LATERAL VIEW udtf_date(v) tmp AS original, year, month, day + ORDER BY tmp.original; + """ + + // ======================================== + // Type 9: DATETIME (Date with time) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_datetime(DATETIME); """ + sql """ + CREATE TABLES FUNCTION udtf_datetime(DATETIME) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_datetime", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_datetime(v): + '''Process DATETIME: extract time components''' + if v is not None: + # v is a datetime.datetime object + yield (v, v.hour, v.minute) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_datetime; """ + sql """ + CREATE TABLE test_datetime ( + id INT, + v DATETIME + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_datetime VALUES + (1, '2024-01-01 08:30:00'), + (2, '2024-06-15 12:00:00'), + (3, '2024-12-31 23:59:00'); + """ + + qt_datetime """ + SELECT tmp.original, tmp.hour, tmp.minute + FROM test_datetime + LATERAL VIEW udtf_datetime(v) tmp AS original, hour, minute + ORDER BY tmp.original; + """ + + // ======================================== + // Type 10: ARRAY (Array of integers) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_array_int(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_array_int(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_array_int", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_array_int(arr): + '''Process ARRAY: explode array and process each element''' + if arr is not None: + for i, elem in enumerate(arr): + if elem is not None: + yield (i, elem, elem * 2) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_array_int; """ + sql """ + CREATE TABLE test_array_int ( + id INT, + v ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_array_int VALUES + (1, [1, 2, 3]), + (2, [10, 20]), + (3, [100]); + """ + + qt_array_int """ + SELECT id, tmp.arr_pos, tmp.element, tmp.doubled + FROM test_array_int + LATERAL VIEW udtf_array_int(v) tmp AS arr_pos, element, doubled + ORDER BY id, tmp.arr_pos; + """ + + // ======================================== + // Type 11: ARRAY (Array of strings) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_array_string(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_array_string(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_array_string", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_array_string(arr): + '''Process ARRAY: explode and get string lengths''' + if arr is not None: + for elem in arr: + if elem is not None: + yield (elem, len(elem)) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_array_string; """ + sql """ + CREATE TABLE test_array_string ( + id INT, + v ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_array_string VALUES + (1, ['apple', 'banana']), + (2, ['cat', 'dog', 'bird']); + """ + + qt_array_string """ + SELECT id, tmp.element, tmp.length + FROM test_array_string + LATERAL VIEW udtf_array_string(v) tmp AS element, length + ORDER BY id, tmp.element; + """ + + // ======================================== + // Type 12: STRUCT (Structured data) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_struct(STRUCT); """ + sql """ + CREATE TABLES FUNCTION udtf_struct(STRUCT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_struct", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_struct(person): + '''Process STRUCT: access struct fields''' + if person is not None: + name = person['name'] if 'name' in person else None + age = person['age'] if 'age' in person else None + + if name is not None and age is not None: + category = 'child' if age < 18 else 'adult' + yield (name, age, category) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_struct; """ + sql """ + CREATE TABLE test_struct ( + id INT, + person STRUCT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_struct VALUES + (1, named_struct('name', 'Alice', 'age', 25)), + (2, named_struct('name', 'Bob', 'age', 15)), + (3, named_struct('name', 'Charlie', 'age', 30)); + """ + + qt_struct """ + SELECT tmp.name, tmp.age, tmp.category + FROM test_struct + LATERAL VIEW udtf_struct(person) tmp AS name, age, category + ORDER BY tmp.name; + """ + + // ======================================== + // Type 13: Multiple Input Types (INT, STRING) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_multi_types(INT, STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_multi_types(INT, STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_multi_types", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_multi_types(num, text): + '''Process multiple input types''' + if num is not None and text is not None: + yield (num, text, f"{text}_{num}") +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_multi_types; """ + sql """ + CREATE TABLE test_multi_types ( + id INT, + num INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_multi_types VALUES (1, 100, 'apple'), (2, 200, 'banana'); + """ + + qt_multi_types """ + SELECT tmp.number, tmp.text, tmp.combined + FROM test_multi_types + LATERAL VIEW udtf_multi_types(num, text) tmp AS number, text, combined + ORDER BY tmp.number; + """ + + // ======================================== + // Type 14: DECIMAL (High precision decimal) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_decimal(DECIMAL(10,2)); """ + sql """ + CREATE TABLES FUNCTION udtf_decimal(DECIMAL(10,2)) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_decimal", + "runtime_version" = "3.8.10" + ) + AS \$\$ +from decimal import Decimal + +def process_decimal(v): + '''Process DECIMAL: high precision arithmetic''' + if v is not None: + doubled = v * 2 + yield (v, doubled) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_decimal; """ + sql """ + CREATE TABLE test_decimal ( + id INT, + v DECIMAL(10,2) + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_decimal VALUES (1, 123.45), (2, 678.90), (3, 999.99); + """ + + qt_decimal """ + SELECT tmp.original, tmp.doubled + FROM test_decimal + LATERAL VIEW udtf_decimal(v) tmp AS original, doubled + ORDER BY tmp.original; + """ + + // ======================================== + // Section: P1 - Complex Data Types + // ======================================== + + // Test P1.1: MAP type (if supported) + // Note: Doris may not fully support MAP in UDTF, test with workaround + sql """ DROP FUNCTION IF EXISTS udtf_map_processor(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_map_processor(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_map_string", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_map_string(map_str): + '''Process map-like string (key1:val1,key2:val2)''' + if map_str: + pairs = map_str.split(',') + for pair in pairs: + if ':' in pair: + k, val = pair.split(':', 1) + try: + yield (k.strip(), int(val.strip())) + except ValueError: + pass +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_map_like; """ + sql """ + CREATE TABLE test_map_like ( + id INT, + map_data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_map_like VALUES + (1, 'age:25,score:90'), + (2, 'age:30,score:85,level:3'); + """ + + qt_map_like """ + SELECT id, tmp.k, tmp.v + FROM test_map_like + LATERAL VIEW udtf_map_processor(map_data) tmp AS k, v + ORDER BY id, tmp.k; + """ + + // Test P1.2: Nested ARRAY (ARRAY> simulated) + sql """ DROP FUNCTION IF EXISTS udtf_nested_array(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_nested_array(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_nested_array", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_nested_array(nested_str): + '''Process nested array string ([[1,2],[3,4]])''' + if nested_str: + # Remove brackets and split by ],[ + nested_str = nested_str.strip('[]') + groups = nested_str.split('],[') + + for group_idx, group in enumerate(groups): + elements = group.strip('[]').split(',') + for elem in elements: + try: + yield (group_idx, int(elem.strip())) + except ValueError: + pass +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_nested_array; """ + sql """ + CREATE TABLE test_nested_array ( + id INT, + nested_data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_array VALUES + (1, '[[10,20],[30,40]]'), + (2, '[[50],[60,70,80]]'); + """ + + qt_nested_array """ + SELECT id, tmp.group_idx, tmp.element + FROM test_nested_array + LATERAL VIEW udtf_nested_array(nested_data) tmp AS group_idx, element + ORDER BY id, tmp.group_idx, tmp.element; + """ + + // Test P1.3: ARRAY> + sql """ DROP FUNCTION IF EXISTS udtf_array_of_structs(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_array_of_structs(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_array_structs", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_array_structs(data): + '''Process array of structs (name:age:score|name:age:score)''' + if data: + items = data.split('|') + for item in items: + parts = item.split(':') + if len(parts) == 3: + try: + yield (parts[0], int(parts[1]), int(parts[2])) + except ValueError: + pass +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_array_structs; """ + sql """ + CREATE TABLE test_array_structs ( + id INT, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_array_structs VALUES + (1, 'Alice:25:90|Bob:30:85'), + (2, 'Charlie:28:88'); + """ + + qt_array_structs """ + SELECT id, tmp.name, tmp.age, tmp.score + FROM test_array_structs + LATERAL VIEW udtf_array_of_structs(data) tmp AS name, age, score + ORDER BY id, tmp.name; + """ + + // Test P1.4: STRUCT with nested ARRAY + sql """ DROP FUNCTION IF EXISTS udtf_struct_with_array(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_struct_with_array(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_struct_array", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_struct_array(data): + '''Process struct with array (name:tag1,tag2,tag3)''' + if data and ':' in data: + name, tags = data.split(':', 1) + tag_list = tags.split(',') + yield (name, len(tag_list), ','.join(tag_list)) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_struct_array; """ + sql """ + CREATE TABLE test_struct_array ( + id INT, + person_tags STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_struct_array VALUES + (1, 'Alice:sports,music,reading'), + (2, 'Bob:coding,gaming'); + """ + + qt_struct_array """ + SELECT id, tmp.person_name, tmp.tag_count, tmp.tags + FROM test_struct_array + LATERAL VIEW udtf_struct_with_array(person_tags) tmp AS person_name, tag_count, tags + ORDER BY id; + """ + + // Test P1.5: JSON-like data processing + sql """ DROP FUNCTION IF EXISTS udtf_json_extract(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_json_extract(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "extract_json_fields", + "runtime_version" = "3.8.10" + ) + AS \$\$ +import json + +def extract_json_fields(json_str): + '''Extract JSON fields''' + if json_str: + try: + data = json.loads(json_str) + if isinstance(data, dict): + for k, v in data.items(): + yield (k, str(v)) + except: + pass +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_json_data; """ + sql """ + CREATE TABLE test_json_data ( + id INT, + json_content STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_json_data VALUES + (1, '{"name":"Alice","age":25,"city":"NYC"}'), + (2, '{"name":"Bob","age":30}'); + """ + + qt_json_extract """ + SELECT id, tmp.field, tmp.v + FROM test_json_data + LATERAL VIEW udtf_json_extract(json_content) tmp AS field, v + ORDER BY id, tmp.field; + """ + + // Test P1.6: Complex nested STRUCT + sql """ DROP FUNCTION IF EXISTS udtf_complex_struct(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_complex_struct(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_complex_struct", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_complex_struct(data): + '''Process complex struct (id:name:city:zip)''' + if data: + parts = data.split(':') + if len(parts) == 4: + try: + yield (int(parts[0]), parts[1], parts[2], parts[3]) + except ValueError: + pass +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_complex_struct; """ + sql """ + CREATE TABLE test_complex_struct ( + id INT, + user_data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_complex_struct VALUES + (1, '101:Alice:NYC:10001'), + (2, '102:Bob:LA:90001'); + """ + + qt_complex_struct """ + SELECT id, tmp.user_id, tmp.user_name, tmp.address_city, tmp.address_zip + FROM test_complex_struct + LATERAL VIEW udtf_complex_struct(user_data) tmp AS user_id, user_name, address_city, address_zip + ORDER BY id; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS udtf_tinyint(TINYINT);") + try_sql("DROP FUNCTION IF EXISTS udtf_smallint(SMALLINT);") + try_sql("DROP FUNCTION IF EXISTS udtf_bigint(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS udtf_float(FLOAT);") + try_sql("DROP FUNCTION IF EXISTS udtf_double(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS udtf_boolean(BOOLEAN);") + try_sql("DROP FUNCTION IF EXISTS udtf_string(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_date(DATE);") + try_sql("DROP FUNCTION IF EXISTS udtf_datetime(DATETIME);") + try_sql("DROP FUNCTION IF EXISTS udtf_array_int(ARRAY);") + try_sql("DROP FUNCTION IF EXISTS udtf_array_string(ARRAY);") + try_sql("DROP FUNCTION IF EXISTS udtf_struct(STRUCT);") + try_sql("DROP FUNCTION IF EXISTS udtf_multi_types(INT, STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_decimal(DECIMAL(10,2));") + try_sql("DROP FUNCTION IF EXISTS udtf_map_processor(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_nested_array(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_array_of_structs(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_struct_with_array(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_json_extract(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_complex_struct(STRING);") + try_sql("DROP TABLE IF EXISTS test_tinyint;") + try_sql("DROP TABLE IF EXISTS test_smallint;") + try_sql("DROP TABLE IF EXISTS test_bigint;") + try_sql("DROP TABLE IF EXISTS test_float;") + try_sql("DROP TABLE IF EXISTS test_double;") + try_sql("DROP TABLE IF EXISTS test_boolean;") + try_sql("DROP TABLE IF EXISTS test_string;") + try_sql("DROP TABLE IF EXISTS test_date;") + try_sql("DROP TABLE IF EXISTS test_datetime;") + try_sql("DROP TABLE IF EXISTS test_array_int;") + try_sql("DROP TABLE IF EXISTS test_array_string;") + try_sql("DROP TABLE IF EXISTS test_struct;") + try_sql("DROP TABLE IF EXISTS test_multi_types;") + try_sql("DROP TABLE IF EXISTS test_decimal;") + try_sql("DROP TABLE IF EXISTS test_map_like;") + try_sql("DROP TABLE IF EXISTS test_nested_array;") + try_sql("DROP TABLE IF EXISTS test_array_structs;") + try_sql("DROP TABLE IF EXISTS test_struct_array;") + try_sql("DROP TABLE IF EXISTS test_json_data;") + try_sql("DROP TABLE IF EXISTS test_complex_struct;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_data_types_module.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_data_types_module.groovy new file mode 100644 index 00000000000000..771b4fbe62f569 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_data_types_module.groovy @@ -0,0 +1,827 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_data_types_module") { + // Test Python UDTF with Various Data Types using module-based deployment + // UDTFs are loaded from pyudtf.zip file + + def pyPath = """${context.file.parent}/udtf_scripts/pyudtf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${pyPath}".toString()) + + try { + // ======================================== + // Type 1: TINYINT (1-byte integer: -128 to 127) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_tinyint_module(TINYINT); """ + sql """ + CREATE TABLES FUNCTION udtf_tinyint_module(TINYINT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_tinyint", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_tinyint_module; """ + sql """ + CREATE TABLE test_tinyint_module ( + id INT, + v TINYINT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_tinyint_module VALUES (1, -60), (2, 0), (3, 63); + """ + + qt_tinyint """ + SELECT tmp.original, tmp.doubled + FROM test_tinyint_module + LATERAL VIEW udtf_tinyint_module(v) tmp AS original, doubled + ORDER BY tmp.original; + """ + + // ======================================== + // Type 2: SMALLINT (2-byte integer: -32768 to 32767) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_smallint_module(SMALLINT); """ + sql """ + CREATE TABLES FUNCTION udtf_smallint_module(SMALLINT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_smallint", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_smallint_module; """ + sql """ + CREATE TABLE test_smallint_module ( + id INT, + v SMALLINT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_smallint_module VALUES (1, -1000), (2, 0), (3, 1000); + """ + + qt_smallint """ + SELECT tmp.original, tmp.squared + FROM test_smallint_module + LATERAL VIEW udtf_smallint_module(v) tmp AS original, squared + ORDER BY tmp.original; + """ + + // ======================================== + // Type 3: BIGINT (8-byte integer) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_bigint_module(BIGINT); """ + sql """ + CREATE TABLES FUNCTION udtf_bigint_module(BIGINT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_bigint", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_bigint_module; """ + sql """ + CREATE TABLE test_bigint_module ( + id INT, + v BIGINT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_bigint_module VALUES (1, -1000000000000), (2, 0), (3, 1000000000000); + """ + + qt_bigint """ + SELECT tmp.original, tmp.incremented + FROM test_bigint_module + LATERAL VIEW udtf_bigint_module(v) tmp AS original, incremented + ORDER BY tmp.original; + """ + + // ======================================== + // Type 4: FLOAT (4-byte floating point) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_float_module(FLOAT); """ + sql """ + CREATE TABLES FUNCTION udtf_float_module(FLOAT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_float", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_float_module; """ + sql """ + CREATE TABLE test_float_module ( + id INT, + v FLOAT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_float_module VALUES (1, -3.14), (2, 0.0), (3, 2.718); + """ + + qt_float """ + SELECT tmp.original, tmp.halved + FROM test_float_module + LATERAL VIEW udtf_float_module(v) tmp AS original, halved + ORDER BY tmp.original; + """ + + // ======================================== + // Type 5: DOUBLE (8-byte floating point) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_double_module(DOUBLE); """ + sql """ + CREATE TABLES FUNCTION udtf_double_module(DOUBLE) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_double", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_double_module; """ + sql """ + CREATE TABLE test_double_module ( + id INT, + v DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_double_module VALUES (1, 0.0), (2, 4.0), (3, 16.0), (4, 100.0); + """ + + qt_double """ + SELECT tmp.original, tmp.sqrt_value + FROM test_double_module + LATERAL VIEW udtf_double_module(v) tmp AS original, sqrt_value + ORDER BY tmp.original; + """ + + // ======================================== + // Type 6: BOOLEAN + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_boolean_module(BOOLEAN); """ + sql """ + CREATE TABLES FUNCTION udtf_boolean_module(BOOLEAN) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_boolean", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_boolean_module; """ + sql """ + CREATE TABLE test_boolean_module ( + id INT, + v BOOLEAN + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_boolean_module VALUES (1, true), (2, false); + """ + + qt_boolean """ + SELECT tmp.original, tmp.negated, tmp.as_string + FROM test_boolean_module + LATERAL VIEW udtf_boolean_module(v) tmp AS original, negated, as_string + ORDER BY tmp.original; + """ + + // ======================================== + // Type 7: STRING (Variable length text) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_string_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_string_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_string", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_string_module; """ + sql """ + CREATE TABLE test_string_module ( + id INT, + v STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_string_module VALUES (1, 'Hello'), (2, 'WORLD'), (3, 'DoRiS'); + """ + + qt_string """ + SELECT tmp.original, tmp.length, tmp.upper, tmp.lower + FROM test_string_module + LATERAL VIEW udtf_string_module(v) tmp AS original, length, upper, lower + ORDER BY tmp.original; + """ + + // ======================================== + // Type 8: DATE (Date without time) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_date_module(DATE); """ + sql """ + CREATE TABLES FUNCTION udtf_date_module(DATE) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_date", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_date_module; """ + sql """ + CREATE TABLE test_date_module ( + id INT, + v DATE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_date_module VALUES (1, '2024-01-01'), (2, '2024-06-15'), (3, '2024-12-31'); + """ + + qt_date """ + SELECT tmp.original, tmp.year, tmp.month, tmp.day + FROM test_date_module + LATERAL VIEW udtf_date_module(v) tmp AS original, year, month, day + ORDER BY tmp.original; + """ + + // ======================================== + // Type 9: DATETIME (Date with time) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_datetime_module(DATETIME); """ + sql """ + CREATE TABLES FUNCTION udtf_datetime_module(DATETIME) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_datetime", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_datetime_module; """ + sql """ + CREATE TABLE test_datetime_module ( + id INT, + v DATETIME + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_datetime_module VALUES + (1, '2024-01-01 08:30:00'), + (2, '2024-06-15 12:00:00'), + (3, '2024-12-31 23:59:00'); + """ + + qt_datetime """ + SELECT tmp.original, tmp.hour, tmp.minute + FROM test_datetime_module + LATERAL VIEW udtf_datetime_module(v) tmp AS original, hour, minute + ORDER BY tmp.original; + """ + + // ======================================== + // Type 10: ARRAY (Array of integers) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_array_int_module(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_array_int_module(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_array_int", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_array_int_module; """ + sql """ + CREATE TABLE test_array_int_module ( + id INT, + v ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_array_int_module VALUES + (1, [1, 2, 3]), + (2, [10, 20]), + (3, [100]); + """ + + qt_array_int """ + SELECT id, tmp.arr_pos, tmp.element, tmp.doubled + FROM test_array_int_module + LATERAL VIEW udtf_array_int_module(v) tmp AS arr_pos, element, doubled + ORDER BY id, tmp.arr_pos; + """ + + // ======================================== + // Type 11: ARRAY (Array of strings) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_array_string_module(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_array_string_module(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_array_string", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_array_string_module; """ + sql """ + CREATE TABLE test_array_string_module ( + id INT, + v ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_array_string_module VALUES + (1, ['apple', 'banana']), + (2, ['cat', 'dog', 'bird']); + """ + + qt_array_string """ + SELECT id, tmp.element, tmp.length + FROM test_array_string_module + LATERAL VIEW udtf_array_string_module(v) tmp AS element, length + ORDER BY id, tmp.element; + """ + + // ======================================== + // Type 12: STRUCT (Structured data) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_struct_module(STRUCT); """ + sql """ + CREATE TABLES FUNCTION udtf_struct_module(STRUCT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_struct", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_struct_module; """ + sql """ + CREATE TABLE test_struct_module ( + id INT, + person STRUCT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_struct_module VALUES + (1, named_struct('name', 'Alice', 'age', 25)), + (2, named_struct('name', 'Bob', 'age', 15)), + (3, named_struct('name', 'Charlie', 'age', 30)); + """ + + qt_struct """ + SELECT tmp.name, tmp.age, tmp.category + FROM test_struct_module + LATERAL VIEW udtf_struct_module(person) tmp AS name, age, category + ORDER BY tmp.name; + """ + + // ======================================== + // Type 13: Multiple Input Types (INT, STRING) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_multi_types_module(INT, STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_multi_types_module(INT, STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_multi_types", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_multi_types_module; """ + sql """ + CREATE TABLE test_multi_types_module ( + id INT, + num INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_multi_types_module VALUES (1, 100, 'apple'), (2, 200, 'banana'); + """ + + qt_multi_types """ + SELECT tmp.number, tmp.text, tmp.combined + FROM test_multi_types_module + LATERAL VIEW udtf_multi_types_module(num, text) tmp AS number, text, combined + ORDER BY tmp.number; + """ + + // ======================================== + // Type 14: DECIMAL (High precision decimal) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_decimal_module(DECIMAL(10,2)); """ + sql """ + CREATE TABLES FUNCTION udtf_decimal_module(DECIMAL(10,2)) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_decimal", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_decimal_module; """ + sql """ + CREATE TABLE test_decimal_module ( + id INT, + v DECIMAL(10,2) + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_decimal_module VALUES (1, 123.45), (2, 678.90), (3, 999.99); + """ + + qt_decimal """ + SELECT tmp.original, tmp.doubled + FROM test_decimal_module + LATERAL VIEW udtf_decimal_module(v) tmp AS original, doubled + ORDER BY tmp.original; + """ + + // ======================================== + // Section: P1 - Complex Data Types + // ======================================== + + // Test P1.1: MAP type (if supported) + // Note: Doris may not fully support MAP in UDTF, test with workaround + sql """ DROP FUNCTION IF EXISTS udtf_map_processor_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_map_processor_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_map_string", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_map_like_module; """ + sql """ + CREATE TABLE test_map_like_module ( + id INT, + map_data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_map_like_module VALUES + (1, 'age:25,score:90'), + (2, 'age:30,score:85,level:3'); + """ + + qt_map_like """ + SELECT id, tmp.k, tmp.v + FROM test_map_like_module + LATERAL VIEW udtf_map_processor_module(map_data) tmp AS k, v + ORDER BY id, tmp.k; + """ + + // Test P1.2: Nested ARRAY (ARRAY> simulated) + sql """ DROP FUNCTION IF EXISTS udtf_nested_array_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_nested_array_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_nested_array", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_nested_array_module; """ + sql """ + CREATE TABLE test_nested_array_module ( + id INT, + nested_data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_array_module VALUES + (1, '[[10,20],[30,40]]'), + (2, '[[50],[60,70,80]]'); + """ + + qt_nested_array """ + SELECT id, tmp.group_idx, tmp.element + FROM test_nested_array_module + LATERAL VIEW udtf_nested_array_module(nested_data) tmp AS group_idx, element + ORDER BY id, tmp.group_idx, tmp.element; + """ + + // Test P1.3: ARRAY> + sql """ DROP FUNCTION IF EXISTS udtf_array_of_structs_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_array_of_structs_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_array_structs", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_array_structs_module; """ + sql """ + CREATE TABLE test_array_structs_module ( + id INT, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_array_structs_module VALUES + (1, 'Alice:25:90|Bob:30:85'), + (2, 'Charlie:28:88'); + """ + + qt_array_structs """ + SELECT id, tmp.name, tmp.age, tmp.score + FROM test_array_structs_module + LATERAL VIEW udtf_array_of_structs_module(data) tmp AS name, age, score + ORDER BY id, tmp.name; + """ + + // Test P1.4: STRUCT with nested ARRAY + sql """ DROP FUNCTION IF EXISTS udtf_struct_with_array_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_struct_with_array_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_struct_array", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_struct_array_module; """ + sql """ + CREATE TABLE test_struct_array_module ( + id INT, + person_tags STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_struct_array_module VALUES + (1, 'Alice:sports,music,reading'), + (2, 'Bob:coding,gaming'); + """ + + qt_struct_array """ + SELECT id, tmp.person_name, tmp.tag_count, tmp.tags + FROM test_struct_array_module + LATERAL VIEW udtf_struct_with_array_module(person_tags) tmp AS person_name, tag_count, tags + ORDER BY id; + """ + + // Test P1.5: JSON-like data processing + sql """ DROP FUNCTION IF EXISTS udtf_json_extract_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_json_extract_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.extract_json_fields", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_json_data_module; """ + sql """ + CREATE TABLE test_json_data_module ( + id INT, + json_content STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_json_data_module VALUES + (1, '{"name":"Alice","age":25,"city":"NYC"}'), + (2, '{"name":"Bob","age":30}'); + """ + + qt_json_extract """ + SELECT id, tmp.field, tmp.v + FROM test_json_data_module + LATERAL VIEW udtf_json_extract_module(json_content) tmp AS field, v + ORDER BY id, tmp.field; + """ + + // Test P1.6: Complex nested STRUCT + sql """ DROP FUNCTION IF EXISTS udtf_complex_struct_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_complex_struct_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.data_types_udtf.process_complex_struct", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_complex_struct_module; """ + sql """ + CREATE TABLE test_complex_struct_module ( + id INT, + user_data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_complex_struct_module VALUES + (1, '101:Alice:NYC:10001'), + (2, '102:Bob:LA:90001'); + """ + + qt_complex_struct """ + SELECT id, tmp.user_id, tmp.user_name, tmp.address_city, tmp.address_zip + FROM test_complex_struct_module + LATERAL VIEW udtf_complex_struct_module(user_data) tmp AS user_id, user_name, address_city, address_zip + ORDER BY id; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS udtf_tinyint_module(TINYINT);") + try_sql("DROP FUNCTION IF EXISTS udtf_smallint_module(SMALLINT);") + try_sql("DROP FUNCTION IF EXISTS udtf_bigint_module(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS udtf_float_module(FLOAT);") + try_sql("DROP FUNCTION IF EXISTS udtf_double_module(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS udtf_boolean_module(BOOLEAN);") + try_sql("DROP FUNCTION IF EXISTS udtf_string_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_date_module(DATE);") + try_sql("DROP FUNCTION IF EXISTS udtf_datetime_module(DATETIME);") + try_sql("DROP FUNCTION IF EXISTS udtf_array_int_module(ARRAY);") + try_sql("DROP FUNCTION IF EXISTS udtf_array_string_module(ARRAY);") + try_sql("DROP FUNCTION IF EXISTS udtf_struct_module(STRUCT);") + try_sql("DROP FUNCTION IF EXISTS udtf_multi_types_module(INT, STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_decimal_module(DECIMAL(10,2));") + try_sql("DROP FUNCTION IF EXISTS udtf_map_processor_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_nested_array_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_array_of_structs_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_struct_with_array_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_json_extract_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_complex_struct_module(STRING);") + try_sql("DROP TABLE IF EXISTS test_tinyint_module;") + try_sql("DROP TABLE IF EXISTS test_smallint_module;") + try_sql("DROP TABLE IF EXISTS test_bigint_module;") + try_sql("DROP TABLE IF EXISTS test_float_module;") + try_sql("DROP TABLE IF EXISTS test_double_module;") + try_sql("DROP TABLE IF EXISTS test_boolean_module;") + try_sql("DROP TABLE IF EXISTS test_string_module;") + try_sql("DROP TABLE IF EXISTS test_date_module;") + try_sql("DROP TABLE IF EXISTS test_datetime_module;") + try_sql("DROP TABLE IF EXISTS test_array_int_module;") + try_sql("DROP TABLE IF EXISTS test_array_string_module;") + try_sql("DROP TABLE IF EXISTS test_struct_module;") + try_sql("DROP TABLE IF EXISTS test_multi_types_module;") + try_sql("DROP TABLE IF EXISTS test_decimal_module;") + try_sql("DROP TABLE IF EXISTS test_map_like_module;") + try_sql("DROP TABLE IF EXISTS test_nested_array_module;") + try_sql("DROP TABLE IF EXISTS test_array_structs_module;") + try_sql("DROP TABLE IF EXISTS test_struct_array_module;") + try_sql("DROP TABLE IF EXISTS test_json_data_module;") + try_sql("DROP TABLE IF EXISTS test_complex_struct_module;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_edge_cases_inline.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_edge_cases_inline.groovy new file mode 100644 index 00000000000000..cbf27821979010 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_edge_cases_inline.groovy @@ -0,0 +1,701 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_edge_cases_inline") { + // Test Python UDTF Edge Cases and Boundary Conditions + // Coverage: NULL handling, extreme cases, special values + + def runtime_version = "3.8.10" + + try { + // ======================================== + // Section 1: NULL Value Handling + // ======================================== + + // Test 1.1: NULL Integer Input + sql """ DROP FUNCTION IF EXISTS udtf_null_int(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_null_int(INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "handle_null_int", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def handle_null_int(value): + '''Handle NULL integer values''' + if value is None: + yield (None, True, -1) # NULL indicator + else: + yield (value, False, value * 2) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_null_int; """ + sql """ + CREATE TABLE test_null_int ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_null_int VALUES (1, NULL), (2, 0), (3, 10), (4, NULL); + """ + + qt_null_int """ + SELECT id, tmp.input_value, tmp.is_null, tmp.result + FROM test_null_int + LATERAL VIEW udtf_null_int(value) tmp AS input_value, is_null, result + ORDER BY id; + """ + + // Test 1.2: Empty String vs NULL String + sql """ DROP FUNCTION IF EXISTS udtf_null_string(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_null_string(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "handle_null_string", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def handle_null_string(value): + '''Distinguish NULL from empty string''' + if value is None: + yield ('NULL', -1) + elif value == '': + yield ('EMPTY', 0) + else: + yield ('NORMAL', len(value)) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_null_string; """ + sql """ + CREATE TABLE test_null_string ( + id INT, + value STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_null_string VALUES (1, NULL), (2, ''), (3, 'hello'), (4, NULL); + """ + + qt_null_string """ + SELECT id, tmp.value_type, tmp.length + FROM test_null_string + LATERAL VIEW udtf_null_string(value) tmp AS value_type, length + ORDER BY id; + """ + + // Test 1.3: Empty Array + sql """ DROP FUNCTION IF EXISTS udtf_empty_array(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_empty_array(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "handle_empty_array", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def handle_empty_array(arr): + '''Handle NULL vs empty array''' + if arr is None: + yield ('NULL', -1) + elif len(arr) == 0: + yield ('EMPTY', 0) + else: + yield ('NORMAL', len(arr)) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_empty_array; """ + sql """ + CREATE TABLE test_empty_array ( + id INT, + value ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_empty_array VALUES + (1, NULL), + (2, []), + (3, [1, 2, 3]); + """ + + qt_empty_array """ + SELECT id, tmp.array_type, tmp.size + FROM test_empty_array + LATERAL VIEW udtf_empty_array(value) tmp AS array_type, size + ORDER BY id; + """ + + // Test 1.4: NULL Fields in STRUCT + sql """ DROP FUNCTION IF EXISTS udtf_null_struct(STRUCT); """ + sql """ + CREATE TABLES FUNCTION udtf_null_struct(STRUCT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "handle_null_struct", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def handle_null_struct(person): + '''Handle NULL fields in STRUCT''' + if person is None: + yield (False, False, 'struct_is_null') + else: + name = person.get('name') + age = person.get('age') + has_name = name is not None + has_age = age is not None + + if has_name and has_age: + summary = f"{name}_{age}" + elif has_name: + summary = f"{name}_no_age" + elif has_age: + summary = f"no_name_{age}" + else: + summary = "all_fields_null" + + yield (has_name, has_age, summary) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_null_struct; """ + sql """ + CREATE TABLE test_null_struct ( + id INT, + person STRUCT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_null_struct VALUES + (1, named_struct('name', 'Alice', 'age', 25)), + (2, named_struct('name', 'Bob', 'age', NULL)), + (3, named_struct('name', NULL, 'age', 30)), + (4, named_struct('name', NULL, 'age', NULL)); + """ + + qt_null_struct """ + SELECT id, tmp.has_name, tmp.has_age, tmp.summary + FROM test_null_struct + LATERAL VIEW udtf_null_struct(person) tmp AS has_name, has_age, summary + ORDER BY id; + """ + + // ======================================== + // Section 2: Extreme Cases + // ======================================== + + // Test 2.1: Empty Table (0 rows) + sql """ DROP FUNCTION IF EXISTS udtf_empty_table(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_empty_table(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_empty_table", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_empty_table(value): + '''This should never be called for empty table''' + if value is not None: + yield (value * 2,) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_empty_table; """ + sql """ + CREATE TABLE test_empty_table ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + // No INSERT - table remains empty + + qt_empty_table """ + SELECT tmp.value + FROM test_empty_table + LATERAL VIEW udtf_empty_table(value) tmp AS value; + """ + + // Test 2.2: Single Row Table + sql """ DROP FUNCTION IF EXISTS udtf_single_row(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_single_row(INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_single_row", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_single_row(value): + '''Process single row input''' + if value is not None: + for i in range(3): + yield (value, value + i) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_single_row; """ + sql """ + CREATE TABLE test_single_row ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_single_row VALUES (1, 100); + """ + + qt_single_row """ + SELECT tmp.original, tmp.generated + FROM test_single_row + LATERAL VIEW udtf_single_row(value) tmp AS original, generated + ORDER BY tmp.generated; + """ + + // Test 2.3: Large Field - Long String + sql """ DROP FUNCTION IF EXISTS udtf_long_string(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_long_string(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_long_string", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_long_string(text): + '''Process very long string''' + if text is not None: + length = len(text) + first_10 = text[:10] if length >= 10 else text + last_10 = text[-10:] if length >= 10 else text + yield (length, first_10, last_10) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_long_string; """ + sql """ + CREATE TABLE test_long_string ( + id INT, + value STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_long_string VALUES + (1, REPEAT('A', 1000)), + (2, REPEAT('B', 5000)); + """ + + qt_long_string """ + SELECT id, tmp.length, tmp.first_10, tmp.last_10 + FROM test_long_string + LATERAL VIEW udtf_long_string(value) tmp AS length, first_10, last_10 + ORDER BY id; + """ + + // Test 2.4: Large Array + sql """ DROP FUNCTION IF EXISTS udtf_large_array(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_large_array(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_large_array", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_large_array(arr): + '''Process large array - compute statistics instead of exploding''' + if arr is not None and len(arr) > 0: + total = len(arr) + total_sum = sum(x for x in arr if x is not None) + first = arr[0] if len(arr) > 0 else None + last = arr[-1] if len(arr) > 0 else None + yield (total, total_sum, first, last) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_large_array; """ + sql """ + CREATE TABLE test_large_array ( + id INT, + value ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_large_array VALUES + (1, ARRAY_REPEAT(1, 100)), + (2, ARRAY_REPEAT(5, 50)); + """ + + qt_large_array """ + SELECT id, tmp.total_elements, tmp.sum_value, tmp.first_elem, tmp.last_elem + FROM test_large_array + LATERAL VIEW udtf_large_array(value) tmp AS total_elements, sum_value, first_elem, last_elem + ORDER BY id; + """ + + // Test 2.5: Output Explosion - Controlled + sql """ DROP FUNCTION IF EXISTS udtf_output_explosion(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_output_explosion(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "output_explosion", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def output_explosion(n): + '''Generate many outputs from single input (controlled explosion)''' + if n is not None and 0 < n <= 100: # Safety limit + for i in range(n): + yield (i,) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_output_explosion; """ + sql """ + CREATE TABLE test_output_explosion ( + id INT, + multiplier INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_output_explosion VALUES (1, 10), (2, 50); + """ + + qt_output_explosion """ + SELECT id, COUNT(*) as output_count, MIN(tmp.value) as min_val, MAX(tmp.value) as max_val + FROM test_output_explosion + LATERAL VIEW udtf_output_explosion(multiplier) tmp AS value + GROUP BY id + ORDER BY id; + """ + + // ======================================== + // Section 3: Special Values + // ======================================== + + // Test 3.1: Special Numeric Values (0, negative, boundary) + sql """ DROP FUNCTION IF EXISTS udtf_special_numbers(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_special_numbers(INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_special_numbers", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_special_numbers(value): + '''Categorize special numeric values''' + INT_MIN = -2147483648 + INT_MAX = 2147483647 + + if value is None: + yield (None, 'NULL', False) + elif value == 0: + yield (value, 'ZERO', False) + elif value == INT_MIN or value == INT_MAX: + category = 'POSITIVE' if value > 0 else 'NEGATIVE' + yield (value, category, True) # is_boundary = True + elif value > 0: + yield (value, 'POSITIVE', False) + else: + yield (value, 'NEGATIVE', False) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_special_numbers; """ + sql """ + CREATE TABLE test_special_numbers ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_special_numbers VALUES + (1, -2147483648), -- INT MIN + (2, -1), + (3, 0), + (4, 1), + (5, 2147483647), -- INT MAX + (6, NULL); + """ + + qt_special_numbers """ + SELECT id, tmp.original, tmp.category, tmp.is_boundary + FROM test_special_numbers + LATERAL VIEW udtf_special_numbers(value) tmp AS original, category, is_boundary + ORDER BY id; + """ + + // Test 3.2: Special Double Values (infinity, very small numbers) + sql """ DROP FUNCTION IF EXISTS udtf_special_doubles(DOUBLE); """ + sql """ + CREATE TABLES FUNCTION udtf_special_doubles(DOUBLE) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_special_doubles", + "runtime_version" = "3.8.10" + ) + AS \$\$ +import math + +def process_special_doubles(value): + '''Classify special double values''' + if value is None: + yield (None, 'NULL') + elif math.isnan(value): + yield (value, 'NAN') + elif math.isinf(value): + if value > 0: + yield (value, 'POSITIVE_INF') + else: + yield (value, 'NEGATIVE_INF') + elif value == 0.0: + yield (value, 'ZERO') + elif abs(value) < 1e-10: + yield (value, 'VERY_SMALL') + elif abs(value) > 1e10: + yield (value, 'VERY_LARGE') + else: + yield (value, 'NORMAL') +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_special_doubles; """ + sql """ + CREATE TABLE test_special_doubles ( + id INT, + value DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_special_doubles VALUES + (1, 0.0), + (2, 1e-15), + (3, 1e15), + (4, -1e15), + (5, 3.14159); + """ + + qt_special_doubles """ + SELECT id, tmp.original, tmp.classification + FROM test_special_doubles + LATERAL VIEW udtf_special_doubles(value) tmp AS original, classification + ORDER BY id; + """ + + // Test 3.3: Special String Values (special characters, Unicode) + sql """ DROP FUNCTION IF EXISTS udtf_special_strings(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_special_strings(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_special_strings", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_special_strings(text): + '''Process strings with special characters''' + if text is None: + yield (0, False, 'NULL') + elif text == '': + yield (0, False, 'EMPTY') + else: + length = len(text) + has_special = any(ord(c) > 127 for c in text) + + if has_special: + desc = 'HAS_UNICODE' + elif any(c in text for c in ['\\n', '\\t', '\\r']): + desc = 'HAS_WHITESPACE' + elif any(c in text for c in ['!', '@', '#', '\$', '%']): + desc = 'HAS_SYMBOLS' + else: + desc = 'NORMAL' + + yield (length, has_special, desc) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_special_strings; """ + sql """ + CREATE TABLE test_special_strings ( + id INT, + value STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_special_strings VALUES + (1, 'normal text'), + (2, 'hello@world.com'), + (3, 'tab\\there'), + (4, '你好世界'), + (5, ''); + """ + + qt_special_strings """ + SELECT id, tmp.length, tmp.has_special, tmp.description + FROM test_special_strings + LATERAL VIEW udtf_special_strings(value) tmp AS length, has_special, description + ORDER BY id; + """ + + // Test 3.4: Boundary Dates + sql """ DROP FUNCTION IF EXISTS udtf_boundary_dates(DATE); """ + sql """ + CREATE TABLES FUNCTION udtf_boundary_dates(DATE) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_boundary_dates", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_boundary_dates(dt): + '''Process boundary date values''' + if dt is None: + yield (None, 0, False) + else: + year = dt.year + # Check if it's a boundary date + is_boundary = year in [1970, 9999] or (year == 1970 and dt.month == 1 and dt.day == 1) + yield (dt, year, is_boundary) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_boundary_dates; """ + sql """ + CREATE TABLE test_boundary_dates ( + id INT, + value DATE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_boundary_dates VALUES + (1, '1970-01-01'), + (2, '2024-06-15'), + (3, '9999-12-31'); + """ + + qt_boundary_dates """ + SELECT id, tmp.original, tmp.year, tmp.is_boundary + FROM test_boundary_dates + LATERAL VIEW udtf_boundary_dates(value) tmp AS original, year, is_boundary + ORDER BY id; + """ + + } finally { + // Cleanup functions + try_sql("DROP FUNCTION IF EXISTS udtf_null_int(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_null_string(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_empty_array(ARRAY);") + try_sql("DROP FUNCTION IF EXISTS udtf_null_struct(STRUCT);") + try_sql("DROP FUNCTION IF EXISTS udtf_empty_table(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_single_row(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_long_string(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_large_array(ARRAY);") + try_sql("DROP FUNCTION IF EXISTS udtf_output_explosion(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_special_numbers(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_special_doubles(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS udtf_special_strings(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_boundary_dates(DATE);") + + // Cleanup tables + try_sql("DROP TABLE IF EXISTS test_null_int;") + try_sql("DROP TABLE IF EXISTS test_null_string;") + try_sql("DROP TABLE IF EXISTS test_empty_array;") + try_sql("DROP TABLE IF EXISTS test_null_struct;") + try_sql("DROP TABLE IF EXISTS test_empty_table;") + try_sql("DROP TABLE IF EXISTS test_single_row;") + try_sql("DROP TABLE IF EXISTS test_long_string;") + try_sql("DROP TABLE IF EXISTS test_large_array;") + try_sql("DROP TABLE IF EXISTS test_output_explosion;") + try_sql("DROP TABLE IF EXISTS test_special_numbers;") + try_sql("DROP TABLE IF EXISTS test_special_doubles;") + try_sql("DROP TABLE IF EXISTS test_special_strings;") + try_sql("DROP TABLE IF EXISTS test_boundary_dates;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_edge_cases_module.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_edge_cases_module.groovy new file mode 100644 index 00000000000000..25d0cffc43e1a7 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_edge_cases_module.groovy @@ -0,0 +1,554 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_edge_cases_module") { + // Test Python UDTF Edge Cases and Boundary Conditions + // Coverage: NULL handling, extreme cases, special values + + def pyPath = """${context.file.parent}/udtf_scripts/pyudtf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${pyPath}".toString()) + + try { + // ======================================== + // Section 1: NULL Value Handling + // ======================================== + + // Test 1.1: NULL Integer Input + sql """ DROP FUNCTION IF EXISTS udtf_null_int_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_null_int_module(INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.handle_null_int", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_null_int_module; """ + sql """ + CREATE TABLE test_null_int_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_null_int_module VALUES (1, NULL), (2, 0), (3, 10), (4, NULL); + """ + + qt_null_int """ + SELECT id, tmp.input_value, tmp.is_null, tmp.result + FROM test_null_int_module + LATERAL VIEW udtf_null_int_module(value) tmp AS input_value, is_null, result + ORDER BY id; + """ + + // Test 1.2: Empty String vs NULL String + sql """ DROP FUNCTION IF EXISTS udtf_null_string_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_null_string_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.handle_null_string", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_null_string_module; """ + sql """ + CREATE TABLE test_null_string_module ( + id INT, + value STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_null_string_module VALUES (1, NULL), (2, ''), (3, 'hello'), (4, NULL); + """ + + qt_null_string """ + SELECT id, tmp.value_type, tmp.length + FROM test_null_string_module + LATERAL VIEW udtf_null_string_module(value) tmp AS value_type, length + ORDER BY id; + """ + + // Test 1.3: Empty Array + sql """ DROP FUNCTION IF EXISTS udtf_empty_array_module(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_empty_array_module(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.handle_empty_array", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_empty_array_module; """ + sql """ + CREATE TABLE test_empty_array_module ( + id INT, + value ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_empty_array_module VALUES + (1, NULL), + (2, []), + (3, [1, 2, 3]); + """ + + qt_empty_array """ + SELECT id, tmp.array_type, tmp.size + FROM test_empty_array_module + LATERAL VIEW udtf_empty_array_module(value) tmp AS array_type, size + ORDER BY id; + """ + + // Test 1.4: NULL Fields in STRUCT + sql """ DROP FUNCTION IF EXISTS udtf_null_struct_module(STRUCT); """ + sql """ + CREATE TABLES FUNCTION udtf_null_struct_module(STRUCT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.handle_null_struct", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_null_struct_module; """ + sql """ + CREATE TABLE test_null_struct_module ( + id INT, + person STRUCT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_null_struct_module VALUES + (1, named_struct('name', 'Alice', 'age', 25)), + (2, named_struct('name', 'Bob', 'age', NULL)), + (3, named_struct('name', NULL, 'age', 30)), + (4, named_struct('name', NULL, 'age', NULL)); + """ + + qt_null_struct """ + SELECT id, tmp.has_name, tmp.has_age, tmp.summary + FROM test_null_struct_module + LATERAL VIEW udtf_null_struct_module(person) tmp AS has_name, has_age, summary + ORDER BY id; + """ + + // ======================================== + // Section 2: Extreme Cases + // ======================================== + + // Test 2.1: Empty Table (0 rows) + sql """ DROP FUNCTION IF EXISTS udtf_empty_table_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_empty_table_module(INT) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.process_empty_table", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_empty_table_module; """ + sql """ + CREATE TABLE test_empty_table_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + // No INSERT - table remains empty + + qt_empty_table """ + SELECT tmp.value + FROM test_empty_table_module + LATERAL VIEW udtf_empty_table_module(value) tmp AS value; + """ + + // Test 2.2: Single Row Table + sql """ DROP FUNCTION IF EXISTS udtf_single_row_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_single_row_module(INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.process_single_row", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_single_row_module; """ + sql """ + CREATE TABLE test_single_row_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_single_row_module VALUES (1, 100); + """ + + qt_single_row """ + SELECT tmp.original, tmp.generated + FROM test_single_row_module + LATERAL VIEW udtf_single_row_module(value) tmp AS original, generated + ORDER BY tmp.generated; + """ + + // Test 2.3: Large Field - Long String + sql """ DROP FUNCTION IF EXISTS udtf_long_string_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_long_string_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.process_long_string", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_long_string_module; """ + sql """ + CREATE TABLE test_long_string_module ( + id INT, + value STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_long_string_module VALUES + (1, REPEAT('A', 1000)), + (2, REPEAT('B', 5000)); + """ + + qt_long_string """ + SELECT id, tmp.length, tmp.first_10, tmp.last_10 + FROM test_long_string_module + LATERAL VIEW udtf_long_string_module(value) tmp AS length, first_10, last_10 + ORDER BY id; + """ + + // Test 2.4: Large Array + sql """ DROP FUNCTION IF EXISTS udtf_large_array_module(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_large_array_module(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.process_large_array", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_large_array_module; """ + sql """ + CREATE TABLE test_large_array_module ( + id INT, + value ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_large_array_module VALUES + (1, ARRAY_REPEAT(1, 100)), + (2, ARRAY_REPEAT(5, 50)); + """ + + qt_large_array """ + SELECT id, tmp.total_elements, tmp.sum_value, tmp.first_elem, tmp.last_elem + FROM test_large_array_module + LATERAL VIEW udtf_large_array_module(value) tmp AS total_elements, sum_value, first_elem, last_elem + ORDER BY id; + """ + + // Test 2.5: Output Explosion - Controlled + sql """ DROP FUNCTION IF EXISTS udtf_output_explosion_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_output_explosion_module(INT) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.output_explosion", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_output_explosion_module; """ + sql """ + CREATE TABLE test_output_explosion_module ( + id INT, + multiplier INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_output_explosion_module VALUES (1, 10), (2, 50); + """ + + qt_output_explosion """ + SELECT id, COUNT(*) as output_count, MIN(tmp.value) as min_val, MAX(tmp.value) as max_val + FROM test_output_explosion_module + LATERAL VIEW udtf_output_explosion_module(multiplier) tmp AS value + GROUP BY id + ORDER BY id; + """ + + // ======================================== + // Section 3: Special Values + // ======================================== + + // Test 3.1: Special Numeric Values (0, negative, boundary) + sql """ DROP FUNCTION IF EXISTS udtf_special_numbers_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_special_numbers_module(INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.process_special_numbers", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_special_numbers_module; """ + sql """ + CREATE TABLE test_special_numbers_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_special_numbers_module VALUES + (1, -2147483648), -- INT MIN + (2, -1), + (3, 0), + (4, 1), + (5, 2147483647), -- INT MAX + (6, NULL); + """ + + qt_special_numbers """ + SELECT id, tmp.original, tmp.category, tmp.is_boundary + FROM test_special_numbers_module + LATERAL VIEW udtf_special_numbers_module(value) tmp AS original, category, is_boundary + ORDER BY id; + """ + + // Test 3.2: Special Double Values (infinity, very small numbers) + sql """ DROP FUNCTION IF EXISTS udtf_special_doubles_module(DOUBLE); """ + sql """ + CREATE TABLES FUNCTION udtf_special_doubles_module(DOUBLE) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.process_special_doubles", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_special_doubles_module; """ + sql """ + CREATE TABLE test_special_doubles_module ( + id INT, + value DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_special_doubles_module VALUES + (1, 0.0), + (2, 1e-15), + (3, 1e15), + (4, -1e15), + (5, 3.14159); + """ + + qt_special_doubles """ + SELECT id, tmp.original, tmp.classification + FROM test_special_doubles_module + LATERAL VIEW udtf_special_doubles_module(value) tmp AS original, classification + ORDER BY id; + """ + + // Test 3.3: Special String Values (special characters, Unicode) + sql """ DROP FUNCTION IF EXISTS udtf_special_strings_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_special_strings_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.process_special_strings", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_special_strings_module; """ + sql """ + CREATE TABLE test_special_strings_module ( + id INT, + value STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_special_strings_module VALUES + (1, 'normal text'), + (2, 'hello@world.com'), + (3, 'tab\\there'), + (4, '你好世界'), + (5, ''); + """ + + qt_special_strings """ + SELECT id, tmp.length, tmp.has_special, tmp.description + FROM test_special_strings_module + LATERAL VIEW udtf_special_strings_module(value) tmp AS length, has_special, description + ORDER BY id; + """ + + // Test 3.4: Boundary Dates + sql """ DROP FUNCTION IF EXISTS udtf_boundary_dates_module(DATE); """ + sql """ + CREATE TABLES FUNCTION udtf_boundary_dates_module(DATE) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.edge_cases_udtf.process_boundary_dates", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_boundary_dates_module; """ + sql """ + CREATE TABLE test_boundary_dates_module ( + id INT, + value DATE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_boundary_dates_module VALUES + (1, '1970-01-01'), + (2, '2024-06-15'), + (3, '9999-12-31'); + """ + + qt_boundary_dates """ + SELECT id, tmp.original, tmp.year, tmp.is_boundary + FROM test_boundary_dates_module + LATERAL VIEW udtf_boundary_dates_module(value) tmp AS original, year, is_boundary + ORDER BY id; + """ + + } finally { + // Cleanup functions + try_sql("DROP FUNCTION IF EXISTS udtf_null_int_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_null_string_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_empty_array_module(ARRAY);") + try_sql("DROP FUNCTION IF EXISTS udtf_null_struct_module(STRUCT);") + try_sql("DROP FUNCTION IF EXISTS udtf_empty_table_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_single_row_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_long_string_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_large_array_module(ARRAY);") + try_sql("DROP FUNCTION IF EXISTS udtf_output_explosion_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_special_numbers_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_special_doubles_module(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS udtf_special_strings_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_boundary_dates_module(DATE);") + + // Cleanup tables + try_sql("DROP TABLE IF EXISTS test_null_int_module;") + try_sql("DROP TABLE IF EXISTS test_null_string_module;") + try_sql("DROP TABLE IF EXISTS test_empty_array_module;") + try_sql("DROP TABLE IF EXISTS test_null_struct_module;") + try_sql("DROP TABLE IF EXISTS test_empty_table_module;") + try_sql("DROP TABLE IF EXISTS test_single_row_module;") + try_sql("DROP TABLE IF EXISTS test_long_string_module;") + try_sql("DROP TABLE IF EXISTS test_large_array_module;") + try_sql("DROP TABLE IF EXISTS test_output_explosion_module;") + try_sql("DROP TABLE IF EXISTS test_special_numbers_module;") + try_sql("DROP TABLE IF EXISTS test_special_doubles_module;") + try_sql("DROP TABLE IF EXISTS test_special_strings_module;") + try_sql("DROP TABLE IF EXISTS test_boundary_dates_module;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_exceptions_inline.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_exceptions_inline.groovy new file mode 100644 index 00000000000000..007440c9444deb --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_exceptions_inline.groovy @@ -0,0 +1,788 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_exceptions_inline") { + // Test Python UDTF Exception Handling + // Coverage: Runtime errors, type errors, logic errors, edge cases + + def runtime_version = "3.8.10" + + try { + // ======================================== + // Section 1: Arithmetic Exceptions + // ======================================== + + // Test 1.1: Division by Zero - Handled + sql """ DROP FUNCTION IF EXISTS udtf_safe_divide(INT, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_safe_divide(INT, INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "safe_divide", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def safe_divide(a, b): + '''Safe division with error handling''' + try: + if b == 0: + yield (a, b, None, 'division_by_zero') + else: + result = a / b + yield (a, b, result, 'success') + except Exception as e: + yield (a, b, None, f'error_{type(e).__name__}') +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_division; """ + sql """ + CREATE TABLE test_division ( + id INT, + num INT, + denom INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_division VALUES + (1, 10, 2), + (2, 10, 0), + (3, 0, 5), + (4, -8, 4); + """ + + qt_safe_divide """ + SELECT id, tmp.numerator, tmp.denominator, tmp.result, tmp.error_msg + FROM test_division + LATERAL VIEW udtf_safe_divide(num, denom) tmp AS numerator, denominator, result, error_msg + ORDER BY id; + """ + + // Test 1.2: Integer Overflow Detection + sql """ DROP FUNCTION IF EXISTS udtf_overflow_check(BIGINT); """ + sql """ + CREATE TABLES FUNCTION udtf_overflow_check(BIGINT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "check_overflow", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def check_overflow(value): + '''Check for potential overflow in operations''' + if value is None: + yield (None, None, 'null_input') + else: + # BIGINT range: -2^63 to 2^63-1 + MAX_BIGINT = 9223372036854775807 + MIN_BIGINT = -9223372036854775808 + + doubled = value * 2 + + # Check if doubled value is within safe range + if doubled > MAX_BIGINT or doubled < MIN_BIGINT: + yield (value, None, 'would_overflow') + else: + yield (value, doubled, 'safe') +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_overflow; """ + sql """ + CREATE TABLE test_overflow ( + id INT, + big_val BIGINT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_overflow VALUES + (1, 100), + (2, 5000000000000), + (3, -5000000000000), + (4, NULL); + """ + + qt_overflow_check """ + SELECT id, tmp.original, tmp.doubled, tmp.status + FROM test_overflow + LATERAL VIEW udtf_overflow_check(big_val) tmp AS original, doubled, status + ORDER BY id; + """ + + // ======================================== + // Section 2: Type Conversion Errors + // ======================================== + + // Test 2.1: String to Number Conversion + sql """ DROP FUNCTION IF EXISTS udtf_parse_number(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_parse_number(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "parse_number", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def parse_number(text): + '''Parse string to number with error handling''' + if text is None: + yield (None, None, False) + else: + try: + num = float(text) + yield (text, num, True) + except ValueError: + yield (text, None, False) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_parse; """ + sql """ + CREATE TABLE test_parse ( + id INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_parse VALUES + (1, '123'), + (2, '45.67'), + (3, 'abc'), + (4, '12.34.56'), + (5, ''), + (6, NULL); + """ + + qt_parse_number """ + SELECT id, tmp.input, tmp.parsed, tmp.is_valid + FROM test_parse + LATERAL VIEW udtf_parse_number(text) tmp AS input, parsed, is_valid + ORDER BY id; + """ + + // Test 2.2: Type Mismatch Handling + sql """ DROP FUNCTION IF EXISTS udtf_type_check(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_type_check(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "check_type", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def check_type(value): + '''Check and report value type''' + type_name = type(value).__name__ + + if value is None: + yield (None, 'NoneType', 0) + elif isinstance(value, str): + yield (value, type_name, len(value)) + else: + # Unexpected type - convert to string + yield (str(value), type_name, len(str(value))) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_types; """ + sql """ + CREATE TABLE test_types ( + id INT, + val STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_types VALUES + (1, 'hello'), + (2, ''), + (3, '12345'), + (4, NULL); + """ + + qt_type_check """ + SELECT id, tmp.value, tmp.type_name, tmp.length + FROM test_types + LATERAL VIEW udtf_type_check(val) tmp AS value, type_name, length + ORDER BY id; + """ + + // ======================================== + // Section 3: Collection/Array Errors + // ======================================== + + // Test 3.1: Array Index Out of Bounds + sql """ DROP FUNCTION IF EXISTS udtf_safe_index(ARRAY, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_safe_index(ARRAY, INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "safe_array_access", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def safe_array_access(arr, position): + '''Safe array element access''' + if arr is None: + yield (0, position, None, 'null_array') + elif len(arr) == 0: + yield (0, position, None, 'empty_array') + elif position < 0 or position >= len(arr): + yield (len(arr), position, None, 'out_of_bounds') + else: + yield (len(arr), position, arr[position], 'success') +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_array_access; """ + sql """ + CREATE TABLE test_array_access ( + id INT, + arr ARRAY, + pos INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_array_access VALUES + (1, [10, 20, 30], 1), + (2, [10, 20, 30], 5), + (3, [10, 20, 30], -1), + (4, [], 0), + (5, NULL, 0); + """ + + qt_safe_index """ + SELECT id, tmp.arr_size, tmp.target_pos, tmp.value, tmp.status + FROM test_array_access + LATERAL VIEW udtf_safe_index(arr, pos) tmp AS arr_size, target_pos, value, status + ORDER BY id; + """ + + // Test 3.2: Empty Collection Handling + sql """ DROP FUNCTION IF EXISTS udtf_collection_stats(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_collection_stats(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "compute_stats", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def compute_stats(arr): + '''Compute statistics with empty array handling''' + if arr is None: + yield (0, 0, 0.0, 'null_array') + elif len(arr) == 0: + yield (0, 0, 0.0, 'empty_array') + else: + count = len(arr) + total = sum(x for x in arr if x is not None) + avg = total / count if count > 0 else 0.0 + yield (count, total, avg, 'computed') +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_collection_stats; """ + sql """ + CREATE TABLE test_collection_stats ( + id INT, + data ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_collection_stats VALUES + (1, [1, 2, 3, 4, 5]), + (2, []), + (3, NULL), + (4, [10, 20]); + """ + + qt_collection_stats """ + SELECT id, tmp.count, tmp.total, tmp.avg, tmp.status + FROM test_collection_stats + LATERAL VIEW udtf_collection_stats(data) tmp AS count, total, avg, status + ORDER BY id; + """ + + // ======================================== + // Section 4: Dictionary/STRUCT Errors + // ======================================== + + // Test 4.1: Missing Dictionary Keys + sql """ DROP FUNCTION IF EXISTS udtf_safe_struct_access(STRUCT); """ + sql """ + CREATE TABLES FUNCTION udtf_safe_struct_access(STRUCT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "access_struct_fields", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def access_struct_fields(person): + '''Safe STRUCT field access''' + if person is None: + yield (False, False, None, None) + else: + # Use .get() to safely access dictionary keys + name = person.get('name') + age = person.get('age') + + has_name = name is not None + has_age = age is not None + + yield (has_name, has_age, name, age) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_struct_access; """ + sql """ + CREATE TABLE test_struct_access ( + id INT, + person STRUCT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_struct_access VALUES + (1, named_struct('name', 'Alice', 'age', 30)), + (2, named_struct('name', 'Bob', 'age', NULL)), + (3, named_struct('name', NULL, 'age', 25)), + (4, NULL); + """ + + qt_safe_struct_access """ + SELECT id, tmp.has_name, tmp.has_age, tmp.name_val, tmp.age_val + FROM test_struct_access + LATERAL VIEW udtf_safe_struct_access(person) tmp AS has_name, has_age, name_val, age_val + ORDER BY id; + """ + + // ======================================== + // Section 5: String Processing Errors + // ======================================== + + // Test 5.1: Invalid String Operations + sql """ DROP FUNCTION IF EXISTS udtf_string_slice(STRING, INT, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_string_slice(STRING, INT, INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "slice_string", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def slice_string(text, start, end): + '''Safe string slicing''' + if text is None: + yield (None, start, end, None, 'null_string') + elif start is None or end is None: + yield (text, start, end, None, 'null_index') + else: + length = len(text) + + # Clamp indices to valid range + safe_start = max(0, min(start, length)) + safe_end = max(0, min(end, length)) + + if safe_start >= safe_end: + yield (text, start, end, '', 'empty_slice') + else: + result = text[safe_start:safe_end] + yield (text, start, end, result, 'success') +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_string_slice; """ + sql """ + CREATE TABLE test_string_slice ( + id INT, + text STRING, + start_pos INT, + end_pos INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_string_slice VALUES + (1, 'hello world', 0, 5), + (2, 'hello world', 6, 11), + (3, 'hello world', 20, 30), + (4, 'hello world', 5, 2), + (5, NULL, 0, 5); + """ + + qt_string_slice """ + SELECT id, tmp.original, tmp.start_pos, tmp.end_pos, tmp.slice, tmp.status + FROM test_string_slice + LATERAL VIEW udtf_string_slice(text, start_pos, end_pos) tmp AS original, start_pos, end_pos, slice, status + ORDER BY id; + """ + + // Test 5.2: Encoding/Decoding Errors + sql """ DROP FUNCTION IF EXISTS udtf_check_encoding(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_check_encoding(STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "check_text_encoding", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def check_text_encoding(text): + '''Check string encoding properties''' + if text is None: + yield (None, 0, 0, False) + else: + byte_len = len(text.encode('utf-8')) + char_len = len(text) + has_unicode = byte_len > char_len + + yield (text, byte_len, char_len, has_unicode) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_encoding; """ + sql """ + CREATE TABLE test_encoding ( + id INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_encoding VALUES + (1, 'hello'), + (2, '你好世界'), + (3, 'café'), + (4, ''), + (5, NULL); + """ + + qt_check_encoding """ + SELECT id, tmp.text, tmp.byte_length, tmp.char_length, tmp.has_unicode + FROM test_encoding + LATERAL VIEW udtf_check_encoding(text) tmp AS text, byte_length, char_length, has_unicode + ORDER BY id; + """ + + // ======================================== + // Section 6: Logic and State Errors + // ======================================== + + // Test 6.1: Conditional Logic Errors + sql """ DROP FUNCTION IF EXISTS udtf_conditional_process(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_conditional_process(INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "process_conditional", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def process_conditional(value): + '''Process value based on multiple conditions''' + if value is None: + yield (None, 'null', 0) + elif value < 0: + # For negative: take absolute value + yield (value, 'negative', abs(value)) + elif value == 0: + # Zero case: return 1 + yield (value, 'zero', 1) + elif value > 0 and value <= 100: + # Small positive: double it + yield (value, 'small_positive', value * 2) + else: + # Large positive: return as-is + yield (value, 'large_positive', value) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_conditional; """ + sql """ + CREATE TABLE test_conditional ( + id INT, + val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_conditional VALUES + (1, -10), + (2, 0), + (3, 50), + (4, 200), + (5, NULL); + """ + + qt_conditional_process """ + SELECT id, tmp.input, tmp.category, tmp.result + FROM test_conditional + LATERAL VIEW udtf_conditional_process(val) tmp AS input, category, result + ORDER BY id; + """ + + // Test 6.2: Yield Control - No Output Case + sql """ DROP FUNCTION IF EXISTS udtf_filter_yield(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_filter_yield(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "conditional_yield", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def conditional_yield(value): + '''Only yield for even positive numbers''' + if value is not None and value > 0 and value % 2 == 0: + yield (value,) + # For other cases, yield nothing (filter out) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_filter_yield; """ + sql """ + CREATE TABLE test_filter_yield ( + id INT, + val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_filter_yield VALUES + (1, 10), + (2, 15), + (3, -4), + (4, 0), + (5, 22), + (6, NULL); + """ + + qt_filter_yield """ + SELECT id, tmp.value + FROM test_filter_yield + LATERAL VIEW udtf_filter_yield(val) tmp AS value + ORDER BY id; + """ + + // ======================================== + // Section 7: Edge Cases in Computation + // ======================================== + + // Test 7.1: Very Small and Very Large Numbers + sql """ DROP FUNCTION IF EXISTS udtf_number_range(DOUBLE); """ + sql """ + CREATE TABLES FUNCTION udtf_number_range(DOUBLE) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "classify_number_range", + "runtime_version" = "3.8.10" + ) + AS \$\$ +import math + +def classify_number_range(value): + '''Classify number by magnitude''' + if value is None: + yield (None, 'null', True) + elif math.isnan(value): + yield (value, 'nan', False) + elif math.isinf(value): + yield (value, 'infinity', False) + elif value == 0.0: + yield (value, 'zero', True) + elif abs(value) < 1e-100: + yield (value, 'extremely_small', True) + elif abs(value) > 1e100: + yield (value, 'extremely_large', True) + elif abs(value) < 1.0: + yield (value, 'small', True) + else: + yield (value, 'normal', True) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_number_range; """ + sql """ + CREATE TABLE test_number_range ( + id INT, + val DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_number_range VALUES + (1, 0.0), + (2, 1e-150), + (3, 1e150), + (4, 0.5), + (5, 123.456), + (6, NULL); + """ + + qt_number_range """ + SELECT id, tmp.value, tmp.magnitude, tmp.is_finite + FROM test_number_range + LATERAL VIEW udtf_number_range(val) tmp AS value, magnitude, is_finite + ORDER BY id; + """ + + // Test 7.2: Date/Time Edge Cases + sql """ DROP FUNCTION IF EXISTS udtf_date_validation(DATE); """ + sql """ + CREATE TABLES FUNCTION udtf_date_validation(DATE) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "validate_date", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def validate_date(dt): + '''Validate and classify dates''' + if dt is None: + yield (None, 0, False, 'null_date') + else: + year = dt.year + + # Check if leap year + is_leap = (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0) + + # Classify date + if year < 1900: + status = 'very_old' + elif year > 2100: + status = 'far_future' + else: + status = 'normal' + + yield (dt, year, is_leap, status) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_date_validation; """ + sql """ + CREATE TABLE test_date_validation ( + id INT, + dt DATE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_date_validation VALUES + (1, '2024-01-01'), + (2, '2000-02-29'), + (3, '1970-01-01'), + (4, '9999-12-31'), + (5, NULL); + """ + + qt_date_validation """ + SELECT id, tmp.input_date, tmp.year, tmp.is_leap_year, tmp.status + FROM test_date_validation + LATERAL VIEW udtf_date_validation(dt) tmp AS input_date, year, is_leap_year, status + ORDER BY id; + """ + + } finally { + // Cleanup functions + try_sql("DROP FUNCTION IF EXISTS udtf_safe_divide(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_overflow_check(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS udtf_parse_number(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_type_check(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_safe_index(ARRAY, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_collection_stats(ARRAY);") + try_sql("DROP FUNCTION IF EXISTS udtf_safe_struct_access(STRUCT);") + try_sql("DROP FUNCTION IF EXISTS udtf_string_slice(STRING, INT, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_check_encoding(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_conditional_process(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_filter_yield(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_number_range(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS udtf_date_validation(DATE);") + + // Cleanup tables + try_sql("DROP TABLE IF EXISTS test_division;") + try_sql("DROP TABLE IF EXISTS test_overflow;") + try_sql("DROP TABLE IF EXISTS test_parse;") + try_sql("DROP TABLE IF EXISTS test_types;") + try_sql("DROP TABLE IF EXISTS test_array_access;") + try_sql("DROP TABLE IF EXISTS test_collection_stats;") + try_sql("DROP TABLE IF EXISTS test_struct_access;") + try_sql("DROP TABLE IF EXISTS test_string_slice;") + try_sql("DROP TABLE IF EXISTS test_encoding;") + try_sql("DROP TABLE IF EXISTS test_conditional;") + try_sql("DROP TABLE IF EXISTS test_filter_yield;") + try_sql("DROP TABLE IF EXISTS test_number_range;") + try_sql("DROP TABLE IF EXISTS test_date_validation;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_exceptions_module.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_exceptions_module.groovy new file mode 100644 index 00000000000000..f2f64ce807cbb7 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_exceptions_module.groovy @@ -0,0 +1,609 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_exceptions_module") { + // Test Python UDTF Exception Handling + // Coverage: Runtime errors, type errors, logic errors, edge cases + + def pyPath = """${context.file.parent}/udtf_scripts/pyudtf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${pyPath}".toString()) + + try { + // ======================================== + // Section 1: Arithmetic Exceptions + // ======================================== + + // Test 1.1: Division by Zero - Handled + sql """ DROP FUNCTION IF EXISTS udtf_safe_divide_module(INT, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_safe_divide_module(INT, INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.safe_divide", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_division_module; """ + sql """ + CREATE TABLE test_division_module ( + id INT, + num INT, + denom INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_division_module VALUES + (1, 10, 2), + (2, 10, 0), + (3, 0, 5), + (4, -8, 4); + """ + + qt_safe_divide """ + SELECT id, tmp.numerator, tmp.denominator, tmp.result, tmp.error_msg + FROM test_division_module + LATERAL VIEW udtf_safe_divide_module(num, denom) tmp AS numerator, denominator, result, error_msg + ORDER BY id; + """ + + // Test 1.2: Integer Overflow Detection + sql """ DROP FUNCTION IF EXISTS udtf_overflow_check_module(BIGINT); """ + sql """ + CREATE TABLES FUNCTION udtf_overflow_check_module(BIGINT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.check_overflow", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_overflow_module; """ + sql """ + CREATE TABLE test_overflow_module ( + id INT, + big_val BIGINT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_overflow_module VALUES + (1, 100), + (2, 5000000000000), + (3, -5000000000000), + (4, NULL); + """ + + qt_overflow_check """ + SELECT id, tmp.original, tmp.doubled, tmp.status + FROM test_overflow_module + LATERAL VIEW udtf_overflow_check_module(big_val) tmp AS original, doubled, status + ORDER BY id; + """ + + // ======================================== + // Section 2: Type Conversion Errors + // ======================================== + + // Test 2.1: String to Number Conversion + sql """ DROP FUNCTION IF EXISTS udtf_parse_number_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_parse_number_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.parse_number", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_parse_module; """ + sql """ + CREATE TABLE test_parse_module ( + id INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_parse_module VALUES + (1, '123'), + (2, '45.67'), + (3, 'abc'), + (4, '12.34.56'), + (5, ''), + (6, NULL); + """ + + qt_parse_number """ + SELECT id, tmp.input, tmp.parsed, tmp.is_valid + FROM test_parse_module + LATERAL VIEW udtf_parse_number_module(text) tmp AS input, parsed, is_valid + ORDER BY id; + """ + + // Test 2.2: Type Mismatch Handling + sql """ DROP FUNCTION IF EXISTS udtf_type_check_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_type_check_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.check_type", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_types_module; """ + sql """ + CREATE TABLE test_types_module ( + id INT, + val STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_types_module VALUES + (1, 'hello'), + (2, ''), + (3, '12345'), + (4, NULL); + """ + + qt_type_check """ + SELECT id, tmp.value, tmp.type_name, tmp.length + FROM test_types_module + LATERAL VIEW udtf_type_check_module(val) tmp AS value, type_name, length + ORDER BY id; + """ + + // ======================================== + // Section 3: Collection/Array Errors + // ======================================== + + // Test 3.1: Array Index Out of Bounds + sql """ DROP FUNCTION IF EXISTS udtf_safe_index_module(ARRAY, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_safe_index_module(ARRAY, INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.safe_array_access", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_array_access_module; """ + sql """ + CREATE TABLE test_array_access_module ( + id INT, + arr ARRAY, + pos INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_array_access_module VALUES + (1, [10, 20, 30], 1), + (2, [10, 20, 30], 5), + (3, [10, 20, 30], -1), + (4, [], 0), + (5, NULL, 0); + """ + + qt_safe_index """ + SELECT id, tmp.arr_size, tmp.target_pos, tmp.value, tmp.status + FROM test_array_access_module + LATERAL VIEW udtf_safe_index_module(arr, pos) tmp AS arr_size, target_pos, value, status + ORDER BY id; + """ + + // Test 3.2: Empty Collection Handling + sql """ DROP FUNCTION IF EXISTS udtf_collection_stats_module(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_collection_stats_module(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.compute_stats", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_collection_stats_module; """ + sql """ + CREATE TABLE test_collection_stats_module ( + id INT, + data ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_collection_stats_module VALUES + (1, [1, 2, 3, 4, 5]), + (2, []), + (3, NULL), + (4, [10, 20]); + """ + + qt_collection_stats """ + SELECT id, tmp.count, tmp.total, tmp.avg, tmp.status + FROM test_collection_stats_module + LATERAL VIEW udtf_collection_stats_module(data) tmp AS count, total, avg, status + ORDER BY id; + """ + + // ======================================== + // Section 4: Dictionary/STRUCT Errors + // ======================================== + + // Test 4.1: Missing Dictionary Keys + sql """ DROP FUNCTION IF EXISTS udtf_safe_struct_access_module(STRUCT); """ + sql """ + CREATE TABLES FUNCTION udtf_safe_struct_access_module(STRUCT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.access_struct_fields", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_struct_access_module; """ + sql """ + CREATE TABLE test_struct_access_module ( + id INT, + person STRUCT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_struct_access_module VALUES + (1, named_struct('name', 'Alice', 'age', 30)), + (2, named_struct('name', 'Bob', 'age', NULL)), + (3, named_struct('name', NULL, 'age', 25)), + (4, NULL); + """ + + qt_safe_struct_access """ + SELECT id, tmp.has_name, tmp.has_age, tmp.name_val, tmp.age_val + FROM test_struct_access_module + LATERAL VIEW udtf_safe_struct_access_module(person) tmp AS has_name, has_age, name_val, age_val + ORDER BY id; + """ + + // ======================================== + // Section 5: String Processing Errors + // ======================================== + + // Test 5.1: Invalid String Operations + sql """ DROP FUNCTION IF EXISTS udtf_string_slice_module(STRING, INT, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_string_slice_module(STRING, INT, INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.slice_string", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_string_slice_module; """ + sql """ + CREATE TABLE test_string_slice_module ( + id INT, + text STRING, + start_pos INT, + end_pos INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_string_slice_module VALUES + (1, 'hello world', 0, 5), + (2, 'hello world', 6, 11), + (3, 'hello world', 20, 30), + (4, 'hello world', 5, 2), + (5, NULL, 0, 5); + """ + + qt_string_slice """ + SELECT id, tmp.original, tmp.start_pos, tmp.end_pos, tmp.slice, tmp.status + FROM test_string_slice_module + LATERAL VIEW udtf_string_slice_module(text, start_pos, end_pos) tmp AS original, start_pos, end_pos, slice, status + ORDER BY id; + """ + + // Test 5.2: Encoding/Decoding Errors + sql """ DROP FUNCTION IF EXISTS udtf_check_encoding_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_check_encoding_module(STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.check_text_encoding", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_encoding_module; """ + sql """ + CREATE TABLE test_encoding_module ( + id INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_encoding_module VALUES + (1, 'hello'), + (2, '你好世界'), + (3, 'café'), + (4, ''), + (5, NULL); + """ + + qt_check_encoding """ + SELECT id, tmp.text, tmp.byte_length, tmp.char_length, tmp.has_unicode + FROM test_encoding_module + LATERAL VIEW udtf_check_encoding_module(text) tmp AS text, byte_length, char_length, has_unicode + ORDER BY id; + """ + + // ======================================== + // Section 6: Logic and State Errors + // ======================================== + + // Test 6.1: Conditional Logic Errors + sql """ DROP FUNCTION IF EXISTS udtf_conditional_process_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_conditional_process_module(INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.process_conditional", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_conditional_module; """ + sql """ + CREATE TABLE test_conditional_module ( + id INT, + val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_conditional_module VALUES + (1, -10), + (2, 0), + (3, 50), + (4, 200), + (5, NULL); + """ + + qt_conditional_process """ + SELECT id, tmp.input, tmp.category, tmp.result + FROM test_conditional_module + LATERAL VIEW udtf_conditional_process_module(val) tmp AS input, category, result + ORDER BY id; + """ + + // Test 6.2: Yield Control - No Output Case + sql """ DROP FUNCTION IF EXISTS udtf_filter_yield_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_filter_yield_module(INT) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.conditional_yield", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_filter_yield_module; """ + sql """ + CREATE TABLE test_filter_yield_module ( + id INT, + val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_filter_yield_module VALUES + (1, 10), + (2, 15), + (3, -4), + (4, 0), + (5, 22), + (6, NULL); + """ + + qt_filter_yield """ + SELECT id, tmp.value + FROM test_filter_yield_module + LATERAL VIEW udtf_filter_yield_module(val) tmp AS value + ORDER BY id; + """ + + // ======================================== + // Section 7: Edge Cases in Computation + // ======================================== + + // Test 7.1: Very Small and Very Large Numbers + sql """ DROP FUNCTION IF EXISTS udtf_number_range_module(DOUBLE); """ + sql """ + CREATE TABLES FUNCTION udtf_number_range_module(DOUBLE) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.classify_number_range", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_number_range_module; """ + sql """ + CREATE TABLE test_number_range_module ( + id INT, + val DOUBLE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_number_range_module VALUES + (1, 0.0), + (2, 1e-150), + (3, 1e150), + (4, 0.5), + (5, 123.456), + (6, NULL); + """ + + qt_number_range """ + SELECT id, tmp.value, tmp.magnitude, tmp.is_finite + FROM test_number_range_module + LATERAL VIEW udtf_number_range_module(val) tmp AS value, magnitude, is_finite + ORDER BY id; + """ + + // Test 7.2: Date/Time Edge Cases + sql """ DROP FUNCTION IF EXISTS udtf_date_validation_module(DATE); """ + sql """ + CREATE TABLES FUNCTION udtf_date_validation_module(DATE) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.exceptions_udtf.validate_date", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_date_validation_module; """ + sql """ + CREATE TABLE test_date_validation_module ( + id INT, + dt DATE + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_date_validation_module VALUES + (1, '2024-01-01'), + (2, '2000-02-29'), + (3, '1970-01-01'), + (4, '9999-12-31'), + (5, NULL); + """ + + qt_date_validation """ + SELECT id, tmp.input_date, tmp.year, tmp.is_leap_year, tmp.status + FROM test_date_validation_module + LATERAL VIEW udtf_date_validation_module(dt) tmp AS input_date, year, is_leap_year, status + ORDER BY id; + """ + + } finally { + // Cleanup functions + try_sql("DROP FUNCTION IF EXISTS udtf_safe_divide_module(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_overflow_check_module(BIGINT);") + try_sql("DROP FUNCTION IF EXISTS udtf_parse_number_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_type_check_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_safe_index_module(ARRAY, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_collection_stats_module(ARRAY);") + try_sql("DROP FUNCTION IF EXISTS udtf_safe_struct_access_module(STRUCT);") + try_sql("DROP FUNCTION IF EXISTS udtf_string_slice_module(STRING, INT, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_check_encoding_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_conditional_process_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_filter_yield_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_number_range_module(DOUBLE);") + try_sql("DROP FUNCTION IF EXISTS udtf_date_validation_module(DATE);") + + // Cleanup tables + try_sql("DROP TABLE IF EXISTS test_division_module;") + try_sql("DROP TABLE IF EXISTS test_overflow_module;") + try_sql("DROP TABLE IF EXISTS test_parse_module;") + try_sql("DROP TABLE IF EXISTS test_types_module;") + try_sql("DROP TABLE IF EXISTS test_array_access_module;") + try_sql("DROP TABLE IF EXISTS test_collection_stats_module;") + try_sql("DROP TABLE IF EXISTS test_struct_access_module;") + try_sql("DROP TABLE IF EXISTS test_string_slice_module;") + try_sql("DROP TABLE IF EXISTS test_encoding_module;") + try_sql("DROP TABLE IF EXISTS test_conditional_module;") + try_sql("DROP TABLE IF EXISTS test_filter_yield_module;") + try_sql("DROP TABLE IF EXISTS test_number_range_module;") + try_sql("DROP TABLE IF EXISTS test_date_validation_module;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_io_patterns_inline.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_io_patterns_inline.groovy new file mode 100644 index 00000000000000..2dd5c13223b37c --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_io_patterns_inline.groovy @@ -0,0 +1,522 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_io_patterns_inline") { + // Test Python UDTF Input/Output Patterns + // Testing different cardinality patterns: 1-to-1, 1-to-N, 1-to-0, N-to-M + + def runtime_version = "3.8.10" + + try { + // ======================================== + // Pattern 1: One-to-One (1 input row → 1 output row) + // Each input row produces exactly one output row + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_one_to_one(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_one_to_one(INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "one_to_one", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def one_to_one(value): + '''Each input row produces exactly one output row''' + if value is not None: + yield (value, value * 2) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_one_to_one; """ + sql """ + CREATE TABLE test_one_to_one ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_one_to_one VALUES (1, 10), (2, 20), (3, 30); + """ + + qt_one_to_one """ + SELECT tmp.input, tmp.doubled + FROM test_one_to_one + LATERAL VIEW udtf_one_to_one(value) tmp AS input, doubled + ORDER BY tmp.input; + """ + + // ======================================== + // Pattern 2: One-to-Many (1 input row → N output all_rows) + // Each input row produces multiple output all_rows + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_one_to_many(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_one_to_many(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "one_to_many", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def one_to_many(n): + '''Each input row produces N output all_rows (1 to n)''' + if n is not None and n > 0: + for i in range(1, n + 1): + yield (i,) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_one_to_many; """ + sql """ + CREATE TABLE test_one_to_many ( + id INT, + count INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_one_to_many VALUES (1, 3), (2, 2), (3, 4); + """ + + qt_one_to_many """ + SELECT id, tmp.value + FROM test_one_to_many + LATERAL VIEW udtf_one_to_many(count) tmp AS value + ORDER BY id, tmp.value; + """ + + // ======================================== + // Pattern 3: One-to-Zero (1 input row → 0 output all_rows) + // Some input all_rows produce no output (filtering) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_one_to_zero(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_one_to_zero(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "one_to_zero", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def one_to_zero(value): + '''Only output even numbers, skip odd numbers (zero output)''' + if value is not None and value % 2 == 0: + yield (value,) + # Odd numbers: no yield, zero output all_rows +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_one_to_zero; """ + sql """ + CREATE TABLE test_one_to_zero ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_one_to_zero VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6); + """ + + qt_one_to_zero """ + SELECT tmp.value + FROM test_one_to_zero + LATERAL VIEW udtf_one_to_zero(value) tmp AS value + ORDER BY tmp.value; + """ + + // ======================================== + // Pattern 4: One-to-Variable (1 input row → 0/1/N output all_rows) + // Different input all_rows produce different numbers of output all_rows + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_one_to_variable(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_one_to_variable(STRING) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "one_to_variable", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def one_to_variable(text): + ''' + - Empty string → 0 all_rows + - Single word → 1 row + - Multiple words → N all_rows + ''' + if text: + words = text.split() + for word in words: + yield (word,) + # Empty or None: no yield, zero output +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_one_to_variable; """ + sql """ + CREATE TABLE test_one_to_variable ( + id INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_one_to_variable VALUES + (1, 'hello'), -- 1 output + (2, 'hello world'), -- 2 outputs + (3, ''), -- 0 outputs + (4, 'a b c'); -- 3 outputs + """ + + qt_one_to_variable """ + SELECT id, tmp.word + FROM test_one_to_variable + LATERAL VIEW udtf_one_to_variable(text) tmp AS word + ORDER BY id, tmp.word; + """ + + // ======================================== + // Pattern 5: Many-to-One (N input all_rows → aggregate to fewer all_rows) + // Note: This simulates batch processing where each row independently + // produces output, but conceptually represents aggregation pattern + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_aggregate_pattern(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_aggregate_pattern(INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "aggregate_pattern", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def aggregate_pattern(value): + '''Categorize numbers into ranges''' + if value is not None: + if value < 10: + category = 'small' + elif value < 100: + category = 'medium' + else: + category = 'large' + yield (value, category) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_aggregate_pattern; """ + sql """ + CREATE TABLE test_aggregate_pattern ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_aggregate_pattern VALUES + (1, 5), (2, 50), (3, 500), (4, 8), (5, 80), (6, 800); + """ + + qt_aggregate_pattern """ + SELECT tmp.category, COUNT(*) as count + FROM test_aggregate_pattern + LATERAL VIEW udtf_aggregate_pattern(value) tmp AS value, category + GROUP BY tmp.category + ORDER BY tmp.category; + """ + + // ======================================== + // Pattern 6: Explosive Growth (1 input row → many output all_rows) + // Testing large multiplication factor + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_explosive(INT, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_explosive(INT, INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "explosive", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def explosive(all_rows, all_cols): + '''Generate all_rows * all_cols output all_rows (cartesian product)''' + if all_rows is not None and all_cols is not None and all_rows > 0 and all_cols > 0: + for r in range(all_rows): + for c in range(all_cols): + yield (r, c) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_explosive; """ + sql """ + CREATE TABLE test_explosive ( + id INT, + all_rows INT, + all_cols INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_explosive VALUES (1, 2, 3); + """ + + qt_explosive """ + SELECT tmp.row_id, tmp.col_id + FROM test_explosive + LATERAL VIEW udtf_explosive(all_rows, all_cols) tmp AS row_id, col_id + ORDER BY tmp.row_id, tmp.col_id; + """ + + // ======================================== + // Pattern 7: Conditional Branching (different logic paths) + // Same function but different output counts based on condition + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_conditional(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_conditional(INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "conditional", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def conditional(value): + ''' + - Positive: output (value, 'positive') + - Negative: output (abs(value), 'negative') + - Zero: output both (0, 'zero') and (0, 'neutral') + ''' + if value is not None: + if value > 0: + yield (value, 'positive') + elif value < 0: + yield (abs(value), 'negative') + else: + yield (0, 'zero') + yield (0, 'neutral') +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_conditional; """ + sql """ + CREATE TABLE test_conditional ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_conditional VALUES (1, 10), (2, -5), (3, 0), (4, 7); + """ + + qt_conditional """ + SELECT tmp.value, tmp.type + FROM test_conditional + LATERAL VIEW udtf_conditional(value) tmp AS value, type + ORDER BY tmp.value, tmp.type; + """ + + // ======================================== + // Pattern 8: All-or-Nothing (either all all_rows or no all_rows) + // Based on a condition, output all or nothing + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_all_or_nothing(STRING, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_all_or_nothing(STRING, INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "all_or_nothing", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def all_or_nothing(text, min_length): + ''' + If text length >= min_length: output each character with position + Otherwise: output nothing + ''' + if text and len(text) >= min_length: + for i, char in enumerate(text): + yield (char, i) + # If condition not met: no yield +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_all_or_nothing; """ + sql """ + CREATE TABLE test_all_or_nothing ( + id INT, + text STRING, + min_len INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_all_or_nothing VALUES + (1, 'hello', 3), -- 5 outputs (length=5 >= 3) + (2, 'hi', 5), -- 0 outputs (length=2 < 5) + (3, 'world', 4); -- 5 outputs (length=5 >= 4) + """ + + qt_all_or_nothing """ + SELECT id, tmp.char, tmp.pos + FROM test_all_or_nothing + LATERAL VIEW udtf_all_or_nothing(text, min_len) tmp AS char, pos + ORDER BY id, tmp.pos; + """ + + // ======================================== + // Pattern 9: Empty Input Table (0 input all_rows) + // Test behavior with no input data + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_empty_input(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_empty_input(INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "empty_input", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def empty_input(value): + '''Simple identity function''' + if value is not None: + yield (value,) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_empty_input; """ + sql """ + CREATE TABLE test_empty_input ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + // No INSERT - table is empty + + qt_empty_input """ + SELECT tmp.value + FROM test_empty_input + LATERAL VIEW udtf_empty_input(value) tmp AS value; + """ + + // ======================================== + // Pattern 10: Batch Processing Simulation + // Multiple input all_rows, each producing variable outputs + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_batch_process(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_batch_process(INT) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "batch_process", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def batch_process(value): + '''For each input, generate multiples (2x, 3x, 5x)''' + if value is not None and value > 0: + for factor in [2, 3, 5]: + yield (value, factor, value * factor) +\$\$; + """ + + sql """ DROP TABLE IF EXISTS test_batch_process; """ + sql """ + CREATE TABLE test_batch_process ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_batch_process VALUES (1, 10), (2, 20); + """ + + qt_batch_process """ + SELECT tmp.original, tmp.factor, tmp.result + FROM test_batch_process + LATERAL VIEW udtf_batch_process(value) tmp AS original, factor, result + ORDER BY tmp.original, tmp.factor; + """ + + } finally { + // Cleanup functions + try_sql("DROP FUNCTION IF EXISTS udtf_one_to_one(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_one_to_many(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_one_to_zero(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_one_to_variable(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_aggregate_pattern(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_explosive(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_conditional(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_all_or_nothing(STRING, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_empty_input(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_batch_process(INT);") + + // Cleanup tables + try_sql("DROP TABLE IF EXISTS test_one_to_one;") + try_sql("DROP TABLE IF EXISTS test_one_to_many;") + try_sql("DROP TABLE IF EXISTS test_one_to_zero;") + try_sql("DROP TABLE IF EXISTS test_one_to_variable;") + try_sql("DROP TABLE IF EXISTS test_aggregate_pattern;") + try_sql("DROP TABLE IF EXISTS test_explosive;") + try_sql("DROP TABLE IF EXISTS test_conditional;") + try_sql("DROP TABLE IF EXISTS test_all_or_nothing;") + try_sql("DROP TABLE IF EXISTS test_empty_input;") + try_sql("DROP TABLE IF EXISTS test_batch_process;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_io_patterns_module.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_io_patterns_module.groovy new file mode 100644 index 00000000000000..71e40e8502f61c --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_io_patterns_module.groovy @@ -0,0 +1,442 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_io_patterns_module") { + // Test Python UDTF Input/Output Patterns + // Testing different cardinality patterns: 1-to-1, 1-to-N, 1-to-0, N-to-M + + def pyPath = """${context.file.parent}/udtf_scripts/pyudtf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${pyPath}".toString()) + + try { + // ======================================== + // Pattern 1: One-to-One (1 input row → 1 output row) + // Each input row produces exactly one output row + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_one_to_one_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_one_to_one_module(INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.io_patterns_udtf.one_to_one", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_one_to_one_module; """ + sql """ + CREATE TABLE test_one_to_one_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_one_to_one_module VALUES (1, 10), (2, 20), (3, 30); + """ + + qt_one_to_one """ + SELECT tmp.input, tmp.doubled + FROM test_one_to_one_module + LATERAL VIEW udtf_one_to_one_module(value) tmp AS input, doubled + ORDER BY tmp.input; + """ + + // ======================================== + // Pattern 2: One-to-Many (1 input row → N output all_rows) + // Each input row produces multiple output all_rows + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_one_to_many_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_one_to_many_module(INT) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.io_patterns_udtf.one_to_many", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_one_to_many_module; """ + sql """ + CREATE TABLE test_one_to_many_module ( + id INT, + count INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_one_to_many_module VALUES (1, 3), (2, 2), (3, 4); + """ + + qt_one_to_many """ + SELECT id, tmp.value + FROM test_one_to_many_module + LATERAL VIEW udtf_one_to_many_module(count) tmp AS value + ORDER BY id, tmp.value; + """ + + // ======================================== + // Pattern 3: One-to-Zero (1 input row → 0 output all_rows) + // Some input all_rows produce no output (filtering) + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_one_to_zero_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_one_to_zero_module(INT) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.io_patterns_udtf.one_to_zero", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_one_to_zero_module; """ + sql """ + CREATE TABLE test_one_to_zero_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_one_to_zero_module VALUES (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6); + """ + + qt_one_to_zero """ + SELECT tmp.value + FROM test_one_to_zero_module + LATERAL VIEW udtf_one_to_zero_module(value) tmp AS value + ORDER BY tmp.value; + """ + + // ======================================== + // Pattern 4: One-to-Variable (1 input row → 0/1/N output all_rows) + // Different input all_rows produce different numbers of output all_rows + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_one_to_variable_module(STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_one_to_variable_module(STRING) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.io_patterns_udtf.one_to_variable", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_one_to_variable_module; """ + sql """ + CREATE TABLE test_one_to_variable_module ( + id INT, + text STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_one_to_variable_module VALUES + (1, 'hello'), -- 1 output + (2, 'hello world'), -- 2 outputs + (3, ''), -- 0 outputs + (4, 'a b c'); -- 3 outputs + """ + + qt_one_to_variable """ + SELECT id, tmp.word + FROM test_one_to_variable_module + LATERAL VIEW udtf_one_to_variable_module(text) tmp AS word + ORDER BY id, tmp.word; + """ + + // ======================================== + // Pattern 5: Many-to-One (N input all_rows → aggregate to fewer all_rows) + // Note: This simulates batch processing where each row independently + // produces output, but conceptually represents aggregation pattern + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_aggregate_pattern_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_aggregate_pattern_module(INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.io_patterns_udtf.aggregate_pattern", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_aggregate_pattern_module; """ + sql """ + CREATE TABLE test_aggregate_pattern_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_aggregate_pattern_module VALUES + (1, 5), (2, 50), (3, 500), (4, 8), (5, 80), (6, 800); + """ + + qt_aggregate_pattern """ + SELECT tmp.category, COUNT(*) as count + FROM test_aggregate_pattern_module + LATERAL VIEW udtf_aggregate_pattern_module(value) tmp AS value, category + GROUP BY tmp.category + ORDER BY tmp.category; + """ + + // ======================================== + // Pattern 6: Explosive Growth (1 input row → many output all_rows) + // Testing large multiplication factor + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_explosive_module(INT, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_explosive_module(INT, INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.io_patterns_udtf.explosive", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_explosive_module; """ + sql """ + CREATE TABLE test_explosive_module ( + id INT, + all_rows INT, + all_cols INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_explosive_module VALUES (1, 2, 3); + """ + + qt_explosive """ + SELECT tmp.row_id, tmp.col_id + FROM test_explosive_module + LATERAL VIEW udtf_explosive_module(all_rows, all_cols) tmp AS row_id, col_id + ORDER BY tmp.row_id, tmp.col_id; + """ + + // ======================================== + // Pattern 7: Conditional Branching (different logic paths) + // Same function but different output counts based on condition + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_conditional_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_conditional_module(INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.io_patterns_udtf.conditional", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_conditional_module; """ + sql """ + CREATE TABLE test_conditional_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_conditional_module VALUES (1, 10), (2, -5), (3, 0), (4, 7); + """ + + qt_conditional """ + SELECT tmp.value, tmp.type + FROM test_conditional_module + LATERAL VIEW udtf_conditional_module(value) tmp AS value, type + ORDER BY tmp.value, tmp.type; + """ + + // ======================================== + // Pattern 8: All-or-Nothing (either all all_rows or no all_rows) + // Based on a condition, output all or nothing + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_all_or_nothing_module(STRING, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_all_or_nothing_module(STRING, INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.io_patterns_udtf.all_or_nothing", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_all_or_nothing_module; """ + sql """ + CREATE TABLE test_all_or_nothing_module ( + id INT, + text STRING, + min_len INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_all_or_nothing_module VALUES + (1, 'hello', 3), -- 5 outputs (length=5 >= 3) + (2, 'hi', 5), -- 0 outputs (length=2 < 5) + (3, 'world', 4); -- 5 outputs (length=5 >= 4) + """ + + qt_all_or_nothing """ + SELECT id, tmp.char, tmp.pos + FROM test_all_or_nothing_module + LATERAL VIEW udtf_all_or_nothing_module(text, min_len) tmp AS char, pos + ORDER BY id, tmp.pos; + """ + + // ======================================== + // Pattern 9: Empty Input Table (0 input all_rows) + // Test behavior with no input data + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_empty_input_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_empty_input_module(INT) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.io_patterns_udtf.empty_input", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_empty_input_module; """ + sql """ + CREATE TABLE test_empty_input_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + // No INSERT - table is empty + + qt_empty_input """ + SELECT tmp.value + FROM test_empty_input_module + LATERAL VIEW udtf_empty_input_module(value) tmp AS value; + """ + + // ======================================== + // Pattern 10: Batch Processing Simulation + // Multiple input all_rows, each producing variable outputs + // ======================================== + sql """ DROP FUNCTION IF EXISTS udtf_batch_process_module(INT); """ + sql """ + CREATE TABLES FUNCTION udtf_batch_process_module(INT) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.io_patterns_udtf.batch_process", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + sql """ DROP TABLE IF EXISTS test_batch_process_module; """ + sql """ + CREATE TABLE test_batch_process_module ( + id INT, + value INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_batch_process_module VALUES (1, 10), (2, 20); + """ + + qt_batch_process """ + SELECT tmp.original, tmp.factor, tmp.result + FROM test_batch_process_module + LATERAL VIEW udtf_batch_process_module(value) tmp AS original, factor, result + ORDER BY tmp.original, tmp.factor; + """ + + } finally { + // Cleanup functions + try_sql("DROP FUNCTION IF EXISTS udtf_one_to_one_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_one_to_many_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_one_to_zero_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_one_to_variable_module(STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_aggregate_pattern_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_explosive_module(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_conditional_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_all_or_nothing_module(STRING, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_empty_input_module(INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_batch_process_module(INT);") + + // Cleanup tables + try_sql("DROP TABLE IF EXISTS test_one_to_one_module;") + try_sql("DROP TABLE IF EXISTS test_one_to_many_module;") + try_sql("DROP TABLE IF EXISTS test_one_to_zero_module;") + try_sql("DROP TABLE IF EXISTS test_one_to_variable_module;") + try_sql("DROP TABLE IF EXISTS test_aggregate_pattern_module;") + try_sql("DROP TABLE IF EXISTS test_explosive_module;") + try_sql("DROP TABLE IF EXISTS test_conditional_module;") + try_sql("DROP TABLE IF EXISTS test_all_or_nothing_module;") + try_sql("DROP TABLE IF EXISTS test_empty_input_module;") + try_sql("DROP TABLE IF EXISTS test_batch_process_module;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_sql_integration_inline.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_sql_integration_inline.groovy new file mode 100644 index 00000000000000..e0609a02a0bbf9 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_sql_integration_inline.groovy @@ -0,0 +1,1001 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_sql_integration_inline") { + // Test Python UDTF Integration with SQL Operations + // Coverage: WHERE, JOIN, GROUP BY, ORDER BY, LIMIT, Subqueries, CTEs + + def runtime_version = "3.8.10" + + try { + // ======================================== + // Prepare Common UDTF Functions + // ======================================== + + // Helper UDTF: Split string into multiple records + sql """ DROP FUNCTION IF EXISTS udtf_split(STRING, STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_split(STRING, STRING) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "split_with_position", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def split_with_position(text, delimiter): + '''Split string and return with position''' + if text and delimiter: + parts = text.split(delimiter) + for i, part in enumerate(parts): + yield (i, part.strip()) +\$\$; + """ + + // Helper UDTF: Generate number sequence + sql """ DROP FUNCTION IF EXISTS udtf_range(INT, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_range(INT, INT) + RETURNS ARRAY + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "generate_range", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def generate_range(start, end): + '''Generate integer range''' + if start is not None and end is not None: + for i in range(start, end + 1): + yield (i,) +\$\$; + """ + + // Helper UDTF: Expand array elements + sql """ DROP FUNCTION IF EXISTS udtf_explode_array(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_explode_array(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "type" = "PYTHON_UDF", + "symbol" = "explode_with_index", + "runtime_version" = "3.8.10" + ) + AS \$\$ +def explode_with_index(arr): + '''Explode array with index''' + if arr: + for i, elem in enumerate(arr): + if elem is not None: + yield (elem, i) +\$\$; + """ + + // ======================================== + // Section 1: UDTF with WHERE Clause + // ======================================== + + // Test 1.1: Filter BEFORE UDTF (reduce input) + sql """ DROP TABLE IF EXISTS test_where_before; """ + sql """ + CREATE TABLE test_where_before ( + id INT, + category STRING, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_where_before VALUES + (1, 'A', 'apple,banana'), + (2, 'B', 'cat,dog'), + (3, 'A', 'red,green,blue'), + (4, 'C', 'one,two'); + """ + + qt_where_before """ + SELECT id, category, tmp.position, tmp.value + FROM test_where_before + LATERAL VIEW udtf_split(data, ',') tmp AS position, value + WHERE category = 'A' + ORDER BY id, tmp.position; + """ + + // Test 1.2: Filter AFTER UDTF (filter expanded results) + qt_where_after """ + SELECT id, tmp.position, tmp.value + FROM test_where_before + LATERAL VIEW udtf_split(data, ',') tmp AS position, value + WHERE tmp.value LIKE '%e%' + ORDER BY id, tmp.position; + """ + + // Test 1.3: Combined Filter (before and after UDTF) + qt_where_combined """ + SELECT id, category, tmp.value + FROM test_where_before + LATERAL VIEW udtf_split(data, ',') tmp AS position, value + WHERE category IN ('A', 'B') AND tmp.position = 0 + ORDER BY id; + """ + + // ======================================== + // Section 2: UDTF with JOIN Operations + // ======================================== + + // Prepare dimension table + sql """ DROP TABLE IF EXISTS dim_numbers; """ + sql """ + CREATE TABLE dim_numbers ( + num INT, + num_name STRING, + is_even BOOLEAN + ) ENGINE=OLAP + DUPLICATE KEY(num) + DISTRIBUTED BY HASH(num) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO dim_numbers VALUES + (1, 'one', false), + (2, 'two', true), + (3, 'three', false), + (4, 'four', true), + (5, 'five', false); + """ + + // Prepare fact table + sql """ DROP TABLE IF EXISTS fact_ranges; """ + sql """ + CREATE TABLE fact_ranges ( + id INT, + start_num INT, + end_num INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO fact_ranges VALUES + (1, 1, 3), + (2, 2, 4); + """ + + // Test 2.1: INNER JOIN with UDTF + qt_join_inner """ + SELECT + f.id, + tmp.num, + d.num_name, + d.is_even + FROM fact_ranges f + LATERAL VIEW udtf_range(f.start_num, f.end_num) tmp AS num + INNER JOIN dim_numbers d ON tmp.num = d.num + ORDER BY f.id, tmp.num; + """ + + // Test 2.2: LEFT JOIN with UDTF (some generated values may not match) + sql """ DROP TABLE IF EXISTS fact_ranges_extended; """ + sql """ + CREATE TABLE fact_ranges_extended ( + id INT, + start_num INT, + end_num INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO fact_ranges_extended VALUES + (1, 1, 2), + (2, 5, 7); + """ + + qt_join_left """ + SELECT + f.id, + tmp.num, + d.num_name + FROM fact_ranges_extended f + LATERAL VIEW udtf_range(f.start_num, f.end_num) tmp AS num + LEFT JOIN dim_numbers d ON tmp.num = d.num + ORDER BY f.id, tmp.num; + """ + + // Test 2.3: Self-JOIN through UDTF + sql """ DROP TABLE IF EXISTS test_self_join; """ + sql """ + CREATE TABLE test_self_join ( + id INT, + value_list STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_self_join VALUES + (1, '10,20,30'), + (2, '20,30,40'); + """ + + qt_join_self """ + SELECT + t1.id AS id1, + value1, + t2.id AS id2, + value2 + FROM test_self_join t1 + LATERAL VIEW udtf_split(t1.value_list, ',') tmp1 AS pos1, value1 + INNER JOIN test_self_join t2 + LATERAL VIEW udtf_split(t2.value_list, ',') tmp2 AS pos2, value2 + ON value1 = value2 AND t1.id < t2.id + ORDER BY t1.id, value1, t2.id; + """ + + // ======================================== + // Section 3: UDTF with GROUP BY and Aggregation + // ======================================== + + sql """ DROP TABLE IF EXISTS test_group_by; """ + sql """ + CREATE TABLE test_group_by ( + id INT, + category STRING, + tags STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_group_by VALUES + (1, 'fruit', 'apple,banana,apple'), + (2, 'fruit', 'banana,cherry'), + (3, 'animal', 'cat,dog,cat'); + """ + + // Test 3.1: GROUP BY after UDTF expansion + qt_group_by_udtf """ + SELECT + tmp.value AS tag, + COUNT(*) AS occurrence_count + FROM test_group_by + LATERAL VIEW udtf_split(tags, ',') tmp AS position, value + GROUP BY tmp.value + ORDER BY occurrence_count DESC, tag; + """ + + // Test 3.2: GROUP BY with original table columns + qt_group_by_mixed """ + SELECT + category, + tmp.value AS tag, + COUNT(*) AS tag_count + FROM test_group_by + LATERAL VIEW udtf_split(tags, ',') tmp AS position, value + GROUP BY category, tmp.value + ORDER BY category, tag_count DESC, tag; + """ + + // Test 3.3: Aggregation with HAVING clause + qt_group_by_having """ + SELECT + tmp.value AS tag, + COUNT(*) AS cnt + FROM test_group_by + LATERAL VIEW udtf_split(tags, ',') tmp AS position, value + GROUP BY tmp.value + HAVING COUNT(*) > 1 + ORDER BY cnt DESC, tag; + """ + + // Test 3.4: Multiple aggregation functions + sql """ DROP TABLE IF EXISTS test_agg_numbers; """ + sql """ + CREATE TABLE test_agg_numbers ( + id INT, + start_val INT, + end_val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_agg_numbers VALUES + (1, 1, 5), + (2, 3, 7), + (3, 10, 12); + """ + + qt_group_by_multi_agg """ + SELECT + id, + COUNT(*) AS total_count, + MIN(tmp.num) AS min_num, + MAX(tmp.num) AS max_num, + SUM(tmp.num) AS sum_num, + AVG(tmp.num) AS avg_num + FROM test_agg_numbers + LATERAL VIEW udtf_range(start_val, end_val) tmp AS num + GROUP BY id + ORDER BY id; + """ + + // ======================================== + // Section 4: UDTF with ORDER BY and LIMIT + // ======================================== + + sql """ DROP TABLE IF EXISTS test_order_limit; """ + sql """ + CREATE TABLE test_order_limit ( + id INT, + name STRING, + scores STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_order_limit VALUES + (1, 'Alice', '85,92,78'), + (2, 'Bob', '90,88,95'), + (3, 'Charlie', '70,82,88'); + """ + + // Test 4.1: ORDER BY UDTF output + qt_order_by_udtf """ + SELECT + id, + name, + tmp.value AS score + FROM test_order_limit + LATERAL VIEW udtf_split(scores, ',') tmp AS position, value + ORDER BY CAST(tmp.value AS INT) DESC, name + LIMIT 5; + """ + + // Test 4.2: ORDER BY original and UDTF columns + qt_order_by_mixed """ + SELECT + id, + name, + tmp.position, + tmp.value AS score + FROM test_order_limit + LATERAL VIEW udtf_split(scores, ',') tmp AS position, value + ORDER BY id ASC, tmp.position DESC; + """ + + // Test 4.3: LIMIT without ORDER BY + qt_limit_only """ + SELECT + id, + tmp.value + FROM test_order_limit + LATERAL VIEW udtf_split(scores, ',') tmp AS position, value + LIMIT 3; + """ + + // Test 4.4: TOP-N pattern (ORDER BY + LIMIT per group) + qt_top_n_pattern """ + SELECT id, name, score + FROM ( + SELECT + id, + name, + CAST(tmp.value AS INT) AS score, + ROW_NUMBER() OVER (PARTITION BY id ORDER BY CAST(tmp.value AS INT) DESC) AS rn + FROM test_order_limit + LATERAL VIEW udtf_split(scores, ',') tmp AS position, value + ) ranked + WHERE rn <= 2 + ORDER BY id, score DESC; + """ + + // ======================================== + // Section 5: UDTF in Subqueries + // ======================================== + + sql """ DROP TABLE IF EXISTS test_subquery; """ + sql """ + CREATE TABLE test_subquery ( + id INT, + item_list STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_subquery VALUES + (1, 'A,B,C'), + (2, 'B,C,D'), + (3, 'A,C,E'); + """ + + // Test 5.1: UDTF in WHERE IN subquery + qt_subquery_in """ + SELECT id, item_list + FROM test_subquery + WHERE id IN ( + SELECT DISTINCT id + FROM test_subquery + LATERAL VIEW udtf_split(item_list, ',') tmp AS position, value + WHERE tmp.value = 'A' + ) + ORDER BY id; + """ + + // Test 5.2: UDTF in FROM subquery + qt_subquery_from """ + SELECT + item, + COUNT(DISTINCT source_id) AS source_count + FROM ( + SELECT id AS source_id, tmp.value AS item + FROM test_subquery + LATERAL VIEW udtf_split(item_list, ',') tmp AS position, value + ) expanded + GROUP BY item + ORDER BY source_count DESC, item; + """ + + // Test 5.3: Nested subqueries with UDTF + qt_subquery_nested """ + SELECT item, total_occurrences + FROM ( + SELECT item, COUNT(*) AS total_occurrences + FROM ( + SELECT id, tmp.value AS item + FROM test_subquery + LATERAL VIEW udtf_split(item_list, ',') tmp AS position, value + ) level1 + GROUP BY item + ) level2 + WHERE total_occurrences >= 2 + ORDER BY total_occurrences DESC, item; + """ + + // ======================================== + // Section 6: UDTF with DISTINCT + // ======================================== + + sql """ DROP TABLE IF EXISTS test_distinct; """ + sql """ + CREATE TABLE test_distinct ( + id INT, + tags STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_distinct VALUES + (1, 'red,blue,red'), + (2, 'blue,green'), + (3, 'red,yellow'); + """ + + // Test 6.1: DISTINCT on UDTF output + qt_distinct_udtf """ + SELECT DISTINCT tmp.value AS tag + FROM test_distinct + LATERAL VIEW udtf_split(tags, ',') tmp AS position, value + ORDER BY tag; + """ + + // Test 6.2: COUNT DISTINCT + qt_count_distinct """ + SELECT COUNT(DISTINCT tmp.value) AS unique_tag_count + FROM test_distinct + LATERAL VIEW udtf_split(tags, ',') tmp AS position, value; + """ + + // ======================================== + // Section 7: UDTF with UNION + // ======================================== + + sql """ DROP TABLE IF EXISTS test_union_a; """ + sql """ + CREATE TABLE test_union_a ( + id INT, + items STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_union_a VALUES (1, 'X,Y'); + """ + + sql """ DROP TABLE IF EXISTS test_union_b; """ + sql """ + CREATE TABLE test_union_b ( + id INT, + items STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_union_b VALUES (2, 'Y,Z'); + """ + + // Test 7.1: UNION ALL with UDTF + qt_union_all """ + SELECT id, tmp.value AS item + FROM test_union_a + LATERAL VIEW udtf_split(items, ',') tmp AS position, value + UNION ALL + SELECT id, tmp.value AS item + FROM test_union_b + LATERAL VIEW udtf_split(items, ',') tmp AS position, value + ORDER BY id, item; + """ + + // Test 7.2: UNION (removes duplicates) + qt_union_distinct """ + SELECT tmp.value AS item + FROM test_union_a + LATERAL VIEW udtf_split(items, ',') tmp AS position, value + UNION + SELECT tmp.value AS item + FROM test_union_b + LATERAL VIEW udtf_split(items, ',') tmp AS position, value + ORDER BY item; + """ + + // ======================================== + // Section 8: UDTF with Complex Array Operations + // ======================================== + + sql """ DROP TABLE IF EXISTS test_array_ops; """ + sql """ + CREATE TABLE test_array_ops ( + id INT, + numbers ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_array_ops VALUES + (1, [1, 2, 3]), + (2, [2, 3, 4, 5]), + (3, [3, 4]); + """ + + // Test 8.1: Filter array elements through UDTF + qt_array_filter """ + SELECT + id, + tmp.element + FROM test_array_ops + LATERAL VIEW udtf_explode_array(numbers) tmp AS element, element_index + WHERE tmp.element > 2 + ORDER BY id, tmp.element; + """ + + // Test 8.2: Aggregate array elements + qt_array_aggregate """ + SELECT + id, + COUNT(*) AS element_count, + SUM(tmp.element) AS element_sum, + AVG(tmp.element) AS element_avg + FROM test_array_ops + LATERAL VIEW udtf_explode_array(numbers) tmp AS element, element_index + GROUP BY id + ORDER BY id; + """ + + // ======================================== + // Section 9: UDTF with Window Functions + // ======================================== + + sql """ DROP TABLE IF EXISTS test_window; """ + sql """ + CREATE TABLE test_window ( + id INT, + category STRING, + value_list STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_window VALUES + (1, 'A', '10,20,30'), + (2, 'A', '15,25'), + (3, 'B', '5,10,15'); + """ + + // Test 9.1: Window function over UDTF results + qt_window_function """ + SELECT + id, + category, + CAST(tmp.value AS INT) AS val, + ROW_NUMBER() OVER (PARTITION BY category ORDER BY CAST(tmp.value AS INT)) AS rn, + SUM(CAST(tmp.value AS INT)) OVER (PARTITION BY category) AS category_total + FROM test_window + LATERAL VIEW udtf_split(value_list, ',') tmp AS position, value + ORDER BY category, val; + """ + + // ======================================== + // Section 10: UDTF with CASE WHEN + // ======================================== + + sql """ DROP TABLE IF EXISTS test_case_when; """ + sql """ + CREATE TABLE test_case_when ( + id INT, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_case_when VALUES + (1, '5,15,25'), + (2, '10,20,30'); + """ + + // Test 10.1: CASE WHEN on UDTF results + qt_case_when """ + SELECT + id, + tmp.value, + CASE + WHEN CAST(tmp.value AS INT) < 10 THEN 'small' + WHEN CAST(tmp.value AS INT) < 20 THEN 'medium' + ELSE 'large' + END AS size_category + FROM test_case_when + LATERAL VIEW udtf_split(data, ',') tmp AS position, value + ORDER BY id, CAST(tmp.value AS INT); + """ + + // ======================================== + // Section 11: - Multiple LATERAL VIEW Nesting + // ======================================== + + // Test 11.1: Two-level LATERAL VIEW nesting (sequential) + sql """ DROP TABLE IF EXISTS test_nested_2level; """ + sql """ + CREATE TABLE test_nested_2level ( + id INT, + categories STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_2level VALUES + (1, 'A:1,2|B:3'), + (2, 'C:4,5'); + """ + + qt_nested_2level """ + SELECT + id, + cat, + CAST(num AS INT) as num + FROM test_nested_2level + LATERAL VIEW udtf_split(categories, '|') t1 AS p1, cat_nums + LATERAL VIEW udtf_split(cat_nums, ':') t2 AS p2, cat + LATERAL VIEW udtf_split(cat, ',') t3 AS p3, num + WHERE p2 = 1 + ORDER BY id, cat, num; + """ + + // Test 11.2: Parallel LATERAL VIEWs (cartesian product) + sql """ DROP TABLE IF EXISTS test_parallel_lateral; """ + sql """ + CREATE TABLE test_parallel_lateral ( + id INT, + list1 STRING, + list2 STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_parallel_lateral VALUES + (1, 'A,B', 'X,Y'), + (2, 'C', 'Z'); + """ + + qt_parallel_lateral """ + SELECT + id, + item1, + item2 + FROM test_parallel_lateral + LATERAL VIEW udtf_split(list1, ',') t1 AS p1, item1 + LATERAL VIEW udtf_split(list2, ',') t2 AS p2, item2 + ORDER BY id, item1, item2; + """ + + // Test 11.3: Nested LATERAL VIEW with JOIN + sql """ DROP TABLE IF EXISTS test_nested_join_base; """ + sql """ + CREATE TABLE test_nested_join_base ( + user_id INT, + tags STRING + ) ENGINE=OLAP + DUPLICATE KEY(user_id) + DISTRIBUTED BY HASH(user_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_join_base VALUES + (1, 'sports:soccer,tennis|food:pizza'), + (2, 'music:rock'); + """ + + sql """ DROP TABLE IF EXISTS dim_tag_info; """ + sql """ + CREATE TABLE dim_tag_info ( + tag VARCHAR(50), + score INT + ) ENGINE=OLAP + DUPLICATE KEY(tag) + DISTRIBUTED BY HASH(tag) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO dim_tag_info VALUES + ('soccer', 10), + ('tennis', 8), + ('pizza', 5), + ('rock', 9); + """ + + qt_nested_join """ + SELECT + u.user_id, + tag_name, + d.score + FROM test_nested_join_base u + LATERAL VIEW udtf_split(u.tags, '|') t1 AS p1, cat_tags + LATERAL VIEW udtf_split(cat_tags, ':') t2 AS p2, part + LATERAL VIEW udtf_split(part, ',') t3 AS p3, tag_name + INNER JOIN dim_tag_info d ON d.tag = tag_name + WHERE p2 = 1 + ORDER BY u.user_id, d.score DESC; + """ + + // Test 11.4: Nested LATERAL VIEW with GROUP BY aggregation + sql """ DROP TABLE IF EXISTS test_nested_groupby; """ + sql """ + CREATE TABLE test_nested_groupby ( + store_id INT, + sales_data STRING + ) ENGINE=OLAP + DUPLICATE KEY(store_id) + DISTRIBUTED BY HASH(store_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_groupby VALUES + (1, 'day1:100,200|day2:150'), + (2, 'day1:300|day2:250,100'); + """ + + qt_nested_groupby """ + SELECT + store_id, + COUNT(*) as sale_count, + SUM(CAST(amount AS INT)) as total_amount + FROM test_nested_groupby + LATERAL VIEW udtf_split(sales_data, '|') t1 AS p1, day_amounts + LATERAL VIEW udtf_split(day_amounts, ':') t2 AS p2, part + LATERAL VIEW udtf_split(part, ',') t3 AS p3, amount + WHERE p2 = 1 + GROUP BY store_id + ORDER BY store_id; + """ + + // Test 11.5: Three-level deep nesting + sql """ DROP TABLE IF EXISTS test_nested_3level; """ + sql """ + CREATE TABLE test_nested_3level ( + id INT, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_3level VALUES + (1, 'A,B,C|D,E|F'); + """ + + qt_nested_3level """ + SELECT + id, + grp_pos, + item + FROM test_nested_3level + LATERAL VIEW udtf_split(data, '|') t1 AS grp_pos, group_items + LATERAL VIEW udtf_split(group_items, ',') t2 AS item_pos, item + ORDER BY id, grp_pos, item_pos; + """ + + // Test 11.6: Nested with array expansion + sql """ DROP TABLE IF EXISTS test_nested_array_expansion; """ + sql """ + CREATE TABLE test_nested_array_expansion ( + id INT, + group_id INT, + numbers ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_array_expansion VALUES + (1, 1, [10, 20]), + (1, 2, [30]), + (2, 1, [40, 50]); + """ + + qt_nested_array_expansion """ + SELECT + id, + group_id, + element + FROM test_nested_array_expansion + LATERAL VIEW udtf_explode_array(numbers) t1 AS element, idx + ORDER BY id, group_id, element; + """ + + // Test 11.7: Nested with WHERE filtering at multiple levels + sql """ DROP TABLE IF EXISTS test_nested_multifilter; """ + sql """ + CREATE TABLE test_nested_multifilter ( + id INT, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_multifilter VALUES + (1, 'A:10,20,30|B:40'), + (2, 'C:50,60'); + """ + + qt_nested_multifilter """ + SELECT + id, + cat_name, + CAST(num AS INT) as num + FROM ( + SELECT + id, + p1, + CASE WHEN p2 = 0 THEN part END AS cat_name, + CASE WHEN p2 = 1 THEN part END AS nums + FROM test_nested_multifilter + LATERAL VIEW udtf_split(data, '|') t1 AS p1, cat_nums + LATERAL VIEW udtf_split(cat_nums, ':') t2 AS p2, part + ) t + LATERAL VIEW udtf_split(nums, ',') t3 AS p3, num + WHERE nums IS NOT NULL AND CAST(num AS INT) >= 20 + ORDER BY id, p1, num; + """ + + // Test 11.8: Nested with DISTINCT across levels + sql """ DROP TABLE IF EXISTS test_nested_distinct; """ + sql """ + CREATE TABLE test_nested_distinct ( + id INT, + tags STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_distinct VALUES + (1, 'red,blue|red,green'), + (2, 'blue,yellow'); + """ + + qt_nested_distinct """ + SELECT DISTINCT color + FROM test_nested_distinct + LATERAL VIEW udtf_split(tags, '|') t1 AS p1, color_list + LATERAL VIEW udtf_split(color_list, ',') t2 AS p2, color + ORDER BY color; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS udtf_split(STRING, STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_range(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_explode_array(ARRAY);") + try_sql("DROP TABLE IF EXISTS test_where_before;") + try_sql("DROP TABLE IF EXISTS dim_numbers;") + try_sql("DROP TABLE IF EXISTS fact_ranges;") + try_sql("DROP TABLE IF EXISTS fact_ranges_extended;") + try_sql("DROP TABLE IF EXISTS test_self_join;") + try_sql("DROP TABLE IF EXISTS test_group_by;") + try_sql("DROP TABLE IF EXISTS test_agg_numbers;") + try_sql("DROP TABLE IF EXISTS test_order_limit;") + try_sql("DROP TABLE IF EXISTS test_subquery;") + try_sql("DROP TABLE IF EXISTS test_distinct;") + try_sql("DROP TABLE IF EXISTS test_union_a;") + try_sql("DROP TABLE IF EXISTS test_union_b;") + try_sql("DROP TABLE IF EXISTS test_array_ops;") + try_sql("DROP TABLE IF EXISTS test_window;") + try_sql("DROP TABLE IF EXISTS test_case_when;") + try_sql("DROP TABLE IF EXISTS test_nested_2level;") + try_sql("DROP TABLE IF EXISTS test_parallel_lateral;") + try_sql("DROP TABLE IF EXISTS test_nested_join_base;") + try_sql("DROP TABLE IF EXISTS dim_tag_info;") + try_sql("DROP TABLE IF EXISTS test_nested_groupby;") + try_sql("DROP TABLE IF EXISTS test_nested_3level;") + try_sql("DROP TABLE IF EXISTS test_nested_array_expansion;") + try_sql("DROP TABLE IF EXISTS test_nested_multifilter;") + try_sql("DROP TABLE IF EXISTS test_nested_distinct;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/test_pythonudtf_sql_integration_module.groovy b/regression-test/suites/pythonudtf_p0/test_pythonudtf_sql_integration_module.groovy new file mode 100644 index 00000000000000..b23d89e5a21357 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/test_pythonudtf_sql_integration_module.groovy @@ -0,0 +1,984 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_pythonudtf_sql_integration_module") { + // Test Python UDTF Integration with SQL Operations + // Coverage: WHERE, JOIN, GROUP BY, ORDER BY, LIMIT, Subqueries, CTEs + + def pyPath = """${context.file.parent}/udtf_scripts/pyudtf.zip""" + scp_udf_file_to_all_be(pyPath) + def runtime_version = "3.8.10" + log.info("Python zip path: ${pyPath}".toString()) + + try { + // ======================================== + // Prepare Common UDTF Functions + // ======================================== + + // Helper UDTF: Split string into multiple records + sql """ DROP FUNCTION IF EXISTS udtf_split_module(STRING, STRING); """ + sql """ + CREATE TABLES FUNCTION udtf_split_module(STRING, STRING) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.sql_integration_udtf.split_with_position", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + // Helper UDTF: Generate number sequence + sql """ DROP FUNCTION IF EXISTS udtf_range_module(INT, INT); """ + sql """ + CREATE TABLES FUNCTION udtf_range_module(INT, INT) + RETURNS ARRAY + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.sql_integration_udtf.generate_range", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + // Helper UDTF: Expand array elements + sql """ DROP FUNCTION IF EXISTS udtf_explode_array_module(ARRAY); """ + sql """ + CREATE TABLES FUNCTION udtf_explode_array_module(ARRAY) + RETURNS ARRAY> + PROPERTIES ( + "file" = "file://${pyPath}", + "symbol" = "pyudtf_module.sql_integration_udtf.explode_with_index", + "type" = "PYTHON_UDF", + "runtime_version" = "${runtime_version}" + ); + """ + + // ======================================== + // Section 1: UDTF with WHERE Clause + // ======================================== + + // Test 1.1: Filter BEFORE UDTF (reduce input) + sql """ DROP TABLE IF EXISTS test_where_before_module; """ + sql """ + CREATE TABLE test_where_before_module ( + id INT, + category STRING, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_where_before_module VALUES + (1, 'A', 'apple,banana'), + (2, 'B', 'cat,dog'), + (3, 'A', 'red,green,blue'), + (4, 'C', 'one,two'); + """ + + qt_where_before """ + SELECT id, category, tmp.position, tmp.value + FROM test_where_before_module + LATERAL VIEW udtf_split_module(data, ',') tmp AS position, value + WHERE category = 'A' + ORDER BY id, tmp.position; + """ + + // Test 1.2: Filter AFTER UDTF (filter expanded results) + qt_where_after """ + SELECT id, tmp.position, tmp.value + FROM test_where_before_module + LATERAL VIEW udtf_split_module(data, ',') tmp AS position, value + WHERE tmp.value LIKE '%e%' + ORDER BY id, tmp.position; + """ + + // Test 1.3: Combined Filter (before and after UDTF) + qt_where_combined """ + SELECT id, category, tmp.value + FROM test_where_before_module + LATERAL VIEW udtf_split_module(data, ',') tmp AS position, value + WHERE category IN ('A', 'B') AND tmp.position = 0 + ORDER BY id; + """ + + // ======================================== + // Section 2: UDTF with JOIN Operations + // ======================================== + + // Prepare dimension table + sql """ DROP TABLE IF EXISTS dim_numbers_module; """ + sql """ + CREATE TABLE dim_numbers_module ( + num INT, + num_name STRING, + is_even BOOLEAN + ) ENGINE=OLAP + DUPLICATE KEY(num) + DISTRIBUTED BY HASH(num) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO dim_numbers_module VALUES + (1, 'one', false), + (2, 'two', true), + (3, 'three', false), + (4, 'four', true), + (5, 'five', false); + """ + + // Prepare fact table + sql """ DROP TABLE IF EXISTS fact_ranges_module; """ + sql """ + CREATE TABLE fact_ranges_module ( + id INT, + start_num INT, + end_num INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO fact_ranges_module VALUES + (1, 1, 3), + (2, 2, 4); + """ + + // Test 2.1: INNER JOIN with UDTF + qt_join_inner """ + SELECT + f.id, + tmp.num, + d.num_name, + d.is_even + FROM fact_ranges_module f + LATERAL VIEW udtf_range_module(f.start_num, f.end_num) tmp AS num + INNER JOIN dim_numbers_module d ON tmp.num = d.num + ORDER BY f.id, tmp.num; + """ + + // Test 2.2: LEFT JOIN with UDTF (some generated values may not match) + sql """ DROP TABLE IF EXISTS fact_ranges_extended_module; """ + sql """ + CREATE TABLE fact_ranges_extended_module ( + id INT, + start_num INT, + end_num INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO fact_ranges_extended_module VALUES + (1, 1, 2), + (2, 5, 7); + """ + + qt_join_left """ + SELECT + f.id, + tmp.num, + d.num_name + FROM fact_ranges_extended_module f + LATERAL VIEW udtf_range_module(f.start_num, f.end_num) tmp AS num + LEFT JOIN dim_numbers_module d ON tmp.num = d.num + ORDER BY f.id, tmp.num; + """ + + // Test 2.3: Self-JOIN through UDTF + sql """ DROP TABLE IF EXISTS test_self_join_module; """ + sql """ + CREATE TABLE test_self_join_module ( + id INT, + value_list STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_self_join_module VALUES + (1, '10,20,30'), + (2, '20,30,40'); + """ + + qt_join_self """ + SELECT + t1.id AS id1, + value1, + t2.id AS id2, + value2 + FROM test_self_join_module t1 + LATERAL VIEW udtf_split_module(t1.value_list, ',') tmp1 AS pos1, value1 + INNER JOIN test_self_join_module t2 + LATERAL VIEW udtf_split_module(t2.value_list, ',') tmp2 AS pos2, value2 + ON value1 = value2 AND t1.id < t2.id + ORDER BY t1.id, value1, t2.id; + """ + + // ======================================== + // Section 3: UDTF with GROUP BY and Aggregation + // ======================================== + + sql """ DROP TABLE IF EXISTS test_group_by_module; """ + sql """ + CREATE TABLE test_group_by_module ( + id INT, + category STRING, + tags STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_group_by_module VALUES + (1, 'fruit', 'apple,banana,apple'), + (2, 'fruit', 'banana,cherry'), + (3, 'animal', 'cat,dog,cat'); + """ + + // Test 3.1: GROUP BY after UDTF expansion + qt_group_by_udtf """ + SELECT + tmp.value AS tag, + COUNT(*) AS occurrence_count + FROM test_group_by_module + LATERAL VIEW udtf_split_module(tags, ',') tmp AS position, value + GROUP BY tmp.value + ORDER BY occurrence_count DESC, tag; + """ + + // Test 3.2: GROUP BY with original table columns + qt_group_by_mixed """ + SELECT + category, + tmp.value AS tag, + COUNT(*) AS tag_count + FROM test_group_by_module + LATERAL VIEW udtf_split_module(tags, ',') tmp AS position, value + GROUP BY category, tmp.value + ORDER BY category, tag_count DESC, tag; + """ + + // Test 3.3: Aggregation with HAVING clause + qt_group_by_having """ + SELECT + tmp.value AS tag, + COUNT(*) AS cnt + FROM test_group_by_module + LATERAL VIEW udtf_split_module(tags, ',') tmp AS position, value + GROUP BY tmp.value + HAVING COUNT(*) > 1 + ORDER BY cnt DESC, tag; + """ + + // Test 3.4: Multiple aggregation functions + sql """ DROP TABLE IF EXISTS test_agg_numbers_module; """ + sql """ + CREATE TABLE test_agg_numbers_module ( + id INT, + start_val INT, + end_val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_agg_numbers_module VALUES + (1, 1, 5), + (2, 3, 7), + (3, 10, 12); + """ + + qt_group_by_multi_agg """ + SELECT + id, + COUNT(*) AS total_count, + MIN(tmp.num) AS min_num, + MAX(tmp.num) AS max_num, + SUM(tmp.num) AS sum_num, + AVG(tmp.num) AS avg_num + FROM test_agg_numbers_module + LATERAL VIEW udtf_range_module(start_val, end_val) tmp AS num + GROUP BY id + ORDER BY id; + """ + + // ======================================== + // Section 4: UDTF with ORDER BY and LIMIT + // ======================================== + + sql """ DROP TABLE IF EXISTS test_order_limit_module; """ + sql """ + CREATE TABLE test_order_limit_module ( + id INT, + name STRING, + scores STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_order_limit_module VALUES + (1, 'Alice', '85,92,78'), + (2, 'Bob', '90,88,95'), + (3, 'Charlie', '70,82,88'); + """ + + // Test 4.1: ORDER BY UDTF output + qt_order_by_udtf """ + SELECT + id, + name, + tmp.value AS score + FROM test_order_limit_module + LATERAL VIEW udtf_split_module(scores, ',') tmp AS position, value + ORDER BY CAST(tmp.value AS INT) DESC, name + LIMIT 5; + """ + + // Test 4.2: ORDER BY original and UDTF columns + qt_order_by_mixed """ + SELECT + id, + name, + tmp.position, + tmp.value AS score + FROM test_order_limit_module + LATERAL VIEW udtf_split_module(scores, ',') tmp AS position, value + ORDER BY id ASC, tmp.position DESC; + """ + + // Test 4.3: LIMIT without ORDER BY + qt_limit_only """ + SELECT + id, + tmp.value + FROM test_order_limit_module + LATERAL VIEW udtf_split_module(scores, ',') tmp AS position, value + LIMIT 3; + """ + + // Test 4.4: TOP-N pattern (ORDER BY + LIMIT per group) + qt_top_n_pattern """ + SELECT id, name, score + FROM ( + SELECT + id, + name, + CAST(tmp.value AS INT) AS score, + ROW_NUMBER() OVER (PARTITION BY id ORDER BY CAST(tmp.value AS INT) DESC) AS rn + FROM test_order_limit_module + LATERAL VIEW udtf_split_module(scores, ',') tmp AS position, value + ) ranked + WHERE rn <= 2 + ORDER BY id, score DESC; + """ + + // ======================================== + // Section 5: UDTF in Subqueries + // ======================================== + + sql """ DROP TABLE IF EXISTS test_subquery_module; """ + sql """ + CREATE TABLE test_subquery_module ( + id INT, + item_list STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_subquery_module VALUES + (1, 'A,B,C'), + (2, 'B,C,D'), + (3, 'A,C,E'); + """ + + // Test 5.1: UDTF in WHERE IN subquery + qt_subquery_in """ + SELECT id, item_list + FROM test_subquery_module + WHERE id IN ( + SELECT DISTINCT id + FROM test_subquery_module + LATERAL VIEW udtf_split_module(item_list, ',') tmp AS position, value + WHERE tmp.value = 'A' + ) + ORDER BY id; + """ + + // Test 5.2: UDTF in FROM subquery + qt_subquery_from """ + SELECT + item, + COUNT(DISTINCT source_id) AS source_count + FROM ( + SELECT id AS source_id, tmp.value AS item + FROM test_subquery_module + LATERAL VIEW udtf_split_module(item_list, ',') tmp AS position, value + ) expanded + GROUP BY item + ORDER BY source_count DESC, item; + """ + + // Test 5.3: Nested subqueries with UDTF + qt_subquery_nested """ + SELECT item, total_occurrences + FROM ( + SELECT item, COUNT(*) AS total_occurrences + FROM ( + SELECT id, tmp.value AS item + FROM test_subquery_module + LATERAL VIEW udtf_split_module(item_list, ',') tmp AS position, value + ) level1 + GROUP BY item + ) level2 + WHERE total_occurrences >= 2 + ORDER BY total_occurrences DESC, item; + """ + + // ======================================== + // Section 6: UDTF with DISTINCT + // ======================================== + + sql """ DROP TABLE IF EXISTS test_distinct_module; """ + sql """ + CREATE TABLE test_distinct_module ( + id INT, + tags STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_distinct_module VALUES + (1, 'red,blue,red'), + (2, 'blue,green'), + (3, 'red,yellow'); + """ + + // Test 6.1: DISTINCT on UDTF output + qt_distinct_udtf """ + SELECT DISTINCT tmp.value AS tag + FROM test_distinct_module + LATERAL VIEW udtf_split_module(tags, ',') tmp AS position, value + ORDER BY tag; + """ + + // Test 6.2: COUNT DISTINCT + qt_count_distinct """ + SELECT COUNT(DISTINCT tmp.value) AS unique_tag_count + FROM test_distinct_module + LATERAL VIEW udtf_split_module(tags, ',') tmp AS position, value; + """ + + // ======================================== + // Section 7: UDTF with UNION + // ======================================== + + sql """ DROP TABLE IF EXISTS test_union_a_module; """ + sql """ + CREATE TABLE test_union_a_module ( + id INT, + items STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_union_a_module VALUES (1, 'X,Y'); + """ + + sql """ DROP TABLE IF EXISTS test_union_b_module; """ + sql """ + CREATE TABLE test_union_b_module ( + id INT, + items STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_union_b_module VALUES (2, 'Y,Z'); + """ + + // Test 7.1: UNION ALL with UDTF + qt_union_all """ + SELECT id, tmp.value AS item + FROM test_union_a_module + LATERAL VIEW udtf_split_module(items, ',') tmp AS position, value + UNION ALL + SELECT id, tmp.value AS item + FROM test_union_b_module + LATERAL VIEW udtf_split_module(items, ',') tmp AS position, value + ORDER BY id, item; + """ + + // Test 7.2: UNION (removes duplicates) + qt_union_distinct """ + SELECT tmp.value AS item + FROM test_union_a_module + LATERAL VIEW udtf_split_module(items, ',') tmp AS position, value + UNION + SELECT tmp.value AS item + FROM test_union_b_module + LATERAL VIEW udtf_split_module(items, ',') tmp AS position, value + ORDER BY item; + """ + + // ======================================== + // Section 8: UDTF with Complex Array Operations + // ======================================== + + sql """ DROP TABLE IF EXISTS test_array_ops_module; """ + sql """ + CREATE TABLE test_array_ops_module ( + id INT, + numbers ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_array_ops_module VALUES + (1, [1, 2, 3]), + (2, [2, 3, 4, 5]), + (3, [3, 4]); + """ + + // Test 8.1: Filter array elements through UDTF + qt_array_filter """ + SELECT + id, + tmp.element + FROM test_array_ops_module + LATERAL VIEW udtf_explode_array_module(numbers) tmp AS element, element_index + WHERE tmp.element > 2 + ORDER BY id, tmp.element; + """ + + // Test 8.2: Aggregate array elements + qt_array_aggregate """ + SELECT + id, + COUNT(*) AS element_count, + SUM(tmp.element) AS element_sum, + AVG(tmp.element) AS element_avg + FROM test_array_ops_module + LATERAL VIEW udtf_explode_array_module(numbers) tmp AS element, element_index + GROUP BY id + ORDER BY id; + """ + + // ======================================== + // Section 9: UDTF with Window Functions + // ======================================== + + sql """ DROP TABLE IF EXISTS test_window_module; """ + sql """ + CREATE TABLE test_window_module ( + id INT, + category STRING, + value_list STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_window_module VALUES + (1, 'A', '10,20,30'), + (2, 'A', '15,25'), + (3, 'B', '5,10,15'); + """ + + // Test 9.1: Window function over UDTF results + qt_window_function """ + SELECT + id, + category, + CAST(tmp.value AS INT) AS val, + ROW_NUMBER() OVER (PARTITION BY category ORDER BY CAST(tmp.value AS INT)) AS rn, + SUM(CAST(tmp.value AS INT)) OVER (PARTITION BY category) AS category_total + FROM test_window_module + LATERAL VIEW udtf_split_module(value_list, ',') tmp AS position, value + ORDER BY category, val; + """ + + // ======================================== + // Section 10: UDTF with CASE WHEN + // ======================================== + + sql """ DROP TABLE IF EXISTS test_case_when_module; """ + sql """ + CREATE TABLE test_case_when_module ( + id INT, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_case_when_module VALUES + (1, '5,15,25'), + (2, '10,20,30'); + """ + + // Test 10.1: CASE WHEN on UDTF results + qt_case_when """ + SELECT + id, + tmp.value, + CASE + WHEN CAST(tmp.value AS INT) < 10 THEN 'small' + WHEN CAST(tmp.value AS INT) < 20 THEN 'medium' + ELSE 'large' + END AS size_category + FROM test_case_when_module + LATERAL VIEW udtf_split_module(data, ',') tmp AS position, value + ORDER BY id, CAST(tmp.value AS INT); + """ + + // ======================================== + // Section 11: - Multiple LATERAL VIEW Nesting + // ======================================== + + // Test 11.1: Two-level LATERAL VIEW nesting (sequential) + sql """ DROP TABLE IF EXISTS test_nested_2level_module; """ + sql """ + CREATE TABLE test_nested_2level_module ( + id INT, + categories STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_2level_module VALUES + (1, 'A:1,2|B:3'), + (2, 'C:4,5'); + """ + + qt_nested_2level """ + SELECT + id, + cat, + CAST(num AS INT) as num + FROM test_nested_2level_module + LATERAL VIEW udtf_split_module(categories, '|') t1 AS p1, cat_nums + LATERAL VIEW udtf_split_module(cat_nums, ':') t2 AS p2, cat + LATERAL VIEW udtf_split_module(cat, ',') t3 AS p3, num + WHERE p2 = 1 + ORDER BY id, cat, num; + """ + + // Test 11.2: Parallel LATERAL VIEWs (cartesian product) + sql """ DROP TABLE IF EXISTS test_parallel_lateral_module; """ + sql """ + CREATE TABLE test_parallel_lateral_module ( + id INT, + list1 STRING, + list2 STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_parallel_lateral_module VALUES + (1, 'A,B', 'X,Y'), + (2, 'C', 'Z'); + """ + + qt_parallel_lateral """ + SELECT + id, + item1, + item2 + FROM test_parallel_lateral_module + LATERAL VIEW udtf_split_module(list1, ',') t1 AS p1, item1 + LATERAL VIEW udtf_split_module(list2, ',') t2 AS p2, item2 + ORDER BY id, item1, item2; + """ + + // Test 11.3: Nested LATERAL VIEW with JOIN + sql """ DROP TABLE IF EXISTS test_nested_join_base_module; """ + sql """ + CREATE TABLE test_nested_join_base_module ( + user_id INT, + tags STRING + ) ENGINE=OLAP + DUPLICATE KEY(user_id) + DISTRIBUTED BY HASH(user_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_join_base_module VALUES + (1, 'sports:soccer,tennis|food:pizza'), + (2, 'music:rock'); + """ + + sql """ DROP TABLE IF EXISTS dim_tag_info_module; """ + sql """ + CREATE TABLE dim_tag_info_module ( + tag VARCHAR(50), + score INT + ) ENGINE=OLAP + DUPLICATE KEY(tag) + DISTRIBUTED BY HASH(tag) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO dim_tag_info_module VALUES + ('soccer', 10), + ('tennis', 8), + ('pizza', 5), + ('rock', 9); + """ + + qt_nested_join """ + SELECT + u.user_id, + tag_name, + d.score + FROM test_nested_join_base_module u + LATERAL VIEW udtf_split_module(u.tags, '|') t1 AS p1, cat_tags + LATERAL VIEW udtf_split_module(cat_tags, ':') t2 AS p2, part + LATERAL VIEW udtf_split_module(part, ',') t3 AS p3, tag_name + INNER JOIN dim_tag_info_module d ON d.tag = tag_name + WHERE p2 = 1 + ORDER BY u.user_id, d.score DESC; + """ + + // Test 11.4: Nested LATERAL VIEW with GROUP BY aggregation + sql """ DROP TABLE IF EXISTS test_nested_groupby_module; """ + sql """ + CREATE TABLE test_nested_groupby_module ( + store_id INT, + sales_data STRING + ) ENGINE=OLAP + DUPLICATE KEY(store_id) + DISTRIBUTED BY HASH(store_id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_groupby_module VALUES + (1, 'day1:100,200|day2:150'), + (2, 'day1:300|day2:250,100'); + """ + + qt_nested_groupby """ + SELECT + store_id, + COUNT(*) as sale_count, + SUM(CAST(amount AS INT)) as total_amount + FROM test_nested_groupby_module + LATERAL VIEW udtf_split_module(sales_data, '|') t1 AS p1, day_amounts + LATERAL VIEW udtf_split_module(day_amounts, ':') t2 AS p2, part + LATERAL VIEW udtf_split_module(part, ',') t3 AS p3, amount + WHERE p2 = 1 + GROUP BY store_id + ORDER BY store_id; + """ + + // Test 11.5: Three-level deep nesting + sql """ DROP TABLE IF EXISTS test_nested_3level_module; """ + sql """ + CREATE TABLE test_nested_3level_module ( + id INT, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_3level_module VALUES + (1, 'A,B,C|D,E|F'); + """ + + qt_nested_3level """ + SELECT + id, + grp_pos, + item + FROM test_nested_3level_module + LATERAL VIEW udtf_split_module(data, '|') t1 AS grp_pos, group_items + LATERAL VIEW udtf_split_module(group_items, ',') t2 AS item_pos, item + ORDER BY id, grp_pos, item_pos; + """ + + // Test 11.6: Nested with array expansion + sql """ DROP TABLE IF EXISTS test_nested_array_expansion_module; """ + sql """ + CREATE TABLE test_nested_array_expansion_module ( + id INT, + group_id INT, + numbers ARRAY + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_array_expansion_module VALUES + (1, 1, [10, 20]), + (1, 2, [30]), + (2, 1, [40, 50]); + """ + + qt_nested_array_expansion """ + SELECT + id, + group_id, + element + FROM test_nested_array_expansion_module + LATERAL VIEW udtf_explode_array_module(numbers) t1 AS element, idx + ORDER BY id, group_id, element; + """ + + // Test 11.7: Nested with WHERE filtering at multiple levels + sql """ DROP TABLE IF EXISTS test_nested_multifilter_module; """ + sql """ + CREATE TABLE test_nested_multifilter_module ( + id INT, + data STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_multifilter_module VALUES + (1, 'A:10,20,30|B:40'), + (2, 'C:50,60'); + """ + + qt_nested_multifilter """ + SELECT + id, + cat_name, + CAST(num AS INT) as num + FROM ( + SELECT + id, + p1, + CASE WHEN p2 = 0 THEN part END AS cat_name, + CASE WHEN p2 = 1 THEN part END AS nums + FROM test_nested_multifilter_module + LATERAL VIEW udtf_split_module(data, '|') t1 AS p1, cat_nums + LATERAL VIEW udtf_split_module(cat_nums, ':') t2 AS p2, part + ) t + LATERAL VIEW udtf_split_module(nums, ',') t3 AS p3, num + WHERE nums IS NOT NULL AND CAST(num AS INT) >= 20 + ORDER BY id, p1, num; + """ + + // Test 11.8: Nested with DISTINCT across levels + sql """ DROP TABLE IF EXISTS test_nested_distinct_module; """ + sql """ + CREATE TABLE test_nested_distinct_module ( + id INT, + tags STRING + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_num" = "1"); + """ + + sql """ + INSERT INTO test_nested_distinct_module VALUES + (1, 'red,blue|red,green'), + (2, 'blue,yellow'); + """ + + qt_nested_distinct """ + SELECT DISTINCT color + FROM test_nested_distinct_module + LATERAL VIEW udtf_split_module(tags, '|') t1 AS p1, color_list + LATERAL VIEW udtf_split_module(color_list, ',') t2 AS p2, color + ORDER BY color; + """ + + } finally { + try_sql("DROP FUNCTION IF EXISTS udtf_split_module(STRING, STRING);") + try_sql("DROP FUNCTION IF EXISTS udtf_range_module(INT, INT);") + try_sql("DROP FUNCTION IF EXISTS udtf_explode_array_module(ARRAY);") + try_sql("DROP TABLE IF EXISTS test_where_before_module;") + try_sql("DROP TABLE IF EXISTS dim_numbers_module;") + try_sql("DROP TABLE IF EXISTS fact_ranges_module;") + try_sql("DROP TABLE IF EXISTS fact_ranges_extended_module;") + try_sql("DROP TABLE IF EXISTS test_self_join_module;") + try_sql("DROP TABLE IF EXISTS test_group_by_module;") + try_sql("DROP TABLE IF EXISTS test_agg_numbers_module;") + try_sql("DROP TABLE IF EXISTS test_order_limit_module;") + try_sql("DROP TABLE IF EXISTS test_subquery_module;") + try_sql("DROP TABLE IF EXISTS test_distinct_module;") + try_sql("DROP TABLE IF EXISTS test_union_a_module;") + try_sql("DROP TABLE IF EXISTS test_union_b_module;") + try_sql("DROP TABLE IF EXISTS test_array_ops_module;") + try_sql("DROP TABLE IF EXISTS test_window_module;") + try_sql("DROP TABLE IF EXISTS test_case_when_module;") + try_sql("DROP TABLE IF EXISTS test_nested_2level_module;") + try_sql("DROP TABLE IF EXISTS test_parallel_lateral_module;") + try_sql("DROP TABLE IF EXISTS test_nested_join_base_module;") + try_sql("DROP TABLE IF EXISTS dim_tag_info_module;") + try_sql("DROP TABLE IF EXISTS test_nested_groupby_module;") + try_sql("DROP TABLE IF EXISTS test_nested_3level_module;") + try_sql("DROP TABLE IF EXISTS test_nested_array_expansion_module;") + try_sql("DROP TABLE IF EXISTS test_nested_multifilter_module;") + try_sql("DROP TABLE IF EXISTS test_nested_distinct_module;") + } +} diff --git a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf.zip b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf.zip new file mode 100644 index 00000000000000..f04942c97849c9 Binary files /dev/null and b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf.zip differ diff --git a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/basic_udtf.py b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/basic_udtf.py new file mode 100644 index 00000000000000..c4ece6f1b71e38 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/basic_udtf.py @@ -0,0 +1,95 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Basic UDTF implementations - copied from inline tests""" + +import json + + +def split_string_udtf(input_str): + '''Split comma-separated string into rows''' + if input_str: + parts = input_str.split(',') + for part in parts: + yield (part.strip(),) + + +def generate_series_udtf(start, end): + '''Generate integer series from start to end''' + if start is not None and end is not None: + for i in range(start, end + 1): + yield (i,) + + +def running_sum_udtf(value): + '''Return value with itself as cumulative sum (stateless)''' + # Note: Function-based UDTF cannot maintain state + # This is simplified to return (value, value) + if value is not None: + yield (value, value) + + +def explode_json_udtf(json_str): + '''Explode JSON ARRAY into rows''' + if json_str: + try: + data = json.loads(json_str) + if isinstance(data, list): + for item in data: + yield (str(item),) + except: + pass # Skip invalid JSON + + +def top_n_udtf(value, n): + '''Return single value with rank 1 (stateless)''' + # Without state, each row is independent + if value is not None and n is not None and n > 0: + yield (value, 1) + + +def duplicate_udtf(text, n): + '''Duplicate input text N times''' + if text and n: + for i in range(n): + yield (text, i + 1) + + +def filter_positive_udtf(value): + '''Only output positive values''' + if value is not None and value > 0: + yield (value,) + # If value <= 0, don't yield (skip this row) + + +def cartesian_udtf(list1, list2): + '''Generate cartesian product of two comma-separated lists''' + if list1 and list2: + items1 = [x.strip() for x in list1.split(',')] + items2 = [y.strip() for y in list2.split(',')] + + for x in items1: + for y in items2: + yield (x, y) + + +def filter_negative_udtf(value): + '''Only output negative values (filter all positive numbers)''' + if value is not None and value < 0: + yield (value,) + # For positive numbers, don't yield anything + diff --git a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/data_types_udtf.py b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/data_types_udtf.py new file mode 100644 index 00000000000000..8a32317c44bb7f --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/data_types_udtf.py @@ -0,0 +1,191 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Data type handling UDTF implementations - copied from inline tests""" + +import math +import json +from decimal import Decimal + + +def process_tinyint(v): + '''Process TINYINT: test small integer range''' + if v is not None: + yield (v, v * 2) + + +def process_smallint(v): + '''Process SMALLINT: test medium integer range''' + if v is not None: + yield (v, v * v) + + +def process_bigint(v): + '''Process BIGINT: test large integer range''' + if v is not None: + yield (v, v + 1) + + +def process_float(v): + '''Process FLOAT: test floating point numbers''' + if v is not None: + yield (v, v / 2.0) + + +def process_double(v): + '''Process DOUBLE: test high precision floating point''' + if v is not None and v >= 0: + yield (v, math.sqrt(v)) + + +def process_boolean(v): + '''Process BOOLEAN: test true/false values''' + if v is not None: + yield (v, not v, 'TRUE' if v else 'FALSE') + + +def process_string(v): + '''Process STRING: test text manipulation''' + if v is not None: + yield (v, len(v), v.upper(), v.lower()) + + +def process_date(v): + '''Process DATE: extract date components''' + if v is not None: + # v is a datetime.date object + yield (v, v.year, v.month, v.day) + + +def process_datetime(v): + '''Process DATETIME: extract time components''' + if v is not None: + # v is a datetime.datetime object + yield (v, v.hour, v.minute) + + +def process_array_int(arr): + '''Process ARRAY: explode array and process each element''' + if arr is not None: + for i, elem in enumerate(arr): + if elem is not None: + yield (i, elem, elem * 2) + + +def process_array_string(arr): + '''Process ARRAY: explode and get string lengths''' + if arr is not None: + for elem in arr: + if elem is not None: + yield (elem, len(elem)) + + +def process_struct(person): + '''Process STRUCT: access struct fields''' + if person is not None: + name = person['name'] if 'name' in person else None + age = person['age'] if 'age' in person else None + + if name is not None and age is not None: + category = 'child' if age < 18 else 'adult' + yield (name, age, category) + + +def process_multi_types(num, text): + '''Process multiple input types''' + if num is not None and text is not None: + yield (num, text, f"{text}_{num}") + + +def process_decimal(v): + '''Process DECIMAL: high precision arithmetic''' + if v is not None: + doubled = v * 2 + yield (v, doubled) + + +def process_map_string(map_str): + '''Process map-like string (key1:val1,key2:val2)''' + if map_str: + pairs = map_str.split(',') + for pair in pairs: + if ':' in pair: + k, val = pair.split(':', 1) + try: + yield (k.strip(), int(val.strip())) + except ValueError: + pass + + +def process_nested_array(nested_str): + '''Process nested array string ([[1,2],[3,4]])''' + if nested_str: + # Remove brackets and split by ],[ + nested_str = nested_str.strip('[]') + groups = nested_str.split('],[') + + for group_idx, group in enumerate(groups): + elements = group.strip('[]').split(',') + for elem in elements: + try: + yield (group_idx, int(elem.strip())) + except ValueError: + pass + + +def process_array_structs(data): + '''Process array of structs (name:age:score|name:age:score)''' + if data: + items = data.split('|') + for item in items: + parts = item.split(':') + if len(parts) == 3: + try: + yield (parts[0], int(parts[1]), int(parts[2])) + except ValueError: + pass + + +def process_struct_array(data): + '''Process struct with array (name:tag1,tag2,tag3)''' + if data and ':' in data: + name, tags = data.split(':', 1) + tag_list = tags.split(',') + yield (name, len(tag_list), ','.join(tag_list)) + + +def extract_json_fields(json_str): + '''Extract JSON fields''' + if json_str: + try: + data = json.loads(json_str) + if isinstance(data, dict): + for k, v in data.items(): + yield (k, str(v)) + except: + pass + + +def process_complex_struct(data): + '''Process complex struct (id:name:city:zip)''' + if data: + parts = data.split(':') + if len(parts) == 4: + try: + yield (int(parts[0]), parts[1], parts[2], parts[3]) + except ValueError: + pass diff --git a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/edge_cases_udtf.py b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/edge_cases_udtf.py new file mode 100644 index 00000000000000..63515790f45c79 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/edge_cases_udtf.py @@ -0,0 +1,181 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Edge cases UDTF implementations""" + +import math + + +def handle_null_int(value): + '''Handle NULL integer values''' + if value is None: + yield (None, True, -1) # NULL indicator + else: + yield (value, False, value * 2) + + +def handle_null_string(value): + '''Distinguish NULL from empty string''' + if value is None: + yield ('NULL', -1) + elif value == '': + yield ('EMPTY', 0) + else: + yield ('NORMAL', len(value)) + + +def handle_empty_array(arr): + '''Handle NULL vs empty array''' + if arr is None: + yield ('NULL', -1) + elif len(arr) == 0: + yield ('EMPTY', 0) + else: + yield ('NORMAL', len(arr)) + + +def handle_null_struct(person): + '''Handle NULL fields in STRUCT''' + if person is None: + yield (False, False, 'struct_is_null') + else: + name = person.get('name') + age = person.get('age') + has_name = name is not None + has_age = age is not None + + if has_name and has_age: + summary = f"{name}_{age}" + elif has_name: + summary = f"{name}_no_age" + elif has_age: + summary = f"no_name_{age}" + else: + summary = "all_fields_null" + + yield (has_name, has_age, summary) + + +def process_empty_table(value): + '''This should never be called for empty table''' + if value is not None: + yield (value * 2,) + + +def process_single_row(value): + '''Process single row input''' + if value is not None: + for i in range(3): + yield (value, value + i) + + +def process_long_string(text): + '''Process very long string''' + if text is not None: + length = len(text) + first_10 = text[:10] if length >= 10 else text + last_10 = text[-10:] if length >= 10 else text + yield (length, first_10, last_10) + + +def process_large_array(arr): + '''Process large array - compute statistics instead of exploding''' + if arr is not None and len(arr) > 0: + total = len(arr) + total_sum = sum(x for x in arr if x is not None) + first = arr[0] if len(arr) > 0 else None + last = arr[-1] if len(arr) > 0 else None + yield (total, total_sum, first, last) + + +def output_explosion(n): + '''Generate many outputs from single input (controlled explosion)''' + if n is not None and 0 < n <= 100: # Safety limit + for i in range(n): + yield (i,) + + +def process_special_numbers(value): + '''Categorize special numeric values''' + INT_MIN = -2147483648 + INT_MAX = 2147483647 + + if value is None: + yield (None, 'NULL', False) + elif value == 0: + yield (value, 'ZERO', False) + elif value == INT_MIN or value == INT_MAX: + category = 'POSITIVE' if value > 0 else 'NEGATIVE' + yield (value, category, True) # is_boundary = True + elif value > 0: + yield (value, 'POSITIVE', False) + else: + yield (value, 'NEGATIVE', False) + + +def process_special_doubles(value): + '''Classify special double values''' + if value is None: + yield (None, 'NULL') + elif math.isnan(value): + yield (value, 'NAN') + elif math.isinf(value): + if value > 0: + yield (value, 'POSITIVE_INF') + else: + yield (value, 'NEGATIVE_INF') + elif value == 0.0: + yield (value, 'ZERO') + elif abs(value) < 1e-10: + yield (value, 'VERY_SMALL') + elif abs(value) > 1e10: + yield (value, 'VERY_LARGE') + else: + yield (value, 'NORMAL') + + +def process_special_strings(text): + '''Process strings with special characters''' + if text is None: + yield (0, False, 'NULL') + elif text == '': + yield (0, False, 'EMPTY') + else: + length = len(text) + has_special = any(ord(c) > 127 for c in text) + + if has_special: + desc = 'HAS_UNICODE' + elif any(c in text for c in ['\n', '\t', '\r']): + desc = 'HAS_WHITESPACE' + elif any(c in text for c in ['!', '@', '#', '$', '%']): + desc = 'HAS_SYMBOLS' + else: + desc = 'NORMAL' + + yield (length, has_special, desc) + + +def process_boundary_dates(dt): + '''Process boundary date values''' + if dt is None: + yield (None, 0, False) + else: + year = dt.year + # Check if it's a boundary date + is_boundary = year in [1970, 9999] or (year == 1970 and dt.month == 1 and dt.day == 1) + yield (dt, year, is_boundary) diff --git a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/exceptions_udtf.py b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/exceptions_udtf.py new file mode 100644 index 00000000000000..b663c7aa878dc7 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/exceptions_udtf.py @@ -0,0 +1,213 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Exception handling UDTF implementations""" + +import math + + +def safe_divide(a, b): + '''Safe division with error handling''' + try: + if b == 0: + yield (a, b, None, 'division_by_zero') + else: + result = a / b + yield (a, b, result, 'success') + except Exception as e: + yield (a, b, None, f'error_{type(e).__name__}') + + +def check_overflow(value): + '''Check for potential overflow in operations''' + if value is None: + yield (None, None, 'null_input') + else: + # BIGINT range: -2^63 to 2^63-1 + MAX_BIGINT = 9223372036854775807 + MIN_BIGINT = -9223372036854775808 + + doubled = value * 2 + + # Check if doubled value is within safe range + if doubled > MAX_BIGINT or doubled < MIN_BIGINT: + yield (value, None, 'would_overflow') + else: + yield (value, doubled, 'safe') + + +def parse_number(text): + '''Parse string to number with error handling''' + if text is None: + yield (None, None, False) + else: + try: + num = float(text) + yield (text, num, True) + except ValueError: + yield (text, None, False) + + +def check_type(value): + '''Check and report value type''' + type_name = type(value).__name__ + + if value is None: + yield (None, 'NoneType', 0) + elif isinstance(value, str): + yield (value, type_name, len(value)) + else: + # Unexpected type - convert to string + yield (str(value), type_name, len(str(value))) + + +def safe_array_access(arr, position): + '''Safe array element access''' + if arr is None: + yield (0, position, None, 'null_array') + elif len(arr) == 0: + yield (0, position, None, 'empty_array') + elif position < 0 or position >= len(arr): + yield (len(arr), position, None, 'out_of_bounds') + else: + yield (len(arr), position, arr[position], 'success') + + +def compute_stats(arr): + '''Compute statistics with empty array handling''' + if arr is None: + yield (0, 0, 0.0, 'null_array') + elif len(arr) == 0: + yield (0, 0, 0.0, 'empty_array') + else: + count = len(arr) + total = sum(x for x in arr if x is not None) + avg = total / count if count > 0 else 0.0 + yield (count, total, avg, 'computed') + + +def access_struct_fields(person): + '''Safe STRUCT field access''' + if person is None: + yield (False, False, None, None) + else: + # Use .get() to safely access dictionary keys + name = person.get('name') + age = person.get('age') + + has_name = name is not None + has_age = age is not None + + yield (has_name, has_age, name, age) + + +def slice_string(text, start, end): + '''Safe string slicing''' + if text is None: + yield (None, start, end, None, 'null_string') + elif start is None or end is None: + yield (text, start, end, None, 'null_index') + else: + length = len(text) + + # Clamp indices to valid range + safe_start = max(0, min(start, length)) + safe_end = max(0, min(end, length)) + + if safe_start >= safe_end: + yield (text, start, end, '', 'empty_slice') + else: + result = text[safe_start:safe_end] + yield (text, start, end, result, 'success') + + +def check_text_encoding(text): + '''Check string encoding properties''' + if text is None: + yield (None, 0, 0, False) + else: + byte_len = len(text.encode('utf-8')) + char_len = len(text) + has_unicode = byte_len > char_len + + yield (text, byte_len, char_len, has_unicode) + + +def process_conditional(value): + '''Process value based on multiple conditions''' + if value is None: + yield (None, 'null', 0) + elif value < 0: + # For negative: take absolute value + yield (value, 'negative', abs(value)) + elif value == 0: + # Zero case: return 1 + yield (value, 'zero', 1) + elif value > 0 and value <= 100: + # Small positive: double it + yield (value, 'small_positive', value * 2) + else: + # Large positive: return as-is + yield (value, 'large_positive', value) + + +def conditional_yield(value): + '''Only yield for even positive numbers''' + if value is not None and value > 0 and value % 2 == 0: + yield (value,) + # For other cases, yield nothing (filter out) + + +def classify_number_range(value): + '''Classify number by magnitude''' + if value is None: + yield (None, 'null', True) + elif math.isnan(value): + yield (value, 'nan', False) + elif math.isinf(value): + yield (value, 'infinity', False) + elif value == 0.0: + yield (value, 'zero', True) + elif abs(value) < 1e-100: + yield (value, 'extremely_small', True) + elif abs(value) > 1e100: + yield (value, 'extremely_large', True) + elif abs(value) < 1.0: + yield (value, 'small', True) + else: + yield (value, 'normal', True) + + +def validate_date(dt): + '''Validate and classify dates''' + if dt is None: + yield (None, 0, False, 'null_date') + else: + year = dt.year + + # Check if leap year + is_leap = (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0) + + # Classify date + if year < 1900: + status = 'very_old' + elif year > 2100: + status = 'far_future' + else: + status = 'normal' + + yield (dt, year, is_leap, status) diff --git a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/io_patterns_udtf.py b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/io_patterns_udtf.py new file mode 100644 index 00000000000000..b275bfd618dd31 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/io_patterns_udtf.py @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""I/O pattern UDTF implementations - testing various cardinality patterns""" + + +def one_to_one(value): + '''Each input row produces exactly one output row''' + if value is not None: + yield (value, value * 2) + + +def one_to_many(n): + '''Each input row produces N output all_rows (1 to n)''' + if n is not None and n > 0: + for i in range(1, n + 1): + yield (i,) + + +def one_to_zero(value): + '''Only output even numbers, skip odd numbers (zero output)''' + if value is not None and value % 2 == 0: + yield (value,) + # Odd numbers: no yield, zero output all_rows + + +def one_to_variable(text): + ''' + - Empty string → 0 all_rows + - Single word → 1 row + - Multiple words → N all_rows + ''' + if text: + words = text.split() + for word in words: + yield (word,) + # Empty or None: no yield, zero output + + +def aggregate_pattern(value): + '''Categorize numbers into ranges''' + if value is not None: + if value < 10: + category = 'small' + elif value < 100: + category = 'medium' + else: + category = 'large' + yield (value, category) + + +def explosive(all_rows, all_cols): + '''Generate all_rows * all_cols output all_rows (cartesian product)''' + if all_rows is not None and all_cols is not None and all_rows > 0 and all_cols > 0: + for r in range(all_rows): + for c in range(all_cols): + yield (r, c) + + +def conditional(value): + ''' + - Positive: output (value, 'positive') + - Negative: output (abs(value), 'negative') + - Zero: output both (0, 'zero') and (0, 'neutral') + ''' + if value is not None: + if value > 0: + yield (value, 'positive') + elif value < 0: + yield (abs(value), 'negative') + else: + yield (0, 'zero') + yield (0, 'neutral') + + +def all_or_nothing(text, min_length): + ''' + If text length >= min_length: output each character with position + Otherwise: output nothing + ''' + if text and len(text) >= min_length: + for i, char in enumerate(text): + yield (char, i) + # If condition not met: no yield + + +def empty_input(value): + '''Simple identity function''' + if value is not None: + yield (value,) + + +def batch_process(value): + '''For each input, generate multiples (2x, 3x, 5x)''' + if value is not None and value > 0: + for factor in [2, 3, 5]: + yield (value, factor, value * factor) diff --git a/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/sql_integration_udtf.py b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/sql_integration_udtf.py new file mode 100644 index 00000000000000..63027562918e64 --- /dev/null +++ b/regression-test/suites/pythonudtf_p0/udtf_scripts/pyudtf_module/sql_integration_udtf.py @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""SQL integration UDTF implementations for complex query patterns""" + + +def split_with_position(text, delimiter): + '''Split string and return with position''' + if text and delimiter: + parts = text.split(delimiter) + for i, part in enumerate(parts): + yield (i, part.strip()) + + +def generate_range(start, end): + '''Generate integer range''' + if start is not None and end is not None: + for i in range(start, end + 1): + yield (i,) + + +def explode_with_index(arr): + '''Explode array with index''' + if arr: + for i, elem in enumerate(arr): + if elem is not None: + yield (elem, i)