diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f26cf764d00..c601dd0ce1d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -588,7 +588,6 @@ add_library( src/join/mixed_join_semi.cu src/join/mixed_join_size_kernel.cu src/join/mixed_join_size_kernel_nulls.cu - src/join/semi_join.cu src/join/sort_merge_join.cu src/json/json_path.cu src/lists/contains.cu diff --git a/cpp/include/cudf/join/join.hpp b/cpp/include/cudf/join/join.hpp index fb29ac75b46..23eb0a67f33 100644 --- a/cpp/include/cudf/join/join.hpp +++ b/cpp/include/cudf/join/join.hpp @@ -201,73 +201,6 @@ full_join(cudf::table_view const& left_keys, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); -/** - * @brief Returns a vector of row indices corresponding to a left semi-join - * between the specified tables. - * - * @deprecated Use the object-oriented filtered_join `cudf::filtered_join::anti_join` instead - * - * The returned vector contains the row indices from the left table - * for which there is a matching row in the right table. - * - * @code{.pseudo} - * TableA: {{0, 1, 2}} - * TableB: {{1, 2, 3}} - * Result: {1, 2} - * @endcode - * - * @param left_keys The left table - * @param right_keys The right table - * @param compare_nulls Controls whether null join-key values should match or not - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned table and columns' device memory - * - * @return A vector `left_indices` that can be used to construct - * the result of performing a left semi join between two tables with - * `left_keys` and `right_keys` as the join keys . - */ -[[deprecated]] std::unique_ptr> left_semi_join( - cudf::table_view const& left_keys, - cudf::table_view const& right_keys, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - -/** - * @brief Returns a vector of row indices corresponding to a left anti join - * between the specified tables. - * - * @deprecated Use the object-oriented filtered_join `cudf::filtered_join::semi_join` instead - * - * The returned vector contains the row indices from the left table - * for which there is no matching row in the right table. - * - * @code{.pseudo} - * TableA: {{0, 1, 2}} - * TableB: {{1, 2, 3}} - * Result: {0} - * @endcode - * - * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0 - * - * @param[in] left_keys The left table - * @param[in] right_keys The right table - * @param[in] compare_nulls controls whether null join-key values - * should match or not. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned table and columns' device memory - * - * @return A column `left_indices` that can be used to construct - * the result of performing a left anti join between two tables with - * `left_keys` and `right_keys` as the join keys . - */ -[[deprecated]] std::unique_ptr> left_anti_join( - cudf::table_view const& left_keys, - cudf::table_view const& right_keys, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - /** * @brief Performs a cross join on two tables (`left`, `right`) * diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu deleted file mode 100644 index d9ddbe9e15f..00000000000 --- a/cpp/src/join/semi_join.cu +++ /dev/null @@ -1,86 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "join/join_common_utils.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -namespace cudf { -namespace detail { - -std::unique_ptr> left_semi_anti_join( - join_kind const kind, - cudf::table_view const& left_keys, - cudf::table_view const& right_keys, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty"); - CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty"); - - if (is_trivial_join(left_keys, right_keys, kind)) { - return std::make_unique>(0, stream, mr); - } - if ((join_kind::LEFT_ANTI_JOIN == kind) && (0 == right_keys.num_rows())) { - auto result = - std::make_unique>(left_keys.num_rows(), stream, mr); - thrust::sequence(rmm::exec_policy(stream), result->begin(), result->end()); - return result; - } - - cudf::filtered_join obj(right_keys, compare_nulls, cudf::set_as_build_table::RIGHT, stream); - if (kind == join_kind::LEFT_SEMI_JOIN) { return obj.semi_join(left_keys, stream, mr); } - return obj.anti_join(left_keys, stream, mr); -} - -} // namespace detail - -std::unique_ptr> left_semi_join( - cudf::table_view const& left, - cudf::table_view const& right, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::left_semi_anti_join( - join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, stream, mr); -} - -std::unique_ptr> left_anti_join( - cudf::table_view const& left, - cudf::table_view const& right, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::left_semi_anti_join( - join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, stream, mr); -} - -} // namespace cudf diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 987f4b84807..1ec5fecb948 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -3511,13 +3512,16 @@ Java_ai_rapids_cudf_Table_mixedFullJoinGatherMaps(JNIEnv* env, JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoinGatherMap( JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) { + double constexpr load_factor = 0.5; return cudf::jni::join_gather_single_map( env, j_left_keys, j_right_keys, compare_nulls_equal, - [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) { - return cudf::left_semi_join(left, right, nulleq); + [load_factor]( + cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) { + cudf::filtered_join obj(right, nulleq, cudf::set_as_build_table::RIGHT, load_factor); + return obj.semi_join(left); }); } @@ -3608,13 +3612,16 @@ Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap(JNIEnv* env, JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap( JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) { + double constexpr load_factor = 0.5; return cudf::jni::join_gather_single_map( env, j_left_keys, j_right_keys, compare_nulls_equal, - [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) { - return cudf::left_anti_join(left, right, nulleq); + [load_factor]( + cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) { + cudf::filtered_join obj(right, nulleq, cudf::set_as_build_table::RIGHT, load_factor); + return obj.anti_join(left); }); } diff --git a/python/pylibcudf/pylibcudf/join.pxd b/python/pylibcudf/pylibcudf/join.pxd index a93ec4df2b8..31a998029e3 100644 --- a/python/pylibcudf/pylibcudf/join.pxd +++ b/python/pylibcudf/pylibcudf/join.pxd @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 +from libcpp.memory cimport unique_ptr +from pylibcudf.libcudf cimport join as cpp_join from pylibcudf.libcudf.types cimport null_equality from rmm.pylibrmm.stream cimport Stream from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource @@ -148,3 +150,6 @@ cpdef Column mixed_left_anti_join( Stream stream=*, DeviceMemoryResource mr=*, ) + +cdef class FilteredJoin: + cdef unique_ptr[cpp_join.filtered_join] c_obj diff --git a/python/pylibcudf/pylibcudf/join.pyi b/python/pylibcudf/pylibcudf/join.pyi index 7219b513f48..81644d0d64c 100644 --- a/python/pylibcudf/pylibcudf/join.pyi +++ b/python/pylibcudf/pylibcudf/join.pyi @@ -1,6 +1,8 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 +from enum import IntEnum + from rmm.pylibrmm import Stream from rmm.pylibrmm.memory_resource import DeviceMemoryResource @@ -9,6 +11,10 @@ from pylibcudf.expressions import Expression from pylibcudf.table import Table from pylibcudf.types import NullEquality +class SetAsBuildTable(IntEnum): + LEFT = ... + RIGHT = ... + def inner_join( left_keys: Table, right_keys: Table, @@ -135,3 +141,25 @@ def mixed_left_anti_join( stream: Stream | None = None, mr: DeviceMemoryResource | None = None, ) -> Column: ... + +class FilteredJoin: + def __init__( + self, + build: Table, + compare_nulls: NullEquality, + reuse_tbl: SetAsBuildTable, + load_factor: float, + stream: Stream | None = None, + ) -> None: ... + def semi_join( + self, + probe: Table, + stream: Stream | None = None, + mr: DeviceMemoryResource | None = None, + ) -> Column: ... + def anti_join( + self, + probe: Table, + stream: Stream | None = None, + mr: DeviceMemoryResource | None = None, + ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx index 221f8e3c79b..aefe82ade6b 100644 --- a/python/pylibcudf/pylibcudf/join.pyx +++ b/python/pylibcudf/pylibcudf/join.pyx @@ -21,6 +21,8 @@ from .expressions cimport Expression from .table cimport Table from .utils cimport _get_stream, _get_memory_resource +from pylibcudf.libcudf.join import set_as_build_table as SetAsBuildTable # no-cython-lint + __all__ = [ "conditional_full_join", "conditional_inner_join", @@ -28,6 +30,7 @@ __all__ = [ "conditional_left_join", "conditional_left_semi_join", "cross_join", + "FilteredJoin", "full_join", "inner_join", "left_anti_join", @@ -38,6 +41,7 @@ __all__ = [ "mixed_left_anti_join", "mixed_left_join", "mixed_left_semi_join", + "SetAsBuildTable", ] cdef Column _column_from_gather_map( @@ -189,7 +193,7 @@ cpdef Column left_semi_join( ): """Perform a left semi join between two tables. - For details, see :cpp:func:`left_semi_join`. + For details, see :cpp:class:`cudf::filtered_join`. Parameters ---------- @@ -210,11 +214,19 @@ cpdef Column left_semi_join( stream = _get_stream(stream) mr = _get_memory_resource(mr) + cdef unique_ptr[cpp_join.filtered_join] join_obj + with nogil: - c_result = cpp_join.left_semi_join( + join_obj.reset( + new cpp_join.filtered_join( + right_keys.view(), + nulls_equal, + cpp_join.set_as_build_table.RIGHT, + stream.view() + ) + ) + c_result = join_obj.get()[0].semi_join( left_keys.view(), - right_keys.view(), - nulls_equal, stream.view(), mr.get_mr() ) @@ -230,7 +242,7 @@ cpdef Column left_anti_join( ): """Perform a left anti join between two tables. - For details, see :cpp:func:`left_anti_join`. + For details, see :cpp:class:`cudf::filtered_join`. Parameters ---------- @@ -251,11 +263,19 @@ cpdef Column left_anti_join( stream = _get_stream(stream) mr = _get_memory_resource(mr) + cdef unique_ptr[cpp_join.filtered_join] join_obj + with nogil: - c_result = cpp_join.left_anti_join( + join_obj.reset( + new cpp_join.filtered_join( + right_keys.view(), + nulls_equal, + cpp_join.set_as_build_table.RIGHT, + stream.view() + ) + ) + c_result = join_obj.get()[0].anti_join( left_keys.view(), - right_keys.view(), - nulls_equal, stream.view(), mr.get_mr() ) @@ -803,3 +823,131 @@ cpdef Column mixed_left_anti_join( mr.get_mr() ) return _column_from_gather_map(move(c_result), stream, mr) + + +cdef class FilteredJoin: + """ + Filtered hash join that builds hash table on creation and probes + results in subsequent join member functions. + + For details, see :cpp:class:`cudf::filtered_join`. + """ + + def __cinit__( + self, + Table build, + null_equality compare_nulls, + cpp_join.set_as_build_table reuse_tbl, + double load_factor=0.5, + Stream stream=None, + ): + """ + Construct a filtered hash join object for subsequent probe calls. + + Parameters + ---------- + build : Table + The build table, from which the hash map is built. + compare_nulls : NullEquality + Controls whether null join-key values should match or not. + reuse_tbl : SetAsBuildTable + Specifies which table to use as the build table. If LEFT, the build + table is considered as the left table and is reused with multiple right + (probe) tables. If RIGHT, the build table is considered as the + right/filter table and will be applied to multiple left (probe) tables. + load_factor : float, optional + The desired ratio of filled slots to total slots in the hash table, + must be in range (0,1]. Defaults to 0.5. + stream : Stream, optional + CUDA stream used for device memory operations and kernel launches. + """ + stream = _get_stream(stream) + + with nogil: + self.c_obj.reset( + new cpp_join.filtered_join( + build.view(), + compare_nulls, + reuse_tbl, + load_factor, + stream.view() + ) + ) + + def semi_join( + self, + Table probe, + Stream stream=None, + DeviceMemoryResource mr=None, + ): + """ + Returns a column of row indices corresponding to a semi-join + between the build table and probe table. + + For details, see :cpp:func:`cudf::filtered_join::semi_join`. + + Parameters + ---------- + probe : Table + The probe table. + stream : Stream, optional + CUDA stream used for device memory operations and kernel launches. + mr : DeviceMemoryResource, optional + Device memory resource used to allocate the returned column's device memory. + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + + stream = _get_stream(stream) + mr = _get_memory_resource(mr) + + with nogil: + c_result = self.c_obj.get()[0].semi_join( + probe.view(), + stream.view(), + mr.get_mr() + ) + return _column_from_gather_map(move(c_result), stream, mr) + + def anti_join( + self, + Table probe, + Stream stream=None, + DeviceMemoryResource mr=None, + ): + """ + Returns a column of row indices corresponding to an anti-join + between the build table and probe table. + + For details, see :cpp:func:`cudf::filtered_join::anti_join`. + + Parameters + ---------- + probe : Table + The probe table. + stream : Stream, optional + CUDA stream used for device memory operations and kernel launches. + mr : DeviceMemoryResource, optional + Device memory resource used to allocate the returned column's device memory. + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + + stream = _get_stream(stream) + mr = _get_memory_resource(mr) + + with nogil: + c_result = self.c_obj.get()[0].anti_join( + probe.view(), + stream.view(), + mr.get_mr() + ) + return _column_from_gather_map(move(c_result), stream, mr) diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt index 191fdb0ea4e..5baa037311f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt @@ -6,8 +6,8 @@ # ============================================================================= set(cython_sources - aggregation.pyx binaryop.pyx copying.pyx datetime.pyx expressions.pyx labeling.pyx reduce.pyx - replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx + aggregation.pyx binaryop.pyx copying.pyx datetime.pyx expressions.pyx join.pyx labeling.pyx + reduce.pyx replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd index c94ce40f548..ca96e1e69c1 100644 --- a/python/pylibcudf/pylibcudf/libcudf/join.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd @@ -48,20 +48,6 @@ cdef extern from "cudf/join/join.hpp" namespace "cudf" nogil: device_memory_resource* mr ) except +libcudf_exception_handler - cdef gather_map_type left_semi_join( - const table_view left_keys, - const table_view right_keys, - cuda_stream_view stream, - device_memory_resource* mr - ) except +libcudf_exception_handler - - cdef gather_map_type left_anti_join( - const table_view left_keys, - const table_view right_keys, - cuda_stream_view stream, - device_memory_resource* mr - ) except +libcudf_exception_handler - cdef gather_map_pair_type inner_join( const table_view left_keys, const table_view right_keys, @@ -86,22 +72,6 @@ cdef extern from "cudf/join/join.hpp" namespace "cudf" nogil: device_memory_resource* mr ) except +libcudf_exception_handler - cdef gather_map_type left_semi_join( - const table_view left_keys, - const table_view right_keys, - null_equality nulls_equal, - cuda_stream_view stream, - device_memory_resource* mr - ) except +libcudf_exception_handler - - cdef gather_map_type left_anti_join( - const table_view left_keys, - const table_view right_keys, - null_equality nulls_equal, - cuda_stream_view stream, - device_memory_resource* mr - ) except +libcudf_exception_handler - cdef unique_ptr[table] cross_join( const table_view left, const table_view right, @@ -244,3 +214,34 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil: cuda_stream_view stream, device_memory_resource* mr ) except +libcudf_exception_handler + +cdef extern from "cudf/join/filtered_join.hpp" namespace "cudf" nogil: + cpdef enum class set_as_build_table: + LEFT + RIGHT + + cdef cppclass filtered_join: + filtered_join() except + + filtered_join( + const table_view build, + null_equality compare_nulls, + set_as_build_table reuse_tbl, + cuda_stream_view stream + ) except +libcudf_exception_handler + filtered_join( + const table_view build, + null_equality compare_nulls, + set_as_build_table reuse_tbl, + double load_factor, + cuda_stream_view stream + ) except +libcudf_exception_handler + gather_map_type semi_join( + const table_view probe, + cuda_stream_view stream, + device_memory_resource* mr + ) except +libcudf_exception_handler + gather_map_type anti_join( + const table_view probe, + cuda_stream_view stream, + device_memory_resource* mr + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pyx b/python/pylibcudf/pylibcudf/libcudf/join.pyx new file mode 100644 index 00000000000..d43d6889805 --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/join.pyx @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# This file exists to make the cpdef enums in join.pxd importable from Python