From 7b7648b91691471fec1562270defe953b7c1c661 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Fri, 14 Feb 2025 09:25:04 -0500 Subject: [PATCH 1/7] test docs build --- docs/examples/UserGuide/agg_window.jl | 17 +++++++++++++++ docs/examples/UserGuide/udfs_ex.jl | 31 ++++++++++++++++++--------- 2 files changed, 38 insertions(+), 10 deletions(-) create mode 100644 docs/examples/UserGuide/agg_window.jl diff --git a/docs/examples/UserGuide/agg_window.jl b/docs/examples/UserGuide/agg_window.jl new file mode 100644 index 00000000..34d5e912 --- /dev/null +++ b/docs/examples/UserGuide/agg_window.jl @@ -0,0 +1,17 @@ +# TidierDB supports all aggregate functions accross the supported databases, as well as window functions. + +# ## Aggregate Functions +# `@summarize`, by default, supports all aggregate functions built in to a SQL database, with the exception that any `'` that would be used in SQL should be replaced wiht `"`. +using TidierDB +db = connect(duckdb()) +mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" +mtcars = db_tbable(db, mtcars_path); + + +# `@mutate`/`@transmute` supports - the following aggregate functions by default: `maximum`, `minimum`, `mean`, `std`, `sum`, `cumsum` +# However, users can expand this list through two different methods outlineed here. + + +# ## Window Functions +# TidierDB's `@mutate`/`@transmute` +# - Window Functions: `lead`, `lag`, `dense_rank`, `nth_value`, `ntile`, `rank_dense`, `row_number`, `first_value`, `last_value`, `cume_dist` diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl index 222b62dc..8bc7e669 100644 --- a/docs/examples/UserGuide/udfs_ex.jl +++ b/docs/examples/UserGuide/udfs_ex.jl @@ -1,12 +1,10 @@ -# TidierDB is unique in its statement parsing flexiblility. This means that using any built in SQL function or user defined functions (or UDFS) or is readily avaialable. +# TidierDB is unique in its statement parsing flexiblility. This means that using any built in SQL function or user defined functions (or UDFS) are readily avaialable. # To use any function built into a database in `@mutate` or in `@summarize`, simply correctly write the correctly, but replace `'` with `"`. This also applies to any UDF. The example below will illustrate UDFs in the context of DuckDB. +using TidierDB # DuckDB is reexported by TidierDB +db = connect(duckdb()) # ``` -# # Set up the connection -# using TidierDB #rexports DuckDB -# db = DuckDB.DB() -# con = DuckDB.connect(db) # this will be important for UDFs # mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" # mtcars = db_tbable(db, mtcars_path); # ``` @@ -105,7 +103,23 @@ # 1 │ Hornet Sportabout 18.7 8 360.0 175 # ``` -# ## UDF SQLite Example +# ## UDFs in DuckDB +df = db_table(db, DataFrame(a = [1, 2, 3], b = [1, 2, 3]), "df_view") + +# A more in depth disccusion of UDFs in DuckDB.jl can be found [here](https://discourse.julialang.org/t/is-it-hard-to-support-julia-udfs-in-duckdb/118509/24?u=true). +# define a function +bino = (a, b) -> (a + b) * (a + b) +# create the scalar function +fun = DuckDB.@create_scalar_function quad(a::Int, b::Int)::Int; +DuckDB.register_scalar_function(db, fun); +# use the scalar function in mutate without any further modifcation. +@chain t(df) @mutate(c = bino(a, b)) @collect + +#notably, when the function is redefined (with the same arguments), the DuckDB UDF will change as well. +quad = (a, b) -> (a + b) * (a - b); +@chain t(df) @mutate(c = bino(a, b)) @collect + +# ## UDFs in SQLite # ``` # using SQLite # sql = connect(sqlite()); @@ -138,7 +152,4 @@ # 8 │ 3 0.8 8.36 # 9 │ 4 0.9 15.19 # 10 │ 5 1.0 24.0 -# ``` - -# ## How to create UDF in DuckDB -# Example coming soon.. \ No newline at end of file +# ``` \ No newline at end of file From 3c5baa5a9cb197201d91389464ac6b14f27610be Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Fri, 14 Feb 2025 09:53:13 -0500 Subject: [PATCH 2/7] fix depreciation warning --- Project.toml | 2 +- docs/examples/UserGuide/agg_window.jl | 2 +- docs/examples/UserGuide/udfs_ex.jl | 2 +- src/TBD_macros.jl | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Project.toml b/Project.toml index fccf11fa..c312f8c1 100644 --- a/Project.toml +++ b/Project.toml @@ -45,7 +45,7 @@ ClickHouse = "0.2" DataFrames = "1.5" Dates = "1.9" Documenter = "0.27, 1" -DuckDB = "1 - 2" +DuckDB = "1.2" GZip = "0.6" GoogleCloud = "0.11" HTTP = "1.1" diff --git a/docs/examples/UserGuide/agg_window.jl b/docs/examples/UserGuide/agg_window.jl index 34d5e912..876f81b6 100644 --- a/docs/examples/UserGuide/agg_window.jl +++ b/docs/examples/UserGuide/agg_window.jl @@ -5,7 +5,7 @@ using TidierDB db = connect(duckdb()) mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" -mtcars = db_tbable(db, mtcars_path); +mtcars = db_table(db, mtcars_path); # `@mutate`/`@transmute` supports - the following aggregate functions by default: `maximum`, `minimum`, `mean`, `std`, `sum`, `cumsum` diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl index 8bc7e669..c802a4ee 100644 --- a/docs/examples/UserGuide/udfs_ex.jl +++ b/docs/examples/UserGuide/udfs_ex.jl @@ -110,7 +110,7 @@ df = db_table(db, DataFrame(a = [1, 2, 3], b = [1, 2, 3]), "df_view") # define a function bino = (a, b) -> (a + b) * (a + b) # create the scalar function -fun = DuckDB.@create_scalar_function quad(a::Int, b::Int)::Int; +fun = DuckDB.@create_scalar_function bino(a::Int, b::Int)::Int; DuckDB.register_scalar_function(db, fun); # use the scalar function in mutate without any further modifcation. @chain t(df) @mutate(c = bino(a, b)) @collect diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl index af10520d..82f5a330 100644 --- a/src/TBD_macros.jl +++ b/src/TBD_macros.jl @@ -446,7 +446,7 @@ end function final_collect(sqlquery::SQLQuery, ::Type{<:duckdb}) final_query = finalize_query(sqlquery) - result = DBInterface.execute(sqlquery.db, final_query) + result = DuckDB.query(sqlquery.db, final_query) return DataFrame(result) end From 88e99692e531fcc3ee1879a7d58ad8d66dd22f34 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Fri, 14 Feb 2025 11:37:52 -0500 Subject: [PATCH 3/7] test remove stream for warning? --- docs/examples/UserGuide/agg_window.jl | 29 ++++++- docs/examples/UserGuide/udfs_ex.jl | 105 +++++--------------------- src/TBD_macros.jl | 2 +- 3 files changed, 44 insertions(+), 92 deletions(-) diff --git a/docs/examples/UserGuide/agg_window.jl b/docs/examples/UserGuide/agg_window.jl index 876f81b6..983c9dd4 100644 --- a/docs/examples/UserGuide/agg_window.jl +++ b/docs/examples/UserGuide/agg_window.jl @@ -7,11 +7,34 @@ db = connect(duckdb()) mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" mtcars = db_table(db, mtcars_path); +# ## aggregate function in `@summarize` +# Lets use the DuckDB `kurtosis` aggregate function +@chain t(mtcars) begin + @group_by cyl + @summarize(kurt = kurtosis(mpg)) + @collect + end +## aggregate functions in `@mutate` # `@mutate`/`@transmute` supports - the following aggregate functions by default: `maximum`, `minimum`, `mean`, `std`, `sum`, `cumsum` -# However, users can expand this list through two different methods outlineed here. +# To use aggregate sql functions that are built in to any database not, but exist outside of the TidierDB parser, simply wrap the function call in `agg()` +@chain t(mtcars) begin + @group_by(cyl) + @mutate(kurt = agg(kurtosis(mpg))) + @select cyl mpg kurt + @collect +end + +# Alternatively , if you anticipate regularly using specific aggregate functions, you can use update the underlying parser and drop the need to use `agg` +push!(TidierDB.window_agg_fxns, :kurtosis); +@chain t(mtcars) begin + @group_by(cyl) + @mutate(kurt = kurtosis(mpg)) + @select cyl mpg kurt + @collect +end # ## Window Functions -# TidierDB's `@mutate`/`@transmute` -# - Window Functions: `lead`, `lag`, `dense_rank`, `nth_value`, `ntile`, `rank_dense`, `row_number`, `first_value`, `last_value`, `cume_dist` +# TidierDB's `@mutate`/`@transmute` support all of the window functions below +#`lead`, `lag`, `dense_rank`, `nth_value`, `ntile`, `rank_dense`, `row_number`, `first_value`, `last_value`, `cume_dist` diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl index c802a4ee..7cf4ddf1 100644 --- a/docs/examples/UserGuide/udfs_ex.jl +++ b/docs/examples/UserGuide/udfs_ex.jl @@ -1,91 +1,17 @@ -# TidierDB is unique in its statement parsing flexiblility. This means that using any built in SQL function or user defined functions (or UDFS) are readily avaialable. -# To use any function built into a database in `@mutate` or in `@summarize`, simply correctly write the correctly, but replace `'` with `"`. This also applies to any UDF. The example below will illustrate UDFs in the context of DuckDB. - +# TidierDB is unique in its statement parsing flexiblility. This means that in addition to using any built in SQL database functions, user defined functions (or UDFS) are readily avaialable in TidierDB. using TidierDB # DuckDB is reexported by TidierDB db = connect(duckdb()) -# ``` -# mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" -# mtcars = db_tbable(db, mtcars_path); -# ``` -# ## aggregate function in `@summarize` -# Lets use the DuckDB `kurtosis` aggregate function -# ``` -# @chain t(mtcars) begin -# @group_by cyl -# @summarize(kurt = kurtosis(mpg)) -# @collect -# end -# 3×2 DataFrame -# Row │ cyl kurt -# │ Int64? Float64? -# ─────┼─────────────────── -# 1 │ 4 -1.43411 -# 2 │ 6 -1.82944 -# 3 │ 8 0.330061 -# ``` - -# ## aggregate functions in `@mutate` -# To aggregate sql functions that are builtin to any database, but exist outside of the TidierDB parser, simply wrap the function call in `agg()` -# ``` -# @chain t(mtcars) begin -# @group_by(cyl) -# @mutate(kurt = agg(kurtosis(mpg))) -# @select cyl mpg kurt -# @collect -# end - -# 32×3 DataFrame -# Row │ cyl mpg kurt -# │ Int64? Float64? Float64? -# ─────┼───────────────────────────── -# 1 │ 8 18.7 0.330061 -# 2 │ 8 14.3 0.330061 -# 3 │ 8 16.4 0.330061 -# 4 │ 8 17.3 0.330061 -# 5 │ 8 15.2 0.330061 -# 6 │ 8 10.4 0.330061 -# 7 │ 8 10.4 0.330061 -# ⋮ │ ⋮ ⋮ ⋮ -# 27 │ 6 21.0 -1.82944 -# 28 │ 6 21.4 -1.82944 -# 29 │ 6 18.1 -1.82944 -# 30 │ 6 19.2 -1.82944 -# 31 │ 6 17.8 -1.82944 -# 32 │ 6 19.7 -1.82944 -# 19 rows omitted -# end - -# ``` +mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"; +mtcars = db_tbable(db, mtcars_path); # ## DuckDB function chaining # In DuckDB, functions can be chained together with `.`. TidierDB lets you leverage this. -# ``` -# @chain t(mtcars) begin -# @mutate(model2 = model.upper().string_split(" ").list_aggr("string_agg",".").concat(".")) -# @select model model2 -# @collect -# end -# 32×2 DataFrame -# Row │ model model2 -# │ String? String? -# ─────┼─────────────────────────────────────── -# 1 │ Mazda RX4 MAZDA.RX4. -# 2 │ Mazda RX4 Wag MAZDA.RX4.WAG. -# 3 │ Datsun 710 DATSUN.710. -# 4 │ Hornet 4 Drive HORNET.4.DRIVE. -# 5 │ Hornet Sportabout HORNET.SPORTABOUT. -# 6 │ Valiant VALIANT. -# 7 │ Duster 360 DUSTER.360. -# ⋮ │ ⋮ ⋮ -# 27 │ Porsche 914-2 PORSCHE.914-2. -# 28 │ Lotus Europa LOTUS.EUROPA. -# 29 │ Ford Pantera L FORD.PANTERA.L. -# 30 │ Ferrari Dino FERRARI.DINO. -# 31 │ Maserati Bora MASERATI.BORA. -# 32 │ Volvo 142E VOLVO.142E. -# 19 rows omitted -# ``` +@chain t(mtcars) begin + @mutate(model2 = model.upper().string_split(" ").list_aggr("string_agg",".").concat(".")) + @select model model2 + @collect +end # ## `rowid` and pseudocolumns # When a table is not being read directly from a file, `rowid` is avaialable for use. In general, TidierDB should support all pseudocolumns. @@ -104,20 +30,23 @@ db = connect(duckdb()) # ``` # ## UDFs in DuckDB -df = db_table(db, DataFrame(a = [1, 2, 3], b = [1, 2, 3]), "df_view") +# TidierDB's flexibility means that once created, UDFs can immediately be used in with `@mutate` or `@transmute` +df = DataFrame(a = [1, 2, 3], b = [1, 2, 3]) +dfv = db_table(db, df, "df_view"); # A more in depth disccusion of UDFs in DuckDB.jl can be found [here](https://discourse.julialang.org/t/is-it-hard-to-support-julia-udfs-in-duckdb/118509/24?u=true). # define a function bino = (a, b) -> (a + b) * (a + b) -# create the scalar function +# Create the scalar function fun = DuckDB.@create_scalar_function bino(a::Int, b::Int)::Int; DuckDB.register_scalar_function(db, fun); -# use the scalar function in mutate without any further modifcation. -@chain t(df) @mutate(c = bino(a, b)) @collect -#notably, when the function is redefined (with the same arguments), the DuckDB UDF will change as well. +# Use the UDF in mutate without any further modifcation. +@chain t(dfv) @mutate(c = bino(a, b)) @collect + +# Notably, when the function is redefined (with the same arguments), the DuckDB UDF will change as well. quad = (a, b) -> (a + b) * (a - b); -@chain t(df) @mutate(c = bino(a, b)) @collect +@chain t(dfv) @mutate(c = bino(a, b)) @collect # ## UDFs in SQLite # ``` diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl index 82f5a330..28ed5dca 100644 --- a/src/TBD_macros.jl +++ b/src/TBD_macros.jl @@ -507,7 +507,7 @@ macro collect(sqlquery, stream = false) if backend == duckdb() if $stream println("streaming") - stream_collect($(esc(sqlquery))) + final_collect($(esc(sqlquery)), duckdb) else final_collect($(esc(sqlquery)), duckdb) end From c7fa248c8add100c71dd42e19aac4b411121b8bb Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Fri, 14 Feb 2025 11:58:39 -0500 Subject: [PATCH 4/7] replace all DuckDB.execute with DBInterface.execute --- docs/examples/UserGuide/udfs_ex.jl | 2 +- src/TBD_macros.jl | 2 +- src/TidierDB.jl | 24 ++++++++++++------------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl index 7cf4ddf1..e6526512 100644 --- a/docs/examples/UserGuide/udfs_ex.jl +++ b/docs/examples/UserGuide/udfs_ex.jl @@ -45,7 +45,7 @@ DuckDB.register_scalar_function(db, fun); @chain t(dfv) @mutate(c = bino(a, b)) @collect # Notably, when the function is redefined (with the same arguments), the DuckDB UDF will change as well. -quad = (a, b) -> (a + b) * (a - b); +bino = (a, b) -> (a + b) * (a - b); @chain t(dfv) @mutate(c = bino(a, b)) @collect # ## UDFs in SQLite diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl index 28ed5dca..b1128669 100644 --- a/src/TBD_macros.jl +++ b/src/TBD_macros.jl @@ -507,7 +507,7 @@ macro collect(sqlquery, stream = false) if backend == duckdb() if $stream println("streaming") - final_collect($(esc(sqlquery)), duckdb) + # stream_collect($(esc(sqlquery))) else final_collect($(esc(sqlquery)), duckdb) end diff --git a/src/TidierDB.jl b/src/TidierDB.jl index 1a126f2a..d5e1b167 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -206,7 +206,7 @@ function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_nam DESCRIBE SELECT * FROM $(table_name) LIMIT 0 """ end - result = DuckDB.execute(conn, query) |> DataFrame + result = DBInterface.execute(conn, query) |> DataFrame result[!, :current_selxn] .= 1 if occursin("*" , table_name) || alias != "" if alias != "" @@ -249,8 +249,8 @@ function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, de table_name2 = "iceberg_scan('$table_name', allow_moved_paths = true)" metadata = get_table_metadata(db, table_name2) elseif delta - DuckDB.execute(db, "INSTALL delta;") - DuckDB.execute(db, "LOAD delta;") + DBInterface.execute(db, "INSTALL delta;") + DBInterface.execute(db, "LOAD delta;") table_name2 = "delta_scan('$table_name')" elseif occursin("docs.google", table_name) table_name2 = "read_gsheet('$table_name')" @@ -399,17 +399,17 @@ function copy_to(conn, df_or_path::Union{DataFrame, AbstractString}, name::Strin # Determine the file type based on the extension if startswith(df_or_path, "http") # Install and load the httpfs extension if the path is a URL - DuckDB.execute(conn, "INSTALL httpfs;") - DuckDB.execute(conn, "LOAD httpfs;") + DBInterface.execute(conn, "INSTALL httpfs;") + DBInterface.execute(conn, "LOAD httpfs;") end if occursin(r"\.csv$", df_or_path) # Construct and execute a SQL command for loading a CSV file sql_command = "CREATE TABLE $name AS SELECT * FROM '$df_or_path';" - DuckDB.execute(conn, sql_command) + DBInterface.execute(conn, sql_command) elseif occursin(r"\.parquet$", df_or_path) # Construct and execute a SQL command for loading a Parquet file sql_command = "CREATE TABLE $name AS SELECT * FROM '$df_or_path';" - DuckDB.execute(conn, sql_command) + DBInterface.execute(conn, sql_command) elseif occursin(r"\.arrow$", df_or_path) # Construct and execute a SQL command for loading a CSV file arrow_table = Arrow.Table(df_or_path) @@ -417,11 +417,11 @@ function copy_to(conn, df_or_path::Union{DataFrame, AbstractString}, name::Strin elseif occursin(r"\.json$", df_or_path) # For Arrow files, read the file into a DataFrame and then insert sql_command = "CREATE TABLE $name AS SELECT * FROM read_json('$df_or_path');" - DuckDB.execute(conn, "INSTALL json;") - DuckDB.execute(conn, "LOAD json;") - DuckDB.execute(conn, sql_command) + DBInterface.execute(conn, "INSTALL json;") + DBInterface.execute(conn, "LOAD json;") + DBInterface.execute(conn, sql_command) elseif startswith(df_or_path, "read") - DuckDB.execute(conn, "CREATE TABLE $name AS SELECT * FROM $df_or_path;") + DBInterface.execute(conn, "CREATE TABLE $name AS SELECT * FROM $df_or_path;") else error("Unsupported file type for: $df_or_path") end @@ -473,7 +473,7 @@ function connect(::duckdb, db_type::Symbol; access_key::String="", secret_key::S DBInterface.execute(db, "LOAD httpfs;") if db_type == :gbq - DuckDB.execute(db, """ + DBInterface.execute(db, """ CREATE SECRET ( TYPE GCS, KEY_ID '$access_key', From 351873c0a163b3ce46e89eeedb616c0265be1e8f Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Fri, 14 Feb 2025 12:23:43 -0500 Subject: [PATCH 5/7] try complete removing streaming 2/2 nextDataChunk --- docs/examples/UserGuide/udfs_ex.jl | 8 +++++++- src/TBD_macros.jl | 31 ------------------------------ 2 files changed, 7 insertions(+), 32 deletions(-) diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl index e6526512..095c9f42 100644 --- a/docs/examples/UserGuide/udfs_ex.jl +++ b/docs/examples/UserGuide/udfs_ex.jl @@ -31,23 +31,29 @@ end # ## UDFs in DuckDB # TidierDB's flexibility means that once created, UDFs can immediately be used in with `@mutate` or `@transmute` -df = DataFrame(a = [1, 2, 3], b = [1, 2, 3]) +df = DataFrame(a = [1, 2, 3], b = [1, 2, 3]); dfv = db_table(db, df, "df_view"); # A more in depth disccusion of UDFs in DuckDB.jl can be found [here](https://discourse.julialang.org/t/is-it-hard-to-support-julia-udfs-in-duckdb/118509/24?u=true). # define a function + bino = (a, b) -> (a + b) * (a + b) + # Create the scalar function fun = DuckDB.@create_scalar_function bino(a::Int, b::Int)::Int; DuckDB.register_scalar_function(db, fun); # Use the UDF in mutate without any further modifcation. + @chain t(dfv) @mutate(c = bino(a, b)) @collect + # Notably, when the function is redefined (with the same arguments), the DuckDB UDF will change as well. + bino = (a, b) -> (a + b) * (a - b); @chain t(dfv) @mutate(c = bino(a, b)) @collect + # ## UDFs in SQLite # ``` # using SQLite diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl index b1128669..22172183 100644 --- a/src/TBD_macros.jl +++ b/src/TBD_macros.jl @@ -463,38 +463,7 @@ function final_collect(sqlquery::SQLQuery, ::Type{<:snowflake}) return DataFrame(result) end -function stream_collect(sqlquery::SQLQuery) - final_query = finalize_query(sqlquery) - res = DBInterface.execute(sqlquery.db, final_query, DuckDB.StreamResult) - - # Helper function to get the non-Missing type from a Union{Missing, T} - function non_missing_type(T) - T === Missing && return Any - T <: Union{Missing} ? non_missing_type(Base.typesplit(T)[2]) : T - end - # Initialize DataFrame with correct types - df = DataFrame([name => Vector{non_missing_type(t)}() for (name, t) in zip(res.names, res.types)]) - - while true - chunk = DuckDB.nextDataChunk(res) - chunk === missing && break # All chunks processed - - for (col_idx, col_name) in enumerate(res.names) - # Convert DuckDB data to Julia data - duckdb_logical_type = DuckDB.LogicalType(DuckDB.duckdb_column_logical_type(res.handle, col_idx)) - duckdb_conversion_state = DuckDB.ColumnConversionData([chunk], col_idx, duckdb_logical_type, nothing) - col_data = DuckDB.convert_column(duckdb_conversion_state) - - # Append the data to the DataFrame - append!(df[!, col_name], col_data) - end - - DuckDB.destroy_data_chunk(chunk) - end - - return df -end # COV_EXCL_STOP From 1f698b26bda8d5d564392c8cc1919aa70099d27d Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Fri, 14 Feb 2025 12:38:31 -0500 Subject: [PATCH 6/7] add ci fix? --- .github/workflows/CI.yml | 2 ++ docs/examples/UserGuide/udfs_ex.jl | 7 +++---- src/TBD_macros.jl | 31 ++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index cfe5c686..37f81030 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -35,6 +35,8 @@ jobs: - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 + with: + args: "--depwarn=no" # fix duckdb deprec warning - uses: julia-actions/julia-processcoverage@v1 with: directories: src diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl index 095c9f42..32c977aa 100644 --- a/docs/examples/UserGuide/udfs_ex.jl +++ b/docs/examples/UserGuide/udfs_ex.jl @@ -2,8 +2,7 @@ using TidierDB # DuckDB is reexported by TidierDB db = connect(duckdb()) -mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"; -mtcars = db_tbable(db, mtcars_path); +mtcars = db_table(db, "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv") # ## DuckDB function chaining # In DuckDB, functions can be chained together with `.`. TidierDB lets you leverage this. @@ -32,7 +31,7 @@ end # ## UDFs in DuckDB # TidierDB's flexibility means that once created, UDFs can immediately be used in with `@mutate` or `@transmute` df = DataFrame(a = [1, 2, 3], b = [1, 2, 3]); -dfv = db_table(db, df, "df_view"); +dfv = db_table(db, df, "df_view") # A more in depth disccusion of UDFs in DuckDB.jl can be found [here](https://discourse.julialang.org/t/is-it-hard-to-support-julia-udfs-in-duckdb/118509/24?u=true). # define a function @@ -41,7 +40,7 @@ bino = (a, b) -> (a + b) * (a + b) # Create the scalar function fun = DuckDB.@create_scalar_function bino(a::Int, b::Int)::Int; -DuckDB.register_scalar_function(db, fun); +DuckDB.register_scalar_function(db, fun) # Use the UDF in mutate without any further modifcation. diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl index 22172183..b1128669 100644 --- a/src/TBD_macros.jl +++ b/src/TBD_macros.jl @@ -463,7 +463,38 @@ function final_collect(sqlquery::SQLQuery, ::Type{<:snowflake}) return DataFrame(result) end +function stream_collect(sqlquery::SQLQuery) + final_query = finalize_query(sqlquery) + res = DBInterface.execute(sqlquery.db, final_query, DuckDB.StreamResult) + + # Helper function to get the non-Missing type from a Union{Missing, T} + function non_missing_type(T) + T === Missing && return Any + T <: Union{Missing} ? non_missing_type(Base.typesplit(T)[2]) : T + end + # Initialize DataFrame with correct types + df = DataFrame([name => Vector{non_missing_type(t)}() for (name, t) in zip(res.names, res.types)]) + + while true + chunk = DuckDB.nextDataChunk(res) + chunk === missing && break # All chunks processed + + for (col_idx, col_name) in enumerate(res.names) + # Convert DuckDB data to Julia data + duckdb_logical_type = DuckDB.LogicalType(DuckDB.duckdb_column_logical_type(res.handle, col_idx)) + duckdb_conversion_state = DuckDB.ColumnConversionData([chunk], col_idx, duckdb_logical_type, nothing) + col_data = DuckDB.convert_column(duckdb_conversion_state) + + # Append the data to the DataFrame + append!(df[!, col_name], col_data) + end + + DuckDB.destroy_data_chunk(chunk) + end + + return df +end # COV_EXCL_STOP From 15af88e7a0ed85b91efc11a54df42bec92f380d7 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Fri, 14 Feb 2025 14:42:26 -0500 Subject: [PATCH 7/7] try to fix it a different way --- .github/workflows/CI.yml | 6 ++---- docs/examples/UserGuide/udfs_ex.jl | 23 ++++++----------------- src/TidierDB.jl | 2 +- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 37f81030..f80cc2a9 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -7,12 +7,12 @@ on: pull_request: workflow_dispatch: concurrency: - # Skip intermediate builds: always. - # Cancel intermediate builds: only if it is a pull request build. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: test: + env: + JULIA_DEPWARN: "false" name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: @@ -35,8 +35,6 @@ jobs: - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - with: - args: "--depwarn=no" # fix duckdb deprec warning - uses: julia-actions/julia-processcoverage@v1 with: directories: src diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl index 32c977aa..8550570d 100644 --- a/docs/examples/UserGuide/udfs_ex.jl +++ b/docs/examples/UserGuide/udfs_ex.jl @@ -1,11 +1,10 @@ # TidierDB is unique in its statement parsing flexiblility. This means that in addition to using any built in SQL database functions, user defined functions (or UDFS) are readily avaialable in TidierDB. +# ## DuckDB function chaining +# In DuckDB, functions can be chained together with `.`. TidierDB lets you leverage this. using TidierDB # DuckDB is reexported by TidierDB db = connect(duckdb()) mtcars = db_table(db, "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv") - -# ## DuckDB function chaining -# In DuckDB, functions can be chained together with `.`. TidierDB lets you leverage this. @chain t(mtcars) begin @mutate(model2 = model.upper().string_split(" ").list_aggr("string_agg",".").concat(".")) @select model model2 @@ -30,29 +29,19 @@ end # ## UDFs in DuckDB # TidierDB's flexibility means that once created, UDFs can immediately be used in with `@mutate` or `@transmute` -df = DataFrame(a = [1, 2, 3], b = [1, 2, 3]); +df = DataFrame(a = [1, 2, 3], b = [1, 2, 3]) dfv = db_table(db, df, "df_view") - # A more in depth disccusion of UDFs in DuckDB.jl can be found [here](https://discourse.julialang.org/t/is-it-hard-to-support-julia-udfs-in-duckdb/118509/24?u=true). -# define a function - +# define a function and Create the scalar function bino = (a, b) -> (a + b) * (a + b) - -# Create the scalar function -fun = DuckDB.@create_scalar_function bino(a::Int, b::Int)::Int; +fun = DuckDB.@create_scalar_function bino(a::Int, b::Int)::Int DuckDB.register_scalar_function(db, fun) - -# Use the UDF in mutate without any further modifcation. - @chain t(dfv) @mutate(c = bino(a, b)) @collect - # Notably, when the function is redefined (with the same arguments), the DuckDB UDF will change as well. - -bino = (a, b) -> (a + b) * (a - b); +bino = (a, b) -> (a + b) * (a - b) @chain t(dfv) @mutate(c = bino(a, b)) @collect - # ## UDFs in SQLite # ``` # using SQLite diff --git a/src/TidierDB.jl b/src/TidierDB.jl index d5e1b167..9cca624e 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -473,7 +473,7 @@ function connect(::duckdb, db_type::Symbol; access_key::String="", secret_key::S DBInterface.execute(db, "LOAD httpfs;") if db_type == :gbq - DBInterface.execute(db, """ + DuckDB.execute(db, """ CREATE SECRET ( TYPE GCS, KEY_ID '$access_key',