diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index cfe5c686..f80cc2a9 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -7,12 +7,12 @@ on: pull_request: workflow_dispatch: concurrency: - # Skip intermediate builds: always. - # Cancel intermediate builds: only if it is a pull request build. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: test: + env: + JULIA_DEPWARN: "false" name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: diff --git a/Project.toml b/Project.toml index fccf11fa..c312f8c1 100644 --- a/Project.toml +++ b/Project.toml @@ -45,7 +45,7 @@ ClickHouse = "0.2" DataFrames = "1.5" Dates = "1.9" Documenter = "0.27, 1" -DuckDB = "1 - 2" +DuckDB = "1.2" GZip = "0.6" GoogleCloud = "0.11" HTTP = "1.1" diff --git a/docs/examples/UserGuide/agg_window.jl b/docs/examples/UserGuide/agg_window.jl new file mode 100644 index 00000000..983c9dd4 --- /dev/null +++ b/docs/examples/UserGuide/agg_window.jl @@ -0,0 +1,40 @@ +# TidierDB supports all aggregate functions accross the supported databases, as well as window functions. + +# ## Aggregate Functions +# `@summarize`, by default, supports all aggregate functions built in to a SQL database, with the exception that any `'` that would be used in SQL should be replaced wiht `"`. +using TidierDB +db = connect(duckdb()) +mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" +mtcars = db_table(db, mtcars_path); + +# ## aggregate function in `@summarize` +# Lets use the DuckDB `kurtosis` aggregate function +@chain t(mtcars) begin + @group_by cyl + @summarize(kurt = kurtosis(mpg)) + @collect + end + +## aggregate functions in `@mutate` +# `@mutate`/`@transmute` supports - the following aggregate functions by default: `maximum`, `minimum`, `mean`, `std`, `sum`, `cumsum` +# To use aggregate sql functions that are built in to any database not, but exist outside of the TidierDB parser, simply wrap the function call in `agg()` +@chain t(mtcars) begin + @group_by(cyl) + @mutate(kurt = agg(kurtosis(mpg))) + @select cyl mpg kurt + @collect +end + +# Alternatively , if you anticipate regularly using specific aggregate functions, you can use update the underlying parser and drop the need to use `agg` +push!(TidierDB.window_agg_fxns, :kurtosis); +@chain t(mtcars) begin + @group_by(cyl) + @mutate(kurt = kurtosis(mpg)) + @select cyl mpg kurt + @collect +end + + +# ## Window Functions +# TidierDB's `@mutate`/`@transmute` support all of the window functions below +#`lead`, `lag`, `dense_rank`, `nth_value`, `ntile`, `rank_dense`, `row_number`, `first_value`, `last_value`, `cume_dist` diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl index 222b62dc..8550570d 100644 --- a/docs/examples/UserGuide/udfs_ex.jl +++ b/docs/examples/UserGuide/udfs_ex.jl @@ -1,93 +1,15 @@ -# TidierDB is unique in its statement parsing flexiblility. This means that using any built in SQL function or user defined functions (or UDFS) or is readily avaialable. -# To use any function built into a database in `@mutate` or in `@summarize`, simply correctly write the correctly, but replace `'` with `"`. This also applies to any UDF. The example below will illustrate UDFs in the context of DuckDB. - - -# ``` -# # Set up the connection -# using TidierDB #rexports DuckDB -# db = DuckDB.DB() -# con = DuckDB.connect(db) # this will be important for UDFs -# mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" -# mtcars = db_tbable(db, mtcars_path); -# ``` -# ## aggregate function in `@summarize` -# Lets use the DuckDB `kurtosis` aggregate function -# ``` -# @chain t(mtcars) begin -# @group_by cyl -# @summarize(kurt = kurtosis(mpg)) -# @collect -# end -# 3×2 DataFrame -# Row │ cyl kurt -# │ Int64? Float64? -# ─────┼─────────────────── -# 1 │ 4 -1.43411 -# 2 │ 6 -1.82944 -# 3 │ 8 0.330061 -# ``` - -# ## aggregate functions in `@mutate` -# To aggregate sql functions that are builtin to any database, but exist outside of the TidierDB parser, simply wrap the function call in `agg()` -# ``` -# @chain t(mtcars) begin -# @group_by(cyl) -# @mutate(kurt = agg(kurtosis(mpg))) -# @select cyl mpg kurt -# @collect -# end - -# 32×3 DataFrame -# Row │ cyl mpg kurt -# │ Int64? Float64? Float64? -# ─────┼───────────────────────────── -# 1 │ 8 18.7 0.330061 -# 2 │ 8 14.3 0.330061 -# 3 │ 8 16.4 0.330061 -# 4 │ 8 17.3 0.330061 -# 5 │ 8 15.2 0.330061 -# 6 │ 8 10.4 0.330061 -# 7 │ 8 10.4 0.330061 -# ⋮ │ ⋮ ⋮ ⋮ -# 27 │ 6 21.0 -1.82944 -# 28 │ 6 21.4 -1.82944 -# 29 │ 6 18.1 -1.82944 -# 30 │ 6 19.2 -1.82944 -# 31 │ 6 17.8 -1.82944 -# 32 │ 6 19.7 -1.82944 -# 19 rows omitted -# end - -# ``` +# TidierDB is unique in its statement parsing flexiblility. This means that in addition to using any built in SQL database functions, user defined functions (or UDFS) are readily avaialable in TidierDB. # ## DuckDB function chaining # In DuckDB, functions can be chained together with `.`. TidierDB lets you leverage this. -# ``` -# @chain t(mtcars) begin -# @mutate(model2 = model.upper().string_split(" ").list_aggr("string_agg",".").concat(".")) -# @select model model2 -# @collect -# end -# 32×2 DataFrame -# Row │ model model2 -# │ String? String? -# ─────┼─────────────────────────────────────── -# 1 │ Mazda RX4 MAZDA.RX4. -# 2 │ Mazda RX4 Wag MAZDA.RX4.WAG. -# 3 │ Datsun 710 DATSUN.710. -# 4 │ Hornet 4 Drive HORNET.4.DRIVE. -# 5 │ Hornet Sportabout HORNET.SPORTABOUT. -# 6 │ Valiant VALIANT. -# 7 │ Duster 360 DUSTER.360. -# ⋮ │ ⋮ ⋮ -# 27 │ Porsche 914-2 PORSCHE.914-2. -# 28 │ Lotus Europa LOTUS.EUROPA. -# 29 │ Ford Pantera L FORD.PANTERA.L. -# 30 │ Ferrari Dino FERRARI.DINO. -# 31 │ Maserati Bora MASERATI.BORA. -# 32 │ Volvo 142E VOLVO.142E. -# 19 rows omitted -# ``` +using TidierDB # DuckDB is reexported by TidierDB +db = connect(duckdb()) +mtcars = db_table(db, "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv") +@chain t(mtcars) begin + @mutate(model2 = model.upper().string_split(" ").list_aggr("string_agg",".").concat(".")) + @select model model2 + @collect +end # ## `rowid` and pseudocolumns # When a table is not being read directly from a file, `rowid` is avaialable for use. In general, TidierDB should support all pseudocolumns. @@ -105,7 +27,22 @@ # 1 │ Hornet Sportabout 18.7 8 360.0 175 # ``` -# ## UDF SQLite Example +# ## UDFs in DuckDB +# TidierDB's flexibility means that once created, UDFs can immediately be used in with `@mutate` or `@transmute` +df = DataFrame(a = [1, 2, 3], b = [1, 2, 3]) +dfv = db_table(db, df, "df_view") +# A more in depth disccusion of UDFs in DuckDB.jl can be found [here](https://discourse.julialang.org/t/is-it-hard-to-support-julia-udfs-in-duckdb/118509/24?u=true). +# define a function and Create the scalar function +bino = (a, b) -> (a + b) * (a + b) +fun = DuckDB.@create_scalar_function bino(a::Int, b::Int)::Int +DuckDB.register_scalar_function(db, fun) +@chain t(dfv) @mutate(c = bino(a, b)) @collect + +# Notably, when the function is redefined (with the same arguments), the DuckDB UDF will change as well. +bino = (a, b) -> (a + b) * (a - b) +@chain t(dfv) @mutate(c = bino(a, b)) @collect + +# ## UDFs in SQLite # ``` # using SQLite # sql = connect(sqlite()); @@ -138,7 +75,4 @@ # 8 │ 3 0.8 8.36 # 9 │ 4 0.9 15.19 # 10 │ 5 1.0 24.0 -# ``` - -# ## How to create UDF in DuckDB -# Example coming soon.. \ No newline at end of file +# ``` \ No newline at end of file diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl index af10520d..b1128669 100644 --- a/src/TBD_macros.jl +++ b/src/TBD_macros.jl @@ -446,7 +446,7 @@ end function final_collect(sqlquery::SQLQuery, ::Type{<:duckdb}) final_query = finalize_query(sqlquery) - result = DBInterface.execute(sqlquery.db, final_query) + result = DuckDB.query(sqlquery.db, final_query) return DataFrame(result) end @@ -507,7 +507,7 @@ macro collect(sqlquery, stream = false) if backend == duckdb() if $stream println("streaming") - stream_collect($(esc(sqlquery))) + # stream_collect($(esc(sqlquery))) else final_collect($(esc(sqlquery)), duckdb) end diff --git a/src/TidierDB.jl b/src/TidierDB.jl index 1a126f2a..9cca624e 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -206,7 +206,7 @@ function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_nam DESCRIBE SELECT * FROM $(table_name) LIMIT 0 """ end - result = DuckDB.execute(conn, query) |> DataFrame + result = DBInterface.execute(conn, query) |> DataFrame result[!, :current_selxn] .= 1 if occursin("*" , table_name) || alias != "" if alias != "" @@ -249,8 +249,8 @@ function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, de table_name2 = "iceberg_scan('$table_name', allow_moved_paths = true)" metadata = get_table_metadata(db, table_name2) elseif delta - DuckDB.execute(db, "INSTALL delta;") - DuckDB.execute(db, "LOAD delta;") + DBInterface.execute(db, "INSTALL delta;") + DBInterface.execute(db, "LOAD delta;") table_name2 = "delta_scan('$table_name')" elseif occursin("docs.google", table_name) table_name2 = "read_gsheet('$table_name')" @@ -399,17 +399,17 @@ function copy_to(conn, df_or_path::Union{DataFrame, AbstractString}, name::Strin # Determine the file type based on the extension if startswith(df_or_path, "http") # Install and load the httpfs extension if the path is a URL - DuckDB.execute(conn, "INSTALL httpfs;") - DuckDB.execute(conn, "LOAD httpfs;") + DBInterface.execute(conn, "INSTALL httpfs;") + DBInterface.execute(conn, "LOAD httpfs;") end if occursin(r"\.csv$", df_or_path) # Construct and execute a SQL command for loading a CSV file sql_command = "CREATE TABLE $name AS SELECT * FROM '$df_or_path';" - DuckDB.execute(conn, sql_command) + DBInterface.execute(conn, sql_command) elseif occursin(r"\.parquet$", df_or_path) # Construct and execute a SQL command for loading a Parquet file sql_command = "CREATE TABLE $name AS SELECT * FROM '$df_or_path';" - DuckDB.execute(conn, sql_command) + DBInterface.execute(conn, sql_command) elseif occursin(r"\.arrow$", df_or_path) # Construct and execute a SQL command for loading a CSV file arrow_table = Arrow.Table(df_or_path) @@ -417,11 +417,11 @@ function copy_to(conn, df_or_path::Union{DataFrame, AbstractString}, name::Strin elseif occursin(r"\.json$", df_or_path) # For Arrow files, read the file into a DataFrame and then insert sql_command = "CREATE TABLE $name AS SELECT * FROM read_json('$df_or_path');" - DuckDB.execute(conn, "INSTALL json;") - DuckDB.execute(conn, "LOAD json;") - DuckDB.execute(conn, sql_command) + DBInterface.execute(conn, "INSTALL json;") + DBInterface.execute(conn, "LOAD json;") + DBInterface.execute(conn, sql_command) elseif startswith(df_or_path, "read") - DuckDB.execute(conn, "CREATE TABLE $name AS SELECT * FROM $df_or_path;") + DBInterface.execute(conn, "CREATE TABLE $name AS SELECT * FROM $df_or_path;") else error("Unsupported file type for: $df_or_path") end