TidierOrg · drizk1 · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -7,12 +7,12 @@ on:
   pull_request:
   workflow_dispatch:
 concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 jobs:
   test:
+    env:
+      JULIA_DEPWARN: "false"
     name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
     runs-on: ${{ matrix.os }}
     strategy:

diff --git a/Project.toml b/Project.toml
@@ -45,7 +45,7 @@ ClickHouse = "0.2"
 DataFrames = "1.5"
 Dates = "1.9"
 Documenter = "0.27, 1"
-DuckDB = "1 - 2"
+DuckDB = "1.2"
 GZip = "0.6"
 GoogleCloud = "0.11"
 HTTP = "1.1"

diff --git a/docs/examples/UserGuide/agg_window.jl b/docs/examples/UserGuide/agg_window.jl
@@ -0,0 +1,40 @@
+# TidierDB supports all aggregate functions accross the supported databases, as well as window functions. 
+
+# ## Aggregate Functions
+# `@summarize`, by default, supports all aggregate functions built in to a SQL database, with the exception that any `'` that would be used in SQL should be replaced wiht `"`. 
+using TidierDB
+db = connect(duckdb())
+mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
+mtcars = db_table(db, mtcars_path);
+
+# ## aggregate function in `@summarize`
+# Lets use the DuckDB `kurtosis` aggregate function 
+@chain t(mtcars) begin
+       @group_by cyl 
+       @summarize(kurt = kurtosis(mpg))
+       @collect 
+ end
+
+## aggregate functions in `@mutate`
+# `@mutate`/`@transmute` supports - the following aggregate functions by default: `maximum`, `minimum`, `mean`, `std`, `sum`, `cumsum`
+# To use aggregate sql functions that are built in to any database not, but exist outside of the TidierDB parser, simply wrap the function call in `agg()`
+@chain t(mtcars) begin 
+     @group_by(cyl)
+     @mutate(kurt = agg(kurtosis(mpg)))
+     @select cyl mpg kurt
+     @collect 
+end
+
+# Alternatively , if you anticipate regularly using specific aggregate functions, you can use update the underlying parser and drop the need to use `agg`
+push!(TidierDB.window_agg_fxns, :kurtosis);
+@chain t(mtcars) begin 
+     @group_by(cyl)
+     @mutate(kurt = kurtosis(mpg))
+     @select cyl mpg kurt
+     @collect 
+end
+
+
+# ## Window Functions
+# TidierDB's `@mutate`/`@transmute` support all of the window functions below
+#`lead`, `lag`, `dense_rank`, `nth_value`, `ntile`, `rank_dense`, `row_number`, `first_value`, `last_value`, `cume_dist`
diff --git a/docs/examples/UserGuide/udfs_ex.jl b/docs/examples/UserGuide/udfs_ex.jl
@@ -1,93 +1,15 @@
-# TidierDB is unique in its statement parsing flexiblility.  This means that using any built in SQL function or user defined functions (or UDFS) or is readily avaialable.  
-# To use any function built into a database in `@mutate` or in `@summarize`, simply correctly write the correctly, but replace `'` with `"`. This also applies to any UDF. The example below will illustrate UDFs in the context of DuckDB.
-
-
-# ```
-# # Set up the connection
-# using TidierDB  #rexports DuckDB
-# db = DuckDB.DB()
-# con = DuckDB.connect(db) # this will be important for UDFs
-# mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
-# mtcars = db_tbable(db, mtcars_path);
-# ```
-# ## aggregate function in `@summarize`
-# Lets use the DuckDB `kurtosis` aggregate function 
-# ```
-# @chain t(mtcars) begin
-#       @group_by cyl 
-#       @summarize(kurt = kurtosis(mpg))
-#       @collect 
-# end
-# 3×2 DataFrame
-#  Row │ cyl     kurt      
-#      │ Int64?  Float64?  
-# ─────┼───────────────────
-#    1 │      4  -1.43411
-#    2 │      6  -1.82944
-#    3 │      8   0.330061
-# ```
-
-# ## aggregate functions in `@mutate`
-# To aggregate sql functions that are builtin to any database, but exist outside of the TidierDB parser, simply wrap the function call in `agg()`
-# ```
-# @chain t(mtcars) begin 
-#     @group_by(cyl)
-#     @mutate(kurt = agg(kurtosis(mpg)))
-#     @select cyl mpg kurt
-#     @collect 
-# end
-
-# 32×3 DataFrame
-#  Row │ cyl     mpg       kurt      
-#      │ Int64?  Float64?  Float64?  
-# ─────┼─────────────────────────────
-#    1 │      8      18.7   0.330061
-#    2 │      8      14.3   0.330061
-#    3 │      8      16.4   0.330061
-#    4 │      8      17.3   0.330061
-#    5 │      8      15.2   0.330061
-#    6 │      8      10.4   0.330061
-#    7 │      8      10.4   0.330061
-#   ⋮  │   ⋮        ⋮          ⋮
-#   27 │      6      21.0  -1.82944
-#   28 │      6      21.4  -1.82944
-#   29 │      6      18.1  -1.82944
-#   30 │      6      19.2  -1.82944
-#   31 │      6      17.8  -1.82944
-#   32 │      6      19.7  -1.82944
-#                     19 rows omitted
-# end
-
-# ```
+# TidierDB is unique in its statement parsing flexiblility.  This means that in addition to using any built in SQL database functions, user defined functions (or UDFS) are readily avaialable in TidierDB.  
 
 # ##  DuckDB function chaining
 # In DuckDB, functions can be chained together with `.`. TidierDB lets you leverage this. 
-# ```
-# @chain t(mtcars) begin 
-#     @mutate(model2 = model.upper().string_split(" ").list_aggr("string_agg",".").concat("."))
-#     @select model model2
-#     @collect
-# end
-# 32×2 DataFrame
-#  Row │ model              model2             
-#      │ String?            String?            
-# ─────┼───────────────────────────────────────
-#    1 │ Mazda RX4          MAZDA.RX4.
-#    2 │ Mazda RX4 Wag      MAZDA.RX4.WAG.
-#    3 │ Datsun 710         DATSUN.710.
-#    4 │ Hornet 4 Drive     HORNET.4.DRIVE.
-#    5 │ Hornet Sportabout  HORNET.SPORTABOUT.
-#    6 │ Valiant            VALIANT.
-#    7 │ Duster 360         DUSTER.360.
-#   ⋮  │         ⋮                  ⋮
-#   27 │ Porsche 914-2      PORSCHE.914-2.
-#   28 │ Lotus Europa       LOTUS.EUROPA.
-#   29 │ Ford Pantera L     FORD.PANTERA.L.
-#   30 │ Ferrari Dino       FERRARI.DINO.
-#   31 │ Maserati Bora      MASERATI.BORA.
-#   32 │ Volvo 142E         VOLVO.142E.
-#                               19 rows omitted
-# ```
+using TidierDB # DuckDB is reexported by TidierDB
+db = connect(duckdb())
+mtcars = db_table(db, "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv")
+@chain t(mtcars) begin 
+    @mutate(model2 = model.upper().string_split(" ").list_aggr("string_agg",".").concat("."))
+    @select model model2
+    @collect
+end
 
 # ## `rowid` and pseudocolumns
 # When a table is not being read directly from a file, `rowid` is avaialable for use. In general, TidierDB should support all pseudocolumns.
@@ -105,7 +27,22 @@
 #    1 │ Hornet Sportabout      18.7       8     360.0     175
 # ```
 
-# ## UDF SQLite Example
+# ## UDFs in DuckDB
+# TidierDB's flexibility means that once created, UDFs can immediately be used in with `@mutate` or `@transmute`
+df = DataFrame(a = [1, 2, 3], b = [1, 2, 3])
+dfv = db_table(db, df, "df_view")
+# A more in depth disccusion of UDFs in DuckDB.jl can be found [here](https://discourse.julialang.org/t/is-it-hard-to-support-julia-udfs-in-duckdb/118509/24?u=true). 
+# define a function and Create the scalar function 
+bino = (a, b) -> (a + b) * (a + b)
+fun = DuckDB.@create_scalar_function bino(a::Int, b::Int)::Int
+DuckDB.register_scalar_function(db, fun)
+@chain t(dfv) @mutate(c = bino(a, b)) @collect
+
+# Notably, when the function is redefined (with the same arguments), the DuckDB UDF will change as well.
+bino = (a, b) -> (a + b) * (a - b)
+@chain t(dfv) @mutate(c = bino(a, b)) @collect
+
+# ## UDFs in SQLite 
 # ```
 # using SQLite
 # sql = connect(sqlite());
@@ -138,7 +75,4 @@
 #    8 │     3      0.8     8.36
 #    9 │     4      0.9    15.19
 #   10 │     5      1.0    24.0
-# ```
-
-# ## How to create UDF in DuckDB
-# Example coming soon..
+# ```
diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl
@@ -446,7 +446,7 @@ end
 
 function final_collect(sqlquery::SQLQuery, ::Type{<:duckdb})
     final_query = finalize_query(sqlquery)
-    result = DBInterface.execute(sqlquery.db, final_query)
+    result = DuckDB.query(sqlquery.db, final_query)
     return DataFrame(result)
 end
 
@@ -507,7 +507,7 @@ macro collect(sqlquery, stream = false)
         if backend == duckdb()
             if $stream
                 println("streaming")
-                stream_collect($(esc(sqlquery)))
+           #     stream_collect($(esc(sqlquery)))
             else
                 final_collect($(esc(sqlquery)), duckdb)
             end

diff --git a/src/TidierDB.jl b/src/TidierDB.jl
@@ -206,7 +206,7 @@ function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_nam
         DESCRIBE SELECT * FROM $(table_name) LIMIT 0
         """
     end
-    result = DuckDB.execute(conn, query) |> DataFrame
+    result = DBInterface.execute(conn, query) |> DataFrame
     result[!, :current_selxn] .= 1
     if occursin("*" , table_name) || alias != ""
         if alias != ""
@@ -249,8 +249,8 @@ function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, de
             table_name2 = "iceberg_scan('$table_name', allow_moved_paths = true)"
             metadata = get_table_metadata(db, table_name2)
         elseif delta
-            DuckDB.execute(db, "INSTALL delta;")
-            DuckDB.execute(db, "LOAD delta;")
+            DBInterface.execute(db, "INSTALL delta;")
+            DBInterface.execute(db, "LOAD delta;")
             table_name2 = "delta_scan('$table_name')"
         elseif occursin("docs.google", table_name) 
             table_name2 = "read_gsheet('$table_name')"
@@ -399,29 +399,29 @@ function copy_to(conn, df_or_path::Union{DataFrame, AbstractString}, name::Strin
         # Determine the file type based on the extension
         if startswith(df_or_path, "http")
             # Install and load the httpfs extension if the path is a URL
-            DuckDB.execute(conn, "INSTALL httpfs;")
-            DuckDB.execute(conn, "LOAD httpfs;")
+            DBInterface.execute(conn, "INSTALL httpfs;")
+            DBInterface.execute(conn, "LOAD httpfs;")
         end
         if occursin(r"\.csv$", df_or_path)
             # Construct and execute a SQL command for loading a CSV file
             sql_command = "CREATE TABLE $name AS SELECT * FROM '$df_or_path';"
-            DuckDB.execute(conn, sql_command)
+            DBInterface.execute(conn, sql_command)
         elseif occursin(r"\.parquet$", df_or_path)
             # Construct and execute a SQL command for loading a Parquet file
             sql_command = "CREATE TABLE $name AS SELECT * FROM '$df_or_path';"
-            DuckDB.execute(conn, sql_command)
+            DBInterface.execute(conn, sql_command)
         elseif occursin(r"\.arrow$", df_or_path)
             # Construct and execute a SQL command for loading a CSV file
             arrow_table = Arrow.Table(df_or_path)
             DuckDB.register_table(conn, arrow_table, name)
         elseif occursin(r"\.json$", df_or_path)
             # For Arrow files, read the file into a DataFrame and then insert
             sql_command = "CREATE TABLE $name AS SELECT * FROM read_json('$df_or_path');"
-            DuckDB.execute(conn, "INSTALL json;")
-            DuckDB.execute(conn, "LOAD json;")
-            DuckDB.execute(conn, sql_command)
+            DBInterface.execute(conn, "INSTALL json;")
+            DBInterface.execute(conn, "LOAD json;")
+            DBInterface.execute(conn, sql_command)
         elseif startswith(df_or_path, "read")
-             DuckDB.execute(conn, "CREATE TABLE $name AS SELECT * FROM $df_or_path;")
+             DBInterface.execute(conn, "CREATE TABLE $name AS SELECT * FROM $df_or_path;")
         else
             error("Unsupported file type for: $df_or_path")
         end