Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ on:
pull_request:
workflow_dispatch:
concurrency:
# Skip intermediate builds: always.
# Cancel intermediate builds: only if it is a pull request build.
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
jobs:
test:
env:
JULIA_DEPWARN: "false"
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
strategy:
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ ClickHouse = "0.2"
DataFrames = "1.5"
Dates = "1.9"
Documenter = "0.27, 1"
DuckDB = "1 - 2"
DuckDB = "1.2"
GZip = "0.6"
GoogleCloud = "0.11"
HTTP = "1.1"
Expand Down
40 changes: 40 additions & 0 deletions docs/examples/UserGuide/agg_window.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# TidierDB supports all aggregate functions accross the supported databases, as well as window functions.

# ## Aggregate Functions
# `@summarize`, by default, supports all aggregate functions built in to a SQL database, with the exception that any `'` that would be used in SQL should be replaced wiht `"`.
using TidierDB
db = connect(duckdb())
mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
mtcars = db_table(db, mtcars_path);

# ## aggregate function in `@summarize`
# Lets use the DuckDB `kurtosis` aggregate function
@chain t(mtcars) begin
@group_by cyl
@summarize(kurt = kurtosis(mpg))
@collect
end

## aggregate functions in `@mutate`
# `@mutate`/`@transmute` supports - the following aggregate functions by default: `maximum`, `minimum`, `mean`, `std`, `sum`, `cumsum`
# To use aggregate sql functions that are built in to any database not, but exist outside of the TidierDB parser, simply wrap the function call in `agg()`
@chain t(mtcars) begin
@group_by(cyl)
@mutate(kurt = agg(kurtosis(mpg)))
@select cyl mpg kurt
@collect
end

# Alternatively , if you anticipate regularly using specific aggregate functions, you can use update the underlying parser and drop the need to use `agg`
push!(TidierDB.window_agg_fxns, :kurtosis);
@chain t(mtcars) begin
@group_by(cyl)
@mutate(kurt = kurtosis(mpg))
@select cyl mpg kurt
@collect
end


# ## Window Functions
# TidierDB's `@mutate`/`@transmute` support all of the window functions below
#`lead`, `lag`, `dense_rank`, `nth_value`, `ntile`, `rank_dense`, `row_number`, `first_value`, `last_value`, `cume_dist`
118 changes: 26 additions & 92 deletions docs/examples/UserGuide/udfs_ex.jl
Original file line number Diff line number Diff line change
@@ -1,93 +1,15 @@
# TidierDB is unique in its statement parsing flexiblility. This means that using any built in SQL function or user defined functions (or UDFS) or is readily avaialable.
# To use any function built into a database in `@mutate` or in `@summarize`, simply correctly write the correctly, but replace `'` with `"`. This also applies to any UDF. The example below will illustrate UDFs in the context of DuckDB.


# ```
# # Set up the connection
# using TidierDB #rexports DuckDB
# db = DuckDB.DB()
# con = DuckDB.connect(db) # this will be important for UDFs
# mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
# mtcars = db_tbable(db, mtcars_path);
# ```
# ## aggregate function in `@summarize`
# Lets use the DuckDB `kurtosis` aggregate function
# ```
# @chain t(mtcars) begin
# @group_by cyl
# @summarize(kurt = kurtosis(mpg))
# @collect
# end
# 3×2 DataFrame
# Row │ cyl kurt
# │ Int64? Float64?
# ─────┼───────────────────
# 1 │ 4 -1.43411
# 2 │ 6 -1.82944
# 3 │ 8 0.330061
# ```

# ## aggregate functions in `@mutate`
# To aggregate sql functions that are builtin to any database, but exist outside of the TidierDB parser, simply wrap the function call in `agg()`
# ```
# @chain t(mtcars) begin
# @group_by(cyl)
# @mutate(kurt = agg(kurtosis(mpg)))
# @select cyl mpg kurt
# @collect
# end

# 32×3 DataFrame
# Row │ cyl mpg kurt
# │ Int64? Float64? Float64?
# ─────┼─────────────────────────────
# 1 │ 8 18.7 0.330061
# 2 │ 8 14.3 0.330061
# 3 │ 8 16.4 0.330061
# 4 │ 8 17.3 0.330061
# 5 │ 8 15.2 0.330061
# 6 │ 8 10.4 0.330061
# 7 │ 8 10.4 0.330061
# ⋮ │ ⋮ ⋮ ⋮
# 27 │ 6 21.0 -1.82944
# 28 │ 6 21.4 -1.82944
# 29 │ 6 18.1 -1.82944
# 30 │ 6 19.2 -1.82944
# 31 │ 6 17.8 -1.82944
# 32 │ 6 19.7 -1.82944
# 19 rows omitted
# end

# ```
# TidierDB is unique in its statement parsing flexiblility. This means that in addition to using any built in SQL database functions, user defined functions (or UDFS) are readily avaialable in TidierDB.

# ## DuckDB function chaining
# In DuckDB, functions can be chained together with `.`. TidierDB lets you leverage this.
# ```
# @chain t(mtcars) begin
# @mutate(model2 = model.upper().string_split(" ").list_aggr("string_agg",".").concat("."))
# @select model model2
# @collect
# end
# 32×2 DataFrame
# Row │ model model2
# │ String? String?
# ─────┼───────────────────────────────────────
# 1 │ Mazda RX4 MAZDA.RX4.
# 2 │ Mazda RX4 Wag MAZDA.RX4.WAG.
# 3 │ Datsun 710 DATSUN.710.
# 4 │ Hornet 4 Drive HORNET.4.DRIVE.
# 5 │ Hornet Sportabout HORNET.SPORTABOUT.
# 6 │ Valiant VALIANT.
# 7 │ Duster 360 DUSTER.360.
# ⋮ │ ⋮ ⋮
# 27 │ Porsche 914-2 PORSCHE.914-2.
# 28 │ Lotus Europa LOTUS.EUROPA.
# 29 │ Ford Pantera L FORD.PANTERA.L.
# 30 │ Ferrari Dino FERRARI.DINO.
# 31 │ Maserati Bora MASERATI.BORA.
# 32 │ Volvo 142E VOLVO.142E.
# 19 rows omitted
# ```
using TidierDB # DuckDB is reexported by TidierDB
db = connect(duckdb())
mtcars = db_table(db, "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv")
@chain t(mtcars) begin
@mutate(model2 = model.upper().string_split(" ").list_aggr("string_agg",".").concat("."))
@select model model2
@collect
end

# ## `rowid` and pseudocolumns
# When a table is not being read directly from a file, `rowid` is avaialable for use. In general, TidierDB should support all pseudocolumns.
Expand All @@ -105,7 +27,22 @@
# 1 │ Hornet Sportabout 18.7 8 360.0 175
# ```

# ## UDF SQLite Example
# ## UDFs in DuckDB
# TidierDB's flexibility means that once created, UDFs can immediately be used in with `@mutate` or `@transmute`
df = DataFrame(a = [1, 2, 3], b = [1, 2, 3])
dfv = db_table(db, df, "df_view")
# A more in depth disccusion of UDFs in DuckDB.jl can be found [here](https://discourse.julialang.org/t/is-it-hard-to-support-julia-udfs-in-duckdb/118509/24?u=true).
# define a function and Create the scalar function
bino = (a, b) -> (a + b) * (a + b)
fun = DuckDB.@create_scalar_function bino(a::Int, b::Int)::Int
DuckDB.register_scalar_function(db, fun)
@chain t(dfv) @mutate(c = bino(a, b)) @collect

# Notably, when the function is redefined (with the same arguments), the DuckDB UDF will change as well.
bino = (a, b) -> (a + b) * (a - b)
@chain t(dfv) @mutate(c = bino(a, b)) @collect

# ## UDFs in SQLite
# ```
# using SQLite
# sql = connect(sqlite());
Expand Down Expand Up @@ -138,7 +75,4 @@
# 8 │ 3 0.8 8.36
# 9 │ 4 0.9 15.19
# 10 │ 5 1.0 24.0
# ```

# ## How to create UDF in DuckDB
# Example coming soon..
# ```
4 changes: 2 additions & 2 deletions src/TBD_macros.jl
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ end

function final_collect(sqlquery::SQLQuery, ::Type{<:duckdb})
final_query = finalize_query(sqlquery)
result = DBInterface.execute(sqlquery.db, final_query)
result = DuckDB.query(sqlquery.db, final_query)
return DataFrame(result)
end

Expand Down Expand Up @@ -507,7 +507,7 @@ macro collect(sqlquery, stream = false)
if backend == duckdb()
if $stream
println("streaming")
stream_collect($(esc(sqlquery)))
# stream_collect($(esc(sqlquery)))
else
final_collect($(esc(sqlquery)), duckdb)
end
Expand Down
22 changes: 11 additions & 11 deletions src/TidierDB.jl
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_nam
DESCRIBE SELECT * FROM $(table_name) LIMIT 0
"""
end
result = DuckDB.execute(conn, query) |> DataFrame
result = DBInterface.execute(conn, query) |> DataFrame
result[!, :current_selxn] .= 1
if occursin("*" , table_name) || alias != ""
if alias != ""
Expand Down Expand Up @@ -249,8 +249,8 @@ function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, de
table_name2 = "iceberg_scan('$table_name', allow_moved_paths = true)"
metadata = get_table_metadata(db, table_name2)
elseif delta
DuckDB.execute(db, "INSTALL delta;")
DuckDB.execute(db, "LOAD delta;")
DBInterface.execute(db, "INSTALL delta;")
DBInterface.execute(db, "LOAD delta;")
table_name2 = "delta_scan('$table_name')"
elseif occursin("docs.google", table_name)
table_name2 = "read_gsheet('$table_name')"
Expand Down Expand Up @@ -399,29 +399,29 @@ function copy_to(conn, df_or_path::Union{DataFrame, AbstractString}, name::Strin
# Determine the file type based on the extension
if startswith(df_or_path, "http")
# Install and load the httpfs extension if the path is a URL
DuckDB.execute(conn, "INSTALL httpfs;")
DuckDB.execute(conn, "LOAD httpfs;")
DBInterface.execute(conn, "INSTALL httpfs;")
DBInterface.execute(conn, "LOAD httpfs;")
end
if occursin(r"\.csv$", df_or_path)
# Construct and execute a SQL command for loading a CSV file
sql_command = "CREATE TABLE $name AS SELECT * FROM '$df_or_path';"
DuckDB.execute(conn, sql_command)
DBInterface.execute(conn, sql_command)
elseif occursin(r"\.parquet$", df_or_path)
# Construct and execute a SQL command for loading a Parquet file
sql_command = "CREATE TABLE $name AS SELECT * FROM '$df_or_path';"
DuckDB.execute(conn, sql_command)
DBInterface.execute(conn, sql_command)
elseif occursin(r"\.arrow$", df_or_path)
# Construct and execute a SQL command for loading a CSV file
arrow_table = Arrow.Table(df_or_path)
DuckDB.register_table(conn, arrow_table, name)
elseif occursin(r"\.json$", df_or_path)
# For Arrow files, read the file into a DataFrame and then insert
sql_command = "CREATE TABLE $name AS SELECT * FROM read_json('$df_or_path');"
DuckDB.execute(conn, "INSTALL json;")
DuckDB.execute(conn, "LOAD json;")
DuckDB.execute(conn, sql_command)
DBInterface.execute(conn, "INSTALL json;")
DBInterface.execute(conn, "LOAD json;")
DBInterface.execute(conn, sql_command)
elseif startswith(df_or_path, "read")
DuckDB.execute(conn, "CREATE TABLE $name AS SELECT * FROM $df_or_path;")
DBInterface.execute(conn, "CREATE TABLE $name AS SELECT * FROM $df_or_path;")
else
error("Unsupported file type for: $df_or_path")
end
Expand Down
Loading