TidierOrg
diff --git a/‎NEWS.md‎
Lines changed: 8 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/examples/UserGuide/ex_joining.jl‎
Lines changed: 2 additions & 11 deletions b/‎docs/examples/UserGuide/ex_joining.jl‎
Lines changed: 2 additions & 11 deletions
diff --git a/‎docs/examples/UserGuide/file_reading.jl‎
Lines changed: 3 additions & 1 deletion b/‎docs/examples/UserGuide/file_reading.jl‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/examples/UserGuide/misc_tips.jl‎
Lines changed: 3 additions & 3 deletions b/‎docs/examples/UserGuide/misc_tips.jl‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/mkdocs.yml‎
Lines changed: 1 addition & 1 deletion b/‎docs/mkdocs.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/TidierDB.jl‎
Lines changed: 31 additions & 9 deletions b/‎src/TidierDB.jl‎
Lines changed: 31 additions & 9 deletions
diff --git a/‎src/TidierDB_macros.jl‎
Lines changed: 10 additions & 45 deletions b/‎src/TidierDB_macros.jl‎
Lines changed: 10 additions & 45 deletions
@@ -1,4 +1,12 @@
 # TidierDB.jl updates
+## v.8.3 - 2025-04-11
+- adds `@drop_missing`
+- adds `@pivot_wider`
+- `db_table` or `dt` accept paths to .sas7bdat, .xpt, .sav, .zsav, .por, .dta files with DuckDB
+- Improvements to CTE generation 
+- add kwarg `overwrite = false` to `copy_to` to default table copying to not replace exisiting tables with the name.
+- separate `@summary` into its own macro for collecting summary statistics (max, min, q1, q2, q3, avg, std, count, unique) from a table or file 
+
 ## v0.8.0 - 2025-03-24
 - adds `@transmute`
 - adds `@separate` and `@unite`
 
@@ -1,7 +1,7 @@
 name = "TidierDB"
 uuid = "86993f9b-bbba-4084-97c5-ee15961ad48b"
 authors = ["Daniel Rizk <rizk.daniel.12@gmail.com> and contributors"]
-version = "0.8.2"
+version = "0.8.3"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
 
@@ -26,21 +26,11 @@
 # ## Examples
 # Examples below will cover how to join tables with different schemas in different databases, 
 # and how to write queries on tables and then join them together, and how to do this by levaraging views. Some examples 
-# <!--
+
 using TidierDB
 db = connect(duckdb())
 mtcars = dt(db, "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv")
-# -->
 
-# ## Setup
-# ```julia
-# using TidierDB
-# db = connect(duckdb(), "md:")
-# 
-# mtcars = dt(db, "my_db.mtcars")
-# mt2 = dt(db, "ducks_db.mt2")
-# ```
-# 
 # ## Wrangle tables and self join
 query = @chain mtcars begin
     @group_by cyl
@@ -70,6 +60,7 @@ end
 # To connect to a table in a different schema, prefix it with a dot. For example, "schema_name.table_name".
 # In this query, we are also filtering out cars that contain "M" in the name from the `mt2` table before joining. 
 # ```julia
+# mt2 = dt(db, "ducks_db.mt2")
 # other_db = @chain dt(db, "ducks_db.mt2") @filter(!str_detect(car, "M"))
 # @chain mtcars begin
 #     @left_join(t(other_db), model == car)
 
@@ -9,9 +9,11 @@
 # - S3 buckets
 # - iceberg and delta - require additional args `delta` or `iceberg` to be set to `true`
 # - Google Sheets (first run `connect(db, :gsheets)`)
+# - .sas7bdat, .xpt, .sav, .zsav, .por, .dta : `dt(db, "any/file/path/to.sav")`
 #
 
-# `dt` allso supports directly using any DuckDB file reading function. This allows for easily reading in compressed files
+# `dt` also supports directly using DuckDB file reading function. This enables easily reading in compressed files
+
 # When reading in a compresssed path, adding an `alias` is recommended. 
 # - `dt(db, "read_csv('/Volumes/Untitled/phd_*_genlab.txt', ignore_errors=true)", alias = "genlab")`
 
 
@@ -13,8 +13,8 @@ dfv = dt(db, df, "dfv");
 
 # ## DuckDB's SUMMARIZE
 # DuckDB has a feature tosummarize tables that gives information about the table, such as mean, std, q25, q75 etc.
-# To use this feature with TidierDB, simply call an empty `@summarize`. 
-@chain dfv @summarize() @collect
+# To use this feature with TidierDB, simply call an `@summary` on any table or file before querying it.
+@chain dfv @summary() @collect
 
 # ## show_query/collect
 # If you find yourself frequently showing a query while collecting, you can define the following function 
@@ -25,7 +25,7 @@ sqc(qry) = @chain qry begin
 
 # Call this function at the end of a chain similar the `@show_query` or`@collect` macros
 # _printed query is not seen here as it prints to the REPL_
-@chain dfv @summarize() sqc()
+@chain dfv @summary() sqc()
 
 # ## Color Printing
 # Queries print with some code words in color to the REPL. To turn off this feature, run one of the following.
 
@@ -118,7 +118,7 @@ nav:
   - "Home": "index.md"
   - "Key Differences from TidierData.jl" : "examples/generated/UserGuide/key_differences.md"
   - "Getting Started" : "examples/generated/UserGuide/getting_started.md"
-  - "File Reading" : "examples/generated/UserGuide/file_reading.md"
+  - "File Reading/Writing" : "examples/generated/UserGuide/file_reading.md"
   - "Joining Tables" : "examples/generated/UserGuide/ex_joining.md"
   - "Aggregate and Window Functions" : "examples/generated/UserGuide/agg_window.md"
   - "Flexible Syntax and UDFs" : "examples/generated/UserGuide/udfs_ex.md"
 
@@ -19,7 +19,7 @@ using Crayons
         @distinct, @left_join, @right_join, @inner_join, @count, @slice_max,  @union,
         @slice_min, @slice_sample, @rename, @relocate, @union_all, @setdiff, @intersect, 
         @semi_join, @full_join, @transmute,  @anti_join, @head,  @unnest_wider, @unnest_longer,
-        @separate, @unite, @drop_missing
+        @separate, @unite, @drop_missing, @pivot_wider, @summary
 
  export db_table, set_sql_mode, connect, from_query, update_con,  
  clickhouse, duckdb, sqlite, mysql, mssql, postgres, athena, snowflake, gbq, 
@@ -72,6 +72,7 @@ include("relocate.jl")
 include("union_intersect_setdiff.jl")
 include("unnest.jl")
 include("sep_unite.jl")
+include("pivots.jl")
 
 
 # Unified expr_to_sql function to use right mode
@@ -173,7 +174,11 @@ function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, de
            # println(table_name2)
             alias == "" ? alias = "gsheet" : alias = alias
             metadata = get_table_metadata(db, table_name2, alias = alias)
-        elseif startswith(table_name, "read") 
+        elseif any(endswith(table_name, ext) for ext in [".sas7bdat", ".xpt", ".sav", ".zsav", ".por", ".dta"])
+            DuckDB.query(db, "install read_stat from community; load read_stat")
+            table_name2 = "read_stat('$table_name')"
+            metadata = get_table_metadata(db, table_name2)
+        elseif startswith(table_name, "read")
             table_name2 = "$table_name"
             alias = alias == "" ? "data" : alias
            # println(table_name2)
@@ -291,24 +296,41 @@ function db_table(db, table::Vector{String}, athena_params::Any=nothing)
 end
 
 function db_table(db, table::DataFrame, alias::String) 
+    if any(any(lowercase(string(name)) == word for word in sql_words) for name in names(table))
+        found_words = [word for word in sql_words if any(lowercase(string(name)) == word for name in names(table))]
+        @warn "Column names containing SQL keywords detected: $(join(found_words, ", ")). 
+         These may cause issues as they are reserved SQL keywords. 
+         Consider renaming the columns before scanning to DuckDB."
+    end
+    # COV_EXCL_STOP
     DuckDB.register_data_frame(db, table, alias)
     metadata = get_table_metadata(db, alias)
     return SQLQuery(from = alias, metadata=metadata, db=db)
 end
 const dt = db_table # COV_EXCL_LINE
-# COV_EXCL_STOP
+
+
+sql_words = ["group", "select", "from", "where", "having", "order", "by", "join", "union", "case", "when", "then", "else", "end", "limit", "right", "left"] # COV_EXCL_LINE
 
 """
 $docstring_copy_to
 """
-function copy_to(conn, df_or_path::Union{DataFrame, AbstractString}, name::String)
+function copy_to(conn, df_or_path::Union{DataFrame, AbstractString}, name::String; overwrite::Bool=false)
     # Check if the input is a DataFrame
+    rep = overwrite ? "OR REPLACE" : ""
     if isa(df_or_path, DataFrame)
         if current_sql_mode[] == duckdb()
             name_view = name * "view"
             DuckDB.register_data_frame(conn, df_or_path, name_view)
-            DBInterface.execute(conn, "CREATE OR REPLACE TABLE $name AS SELECT * FROM $name_view")
+            DBInterface.execute(conn, "CREATE $rep TABLE $name AS SELECT * FROM $name_view")
             DBInterface.execute(conn, "DROP VIEW $name_view ")
+        # Check for 'group' in column names and warn if found
+            if any(any(lowercase(string(name)) == word for word in sql_words) for name in names(df_or_path))
+                    found_words = [word for word in sql_words if any(lowercase(string(name)) == word for name in names(df_or_path))]
+                @warn "Column names containing SQL keywords detected: $(join(found_words, ", ")). 
+                These may cause issues as they are reserved SQL keywords. 
+                Consider renaming the columns before copying to DuckDB."
+            end
         end
     # COV_EXCL_START
     elseif isa(df_or_path, AbstractString)
@@ -323,24 +345,24 @@ function copy_to(conn, df_or_path::Union{DataFrame, AbstractString}, name::Strin
         end
         if occursin(r"\.csv$", df_or_path)
             # Construct and execute a SQL command for loading a CSV file
-            sql_command = "CREATE TABLE $name AS SELECT * FROM '$df_or_path';"
+            sql_command = "CREATE $rep TABLE $name AS SELECT * FROM '$df_or_path';"
             DuckDB.execute(conn, sql_command)
         elseif occursin(r"\.parquet$", df_or_path)
             # Construct and execute a SQL command for loading a Parquet file
-            sql_command = "CREATE TABLE $name AS SELECT * FROM '$df_or_path';"
+            sql_command = "CREATE $rep TABLE $name AS SELECT * FROM '$df_or_path';"
             DuckDB.execute(conn, sql_command)
         elseif occursin(r"\.arrow$", df_or_path)
             # Construct and execute a SQL command for loading a CSV file
             arrow_table = Arrow.Table(df_or_path)
             DuckDB.register_table(conn, arrow_table, name)
         elseif occursin(r"\.json$", df_or_path)
             # For Arrow files, read the file into a DataFrame and then insert
-            sql_command = "CREATE TABLE $name AS SELECT * FROM read_json('$df_or_path');"
+            sql_command = "CREATE $rep TABLE $name AS SELECT * FROM read_json('$df_or_path');"
             DuckDB.execute(conn, "INSTALL json;")
             DuckDB.execute(conn, "LOAD json;")
             DuckDB.execute(conn, sql_command)
         elseif startswith(df_or_path, "read")
-             DuckDB.execute(conn, "CREATE TABLE $name AS SELECT * FROM $df_or_path;")
+             DuckDB.execute(conn, "CREATE $rep TABLE $name AS SELECT * FROM $df_or_path;")
         else
             error("Unsupported file type for: $df_or_path")
         end
 
@@ -10,7 +10,7 @@ macro select(sqlquery, exprs...)
         sq = $(esc(sqlquery))
         sq = sq.post_first ? (t($(esc(sqlquery)))) : sq
         sq.post_first = false; 
-        build_cte!(sq)
+        if sq.select != "" build_cte!(sq); sq.select == ""; end
         let columns = parse_tidy_db(exprs_str, sq.metadata)
             columns_str = join(["SELECT ", join([string(column) for column in columns], ", ")])
             sq.select = columns_str
@@ -51,7 +51,6 @@ macro filter(sqlquery, conditions...)
         sq.post_first = false; 
 
         if isa(sq, SQLQuery)
-            # Early handling for non-aggregated context
             if !sq.is_aggregated
                 if sq.post_join
                     combined_conditions = String[]
@@ -61,6 +60,7 @@ macro filter(sqlquery, conditions...)
                         push!(combined_conditions, condition_str)
                     end
                     combined_condition_str = join(combined_conditions, " AND ")
+
                     sq.where = " WHERE " * combined_condition_str
                   #  sq.post_join = false
                 else
@@ -72,23 +72,18 @@ macro filter(sqlquery, conditions...)
                     push!(combined_conditions, condition_str)
                 end
                 combined_condition_str = join(combined_conditions, " AND ")
-                new_cte = CTE(name=cte_name, select="*", from=(isempty(sq.ctes) ? sq.from : last(sq.ctes).name), where=combined_condition_str)
-                up_cte_name(sq, cte_name)
-                
-                push!(sq.ctes, new_cte)
-                sq.from = cte_name
-                sq.cte_count += 1
-         #      matching_indices = findall(sq.metadata.name .== 2)
-         #       sq.metadata.current_selxn[matching_indices] .= 1
+
+                sq.where = combined_condition_str
+            #    println(sq.from)
+                build_cte!(sq)
+                sq.select = " * "
             end
             else
             aggregated_columns = Set{String}()
 
-            # Check SELECT clause of the main query and all CTEs for aggregation functions
             if !isempty(sq.select)
                 for part in split(sq.select, ", ")
                     if occursin(" AS ", part)
-                        # Extract the alias used after 'AS' which represents an aggregated column
                         aggregated_column = strip(split(part, " AS ")[2])
                         push!(aggregated_columns, aggregated_column)
                     end
@@ -118,20 +113,9 @@ macro filter(sqlquery, conditions...)
             end
             if !isempty(non_aggregated_conditions)
                 combined_conditions = join(non_aggregated_conditions, " AND ")
-                cte_name = "cte_" * string(sq.cte_count + 1)
-                new_cte = CTE(name=cte_name, select=sq.select, from=(isempty(sq.ctes) ? sq.from : last(sq.ctes).name), groupBy = sq.groupBy, having=sq.having)
-                up_cte_name(sq, cte_name)
-                
-                push!(sq.ctes, new_cte)
+                build_cte!(sq)
                 sq.select = "*"
-                sq.groupBy = ""
-                sq.having = ""
-                
                 sq.where = "WHERE " * join(non_aggregated_conditions, " AND ")
-                sq.from = cte_name
-                sq.cte_count += 1
-             #   matching_indices = findall(sq.metadata.name .== 2)
-            #    sq.metadata.current_selxn[matching_indices] .= 1
             end
         end
 
@@ -146,13 +130,9 @@ end
 
 
 function _colref_to_string(col)
-    # If it's already a bare Symbol, just convert to string
     if isa(col, Symbol)
         return string(col)
-    # If it's an expression using the dot operator, e.g. `sales.id`
     elseif isa(col, Expr) && col.head === :.
-        # col.args[1] = the "parent" (could be another dotted expr)
-        # col.args[2] = the field name (usually a Symbol)
         parent_str = _colref_to_string(col.args[1])
         field_str  = string(col.args[2].value)
         return parent_str * "." * field_str
@@ -212,16 +192,6 @@ macro group_by(sqlquery, columns...)
 
                 sq.groupBy = group_clause
 
-               # if isempty(sq.select) || sq.select == "SELECT "
-               #     sq.select = "SELECT " * join(group_columns, ", ")
-               # else
-               #     for col in group_columns
-               #         if !contains(sq.select, col)
-               #             sq.select = sq.select * ", " * col
-               #         end
-               #     end
-               # end
-
                 current_group_columns = group_columns
                 summarized_columns = split(sq.select, ", ")[2:end]  # Exclude the initial SELECT
                 all_columns = unique(vcat(current_group_columns, summarized_columns))
@@ -257,15 +227,9 @@ macro distinct(sqlquery, distinct_columns...)
             cte_select = !isempty(distinct_cols_str) ? " DISTINCT " * distinct_cols_str : " DISTINCT *"
             cte_select *= " FROM " * sq.from
 
-            # Create the CTE instance
             cte = CTE(name=cte_name, select=cte_select)
-            # Add the CTE to the SQLQuery's CTEs vector
             push!(sq.ctes, cte)
-            
-            # Adjust the main query to select from the newly created CTE
             sq.from = cte_name
-            
-            # Reset sq.select to ensure the final SELECT * operates correctly
             sq.select = "*"
         end
         sq
@@ -278,7 +242,7 @@ $docstring_count
 macro count(sqlquery, group_by_columns...)
     # Set default sort expression to true.
     sort_expr = :(true)
-    # Check if the last argument is a keyword assignment for sort.
+
     if length(group_by_columns) > 0 &&
        isa(group_by_columns[end], Expr) &&
        group_by_columns[end].head == :(=) &&
@@ -426,6 +390,7 @@ macro show_query(sqlquery)
         formatted_query = replace(formatted_query, " OUTER JOIN " => "\n\tOUTER JOIN ")
         formatted_query = replace(formatted_query, " ASOF " => "\n\tASOF ")
         formatted_query = replace(formatted_query, " LIMIT " => "\n\tLIMIT ")
+        formatted_query = replace(formatted_query, " ANY_VALUE" => "\n\tANY_VALUE")
 
         pattern = r"\b(cte_\w+|WITH|FROM|SELECT|AS|LEFT|JOIN|RIGHT|OUTER|UNION|INNER|ASOF|GROUP\s+BY|CASE|WHEN|THEN|ELSE|END|WHERE|HAVING|ORDER\s+BY|PARTITION|ASC|DESC|INNER)\b"
         # COV_EXCL_START