Merge pull request #36 from TidierOrg/coltypesxl

drizk1 · web-flow · commit ad7fc93a8321 · 2025-08-11T17:00:25.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierFiles"
 uuid = "8ae5e7a9-bdd3-4c93-9cc3-9df4d5d947db"
 authors = ["Daniel Rizk <rizk.daniel.12@gmail.com> and contributors"]
-version = "0.3.1"
+version = "0.3.2"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ Currently supported file types:
 - `read_dta` and `write_dta` (.dta) 
 - `read_arrow` and `write_arrow`
 - `read_parquet` and `write_parquet`
+- `read_json` and `write_json`
 - `read_rdata` (.rdata and .rds)
 - `read_gsheet` and `write_gsheet` (Google Sheets)
 - `read_json` and `write_json`
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -20,6 +20,7 @@ Currently supported file types:
 - `read_sas` and `write_sas` (.sas7bdat and .xpt)
 - `read_dta` and `write_dta` (.dta) 
 - `read_arrow` and `write_arrow`
+- `read_json` and `write_json`
 - `read_parquet` and `write_parquet`
 - `read_rdata` (.rdata and .rds)
 - `read_gsheet` and `write_gsheet`(Google Sheets)
diff --git a/src/TidierFiles.jl b/src/TidierFiles.jl
@@ -413,6 +413,8 @@ function write_csv(
     file::String;
     missing_value::String = "",
     append::Bool = false,
+    delim = ',',
+    decimal = '.',
     col_names::Bool = true,
     eol::String = "\n",
     num_threads::Int = Threads.nthreads())
@@ -423,6 +425,8 @@ function write_csv(
         x,
         append = append,
         header = col_names && !append,
+        delim = delim,
+        decimal = decimal,
         missingstring = missing_value,
         newline = eol,
         threaded = num_threads > 1    )
diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -199,6 +199,8 @@ Write a DataFrame to a CSV (comma-separated values) file.
 - `x`: The DataFrame to write to the CSV file.
 - `file`: The path to the output CSV file.
 - `missing_value`: = "": The string to represent missing values in the output file. Default is an empty string.
+- `delim`: delimiter for file. can be a character or string. default `,` 
+- `decimal`: decimal for file. supports characters only. default `.` 
 - `append`: Whether to append to the file if it already exists. Default is false.
 - `col_names`: = true: Whether to write column names as the first line of the file. Default is true.
 - `eol`: The end-of-line character to use in the output file. Default is the newline character.
diff --git a/src/xlfiles.jl b/src/xlfiles.jl
@@ -27,17 +27,26 @@ end
 # Function to convert a column to the inferred type
 function convert_column(col, inferred_type)
     if inferred_type == Int
-        return [x === missing ? missing : isa(x, Int) ? x : tryparse(Int, string(x)) for x in col]
+        return [x === missing ? missing :
+                isa(x, Int) ? x :
+                tryparse(Int, string(x)) for x in col]
     elseif inferred_type == Float64
-        return [x === missing ? missing : isa(x, Float64) ? x : tryparse(Float64, string(x)) for x in col]
+        return [x === missing ? missing :
+                isa(x, Float64) ? x :
+                tryparse(Float64, string(x)) for x in col]
     elseif inferred_type == Date
-        return [x === missing ? missing : isa(x, Date) ? x : tryparse(Date, string(x), dateformat"yyyy-mm-dd") for x in col]
+        return [x === missing ? missing :
+                isa(x, Date) ? x :
+                tryparse(Date, string(x), dateformat"yyyy-mm-dd") for x in col]
+    elseif inferred_type == String
+        return [x === missing ? missing : string(x) for x in col]
     else
-        return [x === missing ? missing : convert(String, x) for x in col]
+        return [x === missing ? missing : convert(inferred_type, x) for x in col]
     end
 end
 
 
+
 """
 $docstring_read_xlsx
 """
@@ -49,9 +58,9 @@ function read_xlsx(
     missing_value = "",
     trim_ws = true,
     skip = 0,
-    n_max = Inf
+    n_max = Inf,
+    col_types = Dict{Any,Any}()  # accepts Symbol | String | Int keys, flexible values
 )
-    # Fetch the Excel file (from URL or local path)
     xf = if startswith(path, "http://") || startswith(path, "https://")
         response = HTTP.get(path)
         if response.status != 200
@@ -62,40 +71,62 @@ function read_xlsx(
         XLSX.readxlsx(path)
     end
 
-    # Determine which sheet to read
     sheet_to_read = isnothing(sheet) ? first(XLSX.sheetnames(xf)) : sheet
-
-    # Read the table data from the specified range or full sheet
     table_data = XLSX.gettable(xf[sheet_to_read])
     data = DataFrame(table_data)
 
-    # Infer and apply column types based on the first 5 rows
+    # Build a lookup from normalized header -> actual name
+    name_map = Dict(normalize_name(n) => n for n in names(data))
+
+    # Preprocess user-specified overrides:
+    # - Int key -> positional column
+    # - Symbol/String key -> match case/whitespace-insensitively
+    overrides = Dict{Any,Type}()
+    for (k, v) in col_types
+        tgt_type = resolve_type(v)
+        if k isa Integer
+            1 <= k <= ncol(data) || error("col_types position $(k) is out of bounds (ncol=$(ncol(data)))")
+            overrides[names(data)[k]] = tgt_type
+        else
+            nk = normalize_name(k)
+            if haskey(name_map, nk)
+                overrides[name_map[nk]] = tgt_type
+            else
+                @warn "col_types key $(k) did not match any column header" available_headers=names(data)
+            end
+        end
+    end
+
+    # Infer/apply column types; overrides take precedence
     for col in names(data)
         col_values = data[!, col]
-        inferred_type = infer_column_type(col_values)
+        requested = get(overrides, col, nothing)
+        inferred_type = isnothing(requested) ? infer_column_type(col_values) : requested
         data[!, col] = convert_column(col_values, inferred_type)
     end
 
-    # Skipping rows
     if skip > 0
         data = data[(skip+1):end, :]
     end
 
-    # Limiting the number of rows
     if !isinf(n_max)
         data = data[1:min(n_max, nrow(data)), :]
     end
 
-    # Replace missing strings with `missing` if applicable
     if !isempty(missing_value)
-        for missing_value in missing_value
+        if missing_value isa AbstractVector
+            for mv in missing_value
+                for col in names(data)
+                    data[!, col] = replace(data[!, col], mv => missing)
+                end
+            end
+        else
             for col in names(data)
                 data[!, col] = replace(data[!, col], missing_value => missing)
             end
         end
     end
 
-    # Trim whitespace if requested
     if trim_ws
         for col in names(data)
             if eltype(data[!, col]) == String
@@ -107,6 +138,19 @@ function read_xlsx(
     return data
 end
 
+resolve_type(t) = t isa Type ? t :
+                  t === string ? String :
+                  t === Symbol("string") ? String :
+                  t === :string ? String :
+                  t === :int ? Int :
+                  t === :float ? Float64 :
+                  t === :date ? Date :
+                  t
+
+# Normalize a column name for matching
+normalize_name(x) = lowercase(strip(String(x)))
+
+
 """
 $docstring_write_xlsx
 """
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -32,9 +32,9 @@ doctest(TidierFiles)
 
     @test roundTripDataFrame(df_barley;JSONObjectVector=false)
 
-    df_budget = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/budget.json")
+   # df_budget = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/budget.json")
 
-    @test roundTripDataFrame(df_budget)
+   # @test roundTripDataFrame(df_budget)
 
     df_budgets = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/budgets.json")
 
diff --git a/testdf.json b/testdf.json
@@ -0,0 +1 @@
+[{"year":"1565","wheat":41.0,"wages":5.0},{"year":"1570","wheat":45.0,"wages":5.05},{"year":"1575","wheat":42.0,"wages":5.08},{"year":"1580","wheat":49.0,"wages":5.12},{"year":"1585","wheat":41.5,"wages":5.15},{"year":"1590","wheat":47.0,"wages":5.25},{"year":"1595","wheat":64.0,"wages":5.54},{"year":"1600","wheat":27.0,"wages":5.61},{"year":"1605","wheat":33.0,"wages":5.69},{"year":"1610","wheat":32.0,"wages":5.78},{"year":"1615","wheat":33.0,"wages":5.94},{"year":"1620","wheat":35.0,"wages":6.01},{"year":"1625","wheat":33.0,"wages":6.12},{"year":"1630","wheat":45.0,"wages":6.22},{"year":"1635","wheat":33.0,"wages":6.3},{"year":"1640","wheat":39.0,"wages":6.37},{"year":"1645","wheat":53.0,"wages":6.45},{"year":"1650","wheat":42.0,"wages":6.5},{"year":"1655","wheat":40.5,"wages":6.6},{"year":"1660","wheat":46.5,"wages":6.75},{"year":"1665","wheat":32.0,"wages":6.8},{"year":"1670","wheat":37.0,"wages":6.9},{"year":"1675","wheat":43.0,"wages":7.0},{"year":"1680","wheat":35.0,"wages":7.3},{"year":"1685","wheat":27.0,"wages":7.6},{"year":"1690","wheat":40.0,"wages":8.0},{"year":"1695","wheat":50.0,"wages":8.5},{"year":"1700","wheat":30.0,"wages":9.0},{"year":"1705","wheat":32.0,"wages":10.0},{"year":"1710","wheat":44.0,"wages":11.0},{"year":"1715","wheat":33.0,"wages":11.75},{"year":"1720","wheat":29.0,"wages":12.5},{"year":"1725","wheat":39.0,"wages":13.0},{"year":"1730","wheat":26.0,"wages":13.3},{"year":"1735","wheat":32.0,"wages":13.6},{"year":"1740","wheat":27.0,"wages":14.0},{"year":"1745","wheat":27.5,"wages":14.5},{"year":"1750","wheat":31.0,"wages":15.0},{"year":"1755","wheat":35.5,"wages":15.7},{"year":"1760","wheat":31.0,"wages":16.5},{"year":"1765","wheat":43.0,"wages":17.6},{"year":"1770","wheat":47.0,"wages":18.5},{"year":"1775","wheat":44.0,"wages":19.5},{"year":"1780","wheat":46.0,"wages":21.0},{"year":"1785","wheat":42.0,"wages":23.0},{"year":"1790","wheat":47.5,"wages":25.5},{"year":"1795","wheat":76.0,"wages":27.5},{"year":"1800","wheat":79.0,"wages":28.5},{"year":"1805","wheat":81.0,"wages":29.5},{"year":"1810","wheat":99.0,"wages":30.0},{"year":"1815","wheat":78.0,"wages":null},{"year":"1820","wheat":54.0,"wages":null}]

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+[{"year":"1565","wheat":41.0,"wages":5.0},{"year":"1570","wheat":45.0,"wages":5.05},{"year":"1575","wheat":42.0,"wages":5.08},{"year":"1580","wheat":49.0,"wages":5.12},{"year":"1585","wheat":41.5,"wages":5.15},{"year":"1590","wheat":47.0,"wages":5.25},{"year":"1595","wheat":64.0,"wages":5.54},{"year":"1600","wheat":27.0,"wages":5.61},{"year":"1605","wheat":33.0,"wages":5.69},{"year":"1610","wheat":32.0,"wages":5.78},{"year":"1615","wheat":33.0,"wages":5.94},{"year":"1620","wheat":35.0,"wages":6.01},{"year":"1625","wheat":33.0,"wages":6.12},{"year":"1630","wheat":45.0,"wages":6.22},{"year":"1635","wheat":33.0,"wages":6.3},{"year":"1640","wheat":39.0,"wages":6.37},{"year":"1645","wheat":53.0,"wages":6.45},{"year":"1650","wheat":42.0,"wages":6.5},{"year":"1655","wheat":40.5,"wages":6.6},{"year":"1660","wheat":46.5,"wages":6.75},{"year":"1665","wheat":32.0,"wages":6.8},{"year":"1670","wheat":37.0,"wages":6.9},{"year":"1675","wheat":43.0,"wages":7.0},{"year":"1680","wheat":35.0,"wages":7.3},{"year":"1685","wheat":27.0,"wages":7.6},{"year":"1690","wheat":40.0,"wages":8.0},{"year":"1695","wheat":50.0,"wages":8.5},{"year":"1700","wheat":30.0,"wages":9.0},{"year":"1705","wheat":32.0,"wages":10.0},{"year":"1710","wheat":44.0,"wages":11.0},{"year":"1715","wheat":33.0,"wages":11.75},{"year":"1720","wheat":29.0,"wages":12.5},{"year":"1725","wheat":39.0,"wages":13.0},{"year":"1730","wheat":26.0,"wages":13.3},{"year":"1735","wheat":32.0,"wages":13.6},{"year":"1740","wheat":27.0,"wages":14.0},{"year":"1745","wheat":27.5,"wages":14.5},{"year":"1750","wheat":31.0,"wages":15.0},{"year":"1755","wheat":35.5,"wages":15.7},{"year":"1760","wheat":31.0,"wages":16.5},{"year":"1765","wheat":43.0,"wages":17.6},{"year":"1770","wheat":47.0,"wages":18.5},{"year":"1775","wheat":44.0,"wages":19.5},{"year":"1780","wheat":46.0,"wages":21.0},{"year":"1785","wheat":42.0,"wages":23.0},{"year":"1790","wheat":47.5,"wages":25.5},{"year":"1795","wheat":76.0,"wages":27.5},{"year":"1800","wheat":79.0,"wages":28.5},{"year":"1805","wheat":81.0,"wages":29.5},{"year":"1810","wheat":99.0,"wages":30.0},{"year":"1815","wheat":78.0,"wages":null},{"year":"1820","wheat":54.0,"wages":null}]