Skip to content

Commit ad7fc93

Browse files
authored
Merge pull request #36 from TidierOrg/coltypesxl
2 parents 3ec5156 + 8d5dbf6 commit ad7fc93

File tree

8 files changed

+72
-19
lines changed

8 files changed

+72
-19
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "TidierFiles"
22
uuid = "8ae5e7a9-bdd3-4c93-9cc3-9df4d5d947db"
33
authors = ["Daniel Rizk <[email protected]> and contributors"]
4-
version = "0.3.1"
4+
version = "0.3.2"
55

66
[deps]
77
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Currently supported file types:
2525
- `read_dta` and `write_dta` (.dta)
2626
- `read_arrow` and `write_arrow`
2727
- `read_parquet` and `write_parquet`
28+
- `read_json` and `write_json`
2829
- `read_rdata` (.rdata and .rds)
2930
- `read_gsheet` and `write_gsheet` (Google Sheets)
3031
- `read_json` and `write_json`

docs/src/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ Currently supported file types:
2020
- `read_sas` and `write_sas` (.sas7bdat and .xpt)
2121
- `read_dta` and `write_dta` (.dta)
2222
- `read_arrow` and `write_arrow`
23+
- `read_json` and `write_json`
2324
- `read_parquet` and `write_parquet`
2425
- `read_rdata` (.rdata and .rds)
2526
- `read_gsheet` and `write_gsheet`(Google Sheets)

src/TidierFiles.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,8 @@ function write_csv(
413413
file::String;
414414
missing_value::String = "",
415415
append::Bool = false,
416+
delim = ',',
417+
decimal = '.',
416418
col_names::Bool = true,
417419
eol::String = "\n",
418420
num_threads::Int = Threads.nthreads())
@@ -423,6 +425,8 @@ function write_csv(
423425
x,
424426
append = append,
425427
header = col_names && !append,
428+
delim = delim,
429+
decimal = decimal,
426430
missingstring = missing_value,
427431
newline = eol,
428432
threaded = num_threads > 1 )

src/docstrings.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ Write a DataFrame to a CSV (comma-separated values) file.
199199
- `x`: The DataFrame to write to the CSV file.
200200
- `file`: The path to the output CSV file.
201201
- `missing_value`: = "": The string to represent missing values in the output file. Default is an empty string.
202+
- `delim`: delimiter for file. can be a character or string. default `,`
203+
- `decimal`: decimal for file. supports characters only. default `.`
202204
- `append`: Whether to append to the file if it already exists. Default is false.
203205
- `col_names`: = true: Whether to write column names as the first line of the file. Default is true.
204206
- `eol`: The end-of-line character to use in the output file. Default is the newline character.

src/xlfiles.jl

Lines changed: 60 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,26 @@ end
2727
# Function to convert a column to the inferred type
2828
function convert_column(col, inferred_type)
2929
if inferred_type == Int
30-
return [x === missing ? missing : isa(x, Int) ? x : tryparse(Int, string(x)) for x in col]
30+
return [x === missing ? missing :
31+
isa(x, Int) ? x :
32+
tryparse(Int, string(x)) for x in col]
3133
elseif inferred_type == Float64
32-
return [x === missing ? missing : isa(x, Float64) ? x : tryparse(Float64, string(x)) for x in col]
34+
return [x === missing ? missing :
35+
isa(x, Float64) ? x :
36+
tryparse(Float64, string(x)) for x in col]
3337
elseif inferred_type == Date
34-
return [x === missing ? missing : isa(x, Date) ? x : tryparse(Date, string(x), dateformat"yyyy-mm-dd") for x in col]
38+
return [x === missing ? missing :
39+
isa(x, Date) ? x :
40+
tryparse(Date, string(x), dateformat"yyyy-mm-dd") for x in col]
41+
elseif inferred_type == String
42+
return [x === missing ? missing : string(x) for x in col]
3543
else
36-
return [x === missing ? missing : convert(String, x) for x in col]
44+
return [x === missing ? missing : convert(inferred_type, x) for x in col]
3745
end
3846
end
3947

4048

49+
4150
"""
4251
$docstring_read_xlsx
4352
"""
@@ -49,9 +58,9 @@ function read_xlsx(
4958
missing_value = "",
5059
trim_ws = true,
5160
skip = 0,
52-
n_max = Inf
61+
n_max = Inf,
62+
col_types = Dict{Any,Any}() # accepts Symbol | String | Int keys, flexible values
5363
)
54-
# Fetch the Excel file (from URL or local path)
5564
xf = if startswith(path, "http://") || startswith(path, "https://")
5665
response = HTTP.get(path)
5766
if response.status != 200
@@ -62,40 +71,62 @@ function read_xlsx(
6271
XLSX.readxlsx(path)
6372
end
6473

65-
# Determine which sheet to read
6674
sheet_to_read = isnothing(sheet) ? first(XLSX.sheetnames(xf)) : sheet
67-
68-
# Read the table data from the specified range or full sheet
6975
table_data = XLSX.gettable(xf[sheet_to_read])
7076
data = DataFrame(table_data)
7177

72-
# Infer and apply column types based on the first 5 rows
78+
# Build a lookup from normalized header -> actual name
79+
name_map = Dict(normalize_name(n) => n for n in names(data))
80+
81+
# Preprocess user-specified overrides:
82+
# - Int key -> positional column
83+
# - Symbol/String key -> match case/whitespace-insensitively
84+
overrides = Dict{Any,Type}()
85+
for (k, v) in col_types
86+
tgt_type = resolve_type(v)
87+
if k isa Integer
88+
1 <= k <= ncol(data) || error("col_types position $(k) is out of bounds (ncol=$(ncol(data)))")
89+
overrides[names(data)[k]] = tgt_type
90+
else
91+
nk = normalize_name(k)
92+
if haskey(name_map, nk)
93+
overrides[name_map[nk]] = tgt_type
94+
else
95+
@warn "col_types key $(k) did not match any column header" available_headers=names(data)
96+
end
97+
end
98+
end
99+
100+
# Infer/apply column types; overrides take precedence
73101
for col in names(data)
74102
col_values = data[!, col]
75-
inferred_type = infer_column_type(col_values)
103+
requested = get(overrides, col, nothing)
104+
inferred_type = isnothing(requested) ? infer_column_type(col_values) : requested
76105
data[!, col] = convert_column(col_values, inferred_type)
77106
end
78107

79-
# Skipping rows
80108
if skip > 0
81109
data = data[(skip+1):end, :]
82110
end
83111

84-
# Limiting the number of rows
85112
if !isinf(n_max)
86113
data = data[1:min(n_max, nrow(data)), :]
87114
end
88115

89-
# Replace missing strings with `missing` if applicable
90116
if !isempty(missing_value)
91-
for missing_value in missing_value
117+
if missing_value isa AbstractVector
118+
for mv in missing_value
119+
for col in names(data)
120+
data[!, col] = replace(data[!, col], mv => missing)
121+
end
122+
end
123+
else
92124
for col in names(data)
93125
data[!, col] = replace(data[!, col], missing_value => missing)
94126
end
95127
end
96128
end
97129

98-
# Trim whitespace if requested
99130
if trim_ws
100131
for col in names(data)
101132
if eltype(data[!, col]) == String
@@ -107,6 +138,19 @@ function read_xlsx(
107138
return data
108139
end
109140

141+
resolve_type(t) = t isa Type ? t :
142+
t === string ? String :
143+
t === Symbol("string") ? String :
144+
t === :string ? String :
145+
t === :int ? Int :
146+
t === :float ? Float64 :
147+
t === :date ? Date :
148+
t
149+
150+
# Normalize a column name for matching
151+
normalize_name(x) = lowercase(strip(String(x)))
152+
153+
110154
"""
111155
$docstring_write_xlsx
112156
"""

test/runtests.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ doctest(TidierFiles)
3232

3333
@test roundTripDataFrame(df_barley;JSONObjectVector=false)
3434

35-
df_budget = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/budget.json")
35+
# df_budget = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/budget.json")
3636

37-
@test roundTripDataFrame(df_budget)
37+
# @test roundTripDataFrame(df_budget)
3838

3939
df_budgets = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/budgets.json")
4040

testdf.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[{"year":"1565","wheat":41.0,"wages":5.0},{"year":"1570","wheat":45.0,"wages":5.05},{"year":"1575","wheat":42.0,"wages":5.08},{"year":"1580","wheat":49.0,"wages":5.12},{"year":"1585","wheat":41.5,"wages":5.15},{"year":"1590","wheat":47.0,"wages":5.25},{"year":"1595","wheat":64.0,"wages":5.54},{"year":"1600","wheat":27.0,"wages":5.61},{"year":"1605","wheat":33.0,"wages":5.69},{"year":"1610","wheat":32.0,"wages":5.78},{"year":"1615","wheat":33.0,"wages":5.94},{"year":"1620","wheat":35.0,"wages":6.01},{"year":"1625","wheat":33.0,"wages":6.12},{"year":"1630","wheat":45.0,"wages":6.22},{"year":"1635","wheat":33.0,"wages":6.3},{"year":"1640","wheat":39.0,"wages":6.37},{"year":"1645","wheat":53.0,"wages":6.45},{"year":"1650","wheat":42.0,"wages":6.5},{"year":"1655","wheat":40.5,"wages":6.6},{"year":"1660","wheat":46.5,"wages":6.75},{"year":"1665","wheat":32.0,"wages":6.8},{"year":"1670","wheat":37.0,"wages":6.9},{"year":"1675","wheat":43.0,"wages":7.0},{"year":"1680","wheat":35.0,"wages":7.3},{"year":"1685","wheat":27.0,"wages":7.6},{"year":"1690","wheat":40.0,"wages":8.0},{"year":"1695","wheat":50.0,"wages":8.5},{"year":"1700","wheat":30.0,"wages":9.0},{"year":"1705","wheat":32.0,"wages":10.0},{"year":"1710","wheat":44.0,"wages":11.0},{"year":"1715","wheat":33.0,"wages":11.75},{"year":"1720","wheat":29.0,"wages":12.5},{"year":"1725","wheat":39.0,"wages":13.0},{"year":"1730","wheat":26.0,"wages":13.3},{"year":"1735","wheat":32.0,"wages":13.6},{"year":"1740","wheat":27.0,"wages":14.0},{"year":"1745","wheat":27.5,"wages":14.5},{"year":"1750","wheat":31.0,"wages":15.0},{"year":"1755","wheat":35.5,"wages":15.7},{"year":"1760","wheat":31.0,"wages":16.5},{"year":"1765","wheat":43.0,"wages":17.6},{"year":"1770","wheat":47.0,"wages":18.5},{"year":"1775","wheat":44.0,"wages":19.5},{"year":"1780","wheat":46.0,"wages":21.0},{"year":"1785","wheat":42.0,"wages":23.0},{"year":"1790","wheat":47.5,"wages":25.5},{"year":"1795","wheat":76.0,"wages":27.5},{"year":"1800","wheat":79.0,"wages":28.5},{"year":"1805","wheat":81.0,"wages":29.5},{"year":"1810","wheat":99.0,"wages":30.0},{"year":"1815","wheat":78.0,"wages":null},{"year":"1820","wheat":54.0,"wages":null}]

0 commit comments

Comments
 (0)