Skip to content

Commit cc071f1

Browse files
committed
add col type for xl
1 parent 3ec5156 commit cc071f1

File tree

2 files changed

+61
-16
lines changed

2 files changed

+61
-16
lines changed

src/xlfiles.jl

Lines changed: 60 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,26 @@ end
2727
# Function to convert a column to the inferred type
2828
function convert_column(col, inferred_type)
2929
if inferred_type == Int
30-
return [x === missing ? missing : isa(x, Int) ? x : tryparse(Int, string(x)) for x in col]
30+
return [x === missing ? missing :
31+
isa(x, Int) ? x :
32+
tryparse(Int, string(x)) for x in col]
3133
elseif inferred_type == Float64
32-
return [x === missing ? missing : isa(x, Float64) ? x : tryparse(Float64, string(x)) for x in col]
34+
return [x === missing ? missing :
35+
isa(x, Float64) ? x :
36+
tryparse(Float64, string(x)) for x in col]
3337
elseif inferred_type == Date
34-
return [x === missing ? missing : isa(x, Date) ? x : tryparse(Date, string(x), dateformat"yyyy-mm-dd") for x in col]
38+
return [x === missing ? missing :
39+
isa(x, Date) ? x :
40+
tryparse(Date, string(x), dateformat"yyyy-mm-dd") for x in col]
41+
elseif inferred_type == String
42+
return [x === missing ? missing : string(x) for x in col]
3543
else
36-
return [x === missing ? missing : convert(String, x) for x in col]
44+
return [x === missing ? missing : convert(inferred_type, x) for x in col]
3745
end
3846
end
3947

4048

49+
4150
"""
4251
$docstring_read_xlsx
4352
"""
@@ -49,9 +58,9 @@ function read_xlsx(
4958
missing_value = "",
5059
trim_ws = true,
5160
skip = 0,
52-
n_max = Inf
61+
n_max = Inf,
62+
col_types = Dict{Any,Any}() # accepts Symbol | String | Int keys, flexible values
5363
)
54-
# Fetch the Excel file (from URL or local path)
5564
xf = if startswith(path, "http://") || startswith(path, "https://")
5665
response = HTTP.get(path)
5766
if response.status != 200
@@ -62,40 +71,62 @@ function read_xlsx(
6271
XLSX.readxlsx(path)
6372
end
6473

65-
# Determine which sheet to read
6674
sheet_to_read = isnothing(sheet) ? first(XLSX.sheetnames(xf)) : sheet
67-
68-
# Read the table data from the specified range or full sheet
6975
table_data = XLSX.gettable(xf[sheet_to_read])
7076
data = DataFrame(table_data)
7177

72-
# Infer and apply column types based on the first 5 rows
78+
# Build a lookup from normalized header -> actual name
79+
name_map = Dict(normalize_name(n) => n for n in names(data))
80+
81+
# Preprocess user-specified overrides:
82+
# - Int key -> positional column
83+
# - Symbol/String key -> match case/whitespace-insensitively
84+
overrides = Dict{Any,Type}()
85+
for (k, v) in col_types
86+
tgt_type = resolve_type(v)
87+
if k isa Integer
88+
1 <= k <= ncol(data) || error("col_types position $(k) is out of bounds (ncol=$(ncol(data)))")
89+
overrides[names(data)[k]] = tgt_type
90+
else
91+
nk = normalize_name(k)
92+
if haskey(name_map, nk)
93+
overrides[name_map[nk]] = tgt_type
94+
else
95+
@warn "col_types key $(k) did not match any column header" available_headers=names(data)
96+
end
97+
end
98+
end
99+
100+
# Infer/apply column types; overrides take precedence
73101
for col in names(data)
74102
col_values = data[!, col]
75-
inferred_type = infer_column_type(col_values)
103+
requested = get(overrides, col, nothing)
104+
inferred_type = isnothing(requested) ? infer_column_type(col_values) : requested
76105
data[!, col] = convert_column(col_values, inferred_type)
77106
end
78107

79-
# Skipping rows
80108
if skip > 0
81109
data = data[(skip+1):end, :]
82110
end
83111

84-
# Limiting the number of rows
85112
if !isinf(n_max)
86113
data = data[1:min(n_max, nrow(data)), :]
87114
end
88115

89-
# Replace missing strings with `missing` if applicable
90116
if !isempty(missing_value)
91-
for missing_value in missing_value
117+
if missing_value isa AbstractVector
118+
for mv in missing_value
119+
for col in names(data)
120+
data[!, col] = replace(data[!, col], mv => missing)
121+
end
122+
end
123+
else
92124
for col in names(data)
93125
data[!, col] = replace(data[!, col], missing_value => missing)
94126
end
95127
end
96128
end
97129

98-
# Trim whitespace if requested
99130
if trim_ws
100131
for col in names(data)
101132
if eltype(data[!, col]) == String
@@ -107,6 +138,19 @@ function read_xlsx(
107138
return data
108139
end
109140

141+
resolve_type(t) = t isa Type ? t :
142+
t === string ? String :
143+
t === Symbol("string") ? String :
144+
t === :string ? String :
145+
t === :int ? Int :
146+
t === :float ? Float64 :
147+
t === :date ? Date :
148+
t
149+
150+
# Normalize a column name for matching
151+
normalize_name(x) = lowercase(strip(String(x)))
152+
153+
110154
"""
111155
$docstring_write_xlsx
112156
"""

testdf.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[{"year":"1565","wheat":41.0,"wages":5.0},{"year":"1570","wheat":45.0,"wages":5.05},{"year":"1575","wheat":42.0,"wages":5.08},{"year":"1580","wheat":49.0,"wages":5.12},{"year":"1585","wheat":41.5,"wages":5.15},{"year":"1590","wheat":47.0,"wages":5.25},{"year":"1595","wheat":64.0,"wages":5.54},{"year":"1600","wheat":27.0,"wages":5.61},{"year":"1605","wheat":33.0,"wages":5.69},{"year":"1610","wheat":32.0,"wages":5.78},{"year":"1615","wheat":33.0,"wages":5.94},{"year":"1620","wheat":35.0,"wages":6.01},{"year":"1625","wheat":33.0,"wages":6.12},{"year":"1630","wheat":45.0,"wages":6.22},{"year":"1635","wheat":33.0,"wages":6.3},{"year":"1640","wheat":39.0,"wages":6.37},{"year":"1645","wheat":53.0,"wages":6.45},{"year":"1650","wheat":42.0,"wages":6.5},{"year":"1655","wheat":40.5,"wages":6.6},{"year":"1660","wheat":46.5,"wages":6.75},{"year":"1665","wheat":32.0,"wages":6.8},{"year":"1670","wheat":37.0,"wages":6.9},{"year":"1675","wheat":43.0,"wages":7.0},{"year":"1680","wheat":35.0,"wages":7.3},{"year":"1685","wheat":27.0,"wages":7.6},{"year":"1690","wheat":40.0,"wages":8.0},{"year":"1695","wheat":50.0,"wages":8.5},{"year":"1700","wheat":30.0,"wages":9.0},{"year":"1705","wheat":32.0,"wages":10.0},{"year":"1710","wheat":44.0,"wages":11.0},{"year":"1715","wheat":33.0,"wages":11.75},{"year":"1720","wheat":29.0,"wages":12.5},{"year":"1725","wheat":39.0,"wages":13.0},{"year":"1730","wheat":26.0,"wages":13.3},{"year":"1735","wheat":32.0,"wages":13.6},{"year":"1740","wheat":27.0,"wages":14.0},{"year":"1745","wheat":27.5,"wages":14.5},{"year":"1750","wheat":31.0,"wages":15.0},{"year":"1755","wheat":35.5,"wages":15.7},{"year":"1760","wheat":31.0,"wages":16.5},{"year":"1765","wheat":43.0,"wages":17.6},{"year":"1770","wheat":47.0,"wages":18.5},{"year":"1775","wheat":44.0,"wages":19.5},{"year":"1780","wheat":46.0,"wages":21.0},{"year":"1785","wheat":42.0,"wages":23.0},{"year":"1790","wheat":47.5,"wages":25.5},{"year":"1795","wheat":76.0,"wages":27.5},{"year":"1800","wheat":79.0,"wages":28.5},{"year":"1805","wheat":81.0,"wages":29.5},{"year":"1810","wheat":99.0,"wages":30.0},{"year":"1815","wheat":78.0,"wages":null},{"year":"1820","wheat":54.0,"wages":null}]

0 commit comments

Comments
 (0)