Skip to content

Commit 2d17cb6

Browse files
authored
Reading/writing updates (#22)
* fix reading of types that do not convert to string, rewrite Tables.jl writing to dbf * use strip on strings from dbf * better replace for non-ascii strings * Better error message * bump version
1 parent 6156e9a commit 2d17cb6

File tree

3 files changed

+140
-131
lines changed

3 files changed

+140
-131
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "DBFTables"
22
uuid = "75c7ada1-017a-5fb6-b8c7-2125ff2d6c93"
3-
version = "1.1.0"
3+
version = "1.2.0"
44

55
[deps]
66
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"

src/DBFTables.jl

Lines changed: 137 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,35 @@ struct FieldDescriptor
1212
ndec::UInt8
1313
end
1414

15+
"Create FieldDescriptor from a column in a Tables.jl table."
16+
function FieldDescriptor(name::Symbol, data::AbstractVector)
17+
T = Base.nonmissingtype(eltype(data))
18+
char = dbf_type(T)
19+
ndec = 0x00
20+
itr = skipmissing(data)
21+
if char === 'D'
22+
len = 0x08
23+
elseif T === Union{} # data is only missings
24+
len = 0x01
25+
elseif char === 'C'
26+
width = T <: AbstractString ? maximum(length, itr) : maximum(x -> length(string(x)), itr)
27+
if width > 254
28+
@warn "Due to DBF limitations, strings in field $name will be truncated to 254 characters."
29+
len = UInt8(254)
30+
else
31+
len = UInt8(width)
32+
end
33+
elseif char === 'N'
34+
len = UInt8(20)
35+
ndec = T <: AbstractFloat ? 0x1 : 0x0
36+
elseif char === 'L'
37+
len = 0x1
38+
else
39+
error("This shouldn't be reachable. Unknown DBF type code: '$char'.")
40+
end
41+
FieldDescriptor(name, T, char, len, ndec)
42+
end
43+
1544
"DBF header, which also holds all field definitions"
1645
struct Header
1746
version::UInt8
@@ -40,30 +69,103 @@ struct Row <: Tables.AbstractRow
4069
row::Int
4170
end
4271

43-
"Convert DBF data type characters to Julia types"
44-
function typemap(fld::Char, ndec::UInt8)
45-
# https://www.clicketyclick.dk/databases/xbase/format/data_types.html
46-
rt = Nothing
47-
if fld == 'C'
48-
rt = String
49-
elseif fld == 'D'
50-
rt = Date
51-
elseif fld == 'N'
52-
if ndec > 0
53-
rt = Float64
54-
else
55-
rt = Int
56-
end
57-
elseif fld == 'F' || fld == 'O'
58-
rt = Float64
59-
elseif fld == 'I' || fld == '+'
60-
rt = Int
61-
elseif fld == 'L'
62-
rt = Bool
63-
else
64-
throw(ArgumentError("Unknown record type $fld"))
72+
#-----------------------------------------------------------------------# conversions: Julia-to-DBF
73+
# These are the only types DBFTables.jl will use to save data as.
74+
"Get the DBF type code from the Julia type. Assumes `Base.nonmissingtype(T)` is the input."
75+
dbf_type(::Type{<:AbstractString}) = 'C'
76+
dbf_type(::Type{Bool}) = 'L'
77+
dbf_type(::Type{<:Integer}) = 'N'
78+
dbf_type(::Type{<:AbstractFloat}) = 'N'
79+
dbf_type(::Type{Date}) = 'D'
80+
dbf_type(::Type{Union{}}) = 'C'
81+
function dbf_type(::Type{T}) where {T}
82+
@warn "No DBF type associated with Julia type $T. Data will be saved as `string(x)`."
83+
'C'
84+
end
85+
86+
dbf_value(field::FieldDescriptor, val) = dbf_value(Val(field.dbf_type), field.length, val)
87+
88+
# String (or any other type that gets mapped to 'C')
89+
function dbf_value(::Val{'C'}, len::UInt8, x)
90+
out = replace(rpad(x, len), !isascii => x -> '_' ^ textwidth(x))
91+
length(out) > 254 ? out[1:254] : out
92+
end
93+
dbf_value(::Val{'C'}, len::UInt8, ::Missing) = ' ' ^ len
94+
95+
# Bool
96+
dbf_value(::Val{'L'}, ::UInt8, x::Bool) = x ? 'T' : 'F'
97+
dbf_value(::Val{'L'}, ::UInt8, ::Missing) = '?'
98+
99+
# Integer & AbstractFloat
100+
function dbf_value(::Val{'N'}, ::UInt8, x::Union{AbstractFloat, Integer})
101+
maxval = 99999999999999999999
102+
abs(x) > maxval && @warn "Due to DBF limitations, a float will be clamped to fit in 20 characters."
103+
rpad(clamp(x, -maxval, maxval), 20)
104+
end
105+
dbf_value(::Val{'N'}, ::UInt8, ::Missing) = ' ' ^ 20
106+
107+
# Date
108+
dbf_value(::Val{'D'}, ::UInt8, x::Date) = Dates.format(x, "yyyymmdd")
109+
dbf_value(::Val{'D'}, ::UInt8, ::Missing) = ' ' ^ 8
110+
111+
dbf_value(::Val, ::UInt8, x) = error("This should be unreachable. No known conversion from Julia to DBF: $x.")
112+
113+
#-----------------------------------------------------------------------# conversions: DBF-to-Julia
114+
"Get the Julia type from the DBF type code and the decimal count"
115+
julia_type(::Val{'C'}, ndec::UInt8) = String
116+
julia_type(::Val{'D'}, ndec::UInt8) = Date
117+
julia_type(::Val{'N'}, ndec::UInt8) = ndec > 0 ? Float64 : Int
118+
julia_type(::Val{'F'}, ndec::UInt8) = Float64
119+
julia_type(::Val{'O'}, ndec::UInt8) = Float64
120+
julia_type(::Val{'I'}, ndec::UInt8) = Int32
121+
julia_type(::Val{'+'}, ndec::UInt8) = Int64
122+
julia_type(::Val{'L'}, ndec::UInt8) = Bool
123+
function julia_type(::Val{T}, ndec::UInt8) where {T}
124+
@warn "Unknown DBF type code '$T'. Data will be loaded as `String"
125+
String
126+
end
127+
128+
129+
julia_value(o::FieldDescriptor, s::AbstractString) = julia_value(o.type, Val(o.dbf_type), s::AbstractString)
130+
131+
function julia_value(::Type{String}, ::Val{'C'}, s::AbstractString)
132+
s2 = strip(s)
133+
isempty(s2) ? missing : String(s2)
134+
end
135+
function julia_value(::Type{Date}, ::Val{'D'}, s::AbstractString)
136+
all(isspace, s) ? missing : Date(s, dateformat"yyyymmdd")
137+
end
138+
julia_value(::Type{Int}, ::Val{'N'}, s::AbstractString) = miss(tryparse(Int, s))
139+
julia_value(::Type{Float64}, ::Val{'N'}, s::AbstractString) = miss(tryparse(Float64, s))
140+
julia_value(::Type{Float64}, ::Val{'F'}, s::AbstractString) = miss(tryparse(Float64, s))
141+
# 'O', 'I', and '+' do not use string representations.
142+
function julia_value(::Type{Float64}, ::Val{'O'}, s::AbstractString)
143+
try
144+
only(reinterpret(Float64, Vector{UInt8}(s)))
145+
catch
146+
missing
147+
end
148+
end
149+
function julia_value(::Type{Int32}, ::Val{'I'}, s::AbstractString)
150+
try
151+
only(reinterpret(Int32, Vector{UInt8}(s)))
152+
catch
153+
missing
154+
end
155+
end
156+
function julia_value(::Type{Int64}, ::Val{'+'}, s::AbstractString)
157+
try
158+
only(reinterpret(Int64, Vector{UInt8}(s)))
159+
catch
160+
missing
65161
end
66-
return rt
162+
end
163+
function julia_value(::Type{Bool}, ::Val{'L'}, s::AbstractString)
164+
char = only(s)
165+
char === '?' ? missing :
166+
char in "YyTt" ? true :
167+
char in "NnFf" ? false :
168+
error("Unknown logical entry for dbf type code 'L': '$char'.")
67169
end
68170

69171
"Read a field descriptor from the stream, and create a FieldDescriptor struct"
@@ -75,7 +177,7 @@ function read_dbf_field(io::IO)
75177
field_len = read(io, UInt8)
76178
field_dec = read(io, UInt8)
77179
skip(io, 14) # reserved
78-
jltype = typemap(field_type, field_dec)
180+
jltype = julia_type(Val(field_type), field_dec)
79181
return FieldDescriptor(field_name, jltype, field_type, field_len, field_dec)
80182
end
81183

@@ -175,35 +277,6 @@ end
175277

176278
miss(x) = ifelse(x === nothing, missing, x)
177279

178-
"Concert a DBF entry string to a Julia value"
179-
function dbf_value(::Type{Bool}, str::AbstractString)
180-
char = first(str)
181-
if char in "YyTt"
182-
true
183-
elseif char in "NnFf"
184-
false
185-
elseif char == '?'
186-
missing
187-
else
188-
throw(ArgumentError("Unknown logical entry: $(repr(char))"))
189-
end
190-
end
191-
192-
dbf_value(::Type{Date}, str::AbstractString) = all(isspace, str) ? missing : Date(str, dateformat"yyyymmdd")
193-
194-
dbf_value(T::Union{Type{Int},Type{Float64}}, str::AbstractString) = miss(tryparse(T, str))
195-
# String to avoid returning SubString{String}
196-
function dbf_value(::Type{String}, str::AbstractString)
197-
stripped = rstrip(str)
198-
if isempty(stripped)
199-
# return missing rather than ""
200-
return missing
201-
else
202-
return String(stripped)
203-
end
204-
end
205-
dbf_value(::Type{Nothing}, ::AbstractString) = missing
206-
207280
# define get functions using getfield since we overload getproperty
208281
"Access the header of a DBF Table"
209282
getheader(dbf::Table) = getfield(dbf, :header)
@@ -265,8 +338,7 @@ function Base.NamedTuple(row::Row)
265338
ncol = length(fields)
266339
rowidx = getrow(row)
267340
@inbounds record = @view str[:, rowidx]
268-
@inbounds prs =
269-
(fields[col].name => dbf_value(fields[col].type, record[col]) for col = 1:ncol)
341+
@inbounds prs = (fields[col].name => julia_value(fields[col], record[col]) for col = 1:ncol)
270342
return (; prs...)
271343
end
272344

@@ -310,17 +382,17 @@ function Tables.getcolumn(row::Row, name::Symbol)
310382
str = getstrings(dbf)
311383
colidx = get(header.fieldcolumns, name, nothing)
312384
colidx === nothing && throw(ArgumentError("Column not present: $name"))
313-
type = @inbounds getfields(dbf)[colidx].type
385+
field = @inbounds getfields(dbf)[colidx]
314386
rowidx = getrow(row)
315-
return @inbounds dbf_value(type, str[colidx, rowidx])
387+
return @inbounds julia_value(field, str[colidx, rowidx])
316388
end
317389

318390
function Tables.getcolumn(row::Row, i::Int)
319391
dbf = gettable(row)
320392
str = getstrings(dbf)
321-
type = getfields(dbf)[i].type
393+
field = getfields(dbf)[i]
322394
rowidx = getrow(row)
323-
return @inbounds dbf_value(type, str[i, rowidx])
395+
return @inbounds julia_value(field, str[i, rowidx])
324396
end
325397

326398
Tables.istable(::Type{Table}) = true
@@ -347,9 +419,9 @@ function Base.getproperty(dbf::Table, name::Symbol)
347419
col = get(header.fieldcolumns, name, nothing)
348420
col === nothing && throw(ArgumentError("Column not present: $name"))
349421
nrow = header.records
350-
@inbounds type = getfields(dbf)[col].type
422+
@inbounds field = getfields(dbf)[col]
351423
str = getstrings(dbf)
352-
@inbounds colarr = [dbf_value(type, str[col, i]) for i = 1:nrow]
424+
@inbounds colarr = [julia_value(field, str[col, i]) for i = 1:nrow]
353425
return colarr
354426
end
355427

@@ -363,7 +435,8 @@ write(path::AbstractString, tbl) = open(io -> write(io, tbl), touch(path), "w")
363435

364436
function write(io::IO, tbl)
365437
dct = Tables.dictcolumntable(tbl)
366-
fields, records = get_field_descriptors(dct)
438+
fields = [FieldDescriptor(k, v) for (k,v) in pairs(getfield(dct, :values))]
439+
records = UInt32(length(first(dct)))
367440
fieldcolumns = Dict{Symbol,Int}(f.name => i for (i,f) in enumerate(fields))
368441
hsize = UInt16(length(fields) * 32 + 32)
369442
rsize = UInt16(sum(x -> x.length, fields)) + 1
@@ -385,80 +458,15 @@ function write(io::IO, tbl)
385458
return out
386459
end
387460

388-
function get_field_descriptors(dct)
389-
fields = FieldDescriptor[]
390-
sch = Tables.schema(dct)
391-
for (name, type) in zip(sch.names, sch.types)
392-
ndec = 0x0
393-
len = 0x0
394-
dbf_type = 'C'
395-
T = Base.nonmissingtype(type)
396-
if T isa Date
397-
dbf_type = 'D'
398-
len = 0x08
399-
elseif T <: AbstractString
400-
# TODO: support memos. Currently strings > 254 bytes will error
401-
len = UInt8(maximum(x -> length(string(x)), dct[name]))
402-
if len > 254
403-
@warn "Strings will be truncated to 254 characters."
404-
len = 254
405-
end
406-
dbf_type = 'C'
407-
elseif type === Float64
408-
dbf_type = 'O'
409-
len = 0x08
410-
ndec = 0x01
411-
elseif T <: AbstractFloat
412-
dbf_type = 'F'
413-
len = UInt8(20)
414-
ndec = 0x01
415-
elseif T <: Bool
416-
dbf_type = 'L'
417-
len = 0x1
418-
elseif T <: Date
419-
dbf_type = 'D'
420-
len = 0x8
421-
elseif T <: Integer
422-
dbf_type = 'N'
423-
len = UInt8(maximum(x -> length(string(x)), dct[name]))
424-
else
425-
@warn "Field $name has no known matching DBF data type for $T. Data will be stored as the DBF character data type ('C')."
426-
len = UInt8(maximum(x -> length(string(x)), dct[name]))
427-
end
428-
push!(fields, FieldDescriptor(name, type, dbf_type, len, ndec))
429-
end
430-
fields, UInt32(length(first(dct)))
431-
end
432-
433461
function write_record(io::IO, fd::Vector{FieldDescriptor}, row)
434462
out = 0
435463
out += Base.write(io, ' ') # deletion marker ' '=valid, '*'=deleted
436464
for (field, val) in zip(fd, row)
437-
out += Base.write(io, _val(field, val))
465+
out += Base.write(io, dbf_value(field, val))
438466
end
439467
return out
440468
end
441469

442-
function _val(field::FieldDescriptor, val)::Union{String, Float64}
443-
char = field.dbf_type
444-
if char == 'L'
445-
ismissing(val) && return "?"
446-
val ? "T" : "F"
447-
elseif ismissing(val)
448-
' ' ^ field.length
449-
elseif char == 'C'
450-
replace(rpad(val, field.length), !isascii => ' ')
451-
elseif char == 'D'
452-
Dates.format(val, "yyyymmdd")
453-
elseif char == 'O'
454-
val # <-- the Float64 return value
455-
elseif char == 'F'
456-
rpad(val, 20)[1:20]
457-
elseif char == 'N'
458-
rpad(val, field.length)
459-
else
460-
error("Unknown DBF datatype $char.")
461-
end
462-
end
470+
463471

464472
end # module

test/runtests.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ row, st = iterate(dbf)
2828
@test roundtrip([(; x=1.0), (;x=missing)])
2929
@test roundtrip([(; x=missing), (; x=missing)])
3030

31-
@test_warn "Data will be stored as the DBF character data type" DBFTables.write(tempname(), [(; x = rand(10))])
31+
@test_warn "No DBF type" DBFTables.write(tempname(), [(; x = rand(1))])
32+
@test_warn "truncated to 254 characters" DBFTables.write(tempname(), [(; x = rand(999))])
3233

3334
# Base.write for DBFTables.Table
3435
file = joinpath(tempdir(), "test.dbf")

0 commit comments

Comments
 (0)