Skip to content

Commit 6156e9a

Browse files
authored
Writing (#21)
* wip write method * working write and test * better test * delete test file * simplify test * fix reserved bytes * bump version * generic write working for test case * wip * test * add 'O' dbf type * better tests, add support for Dates because why not * handle Integer and AbstractFloat separately for writing * remove Julia 1.0 from ci * change Julia lower bound to LTS * add test * cleanup * convert unicode to spaces when writing character data to DBF * remove comment * removed unused ifelse, create dictcolumntable in write function
1 parent 6b4ef1a commit 6156e9a

File tree

5 files changed

+201
-19
lines changed

5 files changed

+201
-19
lines changed

.github/workflows/ci.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ jobs:
1515
fail-fast: false
1616
matrix:
1717
version:
18-
- '1.0'
1918
- '1' # automatically expands to the latest stable 1.x release of Julia
2019
- 'nightly'
2120
os:

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,5 @@
22
*.jl.*.cov
33
*.jl.mem
44
Manifest.toml
5+
*.dbf
6+
*.DS_Store

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
name = "DBFTables"
22
uuid = "75c7ada1-017a-5fb6-b8c7-2125ff2d6c93"
3-
version = "1.0.0"
3+
version = "1.1.0"
44

55
[deps]
6-
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
6+
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
77
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
88
WeakRefStrings = "ea10d353-3f73-51f8-a26c-33c1cb351aa5"
99

1010
[compat]
1111
Tables = "0.2, 1"
1212
WeakRefStrings = "0.6, 1"
13-
julia = "1.0"
13+
julia = "1.6"
1414

1515
[extras]
1616
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"

src/DBFTables.jl

Lines changed: 163 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
module DBFTables
22

3-
import Printf, Tables, WeakRefStrings
3+
import Tables, WeakRefStrings
4+
using Dates
45

56
"Field/column descriptor, part of the Header"
67
struct FieldDescriptor
78
name::Symbol
89
type::Type
10+
dbf_type::Char
911
length::UInt8
1012
ndec::UInt8
1113
end
1214

1315
"DBF header, which also holds all field definitions"
1416
struct Header
1517
version::UInt8
16-
last_update::String
18+
last_update::Date
1719
records::UInt32
1820
hsize::UInt16
1921
rsize::UInt16
@@ -45,7 +47,7 @@ function typemap(fld::Char, ndec::UInt8)
4547
if fld == 'C'
4648
rt = String
4749
elseif fld == 'D'
48-
rt = String
50+
rt = Date
4951
elseif fld == 'N'
5052
if ndec > 0
5153
rt = Float64
@@ -74,16 +76,29 @@ function read_dbf_field(io::IO)
7476
field_dec = read(io, UInt8)
7577
skip(io, 14) # reserved
7678
jltype = typemap(field_type, field_dec)
77-
return FieldDescriptor(field_name, jltype, field_len, field_dec)
79+
return FieldDescriptor(field_name, jltype, field_type, field_len, field_dec)
80+
end
81+
82+
reserved(n) = fill(0x00, n)
83+
84+
function Base.write(io::IO, fd::FieldDescriptor)
85+
out = 0
86+
out += Base.write(io, replace(rpad(String(fd.name), 11), ' ' => '\0')) # 0-10
87+
out += Base.write(io, fd.dbf_type) # 11
88+
out += Base.write(io, reserved(4)) # 12-15
89+
out += Base.write(io, fd.length) # 16
90+
out += Base.write(io, fd.ndec) # 17
91+
out += Base.write(io, reserved(14)) # 18-31
92+
return out
7893
end
7994

8095
"Read a DBF header from a stream"
8196
function Header(io::IO)
8297
ver = read(io, UInt8)
83-
date1 = read(io, UInt8)
84-
date2 = read(io, UInt8)
85-
date3 = read(io, UInt8)
86-
last_update = Printf.@sprintf("%4d%02d%02d", date1 + 1900, date2, date3)
98+
yy = read(io, UInt8)
99+
mm = read(io, UInt8)
100+
dd = read(io, UInt8)
101+
last_update = Date(yy + 1900, mm, dd)
87102
records = read(io, UInt32)
88103
hsize = read(io, UInt16)
89104
rsize = read(io, UInt16)
@@ -130,6 +145,34 @@ function Header(io::IO)
130145
)
131146
end
132147

148+
149+
150+
# ref: https://www.clicketyclick.dk/databases/xbase/format/dbf.html
151+
function Base.Base.write(io::IO, h::Header)
152+
out = 0
153+
out += Base.write(io, h.version) # 0
154+
yy = UInt8(year(h.last_update) - 1900)
155+
mm = UInt8(month(h.last_update))
156+
dd = UInt8(day(h.last_update))
157+
out += Base.write(io, yy, mm, dd) # 1-3
158+
out += Base.write(io, h.records) # 4-7
159+
out += Base.write(io, h.hsize) # 8-9
160+
out += Base.write(io, h.rsize) # 10-11
161+
out += Base.write(io, reserved(2)) # 12-13 reserved
162+
out += Base.write(io, h.incomplete) # 14
163+
out += Base.write(io, h.encrypted) # 15
164+
out += Base.write(io, reserved(12)) # 16-19, 20-27 reserved
165+
out += Base.write(io, h.mdx) # 28
166+
out += Base.write(io, h.lang_id) # 29
167+
out += Base.write(io, reserved(2)) # 30-31 reserved
168+
for field in h.fields
169+
out += Base.write(io, field)
170+
end
171+
out += Base.write(io, 0xD)
172+
return out
173+
end
174+
175+
133176
miss(x) = ifelse(x === nothing, missing, x)
134177

135178
"Concert a DBF entry string to a Julia value"
@@ -142,10 +185,12 @@ function dbf_value(::Type{Bool}, str::AbstractString)
142185
elseif char == '?'
143186
missing
144187
else
145-
throw(ArgumentError("Unknown logical $char"))
188+
throw(ArgumentError("Unknown logical entry: $(repr(char))"))
146189
end
147190
end
148191

192+
dbf_value(::Type{Date}, str::AbstractString) = all(isspace, str) ? missing : Date(str, dateformat"yyyymmdd")
193+
149194
dbf_value(T::Union{Type{Int},Type{Float64}}, str::AbstractString) = miss(tryparse(T, str))
150195
# String to avoid returning SubString{String}
151196
function dbf_value(::Type{String}, str::AbstractString)
@@ -196,7 +241,7 @@ function Table(path::AbstractString)
196241
end
197242
end
198243

199-
"Collect all the offsets and lenghts from the header to create a StringArray"
244+
"Collect all the offsets and lengths from the header to create a StringArray"
200245
function _create_stringarray(header::Header, data::AbstractVector)
201246
# first make the lengths and offsets for a single record
202247
lengths_record = UInt32.(getfield.(header.fields, :length))
@@ -308,4 +353,112 @@ function Base.getproperty(dbf::Table, name::Symbol)
308353
return colarr
309354
end
310355

356+
357+
Base.write(io::IO, dbf::Table) = Base.write(io, getfield(dbf, :header), getfield(dbf, :data), 0x1a)
358+
Base.write(path::AbstractString, dbf::Table) = open(io -> Base.write(io, dbf), touch(path), "w")
359+
360+
361+
"Generic .dbf writer for the Tables.jl interface."
362+
write(path::AbstractString, tbl) = open(io -> write(io, tbl), touch(path), "w")
363+
364+
function write(io::IO, tbl)
365+
dct = Tables.dictcolumntable(tbl)
366+
fields, records = get_field_descriptors(dct)
367+
fieldcolumns = Dict{Symbol,Int}(f.name => i for (i,f) in enumerate(fields))
368+
hsize = UInt16(length(fields) * 32 + 32)
369+
rsize = UInt16(sum(x -> x.length, fields)) + 1
370+
371+
version = 0x03
372+
last_update = today()
373+
incomplete = false
374+
encrypted = false
375+
mdx = false
376+
lang_id = 0x00
377+
378+
h = Header(version, last_update, records, hsize, rsize, incomplete, encrypted, mdx, lang_id, fields, fieldcolumns)
379+
out = Base.write(io, h)
380+
381+
for row in Tables.rows(dct)
382+
out += write_record(io, fields, row)
383+
end
384+
out += Base.write(io, 0x1a) # EOF marker
385+
return out
386+
end
387+
388+
function get_field_descriptors(dct)
389+
fields = FieldDescriptor[]
390+
sch = Tables.schema(dct)
391+
for (name, type) in zip(sch.names, sch.types)
392+
ndec = 0x0
393+
len = 0x0
394+
dbf_type = 'C'
395+
T = Base.nonmissingtype(type)
396+
if T isa Date
397+
dbf_type = 'D'
398+
len = 0x08
399+
elseif T <: AbstractString
400+
# TODO: support memos. Currently strings > 254 bytes will error
401+
len = UInt8(maximum(x -> length(string(x)), dct[name]))
402+
if len > 254
403+
@warn "Strings will be truncated to 254 characters."
404+
len = 254
405+
end
406+
dbf_type = 'C'
407+
elseif type === Float64
408+
dbf_type = 'O'
409+
len = 0x08
410+
ndec = 0x01
411+
elseif T <: AbstractFloat
412+
dbf_type = 'F'
413+
len = UInt8(20)
414+
ndec = 0x01
415+
elseif T <: Bool
416+
dbf_type = 'L'
417+
len = 0x1
418+
elseif T <: Date
419+
dbf_type = 'D'
420+
len = 0x8
421+
elseif T <: Integer
422+
dbf_type = 'N'
423+
len = UInt8(maximum(x -> length(string(x)), dct[name]))
424+
else
425+
@warn "Field $name has no known matching DBF data type for $T. Data will be stored as the DBF character data type ('C')."
426+
len = UInt8(maximum(x -> length(string(x)), dct[name]))
427+
end
428+
push!(fields, FieldDescriptor(name, type, dbf_type, len, ndec))
429+
end
430+
fields, UInt32(length(first(dct)))
431+
end
432+
433+
function write_record(io::IO, fd::Vector{FieldDescriptor}, row)
434+
out = 0
435+
out += Base.write(io, ' ') # deletion marker ' '=valid, '*'=deleted
436+
for (field, val) in zip(fd, row)
437+
out += Base.write(io, _val(field, val))
438+
end
439+
return out
440+
end
441+
442+
function _val(field::FieldDescriptor, val)::Union{String, Float64}
443+
char = field.dbf_type
444+
if char == 'L'
445+
ismissing(val) && return "?"
446+
val ? "T" : "F"
447+
elseif ismissing(val)
448+
' ' ^ field.length
449+
elseif char == 'C'
450+
replace(rpad(val, field.length), !isascii => ' ')
451+
elseif char == 'D'
452+
Dates.format(val, "yyyymmdd")
453+
elseif char == 'O'
454+
val # <-- the Float64 return value
455+
elseif char == 'F'
456+
rpad(val, 20)[1:20]
457+
elseif char == 'N'
458+
rpad(val, field.length)
459+
else
460+
error("Unknown DBF datatype $char.")
461+
end
462+
end
463+
311464
end # module

test/runtests.jl

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,46 @@ using DBFTables
22
using Test
33
using Tables
44
using DataFrames
5+
using Dates
56

67
test_dbf_path = joinpath(@__DIR__, "test.dbf")
78
dbf = DBFTables.Table(test_dbf_path)
89
df = DataFrame(dbf)
910
row, st = iterate(dbf)
1011

1112
@testset "DBFTables" begin
13+
@testset "Writing" begin
14+
tables_equal(tbl1, tbl2) = all(zip(Tables.columns(tbl1), Tables.columns(tbl2))) do (t1, t2)
15+
all(ismissing(a) ? ismissing(b) : a == b for (a,b) in zip(t1,t2))
16+
end
17+
function _roundtrip(table)
18+
file = joinpath(tempdir(), "test.dbf")
19+
DBFTables.write(file, table)
20+
table2 = DBFTables.Table(file)
21+
end
22+
roundtrip(table) = tables_equal(DataFrame(table), DataFrame(_roundtrip(table)))
23+
@test roundtrip(df)
24+
@test roundtrip(dbf)
25+
@test roundtrip([(x=Float32(1), y=1), (x=Float32(2), y=2), (x=missing, y=3)])
26+
@test roundtrip([(x=true, y="test"), (x=missing, y=missing)])
27+
@test roundtrip([(x=today(), y=missing), (x=missing, y=today())])
28+
@test roundtrip([(; x=1.0), (;x=missing)])
29+
@test roundtrip([(; x=missing), (; x=missing)])
30+
31+
@test_warn "Data will be stored as the DBF character data type" DBFTables.write(tempname(), [(; x = rand(10))])
32+
33+
# Base.write for DBFTables.Table
34+
file = joinpath(tempdir(), "test.dbf")
35+
write(file, dbf)
36+
dbf2 = DBFTables.Table(file)
37+
@test tables_equal(dbf, dbf2)
38+
end
39+
1240
@testset "DataFrame indexing" begin
1341
@test size(df, 1) == 7 # records
1442
@test size(df, 2) == 6 # fields
1543
@test df[2, :CHAR] == "John"
16-
@test df[1, :DATE] == "19900102"
44+
@test df[1, :DATE] == Date("19900102", dateformat"yyyymmdd")
1745
@test df[3, :BOOL] == false
1846
@test df[1, :FLOAT] == 10.21
1947
@test df[2, :NUMERIC] == 12.21
@@ -30,7 +58,7 @@ row, st = iterate(dbf)
3058
@testset "header" begin
3159
h = DBFTables.Header(open(test_dbf_path))
3260
@test h.version == 3
33-
@test h.last_update == "20140806"
61+
@test h.last_update == Date("20140806", dateformat"yyyymmdd")
3462
@test h.records == 7
3563
@test length(h.fields) == 6
3664
end
@@ -39,18 +67,18 @@ row, st = iterate(dbf)
3967
@test sprint(show, row) === sprint(show, NamedTuple(row))
4068
# use replace to update to julia 1.4 union printing
4169
@test replace(sprint(show, dbf), r"\} +" => "}") ===
42-
"DBFTables.Table with 7 rows and 6 columns\nTables.Schema:\n :CHAR Union{Missing, String}\n :DATE Union{Missing, String}\n :BOOL Union{Missing, Bool}\n :FLOAT Union{Missing, Float64}\n :NUMERIC Union{Missing, Float64}\n :INTEGER Union{Missing, $Int}\n"
70+
"DBFTables.Table with 7 rows and 6 columns\nTables.Schema:\n :CHAR Union{Missing, String}\n :DATE Union{Missing, Date}\n :BOOL Union{Missing, Bool}\n :FLOAT Union{Missing, Float64}\n :NUMERIC Union{Missing, Float64}\n :INTEGER Union{Missing, $Int}\n"
4371
end
4472

4573
@testset "iterate" begin
4674
@test st === 2
4775
@test haskey(row, :CHAR)
4876
@test row.CHAR === "Bob"
49-
@test row[2] === "19900102"
77+
@test row[2] === Date("19900102", dateformat"yyyymmdd")
5078
@test_throws ArgumentError row.nonexistent_field
5179
firstrow = (
5280
CHAR = "Bob",
53-
DATE = "19900102",
81+
DATE = Date("19900102", dateformat"yyyymmdd"),
5482
BOOL = false,
5583
FLOAT = 10.21,
5684
NUMERIC = 11.21,

0 commit comments

Comments
 (0)