Skip to content

Commit e5d4c37

Browse files
authored
(Mostly) Bug Fixes for writing Strings (#35)
- Fix String write for when `length(str) != ncodeunits(str)` - `write(path::String, table)` now returns `path` instead of number of bytes written. - Support for writing `Char` - Attempting to write a String >254 bytes is now an error (previously a warning) - Change tests to depend on tables created within the tests as opposed to a .dbf file. - Added quirks/gotchas to README
1 parent f7768b5 commit e5d4c37

File tree

5 files changed

+113
-134
lines changed

5 files changed

+113
-134
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "DBFTables"
22
uuid = "75c7ada1-017a-5fb6-b8c7-2125ff2d6c93"
3-
version = "1.2.5"
3+
version = "1.2.6"
44

55
[deps]
66
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,17 @@ The DBF header contains information on the amount of rows, which columns are pre
4949
The `DBFTables.Table` struct holds both the header and data. All data is read into memory in one go as a `Vector{UInt8}`. To provide efficient access into the individual entries, we use [WeakRefStrings](https://github.com/JuliaData/WeakRefStrings.jl/). WeakRefStrings' `StringArray` only holds the offsets and lengths into the `Vector{UInt8}` with all the data. Then we still need to convert from the string to the julia type. This is done on demand with `dbf_value`.
5050

5151
Note that the format also contains a "record deleted" flag, which is represented by a `'*'` at the start of the row. When this is encountered the record should be treated as if it doesn't exist. Since normally writers strip these records when writing, they are rarely encountered. For that reason this package ignores these flags by default right now. To check for the flags yourself, there is the `isdeleted` function. A sample file with deleted record flags is available [here](https://issues.qgis.org/issues/11007#note-30).
52+
53+
54+
## Quirks and Gotchas
55+
56+
The DBF format is quite old (introduced in 1983). As such, it has some quirks that may not be immediately obvious:
57+
58+
1. An empty string is equivalent to a missing value. Thus an empty string in a table will not survive a `write`/`read` round trip.
59+
2. Strings are limited to 254 characters. Attempting to write longer Strings results in an error.
60+
3. In order to support as many versions of DBF as possible, DBFTables.jl will only write data as one of the following DBF data types:
61+
- `'C'` (Character): `String`s (and anything else that doesn't doesn't match one of the other three types).
62+
- `'N'` (Numeric): `Integer`s and `AbstractFloat`s.
63+
- `'L'` (Logical): `Bool`s.
64+
- `'D'` (Date): `Date`s.
65+
4. The `'N` (Numeric) data type restricts values to fit within 20 printed characters. All `Int64`s fit within 20 characters, but `Float64`s may not. E.g. `string(nextfloat(-Inf))` is 23 characters. DBFTables.jl will remove the least significant digits (loss of precision) in order to fit within the 20 character limit.

src/DBFTables.jl

Lines changed: 19 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,9 @@ function FieldDescriptor(name::Symbol, data::AbstractVector)
2424
elseif T === Union{} # data is only missings
2525
len = 0x01
2626
elseif char === 'C'
27-
width = T <: AbstractString ? maximum(length, itr) : maximum(x -> length(string(x)), itr)
28-
if width > 254
29-
@warn "Due to DBF limitations, strings in field $name will be truncated to 254 characters."
30-
len = UInt8(254)
31-
else
32-
len = UInt8(width)
33-
end
27+
width = maximum(x -> ncodeunits(string(x)), itr)
28+
width > 254 && error("String data must be <254 characters due to DBF limitations. Found: $width.")
29+
len = UInt8(width)
3430
elseif char === 'N'
3531
len = UInt8(20)
3632
ndec = T <: AbstractFloat ? 0x1 : 0x0
@@ -73,7 +69,7 @@ end
7369
#-----------------------------------------------------------------------# conversions: Julia-to-DBF
7470
# These are the only types DBFTables.jl will use to save data as.
7571
"Get the DBF type code from the Julia type. Assumes `Base.nonmissingtype(T)` is the input."
76-
dbf_type(::Type{<:AbstractString}) = 'C'
72+
dbf_type(::Type{<:Union{Char, AbstractString}}) = 'C'
7773
dbf_type(::Type{Bool}) = 'L'
7874
dbf_type(::Type{<:Integer}) = 'N'
7975
dbf_type(::Type{<:AbstractFloat}) = 'N'
@@ -88,8 +84,9 @@ dbf_value(field::FieldDescriptor, val) = dbf_value(Val(field.dbf_type), field.le
8884

8985
# String (or any other type that gets mapped to 'C')
9086
function dbf_value(::Val{'C'}, len::UInt8, x)
91-
out = replace(rpad(x, len), !isascii => x -> '_' ^ textwidth(x))
92-
length(out) > 254 ? out[1:254] : out
87+
s = string(x)
88+
out = s * ' ' ^ (len - ncodeunits(s))
89+
ncodeunits(out) > 254 ? error("The DBF format cannot save strings >254 characters.") : out
9390
end
9491
dbf_value(::Val{'C'}, len::UInt8, ::Missing) = ' ' ^ len
9592

@@ -111,7 +108,7 @@ function dbf_value(::Val{'N'}, ::UInt8, x::AbstractFloat)
111108
s2 = @sprintf "%.20e" x
112109
i = findfirst('e', s2)
113110
s_end = replace(s2[i:end], '+' => "")
114-
len = length(s_end)
111+
len = length(s_end)
115112
n = 20 - len
116113
out = s2[1:n] * s_end
117114
@warn "A DBF limitation has reduced the precision of $x by $(length(s) - 20) digits."
@@ -145,8 +142,8 @@ end
145142
julia_value(o::FieldDescriptor, s::AbstractString) = julia_value(o.type, Val(o.dbf_type), s::AbstractString)
146143

147144
function julia_value_string(s::AbstractString)
148-
s2 = strip(s)
149-
isempty(s2) ? missing : String(s2)
145+
out = strip(x -> isspace(x) || x == '\0', s)
146+
isempty(out) ? missing : out
150147
end
151148

152149
julia_value(::Type{String}, ::Val{'C'}, s::AbstractString) = julia_value_string(s)
@@ -192,19 +189,14 @@ end
192189

193190
"Read a field descriptor from the stream, and create a FieldDescriptor struct"
194191
function read_dbf_field(io::IO)
195-
n_bytes_field_name = 11 # field name can be up to 11 bytes long, delimited by '\0' (end of string, EOS)
196-
field_name_bytes = read(io, n_bytes_field_name)
197-
pos_eos = findfirst(iszero, field_name_bytes)
198-
n = pos_eos === nothing ? n_bytes_field_name : pos_eos - 1
199-
field_name = Symbol(field_name_bytes[1:n])
200-
201-
field_type = read(io, Char)
192+
name = Symbol(filter!(!iszero, read(io, 11))) # 11 bytes padded by '\0'
193+
dbf_type = read(io, Char)
202194
skip(io, 4) # skip
203-
field_len = read(io, UInt8)
204-
field_dec = read(io, UInt8)
195+
length = read(io, UInt8)
196+
ndec = read(io, UInt8)
205197
skip(io, 14) # reserved
206-
jltype = julia_type(Val(field_type), field_dec)
207-
return FieldDescriptor(field_name, jltype, field_type, field_len, field_dec)
198+
type = julia_type(Val(dbf_type), ndec)
199+
return FieldDescriptor(name, type, dbf_type, length, ndec)
208200
end
209201

210202
reserved(n) = fill(0x00, n)
@@ -247,15 +239,7 @@ function Header(io::IO)
247239
fieldcolumns[field.name] = col
248240
push!(fields, field)
249241
col += 1
250-
251-
# peek if we are at the end
252-
mark(io)
253-
trm = read(io, UInt8)
254-
if trm == 0xD
255-
break
256-
else
257-
reset(io)
258-
end
242+
peek(io) == 0x0d && break
259243
end
260244

261245
return Header(
@@ -332,7 +316,7 @@ function Table(io::IO)
332316
# Make sure data is read at the right position
333317
bytes_to_skip = header.hsize - position(io)
334318
bytes_to_skip > 0 && skip(io, bytes_to_skip)
335-
319+
336320
data = Vector{UInt8}(undef, header.rsize * header.records)
337321
read!(io, data)
338322
strings = _create_stringarray(header, data)
@@ -463,7 +447,7 @@ Base.write(path::AbstractString, dbf::Table) = open(io -> Base.write(io, dbf), t
463447

464448

465449
"Generic .dbf writer for the Tables.jl interface."
466-
write(path::AbstractString, tbl) = open(io -> write(io, tbl), touch(path), "w")
450+
write(path::AbstractString, tbl) = (open(io -> write(io, tbl), touch(path), "w"); path)
467451

468452
function write(io::IO, tbl)
469453
dct = Tables.dictcolumntable(tbl)

test/runtests.jl

Lines changed: 79 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -4,118 +4,99 @@ using Tables
44
using DataFrames
55
using Dates
66

7-
test_dbf_path = joinpath(@__DIR__, "test.dbf")
8-
dbf = DBFTables.Table(test_dbf_path)
9-
df = DataFrame(dbf)
10-
row, st = iterate(dbf)
7+
#-----------------------------------------------------------------------------# setup
8+
df = DataFrame(
9+
a = [1,2,3],
10+
b = ["one", "two", "three"],
11+
c = [true, true, false],
12+
d = [today() + Day(i) for i in 1:3],
13+
e = 1.0:3.0,
14+
f = ["😄", "", "∫eˣ123"]
15+
)
1116

17+
# Same as above but with missings
18+
df2 = vcat(df, DataFrame([missing missing missing missing missing missing], names(df)))
19+
20+
# `df2` as a DBFTables.Table
21+
dbf = DBFTables.Table(DBFTables.write(tempname(), df2))
22+
23+
# `dbf` after write/read roundtrip
24+
dbf2 = DBFTables.Table(DBFTables.write(tempname(), dbf))
25+
26+
# Check that data survives a write/read roundtrip
27+
function roundtrip(t)
28+
path = DBFTables.write(tempname(), t)
29+
t2 = DBFTables.Table(path)
30+
isequal(DataFrame(t), DataFrame(t2))
31+
end
32+
33+
#-----------------------------------------------------------------------------# tests
1234
@testset "DBFTables" begin
13-
@testset "Writing" begin
14-
tables_equal(tbl1, tbl2) = all(zip(Tables.columns(tbl1), Tables.columns(tbl2))) do (t1, t2)
15-
all(ismissing(a) ? ismissing(b) : a == b for (a,b) in zip(t1,t2))
16-
end
17-
function _roundtrip(table)
18-
file = joinpath(tempdir(), "test.dbf")
19-
DBFTables.write(file, table)
20-
table2 = DBFTables.Table(file)
21-
end
22-
roundtrip(table) = tables_equal(DataFrame(table), DataFrame(_roundtrip(table)))
35+
@testset "DBFTables.Table roundtrip" begin
36+
@test Tables.schema(df2) == Tables.schema(dbf)
37+
@test Tables.schema(dbf) == Tables.schema(dbf2)
38+
@test isequal(NamedTuple.(dbf), NamedTuple.(dbf2))
39+
40+
@test DBFTables.isdeleted(dbf2) isa BitVector
41+
@test all(.!DBFTables.isdeleted(dbf2))
42+
@test !DBFTables.isdeleted(dbf2, 3)
43+
44+
@test ismissing(dbf2.a[end])
45+
end
46+
47+
@testset "Tables.jl roundtrips" begin
2348
@test roundtrip(df)
49+
@test roundtrip(df2)
2450
@test roundtrip(dbf)
2551
@test roundtrip([(x=Float32(1), y=1), (x=Float32(2), y=2), (x=missing, y=3)])
2652
@test roundtrip([(x=true, y="test"), (x=missing, y=missing)])
2753
@test roundtrip([(x=today(), y=missing), (x=missing, y=today())])
28-
@test roundtrip([(; x=1.0), (;x=missing)])
54+
@test roundtrip([(; x=1.0), (; x=missing)])
2955
@test roundtrip([(; x=missing), (; x=missing)])
3056

31-
@test_warn "No DBF type" DBFTables.write(tempname(), [(; x = rand(1))])
32-
@test_warn "truncated to 254 characters" DBFTables.write(tempname(), [(; x = rand(999))])
33-
34-
# Base.write for DBFTables.Table
35-
file = joinpath(tempdir(), "test.dbf")
36-
write(file, dbf)
37-
dbf2 = DBFTables.Table(file)
38-
@test tables_equal(dbf, dbf2)
57+
@test_warn "No DBF type associated with Julia type Vector{Float64}" DBFTables.write(tempname(), [(; x = rand(5))])
58+
@test_throws Exception DBFTables.write(tempname(), [(; x = rand(999))])
3959
end
4060

41-
@testset "DataFrame indexing" begin
42-
@test size(df, 1) == 7 # records
43-
@test size(df, 2) == 6 # fields
44-
@test df[2, :CHAR] == "John"
45-
@test df[1, :DATE] == Date("19900102", dateformat"yyyymmdd")
46-
@test df[3, :BOOL] == false
47-
@test df[1, :FLOAT] == 10.21
48-
@test df[2, :NUMERIC] == 12.21
49-
@test df[3, :INTEGER] == 102
50-
end
51-
52-
@testset "missing entries" begin
53-
@test ismissing(df[4, :BOOL])
54-
@test ismissing(df[5, :FLOAT])
55-
@test ismissing(df[6, :NUMERIC])
56-
@test ismissing(df[7, :INTEGER])
57-
end
58-
59-
@testset "header" begin
60-
h = DBFTables.Header(open(test_dbf_path))
61-
@test h.version == 3
62-
@test h.last_update == Date("20140806", dateformat"yyyymmdd")
63-
@test h.records == 7
64-
@test length(h.fields) == 6
61+
@testset "Header" begin
62+
# Verify that Header survives write/read roundtrip
63+
h, h2 = getfield(dbf, :header), getfield(dbf2, :header)
64+
for name in fieldnames(DBFTables.Header)
65+
@test getfield(h, name) == getfield(h2, name)
66+
end
6567
end
6668

6769
@testset "show" begin
68-
@test sprint(show, row) === sprint(show, NamedTuple(row))
69-
# use replace to update to julia 1.4 union printing
70-
@test replace(sprint(show, dbf), r"\} +" => "}") ===
71-
"DBFTables.Table with 7 rows and 6 columns\nTables.Schema:\n :CHAR Union{Missing, String}\n :DATE Union{Missing, Date}\n :BOOL Union{Missing, Bool}\n :FLOAT Union{Missing, Float64}\n :NUMERIC Union{Missing, Float64}\n :INTEGER Union{Missing, $Int}\n"
70+
str = """
71+
DBFTables.Table with 4 rows and 6 columns
72+
Tables.Schema:
73+
:a Union{Missing, $Int}
74+
:b Union{Missing, String}
75+
:c Union{Missing, Bool}
76+
:d Union{Missing, Date}
77+
:e Union{Missing, Float64}
78+
:f Union{Missing, String}
79+
"""
80+
@test sprint(show, dbf) == str
7281
end
7382

74-
@testset "iterate" begin
75-
@test st === 2
76-
@test haskey(row, :CHAR)
77-
@test row.CHAR === "Bob"
78-
@test row[2] === Date("19900102", dateformat"yyyymmdd")
79-
@test_throws ArgumentError row.nonexistent_field
80-
firstrow = (
81-
CHAR = "Bob",
82-
DATE = Date("19900102", dateformat"yyyymmdd"),
83-
BOOL = false,
84-
FLOAT = 10.21,
85-
NUMERIC = 11.21,
86-
INTEGER = 100,
87-
)
88-
@test NamedTuple(row) === firstrow
89-
@test row isa DBFTables.Row
90-
@test row isa Tables.AbstractRow
91-
@test length(row) === 6
92-
@test size(row) === (6,)
93-
@test size(row, 1) === 6
94-
@test_throws BoundsError size(row, 2)
95-
@test DBFTables.getrow(row) === 1
96-
@test DBFTables.gettable(row) === dbf
97-
@test sum(1 for row in dbf) === 7
98-
@test sum(1 for cell in row) === 6
99-
@test propertynames(dbf) == [:CHAR, :DATE, :BOOL, :FLOAT, :NUMERIC, :INTEGER]
100-
@test propertynames(row) == [:CHAR, :DATE, :BOOL, :FLOAT, :NUMERIC, :INTEGER]
101-
end
83+
@testset "iterate and other Base methods" begin
84+
@test size(dbf) == size(df2)
85+
@test size(dbf, 1) == size(df2, 1)
86+
@test size(dbf, 2) == size(df2, 2)
87+
for row in dbf
88+
@test_throws ArgumentError row.nonexistent_field
89+
@test length(row) == length('a':'f')
90+
@test size(row) == (length(row), )
91+
@test size(row, 1) == length(row)
92+
@test propertynames(row) == Symbol.('a':'f')
93+
for prop in propertynames(row)
94+
@test getproperty(row, prop) isa Any # dummy test to ensure no error is thrown
95+
end
96+
end
10297

103-
@testset "column" begin
104-
@test size(dbf) === (7, 6)
105-
@test size(dbf, 2) === 6
106-
107-
@test length(dbf.CHAR) === 7
108-
@test dbf.CHAR isa Vector{Union{String,Missing}}
109-
@test dbf.INTEGER isa Vector{Union{Int,Missing}}
110-
@test_throws ArgumentError row.nonexistent_field
111-
@test dbf.INTEGER[2] === 101
112-
@test ismissing(dbf.INTEGER[7])
113-
@test dbf.CHAR[2] === "John"
114-
@test ismissing(dbf.CHAR[7])
115-
116-
@test DBFTables.isdeleted(dbf) isa BitVector
117-
@test all(.!DBFTables.isdeleted(dbf))
118-
@test !DBFTables.isdeleted(dbf, 3)
98+
@test sum(1 for row in dbf) === 4
99+
@test sum(1 for cell in first(dbf)) === 6
119100
end
120101

121102
@testset "Numeric 20-character Limit Nonsense" begin
@@ -127,7 +108,7 @@ row, st = iterate(dbf)
127108
@test DBFTables.dbf_value(Val('N'), 0x01, negbig) == string(negbig)
128109
@test_throws Exception DBFTables.dbf_value(Val('N'), 0x01, negbig - 1)
129110

130-
@test_warn r"DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, prevfloat(Inf))
131-
@test_warn r"DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, nextfloat(-Inf))
111+
@test_warn "DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, prevfloat(Inf))
112+
@test_warn "DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, nextfloat(-Inf))
132113
end
133-
end # testset "DBFTables"
114+
end

test/test.dbf

-611 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)