(Mostly) Bug Fixes for writing Strings (#35)

joshday · web-flow · commit e5d4c37b1bac · 2024-06-18T14:39:20.000-04:00
- Fix String write for when `length(str) != ncodeunits(str)`
- `write(path::String, table)` now returns `path` instead of number of bytes written.
- Support for writing `Char`
- Attempting to write a String &gt;254 bytes is now an error (previously a warning)
- Change tests to depend on tables created within the tests as opposed to a .dbf file.
- Added quirks/gotchas to README
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "DBFTables"
 uuid = "75c7ada1-017a-5fb6-b8c7-2125ff2d6c93"
-version = "1.2.5"
+version = "1.2.6"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
diff --git a/README.md b/README.md
@@ -49,3 +49,17 @@ The DBF header contains information on the amount of rows, which columns are pre
 The `DBFTables.Table` struct holds both the header and data. All data is read into memory in one go as a `Vector{UInt8}`. To provide efficient access into the individual entries, we use [WeakRefStrings](https://github.com/JuliaData/WeakRefStrings.jl/). WeakRefStrings' `StringArray` only holds the offsets and lengths into the `Vector{UInt8}` with all the data. Then we still need to convert from the string to the julia type. This is done on demand with `dbf_value`.
 
 Note that the format also contains a "record deleted" flag, which is represented by a `'*'` at the start of the row. When this is encountered the record should be treated as if it doesn't exist. Since normally writers strip these records when writing, they are rarely encountered. For that reason this package ignores these flags by default right now. To check for the flags yourself, there is the `isdeleted` function. A sample file with deleted record flags is available [here](https://issues.qgis.org/issues/11007#note-30).
+
+
+## Quirks and Gotchas
+
+The DBF format is quite old (introduced in 1983).  As such, it has some quirks that may not be immediately obvious:
+
+1. An empty string is equivalent to a missing value.  Thus an empty string in a table will not survive a `write`/`read` round trip.
+2. Strings are limited to 254 characters.  Attempting to write longer Strings results in an error.
+3. In order to support as many versions of DBF as possible, DBFTables.jl will only write data as one of the following DBF data types:
+  - `'C'` (Character): `String`s (and anything else that doesn't doesn't match one of the other three types).
+  - `'N'` (Numeric): `Integer`s and `AbstractFloat`s.
+  - `'L'` (Logical): `Bool`s.
+  - `'D'` (Date): `Date`s.
+4. The `'N` (Numeric) data type restricts values to fit within 20 printed characters.  All `Int64`s fit within 20 characters, but `Float64`s may not.  E.g. `string(nextfloat(-Inf))` is 23 characters.  DBFTables.jl will remove the least significant digits (loss of precision) in order to fit within the 20 character limit.
diff --git a/src/DBFTables.jl b/src/DBFTables.jl
@@ -24,13 +24,9 @@ function FieldDescriptor(name::Symbol, data::AbstractVector)
     elseif T === Union{}  # data is only missings
         len = 0x01
     elseif char === 'C'
-        width = T <: AbstractString ? maximum(length, itr) : maximum(x -> length(string(x)), itr)
-        if width > 254
-            @warn "Due to DBF limitations, strings in field $name will be truncated to 254 characters."
-            len = UInt8(254)
-        else
-            len = UInt8(width)
-        end
+        width = maximum(x -> ncodeunits(string(x)), itr)
+        width > 254 && error("String data must be <254 characters due to DBF limitations.  Found: $width.")
+        len = UInt8(width)
     elseif char === 'N'
         len = UInt8(20)
         ndec = T <: AbstractFloat ? 0x1 : 0x0
@@ -73,7 +69,7 @@ end
 #-----------------------------------------------------------------------# conversions: Julia-to-DBF
 # These are the only types DBFTables.jl will use to save data as.
 "Get the DBF type code from the Julia type.  Assumes `Base.nonmissingtype(T)` is the input."
-dbf_type(::Type{<:AbstractString}) = 'C'
+dbf_type(::Type{<:Union{Char, AbstractString}}) = 'C'
 dbf_type(::Type{Bool}) = 'L'
 dbf_type(::Type{<:Integer}) = 'N'
 dbf_type(::Type{<:AbstractFloat}) = 'N'
@@ -88,8 +84,9 @@ dbf_value(field::FieldDescriptor, val) = dbf_value(Val(field.dbf_type), field.le
 
 # String (or any other type that gets mapped to 'C')
 function dbf_value(::Val{'C'}, len::UInt8, x)
-    out = replace(rpad(x, len), !isascii => x -> '_' ^ textwidth(x))
-    length(out) > 254 ? out[1:254] : out
+    s = string(x)
+    out = s * ' ' ^ (len - ncodeunits(s))
+    ncodeunits(out) > 254 ? error("The DBF format cannot save strings >254 characters.") : out
 end
 dbf_value(::Val{'C'}, len::UInt8, ::Missing) = ' ' ^ len
 
@@ -111,7 +108,7 @@ function dbf_value(::Val{'N'}, ::UInt8, x::AbstractFloat)
     s2 = @sprintf "%.20e" x
     i = findfirst('e', s2)
     s_end = replace(s2[i:end], '+' => "")
-    len  = length(s_end)
+    len = length(s_end)
     n = 20 - len
     out = s2[1:n] * s_end
     @warn "A DBF limitation has reduced the precision of $x by $(length(s) - 20) digits."
@@ -145,8 +142,8 @@ end
 julia_value(o::FieldDescriptor, s::AbstractString) = julia_value(o.type, Val(o.dbf_type), s::AbstractString)
 
 function julia_value_string(s::AbstractString)
-    s2 = strip(s)
-    isempty(s2) ? missing : String(s2)
+    out = strip(x -> isspace(x) || x == '\0', s)
+    isempty(out) ? missing : out
 end
 
 julia_value(::Type{String}, ::Val{'C'}, s::AbstractString) = julia_value_string(s)
@@ -192,19 +189,14 @@ end
 
 "Read a field descriptor from the stream, and create a FieldDescriptor struct"
 function read_dbf_field(io::IO)
-    n_bytes_field_name = 11 # field name can be up to 11 bytes long, delimited by '\0' (end of string, EOS)
-    field_name_bytes = read(io, n_bytes_field_name)
-    pos_eos = findfirst(iszero, field_name_bytes)
-    n = pos_eos === nothing ? n_bytes_field_name : pos_eos - 1
-    field_name = Symbol(field_name_bytes[1:n])
-
-    field_type = read(io, Char)
+    name = Symbol(filter!(!iszero, read(io, 11)))  # 11 bytes padded by '\0'
+    dbf_type = read(io, Char)
     skip(io, 4)  # skip
-    field_len = read(io, UInt8)
-    field_dec = read(io, UInt8)
+    length = read(io, UInt8)
+    ndec = read(io, UInt8)
     skip(io, 14)  # reserved
-    jltype = julia_type(Val(field_type), field_dec)
-    return FieldDescriptor(field_name, jltype, field_type, field_len, field_dec)
+    type = julia_type(Val(dbf_type), ndec)
+    return FieldDescriptor(name, type, dbf_type, length, ndec)
 end
 
 reserved(n) = fill(0x00, n)
@@ -247,15 +239,7 @@ function Header(io::IO)
         fieldcolumns[field.name] = col
         push!(fields, field)
         col += 1
-
-        # peek if we are at the end
-        mark(io)
-        trm = read(io, UInt8)
-        if trm == 0xD
-            break
-        else
-            reset(io)
-        end
+        peek(io) == 0x0d && break
     end
 
     return Header(
@@ -332,7 +316,7 @@ function Table(io::IO)
     # Make sure data is read at the right position
     bytes_to_skip = header.hsize - position(io)
     bytes_to_skip > 0 && skip(io, bytes_to_skip)
-    
+
     data = Vector{UInt8}(undef, header.rsize * header.records)
     read!(io, data)
     strings = _create_stringarray(header, data)
@@ -463,7 +447,7 @@ Base.write(path::AbstractString, dbf::Table) = open(io -> Base.write(io, dbf), t
 
 
 "Generic .dbf writer for the Tables.jl interface."
-write(path::AbstractString, tbl) = open(io -> write(io, tbl), touch(path), "w")
+write(path::AbstractString, tbl) = (open(io -> write(io, tbl), touch(path), "w"); path)
 
 function write(io::IO, tbl)
     dct = Tables.dictcolumntable(tbl)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -4,118 +4,99 @@ using Tables
 using DataFrames
 using Dates
 
-test_dbf_path = joinpath(@__DIR__, "test.dbf")
-dbf = DBFTables.Table(test_dbf_path)
-df = DataFrame(dbf)
-row, st = iterate(dbf)
+#-----------------------------------------------------------------------------# setup
+df = DataFrame(
+    a = [1,2,3],
+    b = ["one", "two", "three"],
+    c = [true, true, false],
+    d = [today() + Day(i) for i in 1:3],
+    e = 1.0:3.0,
+    f = ["😄", "∱", "∫eˣ123"]
+)
 
+# Same as above but with missings
+df2 = vcat(df, DataFrame([missing missing missing missing missing missing], names(df)))
+
+# `df2` as a DBFTables.Table
+dbf = DBFTables.Table(DBFTables.write(tempname(), df2))
+
+# `dbf` after write/read roundtrip
+dbf2 = DBFTables.Table(DBFTables.write(tempname(), dbf))
+
+# Check that data survives a write/read roundtrip
+function roundtrip(t)
+    path = DBFTables.write(tempname(), t)
+    t2 = DBFTables.Table(path)
+    isequal(DataFrame(t), DataFrame(t2))
+end
+
+#-----------------------------------------------------------------------------# tests
 @testset "DBFTables" begin
-    @testset "Writing" begin
-        tables_equal(tbl1, tbl2) = all(zip(Tables.columns(tbl1), Tables.columns(tbl2))) do (t1, t2)
-            all(ismissing(a) ? ismissing(b) : a == b for (a,b) in zip(t1,t2))
-        end
-        function _roundtrip(table)
-            file = joinpath(tempdir(), "test.dbf")
-            DBFTables.write(file, table)
-            table2 = DBFTables.Table(file)
-        end
-        roundtrip(table) = tables_equal(DataFrame(table), DataFrame(_roundtrip(table)))
+    @testset "DBFTables.Table roundtrip" begin
+        @test Tables.schema(df2) == Tables.schema(dbf)
+        @test Tables.schema(dbf) == Tables.schema(dbf2)
+        @test isequal(NamedTuple.(dbf), NamedTuple.(dbf2))
+
+        @test DBFTables.isdeleted(dbf2) isa BitVector
+        @test all(.!DBFTables.isdeleted(dbf2))
+        @test !DBFTables.isdeleted(dbf2, 3)
+
+        @test ismissing(dbf2.a[end])
+    end
+
+    @testset "Tables.jl roundtrips" begin
         @test roundtrip(df)
+        @test roundtrip(df2)
         @test roundtrip(dbf)
         @test roundtrip([(x=Float32(1), y=1), (x=Float32(2), y=2), (x=missing, y=3)])
         @test roundtrip([(x=true, y="test"), (x=missing, y=missing)])
         @test roundtrip([(x=today(), y=missing), (x=missing, y=today())])
-        @test roundtrip([(; x=1.0), (;x=missing)])
+        @test roundtrip([(; x=1.0), (; x=missing)])
         @test roundtrip([(; x=missing), (; x=missing)])
 
-        @test_warn "No DBF type" DBFTables.write(tempname(), [(; x = rand(1))])
-        @test_warn "truncated to 254 characters" DBFTables.write(tempname(), [(; x = rand(999))])
-
-        # Base.write for DBFTables.Table
-        file = joinpath(tempdir(), "test.dbf")
-        write(file, dbf)
-        dbf2 = DBFTables.Table(file)
-        @test tables_equal(dbf, dbf2)
+        @test_warn "No DBF type associated with Julia type Vector{Float64}" DBFTables.write(tempname(), [(; x = rand(5))])
+        @test_throws Exception DBFTables.write(tempname(), [(; x = rand(999))])
     end
 
-    @testset "DataFrame indexing" begin
-        @test size(df, 1) == 7 # records
-        @test size(df, 2) == 6 # fields
-        @test df[2, :CHAR] == "John"
-        @test df[1, :DATE] == Date("19900102", dateformat"yyyymmdd")
-        @test df[3, :BOOL] == false
-        @test df[1, :FLOAT] == 10.21
-        @test df[2, :NUMERIC] == 12.21
-        @test df[3, :INTEGER] == 102
-    end
-
-    @testset "missing entries" begin
-        @test ismissing(df[4, :BOOL])
-        @test ismissing(df[5, :FLOAT])
-        @test ismissing(df[6, :NUMERIC])
-        @test ismissing(df[7, :INTEGER])
-    end
-
-    @testset "header" begin
-        h = DBFTables.Header(open(test_dbf_path))
-        @test h.version == 3
-        @test h.last_update == Date("20140806", dateformat"yyyymmdd")
-        @test h.records == 7
-        @test length(h.fields) == 6
+    @testset "Header" begin
+        # Verify that Header survives write/read roundtrip
+        h, h2 = getfield(dbf, :header), getfield(dbf2, :header)
+        for name in fieldnames(DBFTables.Header)
+            @test getfield(h, name) == getfield(h2, name)
+        end
     end
 
     @testset "show" begin
-        @test sprint(show, row) === sprint(show, NamedTuple(row))
-        # use replace to update to julia 1.4 union printing
-        @test replace(sprint(show, dbf), r"\} +" => "}") ===
-              "DBFTables.Table with 7 rows and 6 columns\nTables.Schema:\n :CHAR     Union{Missing, String}\n :DATE     Union{Missing, Date}\n :BOOL     Union{Missing, Bool}\n :FLOAT    Union{Missing, Float64}\n :NUMERIC  Union{Missing, Float64}\n :INTEGER  Union{Missing, $Int}\n"
+        str = """
+        DBFTables.Table with 4 rows and 6 columns
+        Tables.Schema:
+         :a  Union{Missing, $Int}
+         :b  Union{Missing, String}
+         :c  Union{Missing, Bool}
+         :d  Union{Missing, Date}
+         :e  Union{Missing, Float64}
+         :f  Union{Missing, String}
+        """
+        @test sprint(show, dbf) == str
     end
 
-    @testset "iterate" begin
-        @test st === 2
-        @test haskey(row, :CHAR)
-        @test row.CHAR === "Bob"
-        @test row[2] === Date("19900102", dateformat"yyyymmdd")
-        @test_throws ArgumentError row.nonexistent_field
-        firstrow = (
-            CHAR = "Bob",
-            DATE = Date("19900102", dateformat"yyyymmdd"),
-            BOOL = false,
-            FLOAT = 10.21,
-            NUMERIC = 11.21,
-            INTEGER = 100,
-        )
-        @test NamedTuple(row) === firstrow
-        @test row isa DBFTables.Row
-        @test row isa Tables.AbstractRow
-        @test length(row) === 6
-        @test size(row) === (6,)
-        @test size(row, 1) === 6
-        @test_throws BoundsError size(row, 2)
-        @test DBFTables.getrow(row) === 1
-        @test DBFTables.gettable(row) === dbf
-        @test sum(1 for row in dbf) === 7
-        @test sum(1 for cell in row) === 6
-        @test propertynames(dbf) == [:CHAR, :DATE, :BOOL, :FLOAT, :NUMERIC, :INTEGER]
-        @test propertynames(row) == [:CHAR, :DATE, :BOOL, :FLOAT, :NUMERIC, :INTEGER]
-    end
+    @testset "iterate and other Base methods" begin
+        @test size(dbf) == size(df2)
+        @test size(dbf, 1) == size(df2, 1)
+        @test size(dbf, 2) == size(df2, 2)
+        for row in dbf
+            @test_throws ArgumentError row.nonexistent_field
+            @test length(row) == length('a':'f')
+            @test size(row) == (length(row), )
+            @test size(row, 1) == length(row)
+            @test propertynames(row) == Symbol.('a':'f')
+            for prop in propertynames(row)
+                @test getproperty(row, prop) isa Any # dummy test to ensure no error is thrown
+            end
+        end
 
-    @testset "column" begin
-        @test size(dbf) === (7, 6)
-        @test size(dbf, 2) === 6
-
-        @test length(dbf.CHAR) === 7
-        @test dbf.CHAR isa Vector{Union{String,Missing}}
-        @test dbf.INTEGER isa Vector{Union{Int,Missing}}
-        @test_throws ArgumentError row.nonexistent_field
-        @test dbf.INTEGER[2] === 101
-        @test ismissing(dbf.INTEGER[7])
-        @test dbf.CHAR[2] === "John"
-        @test ismissing(dbf.CHAR[7])
-
-        @test DBFTables.isdeleted(dbf) isa BitVector
-        @test all(.!DBFTables.isdeleted(dbf))
-        @test !DBFTables.isdeleted(dbf, 3)
+        @test sum(1 for row in dbf) === 4
+        @test sum(1 for cell in first(dbf)) === 6
     end
 
     @testset "Numeric 20-character Limit Nonsense" begin
@@ -127,7 +108,7 @@ row, st = iterate(dbf)
         @test DBFTables.dbf_value(Val('N'), 0x01, negbig) == string(negbig)
         @test_throws Exception DBFTables.dbf_value(Val('N'), 0x01, negbig - 1)
 
-        @test_warn r"DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, prevfloat(Inf))
-        @test_warn r"DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, nextfloat(-Inf))
+        @test_warn "DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, prevfloat(Inf))
+        @test_warn "DBF limitation" DBFTables.dbf_value(Val('N'), 0x01, nextfloat(-Inf))
     end
-end  # testset "DBFTables"
+end
diff --git a/test/test.dbf b/test/test.dbf