remove numpy format when with_format("julia") (#15)

CarloLucibello · web-flow · commit 50d4ee01af57 · 2023-02-12T17:34:28.000+01:00
* work in prog

* readme

* fix docstring

* no ci

* cleanup
diff --git a/CondaPkg.toml b/CondaPkg.toml
@@ -1,6 +1,7 @@
 channels = ["conda-forge"]
 
 [deps]
+h5py = ""
 pillow = ">=9.1, <10"
 numpy = ">=1.20, <2"
 datasets = ">=2.7, <3"
diff --git a/Project.toml b/Project.toml
@@ -1,19 +1,21 @@
 name = "HuggingFaceDatasets"
 uuid = "d94b9a45-fdf5-4270-b024-5cbb9ef7117d"
 authors = ["Carlo Lucibello"]
-version = "0.2.1"
+version = "0.3.0"
 
 [deps]
 CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab"
 DLPack = "53c2dc0f-f7d5-43fd-8906-6c0220547083"
+ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 
 [compat]
 CondaPkg = "0.2"
 DLPack = "0.1"
-MLUtils = "0.2, 0.3, 0.4"
-PythonCall = "0.8, 0.9"
+ImageCore = "0.9"
+MLUtils = "0.4.1"
+PythonCall = "0.9"
 julia = "1.7"
 
 [extras]
diff --git a/README.md b/README.md
@@ -25,25 +25,30 @@ Check out the `examples/` folder for usage examples.
 
 ```julia
 julia> train_data = load_dataset("mnist", split = "train")
-Dataset(<py Dataset({
+Dataset({
     features: ['image', 'label'],
     num_rows: 60000
-})>, identity)
+})
 
 # Indexing starts with 1. 
-# By defaul, python types are returned.
+# Python types are returned by default.
 julia> train_data[1]
 Python dict: {'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=28x28 at 0x2B64E2E90>, 'label': 5}
 
-julia> set_format!(train_data, "julia")
-Dataset(<py Dataset({
-    features: ['image', 'label'],
-    num_rows: 60000
-})>, HuggingFaceDatasets.py2jl)
+julia> length(train_data)
+60000
 
-# Now we have julia types
+# Now we set the julia format
+julia> train_data = load_dataset("mnist", split = "train").with_format("julia");
+
+# Returned observations are julia objects
 julia> train_data[1]
 Dict{String, Any} with 2 entries:
   "label" => 5
-  "image" => UInt8[0x00 0x00 … 0x00 0x00; 0x00 0x00 … 0x00 0x00; … ; 0x00 0x00 … 0x00 0x00; 0x00 0x00 … 0x00 0x00]
+  "image" => ColorTypes.Gray{FixedPointNumbers.N0f8}[Gray{N0f8}(0.0) Gray{N0f8}(0.0) … Gray{N0f8}(0.0) Gray{N0f8}(0.0); Gray{N…
+
+julia> train_data[1:2]
+Dict{String, Vector} with 2 entries:
+  "label" => [5, 0]
+  "image" => Base.ReinterpretArray{Gray{N0f8}, 2, UInt8, Matrix{UInt8}, false}[[Gray{N0f8}(0.0) Gray{N0f8}(0.0) … Gray{N0f8}(0.0) Gra…
 ```
diff --git a/examples/flux_mnist.jl b/examples/flux_mnist.jl
@@ -3,15 +3,23 @@ using Random, Statistics
 using Flux.Losses: logitcrossentropy
 using Flux: onecold
 using HuggingFaceDatasets
+using MLUtils
+using ImageCore
 # using ProfileView, BenchmarkTools
 
-function mnist_transform(x)
-    x = py2jl(x)
-    image = x["image"] ./ 255f0
-    label = Flux.onehotbatch(x["label"], 0:9)
+function mnist_transform(batch)
+    image = ImageCore.channelview.(batch["image"]) # from Matrix{Gray{N0f8}} to Matrix{UInt8}
+    image = Flux.batch(image) ./ 255f0
+    label = Flux.onehotbatch(batch["label"], 0:9)
     return (; image, label)
 end
 
+# Remove when https://github.com/JuliaML/MLUtils.jl/pull/147 is merged and tagged
+Base.getindex(data::MLUtils.MappedData, idx::Int) = getobs(data.f(getobs(data.data, [idx])), 1)
+Base.getindex(data::MLUtils.MappedData, idxs::AbstractVector) = data.f(getobs(data.data, idxs))
+Base.getindex(data::MLUtils.MappedData, ::Colon) = data[1:length(data.data)]
+
+
 function loss_and_accuracy(data_loader, model, device)
     acc = 0
     ls = 0.0f0
@@ -29,18 +37,16 @@ end
 function train(epochs)
     batchsize = 128
     nhidden = 100
-    device = gpu
-
-    dataset = load_dataset("mnist")
-    set_format!(dataset, "julia")
-    set_jltransform!(dataset, mnist_transform)
-
-    # We use [:] to materialize and transform the whole dataset.
-    # This gives much faster iterations.
-    # Omit the [:] if you don't want to load the whole dataset in-memory.
-    train_loader = Flux.DataLoader(dataset["train"][:]; batchsize, shuffle=true) 
-    test_loader = Flux.DataLoader(dataset["test"][:]; batchsize)
+    device = cpu
 
+    train_data = load_dataset("mnist", split="train").with_format("julia")
+    test_data = load_dataset("mnist", split="test").with_format("julia")
+    train_data = mapobs(mnist_transform, train_data)[:] # lazy apply transform then materialize
+    test_data = mapobs(mnist_transform, test_data)[:]
+    
+    train_loader = Flux.DataLoader(train_data; batchsize, shuffle=true) 
+    test_loader = Flux.DataLoader(test_data; batchsize)
+    
     model = Chain([Flux.flatten,
                    Dense(28*28, nhidden, relu),
                    Dense(nhidden, nhidden, relu),
@@ -57,7 +63,7 @@ function train(epochs)
     end
 
     report(0)
-	for epoch in 1:epochs
+	@time for epoch in 1:epochs
 		for (x, y) in train_loader
 			x, y = x |> device, y |> device
 			loss, grads = withgradient(model -> logitcrossentropy(model(x), y), model)
diff --git a/perf/perf.jl b/perf/perf.jl
@@ -1,45 +1,61 @@
 using HuggingFaceDatasets
 using BenchmarkTools
+using MLDatasets
 
 function f(ds)
-    for i in 1:6000
-        ds[i]
+    for i in 1:numobs(ds)
+        getobs(ds, i)
     end
 end
+function fbatch(ds)
+    for i in 1:128:numobs(ds)-128
+        getobs(ds, i:i+127)
+    end
+end
+function fall(ds)
+    getobs(ds, :)
+end
 
-ds_plain = load_dataset("mnist", split="train")
-@btime f(ds_plain)
-
-
-ds_julia = with_format(ds_plain, "julia")
-@btime f(ds_plain)
-
-
-ds_py2jl = with_jltransform(ds_plain, py2jl)
-
-set_transform!(ds2, py2jl)
-
-@btime f1(ds2)
-
-ds2[1]["image"]
-
-set_transform!(ds2, identity)
-@time ds2[1:10000]["image"];
-@time Flux.batch(ds2[1:10000]["image"])
-
-@time Flux.batch(ds2[1:10000]["image"] |> py2jl)
-
-ds[1]
-
-ds2["label"]
-
-ds2.set_format("numpy")
-ds2[1:10] |> py2jl
-
-ds2["image"]
-
-#####
-function set_jltransform!(ds, transform = identity)
-    ds.pyset_format("numpy")
-    ds.jltransform(transform)
+function bench()
+    mld = MNIST(split=:test)
+    ds_plain = load_dataset("mnist", split="test")
+    ds_julia = with_format(ds_plain, "julia")
+    ds_numpy = with_format(ds_plain, "numpy")
+    ds_jnumpy = with_jltransform(py2jl, ds_numpy) # numpy + py2jl
+
+    for (name, ds) in [("mldatasets", mld),
+                      ("plain", ds_plain),
+                      ("julia", ds_julia),
+                      ("numpy", ds_numpy),
+                      ("jnumpy", ds_jnumpy)]
+        println("# $name")
+        @btime f($ds)
+        @btime fbatch($ds)
+        @btime fall($ds)
+    end
 end
+
+# hf is slow at reading image datasets.
+# Pytorch vision is much faset (see the notebook in perf/) 
+
+bench()
+# # MLDatasets
+# 19.515 ms (120005 allocations: 34.64 MiB)
+# 4.671 ms (1097 allocations: 29.97 MiB)
+# 717.324 ns (6 allocations: 240 bytes)
+# # plain
+# 602.001 ms (668464 allocations: 18.06 MiB)
+# 266.483 ms (390 allocations: 6.09 KiB)
+# 265.651 ms (5 allocations: 80 bytes)
+# # julia
+# 985.251 ms (2398464 allocations: 93.28 MiB)
+# 379.270 ms (659256 allocations: 27.31 MiB)
+# 378.751 ms (650134 allocations: 27.01 MiB)
+# # numpy
+# 1.264 s (728464 allocations: 19.13 MiB)
+# 311.426 ms (390 allocations: 6.09 KiB)
+# 318.403 ms (5 allocations: 80 bytes)
+# # jnumpy
+# 1.527 s (2208464 allocations: 110.91 MiB)
+# 318.356 ms (13962 allocations: 637.41 KiB)
+# 335.109 ms (179 allocations: 8.17 KiB)
diff --git a/src/HuggingFaceDatasets.jl b/src/HuggingFaceDatasets.jl
@@ -4,6 +4,7 @@ using PythonCall
 using MLUtils: getobs, numobs
 import MLUtils
 using DLPack
+using ImageCore
 
 const datasets = PythonCall.pynew()
 const PIL = PythonCall.pynew()
diff --git a/src/dataset.jl b/src/dataset.jl
@@ -28,6 +28,8 @@ end
 function Base.getproperty(ds::Dataset, s::Symbol)
     if s in fieldnames(Dataset)
         return getfield(ds, s)
+    elseif s === :with_format
+        return format -> with_format(ds, format)
     else
         res = getproperty(getfield(ds, :pyds), s)
         if pycallable(res)
@@ -44,7 +46,7 @@ Base.getindex(ds::Dataset, ::Colon) = ds[1:length(ds)]
 
 function Base.getindex(ds::Dataset, i::AbstractVector{<:Integer})
     @assert all(>(0), i)
-    x = ds.pyds[i .- 1]
+    x = getfield(ds, :pyds)[i .- 1]
     return ds.jltransform(x)
 end
 
@@ -64,6 +66,8 @@ function Base.deepcopy(ds::Dataset)
     return Dataset(pyds, ds.jltransform)
 end
 
+Base.show(io::IO, ds::Dataset) = print(io, ds.pyds)
+
 """
     with_format(ds::Dataset, format)
 
@@ -103,7 +107,7 @@ version of [`with_format`](@ref).
 """
 function set_format!(ds::Dataset, format)
     if format == "julia"
-        ds.pyds.set_format("numpy")
+        # ds.pyds.set_format("numpy")
         ds.jltransform = py2jl
     else
         ds.pyds.set_format(format)
diff --git a/src/datasetdict.jl b/src/datasetdict.jl
@@ -23,6 +23,8 @@ end
 function Base.getproperty(d::DatasetDict, s::Symbol)
     if s in fieldnames(DatasetDict)
         return getfield(d, s)
+    elseif s === :with_format
+        return format -> with_format(d, format)
     else
         res = getproperty(getfield(d, :pyd), s)
         if pycallable(res)
@@ -44,6 +46,9 @@ function Base.deepcopy(d::DatasetDict)
     pyd = copy.deepcopy(d.pyd)
     return DatasetDict(pyd, d.jltransform)
 end
+
+Base.show(io::IO, ds::DatasetDict) = print(io, ds.pyd)
+
 """"
     with_jltransform(d::DatasetDict, transform)
     with_jltransform(transform, d::DatasetDict)
diff --git a/src/load_dataset.jl b/src/load_dataset.jl
@@ -1,17 +1,24 @@
 
 """
-    load_dataset(args...; transform=py2jl, kws...)
+    load_dataset(args...; kws...)
 
 Load a dataset from the [HuggingFace Datasets](https://huggingface.co/datasets) library.
 
 All arguments are passed to the python function `datasets.load_dataset`.
 See the documentation [here](https://huggingface.co/docs/datasets/package_reference/loading_methods.html#datasets.load_dataset).
 
+Returns a [`DatasetDict`](@ref) or a [`Dataset`](@ref) depending on the `split` argument.
+
+Use the `dataset.with_format("julia")` method to lazily convert the observation from the dataset 
+to julia types.
+
 # Examples
 
+Without a `split` argument, a `DatasetDict` is returned:
+
 ```julia
 julia> d = load_dataset("glue", "sst2")
-DatasetDict(<py DatasetDict({
+DatasetDict({
     train: Dataset({
         features: ['sentence', 'label', 'idx'],
         num_rows: 67349
@@ -24,26 +31,29 @@ DatasetDict(<py DatasetDict({
         features: ['sentence', 'label', 'idx'],
         num_rows: 1821
     })
-})>, HuggingFaceDatasets.py2jl)
+})
 
 julia> d["train"]
-Dataset(<py Dataset({
+Dataset({
     features: ['sentence', 'label', 'idx'],
     num_rows: 67349
-})>, HuggingFaceDatasets.py2jl)
+})
+```
 
-mnist = load_dataset("mnist", split="train")
+Selecting a split returns a `Dataset` instead. We also
+apply the `"julia"` format.
 
-julia> mnist = load_dataset("mnist", split="train")
-Dataset(<py Dataset({
+```julia
+julia> mnist = load_dataset("mnist", split="train").with_format("julia")
+Dataset({
     features: ['image', 'label'],
     num_rows: 60000
-})>, HuggingFaceDatasets.py2jl)
+})
 
 julia> mnist[1]
 Dict{String, Any} with 2 entries:
   "label" => 5
-  "image" => UInt8[0x00 0x00 … 0x00 0x00; 0x00 0x00 … 0x00 0x00; … ; 0x00 0x00 … 0x00 0x00; 0x00 0x00 … 0x00 0x00]
+  "image" => Gray{N0f8}[Gray{N0f8}(0.0) Gray{N0f8}(0.0) … Gray{N0f8}(0.0) Gray{N0f8}(0.0); Gray{N0f8}(0.0) Gray{N0f8}(0.0) … Gray{N0f…
 ```
 """
 function load_dataset(args...; kws...)
diff --git a/src/observation.jl b/src/observation.jl
@@ -5,7 +5,7 @@ function MLUtils.getobs(py::Py, i::Integer)
     elseif pyisinstance(py, pytype(pylist()))
         # TODO do this only for lists containing numbers
         return py[i-1] 
-    elseif pyisinstance(xpy, np.ndarray)
+    elseif pyisinstance(py, np.ndarray)
         return py[i-1]
     else
         return error("Py type $(pytype(py)) non supported yet")
diff --git a/src/transforms.jl b/src/transforms.jl
@@ -5,9 +5,15 @@ function _pyconvert(x::Py)
         return Dataset(x)
     elseif pyisinstance(x, datasets.DatasetDict)
         return DatasetDict(x)
-    elseif pyisinstance(x, PIL.PngImagePlugin.PngImageFile)
-        # TODO: attempt to convert to a Julia image type. 
-        return numpy2jl(np.array(x))
+    elseif pyisinstance(x, PIL.PngImagePlugin.PngImageFile) || pyisinstance(x, PIL.JpegImagePlugin.JpegImageFile)
+        a = numpy2jl(np.array(x))
+        if ndims(a) == 3 && size(a, 1) == 3
+            return colorview(RGB{N0f8}, a)
+        elseif ndims(a) == 2
+            return reinterpret(Gray{N0f8}, a)
+        else
+            error("Unknown image format")
+        end
     elseif pyisinstance(x, np.ndarray)
         return numpy2jl(x)
     else
diff --git a/test/dataset.jl b/test/dataset.jl
diff --git a/test/no_ci.jl b/test/no_ci.jl
diff --git a/test/runtests.jl b/test/runtests.jl
diff --git a/test/transforms.jl b/test/transforms.jl