JuliaGenAI
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 2 deletions b/‎.gitignore‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎CondaPkg.toml‎
Lines changed: 2 additions & 2 deletions b/‎CondaPkg.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Project.toml‎
Lines changed: 5 additions & 10 deletions b/‎Project.toml‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎README.md‎
Lines changed: 17 additions & 50 deletions b/‎README.md‎
Lines changed: 17 additions & 50 deletions
diff --git a/‎docs/src/api.md‎
Lines changed: 13 additions & 0 deletions b/‎docs/src/api.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎docs/src/index.md‎
Lines changed: 40 additions & 3 deletions b/‎docs/src/index.md‎
Lines changed: 40 additions & 3 deletions
diff --git a/‎examples/Project.toml‎
Lines changed: 6 additions & 0 deletions b/‎examples/Project.toml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/flux_mnist.jl‎
Lines changed: 17 additions & 18 deletions b/‎examples/flux_mnist.jl‎
Lines changed: 17 additions & 18 deletions
diff --git a/‎perf/perf.jl‎
Lines changed: 45 additions & 0 deletions b/‎perf/perf.jl‎
Lines changed: 45 additions & 0 deletions
@@ -1,8 +1,8 @@
 *.jl.*.cov
 *.jl.cov
 *.jl.mem
-/Manifest.toml
+Manifest.toml
 /docs/build/
 .CondaPkg
 .vscode
-
+.ipynb_checkpoints
@@ -1,7 +1,7 @@
 channels = ["conda-forge"]
 
 [deps]
-datasets = ">=2.7, <3"
-numpy = ">=1.20, <2"
 pillow = ">=9.1, <10"
+numpy = ">=1.20, <2"
+datasets = ">=2.7, <3"
 pyarrow = "==6.0.0"
@@ -1,28 +1,23 @@
 name = "HuggingFaceDatasets"
 uuid = "d94b9a45-fdf5-4270-b024-5cbb9ef7117d"
-authors = ["Carlo Lucibello <[email protected]> and contributors"]
-version = "0.1.0"
+authors = ["Carlo Lucibello"]
+version = "0.2.0"
 
 [deps]
 CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab"
+DLPack = "53c2dc0f-f7d5-43fd-8906-6c0220547083"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 
 [compat]
 CondaPkg = "0.2"
-Flux = "0.13"
-MLDatasets = "0.7"
+DLPack = "0.1"
 MLUtils = "0.2, 0.3"
 PythonCall = "0.8, 0.9"
 julia = "1.7"
 
 [extras]
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-ImageInTerminal = "d8c32880-2388-543b-8c61-d9f865259254"
-ImageShow = "4e3cecfd-b093-5904-9786-8bbb286a6a31"
-MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
-ProfileView = "c46f51b8-102a-5cf2-8d2c-8597cb0e0da7"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Flux", "ImageInTerminal", "MLDatasets", "ProfileView", "ImageShow"]
+test = ["Test"]
@@ -5,78 +5,45 @@
 [![Build Status](https://github.com/CarloLucibello/HuggingFaceDatasets.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/CarloLucibello/HuggingFaceDatasets.jl/actions/workflows/CI.yml?query=branch%3Amain)
 [![Coverage](https://codecov.io/gh/CarloLucibello/HuggingFaceDatasets.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/CarloLucibello/HuggingFaceDatasets.jl) 
 
-A julia wrapper around the Hugging Face `datasets` python package, exposing a large collection
-of machine learning datasets. 
+HuggingFaceDatasets.jl is a non-official julia wrapper around the python package  `datasets` from Hugging Face. `datasets` contains a large collection of machine learning datasets (see [here](https://huggingface.co/datasets) for a list) that this package makes available to the julia ecosystem.
 
 This package is built on top of [PythonCall.jl](https://github.com/cjdoris/PythonCall.jl).
 
 ## Installation
 
-This package is currently under development and not registered yet. 
-You can install it using the Julia package manager with the command
+HuggingFaceDatasets.jl is a registered Julia package. You can easily install it through the package manager:
 
 ```julia
-pkg> add https://github.com/CarloLucibello/HuggingFaceDatasets.jl
+pkg> add HuggingFaceDatasets
 ```
 
-## Usage Examples
+## Usage
 
-HuggingFaceDatasets.jl provides a few wrappers around types from the `datasets` python package,
-along with a few related methods.
+HuggingFaceDatasets.jl provides wrappers around types from the `datasets` python package (e.g. `Dataset` and `DatasetDict`) along with a few related methods.
 
 Check out the `examples/` folder for usage examples.
 
-### `load_dataset` method
-
 ```julia
-julia> using HuggingFaceDatasets
+julia> train_data = load_dataset("mnist", split = "train")
+Dataset(<py Dataset({
+    features: ['image', 'label'],
+    num_rows: 60000
+})>, identity)
 
-julia> train_data = load_dataset("mnist", split="train")
-Reusing dataset mnist (/home/carlo/.cache/huggingface/datasets/mnist/mnist/1.0.0 fda16c03c4ecfb13f165ba7e29cf38129ce035011519968cdaf74894ce91c9d4)
+# Indexing starts with 1. 
+# By defaul, python types are returned.
+julia> train_data[1]
+Python dict: {'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=28x28 at 0x2B64E2E90>, 'label': 5}
+
+julia> set_format!(train_data, "julia")
 Dataset(<py Dataset({
     features: ['image', 'label'],
     num_rows: 60000
 })>, HuggingFaceDatasets.py2jl)
 
+# Now we have julia types
 julia> train_data[1]
 Dict{String, Any} with 2 entries:
   "label" => 5
   "image" => UInt8[0x00 0x00 … 0x00 0x00; 0x00 0x00 … 0x00 0x00; … ; 0x00 0x00 … 0x00 0x00; 0x00 0x00 … 0x00 0x00]
 ```
-
-### `set_transform!` method
-
-```julia
-julia> using HuggingFaceDatasets, Flux
-
-julia> train_data = load_dataset("mnist", split="train");
-
-julia> function mnist_transform(x)
-            x = py2jl(x) # `py2jl` converts python types to julia types. This is the default transform.
-            image = Flux.batch(x["image"]) ./ 255f0
-            label = Flux.onehotbatch(x["label"], 0:9)
-            return (; image, label)
-        end
-
-julia> set_transform!(train_data, mnist_transform)
-
-julia> train_data[1:5].image |> summary
-"28×28×5 Array{Float32, 3}"
-
-julia> train_data[1:5].label
-10×5 OneHotMatrix(::Vector{UInt32}) with eltype Bool:
- ⋅  1  ⋅  ⋅  ⋅
- ⋅  ⋅  ⋅  1  ⋅
- ⋅  ⋅  ⋅  ⋅  ⋅
- ⋅  ⋅  ⋅  ⋅  ⋅
- ⋅  ⋅  1  ⋅  ⋅
- 1  ⋅  ⋅  ⋅  ⋅
- ⋅  ⋅  ⋅  ⋅  ⋅
- ⋅  ⋅  ⋅  ⋅  ⋅
- ⋅  ⋅  ⋅  ⋅  ⋅
- ⋅  ⋅  ⋅  ⋅  1
-```
-
-## Datasets list
-
-For a list of the available datasets, see https://huggingface.co/datasets.
@@ -0,0 +1,13 @@
+# API
+
+## Index
+
+```@index
+Pages   = ["api.md"]
+```
+
+## Docs
+
+```@autodocs
+Modules = [HuggingFaceDatasets]
+```
@@ -6,9 +6,46 @@ CurrentModule = HuggingFaceDatasets
 
 Documentation for [HuggingFaceDatasets](https://github.com/CarloLucibello/HuggingFaceDatasets.jl).
 
-```@index
+
+HuggingFaceDatasets.jl is a non-official julia wrapper around the python package  `datasets` from Hugging Face. `datasets` contains a large collection of machine learning datasets (see [here](https://huggingface.co/datasets) for a list) that this package makes available to the julia ecosystem.
+
+This package is built on top of [PythonCall.jl](https://github.com/cjdoris/PythonCall.jl).
+
+## Installation
+
+HuggingFaceDatasets.jl is a registered Julia package. You can easily install it through the package manager:
+
+```julia
+pkg> add HuggingFaceDatasets
 ```
 
-```@autodocs
-Modules = [HuggingFaceDatasets]
+## Usage
+
+HuggingFaceDatasets.jl provides wrappers around types from the `datasets` python package (e.g. `Dataset` and `DatasetDict`) along with a few related methods.
+
+Check out the `examples/` folder for usage examples.
+
+```julia
+julia> train_data = load_dataset("mnist", split = "train")
+Dataset(<py Dataset({
+    features: ['image', 'label'],
+    num_rows: 60000
+})>, identity)
+
+# Indexing starts with 1. 
+# By defaul, python types are returned.
+julia> train_data[1]
+Python dict: {'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=28x28 at 0x2B64E2E90>, 'label': 5}
+
+julia> set_format!(train_data, "julia")
+Dataset(<py Dataset({
+    features: ['image', 'label'],
+    num_rows: 60000
+})>, HuggingFaceDatasets.py2jl)
+
+# Now we have julia types
+julia> train_data[1]
+Dict{String, Any} with 2 entries:
+  "label" => 5
+  "image" => UInt8[0x00 0x00 … 0x00 0x00; 0x00 0x00 … 0x00 0x00; … ; 0x00 0x00 … 0x00 0x00; 0x00 0x00 … 0x00 0x00]
 ```
@@ -0,0 +1,6 @@
+[deps]
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+HuggingFaceDatasets = "d94b9a45-fdf5-4270-b024-5cbb9ef7117d"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -2,13 +2,12 @@ using Flux, Zygote
 using Random, Statistics
 using Flux.Losses: logitcrossentropy
 using Flux: onecold
-using CUDA
 using HuggingFaceDatasets
 # using ProfileView, BenchmarkTools
 
 function mnist_transform(x)
     x = py2jl(x)
-    image = Flux.batch(x["image"]) ./ 255f0
+    image = x["image"] ./ 255f0
     label = Flux.onehotbatch(x["label"], 0:9)
     return (; image, label)
 end
@@ -18,37 +17,36 @@ function loss_and_accuracy(data_loader, model, device)
     ls = 0.0f0
     num = 0
     for (x, y) in data_loader
-        x, y = device(x), device(y)
-        ŷ = model(x)
+        x, y = x |> device, y |> device
+		ŷ = model(x)
         ls += logitcrossentropy(ŷ, y, agg=sum)
-        acc += sum(onecold(ŷ) .== onecold(y)) ## Decode the output of the model
+        acc += sum(onecold(ŷ) .== onecold(y))
         num +=  size(x)[end]
     end
     return ls / num, acc / num
 end
 
-function train(epochs)	
+function train(epochs)
     batchsize = 128
     nhidden = 100
-    device = cpu
+    device = gpu
 
-    dtrain = load_dataset("mnist", split="train")
-    dtest = load_dataset("mnist", split="test")
-    set_transform!(dtrain, mnist_transform)
-    set_transform!(dtest, mnist_transform)
+    dataset = load_dataset("mnist")
+    set_format!(dataset, "julia")
+    set_jltransform!(dataset, mnist_transform)
 
     # We use [:] to materialize and transform the whole dataset.
     # This gives much faster iterations.
-    train_loader = Flux.DataLoader(dtrain[:]; batchsize, shuffle=true) 
-    test_loader = Flux.DataLoader(dtest[:]; batchsize)
+    # Omit the [:] if you don't want to load the whole dataset in-memory.
+    train_loader = Flux.DataLoader(dataset["train"][:]; batchsize, shuffle=true) 
+    test_loader = Flux.DataLoader(dataset["test"][:]; batchsize)
 
     model = Chain([Flux.flatten,
                    Dense(28*28, nhidden, relu),
                    Dense(nhidden, nhidden, relu),
                    Dense(nhidden, 10)]) |> device
 
-	ps = Flux.params(model)
-	opt = ADAM(1e-4)
+	opt = Flux.setup(AdamW(1e-3), model)
 
     function report(epoch)
 		train_loss, train_acc = loss_and_accuracy(train_loader, model, device)
@@ -62,11 +60,12 @@ function train(epochs)
 	for epoch in 1:epochs
 		for (x, y) in train_loader
 			x, y = x |> device, y |> device
-			loss, gs = withgradient(() -> logitcrossentropy(model(x), y), ps)
-			Flux.update!(opt, ps, gs)
+			loss, grads = withgradient(model -> logitcrossentropy(model(x), y), model)
+            Flux.update!(opt, model, grads[1])
 		end
         report(epoch)
 	end
 end
 
-@time train(2)
+@time train(2)  # 8s on a m1 pro with in-memory loading
+                # 20s on-the-fly loading
@@ -0,0 +1,45 @@
+using HuggingFaceDatasets
+using BenchmarkTools
+
+function f(ds)
+    for i in 1:6000
+        ds[i]
+    end
+end
+
+ds_plain = load_dataset("mnist", split="train")
+@btime f(ds_plain)
+
+
+ds_julia = with_format(ds_plain, "julia")
+@btime f(ds_plain)
+
+
+ds_py2jl = with_jltransform(ds_plain, py2jl)
+
+set_transform!(ds2, py2jl)
+
+@btime f1(ds2)
+
+ds2[1]["image"]
+
+set_transform!(ds2, identity)
+@time ds2[1:10000]["image"];
+@time Flux.batch(ds2[1:10000]["image"])
+
+@time Flux.batch(ds2[1:10000]["image"] |> py2jl)
+
+ds[1]
+
+ds2["label"]
+
+ds2.set_format("numpy")
+ds2[1:10] |> py2jl
+
+ds2["image"]
+
+#####
+function set_jltransform!(ds, transform = identity)
+    ds.pyset_format("numpy")
+    ds.jltransform(transform)
+end