Skip to content

Commit 08b34d9

Browse files
updates (#8)
* add docs * with_format * with_format * flux example * finish dataset * init work on datasetdict * fix all tests * more tests * docstrings * cleanup * readme * fixes * cleanup * remove m * more tests * docs * cleanup * fix test
1 parent 289b4ee commit 08b34d9

21 files changed

+784
-314
lines changed

.gitignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
*.jl.*.cov
22
*.jl.cov
33
*.jl.mem
4-
/Manifest.toml
4+
Manifest.toml
55
/docs/build/
66
.CondaPkg
77
.vscode
8-
8+
.ipynb_checkpoints

CondaPkg.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
channels = ["conda-forge"]
22

33
[deps]
4-
datasets = ">=2.7, <3"
5-
numpy = ">=1.20, <2"
64
pillow = ">=9.1, <10"
5+
numpy = ">=1.20, <2"
6+
datasets = ">=2.7, <3"
77
pyarrow = "==6.0.0"

Project.toml

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,23 @@
11
name = "HuggingFaceDatasets"
22
uuid = "d94b9a45-fdf5-4270-b024-5cbb9ef7117d"
3-
authors = ["Carlo Lucibello <[email protected]> and contributors"]
4-
version = "0.1.0"
3+
authors = ["Carlo Lucibello"]
4+
version = "0.2.0"
55

66
[deps]
77
CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab"
8+
DLPack = "53c2dc0f-f7d5-43fd-8906-6c0220547083"
89
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
910
PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
1011

1112
[compat]
1213
CondaPkg = "0.2"
13-
Flux = "0.13"
14-
MLDatasets = "0.7"
14+
DLPack = "0.1"
1515
MLUtils = "0.2, 0.3"
1616
PythonCall = "0.8, 0.9"
1717
julia = "1.7"
1818

1919
[extras]
20-
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
21-
ImageInTerminal = "d8c32880-2388-543b-8c61-d9f865259254"
22-
ImageShow = "4e3cecfd-b093-5904-9786-8bbb286a6a31"
23-
MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
24-
ProfileView = "c46f51b8-102a-5cf2-8d2c-8597cb0e0da7"
2520
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2621

2722
[targets]
28-
test = ["Test", "Flux", "ImageInTerminal", "MLDatasets", "ProfileView", "ImageShow"]
23+
test = ["Test"]

README.md

Lines changed: 17 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -5,78 +5,45 @@
55
[![Build Status](https://github.com/CarloLucibello/HuggingFaceDatasets.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/CarloLucibello/HuggingFaceDatasets.jl/actions/workflows/CI.yml?query=branch%3Amain)
66
[![Coverage](https://codecov.io/gh/CarloLucibello/HuggingFaceDatasets.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/CarloLucibello/HuggingFaceDatasets.jl)
77

8-
A julia wrapper around the Hugging Face `datasets` python package, exposing a large collection
9-
of machine learning datasets.
8+
HuggingFaceDatasets.jl is a non-official julia wrapper around the python package `datasets` from Hugging Face. `datasets` contains a large collection of machine learning datasets (see [here](https://huggingface.co/datasets) for a list) that this package makes available to the julia ecosystem.
109

1110
This package is built on top of [PythonCall.jl](https://github.com/cjdoris/PythonCall.jl).
1211

1312
## Installation
1413

15-
This package is currently under development and not registered yet.
16-
You can install it using the Julia package manager with the command
14+
HuggingFaceDatasets.jl is a registered Julia package. You can easily install it through the package manager:
1715

1816
```julia
19-
pkg> add https://github.com/CarloLucibello/HuggingFaceDatasets.jl
17+
pkg> add HuggingFaceDatasets
2018
```
2119

22-
## Usage Examples
20+
## Usage
2321

24-
HuggingFaceDatasets.jl provides a few wrappers around types from the `datasets` python package,
25-
along with a few related methods.
22+
HuggingFaceDatasets.jl provides wrappers around types from the `datasets` python package (e.g. `Dataset` and `DatasetDict`) along with a few related methods.
2623

2724
Check out the `examples/` folder for usage examples.
2825

29-
### `load_dataset` method
30-
3126
```julia
32-
julia> using HuggingFaceDatasets
27+
julia> train_data = load_dataset("mnist", split = "train")
28+
Dataset(<py Dataset({
29+
features: ['image', 'label'],
30+
num_rows: 60000
31+
})>, identity)
3332

34-
julia> train_data = load_dataset("mnist", split="train")
35-
Reusing dataset mnist (/home/carlo/.cache/huggingface/datasets/mnist/mnist/1.0.0 fda16c03c4ecfb13f165ba7e29cf38129ce035011519968cdaf74894ce91c9d4)
33+
# Indexing starts with 1.
34+
# By defaul, python types are returned.
35+
julia> train_data[1]
36+
Python dict: {'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=28x28 at 0x2B64E2E90>, 'label': 5}
37+
38+
julia> set_format!(train_data, "julia")
3639
Dataset(<py Dataset({
3740
features: ['image', 'label'],
3841
num_rows: 60000
3942
})>, HuggingFaceDatasets.py2jl)
4043

44+
# Now we have julia types
4145
julia> train_data[1]
4246
Dict{String, Any} with 2 entries:
4347
"label" => 5
4448
"image" => UInt8[0x00 0x00 0x00 0x00; 0x00 0x00 0x00 0x00; ; 0x00 0x00 0x00 0x00; 0x00 0x00 0x00 0x00]
4549
```
46-
47-
### `set_transform!` method
48-
49-
```julia
50-
julia> using HuggingFaceDatasets, Flux
51-
52-
julia> train_data = load_dataset("mnist", split="train");
53-
54-
julia> function mnist_transform(x)
55-
x = py2jl(x) # `py2jl` converts python types to julia types. This is the default transform.
56-
image = Flux.batch(x["image"]) ./ 255f0
57-
label = Flux.onehotbatch(x["label"], 0:9)
58-
return (; image, label)
59-
end
60-
61-
julia> set_transform!(train_data, mnist_transform)
62-
63-
julia> train_data[1:5].image |> summary
64-
"28×28×5 Array{Float32, 3}"
65-
66-
julia> train_data[1:5].label
67-
10×5 OneHotMatrix(::Vector{UInt32}) with eltype Bool:
68-
1
69-
1
70-
71-
72-
1
73-
1
74-
75-
76-
77-
1
78-
```
79-
80-
## Datasets list
81-
82-
For a list of the available datasets, see https://huggingface.co/datasets.

docs/src/api.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# API
2+
3+
## Index
4+
5+
```@index
6+
Pages = ["api.md"]
7+
```
8+
9+
## Docs
10+
11+
```@autodocs
12+
Modules = [HuggingFaceDatasets]
13+
```

docs/src/index.md

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,46 @@ CurrentModule = HuggingFaceDatasets
66

77
Documentation for [HuggingFaceDatasets](https://github.com/CarloLucibello/HuggingFaceDatasets.jl).
88

9-
```@index
9+
10+
HuggingFaceDatasets.jl is a non-official julia wrapper around the python package `datasets` from Hugging Face. `datasets` contains a large collection of machine learning datasets (see [here](https://huggingface.co/datasets) for a list) that this package makes available to the julia ecosystem.
11+
12+
This package is built on top of [PythonCall.jl](https://github.com/cjdoris/PythonCall.jl).
13+
14+
## Installation
15+
16+
HuggingFaceDatasets.jl is a registered Julia package. You can easily install it through the package manager:
17+
18+
```julia
19+
pkg> add HuggingFaceDatasets
1020
```
1121

12-
```@autodocs
13-
Modules = [HuggingFaceDatasets]
22+
## Usage
23+
24+
HuggingFaceDatasets.jl provides wrappers around types from the `datasets` python package (e.g. `Dataset` and `DatasetDict`) along with a few related methods.
25+
26+
Check out the `examples/` folder for usage examples.
27+
28+
```julia
29+
julia> train_data = load_dataset("mnist", split = "train")
30+
Dataset(<py Dataset({
31+
features: ['image', 'label'],
32+
num_rows: 60000
33+
})>, identity)
34+
35+
# Indexing starts with 1.
36+
# By defaul, python types are returned.
37+
julia> train_data[1]
38+
Python dict: {'image': <PIL.PngImagePlugin.PngImageFile image mode=L size=28x28 at 0x2B64E2E90>, 'label': 5}
39+
40+
julia> set_format!(train_data, "julia")
41+
Dataset(<py Dataset({
42+
features: ['image', 'label'],
43+
num_rows: 60000
44+
})>, HuggingFaceDatasets.py2jl)
45+
46+
# Now we have julia types
47+
julia> train_data[1]
48+
Dict{String, Any} with 2 entries:
49+
"label" => 5
50+
"image" => UInt8[0x00 0x00 0x00 0x00; 0x00 0x00 0x00 0x00; ; 0x00 0x00 0x00 0x00; 0x00 0x00 0x00 0x00]
1451
```

examples/Project.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[deps]
2+
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
3+
HuggingFaceDatasets = "d94b9a45-fdf5-4270-b024-5cbb9ef7117d"
4+
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
5+
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
6+
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

examples/flux_mnist.jl

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,12 @@ using Flux, Zygote
22
using Random, Statistics
33
using Flux.Losses: logitcrossentropy
44
using Flux: onecold
5-
using CUDA
65
using HuggingFaceDatasets
76
# using ProfileView, BenchmarkTools
87

98
function mnist_transform(x)
109
x = py2jl(x)
11-
image = Flux.batch(x["image"]) ./ 255f0
10+
image = x["image"] ./ 255f0
1211
label = Flux.onehotbatch(x["label"], 0:9)
1312
return (; image, label)
1413
end
@@ -18,37 +17,36 @@ function loss_and_accuracy(data_loader, model, device)
1817
ls = 0.0f0
1918
num = 0
2019
for (x, y) in data_loader
21-
x, y = device(x), device(y)
22-
= model(x)
20+
x, y = x |> device, y |> device
21+
= model(x)
2322
ls += logitcrossentropy(ŷ, y, agg=sum)
24-
acc += sum(onecold(ŷ) .== onecold(y)) ## Decode the output of the model
23+
acc += sum(onecold(ŷ) .== onecold(y))
2524
num += size(x)[end]
2625
end
2726
return ls / num, acc / num
2827
end
2928

30-
function train(epochs)
29+
function train(epochs)
3130
batchsize = 128
3231
nhidden = 100
33-
device = cpu
32+
device = gpu
3433

35-
dtrain = load_dataset("mnist", split="train")
36-
dtest = load_dataset("mnist", split="test")
37-
set_transform!(dtrain, mnist_transform)
38-
set_transform!(dtest, mnist_transform)
34+
dataset = load_dataset("mnist")
35+
set_format!(dataset, "julia")
36+
set_jltransform!(dataset, mnist_transform)
3937

4038
# We use [:] to materialize and transform the whole dataset.
4139
# This gives much faster iterations.
42-
train_loader = Flux.DataLoader(dtrain[:]; batchsize, shuffle=true)
43-
test_loader = Flux.DataLoader(dtest[:]; batchsize)
40+
# Omit the [:] if you don't want to load the whole dataset in-memory.
41+
train_loader = Flux.DataLoader(dataset["train"][:]; batchsize, shuffle=true)
42+
test_loader = Flux.DataLoader(dataset["test"][:]; batchsize)
4443

4544
model = Chain([Flux.flatten,
4645
Dense(28*28, nhidden, relu),
4746
Dense(nhidden, nhidden, relu),
4847
Dense(nhidden, 10)]) |> device
4948

50-
ps = Flux.params(model)
51-
opt = ADAM(1e-4)
49+
opt = Flux.setup(AdamW(1e-3), model)
5250

5351
function report(epoch)
5452
train_loss, train_acc = loss_and_accuracy(train_loader, model, device)
@@ -62,11 +60,12 @@ function train(epochs)
6260
for epoch in 1:epochs
6361
for (x, y) in train_loader
6462
x, y = x |> device, y |> device
65-
loss, gs = withgradient(() -> logitcrossentropy(model(x), y), ps)
66-
Flux.update!(opt, ps, gs)
63+
loss, grads = withgradient(model -> logitcrossentropy(model(x), y), model)
64+
Flux.update!(opt, model, grads[1])
6765
end
6866
report(epoch)
6967
end
7068
end
7169

72-
@time train(2)
70+
@time train(2) # 8s on a m1 pro with in-memory loading
71+
# 20s on-the-fly loading

perf/perf.jl

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
using HuggingFaceDatasets
2+
using BenchmarkTools
3+
4+
function f(ds)
5+
for i in 1:6000
6+
ds[i]
7+
end
8+
end
9+
10+
ds_plain = load_dataset("mnist", split="train")
11+
@btime f(ds_plain)
12+
13+
14+
ds_julia = with_format(ds_plain, "julia")
15+
@btime f(ds_plain)
16+
17+
18+
ds_py2jl = with_jltransform(ds_plain, py2jl)
19+
20+
set_transform!(ds2, py2jl)
21+
22+
@btime f1(ds2)
23+
24+
ds2[1]["image"]
25+
26+
set_transform!(ds2, identity)
27+
@time ds2[1:10000]["image"];
28+
@time Flux.batch(ds2[1:10000]["image"])
29+
30+
@time Flux.batch(ds2[1:10000]["image"] |> py2jl)
31+
32+
ds[1]
33+
34+
ds2["label"]
35+
36+
ds2.set_format("numpy")
37+
ds2[1:10] |> py2jl
38+
39+
ds2["image"]
40+
41+
#####
42+
function set_jltransform!(ds, transform = identity)
43+
ds.pyset_format("numpy")
44+
ds.jltransform(transform)
45+
end

0 commit comments

Comments
 (0)