FluxML
diff --git a/‎CHANGELOG.md
Lines changed: 13 additions & 2 deletions b/‎CHANGELOG.md
Lines changed: 13 additions & 2 deletions
diff --git a/‎Project.toml
Lines changed: 4 additions & 12 deletions b/‎Project.toml
Lines changed: 4 additions & 12 deletions
diff --git a/‎docs/Project.toml
Lines changed: 2 additions & 1 deletion b/‎docs/Project.toml
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/background/datapipelines.md
Lines changed: 3 additions & 3 deletions b/‎docs/background/datapipelines.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/data_containers.md
Lines changed: 4 additions & 4 deletions b/‎docs/data_containers.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/fastai_api_comparison.md
Lines changed: 10 additions & 10 deletions b/‎docs/fastai_api_comparison.md
Lines changed: 10 additions & 10 deletions
diff --git a/‎docs/glossary.md
Lines changed: 2 additions & 2 deletions b/‎docs/glossary.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/introduction.md
Lines changed: 1 addition & 1 deletion b/‎docs/introduction.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/project.jl
Lines changed: 2 additions & 1 deletion b/‎docs/project.jl
Lines changed: 2 additions & 1 deletion
diff --git a/‎notebooks/how_to_visualize.ipynb
Lines changed: 1 addition & 1 deletion b/‎notebooks/how_to_visualize.ipynb
Lines changed: 1 addition & 1 deletion
@@ -5,7 +5,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v0.4.3
+## v0.5 (unreleased)
+
+### Changed
+
+- (BREAKING) Now uses [MLUtils.jl](https://github.com/JuliaML/MLUtils.jl) to create and load datasets and data containers
+    - Replaces dependencies MLDataPattern.jl, LearnBase.jl, and DataLoaders.jl
+    - Data containers must now implement the `Base.getindex`/`MLUtils.getobs` and `Base.length`/`MLUtils.numobs` interfaces.
+    - Previously exported `MLDataPattern.datasubset` has been replaced by `MLUtils.ObsView`
+    - Documentation has been updated appropriately
+    
+
+## v0.4.3 (2022/05/14)
 
 ### Added 
 
@@ -17,7 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - the old APIs for registries have been removed and functionality for accessing them (`finddatasets`, `loaddataset`) has been deprecated. See the updated docs for how to find functionality using the new feature registries.
 
 
-## v0.4.2
+## v0.4.2 (2022/04/30)
 
 ### Added
 
 
@@ -4,29 +4,25 @@ authors = ["Lorenz Ohly", "Julia Community"]
 version = "0.4.3"
 
 [deps]
-Animations = "27a7e980-b3e6-11e9-2bcd-0b925532e340"
-BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 ColorVectorSpace = "c3611d14-8923-5661-9e6a-0046d554d3a4"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 DataAugmentation = "88a5189c-e7ff-4f85-ac6b-e6158070f02e"
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-DataLoaders = "2e981812-ef13-4a9c-bfa0-ab13047b12a9"
 FeatureRegistries = "c6aefb4f-3ac3-4095-8805-528476b02c02"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f"
 FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 FluxTraining = "7bf95e4d-ca32-48da-9824-f0dc5310474f"
-Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
 ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19"
 ImageInTerminal = "d8c32880-2388-543b-8c61-d9f865259254"
 IndirectArrays = "9b13fd28-a010-5f03-acff-a1bbcff69959"
 InlineTest = "bd334432-b1e7-49c7-a2dc-dd9149e4ebd6"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-LearnBase = "7f8f8fb0-2700-5f03-b4bd-41f8cfc144b6"
-MLDataPattern = "9920b226-0b2a-5f5f-9153-9aa70a013f8b"
+MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
+MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 MosaicViews = "e94cdb99-869f-56ef-bcf0-1ae2bcbe0389"
 Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
@@ -45,29 +41,25 @@ UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-Animations = "0.4"
-BSON = "0.3"
 CSV = "0.8, 0.9, 0.10"
 ColorVectorSpace = "0.9"
 Colors = "0.12"
 DataAugmentation = "0.2.4"
 DataDeps = "0.7"
 DataFrames = "1"
-DataLoaders = "0.1"
 FeatureRegistries = "0.1"
 FileIO = "1.7"
 FilePathsBase = "0.9"
 FixedPointNumbers = "0.8"
 Flux = "0.12, 0.13"
 FluxTraining = "0.2, 0.3"
-Glob = "1"
 ImageIO = "0.6"
 ImageInTerminal = "0.4"
 IndirectArrays = "0.5, 1"
 InlineTest = "0.2"
 JLD2 = "0.4"
-LearnBase = "0.3, 0.4, 0.6"
-MLDataPattern = "0.5"
+MLDatasets = "0.7"
+MLUtils = "0.2.6"
 MosaicViews = "0.2, 0.3"
 Parameters = "0.12"
 PrettyTables = "1.2"
 
@@ -9,10 +9,11 @@ FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 FluxTraining = "7bf95e4d-ca32-48da-9824-f0dc5310474f"
 ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19"
-ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
 ImageShow = "4e3cecfd-b093-5904-9786-8bbb286a6a31"
 Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0"
 JuliaSyntax = "70703baa-626e-46a2-a12c-08ffd08c73b4"
+MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
+MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 ModuleInfo = "3c3ff5e7-c68c-4a09-80d1-9526a1e9878a"
 Pollen = "c88717ad-5130-4874-a664-5a9aba5ec443"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 
@@ -38,7 +38,7 @@ batchdata = batchviewcollated(taskdata, 16)
 NBATCHES = 200
 
 # sequential data iterator
-@time for (i, batch) in enumerate(getobs(batchdata, i) for i in 1:nobs(batchdata))
+@time for (i, batch) in enumerate(getobs(batchdata, i) for i in 1:numobs(batchdata))
     i != NBATCHES || break
 end
 
@@ -96,11 +96,11 @@ To find performance bottlenecks in the loading of each observation, you'll want
 ```julia
 using BenchmarkTools
 using FastAI
-using FastAI.Datasets
+using FastAI.Datasets, FastAI.MLUtils
 
 # Since loading times can vary per observation, we'll average the measurements over multiple observations
 N = 10
-data = datasubset(data, 1:N)
+data = MLUtils.ObsView(data, 1:N)
 
 # Time it takes to load an `(image, class)` observation
 @btime for i in 1:N
 
@@ -16,7 +16,7 @@ using FastAI
 data, _ = load(findfirst(datarecipes(datasetid="imagenette2-160")))
 ```
 
-A data container is any type that holds observations of data and allows us to load them with `getobs` and query the number of observations with `nobs`. In this case, each observation is a tuple of an image and the corresponding class; after all, we want to use it for image classification. 
+A data container is any type that holds observations of data and allows us to load them with `getobs` and query the number of observations with `numobs`. In this case, each observation is a tuple of an image and the corresponding class; after all, we want to use it for image classification. 
 
 {cell=main}
 ```julia
@@ -27,7 +27,7 @@ image
 
 {cell=main}
 ```julia
-nobs(data)
+numobs(data)
 ```
 
 `load(`[`datasets`](#)`[id])` makes it easy to a load a data container that is compatible with some block types, but to get a better feel for what it does, let's look under the hood by creating the same data container using some mid-level APIs.
@@ -41,11 +41,11 @@ Before we recreate the data container, we'll download the dataset and get the pa
 dir = load(datasets()["imagenette2-160"])
 ```
 
-Now we'll start with [`FileDataset`](#) which creates a data container (here a `Vector`) of files given a path. We'll use the path of the downloaded dataset:
+Now we'll start with `loadfolderdata` which creates a data container (here a `Vector`) of files given a path. We'll use the path of the downloaded dataset:
 
 {cell=main}
 ```julia
-files = FileDataset(dir)
+files = loadfolderdata(dir)
 ```
 
 `files` is a data container where each observation is a path to a file. We'll confirm that using `getobs`:
 
@@ -1,6 +1,6 @@
 # fastai API comparison 
 
-FastAI.jl is in many ways similar to the original Python [fastai](docs.fast.ai), but also has its differences. This reference goes through all the sections in the [fastai: A Layered API for Deep Learning](https://arxiv.org/abs/2002.04688) paper and comments what the interfaces for the same functionality in FastAI.jl are, and where they differ or functionality is still missing.
+FastAI.jl is in many ways similar to the original Python [fastai](http://docs.fast.ai), but also has its differences. This reference goes through all the sections in the [fastai: A Layered API for Deep Learning](https://arxiv.org/abs/2002.04688) paper and comments what the interfaces for the same functionality in FastAI.jl are, and where they differ or functionality is still missing.
 
 ## Applications
 
@@ -10,15 +10,16 @@ FastAI.jl additionally has a unified API for registering and discovering functio
 
 ### Vision
 
-Computer vision is the most developed part of FastAI.jl with good support for different tasks and optimized data pipelines with N-dimensional images, masks and keypoints. See the tutorial section for many examples.
+Computer vision is well-supported in FastAI.jl with different tasks and optimized data pipelines for N-dimensional images, masks and keypoints. See the tutorial section for many examples.
 
 ### Tabular
 
-Support for tabular data is merged into master but is lacking documentation which will come with the next release (0.2.0).
+FastAI.jl also has support for tabular data.
 
 ### Deployment
 
-Through FastAI.jl's [`LearningTask` interface](./learning_tasks.md), the data processing logic is decoupled from the dataset creation and training and can be easily serialized and loaded to make predictions. See the tutorial on [saving and loading models](../notebooks/serialization.ipynb).
+Through FastAI.jl's [`LearningTask`](#) interface, the data processing logic is decoupled from the dataset creation and training and can be easily serialized and loaded to make predictions. See the tutorial on [saving and loading models](../notebooks/serialization.ipynb).
+
 
 ---
 
@@ -76,8 +77,7 @@ res = lrfind(learner); plot(res)   # Run learning rate finder and plot suggestio
 Since it is a Julia package, FastAI.jl is not written on top of PyTorch, but a Julia library for deep learning: [Flux.jl](http://www.fluxml.ai). In any case, the point of this section is to note that the abstractions in fastai are decoupled and existing projects can easily be reused. This is also the case for FastAI.jl as it is built on top of several decoupled libraries. Many of these were built specifically for FastAI.jl, but they are unaware of each other and useful in their own right:
 
 - [Flux.jl](https://github.com/FluxML/Flux.jl) provides models, optimizers, and loss functions, fulfilling a similar role to PyTorch
-- [MLDataPattern.jl](https://github.com/JuliaML/MLDataPattern.jl) gives you tools for building and transforming data containers
-- [DataLoaders.jl](https://github.com/lorenzoh/DataLoaders.jl) takes care of efficient, parallelized iteration of data containers
+- [MLUtils.jl](https://github.com/JuliaML/MLUtils.jl) gives you tools for building and transforming data containers. Also, it takes care of efficient, parallelized iteration of data containers.
 - [DataAugmentation.jl](https://github.com/lorenzoh/DataAugmentation.jl) takes care of the lower levels of high-performance, composable data augmentations.
 - [FluxTraining.jl](https://github.com/lorenzoh/FluxTraining.jl) contributes a highly extensible training loop with 2-way callbacks
 
@@ -126,14 +126,14 @@ FastAI.jl makes all the same datasets available in `fastai.data.external` availa
 
 ### funcs_kwargs and DataLoader, fastai.data.core
 
-In FastAI.jl, you are not restricted to a specific type of data iterator and can pass any iterator over batches to `Learner`. In cases where performance is important [`DataLoader`](#) can speed up data iteration by loading and batching samples in parallel on background threads. All transformations of data happen through the data container interface which requires a type to implement `LearnBase.getobs` and `LearnBase.nobs`, similar to PyTorch's `torch.utils.data.Dataset`. Data containers are then transformed into other data containers. Some examples:
+In FastAI.jl, you are not restricted to a specific type of data iterator and can pass any iterator over batches to `Learner`. In cases where performance is important [`DataLoader`](#) can speed up data iteration by loading and batching samples in parallel on background threads. All transformations of data happen through the data container interface which requires a type to implement `Base.getindex`/`MLUtils.getobs` and `Base.length`/`MLUtils.numobs`, similar to PyTorch's `torch.utils.data.Dataset`. Data containers are then transformed into other data containers. Some examples:
 
 - [`mapobs`](#)`(f, data)` lazily maps a function `f` of over `data` such that `getobs(mapobs(f, data), idx) == f(getobs(data, idx))`. For example `mapobs(loadfile, files)` turns a vector of image files into a data container of images.
-- `DataLoader(data, batchsize)` is a wrapper around `batchviewcollated` which turns a data container of samples into one of collated batches and `eachobsparallel` which creates a parallel, buffered iterator over the observations (here batches) in the resulting container.
+- `DataLoader(data; batchsize)` is a wrapper around [`BatchView`](#) which turns a data container of samples into one of collated batches and `eachobsparallel` which creates a parallel, buffered iterator over the observations (here batches) in the resulting container.
 - [`groupobs`](#)`(f, data)` splits a container into groups using a grouping function `f`. For example, `groupobs(grandparentname, files)` creates training splits for files where the grandparent folder indicates the split.
-- [`datasubset`](#)`(data, idxs)` lazily takes a subset of the observations in `data`.
+- [`MLUtils.ObsView`](#)`(data, idxs)` lazily takes a subset of the observations in `data`.
 
-For more information, see the [data container tutorial](data_containers.md) and the [MLDataPattern.jl docs](https://mldatapatternjl.readthedocs.io/en/latest/). At a higher level, there are also convenience functions like [`FileDataset`](#) to create data containers.
+For more information, see the [data container tutorial](data_containers.md) and the [MLUtils.jl docs](https://juliaml.github.io/MLUtils.jl/dev/). At a higher level, there are also convenience functions like `loadfolderdata` to create data containers.
 
 ### Layers and architectures 
 
 
@@ -6,7 +6,7 @@ Terms commonly used in *FastAI.jl*.
 
 In many docstrings, generic types are abbreviated with the following symbols. Many of these refer to a learning task; the context should make clear which task is meant.
 
-- `DC{T}`: A [data container](#data-container) of type T, meaning a type that implements the data container interface `getobs` and `nobs` where `getobs : (DC{T}, Int) -> Int`, that is, each observation is of type `T`.
+- `DC{T}`: A [data container](data_containers.md) of type T, meaning a type that implements the data container interface `getindex`/`getobs` and `length`/`numobs` where `getobs : (DC{T}, Int) -> Int`, that is, each observation is of type `T`.
 - `I`: Type of the unprocessed input in the context of a task.
 - `T`: Type of the target variable.
 - `X`: Type of the processed input. This is fed into a `model`, though it may be batched beforehand. `Xs` represents a batch of processed inputs.
@@ -23,7 +23,7 @@ Some examples of these in use:
 
 ### Data container
 
-A data structure that is used to load a number of data observations separately and lazily. It defines how many observations it holds with `nobs` and how to load a single observation with `getobs`.
+A data structure that is used to load a number of data observations separately and lazily. It defines how many observations it holds with `numobs` and how to load a single observation with `getobs`.
 
 ### Learning task
 
 
@@ -33,7 +33,7 @@ ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"
 data, blocks = load(datarecipes()["imagenette2-160"])
 ```
 
-This line downloads and loads the [ImageNette](https://github.com/fastai/imagenette) image classification dataset, a small subset of ImageNet with 10 different classes. `data` is a [data container](data_containers.md) that can be used to load individual observations, here of images and the corresponding labels. We can use `getobs(data, i)` to load the `i`-th observation and `nobs` to find out how many observations there are.
+This line downloads and loads the [ImageNette](https://github.com/fastai/imagenette) image classification dataset, a small subset of ImageNet with 10 different classes. `data` is a [data container](data_containers.md) that can be used to load individual observations, here of images and the corresponding labels. We can use `getobs(data, i)` to load the `i`-th observation and `numobs` to find out how many observations there are.
 
 {cell=main }
 ```julia
 
@@ -6,12 +6,13 @@ Crayons.COLORS[:nothing] = 67
 ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"
 
 using FastAI, Flux, FluxTraining
-import DataAugmentation
+import DataAugmentation, MLUtils
 m = FastAI
 ms = [
     DataAugmentation,
     Flux,
     FluxTraining,
+    MLUtils,
     m,
 ]
 
 
@@ -106,7 +106,7 @@
     }
    ],
    "source": [
-    "idxs = rand(1:nobs(data), 9)\n",
+    "idxs = rand(1:numobs(data), 9)\n",
     "samples = [getobs(data, i) for i in idxs]\n",
     "xs, ys = makebatch(task, data, idxs)\n",
     "ŷs = gpu(model)(gpu(xs)) |> cpu"
Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@`
`106`	`106`	`}`
`107`	`107`	`],`
`108`	`108`	`"source": [`
`109`		`- "idxs = rand(1:nobs(data), 9)\n",`
	`109`	`+ "idxs = rand(1:numobs(data), 9)\n",`
`110`	`110`	`"samples = [getobs(data, i) for i in idxs]\n",`
`111`	`111`	`"xs, ys = makebatch(task, data, idxs)\n",`
`112`	`112`	`"ŷs = gpu(model)(gpu(xs)) \|> cpu"`