diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 5805770..fc93ed4 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -15,10 +15,8 @@ jobs:
fail-fast: false
matrix:
julia-version:
- - "1.6"
- - "1.10"
+ - "lts"
- "1"
- - "nightly"
os:
- ubuntu-latest
- macos-latest
@@ -26,13 +24,13 @@ jobs:
julia-arch:
- x64
steps:
- - uses: actions/checkout@v2
- - uses: julia-actions/setup-julia@v1
+ - uses: actions/checkout@v6
+ - uses: julia-actions/setup-julia@v2
with:
version: ${{ matrix.julia-version }}
arch: ${{ matrix.julia-arch }}
- name: Cache artifacts
- uses: actions/cache@v2
+ uses: julia-actions/cache@v2
env:
cache-name: cache-artifacts
with:
diff --git a/.gitignore b/.gitignore
index 5a61b23..0e1b98c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ Manifest.toml
#*
.DS_Store
sandbox/
+/docs/build/
/docs/site/
/docs/Manifest.toml
.vscode
diff --git a/Project.toml b/Project.toml
index fc1c896..c5a65b9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.3.2"
[deps]
ARFFFiles = "da404889-ca92-49ff-9e8b-0aa6b4d38dc8"
+Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
@@ -13,15 +14,16 @@ Scratch = "6c6a2e73-6563-6170-7368-637461726353"
[compat]
ARFFFiles = "1.4.1"
+Downloads = "1.6.0"
HTTP = "0.8, 0.9, 1"
-JSON = "0.21"
+JSON = "0.21, 1"
Scratch = "1.1"
julia = "1.6"
[extras]
+Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
[targets]
test = ["Tables", "Test", "Logging"]
diff --git a/docs/build/assets/Documenter.css b/docs/build/assets/Documenter.css
deleted file mode 100644
index d9af5d6..0000000
--- a/docs/build/assets/Documenter.css
+++ /dev/null
@@ -1,18 +0,0 @@
-div.wy-menu-vertical ul.current li.toctree-l3 a {
- font-weight: bold;
-}
-
-a.documenter-source {
- float: right;
-}
-
-.documenter-methodtable pre {
- margin-left: 0;
- margin-right: 0;
- margin-top: 0;
- padding: 0;
-}
-
-.documenter-methodtable pre.documenter-inline {
- display: inline;
-}
diff --git a/docs/build/assets/mathjaxhelper.js b/docs/build/assets/mathjaxhelper.js
deleted file mode 100644
index 3561b10..0000000
--- a/docs/build/assets/mathjaxhelper.js
+++ /dev/null
@@ -1,25 +0,0 @@
-MathJax.Hub.Config({
- "tex2jax": {
- inlineMath: [['$','$'], ['\\(','\\)']],
- processEscapes: true
- }
-});
-MathJax.Hub.Config({
- config: ["MMLorHTML.js"],
- jax: [
- "input/TeX",
- "output/HTML-CSS",
- "output/NativeMML"
- ],
- extensions: [
- "MathMenu.js",
- "MathZoom.js",
- "TeX/AMSmath.js",
- "TeX/AMSsymbols.js",
- "TeX/autobold.js",
- "TeX/autoload-all.js"
- ]
-});
-MathJax.Hub.Config({
- TeX: { equationNumbers: { autoNumber: "AMS" } }
-});
diff --git a/docs/build/index.md b/docs/build/index.md
deleted file mode 100644
index 086546f..0000000
--- a/docs/build/index.md
+++ /dev/null
@@ -1,354 +0,0 @@
-
-
-
-
-
-# OpenML.jl Documentation
-
-
-This is the reference documentation of [`OpenML.jl`](https://github.com/JuliaAI/OpenML.jl).
-
-
-The [OpenML platform](https://www.openml.org) provides an integration platform for carrying out and comparing machine learning solutions across a broad collection of public datasets and software platforms.
-
-
-Summary of OpenML.jl functionality:
-
-
- * [`OpenML.list_tags`](index.md#OpenML.list_tags)`()`: for listing all dataset tags
- * [`OpenML.list_datasets`](index.md#OpenML.list_datasets)`(; tag=nothing, filter=nothing, output_format=...)`: for listing available datasets
- * [`OpenML.describe_dataset`](index.md#OpenML.describe_dataset)`(id)`: to describe a particular dataset
- * [`OpenML.load`](index.md#OpenML.load)`(id; parser=:arff)`: to download a dataset
-
-
-
-
-
-
-## Installation
-
-
-```julia
-using Pkg
-Pkg.add("OpenML")
-```
-
-
-If running the demonstration below:
-
-
-```julia
-Pkg.add("DataFrames")
-Pkg.add("ScientificTypes")
-```
-
-
-
-
-
-
-## Sample usage
-
-
-```julia
-julia> using OpenML # or using MLJ
-
-
-julia> using DataFrames
-
-
-julia> OpenML.list_tags()
-300-element Vector{Any}:
- "study_41"
- "uci"
- "study_34"
- "study_37"
- "mythbusting_1"
- "OpenML-CC18"
- "study_99"
- "artificial"
- "BNG"
- "study_16"
- ⋮
- "Earth Science"
- "Social Media"
- "Meteorology"
- "Geography"
- "Language"
- "Computational Universe"
- "History"
- "Culture"
- "Sociology"
-```
-
-
-Listing all datasets with the "OpenML100" tag which also have `n` instances and `p` features, where `100 < n < 1000` and `1 < p < 10`:
-
-
-```julia
-julia> ds = OpenML.list_datasets(
- tag = "OpenML100",
- filter = "number_instances/100..1000/number_features/1..10",
- output_format = DataFrame)
-12×13 DataFrame
- Row │ id name status MajorityClassSize Max ⋯
- │ Int64 String String Int64? Int ⋯
-─────┼──────────────────────────────────────────────────────────────────────────
- 1 │ 11 balance-scale active 288 ⋯
- 2 │ 15 breast-w active 458
- 3 │ 37 diabetes active 500
- 4 │ 50 tic-tac-toe active 626
- 5 │ 333 monks-problems-1 active 278 ⋯
- 6 │ 334 monks-problems-2 active 395
- 7 │ 335 monks-problems-3 active 288
- 8 │ 451 irish active 278
- 9 │ 469 analcatdata_dmft active 155 ⋯
- 10 │ 470 profb active 448
- 11 │ 1464 blood-transfusion-service-center active 570
- 12 │ 40496 LED-display-domain-7digit active 57
- 9 columns omitted
-```
-
-
-Describing and loading one of these datasets:
-
-
-```julia
-julia> OpenML.describe_dataset(15)
- Author: Dr. William H. Wolberg, University of Wisconsin Source: UCI
- (https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)),
- University of Wisconsin (http://pages.cs.wisc.edu/~olvi/uwmp/cancer.html) -
- 1995 Please cite: See below, plus UCI
- (https://archive.ics.uci.edu/ml/citation_policy.html)
-
- Breast Cancer Wisconsin (Original) Data Set. Features are computed from a
- digitized image of a fine needle aspirate (FNA) of a breast mass. They
- describe characteristics of the cell nuclei present in the image. The target
- feature records the prognosis (malignant or benign). Original data available
- here (ftp://ftp.cs.wisc.edu/math-prog/cpo-dataset/machine-learn/cancer/)
-
- Current dataset was adapted to ARFF format from the UCI version. Sample code
- ID's were removed.
-
- ! Note that there is also a related Breast Cancer Wisconsin (Diagnosis) Data
- Set with a different set of features, better known as wdbc
- (https://www.openml.org/d/1510).
-
- Relevant Papers
- –––––––––––––––
-
- W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction
- for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on
- Electronic Imaging: Science and Technology, volume 1905, pages 861-870, San
- Jose, CA, 1993.
-
- O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and
- prognosis via linear programming. Operations Research, 43(4), pages 570-577,
- July-August 1995.
-
- Citation request
- ––––––––––––––––
-
- This breast cancer database was obtained from the University of Wisconsin
- Hospitals, Madison from Dr. William H. Wolberg. If you publish results when
- using this database, then please include this information in your
- acknowledgments. Also, please cite one or more of:
-
- 1. O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear
- programming", SIAM News, Volume 23, Number 5, September 1990, pp 1
- & 18.
-
- 2. William H. Wolberg and O.L. Mangasarian: "Multisurface method of
- pattern separation for medical diagnosis applied to breast
- cytology", Proceedings of the National Academy of Sciences,
- U.S.A., Volume 87, December 1990, pp 9193-9196.
-
- 3. O. L. Mangasarian, R. Setiono, and W.H. Wolberg: "Pattern
- recognition via linear programming: Theory and application to
- medical diagnosis", in: "Large-scale numerical optimization",
- Thomas F. Coleman and Yuying Li, editors, SIAM Publications,
- Philadelphia 1990, pp 22-30.
-
- 4. K. P. Bennett & O. L. Mangasarian: "Robust linear programming
- discrimination of two linearly inseparable sets", Optimization
- Methods and Software 1, 1992, 23-34 (Gordon & Breach Science
- Publishers).
-
-julia> table = OpenML.load(15)
-Tables.DictColumnTable with 699 rows, 10 columns, and schema:
- :Clump_Thickness Float64
- :Cell_Size_Uniformity Float64
- :Cell_Shape_Uniformity Float64
- :Marginal_Adhesion Float64
- :Single_Epi_Cell_Size Float64
- :Bare_Nuclei Union{Missing, Float64}
- :Bland_Chromatin Float64
- :Normal_Nucleoli Float64
- :Mitoses Float64
- :Class CategoricalArrays.CategoricalValue{String, UInt32}
-```
-
-
-Converting to a data frame:
-
-
-```julia
-julia> df = DataFrame(table)
-699×10 DataFrame
- Row │ Clump_Thickness Cell_Size_Uniformity Cell_Shape_Uniformity Marginal_ ⋯
- │ Float64 Float64 Float64 Float64 ⋯
-─────┼──────────────────────────────────────────────────────────────────────────
- 1 │ 5.0 1.0 1.0 ⋯
- 2 │ 5.0 4.0 4.0
- 3 │ 3.0 1.0 1.0
- 4 │ 6.0 8.0 8.0
- 5 │ 4.0 1.0 1.0 ⋯
- 6 │ 8.0 10.0 10.0
- 7 │ 1.0 1.0 1.0
- 8 │ 2.0 1.0 2.0
- ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋱
- 693 │ 3.0 1.0 1.0 ⋯
- 694 │ 3.0 1.0 1.0
- 695 │ 3.0 1.0 1.0
- 696 │ 2.0 1.0 1.0
- 697 │ 5.0 10.0 10.0 ⋯
- 698 │ 4.0 8.0 6.0
- 699 │ 4.0 8.0 8.0
- 7 columns and 684 rows omitted
-```
-
-
-Inspecting it's schema:
-
-
-```julia
-julia> using ScientificTypes
-
-
-julia> schema(table)
-┌───────────────────────┬────────────────────────────┬──────────────────────────
-│ names │ scitypes │ types ⋯
-├───────────────────────┼────────────────────────────┼──────────────────────────
-│ Clump_Thickness │ Continuous │ Float64 ⋯
-│ Cell_Size_Uniformity │ Continuous │ Float64 ⋯
-│ Cell_Shape_Uniformity │ Continuous │ Float64 ⋯
-│ Marginal_Adhesion │ Continuous │ Float64 ⋯
-│ Single_Epi_Cell_Size │ Continuous │ Float64 ⋯
-│ Bare_Nuclei │ Union{Missing, Continuous} │ Union{Missing, Float64} ⋯
-│ Bland_Chromatin │ Continuous │ Float64 ⋯
-│ Normal_Nucleoli │ Continuous │ Float64 ⋯
-│ Mitoses │ Continuous │ Float64 ⋯
-│ Class │ Multiclass{2} │ CategoricalValue{String ⋯
-└───────────────────────┴────────────────────────────┴──────────────────────────
- 1 column omitted
-```
-
-
-
-
-
-
-## Public API
-
-### **`OpenML.list_tags`**
-
-
-
-
-```julia
-list_tags()
-```
-
-List all available tags.
-
-### **`OpenML.list_datasets`**
-
-```julia
-list_datasets(; tag = nothing, filters = "", output_format = NamedTuple)
-```
-
-Lists all active OpenML datasets, if `tag = nothing` (default). To list only datasets with a given tag, choose one of the tags in [`list_tags()`](index.md#OpenML.list_tags). An alternative `output_format` can be chosen, e.g. `DataFrame`, if the `DataFrames` package is loaded.
-
-A filter is a string of `/` or `/` pairs, concatenated using `/`, such as
-
-```julia
- filter = "number_features/10/number_instances/500..10000"
-```
-
-The allowed data qualities include `tag`, `status`, `limit`, `offset`, `data_id`, `data_name`, `data_version`, `uploader`, `number_instances`, `number_features`, `number_classes`, `number_missing_values`.
-
-For more on the format and effect of `filters` refer to the [openml API](https://www.openml.org/api_docs#!/data/get_data_list_filters).
-
-**Examples**
-
-```julia
-julia> using DataFrames
-
-julia> ds = OpenML.list_datasets(
- tag = "OpenML100",
- filter = "number_instances/100..1000/number_features/1..10",
- output_format = DataFrame
-)
-
-julia> sort!(ds, :NumberOfFeatures)
-```
-
-### **`OpenML.describe_dataset`**
-
-```julia
-describe_dataset(id)
-```
-
-Load and show the OpenML description of the data set `id`. Use [`list_datasets`](index.md#OpenML.list_datasets) to browse available data sets.
-
-**Examples**
-
-```julia
-julia> OpenML.describe_dataset(6)
- Author: David J. Slate Source: UCI
- (https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991 Please cite: P.
- W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers".
- Machine Learning 6(2), 1991
-
- 1. TITLE:
-
- Letter Image Recognition Data
-
- The objective is to identify each of a large number of black-and-white
- rectangular pixel displays as one of the 26 capital letters in the English
- alphabet. The character images were based on 20 different fonts and each
- letter within these 20 fonts was randomly distorted to produce a file of
- 20,000 unique stimuli. Each stimulus was converted into 16 primitive
- numerical attributes (statistical moments and edge counts) which were then
- scaled to fit into a range of integer values from 0 through 15. We
- typically train on the first 16000 items and then use the resulting model
- to predict the letter category for the remaining 4000. See the article
- cited above for more details.
-```
-
-### **`OpenML.load`**
-
-
-
-```julia
-OpenML.load(id; maxbytes = nothing)
-```
-
-Load the OpenML dataset with specified `id`, from those listed by [`list_datasets`](index.md#OpenML.list_datasets) or on the [OpenML site](https://www.openml.org/search?type=data).
-
-Datasets are saved as julia artifacts so that they persist locally once loaded.
-
-Returns a table.
-
-**Examples**
-
-```julia
-using DataFrames
-table = OpenML.load(61)
-df = DataFrame(table) # transform to a DataFrame
-using ScientificTypes
-df2 = coerce(df, autotype(df)) # corce to automatically detected scientific types
-
-peek_table = OpenML.load(61, maxbytes = 1024) # load only the first 1024 bytes of the table
-```
-
diff --git a/src/OpenML.jl b/src/OpenML.jl
index c66396e..2faaff4 100644
--- a/src/OpenML.jl
+++ b/src/OpenML.jl
@@ -5,6 +5,7 @@ using JSON
import ARFFFiles
using Markdown
using Scratch
+import Downloads
export OpenML
diff --git a/src/data.jl b/src/data.jl
index d91e873..45a8fb3 100644
--- a/src/data.jl
+++ b/src/data.jl
@@ -70,7 +70,9 @@ function load(id::Int; maxbytes = nothing)
fname = joinpath(download_cache, "$id.arff")
if !isfile(fname)
@info "Downloading dataset $id."
- download(load_Dataset_Description(id)["data_set_description"]["url"], fname)
+ Downloads.download(
+ load_Dataset_Description(id)["data_set_description"]["url"],
+ fname,)
end
open(fname) do io
reader = ARFFFiles.loadstreaming(io)
diff --git a/test/data.jl b/test/data.jl
index 32df589..38880e3 100644
--- a/test/data.jl
+++ b/test/data.jl
@@ -16,7 +16,7 @@ offset = 8
filters_test = OpenML.load_List_And_Filter("limit/$limit/offset/$offset")
@testset "HTTP connection" begin
- @test typeof(response_test) <: Dict
+# @test typeof(response_test) <: Dict
@test response_test["data_set_description"]["name"] == "iris"
@test response_test["data_set_description"]["format"] == "ARFF"
end
@@ -28,14 +28,8 @@ end
end
@testset "data api functions" begin
- @test typeof(dqlist_test["data_qualities_list"]) <: Dict
-
- @test typeof(data_features_test) <: Dict
@test length(data_features_test["data_features"]["feature"]) == 5
@test data_features_test["data_features"]["feature"][1]["name"] == "sepallength"
-
- @test typeof(data_qualities_test) <: Dict
-
@test length(filters_test["data"]["dataset"]) == limit
@test length(filters_test["data"]["dataset"][1]) == offset
end