JuliaStats · asinghvi17 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,15 +1,17 @@
 name = "RDatasets"
 uuid = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
-version = "0.8.0"
+version = "1.0.0"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SciMLPublic = "431bcebd-1456-4ced-9d72-93c2757fff0b"
 
 [compat]
 CSV = "0.5, 0.6, 0.7, 0.8, 0.9, 0.10"
@@ -18,6 +20,7 @@ DataFrames = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22, 1"
 FileIO = "1"
 RData = "0.5, 0.6, 0.7, 0.8, 1"
 Reexport = "0.2, 1.0"
+SciMLPublic = "1"
 julia = "1.6"
 
 [extras]

diff --git a/README.md b/README.md
@@ -5,15 +5,21 @@
 The RDatasets package provides an easy way for Julia users to experiment with most of the standard data sets that are available in the core of R as well as datasets included with many of R's most popular packages. This package is essentially a simplistic port of the Rdatasets repo created by Vincent Arelbundock, who conveniently gathered data sets from many of the standard R packages in one convenient location on GitHub at https://github.com/vincentarelbundock/Rdatasets
 
 In order to load one of the data sets included in the RDatasets package, you will need to have the `DataFrames` package installed. This package is automatically installed as a dependency of the `RDatasets` package if you install `RDatasets` as follows:
-
-    Pkg.add("RDatasets")
-
+```julia
+Pkg.add("RDatasets")
+```
 After installing the RDatasets package, you can then load data sets using the `dataset()` function, which takes the name of a package and a data set as arguments:
-
-    using RDatasets
-    iris = dataset("datasets", "iris")
-    neuro = dataset("boot", "neuro")
-
+```julia
+using RDatasets
+iris = dataset("datasets", "iris")
+neuro = dataset("boot", "neuro")
+```
+You can also get descriptions of the datasets by calling `RDatasets.description`:
+```julia
+RDatasets.description("datasets", "iris")
+# or
+RDatasets.description(iris) # only use this on DataFrames returned from `dataset`!
+```
-RDatasets.description(iris) # only use this on DataFrames returned from `dataset`!
-```
+RDatasets.description(iris)
+```
+Only use the latter on data frames returned from `dataset`.
+
-RDatasets.description(iris) # only use this on DataFrames returned from `dataset`!
-```
+RDatasets.description(iris)
+```
+Only use the latter on data frames returned from `dataset`.
+
 # Data Sets
 
 The `RDatasets.packages()` function returns a table of represented R packages:
@@ -74,6 +80,23 @@ mlmRev|guImmun|Immunization in Guatemala|2159|13
 mlmRev|guPrenat|Prenatal care in Guatemala|2449|15
 mlmRev|star|Student Teacher Achievement Ratio (STAR) project data|26796|18
 
+# How to add datasets from a new package
+
+**Step 1: add the data from the package**
+
+ 1. In your clone of this repo `mkdir -p data/$PKG`
+ 2. Go to CRAN
+ 3. Download the *source package*
+ 4. Extract one or more of the datasets in the `data` directory into the new directory
+
+**Step 2: add the metadata**
+
+Run the script:
+
+     $ scripts/update_doc_one.sh $PKG
+
+Now it's ready for you to submit your pull request.
+
 # Licensing and Intellectual Property
 
 Following Vincent's lead, we have assumed that all of the data sets in this repository can be made available under the GPL-3 license. If you know that one of the datasets released here should not be released publicly or if you know that a data set can only be released under a different license, please contact me so that I can remove the data set from this repository.
diff --git a/doc/datasets.csv b/doc/datasets.csv
@@ -506,6 +506,36 @@
 "datasets","volcano","Topographic Information on Auckland's Maunga Whau Volcano",87,61
 "datasets","warpbreaks","The Number of Breaks in Yarn during Weaving",54,3
 "datasets","women","Average Heights and Weights for American Women",15,2
+"gamair","aral","aral",488,4
+"gamair","aral.bnd","aral.bnd",107,3
+"gamair","bird","bird",25100,7
+"gamair","blowfly","blowfly",180,3
+"gamair","bone","bone",23,4
+"gamair","brain","brain",1567,6
+"gamair","cairo","cairo",3780,7
+"gamair","chicago","chicago",5114,8
+"gamair","chl","chl",13840,7
+"gamair","co2s","co2s",507,4
+"gamair","coast","coast",2091,3
+"gamair","engine","engine",19,3
+"gamair","gas","gas",60,804
+"gamair","harrier","harrier",37,3
+"gamair","hubble","hubble",24,4
+"gamair","ipo","ipo",156,7
+"gamair","mack","mack",634,17
+"gamair","mackp","mackp",1162,9
+"gamair","med","med",1476,25
+"gamair","meh","meh",1476,24
+"gamair","mpg","mpg",205,27
+"gamair","prostate","prostate",654,530
+"gamair","sitka","sitka",1027,6
+"gamair","sole","sole",1575,8
+"gamair","sperm.comp1","sperm.comp1",15,5
+"gamair","sperm.comp2","sperm.comp2",24,11
+"gamair","stomata","stomata",24,4
+"gamair","swer","swer",2196,10
+"gamair","wesdr","wesdr",669,5
+"gamair","wine","wine",47,8
 "gap","PD","A study of Parkinson's disease and APOE, LRRK2, SNCA makers",825,22
 "gap","aldh2","ALDH2 markers and Alcoholism",263,18
 "gap","apoeapoc","APOE/APOC1 markers and Alzheimer's",353,8
@@ -732,33 +762,3 @@
 "vcd","VonBort","Von Bortkiewicz Horse Kicks Data",280,4
 "vcd","WeldonDice","Weldon's Dice Data",11,2
 "vcd","WomenQueue","Women in Queues",11,2
-"gamair","aral.bnd","aral.bnd",107,3
-"gamair","aral","aral",488,4
-"gamair","bird","bird",25100,7
-"gamair","blowfly","blowfly",180,3
-"gamair","bone","bone",23,4
-"gamair","brain","brain",1567,6
-"gamair","cairo","cairo",3780,7
-"gamair","chicago","chicago",5114,8
-"gamair","chl","chl",13840,7
-"gamair","co2s","co2s",507,4
-"gamair","coast","coast",2091,3
-"gamair","engine","engine",19,3
-"gamair","gas","gas",60,804
-"gamair","harrier","harrier",37,3
-"gamair","hubble","hubble",24,4
-"gamair","ipo","ipo",156,7
-"gamair","mack","mack",634,17
-"gamair","mackp","mackp",1162,9
-"gamair","med","med",1476,25
-"gamair","meh","meh",1476,24
-"gamair","mpg","mpg",205,27
-"gamair","prostate","prostate",654,530
-"gamair","sitka","sitka",1027,6
-"gamair","sole","sole",1575,8
-"gamair","sperm.comp1","sperm.comp1",15,5
-"gamair","sperm.comp2","sperm.comp2",24,11
-"gamair","stomata","stomata",24,4
-"gamair","swer","swer",2196,10
-"gamair","wesdr","wesdr",669,5
-"gamair","wine","wine",47,8
diff --git a/scripts/update_doc_all.sh b/scripts/update_doc_all.sh
@@ -0,0 +1,4 @@
+R --no-save <<END
+source("src/update_doc.r")
+update_docs(".")
+END
diff --git a/scripts/update_doc_one.sh b/scripts/update_doc_one.sh
@@ -0,0 +1,4 @@
+R --no-save <<END
+source("src/update_doc.r")
+update_package_doc(".", "$1")
+END
diff --git a/src/RDatasets.jl b/src/RDatasets.jl
@@ -3,10 +3,13 @@ module RDatasets
         @eval Base.Experimental.@optlevel 1
     end
 
+    import Markdown
+    import SciMLPublic: @public
     using Reexport, RData, CSV, CodecZlib
     @reexport using DataFrames
 
     export dataset
+    @public description, packages
 
     global __packages = nothing
     global __datasets = nothing

diff --git a/src/dataset.jl b/src/dataset.jl
@@ -6,19 +6,152 @@ const Dataset_typedetect_rows = Dict{Tuple{String, String}, Union{Vector,Dict}}(
 
 function dataset(package_name::AbstractString, dataset_name::AbstractString)
     basename = joinpath(@__DIR__, "..", "data", package_name)
-
+    # First, identify possible files
+    rdataname = joinpath(basename, string(dataset_name, ".RData"))
     rdaname = joinpath(basename, string(dataset_name, ".rda"))
-    if isfile(rdaname)
-        return load(rdaname)[dataset_name]
-    end
-
     csvname = joinpath(basename, string(dataset_name, ".csv.gz"))
-    if isfile(csvname)
-        return open(csvname,"r") do io
+    # Then, check to see which exists.  If none exist, error.
+    dataset = if isfile(rdataname)
+        load(rdataname)[dataset_name]
+    elseif isfile(rdaname)
+        load(rdaname)[dataset_name]
+    elseif isfile(csvname)
+        open(csvname,"r") do io
             uncompressed = IOBuffer(read(GzipDecompressorStream(io)))
             DataFrame(CSV.File(uncompressed, delim=',', quotechar='\"', missingstring="NA",
                       types=get(Dataset_typedetect_rows, (package_name, dataset_name), nothing)) )
         end
+    else
+        error("Unable to locate dataset file $rdaname or $csvname")
+    end
+    # Finally, inject metadata into the dataframe to indicate origin:
+    metadata!(dataset, "RDatasets.jl", (string(package_name), string(dataset_name)))
+    return dataset
+end
+
+
+"""
+    RDatasets.description(package_name::AbstractString, dataset_name::AbstractString)
+    RDatasets.description(df::AbstractDataFrame; default=nothing)
+
+Return an `RDatasetDescription` object containing the description of the dataset.
+
+Invoke this function in exactly the same way you would invoke `dataset` to get the dataset itself.
+
+This object prints well in the REPL, and can also be shown as Markdown or HTML.
+
+When passing a `DataFrame`, it must have been obtained from `RDatasets.dataset`. If the DataFrame
+does not have the required metadata, an error is thrown unless a `default` value is provided,
+in which case that value is returned instead.
+"""
+function description(package_name::AbstractString, dataset_name::AbstractString)
+    doc_html_file = joinpath(@__DIR__, "..", "doc", package_name, "$dataset_name.html")
+    if isfile(doc_html_file)
+        return RDatasetDescription(read(doc_html_file, String))
+    else
+        return RDatasetDescription("No description available.")
+    end
+end
+
+# This is a convenience function to get the description of a dataset from a DataFrame.
+# Since we set metadata on the DataFrame, we can use this to get the description.
+function description(df::AbstractDataFrame; default=nothing)
+    if "RDatasets.jl" in metadatakeys(df)
+        package_name, dataset_name = metadata(df, "RDatasets.jl")
+        return description(package_name, dataset_name)
+    elseif default !== nothing
+        return default
+    else
+        throw(ArgumentError("DataFrame does not have RDatasets.jl metadata. Use a DataFrame obtained from `RDatasets.dataset`, or provide a `default` value."))
+    end
+end
+
+"""
+    RDatasetDescription(content::String)
+
+A type to hold the content of a dataset description.
+
+The main purpose of its existence is to provide a way to display the content
+differently in HTML and Markdown contexts.
+
+Obtained through [`RDatasets.description`](@ref).
+"""
+struct RDatasetDescription
+    content::String
+end
+
+function Base.show(io::IO, mime::MIME"text/plain", d::RDatasetDescription)
+    s = description_to_markdown(d.content)
+    # Here, we show a Markdown.jl object, which the REPL can render correctly
+    # as markdown, as it does in help-mode.
+    show(io, mime, Markdown.parse(s))
+end
+function Base.show(io::IO, mime::MIME"text/markdown", d::RDatasetDescription)
+    s = description_to_markdown(d.content)
+    # Here, we return a Markdown string directly.  This is useful for e.g. documentation, 
+    # where we want to render the markdown as HTML.
+    show(io, mime, s)
+end
+# This returns raw HTML documentation.
+function Base.show(io::IO, mime::MIME"text/html", d::RDatasetDescription)
+    show(io, mime, Docs.HTML(d.content))
+end
+
+
+"""
+    description_to_markdown(string::String)
+
+Converts an HTML string to markdown.  This function is written specifically 
+for HTML descriptions in RDatasets.jl, and so is a bit opinionated on what to 
+replace, etc.
+
+It replaces all known HTML tags using regex, and then removes all other HTML tags.
+
+## Behaviour
+
+Currently, it handles the following HTML tags:
+- `<h1>`, `<h2>`, `<h3>`, `<h4>`, `<h5>`, `<h6>` -> `#`, `##`, `###`, `####`, `#####`, `######`
+- `<title>` -> `#`
+- `<code>` -> `` `code` ``
+- `<pre>` -> "```R\\npre\\n```"
+- `<EM>` -> `*EM*`
+- `<B>` -> `**B**`
+- `&ndash;` -> `-`
+
+## TODOs
+
+- Tables
+- Links
+- Images
+"""
+function description_to_markdown(string)
+    html_header_regex = r"<h(?'hnum'\d)>(?'content'[^<]+)<\/h\g'hnum'>"
+    function regexmatch2md(matched_string)
+        m = match(html_header_regex, matched_string)
+        if isnothing(m.captures[1]) || isnothing(m.captures[2])
+            return matched_string
+        end
+
+        hnum = parse(Int, m[:hnum])
+        content = m[:content]
+
+        return join(("\n", "#"^hnum, " ", content, "\n\n"))
     end
-    error("Unable to locate dataset file $rdaname or $csvname")
+    title_matcher_regex = r"<title>(?'content'[^<]+)<\/title>"
+    code_matcher_regex = r"<code>(?'content'[^<]+)<\/code>"
+    pre_matcher_regex = r"<pre>(?'content'[^<]+)<\/pre>"
+    emph_matcher_regex = r"<(?i)EM(?-i)>(?'content'[^<]+)<\/(?i)EM(?-i)>"
+    b_matcher_regex = r"<(?i)B(?-i)>(?'content'[^<]+)<\/(?i)B(?-i)>"
+    new_string = replace(
+        string, 
+        html_header_regex => regexmatch2md, 
+        title_matcher_regex => titlestr -> "# " * match(title_matcher_regex, titlestr)[:content],
+        code_matcher_regex => codestr -> "`" * match(code_matcher_regex, codestr)[:content] * "`",
+        pre_matcher_regex => prestr -> "\n```R\n" * match(pre_matcher_regex, prestr)[:content] * "\n```\n",
+        emph_matcher_regex => emphstr -> "*" * match(emph_matcher_regex, emphstr)[:content] * "*",
+        b_matcher_regex => bstr -> "**" * match(b_matcher_regex, bstr)[:content] * "**",
+        "&ndash;" => "-",
+    )
+    nohtml = replace(new_string, Regex("<[^>]*>") => "")
+    return replace(nohtml, Regex("\n\n+") => "\n\n")
 end