added blocks for regression task

codeboy5 · codeboy5 · commit ec6abe555dca · 2022-08-07T15:42:24.000+05:30
diff --git a/FastTimeSeries/src/FastTimeSeries.jl b/FastTimeSeries/src/FastTimeSeries.jl
@@ -54,5 +54,5 @@ function __init__()
 end
 
 export
-    TimeSeriesRow, TSClassificationSingle, TSPreprocessing
+    TimeSeriesRow, TSClassificationSingle, TSPreprocessing, _ts2df
 end
diff --git a/FastTimeSeries/src/container.jl b/FastTimeSeries/src/container.jl
@@ -54,6 +54,7 @@ function _ts2df(
 
     timestamps = false
     class_labels = false
+    target_labels = true
 
     open(full_file_path_and_name, "r") do file
         for ln in eachline(file)
@@ -109,6 +110,21 @@ function _ts2df(
 
                     series_length = parse(Int, tokens[2])
 
+                elseif startswith(ln, "@dimension")
+                    # Check that the associated value is valid
+                    tokens = split(ln, " ")
+                    
+                    num_dimensions = parse(Int, tokens[2])
+                
+                elseif startswith(ln, "@targetlabel")
+                    tokens = split(ln, " ")
+
+                    if tokens[2] == "true"
+                        target_labels = true
+                    else
+                        target_labels = false
+                    end
+
                 elseif startswith(ln, "@classlabel")
                     # Check that the associated value is valid
                     tokens = split(ln, " ")
@@ -150,7 +166,106 @@ function _ts2df(
                     # Check if we dealing with data that has timestamps
 
                     if timestamps
-                        #! Need To Add Code.
+                        
+                        has_another_value = false
+                        has_another_dimension = false
+
+                        timestamps_for_dimension = []
+                        values_for_dimension = []
+
+                        line_len = length(ln)
+                        char_num = 1
+                        num_this_dimension = 1
+                        arr = Array{Float32, 2}(undef, num_dimensions, series_length)
+
+                        while char_num <= line_len
+
+                            # Move through any spaces.
+                            while char_num <= line_len && isspace(ln[char_num])
+                                char_num += 1
+                            end
+
+                            if char_num <= line_len
+
+                                # Check if we have reached a class label
+                                if ln[char_num] != '(' && target_labels
+
+                                    class_val = strip(ln[char_num:end], ' ')
+
+                                    push!(class_val_list, parse(Float64, class_val))
+                                    push!(instance_list, arr)
+
+                                    char_num = line_len
+
+                                    has_another_value = false
+                                    has_another_dimension = false
+
+                                    timestamps_for_dimension = []
+                                    values_for_dimension = []
+
+                                    char_num += 1
+                                    num_this_dimension = 1
+                                    arr = Array{Float32, 2}(undef, num_dimensions, series_length)
+                                
+                                else
+
+                                    char_num += 1
+                                    tuple_data = ""
+
+                                    while (char_num <= line_len && ln[char_num] != ')')
+                                        tuple_data *= ln[char_num]
+                                        char_num += 1
+                                    end
+
+                                    char_num += 1
+
+                                    while char_num <= line_len && isspace(ln[char_num])
+                                        char_num += 1
+                                    end
+
+                                    # Check if there is another value or dimension to process after this tuple.
+                                    if char_num > line_len
+                                        has_another_value = false
+                                        has_another_dimension = false
+                                    elseif ln[char_num] == ','
+                                        has_another_value = true
+                                        has_another_dimension = false
+                                    elseif ln[char_num] == ':'
+                                        has_another_value = false
+                                        has_another_dimension = true
+                                    end
+
+                                    char_num += 1
+
+                                    last_comma_index = findlast(",", tuple_data)
+
+                                    if !isnothing(last_comma_index)
+                                        last_comma_index = last_comma_index[1]
+                                    end
+
+                                    value = tuple_data[last_comma_index+1:end]
+                                    value = parse(Float64, value)
+
+                                    timestamp = tuple_data[1:last_comma_index-1]
+
+                                    push!(values_for_dimension, value)
+
+                                    if !has_another_value
+
+                                        arr[num_this_dimension, 1:end] = values_for_dimension
+
+                                        values_for_dimension = []
+
+                                        num_this_dimension += 1
+                                    end
+                                    
+                                end
+
+                            end
+
+                        end
+
+
                     else
                         dimensions = split(ln, ':')
 
@@ -196,13 +311,9 @@ function _ts2df(
                                 data_series = split(dimension, ',')
                                 data_series = [parse(Float32, i) for i in data_series]
                                 arr[dim, 1:end] = data_series
-                                # println(data_series)
-                                # data_series = [parse(Float32, i) for i in data_series]
-                                # push!(instance_list[dim], data_series)
                             else
                                 tmp = Array{Float32, 1}(undef, 100)
                                 arr[dim, 1:end] = tmp
-                                # push!(instance_list[dim], [])
                             end
                         end
 
@@ -229,7 +340,7 @@ function _ts2df(
         end
 
         # Check if we should return any associated class labels separately
-        if class_labels
+        if class_labels || target_labels
             return data, class_val_list
         else
             return data
diff --git a/FastTimeSeries/src/models.jl b/FastTimeSeries/src/models.jl
@@ -10,8 +10,7 @@ function blockmodel(inblock::TimeSeriesRow,
     data   = rand(Float32, inblock.nfeatures, 32, inblock.obslength)
     # data   = [rand(Float32, inblock.nfeatures, 32) for _ ∈ 1:inblock.obslength]
     output = backbone(data)
-    outs   = size(output)[1]
-    return Models.RNNModel(backbone, outsize = length(outblock.classes), recout = outs)
+    return Models.RNNModel(backbone, outsize = length(outblock.classes), recout = size(output, 1))
 end
 
 """
diff --git a/FastTimeSeries/src/models/RNN.jl b/FastTimeSeries/src/models/RNN.jl
@@ -26,7 +26,7 @@ function RNNModel(recbackbone;
                 outsize,
                 recout,
                 kwargs...)
-    return RNNModel{}(recbackbone, Dense(recout, outsize))
+    return RNNModel(recbackbone, Dense(recout, outsize))
 end
 
 function (m::RNNModel)(X)
diff --git a/FastTimeSeries/src/recipes.jl b/FastTimeSeries/src/recipes.jl
@@ -7,10 +7,18 @@ Recipe for loading a time series dataset stored in a .ts file
 Base.@kwdef struct TimeSeriesDatasetRecipe <: Datasets.DatasetRecipe
     train_file
     test_file = nothing
+    regression = false
     loadfn = Datasets.loadfile
 end
 
-Datasets.recipeblocks(::Type{TimeSeriesDatasetRecipe}) = Tuple{TimeSeriesRow, Label}
+function Datasets.recipeblocks(recipe::TimeSeriesDatasetRecipe)
+    if !recipe.regression
+        return Tuple{TimeSeriesRow, Label}
+    else
+        return Tuple{TimeSeriesRow, Continuous}
+    end
+end
+# Datasets.recipeblocks(::Type{TimeSeriesDatasetRecipe}) = Tuple{TimeSeriesRow, Label}
 
 #TODO: Add Check if test_file is nothing.
 function Datasets.loadrecipe(recipe::TimeSeriesDatasetRecipe, path)
@@ -23,10 +31,18 @@ function Datasets.loadrecipe(recipe::TimeSeriesDatasetRecipe, path)
     labels = [labels_train; labels_test]
     rows = TimeSeriesDataset(rows)
     data = rows, labels
-    blocks = (
-        setup(TimeSeriesRow,rows),
-        Label(unique(eachobs(labels))),
-    )
+    blocks = nothing
+    if !recipe.regression
+        blocks = (
+            setup(TimeSeriesRow,rows),
+            Label(unique(eachobs(labels))),
+        )
+    else
+        blocks = (
+            setup(TimeSeriesRow,rows),
+            Continuous(1)
+        )
+    end
     return data, blocks
 end
 
@@ -41,11 +57,11 @@ const RECIPES = Dict{String,Vector{Datasets.DatasetRecipe}}(
     ],
     "natops" => [
         TimeSeriesDatasetRecipe(train_file="NATOPS_TEST.ts", test_file="NATOPS_TRAIN.ts")
-    ]
+    ],
     #! TODO.
-    # "appliances_energy" => [
-    #     TimeSeriesDatasetRecipe(train_file="AppliancesEnergy_TRAIN.ts", test_file="AppliancesEnergy_TEST.ts")
-    # ]
+    "appliances_energy" => [
+        TimeSeriesDatasetRecipe(train_file="AppliancesEnergy_TRAIN.ts", test_file="AppliancesEnergy_TEST.ts", regression = true)
+    ]
 )
 
 function _registerrecipes()
diff --git a/src/datasets/fastaidatasets.jl b/src/datasets/fastaidatasets.jl
@@ -18,8 +18,20 @@ struct TSClassificationDataset
     size
 end
 
+struct MonashRegressionDataset
+    name
+    dset_id
+    extension
+    description
+    checksum
+    datadepname
+    splits
+    size
+end
+
 const ROOT_URL_FastAI = "https://s3.amazonaws.com/fast-ai-"
 const ROOT_URL_TSClassification = "http://www.timeseriesclassification.com/Downloads"
+const ROOT_URL_MonashRegression = "https://zenodo.org/record/"
 
 function FastAIDataset(name, subfolder, checksum = "";
                        extension = "tgz",
@@ -40,13 +52,21 @@ function TSClassificationDataset(
     return TSClassificationDataset(name, extension, description, checksum, datadepname, size)
 end
 
+function MonashRegressionDataset(
+        name, dset_id, checksum = "";
+        extension = "ts", description = "", splits = ["TRAIN", "TEST"],
+        datadepname="", size="???")
+    return MonashRegressionDataset(name, dset_id, extension, description, checksum, datadepname, splits, size)
+end
+
 const DESCRIPTIONS = Dict(
     "imagenette" => "A subset of 10 easily classified classes from Imagenet: tench, English springer, cassette player, chain saw, church, French horn, garbage truck, gas pump, golf ball, parachute",
     "imagewoof" => "A subset of 10 harder to classify classes from Imagenet (all dog breeds): Australian terrier, Border terrier, Samoyed, beagle, Shih-Tzu, English foxhound, Rhodesian ridgeback, dingo, golden retriever, Old English sheepdog",
     "food-101" => "101 food categories, with 101,000 images; 250 test images and 750 training images per class. The training images were not cleaned. All images were rescaled to have a maximum side length of 512 pixels.",
     "ECG5000" => "The original dataset for \"ECG5000\" is a 20-hour long ECG downloaded from Physionet. The name is BIDMC Congestive Heart Failure Database(chfdb) and it is record \"chf07\".",
     "AtrialFibrillation" => "This is a physionet dataset of two-channel ECG recordings has been created from data used in the Computers in Cardiology Challenge 2004, an open competition with the goal of developing automated methods for predicting spontaneous termination of atrial fibrillation (AF).",
-    "NATOPS" => "The data is generated by sensors on the hands, elbows, wrists and thumbs. The data are the x,y,z coordinates for each of the eight locations. "
+    "NATOPS" => "The data is generated by sensors on the hands, elbows, wrists and thumbs. The data are the x,y,z coordinates for each of the eight locations. ",
+    "AppliancesEnergy" => "The goal of this dataset is to predict total energy usage in kWh of a house.",
 )
 
 const DATASETCONFIGS = [
@@ -209,6 +229,9 @@ const DATASETCONFIGS = [
     TSClassificationDataset("AtrialFibrillation", "218abad67d58190a6daa1a27f4bd58ace6e18f80fb59fb2c7385f0d2d4b411a2", description = DESCRIPTIONS["AtrialFibrillation"], datadepname = "atrial", size = "226KB"),
     TSClassificationDataset("NATOPS", "57a8debeedadad7764bfa9c87b4300bd64a999ef95a98a6ee07a830c41de4aa1", description = DESCRIPTIONS["NATOPS"], datadepname = "natops", size = "5.1MB"),
 
+    # monash regression datasets
+    MonashRegressionDataset("AppliancesEnergy", 3902637, ["bbc65fcfa5c01655bb0ec7d558335d44b9c81979d7246f485bbc95a9759a5bff", "0e73676156bdce593059cd03785db9fd5616c1620ba87893b0f0903ef80f2248"],
+    description = DESCRIPTIONS["AppliancesEnergy"], datadepname="appliances_energy", size = "15MB"),
 ]
 
 const DATASETS = [d.datadepname for d in DATASETCONFIGS]
@@ -258,6 +281,20 @@ function DataDeps.DataDep(d::TSClassificationDataset)
     )
 end
 
+function DataDeps.DataDep(d::MonashRegressionDataset)
+    remote_paths = [ "https://zenodo.org/record/$(d.dset_id)/files/$(d.name)_$split.ts" for split in d.splits]
+    return DataDep(
+        "fastai-$(d.datadepname)",
+        """
+        "$(d.name)" from the Monash, UEA & UCR Time Series Extrinsic Regression Repository (http://tseregression.org)
+        $(d.description)
+        Download size: $(d.size)
+        """,
+        remote_paths,
+        d.checksum
+    )
+end
+
 function initdatadeps()
     for d in DATASETCONFIGS
         DataDeps.register(DataDep(d))