diff --git a/Project.toml b/Project.toml index 4f683b0..9df7b12 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "RQADeforestation" uuid = "3f1d9318-18cc-4ffd-9937-04025ce33b74" -authors = ["Felix Cremer , Daniel Loos and contributors"] version = "1.0.0-DEV" +authors = ["Felix Cremer , Daniel Loos and contributors"] [deps] ArchGDAL = "c9ce4bd3-c3d5-55b8-8973-c0e20141b8c3" diff --git a/src/auxil.jl b/src/auxil.jl index 9a6ac1c..0d968ee 100644 --- a/src/auxil.jl +++ b/src/auxil.jl @@ -34,7 +34,16 @@ function DiskArrays.readblock!(a::LazyAggDiskArray, aout, i::UnitRange{Int}...) for (j, it) in enumerate(itime) arrays_now = a.arrays[a.inds[it]] for ia in eachindex(arrays_now) - DiskArrays.readblock!(arrays_now[ia], view(buf, :, :, ia), i1, i2) + try + DiskArrays.readblock!(arrays_now[ia], view(buf, :, :, ia), i1, i2) + catch e + if hasproperty(e, :captured) && e.captured.ex isa ArchGDAL.GDAL.GDALError + @warn e.captured.ex.msg + buf[:,:,ia] .= missing + else + rethrow(e) + end + end end vbuf = view(buf, :, :, 1:length(arrays_now)) map!(a.f, view(aout, :, :, j), eachslice(vbuf, dims=(1, 2))) @@ -135,27 +144,7 @@ function stackindices(times, timediff=200000) return groups end -#= -function DiskArrays.readblock!(b::GDALBand, aout, r::AbstractUnitRange...) - if !isa(aout,Matrix) - aout2 = similar(aout) - AG.read(b.filename) do ds - AG.getband(ds, b.band) do bh - DiskArrays.readblock!(bh, aout2, r...) - end - end - aout .= aout2 - else - AG.read(b.filename) do ds - AG.getband(ds, b.band) do bh - DiskArrays.readblock!(bh, aout, r...) - end - end - end -end -=# - -function gdalcube(filenames::AbstractVector{<:AbstractString}, stackgroups=:dae) +function gdalcube(filenames::AbstractVector{<:AbstractString}, stackgroups=:lazyagg) dates = getdate.(filenames) @show length(dates) # Sort the dates and files by DateTime @@ -165,7 +154,6 @@ function gdalcube(filenames::AbstractVector{<:AbstractString}, stackgroups=:dae) #@show sdates # Put the dates which are 200 seconds apart into groups - if stackgroups in [:dae, :lazyagg] groupinds = grouptimes(sdates, 200000) onefile = first(sfiles) gd = backendlist[:gdal] @@ -184,60 +172,32 @@ function gdalcube(filenames::AbstractVector{<:AbstractString}, stackgroups=:dae) cubelist = CFDiskArray.(group_gdbs, (gdbattrs,)) stackinds = stackindices(sdates) - aggdata = if stackgroups == :dae - gcube = diskstack(cubelist) - aggdata = DAE.aggregate_diskarray(gcube, mean ∘ skipmissing, (3 => stackinds,); strategy=:direct) - else - println("Construct lazy diskarray") - LazyAggDiskArray(mean ∘ skipmissing, cubelist, stackinds) - end + println("Construct lazy diskarray") + aggdata = LazyAggDiskArray(skipmissingmean, cubelist, stackinds) # data = DiskArrays.ConcatDiskArray(reshape(groupcubes, (1,1,length(groupcubes)))) dates_grouped = [sdates[group[begin]] for group in groupinds] taxis = DD.Ti(dates_grouped) gcube = Cube(sfiles[1]) return YAXArray((DD.dims(gcube)[1:2]..., taxis), aggdata, gcube.properties,) - else - #datasets = AG.readraster.(sfiles) - taxis = DD.Ti(sdates) - - onefile = first(sfiles) - gd = backendlist[:gdal] - yax1 = gd(onefile) - onecube = Cube(onefile) - #@show onecube.axes - gdb = get_var_handle(yax1, "Gray") +end - #@assert gdb isa GDALBand - all_gdbs = map(sfiles) do f - BufferGDALBand{eltype(gdb)}(f, gdb.band, gdb.size, gdb.attrs, gdb.cs, Dict{Int,AG.IRasterBand}()) - end - stacked_gdbs = diskstack(all_gdbs) - attrs = copy(gdb.attrs) - #attrs["add_offset"] = Float16(attrs["add_offset"]) - if haskey(attrs, "scale_factor") - attrs["scale_factor"] = Float16(attrs["scale_factor"]) - end - all_cfs = CFDiskArray(stacked_gdbs, attrs) - return YAXArray((onecube.axes..., taxis), all_cfs, onecube.properties) +function skipmissingmean(x) + isempty(x) && return missing + s,n = reduce(x,init=(zero(eltype(x)),0)) do (s,n), ix + ismissing(ix) ? (s,n) : (s+ix,n+1) end - #datasetgroups = [datasets[group] for group in groupinds] - #We have to save the vrts because the usage of nested vrts is not working as a rasterdataset - #temp = tempdir() - #outpaths = [joinpath(temp, splitext(basename(sfiles[group][1]))[1] * ".vrt") for group in groupinds] - #vrt_grouped = AG.unsafe_gdalbuildvrt.(datasetgroups) - #AG.write.(vrt_grouped, outpaths) - #vrt_grouped = AG.read.(outpaths) - #vrt_vv = AG.unsafe_gdalbuildvrt(vrt_grouped, ["-separate"]) - #rvrt_vv = AG.RasterDataset(vrt_vv) - #yaxras = YAXArray.(sfiles) - #cube = concatenatecubes(yaxras, taxis) - #bandnames = AG.GDAL.gdalgetfilelist(vrt_vv.ptr) - - - - # Set the timesteps from the bandnames as time axis - #dates_grouped = [sdates[group[begin]] for group in groupinds] + n==0 ? missing : s/n +end + +# I don't dare to make this type piracy. +#Base.∘(::typeof(mean), ::typeof(skipmissing)) = skipmissingmean + +@testitem "skipmissingmean" begin + @test ismissing(RQADeforestation.skipmissingmean(Float32[])) + @test ismissing(RQADeforestation.skipmissingmean(Union{Float32, Missing}[missing, missing])) + @test RQADeforestation.skipmissingmean([1,0,missing]) == 0.5 + @test RQADeforestation.skipmissingmean([1,2,3]) == 2.0 end diff --git a/src/main.jl b/src/main.jl index 3d535f6..7a9cb03 100644 --- a/src/main.jl +++ b/src/main.jl @@ -1,5 +1,6 @@ using ArgParse using YAXArrays: YAXDefaults +using FilePathsBase: exists, Path const argparsesettings = ArgParseSettings() @@ -66,7 +67,7 @@ end function main(; - tiles::Vector{String}, + tiles, continent::String, indir::String, outdir="out.zarr", @@ -76,8 +77,10 @@ function main(; orbit="D", threshold=3.0, folders=["V1M0R1", "V1M1R1", "V1M1R2"], - stack=:dae + stack=:lazyagg, + delete_intermediate=false ) + outdir = Path(outdir) in(orbit, ["A", "D"]) || error("Orbit needs to be either A or D") if isdir(indir) && isempty(indir) error("Input directory $indir must not be empty") @@ -93,23 +96,24 @@ function main(; @warn "Selected time series does not include a multiple of whole years. This might introduce seasonal bias." end - YAXDefaults.workdir[] = outdir corruptedfiles = "corrupted_tiles.txt" # TODO save the corrupt files to a txt for investigation for tilefolder in tiles filenamelist = [glob("$(sub)/*$(continent)*20M/$(tilefolder)/*$(polarisation)_$(orbit)*.tif", indir) for sub in folders] allfilenames = collect(Iterators.flatten(filenamelist)) - relorbits = unique([split(basename(x), "_")[5][2:end] for x in allfilenames]) @show relorbits for relorbit in relorbits - filenames = allfilenames[findall(contains("$(relorbit)_E"), allfilenames)] - @time cube = gdalcube(filenames, stack) - - path = joinpath(YAXDefaults.workdir[], "$(tilefolder)_rqatrend_$(polarisation)_$(orbit)$(relorbit)_thresh_$(threshold)") + path = joinpath(outdir, "$(tilefolder)_rqatrend_$(polarisation)_$(orbit)$(relorbit)_thresh_$(threshold)_$(start_date)_$(end_date)") + #s3path = "s3://"*joinpath(outstore.bucket, path) @show path + exists(path * ".done") && continue + exists(path * "_zerotimesteps.done") && continue + filenames = allfilenames[findall(contains("$(relorbit)_E"), allfilenames)] + @time "cube construction" cube = gdalcube(filenames, stack) + ispath(path * ".done") && continue ispath(path * "_zerotimesteps.done") && continue @@ -121,14 +125,41 @@ function main(; continue end try - outpath = path * ".zarr" - @time rqatrend(tcube; thresh=threshold, outpath=outpath, overwrite=true) - Zarr.consolidate_metadata(outpath) + orbitoutpath = path * ".zarr/" + # This is only necessary because overwrite=true doesn't work on S3 based Zarr files in YAXArrays + # See https://github.com/JuliaDataCubes/YAXArrays.jl/issues/511 + if exists(orbitoutpath) + println("Deleting path $orbitoutpath") + rm(orbitoutpath, recursive=true) + end + @show orbitoutpath + # We save locally and then save a rechunked version in the cloud, + # because the chunking is suboptimal which we get from the automatic setting. + tmppath = tempname() * ".zarr" + @time "rqatrend" rqatrend(tcube; thresh=threshold, outpath=tmppath, overwrite=true) + c = Cube(tmppath) + @time "save to S3" savecube(setchunks(c, (15000,15000)), string(orbitoutpath)) + rm(tmppath, recursive=true) + @show delete_intermediate + if delete_intermediate == false + #PyramidScheme.buildpyramids(orbitoutpath) + Zarr.consolidate_metadata(string(orbitoutpath)) + end catch e - + println("inside catch") if hasproperty(e, :captured) && e.captured.ex isa ArchGDAL.GDAL.GDALError - println("Found GDALError:") + msg = e.captured.ex.msg + corruptfile = split(msg, " ")[1][1:end-1] + corrupt_parts = split(corruptfile, "_") + foldername = corrupt_parts[end-1] + continentfolder = corrupt_parts[end-2] + corruptpath = joinpath(indir, foldername, "EQUI7_$continentfolder", tilefolder, corruptfile) + println("Corrupted input file") + println(corruptpath) + println(joinpath(indir, )) println(e.captured.ex.msg) + println(corruptedfiles, "Found GDALError:") + println(corruptedfiles, e.captured.ex.msg) continue else rethrow(e) diff --git a/src/rqatrend.jl b/src/rqatrend.jl index e460845..b2dcb3b 100644 --- a/src/rqatrend.jl +++ b/src/rqatrend.jl @@ -6,10 +6,10 @@ using Distances """rqatrend(cube;thresh=2, path=tempname() * ".zarr") Compute the RQA trend metric for the datacube `cube` with the epsilon threshold `thresh`. +`lowerbound` and `upperbound` are forwarded to the classification of the RQA Trend result. """ -function rqatrend(cube; thresh=2, outpath=tempname() * ".zarr", overwrite=false, kwargs...) - @show outpath - mapCube(rqatrend, cube, thresh; indims=InDims("Time"), outdims=OutDims(; outtype=Float32, path=outpath, overwrite, kwargs...)) +function rqatrend(cube; thresh=2, lowerbound=-5, upperbound=-0.5, outpath=tempname() * ".zarr", overwrite=false, kwargs...) + mapCube(rqatrend, cube, thresh, lowerbound, upperbound; indims=InDims("Time"), outdims=OutDims(; outtype=UInt8, path=outpath, fill_value=255, overwrite, kwargs...)) end @testitem "rqatrend cube" begin @@ -32,27 +32,51 @@ end mock_trend = rqatrend(mock_cube; thresh=0.5) @test mock_trend.axes == (mock_cube.X, mock_cube.Y) diff = abs(mean(mock_trend)) - @test diff < 0.5 + @test diff < 254 end """rqatrend(path::AbstractString; thresh=2, outpath=tempname()*".zarr") Compute the RQA trend metric for the data that is available on `path`. +See the `rqatrend` for a YAXArray for the description of the parameters. """ -rqatrend(path::AbstractString; thresh=2, outpath=tempname() * ".zarr", overwrite=false, kwargs...) = - rqatrend(Cube(path); thresh, outpath, overwrite, kwargs...) +rqatrend(path::AbstractString; thresh=2, lowerbound=-5., upperbound=-0.5, outpath=tempname() * ".zarr", overwrite=false, kwargs...) = + rqatrend(Cube(path); thresh, lowerbound, upperbound, outpath, overwrite, kwargs...) """ rqatrend(xout, xin, thresh) Compute the RQA trend metric for the non-missing time steps of xin, and save it to xout. -`thresh` specifies the epsilon threshold of the Recurrence Plot computation +`thresh` specifies the epsilon threshold of the Recurrence Plot computation. +`lowerbound` and `upperbound` are the bounds of the classification into UInt8. +The result of rqatrend are UInt8 values between 0 (no change) to 254 (definitive change) with 255 as sentinel value for missing data. """ -function rqatrend(pix_trend, pix, thresh=2) - pix_trend .= rqatrend_impl(pix; thresh) +function rqatrend(pix_trend, pix, thresh=2, lowerbound=-5., upperbound=-0.5) + pix_trend .= classify_rqatrend(rqatrend_impl(pix; thresh); lowerbound, upperbound) end +""" + classify_rqatrend(trend; lowerbound=Float32(-5.0), upperbound=Float32(-0.5))) +Classify the rqatrend and put it into 254 bins so that they can fit into a UInt8 encoding. +This is a compromise between data storage and accuracy of the change detection. +The value range is 0 (no change) to 254 (definitive change) with 255 kept free as a Sentinel value for missing data. +""" +function classify_rqatrend(trend; lowerbound=Float32(-5.0), upperbound=Float32(-0.5)) + isnan(trend) && return UInt8(255) + ctrend = clamp(trend, lowerbound, upperbound) + rlength = upperbound - lowerbound + return round(UInt8, 254-((ctrend - lowerbound) / rlength) * 254) +end + +@testitem "classify_rqatrend" begin + import AllocCheck + @test RQADeforestation.classify_rqatrend(-4.999) === UInt8(254) + @test RQADeforestation.classify_rqatrend(1) === UInt8(0) + @test RQADeforestation.classify_rqatrend(-0.52) === UInt8(1) + @test RQADeforestation.classify_rqatrend(-6) === UInt8(254) + @test isempty( AllocCheck.check_allocs(RQADeforestation.classify_rqatrend, (Float32,))) +end function rqatrend_impl(data; thresh=2, border=10, theiler=1, metric=CheckedEuclidean()) # simplified implementation of https://stats.stackexchange.com/a/370175 and https://github.com/joshday/OnlineStats.jl/blob/b89a99679b13e3047ff9c93a03c303c357931832/src/stats/linreg.jl diff --git a/test/runtests.jl b/test/runtests.jl index 968e1c9..2a9782d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,3 +10,16 @@ doctest(RQADeforestation) using TestItemRunner @run_package_tests + + +@testitem "rqa step function" begin + using RQADeforestation + using Distributions: Normal as DNormal + x = range(0,100,length=1000) + ts2 = zero(x) + ts2[1:div(end,2)] .= rand.(DNormal(3,1)) + ts2[div(end,2):end] .= rand.(DNormal(0,1)) + pixtrend = UInt8[255] + RQADeforestation.rqatrend(pixtrend, ts2) + @test pixtrend[] < 20 +end \ No newline at end of file diff --git a/test/testdata.jl b/test/testdata.jl index 078f78a..0e8f3c2 100644 --- a/test/testdata.jl +++ b/test/testdata.jl @@ -2,12 +2,13 @@ @testitem "testdata main" begin import Pkg: Artifacts.@artifact_str using LazyArtifacts + using FilePathsBase testdatapath = artifact"rqatestdata/RQADeforestationTestData-2.0" testdir = tempname() rm(testdir, recursive=true, force=true) mkpath(testdir) - outdir = "$testdir/out.zarr" + outdir = "$testdir/out" indir = "$testdir/in" cp(testdatapath, indir) @@ -20,13 +21,14 @@ indir=indir, start_date=Date("2021-01-01"), end_date=Date("2022-01-01"), - outdir=outdir + outdir=Path(outdir), + stack=:lazyagg, ) - a = open_dataset(outdir * "/E051N018T3_rqatrend_VH_D022_thresh_3.0.zarr").layer + a = open_dataset(joinpath(outdir, "E051N018T3_rqatrend_VH_D022_thresh_3.0_2021-01-01_2022-01-01.zarr")).layer @test size(a) == (50, 74) - @test minimum(a) < 0 - @test maximum(a) > 0 + @test minimum(a) == 0 + @test maximum(a) > 200 end @testitem "testdata julia_main" begin