Merge pull request #31 from stensmo/main

drizk1 · web-flow · commit 3ec51566427f · 2025-04-12T22:53:56.000-04:00
adds `read_json` and `write_json`
diff --git a/Project.toml b/Project.toml
@@ -17,6 +17,7 @@ ReadStatTables = "52522f7a-9570-4e34-8ac6-c005c74d4b84"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
 XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 
 [compat]
 Arrow = "2.7"
@@ -34,6 +35,7 @@ Sockets = "1.9 - 1.11"
 Random = "0.5 - 10"
 XLSX = "0.10"
 julia = "1.10"
+JSON = "0.21"
 
 [extras]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
diff --git a/README.md b/README.md
@@ -27,6 +27,7 @@ Currently supported file types:
 - `read_parquet` and `write_parquet`
 - `read_rdata` (.rdata and .rds)
 - `read_gsheet` and `write_gsheet` (Google Sheets)
+- `read_json` and `write_json`
 
 Agnostic read and write functions that detect the type and dispatch the appropriate function. 
 - `read_file` and `write_file` 
diff --git a/src/TidierFiles.jl b/src/TidierFiles.jl
@@ -13,13 +13,14 @@ using RData
 using JSON3
 using Random
 using Sockets
+using JSON
 
 @reexport using DataFrames: DataFrame
 
 export read_csv, write_csv, read_tsv, write_tsv, read_table, write_table, read_delim, read_xlsx, write_xlsx, 
  read_fwf, write_fwf, fwf_empty, fwf_positions, fwf_positions, read_sav, read_sas, read_dta, write_sav, write_sas, 
  write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2, read_file, write_file, read_rdata, list_files,
- read_gsheet, connect_gsheet, write_gsheet
+ read_gsheet, connect_gsheet, write_gsheet, read_json, write_json
 
 
 include("docstrings.jl")
@@ -30,6 +31,7 @@ include("parquet_files.jl")
 include("arrow_files.jl")
 include("r_data.jl")
 include("gsheets.jl")
+include("jsonfiles.jl")
 
 """
 $docstring_read_csv
diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -789,4 +789,51 @@ julia> df = DataFrame(A=1:5, B=["a", missing, "c", "d", "e"], C=[1.1, 2.2, 3.3,
 
 julia> write_gsheet(df, full, sheet = "sheet2", append = false)
 ```
+"""
+
+const docstring_read_json  =
+"""
+    read_json(path::String; null = missing, convertMixedNumberTypes::Bool = true) 
+
+Read data from a JSON file into a DataFrame
+
+# Arguments
+- `path::String`: A file name or a URL to the JSON file
+- `null`: Determines what data type a JSON null should be
+- `convertMixedNumberTypes::Bool`: When parsing numers in JSON, they can be interpreted as Float64 or Int64, setting this flag to true, means mixed numers (in the same column) will be interpreted as Float64
+
+# Examples
+```julia
+
+julia> df = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/movies.json")
+3201×16 DataFrame
+  Row │ Director         Worldwide Gross  Running Time min  US DVD Sales  Source                        Distributor  ⋯
+      │ String?          Int64?           Int64?            Int64?        String?                       String?      ⋯
+──────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────
+    1 │ missing                   146083           missing       missing  missing                       Gramercy     ⋯
+    2 │ missing                    10876           missing       missing  missing                       Strand
+  ⋮   │        ⋮                ⋮                ⋮               ⋮                     ⋮                        ⋮    ⋱
+ 3200 │ Martin Campbell        141475336               129       missing  Remake                        Sony Picture
+ 3201 │ Martin Campbell        233700000               136       missing  Remake                        Sony Picture
+```
+"""
+
+
+const docstring_write_json  =
+"""
+     write_json(df::DataFrame, path::String;JSONObjectVector::Bool=true)
+
+     Writes the contents of a DataFrame to a specified JSON file
+
+# Arguments
+- `df::DataFrame`: The DataFrame containing the data to be written to a JSON file
+- `path::String`: Path to the local JSON file to be written
+- `JSONObjectVector::Bool`: Determines what JSON formatat to write, true means writing as a vector of JSON Objects, false writes as JSON arrays
+
+# Examples
+```
+julia> df = DataFrame(A=1:5, B=["a", missing, "c", "d", "e"], C=[1.1, 2.2, 3.3, 4.4, 5.5]);
+
+julia> write_json(df, "data.json")
+```
 """
diff --git a/src/gen_fxn.jl b/src/gen_fxn.jl
@@ -27,6 +27,8 @@ function read_file(filepath::String, args...; kwargs...)
         return read_parquet(filepath, args...; kwargs...)
     elseif ext == ".rds" || ext == ".RData" || ext == ".rdata"
         return RData.load(filepath)
+    elseif ext == ".json"
+        return read_json(filepath, args...; kwargs...)
     else
         error("Unsupported file format: $ext")
     end
@@ -57,6 +59,8 @@ function write_file(data::DataFrame,path::String, args...; kwargs...)
         return write_arrow(data, path, args...; kwargs...)
     elseif ext == ".parquet"
         return write_parquet(data, path, args...; kwargs...)
+    elseif ext == ".json"
+        return write_json(data, path, args...; kwargs...)
     else
         error("Unsupported file format: $ext")
     end
diff --git a/src/jsonfiles.jl b/src/jsonfiles.jl
@@ -0,0 +1,106 @@
+function specialConvert(col)
+    return ismissing(col) ? missing : Float64(col)
+end
+
+function fixTypes(df::DataFrame)::DataFrame
+    for (name, col) in pairs(eachcol(df))
+
+        elemtype = eltype(col)
+      
+        if typeof(elemtype) == Union
+            ut = Base.uniontypes(elemtype)
+            if Int64 in ut && Float64 in ut
+
+                df[!,name] = specialConvert.(col)
+
+            end
+        end
+        
+    end
+
+    return df
+end
+
+
+function checkIfVectorFormat(parsedJSON)
+ 
+    len = 0
+
+    for (key, value) in pairs(parsedJSON)
+        typeof(value) != Vector{Any} && return false
+       
+        thisLen = length(value)
+        len > 0 && thisLen != len && return false
+        len = thisLen
+
+    end
+    return true
+end
+
+function fixTypesVectorFormat(df::DataFrame)
+    for (name, col) in pairs(eachcol(df))
+      try
+            t = typeof(col[1])
+          
+            df[!,name] = convert(Vector{t}, col)
+       catch
+          return df
+      end
+        
+    end
+    return df
+end
+
+"""
+$docstring_read_json
+"""
+function read_json(path::String; null = missing, convertMixedNumberTypes::Bool = true) 
+   
+    parsedJSON = nothing
+    df = nothing
+   
+    if occursin("http", path) 
+        response = HTTP.get(path)
+         # Ensure the request was successful
+         if response.status != 200
+            error("Failed to fetch the JSON file: HTTP status code ", response.status)
+        end
+        file_to_read = IOBuffer(response.body)
+        parsedJSON = JSON.parse(file_to_read; null=null)   
+         
+    else
+    
+        open(path, "r") do io
+            parsedJSON = JSON.parse(io;null=null)
+        end
+        
+    end
+
+    if checkIfVectorFormat(parsedJSON)
+        df = DataFrame([v for v in values(parsedJSON)],[k for k in keys(parsedJSON) ] )  
+        return fixTypesVectorFormat(df)
+       
+    else
+        df = DataFrame(Tables.dictrowtable(parsedJSON))
+        return (convertMixedNumberTypes) ? fixTypes(df) : df
+    end
+
+end
+
+"""
+$docstring_write_json
+"""
+function write_json(df::DataFrame, path::String;JSONObjectVector::Bool=true)
+    if JSONObjectVector
+        columnNames = names(df)
+      
+        dicts = [ Dict{ String, Any }((columnNames .=> values(row))) for row in eachrow(df) ]
+
+        json_string = JSON.json(dicts)
+    else 
+        json_string = JSON.json(df)
+    end
+        open(path, "w") do io
+            write(io, json_string)
+        end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -10,5 +10,148 @@ end); recursive=true)
 
 doctest(TidierFiles)
 
+
+@testset "JSON Test" begin
+    
+    function roundTripDataFrame(df::DataFrame; JSONObjectVector=true)
+        write_json(df, "testdf.json";JSONObjectVector )
+        df_read = read_json("testdf.json")
+        return isequal(df, df_read) 
+    end
+    
+ 
+    df_anscombe = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/anscombe.json")
+
+    @test typeof(df_anscombe.X) == Vector{Float64}
+
+    @test roundTripDataFrame(df_anscombe, JSONObjectVector = false)
+
+    df_barley = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/barley.json")
+
+    @test typeof(df_barley.yield) == Vector{Float64}
+
+    @test roundTripDataFrame(df_barley;JSONObjectVector=false)
+
+    df_budget = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/budget.json")
+
+    @test roundTripDataFrame(df_budget)
+
+    df_budgets = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/budgets.json")
+
+    @test roundTripDataFrame(df_budgets)
+    @test typeof(df_budgets.value) == Vector{Float64}
+    @test typeof(df_budgets.budgetYear) == Vector{Int64}
+
+
+    df_burtin = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/burtin.json")
+
+    @test roundTripDataFrame(df_burtin)
+
+    df_cars = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/cars.json")
+
+    @test sum(skipmissing(df_cars.Horsepower)) == 42033
+
+    @test roundTripDataFrame(df_cars)
+
+    df_countries = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/countries.json")
+
+    @test sum(skipmissing(df_countries.p_life_expect))  ≈ 36591.29 atol=0.01
+
+   
+    df_crimea = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/crimea.json")
+
+    @test roundTripDataFrame(df_crimea)
+
+    df_driving = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/driving.json")
+
+    @test roundTripDataFrame(df_driving)
+
+   
+    df_flights_200k = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/flights-200k.json")
+
+    @test roundTripDataFrame(df_flights_200k)
+
+    df_football = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/football.json")
+
+    @test roundTripDataFrame(df_football)
+
+    df_income = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/income.json")
+
+    @test roundTripDataFrame(df_income)
+
+    df_jobs = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/jobs.json")
+
+    @test roundTripDataFrame(df_jobs)
+
+
+    df_movies = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/movies.json")
+
+    @test roundTripDataFrame(df_movies)
+
+    df_obesity = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/obesity.json")
+
+    @test sum(df_obesity.rate) ≈ 7.791 atol=0.01
+
+    df_ohlc = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/ohlc.json")
+
+    @test sum(df_ohlc.open) ≈ 1223.04 atol=0.01
+ 
+    df_penguins = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/penguins.json")
+
+    @test sum(skipmissing(df_penguins."Flipper Length (mm)")) == 68713
+
+    @test roundTripDataFrame(df_penguins)
+
+
+    df_platformer_terrain = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/platformer-terrain.json")
+
+    df_political_contributions = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/political-contributions.json")
+
+    df_population = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/population.json")
+
+    df_udistrict = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/udistrict.json")
+
+    df_unemployment = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/unemployment-across-industries.json")
+
+    @test roundTripDataFrame(df_unemployment)
+
+
+    df_uniform_2d = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/uniform-2d.json")
+
+    df_uniform_2d = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/us-10m.json")
+
+    df_us_state_capitals = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/us-state-capitals.json")
+
+    @test roundTripDataFrame(df_us_state_capitals)
+
+    @test sum(df_us_state_capitals.lat) ≈ 1970.67 atol=0.01
+
+    #df_volcano = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/volcano.json")
+
+    df_weekly_weather = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/weekly-weather.json")
+
+
+    #=
+    df_weekly_weather_without_missing=dropmissing(df_weekly_weather, :forecast)
+
+    df_weekly_weather_unnested=@unnest_wider(df_weekly_weather_without_missing, normal, record, forecast)
+    sum(skipmissing(df_weekly_weather_unnested.forecast_high))
+
+    df_weekly_weather_unnested2=@unnest_wider(df_weekly_weather, normal, record, forecast)
+    sum(skipmissing(df_weekly_weather_unnested2.forecast_high))
+    =#
+
+    df_wheat = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/wheat.json")
+
+    @test sum(skipmissing(df_wheat.wages)) ≈ 579.08 atol=0.01
+
+    @test typeof(df_wheat.wheat) == Vector{Float64}
+
+    @test roundTripDataFrame(df_wheat)
+
+
+    end
+
+
 end