Skip to content

Commit 3ec5156

Browse files
authored
Merge pull request #31 from stensmo/main
adds `read_json` and `write_json`
2 parents 274f505 + d439c5a commit 3ec5156

File tree

7 files changed

+306
-1
lines changed

7 files changed

+306
-1
lines changed

Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ ReadStatTables = "52522f7a-9570-4e34-8ac6-c005c74d4b84"
1717
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
1818
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
1919
XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0"
20+
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
2021

2122
[compat]
2223
Arrow = "2.7"
@@ -34,6 +35,7 @@ Sockets = "1.9 - 1.11"
3435
Random = "0.5 - 10"
3536
XLSX = "0.10"
3637
julia = "1.10"
38+
JSON = "0.21"
3739

3840
[extras]
3941
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Currently supported file types:
2727
- `read_parquet` and `write_parquet`
2828
- `read_rdata` (.rdata and .rds)
2929
- `read_gsheet` and `write_gsheet` (Google Sheets)
30+
- `read_json` and `write_json`
3031

3132
Agnostic read and write functions that detect the type and dispatch the appropriate function.
3233
- `read_file` and `write_file`

src/TidierFiles.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,14 @@ using RData
1313
using JSON3
1414
using Random
1515
using Sockets
16+
using JSON
1617

1718
@reexport using DataFrames: DataFrame
1819

1920
export read_csv, write_csv, read_tsv, write_tsv, read_table, write_table, read_delim, read_xlsx, write_xlsx,
2021
read_fwf, write_fwf, fwf_empty, fwf_positions, fwf_positions, read_sav, read_sas, read_dta, write_sav, write_sas,
2122
write_dta, read_arrow, write_arrow, read_parquet, write_parquet, read_csv2, read_file, write_file, read_rdata, list_files,
22-
read_gsheet, connect_gsheet, write_gsheet
23+
read_gsheet, connect_gsheet, write_gsheet, read_json, write_json
2324

2425

2526
include("docstrings.jl")
@@ -30,6 +31,7 @@ include("parquet_files.jl")
3031
include("arrow_files.jl")
3132
include("r_data.jl")
3233
include("gsheets.jl")
34+
include("jsonfiles.jl")
3335

3436
"""
3537
$docstring_read_csv

src/docstrings.jl

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,4 +789,51 @@ julia> df = DataFrame(A=1:5, B=["a", missing, "c", "d", "e"], C=[1.1, 2.2, 3.3,
789789
790790
julia> write_gsheet(df, full, sheet = "sheet2", append = false)
791791
```
792+
"""
793+
794+
const docstring_read_json =
795+
"""
796+
read_json(path::String; null = missing, convertMixedNumberTypes::Bool = true)
797+
798+
Read data from a JSON file into a DataFrame
799+
800+
# Arguments
801+
- `path::String`: A file name or a URL to the JSON file
802+
- `null`: Determines what data type a JSON null should be
803+
- `convertMixedNumberTypes::Bool`: When parsing numers in JSON, they can be interpreted as Float64 or Int64, setting this flag to true, means mixed numers (in the same column) will be interpreted as Float64
804+
805+
# Examples
806+
```julia
807+
808+
julia> df = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/movies.json")
809+
3201×16 DataFrame
810+
Row │ Director Worldwide Gross Running Time min US DVD Sales Source Distributor ⋯
811+
│ String? Int64? Int64? Int64? String? String? ⋯
812+
──────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────
813+
1 │ missing 146083 missing missing missing Gramercy ⋯
814+
2 │ missing 10876 missing missing missing Strand
815+
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱
816+
3200 │ Martin Campbell 141475336 129 missing Remake Sony Picture
817+
3201 │ Martin Campbell 233700000 136 missing Remake Sony Picture
818+
```
819+
"""
820+
821+
822+
const docstring_write_json =
823+
"""
824+
write_json(df::DataFrame, path::String;JSONObjectVector::Bool=true)
825+
826+
Writes the contents of a DataFrame to a specified JSON file
827+
828+
# Arguments
829+
- `df::DataFrame`: The DataFrame containing the data to be written to a JSON file
830+
- `path::String`: Path to the local JSON file to be written
831+
- `JSONObjectVector::Bool`: Determines what JSON formatat to write, true means writing as a vector of JSON Objects, false writes as JSON arrays
832+
833+
# Examples
834+
```
835+
julia> df = DataFrame(A=1:5, B=["a", missing, "c", "d", "e"], C=[1.1, 2.2, 3.3, 4.4, 5.5]);
836+
837+
julia> write_json(df, "data.json")
838+
```
792839
"""

src/gen_fxn.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ function read_file(filepath::String, args...; kwargs...)
2727
return read_parquet(filepath, args...; kwargs...)
2828
elseif ext == ".rds" || ext == ".RData" || ext == ".rdata"
2929
return RData.load(filepath)
30+
elseif ext == ".json"
31+
return read_json(filepath, args...; kwargs...)
3032
else
3133
error("Unsupported file format: $ext")
3234
end
@@ -57,6 +59,8 @@ function write_file(data::DataFrame,path::String, args...; kwargs...)
5759
return write_arrow(data, path, args...; kwargs...)
5860
elseif ext == ".parquet"
5961
return write_parquet(data, path, args...; kwargs...)
62+
elseif ext == ".json"
63+
return write_json(data, path, args...; kwargs...)
6064
else
6165
error("Unsupported file format: $ext")
6266
end

src/jsonfiles.jl

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
function specialConvert(col)
2+
return ismissing(col) ? missing : Float64(col)
3+
end
4+
5+
function fixTypes(df::DataFrame)::DataFrame
6+
for (name, col) in pairs(eachcol(df))
7+
8+
elemtype = eltype(col)
9+
10+
if typeof(elemtype) == Union
11+
ut = Base.uniontypes(elemtype)
12+
if Int64 in ut && Float64 in ut
13+
14+
df[!,name] = specialConvert.(col)
15+
16+
end
17+
end
18+
19+
end
20+
21+
return df
22+
end
23+
24+
25+
function checkIfVectorFormat(parsedJSON)
26+
27+
len = 0
28+
29+
for (key, value) in pairs(parsedJSON)
30+
typeof(value) != Vector{Any} && return false
31+
32+
thisLen = length(value)
33+
len > 0 && thisLen != len && return false
34+
len = thisLen
35+
36+
end
37+
return true
38+
end
39+
40+
function fixTypesVectorFormat(df::DataFrame)
41+
for (name, col) in pairs(eachcol(df))
42+
try
43+
t = typeof(col[1])
44+
45+
df[!,name] = convert(Vector{t}, col)
46+
catch
47+
return df
48+
end
49+
50+
end
51+
return df
52+
end
53+
54+
"""
55+
$docstring_read_json
56+
"""
57+
function read_json(path::String; null = missing, convertMixedNumberTypes::Bool = true)
58+
59+
parsedJSON = nothing
60+
df = nothing
61+
62+
if occursin("http", path)
63+
response = HTTP.get(path)
64+
# Ensure the request was successful
65+
if response.status != 200
66+
error("Failed to fetch the JSON file: HTTP status code ", response.status)
67+
end
68+
file_to_read = IOBuffer(response.body)
69+
parsedJSON = JSON.parse(file_to_read; null=null)
70+
71+
else
72+
73+
open(path, "r") do io
74+
parsedJSON = JSON.parse(io;null=null)
75+
end
76+
77+
end
78+
79+
if checkIfVectorFormat(parsedJSON)
80+
df = DataFrame([v for v in values(parsedJSON)],[k for k in keys(parsedJSON) ] )
81+
return fixTypesVectorFormat(df)
82+
83+
else
84+
df = DataFrame(Tables.dictrowtable(parsedJSON))
85+
return (convertMixedNumberTypes) ? fixTypes(df) : df
86+
end
87+
88+
end
89+
90+
"""
91+
$docstring_write_json
92+
"""
93+
function write_json(df::DataFrame, path::String;JSONObjectVector::Bool=true)
94+
if JSONObjectVector
95+
columnNames = names(df)
96+
97+
dicts = [ Dict{ String, Any }((columnNames .=> values(row))) for row in eachrow(df) ]
98+
99+
json_string = JSON.json(dicts)
100+
else
101+
json_string = JSON.json(df)
102+
end
103+
open(path, "w") do io
104+
write(io, json_string)
105+
end
106+
end

test/runtests.jl

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,148 @@ end); recursive=true)
1010

1111
doctest(TidierFiles)
1212

13+
14+
@testset "JSON Test" begin
15+
16+
function roundTripDataFrame(df::DataFrame; JSONObjectVector=true)
17+
write_json(df, "testdf.json";JSONObjectVector )
18+
df_read = read_json("testdf.json")
19+
return isequal(df, df_read)
20+
end
21+
22+
23+
df_anscombe = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/anscombe.json")
24+
25+
@test typeof(df_anscombe.X) == Vector{Float64}
26+
27+
@test roundTripDataFrame(df_anscombe, JSONObjectVector = false)
28+
29+
df_barley = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/barley.json")
30+
31+
@test typeof(df_barley.yield) == Vector{Float64}
32+
33+
@test roundTripDataFrame(df_barley;JSONObjectVector=false)
34+
35+
df_budget = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/budget.json")
36+
37+
@test roundTripDataFrame(df_budget)
38+
39+
df_budgets = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/budgets.json")
40+
41+
@test roundTripDataFrame(df_budgets)
42+
@test typeof(df_budgets.value) == Vector{Float64}
43+
@test typeof(df_budgets.budgetYear) == Vector{Int64}
44+
45+
46+
df_burtin = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/burtin.json")
47+
48+
@test roundTripDataFrame(df_burtin)
49+
50+
df_cars = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/cars.json")
51+
52+
@test sum(skipmissing(df_cars.Horsepower)) == 42033
53+
54+
@test roundTripDataFrame(df_cars)
55+
56+
df_countries = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/countries.json")
57+
58+
@test sum(skipmissing(df_countries.p_life_expect)) 36591.29 atol=0.01
59+
60+
61+
df_crimea = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/crimea.json")
62+
63+
@test roundTripDataFrame(df_crimea)
64+
65+
df_driving = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/driving.json")
66+
67+
@test roundTripDataFrame(df_driving)
68+
69+
70+
df_flights_200k = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/flights-200k.json")
71+
72+
@test roundTripDataFrame(df_flights_200k)
73+
74+
df_football = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/football.json")
75+
76+
@test roundTripDataFrame(df_football)
77+
78+
df_income = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/income.json")
79+
80+
@test roundTripDataFrame(df_income)
81+
82+
df_jobs = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/jobs.json")
83+
84+
@test roundTripDataFrame(df_jobs)
85+
86+
87+
df_movies = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/movies.json")
88+
89+
@test roundTripDataFrame(df_movies)
90+
91+
df_obesity = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/obesity.json")
92+
93+
@test sum(df_obesity.rate) 7.791 atol=0.01
94+
95+
df_ohlc = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/ohlc.json")
96+
97+
@test sum(df_ohlc.open) 1223.04 atol=0.01
98+
99+
df_penguins = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/penguins.json")
100+
101+
@test sum(skipmissing(df_penguins."Flipper Length (mm)")) == 68713
102+
103+
@test roundTripDataFrame(df_penguins)
104+
105+
106+
df_platformer_terrain = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/platformer-terrain.json")
107+
108+
df_political_contributions = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/political-contributions.json")
109+
110+
df_population = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/population.json")
111+
112+
df_udistrict = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/udistrict.json")
113+
114+
df_unemployment = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/unemployment-across-industries.json")
115+
116+
@test roundTripDataFrame(df_unemployment)
117+
118+
119+
df_uniform_2d = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/uniform-2d.json")
120+
121+
df_uniform_2d = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/us-10m.json")
122+
123+
df_us_state_capitals = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/us-state-capitals.json")
124+
125+
@test roundTripDataFrame(df_us_state_capitals)
126+
127+
@test sum(df_us_state_capitals.lat) 1970.67 atol=0.01
128+
129+
#df_volcano = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/volcano.json")
130+
131+
df_weekly_weather = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/weekly-weather.json")
132+
133+
134+
#=
135+
df_weekly_weather_without_missing=dropmissing(df_weekly_weather, :forecast)
136+
137+
df_weekly_weather_unnested=@unnest_wider(df_weekly_weather_without_missing, normal, record, forecast)
138+
sum(skipmissing(df_weekly_weather_unnested.forecast_high))
139+
140+
df_weekly_weather_unnested2=@unnest_wider(df_weekly_weather, normal, record, forecast)
141+
sum(skipmissing(df_weekly_weather_unnested2.forecast_high))
142+
=#
143+
144+
df_wheat = read_json("https://raw.githubusercontent.com/vega/vega-datasets/refs/heads/main/data/wheat.json")
145+
146+
@test sum(skipmissing(df_wheat.wages)) 579.08 atol=0.01
147+
148+
@test typeof(df_wheat.wheat) == Vector{Float64}
149+
150+
@test roundTripDataFrame(df_wheat)
151+
152+
153+
end
154+
155+
13156
end
14157

0 commit comments

Comments
 (0)