|
| 1 | +IMPORT 'common_macros.pig'; %DEFAULT data_dir '/data/rawd'; %DEFAULT out_dir '/data/out/baseball'; |
| 2 | + |
| 3 | +bat_seasons = load_bat_seasons(); |
| 4 | + |
| 5 | +-- *************************************************************************** |
| 6 | +-- |
| 7 | +-- ==== Storing JSON to disk |
| 8 | +-- |
| 9 | + |
| 10 | +-- bats_1900_fl = FILTER bat_seasons BY (year_id == 1900); |
| 11 | +-- |
| 12 | +-- bats_1900 = FOREACH bats_1900_fl { |
| 13 | +-- whatever = {(HR, R)}; |
| 14 | +-- GENERATE |
| 15 | +-- player_id, |
| 16 | +-- year_id, |
| 17 | +-- (name_first,name_last) AS full_name:tuple(name_first:chararray,name_last:chararray), |
| 18 | +-- team_id, lg_id, age, |
| 19 | +-- (G, PA, AB) AS appearances:tuple(G:int, PA:int, AB:int), |
| 20 | +-- ['HBP', HBP, 'SH', SH, 'BB', BB, 'H', H] AS hit_stats:map[int], |
| 21 | +-- -- : int,h1B: int,h2B: int,h3B: int,HR: int,R: int,RBI: int |
| 22 | +-- whatever AS whatever:bag{t:(HR:int, R:int)} |
| 23 | +-- ; |
| 24 | +-- }; |
| 25 | +-- |
| 26 | +-- DESCRIBE bats_1900; |
| 27 | +-- DUMP bats_1900; |
| 28 | + |
| 29 | +-- rmf $out_dir/json_dir_with_schema |
| 30 | +-- STORE bats_1900 INTO '$out_dir/json_dir_with_schema' |
| 31 | +-- USING org.apache.pig.builtin.JsonStorage(); |
| 32 | + |
| 33 | +-- -- *************************************************************************** |
| 34 | +-- -- |
| 35 | +-- -- ==== Loading JSON using a pre-defined schema |
| 36 | +-- -- |
| 37 | + |
| 38 | +-- bats_with_schema = LOAD '$out_dir/json_dir_with_schema' |
| 39 | +-- USING org.apache.pig.builtin.JsonLoader(); |
| 40 | +-- |
| 41 | + |
| 42 | + |
| 43 | +-- *************************************************************************** |
| 44 | +-- |
| 45 | +-- ==== Loading JSON using an in-line schema |
| 46 | +-- |
| 47 | + |
| 48 | +cp $out_dir/json_dir_with_schema/part-m-00000 $out_dir/json_no_schema.json |
| 49 | + |
| 50 | +bats_no_schema = LOAD '$out_dir/json_no_schema.json' |
| 51 | + USING org.apache.pig.builtin.JsonLoader( |
| 52 | + -- 'player_id: chararray,year_id: int,full_name: (name_first: chararray,name_last: chararray)' --,team_id: chararray,lg_id: chararray,age: int,appearances: (G: int,PA: int,AB: int),hit_stats: map[int],whatever: {t: (HR: int,R: int)}' |
| 53 | + 'player_id: chararray,year_id: int,full_name: map[chararray]' |
| 54 | + ); |
| 55 | + |
| 56 | +-- DUMP bats_with_schema; |
| 57 | +-- DESCRIBE bats_with_schema; |
| 58 | + |
| 59 | +DUMP bats_no_schema; |
| 60 | +DESCRIBE bats_no_schema; |
| 61 | + |
| 62 | + |
| 63 | + |
| 64 | +-- |
| 65 | +-- -- |
| 66 | +-- -- TODO: ?? to (a) load JSON into a map; or (b) supply a JSON schema see |
| 67 | +-- -- https://issues.apache.org/jira/browse/PIG-1914 |
| 68 | +-- -- |
| 69 | +-- |
| 70 | +-- -- *************************************************************************** |
| 71 | +-- -- |
| 72 | +-- -- ==== A little script to dump JSON pretty |
| 73 | +-- -- |
| 74 | + |
| 75 | +-- #!/usr/bin/env ruby |
| 76 | +-- |
| 77 | +-- require 'rubygems' |
| 78 | +-- ['yajl/json_gem', 'json', 'json/pure'].each do |gem_name| |
| 79 | +-- begin |
| 80 | +-- require gem_name |
| 81 | +-- rescue LoadError ; next ; end |
| 82 | +-- break |
| 83 | +-- end |
| 84 | +-- |
| 85 | +-- $stdin.each do |line| |
| 86 | +-- puts JSON.pretty_generate(JSON.load(line)) |
| 87 | +-- end |
0 commit comments