Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
runs-on: ${{ matrix.runs-on }}
env:
# We can invalidate the current cache by updating this.
CACHE_VERSION: "2022-08-27"
CACHE_VERSION: "2022-11-22"
steps:
- uses: actions/checkout@v3
- uses: ruby/setup-ruby@v1
Expand Down Expand Up @@ -68,6 +68,13 @@ jobs:
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt update
- name: Prepare Apache Arrow on macOS
if: |
runner.os == 'macOS'
run: |
brew install apache-arrow
brew install gobject-introspection
brew install apache-arrow-glib
- name: Install dependencies
run: |
bundle install
Expand Down
27 changes: 27 additions & 0 deletions example/tlc-fhv-trip.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env ruby

require "datasets-parquet"

trips = Datasets::TLC::FHVTrip.new(year: 2022, month: 1)

p trips.to_arrow
#<Arrow::Table:0x11441f108 ptr=0x10775f960>
# dispatching_base_num pickup_datetime dropOff_datetime PUlocationID DOlocationIDSR_Flag Affiliated_base_number
# 0 B00009 2022-01-01T09:31:00+09:00 2022-01-01T10:05:00+09:00 (null) (null) (null) B00009
# 1 B00009 2022-01-01T09:37:00+09:00 2022-01-01T10:05:00+09:00 (null) (null) (null) B00009
# ...

trips.each do |trip|
p [
trip.dispatching_base_num,
trip.pickup_datetime,
trip.dropoff_datetime,
trip.pu_location_id,
trip.do_location_id,
trip.sr_flag?,
trip.affiliated_base_number
]
end
# ["B00009", 2022-01-01 09:31:00 +0900, 2022-01-01 10:05:00 +0900, nil, nil, false, "B00009"]
# ["B00009", 2022-01-01 09:37:00 +0900, 2022-01-01 10:05:00 +0900, nil, nil, false, "B00009"]
# ...
46 changes: 46 additions & 0 deletions example/tlc-high-volume-fhv-trip.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env ruby

require "datasets-parquet"

trips = Datasets::TLC::HighVolumeFHVTrip.new(year: 2022, month: 1)

p trips.to_arrow
#<Arrow::Table:0x13f920640 ptr=0x13f180160>
# hvfhs_license_num dispatching_base_num originating_base_num request_datetime on_scene_datetime pickup_datetime dropoff_datetime PULocationID DOLocationID trip_miles trip_time base_passenger_fare tolls bcf sales_tax congestion_surcharge airport_fee tips driver_pay shared_request_flag shared_match_flag access_a_ride_flag wav_request_flag wav_match_flag
# 0 HV0003 B03404 B03404 2022-01-01T09:05:31+09:00 2022-01-01T09:05:40+09:00 2022-01-01T09:07:24+09:00 2022-01-01T09:18:28+09:00 170 161 1.180000 664 24.900000 0.000000 0.750000 2.210000 2.750000 0.000000 0.000000 23.030000 N N N N
# 1 HV0003 B03404 B03404 2022-01-01T09:19:27+09:00 2022-01-01T09:22:08+09:00 2022-01-01T09:22:32+09:00 2022-01-01T09:30:12+09:00 237 161 0.820000 460 11.970000 0.000000 0.360000 1.060000 2.750000 0.000000 0.000000 12.320000 N N N N
# ...


trips.each do |trip|
p [
trip.hvfhs_license_num,
trip.dispatching_base_num,
trip.originating_base_num,
trip.request_datetime,
trip.on_scene_datetime,
trip.pickup_datetime,
trip.dropoff_datetime,
trip.pu_locationID,
trip.do_locationID,
trip.trip_miles,
trip.trip_time,
trip.base_passenger_fare,
trip.tolls,
trip.bcf,
trip.sales_tax,
trip.congestion_surcharge,
trip.airport_fee,
trip.tips,
trip.driver_pay,
trip.shared_request_flag?,
trip.shared_match_flag?,
trip.access_a_ride_flag?,
trip.wav_request_flag?,
trip.wav_match_flag?,
]
end
# [:uber, "B03404", "B03404", 2022-01-01 09:05:31 +0900, 2022-01-01 09:05:40 +0900, 2022-01-01 09:07:24 +0900, 2022-01-01 09:18:28 +0900, 170, 161, 1.18, 664, 24.9, 0.0, 0.75, 2.21, 2.75, 0.0, 0.0, 23.03, false, false, false, false, false]
# [:uber, "B03404", "B03404", 2022-01-01 09:19:27 +0900, 2022-01-01 09:22:08 +0900, 2022-01-01 09:22:32 +0900, 2022-01-01 09:30:12 +0900, 237, 161, 0.82, 460, 11.97, 0.0, 0.36, 1.06, 2.75, 0.0, 0.0, 12.32, false, false, false, false, false]
# [:uber, "B03404", "B03404", 2022-01-01 09:43:53 +0900, 2022-01-01 09:57:37 +0900, 2022-01-01 09:57:37 +0900, 2022-01-01 10:07:32 +0900, 237, 161, 1.18, 595, 29.82, 0.0, 0.89, 2.65, 2.75, 0.0, 0.0, 23.3, false, false, false, false, false]
# ...
2 changes: 2 additions & 0 deletions lib/datasets-parquet.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@

require_relative "datasets-parquet/version"

require_relative "datasets-parquet/tlc/fhv-trip"
require_relative "datasets-parquet/tlc/green-taxi-trip"
require_relative "datasets-parquet/tlc/high-volume-fhv-trip"
require_relative "datasets-parquet/tlc/yellow-taxi-trip"
58 changes: 58 additions & 0 deletions lib/datasets-parquet/tlc/fhv-trip.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
module Datasets
module TLC
class FHVTrip < Dataset
class Record < Struct.new(:dispatching_base_num,
:pickup_datetime,
:dropoff_datetime,
:pu_location_id,
:do_location_id,
:sr_flag,
:affiliated_base_number)
alias_method :sr_flag?, :sr_flag

def initialize(*values)
super()
members.zip(values) do |member, value|
__send__("#{member}=", value)
end
end

def sr_flag=(sr_flag)
super(!!sr_flag)
end
end

def initialize(year: Date.today.year, month: Date.today.month)
super()
@metadata.id = "nyc-taxi-and-limousine-commission-for-hire-vehicle-trip"
@metadata.name = "New York city Taxi and Limousine Commission: for hire vehicle trip record dataset"
@metadata.url = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
@metadata.licenses = [
{
name: "NYC Open Data Terms of Use",
url: "https://opendata.cityofnewyork.us/overview/#termsofuse",
}
]
@year = year
@month = month
end

def to_arrow
base_name = "fhv_tripdata_%04d-%02d.parquet" % [@year, @month]
data_path = cache_dir_path + base_name
data_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/#{base_name}"
download(data_path, data_url)
Arrow::Table.load(data_path)
end

def each
return to_enum(__method__) unless block_given?

to_arrow.raw_records.each do |raw_record|
record = Record.new(*raw_record)
yield(record)
end
end
end
end
end
108 changes: 108 additions & 0 deletions lib/datasets-parquet/tlc/high-volume-fhv-trip.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
module Datasets
module TLC
class HighVolumeFHVTrip < Dataset
class Record < Struct.new(:hvfhs_license_num,
:dispatching_base_num,
:originating_base_num,
:request_datetime,
:on_scene_datetime,
:pickup_datetime,
:dropoff_datetime,
:pu_locationID,
:do_locationID,
:trip_miles,
:trip_time,
:base_passenger_fare,
:tolls,
:bcf,
:sales_tax,
:congestion_surcharge,
:airport_fee,
:tips,
:driver_pay,
:shared_request_flag,
:shared_match_flag,
:access_a_ride_flag,
:wav_request_flag,
:wav_match_flag)
alias_method :shared_request_flag?, :shared_request_flag
alias_method :shared_match_flag?, :shared_match_flag
alias_method :access_a_ride_flag?, :access_a_ride_flag
alias_method :wav_request_flag?, :wav_request_flag
alias_method :wav_match_flag?, :wav_match_flag

def initialize(*values)
super()
members.zip(values) do |member, value|
__send__("#{member}=", value)
end
end

def hvfhs_license_num=(hvfhs_license_num)
case hvfhs_license_num
when 'HV0002'
super(:juno)
when 'HV0003'
super(:uber)
when 'HV0004'
super(:via)
when 'HV0005'
super(:lyft)
end
end

def shared_request_flag=(shared_request_flag)
super(shared_request_flag == 'Y')
end

def shared_match_flag=(shared_match_flag)
super(shared_match_flag == 'Y')
end

def access_a_ride_flag=(access_a_ride_flag)
super(access_a_ride_flag == 'Y')
end

def wav_request_flag=(wav_request_flag)
super(wav_request_flag == 'Y')
end

def wav_match_flag=(wav_match_flag)
super(wav_match_flag == 'Y')
end
end

def initialize(year: Date.today.year, month: Date.today.month)
super()
@metadata.id = "nyc-taxi-and-limousine-commission-high-volume-for-hire-vehicle-trip"
@metadata.name = "New York city Taxi and Limousine Commission: high volume for hire vehicle trip record dataset"
@metadata.url = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
@metadata.licenses = [
{
name: "NYC Open Data Terms of Use",
url: "https://opendata.cityofnewyork.us/overview/#termsofuse",
}
]
@year = year
@month = month
end

def to_arrow
base_name = "fhvhv_tripdata_%04d-%02d.parquet" % [@year, @month]
data_path = cache_dir_path + base_name
data_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/#{base_name}"
download(data_path, data_url)
Arrow::Table.load(data_path)
end

def each
return to_enum(__method__) unless block_given?

to_arrow.raw_records.each do |raw_record|
record = Record.new(*raw_record)
yield(record)
end
end
end
end
end
69 changes: 69 additions & 0 deletions test/test-tlc-fhv-trip.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
class TLCFHVTripTest < Test::Unit::TestCase
def setup
@default_timezone_env = ENV['TZ']
ENV['TZ'] = 'UTC'
@dataset = Datasets::TLC::FHVTrip.new(year: 2022, month: 1)
end

def teardown
ENV['TZ'] = @default_timezone_env
end

test("#to_arrow") do
assert_equal(<<~TABLE, @dataset.to_arrow.to_s)
\tdispatching_base_num\t pickup_datetime\t dropOff_datetime\tPUlocationID\tDOlocationID\tSR_Flag\tAffiliated_base_number
0\tB00009 \t2022-01-01T00:31:00+00:00\t2022-01-01T01:05:00+00:00\t (null)\t (null)\t (null)\tB00009
1\tB00009 \t2022-01-01T00:37:00+00:00\t2022-01-01T01:05:00+00:00\t (null)\t (null)\t (null)\tB00009
2\tB00037 \t2022-01-01T00:56:37+00:00\t2022-01-01T01:06:11+00:00\t (null)\t 85.000000\t (null)\tB00037
3\tB00037 \t2022-01-01T00:19:54+00:00\t2022-01-01T00:30:47+00:00\t (null)\t 85.000000\t (null)\tB00037
4\tB00037 \t2022-01-01T00:41:49+00:00\t2022-01-01T00:52:16+00:00\t (null)\t 188.000000\t (null)\tB00037
5\tB00037 \t2022-01-01T00:21:32+00:00\t2022-01-01T00:35:06+00:00\t (null)\t 61.000000\t (null)\tB00037
6\tB00037 \t2022-01-01T00:51:19+00:00\t2022-01-01T01:08:06+00:00\t (null)\t 76.000000\t (null)\tB00037
7\tB00111 \t2022-01-01T00:30:00+00:00\t2022-01-01T01:41:00+00:00\t (null)\t (null)\t (null)\tB03406
8\tB00112 \t2022-01-01T00:31:30+00:00\t2022-01-01T01:10:06+00:00\t (null)\t 67.000000\t (null)\tB00112
9\tB00112 \t2022-01-01T00:12:26+00:00\t2022-01-01T00:37:22+00:00\t (null)\t 155.000000\t (null)\tB00112
...
1143681\tB03380 \t2022-01-31T23:39:32+00:00\t2022-01-31T23:47:43+00:00\t 246.000000\t 158.000000\t (null)\tB03380
1143682\tB03380 \t2022-01-31T23:52:52+00:00\t2022-02-01T00:03:14+00:00\t 158.000000\t 107.000000\t (null)\tB03380
1143683\tB03380 \t2022-01-31T23:24:44+00:00\t2022-01-31T23:35:46+00:00\t 231.000000\t 4.000000\t (null)\tB03380
1143684\tB03380 \t2022-01-31T23:21:35+00:00\t2022-01-31T23:32:16+00:00\t 229.000000\t 48.000000\t (null)\tB03380
1143685\tB03380 \t2022-01-31T23:02:50+00:00\t2022-01-31T23:20:07+00:00\t 142.000000\t 113.000000\t (null)\tB03380
1143686\tB03380 \t2022-01-31T23:22:41+00:00\t2022-01-31T23:26:39+00:00\t 234.000000\t 107.000000\t (null)\tB03380
1143687\tB03380 \t2022-01-31T23:42:42+00:00\t2022-01-31T23:52:58+00:00\t 114.000000\t 148.000000\t (null)\tB03380
1143688\tB03380 \t2022-01-31T23:07:13+00:00\t2022-01-31T23:13:40+00:00\t 90.000000\t 113.000000\t (null)\tB03380
1143689\tB03380 \t2022-01-31T23:16:14+00:00\t2022-01-31T23:31:03+00:00\t 113.000000\t 140.000000\t (null)\tB03380
1143690\tB03381 \t2022-01-31T23:47:42+00:00\t2022-02-01T00:15:03+00:00\t (null)\t 122.000000\t (null)\tB03404
TABLE
end

test("#each") do
records = @dataset.each.to_a

assert_equal([
1143691,
{
dispatching_base_num: 'B00009',
pickup_datetime: Time.parse('2022-01-01 00:31:00 +0000'),
dropoff_datetime: Time.parse('2022-01-01 01:05:00 +0000'),
pu_location_id: nil,
do_location_id: nil,
sr_flag: false,
affiliated_base_number: 'B00009'
},
{
dispatching_base_num: 'B03381',
pickup_datetime: Time.parse('2022-01-31 23:47:42 +0000'),
dropoff_datetime: Time.parse('2022-02-01 00:15:03 +0000'),
pu_location_id: nil,
do_location_id: 122.0,
sr_flag: false,
affiliated_base_number: 'B03404'
}
],
[
records.size,
records.first.to_h,
records.last.to_h,
])
end
end
Loading