Skip to content

Commit 24e559b

Browse files
authored
Merge branch 'master' into python-while-loop
2 parents 760114d + 62c140f commit 24e559b

27 files changed

+26165
-0
lines changed

polars-lazyframe/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# How to Work With Polars LazyFrames
2+
3+
This folder contains completed notebooks and other files used in the Real Python tutorial [How to Work With Polars LazyFrames](https://realpython.com/polars-lazyframe/).
4+
5+
**The following files are included:**
6+
7+
- `tutorial_code.ipynb` is a Jupyter Notebook containing all the code used in the tutorial.
8+
- `rides.parquet` contains taxi fare data used throughout the tutorial. [See below]
9+
- `taxi_rides_nnGB.py` are a series of Python scripts that will generate a local copy of the `2021_Yellow_Taxi_Trip_Data.csv` in different sizes. This is used to illustrate data streaming and is provided to help demonstrate streaming.
10+
- `programming_languages.csv` is a small file used to illustrate the conversion between a DataFrame and a LazyFrame.
11+
- `dataframe_timer.py` and `lazyframe_timer.py` are two scripts used to compare the speed differences when using both a DataFrame and LazyFrame to perform an analysis.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import time
2+
3+
import polars as pl
4+
5+
start = time.perf_counter()
6+
7+
for _ in range(10):
8+
rides = pl.read_parquet("rides.parquet")
9+
result = (
10+
rides.filter(pl.col("pick_up") == pl.col("drop_off"))
11+
.group_by(pl.col("pick_up"))
12+
.agg(pl.col("fare").mean())
13+
.filter(
14+
pl.col("pick_up").is_in(
15+
["Brooklyn", "Bronx", "Queens", "Manhattan"]
16+
)
17+
)
18+
)
19+
20+
end = time.perf_counter()
21+
22+
f"Code finished in {(end - start)/10:0.4f} seconds."
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import time
2+
3+
import polars as pl
4+
5+
start = time.perf_counter()
6+
7+
for _ in range(10):
8+
rides = pl.scan_parquet("rides.parquet")
9+
result = (
10+
rides.filter(pl.col("pick_up") == pl.col("drop_off"))
11+
.group_by(pl.col("pick_up"))
12+
.agg(pl.col("fare").mean())
13+
.filter(
14+
pl.col("pick_up").is_in(
15+
["Brooklyn", "Bronx", "Queens", "Manhattan"]
16+
)
17+
)
18+
).collect()
19+
20+
end = time.perf_counter()
21+
22+
f"Code finished in {(end - start)/10:0.4f} seconds."
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
language_id,language,creator,year
2+
0,Pascal,Niklaus Wirth,1970
3+
1,C,Dennis Ritchie,1973
4+
2,C++,Bjarne Stroustrup,1985
5+
3,Python,Guido van Rossum,1991
6+
4,Java,James Gosling,1995

polars-lazyframe/rides.parquet

5.91 MB
Binary file not shown.
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import csv
2+
import datetime
3+
import random
4+
5+
random.seed(10)
6+
7+
8+
def generate_pu():
9+
lower_bound = datetime.datetime(
10+
year=2021, month=1, day=1, hour=0, minute=0, second=0
11+
)
12+
upper_bound = datetime.datetime(
13+
year=2021, month=12, day=31, hour=23, minute=59, second=59
14+
)
15+
random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
16+
return random_pickup.replace(microsecond=0)
17+
18+
19+
def generate_do(pick_up):
20+
random_dropoff = pick_up + datetime.timedelta(
21+
minutes=random.randint(10, 120), seconds=random.randint(0, 59)
22+
)
23+
return random_dropoff
24+
25+
26+
def generate_choice(*kwargs):
27+
return random.choice(kwargs)
28+
29+
30+
def generate_file():
31+
with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
32+
header = (
33+
"VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
34+
"trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
35+
"payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
36+
"improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
37+
).split(",")
38+
39+
writer = csv.writer(csvfile)
40+
writer.writerow(header)
41+
42+
for _ in range(170_000_000):
43+
fare_amount = round(random.uniform(5.0, 100.0), 2)
44+
extra = generate_choice(0, 0.5, 3)
45+
mta_tax = generate_choice(0, 0.5)
46+
tip_amount = round(random.uniform(0.0, 20.0), 2)
47+
tolls_amount = generate_choice(0, 6.12)
48+
improvement_surcharge = 0.3
49+
congestion_surcharge = 2.5
50+
total_amount = round(
51+
fare_amount
52+
+ extra
53+
+ mta_tax
54+
+ tip_amount
55+
+ tolls_amount
56+
+ improvement_surcharge
57+
+ congestion_surcharge,
58+
2,
59+
)
60+
61+
pick_up = generate_pu()
62+
63+
ride = [
64+
random.randint(0, 6), # VendorID
65+
pick_up, # tpep_pickup_datetime
66+
generate_do(pick_up), # tpep_dropoff_datetime
67+
random.randint(0, 5), # passenger_count
68+
random.randint(0, 15), # trip_distance
69+
random.randint(0, 6), # RatecodeID
70+
generate_choice("Y", "N"), # store_and_fwd_flag
71+
random.randint(1, 265), # PULocationID
72+
random.randint(1, 265), # DOLocationID
73+
random.randint(0, 5), # payment_type
74+
fare_amount, # fare_amount
75+
extra, # extra
76+
mta_tax, # mta_tax
77+
tip_amount, # tip_amount
78+
tolls_amount, # tolls_amount
79+
improvement_surcharge, # improvement_surcharge
80+
total_amount, # total_amount
81+
congestion_surcharge, # congestion_surcharge
82+
]
83+
84+
writer.writerow(ride)
85+
86+
87+
generate_file()
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import csv
2+
import datetime
3+
import random
4+
5+
random.seed(10)
6+
7+
8+
def generate_pu():
9+
lower_bound = datetime.datetime(
10+
year=2021, month=1, day=1, hour=0, minute=0, second=0
11+
)
12+
upper_bound = datetime.datetime(
13+
year=2021, month=12, day=31, hour=23, minute=59, second=59
14+
)
15+
random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
16+
return random_pickup.replace(microsecond=0)
17+
18+
19+
def generate_do(pick_up):
20+
random_dropoff = pick_up + datetime.timedelta(
21+
minutes=random.randint(10, 120), seconds=random.randint(0, 59)
22+
)
23+
return random_dropoff
24+
25+
26+
def generate_choice(*kwargs):
27+
return random.choice(kwargs)
28+
29+
30+
def generate_file():
31+
with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
32+
header = (
33+
"VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
34+
"trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
35+
"payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
36+
"improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
37+
).split(",")
38+
39+
writer = csv.writer(csvfile)
40+
writer.writerow(header)
41+
42+
for _ in range(255_000_000):
43+
fare_amount = round(random.uniform(5.0, 100.0), 2)
44+
extra = generate_choice(0, 0.5, 3)
45+
mta_tax = generate_choice(0, 0.5)
46+
tip_amount = round(random.uniform(0.0, 20.0), 2)
47+
tolls_amount = generate_choice(0, 6.12)
48+
improvement_surcharge = 0.3
49+
congestion_surcharge = 2.5
50+
total_amount = round(
51+
fare_amount
52+
+ extra
53+
+ mta_tax
54+
+ tip_amount
55+
+ tolls_amount
56+
+ improvement_surcharge
57+
+ congestion_surcharge,
58+
2,
59+
)
60+
61+
pick_up = generate_pu()
62+
63+
ride = [
64+
random.randint(0, 6), # VendorID
65+
pick_up, # tpep_pickup_datetime
66+
generate_do(pick_up), # tpep_dropoff_datetime
67+
random.randint(0, 5), # passenger_count
68+
random.randint(0, 15), # trip_distance
69+
random.randint(0, 6), # RatecodeID
70+
generate_choice("Y", "N"), # store_and_fwd_flag
71+
random.randint(1, 265), # PULocationID
72+
random.randint(1, 265), # DOLocationID
73+
random.randint(0, 5), # payment_type
74+
fare_amount, # fare_amount
75+
extra, # extra
76+
mta_tax, # mta_tax
77+
tip_amount, # tip_amount
78+
tolls_amount, # tolls_amount
79+
improvement_surcharge, # improvement_surcharge
80+
total_amount, # total_amount
81+
congestion_surcharge, # congestion_surcharge
82+
]
83+
84+
writer.writerow(ride)
85+
86+
87+
generate_file()
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import csv
2+
import datetime
3+
import random
4+
5+
random.seed(10)
6+
7+
8+
def generate_pu():
9+
lower_bound = datetime.datetime(
10+
year=2021, month=1, day=1, hour=0, minute=0, second=0
11+
)
12+
upper_bound = datetime.datetime(
13+
year=2021, month=12, day=31, hour=23, minute=59, second=59
14+
)
15+
random_pickup = random.random() * (upper_bound - lower_bound) + lower_bound
16+
return random_pickup.replace(microsecond=0)
17+
18+
19+
def generate_do(pick_up):
20+
random_dropoff = pick_up + datetime.timedelta(
21+
minutes=random.randint(10, 120), seconds=random.randint(0, 59)
22+
)
23+
return random_dropoff
24+
25+
26+
def generate_choice(*kwargs):
27+
return random.choice(kwargs)
28+
29+
30+
def generate_file():
31+
with open("2021_Yellow_Taxi_Trip_Data.csv", "w", newline="") as csvfile:
32+
header = (
33+
"VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,"
34+
"trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,"
35+
"payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,"
36+
"improvement_surcharge,total_amount,congestion_surcharge,airport_fee"
37+
).split(",")
38+
39+
writer = csv.writer(csvfile)
40+
writer.writerow(header)
41+
42+
for _ in range(340_000_000):
43+
fare_amount = round(random.uniform(5.0, 100.0), 2)
44+
extra = generate_choice(0, 0.5, 3)
45+
mta_tax = generate_choice(0, 0.5)
46+
tip_amount = round(random.uniform(0.0, 20.0), 2)
47+
tolls_amount = generate_choice(0, 6.12)
48+
improvement_surcharge = 0.3
49+
congestion_surcharge = 2.5
50+
total_amount = round(
51+
fare_amount
52+
+ extra
53+
+ mta_tax
54+
+ tip_amount
55+
+ tolls_amount
56+
+ improvement_surcharge
57+
+ congestion_surcharge,
58+
2,
59+
)
60+
61+
pick_up = generate_pu()
62+
63+
ride = [
64+
random.randint(0, 6), # VendorID
65+
pick_up, # tpep_pickup_datetime
66+
generate_do(pick_up), # tpep_dropoff_datetime
67+
random.randint(0, 5), # passenger_count
68+
random.randint(0, 15), # trip_distance
69+
random.randint(0, 6), # RatecodeID
70+
generate_choice("Y", "N"), # store_and_fwd_flag
71+
random.randint(1, 265), # PULocationID
72+
random.randint(1, 265), # DOLocationID
73+
random.randint(0, 5), # payment_type
74+
fare_amount, # fare_amount
75+
extra, # extra
76+
mta_tax, # mta_tax
77+
tip_amount, # tip_amount
78+
tolls_amount, # tolls_amount
79+
improvement_surcharge, # improvement_surcharge
80+
total_amount, # total_amount
81+
congestion_surcharge, # congestion_surcharge
82+
]
83+
84+
writer.writerow(ride)
85+
86+
87+
generate_file()

0 commit comments

Comments
 (0)