Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions queries/pandas/q1.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,12 @@


def q() -> None:
line_item_ds = utils.get_line_item_ds
line_item_ds_fn = utils.get_line_item_ds
# first call one time to cache in case we don't include the IO times
line_item_ds()
line_item_ds_fn()

def query() -> pd.DataFrame:
nonlocal line_item_ds
line_item_ds = line_item_ds()
line_item_ds = line_item_ds_fn()

var1 = date(1998, 9, 2)

Expand Down
76 changes: 76 additions & 0 deletions queries/pandas/q10.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from __future__ import annotations

import pandas as pd

from queries.pandas import utils

Q_NUM = 10


def q() -> None:
customer_ds_fn = utils.get_customer_ds
lineitem_ds_fn = utils.get_line_item_ds
nation_ds_fn = utils.get_nation_ds
orders_ds_fn = utils.get_orders_ds

# first call one time to cache in case we don't include the IO times
customer_ds_fn()
lineitem_ds_fn()
nation_ds_fn()
orders_ds_fn()

def query() -> pd.DataFrame:
customer_ds = customer_ds_fn()
lineitem_ds = lineitem_ds_fn()
nation_ds = nation_ds_fn()
orders_ds = orders_ds_fn()

var1 = pd.Timestamp("1993-10-01")
var2 = pd.Timestamp("1994-01-01")

jn1 = customer_ds.merge(orders_ds, left_on="c_custkey", right_on="o_custkey")
jn2 = jn1.merge(lineitem_ds, left_on="o_orderkey", right_on="l_orderkey")
jn3 = jn2.merge(nation_ds, left_on="c_nationkey", right_on="n_nationkey")

jn3 = jn3[(jn3["o_orderdate"] >= var1) & (jn3["o_orderdate"] < var2)]
jn3 = jn3[jn3["l_returnflag"] == "R"]

jn3["revenue"] = jn3["l_extendedprice"] * (1 - jn3["l_discount"])

gb = jn3.groupby(
[
"c_custkey",
"c_name",
"c_acctbal",
"c_phone",
"n_name",
"c_address",
"c_comment",
],
as_index=False,
)
agg = gb.agg(revenue=pd.NamedAgg(column="revenue", aggfunc="sum"))

sel = agg.loc[
:,
[
"c_custkey",
"c_name",
"revenue",
"c_acctbal",
"n_name",
"c_address",
"c_phone",
"c_comment",
],
]

result_df = sel.sort_values("revenue", ascending=False).head(20)

return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)


if __name__ == "__main__":
q()
53 changes: 53 additions & 0 deletions queries/pandas/q11.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from __future__ import annotations

import pandas as pd

from queries.pandas import utils
from settings import Settings

Q_NUM = 11

settings = Settings()


def q() -> None:
nation_ds_fn = utils.get_nation_ds
part_supp_ds_fn = utils.get_part_supp_ds
supplier_ds_fn = utils.get_supplier_ds

# first call one time to cache in case we don't include the IO times
nation_ds_fn()
part_supp_ds_fn()
supplier_ds_fn()

def query() -> pd.DataFrame:
nation_ds = nation_ds_fn()
part_supp_ds = part_supp_ds_fn()
supplier_ds = supplier_ds_fn()

var1 = "GERMANY"
var2 = 0.0001 / settings.scale_factor

jn1 = part_supp_ds.merge(
supplier_ds, left_on="ps_suppkey", right_on="s_suppkey"
)
jn2 = jn1.merge(nation_ds, left_on="s_nationkey", right_on="n_nationkey")
jn2 = jn2[jn2["n_name"] == var1]

jn2["value"] = jn2["ps_supplycost"] * jn2["ps_availqty"]

threshold = jn2["value"].sum() * var2

gb = jn2.groupby("ps_partkey", as_index=False)
agg = gb.agg(value=pd.NamedAgg(column="value", aggfunc="sum"))

result = agg[agg["value"] > threshold]
result_df = result.sort_values("value", ascending=False)

return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)


if __name__ == "__main__":
q()
51 changes: 51 additions & 0 deletions queries/pandas/q12.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from __future__ import annotations

import pandas as pd

from queries.pandas import utils

Q_NUM = 12


def q() -> None:
lineitem_ds_fn = utils.get_line_item_ds
orders_ds_fn = utils.get_orders_ds

# first call one time to cache in case we don't include the IO times
lineitem_ds_fn()
orders_ds_fn()

def query() -> pd.DataFrame:
lineitem_ds = lineitem_ds_fn()
orders_ds = orders_ds_fn()

var1 = "MAIL"
var2 = "SHIP"
var3 = pd.Timestamp("1994-01-01")
var4 = pd.Timestamp("1995-01-01")

jn = orders_ds.merge(lineitem_ds, left_on="o_orderkey", right_on="l_orderkey")

jn = jn[jn["l_shipmode"].isin([var1, var2])]
jn = jn[jn["l_commitdate"] < jn["l_receiptdate"]]
jn = jn[jn["l_shipdate"] < jn["l_commitdate"]]
jn = jn[(jn["l_receiptdate"] >= var3) & (jn["l_receiptdate"] < var4)]

jn["high_line_count"] = jn["o_orderpriority"].isin(["1-URGENT", "2-HIGH"])
jn["low_line_count"] = ~jn["o_orderpriority"].isin(["1-URGENT", "2-HIGH"])

gb = jn.groupby("l_shipmode", as_index=False)
agg = gb.agg(
high_line_count=pd.NamedAgg(column="high_line_count", aggfunc="sum"),
low_line_count=pd.NamedAgg(column="low_line_count", aggfunc="sum"),
)

result_df = agg.sort_values("l_shipmode")

return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)


if __name__ == "__main__":
q()
55 changes: 55 additions & 0 deletions queries/pandas/q13.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from __future__ import annotations

import pandas as pd

from queries.pandas import utils

Q_NUM = 13


def q() -> None:
customer_ds_fn = utils.get_customer_ds
orders_ds_fn = utils.get_orders_ds

# first call one time to cache in case we don't include the IO times
customer_ds_fn()
orders_ds_fn()

def query() -> pd.DataFrame:
customer_ds = customer_ds_fn()
orders_ds = orders_ds_fn()

var1 = "special"
var2 = "requests"

filtered_orders = orders_ds[
~orders_ds["o_comment"].str.contains(
f"{var1}.*{var2}", regex=True, na=False
)
]

jn = customer_ds.merge(
filtered_orders,
left_on="c_custkey",
right_on="o_custkey",
how="left",
)

gb1 = jn.groupby("c_custkey", as_index=False)
agg1 = gb1.agg(c_count=pd.NamedAgg(column="o_orderkey", aggfunc="count"))

gb2 = agg1.groupby("c_count", as_index=False)
agg2 = gb2.size()
agg2.columns = ["c_count", "custdist"]

result_df = agg2.sort_values(
by=["custdist", "c_count"], ascending=[False, False]
)

return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)


if __name__ == "__main__":
q()
46 changes: 46 additions & 0 deletions queries/pandas/q14.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

import pandas as pd

from queries.pandas import utils

Q_NUM = 14


def q() -> None:
lineitem_ds_fn = utils.get_line_item_ds
part_ds_fn = utils.get_part_ds

# first call one time to cache in case we don't include the IO times
lineitem_ds_fn()
part_ds_fn()

def query() -> pd.DataFrame:
lineitem_ds = lineitem_ds_fn()
part_ds = part_ds_fn()

var1 = pd.Timestamp("1995-09-01")
var2 = pd.Timestamp("1995-10-01")

jn = lineitem_ds.merge(part_ds, left_on="l_partkey", right_on="p_partkey")

jn = jn[(jn["l_shipdate"] >= var1) & (jn["l_shipdate"] < var2)]

jn["revenue"] = jn["l_extendedprice"] * (1 - jn["l_discount"])
jn["promo_revenue"] = jn["revenue"].where(
jn["p_type"].str.startswith("PROMO"), 0
)

promo_revenue = (100.0 * jn["promo_revenue"].sum() / jn["revenue"].sum()).round(
2
)

result_df = pd.DataFrame({"promo_revenue": [promo_revenue]})

return result_df

utils.run_query(Q_NUM, query)


if __name__ == "__main__":
q()
55 changes: 55 additions & 0 deletions queries/pandas/q15.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from __future__ import annotations

import pandas as pd

from queries.pandas import utils

Q_NUM = 15


def q() -> None:
lineitem_ds_fn = utils.get_line_item_ds
supplier_ds_fn = utils.get_supplier_ds

# first call one time to cache in case we don't include the IO times
lineitem_ds_fn()
supplier_ds_fn()

def query() -> pd.DataFrame:
lineitem_ds = lineitem_ds_fn()
supplier_ds = supplier_ds_fn()

var1 = pd.Timestamp("1996-01-01")
var2 = pd.Timestamp("1996-04-01")

filtered_lineitem = lineitem_ds[
(lineitem_ds["l_shipdate"] >= var1) & (lineitem_ds["l_shipdate"] < var2)
]

filtered_lineitem["revenue"] = filtered_lineitem["l_extendedprice"] * (
1 - filtered_lineitem["l_discount"]
)

revenue = filtered_lineitem.groupby("l_suppkey", as_index=False).agg(
total_revenue=pd.NamedAgg(column="revenue", aggfunc="sum")
)
revenue = revenue.rename(columns={"l_suppkey": "supplier_no"})

max_revenue = revenue["total_revenue"].max()

jn = supplier_ds.merge(revenue, left_on="s_suppkey", right_on="supplier_no")
jn = jn[jn["total_revenue"] == max_revenue]

result = jn.loc[
:, ["s_suppkey", "s_name", "s_address", "s_phone", "total_revenue"]
]

result_df = result.sort_values("s_suppkey")

return result_df # type: ignore[no-any-return]

utils.run_query(Q_NUM, query)


if __name__ == "__main__":
q()
Loading