|
25 | 25 | import os |
26 | 26 |
|
27 | 27 | import datafusion |
28 | | -import pyarrow |
| 28 | +import pyarrow as pa |
29 | 29 |
|
30 | 30 | ctx = datafusion.SessionContext() |
31 | 31 |
|
32 | 32 | all_schemas = {} |
33 | 33 |
|
34 | 34 | all_schemas["customer"] = [ |
35 | | - ("C_CUSTKEY", pyarrow.int64()), |
36 | | - ("C_NAME", pyarrow.string()), |
37 | | - ("C_ADDRESS", pyarrow.string()), |
38 | | - ("C_NATIONKEY", pyarrow.int64()), |
39 | | - ("C_PHONE", pyarrow.string()), |
40 | | - ("C_ACCTBAL", pyarrow.decimal128(15, 2)), |
41 | | - ("C_MKTSEGMENT", pyarrow.string()), |
42 | | - ("C_COMMENT", pyarrow.string()), |
| 35 | + ("C_CUSTKEY", pa.int64()), |
| 36 | + ("C_NAME", pa.string()), |
| 37 | + ("C_ADDRESS", pa.string()), |
| 38 | + ("C_NATIONKEY", pa.int64()), |
| 39 | + ("C_PHONE", pa.string()), |
| 40 | + ("C_ACCTBAL", pa.decimal128(15, 2)), |
| 41 | + ("C_MKTSEGMENT", pa.string()), |
| 42 | + ("C_COMMENT", pa.string()), |
43 | 43 | ] |
44 | 44 |
|
45 | 45 | all_schemas["lineitem"] = [ |
46 | | - ("L_ORDERKEY", pyarrow.int64()), |
47 | | - ("L_PARTKEY", pyarrow.int64()), |
48 | | - ("L_SUPPKEY", pyarrow.int64()), |
49 | | - ("L_LINENUMBER", pyarrow.int32()), |
50 | | - ("L_QUANTITY", pyarrow.decimal128(15, 2)), |
51 | | - ("L_EXTENDEDPRICE", pyarrow.decimal128(15, 2)), |
52 | | - ("L_DISCOUNT", pyarrow.decimal128(15, 2)), |
53 | | - ("L_TAX", pyarrow.decimal128(15, 2)), |
54 | | - ("L_RETURNFLAG", pyarrow.string()), |
55 | | - ("L_LINESTATUS", pyarrow.string()), |
56 | | - ("L_SHIPDATE", pyarrow.date32()), |
57 | | - ("L_COMMITDATE", pyarrow.date32()), |
58 | | - ("L_RECEIPTDATE", pyarrow.date32()), |
59 | | - ("L_SHIPINSTRUCT", pyarrow.string()), |
60 | | - ("L_SHIPMODE", pyarrow.string()), |
61 | | - ("L_COMMENT", pyarrow.string()), |
| 46 | + ("L_ORDERKEY", pa.int64()), |
| 47 | + ("L_PARTKEY", pa.int64()), |
| 48 | + ("L_SUPPKEY", pa.int64()), |
| 49 | + ("L_LINENUMBER", pa.int32()), |
| 50 | + ("L_QUANTITY", pa.decimal128(15, 2)), |
| 51 | + ("L_EXTENDEDPRICE", pa.decimal128(15, 2)), |
| 52 | + ("L_DISCOUNT", pa.decimal128(15, 2)), |
| 53 | + ("L_TAX", pa.decimal128(15, 2)), |
| 54 | + ("L_RETURNFLAG", pa.string()), |
| 55 | + ("L_LINESTATUS", pa.string()), |
| 56 | + ("L_SHIPDATE", pa.date32()), |
| 57 | + ("L_COMMITDATE", pa.date32()), |
| 58 | + ("L_RECEIPTDATE", pa.date32()), |
| 59 | + ("L_SHIPINSTRUCT", pa.string()), |
| 60 | + ("L_SHIPMODE", pa.string()), |
| 61 | + ("L_COMMENT", pa.string()), |
62 | 62 | ] |
63 | 63 |
|
64 | 64 | all_schemas["nation"] = [ |
65 | | - ("N_NATIONKEY", pyarrow.int64()), |
66 | | - ("N_NAME", pyarrow.string()), |
67 | | - ("N_REGIONKEY", pyarrow.int64()), |
68 | | - ("N_COMMENT", pyarrow.string()), |
| 65 | + ("N_NATIONKEY", pa.int64()), |
| 66 | + ("N_NAME", pa.string()), |
| 67 | + ("N_REGIONKEY", pa.int64()), |
| 68 | + ("N_COMMENT", pa.string()), |
69 | 69 | ] |
70 | 70 |
|
71 | 71 | all_schemas["orders"] = [ |
72 | | - ("O_ORDERKEY", pyarrow.int64()), |
73 | | - ("O_CUSTKEY", pyarrow.int64()), |
74 | | - ("O_ORDERSTATUS", pyarrow.string()), |
75 | | - ("O_TOTALPRICE", pyarrow.decimal128(15, 2)), |
76 | | - ("O_ORDERDATE", pyarrow.date32()), |
77 | | - ("O_ORDERPRIORITY", pyarrow.string()), |
78 | | - ("O_CLERK", pyarrow.string()), |
79 | | - ("O_SHIPPRIORITY", pyarrow.int32()), |
80 | | - ("O_COMMENT", pyarrow.string()), |
| 72 | + ("O_ORDERKEY", pa.int64()), |
| 73 | + ("O_CUSTKEY", pa.int64()), |
| 74 | + ("O_ORDERSTATUS", pa.string()), |
| 75 | + ("O_TOTALPRICE", pa.decimal128(15, 2)), |
| 76 | + ("O_ORDERDATE", pa.date32()), |
| 77 | + ("O_ORDERPRIORITY", pa.string()), |
| 78 | + ("O_CLERK", pa.string()), |
| 79 | + ("O_SHIPPRIORITY", pa.int32()), |
| 80 | + ("O_COMMENT", pa.string()), |
81 | 81 | ] |
82 | 82 |
|
83 | 83 | all_schemas["part"] = [ |
84 | | - ("P_PARTKEY", pyarrow.int64()), |
85 | | - ("P_NAME", pyarrow.string()), |
86 | | - ("P_MFGR", pyarrow.string()), |
87 | | - ("P_BRAND", pyarrow.string()), |
88 | | - ("P_TYPE", pyarrow.string()), |
89 | | - ("P_SIZE", pyarrow.int32()), |
90 | | - ("P_CONTAINER", pyarrow.string()), |
91 | | - ("P_RETAILPRICE", pyarrow.decimal128(15, 2)), |
92 | | - ("P_COMMENT", pyarrow.string()), |
| 84 | + ("P_PARTKEY", pa.int64()), |
| 85 | + ("P_NAME", pa.string()), |
| 86 | + ("P_MFGR", pa.string()), |
| 87 | + ("P_BRAND", pa.string()), |
| 88 | + ("P_TYPE", pa.string()), |
| 89 | + ("P_SIZE", pa.int32()), |
| 90 | + ("P_CONTAINER", pa.string()), |
| 91 | + ("P_RETAILPRICE", pa.decimal128(15, 2)), |
| 92 | + ("P_COMMENT", pa.string()), |
93 | 93 | ] |
94 | 94 |
|
95 | 95 | all_schemas["partsupp"] = [ |
96 | | - ("PS_PARTKEY", pyarrow.int64()), |
97 | | - ("PS_SUPPKEY", pyarrow.int64()), |
98 | | - ("PS_AVAILQTY", pyarrow.int32()), |
99 | | - ("PS_SUPPLYCOST", pyarrow.decimal128(15, 2)), |
100 | | - ("PS_COMMENT", pyarrow.string()), |
| 96 | + ("PS_PARTKEY", pa.int64()), |
| 97 | + ("PS_SUPPKEY", pa.int64()), |
| 98 | + ("PS_AVAILQTY", pa.int32()), |
| 99 | + ("PS_SUPPLYCOST", pa.decimal128(15, 2)), |
| 100 | + ("PS_COMMENT", pa.string()), |
101 | 101 | ] |
102 | 102 |
|
103 | 103 | all_schemas["region"] = [ |
104 | | - ("r_REGIONKEY", pyarrow.int64()), |
105 | | - ("r_NAME", pyarrow.string()), |
106 | | - ("r_COMMENT", pyarrow.string()), |
| 104 | + ("r_REGIONKEY", pa.int64()), |
| 105 | + ("r_NAME", pa.string()), |
| 106 | + ("r_COMMENT", pa.string()), |
107 | 107 | ] |
108 | 108 |
|
109 | 109 | all_schemas["supplier"] = [ |
110 | | - ("S_SUPPKEY", pyarrow.int64()), |
111 | | - ("S_NAME", pyarrow.string()), |
112 | | - ("S_ADDRESS", pyarrow.string()), |
113 | | - ("S_NATIONKEY", pyarrow.int32()), |
114 | | - ("S_PHONE", pyarrow.string()), |
115 | | - ("S_ACCTBAL", pyarrow.decimal128(15, 2)), |
116 | | - ("S_COMMENT", pyarrow.string()), |
| 110 | + ("S_SUPPKEY", pa.int64()), |
| 111 | + ("S_NAME", pa.string()), |
| 112 | + ("S_ADDRESS", pa.string()), |
| 113 | + ("S_NATIONKEY", pa.int32()), |
| 114 | + ("S_PHONE", pa.string()), |
| 115 | + ("S_ACCTBAL", pa.decimal128(15, 2)), |
| 116 | + ("S_COMMENT", pa.string()), |
117 | 117 | ] |
118 | 118 |
|
119 | 119 | curr_dir = os.path.dirname(os.path.abspath(__file__)) |
|
125 | 125 | # in to handle the trailing | in the file |
126 | 126 | output_cols = [r[0] for r in curr_schema] |
127 | 127 |
|
128 | | - curr_schema = [pyarrow.field(r[0], r[1], nullable=False) for r in curr_schema] |
| 128 | + curr_schema = [pa.field(r[0], r[1], nullable=False) for r in curr_schema] |
129 | 129 |
|
130 | 130 | # Trailing | requires extra field for in processing |
131 | | - curr_schema.append(("some_null", pyarrow.null())) |
| 131 | + curr_schema.append(("some_null", pa.null())) |
132 | 132 |
|
133 | | - schema = pyarrow.schema(curr_schema) |
| 133 | + schema = pa.schema(curr_schema) |
134 | 134 |
|
135 | 135 | source_file = os.path.abspath( |
136 | 136 | os.path.join(curr_dir, f"../../benchmarks/tpch/data/{filename}.csv") |
|
0 commit comments