| 
25 | 25 | import os  | 
26 | 26 | 
 
  | 
27 | 27 | import datafusion  | 
28 |  | -import pyarrow  | 
 | 28 | +import pyarrow as pa  | 
29 | 29 | 
 
  | 
30 | 30 | ctx = datafusion.SessionContext()  | 
31 | 31 | 
 
  | 
32 | 32 | all_schemas = {}  | 
33 | 33 | 
 
  | 
34 | 34 | all_schemas["customer"] = [  | 
35 |  | -    ("C_CUSTKEY", pyarrow.int64()),  | 
36 |  | -    ("C_NAME", pyarrow.string()),  | 
37 |  | -    ("C_ADDRESS", pyarrow.string()),  | 
38 |  | -    ("C_NATIONKEY", pyarrow.int64()),  | 
39 |  | -    ("C_PHONE", pyarrow.string()),  | 
40 |  | -    ("C_ACCTBAL", pyarrow.decimal128(15, 2)),  | 
41 |  | -    ("C_MKTSEGMENT", pyarrow.string()),  | 
42 |  | -    ("C_COMMENT", pyarrow.string()),  | 
 | 35 | +    ("C_CUSTKEY", pa.int64()),  | 
 | 36 | +    ("C_NAME", pa.string()),  | 
 | 37 | +    ("C_ADDRESS", pa.string()),  | 
 | 38 | +    ("C_NATIONKEY", pa.int64()),  | 
 | 39 | +    ("C_PHONE", pa.string()),  | 
 | 40 | +    ("C_ACCTBAL", pa.decimal128(15, 2)),  | 
 | 41 | +    ("C_MKTSEGMENT", pa.string()),  | 
 | 42 | +    ("C_COMMENT", pa.string()),  | 
43 | 43 | ]  | 
44 | 44 | 
 
  | 
45 | 45 | all_schemas["lineitem"] = [  | 
46 |  | -    ("L_ORDERKEY", pyarrow.int64()),  | 
47 |  | -    ("L_PARTKEY", pyarrow.int64()),  | 
48 |  | -    ("L_SUPPKEY", pyarrow.int64()),  | 
49 |  | -    ("L_LINENUMBER", pyarrow.int32()),  | 
50 |  | -    ("L_QUANTITY", pyarrow.decimal128(15, 2)),  | 
51 |  | -    ("L_EXTENDEDPRICE", pyarrow.decimal128(15, 2)),  | 
52 |  | -    ("L_DISCOUNT", pyarrow.decimal128(15, 2)),  | 
53 |  | -    ("L_TAX", pyarrow.decimal128(15, 2)),  | 
54 |  | -    ("L_RETURNFLAG", pyarrow.string()),  | 
55 |  | -    ("L_LINESTATUS", pyarrow.string()),  | 
56 |  | -    ("L_SHIPDATE", pyarrow.date32()),  | 
57 |  | -    ("L_COMMITDATE", pyarrow.date32()),  | 
58 |  | -    ("L_RECEIPTDATE", pyarrow.date32()),  | 
59 |  | -    ("L_SHIPINSTRUCT", pyarrow.string()),  | 
60 |  | -    ("L_SHIPMODE", pyarrow.string()),  | 
61 |  | -    ("L_COMMENT", pyarrow.string()),  | 
 | 46 | +    ("L_ORDERKEY", pa.int64()),  | 
 | 47 | +    ("L_PARTKEY", pa.int64()),  | 
 | 48 | +    ("L_SUPPKEY", pa.int64()),  | 
 | 49 | +    ("L_LINENUMBER", pa.int32()),  | 
 | 50 | +    ("L_QUANTITY", pa.decimal128(15, 2)),  | 
 | 51 | +    ("L_EXTENDEDPRICE", pa.decimal128(15, 2)),  | 
 | 52 | +    ("L_DISCOUNT", pa.decimal128(15, 2)),  | 
 | 53 | +    ("L_TAX", pa.decimal128(15, 2)),  | 
 | 54 | +    ("L_RETURNFLAG", pa.string()),  | 
 | 55 | +    ("L_LINESTATUS", pa.string()),  | 
 | 56 | +    ("L_SHIPDATE", pa.date32()),  | 
 | 57 | +    ("L_COMMITDATE", pa.date32()),  | 
 | 58 | +    ("L_RECEIPTDATE", pa.date32()),  | 
 | 59 | +    ("L_SHIPINSTRUCT", pa.string()),  | 
 | 60 | +    ("L_SHIPMODE", pa.string()),  | 
 | 61 | +    ("L_COMMENT", pa.string()),  | 
62 | 62 | ]  | 
63 | 63 | 
 
  | 
64 | 64 | all_schemas["nation"] = [  | 
65 |  | -    ("N_NATIONKEY", pyarrow.int64()),  | 
66 |  | -    ("N_NAME", pyarrow.string()),  | 
67 |  | -    ("N_REGIONKEY", pyarrow.int64()),  | 
68 |  | -    ("N_COMMENT", pyarrow.string()),  | 
 | 65 | +    ("N_NATIONKEY", pa.int64()),  | 
 | 66 | +    ("N_NAME", pa.string()),  | 
 | 67 | +    ("N_REGIONKEY", pa.int64()),  | 
 | 68 | +    ("N_COMMENT", pa.string()),  | 
69 | 69 | ]  | 
70 | 70 | 
 
  | 
71 | 71 | all_schemas["orders"] = [  | 
72 |  | -    ("O_ORDERKEY", pyarrow.int64()),  | 
73 |  | -    ("O_CUSTKEY", pyarrow.int64()),  | 
74 |  | -    ("O_ORDERSTATUS", pyarrow.string()),  | 
75 |  | -    ("O_TOTALPRICE", pyarrow.decimal128(15, 2)),  | 
76 |  | -    ("O_ORDERDATE", pyarrow.date32()),  | 
77 |  | -    ("O_ORDERPRIORITY", pyarrow.string()),  | 
78 |  | -    ("O_CLERK", pyarrow.string()),  | 
79 |  | -    ("O_SHIPPRIORITY", pyarrow.int32()),  | 
80 |  | -    ("O_COMMENT", pyarrow.string()),  | 
 | 72 | +    ("O_ORDERKEY", pa.int64()),  | 
 | 73 | +    ("O_CUSTKEY", pa.int64()),  | 
 | 74 | +    ("O_ORDERSTATUS", pa.string()),  | 
 | 75 | +    ("O_TOTALPRICE", pa.decimal128(15, 2)),  | 
 | 76 | +    ("O_ORDERDATE", pa.date32()),  | 
 | 77 | +    ("O_ORDERPRIORITY", pa.string()),  | 
 | 78 | +    ("O_CLERK", pa.string()),  | 
 | 79 | +    ("O_SHIPPRIORITY", pa.int32()),  | 
 | 80 | +    ("O_COMMENT", pa.string()),  | 
81 | 81 | ]  | 
82 | 82 | 
 
  | 
83 | 83 | all_schemas["part"] = [  | 
84 |  | -    ("P_PARTKEY", pyarrow.int64()),  | 
85 |  | -    ("P_NAME", pyarrow.string()),  | 
86 |  | -    ("P_MFGR", pyarrow.string()),  | 
87 |  | -    ("P_BRAND", pyarrow.string()),  | 
88 |  | -    ("P_TYPE", pyarrow.string()),  | 
89 |  | -    ("P_SIZE", pyarrow.int32()),  | 
90 |  | -    ("P_CONTAINER", pyarrow.string()),  | 
91 |  | -    ("P_RETAILPRICE", pyarrow.decimal128(15, 2)),  | 
92 |  | -    ("P_COMMENT", pyarrow.string()),  | 
 | 84 | +    ("P_PARTKEY", pa.int64()),  | 
 | 85 | +    ("P_NAME", pa.string()),  | 
 | 86 | +    ("P_MFGR", pa.string()),  | 
 | 87 | +    ("P_BRAND", pa.string()),  | 
 | 88 | +    ("P_TYPE", pa.string()),  | 
 | 89 | +    ("P_SIZE", pa.int32()),  | 
 | 90 | +    ("P_CONTAINER", pa.string()),  | 
 | 91 | +    ("P_RETAILPRICE", pa.decimal128(15, 2)),  | 
 | 92 | +    ("P_COMMENT", pa.string()),  | 
93 | 93 | ]  | 
94 | 94 | 
 
  | 
95 | 95 | all_schemas["partsupp"] = [  | 
96 |  | -    ("PS_PARTKEY", pyarrow.int64()),  | 
97 |  | -    ("PS_SUPPKEY", pyarrow.int64()),  | 
98 |  | -    ("PS_AVAILQTY", pyarrow.int32()),  | 
99 |  | -    ("PS_SUPPLYCOST", pyarrow.decimal128(15, 2)),  | 
100 |  | -    ("PS_COMMENT", pyarrow.string()),  | 
 | 96 | +    ("PS_PARTKEY", pa.int64()),  | 
 | 97 | +    ("PS_SUPPKEY", pa.int64()),  | 
 | 98 | +    ("PS_AVAILQTY", pa.int32()),  | 
 | 99 | +    ("PS_SUPPLYCOST", pa.decimal128(15, 2)),  | 
 | 100 | +    ("PS_COMMENT", pa.string()),  | 
101 | 101 | ]  | 
102 | 102 | 
 
  | 
103 | 103 | all_schemas["region"] = [  | 
104 |  | -    ("r_REGIONKEY", pyarrow.int64()),  | 
105 |  | -    ("r_NAME", pyarrow.string()),  | 
106 |  | -    ("r_COMMENT", pyarrow.string()),  | 
 | 104 | +    ("r_REGIONKEY", pa.int64()),  | 
 | 105 | +    ("r_NAME", pa.string()),  | 
 | 106 | +    ("r_COMMENT", pa.string()),  | 
107 | 107 | ]  | 
108 | 108 | 
 
  | 
109 | 109 | all_schemas["supplier"] = [  | 
110 |  | -    ("S_SUPPKEY", pyarrow.int64()),  | 
111 |  | -    ("S_NAME", pyarrow.string()),  | 
112 |  | -    ("S_ADDRESS", pyarrow.string()),  | 
113 |  | -    ("S_NATIONKEY", pyarrow.int32()),  | 
114 |  | -    ("S_PHONE", pyarrow.string()),  | 
115 |  | -    ("S_ACCTBAL", pyarrow.decimal128(15, 2)),  | 
116 |  | -    ("S_COMMENT", pyarrow.string()),  | 
 | 110 | +    ("S_SUPPKEY", pa.int64()),  | 
 | 111 | +    ("S_NAME", pa.string()),  | 
 | 112 | +    ("S_ADDRESS", pa.string()),  | 
 | 113 | +    ("S_NATIONKEY", pa.int32()),  | 
 | 114 | +    ("S_PHONE", pa.string()),  | 
 | 115 | +    ("S_ACCTBAL", pa.decimal128(15, 2)),  | 
 | 116 | +    ("S_COMMENT", pa.string()),  | 
117 | 117 | ]  | 
118 | 118 | 
 
  | 
119 | 119 | curr_dir = os.path.dirname(os.path.abspath(__file__))  | 
 | 
125 | 125 |     # in to handle the trailing | in the file  | 
126 | 126 |     output_cols = [r[0] for r in curr_schema]  | 
127 | 127 | 
 
  | 
128 |  | -    curr_schema = [pyarrow.field(r[0], r[1], nullable=False) for r in curr_schema]  | 
 | 128 | +    curr_schema = [pa.field(r[0], r[1], nullable=False) for r in curr_schema]  | 
129 | 129 | 
 
  | 
130 | 130 |     # Trailing | requires extra field for in processing  | 
131 |  | -    curr_schema.append(("some_null", pyarrow.null()))  | 
 | 131 | +    curr_schema.append(("some_null", pa.null()))  | 
132 | 132 | 
 
  | 
133 |  | -    schema = pyarrow.schema(curr_schema)  | 
 | 133 | +    schema = pa.schema(curr_schema)  | 
134 | 134 | 
 
  | 
135 | 135 |     source_file = os.path.abspath(  | 
136 | 136 |         os.path.join(curr_dir, f"../../benchmarks/tpch/data/{filename}.csv")  | 
 | 
0 commit comments