Skip to content

Commit 998fc4f

Browse files
committed
add script to load data in bigquery and databricks
1 parent aaeeb47 commit 998fc4f

File tree

3 files changed

+115
-0
lines changed

3 files changed

+115
-0
lines changed

blog-examples/Bench2Cost/bigquery/clickbench/bigquery_extended/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,9 @@ This shows the dataset compresses roughly **10× smaller** on disk compared to t
9999
|-----------------|------------------------------|--------------------------------|
100100
| Active storage | 94.4 GiB | 8.8 GiB |
101101
| Long-term (>90d)| 0 GiB | 0 GiB |
102+
103+
## Inserting large number of data
104+
105+
BigQuery’s on-demand model charges based on the amount of data scanned. If you try to scale a 1-billion-row table to 100 billion rows by running `insert into hits_100b select * from hits_1b` a hundred times, the same data is scanned repeatedly, which can become expensive.
106+
107+
You can avoid this by duplicating the rows during a single scan. One option is to use `CROSS JOIN UNNEST(GENERATE_ARRAY(1, 20))`, which multiplies each row without rereading the source table. The load_data.sql script shows how to apply this approach.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
-- Insert 20 billion rows into hits_100B table by cross joining hits_1B table
2+
INSERT INTO `test.hits_100B`
3+
SELECT t.*
4+
FROM `test.hits_1B` AS t
5+
CROSS JOIN UNNEST(GENERATE_ARRAY(1, 20));
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
USE CATALOG bench2cost;
2+
USE SCHEMA hits;
3+
4+
CREATE OR REPLACE TABLE bench2cost.hits.100m_clustered
5+
CLUSTER BY (CounterID, EventDate, UserID, EventTime)
6+
AS
7+
SELECT
8+
CAST(WatchID AS BIGINT) AS WatchID,
9+
CAST(JavaEnable AS SMALLINT) AS JavaEnable,
10+
Title,
11+
CAST(GoodEvent AS SMALLINT) AS GoodEvent,
12+
CAST(EventTime AS TIMESTAMP) AS EventTime,
13+
CAST(EventDate AS DATE) AS EventDate,
14+
CAST(CounterID AS INT) AS CounterID,
15+
CAST(ClientIP AS INT) AS ClientIP,
16+
CAST(RegionID AS INT) AS RegionID,
17+
CAST(UserID AS BIGINT) AS UserID,
18+
CAST(CounterClass AS SMALLINT) AS CounterClass,
19+
CAST(OS AS SMALLINT) AS OS,
20+
CAST(UserAgent AS SMALLINT) AS UserAgent,
21+
URL,
22+
Referer,
23+
CAST(IsRefresh AS SMALLINT) AS IsRefresh,
24+
CAST(RefererCategoryID AS SMALLINT) AS RefererCategoryID,
25+
CAST(RefererRegionID AS INT) AS RefererRegionID,
26+
CAST(URLCategoryID AS SMALLINT) AS URLCategoryID,
27+
CAST(URLRegionID AS INT) AS URLRegionID,
28+
CAST(ResolutionWidth AS SMALLINT) AS ResolutionWidth,
29+
CAST(ResolutionHeight AS SMALLINT) AS ResolutionHeight,
30+
CAST(ResolutionDepth AS SMALLINT) AS ResolutionDepth,
31+
CAST(FlashMajor AS SMALLINT) AS FlashMajor,
32+
CAST(FlashMinor AS SMALLINT) AS FlashMinor,
33+
FlashMinor2,
34+
CAST(NetMajor AS SMALLINT) AS NetMajor,
35+
CAST(NetMinor AS SMALLINT) AS NetMinor,
36+
CAST(UserAgentMajor AS SMALLINT) AS UserAgentMajor,
37+
UserAgentMinor,
38+
CAST(CookieEnable AS SMALLINT) AS CookieEnable,
39+
CAST(JavascriptEnable AS SMALLINT) AS JavascriptEnable,
40+
CAST(IsMobile AS SMALLINT) AS IsMobile,
41+
CAST(MobilePhone AS SMALLINT) AS MobilePhone,
42+
MobilePhoneModel,
43+
Params,
44+
CAST(IPNetworkID AS INT) AS IPNetworkID,
45+
CAST(TraficSourceID AS SMALLINT) AS TraficSourceID,
46+
CAST(SearchEngineID AS SMALLINT) AS SearchEngineID,
47+
SearchPhrase,
48+
CAST(AdvEngineID AS SMALLINT) AS AdvEngineID,
49+
CAST(IsArtifical AS SMALLINT) AS IsArtifical,
50+
CAST(WindowClientWidth AS SMALLINT) AS WindowClientWidth,
51+
CAST(WindowClientHeight AS SMALLINT) AS WindowClientHeight,
52+
CAST(ClientTimeZone AS SMALLINT) AS ClientTimeZone,
53+
CAST(ClientEventTime AS TIMESTAMP) AS ClientEventTime,
54+
CAST(SilverlightVersion1 AS SMALLINT) AS SilverlightVersion1,
55+
CAST(SilverlightVersion2 AS SMALLINT) AS SilverlightVersion2,
56+
CAST(SilverlightVersion3 AS INT) AS SilverlightVersion3,
57+
CAST(SilverlightVersion4 AS SMALLINT) AS SilverlightVersion4,
58+
PageCharset,
59+
CAST(CodeVersion AS INT) AS CodeVersion,
60+
CAST(IsLink AS SMALLINT) AS IsLink,
61+
CAST(IsDownload AS SMALLINT) AS IsDownload,
62+
CAST(IsNotBounce AS SMALLINT) AS IsNotBounce,
63+
CAST(FUniqID AS BIGINT) AS FUniqID,
64+
OriginalURL,
65+
CAST(HID AS INT) AS HID,
66+
CAST(IsOldCounter AS SMALLINT) AS IsOldCounter,
67+
CAST(IsEvent AS SMALLINT) AS IsEvent,
68+
CAST(IsParameter AS SMALLINT) AS IsParameter,
69+
CAST(DontCountHits AS SMALLINT) AS DontCountHits,
70+
CAST(WithHash AS SMALLINT) AS WithHash,
71+
HitColor,
72+
CAST(LocalEventTime AS TIMESTAMP) AS LocalEventTime,
73+
CAST(Age AS SMALLINT) AS Age,
74+
CAST(Sex AS SMALLINT) AS Sex,
75+
CAST(Income AS SMALLINT) AS Income,
76+
CAST(Interests AS SMALLINT) AS Interests,
77+
CAST(Robotness AS SMALLINT) AS Robotness,
78+
CAST(RemoteIP AS INT) AS RemoteIP,
79+
CAST(WindowName AS INT) AS WindowName,
80+
CAST(OpenerName AS INT) AS OpenerName,
81+
CAST(HistoryLength AS SMALLINT) AS HistoryLength,
82+
BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction,
83+
CAST(HTTPError AS SMALLINT) AS HTTPError,
84+
CAST(SendTiming AS INT) AS SendTiming,
85+
CAST(DNSTiming AS INT) AS DNSTiming,
86+
CAST(ConnectTiming AS INT) AS ConnectTiming,
87+
CAST(ResponseStartTiming AS INT) AS ResponseStartTiming,
88+
CAST(ResponseEndTiming AS INT) AS ResponseEndTiming,
89+
CAST(FetchTiming AS INT) AS FetchTiming,
90+
CAST(SocialSourceNetworkID AS SMALLINT) AS SocialSourceNetworkID,
91+
SocialSourcePage,
92+
CAST(ParamPrice AS BIGINT) AS ParamPrice,
93+
ParamOrderID, ParamCurrency,
94+
CAST(ParamCurrencyID AS SMALLINT) AS ParamCurrencyID,
95+
OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID,
96+
UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag,
97+
CAST(HasGCLID AS SMALLINT) AS HasGCLID,
98+
CAST(RefererHash AS BIGINT) AS RefererHash,
99+
CAST(URLHash AS BIGINT) AS URLHash,
100+
CAST(CLID AS INT) AS CLID
101+
FROM read_files(
102+
's3://hits-parquet-100m-sorted-zstd/*.parquet',
103+
format => 'parquet'
104+
);

0 commit comments

Comments
 (0)