Skip to content

Commit 7e079fd

Browse files
Merge branch 'cloudfetch' into main (#154)
Supports executing queries with Cloud Fetch for increased performance and caching. Steps taken: - Synced fork `mattdeekay` for both `cloudfetch` and `main` branches - On `mattdeekay:main`, ran `git merge --squash cloudfetch` - Resolved merge conflicts - Fixed `cloudfetch_test.go` end-to-end test to (WithEnableCloudFetch -> WithCloudFetch) - Commit and create PR - Fix `connector_test.go` to add cloud fetch (forgot to add earlier) - Add link expiration test to `batchloader_test.go` - Fix `arrowRows_test.go` - `golangci-lint run`
2 parents 65bde57 + b8c87f7 commit 7e079fd

File tree

17 files changed

+1254
-275
lines changed

17 files changed

+1254
-275
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ You can set query timeout value by appending a `timeout` query parameter (in sec
4646
```
4747
token:[your token]@[Workspace hostname]:[Port number][Endpoint HTTP Path]?timeout=1000&maxRows=1000
4848
```
49+
You can turn on Cloud Fetch to increase the performance of extracting large query results by fetching data in parallel via cloud storage (more info [here](https://www.databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html)). To turn on Cloud Fetch, append `useCloudFetch=true`. You can also set the number of concurrently fetching goroutines by setting the `maxDownloadThreads` query parameter (default is 10):
50+
```
51+
token:[your token]@[Workspace hostname]:[Port number][Endpoint HTTP Path]?useCloudFetch=true&maxDownloadThreads=3
52+
```
4953

5054
### Connecting with a new Connector
5155

connection.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,7 @@ func (c *conn) executeStatement(ctx context.Context, query string, args []driver
283283
GetDirectResults: &cli_service.TSparkGetDirectResults{
284284
MaxRows: int64(c.cfg.MaxRows),
285285
},
286+
CanDecompressLZ4Result_: &c.cfg.UseLz4Compression,
286287
}
287288

288289
if c.cfg.UseArrowBatches {
@@ -295,6 +296,10 @@ func (c *conn) executeStatement(ctx context.Context, query string, args []driver
295296
}
296297
}
297298

299+
if c.cfg.UseCloudFetch {
300+
req.CanDownloadResult_ = &c.cfg.UseCloudFetch
301+
}
302+
298303
ctx = driverctx.NewContextWithConnId(ctx, c.id)
299304
resp, err := c.client.ExecuteStatement(ctx, &req)
300305

connector.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,3 +245,17 @@ func WithTransport(t http.RoundTripper) connOption {
245245
c.Transport = t
246246
}
247247
}
248+
249+
// WithCloudFetch sets up the use of cloud fetch for query execution. Default is false.
250+
func WithCloudFetch(useCloudFetch bool) connOption {
251+
return func(c *config.Config) {
252+
c.UseCloudFetch = useCloudFetch
253+
}
254+
}
255+
256+
// WithMaxDownloadThreads sets up maximum download threads for cloud fetch. Default is 10.
257+
func WithMaxDownloadThreads(numThreads int) connOption {
258+
return func(c *config.Config) {
259+
c.MaxDownloadThreads = numThreads
260+
}
261+
}

connector_test.go

Lines changed: 61 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -36,24 +36,33 @@ func TestNewConnector(t *testing.T) {
3636
WithSessionParams(sessionParams),
3737
WithRetries(10, 3*time.Second, 60*time.Second),
3838
WithTransport(roundTripper),
39+
WithCloudFetch(true),
40+
WithMaxDownloadThreads(15),
3941
)
42+
expectedCloudFetchConfig := config.CloudFetchConfig{
43+
UseCloudFetch: true,
44+
MaxDownloadThreads: 15,
45+
MaxFilesInMemory: 10,
46+
MinTimeToExpiry: 0 * time.Second,
47+
}
4048
expectedUserConfig := config.UserConfig{
41-
Host: host,
42-
Port: port,
43-
Protocol: "https",
44-
AccessToken: accessToken,
45-
Authenticator: &pat.PATAuth{AccessToken: accessToken},
46-
HTTPPath: "/" + httpPath,
47-
MaxRows: maxRows,
48-
QueryTimeout: timeout,
49-
Catalog: catalog,
50-
Schema: schema,
51-
UserAgentEntry: userAgentEntry,
52-
SessionParams: sessionParams,
53-
RetryMax: 10,
54-
RetryWaitMin: 3 * time.Second,
55-
RetryWaitMax: 60 * time.Second,
56-
Transport: roundTripper,
49+
Host: host,
50+
Port: port,
51+
Protocol: "https",
52+
AccessToken: accessToken,
53+
Authenticator: &pat.PATAuth{AccessToken: accessToken},
54+
HTTPPath: "/" + httpPath,
55+
MaxRows: maxRows,
56+
QueryTimeout: timeout,
57+
Catalog: catalog,
58+
Schema: schema,
59+
UserAgentEntry: userAgentEntry,
60+
SessionParams: sessionParams,
61+
RetryMax: 10,
62+
RetryWaitMin: 3 * time.Second,
63+
RetryWaitMax: 60 * time.Second,
64+
Transport: roundTripper,
65+
CloudFetchConfig: expectedCloudFetchConfig,
5766
}
5867
expectedCfg := config.WithDefaults()
5968
expectedCfg.DriverVersion = DriverVersion
@@ -75,18 +84,25 @@ func TestNewConnector(t *testing.T) {
7584
WithAccessToken(accessToken),
7685
WithHTTPPath(httpPath),
7786
)
87+
expectedCloudFetchConfig := config.CloudFetchConfig{
88+
UseCloudFetch: false,
89+
MaxDownloadThreads: 10,
90+
MaxFilesInMemory: 10,
91+
MinTimeToExpiry: 0 * time.Second,
92+
}
7893
expectedUserConfig := config.UserConfig{
79-
Host: host,
80-
Port: port,
81-
Protocol: "https",
82-
AccessToken: accessToken,
83-
Authenticator: &pat.PATAuth{AccessToken: accessToken},
84-
HTTPPath: "/" + httpPath,
85-
MaxRows: maxRows,
86-
SessionParams: sessionParams,
87-
RetryMax: 4,
88-
RetryWaitMin: 1 * time.Second,
89-
RetryWaitMax: 30 * time.Second,
94+
Host: host,
95+
Port: port,
96+
Protocol: "https",
97+
AccessToken: accessToken,
98+
Authenticator: &pat.PATAuth{AccessToken: accessToken},
99+
HTTPPath: "/" + httpPath,
100+
MaxRows: maxRows,
101+
SessionParams: sessionParams,
102+
RetryMax: 4,
103+
RetryWaitMin: 1 * time.Second,
104+
RetryWaitMax: 30 * time.Second,
105+
CloudFetchConfig: expectedCloudFetchConfig,
90106
}
91107
expectedCfg := config.WithDefaults()
92108
expectedCfg.UserConfig = expectedUserConfig
@@ -109,18 +125,25 @@ func TestNewConnector(t *testing.T) {
109125
WithHTTPPath(httpPath),
110126
WithRetries(-1, 0, 0),
111127
)
128+
expectedCloudFetchConfig := config.CloudFetchConfig{
129+
UseCloudFetch: false,
130+
MaxDownloadThreads: 10,
131+
MaxFilesInMemory: 10,
132+
MinTimeToExpiry: 0 * time.Second,
133+
}
112134
expectedUserConfig := config.UserConfig{
113-
Host: host,
114-
Port: port,
115-
Protocol: "https",
116-
AccessToken: accessToken,
117-
Authenticator: &pat.PATAuth{AccessToken: accessToken},
118-
HTTPPath: "/" + httpPath,
119-
MaxRows: maxRows,
120-
SessionParams: sessionParams,
121-
RetryMax: -1,
122-
RetryWaitMin: 0,
123-
RetryWaitMax: 0,
135+
Host: host,
136+
Port: port,
137+
Protocol: "https",
138+
AccessToken: accessToken,
139+
Authenticator: &pat.PATAuth{AccessToken: accessToken},
140+
HTTPPath: "/" + httpPath,
141+
MaxRows: maxRows,
142+
SessionParams: sessionParams,
143+
RetryMax: -1,
144+
RetryWaitMin: 0,
145+
RetryWaitMax: 0,
146+
CloudFetchConfig: expectedCloudFetchConfig,
124147
}
125148
expectedCfg := config.WithDefaults()
126149
expectedCfg.DriverVersion = DriverVersion

doc.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ Supported optional connection parameters can be specified in param=value and inc
3737
- maxRows: Sets up the max rows fetched per request. Default is 100000
3838
- timeout: Adds timeout (in seconds) for the server query execution. Default is no timeout
3939
- userAgentEntry: Used to identify partners. Set as a string with format <isv-name+product-name>
40+
- useCloudFetch: Used to enable cloud fetch for the query execution. Default is false
41+
- maxDownloadThreads: Sets up the max number of concurrent workers for cloud fetch. Default is 10
4042
4143
Supported optional session parameters can be specified in param=value and include:
4244
@@ -79,6 +81,8 @@ Supported functional options include:
7981
- WithSessionParams(<params_map> map[string]string): Sets up session parameters including "timezone" and "ansi_mode". Optional
8082
- WithTimeout(<timeout> Duration). Adds timeout (in time.Duration) for the server query execution. Default is no timeout. Optional
8183
- WithUserAgentEntry(<isv-name+product-name> string). Used to identify partners. Optional
84+
- WithCloudFetch (bool). Used to enable cloud fetch for the query execution. Default is false. Optional
85+
- WithMaxDownloadThreads (<num_threads> int). Sets up the max number of concurrent workers for cloud fetch. Default is 10. Optional
8286
8387
# Query cancellation and timeout
8488

errors/errors.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,13 @@ const (
3131

3232
// Execution error messages (query failure)
3333
ErrQueryExecution = "failed to execute query"
34+
ErrLinkExpired = "link expired"
3435
)
3536

37+
func InvalidDSNFormat(param string, value string, expected string) string {
38+
return fmt.Sprintf("invalid DSN: param %s with value %s is not of type %s", param, value, expected)
39+
}
40+
3641
func ErrInvalidOperationState(state string) string {
3742
return fmt.Sprintf("invalid operation state %s. This should not have happened", state)
3843
}

examples/cloudfetch/main.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"database/sql"
6+
"fmt"
7+
dbsql "github.com/databricks/databricks-sql-go"
8+
"log"
9+
"os"
10+
"strconv"
11+
"time"
12+
)
13+
14+
type row struct {
15+
symbol string
16+
companyName string
17+
industry string
18+
date string
19+
open float64
20+
high float64
21+
low float64
22+
close float64
23+
volume int
24+
change float64
25+
changePercentage float64
26+
upTrend bool
27+
volatile bool
28+
}
29+
30+
func runTest(withCloudFetch bool, query string) ([]row, error) {
31+
port, err := strconv.Atoi(os.Getenv("DATABRICKS_PORT"))
32+
if err != nil {
33+
return nil, err
34+
}
35+
36+
connector, err := dbsql.NewConnector(
37+
dbsql.WithServerHostname(os.Getenv("DATABRICKS_HOST")),
38+
dbsql.WithPort(port),
39+
dbsql.WithHTTPPath(os.Getenv("DATABRICKS_HTTPPATH")),
40+
dbsql.WithAccessToken(os.Getenv("DATABRICKS_ACCESSTOKEN")),
41+
dbsql.WithTimeout(10),
42+
dbsql.WithInitialNamespace("hive_metastore", "default"),
43+
dbsql.WithCloudFetch(withCloudFetch),
44+
)
45+
if err != nil {
46+
return nil, err
47+
}
48+
db := sql.OpenDB(connector)
49+
defer db.Close()
50+
51+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
52+
defer cancel()
53+
if err := db.PingContext(ctx); err != nil {
54+
return nil, err
55+
}
56+
rows, err1 := db.QueryContext(context.Background(), query)
57+
defer rows.Close()
58+
59+
if err1 != nil {
60+
if err1 == sql.ErrNoRows {
61+
fmt.Println("not found")
62+
return nil, err
63+
} else {
64+
return nil, err
65+
}
66+
}
67+
var res []row
68+
for rows.Next() {
69+
r := row{}
70+
err := rows.Scan(&r.symbol, &r.companyName, &r.industry, &r.date, &r.open, &r.high, &r.low, &r.close, &r.volume, &r.change, &r.changePercentage, &r.upTrend, &r.volatile)
71+
if err != nil {
72+
fmt.Println(err)
73+
return nil, err
74+
}
75+
res = append(res, r)
76+
}
77+
return res, nil
78+
}
79+
80+
func main() {
81+
query := "select * from stock_data where date is not null and volume is not null order by date, symbol limit 10000000"
82+
83+
// Local arrow batch
84+
abRes, err := runTest(false, query)
85+
if err != nil {
86+
log.Fatal(err)
87+
}
88+
89+
// Cloud fetch batch
90+
cfRes, err := runTest(true, query)
91+
if err != nil {
92+
log.Fatal(err)
93+
}
94+
95+
for i := 0; i < len(abRes); i++ {
96+
if abRes[i] != cfRes[i] {
97+
log.Fatal(fmt.Sprintf("not equal for row: %d", i))
98+
}
99+
}
100+
}

0 commit comments

Comments
 (0)