Skip to content

Commit 0fbfbf4

Browse files
alexottmgyucht
andauthored
[Exporter] Allow to match resource names by regular expression (#4177)
## Changes <!-- Summary of your changes that are easy to understand --> In addition to the existing `-match` option, this PR allows the matching of names by regex during the listing operation. There are new options: - `-matchRegex` - checks if name matches a regex - this could be useful for exporting notebooks for only specific users, or something like that. - `-excludeRegex` - checks if name matches a regex, and skips processing of that object. For example, it could be used to exclude `databricks_automl` directories. This parameter has higher priority than the `-match` and `-matchRegex`. - `filterDirectoriesDuringWorkspaceWalking` - if we should apply match logic to directory names when we're performing workspace tree walking. *Note: be careful with it as it will be applied to all entries, so if you want to filter only specific users, then you will need to specify the condition for `/Users` as well, so regex will be `^(/Users|/Users/[a-c].*)$`* ## Tests <!-- How is this tested? Please see the checklist below and also describe any other relevant tests --> - [x] `make test` run locally - [x] relevant change in `docs/` folder - [ ] covered with integration tests in `internal/acceptance` - [ ] relevant acceptance tests are passing - [ ] using Go SDK --------- Co-authored-by: Miles Yucht <[email protected]>
1 parent da1f7e4 commit 0fbfbf4

File tree

7 files changed

+230
-36
lines changed

7 files changed

+230
-36
lines changed

docs/guides/experimental-exporter.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ All arguments are optional, and they tune what code is being generated.
6161
* `-listing` - Comma-separated list of services to be listed and further passed on for importing. For each service specified, the exporter performs a listing of available resources using the `List` function and emits them for importing together with their dependencies. The `-services` parameter could be used to control which transitive dependencies will be also imported.
6262
* `-services` - Comma-separated list of services to import. By default, all services are imported.
6363
* `-match` - Match resource names during listing operation. This filter applies to all resources that are getting listed, so if you want to import all dependencies of just one cluster, specify `-match=autoscaling -listing=compute`. By default, it is empty, which matches everything.
64+
* `-matchRegex` - Match resource names against a given regex during listing operation. Applicable to all resources selected for listing.
65+
* `-excludeRegex` - Exclude resource names matching a given regex. Applied during the listing operation and has higher priority than `-match` and `-matchRegex`. Applicable to all resources selected for listing. Could be used to exclude things like `databricks_automl` notebooks, etc.
66+
* `-filterDirectoriesDuringWorkspaceWalking` - if we should apply match logic to directory names when we're performing workspace tree walking. *Note: be careful with it as it will be applied to all entries, so if you want to filter only specific users, then you will need to specify condition for `/Users` as well, so regex will be `^(/Users|/Users/[a-c].*)$`*.
6467
* `-mounts` - List DBFS mount points, an extremely slow operation that would not trigger unless explicitly specified.
6568
* `-generateProviderDeclaration` - the flag that toggles the generation of `databricks.tf` file with the declaration of the Databricks Terraform provider that is necessary for Terraform versions since Terraform 0.13 (disabled by default).
6669
* `-prefix` - optional prefix that will be added to the name of all exported resources - that's useful for exporting resources from multiple workspaces for merging into a single one.

exporter/command.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ func Run(args ...string) error {
131131
flags.BoolVar(&ic.mounts, "mounts", false, "List DBFS mount points.")
132132
flags.BoolVar(&ic.generateDeclaration, "generateProviderDeclaration", true,
133133
"Generate Databricks provider declaration.")
134+
flags.BoolVar(&ic.filterDirectoriesDuringWorkspaceWalking, "filterDirectoriesDuringWorkspaceWalking", false,
135+
"Apply filtering to directory names during workspace walking")
134136
flags.StringVar(&ic.notebooksFormat, "notebooksFormat", "SOURCE",
135137
"Format to export notebooks: SOURCE, DBC, JUPYTER. Default: SOURCE")
136138
services, listing := ic.allServicesAndListing()
@@ -145,6 +147,12 @@ func Run(args ...string) error {
145147
flags.StringVar(&ic.match, "match", "", "Match resource names during listing operation. "+
146148
"This filter applies to all resources that are getting listed, so if you want to import "+
147149
"all dependencies of just one cluster, specify -listing=compute")
150+
flags.StringVar(&ic.matchRegexStr, "matchRegex", "", "Match resource names during listing operation against a regex. "+
151+
"This filter applies to all resources that are getting listed, so if you want to import "+
152+
"all dependencies of just one cluster, specify -listing=compute")
153+
flags.StringVar(&ic.excludeRegexStr, "excludeRegex", "", "Exclude resource names matching regex during listing operation. "+
154+
"This filter applies to all resources that are getting listed, so if you want to import "+
155+
"all dependencies of just one cluster, specify -listing=compute")
148156
prefix := ""
149157
flags.StringVar(&prefix, "prefix", "", "Prefix that will be added to the name of all exported resources")
150158
newArgs := args

exporter/context.go

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -78,28 +78,33 @@ type importContext struct {
7878
Scope importedResources
7979

8080
// command-line resources (immutable, or set by the single thread)
81-
includeUserDomains bool
82-
importAllUsers bool
83-
exportDeletedUsersAssets bool
84-
incremental bool
85-
mounts bool
86-
noFormat bool
87-
nativeImportSupported bool
88-
services map[string]struct{}
89-
listing map[string]struct{}
90-
match string
91-
lastActiveDays int64
92-
lastActiveMs int64
93-
generateDeclaration bool
94-
exportSecrets bool
95-
meAdmin bool
96-
meUserName string
97-
prefix string
98-
accountLevel bool
99-
shImports map[string]bool
100-
notebooksFormat string
101-
updatedSinceStr string
102-
updatedSinceMs int64
81+
includeUserDomains bool
82+
importAllUsers bool
83+
exportDeletedUsersAssets bool
84+
incremental bool
85+
mounts bool
86+
noFormat bool
87+
nativeImportSupported bool
88+
services map[string]struct{}
89+
listing map[string]struct{}
90+
match string
91+
matchRegexStr string
92+
matchRegex *regexp.Regexp
93+
excludeRegexStr string
94+
excludeRegex *regexp.Regexp
95+
filterDirectoriesDuringWorkspaceWalking bool
96+
lastActiveDays int64
97+
lastActiveMs int64
98+
generateDeclaration bool
99+
exportSecrets bool
100+
meAdmin bool
101+
meUserName string
102+
prefix string
103+
accountLevel bool
104+
shImports map[string]bool
105+
notebooksFormat string
106+
updatedSinceStr string
107+
updatedSinceMs int64
103108

104109
waitGroup *sync.WaitGroup
105110

@@ -297,6 +302,24 @@ func (ic *importContext) Run() error {
297302
return fmt.Errorf("no services to import")
298303
}
299304

305+
if ic.matchRegexStr != "" {
306+
log.Printf("[DEBUG] Using regex '%s' to filter resources", ic.matchRegexStr)
307+
re, err := regexp.Compile(ic.matchRegexStr)
308+
if err != nil {
309+
log.Printf("[ERROR] can't compile regex '%s': %v", ic.matchRegexStr, err)
310+
return err
311+
}
312+
ic.matchRegex = re
313+
}
314+
if ic.excludeRegexStr != "" {
315+
log.Printf("[DEBUG] Using regex '%s' to filter resources", ic.excludeRegexStr)
316+
re, err := regexp.Compile(ic.excludeRegexStr)
317+
if err != nil {
318+
log.Printf("[ERROR] can't compile regex '%s': %v", ic.excludeRegexStr, err)
319+
return err
320+
}
321+
ic.excludeRegex = re
322+
}
300323
if ic.incremental {
301324
if ic.updatedSinceStr == "" {
302325
ic.updatedSinceStr = getLastRunString(statsFileName)

exporter/exporter_test.go

Lines changed: 148 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2349,7 +2349,7 @@ func TestImportingGlobalSqlConfig(t *testing.T) {
23492349
})
23502350
}
23512351

2352-
func TestImportingNotebooksWorkspaceFiles(t *testing.T) {
2352+
func TestImportingNotebooksWorkspaceFilesWithFilter(t *testing.T) {
23532353
fileStatus := workspace.ObjectStatus{
23542354
ObjectID: 123,
23552355
ObjectType: workspace.File,
@@ -2371,7 +2371,135 @@ func TestImportingNotebooksWorkspaceFiles(t *testing.T) {
23712371
Method: "GET",
23722372
Resource: "/api/2.0/workspace/list?path=%2F",
23732373
Response: workspace.ObjectList{
2374-
Objects: []workspace.ObjectStatus{notebookStatus, fileStatus},
2374+
Objects: []workspace.ObjectStatus{notebookStatus, fileStatus,
2375+
{
2376+
ObjectID: 4567,
2377+
ObjectType: workspace.Notebook,
2378+
Path: "/UnmatchedNotebook",
2379+
Language: "PYTHON",
2380+
},
2381+
{
2382+
ObjectID: 1234,
2383+
ObjectType: workspace.File,
2384+
Path: "/UnmatchedFile",
2385+
},
2386+
{
2387+
ObjectID: 456,
2388+
ObjectType: workspace.Directory,
2389+
Path: "/databricks_automl",
2390+
},
2391+
{
2392+
ObjectID: 456,
2393+
ObjectType: workspace.Directory,
2394+
Path: "/.bundle",
2395+
},
2396+
},
2397+
},
2398+
ReuseRequest: true,
2399+
},
2400+
{
2401+
Method: "GET",
2402+
Resource: "/api/2.0/workspace/list?path=%2Fdatabricks_automl",
2403+
Response: workspace.ObjectList{},
2404+
},
2405+
{
2406+
Method: "GET",
2407+
Resource: "/api/2.0/workspace/get-status?path=%2FNotebook",
2408+
Response: notebookStatus,
2409+
ReuseRequest: true,
2410+
},
2411+
{
2412+
Method: "GET",
2413+
Resource: "/api/2.0/workspace/get-status?path=%2FFile",
2414+
Response: fileStatus,
2415+
ReuseRequest: true,
2416+
},
2417+
{
2418+
Method: "GET",
2419+
Resource: "/api/2.0/workspace/export?format=AUTO&path=%2FFile",
2420+
Response: workspace.ExportPath{
2421+
Content: "dGVzdA==",
2422+
},
2423+
ReuseRequest: true,
2424+
},
2425+
{
2426+
Method: "GET",
2427+
Resource: "/api/2.0/workspace/export?format=SOURCE&path=%2FNotebook",
2428+
Response: workspace.ExportPath{
2429+
Content: "dGVzdA==",
2430+
},
2431+
ReuseRequest: true,
2432+
},
2433+
},
2434+
func(ctx context.Context, client *common.DatabricksClient) {
2435+
tmpDir := fmt.Sprintf("/tmp/tf-%s", qa.RandomName())
2436+
defer os.RemoveAll(tmpDir)
2437+
2438+
ic := newImportContext(client)
2439+
ic.Directory = tmpDir
2440+
ic.enableListing("notebooks,wsfiles")
2441+
ic.excludeRegexStr = "databricks_automl"
2442+
ic.matchRegexStr = "^/[FN].*$"
2443+
2444+
err := ic.Run()
2445+
assert.NoError(t, err)
2446+
// check generated code for notebooks
2447+
content, err := os.ReadFile(tmpDir + "/notebooks.tf")
2448+
assert.NoError(t, err)
2449+
contentStr := string(content)
2450+
assert.True(t, strings.Contains(contentStr, `resource "databricks_notebook" "notebook_456"`))
2451+
assert.True(t, strings.Contains(contentStr, `path = "/Notebook"`))
2452+
assert.False(t, strings.Contains(contentStr, `/UnmatchedNotebook`))
2453+
// check generated code for workspace files
2454+
content, err = os.ReadFile(tmpDir + "/wsfiles.tf")
2455+
assert.NoError(t, err)
2456+
contentStr = string(content)
2457+
assert.True(t, strings.Contains(contentStr, `resource "databricks_workspace_file" "file_123"`))
2458+
assert.True(t, strings.Contains(contentStr, `path = "/File"`))
2459+
assert.False(t, strings.Contains(contentStr, `/UnmatchedFile`))
2460+
})
2461+
}
2462+
2463+
func TestImportingNotebooksWorkspaceFilesWithFilterDuringWalking(t *testing.T) {
2464+
fileStatus := workspace.ObjectStatus{
2465+
ObjectID: 123,
2466+
ObjectType: workspace.File,
2467+
Path: "/File",
2468+
}
2469+
notebookStatus := workspace.ObjectStatus{
2470+
ObjectID: 456,
2471+
ObjectType: workspace.Notebook,
2472+
Path: "/Notebook",
2473+
Language: "PYTHON",
2474+
}
2475+
qa.HTTPFixturesApply(t,
2476+
[]qa.HTTPFixture{
2477+
meAdminFixture,
2478+
noCurrentMetastoreAttached,
2479+
emptyRepos,
2480+
emptyIpAccessLIst,
2481+
{
2482+
Method: "GET",
2483+
Resource: "/api/2.0/workspace/list?path=%2F",
2484+
Response: workspace.ObjectList{
2485+
Objects: []workspace.ObjectStatus{notebookStatus, fileStatus,
2486+
{
2487+
ObjectID: 4567,
2488+
ObjectType: workspace.Notebook,
2489+
Path: "/UnmatchedNotebook",
2490+
Language: "PYTHON",
2491+
},
2492+
{
2493+
ObjectID: 1234,
2494+
ObjectType: workspace.File,
2495+
Path: "/UnmatchedFile",
2496+
},
2497+
{
2498+
ObjectID: 456,
2499+
ObjectType: workspace.Directory,
2500+
Path: "/databricks_automl",
2501+
},
2502+
},
23752503
},
23762504
ReuseRequest: true,
23772505
},
@@ -2410,10 +2538,27 @@ func TestImportingNotebooksWorkspaceFiles(t *testing.T) {
24102538

24112539
ic := newImportContext(client)
24122540
ic.Directory = tmpDir
2413-
ic.enableListing("notebooks")
2541+
ic.enableListing("notebooks,wsfiles")
2542+
ic.excludeRegexStr = "databricks_automl"
2543+
ic.matchRegexStr = "^/[FN].*$"
2544+
ic.filterDirectoriesDuringWorkspaceWalking = true
24142545

24152546
err := ic.Run()
24162547
assert.NoError(t, err)
2548+
// check generated code for notebooks
2549+
content, err := os.ReadFile(tmpDir + "/notebooks.tf")
2550+
assert.NoError(t, err)
2551+
contentStr := string(content)
2552+
assert.True(t, strings.Contains(contentStr, `resource "databricks_notebook" "notebook_456"`))
2553+
assert.True(t, strings.Contains(contentStr, `path = "/Notebook"`))
2554+
assert.False(t, strings.Contains(contentStr, `/UnmatchedNotebook`))
2555+
// check generated code for workspace files
2556+
content, err = os.ReadFile(tmpDir + "/wsfiles.tf")
2557+
assert.NoError(t, err)
2558+
contentStr = string(content)
2559+
assert.True(t, strings.Contains(contentStr, `resource "databricks_workspace_file" "file_123"`))
2560+
assert.True(t, strings.Contains(contentStr, `path = "/File"`))
2561+
assert.False(t, strings.Contains(contentStr, `/UnmatchedFile`))
24172562
})
24182563
}
24192564

exporter/util.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,15 @@ func (ic *importContext) isServiceInListing(service string) bool {
3535
}
3636

3737
func (ic *importContext) MatchesName(n string) bool {
38-
if ic.match == "" {
38+
if ic.match == "" && ic.matchRegex == nil && ic.excludeRegex == nil {
3939
return true
4040
}
41+
if ic.excludeRegex != nil && ic.excludeRegex.MatchString(n) {
42+
return false
43+
}
44+
if ic.matchRegex != nil {
45+
return ic.matchRegex.MatchString(n)
46+
}
4147
return strings.Contains(strings.ToLower(n), strings.ToLower(ic.match))
4248
}
4349

exporter/util_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -316,16 +316,16 @@ func TestGetEnvAsInt(t *testing.T) {
316316
}
317317

318318
func TestExcludeAuxiliaryDirectories(t *testing.T) {
319-
assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "", ObjectType: workspace.Directory}))
320-
assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{ObjectType: workspace.File}))
321-
assert.True(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/[email protected]/abc",
319+
assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "", ObjectType: workspace.Directory}))
320+
assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{ObjectType: workspace.File}))
321+
assert.False(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/[email protected]/abc",
322322
ObjectType: workspace.Directory}))
323323
// should be ignored
324-
assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/[email protected]/.ide",
324+
assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/[email protected]/.ide",
325325
ObjectType: workspace.Directory}))
326-
assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Shared/.bundle",
326+
assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Shared/.bundle",
327327
ObjectType: workspace.Directory}))
328-
assert.False(t, excludeAuxiliaryDirectories(workspace.ObjectStatus{Path: "/Users/[email protected]/abc/__pycache__",
328+
assert.True(t, isAuxiliaryDirectory(workspace.ObjectStatus{Path: "/Users/[email protected]/abc/__pycache__",
329329
ObjectType: workspace.Directory}))
330330
}
331331

exporter/util_workspace.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,17 +93,18 @@ func (ic *importContext) getAllDirectories() []workspace.ObjectStatus {
9393
var directoriesToIgnore = []string{".ide", ".bundle", "__pycache__"}
9494

9595
// TODO: add ignoring directories of deleted users? This could potentially decrease the number of processed objects...
96-
func excludeAuxiliaryDirectories(v workspace.ObjectStatus) bool {
96+
func isAuxiliaryDirectory(v workspace.ObjectStatus) bool {
9797
if v.ObjectType != workspace.Directory {
98-
return true
98+
return false
9999
}
100100
// TODO: rewrite to use suffix check, etc., instead of split and slice contains?
101101
parts := strings.Split(v.Path, "/")
102102
result := len(parts) > 1 && slices.Contains[[]string, string](directoriesToIgnore, parts[len(parts)-1])
103+
log.Printf("[DEBUG] directory %s: %v", v.Path, result)
103104
if result {
104105
log.Printf("[DEBUG] Ignoring directory %s", v.Path)
105106
}
106-
return !result
107+
return result
107108
}
108109

109110
func (ic *importContext) getAllWorkspaceObjects(visitor func([]workspace.ObjectStatus)) []workspace.ObjectStatus {
@@ -113,7 +114,15 @@ func (ic *importContext) getAllWorkspaceObjects(visitor func([]workspace.ObjectS
113114
t1 := time.Now()
114115
log.Print("[INFO] Starting to list all workspace objects")
115116
notebooksAPI := workspace.NewNotebooksAPI(ic.Context, ic.Client)
116-
ic.allWorkspaceObjects, _ = ListParallel(notebooksAPI, "/", excludeAuxiliaryDirectories, visitor)
117+
shouldIncludeDirectory := func(v workspace.ObjectStatus) bool {
118+
decision := !isAuxiliaryDirectory(v)
119+
if decision && ic.filterDirectoriesDuringWorkspaceWalking {
120+
decision = ic.MatchesName(v.Path)
121+
}
122+
// log.Printf("[DEBUG] decision of shouldIncludeDirectory for %s: %v", v.Path, decision)
123+
return decision
124+
}
125+
ic.allWorkspaceObjects, _ = ListParallel(notebooksAPI, "/", shouldIncludeDirectory, visitor)
117126
log.Printf("[INFO] Finished listing of all workspace objects. %d objects in total. %v seconds",
118127
len(ic.allWorkspaceObjects), time.Since(t1).Seconds())
119128
}

0 commit comments

Comments
 (0)