Skip to content

Commit 5d1cedf

Browse files
authored
Exporter: Ignore workspace assets of deleted users and service principals (#2980)
* Exporter: detecting deleted users/service principals & not exporting their data Initial work, tests should be fixed. * Fix tests * Add a command-line flag to export assets of deleted users * Fix formatting * Fix tests after the Go SDK upgrade
1 parent 0d6281c commit 5d1cedf

File tree

8 files changed

+319
-117
lines changed

8 files changed

+319
-117
lines changed

docs/guides/experimental-exporter.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ All arguments are optional, and they tune what code is being generated.
4545
* `-skip-interactive` - optionally run in a non-interactive mode.
4646
* `-includeUserDomains` - optionally include domain name into generated resource name for `databricks_user` resource.
4747
* `-importAllUsers` - optionally include all users and service principals even if they are only part of the `users` group.
48+
* `-exportDeletedUsersAssets` - optionally include assets of deleted users and service principals.
4849
* `-incremental` - experimental option for incremental export of modified resources and merging with existing resources. *Please note that only a limited set of resources (notebooks, SQL queries/dashboards/alerts, ...) provides information about the last modified date - all other resources will be re-exported again! Also, it's impossible to detect the deletion of the resources, so you must do periodic full export if resources are deleted!* **Requires** `-updated-since` option if no `exporter-run-stats.json` file exists in the output directory.
4950
* `-updated-since` - timestamp (in ISO8601 format supported by Go language) for exporting of resources modified since a given timestamp. I.e., `2023-07-24T00:00:00Z`. If not specified, the exporter will try to load the last run timestamp from the `exporter-run-stats.json` file generated during the export and use it.
5051
* `-notebooksFormat` - optional format for exporting of notebooks. Supported values are `SOURCE` (default), `DBC`, `JUPYTER`. This option could be used to export notebooks with embedded dashboards.

exporter/command.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ func Run(args ...string) error {
100100
flags.BoolVar(&ic.includeUserDomains, "includeUserDomains", false, "Include domain portion in `databricks_user` resource name")
101101
flags.BoolVar(&ic.importAllUsers, "importAllUsers", false,
102102
"Import all users and service principals, even if they aren't referenced in any resource")
103+
flags.BoolVar(&ic.exportDeletedUsersAssets, "exportDeletedUsersAssets", false,
104+
"Export assets (notebooks, etc.) of deleted users & service principals")
103105
flags.StringVar(&ic.Directory, "directory", cwd,
104106
"Directory to generate sources in. Defaults to current directory.")
105107
flags.Int64Var(&ic.lastActiveDays, "last-active-days", 3650,

exporter/context.go

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -80,25 +80,26 @@ type importContext struct {
8080
Scope importedResources
8181

8282
// command-line resources (immutable, or set by the single thread)
83-
includeUserDomains bool
84-
importAllUsers bool
85-
debug bool
86-
incremental bool
87-
mounts bool
88-
noFormat bool
89-
services string
90-
listing string
91-
match string
92-
lastActiveDays int64
93-
lastActiveMs int64
94-
generateDeclaration bool
95-
meAdmin bool
96-
prefix string
97-
accountLevel bool
98-
shImports map[string]bool
99-
notebooksFormat string
100-
updatedSinceStr string
101-
updatedSinceMs int64
83+
includeUserDomains bool
84+
importAllUsers bool
85+
exportDeletedUsersAssets bool
86+
debug bool
87+
incremental bool
88+
mounts bool
89+
noFormat bool
90+
services string
91+
listing string
92+
match string
93+
lastActiveDays int64
94+
lastActiveMs int64
95+
generateDeclaration bool
96+
meAdmin bool
97+
prefix string
98+
accountLevel bool
99+
shImports map[string]bool
100+
notebooksFormat string
101+
updatedSinceStr string
102+
updatedSinceMs int64
102103

103104
waitGroup *sync.WaitGroup
104105

exporter/exporter_test.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1559,6 +1559,10 @@ func TestImportingRepos(t *testing.T) {
15591559
qa.HTTPFixturesApply(t,
15601560
[]qa.HTTPFixture{
15611561
meAdminFixture,
1562+
userListIdUsernameFixture,
1563+
userListIdUsernameFixture2,
1564+
userListFixture,
1565+
userReadFixture,
15621566
{
15631567
Method: "GET",
15641568
Resource: "/api/2.0/repos?",
@@ -1900,6 +1904,18 @@ func TestImportingDLTPipelines(t *testing.T) {
19001904
Content: "spark.range(10)",
19011905
},
19021906
},
1907+
{
1908+
Method: "GET",
1909+
Resource: "/api/2.0/preview/scim/v2/Users?attributes=userName%2Cid",
1910+
Response: scim.UserList{
1911+
Resources: []scim.User{
1912+
{
1913+
ID: "id",
1914+
UserName: "id",
1915+
},
1916+
},
1917+
},
1918+
},
19031919
{
19041920
Method: "GET",
19051921
Resource: "/api/2.0/instance-profiles/list",
@@ -1971,10 +1987,13 @@ func TestImportingDLTPipelinesMatchingOnly(t *testing.T) {
19711987
meAdminFixture,
19721988
emptyRepos,
19731989
emptyIpAccessLIst,
1990+
userListIdUsernameFixture,
1991+
userListIdUsernameFixture2,
1992+
userListFixture,
1993+
userReadFixture,
19741994
{
19751995
Method: "GET",
19761996
Resource: "/api/2.0/pipelines?max_results=50",
1977-
19781997
Response: pipelines.PipelineListResponse{
19791998
Statuses: []pipelines.PipelineStateInfo{
19801999
{

exporter/importables.go

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ var (
4545
fileNameNormalizationRegex = regexp.MustCompile(`[^-_\w/.@]`)
4646
jobClustersRegex = regexp.MustCompile(`^((job_cluster|task)\.[0-9]+\.new_cluster\.[0-9]+\.)`)
4747
dltClusterRegex = regexp.MustCompile(`^(cluster\.[0-9]+\.)`)
48+
userDirRegex = regexp.MustCompile(`^(/Users/[^/]+)(/.*)?$`)
4849
secretPathRegex = regexp.MustCompile(`^\{\{secrets\/([^\/]+)\/([^}]+)\}\}$`)
4950
sqlParentRegexp = regexp.MustCompile(`^folders/(\d+)$`)
5051
dltDefaultStorageRegex = regexp.MustCompile(`^dbfs:/pipelines/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
@@ -642,10 +643,7 @@ var resourcesMap map[string]importable = map[string]importable{
642643
}
643644
if typ == "fixed" && strings.HasPrefix(k, "init_scripts.") &&
644645
strings.HasSuffix(k, ".workspace.destination") {
645-
ic.Emit(&resource{
646-
Resource: "databricks_workspace_file",
647-
ID: eitherString(value, defaultValue),
648-
})
646+
ic.maybeEmitWorkspaceObject("databricks_workspace_file", eitherString(value, defaultValue))
649647
}
650648
if typ == "fixed" && (strings.HasPrefix(k, "spark_conf.") || strings.HasPrefix(k, "spark_env_vars.")) {
651649
either := eitherString(value, defaultValue)
@@ -1941,14 +1939,10 @@ var resourcesMap map[string]importable = map[string]importable{
19411939
if res := ignoreIdeFolderRegex.FindStringSubmatch(directory.Path); res != nil {
19421940
continue
19431941
}
1944-
// TODO: don't emit directories for deleted users/SPs (how to identify them?)
1945-
ic.Emit(&resource{
1946-
Resource: "databricks_directory",
1947-
ID: directory.Path,
1948-
})
1942+
ic.maybeEmitWorkspaceObject("databricks_directory", directory.Path)
1943+
19491944
if offset%50 == 0 {
1950-
log.Printf("[INFO] Scanned %d of %d directories",
1951-
offset+1, len(directoryList))
1945+
log.Printf("[INFO] Scanned %d of %d directories", offset+1, len(directoryList))
19521946
}
19531947
}
19541948
return nil

exporter/importables_test.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,16 @@ import (
3434
func importContextForTest() *importContext {
3535
p := provider.DatabricksProvider()
3636
return &importContext{
37-
Importables: resourcesMap,
38-
Resources: p.ResourcesMap,
39-
Files: map[string]*hclwrite.File{},
40-
testEmits: map[string]bool{},
41-
nameFixes: nameFixes,
42-
waitGroup: &sync.WaitGroup{},
43-
allUsers: map[string]scim.User{},
44-
allSps: map[string]scim.User{},
45-
channels: makeResourcesChannels(p),
37+
Importables: resourcesMap,
38+
Resources: p.ResourcesMap,
39+
Files: map[string]*hclwrite.File{},
40+
testEmits: map[string]bool{},
41+
nameFixes: nameFixes,
42+
waitGroup: &sync.WaitGroup{},
43+
allUsers: map[string]scim.User{},
44+
allSps: map[string]scim.User{},
45+
channels: makeResourcesChannels(p),
46+
exportDeletedUsersAssets: false,
4647
}
4748
}
4849

exporter/util.go

Lines changed: 53 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,7 @@ func (ic *importContext) emitInitScripts(initScripts []clusters.InitScriptStorag
4141
})
4242
}
4343
if is.Workspace != nil {
44-
ic.Emit(&resource{
45-
Resource: "databricks_workspace_file",
46-
ID: is.Workspace.Destination,
47-
})
44+
ic.maybeEmitWorkspaceObject("", is.Workspace.Destination)
4845
}
4946
}
5047

@@ -112,27 +109,33 @@ func (ic *importContext) emitUserOrServicePrincipal(userOrSPName string) {
112109
if userOrSPName == "" {
113110
return
114111
}
115-
// TODO: think about another way of checking for a user. ideally we need to check against the
116-
// list of users/SPs obtained via SCIM API - this will be done in the refactoring requested by the SCIM team
117-
if strings.Contains(userOrSPName, "@") {
118-
ic.Emit(&resource{
119-
Resource: "databricks_user",
120-
Attribute: "user_name",
121-
Value: strings.ToLower(userOrSPName),
122-
})
123-
} else if common.StringIsUUID(userOrSPName) {
124-
ic.Emit(&resource{
125-
Resource: "databricks_service_principal",
126-
Attribute: "application_id",
127-
Value: userOrSPName,
128-
})
112+
if common.StringIsUUID(userOrSPName) {
113+
user, err := ic.findSpnByAppID(userOrSPName)
114+
if err != nil {
115+
log.Printf("[ERROR] Can't find SP with application ID %s", userOrSPName)
116+
} else {
117+
ic.Emit(&resource{
118+
Resource: "databricks_service_principal",
119+
ID: user.ID,
120+
})
121+
}
122+
} else {
123+
user, err := ic.findUserByName(strings.ToLower(userOrSPName))
124+
if err != nil {
125+
log.Printf("[ERROR] Can't find user with name %s", userOrSPName)
126+
} else {
127+
ic.Emit(&resource{
128+
Resource: "databricks_user",
129+
ID: user.ID,
130+
})
131+
}
129132
}
130133
}
131134

132135
func (ic *importContext) emitUserOrServicePrincipalForPath(path, prefix string) {
133136
if strings.HasPrefix(path, prefix) {
134137
parts := strings.SplitN(path, "/", 4)
135-
if len(parts) >= 3 {
138+
if len(parts) >= 3 && parts[2] != "" {
136139
ic.emitUserOrServicePrincipal(parts[2])
137140
}
138141
}
@@ -143,11 +146,16 @@ func (ic *importContext) IsUserOrServicePrincipalDirectory(path, prefix string)
143146
return false
144147
}
145148
parts := strings.SplitN(path, "/", 4)
146-
if len(parts) == 3 || (len(parts) == 4 && parts[3] == "") {
147-
// TODO: think about another way of checking for a user. ideally we need to check against the
148-
// list of users/SPs obtained via SCIM API - this will be done in the refactoring requested by the SCIM team
149+
if (len(parts) == 3 || (len(parts) == 4 && parts[3] == "")) && parts[2] != "" {
149150
userOrSPName := parts[2]
150-
return strings.Contains(userOrSPName, "@") || common.StringIsUUID(userOrSPName)
151+
var err error
152+
if common.StringIsUUID(userOrSPName) {
153+
_, err = ic.findSpnByAppID(userOrSPName)
154+
} else {
155+
_, err = ic.findUserByName(strings.ToLower(userOrSPName))
156+
}
157+
return err == nil
158+
151159
}
152160
return false
153161
}
@@ -160,10 +168,7 @@ func (ic *importContext) emitNotebookOrRepo(path string) {
160168
Value: strings.Join(strings.SplitN(path, "/", 5)[:4], "/"),
161169
})
162170
} else {
163-
ic.Emit(&resource{
164-
Resource: "databricks_notebook",
165-
ID: path,
166-
})
171+
ic.maybeEmitWorkspaceObject("databricks_notebook", path)
167172
}
168173
}
169174

@@ -795,6 +800,26 @@ func wsObjectGetModifiedAt(obs workspace.ObjectStatus) int64 {
795800
return obs.ModifiedAt
796801
}
797802

803+
func (ic *importContext) shouldEmitForPath(path string) bool {
804+
if !ic.exportDeletedUsersAssets && strings.HasPrefix(path, "/Users/") {
805+
userDir := userDirRegex.ReplaceAllString(path, "$1")
806+
return ic.IsUserOrServicePrincipalDirectory(userDir, "/Users")
807+
}
808+
return true
809+
}
810+
811+
func (ic *importContext) maybeEmitWorkspaceObject(resourceType, path string) {
812+
if ic.shouldEmitForPath(path) {
813+
ic.Emit(&resource{
814+
Resource: resourceType,
815+
ID: path,
816+
Incremental: ic.incremental,
817+
})
818+
} else {
819+
log.Printf("[DEBUG] Not emitting a workspace object %s for deleted user", path)
820+
}
821+
}
822+
798823
func createListWorkspaceObjectsFunc(objType string, resourceType string, objName string) func(ic *importContext) error {
799824
return func(ic *importContext) error {
800825
// TODO: can we pass a visitor here, that will emit corresponding object earlier?
@@ -816,11 +841,7 @@ func createListWorkspaceObjectsFunc(objType string, resourceType string, objName
816841
if !ic.MatchesName(object.Path) {
817842
continue
818843
}
819-
ic.Emit(&resource{
820-
Resource: resourceType,
821-
ID: object.Path,
822-
Incremental: ic.incremental,
823-
})
844+
ic.maybeEmitWorkspaceObject(resourceType, object.Path)
824845

825846
if offset%50 == 0 {
826847
log.Printf("[INFO] Scanned %d of %d %ss", offset+1, len(objectsList), objName)

0 commit comments

Comments
 (0)