Skip to content

Commit 70d2024

Browse files
authored
Update linting checks for Spark table methods (#1816)
1 parent a5feef4 commit 70d2024

16 files changed

+462
-207
lines changed

src/databricks/labs/ucx/source_code/linters/pyspark.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,6 @@ def __init__(self):
235235
TableNameMatcher("createTable", 1, 1000, 0, "tableName"),
236236
TableNameMatcher("createExternalTable", 1, 1000, 0, "tableName"),
237237
TableNameMatcher("getTable", 1, 1, 0),
238-
TableNameMatcher("table", 1, 1, 0),
239238
TableNameMatcher("isCached", 1, 1, 0),
240239
TableNameMatcher("listColumns", 1, 2, 0, "tableName"),
241240
TableNameMatcher("tableExists", 1, 2, 0, "tableName"),

tests/unit/source_code/linters/test_pyspark.py

Lines changed: 3 additions & 206 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
import pytest
44

5-
from databricks.labs.ucx.source_code.base import Advisory, Deprecation, CurrentSessionState
6-
from databricks.labs.ucx.source_code.linters.pyspark import SparkMatchers, SparkSql, AstHelper, TableNameMatcher
5+
from databricks.labs.ucx.source_code.base import Deprecation, CurrentSessionState
6+
from databricks.labs.ucx.source_code.linters.pyspark import SparkSql, AstHelper, TableNameMatcher
77
from databricks.labs.ucx.source_code.queries import FromTable
88

99

@@ -87,214 +87,11 @@ def test_spark_sql_match_named(migration_index):
8787
] == list(sqf.lint(old_code))
8888

8989

90-
METHOD_NAMES = [
91-
"cacheTable",
92-
"createTable",
93-
"createExternalTable",
94-
"getTable",
95-
"isCached",
96-
"listColumns",
97-
"tableExists",
98-
"recoverPartitions",
99-
"refreshTable",
100-
"uncacheTable",
101-
"table",
102-
"insertInto",
103-
"saveAsTable",
104-
]
105-
106-
107-
@pytest.mark.parametrize("method_name", METHOD_NAMES)
108-
def test_spark_table_match(migration_index, method_name):
109-
spark_matchers = SparkMatchers()
110-
ftf = FromTable(migration_index, CurrentSessionState())
111-
sqf = SparkSql(ftf, migration_index)
112-
matcher = spark_matchers.matchers[method_name]
113-
args_list = ["a"] * min(5, matcher.max_args)
114-
args_list[matcher.table_arg_index] = '"old.things"'
115-
args = ",".join(args_list)
116-
old_code = f"""
117-
spark.read.csv("s3://bucket/path")
118-
for i in range(10):
119-
df = spark.{method_name}({args})
120-
do_stuff_with_df(df)
121-
"""
122-
assert [
123-
Deprecation(
124-
code='direct-filesystem-access',
125-
message='The use of direct filesystem references is deprecated: ' 's3://bucket/path',
126-
start_line=2,
127-
start_col=0,
128-
end_line=2,
129-
end_col=34,
130-
),
131-
Deprecation(
132-
code='table-migrate',
133-
message='Table old.things is migrated to brand.new.stuff in Unity Catalog',
134-
start_line=4,
135-
start_col=9,
136-
end_line=4,
137-
end_col=17 + len(method_name) + len(args),
138-
),
139-
] == list(sqf.lint(old_code))
140-
141-
142-
@pytest.mark.parametrize("method_name", METHOD_NAMES)
143-
def test_spark_table_no_match(migration_index, method_name):
144-
spark_matchers = SparkMatchers()
145-
ftf = FromTable(migration_index, CurrentSessionState())
146-
sqf = SparkSql(ftf, migration_index)
147-
matcher = spark_matchers.matchers[method_name]
148-
args_list = ["a"] * min(5, matcher.max_args)
149-
args_list[matcher.table_arg_index] = '"table.we.know.nothing.about"'
150-
args = ",".join(args_list)
151-
old_code = f"""
152-
for i in range(10):
153-
df = spark.{method_name}({args})
154-
do_stuff_with_df(df)
155-
"""
156-
assert not list(sqf.lint(old_code))
157-
158-
159-
@pytest.mark.parametrize("method_name", METHOD_NAMES)
160-
def test_spark_table_too_many_args(migration_index, method_name):
161-
spark_matchers = SparkMatchers()
162-
ftf = FromTable(migration_index, CurrentSessionState())
163-
sqf = SparkSql(ftf, migration_index)
164-
matcher = spark_matchers.matchers[method_name]
165-
if matcher.max_args > 100:
166-
return
167-
args_list = ["a"] * (matcher.max_args + 1)
168-
args_list[matcher.table_arg_index] = '"table.we.know.nothing.about"'
169-
args = ",".join(args_list)
170-
old_code = f"""
171-
for i in range(10):
172-
df = spark.{method_name}({args})
173-
do_stuff_with_df(df)
174-
"""
175-
assert not list(sqf.lint(old_code))
176-
177-
178-
def test_spark_table_named_args(migration_index):
179-
ftf = FromTable(migration_index, CurrentSessionState())
180-
sqf = SparkSql(ftf, migration_index)
181-
old_code = """
182-
spark.read.csv("s3://bucket/path")
183-
for i in range(10):
184-
df = spark.saveAsTable(format="xyz", name="old.things")
185-
do_stuff_with_df(df)
186-
"""
187-
assert [
188-
Deprecation(
189-
code='direct-filesystem-access',
190-
message='The use of direct filesystem references is deprecated: ' 's3://bucket/path',
191-
start_line=2,
192-
start_col=0,
193-
end_line=2,
194-
end_col=34,
195-
),
196-
Deprecation(
197-
code='table-migrate',
198-
message='Table old.things is migrated to brand.new.stuff in Unity Catalog',
199-
start_line=4,
200-
start_col=9,
201-
end_line=4,
202-
end_col=59,
203-
),
204-
] == list(sqf.lint(old_code))
205-
206-
207-
def test_spark_table_variable_arg(migration_index):
208-
ftf = FromTable(migration_index, CurrentSessionState())
209-
sqf = SparkSql(ftf, migration_index)
210-
old_code = """
211-
spark.read.csv("s3://bucket/path")
212-
for i in range(10):
213-
df = spark.saveAsTable(name)
214-
do_stuff_with_df(df)
215-
"""
216-
assert [
217-
Deprecation(
218-
code='direct-filesystem-access',
219-
message='The use of direct filesystem references is deprecated: ' 's3://bucket/path',
220-
start_line=2,
221-
start_col=0,
222-
end_line=2,
223-
end_col=34,
224-
),
225-
Advisory(
226-
code='table-migrate',
227-
message="Can't migrate 'saveAsTable' because its table name argument is not a constant",
228-
start_line=4,
229-
start_col=9,
230-
end_line=4,
231-
end_col=32,
232-
),
233-
] == list(sqf.lint(old_code))
234-
235-
236-
def test_spark_table_fstring_arg(migration_index):
237-
ftf = FromTable(migration_index, CurrentSessionState())
238-
sqf = SparkSql(ftf, migration_index)
239-
old_code = """
240-
spark.read.csv("s3://bucket/path")
241-
for i in range(10):
242-
df = spark.saveAsTable(f"boop{stuff}")
243-
do_stuff_with_df(df)
244-
"""
245-
assert [
246-
Deprecation(
247-
code='direct-filesystem-access',
248-
message='The use of direct filesystem references is deprecated: ' 's3://bucket/path',
249-
start_line=2,
250-
start_col=0,
251-
end_line=2,
252-
end_col=34,
253-
),
254-
Advisory(
255-
code='table-migrate',
256-
message="Can't migrate 'saveAsTable' because its table name argument is not a constant",
257-
start_line=4,
258-
start_col=9,
259-
end_line=4,
260-
end_col=42,
261-
),
262-
] == list(sqf.lint(old_code))
263-
264-
265-
def test_spark_table_return_value(migration_index):
266-
ftf = FromTable(migration_index, CurrentSessionState())
267-
sqf = SparkSql(ftf, migration_index)
268-
old_code = """
269-
spark.read.csv("s3://bucket/path")
270-
for table in spark.listTables():
271-
do_stuff_with_table(table)
272-
"""
273-
assert [
274-
Deprecation(
275-
code='direct-filesystem-access',
276-
message='The use of direct filesystem references is deprecated: ' 's3://bucket/path',
277-
start_line=2,
278-
start_col=0,
279-
end_line=2,
280-
end_col=34,
281-
),
282-
Advisory(
283-
code='table-migrate',
284-
message="Call to 'listTables' will return a list of <catalog>.<database>.<table> instead of <database>.<table>.",
285-
start_line=3,
286-
start_col=13,
287-
end_line=3,
288-
end_col=31,
289-
),
290-
] == list(sqf.lint(old_code))
291-
292-
29390
def test_spark_table_return_value_apply(migration_index):
29491
ftf = FromTable(migration_index, CurrentSessionState())
29592
sqf = SparkSql(ftf, migration_index)
29693
old_code = """spark.read.csv('s3://bucket/path')
297-
for table in spark.listTables():
94+
for table in spark.catalog.listTables():
29895
do_stuff_with_table(table)"""
29996
fixed_code = sqf.apply(old_code)
30097
# no transformations to apply, only lint messages
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
## Check a literal reference to a known table that is migrated.
2+
# ucx[table-migrate:+1:0:+1:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
3+
spark.catalog.cacheTable("old.things")
4+
5+
## Check a literal reference to an unknown table (that is not migrated); we expect no warning.
6+
spark.catalog.cacheTable("table.we.know.nothing.about")
7+
8+
## Check that a call with too many positional arguments is ignored as (presumably) something else; we expect no warning.
9+
spark.catalog.cacheTable("old.things", None, "extra-argument")
10+
11+
## Check a call with an out-of-position named argument referencing a table known to be migrated.
12+
# ucx[table-migrate:+1:0:+1:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
13+
spark.catalog.cacheTable(storageLevel=None, tableName="old.things")
14+
15+
## Some calls that use a variable whose value is unknown: they could potentially reference a migrated table.
16+
# ucx[table-migrate:+1:0:+1:0] Can't migrate 'cacheTable' because its table name argument is not a constant
17+
spark.catalog.cacheTable(name)
18+
# ucx[table-migrate:+1:0:+1:0] Can't migrate 'cacheTable' because its table name argument is not a constant
19+
spark.catalog.cacheTable(f"boop{stuff}")
20+
21+
## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns.
22+
# FIXME: This is a false positive; any method named 'cacheTable' is triggering the warning.
23+
# ucx[table-migrate:+1:0:+1:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
24+
something_else.cacheTable("old.things")
25+
a_function("old.things")
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# ucx[direct-filesystem-access:+1:0:+1:0] The use of direct filesystem references is deprecated: s3://bucket/path
2+
spark.read.csv("s3://bucket/path")
3+
for i in range(10):
4+
5+
## Check a literal reference to a known table that is migrated.
6+
# ucx[table-migrate:+3:0:+3:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
7+
# TODO: Implement missing migration warning (on the source argument):
8+
# #ucx[table-migrate:+1:0:+1:0] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
9+
df = spark.catalog.createExternalTable("old.things")
10+
do_stuff_with(df)
11+
12+
## Check a literal reference to an unknown table (that is not migrated); we expect no warning.
13+
# TODO: Implement missing migration warning (on the source argument):
14+
# #ucx[table-migrate:+1:0:+1:0] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
15+
df = spark.catalog.createExternalTable("table.we.know.nothing.about")
16+
do_stuff_with(df)
17+
18+
## Check that a call with too many positional arguments is ignored as (presumably) something else; we expect no warning.
19+
# FIXME: This is a false positive due to an error in the matching specification; only 4 positional args are allowed.
20+
# ucx[table-migrate:+1:0:+1:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
21+
df = spark.catalog.createExternalTable("old.things", None, None, None, "extra-argument")
22+
do_stuff_with(df)
23+
24+
## Check a call with an out-of-position named argument referencing a table known to be migrated.
25+
# ucx[table-migrate:+1:0:+1:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
26+
df = spark.catalog.createExternalTable(path="foo", tableName="old.things", source="delta")
27+
do_stuff_with(df)
28+
29+
## Some calls that use a variable whose value is unknown: they could potentially reference a migrated table.
30+
# ucx[table-migrate:+1:0:+1:0] Can't migrate 'createExternalTable' because its table name argument is not a constant
31+
df = spark.catalog.createExternalTable(name)
32+
do_stuff_with(df)
33+
# ucx[table-migrate:+1:0:+1:0] Can't migrate 'createExternalTable' because its table name argument is not a constant
34+
df = spark.catalog.createExternalTable(f"boop{stuff}")
35+
do_stuff_with(df)
36+
37+
## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns.
38+
# FIXME: This is a false positive; any method named 'createExternalTable' is triggering the warning.
39+
# ucx[table-migrate:+1:0:+1:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
40+
something_else.createExternalTable("old.things")
41+
a_function("old.things")
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# ucx[direct-filesystem-access:+1:0:+1:0] The use of direct filesystem references is deprecated: s3://bucket/path
2+
spark.read.csv("s3://bucket/path")
3+
for i in range(10):
4+
5+
## Check a literal reference to a known table that is migrated.
6+
# ucx[table-migrate:+3:0:+3:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
7+
# TODO: Implement missing migration warning (on the source argument):
8+
# #ucx[table-migrate:+1:0:+1:0] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
9+
df = spark.catalog.createTable("old.things")
10+
do_stuff_with(df)
11+
12+
## Check a literal reference to an unknown table (that is not migrated); we expect no warning.
13+
# TODO: Implement missing migration warning (on the source argument):
14+
# #ucx[table-migrate:+1:0:+1:0] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
15+
df = spark.catalog.createTable("table.we.know.nothing.about")
16+
do_stuff_with(df)
17+
18+
## Check that a call with too many positional arguments is ignored as (presumably) something else; we expect no warning.
19+
# FIXME: This is a false positive due to an error in the matching specification; only 5 positional args are allowed.
20+
# ucx[table-migrate:+1:0:+1:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
21+
df = spark.catalog.createTable("old.things", None, None, None, None, "extra-argument")
22+
do_stuff_with(df)
23+
24+
## Check a call with an out-of-position named argument referencing a table known to be migrated.
25+
# ucx[table-migrate:+1:0:+1:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
26+
df = spark.catalog.createTable(path="foo", tableName="old.things", source="delta")
27+
do_stuff_with(df)
28+
29+
## Some calls that use a variable whose value is unknown: they could potentially reference a migrated table.
30+
# ucx[table-migrate:+1:0:+1:0] Can't migrate 'createTable' because its table name argument is not a constant
31+
df = spark.catalog.createTable(name)
32+
do_stuff_with(df)
33+
# ucx[table-migrate:+1:0:+1:0] Can't migrate 'createTable' because its table name argument is not a constant
34+
df = spark.catalog.createTable(f"boop{stuff}")
35+
do_stuff_with(df)
36+
37+
## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns.
38+
# FIXME: This is a false positive; any method named 'createTable' is triggering the warning.
39+
# ucx[table-migrate:+1:0:+1:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
40+
something_else.createTable("old.things")
41+
a_function("old.things")
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# ucx[direct-filesystem-access:+1:0:+1:0] The use of direct filesystem references is deprecated: s3://bucket/path
2+
spark.read.csv("s3://bucket/path")
3+
for i in range(10):
4+
5+
## Check a literal reference to a known table that is migrated.
6+
# ucx[table-migrate:+1:0:+1:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
7+
table = spark.catalog.getTable("old.things")
8+
do_stuff_with(table)
9+
10+
## Check a literal reference to an unknown table (that is not migrated); we expect no warning.
11+
table = spark.catalog.getTable("table.we.know.nothing.about")
12+
do_stuff_with(table)
13+
14+
## Check that a call with too many positional arguments is ignored as (presumably) something else; we expect no warning.
15+
table = spark.catalog.getTable("old.things", "extra-argument")
16+
do_stuff_with(table)
17+
18+
## Some calls that use a variable whose value is unknown: they could potentially reference a migrated table.
19+
# ucx[table-migrate:+1:0:+1:0] Can't migrate 'getTable' because its table name argument is not a constant
20+
table = spark.catalog.getTable(name)
21+
do_stuff_with(table)
22+
# ucx[table-migrate:+1:0:+1:0] Can't migrate 'getTable' because its table name argument is not a constant
23+
table = spark.catalog.getTable(f"boop{stuff}")
24+
do_stuff_with(table)
25+
26+
## Some trivial references to the method or table in unrelated contexts that should not trigger warnigns.
27+
# FIXME: This is a false positive; any method named 'getTable' is triggering the warning.
28+
# ucx[table-migrate:+1:0:+1:0] Table old.things is migrated to brand.new.stuff in Unity Catalog
29+
something_else.getTable("old.things")
30+
a_function("old.things")

0 commit comments

Comments
 (0)