Skip to content

Commit 2c25284

Browse files
gorskysdSean Gorsky
andauthored
Prod 411 hui adobe parser error databricks on azure (#11)
* SQL Events are no longer included unless both exist. Downstream product should be unaffected * Updated version * formatting * Added test Co-authored-by: Sean Gorsky <[email protected]>
1 parent feb5ebd commit 2c25284

File tree

4 files changed

+18
-4
lines changed

4 files changed

+18
-4
lines changed

spark_log_parser/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Tools for providing Spark event log"""
22

3-
__version__ = "0.1.0"
3+
__version__ = "0.1.1"

spark_log_parser/parsing_models/application_model_v2.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,14 @@ def getSQLinfo(self, appobj):
134134
sql_jobs = []
135135
sql_stages = []
136136
sql_tasks = []
137-
for jid, job in appobj.jobs.items():
138137

139-
if "end_time" not in sql.keys():
140-
sql["end_time"] = appobj.finish_time
138+
# Sometimes an SQL event will be missing. To be informative, both
139+
# events must be present. But this information is not critical, so
140+
# if either event is missing then simply reject the SQL data
141+
if "start_time" not in sql.keys() or "end_time" not in sql.keys():
142+
continue
143+
144+
for jid, job in appobj.jobs.items():
141145

142146
if (job.submission_time >= sql["start_time"]) and (
143147
job.submission_time <= sql["end_time"]
5.49 MB
Binary file not shown.

tests/test_parse.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,13 @@ def test_simple_emr_log():
6464
assert (
6565
parsed["metadata"]["application_info"]["name"] == "Text Similarity"
6666
), "Name is as expected"
67+
68+
69+
def test_emr_missing_sql_events():
70+
event_log_path = Path("tests", "logs", "emr_missing_sql_events.zip").resolve()
71+
72+
with tempfile.TemporaryDirectory() as temp_dir:
73+
event_log = eventlog.EventLogBuilder(event_log_path.as_uri(), temp_dir).build()
74+
obj = sparkApplication(eventlog=str(event_log))
75+
76+
assert list(obj.sqlData.index.values) == [0, 2, 3, 5, 6, 7, 8]

0 commit comments

Comments
 (0)