Skip to content

Commit e904f71

Browse files
authored
Add Datasets section with dataset schemas to the pipelines explorer (#1479)
## Changes Initially Datasets aren't loaded, but you can expand them to trigger loading <img width="507" alt="Screenshot 2024-12-05 at 10 14 41" src="https://github.com/user-attachments/assets/59d39eba-ded7-409a-b6df-54569622a1cb"> If we can't find dataset definitions in the latests runs we show "please run or validate" item: <img width="507" alt="Screenshot 2024-12-05 at 10 14 58" src="https://github.com/user-attachments/assets/889a3285-0d5c-4ea8-b70a-a9476fc9449e"> <img width="507" alt="Screenshot 2024-12-05 at 10 15 14" src="https://github.com/user-attachments/assets/4afcb134-8fa0-495c-956b-873a60c80215"> Running or validating the pipeline will also show 'loading' state for the datasets section: <img width="507" alt="Screenshot 2024-12-05 at 10 28 09" src="https://github.com/user-attachments/assets/3618ccc8-4c70-4841-a6fc-749c2f95497e"> ## Tests Unit and e2e tests
1 parent 27f8a9b commit e904f71

17 files changed

+581
-186
lines changed

packages/databricks-vscode/src/bundle/BundlePipelinesManager.test.ts

Lines changed: 114 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,21 @@ describe("BundlePipelinesManager", () => {
5454
const firstRun = {
5555
data: {creation_time: 10},
5656
events: [
57-
{origin: {dataset_name: "table1"}},
58-
{origin: {not_a_dataset_name: "table1.5"}},
59-
{origin: {dataset_name: "table2"}},
57+
{
58+
origin: {dataset_name: "table1"},
59+
details: {dataset_definition: {dataset_type: "TABLE"}},
60+
},
61+
{
62+
origin: {not_a_dataset_name: "table1.5"},
63+
},
64+
{
65+
origin: {dataset_name: "table2"},
66+
details: {dataset_definition: {dataset_type: "TABLE"}},
67+
},
68+
{
69+
origin: {dataset_name: "table2.5"},
70+
details: {dataset_definition: {dataset_type: "VIEW"}},
71+
},
6072
],
6173
};
6274
/* eslint-enable @typescript-eslint/naming-convention */
@@ -77,9 +89,17 @@ describe("BundlePipelinesManager", () => {
7789
refresh_selection: ["table3", "table4"],
7890
},
7991
events: [
80-
{origin: {dataset_name: "table3"}},
81-
{origin: {not_a_dataset_name: "table3.5"}},
82-
{origin: {dataset_name: "table4"}},
92+
{
93+
origin: {dataset_name: "table3"},
94+
details: {dataset_definition: {dataset_type: "TABLE"}},
95+
},
96+
{
97+
origin: {not_a_dataset_name: "table3.5"},
98+
},
99+
{
100+
origin: {dataset_name: "table4"},
101+
details: {dataset_definition: {dataset_type: "TABLE"}},
102+
},
83103
],
84104
};
85105
/* eslint-enable @typescript-eslint/naming-convention */
@@ -103,9 +123,17 @@ describe("BundlePipelinesManager", () => {
103123
state: "RUNNING",
104124
},
105125
events: [
106-
{origin: {dataset_name: "table_new"}},
107-
{origin: {not_a_dataset_name: "not a table"}},
108-
{origin: {dataset_name: "table_final"}},
126+
{
127+
origin: {dataset_name: "table_new"},
128+
details: {dataset_definition: {dataset_type: "TABLE"}},
129+
},
130+
{
131+
origin: {not_a_dataset_name: "not a table"},
132+
},
133+
{
134+
origin: {dataset_name: "table_final"},
135+
details: {dataset_definition: {dataset_type: "TABLE"}},
136+
},
109137
],
110138
};
111139
/* eslint-enable @typescript-eslint/naming-convention */
@@ -126,9 +154,17 @@ describe("BundlePipelinesManager", () => {
126154
state: "COMPLETED",
127155
},
128156
events: [
129-
{origin: {dataset_name: "table_new"}},
130-
{origin: {not_a_dataset_name: "not a table"}},
131-
{origin: {dataset_name: "table_final"}},
157+
{
158+
origin: {dataset_name: "table_new"},
159+
details: {dataset_definition: {dataset_type: "TABLE"}},
160+
},
161+
{
162+
origin: {not_a_dataset_name: "not a table"},
163+
},
164+
{
165+
origin: {dataset_name: "table_final"},
166+
details: {dataset_definition: {dataset_type: "TABLE"}},
167+
},
132168
],
133169
};
134170
/* eslint-enable @typescript-eslint/naming-convention */
@@ -137,12 +173,77 @@ describe("BundlePipelinesManager", () => {
137173
await clock.runToLastAsync();
138174

139175
// Only the datasets from the final full-refresh run should be left
140-
datasets = manager.getDatasets("pipeline1");
176+
datasets = manager.getDatasets("pipelines.pipeline1");
141177
assert.strictEqual(datasets.size, 2);
142178
assert(datasets.has("table_new"));
143179
assert(datasets.has("table_final"));
144180
});
145181

182+
it("should extract pipeline schemas from run events", async () => {
183+
const remoteState = {resources: {pipelines: {pipeline1: {}}}};
184+
when(configModel.get("remoteStateConfig")).thenResolve(remoteState);
185+
const runStatuses = new Map();
186+
when(runStatusManager.runStatuses).thenReturn(runStatuses);
187+
188+
/* eslint-disable @typescript-eslint/naming-convention */
189+
const firstRun = {
190+
data: {creation_time: 10},
191+
events: [
192+
{
193+
origin: {dataset_name: "table1"},
194+
details: {
195+
dataset_definition: {
196+
dataset_type: "TABLE",
197+
schema: [
198+
{name: "col1", data_type: "STRING"},
199+
{name: "col2", not_a_data_type: "INTEGER"},
200+
],
201+
},
202+
},
203+
},
204+
{
205+
origin: {not_a_dataset_name: "table1.5"},
206+
},
207+
{
208+
origin: {dataset_name: "table2"},
209+
details: {
210+
dataset_definition: {dataset_type: "TABLE", schema: []},
211+
},
212+
},
213+
{
214+
origin: {dataset_name: "table3"},
215+
details: {
216+
dataset_definition: {
217+
dataset_type: "VIEW",
218+
schema: [{name: "col1", data_type: "STRING"}],
219+
},
220+
},
221+
},
222+
],
223+
};
224+
/* eslint-enable @typescript-eslint/naming-convention */
225+
runStatuses.set("pipelines.pipeline1", firstRun);
226+
227+
eventEmitter.fire();
228+
await clock.runToLastAsync();
229+
230+
const schemas = manager.getSchemas("pipelines.pipeline1");
231+
assert.strictEqual(schemas.size, 2);
232+
assert.deepStrictEqual(schemas.get("table1"), {
233+
name: "table1",
234+
type: "TABLE",
235+
schema: [
236+
{name: "col1", type: "STRING"},
237+
{name: "col2", type: ""},
238+
],
239+
});
240+
assert.deepStrictEqual(schemas.get("table3"), {
241+
name: "table3",
242+
type: "VIEW",
243+
schema: [{name: "col1", type: "STRING"}],
244+
});
245+
});
246+
146247
describe("locationToRange", () => {
147248
it("should return correct range for a given location in a text file", async () => {
148249
const uri = Uri.file("/path/to/file.py");

packages/databricks-vscode/src/bundle/BundlePipelinesManager.ts

Lines changed: 68 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,22 @@ type RunState = {
4040
events: PipelineEvent[] | undefined;
4141
};
4242

43-
type PipelineState = {
44-
key: string;
43+
export type DatasetWithSchema = {
44+
name: string;
45+
type: string;
46+
schema: Array<{name: string; type: string}>;
47+
};
48+
49+
type ResolvedPipelineState = {
4550
datasets: Set<string>;
46-
runs: Set<RunState>;
51+
schemas: Map<string, DatasetWithSchema>;
4752
};
53+
type PreloadedPipelineState = Promise<ResolvedPipelineState | undefined>;
4854

49-
type PreloadedPipelineState = Promise<Set<string> | undefined>;
55+
type PipelineState = {
56+
key: string;
57+
runs: Set<RunState>;
58+
} & ResolvedPipelineState;
5059

5160
type Pick = QuickPickItem & {isDataset?: boolean};
5261

@@ -78,6 +87,7 @@ export class BundlePipelinesManager {
7887
this.configModel.onDidChangeTarget(() => {
7988
this.updateTriggeredPipelinesState();
8089
this.updateDiagnostics();
90+
this.preloadedState.clear();
8191
}),
8292
this.configModel.onDidChangeKey("remoteStateConfig")(async () => {
8393
this.updateTriggeredPipelinesState();
@@ -104,6 +114,7 @@ export class BundlePipelinesManager {
104114
this.triggeredState.set(pipelineKey, {
105115
key: pipelineKey,
106116
datasets: new Set(),
117+
schemas: new Map(),
107118
runs: new Set(),
108119
});
109120
}
@@ -113,7 +124,9 @@ export class BundlePipelinesManager {
113124
);
114125
if (runStatus) {
115126
state.runs.add(runStatus as PipelineRunStatus);
116-
state.datasets = extractPipelineDatasets(state.runs);
127+
const extractedData = extractPipelineDatasets(state.runs);
128+
state.datasets = extractedData.datasets;
129+
state.schemas = extractedData.schemas;
117130
}
118131
});
119132
}
@@ -242,10 +255,19 @@ export class BundlePipelinesManager {
242255
}
243256

244257
public getDatasets(pipelineKey: string) {
245-
return this.triggeredState.get(pipelineKey)?.datasets ?? new Set();
258+
const key = pipelineKey.split(".")[1] ?? pipelineKey;
259+
return this.triggeredState.get(key)?.datasets ?? new Set();
260+
}
261+
262+
public getSchemas(pipelineKey: string) {
263+
const key = pipelineKey.split(".")[1] ?? pipelineKey;
264+
return (
265+
this.triggeredState.get(key ?? pipelineKey)?.schemas ??
266+
new Map<string, DatasetWithSchema>()
267+
);
246268
}
247269

248-
async preloadDatasets(pipelineKey: string): PreloadedPipelineState {
270+
public async preloadDatasets(pipelineKey: string): PreloadedPipelineState {
249271
const remoteState = await this.configModel.get("remoteStateConfig");
250272
if (!remoteState) {
251273
return undefined;
@@ -267,13 +289,13 @@ export class BundlePipelinesManager {
267289
return preloaded;
268290
}
269291

270-
const barrier = new Barrier<Set<string>>();
292+
const barrier = new Barrier<ResolvedPipelineState>();
271293
this.preloadedState.set(pipelineKey, barrier.promise);
272294

273295
try {
274296
const runs = await this.preloadUpdates(client, pipelineId);
275297
if (!runs) {
276-
barrier.resolve(new Set());
298+
barrier.resolve({datasets: new Set(), schemas: new Map()});
277299
return barrier.promise;
278300
}
279301
const listing = this.createPreloadEventsRequest(
@@ -287,8 +309,10 @@ export class BundlePipelinesManager {
287309
runState.events.push(event);
288310
}
289311
}
290-
const datasets = extractPipelineDatasets(new Set(runs.values()));
291-
barrier.resolve(datasets);
312+
const extractedData = extractPipelineDatasets(
313+
new Set(runs.values())
314+
);
315+
barrier.resolve(extractedData);
292316
} catch (e) {
293317
barrier.reject(e);
294318
}
@@ -360,9 +384,9 @@ export class BundlePipelinesManager {
360384
disposables
361385
);
362386
this.preloadDatasets(key)
363-
.then((preloadedDatasets) => {
364-
if (preloadedDatasets && isUIVisible) {
365-
for (const dataset of preloadedDatasets) {
387+
.then((preloadedData) => {
388+
if (preloadedData && isUIVisible) {
389+
for (const dataset of preloadedData.datasets) {
366390
knownDatasets.add(dataset);
367391
}
368392
updateItems(ui, knownDatasets);
@@ -418,7 +442,7 @@ async function confirmFullRefresh() {
418442
);
419443
}
420444

421-
function isFullGraphUpdate(update?: UpdateInfo) {
445+
export function isFullGraphUpdate(update?: UpdateInfo) {
422446
if (!update || update.state !== "COMPLETED") {
423447
return false;
424448
}
@@ -429,37 +453,48 @@ function isFullGraphUpdate(update?: UpdateInfo) {
429453
);
430454
}
431455

432-
// "details" is not a publicly documented field
433-
function extractDatasetName(
434-
event: PipelineEvent & {details?: any}
435-
): string | undefined {
436-
if (!event.origin?.dataset_name) {
437-
return;
438-
}
439-
// VIEWs can't be used for a partial refresh (they are always refreshed)
440-
if (event.details?.dataset_definition?.dataset_type === "VIEW") {
441-
return;
442-
}
443-
return event.origin.dataset_name;
444-
}
445-
446-
function extractPipelineDatasets(runs: Set<RunState>) {
456+
function extractPipelineDatasets(runs: Set<RunState>): ResolvedPipelineState {
447457
const datasets = new Set<string>();
458+
const schemas = new Map<string, DatasetWithSchema>();
448459
const runsByStartTimeDesc = Array.from(runs).sort(
449460
(a, b) => (b.data?.creation_time ?? 0) - (a.data?.creation_time ?? 0)
450461
);
451462
for (const run of runsByStartTimeDesc) {
452463
for (const event of run.events ?? []) {
453-
const datasetName = extractDatasetName(event);
454-
if (datasetName) {
464+
const datasetName = event.origin?.dataset_name;
465+
// 'details' is not documented, but it's safe to rely on if it exists
466+
// @ts-expect-error Property 'details' does not exist
467+
const definition = event.details?.dataset_definition;
468+
if (!datasetName || !definition) {
469+
continue;
470+
}
471+
const datasetType = definition.dataset_type ?? "";
472+
if (datasetType && datasetType !== "VIEW") {
455473
datasets.add(datasetName);
456474
}
475+
if (
476+
Array.isArray(definition.schema) &&
477+
definition.schema.length > 0
478+
) {
479+
const schema = definition.schema.map(
480+
// eslint-disable-next-line @typescript-eslint/naming-convention
481+
(field: {name?: string; data_type?: string}) => ({
482+
name: field.name ?? "",
483+
type: field.data_type ?? "",
484+
})
485+
);
486+
schemas.set(datasetName, {
487+
name: datasetName,
488+
type: datasetType,
489+
schema,
490+
});
491+
}
457492
}
458493
if (isFullGraphUpdate(run.data)) {
459494
break;
460495
}
461496
}
462-
return datasets;
497+
return {datasets, schemas};
463498
}
464499

465500
function createPicks(datasets: Set<string>, manualValue?: string) {

0 commit comments

Comments
 (0)