Skip to content

Commit b3064cb

Browse files
authored
Add orchestration execution tracing (#441)
* WIP of PoC of tracing. * Wiring tracing metadata through test server. * Ensure orchestration traces are parented correctly. * Check orchestration activities are same logical activity. * Test activity traces. * Assert tags of orchestration activities. * Sketch emitting client activity trace. * Exclude Rider files. * Attempt to capture activity/orchestration failures. * Stop orchestration trace when execution completes. * Update protos to match backend. * Add/set partent trace context to suborchestration action. * Port suborchestration tracing. * Add tracing for events sent by worker. * Add tracing for client-sent events. * Add tracing for orchestration timers. * Sketch basic tracing test. * Test activity types. * Add test for sent events. * Add test for timer event. * Add test for client raised events. * Move files to tracing namespace. * Clean up files. * Fixup test for suborchestration tracing. * Add test for suborchestration failure. * Revert change. * Eliminate the unnecessary activity ID (as it can be derived). * Fix up a number of warnings. * Fixup doc and style warnings. * Revert package props. * Updates per PR feedback. * Add orchestration status tag on completion. * Update for move of orchestration trace context to "outer" message. * Fix test breaks. * Updates per more PR feedback. * Update protobuf version.
1 parent 1d3dac5 commit b3064cb

22 files changed

+1773
-49
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,3 +348,6 @@ MigrationBackup/
348348

349349
# Ionide (cross platform F# VS Code tools) working folder
350350
.ionide/
351+
352+
# Rider (cross platform .NET/C# tools) working folder
353+
.idea/

src/Client/Grpc/GrpcDurableTaskClient.cs

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Text;
66
using Google.Protobuf.WellKnownTypes;
77
using Microsoft.DurableTask.Client.Entities;
8+
using Microsoft.DurableTask.Tracing;
89
using Microsoft.Extensions.DependencyInjection;
910
using Microsoft.Extensions.Logging;
1011
using Microsoft.Extensions.Options;
@@ -107,24 +108,6 @@ public override async Task<string> ScheduleNewOrchestrationInstanceAsync(
107108
}
108109
}
109110

110-
if (Activity.Current?.Id != null || Activity.Current?.TraceStateString != null)
111-
{
112-
if (request.ParentTraceContext == null)
113-
{
114-
request.ParentTraceContext = new P.TraceContext();
115-
}
116-
117-
if (Activity.Current?.Id != null)
118-
{
119-
request.ParentTraceContext.TraceParent = Activity.Current?.Id;
120-
}
121-
122-
if (Activity.Current?.TraceStateString != null)
123-
{
124-
request.ParentTraceContext.TraceState = Activity.Current?.TraceStateString;
125-
}
126-
}
127-
128111
DateTimeOffset? startAt = options?.StartAt;
129112
this.logger.SchedulingOrchestration(
130113
request.InstanceId,
@@ -138,6 +121,8 @@ public override async Task<string> ScheduleNewOrchestrationInstanceAsync(
138121
request.ScheduledStartTimestamp = Timestamp.FromDateTimeOffset(startAt.Value.ToUniversalTime());
139122
}
140123

124+
using Activity? newActivity = TraceHelper.StartActivityForNewOrchestration(request);
125+
141126
P.CreateInstanceResponse? result = await this.sidecarClient.StartInstanceAsync(
142127
request, cancellationToken: cancellation);
143128
return result.InstanceId;
@@ -159,6 +144,8 @@ public override async Task RaiseEventAsync(
159144
Input = this.DataConverter.Serialize(eventPayload),
160145
};
161146

147+
using Activity? traceActivity = TraceHelper.StartActivityForNewEventRaisedFromClient(request, instanceId);
148+
162149
await this.sidecarClient.RaiseEventAsync(request, cancellationToken: cancellation);
163150
}
164151

src/Grpc/orchestrator_service.proto

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ message EntityOperationCalledEvent {
193193
}
194194

195195
message EntityLockRequestedEvent {
196-
string criticalSectionId = 1;
196+
string criticalSectionId = 1;
197197
repeated string lockSet = 2;
198198
int32 position = 3;
199199
google.protobuf.StringValue parentInstanceId = 4; // used only within messages, null in histories
@@ -218,7 +218,7 @@ message EntityUnlockSentEvent {
218218
message EntityLockGrantedEvent {
219219
string criticalSectionId = 1;
220220
}
221-
221+
222222
message HistoryEvent {
223223
int32 eventId = 1;
224224
google.protobuf.Timestamp timestamp = 2;
@@ -245,8 +245,8 @@ message HistoryEvent {
245245
ExecutionResumedEvent executionResumed = 22;
246246
EntityOperationSignaledEvent entityOperationSignaled = 23;
247247
EntityOperationCalledEvent entityOperationCalled = 24;
248-
EntityOperationCompletedEvent entityOperationCompleted = 25;
249-
EntityOperationFailedEvent entityOperationFailed = 26;
248+
EntityOperationCompletedEvent entityOperationCompleted = 25;
249+
EntityOperationFailedEvent entityOperationFailed = 26;
250250
EntityLockRequestedEvent entityLockRequested = 27;
251251
EntityLockGrantedEvent entityLockGranted = 28;
252252
EntityUnlockSentEvent entityUnlockSent = 29;
@@ -258,13 +258,15 @@ message ScheduleTaskAction {
258258
google.protobuf.StringValue version = 2;
259259
google.protobuf.StringValue input = 3;
260260
map<string, string> tags = 4;
261+
TraceContext parentTraceContext = 5;
261262
}
262263

263264
message CreateSubOrchestrationAction {
264265
string instanceId = 1;
265266
string name = 2;
266267
google.protobuf.StringValue version = 3;
267268
google.protobuf.StringValue input = 4;
269+
TraceContext parentTraceContext = 5;
268270
}
269271

270272
message CreateTimerAction {
@@ -314,6 +316,11 @@ message OrchestratorAction {
314316
}
315317
}
316318

319+
message OrchestrationTraceContext {
320+
google.protobuf.StringValue spanID = 1;
321+
google.protobuf.Timestamp spanStartTime = 2;
322+
}
323+
317324
message OrchestratorRequest {
318325
string instanceId = 1;
319326
google.protobuf.StringValue executionId = 2;
@@ -322,6 +329,8 @@ message OrchestratorRequest {
322329
OrchestratorEntityParameters entityParameters = 5;
323330
bool requiresHistoryStreaming = 6;
324331
map<string, google.protobuf.Value> properties = 7;
332+
333+
OrchestrationTraceContext orchestrationTraceContext = 8;
325334
}
326335

327336
message OrchestratorResponse {
@@ -333,6 +342,8 @@ message OrchestratorResponse {
333342
// The number of work item events that were processed by the orchestrator.
334343
// This field is optional. If not set, the service should assume that the orchestrator processed all events.
335344
google.protobuf.Int32Value numEventsProcessed = 5;
345+
346+
OrchestrationTraceContext orchestrationTraceContext = 6;
336347
}
337348

338349
message CreateInstanceRequest {
@@ -498,7 +509,7 @@ message SignalEntityRequest {
498509
}
499510

500511
message SignalEntityResponse {
501-
// no payload
512+
// no payload
502513
}
503514

504515
message GetEntityRequest {
@@ -673,16 +684,16 @@ service TaskHubSidecarService {
673684

674685
// Waits for an orchestration instance to reach a running or completion state.
675686
rpc WaitForInstanceStart(GetInstanceRequest) returns (GetInstanceResponse);
676-
687+
677688
// Waits for an orchestration instance to reach a completion state (completed, failed, terminated, etc.).
678689
rpc WaitForInstanceCompletion(GetInstanceRequest) returns (GetInstanceResponse);
679690

680691
// Raises an event to a running orchestration instance.
681692
rpc RaiseEvent(RaiseEventRequest) returns (RaiseEventResponse);
682-
693+
683694
// Terminates a running orchestration instance.
684695
rpc TerminateInstance(TerminateRequest) returns (TerminateResponse);
685-
696+
686697
// Suspends a running orchestration instance.
687698
rpc SuspendInstance(SuspendRequest) returns (SuspendResponse);
688699

@@ -764,7 +775,7 @@ message CompleteTaskResponse {
764775
}
765776

766777
message HealthPing {
767-
// No payload
778+
// No payload
768779
}
769780

770781
message StreamInstanceHistoryRequest {
@@ -777,4 +788,4 @@ message StreamInstanceHistoryRequest {
777788

778789
message HistoryChunk {
779790
repeated HistoryEvent events = 1;
780-
}
791+
}

src/Grpc/versions.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
# The following files were downloaded from branch main at 2025-06-02 21:12:34 UTC
2-
https://raw.githubusercontent.com/microsoft/durabletask-protobuf/fd9369c6a03d6af4e95285e432b7c4e943c06970/protos/orchestrator_service.proto
1+
# The following files were downloaded from branch main at 2025-08-08 16:46:11 UTC
2+
https://raw.githubusercontent.com/microsoft/durabletask-protobuf/e88acbd07ae38b499dbe8c4e333e9e3feeb2a9cc/protos/orchestrator_service.proto

src/Shared/Grpc/ProtoUtils.cs

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
using System.Buffers;
55
using System.Buffers.Text;
6+
using System.Diagnostics;
67
using System.Diagnostics.CodeAnalysis;
78
using System.Text;
89
using DurableTask.Core;
@@ -15,6 +16,7 @@
1516
using Google.Protobuf.WellKnownTypes;
1617
using DTCore = DurableTask.Core;
1718
using P = Microsoft.DurableTask.Protobuf;
19+
using TraceHelper = Microsoft.DurableTask.Tracing.TraceHelper;
1820

1921
namespace Microsoft.DurableTask;
2022

@@ -268,43 +270,72 @@ internal static Timestamp ToTimestamp(this DateTime dateTime)
268270
/// Constructs a <see cref="P.OrchestratorResponse" />.
269271
/// </summary>
270272
/// <param name="instanceId">The orchestrator instance ID.</param>
273+
/// <param name="executionId">The orchestrator execution ID.</param>
271274
/// <param name="customStatus">The orchestrator customer status or <c>null</c> if no custom status.</param>
272275
/// <param name="actions">The orchestrator actions.</param>
273276
/// <param name="completionToken">
274277
/// The completion token for the work item. It must be the exact same <see cref="P.WorkItem.CompletionToken" />
275278
/// value that was provided by the corresponding <see cref="P.WorkItem"/> that triggered the orchestrator execution.
276279
/// </param>
277280
/// <param name="entityConversionState">The entity conversion state, or null if no conversion is required.</param>
281+
/// <param name="orchestrationActivity">The <see cref="Activity" /> that represents orchestration execution.</param>
278282
/// <returns>The orchestrator response.</returns>
279283
/// <exception cref="NotSupportedException">When an orchestrator action is unknown.</exception>
280284
internal static P.OrchestratorResponse ConstructOrchestratorResponse(
281285
string instanceId,
286+
string executionId,
282287
string? customStatus,
283288
IEnumerable<OrchestratorAction> actions,
284289
string completionToken,
285-
EntityConversionState? entityConversionState)
290+
EntityConversionState? entityConversionState,
291+
Activity? orchestrationActivity)
286292
{
287293
Check.NotNull(actions);
288294
var response = new P.OrchestratorResponse
289295
{
290296
InstanceId = instanceId,
291297
CustomStatus = customStatus,
292298
CompletionToken = completionToken,
299+
OrchestrationTraceContext =
300+
new()
301+
{
302+
SpanID = orchestrationActivity?.SpanId.ToString(),
303+
SpanStartTime = orchestrationActivity?.StartTimeUtc.ToTimestamp(),
304+
},
293305
};
294306

295307
foreach (OrchestratorAction action in actions)
296308
{
297309
var protoAction = new P.OrchestratorAction { Id = action.Id };
298310

311+
P.TraceContext? CreateTraceContext()
312+
{
313+
if (orchestrationActivity is null)
314+
{
315+
return null;
316+
}
317+
318+
ActivitySpanId clientSpanId = ActivitySpanId.CreateRandom();
319+
ActivityContext clientActivityContext = new(orchestrationActivity.TraceId, clientSpanId, orchestrationActivity.ActivityTraceFlags, orchestrationActivity.TraceStateString);
320+
321+
return new P.TraceContext
322+
{
323+
TraceParent = $"00-{clientActivityContext.TraceId}-{clientActivityContext.SpanId}-0{clientActivityContext.TraceFlags:d}",
324+
TraceState = clientActivityContext.TraceState,
325+
};
326+
}
327+
299328
switch (action.OrchestratorActionType)
300329
{
301330
case OrchestratorActionType.ScheduleOrchestrator:
302331
var scheduleTaskAction = (ScheduleTaskOrchestratorAction)action;
332+
303333
protoAction.ScheduleTask = new P.ScheduleTaskAction
304334
{
305335
Name = scheduleTaskAction.Name,
306336
Version = scheduleTaskAction.Version,
307337
Input = scheduleTaskAction.Input,
338+
ParentTraceContext = CreateTraceContext(),
308339
};
309340

310341
if (scheduleTaskAction.Tags != null)
@@ -324,6 +355,7 @@ internal static P.OrchestratorResponse ConstructOrchestratorResponse(
324355
InstanceId = subOrchestrationAction.InstanceId,
325356
Name = subOrchestrationAction.Name,
326357
Version = subOrchestrationAction.Version,
358+
ParentTraceContext = CreateTraceContext(),
327359
};
328360
break;
329361
case OrchestratorActionType.CreateTimer:
@@ -378,6 +410,12 @@ internal static P.OrchestratorResponse ConstructOrchestratorResponse(
378410
Name = sendEventAction.EventName,
379411
Data = sendEventAction.EventData,
380412
};
413+
414+
// Distributed Tracing: start a new trace activity derived from the orchestration
415+
// for an EventRaisedEvent (external event)
416+
using Activity? traceActivity = TraceHelper.StartTraceActivityForEventRaisedFromWorker(sendEventAction, instanceId, executionId);
417+
418+
traceActivity?.Stop();
381419
}
382420

383421
break;

0 commit comments

Comments
 (0)