Add comprehensive tracing instrumentation and tests

Copilot · rootfs · Copilot · commit 2d4ecc5895f2 · 2025-10-02T23:09:24.000Z
Co-authored-by: rootfs &lt;7062400+rootfs@users.noreply.github.com&gt;
diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go
@@ -560,14 +560,50 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 		}
 
 		if classificationText != "" {
+			// Start classification span
+			classifyCtx, classifySpan := observability.StartSpan(ctx.TraceContext, observability.SpanClassification)
+			classifyStart := time.Now()
+			
 			// Find the most similar task description or classify, then select best model
 			matchedModel := r.classifyAndSelectBestModel(classificationText)
+			classifyTime := time.Since(classifyStart).Milliseconds()
+			
+			// Get category information for the span
+			categoryName := r.findCategoryForClassification(classificationText)
+			
+			observability.SetSpanAttributes(classifySpan,
+				attribute.String(observability.AttrCategoryName, categoryName),
+				attribute.String(observability.AttrClassifierType, "bert"),
+				attribute.Int64(observability.AttrClassificationTimeMs, classifyTime))
+			classifySpan.End()
+			ctx.TraceContext = classifyCtx
+			
 			if matchedModel != originalModel && matchedModel != "" {
-				// Get detected PII for policy checking
+				// Start PII detection span if enabled
 				allContent := pii.ExtractAllContent(userContent, nonUserMessages)
 				if r.PIIChecker.IsPIIEnabled(matchedModel) {
+					piiCtx, piiSpan := observability.StartSpan(ctx.TraceContext, observability.SpanPIIDetection)
+					piiStart := time.Now()
+					
 					observability.Infof("PII policy enabled for model %s", matchedModel)
 					detectedPII := r.Classifier.DetectPIIInContent(allContent)
+					
+					piiTime := time.Since(piiStart).Milliseconds()
+					piiDetected := len(detectedPII) > 0
+					
+					observability.SetSpanAttributes(piiSpan,
+						attribute.Bool(observability.AttrPIIDetected, piiDetected),
+						attribute.Int64(observability.AttrPIIDetectionTimeMs, piiTime))
+					
+					if piiDetected {
+						// Convert detected PII to comma-separated string
+						piiTypesStr := strings.Join(detectedPII, ",")
+						observability.SetSpanAttributes(piiSpan,
+							attribute.String(observability.AttrPIITypes, piiTypesStr))
+					}
+					
+					piiSpan.End()
+					ctx.TraceContext = piiCtx
 
 					// Check if the initially selected model passes PII policy
 					allowed, deniedPII, err := r.PIIChecker.CheckPolicy(matchedModel, detectedPII)
@@ -622,6 +658,9 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 
 				observability.Infof("Routing to model: %s", matchedModel)
 
+				// Start routing decision span
+				routingCtx, routingSpan := observability.StartSpan(ctx.TraceContext, observability.SpanRoutingDecision)
+				
 				// Check reasoning mode for this category using entropy-based approach
 				useReasoning, categoryName, reasoningDecision := r.getEntropyBasedReasoningModeAndCategory(userContent)
 				observability.Infof("Entropy-based reasoning decision for this query: %v on [%s] model (confidence: %.3f, reason: %s)",
@@ -630,6 +669,18 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 				effortForMetrics := r.getReasoningEffort(categoryName)
 				metrics.RecordReasoningDecision(categoryName, matchedModel, useReasoning, effortForMetrics)
 
+				// Set routing attributes on span
+				observability.SetSpanAttributes(routingSpan,
+					attribute.String(observability.AttrRoutingStrategy, "auto"),
+					attribute.String(observability.AttrRoutingReason, reasoningDecision.DecisionReason),
+					attribute.String(observability.AttrOriginalModel, originalModel),
+					attribute.String(observability.AttrSelectedModel, matchedModel),
+					attribute.Bool(observability.AttrReasoningEnabled, useReasoning),
+					attribute.String(observability.AttrReasoningEffort, effortForMetrics))
+				
+				routingSpan.End()
+				ctx.TraceContext = routingCtx
+
 				// Track VSR decision information
 				ctx.VSRSelectedCategory = categoryName
 				ctx.VSRSelectedModel = matchedModel
@@ -645,14 +696,28 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 				// Update the actual model that will be used
 				actualModel = matchedModel
 
+				// Start backend selection span
+				backendCtx, backendSpan := observability.StartSpan(ctx.TraceContext, observability.SpanBackendSelection)
+				
 				// Select the best endpoint for this model
 				endpointAddress, endpointFound := r.Config.SelectBestEndpointAddressForModel(matchedModel)
 				if endpointFound {
 					selectedEndpoint = endpointAddress
 					observability.Infof("Selected endpoint address: %s for model: %s", selectedEndpoint, matchedModel)
+					
+					// Extract endpoint name from config
+					endpoints := r.Config.GetEndpointsForModel(matchedModel)
+					if len(endpoints) > 0 {
+						observability.SetSpanAttributes(backendSpan,
+							attribute.String(observability.AttrEndpointName, endpoints[0].Name),
+							attribute.String(observability.AttrEndpointAddress, selectedEndpoint))
+					}
 				} else {
 					observability.Warnf("No endpoint found for model %s, using fallback", matchedModel)
 				}
+				
+				backendSpan.End()
+				ctx.TraceContext = backendCtx
 
 				// Modify the model in the request
 				openAIRequest.Model = openai.ChatModel(matchedModel)
@@ -688,21 +753,35 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 					}
 
 					if category != nil && category.SystemPrompt != "" && category.IsSystemPromptEnabled() {
+						// Start system prompt injection span
+						promptCtx, promptSpan := observability.StartSpan(ctx.TraceContext, observability.SpanSystemPromptInjection)
+						
 						mode := category.GetSystemPromptMode()
 						var injected bool
 						modifiedBody, injected, err = addSystemPromptToRequestBody(modifiedBody, category.SystemPrompt, mode)
 						if err != nil {
 							observability.Errorf("Error adding system prompt to request: %v", err)
+							observability.RecordError(promptSpan, err)
 							metrics.RecordRequestError(actualModel, "serialization_error")
+							promptSpan.End()
 							return nil, status.Errorf(codes.Internal, "error adding system prompt: %v", err)
 						}
+						
+						observability.SetSpanAttributes(promptSpan,
+							attribute.Bool("system_prompt.injected", injected),
+							attribute.String("system_prompt.mode", mode),
+							attribute.String(observability.AttrCategoryName, categoryName))
+						
 						if injected {
 							ctx.VSRInjectedSystemPrompt = true
 							observability.Infof("Added category-specific system prompt for category: %s (mode: %s)", categoryName, mode)
 						}
 
 						// Log metadata about system prompt injection (avoid logging sensitive user data)
 						observability.Infof("System prompt injection completed for category: %s, body size: %d bytes", categoryName, len(modifiedBody))
+						
+						promptSpan.End()
+						ctx.TraceContext = promptCtx
 					} else if category != nil && category.SystemPrompt != "" && !category.IsSystemPromptEnabled() {
 						observability.Infof("System prompt disabled for category: %s", categoryName)
 					}
diff --git a/src/semantic-router/pkg/observability/tracing_test.go b/src/semantic-router/pkg/observability/tracing_test.go
@@ -0,0 +1,217 @@
+package observability
+
+import (
+	"context"
+	"testing"
+
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+)
+
+func TestTracingConfiguration(t *testing.T) {
+	tests := []struct {
+		name    string
+		cfg     TracingConfig
+		wantErr bool
+	}{
+		{
+			name: "disabled tracing",
+			cfg: TracingConfig{
+				Enabled: false,
+			},
+			wantErr: false,
+		},
+		{
+			name: "stdout exporter",
+			cfg: TracingConfig{
+				Enabled:               true,
+				Provider:              "opentelemetry",
+				ExporterType:          "stdout",
+				SamplingType:          "always_on",
+				ServiceName:           "test-service",
+				ServiceVersion:        "v1.0.0",
+				DeploymentEnvironment: "test",
+			},
+			wantErr: false,
+		},
+		{
+			name: "probabilistic sampling",
+			cfg: TracingConfig{
+				Enabled:               true,
+				Provider:              "opentelemetry",
+				ExporterType:          "stdout",
+				SamplingType:          "probabilistic",
+				SamplingRate:          0.5,
+				ServiceName:           "test-service",
+				ServiceVersion:        "v1.0.0",
+				DeploymentEnvironment: "test",
+			},
+			wantErr: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ctx := context.Background()
+			err := InitTracing(ctx, tt.cfg)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("InitTracing() error = %v, wantErr %v", err, tt.wantErr)
+			}
+
+			// Cleanup
+			if err == nil {
+				shutdownCtx := context.Background()
+				_ = ShutdownTracing(shutdownCtx)
+			}
+		})
+	}
+}
+
+func TestSpanCreation(t *testing.T) {
+	// Initialize tracing with stdout exporter
+	ctx := context.Background()
+	cfg := TracingConfig{
+		Enabled:               true,
+		Provider:              "opentelemetry",
+		ExporterType:          "stdout",
+		SamplingType:          "always_on",
+		ServiceName:           "test-service",
+		ServiceVersion:        "v1.0.0",
+		DeploymentEnvironment: "test",
+	}
+
+	err := InitTracing(ctx, cfg)
+	if err != nil {
+		t.Fatalf("Failed to initialize tracing: %v", err)
+	}
+	defer func() {
+		shutdownCtx := context.Background()
+		_ = ShutdownTracing(shutdownCtx)
+	}()
+
+	// Test span creation
+	spanCtx, span := StartSpan(ctx, SpanRequestReceived)
+	if span == nil {
+		t.Fatal("StartSpan returned nil span")
+	}
+
+	// Test setting attributes
+	SetSpanAttributes(span,
+		attribute.String(AttrRequestID, "test-request-123"),
+		attribute.String(AttrModelName, "gpt-4"),
+	)
+
+	// Test recording error
+	testErr := context.Canceled
+	RecordError(span, testErr)
+	span.SetStatus(codes.Error, "test error")
+
+	span.End()
+
+	// Verify context was updated
+	if spanCtx == nil {
+		t.Fatal("StartSpan returned nil context")
+	}
+}
+
+func TestTraceContextPropagation(t *testing.T) {
+	// Initialize tracing
+	ctx := context.Background()
+	cfg := TracingConfig{
+		Enabled:               true,
+		Provider:              "opentelemetry",
+		ExporterType:          "stdout",
+		SamplingType:          "always_on",
+		ServiceName:           "test-service",
+		ServiceVersion:        "v1.0.0",
+		DeploymentEnvironment: "test",
+	}
+
+	err := InitTracing(ctx, cfg)
+	if err != nil {
+		t.Fatalf("Failed to initialize tracing: %v", err)
+	}
+	defer func() {
+		shutdownCtx := context.Background()
+		_ = ShutdownTracing(shutdownCtx)
+	}()
+
+	// Create a span to establish trace context
+	spanCtx, span := StartSpan(ctx, "test-span")
+	defer span.End()
+
+	// Test injection
+	headers := make(map[string]string)
+	InjectTraceContext(spanCtx, headers)
+
+	// Verify trace context was injected
+	if len(headers) == 0 {
+		t.Error("InjectTraceContext did not inject any headers")
+	}
+
+	// Test extraction
+	extractedCtx := ExtractTraceContext(ctx, headers)
+	if extractedCtx == nil {
+		t.Error("ExtractTraceContext returned nil context")
+	}
+}
+
+func TestGetTracerWhenNotInitialized(t *testing.T) {
+	// Don't initialize tracing
+	tracer := GetTracer()
+	if tracer == nil {
+		t.Error("GetTracer returned nil when not initialized")
+	}
+
+	// Should return a noop tracer that doesn't panic
+	ctx := context.Background()
+	_, span := tracer.Start(ctx, "test-span")
+	if span == nil {
+		t.Error("Noop tracer returned nil span")
+	}
+	span.End()
+}
+
+func TestSpanAttributeConstants(t *testing.T) {
+	// Verify span name constants are defined
+	spanNames := []string{
+		SpanRequestReceived,
+		SpanClassification,
+		SpanPIIDetection,
+		SpanJailbreakDetection,
+		SpanCacheLookup,
+		SpanRoutingDecision,
+		SpanBackendSelection,
+		SpanUpstreamRequest,
+		SpanResponseProcessing,
+		SpanToolSelection,
+		SpanSystemPromptInjection,
+	}
+
+	for _, name := range spanNames {
+		if name == "" {
+			t.Errorf("Span name constant is empty")
+		}
+		if len(name) < 10 {
+			t.Errorf("Span name %q is too short", name)
+		}
+	}
+
+	// Verify attribute key constants are defined
+	attrKeys := []string{
+		AttrRequestID,
+		AttrModelName,
+		AttrCategoryName,
+		AttrRoutingStrategy,
+		AttrPIIDetected,
+		AttrJailbreakDetected,
+		AttrCacheHit,
+		AttrReasoningEnabled,
+	}
+
+	for _, key := range attrKeys {
+		if key == "" {
+			t.Errorf("Attribute key constant is empty")
+		}
+	}
+}