|
265 | 265 | .WithName("SearchDocuments") |
266 | 266 | .WithOpenApi(); |
267 | 267 |
|
| 268 | +// Diagnostic endpoint for analyzing embeddings and search quality |
| 269 | +app.MapGet("/api/diagnostics/embedding", async (HttpContext context, string text, IEmbeddingService embeddingService, ILogger<Program> logger) => |
| 270 | +{ |
| 271 | + try |
| 272 | + { |
| 273 | + var githubToken = context.Request.Headers["X-GitHub-Token"].FirstOrDefault(); |
| 274 | + var hasToken = !string.IsNullOrEmpty(githubToken) && IsValidGitHubToken(githubToken); |
| 275 | + |
| 276 | + logger.LogInformation("Generating embedding for diagnostic - Text: '{Text}', HasToken: {HasToken}", text, hasToken); |
| 277 | + |
| 278 | + var embedding = await embeddingService.GenerateEmbeddingAsync(text, githubToken); |
| 279 | + |
| 280 | + var stats = new |
| 281 | + { |
| 282 | + Text = text, |
| 283 | + EmbeddingType = hasToken ? "GitHub Models" : "Simple Hash", |
| 284 | + HasGitHubToken = hasToken, |
| 285 | + TokenLength = githubToken?.Length ?? 0, |
| 286 | + EmbeddingDimensions = embedding.Length, |
| 287 | + EmbeddingSample = embedding.Span[0..Math.Min(10, embedding.Length)].ToArray(), // First 10 values |
| 288 | + EmbeddingMagnitude = Math.Sqrt(embedding.Span.ToArray().Sum(x => x * x)), |
| 289 | + EmbeddingStats = new |
| 290 | + { |
| 291 | + Min = embedding.Span.ToArray().Min(), |
| 292 | + Max = embedding.Span.ToArray().Max(), |
| 293 | + Average = embedding.Span.ToArray().Average(), |
| 294 | + NonZeroCount = embedding.Span.ToArray().Count(x => Math.Abs(x) > 0.001f) |
| 295 | + } |
| 296 | + }; |
| 297 | + |
| 298 | + return Results.Ok(stats); |
| 299 | + } |
| 300 | + catch (Exception ex) |
| 301 | + { |
| 302 | + logger.LogError(ex, "Error generating diagnostic embedding for text: {Text}", text); |
| 303 | + return Results.BadRequest(new { Error = ex.Message }); |
| 304 | + } |
| 305 | +}) |
| 306 | +.WithName("DiagnosticEmbedding") |
| 307 | +.WithOpenApi(); |
| 308 | + |
| 309 | +// Diagnostic endpoint for searching with detailed analysis |
| 310 | +app.MapGet("/api/diagnostics/search", async (HttpContext context, string query, int? limit, IVectorStorageService vectorStorage, IEmbeddingService embeddingService, ILogger<Program> logger) => |
| 311 | +{ |
| 312 | + try |
| 313 | + { |
| 314 | + var searchLimit = limit ?? 10; |
| 315 | + var githubToken = context.Request.Headers["X-GitHub-Token"].FirstOrDefault(); |
| 316 | + var hasToken = !string.IsNullOrEmpty(githubToken) && IsValidGitHubToken(githubToken); |
| 317 | + |
| 318 | + logger.LogInformation("=== DIAGNOSTIC SEARCH ==="); |
| 319 | + logger.LogInformation("Query: '{Query}', HasToken: {HasToken}", query, hasToken); |
| 320 | + |
| 321 | + // Generate query embedding |
| 322 | + var queryEmbedding = await embeddingService.GenerateEmbeddingAsync(query, githubToken); |
| 323 | + |
| 324 | + // Get raw search results with very low threshold |
| 325 | + var results = await vectorStorage.SearchSimilarAsync(queryEmbedding, searchLimit, 0.0f); |
| 326 | + |
| 327 | + var diagnosticResults = results.Select((r, index) => new |
| 328 | + { |
| 329 | + Rank = index + 1, |
| 330 | + Id = r.Document.Id, |
| 331 | + Title = r.Document.Title, |
| 332 | + Description = r.Document.Description?.Substring(0, Math.Min(200, r.Document.Description?.Length ?? 0)) + "...", |
| 333 | + Similarity = r.Score, |
| 334 | + SimilarityPercent = Math.Round(r.Score * 100, 2), |
| 335 | + ContainsQueryTerm = r.Document.Title?.Contains(query, StringComparison.OrdinalIgnoreCase) == true || |
| 336 | + r.Document.Description?.Contains(query, StringComparison.OrdinalIgnoreCase) == true, |
| 337 | + TitleMatch = r.Document.Title?.Contains(query, StringComparison.OrdinalIgnoreCase) == true, |
| 338 | + DescriptionMatch = r.Document.Description?.Contains(query, StringComparison.OrdinalIgnoreCase) == true |
| 339 | + }).ToList(); |
| 340 | + |
| 341 | + var analysis = new |
| 342 | + { |
| 343 | + Query = query, |
| 344 | + EmbeddingType = hasToken ? "GitHub Models" : "Simple Hash", |
| 345 | + HasGitHubToken = hasToken, |
| 346 | + QueryEmbeddingStats = new |
| 347 | + { |
| 348 | + Dimensions = queryEmbedding.Length, |
| 349 | + Magnitude = Math.Sqrt(queryEmbedding.Span.ToArray().Sum(x => x * x)), |
| 350 | + Sample = queryEmbedding.Span[0..Math.Min(5, queryEmbedding.Length)].ToArray() |
| 351 | + }, |
| 352 | + TotalResults = diagnosticResults.Count, |
| 353 | + ResultsWithTextMatch = diagnosticResults.Count(r => r.ContainsQueryTerm), |
| 354 | + HighestSimilarity = diagnosticResults.FirstOrDefault()?.Similarity ?? 0, |
| 355 | + LowestSimilarity = diagnosticResults.LastOrDefault()?.Similarity ?? 0, |
| 356 | + Results = diagnosticResults |
| 357 | + }; |
| 358 | + |
| 359 | + logger.LogInformation("Diagnostic complete - {ResultCount} results, {TextMatches} contain query term", |
| 360 | + diagnosticResults.Count, diagnosticResults.Count(r => r.ContainsQueryTerm)); |
| 361 | + |
| 362 | + return Results.Ok(analysis); |
| 363 | + } |
| 364 | + catch (Exception ex) |
| 365 | + { |
| 366 | + logger.LogError(ex, "Error in diagnostic search for query: {Query}", query); |
| 367 | + return Results.BadRequest(new { Error = ex.Message }); |
| 368 | + } |
| 369 | +}) |
| 370 | +.WithName("DiagnosticSearch") |
| 371 | +.WithOpenApi(); |
| 372 | + |
| 373 | +// Diagnostic endpoint to browse ingested documents |
| 374 | +app.MapGet("/api/documents", async (IVectorStorageService vectorStorage, string? search = null, int? limit = null) => |
| 375 | +{ |
| 376 | + try |
| 377 | + { |
| 378 | + var searchLimit = limit ?? 50; |
| 379 | + var documents = await vectorStorage.GetAllDocumentsAsync(searchLimit); |
| 380 | + |
| 381 | + var results = documents.Select(doc => new |
| 382 | + { |
| 383 | + Id = doc.Id, |
| 384 | + Title = doc.Title, |
| 385 | + Description = doc.Description?.Length > 200 ? doc.Description.Substring(0, 200) + "..." : doc.Description, |
| 386 | + Url = doc.Url, |
| 387 | + IngestedAt = doc.IngestedAt, |
| 388 | + TitleMatch = !string.IsNullOrEmpty(search) && doc.Title.Contains(search, StringComparison.OrdinalIgnoreCase), |
| 389 | + DescriptionMatch = !string.IsNullOrEmpty(search) && !string.IsNullOrEmpty(doc.Description) && doc.Description.Contains(search, StringComparison.OrdinalIgnoreCase) |
| 390 | + }).ToList(); |
| 391 | + |
| 392 | + if (!string.IsNullOrEmpty(search)) |
| 393 | + { |
| 394 | + // Filter to only documents that contain the search term |
| 395 | + results = results.Where(r => r.TitleMatch || r.DescriptionMatch).ToList(); |
| 396 | + } |
| 397 | + |
| 398 | + return Results.Ok(new |
| 399 | + { |
| 400 | + TotalDocuments = documents.Count(), |
| 401 | + SearchTerm = search, |
| 402 | + MatchingDocuments = results.Count, |
| 403 | + Documents = results |
| 404 | + }); |
| 405 | + } |
| 406 | + catch (Exception ex) |
| 407 | + { |
| 408 | + return Results.BadRequest(new { Error = ex.Message }); |
| 409 | + } |
| 410 | +}) |
| 411 | +.WithName("BrowseDocuments") |
| 412 | +.WithOpenApi(); |
| 413 | + |
268 | 414 | app.MapDelete("/vector/clear", async (IVectorStorageService vectorStorage) => |
269 | 415 | { |
270 | 416 | try |
|
280 | 426 | .WithName("ClearVectors") |
281 | 427 | .WithOpenApi(); |
282 | 428 |
|
| 429 | +// Diagnostic endpoint to test embedding consistency |
| 430 | +app.MapGet("/api/embedding-test", async (HttpContext context, string text, IEmbeddingService embeddingService, ILogger<Program> logger) => |
| 431 | +{ |
| 432 | + try |
| 433 | + { |
| 434 | + // Test both with and without GitHub token |
| 435 | + var simpleEmbedding = await embeddingService.GenerateEmbeddingAsync(text, null); |
| 436 | + var githubEmbedding = await embeddingService.GenerateEmbeddingAsync(text, "dummy_token"); // Will use simple if token is invalid |
| 437 | + |
| 438 | + // Try with the actual token from headers |
| 439 | + var githubToken = context.Request.Headers["X-GitHub-Token"].FirstOrDefault(); |
| 440 | + ReadOnlyMemory<float>? realGithubEmbedding = null; |
| 441 | + |
| 442 | + if (!string.IsNullOrEmpty(githubToken)) |
| 443 | + { |
| 444 | + try |
| 445 | + { |
| 446 | + realGithubEmbedding = await embeddingService.GenerateEmbeddingAsync(text, githubToken); |
| 447 | + } |
| 448 | + catch (Exception ex) |
| 449 | + { |
| 450 | + logger.LogWarning(ex, "Failed to generate embedding with real GitHub token"); |
| 451 | + } |
| 452 | + } |
| 453 | + |
| 454 | + return Results.Ok(new |
| 455 | + { |
| 456 | + Text = text, |
| 457 | + SimpleEmbedding = new |
| 458 | + { |
| 459 | + Dimensions = simpleEmbedding.Length, |
| 460 | + Sample = simpleEmbedding.Span.Slice(0, Math.Min(10, simpleEmbedding.Length)).ToArray(), |
| 461 | + Magnitude = Math.Sqrt(simpleEmbedding.Span.ToArray().Sum(x => x * x)) |
| 462 | + }, |
| 463 | + GithubEmbedding = new |
| 464 | + { |
| 465 | + Dimensions = githubEmbedding.Length, |
| 466 | + Sample = githubEmbedding.Span.Slice(0, Math.Min(10, githubEmbedding.Length)).ToArray(), |
| 467 | + Magnitude = Math.Sqrt(githubEmbedding.Span.ToArray().Sum(x => x * x)) |
| 468 | + }, |
| 469 | + RealGithubEmbedding = realGithubEmbedding.HasValue ? new |
| 470 | + { |
| 471 | + Dimensions = realGithubEmbedding.Value.Length, |
| 472 | + Sample = realGithubEmbedding.Value.Span.Slice(0, Math.Min(10, realGithubEmbedding.Value.Length)).ToArray(), |
| 473 | + Magnitude = Math.Sqrt(realGithubEmbedding.Value.Span.ToArray().Sum(x => x * x)) |
| 474 | + } : null, |
| 475 | + AreSimpleAndGithubSame = simpleEmbedding.Span.SequenceEqual(githubEmbedding.Span), |
| 476 | + AreGithubEmbeddingsDifferent = realGithubEmbedding.HasValue && !githubEmbedding.Span.SequenceEqual(realGithubEmbedding.Value.Span) |
| 477 | + }); |
| 478 | + } |
| 479 | + catch (Exception ex) |
| 480 | + { |
| 481 | + return Results.BadRequest(new { Error = ex.Message }); |
| 482 | + } |
| 483 | +}) |
| 484 | +.WithName("TestEmbeddingConsistency") |
| 485 | +.WithOpenApi(); |
| 486 | + |
283 | 487 | app.MapDefaultEndpoints(); |
284 | 488 |
|
285 | 489 | // Helper method for GitHub token validation |
|
0 commit comments