Skip to content

Commit d46a761

Browse files
authored
feat: add source field to knowledge insights, unify connections tool (#113)
* feat: add source field to knowledge insights for agent-discovered knowledge Add a source field to capture_insight that distinguishes user-provided insights from agent-discovered ones. Three source values: user (default), agent_discovery (agent figured it out via data exploration), and enrichment_gap (metadata gap flagged for admin attention). - Migration 000010: add source column with DEFAULT 'user' for backward compat - Types: source constants, ValidateSource, NormalizeSource - Store: INSERT/SELECT/filter support for source field - Toolkit: input schema, validation, tool description updated - Prompt: expanded agent guidance for self-capture and when to ask users - Docs: overview, admin-api, governance, llms.txt, llms-full.txt updated - Swagger: regenerated for new Insight.Source field * test: fix patch coverage for PR #113 Add targeted tests covering previously-uncovered code paths: - datahub/toolkit: RegisterTools with non-nil inner toolkit - trino/toolkit: RegisterTools with non-nil inner toolkit - platform/connections_tool: invoke tool through MCP transport - platform/lifecycle: rollback with nil stop callback - knowledge/toolkit: MarkApplied error path in apply handler - oauth/postgres: cleanup goroutine error handling - oauth/server: cleanup routine error handling Patch coverage: 92.4% (134/145 changed lines)
1 parent 20fdb01 commit d46a761

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+917
-215
lines changed

cmd/mcp-data-platform/main.go

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ func setupSignalHandler() context.Context {
8080
type serverResult struct {
8181
mcpServer *mcp.Server
8282
platform *platform.Platform
83-
toolkit interface{ Close() error }
8483
}
8584

8685
func createServer(opts serverOptions) (*serverResult, error) {
@@ -95,14 +94,32 @@ func createServer(opts serverOptions) (*serverResult, error) {
9594
return result, nil
9695
}
9796

98-
result.mcpServer, result.toolkit, err = mcpserver.NewWithDefaults()
97+
result.mcpServer, err = mcpserver.NewWithDefaults()
9998
if err != nil {
10099
return nil, fmt.Errorf("creating server with defaults: %w", err)
101100
}
102101
return result, nil
103102
}
104103

104+
// initLogging configures slog from the LOG_LEVEL environment variable.
105+
// Supported values: debug, info, warn, error. Defaults to info.
106+
func initLogging() {
107+
level := slog.LevelInfo
108+
switch os.Getenv("LOG_LEVEL") {
109+
case "debug", "DEBUG":
110+
level = slog.LevelDebug
111+
case "warn", "WARN":
112+
level = slog.LevelWarn
113+
case "error", "ERROR":
114+
level = slog.LevelError
115+
}
116+
slog.SetDefault(slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{
117+
Level: level,
118+
})))
119+
}
120+
105121
func run() error {
122+
initLogging()
106123
opts := parseFlags()
107124

108125
if opts.showVersion {
@@ -129,11 +146,6 @@ func closeServer(result *serverResult) {
129146
slog.Error("shutdown: platform close error", "error", err)
130147
}
131148
}
132-
if result.toolkit != nil {
133-
if err := result.toolkit.Close(); err != nil {
134-
slog.Error("shutdown: toolkit close error", "error", err)
135-
}
136-
}
137149
slog.Info("shutdown: complete")
138150
}
139151

cmd/mcp-data-platform/main_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package main
22

33
import (
44
"context"
5+
"log/slog"
56
"net/http"
67
"net/http/httptest"
78
"os"
@@ -23,6 +24,40 @@ const (
2324
testPreDelay3s = 3 * time.Second
2425
)
2526

27+
func TestInitLogging(t *testing.T) {
28+
tests := []struct {
29+
env string
30+
level slog.Level
31+
}{
32+
{"debug", slog.LevelDebug},
33+
{"DEBUG", slog.LevelDebug},
34+
{"info", slog.LevelInfo},
35+
{"warn", slog.LevelWarn},
36+
{"WARN", slog.LevelWarn},
37+
{"error", slog.LevelError},
38+
{"ERROR", slog.LevelError},
39+
{"", slog.LevelInfo}, // default
40+
{"unknown", slog.LevelInfo}, // unrecognized falls through
41+
}
42+
43+
for _, tt := range tests {
44+
t.Run("LOG_LEVEL="+tt.env, func(t *testing.T) {
45+
t.Setenv("LOG_LEVEL", tt.env)
46+
initLogging()
47+
48+
handler := slog.Default().Handler()
49+
// Verify the handler is enabled at the expected level
50+
if !handler.Enabled(context.Background(), tt.level) {
51+
t.Errorf("expected handler enabled at %v", tt.level)
52+
}
53+
// For non-debug levels, debug should be disabled
54+
if tt.level > slog.LevelDebug && handler.Enabled(context.Background(), slog.LevelDebug) {
55+
t.Errorf("expected debug disabled when LOG_LEVEL=%q", tt.env)
56+
}
57+
})
58+
}
59+
}
60+
2661
func TestRegisterOAuthRoutes(t *testing.T) {
2762
mux := http.NewServeMux()
2863
handler := http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {

docs/knowledge/admin-api.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ Returns a paginated list of insights with optional filtering.
3434
| `entity_urn` | string | Filter by related entity URN |
3535
| `captured_by` | string | Filter by the user who captured the insight |
3636
| `confidence` | string | Filter by confidence level: `high`, `medium`, `low` |
37+
| `source` | string | Filter by source: `user`, `agent_discovery`, `enrichment_gap` |
3738
| `since` | RFC 3339 | Filter insights created after this timestamp |
3839
| `until` | RFC 3339 | Filter insights created before this timestamp |
3940
| `page` | integer | Page number, 1-based (default: 1) |
@@ -57,6 +58,7 @@ curl -s "https://mcp.example.com/api/v1/admin/knowledge/insights?status=pending&
5758
"session_id": "sess_abc123",
5859
"captured_by": "analyst@example.com",
5960
"persona": "analyst",
61+
"source": "user",
6062
"category": "correction",
6163
"insight_text": "The amount column represents gross margin before returns, not revenue.",
6264
"confidence": "high",
@@ -348,6 +350,7 @@ Knowledge capture uses two PostgreSQL tables, created by migrations 000006, 0000
348350
| `session_id` | TEXT | MCP session that produced the insight |
349351
| `captured_by` | TEXT | User who shared the knowledge |
350352
| `persona` | TEXT | Active persona at capture time |
353+
| `source` | TEXT | Where the knowledge came from: `user`, `agent_discovery`, `enrichment_gap` |
351354
| `category` | TEXT | Insight category |
352355
| `insight_text` | TEXT | The domain knowledge content |
353356
| `confidence` | TEXT | Confidence level (high, medium, low) |

docs/knowledge/governance.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ sequenceDiagram
265265
Analyst->>AI: That's gross margin before returns,<br/>not revenue like the name suggests
266266
267267
Note over AI,Platform: Capture
268-
AI->>Platform: capture_insight(<br/>category: correction,<br/>entity_urns: [urn:li:dataset:...orders...],<br/>insight_text: "amount column is gross margin<br/>before returns, not revenue",<br/>confidence: high,<br/>suggested_actions: [{<br/> action_type: update_description,<br/> target: amount,<br/> detail: "Gross margin before returns"<br/>}])
268+
AI->>Platform: capture_insight(<br/>category: correction,<br/>source: user,<br/>entity_urns: [urn:li:dataset:...orders...],<br/>insight_text: "amount column is gross margin<br/>before returns, not revenue",<br/>confidence: high,<br/>suggested_actions: [{<br/> action_type: update_description,<br/> target: amount,<br/> detail: "Gross margin before returns"<br/>}])
269269
Platform->>DB: INSERT (status: pending)
270270
Platform-->>AI: Insight captured: a1b2c3
271271

docs/knowledge/overview.md

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,11 +174,23 @@ personas:
174174
- "apply_knowledge"
175175
```
176176

177+
## Insight Sources
178+
179+
Insights track where the knowledge came from via the `source` field:
180+
181+
| Source | Description | Example |
182+
|--------|-------------|---------|
183+
| `user` | Knowledge shared by the user during conversation (default) | User says "The amount column is gross margin, not revenue" |
184+
| `agent_discovery` | Knowledge the AI agent figured out independently | Agent samples data and discovers a column contains ISO country codes |
185+
| `enrichment_gap` | Metadata gap flagged for admin attention | Table has no description and the agent cannot determine its purpose from the data |
186+
187+
The source field is optional when calling `capture_insight`. When omitted, it defaults to `user`.
188+
177189
## AI Agent Guidance
178190

179191
The toolkit registers an MCP prompt called `knowledge_capture_guidance` that tells AI assistants when to capture insights. The prompt covers:
180192

181-
**When to capture:**
193+
**When to capture (user-provided):**
182194

183195
- User corrects a column description, table purpose, or data interpretation
184196
- User explains what data means in business terms not captured in metadata
@@ -187,12 +199,29 @@ The toolkit registers an MCP prompt called `knowledge_capture_guidance` that tel
187199
- User explains connections between datasets not captured in lineage
188200
- User suggests improvements to existing documentation or metadata
189201

202+
**When to capture (agent-discovered):**
203+
204+
- Agent discovers what a column means by sampling actual data (set `source: "agent_discovery"`)
205+
- Agent finds join relationships not documented in lineage metadata
206+
- Agent identifies data quality patterns (nulls, outliers, encoding issues)
207+
- Agent resolves ambiguous column names by examining values
208+
- Agent encounters metadata that is missing or clearly wrong and cannot resolve it from the data (set `source: "enrichment_gap"`)
209+
210+
**When to ask the user instead:**
211+
212+
- Enrichment is insufficient and the agent cannot resolve it from the data alone
213+
- Multiple interpretations are equally plausible
214+
- The insight would have high impact (e.g., PII classification, deprecation status)
215+
190216
**When not to capture:**
191217

192218
- Transient questions or debugging ("why is my query slow?")
193219
- Personal preferences ("I prefer using CTEs")
194220
- Information already present in the catalog metadata
195221
- Vague or unverifiable claims without specific context
222+
- Trivially obvious gaps without adding what the data actually means
223+
- Speculative interpretations without evidence from querying
224+
- The same gap repeatedly within a session
196225

197226
The prompt is available via `prompts/list` and `prompts/get` in the MCP protocol.
198227

docs/llms-full.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -987,6 +987,7 @@ Record domain knowledge shared during a session.
987987
| `category` | string | Yes | - | correction, business_context, data_quality, usage_guidance, relationship, enhancement |
988988
| `insight_text` | string | Yes | - | Knowledge to record (10-4000 chars) |
989989
| `confidence` | string | No | medium | high, medium, low |
990+
| `source` | string | No | user | user, agent_discovery, enrichment_gap |
990991
| `entity_urns` | array | No | [] | Related DataHub entity URNs (max 10) |
991992
| `related_columns` | array | No | [] | Related columns (max 20) |
992993
| `suggested_actions` | array | No | [] | Proposed catalog changes (max 5) |
@@ -1154,6 +1155,7 @@ Records domain knowledge shared during a session.
11541155
| `category` | string | Yes | One of: `correction`, `business_context`, `data_quality`, `usage_guidance`, `relationship`, `enhancement` |
11551156
| `insight_text` | string | Yes | The knowledge to record (10-4000 characters) |
11561157
| `confidence` | string | No | `high`, `medium` (default), or `low` |
1158+
| `source` | string | No | `user` (default), `agent_discovery`, or `enrichment_gap` |
11571159
| `entity_urns` | array | No | DataHub URNs this insight relates to (max 10) |
11581160
| `related_columns` | array | No | Columns related to this insight (max 20) |
11591161
| `suggested_actions` | array | No | Proposed catalog changes (max 5) |
@@ -1310,6 +1312,7 @@ HTTP endpoints for managing insights and changesets. All endpoints require admin
13101312
| `entity_urn` | string | Filter by entity URN |
13111313
| `captured_by` | string | Filter by user who captured |
13121314
| `confidence` | string | Filter by confidence level |
1315+
| `source` | string | Filter by source (user, agent_discovery, enrichment_gap) |
13131316
| `since` | RFC3339 | Filter by creation time (after) |
13141317
| `until` | RFC3339 | Filter by creation time (before) |
13151318
| `page` | integer | Page number (1-based) |
@@ -1348,6 +1351,7 @@ Knowledge capture uses two PostgreSQL tables (migrations 000006, 000007, 000008)
13481351
| `session_id` | TEXT | MCP session that produced it |
13491352
| `captured_by` | TEXT | User who shared the knowledge |
13501353
| `persona` | TEXT | Active persona at capture time |
1354+
| `source` | TEXT | Where the knowledge came from: `user`, `agent_discovery`, `enrichment_gap` |
13511355
| `category` | TEXT | Insight category |
13521356
| `insight_text` | TEXT | The domain knowledge |
13531357
| `confidence` | TEXT | Confidence level |

docs/llms.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353

5454
## Knowledge Capture
5555

56-
- [Overview](knowledge/overview.md): Tribal knowledge capture for data catalogs. capture_insight records domain knowledge during AI sessions; apply_knowledge provides admin review, synthesis, and DataHub write-back with changeset tracking and rollback. Insight categories, lifecycle states, governance workflow, configuration, persona integration, AI agent guidance prompt
56+
- [Overview](knowledge/overview.md): Tribal knowledge capture for data catalogs. capture_insight records domain knowledge during AI sessions; apply_knowledge provides admin review, synthesis, and DataHub write-back with changeset tracking and rollback. Insight categories, lifecycle states, insight sources (user, agent_discovery, enrichment_gap), governance workflow, configuration, persona integration, AI agent guidance prompt for both user-provided and agent-discovered insights
5757
- [Governance Workflow](knowledge/governance.md): Active metadata management through human-in-the-loop curation. Bulk review, per-entity review, approve/reject actions, synthesize change proposals, apply changes to DataHub, changeset tracking, rollback. Complete end-to-end workflow example
5858
- [Admin API](knowledge/admin-api.md): REST endpoints for managing insights and changesets. List/filter/get/update insights, approve/reject status transitions, insight statistics. List/get changesets, rollback. Authentication, error responses, database schema reference
5959

internal/apidocs/docs.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2614,6 +2614,9 @@ const docTemplate = `{
26142614
"session_id": {
26152615
"type": "string"
26162616
},
2617+
"source": {
2618+
"type": "string"
2619+
},
26172620
"status": {
26182621
"type": "string"
26192622
},

internal/apidocs/swagger.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2608,6 +2608,9 @@
26082608
"session_id": {
26092609
"type": "string"
26102610
},
2611+
"source": {
2612+
"type": "string"
2613+
},
26112614
"status": {
26122615
"type": "string"
26132616
},

internal/apidocs/swagger.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,8 @@ definitions:
559559
type: string
560560
session_id:
561561
type: string
562+
source:
563+
type: string
562564
status:
563565
type: string
564566
suggested_actions:

0 commit comments

Comments
 (0)