gfernandf
diff --git a/‎capabilities/_index.yaml‎
Lines changed: 13 additions & 1 deletion b/‎capabilities/_index.yaml‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎capabilities/analysis.theme.cluster.yaml‎
Lines changed: 61 additions & 0 deletions b/‎capabilities/analysis.theme.cluster.yaml‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎capabilities/text.merge.yaml‎
Lines changed: 45 additions & 0 deletions b/‎capabilities/text.merge.yaml‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎capabilities/web.source.normalize.yaml‎
Lines changed: 42 additions & 0 deletions b/‎capabilities/web.source.normalize.yaml‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎capabilities/web.search.yaml‎ ‎capabilities/web.source.search.yaml‎capabilities/web.search.yaml renamed to capabilities/web.source.search.yaml
Lines changed: 2 additions & 2 deletions b/‎capabilities/web.search.yaml‎ ‎capabilities/web.source.search.yaml‎capabilities/web.search.yaml renamed to capabilities/web.source.search.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎catalog/capabilities.json‎
Lines changed: 156 additions & 3 deletions b/‎catalog/capabilities.json‎
Lines changed: 156 additions & 3 deletions
@@ -44,4 +44,16 @@ capabilities:
 
   - id: analysis.risk.extract
     status: experimental
-    description: Extract risks, fragile assumptions, failure modes, and mitigation ideas from a target artifact.
+    description: Extract risks, fragile assumptions, failure modes, and mitigation ideas from a target artifact.
+
+  - id: analysis.theme.cluster
+    status: experimental
+    description: Group text items into coherent thematic clusters with summaries and signal strength.
+
+  - id: web.source.normalize
+    status: experimental
+    description: Normalize web search results into corpus item format for downstream analysis.
+
+  - id: text.merge
+    status: experimental
+    description: Merge multiple text items into a single consolidated text block.
@@ -0,0 +1,61 @@
+id: analysis.theme.cluster
+version: 1.0.0
+description: >
+  Group a collection of text items into coherent thematic clusters.
+  Identifies dominant themes, assigns each item to one or more clusters,
+  and produces a summary per cluster with signal strength. Accepts optional
+  hint labels to guide (not force) the thematic structure.
+
+inputs:
+  items:
+    type: array
+    required: true
+    description: >
+      Items to cluster. Each item must include id and content (text).
+      May also include title, type, source, metadata.
+  hint_labels:
+    type: array
+    required: false
+    description: >
+      Suggested theme labels to guide clustering. The implementation may
+      merge, split, rename, or add themes beyond hints. Example:
+      ["market_overview", "key_players", "trends", "risks", "opportunities"].
+  max_clusters:
+    type: number
+    required: false
+    description: Maximum number of clusters to produce (default 8, hard cap 15).
+  context:
+    type: string
+    required: false
+    description: Background context informing how items should be grouped.
+
+outputs:
+  clusters:
+    type: array
+    required: true
+    description: >
+      Themed clusters. Each cluster contains: theme (label), description,
+      item_ids (list of assigned item ids), summary (text summary of the
+      cluster content), signal_strength (0-1 estimate of how well-supported
+      the theme is by the corpus).
+  unclustered:
+    type: array
+    required: false
+    description: >
+      Items that did not fit any cluster. Each entry contains id and reason.
+  cluster_quality:
+    type: object
+    required: false
+    description: >
+      Self-assessment of clustering quality: coherence_score (0-1),
+      coverage_ratio (fraction of items assigned), overlap_warnings
+      (list of items assigned to multiple clusters).
+
+properties:
+  deterministic: false
+  side_effects: false
+  idempotent: true
+
+metadata:
+  status: experimental
+  tags: [analysis, clustering, themes, structuring]
@@ -0,0 +1,45 @@
+id: text.merge
+version: 1.0.0
+description: >
+  Merge multiple text items into a single consolidated text block.
+  Accepts an array of items with text content and produces a single
+  string with configurable separator. Deterministic, no LLM required.
+
+inputs:
+  items:
+    type: array
+    required: true
+    description: >
+      Items to merge. Each item must include a content field (string).
+      May also include id and title which are used as section headers
+      when include_headers is true.
+  separator:
+    type: string
+    required: false
+    description: >
+      Separator between merged items. Defaults to double newline.
+  include_headers:
+    type: boolean
+    required: false
+    description: >
+      Whether to include item titles as section headers in the merged
+      text. Defaults to true when items have titles.
+
+outputs:
+  text:
+    type: string
+    required: true
+    description: Merged text block from all items.
+  item_count:
+    type: number
+    required: true
+    description: Number of items that contributed content to the merged text.
+
+properties:
+  deterministic: true
+  side_effects: false
+  idempotent: true
+
+metadata:
+  status: experimental
+  tags: [text, merge, preprocessing, utility]
@@ -0,0 +1,42 @@
+id: web.source.normalize
+version: 1.0.0
+description: >
+  Normalize web search results into corpus item format suitable for
+  downstream research and analysis capabilities. Converts raw search
+  result objects (url, title, snippet) into structured corpus items
+  with source_ref for lazy content resolution. Supports quick mode
+  (snippet as content) and deep mode (source_ref for full page fetch).
+
+inputs:
+  results:
+    type: array
+    required: true
+    description: >
+      Web search results. Each result should contain at minimum a url field.
+      May also include title, snippet, rank, domain, date.
+  mode:
+    type: string
+    required: false
+    description: >
+      Processing mode. "quick" uses snippets as content (default).
+      "deep" leaves content empty and sets source_ref for downstream
+      resolution via research.source.retrieve.
+
+outputs:
+  items:
+    type: array
+    required: true
+    description: >
+      Normalized corpus items. Each item contains: id, title, content
+      (populated in quick mode, empty in deep mode), source_ref
+      (always present, type=url), type (web_page), source (original URL),
+      metadata (original snippet, rank, domain).
+
+properties:
+  deterministic: true
+  side_effects: false
+  idempotent: true
+
+metadata:
+  status: experimental
+  tags: [web, normalization, corpus, preprocessing]
@@ -1,6 +1,6 @@
-id: web.search
+id: web.source.search
 version: 1.0.0
-description: Search the web for results matching a query.
+description: Search the web for sources matching a query.
 inputs:
   query:
     type: string
 
@@ -346,6 +346,67 @@
       "idempotent": true
     }
   },
+  {
+    "id": "analysis.theme.cluster",
+    "version": "1.0.0",
+    "description": "Group a collection of text items into coherent thematic clusters. Identifies dominant themes, assigns each item to one or more clusters, and produces a summary per cluster with signal strength. Accepts optional hint labels to guide (not force) the thematic structure.\n",
+    "file": "capabilities/analysis.theme.cluster.yaml",
+    "inputs": {
+      "items": {
+        "type": "array",
+        "required": true,
+        "description": "Items to cluster. Each item must include id and content (text). May also include title, type, source, metadata.\n"
+      },
+      "hint_labels": {
+        "type": "array",
+        "required": false,
+        "description": "Suggested theme labels to guide clustering. The implementation may merge, split, rename, or add themes beyond hints. Example: [\"market_overview\", \"key_players\", \"trends\", \"risks\", \"opportunities\"].\n"
+      },
+      "max_clusters": {
+        "type": "number",
+        "required": false,
+        "description": "Maximum number of clusters to produce (default 8, hard cap 15)."
+      },
+      "context": {
+        "type": "string",
+        "required": false,
+        "description": "Background context informing how items should be grouped."
+      }
+    },
+    "outputs": {
+      "clusters": {
+        "type": "array",
+        "required": true,
+        "description": "Themed clusters. Each cluster contains: theme (label), description, item_ids (list of assigned item ids), summary (text summary of the cluster content), signal_strength (0-1 estimate of how well-supported the theme is by the corpus).\n"
+      },
+      "unclustered": {
+        "type": "array",
+        "required": false,
+        "description": "Items that did not fit any cluster. Each entry contains id and reason.\n"
+      },
+      "cluster_quality": {
+        "type": "object",
+        "required": false,
+        "description": "Self-assessment of clustering quality: coherence_score (0-1), coverage_ratio (fraction of items assigned), overlap_warnings (list of items assigned to multiple clusters).\n"
+      }
+    },
+    "metadata": {
+      "tags": [
+        "analysis",
+        "clustering",
+        "themes",
+        "structuring"
+      ],
+      "category": null,
+      "status": "experimental",
+      "examples": []
+    },
+    "properties": {
+      "deterministic": false,
+      "side_effects": false,
+      "idempotent": true
+    }
+  },
   {
     "id": "audio.transcribe",
     "version": "1.0.0",
@@ -2874,6 +2935,57 @@
       "idempotent": true
     }
   },
+  {
+    "id": "text.merge",
+    "version": "1.0.0",
+    "description": "Merge multiple text items into a single consolidated text block. Accepts an array of items with text content and produces a single string with configurable separator. Deterministic, no LLM required.\n",
+    "file": "capabilities/text.merge.yaml",
+    "inputs": {
+      "items": {
+        "type": "array",
+        "required": true,
+        "description": "Items to merge. Each item must include a content field (string). May also include id and title which are used as section headers when include_headers is true.\n"
+      },
+      "separator": {
+        "type": "string",
+        "required": false,
+        "description": "Separator between merged items. Defaults to double newline.\n"
+      },
+      "include_headers": {
+        "type": "boolean",
+        "required": false,
+        "description": "Whether to include item titles as section headers in the merged text. Defaults to true when items have titles.\n"
+      }
+    },
+    "outputs": {
+      "text": {
+        "type": "string",
+        "required": true,
+        "description": "Merged text block from all items."
+      },
+      "item_count": {
+        "type": "number",
+        "required": true,
+        "description": "Number of items that contributed content to the merged text."
+      }
+    },
+    "metadata": {
+      "tags": [
+        "text",
+        "merge",
+        "preprocessing",
+        "utility"
+      ],
+      "category": null,
+      "status": "experimental",
+      "examples": []
+    },
+    "properties": {
+      "deterministic": true,
+      "side_effects": false,
+      "idempotent": true
+    }
+  },
   {
     "id": "text.summarize",
     "version": "1.0.0",
@@ -3063,10 +3175,51 @@
     }
   },
   {
-    "id": "web.search",
+    "id": "web.source.normalize",
+    "version": "1.0.0",
+    "description": "Normalize web search results into corpus item format suitable for downstream research and analysis capabilities. Converts raw search result objects (url, title, snippet) into structured corpus items with source_ref for lazy content resolution. Supports quick mode (snippet as content) and deep mode (source_ref for full page fetch).\n",
+    "file": "capabilities/web.source.normalize.yaml",
+    "inputs": {
+      "results": {
+        "type": "array",
+        "required": true,
+        "description": "Web search results. Each result should contain at minimum a url field. May also include title, snippet, rank, domain, date.\n"
+      },
+      "mode": {
+        "type": "string",
+        "required": false,
+        "description": "Processing mode. \"quick\" uses snippets as content (default). \"deep\" leaves content empty and sets source_ref for downstream resolution via research.source.retrieve.\n"
+      }
+    },
+    "outputs": {
+      "items": {
+        "type": "array",
+        "required": true,
+        "description": "Normalized corpus items. Each item contains: id, title, content (populated in quick mode, empty in deep mode), source_ref (always present, type=url), type (web_page), source (original URL), metadata (original snippet, rank, domain).\n"
+      }
+    },
+    "metadata": {
+      "tags": [
+        "web",
+        "normalization",
+        "corpus",
+        "preprocessing"
+      ],
+      "category": null,
+      "status": "experimental",
+      "examples": []
+    },
+    "properties": {
+      "deterministic": true,
+      "side_effects": false,
+      "idempotent": true
+    }
+  },
+  {
+    "id": "web.source.search",
     "version": "1.0.0",
-    "description": "Search the web for results matching a query.",
-    "file": "capabilities/web.search.yaml",
+    "description": "Search the web for sources matching a query.",
+    "file": "capabilities/web.source.search.yaml",
     "inputs": {
       "query": {
         "type": "string",