gfernandf
diff --git a/‎skills/experimental/research/extract-findings/skill.yaml‎
Lines changed: 115 additions & 0 deletions b/‎skills/experimental/research/extract-findings/skill.yaml‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎skills/experimental/research/normalize-corpus/skill.yaml‎
Lines changed: 90 additions & 0 deletions b/‎skills/experimental/research/normalize-corpus/skill.yaml‎
Lines changed: 90 additions & 0 deletions
@@ -0,0 +1,115 @@
+id: research.extract-findings
+version: 0.1.0
+name: Extract Findings
+description: >
+  Extract structured analytical findings from a normalized corpus of research
+  items. For each item, identifies claims, evidence signals, risks, opportunities,
+  and uncertainty markers. Also extracts keywords and classifies each item by
+  dominant theme. Output is a findings_by_item structure suitable for downstream
+  synthesis, conflict detection, and thematic grouping.
+
+inputs:
+  normalized_items:
+    type: array
+    required: true
+    description: >
+      Normalized items as produced by research.normalize-corpus. Each item must
+      include at minimum: id, content, type.
+
+  topic:
+    type: string
+    required: false
+    description: >
+      Optional topic or research question anchoring the extraction. When provided,
+      findings are prioritized relative to this topic.
+
+  goal:
+    type: string
+    required: false
+    description: >
+      Optional goal describing the intended use of the synthesis (e.g. "evaluate
+      expansion into market X", "assess regulatory risk"). Influences claim
+      relevance scoring.
+
+  focus:
+    type: string
+    required: false
+    description: >
+      Optional extraction focus hint. Examples: "prioritize risks", "prioritize
+      contradictions", "emphasize opportunities".
+
+  max_detail:
+    type: string
+    required: false
+    description: "Extraction depth: brief | standard | detailed. Defaults to standard."
+
+outputs:
+  findings_by_item:
+    type: array
+    required: true
+    description: >
+      Array of per-item findings objects. Each entry includes: item_id, item_type,
+      claims (array of extracted claims with evidence_strength weak|moderate|strong),
+      risks (array), opportunities (array), uncertainty_signals (array of phrases or
+      areas that are unclear or weakly supported), dominant_theme, and keywords.
+
+  extraction_stats:
+    type: object
+    required: true
+    description: >
+      Summary of the extraction pass: items_processed, total_claims, items_with_risks,
+      items_with_opportunities, items_with_uncertainty, dominant_themes (array).
+
+steps:
+
+  - id: extract_structured_findings
+    uses: model.output.generate
+    input:
+      instruction: >
+        For each normalized item, extract structured analytical findings anchored
+        to the provided topic and goal. For each item produce: item_id, item_type,
+        claims (each with text and evidence_strength: weak|moderate|strong),
+        risks (array of strings), opportunities (array of strings),
+        uncertainty_signals (areas unclear or weakly supported), dominant_theme,
+        and keywords. Apply focus hint if provided. Respect max_detail level.
+        Do not fabricate claims not supported by the item content.
+        Also produce extraction_stats: items_processed, total_claims,
+        items_with_risks, items_with_opportunities, items_with_uncertainty,
+        dominant_themes.
+      context_items: inputs.normalized_items
+      output_schema:
+        type: object
+        properties:
+          findings_by_item:
+            type: array
+          extraction_stats:
+            type: object
+        required:
+          - findings_by_item
+          - extraction_stats
+      detail_level: inputs.max_detail
+      constraints:
+        no_fabrication: true
+        evidence_required: true
+    output:
+      output.findings_by_item: outputs.findings_by_item
+      output.extraction_stats: outputs.extraction_stats
+      warnings: vars.extraction_warnings
+
+metadata:
+  status: experimental
+  tags:
+    - research
+    - extraction
+    - findings
+    - claims
+    - analysis
+  category: research
+  use_cases:
+    - Extract structured claims and risks from research documents
+    - Prepare per-item findings for downstream synthesis or comparison
+    - Identify uncertainty signals in a research corpus before making decisions
+  classification:
+    role: utility
+    invocation: direct
+    effect_mode: read_only
@@ -0,0 +1,90 @@
+id: research.normalize-corpus
+version: 0.1.0
+name: Normalize Corpus
+description: >
+  Normalize a heterogeneous collection of research items into a clean, uniform
+  representation ready for analysis. Handles text extraction from raw content,
+  chunking of long items, deduplication of near-identical entries, and language
+  detection. Items may arrive as pre-extracted text or as source references
+  (url, pdf_path, fs_path, memory_key) which are resolved during normalization.
+
+inputs:
+  items:
+    type: array
+    required: true
+    description: >
+      List of input items to normalize. Each item must include an `id` and either
+      a `content` string (pre-extracted text) or a `source_ref` object with `type`
+      and `location` fields for lazy extraction. Optional fields per item: title,
+      type (article | report | note | web_page | search_result | agent_output |
+      transcript | raw_text), source, relevance_hint, metadata.
+
+  max_chunk_size:
+    type: number
+    required: false
+    description: >
+      Maximum character length per chunk when splitting long items. Items shorter
+      than this threshold are not split. Defaults to a runtime-defined value.
+
+outputs:
+  normalized_items:
+    type: array
+    required: true
+    description: >
+      List of normalized items, each containing: id, content (extracted text),
+      type, language, chunks (array), source, title, and any retained metadata.
+
+  normalization_stats:
+    type: object
+    required: true
+    description: >
+      Summary statistics of the normalization pass including total_input,
+      extracted_count, chunked_count, deduplicated_count, and language_distribution.
+
+steps:
+
+  - id: assemble_normalized
+    uses: model.output.generate
+    input:
+      instruction: >
+        Return one object with exactly two top-level fields: normalized_items and
+        normalization_stats. normalized_items must be an array where each item includes:
+        id, content, type (inferred from input if not provided, defaulting to raw_text),
+        language, chunks, source, title, and any retained metadata. normalization_stats
+        must include: total_input, extracted_count, chunked_count, deduplicated_count,
+        and language_distribution. Do not rename these fields, omit them, or wrap them
+        under any additional structure.
+      context_items: inputs.items
+      output_schema:
+        type: object
+        properties:
+          normalized_items:
+            type: array
+          normalization_stats:
+            type: object
+        required:
+          - normalized_items
+          - normalization_stats
+      detail_level: standard
+      constraints:
+        max_chunk_size: inputs.max_chunk_size
+    output:
+      output.normalized_items: outputs.normalized_items
+      output.normalization_stats: outputs.normalization_stats
+
+metadata:
+  status: experimental
+  tags:
+    - research
+    - normalization
+    - corpus
+    - preprocessing
+  category: research
+  use_cases:
+    - Prepare heterogeneous research material for downstream analysis
+    - Normalize agent outputs, PDFs, web pages, and notes into a uniform format
+    - Deduplicate and chunk long documents before synthesis
+  classification:
+    role: utility
+    invocation: direct
+    effect_mode: read_only