Add script for generating new titles/descriptions for snippet metadata.

cpyle0819 · cpyle0819 · commit 333e1ed158b4 · 2025-04-24T14:20:37.000-04:00
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,13 @@
 black==24.3.0
+boto3==1.37.38
+botocore==1.37.38
 flake8==6.1.0
 mypy==1.8.0
 mypy-extensions==1.0.0
 pathspec==0.11.2
 pytest==8.0.0
 PyYAML==6.0.1
 requests==2.32.0
+types-boto3==1.37.38
 types-PyYAML==6.0.12.12
 yamale==4.0.4
diff --git a/scripts/base_prompt.txt b/scripts/base_prompt.txt
@@ -0,0 +1,11 @@
+Provide a 'title', 'title_abbrev', and 'description' for this example in json format.
+Title should be a title, title_abbrev should be a 1-5 word variation on the title, and description should
+explain in one paragraph what the code is going. Provide the json raw, without markdown fences.
+
+Sample result:
+
+{ 
+    "title": "Get an object by name from an Amazon S3 bucket",
+    "title_abbrev": "Get an object",
+    "description": "Use the AWS SDK for JavaScript to get an object from an Amazon S3 bucket. Steps are included that demonstrate how to split large downloads up into multiple parts."
+}
diff --git a/scripts/snippet_summarize.py b/scripts/snippet_summarize.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import Dict, Optional
+
+import boto3
+from botocore.exceptions import ClientError
+
+from aws_doc_sdk_examples_tools.doc_gen import DocGen, Snippet
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def make_doc_gen(root: Path):
+    doc_gen = DocGen.from_root(root)
+    doc_gen.collect_snippets()
+    return doc_gen
+
+
+def generate_descriptions(snippets: Dict[str, Snippet], prompt: Optional[str]):
+    client = boto3.client("bedrock-runtime", region_name="us-west-2")
+    base_prompt = Path(os.path.dirname(__file__), "base_prompt.txt").read_text()
+    model_id = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"
+    results = []
+    for snippet_id, snippet in snippets.items():
+        content = [{"text": base_prompt}]
+        if prompt:
+            content.append({"text": prompt})
+        content.append({"text": snippet.code})
+        conversation = [
+            {
+                "role": "user",
+                "content": content,
+            }
+        ]
+
+        try:
+            response = client.converse(
+                modelId=model_id,
+                messages=conversation,
+                inferenceConfig={"maxTokens": 512, "temperature": 0.5, "topP": 0.9},
+            )
+
+            # Extract and print the response text.
+            response_text = response["output"]["message"]["content"][0]["text"]
+            results.append(response_text)
+
+        except (ClientError, Exception) as e:
+            logger.warning(
+                f"ERROR: Can't invoke '{model_id}'. Name: {type(e).__name__}, Reason: {e}"
+            )
+    print(results)
+
+
+def main(doc_gen_root: Path, prompt: Path):
+    doc_gen = make_doc_gen(doc_gen_root)
+    prompt_text = prompt.read_text() if prompt and prompt.exists() else None
+    generate_descriptions(doc_gen.snippets, prompt_text)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate new titles and descriptions for DocGen snippets"
+    )
+    parser.add_argument(
+        "--doc-gen-root", required=True, help="Path to DocGen ready project"
+    )
+    parser.add_argument(
+        "--prompt",
+        help="Path to an additional prompt to be used for refining the output",
+    )
+    args = parser.parse_args()
+
+    doc_gen_root = Path(args.doc_gen_root)
+    prompt = Path(args.prompt) if args.prompt else None
+    main(doc_gen_root, prompt)