awsdocs · cpyle0819 · Aug 1, 2025 · Jul 18, 2025 · Jul 19, 2025 · Jul 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,4 @@ __pycache__
 *.egg-info
 .idea/
 build/
-dist/
-.ailly_iam_policy
-*.log
+dist/
diff --git a/aws_doc_sdk_examples_tools/lliam/domain/commands.py b/aws_doc_sdk_examples_tools/lliam/domain/commands.py
@@ -25,3 +25,9 @@ class UpdateReservoir(Command):
     root: Path
     batches: List[str]
     packages: List[str]
+
+
+@dataclass
+class DedupeReservoir(Command):
+    root: Path
+    packages: List[str]
diff --git a/aws_doc_sdk_examples_tools/lliam/entry_points/lliam_app.py b/aws_doc_sdk_examples_tools/lliam/entry_points/lliam_app.py
@@ -81,6 +81,23 @@ def update_reservoir(
     handle_domain_errors(errors)
 
 
+@app.command()
+def dedupe_reservoir(
+    iam_tributary_root: str,
+    packages: Annotated[
+        Optional[str], typer.Option(help="Comma delimited list of packages to update")
+    ] = None,
+) -> None:
+    """
+    Enumerate fields that must be unique (e.g. title_abbrev)
+    """
+    doc_gen_root = Path(iam_tributary_root)
+    package_names = parse_package_names(packages)
+    cmd = commands.DedupeReservoir(root=doc_gen_root, packages=package_names)
+    errors = messagebus.handle(cmd)
+    handle_domain_errors(errors)
+
+
 def handle_domain_errors(errors: List[errors.DomainError]):
     if errors:
         for error in errors:

diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py b/aws_doc_sdk_examples_tools/lliam/service_layer/dedupe_reservoir.py
@@ -0,0 +1,43 @@
+from collections import Counter
+from dataclasses import replace
+import logging
+from typing import Dict
+
+from aws_doc_sdk_examples_tools.doc_gen import DocGen
+from aws_doc_sdk_examples_tools.lliam.domain.commands import DedupeReservoir
+from aws_doc_sdk_examples_tools.metadata import Example
+from aws_doc_sdk_examples_tools.yaml_writer import prepare_write, write_many
+
+logger = logging.getLogger(__name__)
+
+
+def make_title_abbreviation(example: Example, counter: Counter):
+    count = counter[example.title_abbrev]
+    abbrev = f"{example.title_abbrev} ({count + 1})" if count else example.title_abbrev
+    counter[example.title_abbrev] += 1
+    return abbrev
+
+
+def handle_dedupe_reservoir(cmd: DedupeReservoir, uow: None):
+    doc_gen = DocGen.from_root(cmd.root)
+
+    examples: Dict[str, Example] = {}
+
+    for id, example in doc_gen.examples.items():
+        if cmd.packages and example.file:
+            package = example.file.name.split("_metadata.yaml")[0]
+            if package in cmd.packages:
+                examples[id] = example
+        else:
+            examples[id] = example
+
+    title_abbrev_counts: Counter = Counter()
+
+    for id, example in examples.items():
+        examples[id] = replace(
+            example,
+            title_abbrev=make_title_abbreviation(example, title_abbrev_counts),
+        )
+
+    writes = prepare_write(examples)
+    write_many(cmd.root, writes)
diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/messagebus.py b/aws_doc_sdk_examples_tools/lliam/service_layer/messagebus.py
@@ -3,6 +3,7 @@
 from aws_doc_sdk_examples_tools.lliam.domain import commands
 from aws_doc_sdk_examples_tools.lliam.service_layer import (
     create_prompts,
+    dedupe_reservoir,
     update_doc_gen,
     run_ailly,
     unit_of_work,
@@ -33,4 +34,5 @@ def handle_command(command: commands.Command, uow: Optional[unit_of_work.FsUnitO
     commands.CreatePrompts: create_prompts.create_prompts,
     commands.RunAilly: run_ailly.handle_run_ailly,
     commands.UpdateReservoir: update_doc_gen.handle_update_reservoir,
+    commands.DedupeReservoir: dedupe_reservoir.handle_dedupe_reservoir,
 }
diff --git a/aws_doc_sdk_examples_tools/lliam/service_layer/update_doc_gen.py b/aws_doc_sdk_examples_tools/lliam/service_layer/update_doc_gen.py
@@ -1,9 +1,11 @@
+from dataclasses import replace
 import json
 import logging
 from collections import Counter
 from pathlib import Path
 from typing import Dict, Iterable, List
 
+from aws_doc_sdk_examples_tools.lliam.adapters.repository import DEFAULT_METADATA_PREFIX
 from aws_doc_sdk_examples_tools.yaml_writer import prepare_write, write_many
 
 from aws_doc_sdk_examples_tools.lliam.config import (
@@ -15,19 +17,22 @@
 
 logger = logging.getLogger(__name__)
 
+Updates = Dict[str, List[Dict[str, str]]]
+
 IAM_LANGUAGE = "IAMPolicyGrammar"
 
 
-def examples_from_updates(updates: List[Dict]) -> Iterable[Example]:
+def examples_from_updates(updates: Updates) -> Iterable[Example]:
     """
     Takes a list of example metadata updates and returns an
     iterable of examples with the applied updates.
     """
 
     indexed_updates = {}
-    for item in updates:
-        if "id" in item:
-            indexed_updates[item["id"]] = item
+    for update_list in updates.values():
+        for item in update_list:
+            if "id" in item:
+                indexed_updates[item["id"]] = item
 
     examples = [
         Example(
@@ -43,45 +48,55 @@ def examples_from_updates(updates: List[Dict]) -> Iterable[Example]:
     return examples
 
 
-def make_title_abbreviation(old: Example, new: Example, abbreviations: Counter):
-    language = old.languages[IAM_LANGUAGE]
+def get_source_title(example: Example) -> str:
+    language = example.languages[IAM_LANGUAGE]
     version = language.versions[0]
     source = version.source
-    source_title = source.title if source else ""
-    base = f"{new.title_abbrev} (from '{source_title}' guide)"
-    abbreviations[base] += 1
-    count = abbreviations[base]
-    return f"{base} ({count})" if count > 1 else base
+    return source.title if source else ""
 
 
 def update_examples(doc_gen: DocGen, examples: Iterable[Example]) -> Dict[str, Example]:
     """
     Merge a subset of example properties into a DocGen instance.
     """
-    title_abbrevs = Counter(
-        [example.title_abbrev for example in doc_gen.examples.values()]
-    )
-    updated = {}
+
     for example in examples:
-        if doc_gen_example := doc_gen.examples.get(example.id):
-            doc_gen_example.title = example.title
-            doc_gen_example.title_abbrev = make_title_abbreviation(
-                old=doc_gen_example, new=example, abbreviations=title_abbrevs
+        if example.id in doc_gen.examples:
+            source_title = get_source_title(doc_gen.examples[example.id])
+            # This is a hack. TCA is replacing AWS with &AWS;, which entity converter
+            # then does another pass on. So we end up with things like "&AWS; &GLUlong;"
+            # which render as "AWS AWS Glue". We should look at this closer when time permits.
+            source_title = source_title.replace("&AWS;", "AWS")
+            new_abbrev = f"{example.title_abbrev} (from '{source_title}' guide)"
+            doc_gen_example = replace(
+                doc_gen.examples[example.id],
+                title=example.title,
+                title_abbrev=new_abbrev,
+                synopsis=example.synopsis,
             )
-            doc_gen_example.synopsis = example.synopsis
-            updated[doc_gen_example.id] = doc_gen_example
+            doc_gen.examples[example.id] = doc_gen_example
         else:
             logger.warning(f"Could not find example with id: {example.id}")
-    return updated
+    return doc_gen.examples
 
 
-def update_doc_gen(doc_gen_root: Path, updates: List[Dict]) -> Dict[str, Example]:
-    doc_gen = DocGen.from_root(doc_gen_root)
+def update_doc_gen(doc_gen: DocGen, updates: Updates) -> Dict[str, Example]:
     examples = examples_from_updates(updates)
     updated_examples = update_examples(doc_gen, examples)
     return updated_examples
 
 
+def merge_updates(a: Updates, b: Updates) -> Updates:
+    merged: Updates = dict(a)
+    for package_name, updates in b.items():
+        if package_name not in merged:
+            merged[package_name] = updates
+        else:
+            # Assumption: Updates will not conflict.
+            merged[package_name].extend(updates)
+    return merged
+
+
 def handle_update_reservoir(cmd: UpdateReservoir, uow: None):
     update_files = (
         [AILLY_DIR_PATH / f"updates_{batch}.json" for batch in cmd.batches]
@@ -93,23 +108,29 @@ def handle_update_reservoir(cmd: UpdateReservoir, uow: None):
         logger.warning("No IAM update files found to process")
         return
 
+    doc_gen = DocGen.from_root(cmd.root)
+
+    combined_updates: Updates = {}
+
     for update_file in sorted(update_files):
         if update_file.exists():
-            logger.info(f"Processing updates from {update_file.name}")
-            updates = json.loads(update_file.read_text())
+            updates: Updates = json.loads(update_file.read_text())
             if cmd.packages:
-                updates = [
-                    update
-                    for package, update_list in updates.items()
-                    if package in cmd.packages
-                    for update in update_list
-                ]
+                updates = {
+                    package_name: update_list
+                    for package_name, update_list in updates.items()
+                    if package_name in cmd.packages
+                }
+
             if not updates:
                 logger.warning(f"No matching updates to run in {update_file.name}")
                 continue
-            examples = update_doc_gen(doc_gen_root=cmd.root, updates=updates)
 
-            writes = prepare_write(examples)
-            write_many(cmd.root, writes)
+            combined_updates = merge_updates(combined_updates, updates)
+
         else:
             logger.warning(f"Update file not found: {update_file}")
+
+    updated_examples = update_doc_gen(doc_gen=doc_gen, updates=combined_updates)
+    writes = prepare_write(updated_examples)
+    write_many(cmd.root, writes)
diff --git a/aws_doc_sdk_examples_tools/metadata_validator.py b/aws_doc_sdk_examples_tools/metadata_validator.py
@@ -163,7 +163,11 @@ def _validate_aws_entity_usage(value: str) -> bool:
         If these counts differ, there's an invalid usage.
         """
         xval = value.replace("&", "&amp;")
-        xtree = xml_tree.fromstring(f"<fake><para>{xval}</para></fake>")
+        try:
+            xtree = xml_tree.fromstring(f"<fake><para>{xval}</para></fake>")
+        except Exception as e:
+            print(xval)
+            raise e
         blocks = (
             xtree.findall(".//programlisting")
             + xtree.findall(".//code")

diff --git a/aws_doc_sdk_examples_tools/yaml_writer.py b/aws_doc_sdk_examples_tools/yaml_writer.py
@@ -2,9 +2,8 @@
 from collections import defaultdict
 from dataclasses import asdict
 from pathlib import Path
-from typing import Any, DefaultDict, Dict, List, Set, Tuple
+from typing import Any, DefaultDict, Dict, List, Tuple
 
-import difflib
 import logging
 import yaml
 
@@ -146,7 +145,8 @@ def report_yaml_differences(
             elif file_path not in after_values:
                 differences.append((file_path, "removed"))
             else:
-                differences.append((file_path, "modified"))
+                diff = f"{before}\n\n---\n\n{after}"
+                differences.append((file_path, diff))
 
     return differences
 
@@ -172,8 +172,8 @@ def main():
     if before_values != after_values:
         differences = report_yaml_differences(before_values, after_values)
         logger.error(f"YAML content changed in {len(differences)} files after writing:")
-        for file_path, diff_type in differences:
-            logger.error(f"  - {file_path}: {diff_type}")
+        for difference in differences:
+            logger.error(difference)
     else:
         logger.info(
             f"Metadata for {root.name} has been normalized and verified for consistency."

diff --git a/aws_doc_sdk_examples_tools/yaml_writer_test.py b/aws_doc_sdk_examples_tools/yaml_writer_test.py
@@ -72,12 +72,11 @@ def test_report_yaml_differences_with_changes():
     differences.sort()
 
     expected = [
-        ("file1.yaml", "modified"),
+        ("file1.yaml", "{'key1': 'value1'}\n\n---\n\n{'key1': 'changed_value'}"),
         ("file2.yaml", "removed"),
         ("file4.yaml", "added"),
     ]
     expected.sort()
-
     assert differences == expected
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,4 @@ __pycache__ @@
     *.egg-info
     .idea/
     build/
-    dist/
-    .ailly_iam_policy
-    *.log
+    dist/