Skip to content

Commit 60f0368

Browse files
authored
Fix Lliam deduplication issues. (#193)
When dedupe_reservoir was run, it was not accounting for title_abbrev fields that were already appended with a number. This change compares the field values without any appended numbers, and then appends a new number based on the count. Initially I wanted to just pick up where the count left off. However, it turned out to be possible to generate duplicate titles on subsequent runs, so there was no way to determine what the 'original' duplicate was.
1 parent a17df93 commit 60f0368

File tree

9 files changed

+191
-23
lines changed

9 files changed

+191
-23
lines changed

aws_doc_sdk_examples_tools/agent/update_doc_gen.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def examples_from_updates(updates_path: Path) -> Iterable[Example]:
2626
examples = [
2727
Example(
2828
id=id,
29-
file=None,
29+
file=Path(),
3030
languages={},
3131
title=update.get("title"),
3232
title_abbrev=update.get("title_abbrev"),

aws_doc_sdk_examples_tools/lliam/__init__.py

Whitespace-only changes.
Lines changed: 62 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
import logging
2+
import re
3+
14
from collections import Counter
25
from dataclasses import replace
3-
import logging
4-
from typing import Dict
6+
from pathlib import Path
7+
from typing import Dict, Iterable, List
58

69
from aws_doc_sdk_examples_tools.doc_gen import DocGen
710
from aws_doc_sdk_examples_tools.lliam.domain.commands import DedupeReservoir
@@ -12,33 +15,75 @@
1215
logger = logging.getLogger(__name__)
1316

1417

15-
def make_title_abbreviation(example: Example, counter: Counter):
18+
def make_abbrev(example: Example, counter: Counter) -> str:
19+
if not example.title_abbrev:
20+
return ""
21+
1622
count = counter[example.title_abbrev]
1723
abbrev = f"{example.title_abbrev} ({count + 1})" if count else example.title_abbrev
1824
counter[example.title_abbrev] += 1
1925
return abbrev
2026

2127

22-
def handle_dedupe_reservoir(cmd: DedupeReservoir, uow: None):
23-
doc_gen = DocGen.from_root(cmd.root, validation=ValidationConfig(check_aws=False))
28+
def reset_abbrev_count(examples: Dict[str, Example]) -> Dict[str, Example]:
29+
"""
30+
Reset all duplicate title abbreviations back to their un-enumerated state.
31+
32+
I don't love this. Ideally we would only update new title_abbrev fields
33+
with the incremented count. But there's no way to know which ones are new
34+
or even which particular title_abbrev is the original.
2435
25-
examples: Dict[str, Example] = {}
36+
Ex.
37+
title_abbrev: some policy
38+
title_abbrev: some policy (2)
39+
title_abbrev: some policy
40+
title_abbrev: some policy
2641
27-
for id, example in doc_gen.examples.items():
28-
if cmd.packages and example.file:
29-
package = example.file.name.split("_metadata.yaml")[0]
30-
if package in cmd.packages:
31-
examples[id] = example
32-
else:
33-
examples[id] = example
42+
Which one is the original? Which ones are new?
43+
"""
3444

35-
title_abbrev_counts: Counter = Counter()
45+
updated_examples = {}
3646

3747
for id, example in examples.items():
38-
examples[id] = replace(
48+
updated_examples[id] = replace(
3949
example,
40-
title_abbrev=make_title_abbreviation(example, title_abbrev_counts),
50+
title_abbrev=re.sub(r"(\s\(\d+\))*$", "", example.title_abbrev or ""),
4151
)
4252

53+
return updated_examples
54+
55+
56+
def example_in_packages(example: Example, packages: List[str]) -> bool:
57+
if packages and example.file:
58+
(example_pkg_name, *_) = example.file.name.split("_metadata.yaml")
59+
if not example_pkg_name in packages:
60+
return False
61+
return True
62+
63+
64+
def dedupe_examples(
65+
examples: Dict[str, Example], packages: List[str]
66+
) -> Dict[str, Example]:
67+
filtered = {
68+
id: ex for id, ex in examples.items() if example_in_packages(ex, packages)
69+
}
70+
71+
reset_examples = reset_abbrev_count(filtered)
72+
73+
counter: Counter = Counter()
74+
75+
return {
76+
id: replace(ex, title_abbrev=make_abbrev(ex, counter))
77+
for id, ex in reset_examples.items()
78+
}
79+
80+
81+
def write_examples(examples: Dict[str, Example], root: Path):
4382
writes = prepare_write(examples)
44-
write_many(cmd.root, writes)
83+
write_many(root, writes)
84+
85+
86+
def handle_dedupe_reservoir(cmd: DedupeReservoir, uow: None):
87+
doc_gen = DocGen.from_root(cmd.root, validation=ValidationConfig(check_aws=False))
88+
examples = dedupe_examples(doc_gen.examples, cmd.packages)
89+
write_examples(examples, cmd.root)

aws_doc_sdk_examples_tools/lliam/service_layer/run_ailly.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"ailly",
2323
"--max-depth",
2424
"10",
25+
"--no-overwrite",
2526
"--root",
2627
str(AILLY_DIR_PATH),
2728
]

aws_doc_sdk_examples_tools/lliam/service_layer/update_doc_gen.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from dataclasses import replace
22
import json
33
import logging
4-
from collections import Counter
54
from pathlib import Path
65
from typing import Dict, Iterable, List
76

@@ -37,7 +36,7 @@ def examples_from_updates(updates: Updates) -> Iterable[Example]:
3736
examples = [
3837
Example(
3938
id=id,
40-
file=None,
39+
file=Path(),
4140
languages={},
4241
title=update.get("title"),
4342
title_abbrev=update.get("title_abbrev"),
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
from collections import Counter
2+
from pathlib import Path
3+
4+
from aws_doc_sdk_examples_tools.metadata import Example
5+
from aws_doc_sdk_examples_tools.lliam.service_layer.dedupe_reservoir import (
6+
make_abbrev,
7+
example_in_packages,
8+
reset_abbrev_count,
9+
dedupe_examples,
10+
)
11+
12+
13+
def test_make_abbrev_continues_numbering():
14+
"""Test that numbering continues from existing numbered titles."""
15+
counter = Counter({"Some abbrev": 2})
16+
example = Example(id="test", file=Path(), languages={}, title_abbrev="Some abbrev")
17+
result = make_abbrev(example, counter)
18+
19+
assert result == "Some abbrev (3)"
20+
21+
22+
def test_make_abbrev_first_occurrence():
23+
"""Test that first occurrence doesn't get numbered."""
24+
counter = Counter()
25+
example = Example(id="test", file=Path(), languages={}, title_abbrev="New abbrev")
26+
result = make_abbrev(example, counter)
27+
28+
assert result == "New abbrev"
29+
assert counter["New abbrev"] == 1
30+
31+
32+
def test_example_in_packages_no_packages():
33+
"""Test that example is included when no packages specified."""
34+
example = Example(id="test", file=Path("test_metadata.yaml"), languages={})
35+
result = example_in_packages(example, [])
36+
37+
assert result is True
38+
39+
40+
def test_example_in_packages_matching_package():
41+
"""Test that example is included when package matches."""
42+
example = Example(id="test", file=Path("pkg1_metadata.yaml"), languages={})
43+
result = example_in_packages(example, ["pkg1", "pkg2"])
44+
45+
assert result is True
46+
47+
48+
def test_example_in_packages_non_matching_package():
49+
"""Test that example is excluded when package doesn't match."""
50+
example = Example(id="test", file=Path("pkg3_metadata.yaml"), languages={})
51+
result = example_in_packages(example, ["pkg1", "pkg2"])
52+
53+
assert result is False
54+
55+
56+
def test_build_abbrev_counter():
57+
"""Test building counter from examples with existing numbered titles."""
58+
examples = {
59+
"1": Example(id="1", file=Path(), languages={}, title_abbrev="Test (1)"),
60+
"2": Example(id="2", file=Path(), languages={}, title_abbrev="Test (2)"),
61+
"3": Example(id="3", file=Path(), languages={}, title_abbrev="Other"),
62+
"4": Example(id="4", file=Path(), languages={}, title_abbrev="Test"),
63+
}
64+
65+
result = reset_abbrev_count(examples)
66+
67+
assert result["1"].title_abbrev == "Test"
68+
assert result["2"].title_abbrev == "Test"
69+
assert result["3"].title_abbrev == "Other"
70+
assert result["4"].title_abbrev == "Test"
71+
72+
73+
def test_build_abbrev_counter_empty():
74+
"""Test building counter from empty examples list."""
75+
result = reset_abbrev_count({})
76+
77+
assert len(result) == 0
78+
79+
80+
def test_dedupe_examples():
81+
"""Test deduping examples with existing numbered titles."""
82+
examples = {
83+
"ex1": Example(
84+
id="ex1",
85+
file=Path("pkg1_metadata.yaml"),
86+
languages={},
87+
title_abbrev="Test (2) (2)",
88+
),
89+
"ex2": Example(
90+
id="ex2",
91+
file=Path("pkg1_metadata.yaml"),
92+
languages={},
93+
title_abbrev="Test (3) (3) (3)",
94+
),
95+
"ex3": Example(
96+
id="ex3", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test"
97+
),
98+
"ex4": Example(
99+
id="ex4", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test"
100+
),
101+
"ex5": Example(
102+
id="ex5", file=Path("pkg1_metadata.yaml"), languages={}, title_abbrev="Test"
103+
),
104+
"ex6": Example(
105+
id="ex6", file=Path("pkg2_metadata.yaml"), languages={}, title_abbrev="Test"
106+
),
107+
}
108+
109+
result = dedupe_examples(examples, [])
110+
111+
assert len(result) == 6
112+
title_abbrevs = sorted(
113+
[ex.title_abbrev for ex in result.values() if ex.title_abbrev]
114+
)
115+
assert title_abbrevs == [
116+
"Test",
117+
"Test (2)",
118+
"Test (3)",
119+
"Test (4)",
120+
"Test (5)",
121+
"Test (6)",
122+
]

aws_doc_sdk_examples_tools/lliam/test/update_doc_gen_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def test_update_examples_title_abbrev(doc_gen_tributary: DocGen):
3232
# Create an example with a title_abbrev to update
3333
update_example = Example(
3434
id="iam_policies_example",
35-
file=None,
35+
file=Path(),
3636
languages={},
3737
title_abbrev="Updated Title Abbrev",
3838
)

aws_doc_sdk_examples_tools/metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def validate(self, errors: MetadataErrors, root: Path):
139139
@dataclass
140140
class Example:
141141
id: str
142-
file: Optional[Path]
142+
file: Path
143143
languages: Dict[str, Language]
144144
# Human readable title.
145145
title: Optional[str] = field(default="")

aws_doc_sdk_examples_tools/yaml_mapper.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4+
from pathlib import Path
45
from typing import Dict, Set, Tuple, Any, List, Optional, Union
56
from .metadata import (
67
Example,
@@ -112,7 +113,7 @@ def example_from_yaml(
112113
return (
113114
Example(
114115
id="",
115-
file=None,
116+
file=Path(),
116117
title=title,
117118
title_abbrev=title_abbrev,
118119
category=category,

0 commit comments

Comments
 (0)