Skip to content

Commit 4f5823f

Browse files
committed
Fix update json getting Ailly'd
On subsequent runs, update json files were being processed by Ailly because of a pattern match on the update files. This changes the update json naming pattern and makes some other refactors.
1 parent 05838f1 commit 4f5823f

File tree

6 files changed

+200
-91
lines changed

6 files changed

+200
-91
lines changed

aws_doc_sdk_examples_tools/agent/bin/main.py

Lines changed: 150 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,31 @@
11
from pathlib import Path
22
from subprocess import run
33
from typing import List
4-
import time
4+
from typing_extensions import Annotated
55
from datetime import timedelta, datetime
66

7+
import json
78
import logging
9+
import time
810
import typer
911

12+
from aws_doc_sdk_examples_tools.agent.shared_constants import BATCH_PREFIX
1013
from aws_doc_sdk_examples_tools.agent.make_prompts import make_prompts
1114
from aws_doc_sdk_examples_tools.agent.process_ailly_files import process_ailly_files
1215
from aws_doc_sdk_examples_tools.agent.update_doc_gen import update_doc_gen
1316
from aws_doc_sdk_examples_tools.yaml_writer import prepare_write, write_many
1417

1518
logging.basicConfig(
16-
level=logging.INFO, filename=f"lliam-run-{datetime.now()}.log", filemode="w"
19+
level=logging.INFO,
20+
filename=f"lliam-run-{datetime.now().strftime('%Y%m%d_%H%M%S')}.log",
21+
filemode="w",
1722
)
1823
logger = logging.getLogger(__name__)
1924

2025
app = typer.Typer()
2126

2227
AILLY_DIR = ".ailly_iam_policy"
2328
AILLY_DIR_PATH = Path(AILLY_DIR)
24-
IAM_UPDATES_PATH = AILLY_DIR_PATH / "iam_updates.json"
2529

2630

2731
def format_duration(seconds: float) -> str:
@@ -30,72 +34,160 @@ def format_duration(seconds: float) -> str:
3034

3135

3236
@app.command()
33-
def update(
34-
iam_tributary_root: str,
35-
system_prompts: List[str] = [],
36-
skip_generation: bool = False,
37+
def create_prompts(iam_tributary_root: str, system_prompts: List[str] = []):
38+
doc_gen_root = Path(iam_tributary_root)
39+
40+
make_prompts(
41+
doc_gen_root=doc_gen_root,
42+
system_prompts=system_prompts,
43+
out_dir=AILLY_DIR_PATH,
44+
language="IAMPolicyGrammar",
45+
)
46+
47+
48+
def parse_batch_names(batch_nums_str: str) -> List[str]:
49+
"""
50+
Parse batch numbers from a string.
51+
"""
52+
if not batch_nums_str:
53+
return []
54+
55+
result = []
56+
parts = batch_nums_str.split(",")
57+
58+
for part in parts:
59+
part = part.strip()
60+
if ".." in part:
61+
start, end = part.split("..")
62+
start_num = int(start.strip())
63+
end_num = int(end.strip())
64+
result.extend(range(start_num, end_num + 1))
65+
else:
66+
result.append(int(part))
67+
68+
batch_nums = sorted(list(set(result)))
69+
return map(lambda b: f"{BATCH_PREFIX}{b:03}", batch_nums)
70+
71+
72+
def resolve_requested_batches(batch_names: List[str]) -> List[Path]:
73+
if not batch_names:
74+
batch_paths = [
75+
p
76+
for p in AILLY_DIR_PATH.iterdir()
77+
if p.is_dir() and p.name.startswith(BATCH_PREFIX)
78+
]
79+
80+
return batch_paths
81+
82+
batch_paths = []
83+
84+
for batch_name in batch_names:
85+
batch_path = Path(AILLY_DIR_PATH / batch_name)
86+
if not batch_path.exists():
87+
raise FileNotFoundError(batch_path)
88+
if not batch_path.is_dir():
89+
raise NotADirectoryError(batch_path)
90+
batch_paths.append(batch_path)
91+
92+
return batch_paths
93+
94+
95+
@app.command()
96+
def run_ailly(
97+
batch_nums: Annotated[
98+
str,
99+
typer.Option(
100+
help="Batch numbers to process (e.g., '33', '33..35', '33,35,37')"
101+
),
102+
] = None,
37103
) -> None:
38104
"""
39-
Generate new IAM policy metadata for a tributary.
105+
Run ailly to generate IAM policy content and process the results.
106+
If batch_nums is specified, only those batches will be processed.
107+
If batch_nums is omitted, all batches will be processed.
40108
"""
41-
doc_gen_root = Path(iam_tributary_root)
109+
requested_batches = parse_batch_names(batch_nums)
110+
resolved_batches = resolve_requested_batches(requested_batches)
42111

43-
if not skip_generation:
44-
make_prompts(
45-
doc_gen_root=doc_gen_root,
46-
system_prompts=system_prompts,
47-
out_dir=AILLY_DIR_PATH,
48-
language="IAMPolicyGrammar",
112+
if resolved_batches:
113+
total_start_time = time.time()
114+
115+
for batch in resolved_batches:
116+
run_ailly_single_batch(batch)
117+
118+
total_end_time = time.time()
119+
total_duration = total_end_time - total_start_time
120+
num_batches = len(resolved_batches)
121+
logger.info(
122+
f"[TIMECHECK] {num_batches} batches took {format_duration(total_duration)} to run"
49123
)
50124

51-
batch_dirs = [
52-
d.name
53-
for d in AILLY_DIR_PATH.iterdir()
54-
if d.is_dir() and d.name.startswith("batch_")
55-
]
56125

57-
if batch_dirs:
58-
total_start_time = time.time()
59-
60-
for batch_dir in sorted(batch_dirs):
61-
batch_start_time = time.time()
62-
63-
cmd = [
64-
"ailly",
65-
"--max-depth",
66-
"10",
67-
"--root",
68-
AILLY_DIR,
69-
str(batch_dir),
70-
]
71-
logger.info(f"Running {cmd}")
72-
run(cmd)
73-
74-
batch_end_time = time.time()
75-
batch_duration = batch_end_time - batch_start_time
76-
batch_num = batch_dir.replace("batch_", "")
77-
logger.info(
78-
f"[TIMECHECK] Batch {batch_num} took {format_duration(batch_duration)} to run"
79-
)
80-
81-
total_end_time = time.time()
82-
total_duration = total_end_time - total_start_time
83-
num_batches = len(batch_dirs)
84-
logger.info(
85-
f"[TIMECHECK] {num_batches} batches took {format_duration(total_duration)} to run"
86-
)
87-
88-
logger.info("Processing generated content")
89-
process_ailly_files(
90-
input_dir=str(AILLY_DIR_PATH), output_file=str(IAM_UPDATES_PATH)
126+
@app.command()
127+
def update_reservoir(
128+
iam_tributary_root: str,
129+
batch_nums: Annotated[
130+
str,
131+
typer.Option(
132+
help="Batch numbers to process (e.g., '33', '33..35', '33,35,37')"
133+
),
134+
] = None,
135+
) -> None:
136+
"""
137+
Update the doc_gen reservoir with processed IAM policy updates.
138+
If batch_nums is specified, only those batches will be processed.
139+
If batch_nums is omitted, all available update files will be processed.
140+
"""
141+
doc_gen_root = Path(iam_tributary_root)
142+
batch_names = parse_batch_names(batch_nums) if batch_nums else None
143+
144+
update_files = (
145+
[AILLY_DIR_PATH / f"updates_{batch}.json" for batch in batch_names]
146+
if batch_names
147+
else list(AILLY_DIR_PATH.glob(f"updates_{BATCH_PREFIX}*.json"))
91148
)
92149

93-
doc_gen = update_doc_gen(
94-
doc_gen_root=doc_gen_root, iam_updates_path=IAM_UPDATES_PATH
150+
if not update_files:
151+
logger.warning("No IAM update files found to process")
152+
return
153+
154+
for update_file in sorted(update_files):
155+
if update_file.exists():
156+
logger.info(f"Processing updates from {update_file.name}")
157+
updates = json.loads(update_file.read_text())
158+
159+
doc_gen = update_doc_gen(doc_gen_root=doc_gen_root, updates=updates)
160+
161+
writes = prepare_write(doc_gen.examples)
162+
write_many(doc_gen_root, writes)
163+
else:
164+
logger.warning(f"Update file not found: {update_file}")
165+
166+
167+
def run_ailly_single_batch(batch: Path) -> None:
168+
"""Run ailly and process files for a single batch."""
169+
batch_start_time = time.time()
170+
iam_updates_path = AILLY_DIR_PATH / f"updates_{batch.name}.json"
171+
172+
cmd = [
173+
"ailly",
174+
"--max-depth",
175+
"10",
176+
"--root",
177+
str(AILLY_DIR_PATH),
178+
batch.name,
179+
]
180+
logger.info(f"Running {cmd}")
181+
run(cmd)
182+
183+
batch_end_time = time.time()
184+
batch_duration = batch_end_time - batch_start_time
185+
logger.info(
186+
f"[TIMECHECK] {batch.name} took {format_duration(batch_duration)} to run"
95187
)
96188

97-
writes = prepare_write(doc_gen.examples)
98-
write_many(doc_gen_root, writes)
189+
logger.info(f"Processing generated content for {batch.name}")
190+
process_ailly_files(input_dir=batch, output_file=iam_updates_path)
99191

100192

101193
if __name__ == "__main__":

aws_doc_sdk_examples_tools/agent/make_prompts.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,15 @@
33
import logging
44
import os
55
import yaml
6+
from itertools import islice
67
from pathlib import Path
7-
from typing import List
8+
from typing import Any, Generator, Iterable, List, Tuple
89

9-
from aws_doc_sdk_examples_tools.doc_gen import DocGen
10+
from aws_doc_sdk_examples_tools.agent.shared_constants import BATCH_PREFIX
11+
from aws_doc_sdk_examples_tools.doc_gen import DocGen, Example
1012

1113
DEFAULT_METADATA_PREFIX = "DEFAULT"
14+
BATCH_SIZE = 150
1215

1316
logger = logging.getLogger(__name__)
1417

@@ -20,11 +23,21 @@ def make_doc_gen(root: Path) -> DocGen:
2023
return doc_gen
2124

2225

26+
def batched(iterable: Iterable, n: int) -> Generator[Tuple, Any, None]:
27+
"Batch data into tuples of length n. The last batch may be shorter."
28+
# batched('ABCDEFG', 3) --> ABC DEF G
29+
if n < 1:
30+
raise ValueError("n must be at least one")
31+
it = iter(iterable)
32+
while batch := tuple(islice(it, n)):
33+
yield batch
34+
35+
2336
def write_prompts(doc_gen: DocGen, out_dir: Path, language: str) -> None:
2437
examples = doc_gen.examples
2538
snippets = doc_gen.snippets
2639

27-
filtered_examples = []
40+
filtered_examples: Tuple[str, Example] = []
2841
for example_id, example in examples.items():
2942
# TCXContentAnalyzer prefixes new metadata title/title_abbrev entries with
3043
# the DEFAULT_METADATA_PREFIX. Checking this here to make sure we're only
@@ -36,23 +49,15 @@ def write_prompts(doc_gen: DocGen, out_dir: Path, language: str) -> None:
3649
):
3750
filtered_examples.append((example_id, example))
3851

39-
batch_size = 150
40-
total_examples = len(filtered_examples)
41-
num_batches = (total_examples + batch_size - 1) // batch_size
42-
4352
logger.info(
44-
f"Splitting {total_examples} examples into {num_batches} batches of {batch_size}"
53+
f"Splitting {len(filtered_examples)} examples into batches of {BATCH_SIZE}"
4554
)
4655

47-
for batch_num in range(num_batches):
48-
batch_dir = out_dir / f"batch_{(batch_num + 1):03}"
56+
for batch_num, batch in enumerate(batched(filtered_examples, BATCH_SIZE)):
57+
batch_dir = out_dir / f"{BATCH_PREFIX}{(batch_num + 1):03}"
4958
batch_dir.mkdir(exist_ok=True)
5059

51-
start_idx = batch_num * batch_size
52-
end_idx = min((batch_num + 1) * batch_size, total_examples)
53-
54-
for i in range(start_idx, end_idx):
55-
example_id, example = filtered_examples[i]
60+
for example_id, example in batch:
5661
prompt_path = batch_dir / f"{example_id}.md"
5762

5863
try:

aws_doc_sdk_examples_tools/agent/process_ailly_files.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from pathlib import Path
1212
from typing import Any, Dict, List, Set
1313

14-
logging.basicConfig(level=logging.WARNING)
1514
logger = logging.getLogger(__name__)
1615

1716
EXPECTED_KEYS: Set[str] = set(["title", "title_abbrev"])
@@ -91,7 +90,7 @@ def parse_ailly_file(
9190

9291

9392
def process_ailly_files(
94-
input_dir: str, output_file: str, file_pattern: str = "*.md.ailly.md"
93+
input_dir: Path, output_file: Path, file_pattern: str = "*.md.ailly.md"
9594
) -> None:
9695
"""
9796
Process all .md.ailly.md files in the input directory and write the results as JSON to the output file.
@@ -102,10 +101,9 @@ def process_ailly_files(
102101
file_pattern: Pattern to match files (default: "*.md.ailly.md")
103102
"""
104103
results = []
105-
input_path = Path(input_dir)
106104

107105
try:
108-
for file_path in input_path.rglob(file_pattern):
106+
for file_path in input_dir.rglob(file_pattern):
109107
logger.info(f"Processing file: {file_path}")
110108
parsed_data = parse_ailly_file(str(file_path))
111109
if parsed_data:
@@ -115,7 +113,7 @@ def process_ailly_files(
115113
json.dump(results, out_file, indent=2)
116114

117115
logger.info(
118-
f"Successfully processed {len(results)} files. Output written to {output_file}"
116+
f"Successfully processed {len(results)} files. Output written to {output_file.name}"
119117
)
120118

121119
except Exception as e:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
BATCH_PREFIX = "batch_"

0 commit comments

Comments
 (0)