Skip to content

Commit d705845

Browse files
committed
Update Ailly response parser to expect a different format
Previously, we were asking the LLM to output JSON. This proved difficult to parse as it frequently gave invalid formatting. This change expects the output to be a new, custom, format.
1 parent 85924b0 commit d705845

File tree

6 files changed

+302
-92
lines changed

6 files changed

+302
-92
lines changed

aws_doc_sdk_examples_tools/agent/bin/main.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
import typer
66

7-
from aws_doc_sdk_examples_tools.agent.make_prompts import main as make_prompts
8-
from aws_doc_sdk_examples_tools.agent.parse_json_files import main as parse_json_files
9-
from aws_doc_sdk_examples_tools.agent.update_doc_gen import update as update_doc_gen
7+
from aws_doc_sdk_examples_tools.agent.make_prompts import make_prompts
8+
from aws_doc_sdk_examples_tools.agent.process_ailly_files import process_ailly_files
9+
from aws_doc_sdk_examples_tools.agent.update_doc_gen import update_doc_gen
1010
from aws_doc_sdk_examples_tools.yaml_writer import prepare_write, write_many
1111

1212
app = typer.Typer()
@@ -16,26 +16,28 @@
1616
IAM_UPDATES_PATH = AILLY_DIR_PATH / "iam_updates.json"
1717

1818

19-
def get_ailly_files(dir: Path) -> List[Path]:
20-
return [
21-
file
22-
for file in dir.iterdir()
23-
if file.is_file() and file.name.endswith(".ailly.md")
24-
]
25-
26-
2719
@app.command()
2820
def update(iam_tributary_root: str, system_prompts: List[str] = []) -> None:
21+
"""
22+
Generate new IAM policy metadata for a tributary.
23+
"""
2924
doc_gen_root = Path(iam_tributary_root)
3025
make_prompts(
31-
doc_gen_root=doc_gen_root, system_prompts=system_prompts, out=AILLY_DIR_PATH
26+
doc_gen_root=doc_gen_root,
27+
system_prompts=system_prompts,
28+
out_dir=AILLY_DIR_PATH,
29+
language="IAMPolicyGrammar",
3230
)
3331
run(["npx", "@ailly/cli", "--root", AILLY_DIR])
34-
file_paths = get_ailly_files(AILLY_DIR_PATH)
35-
parse_json_files(file_paths=file_paths, out=IAM_UPDATES_PATH)
32+
33+
process_ailly_files(
34+
input_dir=str(AILLY_DIR_PATH), output_file=str(IAM_UPDATES_PATH)
35+
)
36+
3637
doc_gen = update_doc_gen(
3738
doc_gen_root=doc_gen_root, iam_updates_path=IAM_UPDATES_PATH
3839
)
40+
3941
writes = prepare_write(doc_gen.examples)
4042
write_many(doc_gen_root, writes)
4143

aws_doc_sdk_examples_tools/agent/make_prompts.py

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,13 @@ def make_doc_gen(root: Path) -> DocGen:
1919
return doc_gen
2020

2121

22-
def write_prompts(doc_gen: DocGen, out: Path) -> None:
23-
out.mkdir(parents=True, exist_ok=True)
22+
def write_prompts(doc_gen: DocGen, out_dir: Path, language: str) -> None:
2423
examples = doc_gen.examples
2524
snippets = doc_gen.snippets
2625
for example_id, example in examples.items():
27-
# Postfix with `.md` so Ailly will pick it up.
28-
prompt_path = out / f"{example_id}.md"
29-
# This assumes we're running DocGen specifically on AWSIAMPolicyExampleReservoir.
26+
prompt_path = out_dir / f"{example_id}.md"
3027
snippet_key = (
31-
example.languages["IAMPolicyGrammar"]
28+
example.languages[language]
3229
.versions[0]
3330
.excerpts[0]
3431
.snippet_files[0]
@@ -38,7 +35,7 @@ def write_prompts(doc_gen: DocGen, out: Path) -> None:
3835
prompt_path.write_text(snippet.code, encoding="utf-8")
3936

4037

41-
def setup_ailly(system_prompts: List[str], out: Path) -> None:
38+
def setup_ailly(system_prompts: List[str], out_dir: Path) -> None:
4239
"""Create the .aillyrc configuration file."""
4340
fence = "---"
4441
options = {"isolated": "true"}
@@ -47,32 +44,33 @@ def setup_ailly(system_prompts: List[str], out: Path) -> None:
4744

4845
content = f"{fence}\n{options_block}\n{fence}\n{prompts_block}"
4946

50-
aillyrc_path = out / ".aillyrc"
51-
aillyrc_path.parent.mkdir(parents=True, exist_ok=True)
47+
aillyrc_path = out_dir / ".aillyrc"
5248
aillyrc_path.write_text(content, encoding="utf-8")
5349

5450

55-
def read_system_prompts(values: List[str]) -> List[str]:
56-
"""Parse system prompts from a list of strings or file paths."""
57-
prompts = []
51+
def read_files(values: List[str]) -> List[str]:
52+
"""Read contents of files into a list of file contents."""
53+
contents = []
5854
for value in values:
5955
if os.path.isfile(value):
6056
with open(value, "r", encoding="utf-8") as f:
61-
prompts.append(f.read())
57+
contents.append(f.read())
6258
else:
63-
prompts.append(value)
64-
return prompts
59+
contents.append(value)
60+
return contents
6561

6662

6763
def validate_root_path(doc_gen_root: Path):
68-
assert "AWSIAMPolicyExampleReservoir" in str(doc_gen_root)
6964
assert doc_gen_root.is_dir()
7065

7166

72-
def main(doc_gen_root: Path, system_prompts: List[str], out: Path) -> None:
67+
def make_prompts(
68+
doc_gen_root: Path, system_prompts: List[str], out_dir: Path, language: str
69+
) -> None:
7370
"""Generate prompts and configuration files for Ailly."""
74-
system_prompts = read_system_prompts(system_prompts)
75-
setup_ailly(system_prompts, out)
7671
validate_root_path(doc_gen_root)
72+
out_dir.mkdir(parents=True, exist_ok=True)
73+
system_prompts = read_files(system_prompts)
74+
setup_ailly(system_prompts, out_dir)
7775
doc_gen = make_doc_gen(doc_gen_root)
78-
write_prompts(doc_gen, out)
76+
write_prompts(doc_gen=doc_gen, out_dir=out_dir, language=language)

aws_doc_sdk_examples_tools/agent/parse_json_files.py

Lines changed: 0 additions & 56 deletions
This file was deleted.
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
"""
2+
Parse generated Ailly output for key: value pairs.
3+
4+
This module processes *.md.ailly.md files, extracts key-value pairs,
5+
converts them to JSON entries in an array, and writes the JSON array
6+
to a specified output file.
7+
"""
8+
9+
import json
10+
import logging
11+
from pathlib import Path
12+
from typing import Any, Dict, List
13+
14+
logging.basicConfig(level=logging.WARNING)
15+
logger = logging.getLogger(__name__)
16+
17+
18+
def parse_fenced_blocks(content: str, fence="===") -> List[List[str]]:
19+
blocks = []
20+
inside_fence = False
21+
current_block: List[str] = []
22+
23+
for line in content.splitlines():
24+
if line.strip() == fence:
25+
if inside_fence:
26+
blocks.append(current_block)
27+
current_block = []
28+
inside_fence = not inside_fence
29+
elif inside_fence:
30+
current_block.append(line)
31+
32+
return blocks
33+
34+
35+
def parse_block_lines(block: List[str], key_pairs: Dict[str, str]):
36+
for line in block:
37+
if "=>" in line:
38+
parts = line.split("=>", 1)
39+
key = parts[0].strip()
40+
value = parts[1].strip() if len(parts) > 1 else ""
41+
key_pairs[key] = value
42+
43+
44+
def parse_ailly_file(file_path: str) -> Dict[str, Any]:
45+
"""
46+
Parse an .md.ailly.md file and extract key-value pairs that are between === fence markers. Each
47+
key value pair is assumed to be on one line and in the form of `key => value`. This formatting is
48+
totally dependent on the LLM output written by Ailly.
49+
50+
Args:
51+
file_path: Path to the .md.ailly.md file
52+
53+
Returns:
54+
Dictionary containing the extracted key-value pairs
55+
"""
56+
result: Dict[str, str] = {}
57+
58+
try:
59+
with open(file_path, "r", encoding="utf-8") as file:
60+
content = file.read()
61+
62+
blocks = parse_fenced_blocks(content)
63+
for block in blocks:
64+
parse_block_lines(block, result)
65+
result["id"] = Path(file_path).name.split(".md.ailly.md")[0]
66+
result["_source_file"] = file_path
67+
68+
except Exception as e:
69+
logger.error(f"Error parsing file {file_path}", exc_info=e)
70+
71+
return result
72+
73+
74+
def process_ailly_files(
75+
input_dir: str, output_file: str, file_pattern: str = "*.md.ailly.md"
76+
) -> None:
77+
"""
78+
Process all .md.ailly.md files in the input directory and write the results as JSON to the output file.
79+
80+
Args:
81+
input_dir: Directory containing .md.ailly.md files
82+
output_file: Path to the output JSON file
83+
file_pattern: Pattern to match files (default: "*.md.ailly.md")
84+
"""
85+
results = []
86+
input_path = Path(input_dir)
87+
88+
try:
89+
for file_path in input_path.glob(file_pattern):
90+
logger.info(f"Processing file: {file_path}")
91+
parsed_data = parse_ailly_file(str(file_path))
92+
if parsed_data:
93+
results.append(parsed_data)
94+
95+
with open(output_file, "w", encoding="utf-8") as out_file:
96+
json.dump(results, out_file, indent=2)
97+
98+
logger.info(
99+
f"Successfully processed {len(results)} files. Output written to {output_file}"
100+
)
101+
102+
except Exception as e:
103+
logger.error("Error processing files", exc_info=e)

0 commit comments

Comments
 (0)