|
| 1 | +""" |
| 2 | +Parse generated Ailly output for key: value pairs. |
| 3 | +
|
| 4 | +This module processes *.md.ailly.md files, extracts key-value pairs, |
| 5 | +converts them to JSON entries in an array, and writes the JSON array |
| 6 | +to a specified output file. |
| 7 | +""" |
| 8 | + |
| 9 | +import json |
| 10 | +import logging |
| 11 | +from pathlib import Path |
| 12 | +from typing import Any, Dict, List, Set |
| 13 | + |
| 14 | +logging.basicConfig(level=logging.WARNING) |
| 15 | +logger = logging.getLogger(__name__) |
| 16 | + |
| 17 | +EXPECTED_KEYS: Set[str] = set(["title", "title_abbrev"]) |
| 18 | +VALUE_PREFIXES: Dict[str, str] = { |
| 19 | + "title": "Example policy: ", |
| 20 | + "title_abbrev": "Example: ", |
| 21 | +} |
| 22 | + |
| 23 | + |
| 24 | +class MissingExpectedKeys(Exception): |
| 25 | + pass |
| 26 | + |
| 27 | + |
| 28 | +def parse_fenced_blocks(content: str, fence="===") -> List[List[str]]: |
| 29 | + blocks = [] |
| 30 | + inside_fence = False |
| 31 | + current_block: List[str] = [] |
| 32 | + |
| 33 | + for line in content.splitlines(): |
| 34 | + if line.strip() == fence: |
| 35 | + if inside_fence: |
| 36 | + blocks.append(current_block) |
| 37 | + current_block = [] |
| 38 | + inside_fence = not inside_fence |
| 39 | + elif inside_fence: |
| 40 | + current_block.append(line) |
| 41 | + |
| 42 | + return blocks |
| 43 | + |
| 44 | + |
| 45 | +def parse_block_lines( |
| 46 | + block: List[str], key_pairs: Dict[str, str], expected_keys=EXPECTED_KEYS |
| 47 | +): |
| 48 | + for line in block: |
| 49 | + if "=>" in line: |
| 50 | + parts = line.split("=>", 1) |
| 51 | + key = parts[0].strip() |
| 52 | + value = parts[1].strip() if len(parts) > 1 else "" |
| 53 | + key_pairs[key] = value |
| 54 | + if missing_keys := expected_keys - key_pairs.keys(): |
| 55 | + raise MissingExpectedKeys(missing_keys) |
| 56 | + |
| 57 | + |
| 58 | +def parse_ailly_file( |
| 59 | + file_path: str, value_prefixes: Dict[str, str] = VALUE_PREFIXES |
| 60 | +) -> Dict[str, Any]: |
| 61 | + """ |
| 62 | + Parse an .md.ailly.md file and extract key-value pairs that are between === fence markers. Each |
| 63 | + key value pair is assumed to be on one line and in the form of `key => value`. This formatting is |
| 64 | + totally dependent on the LLM output written by Ailly. |
| 65 | +
|
| 66 | + Args: |
| 67 | + file_path: Path to the .md.ailly.md file |
| 68 | +
|
| 69 | + Returns: |
| 70 | + Dictionary containing the extracted key-value pairs |
| 71 | + """ |
| 72 | + result: Dict[str, str] = {} |
| 73 | + |
| 74 | + try: |
| 75 | + with open(file_path, "r", encoding="utf-8") as file: |
| 76 | + content = file.read() |
| 77 | + |
| 78 | + blocks = parse_fenced_blocks(content) |
| 79 | + |
| 80 | + for block in blocks: |
| 81 | + parse_block_lines(block, result) |
| 82 | + |
| 83 | + for key, prefix in value_prefixes.items(): |
| 84 | + if key in result: |
| 85 | + result[key] = f"{prefix}{result[key]}" |
| 86 | + |
| 87 | + result["id"] = Path(file_path).name.split(".md.ailly.md")[0] |
| 88 | + result["_source_file"] = file_path |
| 89 | + |
| 90 | + except Exception as e: |
| 91 | + logger.error(f"Error parsing file {file_path}", exc_info=e) |
| 92 | + |
| 93 | + return result |
| 94 | + |
| 95 | + |
| 96 | +def process_ailly_files( |
| 97 | + input_dir: str, output_file: str, file_pattern: str = "*.md.ailly.md" |
| 98 | +) -> None: |
| 99 | + """ |
| 100 | + Process all .md.ailly.md files in the input directory and write the results as JSON to the output file. |
| 101 | +
|
| 102 | + Args: |
| 103 | + input_dir: Directory containing .md.ailly.md files |
| 104 | + output_file: Path to the output JSON file |
| 105 | + file_pattern: Pattern to match files (default: "*.md.ailly.md") |
| 106 | + """ |
| 107 | + results = [] |
| 108 | + input_path = Path(input_dir) |
| 109 | + |
| 110 | + try: |
| 111 | + for file_path in input_path.glob(file_pattern): |
| 112 | + logger.info(f"Processing file: {file_path}") |
| 113 | + parsed_data = parse_ailly_file(str(file_path)) |
| 114 | + if parsed_data: |
| 115 | + results.append(parsed_data) |
| 116 | + |
| 117 | + with open(output_file, "w", encoding="utf-8") as out_file: |
| 118 | + json.dump(results, out_file, indent=2) |
| 119 | + |
| 120 | + logger.info( |
| 121 | + f"Successfully processed {len(results)} files. Output written to {output_file}" |
| 122 | + ) |
| 123 | + |
| 124 | + except Exception as e: |
| 125 | + logger.error("Error processing files", exc_info=e) |
0 commit comments