-
Notifications
You must be signed in to change notification settings - Fork 53
feat: selective includes and LLM-powered extraction (#190) #552
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
74f7056
9d717a1
94e5a97
3b1265b
c8243ee
af1b7e7
502e46f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -12,6 +12,10 @@ | |||||||||||||||||
| from .connect import connect | ||||||||||||||||||
| from .auth import auth_group | ||||||||||||||||||
| from .misc import preprocess | ||||||||||||||||||
| try: | ||||||||||||||||||
| from .extract import extract | ||||||||||||||||||
| except ImportError: | ||||||||||||||||||
| extract = None | ||||||||||||||||||
|
||||||||||||||||||
| except ImportError: | |
| extract = None | |
| except ModuleNotFoundError as exc: | |
| expected_name = __name__.rsplit(".", 1)[0] + ".extract" | |
| if getattr(exc, "name", None) == expected_name: | |
| extract = None | |
| else: | |
| raise |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -217,17 +217,39 @@ def process_xml_tags(text: str, recursive: bool, _seen: Optional[set] = None) -> | |||||||||||||||||||||||||||||
| _seen = set() | ||||||||||||||||||||||||||||||
| text = process_pdd_tags(text) | ||||||||||||||||||||||||||||||
| text = process_include_tags(text, recursive, _seen=_seen) | ||||||||||||||||||||||||||||||
| text = process_extract_tags(text, recursive) | ||||||||||||||||||||||||||||||
| text = process_include_many_tags(text, recursive) | ||||||||||||||||||||||||||||||
| text = process_shell_tags(text, recursive) | ||||||||||||||||||||||||||||||
| text = process_web_tags(text, recursive) | ||||||||||||||||||||||||||||||
| return text | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| def _parse_attrs(attr_str: str) -> dict: | ||||||||||||||||||||||||||||||
| if not attr_str: | ||||||||||||||||||||||||||||||
| return {} | ||||||||||||||||||||||||||||||
| attrs = {} | ||||||||||||||||||||||||||||||
| # Simple attribute parser: key="value" or key='value' | ||||||||||||||||||||||||||||||
| for match in re.finditer(r'(\w+)\s*=\s*["\']([^"\']*)["\']', attr_str): | ||||||||||||||||||||||||||||||
| attrs[match.group(1)] = match.group(2) | ||||||||||||||||||||||||||||||
| return attrs | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| def process_include_tags(text: str, recursive: bool, _seen: Optional[set] = None) -> str: | ||||||||||||||||||||||||||||||
| if _seen is None: | ||||||||||||||||||||||||||||||
| _seen = set() | ||||||||||||||||||||||||||||||
| pattern = r'<include>(.*?)</include>' | ||||||||||||||||||||||||||||||
| # Support both <include>path</include> and <include path="path" attrs... /> | ||||||||||||||||||||||||||||||
| pattern = r'<include(?P<attrs>\s+[^>]*?)?>(?P<content>.*?)</include>|<include(?P<attrs_self>\s+[^>]*?)\s*/>' | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| def replace_include(match): | ||||||||||||||||||||||||||||||
| file_path = match.group(1).strip() | ||||||||||||||||||||||||||||||
| attrs_str = match.group('attrs') or match.group('attrs_self') or "" | ||||||||||||||||||||||||||||||
| attrs = _parse_attrs(attrs_str) | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| file_path = attrs.get('path') | ||||||||||||||||||||||||||||||
| if file_path: | ||||||||||||||||||||||||||||||
| file_path = get_file_path(file_path) or match.group('content') or "" | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
| file_path = attrs.get('path') | |
| if file_path: | |
| file_path = get_file_path(file_path) or match.group('content') or "" | |
| # Support both attribute and content-based include paths | |
| file_path = attrs.get('path') or match.group('content') or "" |
Copilot
AI
Feb 24, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When selectors/limits are requested (select, lines, mode, max_tokens) but pdd.content_selector is missing or selection errors, the code falls back to including full content. That can silently defeat token-budgeting and change prompt semantics. Consider failing fast (raise/return a visible placeholder) when selection was explicitly requested, rather than including the entire file.
| console.print("[yellow]Warning: pdd.content_selector not found. Including full content.[/yellow]") | |
| except Exception as e: | |
| console.print(f"[bold red]Error in content selection:[/bold red] {e}") | |
| console.print("[yellow]Warning: pdd.content_selector not found.[/yellow]") | |
| # When selectors/limits are requested but the content selector | |
| # is unavailable, avoid silently including full content. | |
| # First pass (recursive=True): leave the tag so a later run might resolve it | |
| # Second pass (recursive=False): replace with a visible placeholder | |
| return match.group(0) if recursive else f"[Content selector unavailable for: {file_path}]" | |
| except Exception as e: | |
| console.print(f"[bold red]Error in content selection:[/bold red] {e}") | |
| # On selection errors, do not fall back to full content. | |
| # Follow the same recursive/placeholder pattern as for missing files. | |
| return match.group(0) if recursive else f"[Content selection error for: {file_path}]" |
Outdated
Copilot
AI
Feb 24, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
process_extract_tags validates path (and prints an error/returns an error string) before checking recursive. In the recursive pass, <extract> tags are supposed to be deferred unchanged; as written, a missing path will be replaced with an error during the recursive pass and cannot be retried later. Move the if recursive: return match.group(0) check to the top of replace_extract before any parsing/validation.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| <pdd-reason>Provides precise extraction of file content based on criteria like lines, AST, and regex for selective includes.</pdd-reason> | ||
|
|
||
| <pdd-interface> | ||
| { | ||
| "type": "module", | ||
| "module": { | ||
| "functions": [ | ||
| {"name": "ContentSelector.select", "signature": "(content: str, selectors: list[str], ...)", "returns": "str"} | ||
| ] | ||
| } | ||
| } | ||
| </pdd-interface> | ||
|
|
||
| % You are an expert Python engineer. Your goal is to create a module for deterministic content selection from files. | ||
|
|
||
| % Role & Scope | ||
| The `content_selector` module provides precise extraction of file content based on various criteria (lines, AST, Markdown sections, regex). It is used by the PDD preprocessor to handle selective includes. | ||
|
|
||
| % Requirements | ||
| 1. Implement `ContentSelector` class with a `select(content: str, selectors: list[str], file_path: str = None, mode: str = "full", max_tokens: int = None, overflow: str = "warn") -> str` method. | ||
| 2. Support `lines` selector: `lines:N-M`, `lines:N-`, `lines:-M`, `lines:N` (1-based indices). | ||
| 3. Support Python structural selection using `ast` for `.py` files: | ||
| - `def:function_name`: Extracts the full function definition, including decorators. | ||
| - `class:ClassName`: Extracts the full class definition, including decorators and all members. | ||
| - `class:ClassName.method_name`: Extracts a specific method from a class. | ||
| 4. Support Markdown structural selection for `.md` files: | ||
| - `section:Heading`: Extracts all content under the specified heading until the next heading of the same or higher level. | ||
| 5. Support regex pattern selection: `pattern:/regex/`. | ||
| 6. Support `mode="interface"` for Python: | ||
| - Extract only class/function/method signatures, docstrings, and type hints. | ||
| - Remove function/method bodies (replace with `...`). | ||
| - Exclude private members (starting with `_` but not `__init__`) by default. | ||
| 7. Support `max_tokens` (use `tiktoken` if available, otherwise fallback to `len(content) // 4`) with `overflow` options: | ||
| - `error`: Raise an error if limit exceeded. | ||
| - `truncate`: Truncate content and append a warning. | ||
| - `warn`: Include full content but issue a warning (default). | ||
| 8. Handle multiple selectors (comma-separated string or list) by returning the union of selected parts, preserved in original file order. | ||
| 9. Use `rich` for formatted error reporting and console status (e.g., warnings for truncation/overflow). | ||
| 10. Ensure robust error handling for malformed selectors or missing content, providing descriptive error messages. | ||
|
|
||
| % Dependencies | ||
| <rich_example> | ||
| <include>context/core/errors_example.py</include> | ||
| </rich_example> | ||
|
|
||
| % Deliverables | ||
| - Code: `pdd/content_selector.py` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| <pdd-reason>Provides a CLI command for managing LLM-powered semantic extractions from files.</pdd-reason> | ||
|
|
||
| <pdd-interface> | ||
| { | ||
| "type": "cli", | ||
| "cli": { | ||
| "commands": [ | ||
| {"name": "pdd extract", "description": "Manage LLM-powered semantic extractions."} | ||
| ] | ||
| } | ||
| } | ||
| </pdd-interface> | ||
|
|
||
| % You are an expert Python engineer. Your goal is to create a PDD CLI command for managing LLM-powered extractions. | ||
|
|
||
| % Role & Scope | ||
| The `extract` command provides a CLI interface for the `llm_extractor` module, allowing users to refresh, list, and preview semantic extractions. | ||
|
|
||
| % Requirements | ||
| 1. Implement `extract` command in `pdd/commands/extract.py` using `click`. | ||
| 2. Support subcommands: | ||
| - `refresh [PROMPT_FILE]`: Re-runs all `<extract>` tags found in the specified prompt file(s). | ||
| - `list`: Lists all cached extractions in `.pdd/extracts/` with metadata (source, query, size). | ||
| - `status`: Checks for staleness of all cached extractions relative to their source files. | ||
| - `preview PATH --query QUERY`: Performs a one-off extraction and displays it to stdout without caching. | ||
| 3. Integrate with `LLMExtractor` for extraction logic. | ||
| 4. Use `rich` for beautiful table outputs and status messages. | ||
| 5. Register the command in `pdd/commands/__init__.py` (Note: this is a manual integration point, but the module should be designed for it). | ||
| 6. Support `--force` to skip confirmation prompts where applicable. | ||
|
|
||
| % Dependencies | ||
| <llm_extractor_interface> | ||
| # Expected interface for pdd.llm_extractor (yet to be generated) | ||
| class LLMExtractor: | ||
| def refresh_extractions(self, prompt_path: str = None) -> list[dict]: | ||
| """Refreshes extractions. Returns list of updated extraction metadata.""" | ||
| pass | ||
|
|
||
| def list_extractions(self) -> list[dict]: | ||
| """Returns list of all cached extractions with metadata.""" | ||
| pass | ||
|
|
||
| def check_status(self) -> list[dict]: | ||
| """Checks staleness. Returns list of extraction status dicts.""" | ||
| pass | ||
|
|
||
| def preview_extraction(self, path: str, query: str) -> str: | ||
| """Performs one-off extraction without caching.""" | ||
| pass | ||
| </llm_extractor_interface> | ||
|
|
||
| % Deliverables | ||
| - Code: `pdd/commands/extract.py` |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The PR description’s “Next Steps” uses
pdd extract --refresh ..., but the README examples here usepdd extract refresh ...(subcommand). Please make the PR description and docs consistent with the actual CLI syntax to avoid user confusion.