Skip to content

Commit 22af5e1

Browse files
author
Jet Xu
committed
Changes to be committed:
modified: CHANGELOG.md modified: llama_github/data_retrieval/github_entities.py modified: llama_github/version.py modified: setup.cfg
1 parent 0f21d99 commit 22af5e1

File tree

4 files changed

+94
-48
lines changed

4 files changed

+94
-48
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.3.2] - 2025-06-23
9+
10+
### Optimized
11+
- Upgrade to extract_related_issues method
12+
813
## [0.3.1] - 2025-05-25
914

1015
### Optimized

llama_github/data_retrieval/github_entities.py

Lines changed: 87 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -396,77 +396,118 @@ def get_issue_content(self, number, issue=None) -> str:
396396

397397
def extract_related_issues(self, pr_data: Dict[str, Any]) -> List[int]:
398398
"""
399-
Extracts related issue numbers from all PR data following GitHub's reference syntax.
400-
401-
This function implements GitHub's official autolink reference formats to find:
402-
1. Full GitHub issue/PR URLs
403-
2. Numeric references (#123)
404-
3. Keywords + issue references (fixes #123)
405-
4. Repository cross-references (owner/repo#123)
406-
407-
See: https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/autolinked-references-and-urls
408-
399+
Extracts related issue numbers from PR data using adaptive strategies based on content length.
400+
401+
Uses different matching strategies:
402+
- Short descriptions (<200 chars): Aggressive patterns for simple references
403+
- Long descriptions (>=200 chars): Strict patterns to avoid false positives
404+
409405
Args:
410-
pr_data: Dict[str, Any] - The complete pull request data dictionary
406+
pr_data: Complete pull request data dictionary
411407
412408
Returns:
413-
List[int] - A sorted list of unique issue numbers found in the PR data
409+
List[int] - Sorted list of unique issue numbers
414410
"""
415411
# GitHub's official closing keywords
416412
closing_keywords = (
417-
'close', 'closes', 'closed',
418-
'fix', 'fixes', 'fixed',
419-
'resolve', 'resolves', 'resolved'
413+
'close', 'closes', 'closed',
414+
'fix', 'fixes', 'fixed',
415+
'resolve', 'resolves', 'resolved',
416+
'address', 'addresses', 'addressing',
417+
'relate', 'relates', 'related',
418+
'see',
419+
'issue', 'bug', 'ticket', 'todo', 'task'
420420
)
421421

422-
# Regex patterns for GitHub issue references
423-
patterns = [
424-
# Full GitHub issue/PR URL pattern
425-
rf'(?:https?://)?github\.com/{re.escape(self.full_name)}/(?:issues|pull)/(\d+)',
426-
427-
# # Standard #123 reference with proper boundaries
428-
# r'(?:^|[^\w/])#(\d+)(?=[^\w/]|$)',
429-
430-
# Closing keywords (fixes #123)
431-
fr'(?:^|[^\w/])(?:{"|".join(closing_keywords)}):?\s+#(\d+)(?=[^\w/]|$)',
432-
433-
# Cross-repo reference (owner/repo#123)
434-
rf'{re.escape(self.full_name)}#(\d+)',
422+
issues = set()
423+
424+
def get_description_length(data: Dict[str, Any]) -> int:
425+
"""Get the length of PR description for strategy selection"""
426+
try:
427+
description = data.get('pr_metadata', {}).get('description', '')
428+
return len(description) if isinstance(description, str) else 0
429+
except:
430+
return 0
431+
432+
def extract_with_aggressive_patterns(text: str) -> None:
433+
"""Aggressive patterns for short, focused descriptions"""
434+
if not isinstance(text, str):
435+
return
436+
437+
patterns = [
438+
# Simple #123 reference (most common in short descriptions)
439+
r'#(\d+)(?!\d)',
440+
441+
# Full GitHub URLs
442+
rf'(?:https?://)?github\.com/{re.escape(self.full_name)}/(?:issues|pull)/(\d+)',
443+
444+
# Closing keywords with flexible spacing
445+
fr'(?:{"|".join(closing_keywords)})\s*:?\s*#?(\d+)(?!\d)',
446+
447+
# Action words commonly used in short descriptions
448+
r'(?:addresses?|references?|relates?\s+to|see)\s+#?(\d+)(?!\d)',
449+
]
435450

436-
# Issue keyword reference (issue #123 or issue: #123)
437-
r'(?:^|[^\w/])(?:issue|bug|ticket|todo|task)s?:?\s+#?(\d+)(?=[^\w/]|$)'
438-
]
451+
for pattern in patterns:
452+
matches = re.findall(pattern, text, re.IGNORECASE)
453+
valid_matches = [
454+
int(match) for match in matches
455+
if match.isdigit() and len(match) <= 6 and int(match) > 0
456+
]
457+
issues.update(valid_matches)
439458

440-
issues = set()
441-
442-
def extract_from_text(text: str) -> None:
443-
"""Helper function to extract issue numbers from text"""
459+
def extract_with_strict_patterns(text: str) -> None:
460+
"""Strict patterns for long descriptions to avoid false positives"""
444461
if not isinstance(text, str):
445462
return
446463

464+
patterns = [
465+
# Full GitHub URLs (always reliable)
466+
rf'(?:https?://)?github\.com/{re.escape(self.full_name)}/(?:issues|pull)/(\d+)',
467+
468+
# Closing keywords with word boundaries
469+
fr'\b(?:{"|".join(closing_keywords)})\s*:?\s*#(\d+)\b',
470+
471+
# Explicit issue references with word boundaries
472+
r'\b(?:issue|bug|ticket|pr|pull\s+request)\s*:?\s*#?(\d+)\b',
473+
474+
# Cross-repo references
475+
rf'\b{re.escape(self.full_name)}#(\d+)\b',
476+
]
477+
447478
for pattern in patterns:
448-
matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
449-
# Validate issue numbers (reasonable length and positive values)
479+
matches = re.findall(pattern, text, re.IGNORECASE)
450480
valid_matches = [
451481
int(match) for match in matches
452-
if match.isdigit() and len(match) <= 7 and int(match) > 0
482+
if match.isdigit() and len(match) <= 6 and int(match) > 0
453483
]
454484
issues.update(valid_matches)
455485

456-
def process_value(value: Any) -> None:
457-
"""Recursively process dictionary values and extract issue numbers"""
486+
def extract_from_text(text: str, use_aggressive: bool = False) -> None:
487+
"""Extract issue numbers using appropriate strategy"""
488+
if use_aggressive:
489+
extract_with_aggressive_patterns(text)
490+
else:
491+
extract_with_strict_patterns(text)
492+
493+
def process_value(value: Any, use_aggressive: bool = False) -> None:
494+
"""Recursively process values and extract issue numbers"""
458495
if isinstance(value, dict):
459496
for v in value.values():
460-
process_value(v)
497+
process_value(v, use_aggressive)
461498
elif isinstance(value, (list, tuple)):
462499
for item in value:
463-
process_value(item)
500+
process_value(item, use_aggressive)
464501
elif isinstance(value, str):
465-
extract_from_text(value)
502+
extract_from_text(value, use_aggressive)
503+
504+
# Determine strategy based on description length
505+
desc_length = get_description_length(pr_data)
506+
use_aggressive_strategy = desc_length < 200
507+
508+
# Process all PR data
509+
process_value(pr_data, use_aggressive_strategy)
466510

467-
# Process all data in pr_data recursively
468-
process_value(pr_data)
469-
470511
return sorted(list(issues))
471512

472513

llama_github/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.3.1'
1+
__version__ = '0.3.2'

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = llama-github
3-
version = 0.3.1
3+
version = 0.3.2
44
author = Jet Xu
55
author_email = [email protected]
66
description = Llama-github is an open-source Python library that empowers LLM Chatbots, AI Agents, and Auto-dev Agents to conduct Retrieval from actively selected GitHub public projects. It Augments through LLMs and Generates context for any coding question, in order to streamline the development of sophisticated AI-driven applications.

0 commit comments

Comments
 (0)