Changes to be committed:

Jet Xu · Jet Xu · commit 22af5e16f478 · 2025-06-24T00:00:06.000+08:00
modified:   CHANGELOG.md
	modified:   llama_github/data_retrieval/github_entities.py
	modified:   llama_github/version.py
	modified:   setup.cfg
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.3.2] - 2025-06-23
+
+### Optimized
+- Upgrade to extract_related_issues method
+
 ## [0.3.1] - 2025-05-25
 
 ### Optimized
diff --git a/llama_github/data_retrieval/github_entities.py b/llama_github/data_retrieval/github_entities.py
@@ -396,77 +396,118 @@ def get_issue_content(self, number, issue=None) -> str:
     
     def extract_related_issues(self, pr_data: Dict[str, Any]) -> List[int]:
         """
-        Extracts related issue numbers from all PR data following GitHub's reference syntax.
-        
-        This function implements GitHub's official autolink reference formats to find:
-        1. Full GitHub issue/PR URLs
-        2. Numeric references (#123)
-        3. Keywords + issue references (fixes #123)
-        4. Repository cross-references (owner/repo#123)
-        
-        See: https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/autolinked-references-and-urls
-        
+        Extracts related issue numbers from PR data using adaptive strategies based on content length.
+
+        Uses different matching strategies:
+        - Short descriptions (<200 chars): Aggressive patterns for simple references
+        - Long descriptions (>=200 chars): Strict patterns to avoid false positives
+
         Args:
-            pr_data: Dict[str, Any] - The complete pull request data dictionary
+            pr_data: Complete pull request data dictionary
             
         Returns:
-            List[int] - A sorted list of unique issue numbers found in the PR data
+            List[int] - Sorted list of unique issue numbers
         """
         # GitHub's official closing keywords
         closing_keywords = (
-            'close', 'closes', 'closed',
-            'fix', 'fixes', 'fixed',
-            'resolve', 'resolves', 'resolved'
+        'close', 'closes', 'closed',
+        'fix', 'fixes', 'fixed',
+        'resolve', 'resolves', 'resolved',
+        'address', 'addresses', 'addressing',
+        'relate', 'relates', 'related',
+        'see',
+        'issue', 'bug', 'ticket', 'todo', 'task'
         )
 
-        # Regex patterns for GitHub issue references
-        patterns = [
-            # Full GitHub issue/PR URL pattern
-            rf'(?:https?://)?github\.com/{re.escape(self.full_name)}/(?:issues|pull)/(\d+)',
-            
-            # # Standard #123 reference with proper boundaries
-            # r'(?:^|[^\w/])#(\d+)(?=[^\w/]|$)',
-            
-            # Closing keywords (fixes #123)
-            fr'(?:^|[^\w/])(?:{"|".join(closing_keywords)}):?\s+#(\d+)(?=[^\w/]|$)',
-            
-            # Cross-repo reference (owner/repo#123)
-            rf'{re.escape(self.full_name)}#(\d+)',
+        issues = set()
+
+        def get_description_length(data: Dict[str, Any]) -> int:
+            """Get the length of PR description for strategy selection"""
+            try:
+                description = data.get('pr_metadata', {}).get('description', '')
+                return len(description) if isinstance(description, str) else 0
+            except:
+                return 0
+
+        def extract_with_aggressive_patterns(text: str) -> None:
+            """Aggressive patterns for short, focused descriptions"""
+            if not isinstance(text, str):
+                return
+                
+            patterns = [
+                # Simple #123 reference (most common in short descriptions)
+                r'#(\d+)(?!\d)',
+                
+                # Full GitHub URLs
+                rf'(?:https?://)?github\.com/{re.escape(self.full_name)}/(?:issues|pull)/(\d+)',
+                
+                # Closing keywords with flexible spacing
+                fr'(?:{"|".join(closing_keywords)})\s*:?\s*#?(\d+)(?!\d)',
+                
+                # Action words commonly used in short descriptions
+                r'(?:addresses?|references?|relates?\s+to|see)\s+#?(\d+)(?!\d)',
+            ]
             
-            # Issue keyword reference (issue #123 or issue: #123)
-            r'(?:^|[^\w/])(?:issue|bug|ticket|todo|task)s?:?\s+#?(\d+)(?=[^\w/]|$)'
-        ]
+            for pattern in patterns:
+                matches = re.findall(pattern, text, re.IGNORECASE)
+                valid_matches = [
+                    int(match) for match in matches 
+                    if match.isdigit() and len(match) <= 6 and int(match) > 0
+                ]
+                issues.update(valid_matches)
 
-        issues = set()
-        
-        def extract_from_text(text: str) -> None:
-            """Helper function to extract issue numbers from text"""
+        def extract_with_strict_patterns(text: str) -> None:
+            """Strict patterns for long descriptions to avoid false positives"""
             if not isinstance(text, str):
                 return
                 
+            patterns = [
+                # Full GitHub URLs (always reliable)
+                rf'(?:https?://)?github\.com/{re.escape(self.full_name)}/(?:issues|pull)/(\d+)',
+                
+                # Closing keywords with word boundaries
+                fr'\b(?:{"|".join(closing_keywords)})\s*:?\s*#(\d+)\b',
+                
+                # Explicit issue references with word boundaries  
+                r'\b(?:issue|bug|ticket|pr|pull\s+request)\s*:?\s*#?(\d+)\b',
+                
+                # Cross-repo references
+                rf'\b{re.escape(self.full_name)}#(\d+)\b',
+            ]
+            
             for pattern in patterns:
-                matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
-                # Validate issue numbers (reasonable length and positive values)
+                matches = re.findall(pattern, text, re.IGNORECASE)
                 valid_matches = [
                     int(match) for match in matches 
-                    if match.isdigit() and len(match) <= 7 and int(match) > 0
+                    if match.isdigit() and len(match) <= 6 and int(match) > 0
                 ]
                 issues.update(valid_matches)
 
-        def process_value(value: Any) -> None:
-            """Recursively process dictionary values and extract issue numbers"""
+        def extract_from_text(text: str, use_aggressive: bool = False) -> None:
+            """Extract issue numbers using appropriate strategy"""
+            if use_aggressive:
+                extract_with_aggressive_patterns(text)
+            else:
+                extract_with_strict_patterns(text)
+
+        def process_value(value: Any, use_aggressive: bool = False) -> None:
+            """Recursively process values and extract issue numbers"""
             if isinstance(value, dict):
                 for v in value.values():
-                    process_value(v)
+                    process_value(v, use_aggressive)
             elif isinstance(value, (list, tuple)):
                 for item in value:
-                    process_value(item)
+                    process_value(item, use_aggressive)
             elif isinstance(value, str):
-                extract_from_text(value)
+                extract_from_text(value, use_aggressive)
+
+        # Determine strategy based on description length
+        desc_length = get_description_length(pr_data)
+        use_aggressive_strategy = desc_length < 200
+
+        # Process all PR data
+        process_value(pr_data, use_aggressive_strategy)
 
-        # Process all data in pr_data recursively
-        process_value(pr_data)
-        
         return sorted(list(issues))
 
 
diff --git a/llama_github/version.py b/llama_github/version.py
@@ -1 +1 @@
-__version__ = '0.3.1'
+__version__ = '0.3.2'
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = llama-github
-version = 0.3.1
+version = 0.3.2
 author = Jet Xu
 author_email = Voldemort.xu@foxmail.com
 description = Llama-github is an open-source Python library that empowers LLM Chatbots, AI Agents, and Auto-dev Agents to conduct Retrieval from actively selected GitHub public projects. It Augments through LLMs and Generates context for any coding question, in order to streamline the development of sophisticated AI-driven applications.

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '0.3.1'`
	`1`	`+__version__ = '0.3.2'`