feat: Filter links to only include external domain URLs

kgritesh · claude · kgritesh · commit 38acdbb34477 · 2025-11-19T11:43:18.000+05:30
- Only store external domain URLs in the 'links' field - Keep LinkedIn redirect URLs (linkedin.com/redir/) as they point to external sites - Exclude internal LinkedIn links (profiles, companies, posts, etc.) - Updated docstrings to clarify this behavior 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/src/linkedin_spider/core/scraper.py b/src/linkedin_spider/core/scraper.py
@@ -144,7 +144,7 @@ def search_posts(
             - post_time: ISO 8601 UTC timestamp of when the post was made
             - post_text: The text content of the post in markdown format (links preserved as [text](url))
             - hashtags: List of hashtags used in the post
-            - links: List of URLs found in the post content
+            - links: List of external domain URLs only (excludes internal LinkedIn links)
             - post_url: Direct URL to the post
             - media_urls: List of image/video URLs in the post
             - likes_count: Number of reactions/likes
diff --git a/src/linkedin_spider/scrapers/search.py b/src/linkedin_spider/scrapers/search.py
@@ -435,7 +435,7 @@ def _extract_post_data(self, container: WebElement) -> dict[str, Any]:
             - post_time (ISO 8601 UTC timestamp)
             - post_text (markdown format with links)
             - hashtags
-            - links (list of URLs found in post)
+            - links (list of external domain URLs only - excludes internal LinkedIn links)
             - post_url
             - media_urls (list of image/video URLs)
             - likes_count
@@ -735,7 +735,11 @@ def _parse_relative_time_to_utc(self, relative_time: str) -> str:
             return relative_time
 
     def _extract_post_content(self, container: WebElement) -> dict[str, Any]:
-        """Extract post text, hashtags, links, and posting time."""
+        """Extract post text, hashtags, external links, and posting time.
+
+        Note: Only external domain links are stored (LinkedIn redirect URLs and non-LinkedIn domains).
+        Internal LinkedIn links (profiles, companies, posts) are excluded.
+        """
         content_info = {
             "post_text": "N/A",
             "hashtags": [],
@@ -776,20 +780,29 @@ def _extract_post_content(self, container: WebElement) -> dict[str, Any]:
             if hashtags:
                 content_info["hashtags"] = hashtags
 
-            # Extract links from the post content
+            # Extract links from the post content (external domains only)
             if post_content_elem:
                 links = []
                 link_elements = post_content_elem.find_elements(By.TAG_NAME, "a")
                 for link_elem in link_elements:
                     href = self._extract_attribute_safe(link_elem, "href")
-                    # Skip hashtag links (already extracted separately) and filter valid URLs
-                    # Keep LinkedIn redirect URLs and regular HTTP(S) links
-                    if (
-                        href
-                        and "/search/results/all/?keywords=%23" not in href
-                        and ("linkedin.com/redir/" in href or href.startswith("http"))
-                    ):
+
+                    # Skip hashtag links (already extracted separately)
+                    if not href or "/search/results/all/?keywords=%23" in href:
+                        continue
+
+                    # Keep LinkedIn redirect URLs (these point to external sites)
+                    if "linkedin.com/redir/" in href:
                         links.append(href)
+                        continue
+
+                    # For other links, only keep external domains (not linkedin.com)
+                    if href.startswith("http"):
+                        # Parse the URL to check domain
+                        parsed = urllib.parse.urlparse(href)
+                        # Skip if it's a LinkedIn domain link
+                        if "linkedin.com" not in parsed.netloc:
+                            links.append(href)
 
                 if links:
                     content_info["links"] = links