Skip to content

Commit 38acdbb

Browse files
kgriteshclaude
andcommitted
feat: Filter links to only include external domain URLs
- Only store external domain URLs in the 'links' field - Keep LinkedIn redirect URLs (linkedin.com/redir/) as they point to external sites - Exclude internal LinkedIn links (profiles, companies, posts, etc.) - Updated docstrings to clarify this behavior 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent cad799f commit 38acdbb

File tree

2 files changed

+24
-11
lines changed

2 files changed

+24
-11
lines changed

src/linkedin_spider/core/scraper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def search_posts(
144144
- post_time: ISO 8601 UTC timestamp of when the post was made
145145
- post_text: The text content of the post in markdown format (links preserved as [text](url))
146146
- hashtags: List of hashtags used in the post
147-
- links: List of URLs found in the post content
147+
- links: List of external domain URLs only (excludes internal LinkedIn links)
148148
- post_url: Direct URL to the post
149149
- media_urls: List of image/video URLs in the post
150150
- likes_count: Number of reactions/likes

src/linkedin_spider/scrapers/search.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,7 @@ def _extract_post_data(self, container: WebElement) -> dict[str, Any]:
435435
- post_time (ISO 8601 UTC timestamp)
436436
- post_text (markdown format with links)
437437
- hashtags
438-
- links (list of URLs found in post)
438+
- links (list of external domain URLs only - excludes internal LinkedIn links)
439439
- post_url
440440
- media_urls (list of image/video URLs)
441441
- likes_count
@@ -735,7 +735,11 @@ def _parse_relative_time_to_utc(self, relative_time: str) -> str:
735735
return relative_time
736736

737737
def _extract_post_content(self, container: WebElement) -> dict[str, Any]:
738-
"""Extract post text, hashtags, links, and posting time."""
738+
"""Extract post text, hashtags, external links, and posting time.
739+
740+
Note: Only external domain links are stored (LinkedIn redirect URLs and non-LinkedIn domains).
741+
Internal LinkedIn links (profiles, companies, posts) are excluded.
742+
"""
739743
content_info = {
740744
"post_text": "N/A",
741745
"hashtags": [],
@@ -776,20 +780,29 @@ def _extract_post_content(self, container: WebElement) -> dict[str, Any]:
776780
if hashtags:
777781
content_info["hashtags"] = hashtags
778782

779-
# Extract links from the post content
783+
# Extract links from the post content (external domains only)
780784
if post_content_elem:
781785
links = []
782786
link_elements = post_content_elem.find_elements(By.TAG_NAME, "a")
783787
for link_elem in link_elements:
784788
href = self._extract_attribute_safe(link_elem, "href")
785-
# Skip hashtag links (already extracted separately) and filter valid URLs
786-
# Keep LinkedIn redirect URLs and regular HTTP(S) links
787-
if (
788-
href
789-
and "/search/results/all/?keywords=%23" not in href
790-
and ("linkedin.com/redir/" in href or href.startswith("http"))
791-
):
789+
790+
# Skip hashtag links (already extracted separately)
791+
if not href or "/search/results/all/?keywords=%23" in href:
792+
continue
793+
794+
# Keep LinkedIn redirect URLs (these point to external sites)
795+
if "linkedin.com/redir/" in href:
792796
links.append(href)
797+
continue
798+
799+
# For other links, only keep external domains (not linkedin.com)
800+
if href.startswith("http"):
801+
# Parse the URL to check domain
802+
parsed = urllib.parse.urlparse(href)
803+
# Skip if it's a LinkedIn domain link
804+
if "linkedin.com" not in parsed.netloc:
805+
links.append(href)
793806

794807
if links:
795808
content_info["links"] = links

0 commit comments

Comments
 (0)