Skip to content

Commit fe20976

Browse files
committed
⚡ Add support for extracting every post url
1 parent c912f97 commit fe20976

File tree

3 files changed

+122
-1
lines changed

3 files changed

+122
-1
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ dependencies = [
3232
"python-dotenv>=1.0.0",
3333
"psutil>=5.9.0",
3434
"twine>=6.2.0",
35+
"pyperclip>=1.9.0",
3536
]
3637

3738

src/linkedin_spider/scrapers/search.py

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import contextlib
12
import urllib.parse
23
from datetime import datetime, timedelta, timezone
34
from typing import Any
45

6+
import pyperclip
57
from selenium.common.exceptions import TimeoutException
68
from selenium.webdriver.common.by import By
79
from selenium.webdriver.remote.webelement import WebElement
@@ -941,10 +943,119 @@ def _extract_engagement_metrics(self, container: WebElement) -> dict[str, int]:
941943

942944
return metrics
943945

946+
def _extract_post_link_via_clipboard(self, container: WebElement) -> str | None:
947+
"""Extract post link by clicking three dots menu and copying link to clipboard."""
948+
try:
949+
# Find the three dots button (control menu button)
950+
control_menu_selectors = [
951+
"button[aria-label*='Control menu']",
952+
"button.feed-shared-control-menu__trigger",
953+
"button[aria-label*='More actions']",
954+
".feed-shared-control-menu__trigger",
955+
]
956+
957+
control_menu_button = None
958+
for selector in control_menu_selectors:
959+
control_menu_button = self._find_element_in_parent(
960+
container, By.CSS_SELECTOR, selector
961+
)
962+
if control_menu_button:
963+
break
964+
965+
if not control_menu_button:
966+
self.log_action("DEBUG", "Could not find control menu button")
967+
return None
968+
969+
# Scroll the button into view if needed
970+
self.driver.execute_script(
971+
"arguments[0].scrollIntoView({block: 'center'});", control_menu_button
972+
)
973+
self.human_behavior.delay(0.3, 0.5)
974+
975+
# Click the three dots button to open the dropdown
976+
try:
977+
control_menu_button.click()
978+
self.human_behavior.delay(0.5, 1.0)
979+
except Exception as e:
980+
self.log_action("DEBUG", f"Could not click control menu button: {e!s}")
981+
return None
982+
983+
# Wait a bit for the dropdown to appear
984+
self.human_behavior.delay(0.3, 0.5)
985+
986+
# Find the "Copy link to post" option in the dropdown
987+
# The dropdown might be rendered outside the container, so search in the entire document
988+
copy_link_button = None
989+
with contextlib.suppress(Exception):
990+
# First try to find by class name in the visible dropdown
991+
copy_link_button = self.driver.find_element(
992+
By.CSS_SELECTOR, "li.option-share-via div[role='button']"
993+
)
994+
995+
# If not found by class, search by text content
996+
if not copy_link_button:
997+
with contextlib.suppress(Exception):
998+
# Find all visible dropdown menus
999+
all_menu_items = self.driver.find_elements(
1000+
By.CSS_SELECTOR, "li.feed-shared-control-menu__item"
1001+
)
1002+
for item in all_menu_items:
1003+
with contextlib.suppress(Exception):
1004+
headline = item.find_element(
1005+
By.CSS_SELECTOR, "h5.feed-shared-control-menu__headline"
1006+
)
1007+
if headline and "Copy link to post" in headline.text:
1008+
copy_link_button = item.find_element(
1009+
By.CSS_SELECTOR, "div[role='button']"
1010+
)
1011+
break
1012+
1013+
if not copy_link_button:
1014+
# Try to close the dropdown and return
1015+
with contextlib.suppress(Exception):
1016+
# Click outside to close
1017+
self.driver.execute_script("document.body.click();")
1018+
self.log_action("DEBUG", "Could not find 'Copy link to post' option")
1019+
return None
1020+
1021+
# Click "Copy link to post"
1022+
try:
1023+
copy_link_button.click()
1024+
self.human_behavior.delay(0.5, 1.0)
1025+
except Exception as e:
1026+
self.log_action("DEBUG", f"Could not click 'Copy link to post': {e!s}")
1027+
# Try to close the dropdown
1028+
with contextlib.suppress(Exception):
1029+
self.driver.execute_script("document.body.click();")
1030+
return None
1031+
1032+
# Read from clipboard
1033+
try:
1034+
clipboard_content = pyperclip.paste()
1035+
if clipboard_content and (
1036+
"linkedin.com/feed/update/" in clipboard_content
1037+
or "linkedin.com/posts/" in clipboard_content
1038+
):
1039+
# Clean up the URL (remove query parameters)
1040+
post_url = clipboard_content.split("?")[0]
1041+
return post_url
1042+
else:
1043+
self.log_action(
1044+
"DEBUG",
1045+
f"Clipboard content does not appear to be a LinkedIn post URL: {clipboard_content[:50]}",
1046+
)
1047+
except Exception as e:
1048+
self.log_action("DEBUG", f"Could not read from clipboard: {e!s}")
1049+
1050+
except Exception as e:
1051+
self.log_action("WARNING", f"Error extracting post link via clipboard: {e!s}")
1052+
1053+
return None
1054+
9441055
def _extract_post_url(self, container: WebElement) -> str | None:
9451056
"""Extract the URL to the post."""
9461057
try:
947-
# Look for post link in various places
1058+
# First, try to find post link in various places
9481059
link_selectors = [
9491060
"a[href*='/feed/update/']",
9501061
"a[href*='/posts/']",
@@ -958,6 +1069,13 @@ def _extract_post_url(self, container: WebElement) -> str | None:
9581069
if href and ("/feed/update/" in href or "/posts/" in href):
9591070
return href.split("?")[0] # Remove query parameters
9601071

1072+
# If direct link extraction fails, try clipboard method
1073+
self.log_action("DEBUG", "Direct link extraction failed, trying clipboard method")
1074+
clipboard_url = self._extract_post_link_via_clipboard(container)
1075+
if clipboard_url:
1076+
self.log_action("DEBUG", f"Extracted post from URL: {clipboard_url}")
1077+
return clipboard_url
1078+
9611079
except Exception as e:
9621080
self.log_action("WARNING", f"Error extracting post URL: {e!s}")
9631081

uv.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)