Merge pull request #7 from vertexcover-io/feat/open-link

kgritesh · web-flow · commit 1be53e016077 · 2025-11-22T09:28:30.000+05:30
Feat/open link
diff --git a/.gitignore b/.gitignore
@@ -5,7 +5,9 @@ outputs/*
 .idea/*
 venv/*
 __pycache__/*
+**/__pycache__/*
 .claude/*
+*.pyc
 
 docs/source
 deploy/*
@@ -151,3 +153,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+CLAUDE.md
diff --git a/src/linkedin_spider/core/scraper.py b/src/linkedin_spider/core/scraper.py
@@ -159,6 +159,18 @@ def search_posts(
         """
         return self.search_scraper.search_posts(keywords, max_results, scroll_pause)
 
+    def open_link(self, url: str) -> dict[str, Any] | None:
+        """
+        Open a LinkedIn post URL and extract its content.
+
+        Args:
+            url: LinkedIn post URL (e.g., https://linkedin.com/feed/update/urn:li:activity:...)
+
+        Returns:
+            Dictionary containing post data (same structure as search_posts), or None if failed
+        """
+        return self.search_scraper.open_link(url)
+
     def scrape_company(self, company_url: str) -> dict[str, Any] | None:
         """Scrape a LinkedIn company page."""
         return self.company_scraper.scrape_company(company_url)
diff --git a/src/linkedin_spider/scrapers/search.py b/src/linkedin_spider/scrapers/search.py
@@ -1083,6 +1083,47 @@ def _load_more_comments(self, container: WebElement, max_comments: int) -> None:
         except Exception as e:
             self.log_action("DEBUG", f"Error loading more comments: {e!s}")
 
+    def open_link(self, url: str) -> dict[str, Any] | None:
+        """
+        Open a LinkedIn post URL and extract its content.
+
+        Args:
+            url: LinkedIn post URL (e.g., https://linkedin.com/feed/update/urn:li:activity:...)
+
+        Returns:
+            Dictionary containing post data (same structure as search_posts), or None if failed
+        """
+        try:
+            self.log_action("INFO", f"Opening LinkedIn URL: {url}")
+
+            if not self.navigate_to_url(url):
+                self.log_action("ERROR", f"Failed to navigate to URL: {url}")
+                return None
+
+            # Wait for page load
+            self.human_behavior.delay(2.0, 4.0)
+
+            # Find the first post container
+            post_containers = self._find_post_containers()
+
+            if not post_containers:
+                self.log_action("WARNING", "No post container found on page")
+                return None
+
+            # Extract data from the first post
+            post_data = self._extract_post_data(post_containers[0])
+
+            # Set post_url if not already set
+            if post_data.get("post_url") == "N/A":
+                post_data["post_url"] = url.split("?")[0]
+
+            self.log_action("SUCCESS", f"Extracted post from URL: {url}")
+            return post_data
+
+        except Exception as e:
+            self.log_action("ERROR", f"Failed to open link: {e!s}")
+            return None
+
     def _extract_post_comments(
         self, container: WebElement, max_comments: int = 10
     ) -> list[dict[str, Any]]: