feat: Add comment control and date filtering to search_posts

kgritesh · claude · kgritesh · commit fe578a460df9 · 2025-11-22T18:21:16.000+05:30
Add two new optional parameters to search_posts API: - max_comments: Control number of comments fetched per post (0 to skip) - date_posted: Filter posts by date (past-24h, past-week, past-month) Changes: - Update SearchScraper.search_posts() with new parameters - Implement date filtering via URL query parameters - Add conditional comment fetching based on max_comments value - Update CLI with --max-comments and --date-posted options - Update MCP tool with new parameters - Update LinkedinSpider wrapper method - Fix TRY300 linting issue in open_link method All changes maintain backward compatibility with default values. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/src/linkedin_spider/__init__.py b/src/linkedin_spider/__init__.py
@@ -1,7 +1,5 @@
 """LinkedIn Scraper - A modern LinkedIn scraping library."""
 
-__version__ = "1.0.0"
-
 from linkedin_spider.core.auth import AuthManager
 from linkedin_spider.core.config import ScraperConfig
 from linkedin_spider.core.driver import DriverManager
diff --git a/src/linkedin_spider/cli/main.py b/src/linkedin_spider/cli/main.py
@@ -213,6 +213,19 @@ def search_posts(
         float,
         Parameter(name=["-s", "--scroll-pause"], help="Pause duration between scrolls (seconds)"),
     ] = 2.0,
+    max_comments: Annotated[
+        int,
+        Parameter(
+            name=["-c", "--max-comments"], help="Maximum comments per post (0 to skip comments)"
+        ),
+    ] = 10,
+    date_posted: Annotated[
+        str | None,
+        Parameter(
+            name=["-d", "--date-posted"],
+            help="Filter by date posted (past-24h, past-week, past-month)",
+        ),
+    ] = None,
     output: Annotated[
         str | None,
         Parameter(name=["-o", "--output"], help="Output file path (.json or .csv format)"),
@@ -238,9 +251,15 @@ def search_posts(
 
         print(f"Searching for posts with keywords: '{keywords}'")
         print(f"Maximum results: {max_results}")
-        print(f"Scroll pause: {scroll_pause}s\n")
-
-        results = scraper.search_posts(keywords, max_results, scroll_pause)
+        print(f"Scroll pause: {scroll_pause}s")
+        print(f"Max comments per post: {max_comments}")
+        if date_posted:
+            print(f"Date filter: {date_posted}")
+        print()
+
+        results = scraper.search_posts(
+            keywords, max_results, scroll_pause, max_comments, date_posted
+        )
 
         if output:
             _save_results(results, output)
diff --git a/src/linkedin_spider/core/scraper.py b/src/linkedin_spider/core/scraper.py
@@ -125,7 +125,12 @@ def search_profiles(
         return self.search_scraper.search_profiles(query, max_results, filters)
 
     def search_posts(
-        self, keywords: str, max_results: int = 10, scroll_pause: float = 2.0
+        self,
+        keywords: str,
+        max_results: int = 10,
+        scroll_pause: float = 2.0,
+        max_comments: int = 10,
+        date_posted: str | None = None,
     ) -> list[dict[str, Any]]:
         """
         Search for LinkedIn posts by keywords.
@@ -134,6 +139,8 @@ def search_posts(
             keywords: Search keywords (e.g., "bihar elections")
             max_results: Maximum number of posts to scrape
             scroll_pause: Pause duration between scrolls in seconds
+            max_comments: Maximum number of comments to fetch per post (0 to skip comments)
+            date_posted: Filter by date posted ("past-24h", "past-week", "past-month", or None)
 
         Returns:
             List of dictionaries containing post data with keys:
@@ -157,7 +164,9 @@ def search_posts(
                 - comment_time: ISO 8601 UTC timestamp of when comment was posted
                 - reactions_count: Number of reactions on the comment
         """
-        return self.search_scraper.search_posts(keywords, max_results, scroll_pause)
+        return self.search_scraper.search_posts(
+            keywords, max_results, scroll_pause, max_comments, date_posted
+        )
 
     def open_link(self, url: str) -> dict[str, Any] | None:
         """
diff --git a/src/linkedin_spider/mcp/server.py b/src/linkedin_spider/mcp/server.py
@@ -165,13 +165,21 @@ async def scrape_company(company_url: str) -> str:
 
 
 @mcp_app.tool()
-async def search_posts(keywords: str, max_results: int = 10, scroll_pause: float = 2.0) -> str:
+async def search_posts(
+    keywords: str,
+    max_results: int = 10,
+    scroll_pause: float = 2.0,
+    max_comments: int = 10,
+    date_posted: str | None = None,
+) -> str:
     """Search for LinkedIn posts by keywords.
 
     Args:
         keywords: Search keywords for posts
         max_results: Maximum number of posts to retrieve (default: 10)
         scroll_pause: Pause duration between scrolls in seconds (default: 2.0)
+        max_comments: Maximum comments per post, 0 to skip comments (default: 10)
+        date_posted: Filter by date posted: "past-24h", "past-week", "past-month", or None
 
     Returns:
         JSON string containing post data including author info, content, and engagement metrics
@@ -181,7 +189,9 @@ async def search_posts(keywords: str, max_results: int = 10, scroll_pause: float
 
     try:
         scraper = get_scraper()
-        results = scraper.search_posts(keywords, max_results, scroll_pause)
+        results = scraper.search_posts(
+            keywords, max_results, scroll_pause, max_comments, date_posted
+        )
 
         if results:
             return f"posts:\n{json.dumps(results, indent=2, ensure_ascii=False)}"
diff --git a/src/linkedin_spider/scrapers/search.py b/src/linkedin_spider/scrapers/search.py
@@ -233,7 +233,12 @@ def reset_filters(self) -> Any:
         return self.filter_handler.reset_filters()
 
     def search_posts(
-        self, keywords: str, max_results: int = 10, scroll_pause: float = 2.0
+        self,
+        keywords: str,
+        max_results: int = 10,
+        scroll_pause: float = 2.0,
+        max_comments: int = 10,
+        date_posted: str | None = None,
     ) -> list[dict[str, Any]]:
         """
         Search for LinkedIn posts by keywords.
@@ -242,6 +247,8 @@ def search_posts(
             keywords: Search keywords
             max_results: Maximum number of posts to scrape
             scroll_pause: Pause duration between scrolls (seconds)
+            max_comments: Maximum number of comments to fetch per post (0 to skip comments)
+            date_posted: Filter by date posted. Valid values: "past-24h", "past-week", "past-month", or None
 
         Returns:
             List of post data dictionaries
@@ -253,12 +260,26 @@ def search_posts(
                 f"https://www.linkedin.com/search/results/content/?keywords={encoded_keywords}"
             )
 
+            # Add date filter to URL if provided
+            if date_posted:
+                valid_filters = ["past-24h", "past-week", "past-month"]
+                if date_posted in valid_filters:
+                    search_url += f'&datePosted="{date_posted}"'
+                    self.log_action("INFO", f"Applying date filter: {date_posted}")
+                else:
+                    self.log_action(
+                        "WARNING",
+                        f"Invalid date_posted value: {date_posted}. Must be one of {valid_filters}",
+                    )
+
             self.log_action("INFO", f"Searching for posts with keywords: {keywords}")
 
             if not self.navigate_to_url(search_url):
                 self.log_action("ERROR", f"Failed to navigate to post search: {keywords}")
                 return []
 
+            self.log_action("INFO", f"Navigated to post search URL: {search_url}")
+
             # Wait for initial page load
             self.human_behavior.delay(2.0, 4.0)
 
@@ -284,7 +305,7 @@ def search_posts(
                         continue
 
                     # Extract post data
-                    post_data = self._extract_post_data(container)
+                    post_data = self._extract_post_data(container, max_comments=max_comments)
 
                     # Only add if we got meaningful data
                     if post_data and post_data.get("author_name") != "N/A":
@@ -422,10 +443,14 @@ def _extract_post_id(self, container: WebElement) -> str | None:
         else:
             return None
 
-    def _extract_post_data(self, container: WebElement) -> dict[str, Any]:
+    def _extract_post_data(self, container: WebElement, max_comments: int = 10) -> dict[str, Any]:
         """
         Extract comprehensive data from a post container.
 
+        Args:
+            container: Post container WebElement
+            max_comments: Maximum number of comments to fetch (0 to skip comments)
+
         Returns:
             Dictionary containing post data with keys:
             - author_name
@@ -441,7 +466,7 @@ def _extract_post_data(self, container: WebElement) -> dict[str, Any]:
             - likes_count
             - comments_count
             - reposts_count
-            - comments (list of comment dictionaries)
+            - comments (list of comment dictionaries, empty if max_comments=0)
         """
         post_data = {
             "author_name": "N/A",
@@ -483,10 +508,11 @@ def _extract_post_data(self, container: WebElement) -> dict[str, Any]:
             if media_urls:
                 post_data["media_urls"] = media_urls
 
-            # Extract comments if present
-            comments = self._extract_post_comments(container)
-            if comments:
-                post_data["comments"] = comments
+            # Extract comments if max_comments > 0
+            if max_comments > 0:
+                comments = self._extract_post_comments(container, max_comments)
+                if comments:
+                    post_data["comments"] = comments
 
         except Exception as e:
             self.log_action("WARNING", f"Error extracting post data: {e!s}")
@@ -1117,12 +1143,12 @@ def open_link(self, url: str) -> dict[str, Any] | None:
             if post_data.get("post_url") == "N/A":
                 post_data["post_url"] = url.split("?")[0]
 
-            self.log_action("SUCCESS", f"Extracted post from URL: {url}")
-            return post_data
-
         except Exception as e:
             self.log_action("ERROR", f"Failed to open link: {e!s}")
             return None
+        else:
+            self.log_action("SUCCESS", f"Extracted post from URL: {url}")
+            return post_data
 
     def _extract_post_comments(
         self, container: WebElement, max_comments: int = 10