feat(news-fetcher): support dynamic catching snippet for news (#243)

4R5T · web-flow · commit 91e7349dbf4a · 2025-11-19T22:33:15.000-05:00
&lt;!--
Thanks for creating this pull request 🤗
Please make sure that the pull request is limited to one type (docs,
feature, etc.) and keep it as small as possible. You can open multiple
prs instead of opening a huge one.
--&gt;

&lt;!-- If this pull request closes an issue, please mention the issue
number below --&gt;

Closes # &lt;!-- Issue # here --&gt;

## 📑 Description

&lt;!-- Add a brief description of the pr --&gt;

&lt;!-- You can also choose to add a list of changes and if they have been
completed or not by using the markdown to-do list syntax
- [ ] Not Completed
- [x] Completed
--&gt;

## ✅ Checks

&lt;!-- Make sure your pr passes the CI checks and do check the following
fields as needed - --&gt;

- [ ] My pull request adheres to the code style of this project
- [ ] My code requires changes to the documentation
- [ ] I have updated the documentation as required
- [ ] All the tests have passed
- [ ] Branch name follows `type/descript` (e.g.
`feature/add-llm-agents`)
- [ ] Ready for code review

## ℹ Additional Information

&lt;!-- Any additional information like breaking changes, dependencies
added, screenshots, comparisons between new and old behavior, etc. --&gt;
diff --git a/live_trade_bench/fetchers/news_fetcher.py b/live_trade_bench/fetchers/news_fetcher.py
@@ -127,6 +127,32 @@ def _extract_snippet_from_url(self, url: str) -> str:
             # Silently fail - snippet extraction is optional
             return ""
 
+    def _find_snippet_dynamically(
+        self, card, title_text, source_text, date_text
+    ) -> str:
+        candidates = []
+        for div in card.find_all(["div", "span"]):
+            text = div.get_text(strip=True)
+            # Filter out empty or too short text
+            if not text or len(text) < 20:
+                continue
+
+            # Filter out text that is exactly the title, source, or date
+            if text in [title_text, source_text, date_text]:
+                continue
+
+            # Filter out text that contains the title (parent containers)
+            if title_text in text and len(text) < len(title_text) + 50:
+                continue
+
+            candidates.append(text)
+
+        # Return the longest candidate that remains
+        if candidates:
+            return max(candidates, key=len)
+
+        return ""
+
     def fetch(
         self, query: str, start_date: str, end_date: str, max_pages: int = 10
     ) -> List[Dict[str, Any]]:
@@ -176,26 +202,22 @@ def fetch(
                     link = self._clean_google_href(a["href"])
 
                     title_el = el.select_one("div.MBeuO")
-                    # Try multiple snippet selectors (Google frequently changes these)
-                    snippet_el = (
-                        el.select_one(".GI74Re")
-                        or el.select_one(".yXK7lf")
-                        or el.select_one(".st")
-                    )
                     date_el = el.select_one(".LfVVr")
                     source_el = el.select_one(".NUnG9d span")
 
                     # Snippet is optional - title, date, and source are required
                     if not (title_el and date_el and source_el):
                         continue
 
-                    # Get snippet with 3-tier fallback
-                    snippet = ""
-                    if snippet_el:
-                        snippet = snippet_el.get_text(strip=True)
-                    elif (
-                        link and page == 0
-                    ):  # Only fetch from URL for first page to avoid slowdown
+                    # Get snippet with dynamic fallback strategy
+                    snippet = self._find_snippet_dynamically(
+                        el,
+                        title_el.get_text(strip=True),
+                        source_el.get_text(strip=True),
+                        date_el.get_text(strip=True),
+                    )
+
+                    if not snippet and link and page == 0:
                         snippet = self._extract_snippet_from_url(link)
 
                     ts = self._parse_relative_or_absolute(