Skip to content

Commit 91e7349

Browse files
authored
feat(news-fetcher): support dynamic catching snippet for news (#243)
<!-- Thanks for creating this pull request 🤗 Please make sure that the pull request is limited to one type (docs, feature, etc.) and keep it as small as possible. You can open multiple prs instead of opening a huge one. --> <!-- If this pull request closes an issue, please mention the issue number below --> Closes # <!-- Issue # here --> ## 📑 Description <!-- Add a brief description of the pr --> <!-- You can also choose to add a list of changes and if they have been completed or not by using the markdown to-do list syntax - [ ] Not Completed - [x] Completed --> ## ✅ Checks <!-- Make sure your pr passes the CI checks and do check the following fields as needed - --> - [ ] My pull request adheres to the code style of this project - [ ] My code requires changes to the documentation - [ ] I have updated the documentation as required - [ ] All the tests have passed - [ ] Branch name follows `type/descript` (e.g. `feature/add-llm-agents`) - [ ] Ready for code review ## ℹ Additional Information <!-- Any additional information like breaking changes, dependencies added, screenshots, comparisons between new and old behavior, etc. -->
1 parent 63e5446 commit 91e7349

File tree

1 file changed

+35
-13
lines changed

1 file changed

+35
-13
lines changed

live_trade_bench/fetchers/news_fetcher.py

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,32 @@ def _extract_snippet_from_url(self, url: str) -> str:
127127
# Silently fail - snippet extraction is optional
128128
return ""
129129

130+
def _find_snippet_dynamically(
131+
self, card, title_text, source_text, date_text
132+
) -> str:
133+
candidates = []
134+
for div in card.find_all(["div", "span"]):
135+
text = div.get_text(strip=True)
136+
# Filter out empty or too short text
137+
if not text or len(text) < 20:
138+
continue
139+
140+
# Filter out text that is exactly the title, source, or date
141+
if text in [title_text, source_text, date_text]:
142+
continue
143+
144+
# Filter out text that contains the title (parent containers)
145+
if title_text in text and len(text) < len(title_text) + 50:
146+
continue
147+
148+
candidates.append(text)
149+
150+
# Return the longest candidate that remains
151+
if candidates:
152+
return max(candidates, key=len)
153+
154+
return ""
155+
130156
def fetch(
131157
self, query: str, start_date: str, end_date: str, max_pages: int = 10
132158
) -> List[Dict[str, Any]]:
@@ -176,26 +202,22 @@ def fetch(
176202
link = self._clean_google_href(a["href"])
177203

178204
title_el = el.select_one("div.MBeuO")
179-
# Try multiple snippet selectors (Google frequently changes these)
180-
snippet_el = (
181-
el.select_one(".GI74Re")
182-
or el.select_one(".yXK7lf")
183-
or el.select_one(".st")
184-
)
185205
date_el = el.select_one(".LfVVr")
186206
source_el = el.select_one(".NUnG9d span")
187207

188208
# Snippet is optional - title, date, and source are required
189209
if not (title_el and date_el and source_el):
190210
continue
191211

192-
# Get snippet with 3-tier fallback
193-
snippet = ""
194-
if snippet_el:
195-
snippet = snippet_el.get_text(strip=True)
196-
elif (
197-
link and page == 0
198-
): # Only fetch from URL for first page to avoid slowdown
212+
# Get snippet with dynamic fallback strategy
213+
snippet = self._find_snippet_dynamically(
214+
el,
215+
title_el.get_text(strip=True),
216+
source_el.get_text(strip=True),
217+
date_el.get_text(strip=True),
218+
)
219+
220+
if not snippet and link and page == 0:
199221
snippet = self._extract_snippet_from_url(link)
200222

201223
ts = self._parse_relative_or_absolute(

0 commit comments

Comments
 (0)