Skip to content

Commit ec45451

Browse files
committed
bs4-ed lines to fix #36
correctly reads made-up page links in place of three-digit numbers.
1 parent d44a212 commit ec45451

File tree

1 file changed

+13
-2
lines changed

1 file changed

+13
-2
lines changed

videoText.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import re
22
import requests
33
from gazpacho import Soup
4+
from bs4 import BeautifulSoup
45

56
# pylint: disable=R0903
67
# pylint: disable=broad-except
@@ -114,7 +115,15 @@ def extractPage(self, page: int, sub=1):
114115
peas = self.soup.find("span", {"class": "fg"}, partial=True)
115116
peasStyle = self.soup.find("span", {"class" : "style"}, partial=True)
116117
#
117-
self.lines += [self.linefilter(x.text) for x in self.validateSoup(peas)]
118+
# gazpacho extraction got stuck on false page links
119+
# which are in fact numbers in disguise. The 690s (NBA) of ARD-Text have that.
120+
# so I use bs4 for less boilerplate code.
121+
for x in self.validateSoup(peas):
122+
addline = self.linefilter( \
123+
BeautifulSoup(x.html, features="html.parser").text.strip() \
124+
)
125+
self.lines.append(addline)
126+
#
118127
# gather info like 'Thema xyz on page 123'
119128
for x in self.validateSoup(peasStyle):
120129
addline = self.linefilter(x.text)
@@ -167,11 +176,13 @@ def appendContent(self):
167176
if len(xAsText) > 1 :
168177
# I am canceling the first line of the text with a timestamp here
169178
# as a timestamp is not a score.
179+
# trying without "and not xAsText[0:2].isdigit()"
170180
if any(sport in xAsText for sport in self.listOfBallGames):
171181
isBallGame = True
172182
if isBallGame or self.currentPage in self.ballgamescorepages \
173-
and not xAsText[0:2].isdigit() and not xAsText.strip().endswith(":"):
183+
and not xAsText.strip().endswith(":"):
174184
xAsText = xAsText.replace("--:--", "noch kein Ergebnis")
185+
xAsText = re.sub(r"-\:-\s+\(-\:-\)", "noch kein Ergebnis", xAsText)
175186
xAsText = xAsText.replace("-:-", "noch kein Ergebnis")
176187
xAsText = re.sub(r"([0-9]{1,}):([0-9]{1,})", r"\1 zu \2", xAsText)
177188
self.content += '\n' + xAsText

0 commit comments

Comments
 (0)