|
1 | 1 | import re |
2 | 2 | import requests |
3 | 3 | from gazpacho import Soup |
| 4 | +from bs4 import BeautifulSoup |
4 | 5 |
|
5 | 6 | # pylint: disable=R0903 |
6 | 7 | # pylint: disable=broad-except |
@@ -114,7 +115,15 @@ def extractPage(self, page: int, sub=1): |
114 | 115 | peas = self.soup.find("span", {"class": "fg"}, partial=True) |
115 | 116 | peasStyle = self.soup.find("span", {"class" : "style"}, partial=True) |
116 | 117 | # |
117 | | - self.lines += [self.linefilter(x.text) for x in self.validateSoup(peas)] |
| 118 | + # gazpacho extraction got stuck on false page links |
| 119 | + # which are in fact numbers in disguise. The 690s (NBA) of ARD-Text have that. |
| 120 | + # so I use bs4 for less boilerplate code. |
| 121 | + for x in self.validateSoup(peas): |
| 122 | + addline = self.linefilter( \ |
| 123 | + BeautifulSoup(x.html, features="html.parser").text.strip() \ |
| 124 | + ) |
| 125 | + self.lines.append(addline) |
| 126 | + # |
118 | 127 | # gather info like 'Thema xyz on page 123' |
119 | 128 | for x in self.validateSoup(peasStyle): |
120 | 129 | addline = self.linefilter(x.text) |
@@ -167,11 +176,13 @@ def appendContent(self): |
167 | 176 | if len(xAsText) > 1 : |
168 | 177 | # I am canceling the first line of the text with a timestamp here |
169 | 178 | # as a timestamp is not a score. |
| 179 | + # trying without "and not xAsText[0:2].isdigit()" |
170 | 180 | if any(sport in xAsText for sport in self.listOfBallGames): |
171 | 181 | isBallGame = True |
172 | 182 | if isBallGame or self.currentPage in self.ballgamescorepages \ |
173 | | - and not xAsText[0:2].isdigit() and not xAsText.strip().endswith(":"): |
| 183 | + and not xAsText.strip().endswith(":"): |
174 | 184 | xAsText = xAsText.replace("--:--", "noch kein Ergebnis") |
| 185 | + xAsText = re.sub(r"-\:-\s+\(-\:-\)", "noch kein Ergebnis", xAsText) |
175 | 186 | xAsText = xAsText.replace("-:-", "noch kein Ergebnis") |
176 | 187 | xAsText = re.sub(r"([0-9]{1,}):([0-9]{1,})", r"\1 zu \2", xAsText) |
177 | 188 | self.content += '\n' + xAsText |
|
0 commit comments