Skip to content

Commit 482e440

Browse files
committed
refactoring
1 parent f53a833 commit 482e440

File tree

1 file changed

+30
-23
lines changed

1 file changed

+30
-23
lines changed

videoText.py

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,34 @@ def validateSoup(self, contents):
9494
return [contents]
9595
return []
9696
#
97+
def __extractReadableText(self):
98+
peas = self.soup.find("span", {"class": "fg"}, partial=True)
99+
# gazpacho extraction got stuck on false page links
100+
# which are in fact numbers in disguise. The 690s (NBA) of ARD-Text have that.
101+
# so I use bs4 for less boilerplate code.
102+
for x in self.validateSoup(peas):
103+
foundRunningMatch = 'fgm bgb' in x.html
104+
addline = self.linefilter( \
105+
BeautifulSoup(x.html, features="html.parser").text.strip() \
106+
)
107+
if foundRunningMatch:
108+
addline = re.sub(r'(\d+\:\d+)', r'\1 (läuft) ', addline)
109+
self.lines.append(addline)
110+
#
111+
def __extractLinkedPages(self):
112+
peasStyle = self.soup.find("span", {"class" : "style"}, partial=True)
113+
# gather info like 'Thema xyz on page 123'
114+
for x in self.validateSoup(peasStyle):
115+
addline = self.linefilter(x.text)
116+
alist = x.find("a")
117+
alist = self.validateSoup(alist)
118+
#
119+
for y in alist:
120+
if len(y.html) > 1:
121+
linkedPages = re.findall(r'\d+', y.html)
122+
addline += " " + linkedPages[-1]
123+
self.lines.append(addline)
124+
#
97125
def extractPage(self, page: int, sub=1):
98126
"""Requests content of videotext at page `page`, subpage `sub`/n
99127
@@ -112,32 +140,11 @@ def extractPage(self, page: int, sub=1):
112140
res.raise_for_status()
113141
#gazpacho
114142
self.soup = Soup(res.text)
115-
peas = self.soup.find("span", {"class": "fg"}, partial=True)
116-
peasStyle = self.soup.find("span", {"class" : "style"}, partial=True)
117143
#
118-
# gazpacho extraction got stuck on false page links
119-
# which are in fact numbers in disguise. The 690s (NBA) of ARD-Text have that.
120-
# so I use bs4 for less boilerplate code.
121-
for x in self.validateSoup(peas):
122-
foundRunningMatch = 'fgm bgb' in x.html
123-
addline = self.linefilter( \
124-
BeautifulSoup(x.html, features="html.parser").text.strip() \
125-
)
126-
if foundRunningMatch:
127-
addline = re.sub(r'(\d+\:\d+)', r'\1 (läuft) ', addline)
128-
self.lines.append(addline)
144+
self.__extractReadableText()
129145
#
130-
# gather info like 'Thema xyz on page 123'
131-
for x in self.validateSoup(peasStyle):
132-
addline = self.linefilter(x.text)
133-
alist = x.find("a")
134-
alist = self.validateSoup(alist)
146+
self.__extractLinkedPages()
135147
#
136-
for y in alist:
137-
if len(y.html) > 1:
138-
linkedPages = re.findall(r'\d+', y.html)
139-
addline += " " + linkedPages[-1]
140-
self.lines.append(addline)
141148
except requests.exceptions.HTTPError as httpErr:
142149
message = "Die Seite kann nicht angezeigt werden " + \
143150
f"(Fehler {httpErr.response.status_code})"

0 commit comments

Comments
 (0)