@@ -94,6 +94,34 @@ def validateSoup(self, contents):
9494 return [contents ]
9595 return []
9696 #
97+ def __extractReadableText (self ):
98+ peas = self .soup .find ("span" , {"class" : "fg" }, partial = True )
99+ # gazpacho extraction got stuck on false page links
100+ # which are in fact numbers in disguise. The 690s (NBA) of ARD-Text have that.
101+ # so I use bs4 for less boilerplate code.
102+ for x in self .validateSoup (peas ):
103+ foundRunningMatch = 'fgm bgb' in x .html
104+ addline = self .linefilter ( \
105+ BeautifulSoup (x .html , features = "html.parser" ).text .strip () \
106+ )
107+ if foundRunningMatch :
108+ addline = re .sub (r'(\d+\:\d+)' , r'\1 (läuft) ' , addline )
109+ self .lines .append (addline )
110+ #
111+ def __extractLinkedPages (self ):
112+ peasStyle = self .soup .find ("span" , {"class" : "style" }, partial = True )
113+ # gather info like 'Thema xyz on page 123'
114+ for x in self .validateSoup (peasStyle ):
115+ addline = self .linefilter (x .text )
116+ alist = x .find ("a" )
117+ alist = self .validateSoup (alist )
118+ #
119+ for y in alist :
120+ if len (y .html ) > 1 :
121+ linkedPages = re .findall (r'\d+' , y .html )
122+ addline += " " + linkedPages [- 1 ]
123+ self .lines .append (addline )
124+ #
97125 def extractPage (self , page : int , sub = 1 ):
98126 """Requests content of videotext at page `page`, subpage `sub`/n
99127
@@ -112,32 +140,11 @@ def extractPage(self, page: int, sub=1):
112140 res .raise_for_status ()
113141 #gazpacho
114142 self .soup = Soup (res .text )
115- peas = self .soup .find ("span" , {"class" : "fg" }, partial = True )
116- peasStyle = self .soup .find ("span" , {"class" : "style" }, partial = True )
117143 #
118- # gazpacho extraction got stuck on false page links
119- # which are in fact numbers in disguise. The 690s (NBA) of ARD-Text have that.
120- # so I use bs4 for less boilerplate code.
121- for x in self .validateSoup (peas ):
122- foundRunningMatch = 'fgm bgb' in x .html
123- addline = self .linefilter ( \
124- BeautifulSoup (x .html , features = "html.parser" ).text .strip () \
125- )
126- if foundRunningMatch :
127- addline = re .sub (r'(\d+\:\d+)' , r'\1 (läuft) ' , addline )
128- self .lines .append (addline )
144+ self .__extractReadableText ()
129145 #
130- # gather info like 'Thema xyz on page 123'
131- for x in self .validateSoup (peasStyle ):
132- addline = self .linefilter (x .text )
133- alist = x .find ("a" )
134- alist = self .validateSoup (alist )
146+ self .__extractLinkedPages ()
135147 #
136- for y in alist :
137- if len (y .html ) > 1 :
138- linkedPages = re .findall (r'\d+' , y .html )
139- addline += " " + linkedPages [- 1 ]
140- self .lines .append (addline )
141148 except requests .exceptions .HTTPError as httpErr :
142149 message = "Die Seite kann nicht angezeigt werden " + \
143150 f"(Fehler { httpErr .response .status_code } )"
0 commit comments