1818from datetime import date , datetime , timezone
1919from urllib .parse import urlparse
2020
21- from shared_util import custom_logging
2221from newscatcher import Newscatcher
22+ from shared_util import custom_logging
2323
2424from util import stream_helper
2525
2828rss_datetime_fromat_1 = "%a, %d %b %Y %H:%M:%S %z"
2929rss_datetime_fromat_2 = "%a, %d %b %Y %H:%M:%S %Z"
3030rss_datetime_fromat_3 = "%a, %d %b %Y %H:%M:%S"
31+ rss_datetime_fromat_4 = "%A, %B %d, %Y %I:%M %p %z"
3132
3233
3334class TopicNotSupportedError (Exception ):
@@ -59,7 +60,7 @@ def retrieve_feed_from_all_topics(url):
5960 try :
6061 aggregated_feed .append (retrieve_feed (url , topic = topic ))
6162 except TopicNotSupportedError as error :
62- logger .warn (f"Skipping topic { topic } for { url } because { error } " )
63+ logger .debug (f"Skipping topic { topic } for { url } because { error } " )
6364
6465 return aggregated_feed
6566
@@ -112,7 +113,6 @@ def retrieve_feed(url, topic=None):
112113 news_feeds ["articles" ] = try_parsing_published_date (news_feeds ["articles" ])
113114
114115 if not news_feeds :
115- logger .warn (f"Topic { topic } is not supported" )
116116 raise TopicNotSupportedError (f"Topic { topic } is not supported" )
117117 return news_feeds
118118
@@ -125,7 +125,10 @@ def get_published_timestamp(str_date):
125125 try :
126126 published_datetime = datetime .strptime (str_date , rss_datetime_fromat_2 )
127127 except ValueError :
128- published_datetime = datetime .strptime (str_date , rss_datetime_fromat_3 )
128+ try :
129+ published_datetime = datetime .strptime (str_date , rss_datetime_fromat_3 )
130+ except ValueError :
131+ published_datetime = datetime .strptime (str_date , rss_datetime_fromat_4 )
129132
130133 return published_datetime .replace (tzinfo = timezone .utc )
131134
@@ -162,46 +165,54 @@ def create_and_publish_record(news_feed, account_name, platform, last_published_
162165 try :
163166 published_timestamp = news_feed_timestamp (article )
164167 except ValueError :
165- logger .warn (f"Cannot parse published timestamp for { article } " )
168+ logger .warning (f"Cannot parse published timestamp for { article } " )
166169 continue
167170
168171 if not last_published_timestamp or published_timestamp > datetime .fromisoformat (last_published_timestamp ):
169172 # check if at least one element of list is present in the article summary else skip this article
170- if len (query_str_list ) > 0 and not any (keyword in article ["summary" ] for keyword in query_str_list ):
171- logger .debug ("Did not find {query_str} in {article}" )
172- # Moving to next article since it did not have any of the search key words
173- continue
174-
175- text = article ["summary" ]
176- clean_text = re .sub (cleanr , "" , text )
177- text_array = slice_text_into_arrays (clean_text )
178-
179- # TODO - move the entities and extended entities to a function
180- # populate image urls
181- id_str = f"{ str (int (datetime .now ().timestamp () * 1000 ))} #{ url } "
182- image_urls = filter_link_types (article ["links" ], "image/jpeg" )
183- entities , extended_entities = dict (), dict ()
184- entities ["media" ], extended_entities ["media" ] = image_urls , image_urls
185-
186- # populate text urls
187- text_urls = filter_link_types (article ["links" ], "text/html" )
188- entities ["urls" ], extended_entities ["urls" ] = text_urls , text_urls
189- publish_record (
190- {
191- "account_name" : account_name ,
192- "platform" : platform ,
193- "search_query" : query_str ,
194- "feed" : {
195- "created_at" : published_timestamp .isoformat (),
196- "entities" : entities ,
197- "extended_entities" : extended_entities ,
198- "lang" : language ,
199- "metadata" : {"website" : url , "country" : country , "topic" : topic },
200- },
201- },
202- id_str ,
203- text_array ,
204- )
173+ text = article .get ("summary" , article .get ("title" , None ))
174+ if text :
175+ logger .debug (f"Article Detail: { article } " )
176+ if len (query_str_list ) > 0 and not any (keyword in text for keyword in query_str_list ):
177+ logger .debug (f"Did not find { query_str } in { article } " )
178+ # Moving to next article since it did not have any of the search key words
179+ continue
180+
181+ clean_text = re .sub (cleanr , "" , text )
182+ text_array = slice_text_into_arrays (clean_text )
183+
184+ # populate image urls
185+ id_str = f"{ str (int (datetime .now ().timestamp () * 1000 ))} #{ url } "
186+ image_urls = filter_link_types (article ["links" ], "image/jpeg" )
187+ entities , extended_entities = dict (), dict ()
188+ entities ["media" ], extended_entities ["media" ] = image_urls , image_urls
189+
190+ # populate text urls
191+ text_urls = filter_link_types (article ["links" ], "text/html" )
192+ text_urls = filter_link_types (article ["links" ], "audio/mpeg" ) if not text_urls else text_urls
193+
194+ if text_urls :
195+ entities ["urls" ], extended_entities ["urls" ] = text_urls , text_urls
196+ publish_record (
197+ {
198+ "account_name" : account_name ,
199+ "platform" : platform ,
200+ "search_query" : query_str ,
201+ "feed" : {
202+ "created_at" : published_timestamp .isoformat (),
203+ "entities" : entities ,
204+ "extended_entities" : extended_entities ,
205+ "lang" : language ,
206+ "metadata" : {"website" : url , "country" : country , "topic" : topic },
207+ },
208+ },
209+ id_str ,
210+ text_array ,
211+ )
212+ else :
213+ logger .debug (f"Skipping news feed from { url } since could not get url from { json .dumps (article )} " )
214+ else :
215+ logger .debug (f"Could not find article in newsfeed { article } " )
205216
206217
207218def publish_record (record_to_publish , id_str , text_array ):
@@ -221,13 +232,16 @@ def news_feed_timestamp(article):
221232 published_parsed = article .get ("published_parsed" , None )
222233 if published_parsed :
223234 published_timestamp = get_published_parsed_timestamp (published_parsed )
224- else :
235+ elif article . get ( "published" , None ) :
225236 # sample published time stamp Thu, 18 Mar 2021 20:06:58 +0200
226237 try :
227238 published_timestamp = get_published_timestamp (article ["published" ])
228- except ValueError :
229- logger .error (f"Could not parse time information and hence skipping record { article } " )
230- raise ValueError
239+ except (ValueError , KeyError ) as error :
240+ logger .debug (f"Could not parse time information and hence skipping record { article } " )
241+ raise error
242+ else :
243+ logger .debug (f'Could not retrieve published timestamp for { article } , hence marking it as "now"' )
244+ published_timestamp = datetime .now ().replace (tzinfo = timezone .utc )
231245 return published_timestamp
232246
233247
0 commit comments