aws-solutions
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎deployment/build-s3-dist.sh‎
Lines changed: 18 additions & 1 deletion b/‎deployment/build-s3-dist.sh‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎source/lambda/capture_news_feed/test/test_news_catcher_helper.py‎
Lines changed: 2 additions & 2 deletions b/‎source/lambda/capture_news_feed/test/test_news_catcher_helper.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎source/lambda/capture_news_feed/util/ddb_helper.py‎
Lines changed: 1 addition & 1 deletion b/‎source/lambda/capture_news_feed/util/ddb_helper.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎source/lambda/capture_news_feed/util/newscatcher_helper.py‎
Lines changed: 58 additions & 44 deletions b/‎source/lambda/capture_news_feed/util/newscatcher_helper.py‎
Lines changed: 58 additions & 44 deletions
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.6.1] - 2021-10-26
+### Fixed
+-   GitHub [issue #42](https://github.com/aws-solutions/discovering-hot-topics-using-machine-learning/issues/42). To fix the issue, RSS feed ingestion lambda function and SQLs related to the Amazon QuickSight dashboard were updated.
+
+### Updated
+-   AWS CDK version to 1.125.0
+-   AWS SDK version to 2.1008.0
+
 ## [1.6.0] - 2021-09-27
 
 ### Added
 
@@ -84,8 +84,25 @@ for folder in */ ; do
     done
 
     if [ -e "requirements.txt" ]; then
-        pip3 install -q -r requirements.txt --upgrade --target ./
+        if [ "$function_name" = "capture_news_feed" ]; then
+            echo "Installing $function_name lambda using virtual environment"
+            python3 -m venv .venv-test
+            echo "Activating virtual environment"
+            source .venv-test/bin/activate
+            echo "Executing pip3 install -q -r requirements.txt --upgrade --target ./"
+            pip3 install -q -r requirements.txt --upgrade --target ./
+            echo "Deactivating virtual environment"
+            deactivate
+            echo "Deleting python virtual environment"
+            rm -fr .venv-test
+        else
+            echo "Installing $function_name lambda"
+            echo "Executing pip3 install -q -r requirements.txt --upgrade --target ./"
+            pip3 install -q -r requirements.txt --upgrade --target ./
+        fi
     elif [ -e "package.json" ]; then
+        echo "Installing node dependencies"
+        echo "Executing npm ci --only=prod"
         npm ci --only=prod
     fi
 
 
@@ -115,15 +115,15 @@ def test_with_query_string(self):
 
     def test_slice_text_into_arrays(self):
         original_text_small = "This is fake text"
-        self.assertEquals(len(newscatcher_helper.slice_text_into_arrays(original_text_small)), 1)
+        self.assertEqual(len(newscatcher_helper.slice_text_into_arrays(original_text_small)), 1)
 
         original_text_large_arr = []
         for i in range(300):
             original_text_large_arr.append(original_text_small)
 
         original_text_large = " ".join(original_text_large_arr)
 
-        self.assertEquals(
+        self.assertEqual(
             len(newscatcher_helper.slice_text_into_arrays(original_text_large)),
             len(original_text_large) // 1250
             if len(original_text_large) % 1250 == 0
 
@@ -96,7 +96,7 @@ def get_query_tracker(account, url, search_query, topic=None, **item_kwargs):
         ScanIndexForward=False,
     )
     if len(response["Items"]) == 0:
-        logger.warn("Query tracker is empty")
+        logger.warning("Query tracker is empty")
         return {"LAST_PUBLISHED_TIMESTAMP": (datetime.now(timezone.utc) - timedelta(days=30)).isoformat()}
 
     return response["Items"][0]  # since limit is 1, it will return only 1 record and hence taking the first index value
@@ -18,8 +18,8 @@
 from datetime import date, datetime, timezone
 from urllib.parse import urlparse
 
-from shared_util import custom_logging
 from newscatcher import Newscatcher
+from shared_util import custom_logging
 
 from util import stream_helper
 
@@ -28,6 +28,7 @@
 rss_datetime_fromat_1 = "%a, %d %b %Y %H:%M:%S %z"
 rss_datetime_fromat_2 = "%a, %d %b %Y %H:%M:%S %Z"
 rss_datetime_fromat_3 = "%a, %d %b %Y %H:%M:%S"
+rss_datetime_fromat_4 = "%A, %B %d, %Y %I:%M %p %z"
 
 
 class TopicNotSupportedError(Exception):
@@ -59,7 +60,7 @@ def retrieve_feed_from_all_topics(url):
         try:
             aggregated_feed.append(retrieve_feed(url, topic=topic))
         except TopicNotSupportedError as error:
-            logger.warn(f"Skipping topic {topic} for {url} because {error}")
+            logger.debug(f"Skipping topic {topic} for {url} because {error}")
 
     return aggregated_feed
 
@@ -112,7 +113,6 @@ def retrieve_feed(url, topic=None):
         news_feeds["articles"] = try_parsing_published_date(news_feeds["articles"])
 
     if not news_feeds:
-        logger.warn(f"Topic {topic} is not supported")
         raise TopicNotSupportedError(f"Topic {topic} is not supported")
     return news_feeds
 
@@ -125,7 +125,10 @@ def get_published_timestamp(str_date):
         try:
             published_datetime = datetime.strptime(str_date, rss_datetime_fromat_2)
         except ValueError:
-            published_datetime = datetime.strptime(str_date, rss_datetime_fromat_3)
+            try:
+                published_datetime = datetime.strptime(str_date, rss_datetime_fromat_3)
+            except ValueError:
+                published_datetime = datetime.strptime(str_date, rss_datetime_fromat_4)
 
     return published_datetime.replace(tzinfo=timezone.utc)
 
@@ -162,46 +165,54 @@ def create_and_publish_record(news_feed, account_name, platform, last_published_
         try:
             published_timestamp = news_feed_timestamp(article)
         except ValueError:
-            logger.warn(f"Cannot parse published timestamp for {article}")
+            logger.warning(f"Cannot parse published timestamp for {article}")
             continue
 
         if not last_published_timestamp or published_timestamp > datetime.fromisoformat(last_published_timestamp):
             # check if at least one element of list is present in the article summary else skip this article
-            if len(query_str_list) > 0 and not any(keyword in article["summary"] for keyword in query_str_list):
-                logger.debug("Did not find {query_str} in {article}")
-                # Moving to next article since it did not have any of the search key words
-                continue
-
-            text = article["summary"]
-            clean_text = re.sub(cleanr, "", text)
-            text_array = slice_text_into_arrays(clean_text)
-
-            # TODO - move the entities and extended entities to a function
-            # populate image urls
-            id_str = f"{str(int(datetime.now().timestamp() * 1000))}#{url}"
-            image_urls = filter_link_types(article["links"], "image/jpeg")
-            entities, extended_entities = dict(), dict()
-            entities["media"], extended_entities["media"] = image_urls, image_urls
-
-            # populate text urls
-            text_urls = filter_link_types(article["links"], "text/html")
-            entities["urls"], extended_entities["urls"] = text_urls, text_urls
-            publish_record(
-                {
-                    "account_name": account_name,
-                    "platform": platform,
-                    "search_query": query_str,
-                    "feed": {
-                        "created_at": published_timestamp.isoformat(),
-                        "entities": entities,
-                        "extended_entities": extended_entities,
-                        "lang": language,
-                        "metadata": {"website": url, "country": country, "topic": topic},
-                    },
-                },
-                id_str,
-                text_array,
-            )
+            text = article.get("summary", article.get("title", None))
+            if text:
+                logger.debug(f"Article Detail: {article}")
+                if len(query_str_list) > 0 and not any(keyword in text for keyword in query_str_list):
+                    logger.debug(f"Did not find {query_str} in {article}")
+                    # Moving to next article since it did not have any of the search key words
+                    continue
+
+                clean_text = re.sub(cleanr, "", text)
+                text_array = slice_text_into_arrays(clean_text)
+
+                # populate image urls
+                id_str = f"{str(int(datetime.now().timestamp() * 1000))}#{url}"
+                image_urls = filter_link_types(article["links"], "image/jpeg")
+                entities, extended_entities = dict(), dict()
+                entities["media"], extended_entities["media"] = image_urls, image_urls
+
+                # populate text urls
+                text_urls = filter_link_types(article["links"], "text/html")
+                text_urls = filter_link_types(article["links"], "audio/mpeg") if not text_urls else text_urls
+
+                if text_urls:
+                    entities["urls"], extended_entities["urls"] = text_urls, text_urls
+                    publish_record(
+                        {
+                            "account_name": account_name,
+                            "platform": platform,
+                            "search_query": query_str,
+                            "feed": {
+                                "created_at": published_timestamp.isoformat(),
+                                "entities": entities,
+                                "extended_entities": extended_entities,
+                                "lang": language,
+                                "metadata": {"website": url, "country": country, "topic": topic},
+                            },
+                        },
+                        id_str,
+                        text_array,
+                    )
+                else:
+                    logger.debug(f"Skipping news feed from {url} since could not get url from {json.dumps(article)}")
+            else:
+                logger.debug(f"Could not find article in newsfeed {article}")
 
 
 def publish_record(record_to_publish, id_str, text_array):
@@ -221,13 +232,16 @@ def news_feed_timestamp(article):
     published_parsed = article.get("published_parsed", None)
     if published_parsed:
         published_timestamp = get_published_parsed_timestamp(published_parsed)
-    else:
+    elif article.get("published", None):
         # sample published time stamp Thu, 18 Mar 2021 20:06:58 +0200
         try:
             published_timestamp = get_published_timestamp(article["published"])
-        except ValueError:
-            logger.error(f"Could not parse time information and hence skipping record {article}")
-            raise ValueError
+        except (ValueError, KeyError) as error:
+            logger.debug(f"Could not parse time information and hence skipping record {article}")
+            raise error
+    else:
+        logger.debug(f'Could not retrieve published timestamp for {article}, hence marking it as "now"')
+        published_timestamp = datetime.now().replace(tzinfo=timezone.utc)
     return published_timestamp
Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ def get_query_tracker(account, url, search_query, topic=None, **item_kwargs):`
`96`	`96`	`ScanIndexForward=False,`
`97`	`97`	`)`
`98`	`98`	`if len(response["Items"]) == 0:`
`99`		`- logger.warn("Query tracker is empty")`
	`99`	`+ logger.warning("Query tracker is empty")`
`100`	`100`	`return {"LAST_PUBLISHED_TIMESTAMP": (datetime.now(timezone.utc) - timedelta(days=30)).isoformat()}`
`101`	`101`
`102`	`102`	`return response["Items"][0] # since limit is 1, it will return only 1 record and hence taking the first index value`