Skip to content
This repository was archived by the owner on Dec 30, 2024. It is now read-only.

Commit 213a18c

Browse files
authored
Merge pull request #45 from aws-solutions/develop
Update to version v1.6.1
2 parents d130b94 + aaafde6 commit 213a18c

File tree

59 files changed

+8111
-7981
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+8111
-7981
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [1.6.1] - 2021-10-26
9+
### Fixed
10+
- GitHub [issue #42](https://github.com/aws-solutions/discovering-hot-topics-using-machine-learning/issues/42). To fix the issue, RSS feed ingestion lambda function and SQLs related to the Amazon QuickSight dashboard were updated.
11+
12+
### Updated
13+
- AWS CDK version to 1.125.0
14+
- AWS SDK version to 2.1008.0
15+
816
## [1.6.0] - 2021-09-27
917

1018
### Added

deployment/build-s3-dist.sh

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,25 @@ for folder in */ ; do
8484
done
8585

8686
if [ -e "requirements.txt" ]; then
87-
pip3 install -q -r requirements.txt --upgrade --target ./
87+
if [ "$function_name" = "capture_news_feed" ]; then
88+
echo "Installing $function_name lambda using virtual environment"
89+
python3 -m venv .venv-test
90+
echo "Activating virtual environment"
91+
source .venv-test/bin/activate
92+
echo "Executing pip3 install -q -r requirements.txt --upgrade --target ./"
93+
pip3 install -q -r requirements.txt --upgrade --target ./
94+
echo "Deactivating virtual environment"
95+
deactivate
96+
echo "Deleting python virtual environment"
97+
rm -fr .venv-test
98+
else
99+
echo "Installing $function_name lambda"
100+
echo "Executing pip3 install -q -r requirements.txt --upgrade --target ./"
101+
pip3 install -q -r requirements.txt --upgrade --target ./
102+
fi
88103
elif [ -e "package.json" ]; then
104+
echo "Installing node dependencies"
105+
echo "Executing npm ci --only=prod"
89106
npm ci --only=prod
90107
fi
91108

source/lambda/capture_news_feed/test/test_news_catcher_helper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,15 +115,15 @@ def test_with_query_string(self):
115115

116116
def test_slice_text_into_arrays(self):
117117
original_text_small = "This is fake text"
118-
self.assertEquals(len(newscatcher_helper.slice_text_into_arrays(original_text_small)), 1)
118+
self.assertEqual(len(newscatcher_helper.slice_text_into_arrays(original_text_small)), 1)
119119

120120
original_text_large_arr = []
121121
for i in range(300):
122122
original_text_large_arr.append(original_text_small)
123123

124124
original_text_large = " ".join(original_text_large_arr)
125125

126-
self.assertEquals(
126+
self.assertEqual(
127127
len(newscatcher_helper.slice_text_into_arrays(original_text_large)),
128128
len(original_text_large) // 1250
129129
if len(original_text_large) % 1250 == 0

source/lambda/capture_news_feed/util/ddb_helper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def get_query_tracker(account, url, search_query, topic=None, **item_kwargs):
9696
ScanIndexForward=False,
9797
)
9898
if len(response["Items"]) == 0:
99-
logger.warn("Query tracker is empty")
99+
logger.warning("Query tracker is empty")
100100
return {"LAST_PUBLISHED_TIMESTAMP": (datetime.now(timezone.utc) - timedelta(days=30)).isoformat()}
101101

102102
return response["Items"][0] # since limit is 1, it will return only 1 record and hence taking the first index value

source/lambda/capture_news_feed/util/newscatcher_helper.py

Lines changed: 58 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
from datetime import date, datetime, timezone
1919
from urllib.parse import urlparse
2020

21-
from shared_util import custom_logging
2221
from newscatcher import Newscatcher
22+
from shared_util import custom_logging
2323

2424
from util import stream_helper
2525

@@ -28,6 +28,7 @@
2828
rss_datetime_fromat_1 = "%a, %d %b %Y %H:%M:%S %z"
2929
rss_datetime_fromat_2 = "%a, %d %b %Y %H:%M:%S %Z"
3030
rss_datetime_fromat_3 = "%a, %d %b %Y %H:%M:%S"
31+
rss_datetime_fromat_4 = "%A, %B %d, %Y %I:%M %p %z"
3132

3233

3334
class TopicNotSupportedError(Exception):
@@ -59,7 +60,7 @@ def retrieve_feed_from_all_topics(url):
5960
try:
6061
aggregated_feed.append(retrieve_feed(url, topic=topic))
6162
except TopicNotSupportedError as error:
62-
logger.warn(f"Skipping topic {topic} for {url} because {error}")
63+
logger.debug(f"Skipping topic {topic} for {url} because {error}")
6364

6465
return aggregated_feed
6566

@@ -112,7 +113,6 @@ def retrieve_feed(url, topic=None):
112113
news_feeds["articles"] = try_parsing_published_date(news_feeds["articles"])
113114

114115
if not news_feeds:
115-
logger.warn(f"Topic {topic} is not supported")
116116
raise TopicNotSupportedError(f"Topic {topic} is not supported")
117117
return news_feeds
118118

@@ -125,7 +125,10 @@ def get_published_timestamp(str_date):
125125
try:
126126
published_datetime = datetime.strptime(str_date, rss_datetime_fromat_2)
127127
except ValueError:
128-
published_datetime = datetime.strptime(str_date, rss_datetime_fromat_3)
128+
try:
129+
published_datetime = datetime.strptime(str_date, rss_datetime_fromat_3)
130+
except ValueError:
131+
published_datetime = datetime.strptime(str_date, rss_datetime_fromat_4)
129132

130133
return published_datetime.replace(tzinfo=timezone.utc)
131134

@@ -162,46 +165,54 @@ def create_and_publish_record(news_feed, account_name, platform, last_published_
162165
try:
163166
published_timestamp = news_feed_timestamp(article)
164167
except ValueError:
165-
logger.warn(f"Cannot parse published timestamp for {article}")
168+
logger.warning(f"Cannot parse published timestamp for {article}")
166169
continue
167170

168171
if not last_published_timestamp or published_timestamp > datetime.fromisoformat(last_published_timestamp):
169172
# check if at least one element of list is present in the article summary else skip this article
170-
if len(query_str_list) > 0 and not any(keyword in article["summary"] for keyword in query_str_list):
171-
logger.debug("Did not find {query_str} in {article}")
172-
# Moving to next article since it did not have any of the search key words
173-
continue
174-
175-
text = article["summary"]
176-
clean_text = re.sub(cleanr, "", text)
177-
text_array = slice_text_into_arrays(clean_text)
178-
179-
# TODO - move the entities and extended entities to a function
180-
# populate image urls
181-
id_str = f"{str(int(datetime.now().timestamp() * 1000))}#{url}"
182-
image_urls = filter_link_types(article["links"], "image/jpeg")
183-
entities, extended_entities = dict(), dict()
184-
entities["media"], extended_entities["media"] = image_urls, image_urls
185-
186-
# populate text urls
187-
text_urls = filter_link_types(article["links"], "text/html")
188-
entities["urls"], extended_entities["urls"] = text_urls, text_urls
189-
publish_record(
190-
{
191-
"account_name": account_name,
192-
"platform": platform,
193-
"search_query": query_str,
194-
"feed": {
195-
"created_at": published_timestamp.isoformat(),
196-
"entities": entities,
197-
"extended_entities": extended_entities,
198-
"lang": language,
199-
"metadata": {"website": url, "country": country, "topic": topic},
200-
},
201-
},
202-
id_str,
203-
text_array,
204-
)
173+
text = article.get("summary", article.get("title", None))
174+
if text:
175+
logger.debug(f"Article Detail: {article}")
176+
if len(query_str_list) > 0 and not any(keyword in text for keyword in query_str_list):
177+
logger.debug(f"Did not find {query_str} in {article}")
178+
# Moving to next article since it did not have any of the search key words
179+
continue
180+
181+
clean_text = re.sub(cleanr, "", text)
182+
text_array = slice_text_into_arrays(clean_text)
183+
184+
# populate image urls
185+
id_str = f"{str(int(datetime.now().timestamp() * 1000))}#{url}"
186+
image_urls = filter_link_types(article["links"], "image/jpeg")
187+
entities, extended_entities = dict(), dict()
188+
entities["media"], extended_entities["media"] = image_urls, image_urls
189+
190+
# populate text urls
191+
text_urls = filter_link_types(article["links"], "text/html")
192+
text_urls = filter_link_types(article["links"], "audio/mpeg") if not text_urls else text_urls
193+
194+
if text_urls:
195+
entities["urls"], extended_entities["urls"] = text_urls, text_urls
196+
publish_record(
197+
{
198+
"account_name": account_name,
199+
"platform": platform,
200+
"search_query": query_str,
201+
"feed": {
202+
"created_at": published_timestamp.isoformat(),
203+
"entities": entities,
204+
"extended_entities": extended_entities,
205+
"lang": language,
206+
"metadata": {"website": url, "country": country, "topic": topic},
207+
},
208+
},
209+
id_str,
210+
text_array,
211+
)
212+
else:
213+
logger.debug(f"Skipping news feed from {url} since could not get url from {json.dumps(article)}")
214+
else:
215+
logger.debug(f"Could not find article in newsfeed {article}")
205216

206217

207218
def publish_record(record_to_publish, id_str, text_array):
@@ -221,13 +232,16 @@ def news_feed_timestamp(article):
221232
published_parsed = article.get("published_parsed", None)
222233
if published_parsed:
223234
published_timestamp = get_published_parsed_timestamp(published_parsed)
224-
else:
235+
elif article.get("published", None):
225236
# sample published time stamp Thu, 18 Mar 2021 20:06:58 +0200
226237
try:
227238
published_timestamp = get_published_timestamp(article["published"])
228-
except ValueError:
229-
logger.error(f"Could not parse time information and hence skipping record {article}")
230-
raise ValueError
239+
except (ValueError, KeyError) as error:
240+
logger.debug(f"Could not parse time information and hence skipping record {article}")
241+
raise error
242+
else:
243+
logger.debug(f'Could not retrieve published timestamp for {article}, hence marking it as "now"')
244+
published_timestamp = datetime.now().replace(tzinfo=timezone.utc)
231245
return published_timestamp
232246

233247

0 commit comments

Comments
 (0)