Skip to content

Commit 55f4cf3

Browse files
authored
Refactor some Lambda code for AWS News Feeds and use frequent scheduler (#139)
1 parent 183e941 commit 55f4cf3

File tree

2 files changed

+26
-46
lines changed

2 files changed

+26
-46
lines changed

data-collection/deploy/deploy-data-collection.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1151,7 +1151,7 @@ Resources:
11511151
Parameters:
11521152
DestinationBucket: !Ref S3Bucket
11531153
DestinationBucketARN: !GetAtt S3Bucket.Arn
1154-
Schedule: !Ref Schedule
1154+
Schedule: !Ref ScheduleFrequent
11551155
GlueRoleARN: !GetAtt GlueRole.Arn
11561156
ResourcePrefix: !Ref ResourcePrefix
11571157
LambdaAnalyticsARN: !GetAtt LambdaAnalytics.Arn

data-collection/deploy/module-aws-feeds.yaml

Lines changed: 25 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -94,38 +94,28 @@ Resources:
9494
import boto3
9595
9696
def clean_html(html_content):
97-
result = ""
98-
urls = []
99-
ref_index = [1]
100-
101-
class SimpleHTMLParser(HTMLParser):
97+
class MyParser(HTMLParser):
98+
def __init__(self):
99+
super().__init__()
100+
self.text = ''
101+
self.ref = {}
102+
self.index = 0
102103
def handle_starttag(self, tag, attrs):
103-
nonlocal result
104104
if tag == 'a':
105+
self.index += 1
105106
href = next((value for attr, value in attrs if attr == 'href'), None)
106107
if href:
107108
if href.startswith('/'):
108109
href = f"https://aws.amazon.com{href}"
109-
urls.append(href)
110-
self.last_href_index = len(result)
111-
110+
self.ref[self.index] = href
112111
def handle_endtag(self, tag):
113-
nonlocal result
114112
if tag == 'a':
115-
result = result[:self.last_href_index] + f"[{ref_index[0]}] " + result[self.last_href_index:]
116-
ref_index[0] += 1
117-
113+
self.text += f"[{self.index}]"
118114
def handle_data(self, data):
119-
nonlocal result
120-
result += data
121-
122-
parser = SimpleHTMLParser()
115+
self.text += data
116+
parser = MyParser()
123117
parser.feed(html_content)
124-
125-
for i, url in enumerate(urls, start=1):
126-
result += f"\n[{i}] {url}"
127-
128-
return result
118+
return parser.text.strip() + '\n\n' + '\n'.join([f"[{index}]: {url}" for index, url in parser.ref.items()])
129119
130120
def lambda_handler(event, context):
131121
feed_url = os.environ['FEED_URL']
@@ -532,38 +522,28 @@ Resources:
532522
from dateutil.parser import parse, ParserError
533523
534524
def clean_html(html_content):
535-
result = ""
536-
urls = []
537-
ref_index = [1]
538-
539-
class SimpleHTMLParser(HTMLParser):
525+
class MyParser(HTMLParser):
526+
def __init__(self):
527+
super().__init__()
528+
self.text = ''
529+
self.ref = {}
530+
self.index = 0
540531
def handle_starttag(self, tag, attrs):
541-
nonlocal result
542532
if tag == 'a':
533+
self.index += 1
543534
href = next((value for attr, value in attrs if attr == 'href'), None)
544535
if href:
545536
if href.startswith('/'):
546537
href = f"https://aws.amazon.com{href}"
547-
urls.append(href)
548-
self.last_href_index = len(result)
549-
538+
self.ref[self.index] = href
550539
def handle_endtag(self, tag):
551-
nonlocal result
552540
if tag == 'a':
553-
result = result[:self.last_href_index] + f"[{ref_index[0]}] " + result[self.last_href_index:]
554-
ref_index[0] += 1
555-
541+
self.text += f"[{self.index}]"
556542
def handle_data(self, data):
557-
nonlocal result
558-
result += data
559-
560-
parser = SimpleHTMLParser()
543+
self.text += data
544+
parser = MyParser()
561545
parser.feed(html_content)
562-
563-
for i, url in enumerate(urls, start=1):
564-
result += f"\n[{i}] {url}"
565-
566-
return result
546+
return parser.text.strip() + '\n\n' + '\n'.join([f"[{index}]: {url}" for index, url in parser.ref.items()])
567547
568548
def lambda_handler(event, context):
569549
feed_url = os.environ['FEED_URL']
@@ -662,7 +642,7 @@ Resources:
662642
Variables:
663643
BUCKET_NAME: !Ref DestinationBucket
664644
BUCKET_PATH: "aws-feeds/aws-feeds-security-bulletin"
665-
FEED_URL: " https://aws.amazon.com/security/security-bulletins/rss/feed/"
645+
FEED_URL: "https://aws.amazon.com/security/security-bulletins/rss/feed/"
666646
Metadata:
667647
cfn_nag:
668648
rules_to_suppress:

0 commit comments

Comments
 (0)