Skip to content

Commit cfc59d1

Browse files
authored
addition of the CID RSS Feed in the data collection (#337)
1 parent a0ea7d7 commit cfc59d1

File tree

1 file changed

+80
-66
lines changed

1 file changed

+80
-66
lines changed

data-collection/deploy/module-aws-feeds.yaml

Lines changed: 80 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,18 @@ Resources:
111111
from dateutil.parser import parse
112112
import boto3
113113
114+
FEEDS_MAP = {
115+
"aws": {
116+
"path": "aws-feeds/aws-feeds-whats-new",
117+
"feed_url": "https://aws.amazon.com/about-aws/whats-new/recent/feed/"
118+
},
119+
"aws-cid": {
120+
"path": "aws-feeds/aws-cid-feeds-whats-new",
121+
"feed_url": "https://cid.workshops.aws.dev/feed/cloud-intelligence-dashboards.rss",
122+
"default_services": ["aws-cid"],
123+
}
124+
}
125+
114126
def clean_html(html_content):
115127
class MyParser(HTMLParser):
116128
def __init__(self):
@@ -136,81 +148,82 @@ Resources:
136148
return parser.text.strip() + '\n\n' + '\n'.join([f"[{index}]: {url}" for index, url in parser.ref.items()])
137149
138150
def lambda_handler(event, context):
139-
feed_url = os.environ['FEED_URL']
151+
feeds_list = os.environ['FEEDS_LIST'].split(',')
140152
bucket_name = os.environ['BUCKET_NAME']
141-
bucket_path = os.environ.get('BUCKET_PATH', '')
142153
143154
try:
144-
with urllib.request.urlopen(feed_url, timeout=10) as response: # nosec
145-
feed_data = response.read().decode('utf-8')
146-
147-
malicious_strings = ['!ENTITY', ':include']
148-
for string in malicious_strings:
149-
if string in feed_data:
150-
return {
151-
'statusCode': 400,
152-
'body': f'Malicious content detected in the XML feed: {string}'
153-
}
154-
155-
s3 = boto3.client('s3')
156-
root = ET.fromstring(feed_data) # nosec
157-
158-
date_grouped_records = {}
159-
160-
for item in root.findall('.//item'):
161-
try:
162-
link = item.find('link').text
163-
title = item.find('title').text
164-
description = item.find('description').text or ''
165-
pubDate = item.find('pubDate').text
166-
category = item.find('category').text or ''
155+
for entry in feeds_list:
156+
feed_url = FEEDS_MAP[entry]['feed_url']
157+
bucket_path = FEEDS_MAP[entry]['path']
158+
with urllib.request.urlopen(feed_url, timeout=10) as response: # nosec
159+
feed_data = response.read().decode('utf-8')
160+
161+
malicious_strings = ['!ENTITY', ':include']
162+
for string in malicious_strings:
163+
if string in feed_data:
164+
return {
165+
'statusCode': 400,
166+
'body': f'Malicious content detected in the XML feed: {string}'
167+
}
167168
168-
# Parsing and formatting pubDate to ISO 8601 format
169-
pubDate_datetime = parse(pubDate)
170-
formatted_date = pubDate_datetime.strftime('%Y-%m-%dT%H:%M:%SZ')
169+
s3 = boto3.client('s3')
170+
root = ET.fromstring(feed_data) # nosec
171171
172-
year, month, day = formatted_date[:10].split('-')
173-
date_key = f"{year}-{month}-{day}"
174-
description_cleaned = clean_html(description)
172+
date_grouped_records = {}
175173
176-
categories = category.split(',')
177-
services = []
178-
category_values = []
179-
180-
for cat in categories:
181-
if cat.startswith('general:products/'):
182-
services.append(cat.replace('general:products/', ''))
183-
elif cat.startswith('marketing:marchitecture/'):
184-
category_values.append(cat.replace('marketing:marchitecture/', ''))
185-
186-
for service in services:
187-
for category_value in category_values:
188-
json_record = {
189-
'link': link,
190-
'title': title,
191-
'description': description_cleaned,
192-
'date': formatted_date,
193-
'service': service,
194-
'category': category_value
195-
}
196-
if date_key not in date_grouped_records:
197-
date_grouped_records[date_key] = []
198-
date_grouped_records[date_key].append(json_record)
199-
200-
except Exception as e:
201-
print(f"Error processing item: {ET.tostring(item, encoding='unicode')}. Exception: {str(e)}")
174+
for item in root.findall('.//item'):
175+
try:
176+
link = item.find('link').text
177+
title = item.find('title').text
178+
description = item.find('description').text or ''
179+
pubDate = item.find('pubDate').text
180+
category = item.find('category').text or ''
181+
# Parsing and formatting pubDate to ISO 8601 format
182+
pubDate_datetime = parse(pubDate)
183+
formatted_date = pubDate_datetime.strftime('%Y-%m-%dT%H:%M:%SZ')
202184
203-
for date_key, records in date_grouped_records.items():
204-
year, month, day = date_key.split('-')
205-
json_lines = '\n'.join(json.dumps(record) for record in records)
206-
s3_key = f'{bucket_path}/year={year}/month={month}/day={day}/whats_new.jsonl'
207-
s3.put_object(Body=json_lines, Bucket=bucket_name, Key=s3_key)
185+
year, month, day = formatted_date[:10].split('-')
186+
date_key = f"{year}-{month}-{day}"
187+
description_cleaned = clean_html(description)
188+
189+
categories = category.split(',')
190+
services = FEEDS_MAP[entry].get('default_services', [])
191+
category_values = []
192+
193+
for cat in categories:
194+
if cat.startswith('general:products/'):
195+
services.append(cat.replace('general:products/', ''))
196+
elif cat.startswith('marketing:marchitecture/'):
197+
category_values.append(cat.replace('marketing:marchitecture/', ''))
198+
else:
199+
category_values = categories
200+
for service in services:
201+
for category_value in category_values:
202+
json_record = {
203+
'link': link,
204+
'title': title,
205+
'description': description_cleaned,
206+
'date': formatted_date,
207+
'service': service,
208+
'category': category_value
209+
}
210+
if date_key not in date_grouped_records:
211+
date_grouped_records[date_key] = []
212+
date_grouped_records[date_key].append(json_record)
213+
except Exception as e:
214+
print(f"Error processing item: {ET.tostring(item, encoding='unicode')}. Exception: {str(e)}")
215+
216+
for date_key, records in date_grouped_records.items():
217+
year, month, day = date_key.split('-')
218+
json_lines = '\n'.join(json.dumps(record) for record in records)
219+
s3_key = f'{bucket_path}/year={year}/month={month}/day={day}/whats_new.jsonl'
220+
s3.put_object(Body=json_lines, Bucket=bucket_name, Key=s3_key)
208221
209222
return {
210223
'statusCode': 200,
211224
'body': f'Feed downloaded and grouped by date then uploaded to S3 bucket {bucket_name}'
212225
}
213-
226+
214227
except urllib.error.URLError as e:
215228
return {
216229
'statusCode': 500,
@@ -231,15 +244,15 @@ Resources:
231244
'statusCode': 500,
232245
'body': f'Error processing feed: {str(e)}'
233246
}
247+
234248
Handler: 'index.lambda_handler'
235249
MemorySize: 256
236250
Timeout: 60
237251
Role: !GetAtt LambdaRole.Arn
238252
Environment:
239253
Variables:
240254
BUCKET_NAME: !Ref DestinationBucket
241-
BUCKET_PATH: "aws-feeds/aws-feeds-whats-new"
242-
FEED_URL: "https://aws.amazon.com/about-aws/whats-new/recent/feed/"
255+
FEEDS_LIST: "aws,aws-cid"
243256
Metadata:
244257
cfn_nag:
245258
rules_to_suppress:
@@ -702,6 +715,7 @@ Resources:
702715
Targets:
703716
S3Targets:
704717
- Path: !Sub "s3://${DestinationBucket}/aws-feeds/aws-feeds-whats-new/"
718+
- Path: !Sub "s3://${DestinationBucket}/aws-feeds/aws-cid-feeds-whats-new/"
705719
Configuration: "{\"Version\":1.0,\"CrawlerOutput\":{\"Partitions\":{\"AddOrUpdateBehavior\":\"InheritFromTable\"}}}"
706720

707721
CrawlerBlogPost:
@@ -918,4 +932,4 @@ Resources:
918932
Type: Custom::LambdaAnalyticsExecutor
919933
Properties:
920934
ServiceToken: !Ref LambdaAnalyticsARN
921-
Name: !Ref CFDataName
935+
Name: !Ref CFDataName

0 commit comments

Comments
 (0)