@@ -111,6 +111,18 @@ Resources:
111111 from dateutil.parser import parse
112112 import boto3
113113
114+ FEEDS_MAP = {
115+ "aws": {
116+ "path": "aws-feeds/aws-feeds-whats-new",
117+ "feed_url": "https://aws.amazon.com/about-aws/whats-new/recent/feed/"
118+ },
119+ "aws-cid": {
120+ "path": "aws-feeds/aws-cid-feeds-whats-new",
121+ "feed_url": "https://cid.workshops.aws.dev/feed/cloud-intelligence-dashboards.rss",
122+ "default_services": ["aws-cid"],
123+ }
124+ }
125+
114126 def clean_html(html_content):
115127 class MyParser(HTMLParser):
116128 def __init__(self):
@@ -136,81 +148,82 @@ Resources:
136148 return parser.text.strip() + '\n\n' + '\n'.join([f"[{index}]: {url}" for index, url in parser.ref.items()])
137149
138150 def lambda_handler(event, context):
139- feed_url = os.environ['FEED_URL']
151+ feeds_list = os.environ['FEEDS_LIST'].split(',')
140152 bucket_name = os.environ['BUCKET_NAME']
141- bucket_path = os.environ.get('BUCKET_PATH', '')
142153
143154 try:
144- with urllib.request.urlopen(feed_url, timeout=10) as response: # nosec
145- feed_data = response.read().decode('utf-8')
146-
147- malicious_strings = ['!ENTITY', ':include']
148- for string in malicious_strings:
149- if string in feed_data:
150- return {
151- 'statusCode': 400,
152- 'body': f'Malicious content detected in the XML feed: {string}'
153- }
154-
155- s3 = boto3.client('s3')
156- root = ET.fromstring(feed_data) # nosec
157-
158- date_grouped_records = {}
159-
160- for item in root.findall('.//item'):
161- try:
162- link = item.find('link').text
163- title = item.find('title').text
164- description = item.find('description').text or ''
165- pubDate = item.find('pubDate').text
166- category = item.find('category').text or ''
155+ for entry in feeds_list:
156+ feed_url = FEEDS_MAP[entry]['feed_url']
157+ bucket_path = FEEDS_MAP[entry]['path']
158+ with urllib.request.urlopen(feed_url, timeout=10) as response: # nosec
159+ feed_data = response.read().decode('utf-8')
160+
161+ malicious_strings = ['!ENTITY', ':include']
162+ for string in malicious_strings:
163+ if string in feed_data:
164+ return {
165+ 'statusCode': 400,
166+ 'body': f'Malicious content detected in the XML feed: {string}'
167+ }
167168
168- # Parsing and formatting pubDate to ISO 8601 format
169- pubDate_datetime = parse(pubDate)
170- formatted_date = pubDate_datetime.strftime('%Y-%m-%dT%H:%M:%SZ')
169+ s3 = boto3.client('s3')
170+ root = ET.fromstring(feed_data) # nosec
171171
172- year, month, day = formatted_date[:10].split('-')
173- date_key = f"{year}-{month}-{day}"
174- description_cleaned = clean_html(description)
172+ date_grouped_records = {}
175173
176- categories = category.split(',')
177- services = []
178- category_values = []
179-
180- for cat in categories:
181- if cat.startswith('general:products/'):
182- services.append(cat.replace('general:products/', ''))
183- elif cat.startswith('marketing:marchitecture/'):
184- category_values.append(cat.replace('marketing:marchitecture/', ''))
185-
186- for service in services:
187- for category_value in category_values:
188- json_record = {
189- 'link': link,
190- 'title': title,
191- 'description': description_cleaned,
192- 'date': formatted_date,
193- 'service': service,
194- 'category': category_value
195- }
196- if date_key not in date_grouped_records:
197- date_grouped_records[date_key] = []
198- date_grouped_records[date_key].append(json_record)
199-
200- except Exception as e:
201- print(f"Error processing item: {ET.tostring(item, encoding='unicode')}. Exception: {str(e)}")
174+ for item in root.findall('.//item'):
175+ try:
176+ link = item.find('link').text
177+ title = item.find('title').text
178+ description = item.find('description').text or ''
179+ pubDate = item.find('pubDate').text
180+ category = item.find('category').text or ''
181+ # Parsing and formatting pubDate to ISO 8601 format
182+ pubDate_datetime = parse(pubDate)
183+ formatted_date = pubDate_datetime.strftime('%Y-%m-%dT%H:%M:%SZ')
202184
203- for date_key, records in date_grouped_records.items():
204- year, month, day = date_key.split('-')
205- json_lines = '\n'.join(json.dumps(record) for record in records)
206- s3_key = f'{bucket_path}/year={year}/month={month}/day={day}/whats_new.jsonl'
207- s3.put_object(Body=json_lines, Bucket=bucket_name, Key=s3_key)
185+ year, month, day = formatted_date[:10].split('-')
186+ date_key = f"{year}-{month}-{day}"
187+ description_cleaned = clean_html(description)
188+
189+ categories = category.split(',')
190+ services = FEEDS_MAP[entry].get('default_services', [])
191+ category_values = []
192+
193+ for cat in categories:
194+ if cat.startswith('general:products/'):
195+ services.append(cat.replace('general:products/', ''))
196+ elif cat.startswith('marketing:marchitecture/'):
197+ category_values.append(cat.replace('marketing:marchitecture/', ''))
198+ else:
199+ category_values = categories
200+ for service in services:
201+ for category_value in category_values:
202+ json_record = {
203+ 'link': link,
204+ 'title': title,
205+ 'description': description_cleaned,
206+ 'date': formatted_date,
207+ 'service': service,
208+ 'category': category_value
209+ }
210+ if date_key not in date_grouped_records:
211+ date_grouped_records[date_key] = []
212+ date_grouped_records[date_key].append(json_record)
213+ except Exception as e:
214+ print(f"Error processing item: {ET.tostring(item, encoding='unicode')}. Exception: {str(e)}")
215+
216+ for date_key, records in date_grouped_records.items():
217+ year, month, day = date_key.split('-')
218+ json_lines = '\n'.join(json.dumps(record) for record in records)
219+ s3_key = f'{bucket_path}/year={year}/month={month}/day={day}/whats_new.jsonl'
220+ s3.put_object(Body=json_lines, Bucket=bucket_name, Key=s3_key)
208221
209222 return {
210223 'statusCode': 200,
211224 'body': f'Feed downloaded and grouped by date then uploaded to S3 bucket {bucket_name}'
212225 }
213-
226+
214227 except urllib.error.URLError as e:
215228 return {
216229 'statusCode': 500,
@@ -231,15 +244,15 @@ Resources:
231244 'statusCode': 500,
232245 'body': f'Error processing feed: {str(e)}'
233246 }
247+
234248 Handler : ' index.lambda_handler'
235249 MemorySize : 256
236250 Timeout : 60
237251 Role : !GetAtt LambdaRole.Arn
238252 Environment :
239253 Variables :
240254 BUCKET_NAME : !Ref DestinationBucket
241- BUCKET_PATH : " aws-feeds/aws-feeds-whats-new"
242- FEED_URL : " https://aws.amazon.com/about-aws/whats-new/recent/feed/"
255+ FEEDS_LIST : " aws,aws-cid"
243256 Metadata :
244257 cfn_nag :
245258 rules_to_suppress :
@@ -702,6 +715,7 @@ Resources:
702715 Targets :
703716 S3Targets :
704717 - Path : !Sub "s3://${DestinationBucket}/aws-feeds/aws-feeds-whats-new/"
718+ - Path : !Sub "s3://${DestinationBucket}/aws-feeds/aws-cid-feeds-whats-new/"
705719 Configuration : " {\" Version\" :1.0,\" CrawlerOutput\" :{\" Partitions\" :{\" AddOrUpdateBehavior\" :\" InheritFromTable\" }}}"
706720
707721 CrawlerBlogPost :
0 commit comments