diff --git a/guidebook/sync_guidebook.py b/guidebook/sync_guidebook.py index 4118dcf..3979b1f 100755 --- a/guidebook/sync_guidebook.py +++ b/guidebook/sync_guidebook.py @@ -30,9 +30,7 @@ # to update that. # -from datetime import datetime -from dateutil import parser -from markdownify import markdownify as md +from bs4 import BeautifulSoup from datadog_api_client import ApiClient, Configuration from datadog_api_client.v2.api.metrics_api import MetricsApi from datadog_api_client.v2.model.metric_intake_type import MetricIntakeType @@ -40,6 +38,9 @@ from datadog_api_client.v2.model.metric_point import MetricPoint from datadog_api_client.v2.model.metric_resource import MetricResource from datadog_api_client.v2.model.metric_series import MetricSeries +from datetime import datetime +from dateutil import parser +from markdownify import markdownify as md import click import json import logging @@ -201,6 +202,16 @@ def _load_event_json(self, raw): if room != "": self.rooms.add(room) clean_session = {k: v.strip() for k, v in session.items()} + if clean_session["LongAbstract"] != "": + html = BeautifulSoup( + clean_session["LongAbstract"], "html.parser" + ) + # nuke all images from the HTML because Guidebook doesn't + # support them and will escape the tags in a way that makes + # us forever update the sessions as different + for img in html.find_all("img"): + img.decompose() + clean_session["LongAbstract"] = str(html) data_by_name[name] = clean_session data_by_nid[session["nid"]] = clean_session return (data_by_name, data_by_nid) @@ -661,6 +672,7 @@ def normalize_html(self, html): that which gives us a lot of information about formatting without being sensitive to exact HTML. """ + markdown = md(html) # Normalize whitespace and quotes markdown = markdown.replace("\u2018", "'").replace("\u2019", "'") @@ -692,14 +704,14 @@ def session_needs_update(self, new_data, original_session): ] for key in all_keys: if "time" in key: - a = self.normalize_time(new_data[key]) - b = self.normalize_time(original_session[key]) + a = self.normalize_time(original_session[key]) + b = self.normalize_time(new_data[key]) elif "html" in key: - a = self.normalize_html(new_data[key]) - b = self.normalize_html(original_session[key]) + a = self.normalize_html(original_session[key]) + b = self.normalize_html(new_data[key]) else: - a = new_data[key] - b = original_session[key] + a = original_session[key] + b = new_data[key] if a != b: self.logger.info( "Session '%s' needs update because '%s' changed: '%s' !="