Skip to content

Commit 0b18c4a

Browse files
authored
Handle HTML non-semantic differences (#41)
GB and Drupal have different HTML, convert to an intermediary format to avoid always updating. Signed-off-by: Phil Dibowitz <phil@ipom.com>
1 parent e281854 commit 0b18c4a

File tree

2 files changed

+32
-7
lines changed

2 files changed

+32
-7
lines changed

guidebook/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ requests
33
pytz
44
xdg-base-dirs
55
dateutils
6+
markdownify

guidebook/sync_guidebook.py

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/python3
1+
#!/usr/bin/env python3
22

33
#
44
# Copyright 2018-present Southern California Linux Expo
@@ -32,6 +32,7 @@
3232

3333
from datetime import datetime
3434
from dateutil import parser
35+
from markdownify import markdownify as md
3536
import click
3637
import json
3738
import logging
@@ -500,6 +501,30 @@ def add_session(self, session, original_session=None):
500501
self.sessions_by_nid[session["nid"]] = s
501502
self.sessions_by_name[name] = s
502503

504+
def normalize_html(self, html):
505+
"""
506+
The HTML supported by Drupal vs Guidebook is different and
507+
GB normalizes it upon import, so we can get in a state where
508+
we always detect a difference.
509+
510+
Stripping HTML is lossy, so instead we convert to MD and compare
511+
that which gives us a lot of information about formatting without
512+
being sensitive to exact HTML.
513+
"""
514+
markdown = md(html)
515+
# Normalize whitespace and quotes
516+
markdown = markdown.replace("\u2018", "'").replace("\u2019", "'")
517+
markdown = markdown.replace("\u201c", '"').replace("\u201d", '"')
518+
# collapse whitespace
519+
markdown = " ".join(markdown.split())
520+
return markdown
521+
522+
def normalize_time(self, time_str):
523+
n = time_str.replace("+0000", "+00:00")
524+
n = parser.isoparse(n)
525+
n = n.astimezone(pytz.utc)
526+
return n
527+
503528
def session_needs_update(self, new_data, original_session):
504529
"""
505530
Compare the new session data to the original session data, and return
@@ -517,12 +542,11 @@ def session_needs_update(self, new_data, original_session):
517542
]
518543
for key in all_keys:
519544
if "time" in key:
520-
a = new_data[key].replace("+0000", "+00:00")
521-
b = original_session[key].replace("+0000", "+00:00")
522-
a = parser.isoparse(a)
523-
b = parser.isoparse(b)
524-
a = a.astimezone(pytz.utc)
525-
b = b.astimezone(pytz.utc)
545+
a = self.normalize_time(new_data[key])
546+
b = self.normalize_time(original_session[key])
547+
elif "html" in key:
548+
a = self.normalize_html(new_data[key])
549+
b = self.normalize_html(original_session[key])
526550
else:
527551
a = new_data[key]
528552
b = original_session[key]

0 commit comments

Comments
 (0)