Skip to content

Commit 03436d3

Browse files
committed
refactor some of highlight_sender.py
refactored make_format_dict(...) and HighlightSender.send(...)
1 parent 82abcbb commit 03436d3

File tree

1 file changed

+150
-71
lines changed

1 file changed

+150
-71
lines changed

h2o/highlight_sender.py

Lines changed: 150 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -68,39 +68,15 @@ def format_header(dat: Dict[str, str], header_format: str) -> str:
6868
return header_format.format(**dat)
6969

7070

71-
def make_format_dict(data, calibre_library: str, book_titles_authors: Dict[int, Dict[str, str]]) -> Dict:
71+
def make_time_format_dict(data: Dict) -> Dict[str, str]:
7272
"""
73+
7374
:param data: json object of a calibre highlight
74-
:param calibre_library: name of the calibre library, to make a url to the highlight
75-
:param book_titles_authors: dictionary mapping book ids to their titles and authors
76-
:return:
75+
:return: dict containing all time-related formatting options
7776
"""
7877

79-
def format_blockquote(text: str) -> str:
80-
return "> " + text.replace("\n", "\n> ")
81-
8278
annot = data["annotation"]
8379

84-
# format is calibre://view-book/<Library_Name>/<book_id>/<book_format>?open_at=<location>
85-
# for example, calibre://view-book/Calibre_Library/39/EPUB?open_at=epubcfi(/8/2/4/84/1:184)
86-
# todo: right now, opening two different links from the same book opens two different viewer windows,
87-
# make it instead go to the right location in the already-open window
88-
url_format = "calibre://view-book/{library}/{book_id}/{book_format}?open_at=epubcfi({location})"
89-
url_args = {
90-
"library": calibre_library.replace(" ", "_"),
91-
"book_id": data["book_id"],
92-
"book_format": data["format"],
93-
# the algorithm for this, "/{2 * (spine_index + 1)}", is taken from:
94-
# read_book.annotations.AnnotationsManager.cfi_for_highlight(uuid, spine_index)
95-
# https://github.com/kovidgoyal/calibre/blob/master/src/pyj/read_book/annotations.pyj#L249
96-
# i didn't import the algorithm from calibre because it was too inconvenient to figure out how
97-
#
98-
# unfortunately, this doesn't work without the spine index thing. the location is missing a number.
99-
# it should be, for example /8/2/4/84/1:184, but instead, data["start_cfi"] is /2/4/84/1:184.
100-
# the first number in the cfi address has to be manually calculated.
101-
"location": "/" + str((annot["spine_index"] + 1) * 2) + annot["start_cfi"],
102-
}
103-
10480
# calibre's time format example: "2022-09-10T20:32:08.820Z"
10581
# the "Z" at the end means UTC time
10682
# "%Y-%m-%dT%H:%M:%S", take [:19] of the timestamp to remove milliseconds
@@ -109,18 +85,9 @@ def format_blockquote(text: str) -> str:
10985
h_local = h_time + h_time.astimezone(datetime.datetime.now().tzinfo).utcoffset()
11086
local = time.localtime()
11187
utc = time.gmtime()
112-
title_authors = book_titles_authors.get(int(data["book_id"]), {}) # dict with {"title": str, "authors": Tuple[str]}
11388
utc_offset = ("" if local.tm_gmtoff < 0 else "+") + str(local.tm_gmtoff // 3600) + ":00"
11489

115-
# based on https://github.com/jplattel/obsidian-clipper
116-
format_options = {
117-
# if you add a key to this dict, also update the format_options local variable in config.py
118-
"title": title_authors.get("title", "Untitled"), # title of book
119-
# todo: add "chapter" option
120-
"authors": title_authors.get("authors", ("Unknown",)), # authors of book
121-
"highlight": annot["highlighted_text"], # highlighted text
122-
"blockquote": format_blockquote(annot["highlighted_text"]), # block-quoted highlight
123-
"notes": annot["notes"] if "notes" in annot else "", # user's notes on this highlight
90+
time_options = {
12491
"date": str(h_time.date()), # utc date highlight was made
12592
"localdate": str(h_local.date()),
12693
# local date highlight was made. "local" based on send time, not highlight time
@@ -142,16 +109,93 @@ def format_blockquote(text: str) -> str:
142109
"localyear": str(h_local.year),
143110
"utcnow": time.strftime("%Y-%m-%d %H:%M:%S", utc),
144111
"localnow": time.strftime("%Y-%m-%d %H:%M:%S", local),
112+
"timestamp": str(h_time.timestamp()), # Unix timestamp of highlight time. uses UTC.
113+
}
114+
115+
return time_options
116+
117+
118+
def make_highlight_format_dict(data: Dict, calibre_library: str) -> Dict[str, str]:
119+
"""
120+
121+
:param data: json object of a calibre highlight
122+
:param calibre_library: name of library book is found in. used for making a url to the highlight.
123+
:return: dict containing all highlight-related formatting options.
124+
"""
125+
126+
def format_blockquote(text: str) -> str:
127+
return "> " + text.replace("\n", "\n> ")
128+
129+
annot = data["annotation"]
130+
131+
# format is calibre://view-book/<Library_Name>/<book_id>/<book_format>?open_at=<location>
132+
# for example, calibre://view-book/Calibre_Library/39/EPUB?open_at=epubcfi(/8/2/4/84/1:184)
133+
# todo: right now, opening two different links from the same book opens two different viewer windows,
134+
# make it instead go to the right location in the already-open window
135+
url_format = "calibre://view-book/{library}/{book_id}/{book_format}?open_at=epubcfi({location})"
136+
url_args = {
137+
"library": calibre_library.replace(" ", "_"),
138+
"book_id": data["book_id"],
139+
"book_format": data["format"],
140+
# the algorithm for this, "/{2 * (spine_index + 1)}", is taken from:
141+
# read_book.annotations.AnnotationsManager.cfi_for_highlight(uuid, spine_index)
142+
# https://github.com/kovidgoyal/calibre/blob/master/src/pyj/read_book/annotations.pyj#L249
143+
# i didn't import the algorithm from calibre because it was too inconvenient to figure out how
144+
#
145+
# unfortunately, this doesn't work without the spine index thing. the location is missing a number.
146+
# it should be, for example /8/2/4/84/1:184, but instead, data["start_cfi"] is /2/4/84/1:184.
147+
# the first number in the cfi address has to be manually calculated.
148+
"location": "/" + str((annot["spine_index"] + 1) * 2) + annot["start_cfi"],
149+
}
150+
151+
highlight_format = {
152+
"highlight": annot["highlighted_text"], # highlighted text
153+
"blockquote": format_blockquote(annot["highlighted_text"]), # block-quoted highlight
154+
"notes": annot["notes"] if "notes" in annot else "", # user's notes on this highlight
145155
"url": url_format.format(**url_args), # calibre:// url to open ebook viewer to this highlight
146156
"location": url_args["location"], # epub cfi location of this highlight
147-
"timestamp": h_time.timestamp(), # Unix timestamp of highlight time. uses UTC.
148-
"bookid": data["book_id"],
149157
"uuid": annot["uuid"], # highlight's ID in calibre
150158
}
151159

160+
return highlight_format
161+
162+
163+
def make_book_format_dict(data: Dict, book_titles_authors: Dict[int, Dict[str, str]]) -> Dict[str, str]:
164+
"""
165+
166+
:param data: json object of a calibre highlight
167+
:param book_titles_authors: dictionary mapping book ids to {"title": title, "authors": authors}
168+
:return: dict containing all book-related formatting options
169+
"""
170+
title_authors = book_titles_authors.get(int(data["book_id"]), {}) # dict with {"title": str, "authors": Tuple[str]}
171+
172+
format_options = {
173+
# if you add a key to this dict, also update the format_options local variable in config.py
174+
"title": title_authors.get("title", "Untitled"), # title of book
175+
# todo: add "chapter" option
176+
"authors": title_authors.get("authors", ("Unknown",)), # authors of book
177+
"bookid": data["book_id"],
178+
}
179+
152180
return format_options
153181

154182

183+
def make_format_dict(data, calibre_library: str, book_titles_authors: Dict[int, Dict[str, str]]) -> Dict[str, str]:
184+
"""
185+
:param data: json object of a calibre highlight
186+
:param calibre_library: name of the calibre library, to make a url to the highlight
187+
:param book_titles_authors: dictionary mapping book ids to {"title": title, "authors": authors}
188+
:return: dict[str, str] containing formatting options
189+
"""
190+
191+
# formatting options are based on https://github.com/jplattel/obsidian-clipper
192+
time_options = make_time_format_dict(data)
193+
highlight_options = make_highlight_format_dict(data, calibre_library)
194+
book_options = make_book_format_dict(data, book_titles_authors)
195+
196+
return time_options | highlight_options | book_options # | merges dictionaries https://peps.python.org/pep-0584/
197+
198+
155199
class HighlightSender:
156200

157201
def __init__(self):
@@ -275,30 +319,46 @@ def send(self, condition: Callable[[Any], bool] = lambda x: True):
275319
condition takes a highlight's json object and returns true if that highlight should be sent to obsidian.
276320
"""
277321

278-
highlights = filter(lambda a: a.get("annotation", {}).get("type") == "highlight",
279-
self.annotations_list) # annotations["annotations"])
280-
dats = [] # List[List[obsidian_data, sort_key]]
281-
headers = {} # dict[note_title:str, header:str]
322+
# todo: a lot of the lists used here and in related functions could probably be replaced with tuples
323+
324+
def is_valid_highlight(_dat:Dict):
325+
"""
326+
:param _dat: a dict with one calibre annotation's data
327+
:return: True if this is a valid highlight and should be sent, else False
328+
"""
329+
_annot = _dat.get("annotation", {})
330+
if _annot.get("type") != "highlight":
331+
return False # annotation must be a highlight, not a bookmark
332+
333+
if _annot.get("removed"):
334+
return False # don't try to send highlights that have been removed
282335

283-
for highlight in highlights:
284-
if highlight["annotation"].get("removed", False):
285-
continue # don't try to send highlights that have been removed
336+
if not condition(_dat):
337+
return False # user-defined condition must be true for this highlight
286338

287-
if not condition(highlight):
288-
continue
339+
return True
289340

290-
dat = make_format_dict(highlight, self.library_name, self.book_titles_authors)
341+
def format_add_highlight(_highlight, _dats, _headers):
342+
"""
343+
makes a formatted highlight from an annotation data object, then updates _dats and _headers.
344+
345+
:param _highlight: a calibre annotation object
346+
:param _dats: list to be updated in-place. a list [format_data() output, sort_key] will be appended.
347+
:param _headers: dict to be updated in-place. if we come across a title that's not in the dict,
348+
a formatted header will be made for that title.
349+
:return: none
350+
"""
351+
dat = make_format_dict(_highlight, self.library_name, self.book_titles_authors)
291352
formatted = format_data(dat, self.title_format, self.body_format, self.no_notes_format)
292353

293-
if formatted[0] not in headers:
294-
headers[formatted[0]] = format_header(dat, self.header_format)
354+
if formatted[0] not in _headers: # only make one header per title
355+
_headers[formatted[0]] = format_header(dat, self.header_format)
295356

296-
dats.append([formatted, self.format_sort_key(dat)])
357+
_dats.append([formatted, self.format_sort_key(dat)])
297358

298359
def merge_highlights(data):
299360
"""
300-
returns a dictionary with formatted highlights merged into a single string for each
301-
unique formatted note title found in dats.
361+
merges formatted highlights into a single string for each unique note title found in dats.
302362
303363
This limits the length of merged note contents to 20000 characters. If the length exceeds this, extra
304364
highlights will use a different title, e.g. "The Book", "The Book (1)", etc
@@ -309,14 +369,18 @@ def merge_highlights(data):
309369
:return: list of obsidian_data objects, where each unique title from the input is merged into a
310370
single, sorted item in the output.
311371
"""
312-
# this function has too many nested index lookups, it could use some simplification
313372

314-
books = {} # dict[str, list[list[obsidian_data object, sort_key]]
315-
lengths = {}
316-
# make list of highlights for each note title
317-
for d in data:
318-
format_dat = d[0] # list[title, body]
319-
body_and_sort = [format_dat[1], d[1]] # [note body, sort key]
373+
def add_data_item(_dat, _books, _lengths):
374+
"""
375+
:param _dat: data item: [[title, body], sort_key]
376+
:param _books: dict that will be updated in-place. will have an obsidian_data object and sort key
377+
added to a note title. like _books["title"].append([obsidian_data, sort_key]). automatically handles
378+
cases where "title" is not in _books.
379+
:param _lengths: dict that may be updated in-place, used for tracking cumulative length of highlights
380+
:return: none
381+
"""
382+
format_dat = _dat[0] # list[title, body]
383+
body_and_sort = [format_dat[1], _dat[1]] # [note body, sort key]
320384
base_title = format_dat[0]
321385

322386
# limit each merged highlight to 20000 chars. it could be higher, but we need room for url encoding.
@@ -325,27 +389,34 @@ def merge_highlights(data):
325389
# problem is some detail about how webbrowser.open() is implemented. on my windows 11 laptop, calling
326390
# webbrowser.open("obsidian://" + "a" * 32699) works, but "a" * 32700 will open microsoft edge instead,
327391
# and if the number reaches 32757 it gives an error.
328-
note_title, l = base_title, lengths.get(base_title, False)
329-
if l: # limit size of a note's content to 20 kb.
392+
note_title, l = base_title, _lengths.get(base_title, False)
393+
if l: # start using a different title every 20k characters
330394
splits = l // 20000
331395
if splits > 0:
332396
note_title = base_title + f" ({splits})"
333397

334-
if note_title in books:
335-
books[note_title].append(body_and_sort)
398+
if note_title in _books:
399+
_books[note_title].append(body_and_sort)
336400
else:
337-
books[note_title] = [body_and_sort]
401+
_books[note_title] = [body_and_sort]
338402

339-
if base_title in lengths:
340-
lengths[base_title] += len(body_and_sort[0])
403+
if base_title in _lengths:
404+
_lengths[base_title] += len(body_and_sort[0])
341405
else:
342-
lengths[base_title] = len(body_and_sort[0])
406+
_lengths[base_title] = len(body_and_sort[0])
407+
408+
books = {} # dict[title:str, list[list[obsidian_data object:Dict, sort_key]]
409+
lengths = {} # dict[book title:str, int]
410+
411+
# make list of highlights for each note title
412+
for d in data:
413+
add_data_item(d, books, lengths)
343414

344-
# now, books contains lists of unsorted [note body, sort key] objects
415+
# now, `books` contains lists of unsorted [note body, sort key] objects
345416
ret = []
346417

418+
# sort each book's highlights and then merge them into a single string
347419
for key in books:
348-
# sort each book's highlights and then merge them into a single string
349420
books[key].sort(key=lambda body_sort: body_sort[1])
350421
# header is only included in first of a series of same-book files
351422
# (this happens when there's too much text to send to a single file at once)
@@ -354,6 +425,14 @@ def merge_highlights(data):
354425

355426
return ret
356427

428+
highlights = filter(is_valid_highlight, self.annotations_list) # annotations["annotations"])
429+
dats = [] # List[List[obsidian_data, sort_key]]
430+
headers = {} # dict[note_title:str, header:str]
431+
432+
# make formatted titles, bodies, and headers
433+
for highlight in highlights:
434+
format_add_highlight(highlight, dats, headers)
435+
357436
# todo: sometimes, if obsidian isn't already open, not all highlights get sent
358437
merged = merge_highlights(dats)
359438
for obsidian_dat in merged:

0 commit comments

Comments
 (0)