Skip to content

Commit e7dc052

Browse files
author
Sven Siegmund
committed
HTML de-encapsulation works
1 parent b9c2fdd commit e7dc052

File tree

4 files changed

+79
-11
lines changed

4 files changed

+79
-11
lines changed

src/rtfparse/entities.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,15 @@ def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> Non
178178
self.structure.append(Control_Symbol(self.config, file))
179179
else:
180180
self.structure.append(Plain_Text(self.config, file))
181+
# name the group like its first Control Word
182+
# this way the renderer will be able to ignore entire groups
183+
try:
184+
if isinstance(self.structure[0], Control_Word):
185+
self.name = self.structure[0].control_name
186+
except IndexError:
187+
pass
181188
def __repr__(self) -> str:
182-
cwinfo = ""
183-
if isinstance(self.structure[0], Control_Word):
184-
cwinfo = f" {self.structure[0].control_name}"
185-
return f"<Group{cwinfo}>"
189+
return f"<Group {self.name}>"
186190

187191

188192
if __name__ == "__main__":

src/rtfparse/entry.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ def run(config: config_loader.Config) -> None:
6060
import extract_msg as em
6161
import compressed_rtf as cr
6262
msg = em.openMsg(f"{config.cli_args.msg}")
63+
for attachment in msg.attachments:
64+
with open(config.html / f"{attachment.longFilename}", mode="wb") as att_file:
65+
att_file.write(attachment.data)
6366
decompressed_rtf = cr.decompress(msg.compressedRtf)
6467
with open((config.email_rtf / config.cli_args.msg.name).with_suffix(".rtf"), mode="wb") as email_rtf:
6568
email_rtf.write(decompressed_rtf)
@@ -68,7 +71,7 @@ def run(config: config_loader.Config) -> None:
6871
rp.parse_file(config, rtf_file)
6972
from rtfparse.renderers import encapsulated_html
7073
renderer = encapsulated_html.Encapsulated_HTML()
71-
with open((config.html / config.cli_args.msg.name).with_suffix(".html"), mode="w") as htmlfile:
74+
with open((config.html / config.cli_args.msg.name).with_suffix(".html"), mode="w", encoding="utf-8") as htmlfile:
7275
logger.info(f"Rendering the encapsulated HTML")
7376
renderer.render(rp.parsed, htmlfile)
7477
logger.info(f"Encapsulated HTML rendered")

src/rtfparse/renderers/encapsulated_html.py

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,81 @@
1414

1515

1616
class Encapsulated_HTML(Renderer):
17-
def __init__(self) -> None:
17+
def __init__(self, ) -> None:
1818
super().__init__()
19-
self.render_func = dict((
20-
("par", lambda x: "\n"),
19+
self.ignore_rtf = False
20+
self.render_word_func = dict((
21+
("par", self.newline),
22+
("line", self.newline),
23+
("tab", self.tab),
24+
("fromhtml", self.check_fromhtml),
25+
("htmlrtf", self.ignore_rtf_toggle),
2126
))
27+
self.ignore_groups = (
28+
"fonttbl",
29+
"colortbl",
30+
)
31+
def ignore_rtf_toggle(self, cw: entities.Control_Word) -> str:
32+
if cw.parameter == "" or cw.parameter == 1:
33+
self.ignore_rtf = True
34+
elif cw.parameter == 0:
35+
self.ignore_rtf = False
36+
return ""
37+
def check_fromhtml(self, cw: entities.Control_Word) -> str:
38+
if cw.parameter == 1:
39+
logger.info(f"Confirming that RTF was indeed generated from HTML")
40+
else:
41+
logger.warning(utils.warn(f"Encountered a part of RTF which was not generated from HTML"))
42+
logger.warning(utils.warn(f"This might not be the right renderer for it."))
43+
return ""
44+
def newline(self, cw: entities.Control_Word) -> str:
45+
if self.ignore_rtf:
46+
return ""
47+
else:
48+
return "\n"
49+
def tab(self, cw: entities.Control_Word) -> str:
50+
if self.ignore_rtf:
51+
return ""
52+
else:
53+
return "\t"
54+
def render_symbol(self, item: entities.Control_Symbol, file: io.TextIOWrapper) -> None:
55+
if not self.ignore_rtf:
56+
# Obsolete formula character used by Word 5.1 for Macintosh
57+
if item.text == "|":
58+
pass
59+
# Non-breaking space
60+
elif item.text == "~":
61+
file.write("\u00a0")
62+
# Optional hyphen
63+
elif item.text == "-":
64+
pass
65+
# Non-breaking hyphen
66+
elif item.text == "_":
67+
file.write("\u2011")
68+
# Subentry in an index entry
69+
elif item.text == ":":
70+
pass
71+
# Ignorable outside of Group
72+
elif item.text == "*":
73+
logger.warning(utils.warn(f"Found an IGNORABLE control symbol which is not a group start!"))
74+
# Probably any symbol converted from a hex code: \'hh
75+
else:
76+
file.write(item.text)
2277
def render(self, parsed: entities.Group, file: io.TextIOWrapper) -> None:
2378
for item in parsed.structure:
2479
if isinstance(item, entities.Group):
25-
self.render(item, file)
80+
if item.name not in self.ignore_groups:
81+
self.render(item, file)
2682
elif isinstance(item, entities.Control_Word):
2783
try:
28-
file.write(self.render_func[item.control_name](item))
84+
file.write(self.render_word_func[item.control_name](item))
2985
except KeyError:
3086
pass
87+
elif isinstance(item, entities.Control_Symbol):
88+
self.render_symbol(item, file)
89+
elif isinstance(item, entities.Plain_Text):
90+
if not self.ignore_rtf:
91+
file.write(item.text)
3192
else:
3293
pass
3394

src/rtfparse/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
#!/usr/bin/env python
22

33

4-
version = "0.5.1"
4+
version = "0.6.1"

0 commit comments

Comments
 (0)