Skip to content

Commit b9c2fdd

Browse files
author
Sven Siegmund
committed
Added default encoding to config, encapsulated HTML parser started
1 parent dfe766b commit b9c2fdd

File tree

13 files changed

+174
-134
lines changed

13 files changed

+174
-134
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ If you find a working official Microsoft link to the RTF specification and add i
1212

1313
* [Swissmains Link to RTF Spec 1.9.1](https://manuals.swissmains.com/pages/viewpage.action?pageId=1376332&preview=%2F1376332%2F10620104%2FWord2007RTFSpec9.pdf)
1414
* [Webarchive Link to RTF Spec 1.9.1](https://web.archive.org/web/20190708132914/http://www.kleinlercher.at/tools/Windows_Protocols/Word2007RTFSpec9.pdf)
15+
* [RTF Extensions, MS-OXRTFEX](https://docs.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-oxrtfex/411d0d58-49f7-496c-b8c3-5859b045f6cf)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
argcomplete
2+
extract-msg

src/rtfparse/bytestring_types.py

Lines changed: 0 additions & 35 deletions
This file was deleted.

src/rtfparse/config_loader.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,22 @@ def __init__(self, cfg_path: pathlib.Path, autoconfig: bool) -> None:
3838
self.path_to_config_file = cfg_path
3939
self.path_to_home = utils.provide_dir(self.path_to_config_file.parent)
4040
self.path_to_pyrtfparse_home = pathlib.Path.home() / utils.home_dir_name
41-
self._subdir_dir = Preconfigured_Path(
42-
internal_name="subdir_dir",
43-
path=self.path_to_pyrtfparse_home / "subdir",
44-
comment="some subdir",
41+
self._email_rtf = Preconfigured_Path(
42+
internal_name="email_rtf",
43+
path=self.path_to_pyrtfparse_home / "email_rtf",
44+
comment="Directory for RTF extracted from MS Outlook emails",
4545
)
46+
self._html = Preconfigured_Path(
47+
internal_name="html",
48+
path=self.path_to_pyrtfparse_home / "html",
49+
comment="Directory for HTML extracted from the email",
50+
)
51+
self.check_paths = (self._email_rtf,
52+
self._html,
53+
)
4654
self._wizard_has_run = False
4755
self.autoconfig = autoconfig
4856
self.read_config_file()
49-
self.check_paths = (self._subdir_dir,
50-
)
5157
self.integrity_check()
5258
def __enter__(self):
5359
return self
@@ -75,7 +81,7 @@ def integrity_check(self) -> None:
7581
for preconf_path in self.check_paths:
7682
path_to_check = preconf_path.path
7783
assert path_to_check.exists()
78-
except AssertionError as e:
84+
except AssertionError as err:
7985
logger.debug(f"Path not found, starting wizard")
8086
self.wizard(errors.WrongConfiguration(f"{self.path_to_config_file.name}: '{str(path_to_check)}', path does not exist!", preconf_path), autoconfig=self.autoconfig)
8187
def create_config_file(self) -> None:
@@ -89,11 +95,15 @@ def create_config_file(self) -> None:
8995
self.config_parser.set("Paths", "# or a '\\' at the end of the final directory of a Windows path")
9096
self.config_parser.set("Paths", "# does not interfere with the path parser.")
9197
self.config_parser.set("Paths", "")
92-
for preconf_path in (
93-
self._subdir_dir,
94-
):
98+
for preconf_path in self.check_paths:
9599
self.config_parser.set("Paths", f"# {preconf_path.comment[0].capitalize()}{preconf_path.comment[1:]}")
96100
self.config_parser.set("Paths", f"{preconf_path.internal_name}", f"{preconf_path.path}")
101+
self.config_parser.add_section("Encoding")
102+
self.config_parser.set("Encoding", "# Set default expected RTF encoding here")
103+
self.config_parser.set("Encoding", "# RTF usually uses ANSI by defaul, but that's not specific enough.")
104+
self.config_parser.set("Encoding", "# This specifies what exactly we mean by ANSI, e.g. cp1252 (Western Europe)")
105+
self.config_parser.set("Encoding", "")
106+
self.config_parser.set("Encoding", "default", "cp1252")
97107
with open(self.path_to_config_file, mode="w", encoding="utf-8") as configfh:
98108
self.config_parser.write(configfh)
99109
self.read_config_file()
@@ -121,7 +131,9 @@ def parse(self) -> None:
121131
Parses the configuration files into usable attributes
122132
"""
123133
try:
124-
self.subdir_dir = self.getpath("Paths", "subdir_dir")
134+
self.email_rtf = self.getpath("Paths", "email_rtf")
135+
self.html = self.getpath("Paths", "html")
136+
self.default_encoding = self.config_parser.get("Encoding", "default")
125137
except ValueError:
126138
exc_type, exc_value, exc_traceback = sys.exc_info()
127139
lines = traceback.format_exc().splitlines()

src/rtfparse/entities.py

Lines changed: 57 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
import io
55
import logging
66
import re
7-
from itertools import count
7+
# Own modules
88
from rtfparse import re_patterns
99
from rtfparse import utils
1010
from rtfparse import errors
11+
from rtfparse import config_loader
1112
from rtfparse.enums import Bytestring_Type
1213

1314

@@ -19,15 +20,17 @@
1920
CHARACTER = BACKSLASH = DELIMITER = MINUS = GROUP_END = len(b"\\")
2021
SYMBOL = IGNORABLE = BACKSLASH + CHARACTER
2122
GROUP_START = BACKSLASH + IGNORABLE
22-
MAX_CW_LETTERS = 32
23-
INTEGER_MAGNITUDE = 32
23+
MAX_CW_LETTERS = 32 # As specified in RTF Spec
24+
INTEGER_MAGNITUDE = 32 # As specified in RTF Spec
2425
PLAIN_TEXT = CONTROL_WORD = BACKSLASH + MAX_CW_LETTERS + MINUS + len(str((1 << INTEGER_MAGNITUDE) // 2)) + DELIMITER
2526

2627

2728
class Entity:
29+
def __init__(self) -> None:
30+
self.text = ""
2831
@classmethod
2932
def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Bytestring_Type:
30-
logger.debug(f"in Entity.probed")
33+
logger.debug(f"Probing file at position {file.tell()}")
3134
original_position = file.tell()
3235
while True:
3336
probed = file.read(len(re_patterns.probe_pattern))
@@ -54,84 +57,101 @@ def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Byt
5457
raise errors.UnexpectedEndOfFileError(f"at position {file.tell()}")
5558
continue
5659
break
57-
logger.debug(f"{result = }")
60+
logger.debug(f"Probe {result = }")
5861
logger.debug(f"Probe leaving file at position {file.tell()}")
5962
return result
6063

6164

6265
class Control_Word(Entity):
63-
def __init__(self, file: io.BufferedReader) -> None:
64-
logger.debug(f"Control_Word.__init__")
66+
def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> None:
67+
super().__init__()
68+
self.config = config
69+
logger.debug(f"Reading Control Word at file position {file.tell()}")
6570
self.control_name = "missing"
6671
self.parameter = ""
6772
self.start_position = file.tell()
6873
logger.debug(f"Starting at file position {self.start_position}")
6974
probe = file.read(CONTROL_WORD)
7075
if (match := re_patterns.control_word.match(probe)):
71-
self.control_name = match.group("control_name").decode("ascii")
72-
logger.debug(f"{self.control_name = }")
76+
self.control_name = match.group("control_name").decode(self.config.default_encoding)
77+
logger.debug(f"Preliminary {self.control_name = }")
7378
parameter = match.group("parameter")
74-
if parameter:
75-
self.parameter = int(parameter.decode("ascii"))
79+
if parameter is not None:
80+
self.parameter = int(parameter.decode(self.config.default_encoding))
7681
logger.debug(f"{self.parameter = }")
82+
self.control_name = self.control_name.removesuffix(str(self.parameter))
83+
logger.debug(f"Final {self.control_name = }")
7784
target_position = self.start_position + match.span()[1]
7885
if match.group("other"):
79-
logger.debug(f"Delimiter is {match.group('other').decode('ascii')}, len: {len(match.group('delimiter'))}")
86+
logger.debug(f"Delimiter is {match.group('other').decode(self.config.default_encoding)}, len: {len(match.group('delimiter'))}")
8087
target_position -= len(match.group("delimiter"))
8188
file.seek(target_position)
8289
else:
8390
logger.warning(f"Missing Control Word")
8491
file.seek(self.start_position)
8592
def __repr__(self) -> str:
86-
name = self.control_name
87-
return f"<{self.__class__.__name__}: {name}{self.parameter}>"
93+
return f"<{self.__class__.__name__}: {self.control_name}{self.parameter}>"
8894

8995

9096
class Control_Symbol(Entity):
91-
def __init__(self, file: io.BufferedReader) -> None:
97+
def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> None:
98+
super().__init__()
99+
self.config = config
92100
self.start_position = file.tell()
93-
logger.debug(f"Starting at file position {self.start_position}")
94-
self.text = file.read(SYMBOL)[-1].decode("ascii")
101+
logger.debug(f"Reading Symbol at file position {self.start_position}")
102+
self.char = ""
103+
self.text = chr(file.read(SYMBOL)[-1])
104+
if self.text == "'":
105+
self.char = file.read(SYMBOL).decode(self.config.default_encoding)
106+
self.text = bytes((int(self.char, base=16), )).decode(self.config.default_encoding)
107+
logger.debug(f"Encountered escaped ANSI character, read two more bytes: {self.char}, character: {self.text}")
108+
if self.text in "\\{}":
109+
file.seek(file.tell() - SYMBOL)
95110
def __repr__(self) -> str:
96111
return f"<{self.__class__.__name__}: {self.text}>"
97112

98113

99114
class Plain_Text(Entity):
100-
def __init__(self, file: io.BufferedReader) -> None:
101-
self.start_position = file.tell()
102-
logger.debug(f"Starting at file position {self.start_position}")
115+
def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> None:
116+
super().__init__()
117+
self.config = config
103118
self.text = ""
119+
logger.debug(f"Constructing Plain_Text")
104120
while True:
121+
self.start_position = file.tell()
105122
read = file.read(PLAIN_TEXT)
106-
logger.debug(f"Read file up to position {file.tell()}")
107-
logger.debug(f"Read: {read}")
123+
logger.debug(f"Read file from {self.start_position} to position {file.tell()}, read: {read}")
108124
# see if we have read all the plain text there is:
109125
if (match := re_patterns.plain_text.match(read)):
110126
logger.debug(f"This matches the plain text pattern")
111-
_text = match.group("text").decode("ascii")
127+
_text = match.group("text").decode(self.config.default_encoding)
112128
logger.debug(f"{_text = }")
113129
self.text = "".join((self.text, _text))
114130
logger.debug(f"{self.text = }")
115131
if len(_text) == PLAIN_TEXT:
116132
continue
117133
else:
118-
file.seek(self.start_position + len(self.text))
119-
logger.debug(f"Returned to position {file.tell()}")
134+
file.seek(self.start_position + len(_text))
120135
break
121136
else:
137+
file.seek(self.start_position)
122138
break
139+
logger.debug(f"Returned to position {file.tell()}")
123140
def __repr__(self) -> str:
124141
return f"<{self.__class__.__name__}: {self.text}>"
125142

126143

127-
class Destination_Group(Entity):
128-
def __init__(self, file: io.BufferedReader) -> None:
129-
logger.debug(f"Destination_Group.__init__")
144+
class Group(Entity):
145+
def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> None:
146+
super().__init__()
147+
logger.debug(f"Group.__init__")
148+
self.config = config
130149
self.known = False
131150
self.name = "unknown"
132151
self.ignorable = False
133152
self.structure = list()
134-
logger.debug(f"Creating destination group from {file.name}")
153+
parsed_object = utils.what_is_being_parsed(file)
154+
logger.debug(f"Creating destination group from {parsed_object}")
135155
self.start_position = file.tell()
136156
logger.debug(f"Starting at file position {self.start_position}")
137157
probe = file.read(GROUP_START)
@@ -145,23 +165,24 @@ def __init__(self, file: io.BufferedReader) -> None:
145165
else:
146166
logger.warning(utils.warn(f"Expected a group but found no group start. Creating unknown group"))
147167
file.seek(self.start_position)
148-
self.cw = Control_Word(file)
149-
self.name = self.cw.control_name
150168
while True:
151169
probed = self.probe(re_patterns.probe, file)
152170
if probed is Bytestring_Type.CONTROL_WORD:
153-
self.structure.append(Control_Word(file))
171+
self.structure.append(Control_Word(self.config, file))
154172
elif probed is Bytestring_Type.GROUP_END:
155173
file.read(GROUP_END)
156174
break
157175
elif probed is Bytestring_Type.GROUP_START:
158-
self.structure.append(Destination_Group(file))
176+
self.structure.append(Group(self.config, file))
159177
elif probed is Bytestring_Type.CONTROL_SYMBOL:
160-
self.structure.append(Control_Symbol(file))
178+
self.structure.append(Control_Symbol(self.config, file))
161179
else:
162-
self.structure.append(Plain_Text(file))
180+
self.structure.append(Plain_Text(self.config, file))
163181
def __repr__(self) -> str:
164-
return f"<{self.__class__.__name__}: {self.cw.control_name}{self.cw.parameter}>"
182+
cwinfo = ""
183+
if isinstance(self.structure[0], Control_Word):
184+
cwinfo = f" {self.structure[0].control_name}"
185+
return f"<Group{cwinfo}>"
165186

166187

167188
if __name__ == "__main__":

src/rtfparse/entry.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import logging.config
88
import argparse
99
import argcomplete
10+
import io
1011
from argcomplete.completers import EnvironCompleter as EC
1112
from itertools import filterfalse
1213
# Own modules
@@ -46,14 +47,32 @@ def argument_parser() -> argparse.ArgumentParser:
4647
parser.add_argument("-v", "--version", action="store_true", help="print out rtfparse version and exit").completer = EC
4748
parser.add_argument("--autoconfig", action="store_true", help="Configure rtfparse automatically").completer = EC
4849
parser.add_argument("-f", "--file", action="store", metavar="PATH", type=pathlib.Path, help="path to the rtf file").completer = EC
50+
parser.add_argument("-m", "--msg", action="store", metavar="PATH", type=pathlib.Path, help="Parse RTF from MS Outlook's .msg file").completer = EC
4951
return parser
5052

5153

5254
def run(config: config_loader.Config) -> None:
5355
if config.cli_args.file and config.cli_args.file.exists():
5456
rp = Rtf_Parser()
5557
with open(config.cli_args.file, mode="rb") as rtf_file:
56-
rp.parse_file(rtf_file)
58+
rp.parse_file(config, rtf_file)
59+
elif config.cli_args.msg:
60+
import extract_msg as em
61+
import compressed_rtf as cr
62+
msg = em.openMsg(f"{config.cli_args.msg}")
63+
decompressed_rtf = cr.decompress(msg.compressedRtf)
64+
with open((config.email_rtf / config.cli_args.msg.name).with_suffix(".rtf"), mode="wb") as email_rtf:
65+
email_rtf.write(decompressed_rtf)
66+
with io.BytesIO(decompressed_rtf) as rtf_file:
67+
rp = Rtf_Parser()
68+
rp.parse_file(config, rtf_file)
69+
from rtfparse.renderers import encapsulated_html
70+
renderer = encapsulated_html.Encapsulated_HTML()
71+
with open((config.html / config.cli_args.msg.name).with_suffix(".html"), mode="w") as htmlfile:
72+
logger.info(f"Rendering the encapsulated HTML")
73+
renderer.render(rp.parsed, htmlfile)
74+
logger.info(f"Encapsulated HTML rendered")
75+
5776

5877

5978
def cli_start(version) -> None:

src/rtfparse/logging_conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def create_dict_config(directory: pathlib.Path, all_log: str, info_log: str, err
5353

5454
root_console_handler_conf = {
5555
"class": "logging.StreamHandler",
56-
"level": "DEBUG",
56+
"level": "INFO",
5757
"formatter": "console_formatter",
5858
"stream": "ext://sys.stdout",
5959
}

0 commit comments

Comments
 (0)