Skip to content

Commit 6ab8049

Browse files
author
Sven Siegmund
committed
API ready
1 parent 6df112d commit 6ab8049

File tree

6 files changed

+105
-46
lines changed

6 files changed

+105
-46
lines changed

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,18 +44,22 @@ Created directory C:\Users\nagidal\rtfparse\html
4444

4545
`rtfparse` also creates the folder `.rtfparse` (beginning with a dot) in your home directory where it saves its default configuration and its log files.
4646

47-
# Usage
47+
# Usage From Command Line
4848

4949
Use the `rtfparse` executable from the command line. For example if you want to de-encapsulate the HTML from an RTF file, do it like this:
5050

5151
rtfparse -f "path/to/rtf_file.rtf" -d "path/to/de_encapsulated.html"
5252

5353
Or you can de-encapsulate the HTML from an MS Outlook message, thanks to [extract_msg](https://github.com/TeamMsgExtractor/msg-extractor) and [compressed_rtf](https://github.com/delimitry/compressed_rtf):
5454

55-
rtfparse -m "path/to/email.msg" -d "path/to/de_encapsulated.html"
55+
rtfparse -m "path/to/email.msg" -d
5656

5757
Command reference is in `rtfparse --help`.
5858

59+
# Usage in python module
60+
61+
See 'minimal.py' for an example.
62+
5963
# RTF Specification Links
6064

6165
If you find a working official Microsoft link to the RTF specification and add it here, you'll be remembered fondly.

src/rtfparse/entities.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from rtfparse import re_patterns
99
from rtfparse import utils
1010
from rtfparse import errors
11-
from rtfparse import config_loader
1211
from rtfparse.enums import Bytestring_Type
1312

1413

@@ -53,8 +52,10 @@ def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Byt
5352
file.seek(original_position)
5453
logger.debug(f"Probe moved to position {file.tell()}")
5554
if not probed:
56-
logger.warning(f"Reached unexpected end of file.")
57-
raise errors.UnexpectedEndOfFileError(f"at position {file.tell()}")
55+
logger.debug(f"Reached unexpected end of file.")
56+
result = Bytestring_Type.GROUP_END
57+
break
58+
# raise errors.UnexpectedEndOfFileError(f"at position {file.tell()}")
5859
continue
5960
break
6061
logger.debug(f"Probe {result = }")
@@ -63,9 +64,9 @@ def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Byt
6364

6465

6566
class Control_Word(Entity):
66-
def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> None:
67+
def __init__(self, encoding: str, file: io.BufferedReader) -> None:
6768
super().__init__()
68-
self.config = config
69+
self.encoding = encoding
6970
logger.debug(f"Reading Control Word at file position {file.tell()}")
7071
self.control_name = "missing"
7172
self.parameter = ""
@@ -74,17 +75,17 @@ def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> Non
7475
logger.debug(f"Starting at file position {self.start_position}")
7576
probe = file.read(CONTROL_WORD)
7677
if (match := re_patterns.control_word.match(probe)):
77-
self.control_name = match.group("control_name").decode(self.config.default_encoding)
78+
self.control_name = match.group("control_name").decode(self.encoding)
7879
logger.debug(f"Preliminary {self.control_name = }")
7980
parameter = match.group("parameter")
8081
if parameter is not None:
81-
self.parameter = int(parameter.decode(self.config.default_encoding))
82+
self.parameter = int(parameter.decode(self.encoding))
8283
logger.debug(f"{self.parameter = }")
8384
self.control_name = self.control_name.removesuffix(str(self.parameter))
8485
logger.debug(f"Final {self.control_name = }")
8586
target_position = self.start_position + match.span()[1]
8687
if match.group("other"):
87-
logger.debug(f"Delimiter is {match.group('other').decode(self.config.default_encoding)}, len: {len(match.group('delimiter'))}")
88+
logger.debug(f"Delimiter is {match.group('other').decode(self.encoding)}, len: {len(match.group('delimiter'))}")
8889
target_position -= len(match.group("delimiter"))
8990
file.seek(target_position)
9091
# handle \binN:
@@ -98,16 +99,16 @@ def __repr__(self) -> str:
9899

99100

100101
class Control_Symbol(Entity):
101-
def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> None:
102+
def __init__(self, encoding: str, file: io.BufferedReader) -> None:
102103
super().__init__()
103-
self.config = config
104+
self.encoding = encoding
104105
self.start_position = file.tell()
105106
logger.debug(f"Reading Symbol at file position {self.start_position}")
106107
self.char = ""
107108
self.text = chr(file.read(SYMBOL)[-1])
108109
if self.text == "'":
109-
self.char = file.read(SYMBOL).decode(self.config.default_encoding)
110-
self.text = bytes((int(self.char, base=16), )).decode(self.config.default_encoding)
110+
self.char = file.read(SYMBOL).decode(self.encoding)
111+
self.text = bytes((int(self.char, base=16), )).decode(self.encoding)
111112
logger.debug(f"Encountered escaped ANSI character, read two more bytes: {self.char}, character: {self.text}")
112113
if self.text in "\\{}":
113114
file.seek(file.tell() - SYMBOL)
@@ -116,9 +117,9 @@ def __repr__(self) -> str:
116117

117118

118119
class Plain_Text(Entity):
119-
def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> None:
120+
def __init__(self, encoding: str, file: io.BufferedReader) -> None:
120121
super().__init__()
121-
self.config = config
122+
self.encoding = encoding
122123
self.text = ""
123124
logger.debug(f"Constructing Plain_Text")
124125
while True:
@@ -128,7 +129,7 @@ def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> Non
128129
# see if we have read all the plain text there is:
129130
if (match := re_patterns.plain_text.match(read)):
130131
logger.debug(f"This matches the plain text pattern")
131-
_text = match.group("text").decode(self.config.default_encoding)
132+
_text = match.group("text").decode(self.encoding)
132133
logger.debug(f"{_text = }")
133134
self.text = "".join((self.text, _text))
134135
logger.debug(f"{self.text = }")
@@ -146,10 +147,10 @@ def __repr__(self) -> str:
146147

147148

148149
class Group(Entity):
149-
def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> None:
150+
def __init__(self, encoding: str, file: io.BufferedReader) -> None:
150151
super().__init__()
151152
logger.debug(f"Group.__init__")
152-
self.config = config
153+
self.encoding = encoding
153154
self.known = False
154155
self.name = "unknown"
155156
self.ignorable = False
@@ -164,24 +165,24 @@ def __init__(self, config: config_loader.Config, file: io.BufferedReader) -> Non
164165
self.known = bool(match.group("group_start"))
165166
self.ignorable = bool(match.group("ignorable"))
166167
if not self.ignorable:
167-
file.seek(-IGNORABLE, io.SEEK_CUR)
168+
file.seek(self.start_position + GROUP_START - IGNORABLE)
168169
logger.debug(f"Returned to position {file.tell()}")
169170
else:
170171
logger.warning(utils.warn(f"Expected a group but found no group start. Creating unknown group"))
171172
file.seek(self.start_position)
172173
while True:
173174
probed = self.probe(re_patterns.probe, file)
174175
if probed is Bytestring_Type.CONTROL_WORD:
175-
self.structure.append(Control_Word(self.config, file))
176+
self.structure.append(Control_Word(self.encoding, file))
176177
elif probed is Bytestring_Type.GROUP_END:
177178
file.read(GROUP_END)
178179
break
179180
elif probed is Bytestring_Type.GROUP_START:
180-
self.structure.append(Group(self.config, file))
181+
self.structure.append(Group(self.encoding, file))
181182
elif probed is Bytestring_Type.CONTROL_SYMBOL:
182-
self.structure.append(Control_Symbol(self.config, file))
183+
self.structure.append(Control_Symbol(self.encoding, file))
183184
else:
184-
self.structure.append(Plain_Text(self.config, file))
185+
self.structure.append(Plain_Text(self.encoding, file))
185186
# name the group like its first Control Word
186187
# this way the renderer will be able to ignore entire groups based on their first control word
187188
try:

src/rtfparse/entry.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from rtfparse import config_loader
2020
from rtfparse import version
2121
from rtfparse.parser import Rtf_Parser
22-
from rtfparse.renderers import encapsulated_html
22+
from rtfparse.renderers import de_encapsulate_html
2323

2424

2525
# Setup logging
@@ -51,24 +51,26 @@ def argument_parser() -> argparse.ArgumentParser:
5151
parser.add_argument("--autoconfig", action="store_true", help="Configure rtfparse automatically").completer = EC
5252
parser.add_argument("-f", "--file", action="store", metavar="PATH", type=pathlib.Path, help="path to the rtf file").completer = EC
5353
parser.add_argument("-m", "--msg", action="store", metavar="PATH", type=pathlib.Path, help="Parse RTF from MS Outlook's .msg file").completer = EC
54-
parser.add_argument("-d", "--de-encapsulate-html", action="store", metavar="PATH", type=pathlib.Path, help="De-encapsulate HTML from RTF").completer = EC
54+
parser.add_argument("-d", "--de-encapsulate-html", action="store_true", help="De-encapsulate HTML from RTF").completer = EC
5555
return parser
5656

5757

5858
def de_encapsulate(rp: Rtf_Parser, target_file: pathlib.Path) -> None:
59-
renderer = encapsulated_html.Encapsulated_HTML()
60-
with open(target_file, mode="w", encoding="utf-8") as htmlfile:
59+
renderer = de_encapsulate_html.De_encapsulate_HTML()
60+
with open(target_file, mode="w", encoding="cp1252") as htmlfile:
6161
logger.info(f"Rendering the encapsulated HTML")
6262
renderer.render(rp.parsed, htmlfile)
6363
logger.info(f"Encapsulated HTML rendered")
6464

6565

6666
def run(config: config_loader.Config) -> None:
67-
rp = Rtf_Parser()
6867
if config.cli_args.file and config.cli_args.file.exists():
68+
file_name = config.cli_args.file.name
6969
with open(config.cli_args.file, mode="rb") as rtf_file:
70-
rp.parse_file(config, rtf_file)
70+
rp = Rtf_Parser(rtf_file=rtf_file)
71+
rp.parse_file()
7172
elif config.cli_args.msg:
73+
file_name = config.cli_args.msg.name
7274
msg = em.openMsg(f"{config.cli_args.msg}")
7375
for attachment in msg.attachments:
7476
with open(config.html / f"{attachment.longFilename}", mode="wb") as att_file:
@@ -77,9 +79,10 @@ def run(config: config_loader.Config) -> None:
7779
with open((config.email_rtf / config.cli_args.msg.name).with_suffix(".rtf"), mode="wb") as email_rtf:
7880
email_rtf.write(decompressed_rtf)
7981
with io.BytesIO(decompressed_rtf) as rtf_file:
80-
rp.parse_file(config, rtf_file)
82+
rp = Rtf_Parser(rtf_file=rtf_file)
83+
rp.parse_file()
8184
if config.cli_args.de_encapsulate_html:
82-
de_encapsulate(rp, config.cli_args.de_encapsulate_html)
85+
de_encapsulate(rp, (config.html / file_name).with_suffix(".html"))
8386

8487

8588
def cli_start(version) -> None:

src/rtfparse/minimal.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,17 @@
22

33

44
import pathlib
5-
from rtfparse.parsers import Rtf_Parser
6-
from rtfparse.renderers import encapsulated_html
5+
from rtfparse.parser import Rtf_Parser
6+
from rtfparse.renderers import de_encapsulate_html
77

88

9-
source_file = pathlib.Path(r"D:\trace\email\test_mail_sw_release.rtf")
10-
target_file = pathlib.Path(r"D:\trace\email\extracted_with_rtfparse.html")
9+
source_path = pathlib.Path(r"D:\trace\email\test_mail_sw_release.rtf")
10+
target_path = pathlib.Path(r"D:\trace\email\extracted_with_rtfparse.html")
1111

1212

13-
parser = Rtf_Parser(rtf_file=source_file)
13+
parser = Rtf_Parser(rtf_path=source_path)
1414
parsed = parser.parse_file()
1515

16-
renderer = encapsulated_html.Encapsulated_HTML()
17-
renderer.render(rtf_structure=parsed, target_file=target_file)
16+
renderer = de_encapsulate_html.De_encapsulate_HTML()
17+
with open(target_path, mode="w", encoding="utf-8") as html_file:
18+
renderer.render(parsed, html_file)

src/rtfparse/parser.py

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44
import io
55
import re
66
import logging
7+
import pathlib
78
# Own modules
89
from rtfparse import re_patterns
910
from rtfparse import entities
1011
from rtfparse import errors
1112
from rtfparse import utils
1213
# Typing
14+
from typing import Optional
1315
from typing import Union
1416
from rtfparse import config_loader
1517

@@ -19,19 +21,65 @@
1921

2022

2123
class Rtf_Parser:
22-
def __init__(self) -> None:
23-
self.parsed = None
24-
def parse_file(self, config: config_loader.Config, file: Union[io.BufferedReader, io.BytesIO]) -> None:
24+
def __init__(self,
25+
rtf_path: Optional[pathlib.Path]=None,
26+
rtf_file: Optional[Union[io.BufferedReader, io.BytesIO]]=None,
27+
) -> None:
28+
self.rtf_path = rtf_path
29+
self.rtf_file = rtf_file
30+
if not (self.rtf_path or self.rtf_file):
31+
raise ValueError("Need `rtf_path` or `rtf_file` argument")
32+
self.ENCODING_PROBE = 48 # look for encoding information in the first 48 bytes of the file
33+
def read_encoding(self, file: Union[io.BufferedReader, io.BytesIO]) -> str:
34+
probed = file.read(self.ENCODING_PROBE)
35+
group = entities.Group("cp1252", io.BytesIO(probed))
36+
recognized_encodings = (
37+
"ansi",
38+
"ansicpg",
39+
"mac",
40+
"pc",
41+
"pca",
42+
)
43+
names = tuple(filter(lambda item: isinstance(item, entities.Control_Word) and item.control_name in recognized_encodings, group.structure))
44+
# Check if the ANSI code page is set:
45+
cp = None
46+
for item in names:
47+
# if any item is a Control_Word which has a parameter, we assume that this is the parameter of \ansicpg, and that corresponds to the codepage we are looking for
48+
if item.parameter:
49+
param = item.parameter
50+
if not param:
51+
if names[0].control_name == "ansi":
52+
encoding = "ansi"
53+
elif names[0].control_name == "mac":
54+
encoding = "mac_roman"
55+
elif names[0].control_name == "pc":
56+
encoding = "cp437"
57+
elif names[0].control_name == "pca":
58+
encoding = "cp850"
59+
else:
60+
encoding = f"cp{param}"
61+
file.seek(0)
62+
return encoding
63+
def parse_file(self) -> entities.Group:
64+
if self.rtf_path is not None:
65+
file = open(self.rtf_path, mode="rb")
66+
elif self.rtf_file is not None:
67+
file = self.rtf_file
68+
else:
69+
file = io.BytesIO(b"")
2570
parsed_object = utils.what_is_being_parsed(file)
2671
logger.info(f"Parsing the structure of {parsed_object}")
2772
try:
28-
self.parsed = entities.Group(config, file)
29-
except errors.UnexpectedEndOfFileError as err:
30-
logger.error(f"{err}")
73+
encoding = self.read_encoding(file)
74+
self.parsed = entities.Group(encoding, file)
3175
except Exception as err:
3276
logger.exception(err)
3377
finally:
78+
if self.rtf_path is not None:
79+
logger.debug(f"Closing {parsed_object}")
80+
file.close()
3481
logger.info(f"Structure of {parsed_object} parsed")
82+
return self.parsed
3583

3684

3785
if __name__ == "__main__":

src/rtfparse/renderers/encapsulated_html.py renamed to src/rtfparse/renderers/de_encapsulate_html.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
logger = logging.getLogger(__name__)
1414

1515

16-
class Encapsulated_HTML(Renderer):
16+
class De_encapsulate_HTML(Renderer):
1717
def __init__(self, ) -> None:
1818
super().__init__()
1919
self.ignore_rtf = False
@@ -27,6 +27,8 @@ def __init__(self, ) -> None:
2727
self.ignore_groups = (
2828
"fonttbl",
2929
"colortbl",
30+
"generator",
31+
"formatConverter",
3032
)
3133
def ignore_rtf_toggle(self, cw: entities.Control_Word) -> str:
3234
if cw.parameter == "" or cw.parameter == 1:

0 commit comments

Comments
 (0)