Skip to content

Commit f8a646b

Browse files
author
Sven Siegmund
committed
Released on pypi
1 parent 6ab8049 commit f8a646b

File tree

5 files changed

+42
-19
lines changed

5 files changed

+42
-19
lines changed

README.md

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ RTF Parser. So far it can only de-encapsulate HTML content from an RTF, but it p
44

55
# Dependencies
66

7-
See `requirements.txt`.
7+
```
8+
argcomplete
9+
extract-msg
10+
compressed_rtf
11+
```
812

913
# Installation
1014

@@ -16,7 +20,7 @@ Installation creates an executable file `rtfparse` in your python scripts folder
1620

1721
# First Run
1822

19-
When you run `rtfparse` for the first time it will start a configuration wizard which will guide you through the process of creating a default configuration file and specifying the location of its folders. (These folders don't mean much yet, they are more or less placeholders for upcoming program features.)
23+
When you run `rtfparse` for the first time it will start a configuration wizard which will guide you through the process of creating a default configuration file and specifying the location of its folders. (These folders serve as locations for saving extracted rtf or html files.)
2024

2125
In the configuration wizard you can press `A` for care-free automatic configuration, which would look something like this:
2226

@@ -48,17 +52,34 @@ Created directory C:\Users\nagidal\rtfparse\html
4852

4953
Use the `rtfparse` executable from the command line. For example if you want to de-encapsulate the HTML from an RTF file, do it like this:
5054

51-
rtfparse -f "path/to/rtf_file.rtf" -d "path/to/de_encapsulated.html"
55+
rtfparse -f "path/to/rtf_file.rtf" -d
5256

5357
Or you can de-encapsulate the HTML from an MS Outlook message, thanks to [extract_msg](https://github.com/TeamMsgExtractor/msg-extractor) and [compressed_rtf](https://github.com/delimitry/compressed_rtf):
5458

5559
rtfparse -m "path/to/email.msg" -d
5660

57-
Command reference is in `rtfparse --help`.
61+
The resulting html file will be saved to the `html` folder you set in the `rtfparse_configuration.ini`. Command reference is in `rtfparse --help`.
5862

5963
# Usage in python module
6064

61-
See 'minimal.py' for an example.
65+
```
66+
import pathlib
67+
from rtfparse.parser import Rtf_Parser
68+
from rtfparse.renderers import de_encapsulate_html
69+
70+
71+
source_path = pathlib.Path(r"D:\trace\email\test_mail_sw_release.rtf")
72+
target_path = pathlib.Path(r"D:\trace\email\extracted_with_rtfparse.html")
73+
74+
75+
parser = Rtf_Parser(rtf_path=source_path)
76+
parsed = parser.parse_file()
77+
78+
79+
renderer = de_encapsulate_html.De_encapsulate_HTML()
80+
with open(target_path, mode="w", encoding="utf-8") as html_file:
81+
renderer.render(parsed, html_file)
82+
```
6283

6384
# RTF Specification Links
6485

setup.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,25 +42,26 @@ def get_property(property: str, path_to_init_file: pathlib.Path) -> str:
4242
version=get_property("version", path_to_init_file.parent / "version.py"),
4343
description="RTF parser",
4444
long_description=long_description,
45+
long_description_content_type="text/markdown",
4546
author=get_property("__author__", path_to_init_file),
4647
author_email=get_property("__author_email__", path_to_init_file),
4748
url="https://github.com/Nagidal/rtfparse",
4849
classifiers=[
49-
"Development Status :: 2 - Pre-Alpha"
50+
# "Development Status :: 2 - Pre-Alpha",
5051
# "Development Status :: 3 - Alpha",
51-
# "Development Status :: 4 - Beta",
52-
# "Development Status :: 5 - Production/Stable"
53-
# "Intended Audience :: End Users/Desktop",
52+
"Development Status :: 4 - Beta",
53+
# "Development Status :: 5 - Production/Stable",
54+
"Intended Audience :: End Users/Desktop",
5455
"Intended Audience :: Developers",
5556
"Intended Audience :: System Administrators",
5657
"Environment :: Console",
57-
"Topic :: Software Development :: Testing",
58+
# "Topic :: Software Development :: Testing",
5859
"Topic :: Utilities",
59-
"License :: Free To Use But Restricted",
60+
"License :: OSI Approved :: MIT License",
6061
"Natural Language :: English",
6162
"Programming Language :: Python :: 3.9",
6263
"Operating System :: OS Independent",
63-
"Operating System :: Microsoft :: Windows"
64+
"Operating System :: Microsoft :: Windows",
6465
"Operating System :: POSIX :: Linux",
6566
"Operating System :: MacOS :: MacOS X",
6667
],

src/rtfparse/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
__author__ = "Sven Siegmund"
1313
__author_email__ = "[email protected]"
14-
__date__ = "2020-12-21"
14+
__date__ = "2020-01-05"
1515
__version__ = version.version
1616

1717

src/rtfparse/parser.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
# Own modules
99
from rtfparse import re_patterns
1010
from rtfparse import entities
11-
from rtfparse import errors
1211
from rtfparse import utils
1312
# Typing
1413
from typing import Optional
@@ -40,14 +39,17 @@ def read_encoding(self, file: Union[io.BufferedReader, io.BytesIO]) -> str:
4039
"pc",
4140
"pca",
4241
)
42+
# Gather all control words, which could define an encoding:
4343
names = tuple(filter(lambda item: isinstance(item, entities.Control_Word) and item.control_name in recognized_encodings, group.structure))
44-
# Check if the ANSI code page is set:
44+
# Check if the ANSI code page is set as a parameter of any of the control words:
4545
cp = None
4646
for item in names:
4747
# if any item is a Control_Word which has a parameter, we assume that this is the parameter of \ansicpg, and that corresponds to the codepage we are looking for
4848
if item.parameter:
4949
param = item.parameter
50-
if not param:
50+
if param:
51+
encoding = f"cp{param}"
52+
else:
5153
if names[0].control_name == "ansi":
5254
encoding = "ansi"
5355
elif names[0].control_name == "mac":
@@ -56,9 +58,8 @@ def read_encoding(self, file: Union[io.BufferedReader, io.BytesIO]) -> str:
5658
encoding = "cp437"
5759
elif names[0].control_name == "pca":
5860
encoding = "cp850"
59-
else:
60-
encoding = f"cp{param}"
6161
file.seek(0)
62+
logger.info(f"recognized encoding {encoding}")
6263
return encoding
6364
def parse_file(self) -> entities.Group:
6465
if self.rtf_path is not None:

src/rtfparse/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
#!/usr/bin/env python
22

33

4-
version = "0.7.1"
4+
version = "0.7.4"

0 commit comments

Comments
 (0)