Skip to content

Commit ebf442d

Browse files
Merge pull request #12 from fleetingbytes/11-unknown-encoding-ansi
11 unknown encoding ansi
2 parents 36565fa + 7b3fe8f commit ebf442d

24 files changed

+531
-702
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ __pycache__/
33
*.py[cod]
44
*$py.class
55

6+
# PowerShell garbage
7+
Out-Null
8+
69
# Vim files
710
*~
811
*.swp

CHANGELOG.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Changelog
2+
3+
<!-- towncrier release notes start -->
4+
5+
## 0.8.1 (2023-08-07)
6+
7+
8+
### Bugfixes
9+
10+
- Interpret ANSI encoding as CP1252, improve error handling [#11](https://github.com/fleetingbytes/rtfparse/issues/11)
11+
12+
13+
## 0.8.0 (2023-06-29)
14+
15+
16+
### Bugfixes
17+
18+
- Using `pyproject.toml` for installation with current pip versions [#1](https://github.com/fleetingbytes/rtfparse/issues/1)
19+
20+
21+
### Development Details
22+
23+
- Fixed reference before assignment error [#3](https://github.com/fleetingbytes/rtfparse/issues/3)
24+
- Removed convoluted configurator [#5](https://github.com/fleetingbytes/rtfparse/issues/5)
Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,19 @@
1-
MIT License
2-
3-
Copyright (c) 2020 Nagidal
4-
5-
Permission is hereby granted, free of charge, to any person obtaining a copy
6-
of this software and associated documentation files (the "Software"), to deal
7-
in the Software without restriction, including without limitation the rights
8-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9-
copies of the Software, and to permit persons to whom the Software is
10-
furnished to do so, subject to the following conditions:
11-
12-
The above copyright notice and this permission notice shall be included in all
13-
copies or substantial portions of the Software.
14-
15-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21-
SOFTWARE.
1+
Copyright (c) 2023 Sven Siegmud
2+
3+
Permission is hereby granted, free of charge, to any person obtaining a copy
4+
of this software and associated documentation files (the "Software"), to deal
5+
in the Software without restriction, including without limitation the rights
6+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7+
copies of the Software, and to permit persons to whom the Software is
8+
furnished to do so, subject to the following conditions:
9+
10+
The above copyright notice and this permission notice shall be included in all
11+
copies or substantial portions of the Software.
12+
13+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19+
SOFTWARE.

README.md

Lines changed: 28 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,7 @@
22

33
RTF Parser. So far it can only de-encapsulate HTML content from an RTF, but it properly parses the RTF structure and allows you to write your own custom RTF renderers. The HTML de-encapsulator provided with `rtfparse` is just one such custom renderer which liberates the HTML content from its RTF encapsulation and saves it in a given html file.
44

5-
# Dependencies
6-
7-
```
8-
argcomplete
9-
extract-msg
10-
compressed_rtf
11-
```
5+
rtfparse can also decompressed RTF from MS Outlook `.msg` files and parse that.
126

137
# Installation
148

@@ -18,65 +12,60 @@ Install rtfparse from your local repository with pip:
1812

1913
Installation creates an executable file `rtfparse` in your python scripts folder which should be in your `$PATH`.
2014

21-
# First Run
15+
# Usage From Command Line
2216

23-
When you run `rtfparse` for the first time it will start a configuration wizard which will guide you through the process of creating a default configuration file and specifying the location of its folders. (These folders serve as locations for saving extracted rtf or html files.)
17+
Use the `rtfparse` executable from the command line. Read `rtfparse --help`.
2418

25-
In the configuration wizard you can press `A` for care-free automatic configuration, which would look something like this:
19+
rtfparse writes logs into `~/rtfparse/` into these files:
2620

2721
```
28-
$ rtfparse
29-
Config file missing, creating new default config file
22+
rtfparse.debug.log
23+
rtfparse.info.log
24+
rtfparse.errors.log
25+
```
3026

31-
____ ____ __ _ ____ _ ____ _ _ ____ ____ ___ _ ____ __ _
32-
|___ [__] | \| |--- | |__, |__| |--< |--| | | [__] | \|
33-
_ _ _ ___ ____ ____ ___
34-
|/\| | /__ |--| |--< |__>
27+
## Example: De-encapsulate HTML from an uncompressed RTF file
3528

29+
rtfparse --rtf-file "path/to/rtf_file.rtf" --de-encapsulate-html --output-file "path/to/extracted.html"
3630

37-
◊ email_rtf (C:\Users\nagidal\rtfparse\email_rtf) does not exist!
31+
## Example: De-encapsulate HTML from MS Outlook email file
3832

39-
(A) Automatically configure this and all remaining rtfparse settings
40-
(C) Create this path automatically
41-
(M) Manually input correct path to use or to create
42-
(Q) Quit and edit `email_rtf` in rtfparse_configuration.ini
33+
Thanks to [extract_msg](https://github.com/TeamMsgExtractor/msg-extractor) and [compressed_rtf](https://github.com/delimitry/compressed_rtf), rtfparse internally uses them:
4334

44-
Created directory C:\Users\nagidal\rtfparse
45-
Created directory C:\Users\nagidal\rtfparse\email_rtf
46-
Created directory C:\Users\nagidal\rtfparse\html
47-
```
35+
rtfparse --msg-file "path/to/email.msg" --de-encapsulate-html --output-file "path/to/extracted.html"
4836

49-
`rtfparse` also creates the folder `.rtfparse` (beginning with a dot) in your home directory where it saves its default configuration and its log files.
37+
## Example: Only decompress the RTF from MS Outlook email file
5038

51-
# Usage From Command Line
39+
rtfparse --msg-file "path/to/email.msg" --output-file "path/to/extracted.rtf"
5240

53-
Use the `rtfparse` executable from the command line. For example if you want to de-encapsulate the HTML from an RTF file, do it like this:
41+
## Example: De-encapsulate HTML from MS Outlook email file and save (and later embed) the attachments
5442

55-
rtfparse -f "path/to/rtf_file.rtf" -d
43+
When extracting the RTF from the `.msg` file, you can save the attachments (which includes images embedded in the email text) in a directory:
5644

57-
Or you can de-encapsulate the HTML from an MS Outlook message, thanks to [extract_msg](https://github.com/TeamMsgExtractor/msg-extractor) and [compressed_rtf](https://github.com/delimitry/compressed_rtf):
45+
rtfparse --msg-file "path/to/email.msg" --output-file "path/to/extracted.rtf" --attachments-dir "path/to/dir"
5846

59-
rtfparse -m "path/to/email.msg" -d
47+
In `rtfparse` version 1.x you will be able to embed these images in the de-encapsulated HTML. This functionality will be provided by the package [embedimg](https://github.com/fleetingbytes/embedimg).
6048

61-
The resulting html file will be saved to the `html` folder you set in the `rtfparse_configuration.ini`. Command reference is in `rtfparse --help`.
49+
rtfparse --msg-file "path/to/email.msg" --output-file "path/to/extracted.rtf" --attachments-dir "path/to/dir" --embed-img
6250

63-
# Usage in python module
51+
In the current version the option `--embed-img` does nothing.
52+
53+
# Programatic usage in python module
6454

6555
```
66-
import pathlib
56+
from pathlib import Path
6757
from rtfparse.parser import Rtf_Parser
68-
from rtfparse.renderers import de_encapsulate_html
69-
58+
from rtfparse.renderers.de_encapsulate_html import De_encapsulate_HTML
7059
71-
source_path = pathlib.Path(r"path/to/your/rtf/document.rtf")
72-
target_path = pathlib.Path(r"path/to/your/html/de_encapsulated.html")
60+
source_path = Path(r"path/to/your/rtf/document.rtf")
61+
target_path = Path(r"path/to/your/html/de_encapsulated.html")
7362
7463
7564
parser = Rtf_Parser(rtf_path=source_path)
7665
parsed = parser.parse_file()
7766
67+
renderer = De_encapsulate_HTML()
7868
79-
renderer = de_encapsulate_html.De_encapsulate_HTML()
8069
with open(target_path, mode="w", encoding="utf-8") as html_file:
8170
renderer.render(parsed, html_file)
8271
```
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{% if sections[""] %}
2+
{% for category, val in definitions.items() if category in sections[""] %}
3+
4+
### {{ definitions[category]['name'] }}
5+
6+
{% for text, values in sections[""][category].items() %}
7+
- {{ text }} {{ values|join(', ') }}
8+
{% endfor %}
9+
10+
{% endfor %}
11+
{% else %}
12+
No significant changes.
13+
14+
15+
{% endif %}

pyproject.toml

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
[build-system]
2+
requires = [
3+
"hatchling",
4+
"hatch-semver"
5+
]
6+
build-backend = "hatchling.build"
7+
8+
[project]
9+
name = "rtfparse"
10+
description = "Tool to parse Microsoft Rich Text Format (RTF)"
11+
readme = "README.md"
12+
requires-python = ">=3.10"
13+
authors = [
14+
{ name = "Sven Siegmund", email = "[email protected]" },
15+
]
16+
classifiers = [
17+
#"Development Status :: 3 - Alpha",
18+
#"Development Status :: 4 - Beta",
19+
"Development Status :: 5 - Production/Stable",
20+
"Intended Audience :: Developers",
21+
"Environment :: Console",
22+
"Topic :: Software Development :: Testing",
23+
"Topic :: Utilities",
24+
"Natural Language :: English",
25+
"Programming Language :: Python :: 3.10",
26+
"Programming Language :: Python :: 3.11",
27+
"Operating System :: OS Independent",
28+
"Operating System :: Microsoft :: Windows",
29+
"Operating System :: POSIX :: Linux",
30+
"Operating System :: MacOS :: MacOS X",
31+
]
32+
keywords = [
33+
"rtf",
34+
"parse",
35+
]
36+
dependencies = [
37+
"argcomplete",
38+
"extract-msg",
39+
"compressed_rtf",
40+
"provide_dir",
41+
]
42+
dynamic = ["version"]
43+
44+
[project.license]
45+
file = "LICENSE.txt"
46+
47+
[project.urls]
48+
Documentation = "https://github.com/fleetingbytes/rtfparse#readme"
49+
Issues = "https://github.com/fleetingbytes/rtfparse/issues"
50+
Source = "https://github.com/fleetingbytes/rtfparse"
51+
52+
[project.scripts]
53+
rtfparse = "rtfparse:main"
54+
55+
[tool.hatch.version]
56+
path = "src/rtfparse/__about__.py"
57+
validate-bump = true
58+
scheme = "semver"
59+
60+
[tool.hatch.envs.default]
61+
dependencies = [
62+
"pytest-cov",
63+
]
64+
[tool.hatch.envs.default.scripts]
65+
cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=src/rtfparse --cov=tests {args}"
66+
no-cov = "cov --no-cov {args}"
67+
68+
[tool.hatch.envs.style]
69+
dependencies = [
70+
"black",
71+
"isort",
72+
]
73+
74+
[tool.hatch.envs.style.scripts]
75+
fmt = [
76+
"isort .",
77+
"black .",
78+
]
79+
80+
[tool.hatch.envs.tc]
81+
dependencies = [
82+
"towncrier",
83+
]
84+
85+
[tool.hatch.envs.tc.scripts]
86+
draft = "towncrier build --draft"
87+
build = "towncrier build --yes"
88+
89+
[tool.hatch.envs.docs]
90+
dependencies = [
91+
"pdoc3"
92+
]
93+
94+
[[tool.hatch.envs.test.matrix]]
95+
python = ["311"]
96+
97+
[tool.coverage.run]
98+
branch = true
99+
parallel = true
100+
omit = [
101+
#"src/rtfparse/__about__.py",
102+
]
103+
104+
[tool.coverage.report]
105+
exclude_lines = [
106+
"no cov",
107+
"if __name__ == .__main__.:",
108+
"if TYPE_CHECKING:",
109+
]
110+
111+
[tool.black]
112+
line-length = 102
113+
114+
[tool.isort]
115+
line_length = 102
116+
117+
[tool.towncrier]
118+
name = "rtfparse"
119+
package = "rtfparse"
120+
package_dir = "src"
121+
directory = "changelog.d"
122+
filename = "CHANGELOG.md"
123+
start_string = "<!-- towncrier release notes start -->\n"
124+
underlines = ["", "", ""]
125+
template = "changelog.d/changelog_template.jinja"
126+
#title_format = "## [{version}](https://github.com/fleetingbytes/rtfparse/{version}) - {project_date}"
127+
title_format = "## {version} ({project_date})"
128+
issue_format = "[#{issue}](https://github.com/fleetingbytes/rtfparse/issues/{issue})"
129+
orphan_prefix = "+"
130+
131+
[tool.towncrier.fragment.doc]
132+
name = "Documentation"
133+
134+
[tool.towncrier.fragment.feature]
135+
name = "New Features"
136+
137+
[tool.towncrier.fragment.improved]
138+
name = "Improvements"
139+
140+
[tool.towncrier.fragment.fixed]
141+
name = "Bugfixes"
142+
143+
[tool.towncrier.fragment.unimportant]
144+
name = "Development Details"

0 commit comments

Comments
 (0)