Released on pypi

Sven Siegmund · Sven Siegmund · commit f8a646b5aebd · 2021-01-05T09:23:53.000+01:00
diff --git a/README.md b/README.md
@@ -4,7 +4,11 @@ RTF Parser. So far it can only de-encapsulate HTML content from an RTF, but it p
 
 # Dependencies
 
-See `requirements.txt`.
+```
+argcomplete
+extract-msg
+compressed_rtf
+```
 
 # Installation
 
@@ -16,7 +20,7 @@ Installation creates an executable file `rtfparse` in your python scripts folder
 
 # First Run
 
-When you run `rtfparse` for the first time it will start a configuration wizard which will guide you through the process of creating a default configuration file and specifying the location of its folders. (These folders don't mean much yet, they are more or less placeholders for upcoming program features.)
+When you run `rtfparse` for the first time it will start a configuration wizard which will guide you through the process of creating a default configuration file and specifying the location of its folders. (These folders serve as locations for saving extracted rtf or html files.)
 
 In the configuration wizard you can press `A` for care-free automatic configuration, which would look something like this:
 
@@ -48,17 +52,34 @@ Created directory C:\Users\nagidal\rtfparse\html
 
 Use the `rtfparse` executable from the command line. For example if you want to de-encapsulate the HTML from an RTF file, do it like this:
 
-    rtfparse -f "path/to/rtf_file.rtf" -d "path/to/de_encapsulated.html"
+    rtfparse -f "path/to/rtf_file.rtf" -d
 
 Or you can de-encapsulate the HTML from an MS Outlook message, thanks to [extract_msg](https://github.com/TeamMsgExtractor/msg-extractor) and [compressed_rtf](https://github.com/delimitry/compressed_rtf):
 
     rtfparse -m "path/to/email.msg" -d
 
-Command reference is in `rtfparse --help`.
+The resulting html file will be saved to the `html` folder you set in the `rtfparse_configuration.ini`. Command reference is in `rtfparse --help`.
 
 # Usage in python module
 
-See 'minimal.py' for an example.
+```
+import pathlib
+from rtfparse.parser import Rtf_Parser
+from rtfparse.renderers import de_encapsulate_html
+
+
+source_path = pathlib.Path(r"D:\trace\email\test_mail_sw_release.rtf")
+target_path = pathlib.Path(r"D:\trace\email\extracted_with_rtfparse.html")
+
+
+parser = Rtf_Parser(rtf_path=source_path)
+parsed = parser.parse_file()
+
+
+renderer = de_encapsulate_html.De_encapsulate_HTML()
+with open(target_path, mode="w", encoding="utf-8") as html_file:
+    renderer.render(parsed, html_file)
+```
 
 # RTF Specification Links
 
diff --git a/setup.py b/setup.py
@@ -42,25 +42,26 @@ def get_property(property: str, path_to_init_file: pathlib.Path) -> str:
         version=get_property("version", path_to_init_file.parent / "version.py"),
         description="RTF parser",
         long_description=long_description,
+        long_description_content_type="text/markdown",
         author=get_property("__author__", path_to_init_file),
         author_email=get_property("__author_email__", path_to_init_file),
         url="https://github.com/Nagidal/rtfparse",
         classifiers=[
-            "Development Status :: 2 - Pre-Alpha"
+            # "Development Status :: 2 - Pre-Alpha",
             # "Development Status :: 3 - Alpha",
-            # "Development Status :: 4 - Beta", 
-            # "Development Status :: 5 - Production/Stable"
-            # "Intended Audience :: End Users/Desktop",
+            "Development Status :: 4 - Beta", 
+            # "Development Status :: 5 - Production/Stable",
+            "Intended Audience :: End Users/Desktop",
             "Intended Audience :: Developers",
             "Intended Audience :: System Administrators",
             "Environment :: Console",
-            "Topic :: Software Development :: Testing",
+            # "Topic :: Software Development :: Testing",
             "Topic :: Utilities",
-            "License :: Free To Use But Restricted",
+            "License :: OSI Approved :: MIT License",
             "Natural Language :: English",
             "Programming Language :: Python :: 3.9",
             "Operating System :: OS Independent",
-            "Operating System :: Microsoft :: Windows"
+            "Operating System :: Microsoft :: Windows",
             "Operating System :: POSIX :: Linux",
             "Operating System :: MacOS :: MacOS X",
             ],
diff --git a/src/rtfparse/__init__.py b/src/rtfparse/__init__.py
@@ -11,7 +11,7 @@
 
 __author__ = "Sven Siegmund"
 __author_email__ = "sven.siegmund@gmail.com"
-__date__ = "2020-12-21"
+__date__ = "2020-01-05"
 __version__ = version.version
 
 
diff --git a/src/rtfparse/parser.py b/src/rtfparse/parser.py
@@ -8,7 +8,6 @@
 # Own modules
 from rtfparse import re_patterns
 from rtfparse import entities
-from rtfparse import errors
 from rtfparse import utils
 # Typing
 from typing import Optional
@@ -40,14 +39,17 @@ def read_encoding(self, file: Union[io.BufferedReader, io.BytesIO]) -> str:
                     "pc",
                     "pca",
                 )
+        # Gather all control words, which could define an encoding:
         names = tuple(filter(lambda item: isinstance(item, entities.Control_Word) and item.control_name in recognized_encodings, group.structure))
-        # Check if the ANSI code page is set:
+        # Check if the ANSI code page is set as a parameter of any of the control words:
         cp = None
         for item in names:
             # if any item is a Control_Word which has a parameter, we assume that this is the parameter of \ansicpg, and that corresponds to the codepage we are looking for
             if item.parameter:
                 param = item.parameter
-        if not param:
+        if param:
+            encoding = f"cp{param}"
+        else:
             if names[0].control_name == "ansi":
                 encoding = "ansi"
             elif names[0].control_name == "mac":
@@ -56,9 +58,8 @@ def read_encoding(self, file: Union[io.BufferedReader, io.BytesIO]) -> str:
                 encoding = "cp437"
             elif names[0].control_name == "pca":
                 encoding = "cp850"
-        else:
-            encoding = f"cp{param}"
         file.seek(0)
+        logger.info(f"recognized encoding {encoding}")
         return encoding
     def parse_file(self) -> entities.Group:
         if self.rtf_path is not None:
diff --git a/src/rtfparse/version.py b/src/rtfparse/version.py
@@ -1,4 +1,4 @@
 #!/usr/bin/env python
 
 
-version = "0.7.1"
+version = "0.7.4"

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`#!/usr/bin/env python`
`2`	`2`
`3`	`3`
`4`		`-version = "0.7.1"`
	`4`	`+version = "0.7.4"`