Skip to content

Commit 65f3194

Browse files
Merge pull request #13 from fleetingbytes/develop
Develop
2 parents ee5ad6b + ebf442d commit 65f3194

File tree

4 files changed

+39
-9
lines changed

4 files changed

+39
-9
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22

33
<!-- towncrier release notes start -->
44

5+
## 0.8.1 (2023-08-07)
6+
7+
8+
### Bugfixes
9+
10+
- Interpret ANSI encoding as CP1252, improve error handling [#11](https://github.com/fleetingbytes/rtfparse/issues/11)
11+
12+
513
## 0.8.0 (2023-06-29)
614

715

src/rtfparse/__about__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
#!/usr/bin/env python
22

33

4-
__version__ = "0.8.0"
4+
__version__ = "0.8.1"

src/rtfparse/entities.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,11 @@
2020
MAX_CW_LETTERS = 32 # As specified in RTF Spec
2121
INTEGER_MAGNITUDE = 32 # As specified in RTF Spec
2222
PLAIN_TEXT = CONTROL_WORD = (
23-
BACKSLASH + MAX_CW_LETTERS + MINUS + len(str((1 << INTEGER_MAGNITUDE) // 2)) + DELIMITER
23+
BACKSLASH
24+
+ MAX_CW_LETTERS
25+
+ MINUS
26+
+ len(str((1 << INTEGER_MAGNITUDE) // 2))
27+
+ DELIMITER
2428
)
2529

2630

@@ -29,7 +33,9 @@ def __init__(self) -> None:
2933
self.text = ""
3034

3135
@classmethod
32-
def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Bytestring_Type:
36+
def probe(
37+
cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader
38+
) -> Bytestring_Type:
3339
logger.debug(f"Probing file at position {file.tell()}")
3440
original_position = file.tell()
3541
while True:
@@ -48,7 +54,9 @@ def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Byt
4854
elif match := re_patterns.plain_text.match(probed):
4955
result = Bytestring_Type.PLAIN_TEXT
5056
else:
51-
logger.debug(f"This does not match anything, it's probably a newline, moving on")
57+
logger.debug(
58+
f"This does not match anything, it's probably a newline, moving on"
59+
)
5260
original_position += 1
5361
file.seek(original_position)
5462
logger.debug(f"Probe moved to position {file.tell()}")
@@ -92,7 +100,9 @@ def __init__(self, encoding: str, file: io.BufferedReader) -> None:
92100
file.seek(target_position)
93101
# handle \binN:
94102
if self.control_name == "bin":
95-
self.bindata = file.read(utils.twos_complement(self.parameter, INTEGER_MAGNITUDE))
103+
self.bindata = file.read(
104+
utils.twos_complement(self.parameter, INTEGER_MAGNITUDE)
105+
)
96106
else:
97107
logger.warning(f"Missing Control Word")
98108
file.seek(self.start_position)
@@ -178,7 +188,9 @@ def __init__(self, encoding: str, file: io.BufferedReader) -> None:
178188
logger.debug(f"Returned to position {file.tell()}")
179189
else:
180190
logger.warning(
181-
utils.warn(f"Expected a group but found no group start. Creating unknown group")
191+
utils.warn(
192+
f"Expected a group but found no group start. Creating unknown group"
193+
)
182194
)
183195
file.seek(self.start_position)
184196
while True:

src/rtfparse/parser.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import logging
66
import pathlib
77
import re
8+
from argparse import Namespace
89

910
# Typing
1011
from typing import Optional, Union
@@ -26,7 +27,9 @@ def __init__(
2627
self.rtf_file = rtf_file
2728
if not (self.rtf_path or self.rtf_file):
2829
raise ValueError("Need `rtf_path` or `rtf_file` argument")
29-
self.ENCODING_PROBE = 48 # look for encoding information in the first 48 bytes of the file
30+
self.ENCODING_PROBE = (
31+
48 # look for encoding information in the first 48 bytes of the file
32+
)
3033

3134
def read_encoding(self, file: Union[io.BufferedReader, io.BytesIO]) -> str:
3235
probed = file.read(self.ENCODING_PROBE)
@@ -52,17 +55,22 @@ def read_encoding(self, file: Union[io.BufferedReader, io.BytesIO]) -> str:
5255
# if any item is a Control_Word which has a parameter, we assume that this is the parameter of \ansicpg, and that corresponds to the codepage we are looking for
5356
if item.parameter:
5457
param = item.parameter
58+
else:
59+
param = None
5560
if param:
5661
if param == 65001:
5762
logger.warning(
58-
"Found encoding 65001, but often this is actually cp1252, so I'm overriding it"
63+
"Found encoding '65001', but often this is actually 'cp1252', so I'm taking that"
5964
)
6065
encoding = "cp1252"
6166
else:
6267
encoding = f"cp{param}"
6368
else:
6469
if names[0].control_name == "ansi":
65-
encoding = "ansi"
70+
logger.warning(
71+
"Found encoding 'ansi', but often this is actually 'cp1252', so I'm taking that"
72+
)
73+
encoding = "cp1252"
6674
elif names[0].control_name == "mac":
6775
encoding = "mac_roman"
6876
elif names[0].control_name == "pc":
@@ -87,6 +95,8 @@ def parse_file(self) -> entities.Group:
8795
self.parsed = entities.Group(encoding, file)
8896
except Exception as err:
8997
logger.exception(err)
98+
self.parsed == Namespace()
99+
self.parsed.structure = list()
90100
finally:
91101
if self.rtf_path is not None:
92102
logger.debug(f"Closing {parsed_object}")

0 commit comments

Comments
 (0)