Skip to content

Commit 8ff322a

Browse files
author
Sven Siegmund
committed
fix: contol word recognition, closes #18
1 parent c298afd commit 8ff322a

File tree

5 files changed

+14
-23
lines changed

5 files changed

+14
-23
lines changed

changelog.d/18.fixed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Recognize control words with where the parameter's digital sequence is delimited by any character other than an ASCII digit

src/rtfparse/entities.py

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,7 @@
2020
MAX_CW_LETTERS = 32 # As specified in RTF Spec
2121
INTEGER_MAGNITUDE = 32 # As specified in RTF Spec
2222
PLAIN_TEXT = CONTROL_WORD = (
23-
BACKSLASH
24-
+ MAX_CW_LETTERS
25-
+ MINUS
26-
+ len(str((1 << INTEGER_MAGNITUDE) // 2))
27-
+ DELIMITER
23+
BACKSLASH + MAX_CW_LETTERS + MINUS + len(str((1 << INTEGER_MAGNITUDE) // 2)) + DELIMITER
2824
)
2925

3026

@@ -33,9 +29,7 @@ def __init__(self) -> None:
3329
self.text = ""
3430

3531
@classmethod
36-
def probe(
37-
cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader
38-
) -> Bytestring_Type:
32+
def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Bytestring_Type:
3933
logger.debug(f"Probing file at position {file.tell()}")
4034
original_position = file.tell()
4135
while True:
@@ -54,9 +48,7 @@ def probe(
5448
elif match := re_patterns.plain_text.match(probed):
5549
result = Bytestring_Type.PLAIN_TEXT
5650
else:
57-
logger.debug(
58-
f"This does not match anything, it's probably a newline, moving on"
59-
)
51+
logger.debug(f"This does not match anything, it's probably a newline, moving on")
6052
original_position += 1
6153
file.seek(original_position)
6254
logger.debug(f"Probe moved to position {file.tell()}")
@@ -100,9 +92,7 @@ def __init__(self, encoding: str, file: io.BufferedReader) -> None:
10092
file.seek(target_position)
10193
# handle \binN:
10294
if self.control_name == "bin":
103-
self.bindata = file.read(
104-
utils.twos_complement(self.parameter, INTEGER_MAGNITUDE)
105-
)
95+
self.bindata = file.read(utils.twos_complement(self.parameter, INTEGER_MAGNITUDE))
10696
else:
10797
logger.warning(f"Missing Control Word")
10898
file.seek(self.start_position)
@@ -188,9 +178,7 @@ def __init__(self, encoding: str, file: io.BufferedReader) -> None:
188178
logger.debug(f"Returned to position {file.tell()}")
189179
else:
190180
logger.warning(
191-
utils.warn(
192-
f"Expected a group but found no group start. Creating unknown group"
193-
)
181+
utils.warn(f"Expected a group but found no group start. Creating unknown group")
194182
)
195183
file.seek(self.start_position)
196184
while True:

src/rtfparse/minimal.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"""
77

88
from pathlib import Path
9+
910
from rtfparse.parser import Rtf_Parser
1011
from rtfparse.renderers.html_decapsulator import HTML_Decapsulator
1112

src/rtfparse/parser.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,7 @@ def __init__(
2727
self.rtf_file = rtf_file
2828
if not (self.rtf_path or self.rtf_file):
2929
raise ValueError("Need `rtf_path` or `rtf_file` argument")
30-
self.ENCODING_PROBE = (
31-
48 # look for encoding information in the first 48 bytes of the file
32-
)
30+
self.ENCODING_PROBE = 48 # look for encoding information in the first 48 bytes of the file
3331

3432
def read_encoding(self, file: Union[io.BufferedReader, io.BytesIO]) -> str:
3533
probed = file.read(self.ENCODING_PROBE)

src/rtfparse/re_patterns.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77

88

99
def group(content: bytes) -> bytes:
10-
return rb"[" + content + rb"]"
10+
if content:
11+
return rb"[" + content + rb"]"
12+
else:
13+
return b""
1114

1215

1316
def named_regex_group(name: str, content: bytes) -> bytes:
@@ -54,15 +57,15 @@ def no_capture(content: bytes) -> bytes:
5457
minus = named_regex_group("minus", rb"-?")
5558
digit = named_regex_group("digit", minus + group(_digits) + rb"{1,10}")
5659
hdigit = named_regex_group("hdigit", group(_hdigits))
57-
# int16 = minus + digit + rb"{1,5}"
5860
parameter_pattern = named_regex_group("parameter", digit)
5961
space = named_regex_group("space", rb" ")
6062
newline = named_regex_group("newline", _newline)
6163
other = named_regex_group("other", group(rb"^" + _letters + _digits))
64+
nothing = named_regex_group("nothing", group(rb""))
6265

6366

6467
ascii_letter_sequence = named_regex_group("control_name", ascii_letters + parameter_pattern + rb"?")
65-
delimiter = named_regex_group("delimiter", rb"|".join((space, newline, other, rb"$")))
68+
delimiter = named_regex_group("delimiter", rb"|".join((space, newline, other, nothing, rb"$")))
6669
symbol = named_regex_group("symbol", other)
6770
control_word_pattern = named_regex_group(
6871
"control_word", rtf_backslash + ascii_letter_sequence + delimiter

0 commit comments

Comments
 (0)