Skip to content

Commit dfe766b

Browse files
author
Sven Siegmund
committed
Structure parsing works, needs more testing
1 parent ba0e249 commit dfe766b

File tree

6 files changed

+126
-35
lines changed

6 files changed

+126
-35
lines changed

src/rtfparse/entities.py

Lines changed: 109 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
import io
55
import logging
66
import re
7+
from itertools import count
78
from rtfparse import re_patterns
89
from rtfparse import utils
10+
from rtfparse import errors
911
from rtfparse.enums import Bytestring_Type
1012

1113

@@ -14,73 +16,152 @@
1416

1517

1618
# Constants, number of bytes to read when creating entities
17-
CHARACTER = BACKSLASH = len(b"\\")
18-
IGNORABLE = BACKSLASH + len(rb"*")
19-
GROUP_START = len(rb"x") + IGNORABLE # x = "}" cannot have a rogue brace for vim's auto-indent's sake
20-
DELIMITER = len(rb" ")
19+
CHARACTER = BACKSLASH = DELIMITER = MINUS = GROUP_END = len(b"\\")
20+
SYMBOL = IGNORABLE = BACKSLASH + CHARACTER
21+
GROUP_START = BACKSLASH + IGNORABLE
2122
MAX_CW_LETTERS = 32
2223
INTEGER_MAGNITUDE = 32
23-
CONTROL_WORD = BACKSLASH + MAX_CW_LETTERS + len(rb"-") + len(str((1 << INTEGER_MAGNITUDE) // 2)) + DELIMITER
24+
PLAIN_TEXT = CONTROL_WORD = BACKSLASH + MAX_CW_LETTERS + MINUS + len(str((1 << INTEGER_MAGNITUDE) // 2)) + DELIMITER
2425

2526

2627
class Entity:
2728
@classmethod
2829
def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Bytestring_Type:
2930
logger.debug(f"in Entity.probed")
3031
original_position = file.tell()
31-
probed = file.read(len(re_patterns.probe_pattern))
32-
logger.debug(f"{probed = }")
33-
file.seek(original_position - len(probed))
34-
if (match := re_patterns.group_start.match(probed)):
35-
result = Bytestring_Type.GROUP_START
36-
elif (match := re_patterns.group_end.match(probed)):
37-
result = Bytestring_Type.GROUP_END
38-
elif (match := re_patterns.control_word.match(probed)):
39-
result = Bytestring_Type.CONTROL_WORD
40-
elif (match := re_patterns.control_symbol.match(probed)):
41-
result = Bytestring_Type.CONTROL_SYMBOL
42-
else:
43-
result = Bytestring_Type.PLAIN_TEXT
32+
while True:
33+
probed = file.read(len(re_patterns.probe_pattern))
34+
logger.debug(f"{probed = }")
35+
file.seek(original_position)
36+
logger.debug(f"Probe returned to position {file.tell()}")
37+
if (match := re_patterns.group_start.match(probed)):
38+
result = Bytestring_Type.GROUP_START
39+
elif (match := re_patterns.group_end.match(probed)):
40+
result = Bytestring_Type.GROUP_END
41+
elif (match := re_patterns.control_word.match(probed)):
42+
result = Bytestring_Type.CONTROL_WORD
43+
elif (match := re_patterns.control_symbol.match(probed)):
44+
result = Bytestring_Type.CONTROL_SYMBOL
45+
elif (match := re_patterns.plain_text.match(probed)):
46+
result = Bytestring_Type.PLAIN_TEXT
47+
else:
48+
logger.debug(f"This does not match anything, it's probably a newline, moving on")
49+
original_position += 1
50+
file.seek(original_position)
51+
logger.debug(f"Probe moved to position {file.tell()}")
52+
if not probed:
53+
logger.warning(f"Reached unexpected end of file.")
54+
raise errors.UnexpectedEndOfFileError(f"at position {file.tell()}")
55+
continue
56+
break
4457
logger.debug(f"{result = }")
58+
logger.debug(f"Probe leaving file at position {file.tell()}")
4559
return result
4660

4761

4862
class Control_Word(Entity):
4963
def __init__(self, file: io.BufferedReader) -> None:
5064
logger.debug(f"Control_Word.__init__")
65+
self.control_name = "missing"
66+
self.parameter = ""
5167
self.start_position = file.tell()
5268
logger.debug(f"Starting at file position {self.start_position}")
5369
probe = file.read(CONTROL_WORD)
5470
if (match := re_patterns.control_word.match(probe)):
55-
self.control_name = match.group("control_name")
71+
self.control_name = match.group("control_name").decode("ascii")
5672
logger.debug(f"{self.control_name = }")
57-
self.parameter = match.group("parameter")
58-
file.seek(self.start_position + match.span()[1])
73+
parameter = match.group("parameter")
74+
if parameter:
75+
self.parameter = int(parameter.decode("ascii"))
76+
logger.debug(f"{self.parameter = }")
77+
target_position = self.start_position + match.span()[1]
78+
if match.group("other"):
79+
logger.debug(f"Delimiter is {match.group('other').decode('ascii')}, len: {len(match.group('delimiter'))}")
80+
target_position -= len(match.group("delimiter"))
81+
file.seek(target_position)
82+
else:
83+
logger.warning(f"Missing Control Word")
84+
file.seek(self.start_position)
85+
def __repr__(self) -> str:
86+
name = self.control_name
87+
return f"<{self.__class__.__name__}: {name}{self.parameter}>"
88+
89+
90+
class Control_Symbol(Entity):
91+
def __init__(self, file: io.BufferedReader) -> None:
92+
self.start_position = file.tell()
93+
logger.debug(f"Starting at file position {self.start_position}")
94+
self.text = file.read(SYMBOL)[-1].decode("ascii")
95+
def __repr__(self) -> str:
96+
return f"<{self.__class__.__name__}: {self.text}>"
97+
98+
99+
class Plain_Text(Entity):
100+
def __init__(self, file: io.BufferedReader) -> None:
101+
self.start_position = file.tell()
102+
logger.debug(f"Starting at file position {self.start_position}")
103+
self.text = ""
104+
while True:
105+
read = file.read(PLAIN_TEXT)
106+
logger.debug(f"Read file up to position {file.tell()}")
107+
logger.debug(f"Read: {read}")
108+
# see if we have read all the plain text there is:
109+
if (match := re_patterns.plain_text.match(read)):
110+
logger.debug(f"This matches the plain text pattern")
111+
_text = match.group("text").decode("ascii")
112+
logger.debug(f"{_text = }")
113+
self.text = "".join((self.text, _text))
114+
logger.debug(f"{self.text = }")
115+
if len(_text) == PLAIN_TEXT:
116+
continue
117+
else:
118+
file.seek(self.start_position + len(self.text))
119+
logger.debug(f"Returned to position {file.tell()}")
120+
break
121+
else:
122+
break
123+
def __repr__(self) -> str:
124+
return f"<{self.__class__.__name__}: {self.text}>"
59125

60126

61127
class Destination_Group(Entity):
62128
def __init__(self, file: io.BufferedReader) -> None:
63129
logger.debug(f"Destination_Group.__init__")
64-
logger.debug(f"Creating destination group from {file.name}")
65130
self.known = False
66131
self.name = "unknown"
67132
self.ignorable = False
133+
self.structure = list()
134+
logger.debug(f"Creating destination group from {file.name}")
68135
self.start_position = file.tell()
69136
logger.debug(f"Starting at file position {self.start_position}")
70137
probe = file.read(GROUP_START)
71-
logger.debug(f"Read file up to position {file.tell()}")
138+
logger.debug(f"Read file up to position {file.tell()}, read {probe = }")
72139
if (match := re_patterns.group_start.match(probe)):
73140
self.known = bool(match.group("group_start"))
74141
self.ignorable = bool(match.group("ignorable"))
75142
if not self.ignorable:
76143
file.seek(-IGNORABLE, io.SEEK_CUR)
77144
logger.debug(f"Returned to position {file.tell()}")
78-
self.cw = Control_Word(file)
79-
self.name = self.cw.control_name
80145
else:
81-
logger.warning(utils.warn(f"Expected group has no start. Creating unknown group"))
82-
probed = self.probe(re_patterns.probe, file)
83-
self.content = list()
146+
logger.warning(utils.warn(f"Expected a group but found no group start. Creating unknown group"))
147+
file.seek(self.start_position)
148+
self.cw = Control_Word(file)
149+
self.name = self.cw.control_name
150+
while True:
151+
probed = self.probe(re_patterns.probe, file)
152+
if probed is Bytestring_Type.CONTROL_WORD:
153+
self.structure.append(Control_Word(file))
154+
elif probed is Bytestring_Type.GROUP_END:
155+
file.read(GROUP_END)
156+
break
157+
elif probed is Bytestring_Type.GROUP_START:
158+
self.structure.append(Destination_Group(file))
159+
elif probed is Bytestring_Type.CONTROL_SYMBOL:
160+
self.structure.append(Control_Symbol(file))
161+
else:
162+
self.structure.append(Plain_Text(file))
163+
def __repr__(self) -> str:
164+
return f"<{self.__class__.__name__}: {self.cw.control_name}{self.cw.parameter}>"
84165

85166

86167
if __name__ == "__main__":

src/rtfparse/errors.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,7 @@ class WrongConfiguration(Error):
1616
def __init__(self, message, payload):
1717
self.message = message
1818
self.payload = payload
19+
20+
21+
class UnexpectedEndOfFileError(Error):
22+
pass

src/rtfparse/parser.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from collections import OrderedDict
88
from rtfparse import re_patterns
99
from rtfparse import entities
10+
from rtfparse import errors
1011

1112

1213
# Setup logging
@@ -59,6 +60,8 @@ def parse_file(self, file: io.BufferedReader) -> None:
5960
logger.debug(f"Parsing file {file.name}")
6061
try:
6162
self.parsed = entities.Destination_Group(file)
63+
except errors.UnexpectedEndOfFileError as err:
64+
logger.error(f"{err}")
6265
except Exception as err:
6366
logger.exception(err)
6467
finally:

src/rtfparse/re_patterns.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@ def no_capture(content: bytes) -> bytes:
3333

3434

3535
_control_characters = rb"\\\{\}"
36+
_newline = b"\\" + rb"r" + b"\\" + rb"n"
3637
control_character = group(_control_characters)
3738
not_control_character = group(rb"^" + _control_characters)
39+
_control_characters_or_newline = _control_characters + _newline
40+
control_character_or_newline = group(_control_characters + _newline)
41+
not_control_character_or_newline = group(rb"^" + _control_characters_or_newline)
3842
rtf_backslash = named_regex_group("backslash", not_preceded_by(rb"\\", rb"\\"))
3943
unnamed_rtf_backslash = not_preceded_by(rb"\\", rb"\\")
4044
_letters = rb"a-zA-Z"
@@ -50,17 +54,17 @@ def no_capture(content: bytes) -> bytes:
5054
hdigit = named_regex_group("hdigit", group(_hdigits))
5155
minus = named_regex_group("minus", rb"-?")
5256
# int16 = minus + digit + rb"{1,5}"
53-
parameter = named_regex_group("parameter", minus + digit + rb"{1,10}")
57+
parameter_pattern = named_regex_group("parameter", minus + digit + rb"{1,10}")
5458
space = named_regex_group("space", rb" ")
5559
other = named_regex_group("other", group(rb"^" + _letters + _digits))
5660

5761

5862
ascii_letter_sequence = named_regex_group("control_name", ascii_letters)
59-
delimiter = named_regex_group("delimiter", rb"|".join((rb" ", parameter, other, rb"$")))
63+
delimiter = named_regex_group("delimiter", rb"|".join((space, parameter_pattern, other, rb"$")))
6064
symbol = named_regex_group("symbol", other)
6165
control_word_pattern = named_regex_group("control_word", rtf_backslash + ascii_letter_sequence + delimiter)
6266
pcdata_delimiter = no_capture(rb"|".join((rtf_brace_open, rtf_brace_close, control_word_pattern)))
63-
plain_text_pattern = named_regex_group("text", not_control_character + rb"+") + no_capture(control_character)
67+
plain_text_pattern = named_regex_group("text", not_control_character_or_newline + rb"+") + no_capture(rb"|".join((control_character_or_newline, rb"$")))
6468
probe_pattern = rb".."
6569

6670

@@ -73,9 +77,9 @@ def regex101(self) -> None:
7377
print(self.pattern_bytes.decode("ascii"))
7478

7579

76-
probe = Bytes_Regex(named_regex_group("probe", probe_pattern))
7780
meaningful_bs = Bytes_Regex(rtf_backslash)
78-
# control_word = Bytes_Regex(rtf_backslash + ascii_letter_sequence + delimiter)
81+
probe = Bytes_Regex(named_regex_group("probe", probe_pattern), flags=re.DOTALL)
82+
parameter = Bytes_Regex(parameter_pattern)
7983
control_word = Bytes_Regex(control_word_pattern)
8084
control_symbol = Bytes_Regex(rtf_backslash + symbol)
8185
group_start = Bytes_Regex(rtf_brace_open)

src/rtfparse/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
#!/usr/bin/env python
22

33

4-
version = "0.2.2"
4+
version = "0.3.1"

tests/.gitignore

Lines changed: 0 additions & 1 deletion
This file was deleted.

0 commit comments

Comments
 (0)