Skip to content

Commit ba0e249

Browse files
author
Sven Siegmund
committed
Parsing started
1 parent 176e97e commit ba0e249

File tree

8 files changed

+130
-15
lines changed

8 files changed

+130
-15
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def get_property(property: str, path_to_init_file: pathlib.Path) -> str:
7373
python_requires=">=3.9",
7474
install_requires=requirements,
7575
entry_points={
76-
"console_scripts": [f"{project_name} = {project_name}.__main__:main",
76+
"console_scripts": [f"{project_name} = {project_name}.__main__:{project_name}",
7777
],
7878
},
7979
platforms=["any"],

src/rtfparse/__main__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
from rtfparse import entry
77

88

9-
def main():
9+
def rtfparse():
1010
sys.exit(entry.cli_start(version.version))
1111

1212

1313
if __name__ == "__main__":
14-
main()
14+
rtfparse()

src/rtfparse/bytestring_types.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/env python
2+
3+
4+
"""
5+
Control Word Types
6+
"""
7+
8+
9+
from rtfparse import re_patterns
10+
11+
12+
class Plain_Text:
13+
pass
14+
15+
16+
class Group_Start:
17+
pass
18+
19+
20+
class Group_End:
21+
pass
22+
23+
24+
class Cwtype:
25+
default_delimiter = " "
26+
27+
28+
class Flag(Cwtype):
29+
native_pattern = re_patterns.control_word
30+
def __init__(self, pattern: str) -> None:
31+
self.something = self.native_pattern.pattern.match((pattern + self.default_delimiter).encode("ascii"))
32+
33+
34+
if __name__ == "__main__":
35+
pass

src/rtfparse/entities.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,84 @@
33

44
import io
55
import logging
6+
import re
67
from rtfparse import re_patterns
8+
from rtfparse import utils
9+
from rtfparse.enums import Bytestring_Type
710

811

912
# Setup logging
1013
logger = logging.getLogger(__name__)
1114

1215

13-
class Destination_Group:
16+
# Constants, number of bytes to read when creating entities
17+
CHARACTER = BACKSLASH = len(b"\\")
18+
IGNORABLE = BACKSLASH + len(rb"*")
19+
GROUP_START = len(rb"x") + IGNORABLE # x = "}" cannot have a rogue brace for vim's auto-indent's sake
20+
DELIMITER = len(rb" ")
21+
MAX_CW_LETTERS = 32
22+
INTEGER_MAGNITUDE = 32
23+
CONTROL_WORD = BACKSLASH + MAX_CW_LETTERS + len(rb"-") + len(str((1 << INTEGER_MAGNITUDE) // 2)) + DELIMITER
24+
25+
26+
class Entity:
27+
@classmethod
28+
def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Bytestring_Type:
29+
logger.debug(f"in Entity.probed")
30+
original_position = file.tell()
31+
probed = file.read(len(re_patterns.probe_pattern))
32+
logger.debug(f"{probed = }")
33+
file.seek(original_position - len(probed))
34+
if (match := re_patterns.group_start.match(probed)):
35+
result = Bytestring_Type.GROUP_START
36+
elif (match := re_patterns.group_end.match(probed)):
37+
result = Bytestring_Type.GROUP_END
38+
elif (match := re_patterns.control_word.match(probed)):
39+
result = Bytestring_Type.CONTROL_WORD
40+
elif (match := re_patterns.control_symbol.match(probed)):
41+
result = Bytestring_Type.CONTROL_SYMBOL
42+
else:
43+
result = Bytestring_Type.PLAIN_TEXT
44+
logger.debug(f"{result = }")
45+
return result
46+
47+
48+
class Control_Word(Entity):
49+
def __init__(self, file: io.BufferedReader) -> None:
50+
logger.debug(f"Control_Word.__init__")
51+
self.start_position = file.tell()
52+
logger.debug(f"Starting at file position {self.start_position}")
53+
probe = file.read(CONTROL_WORD)
54+
if (match := re_patterns.control_word.match(probe)):
55+
self.control_name = match.group("control_name")
56+
logger.debug(f"{self.control_name = }")
57+
self.parameter = match.group("parameter")
58+
file.seek(self.start_position + match.span()[1])
59+
60+
61+
class Destination_Group(Entity):
1462
def __init__(self, file: io.BufferedReader) -> None:
63+
logger.debug(f"Destination_Group.__init__")
1564
logger.debug(f"Creating destination group from {file.name}")
65+
self.known = False
66+
self.name = "unknown"
67+
self.ignorable = False
68+
self.start_position = file.tell()
69+
logger.debug(f"Starting at file position {self.start_position}")
70+
probe = file.read(GROUP_START)
71+
logger.debug(f"Read file up to position {file.tell()}")
72+
if (match := re_patterns.group_start.match(probe)):
73+
self.known = bool(match.group("group_start"))
74+
self.ignorable = bool(match.group("ignorable"))
75+
if not self.ignorable:
76+
file.seek(-IGNORABLE, io.SEEK_CUR)
77+
logger.debug(f"Returned to position {file.tell()}")
78+
self.cw = Control_Word(file)
79+
self.name = self.cw.control_name
80+
else:
81+
logger.warning(utils.warn(f"Expected group has no start. Creating unknown group"))
82+
probed = self.probe(re_patterns.probe, file)
83+
self.content = list()
1684

1785

1886
if __name__ == "__main__":

src/rtfparse/enums.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env python
2+
3+
4+
from enum import Enum, unique, auto
5+
6+
7+
@unique
8+
class Bytestring_Type(Enum):
9+
GROUP_START = auto()
10+
GROUP_END = auto()
11+
CONTROL_WORD = auto()
12+
CONTROL_SYMBOL = auto()
13+
PLAIN_TEXT = auto()
14+
15+
16+
if __name__ == "__main__":
17+
pass

src/rtfparse/parser.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,6 @@
1515

1616
class Rtf_Parser:
1717
probe_len = 42
18-
@classmethod
19-
def matchseek(cls, pattern: re.Pattern, file: io.BufferedReader) -> re.Match:
20-
original_position = file.tell()
21-
if (match := pattern.match(file.read(cls.probe_len))):
22-
logger.debug(f"{match = }")
23-
file.seek(original_position + match.span()[1])
24-
return match
2518
@staticmethod
2619
def start_group(match: re.Match) -> None:
2720
logger.debug(f"Starting group")
@@ -67,7 +60,7 @@ def parse_file(self, file: io.BufferedReader) -> None:
6760
try:
6861
self.parsed = entities.Destination_Group(file)
6962
except Exception as err:
70-
logger.error(f"Error: {err}")
63+
logger.exception(err)
7164
finally:
7265
logger.debug(f"Parsing file {file.name} finished")
7366

src/rtfparse/re_patterns.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,17 +50,18 @@ def no_capture(content: bytes) -> bytes:
5050
hdigit = named_regex_group("hdigit", group(_hdigits))
5151
minus = named_regex_group("minus", rb"-?")
5252
# int16 = minus + digit + rb"{1,5}"
53-
parameter = named_regex_group("param", minus + digit + rb"{1,10}")
53+
parameter = named_regex_group("parameter", minus + digit + rb"{1,10}")
5454
space = named_regex_group("space", rb" ")
5555
other = named_regex_group("other", group(rb"^" + _letters + _digits))
5656

5757

5858
ascii_letter_sequence = named_regex_group("control_name", ascii_letters)
59-
delimiter = named_regex_group("delimiter", rb"|".join((rb" ", parameter, other)))
59+
delimiter = named_regex_group("delimiter", rb"|".join((rb" ", parameter, other, rb"$")))
6060
symbol = named_regex_group("symbol", other)
6161
control_word_pattern = named_regex_group("control_word", rtf_backslash + ascii_letter_sequence + delimiter)
6262
pcdata_delimiter = no_capture(rb"|".join((rtf_brace_open, rtf_brace_close, control_word_pattern)))
6363
plain_text_pattern = named_regex_group("text", not_control_character + rb"+") + no_capture(control_character)
64+
probe_pattern = rb".."
6465

6566

6667
class Bytes_Regex():
@@ -72,6 +73,7 @@ def regex101(self) -> None:
7273
print(self.pattern_bytes.decode("ascii"))
7374

7475

76+
probe = Bytes_Regex(named_regex_group("probe", probe_pattern))
7577
meaningful_bs = Bytes_Regex(rtf_backslash)
7678
# control_word = Bytes_Regex(rtf_backslash + ascii_letter_sequence + delimiter)
7779
control_word = Bytes_Regex(control_word_pattern)

src/rtfparse/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
#!/usr/bin/env python
22

33

4-
version = "0.2.1"
4+
version = "0.2.2"

0 commit comments

Comments
 (0)