44import io
55import logging
66import re
7- from itertools import count
7+ # Own modules
88from rtfparse import re_patterns
99from rtfparse import utils
1010from rtfparse import errors
11+ from rtfparse import config_loader
1112from rtfparse .enums import Bytestring_Type
1213
1314
1920CHARACTER = BACKSLASH = DELIMITER = MINUS = GROUP_END = len (b"\\ " )
2021SYMBOL = IGNORABLE = BACKSLASH + CHARACTER
2122GROUP_START = BACKSLASH + IGNORABLE
22- MAX_CW_LETTERS = 32
23- INTEGER_MAGNITUDE = 32
23+ MAX_CW_LETTERS = 32 # As specified in RTF Spec
24+ INTEGER_MAGNITUDE = 32 # As specified in RTF Spec
2425PLAIN_TEXT = CONTROL_WORD = BACKSLASH + MAX_CW_LETTERS + MINUS + len (str ((1 << INTEGER_MAGNITUDE ) // 2 )) + DELIMITER
2526
2627
2728class Entity :
29+ def __init__ (self ) -> None :
30+ self .text = ""
2831 @classmethod
2932 def probe (cls , pattern : re_patterns .Bytes_Regex , file : io .BufferedReader ) -> Bytestring_Type :
30- logger .debug (f"in Entity.probed " )
33+ logger .debug (f"Probing file at position { file . tell () } " )
3134 original_position = file .tell ()
3235 while True :
3336 probed = file .read (len (re_patterns .probe_pattern ))
@@ -54,84 +57,101 @@ def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Byt
5457 raise errors .UnexpectedEndOfFileError (f"at position { file .tell ()} " )
5558 continue
5659 break
57- logger .debug (f"{ result = } " )
60+ logger .debug (f"Probe { result = } " )
5861 logger .debug (f"Probe leaving file at position { file .tell ()} " )
5962 return result
6063
6164
6265class Control_Word (Entity ):
63- def __init__ (self , file : io .BufferedReader ) -> None :
64- logger .debug (f"Control_Word.__init__" )
66+ def __init__ (self , config : config_loader .Config , file : io .BufferedReader ) -> None :
67+ super ().__init__ ()
68+ self .config = config
69+ logger .debug (f"Reading Control Word at file position { file .tell ()} " )
6570 self .control_name = "missing"
6671 self .parameter = ""
6772 self .start_position = file .tell ()
6873 logger .debug (f"Starting at file position { self .start_position } " )
6974 probe = file .read (CONTROL_WORD )
7075 if (match := re_patterns .control_word .match (probe )):
71- self .control_name = match .group ("control_name" ).decode ("ascii" )
72- logger .debug (f"{ self .control_name = } " )
76+ self .control_name = match .group ("control_name" ).decode (self . config . default_encoding )
77+ logger .debug (f"Preliminary { self .control_name = } " )
7378 parameter = match .group ("parameter" )
74- if parameter :
75- self .parameter = int (parameter .decode ("ascii" ))
79+ if parameter is not None :
80+ self .parameter = int (parameter .decode (self . config . default_encoding ))
7681 logger .debug (f"{ self .parameter = } " )
82+ self .control_name = self .control_name .removesuffix (str (self .parameter ))
83+ logger .debug (f"Final { self .control_name = } " )
7784 target_position = self .start_position + match .span ()[1 ]
7885 if match .group ("other" ):
79- logger .debug (f"Delimiter is { match .group ('other' ).decode ('ascii' )} , len: { len (match .group ('delimiter' ))} " )
86+ logger .debug (f"Delimiter is { match .group ('other' ).decode (self . config . default_encoding )} , len: { len (match .group ('delimiter' ))} " )
8087 target_position -= len (match .group ("delimiter" ))
8188 file .seek (target_position )
8289 else :
8390 logger .warning (f"Missing Control Word" )
8491 file .seek (self .start_position )
8592 def __repr__ (self ) -> str :
86- name = self .control_name
87- return f"<{ self .__class__ .__name__ } : { name } { self .parameter } >"
93+ return f"<{ self .__class__ .__name__ } : { self .control_name } { self .parameter } >"
8894
8995
9096class Control_Symbol (Entity ):
91- def __init__ (self , file : io .BufferedReader ) -> None :
97+ def __init__ (self , config : config_loader .Config , file : io .BufferedReader ) -> None :
98+ super ().__init__ ()
99+ self .config = config
92100 self .start_position = file .tell ()
93- logger .debug (f"Starting at file position { self .start_position } " )
94- self .text = file .read (SYMBOL )[- 1 ].decode ("ascii" )
101+ logger .debug (f"Reading Symbol at file position { self .start_position } " )
102+ self .char = ""
103+ self .text = chr (file .read (SYMBOL )[- 1 ])
104+ if self .text == "'" :
105+ self .char = file .read (SYMBOL ).decode (self .config .default_encoding )
106+ self .text = bytes ((int (self .char , base = 16 ), )).decode (self .config .default_encoding )
107+ logger .debug (f"Encountered escaped ANSI character, read two more bytes: { self .char } , character: { self .text } " )
108+ if self .text in "\\ {}" :
109+ file .seek (file .tell () - SYMBOL )
95110 def __repr__ (self ) -> str :
96111 return f"<{ self .__class__ .__name__ } : { self .text } >"
97112
98113
99114class Plain_Text (Entity ):
100- def __init__ (self , file : io .BufferedReader ) -> None :
101- self . start_position = file . tell ()
102- logger . debug ( f"Starting at file position { self . start_position } " )
115+ def __init__ (self , config : config_loader . Config , file : io .BufferedReader ) -> None :
116+ super (). __init__ ()
117+ self . config = config
103118 self .text = ""
119+ logger .debug (f"Constructing Plain_Text" )
104120 while True :
121+ self .start_position = file .tell ()
105122 read = file .read (PLAIN_TEXT )
106- logger .debug (f"Read file up to position { file .tell ()} " )
107- logger .debug (f"Read: { read } " )
123+ logger .debug (f"Read file from { self .start_position } to position { file .tell ()} , read: { read } " )
108124 # see if we have read all the plain text there is:
109125 if (match := re_patterns .plain_text .match (read )):
110126 logger .debug (f"This matches the plain text pattern" )
111- _text = match .group ("text" ).decode ("ascii" )
127+ _text = match .group ("text" ).decode (self . config . default_encoding )
112128 logger .debug (f"{ _text = } " )
113129 self .text = "" .join ((self .text , _text ))
114130 logger .debug (f"{ self .text = } " )
115131 if len (_text ) == PLAIN_TEXT :
116132 continue
117133 else :
118- file .seek (self .start_position + len (self .text ))
119- logger .debug (f"Returned to position { file .tell ()} " )
134+ file .seek (self .start_position + len (_text ))
120135 break
121136 else :
137+ file .seek (self .start_position )
122138 break
139+ logger .debug (f"Returned to position { file .tell ()} " )
123140 def __repr__ (self ) -> str :
124141 return f"<{ self .__class__ .__name__ } : { self .text } >"
125142
126143
127- class Destination_Group (Entity ):
128- def __init__ (self , file : io .BufferedReader ) -> None :
129- logger .debug (f"Destination_Group.__init__" )
144+ class Group (Entity ):
145+ def __init__ (self , config : config_loader .Config , file : io .BufferedReader ) -> None :
146+ super ().__init__ ()
147+ logger .debug (f"Group.__init__" )
148+ self .config = config
130149 self .known = False
131150 self .name = "unknown"
132151 self .ignorable = False
133152 self .structure = list ()
134- logger .debug (f"Creating destination group from { file .name } " )
153+ parsed_object = utils .what_is_being_parsed (file )
154+ logger .debug (f"Creating destination group from { parsed_object } " )
135155 self .start_position = file .tell ()
136156 logger .debug (f"Starting at file position { self .start_position } " )
137157 probe = file .read (GROUP_START )
@@ -145,23 +165,24 @@ def __init__(self, file: io.BufferedReader) -> None:
145165 else :
146166 logger .warning (utils .warn (f"Expected a group but found no group start. Creating unknown group" ))
147167 file .seek (self .start_position )
148- self .cw = Control_Word (file )
149- self .name = self .cw .control_name
150168 while True :
151169 probed = self .probe (re_patterns .probe , file )
152170 if probed is Bytestring_Type .CONTROL_WORD :
153- self .structure .append (Control_Word (file ))
171+ self .structure .append (Control_Word (self . config , file ))
154172 elif probed is Bytestring_Type .GROUP_END :
155173 file .read (GROUP_END )
156174 break
157175 elif probed is Bytestring_Type .GROUP_START :
158- self .structure .append (Destination_Group ( file ))
176+ self .structure .append (Group ( self . config , file ))
159177 elif probed is Bytestring_Type .CONTROL_SYMBOL :
160- self .structure .append (Control_Symbol (file ))
178+ self .structure .append (Control_Symbol (self . config , file ))
161179 else :
162- self .structure .append (Plain_Text (file ))
180+ self .structure .append (Plain_Text (self . config , file ))
163181 def __repr__ (self ) -> str :
164- return f"<{ self .__class__ .__name__ } : { self .cw .control_name } { self .cw .parameter } >"
182+ cwinfo = ""
183+ if isinstance (self .structure [0 ], Control_Word ):
184+ cwinfo = f" { self .structure [0 ].control_name } "
185+ return f"<Group{ cwinfo } >"
165186
166187
167188if __name__ == "__main__" :
0 commit comments