@@ -47,67 +47,46 @@ def __init__(self, fd, registry=None):
4747 self .fd = fd
4848 self .registry = registry or global_registry
4949
50- def read (self , token = None ):
51- if token is None :
52- token = self .fd .read (1 )
50+ def read (self , in_ivar = False ):
51+ result = None
52+ object_index = None
53+ re_flags = None
54+
55+ token = self .fd .read (1 )
5356
5457 # From https://docs.ruby-lang.org/en/2.1.0/marshal_rdoc.html:
5558 # The stream contains only one copy of each object for all objects except
5659 # true, false, nil, Fixnums and Symbols.
57- object_index = None
5860 if token in (
59- TYPE_IVAR ,
6061 # TYPE_EXTENDED, TYPE_UCLASS, ????
6162 TYPE_CLASS ,
6263 TYPE_MODULE ,
6364 TYPE_FLOAT ,
6465 TYPE_BIGNUM ,
66+ TYPE_STRING ,
6567 TYPE_REGEXP ,
6668 TYPE_ARRAY ,
6769 TYPE_HASH ,
6870 TYPE_STRUCT ,
6971 TYPE_OBJECT ,
7072 TYPE_DATA ,
7173 TYPE_USRMARSHAL ,
74+ TYPE_USERDEF ,
7275 ):
73- self .objects .append (None )
7476 object_index = len (self .objects )
77+ # placeholder for incomplete type
78+ self .objects .append (None )
7579
76- result = None
7780 if token == TYPE_NIL :
7881 pass
7982 elif token == TYPE_TRUE :
8083 result = True
8184 elif token == TYPE_FALSE :
8285 result = False
8386 elif token == TYPE_IVAR :
84- sub_token = self .fd .read (1 )
85- result = self .read (sub_token )
86- flags = None
87- if sub_token == TYPE_REGEXP :
88- options = ord (self .fd .read (1 ))
89- flags = 0
90- if options & 1 :
91- flags |= re .IGNORECASE
92- if options & 4 :
93- flags |= re .MULTILINE
94- attributes = self .read_attributes ()
95- if sub_token in (TYPE_STRING , TYPE_REGEXP ):
96- encoding = self ._get_encoding (attributes )
97- try :
98- result = result .decode (encoding )
99- except UnicodeDecodeError :
100- result = result .decode ("unicode-escape" )
101- # string instance attributes are discarded
102- if attributes and sub_token == TYPE_STRING :
103- result = RubyString (result , attributes )
104- if sub_token == TYPE_REGEXP :
105- result = re .compile (str (result ), flags )
106- elif attributes :
107- result .set_attributes (attributes )
87+ result = self .read (in_ivar = True )
10888 elif token == TYPE_STRING :
109- size = self .read_long ()
110- result = self .fd .read (size )
89+ result = self .read_blob ()
11190 elif token == TYPE_SYMBOL :
11291 result = self .read_symreal ()
11392 elif token == TYPE_FIXNUM :
@@ -125,8 +104,7 @@ def read(self, token=None):
125104 result [key ] = value
126105 result = result
127106 elif token == TYPE_FLOAT :
128- size = self .read_long ()
129- floatn = self .fd .read (size )
107+ floatn = self .read_blob ()
130108 floatn = floatn .split (b"\0 " )
131109 result = float (floatn [0 ].decode ("utf-8" ))
132110 elif token == TYPE_BIGNUM :
@@ -139,8 +117,13 @@ def read(self, token=None):
139117 factor *= 2 ** 16
140118 result *= sign
141119 elif token == TYPE_REGEXP :
142- size = self .read_long ()
143- result = self .fd .read (size )
120+ result = self .read_blob ()
121+ options = ord (self .fd .read (1 ))
122+ re_flags = 0
123+ if options & 1 :
124+ re_flags |= re .IGNORECASE
125+ if options & 4 :
126+ re_flags |= re .MULTILINE
144127 elif token == TYPE_USRMARSHAL :
145128 class_symbol = self .read ()
146129 if not isinstance (class_symbol , Symbol ):
@@ -159,15 +142,23 @@ def read(self, token=None):
159142 result = self .read_symlink ()
160143 elif token == TYPE_LINK :
161144 link_id = self .read_long ()
162- if object_index and link_id >= object_index :
145+ if link_id > len ( self . objects ) :
163146 raise ValueError (
164- "invalid link destination: %d should be lower than %d."
165- % (link_id , object_index )
147+ "invalid link destination: %d should be lower than %d or equal ."
148+ % (link_id , len ( self . objects ) )
166149 )
150+ # According to the documentation, objects are counted from 1.
151+ # But it looks like they did not take the outermost object into account.
167152 result = self .objects [link_id ]
153+ if result is None :
154+ # link to incomplete object
155+ raise ValueError (
156+ "invalid link destination: Object id %d is not yet unmarshaled."
157+ % (link_id )
158+ )
168159 elif token == TYPE_USERDEF :
169160 class_symbol = self .read ()
170- private_data = self .read ( TYPE_STRING )
161+ private_data = self .read_blob ( )
171162 if not isinstance (class_symbol , Symbol ):
172163 raise ValueError ("invalid class name: %r" % class_symbol )
173164 class_name = class_symbol .name
@@ -181,7 +172,7 @@ def read(self, token=None):
181172 # noinspection PyProtectedMember
182173 result ._load (private_data )
183174 elif token == TYPE_MODULE :
184- data = self .read ( TYPE_STRING )
175+ data = self .read_blob ( )
185176 module_name = data .decode ()
186177 result = Module (module_name , None )
187178 elif token == TYPE_OBJECT :
@@ -197,10 +188,10 @@ def read(self, token=None):
197188 attributes = self .read_attributes ()
198189 result = python_class (class_name , attributes )
199190 elif token == TYPE_EXTENDED :
200- class_name = self .read ( TYPE_STRING )
191+ class_name = self .read_blob ( )
201192 result = Extended (class_name , None )
202193 elif token == TYPE_CLASS :
203- data = self .read ( TYPE_STRING )
194+ data = self .read_blob ( )
204195 class_name = data .decode ()
205196 if class_name in self .registry :
206197 result = self .registry [class_name ]
@@ -212,8 +203,27 @@ def read(self, token=None):
212203 )
213204 else :
214205 raise ValueError ("token %s is not recognized" % token )
206+
207+ if in_ivar :
208+ # The object has attributes.
209+ attributes = self .read_attributes ()
210+ if token in (TYPE_STRING , TYPE_REGEXP ):
211+ encoding = self ._get_encoding (attributes )
212+ try :
213+ result = result .decode (encoding )
214+ except UnicodeDecodeError :
215+ result = result .decode ("unicode-escape" )
216+ # string instance attributes are discarded (on regex?)
217+ if attributes and token == TYPE_STRING :
218+ result = RubyString (result , attributes )
219+ elif attributes :
220+ result .set_attributes (attributes )
221+
222+ if token == TYPE_REGEXP :
223+ result = re .compile (str (result ), re_flags )
224+
215225 if object_index is not None :
216- self .objects [object_index - 1 ] = result
226+ self .objects [object_index ] = result
217227 return result
218228
219229 @staticmethod
@@ -254,6 +264,10 @@ def read_long(self):
254264 result = result - factor
255265 return result
256266
267+ def read_blob (self ):
268+ size = self .read_long ()
269+ return self .fd .read (size )
270+
257271 def read_symbol (self ):
258272 ivar = 0
259273 while True :
@@ -274,8 +288,7 @@ def read_symlink(self):
274288 return self .symbols [symlink_id ]
275289
276290 def read_symreal (self ):
277- size = self .read_long ()
278- result = self .fd .read (size )
291+ result = self .read_blob ()
279292 result = Symbol (result .decode ("utf-8" ))
280293 self .symbols .append (result )
281294 return result
0 commit comments