Skip to content

Commit dae9a48

Browse files
committed
Merge remote-tracking branch 'origin/master'
2 parents cfa260a + 77de4b7 commit dae9a48

File tree

3 files changed

+458
-47
lines changed

3 files changed

+458
-47
lines changed

rubymarshal/reader.py

Lines changed: 60 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -47,67 +47,46 @@ def __init__(self, fd, registry=None):
4747
self.fd = fd
4848
self.registry = registry or global_registry
4949

50-
def read(self, token=None):
51-
if token is None:
52-
token = self.fd.read(1)
50+
def read(self, in_ivar=False):
51+
result = None
52+
object_index = None
53+
re_flags = None
54+
55+
token = self.fd.read(1)
5356

5457
# From https://docs.ruby-lang.org/en/2.1.0/marshal_rdoc.html:
5558
# The stream contains only one copy of each object for all objects except
5659
# true, false, nil, Fixnums and Symbols.
57-
object_index = None
5860
if token in (
59-
TYPE_IVAR,
6061
# TYPE_EXTENDED, TYPE_UCLASS, ????
6162
TYPE_CLASS,
6263
TYPE_MODULE,
6364
TYPE_FLOAT,
6465
TYPE_BIGNUM,
66+
TYPE_STRING,
6567
TYPE_REGEXP,
6668
TYPE_ARRAY,
6769
TYPE_HASH,
6870
TYPE_STRUCT,
6971
TYPE_OBJECT,
7072
TYPE_DATA,
7173
TYPE_USRMARSHAL,
74+
TYPE_USERDEF,
7275
):
73-
self.objects.append(None)
7476
object_index = len(self.objects)
77+
# placeholder for incomplete type
78+
self.objects.append(None)
7579

76-
result = None
7780
if token == TYPE_NIL:
7881
pass
7982
elif token == TYPE_TRUE:
8083
result = True
8184
elif token == TYPE_FALSE:
8285
result = False
8386
elif token == TYPE_IVAR:
84-
sub_token = self.fd.read(1)
85-
result = self.read(sub_token)
86-
flags = None
87-
if sub_token == TYPE_REGEXP:
88-
options = ord(self.fd.read(1))
89-
flags = 0
90-
if options & 1:
91-
flags |= re.IGNORECASE
92-
if options & 4:
93-
flags |= re.MULTILINE
94-
attributes = self.read_attributes()
95-
if sub_token in (TYPE_STRING, TYPE_REGEXP):
96-
encoding = self._get_encoding(attributes)
97-
try:
98-
result = result.decode(encoding)
99-
except UnicodeDecodeError:
100-
result = result.decode("unicode-escape")
101-
# string instance attributes are discarded
102-
if attributes and sub_token == TYPE_STRING:
103-
result = RubyString(result, attributes)
104-
if sub_token == TYPE_REGEXP:
105-
result = re.compile(str(result), flags)
106-
elif attributes:
107-
result.set_attributes(attributes)
87+
result = self.read(in_ivar=True)
10888
elif token == TYPE_STRING:
109-
size = self.read_long()
110-
result = self.fd.read(size)
89+
result = self.read_blob()
11190
elif token == TYPE_SYMBOL:
11291
result = self.read_symreal()
11392
elif token == TYPE_FIXNUM:
@@ -125,8 +104,7 @@ def read(self, token=None):
125104
result[key] = value
126105
result = result
127106
elif token == TYPE_FLOAT:
128-
size = self.read_long()
129-
floatn = self.fd.read(size)
107+
floatn = self.read_blob()
130108
floatn = floatn.split(b"\0")
131109
result = float(floatn[0].decode("utf-8"))
132110
elif token == TYPE_BIGNUM:
@@ -139,8 +117,13 @@ def read(self, token=None):
139117
factor *= 2**16
140118
result *= sign
141119
elif token == TYPE_REGEXP:
142-
size = self.read_long()
143-
result = self.fd.read(size)
120+
result = self.read_blob()
121+
options = ord(self.fd.read(1))
122+
re_flags = 0
123+
if options & 1:
124+
re_flags |= re.IGNORECASE
125+
if options & 4:
126+
re_flags |= re.MULTILINE
144127
elif token == TYPE_USRMARSHAL:
145128
class_symbol = self.read()
146129
if not isinstance(class_symbol, Symbol):
@@ -159,15 +142,23 @@ def read(self, token=None):
159142
result = self.read_symlink()
160143
elif token == TYPE_LINK:
161144
link_id = self.read_long()
162-
if object_index and link_id >= object_index:
145+
if link_id > len(self.objects):
163146
raise ValueError(
164-
"invalid link destination: %d should be lower than %d."
165-
% (link_id, object_index)
147+
"invalid link destination: %d should be lower than %d or equal."
148+
% (link_id, len(self.objects))
166149
)
150+
# According to the documentation, objects are counted from 1.
151+
# But it looks like they did not take the outermost object into account.
167152
result = self.objects[link_id]
153+
if result is None:
154+
# link to incomplete object
155+
raise ValueError(
156+
"invalid link destination: Object id %d is not yet unmarshaled."
157+
% (link_id)
158+
)
168159
elif token == TYPE_USERDEF:
169160
class_symbol = self.read()
170-
private_data = self.read(TYPE_STRING)
161+
private_data = self.read_blob()
171162
if not isinstance(class_symbol, Symbol):
172163
raise ValueError("invalid class name: %r" % class_symbol)
173164
class_name = class_symbol.name
@@ -181,7 +172,7 @@ def read(self, token=None):
181172
# noinspection PyProtectedMember
182173
result._load(private_data)
183174
elif token == TYPE_MODULE:
184-
data = self.read(TYPE_STRING)
175+
data = self.read_blob()
185176
module_name = data.decode()
186177
result = Module(module_name, None)
187178
elif token == TYPE_OBJECT:
@@ -197,10 +188,10 @@ def read(self, token=None):
197188
attributes = self.read_attributes()
198189
result = python_class(class_name, attributes)
199190
elif token == TYPE_EXTENDED:
200-
class_name = self.read(TYPE_STRING)
191+
class_name = self.read_blob()
201192
result = Extended(class_name, None)
202193
elif token == TYPE_CLASS:
203-
data = self.read(TYPE_STRING)
194+
data = self.read_blob()
204195
class_name = data.decode()
205196
if class_name in self.registry:
206197
result = self.registry[class_name]
@@ -212,8 +203,27 @@ def read(self, token=None):
212203
)
213204
else:
214205
raise ValueError("token %s is not recognized" % token)
206+
207+
if in_ivar:
208+
# The object has attributes.
209+
attributes = self.read_attributes()
210+
if token in (TYPE_STRING, TYPE_REGEXP):
211+
encoding = self._get_encoding(attributes)
212+
try:
213+
result = result.decode(encoding)
214+
except UnicodeDecodeError:
215+
result = result.decode("unicode-escape")
216+
# string instance attributes are discarded (on regex?)
217+
if attributes and token == TYPE_STRING:
218+
result = RubyString(result, attributes)
219+
elif attributes:
220+
result.set_attributes(attributes)
221+
222+
if token == TYPE_REGEXP:
223+
result = re.compile(str(result), re_flags)
224+
215225
if object_index is not None:
216-
self.objects[object_index - 1] = result
226+
self.objects[object_index] = result
217227
return result
218228

219229
@staticmethod
@@ -254,6 +264,10 @@ def read_long(self):
254264
result = result - factor
255265
return result
256266

267+
def read_blob(self):
268+
size = self.read_long()
269+
return self.fd.read(size)
270+
257271
def read_symbol(self):
258272
ivar = 0
259273
while True:
@@ -274,8 +288,7 @@ def read_symlink(self):
274288
return self.symbols[symlink_id]
275289

276290
def read_symreal(self):
277-
size = self.read_long()
278-
result = self.fd.read(size)
291+
result = self.read_blob()
279292
result = Symbol(result.decode("utf-8"))
280293
self.symbols.append(result)
281294
return result

0 commit comments

Comments
 (0)