(de-)serializer for arbitrary python objects

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
4
down vote

favorite












My program is intended to allow (de-)serialization of arbitrary Python objects. It's basically a replacement for python's pickle module, which is similar but prone to arbitrary code execution. I think I've managed to make my implementation more versatile and more secure than pickle, but I'm not an expert in security.



I've come here to ask the question:



Is my (de-)serialization algorithm secure? Is there any risk of arbitrary code execution, or anything else that could be dangerous?



As far as I can tell, I've avoided executing any untrusted code:



  • Classes are serialized as a fully qualified name like module.Class. They are deserialized without importing any modules; the module is looked up through sys.modules[module_name].

  • Fancy objects have their __dict__ serialized. Upon deserialization, a new instance is created by calling obj = object.__new__(cls) and obj.__dict__.update(serialized_dict).

Is there anything I've overlooked?




The code is split into 3 files.



I apologize for the length of the code, but there isn't much I can remove. That said, I think the "interesting" functions are serialize, deserialize, deserialize_next and decode_type. The class-specific (de-)serializers (like encode_dict and decode_dict) should be secure.



First, the public interface. These are the contents of public_api.py:



from .internals import *


def to_str(obj):
"""
Serializes an object to a string.

:param obj: the object to serialize
:return: the serialized object
"""
return serialize(obj)


def load(data):
"""
Deserializes an object.
The input can be a string, bytes, or file-like object.

:param data: the data to deserialize, or a file containing that data
:return: the deserialized object
"""

if isinstance(data, str):
pass
elif isinstance(data, bytes):
data = data.decode()
else:
return load(data.read())

value, data = deserialize_next(data)
if data:
raise DeserializationException('left-over data remained after deserialization')

return value


internals.py:



import sys
import ast
import inspect
import collections

from .exceptions import *


def serialize(obj):
"""
serializes an object to a string of the form "CLASS,SIZE,VALUE", where

- CLASS is the fully qualified name of the object's class
- SIZE the length of VALUE in characters
- VALUE is some sort of string representation of obj; the exact format depends on its class
"""

cls = type(obj)
try:
serializer = SERIALIZERS[cls]
except KeyError:
# if this object isn't of a basic type, we'll serialize its __dict__
try:
obj = vars(obj)
except TypeError:
raise SerializationException('Cannot serialize object without a __dict__: '.format(obj))

serializer = SERIALIZERS[dict]

value = serializer(obj)
return ',,'.format(encode_type(cls), len(value), value)


def deserialize(cls, data):
try:
deserializer = DESERIALIZERS[cls]
except KeyError:
# if this object isn't of a basic type, we'll instantiate a new
# object of the correct type and update its __dict__.
# None of this should execute any dangerous code.
dic = deserialize(dict, data)
obj = object.__new__(cls)
obj.__dict__.update(dic)
else:
obj = deserializer(data)

assert isinstance(obj, cls), (obj, cls)
return obj


def deserialize_next(data):
"""
Deserializes the first object in *data*, returning the deserialized object
and the remaining data that needs to be deserialized.
"""
# extract the class name
i = data.find(',')
if i == -1:
raise DeserializationException('missing type field')

clsname = data[:i]
cls = decode_type(clsname)

# extract the size in bytes
j = data.find(',', i+1)
if j == -1:
raise DeserializationException('missing size field')

try:
size = int(data[i+1:j])
except ValueError:
raise DeserializationException('invalid value in size field (not a number)')

# extract the value
end = j+1+size
value = data[j+1:end]
value = deserialize(cls, value)

return value, data[end:]


# class-specific (de-)serializers below
# =====================================

def encode_type(cls):
module = inspect.getmodule(cls).__name__
if module == 'builtins':
return cls.__qualname__

return '.'.format(module, cls.__qualname__)


def decode_type(qualname):
names = qualname.split('.')

if len(names) < 2:
try:
return getattr(sys.modules['builtins'], qualname)
except AttributeError:
raise DeserializationException('Missing module name in __qualname__: '.format(qualname))

# look up the module in sys.modules; this avoids having to execute any import code
obj = sys.modules[names[0]]
for name in names[1:]:
obj = getattr(obj, name)

if not isinstance(obj, type):
raise DeserializationException("Unknown type: (name resolved to )".format(qualname, obj))

return obj


def encode_list(lis):
chunks = [serialize(value) for value in lis]
return '[]'.format(', '.join(chunks))


def decode_list(data):
lis =

if not data.startswith('['):
raise DeserializationException('list should start with "[" character')
data = data[1:]

while True:
value, data = deserialize_next(data)
lis.append(value)

if not data.startswith(', '):
if data == ']':
break
raise DeserializationException('missing comma between list values')
data = data[2:]

return lis


def encode_dict(dic):
chunks =

for key, value in dic.items():
line = ': '.format(serialize(key), serialize(value))
chunks.append(line)

return ''.format(', '.join(chunks))


def decode_dict(data, dict_type=dict):
dic = dict_type()

if not data.startswith('{'):
raise DeserializationException('dict should start with "" character')
data = data[1:]

while True:
key, data = deserialize_next(data)

if not data.startswith(': '):
raise DeserializationException('missing colon between dict key and value')
data = data[2:]

value, data = deserialize_next(data)
dic[key] = value

if not data.startswith(', '):
if data == '':
break
raise DeserializationException('missing comma between dict values')
data = data[2:]

return dic


SERIALIZERS = int: str,
float: str,
complex: str,
str: repr,
bytes: repr,
type: encode_type,
list: encode_list,
dict: encode_dict,
collections.OrderedDict: encode_dict,


DESERIALIZERS = int: int,
float: float,
complex: complex,
str: ast.literal_eval,
bytes: ast.literal_eval,
type: decode_type,
list: decode_list,
dict: decode_dict,
collections.OrderedDict: lambda d: decode_dict(d, collections.OrderedDict),



And finally exceptions.py:



class SerializationException(Exception):
"""Raised when data serialization fails"""


class DeserializationException(Exception):
"""Raised when data deserialization fails"""






share|improve this question

















  • 2




    The code in the post does not work! Try a = ; a.append(a); serialize(a). I get RecursionError: maximum recursion depth exceeded.
    – Gareth Rees
    Feb 9 at 13:22






  • 1




    @GarethRees You're right, it doesn't currently support self-references. Every object is serialized by value, so a self-reference causes endless recursion. It also means that after (de-)serializing something like a = ; b = [a, a], b will contain 2 different list instances instead of two references to the same list. Thanks for pointing that out, I'll fix that in the next version!
    – Aran-Fey
    Feb 9 at 14:24






  • 4




    @GarethRees I don't think a special case not working equates to "The code in the post does not work!" The OP's not mentioned it, and hasn't asked us to fix it. So it's reasonably working as the OP intends - so it's not broken @ closevoter.
    – Peilonrayz
    Feb 9 at 15:41










  • Handling of cyclic data structures is one of the fundamental requirements for a general-purpose serializer, so I disagree that it's a special case.
    – Gareth Rees
    Feb 13 at 10:27










  • @GarethRees You have a point, but the important thing is that all the functionality which can potentially have vulnerabilities already exists. I can easily make the code handle references without introducing new vulnerabilities. You can think of it as a proof of concept. The question is not whether it's a functional all-purpose serializer, but whether there's something wrong with the way I've approached this problem. That said, I've spent the last few days implementing references and in the process the script has grown to 1000(!) lines of code. I don't think anyone would want to review that.
    – Aran-Fey
    Feb 13 at 12:26

















up vote
4
down vote

favorite












My program is intended to allow (de-)serialization of arbitrary Python objects. It's basically a replacement for python's pickle module, which is similar but prone to arbitrary code execution. I think I've managed to make my implementation more versatile and more secure than pickle, but I'm not an expert in security.



I've come here to ask the question:



Is my (de-)serialization algorithm secure? Is there any risk of arbitrary code execution, or anything else that could be dangerous?



As far as I can tell, I've avoided executing any untrusted code:



  • Classes are serialized as a fully qualified name like module.Class. They are deserialized without importing any modules; the module is looked up through sys.modules[module_name].

  • Fancy objects have their __dict__ serialized. Upon deserialization, a new instance is created by calling obj = object.__new__(cls) and obj.__dict__.update(serialized_dict).

Is there anything I've overlooked?




The code is split into 3 files.



I apologize for the length of the code, but there isn't much I can remove. That said, I think the "interesting" functions are serialize, deserialize, deserialize_next and decode_type. The class-specific (de-)serializers (like encode_dict and decode_dict) should be secure.



First, the public interface. These are the contents of public_api.py:



from .internals import *


def to_str(obj):
"""
Serializes an object to a string.

:param obj: the object to serialize
:return: the serialized object
"""
return serialize(obj)


def load(data):
"""
Deserializes an object.
The input can be a string, bytes, or file-like object.

:param data: the data to deserialize, or a file containing that data
:return: the deserialized object
"""

if isinstance(data, str):
pass
elif isinstance(data, bytes):
data = data.decode()
else:
return load(data.read())

value, data = deserialize_next(data)
if data:
raise DeserializationException('left-over data remained after deserialization')

return value


internals.py:



import sys
import ast
import inspect
import collections

from .exceptions import *


def serialize(obj):
"""
serializes an object to a string of the form "CLASS,SIZE,VALUE", where

- CLASS is the fully qualified name of the object's class
- SIZE the length of VALUE in characters
- VALUE is some sort of string representation of obj; the exact format depends on its class
"""

cls = type(obj)
try:
serializer = SERIALIZERS[cls]
except KeyError:
# if this object isn't of a basic type, we'll serialize its __dict__
try:
obj = vars(obj)
except TypeError:
raise SerializationException('Cannot serialize object without a __dict__: '.format(obj))

serializer = SERIALIZERS[dict]

value = serializer(obj)
return ',,'.format(encode_type(cls), len(value), value)


def deserialize(cls, data):
try:
deserializer = DESERIALIZERS[cls]
except KeyError:
# if this object isn't of a basic type, we'll instantiate a new
# object of the correct type and update its __dict__.
# None of this should execute any dangerous code.
dic = deserialize(dict, data)
obj = object.__new__(cls)
obj.__dict__.update(dic)
else:
obj = deserializer(data)

assert isinstance(obj, cls), (obj, cls)
return obj


def deserialize_next(data):
"""
Deserializes the first object in *data*, returning the deserialized object
and the remaining data that needs to be deserialized.
"""
# extract the class name
i = data.find(',')
if i == -1:
raise DeserializationException('missing type field')

clsname = data[:i]
cls = decode_type(clsname)

# extract the size in bytes
j = data.find(',', i+1)
if j == -1:
raise DeserializationException('missing size field')

try:
size = int(data[i+1:j])
except ValueError:
raise DeserializationException('invalid value in size field (not a number)')

# extract the value
end = j+1+size
value = data[j+1:end]
value = deserialize(cls, value)

return value, data[end:]


# class-specific (de-)serializers below
# =====================================

def encode_type(cls):
module = inspect.getmodule(cls).__name__
if module == 'builtins':
return cls.__qualname__

return '.'.format(module, cls.__qualname__)


def decode_type(qualname):
names = qualname.split('.')

if len(names) < 2:
try:
return getattr(sys.modules['builtins'], qualname)
except AttributeError:
raise DeserializationException('Missing module name in __qualname__: '.format(qualname))

# look up the module in sys.modules; this avoids having to execute any import code
obj = sys.modules[names[0]]
for name in names[1:]:
obj = getattr(obj, name)

if not isinstance(obj, type):
raise DeserializationException("Unknown type: (name resolved to )".format(qualname, obj))

return obj


def encode_list(lis):
chunks = [serialize(value) for value in lis]
return '[]'.format(', '.join(chunks))


def decode_list(data):
lis =

if not data.startswith('['):
raise DeserializationException('list should start with "[" character')
data = data[1:]

while True:
value, data = deserialize_next(data)
lis.append(value)

if not data.startswith(', '):
if data == ']':
break
raise DeserializationException('missing comma between list values')
data = data[2:]

return lis


def encode_dict(dic):
chunks =

for key, value in dic.items():
line = ': '.format(serialize(key), serialize(value))
chunks.append(line)

return ''.format(', '.join(chunks))


def decode_dict(data, dict_type=dict):
dic = dict_type()

if not data.startswith('{'):
raise DeserializationException('dict should start with "" character')
data = data[1:]

while True:
key, data = deserialize_next(data)

if not data.startswith(': '):
raise DeserializationException('missing colon between dict key and value')
data = data[2:]

value, data = deserialize_next(data)
dic[key] = value

if not data.startswith(', '):
if data == '':
break
raise DeserializationException('missing comma between dict values')
data = data[2:]

return dic


SERIALIZERS = int: str,
float: str,
complex: str,
str: repr,
bytes: repr,
type: encode_type,
list: encode_list,
dict: encode_dict,
collections.OrderedDict: encode_dict,


DESERIALIZERS = int: int,
float: float,
complex: complex,
str: ast.literal_eval,
bytes: ast.literal_eval,
type: decode_type,
list: decode_list,
dict: decode_dict,
collections.OrderedDict: lambda d: decode_dict(d, collections.OrderedDict),



And finally exceptions.py:



class SerializationException(Exception):
"""Raised when data serialization fails"""


class DeserializationException(Exception):
"""Raised when data deserialization fails"""






share|improve this question

















  • 2




    The code in the post does not work! Try a = ; a.append(a); serialize(a). I get RecursionError: maximum recursion depth exceeded.
    – Gareth Rees
    Feb 9 at 13:22






  • 1




    @GarethRees You're right, it doesn't currently support self-references. Every object is serialized by value, so a self-reference causes endless recursion. It also means that after (de-)serializing something like a = ; b = [a, a], b will contain 2 different list instances instead of two references to the same list. Thanks for pointing that out, I'll fix that in the next version!
    – Aran-Fey
    Feb 9 at 14:24






  • 4




    @GarethRees I don't think a special case not working equates to "The code in the post does not work!" The OP's not mentioned it, and hasn't asked us to fix it. So it's reasonably working as the OP intends - so it's not broken @ closevoter.
    – Peilonrayz
    Feb 9 at 15:41










  • Handling of cyclic data structures is one of the fundamental requirements for a general-purpose serializer, so I disagree that it's a special case.
    – Gareth Rees
    Feb 13 at 10:27










  • @GarethRees You have a point, but the important thing is that all the functionality which can potentially have vulnerabilities already exists. I can easily make the code handle references without introducing new vulnerabilities. You can think of it as a proof of concept. The question is not whether it's a functional all-purpose serializer, but whether there's something wrong with the way I've approached this problem. That said, I've spent the last few days implementing references and in the process the script has grown to 1000(!) lines of code. I don't think anyone would want to review that.
    – Aran-Fey
    Feb 13 at 12:26













up vote
4
down vote

favorite









up vote
4
down vote

favorite











My program is intended to allow (de-)serialization of arbitrary Python objects. It's basically a replacement for python's pickle module, which is similar but prone to arbitrary code execution. I think I've managed to make my implementation more versatile and more secure than pickle, but I'm not an expert in security.



I've come here to ask the question:



Is my (de-)serialization algorithm secure? Is there any risk of arbitrary code execution, or anything else that could be dangerous?



As far as I can tell, I've avoided executing any untrusted code:



  • Classes are serialized as a fully qualified name like module.Class. They are deserialized without importing any modules; the module is looked up through sys.modules[module_name].

  • Fancy objects have their __dict__ serialized. Upon deserialization, a new instance is created by calling obj = object.__new__(cls) and obj.__dict__.update(serialized_dict).

Is there anything I've overlooked?




The code is split into 3 files.



I apologize for the length of the code, but there isn't much I can remove. That said, I think the "interesting" functions are serialize, deserialize, deserialize_next and decode_type. The class-specific (de-)serializers (like encode_dict and decode_dict) should be secure.



First, the public interface. These are the contents of public_api.py:



from .internals import *


def to_str(obj):
"""
Serializes an object to a string.

:param obj: the object to serialize
:return: the serialized object
"""
return serialize(obj)


def load(data):
"""
Deserializes an object.
The input can be a string, bytes, or file-like object.

:param data: the data to deserialize, or a file containing that data
:return: the deserialized object
"""

if isinstance(data, str):
pass
elif isinstance(data, bytes):
data = data.decode()
else:
return load(data.read())

value, data = deserialize_next(data)
if data:
raise DeserializationException('left-over data remained after deserialization')

return value


internals.py:



import sys
import ast
import inspect
import collections

from .exceptions import *


def serialize(obj):
"""
serializes an object to a string of the form "CLASS,SIZE,VALUE", where

- CLASS is the fully qualified name of the object's class
- SIZE the length of VALUE in characters
- VALUE is some sort of string representation of obj; the exact format depends on its class
"""

cls = type(obj)
try:
serializer = SERIALIZERS[cls]
except KeyError:
# if this object isn't of a basic type, we'll serialize its __dict__
try:
obj = vars(obj)
except TypeError:
raise SerializationException('Cannot serialize object without a __dict__: '.format(obj))

serializer = SERIALIZERS[dict]

value = serializer(obj)
return ',,'.format(encode_type(cls), len(value), value)


def deserialize(cls, data):
try:
deserializer = DESERIALIZERS[cls]
except KeyError:
# if this object isn't of a basic type, we'll instantiate a new
# object of the correct type and update its __dict__.
# None of this should execute any dangerous code.
dic = deserialize(dict, data)
obj = object.__new__(cls)
obj.__dict__.update(dic)
else:
obj = deserializer(data)

assert isinstance(obj, cls), (obj, cls)
return obj


def deserialize_next(data):
"""
Deserializes the first object in *data*, returning the deserialized object
and the remaining data that needs to be deserialized.
"""
# extract the class name
i = data.find(',')
if i == -1:
raise DeserializationException('missing type field')

clsname = data[:i]
cls = decode_type(clsname)

# extract the size in bytes
j = data.find(',', i+1)
if j == -1:
raise DeserializationException('missing size field')

try:
size = int(data[i+1:j])
except ValueError:
raise DeserializationException('invalid value in size field (not a number)')

# extract the value
end = j+1+size
value = data[j+1:end]
value = deserialize(cls, value)

return value, data[end:]


# class-specific (de-)serializers below
# =====================================

def encode_type(cls):
module = inspect.getmodule(cls).__name__
if module == 'builtins':
return cls.__qualname__

return '.'.format(module, cls.__qualname__)


def decode_type(qualname):
names = qualname.split('.')

if len(names) < 2:
try:
return getattr(sys.modules['builtins'], qualname)
except AttributeError:
raise DeserializationException('Missing module name in __qualname__: '.format(qualname))

# look up the module in sys.modules; this avoids having to execute any import code
obj = sys.modules[names[0]]
for name in names[1:]:
obj = getattr(obj, name)

if not isinstance(obj, type):
raise DeserializationException("Unknown type: (name resolved to )".format(qualname, obj))

return obj


def encode_list(lis):
chunks = [serialize(value) for value in lis]
return '[]'.format(', '.join(chunks))


def decode_list(data):
lis =

if not data.startswith('['):
raise DeserializationException('list should start with "[" character')
data = data[1:]

while True:
value, data = deserialize_next(data)
lis.append(value)

if not data.startswith(', '):
if data == ']':
break
raise DeserializationException('missing comma between list values')
data = data[2:]

return lis


def encode_dict(dic):
chunks =

for key, value in dic.items():
line = ': '.format(serialize(key), serialize(value))
chunks.append(line)

return ''.format(', '.join(chunks))


def decode_dict(data, dict_type=dict):
dic = dict_type()

if not data.startswith('{'):
raise DeserializationException('dict should start with "" character')
data = data[1:]

while True:
key, data = deserialize_next(data)

if not data.startswith(': '):
raise DeserializationException('missing colon between dict key and value')
data = data[2:]

value, data = deserialize_next(data)
dic[key] = value

if not data.startswith(', '):
if data == '':
break
raise DeserializationException('missing comma between dict values')
data = data[2:]

return dic


SERIALIZERS = int: str,
float: str,
complex: str,
str: repr,
bytes: repr,
type: encode_type,
list: encode_list,
dict: encode_dict,
collections.OrderedDict: encode_dict,


DESERIALIZERS = int: int,
float: float,
complex: complex,
str: ast.literal_eval,
bytes: ast.literal_eval,
type: decode_type,
list: decode_list,
dict: decode_dict,
collections.OrderedDict: lambda d: decode_dict(d, collections.OrderedDict),



And finally exceptions.py:



class SerializationException(Exception):
"""Raised when data serialization fails"""


class DeserializationException(Exception):
"""Raised when data deserialization fails"""






share|improve this question













My program is intended to allow (de-)serialization of arbitrary Python objects. It's basically a replacement for python's pickle module, which is similar but prone to arbitrary code execution. I think I've managed to make my implementation more versatile and more secure than pickle, but I'm not an expert in security.



I've come here to ask the question:



Is my (de-)serialization algorithm secure? Is there any risk of arbitrary code execution, or anything else that could be dangerous?



As far as I can tell, I've avoided executing any untrusted code:



  • Classes are serialized as a fully qualified name like module.Class. They are deserialized without importing any modules; the module is looked up through sys.modules[module_name].

  • Fancy objects have their __dict__ serialized. Upon deserialization, a new instance is created by calling obj = object.__new__(cls) and obj.__dict__.update(serialized_dict).

Is there anything I've overlooked?




The code is split into 3 files.



I apologize for the length of the code, but there isn't much I can remove. That said, I think the "interesting" functions are serialize, deserialize, deserialize_next and decode_type. The class-specific (de-)serializers (like encode_dict and decode_dict) should be secure.



First, the public interface. These are the contents of public_api.py:



from .internals import *


def to_str(obj):
"""
Serializes an object to a string.

:param obj: the object to serialize
:return: the serialized object
"""
return serialize(obj)


def load(data):
"""
Deserializes an object.
The input can be a string, bytes, or file-like object.

:param data: the data to deserialize, or a file containing that data
:return: the deserialized object
"""

if isinstance(data, str):
pass
elif isinstance(data, bytes):
data = data.decode()
else:
return load(data.read())

value, data = deserialize_next(data)
if data:
raise DeserializationException('left-over data remained after deserialization')

return value


internals.py:



import sys
import ast
import inspect
import collections

from .exceptions import *


def serialize(obj):
"""
serializes an object to a string of the form "CLASS,SIZE,VALUE", where

- CLASS is the fully qualified name of the object's class
- SIZE the length of VALUE in characters
- VALUE is some sort of string representation of obj; the exact format depends on its class
"""

cls = type(obj)
try:
serializer = SERIALIZERS[cls]
except KeyError:
# if this object isn't of a basic type, we'll serialize its __dict__
try:
obj = vars(obj)
except TypeError:
raise SerializationException('Cannot serialize object without a __dict__: '.format(obj))

serializer = SERIALIZERS[dict]

value = serializer(obj)
return ',,'.format(encode_type(cls), len(value), value)


def deserialize(cls, data):
try:
deserializer = DESERIALIZERS[cls]
except KeyError:
# if this object isn't of a basic type, we'll instantiate a new
# object of the correct type and update its __dict__.
# None of this should execute any dangerous code.
dic = deserialize(dict, data)
obj = object.__new__(cls)
obj.__dict__.update(dic)
else:
obj = deserializer(data)

assert isinstance(obj, cls), (obj, cls)
return obj


def deserialize_next(data):
"""
Deserializes the first object in *data*, returning the deserialized object
and the remaining data that needs to be deserialized.
"""
# extract the class name
i = data.find(',')
if i == -1:
raise DeserializationException('missing type field')

clsname = data[:i]
cls = decode_type(clsname)

# extract the size in bytes
j = data.find(',', i+1)
if j == -1:
raise DeserializationException('missing size field')

try:
size = int(data[i+1:j])
except ValueError:
raise DeserializationException('invalid value in size field (not a number)')

# extract the value
end = j+1+size
value = data[j+1:end]
value = deserialize(cls, value)

return value, data[end:]


# class-specific (de-)serializers below
# =====================================

def encode_type(cls):
module = inspect.getmodule(cls).__name__
if module == 'builtins':
return cls.__qualname__

return '.'.format(module, cls.__qualname__)


def decode_type(qualname):
names = qualname.split('.')

if len(names) < 2:
try:
return getattr(sys.modules['builtins'], qualname)
except AttributeError:
raise DeserializationException('Missing module name in __qualname__: '.format(qualname))

# look up the module in sys.modules; this avoids having to execute any import code
obj = sys.modules[names[0]]
for name in names[1:]:
obj = getattr(obj, name)

if not isinstance(obj, type):
raise DeserializationException("Unknown type: (name resolved to )".format(qualname, obj))

return obj


def encode_list(lis):
chunks = [serialize(value) for value in lis]
return '[]'.format(', '.join(chunks))


def decode_list(data):
lis =

if not data.startswith('['):
raise DeserializationException('list should start with "[" character')
data = data[1:]

while True:
value, data = deserialize_next(data)
lis.append(value)

if not data.startswith(', '):
if data == ']':
break
raise DeserializationException('missing comma between list values')
data = data[2:]

return lis


def encode_dict(dic):
chunks =

for key, value in dic.items():
line = ': '.format(serialize(key), serialize(value))
chunks.append(line)

return ''.format(', '.join(chunks))


def decode_dict(data, dict_type=dict):
dic = dict_type()

if not data.startswith('{'):
raise DeserializationException('dict should start with "" character')
data = data[1:]

while True:
key, data = deserialize_next(data)

if not data.startswith(': '):
raise DeserializationException('missing colon between dict key and value')
data = data[2:]

value, data = deserialize_next(data)
dic[key] = value

if not data.startswith(', '):
if data == '':
break
raise DeserializationException('missing comma between dict values')
data = data[2:]

return dic


SERIALIZERS = int: str,
float: str,
complex: str,
str: repr,
bytes: repr,
type: encode_type,
list: encode_list,
dict: encode_dict,
collections.OrderedDict: encode_dict,


DESERIALIZERS = int: int,
float: float,
complex: complex,
str: ast.literal_eval,
bytes: ast.literal_eval,
type: decode_type,
list: decode_list,
dict: decode_dict,
collections.OrderedDict: lambda d: decode_dict(d, collections.OrderedDict),



And finally exceptions.py:



class SerializationException(Exception):
"""Raised when data serialization fails"""


class DeserializationException(Exception):
"""Raised when data deserialization fails"""








share|improve this question












share|improve this question




share|improve this question








edited Feb 9 at 13:17









Jamal♦

30.1k11114225




30.1k11114225









asked Feb 9 at 11:18









Aran-Fey

1235




1235







  • 2




    The code in the post does not work! Try a = ; a.append(a); serialize(a). I get RecursionError: maximum recursion depth exceeded.
    – Gareth Rees
    Feb 9 at 13:22






  • 1




    @GarethRees You're right, it doesn't currently support self-references. Every object is serialized by value, so a self-reference causes endless recursion. It also means that after (de-)serializing something like a = ; b = [a, a], b will contain 2 different list instances instead of two references to the same list. Thanks for pointing that out, I'll fix that in the next version!
    – Aran-Fey
    Feb 9 at 14:24






  • 4




    @GarethRees I don't think a special case not working equates to "The code in the post does not work!" The OP's not mentioned it, and hasn't asked us to fix it. So it's reasonably working as the OP intends - so it's not broken @ closevoter.
    – Peilonrayz
    Feb 9 at 15:41










  • Handling of cyclic data structures is one of the fundamental requirements for a general-purpose serializer, so I disagree that it's a special case.
    – Gareth Rees
    Feb 13 at 10:27










  • @GarethRees You have a point, but the important thing is that all the functionality which can potentially have vulnerabilities already exists. I can easily make the code handle references without introducing new vulnerabilities. You can think of it as a proof of concept. The question is not whether it's a functional all-purpose serializer, but whether there's something wrong with the way I've approached this problem. That said, I've spent the last few days implementing references and in the process the script has grown to 1000(!) lines of code. I don't think anyone would want to review that.
    – Aran-Fey
    Feb 13 at 12:26













  • 2




    The code in the post does not work! Try a = ; a.append(a); serialize(a). I get RecursionError: maximum recursion depth exceeded.
    – Gareth Rees
    Feb 9 at 13:22






  • 1




    @GarethRees You're right, it doesn't currently support self-references. Every object is serialized by value, so a self-reference causes endless recursion. It also means that after (de-)serializing something like a = ; b = [a, a], b will contain 2 different list instances instead of two references to the same list. Thanks for pointing that out, I'll fix that in the next version!
    – Aran-Fey
    Feb 9 at 14:24






  • 4




    @GarethRees I don't think a special case not working equates to "The code in the post does not work!" The OP's not mentioned it, and hasn't asked us to fix it. So it's reasonably working as the OP intends - so it's not broken @ closevoter.
    – Peilonrayz
    Feb 9 at 15:41










  • Handling of cyclic data structures is one of the fundamental requirements for a general-purpose serializer, so I disagree that it's a special case.
    – Gareth Rees
    Feb 13 at 10:27










  • @GarethRees You have a point, but the important thing is that all the functionality which can potentially have vulnerabilities already exists. I can easily make the code handle references without introducing new vulnerabilities. You can think of it as a proof of concept. The question is not whether it's a functional all-purpose serializer, but whether there's something wrong with the way I've approached this problem. That said, I've spent the last few days implementing references and in the process the script has grown to 1000(!) lines of code. I don't think anyone would want to review that.
    – Aran-Fey
    Feb 13 at 12:26








2




2




The code in the post does not work! Try a = ; a.append(a); serialize(a). I get RecursionError: maximum recursion depth exceeded.
– Gareth Rees
Feb 9 at 13:22




The code in the post does not work! Try a = ; a.append(a); serialize(a). I get RecursionError: maximum recursion depth exceeded.
– Gareth Rees
Feb 9 at 13:22




1




1




@GarethRees You're right, it doesn't currently support self-references. Every object is serialized by value, so a self-reference causes endless recursion. It also means that after (de-)serializing something like a = ; b = [a, a], b will contain 2 different list instances instead of two references to the same list. Thanks for pointing that out, I'll fix that in the next version!
– Aran-Fey
Feb 9 at 14:24




@GarethRees You're right, it doesn't currently support self-references. Every object is serialized by value, so a self-reference causes endless recursion. It also means that after (de-)serializing something like a = ; b = [a, a], b will contain 2 different list instances instead of two references to the same list. Thanks for pointing that out, I'll fix that in the next version!
– Aran-Fey
Feb 9 at 14:24




4




4




@GarethRees I don't think a special case not working equates to "The code in the post does not work!" The OP's not mentioned it, and hasn't asked us to fix it. So it's reasonably working as the OP intends - so it's not broken @ closevoter.
– Peilonrayz
Feb 9 at 15:41




@GarethRees I don't think a special case not working equates to "The code in the post does not work!" The OP's not mentioned it, and hasn't asked us to fix it. So it's reasonably working as the OP intends - so it's not broken @ closevoter.
– Peilonrayz
Feb 9 at 15:41












Handling of cyclic data structures is one of the fundamental requirements for a general-purpose serializer, so I disagree that it's a special case.
– Gareth Rees
Feb 13 at 10:27




Handling of cyclic data structures is one of the fundamental requirements for a general-purpose serializer, so I disagree that it's a special case.
– Gareth Rees
Feb 13 at 10:27












@GarethRees You have a point, but the important thing is that all the functionality which can potentially have vulnerabilities already exists. I can easily make the code handle references without introducing new vulnerabilities. You can think of it as a proof of concept. The question is not whether it's a functional all-purpose serializer, but whether there's something wrong with the way I've approached this problem. That said, I've spent the last few days implementing references and in the process the script has grown to 1000(!) lines of code. I don't think anyone would want to review that.
– Aran-Fey
Feb 13 at 12:26





@GarethRees You have a point, but the important thing is that all the functionality which can potentially have vulnerabilities already exists. I can easily make the code handle references without introducing new vulnerabilities. You can think of it as a proof of concept. The question is not whether it's a functional all-purpose serializer, but whether there's something wrong with the way I've approached this problem. That said, I've spent the last few days implementing references and in the process the script has grown to 1000(!) lines of code. I don't think anyone would want to review that.
– Aran-Fey
Feb 13 at 12:26
















active

oldest

votes











Your Answer




StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);








 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f187176%2fde-serializer-for-arbitrary-python-objects%23new-answer', 'question_page');

);

Post as a guest



































active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes










 

draft saved


draft discarded


























 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f187176%2fde-serializer-for-arbitrary-python-objects%23new-answer', 'question_page');

);

Post as a guest













































































Popular posts from this blog

Greedy Best First Search implementation in Rust

Function to Return a JSON Like Objects Using VBA Collections and Arrays

C++11 CLH Lock Implementation