(de-)serializer for arbitrary python objects

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
4
down vote

favorite

My program is intended to allow (de-)serialization of arbitrary Python objects. It's basically a replacement for python's pickle module, which is similar but prone to arbitrary code execution. I think I've managed to make my implementation more versatile and more secure than pickle, but I'm not an expert in security.

I've come here to ask the question:

Is my (de-)serialization algorithm secure? Is there any risk of arbitrary code execution, or anything else that could be dangerous?

As far as I can tell, I've avoided executing any untrusted code:

Classes are serialized as a fully qualified name like module.Class. They are deserialized without importing any modules; the module is looked up through sys.modules[module_name].

Fancy objects have their __dict__ serialized. Upon deserialization, a new instance is created by calling obj = object.__new__(cls) and obj.__dict__.update(serialized_dict).

Is there anything I've overlooked?

The code is split into 3 files.

I apologize for the length of the code, but there isn't much I can remove. That said, I think the "interesting" functions are serialize, deserialize, deserialize_next and decode_type. The class-specific (de-)serializers (like encode_dict and decode_dict) should be secure.

First, the public interface. These are the contents of public_api.py:

from .internals import *


def to_str(obj):
 """
 Serializes an object to a string.

 :param obj: the object to serialize
 :return: the serialized object
 """
 return serialize(obj)


def load(data):
 """
 Deserializes an object.
 The input can be a string, bytes, or file-like object.

 :param data: the data to deserialize, or a file containing that data
 :return: the deserialized object
 """

 if isinstance(data, str):
 pass
 elif isinstance(data, bytes):
 data = data.decode()
 else:
 return load(data.read())

 value, data = deserialize_next(data)
 if data:
 raise DeserializationException('left-over data remained after deserialization')

 return value

internals.py:

import sys
import ast
import inspect
import collections

from .exceptions import *


def serialize(obj):
 """
 serializes an object to a string of the form "CLASS,SIZE,VALUE", where

 - CLASS is the fully qualified name of the object's class
 - SIZE the length of VALUE in characters
 - VALUE is some sort of string representation of obj; the exact format depends on its class
 """

 cls = type(obj)
 try:
 serializer = SERIALIZERS[cls]
 except KeyError:
 # if this object isn't of a basic type, we'll serialize its __dict__
 try:
 obj = vars(obj)
 except TypeError:
 raise SerializationException('Cannot serialize object without a __dict__: '.format(obj))

 serializer = SERIALIZERS[dict]

 value = serializer(obj)
 return ',,'.format(encode_type(cls), len(value), value)


def deserialize(cls, data):
 try:
 deserializer = DESERIALIZERS[cls]
 except KeyError:
 # if this object isn't of a basic type, we'll instantiate a new
 # object of the correct type and update its __dict__.
 # None of this should execute any dangerous code.
 dic = deserialize(dict, data)
 obj = object.__new__(cls)
 obj.__dict__.update(dic)
 else:
 obj = deserializer(data)

 assert isinstance(obj, cls), (obj, cls)
 return obj


def deserialize_next(data):
 """
 Deserializes the first object in *data*, returning the deserialized object
 and the remaining data that needs to be deserialized.
 """
 # extract the class name
 i = data.find(',')
 if i == -1:
 raise DeserializationException('missing type field')

 clsname = data[:i]
 cls = decode_type(clsname)

 # extract the size in bytes
 j = data.find(',', i+1)
 if j == -1:
 raise DeserializationException('missing size field')

 try:
 size = int(data[i+1:j])
 except ValueError:
 raise DeserializationException('invalid value in size field (not a number)')

 # extract the value
 end = j+1+size
 value = data[j+1:end]
 value = deserialize(cls, value)

 return value, data[end:]


# class-specific (de-)serializers below
# =====================================

def encode_type(cls):
 module = inspect.getmodule(cls).__name__
 if module == 'builtins':
 return cls.__qualname__

 return '.'.format(module, cls.__qualname__)


def decode_type(qualname):
 names = qualname.split('.')

 if len(names) < 2:
 try:
 return getattr(sys.modules['builtins'], qualname)
 except AttributeError:
 raise DeserializationException('Missing module name in __qualname__: '.format(qualname))

 # look up the module in sys.modules; this avoids having to execute any import code
 obj = sys.modules[names[0]]
 for name in names[1:]:
 obj = getattr(obj, name)

 if not isinstance(obj, type):
 raise DeserializationException("Unknown type: (name resolved to )".format(qualname, obj))

 return obj


def encode_list(lis):
 chunks = [serialize(value) for value in lis]
 return '[]'.format(', '.join(chunks))


def decode_list(data):
 lis = 

 if not data.startswith('['):
 raise DeserializationException('list should start with "[" character')
 data = data[1:]

 while True:
 value, data = deserialize_next(data)
 lis.append(value)

 if not data.startswith(', '):
 if data == ']':
 break
 raise DeserializationException('missing comma between list values')
 data = data[2:]

 return lis


def encode_dict(dic):
 chunks = 

 for key, value in dic.items():
 line = ': '.format(serialize(key), serialize(value))
 chunks.append(line)

 return ''.format(', '.join(chunks))


def decode_dict(data, dict_type=dict):
 dic = dict_type()

 if not data.startswith('{'):
 raise DeserializationException('dict should start with "" character')
 data = data[1:]

 while True:
 key, data = deserialize_next(data)

 if not data.startswith(': '):
 raise DeserializationException('missing colon between dict key and value')
 data = data[2:]

 value, data = deserialize_next(data)
 dic[key] = value

 if not data.startswith(', '):
 if data == '':
 break
 raise DeserializationException('missing comma between dict values')
 data = data[2:]

 return dic


SERIALIZERS = int: str,
 float: str,
 complex: str,
 str: repr,
 bytes: repr,
 type: encode_type,
 list: encode_list,
 dict: encode_dict,
 collections.OrderedDict: encode_dict,
 

DESERIALIZERS = int: int,
 float: float,
 complex: complex,
 str: ast.literal_eval,
 bytes: ast.literal_eval,
 type: decode_type,
 list: decode_list,
 dict: decode_dict,
 collections.OrderedDict: lambda d: decode_dict(d, collections.OrderedDict),

And finally exceptions.py:

class SerializationException(Exception):
 """Raised when data serialization fails"""


class DeserializationException(Exception):
 """Raised when data deserialization fails"""

edited Feb 9 at 13:17

Jamalâ™¦

30.1k11114225

asked Feb 9 at 11:18

Aran-Fey

1235

2

The code in the post does not work! Try a = ; a.append(a); serialize(a). I get RecursionError: maximum recursion depth exceeded.
â€“Â Gareth Rees
Feb 9 at 13:22

1

@GarethRees You're right, it doesn't currently support self-references. Every object is serialized by value, so a self-reference causes endless recursion. It also means that after (de-)serializing something like a = ; b = [a, a], b will contain 2 different list instances instead of two references to the same list. Thanks for pointing that out, I'll fix that in the next version!
â€“Â Aran-Fey
Feb 9 at 14:24

4

@GarethRees I don't think a special case not working equates to "The code in the post does not work!" The OP's not mentioned it, and hasn't asked us to fix it. So it's reasonably working as the OP intends - so it's not broken @ closevoter.
â€“Â Peilonrayz
Feb 9 at 15:41

Handling of cyclic data structures is one of the fundamental requirements for a general-purpose serializer, so I disagree that it's a special case.
â€“Â Gareth Rees
Feb 13 at 10:27

@GarethRees You have a point, but the important thing is that all the functionality which can potentially have vulnerabilities already exists. I can easily make the code handle references without introducing new vulnerabilities. You can think of it as a proof of concept. The question is not whether it's a functional all-purpose serializer, but whether there's something wrong with the way I've approached this problem. That said, I've spent the last few days implementing references and in the process the script has grown to 1000(!) lines of code. I don't think anyone would want to review that.
â€“Â Aran-Fey
Feb 13 at 12:26

add a commentÂ |Â

up vote
4
down vote

favorite

I've come here to ask the question:

Is my (de-)serialization algorithm secure? Is there any risk of arbitrary code execution, or anything else that could be dangerous?

As far as I can tell, I've avoided executing any untrusted code:

Classes are serialized as a fully qualified name like module.Class. They are deserialized without importing any modules; the module is looked up through sys.modules[module_name].

Fancy objects have their __dict__ serialized. Upon deserialization, a new instance is created by calling obj = object.__new__(cls) and obj.__dict__.update(serialized_dict).

Is there anything I've overlooked?

The code is split into 3 files.

First, the public interface. These are the contents of public_api.py:

from .internals import *


def to_str(obj):
 """
 Serializes an object to a string.

 :param obj: the object to serialize
 :return: the serialized object
 """
 return serialize(obj)


def load(data):
 """
 Deserializes an object.
 The input can be a string, bytes, or file-like object.

 :param data: the data to deserialize, or a file containing that data
 :return: the deserialized object
 """

 if isinstance(data, str):
 pass
 elif isinstance(data, bytes):
 data = data.decode()
 else:
 return load(data.read())

 value, data = deserialize_next(data)
 if data:
 raise DeserializationException('left-over data remained after deserialization')

 return value

internals.py:

import sys
import ast
import inspect
import collections

from .exceptions import *


def serialize(obj):
 """
 serializes an object to a string of the form "CLASS,SIZE,VALUE", where

 - CLASS is the fully qualified name of the object's class
 - SIZE the length of VALUE in characters
 - VALUE is some sort of string representation of obj; the exact format depends on its class
 """

 cls = type(obj)
 try:
 serializer = SERIALIZERS[cls]
 except KeyError:
 # if this object isn't of a basic type, we'll serialize its __dict__
 try:
 obj = vars(obj)
 except TypeError:
 raise SerializationException('Cannot serialize object without a __dict__: '.format(obj))

 serializer = SERIALIZERS[dict]

 value = serializer(obj)
 return ',,'.format(encode_type(cls), len(value), value)


def deserialize(cls, data):
 try:
 deserializer = DESERIALIZERS[cls]
 except KeyError:
 # if this object isn't of a basic type, we'll instantiate a new
 # object of the correct type and update its __dict__.
 # None of this should execute any dangerous code.
 dic = deserialize(dict, data)
 obj = object.__new__(cls)
 obj.__dict__.update(dic)
 else:
 obj = deserializer(data)

 assert isinstance(obj, cls), (obj, cls)
 return obj


def deserialize_next(data):
 """
 Deserializes the first object in *data*, returning the deserialized object
 and the remaining data that needs to be deserialized.
 """
 # extract the class name
 i = data.find(',')
 if i == -1:
 raise DeserializationException('missing type field')

 clsname = data[:i]
 cls = decode_type(clsname)

 # extract the size in bytes
 j = data.find(',', i+1)
 if j == -1:
 raise DeserializationException('missing size field')

 try:
 size = int(data[i+1:j])
 except ValueError:
 raise DeserializationException('invalid value in size field (not a number)')

 # extract the value
 end = j+1+size
 value = data[j+1:end]
 value = deserialize(cls, value)

 return value, data[end:]


# class-specific (de-)serializers below
# =====================================

def encode_type(cls):
 module = inspect.getmodule(cls).__name__
 if module == 'builtins':
 return cls.__qualname__

 return '.'.format(module, cls.__qualname__)


def decode_type(qualname):
 names = qualname.split('.')

 if len(names) < 2:
 try:
 return getattr(sys.modules['builtins'], qualname)
 except AttributeError:
 raise DeserializationException('Missing module name in __qualname__: '.format(qualname))

 # look up the module in sys.modules; this avoids having to execute any import code
 obj = sys.modules[names[0]]
 for name in names[1:]:
 obj = getattr(obj, name)

 if not isinstance(obj, type):
 raise DeserializationException("Unknown type: (name resolved to )".format(qualname, obj))

 return obj


def encode_list(lis):
 chunks = [serialize(value) for value in lis]
 return '[]'.format(', '.join(chunks))


def decode_list(data):
 lis = 

 if not data.startswith('['):
 raise DeserializationException('list should start with "[" character')
 data = data[1:]

 while True:
 value, data = deserialize_next(data)
 lis.append(value)

 if not data.startswith(', '):
 if data == ']':
 break
 raise DeserializationException('missing comma between list values')
 data = data[2:]

 return lis


def encode_dict(dic):
 chunks = 

 for key, value in dic.items():
 line = ': '.format(serialize(key), serialize(value))
 chunks.append(line)

 return ''.format(', '.join(chunks))


def decode_dict(data, dict_type=dict):
 dic = dict_type()

 if not data.startswith('{'):
 raise DeserializationException('dict should start with "" character')
 data = data[1:]

 while True:
 key, data = deserialize_next(data)

 if not data.startswith(': '):
 raise DeserializationException('missing colon between dict key and value')
 data = data[2:]

 value, data = deserialize_next(data)
 dic[key] = value

 if not data.startswith(', '):
 if data == '':
 break
 raise DeserializationException('missing comma between dict values')
 data = data[2:]

 return dic


SERIALIZERS = int: str,
 float: str,
 complex: str,
 str: repr,
 bytes: repr,
 type: encode_type,
 list: encode_list,
 dict: encode_dict,
 collections.OrderedDict: encode_dict,
 

DESERIALIZERS = int: int,
 float: float,
 complex: complex,
 str: ast.literal_eval,
 bytes: ast.literal_eval,
 type: decode_type,
 list: decode_list,
 dict: decode_dict,
 collections.OrderedDict: lambda d: decode_dict(d, collections.OrderedDict),

And finally exceptions.py:

class SerializationException(Exception):
 """Raised when data serialization fails"""


class DeserializationException(Exception):
 """Raised when data deserialization fails"""

edited Feb 9 at 13:17

Jamalâ™¦

30.1k11114225

asked Feb 9 at 11:18

Aran-Fey

1235

2

The code in the post does not work! Try a = ; a.append(a); serialize(a). I get RecursionError: maximum recursion depth exceeded.
â€“Â Gareth Rees
Feb 9 at 13:22

1

@GarethRees You're right, it doesn't currently support self-references. Every object is serialized by value, so a self-reference causes endless recursion. It also means that after (de-)serializing something like a = ; b = [a, a], b will contain 2 different list instances instead of two references to the same list. Thanks for pointing that out, I'll fix that in the next version!
â€“Â Aran-Fey
Feb 9 at 14:24

4

@GarethRees I don't think a special case not working equates to "The code in the post does not work!" The OP's not mentioned it, and hasn't asked us to fix it. So it's reasonably working as the OP intends - so it's not broken @ closevoter.
â€“Â Peilonrayz
Feb 9 at 15:41

Handling of cyclic data structures is one of the fundamental requirements for a general-purpose serializer, so I disagree that it's a special case.
â€“Â Gareth Rees
Feb 13 at 10:27

@GarethRees You have a point, but the important thing is that all the functionality which can potentially have vulnerabilities already exists. I can easily make the code handle references without introducing new vulnerabilities. You can think of it as a proof of concept. The question is not whether it's a functional all-purpose serializer, but whether there's something wrong with the way I've approached this problem. That said, I've spent the last few days implementing references and in the process the script has grown to 1000(!) lines of code. I don't think anyone would want to review that.
â€“Â Aran-Fey
Feb 13 at 12:26

add a commentÂ |Â

up vote
4
down vote

favorite

I've come here to ask the question:

Is my (de-)serialization algorithm secure? Is there any risk of arbitrary code execution, or anything else that could be dangerous?

As far as I can tell, I've avoided executing any untrusted code:

Classes are serialized as a fully qualified name like module.Class. They are deserialized without importing any modules; the module is looked up through sys.modules[module_name].

Fancy objects have their __dict__ serialized. Upon deserialization, a new instance is created by calling obj = object.__new__(cls) and obj.__dict__.update(serialized_dict).

Is there anything I've overlooked?

The code is split into 3 files.

First, the public interface. These are the contents of public_api.py:

from .internals import *


def to_str(obj):
 """
 Serializes an object to a string.

 :param obj: the object to serialize
 :return: the serialized object
 """
 return serialize(obj)


def load(data):
 """
 Deserializes an object.
 The input can be a string, bytes, or file-like object.

 :param data: the data to deserialize, or a file containing that data
 :return: the deserialized object
 """

 if isinstance(data, str):
 pass
 elif isinstance(data, bytes):
 data = data.decode()
 else:
 return load(data.read())

 value, data = deserialize_next(data)
 if data:
 raise DeserializationException('left-over data remained after deserialization')

 return value

internals.py:

import sys
import ast
import inspect
import collections

from .exceptions import *


def serialize(obj):
 """
 serializes an object to a string of the form "CLASS,SIZE,VALUE", where

 - CLASS is the fully qualified name of the object's class
 - SIZE the length of VALUE in characters
 - VALUE is some sort of string representation of obj; the exact format depends on its class
 """

 cls = type(obj)
 try:
 serializer = SERIALIZERS[cls]
 except KeyError:
 # if this object isn't of a basic type, we'll serialize its __dict__
 try:
 obj = vars(obj)
 except TypeError:
 raise SerializationException('Cannot serialize object without a __dict__: '.format(obj))

 serializer = SERIALIZERS[dict]

 value = serializer(obj)
 return ',,'.format(encode_type(cls), len(value), value)


def deserialize(cls, data):
 try:
 deserializer = DESERIALIZERS[cls]
 except KeyError:
 # if this object isn't of a basic type, we'll instantiate a new
 # object of the correct type and update its __dict__.
 # None of this should execute any dangerous code.
 dic = deserialize(dict, data)
 obj = object.__new__(cls)
 obj.__dict__.update(dic)
 else:
 obj = deserializer(data)

 assert isinstance(obj, cls), (obj, cls)
 return obj


def deserialize_next(data):
 """
 Deserializes the first object in *data*, returning the deserialized object
 and the remaining data that needs to be deserialized.
 """
 # extract the class name
 i = data.find(',')
 if i == -1:
 raise DeserializationException('missing type field')

 clsname = data[:i]
 cls = decode_type(clsname)

 # extract the size in bytes
 j = data.find(',', i+1)
 if j == -1:
 raise DeserializationException('missing size field')

 try:
 size = int(data[i+1:j])
 except ValueError:
 raise DeserializationException('invalid value in size field (not a number)')

 # extract the value
 end = j+1+size
 value = data[j+1:end]
 value = deserialize(cls, value)

 return value, data[end:]


# class-specific (de-)serializers below
# =====================================

def encode_type(cls):
 module = inspect.getmodule(cls).__name__
 if module == 'builtins':
 return cls.__qualname__

 return '.'.format(module, cls.__qualname__)


def decode_type(qualname):
 names = qualname.split('.')

 if len(names) < 2:
 try:
 return getattr(sys.modules['builtins'], qualname)
 except AttributeError:
 raise DeserializationException('Missing module name in __qualname__: '.format(qualname))

 # look up the module in sys.modules; this avoids having to execute any import code
 obj = sys.modules[names[0]]
 for name in names[1:]:
 obj = getattr(obj, name)

 if not isinstance(obj, type):
 raise DeserializationException("Unknown type: (name resolved to )".format(qualname, obj))

 return obj


def encode_list(lis):
 chunks = [serialize(value) for value in lis]
 return '[]'.format(', '.join(chunks))


def decode_list(data):
 lis = 

 if not data.startswith('['):
 raise DeserializationException('list should start with "[" character')
 data = data[1:]

 while True:
 value, data = deserialize_next(data)
 lis.append(value)

 if not data.startswith(', '):
 if data == ']':
 break
 raise DeserializationException('missing comma between list values')
 data = data[2:]

 return lis


def encode_dict(dic):
 chunks = 

 for key, value in dic.items():
 line = ': '.format(serialize(key), serialize(value))
 chunks.append(line)

 return ''.format(', '.join(chunks))


def decode_dict(data, dict_type=dict):
 dic = dict_type()

 if not data.startswith('{'):
 raise DeserializationException('dict should start with "" character')
 data = data[1:]

 while True:
 key, data = deserialize_next(data)

 if not data.startswith(': '):
 raise DeserializationException('missing colon between dict key and value')
 data = data[2:]

 value, data = deserialize_next(data)
 dic[key] = value

 if not data.startswith(', '):
 if data == '':
 break
 raise DeserializationException('missing comma between dict values')
 data = data[2:]

 return dic


SERIALIZERS = int: str,
 float: str,
 complex: str,
 str: repr,
 bytes: repr,
 type: encode_type,
 list: encode_list,
 dict: encode_dict,
 collections.OrderedDict: encode_dict,
 

DESERIALIZERS = int: int,
 float: float,
 complex: complex,
 str: ast.literal_eval,
 bytes: ast.literal_eval,
 type: decode_type,
 list: decode_list,
 dict: decode_dict,
 collections.OrderedDict: lambda d: decode_dict(d, collections.OrderedDict),

And finally exceptions.py:

class SerializationException(Exception):
 """Raised when data serialization fails"""


class DeserializationException(Exception):
 """Raised when data deserialization fails"""

edited Feb 9 at 13:17

Jamalâ™¦

30.1k11114225

asked Feb 9 at 11:18

Aran-Fey

1235

I've come here to ask the question:

Is my (de-)serialization algorithm secure? Is there any risk of arbitrary code execution, or anything else that could be dangerous?

As far as I can tell, I've avoided executing any untrusted code:

Classes are serialized as a fully qualified name like module.Class. They are deserialized without importing any modules; the module is looked up through sys.modules[module_name].

Fancy objects have their __dict__ serialized. Upon deserialization, a new instance is created by calling obj = object.__new__(cls) and obj.__dict__.update(serialized_dict).

Is there anything I've overlooked?

The code is split into 3 files.

First, the public interface. These are the contents of public_api.py:

from .internals import *


def to_str(obj):
 """
 Serializes an object to a string.

 :param obj: the object to serialize
 :return: the serialized object
 """
 return serialize(obj)


def load(data):
 """
 Deserializes an object.
 The input can be a string, bytes, or file-like object.

 :param data: the data to deserialize, or a file containing that data
 :return: the deserialized object
 """

 if isinstance(data, str):
 pass
 elif isinstance(data, bytes):
 data = data.decode()
 else:
 return load(data.read())

 value, data = deserialize_next(data)
 if data:
 raise DeserializationException('left-over data remained after deserialization')

 return value

internals.py:

import sys
import ast
import inspect
import collections

from .exceptions import *


def serialize(obj):
 """
 serializes an object to a string of the form "CLASS,SIZE,VALUE", where

 - CLASS is the fully qualified name of the object's class
 - SIZE the length of VALUE in characters
 - VALUE is some sort of string representation of obj; the exact format depends on its class
 """

 cls = type(obj)
 try:
 serializer = SERIALIZERS[cls]
 except KeyError:
 # if this object isn't of a basic type, we'll serialize its __dict__
 try:
 obj = vars(obj)
 except TypeError:
 raise SerializationException('Cannot serialize object without a __dict__: '.format(obj))

 serializer = SERIALIZERS[dict]

 value = serializer(obj)
 return ',,'.format(encode_type(cls), len(value), value)


def deserialize(cls, data):
 try:
 deserializer = DESERIALIZERS[cls]
 except KeyError:
 # if this object isn't of a basic type, we'll instantiate a new
 # object of the correct type and update its __dict__.
 # None of this should execute any dangerous code.
 dic = deserialize(dict, data)
 obj = object.__new__(cls)
 obj.__dict__.update(dic)
 else:
 obj = deserializer(data)

 assert isinstance(obj, cls), (obj, cls)
 return obj


def deserialize_next(data):
 """
 Deserializes the first object in *data*, returning the deserialized object
 and the remaining data that needs to be deserialized.
 """
 # extract the class name
 i = data.find(',')
 if i == -1:
 raise DeserializationException('missing type field')

 clsname = data[:i]
 cls = decode_type(clsname)

 # extract the size in bytes
 j = data.find(',', i+1)
 if j == -1:
 raise DeserializationException('missing size field')

 try:
 size = int(data[i+1:j])
 except ValueError:
 raise DeserializationException('invalid value in size field (not a number)')

 # extract the value
 end = j+1+size
 value = data[j+1:end]
 value = deserialize(cls, value)

 return value, data[end:]


# class-specific (de-)serializers below
# =====================================

def encode_type(cls):
 module = inspect.getmodule(cls).__name__
 if module == 'builtins':
 return cls.__qualname__

 return '.'.format(module, cls.__qualname__)


def decode_type(qualname):
 names = qualname.split('.')

 if len(names) < 2:
 try:
 return getattr(sys.modules['builtins'], qualname)
 except AttributeError:
 raise DeserializationException('Missing module name in __qualname__: '.format(qualname))

 # look up the module in sys.modules; this avoids having to execute any import code
 obj = sys.modules[names[0]]
 for name in names[1:]:
 obj = getattr(obj, name)

 if not isinstance(obj, type):
 raise DeserializationException("Unknown type: (name resolved to )".format(qualname, obj))

 return obj


def encode_list(lis):
 chunks = [serialize(value) for value in lis]
 return '[]'.format(', '.join(chunks))


def decode_list(data):
 lis = 

 if not data.startswith('['):
 raise DeserializationException('list should start with "[" character')
 data = data[1:]

 while True:
 value, data = deserialize_next(data)
 lis.append(value)

 if not data.startswith(', '):
 if data == ']':
 break
 raise DeserializationException('missing comma between list values')
 data = data[2:]

 return lis


def encode_dict(dic):
 chunks = 

 for key, value in dic.items():
 line = ': '.format(serialize(key), serialize(value))
 chunks.append(line)

 return ''.format(', '.join(chunks))


def decode_dict(data, dict_type=dict):
 dic = dict_type()

 if not data.startswith('{'):
 raise DeserializationException('dict should start with "" character')
 data = data[1:]

 while True:
 key, data = deserialize_next(data)

 if not data.startswith(': '):
 raise DeserializationException('missing colon between dict key and value')
 data = data[2:]

 value, data = deserialize_next(data)
 dic[key] = value

 if not data.startswith(', '):
 if data == '':
 break
 raise DeserializationException('missing comma between dict values')
 data = data[2:]

 return dic


SERIALIZERS = int: str,
 float: str,
 complex: str,
 str: repr,
 bytes: repr,
 type: encode_type,
 list: encode_list,
 dict: encode_dict,
 collections.OrderedDict: encode_dict,
 

DESERIALIZERS = int: int,
 float: float,
 complex: complex,
 str: ast.literal_eval,
 bytes: ast.literal_eval,
 type: decode_type,
 list: decode_list,
 dict: decode_dict,
 collections.OrderedDict: lambda d: decode_dict(d, collections.OrderedDict),

And finally exceptions.py:

class SerializationException(Exception):
 """Raised when data serialization fails"""


class DeserializationException(Exception):
 """Raised when data deserialization fails"""

edited Feb 9 at 13:17

Jamalâ™¦

30.1k11114225

asked Feb 9 at 11:18

Aran-Fey

1235

edited Feb 9 at 13:17

Jamalâ™¦

30.1k11114225

edited Feb 9 at 13:17

Jamalâ™¦

30.1k11114225

edited Feb 9 at 13:17

Jamalâ™¦

30.1k11114225

asked Feb 9 at 11:18

Aran-Fey

1235

asked Feb 9 at 11:18

Aran-Fey

1235

asked Feb 9 at 11:18

Aran-Fey

1235

2

The code in the post does not work! Try a = ; a.append(a); serialize(a). I get RecursionError: maximum recursion depth exceeded.
â€“Â Gareth Rees
Feb 9 at 13:22

1

@GarethRees You're right, it doesn't currently support self-references. Every object is serialized by value, so a self-reference causes endless recursion. It also means that after (de-)serializing something like a = ; b = [a, a], b will contain 2 different list instances instead of two references to the same list. Thanks for pointing that out, I'll fix that in the next version!
â€“Â Aran-Fey
Feb 9 at 14:24

4

@GarethRees I don't think a special case not working equates to "The code in the post does not work!" The OP's not mentioned it, and hasn't asked us to fix it. So it's reasonably working as the OP intends - so it's not broken @ closevoter.
â€“Â Peilonrayz
Feb 9 at 15:41

Handling of cyclic data structures is one of the fundamental requirements for a general-purpose serializer, so I disagree that it's a special case.
â€“Â Gareth Rees
Feb 13 at 10:27

@GarethRees You have a point, but the important thing is that all the functionality which can potentially have vulnerabilities already exists. I can easily make the code handle references without introducing new vulnerabilities. You can think of it as a proof of concept. The question is not whether it's a functional all-purpose serializer, but whether there's something wrong with the way I've approached this problem. That said, I've spent the last few days implementing references and in the process the script has grown to 1000(!) lines of code. I don't think anyone would want to review that.
â€“Â Aran-Fey
Feb 13 at 12:26

add a commentÂ |Â

2

The code in the post does not work! Try a = ; a.append(a); serialize(a). I get RecursionError: maximum recursion depth exceeded.
â€“Â Gareth Rees
Feb 9 at 13:22

1

@GarethRees You're right, it doesn't currently support self-references. Every object is serialized by value, so a self-reference causes endless recursion. It also means that after (de-)serializing something like a = ; b = [a, a], b will contain 2 different list instances instead of two references to the same list. Thanks for pointing that out, I'll fix that in the next version!
â€“Â Aran-Fey
Feb 9 at 14:24

4

@GarethRees I don't think a special case not working equates to "The code in the post does not work!" The OP's not mentioned it, and hasn't asked us to fix it. So it's reasonably working as the OP intends - so it's not broken @ closevoter.
â€“Â Peilonrayz
Feb 9 at 15:41

Handling of cyclic data structures is one of the fundamental requirements for a general-purpose serializer, so I disagree that it's a special case.
â€“Â Gareth Rees
Feb 13 at 10:27

@GarethRees You have a point, but the important thing is that all the functionality which can potentially have vulnerabilities already exists. I can easily make the code handle references without introducing new vulnerabilities. You can think of it as a proof of concept. The question is not whether it's a functional all-purpose serializer, but whether there's something wrong with the way I've approached this problem. That said, I've spent the last few days implementing references and in the process the script has grown to 1000(!) lines of code. I don't think anyone would want to review that.
â€“Â Aran-Fey
Feb 13 at 12:26

The code in the post does not work! Try a = ; a.append(a); serialize(a). I get RecursionError: maximum recursion depth exceeded.
â€“Â Gareth Rees
Feb 9 at 13:22

@GarethRees You're right, it doesn't currently support self-references. Every object is serialized by value, so a self-reference causes endless recursion. It also means that after (de-)serializing something like a = ; b = [a, a], b will contain 2 different list instances instead of two references to the same list. Thanks for pointing that out, I'll fix that in the next version!
â€“Â Aran-Fey
Feb 9 at 14:24

@GarethRees I don't think a special case not working equates to "The code in the post does not work!" The OP's not mentioned it, and hasn't asked us to fix it. So it's reasonably working as the OP intends - so it's not broken @ closevoter.
â€“Â Peilonrayz
Feb 9 at 15:41

Handling of cyclic data structures is one of the fundamental requirements for a general-purpose serializer, so I disagree that it's a special case.
â€“Â Gareth Rees
Feb 13 at 10:27

@GarethRees You have a point, but the important thing is that all the functionality which can potentially have vulnerabilities already exists. I can easily make the code handle references without introducing new vulnerabilities. You can think of it as a proof of concept. The question is not whether it's a functional all-purpose serializer, but whether there's something wrong with the way I've approached this problem. That said, I've spent the last few days implementing references and in the process the script has grown to 1000(!) lines of code. I don't think anyone would want to review that.
â€“Â Aran-Fey
Feb 13 at 12:26

add a commentÂ |Â

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f187176%2fde-serializer-for-arbitrary-python-objects%23new-answer', 'question_page');

);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr