Source code for beat.backend.python.dataformat
#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
###################################################################################
# #
# Copyright (c) 2019 Idiap Research Institute, http://www.idiap.ch/ #
# Contact: beat.support@idiap.ch #
# #
# Redistribution and use in source and binary forms, with or without #
# modification, are permitted provided that the following conditions are met: #
# #
# 1. Redistributions of source code must retain the above copyright notice, this #
# list of conditions and the following disclaimer. #
# #
# 2. Redistributions in binary form must reproduce the above copyright notice, #
# this list of conditions and the following disclaimer in the documentation #
# and/or other materials provided with the distribution. #
# #
# 3. Neither the name of the copyright holder nor the names of its contributors #
# may be used to endorse or promote products derived from this software without #
# specific prior written permission. #
# #
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED #
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE #
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE #
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL #
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR #
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER #
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, #
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE #
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
# #
###################################################################################
"""
==========
dataformat
==========
Validation and parsing for dataformats
"""
import copy
import re
import numpy
import simplejson as json
import six
from . import utils
from .baseformat import baseformat
# ----------------------------------------------------------
[docs]class Storage(utils.Storage):
"""Resolves paths for dataformats
Parameters:
prefix (str): Establishes the prefix of your installation.
name (str): The name of the dataformat object in the format
``<user>/<name>/<version>``.
"""
asset_type = "dataformat"
asset_folder = "dataformats"
def __init__(self, prefix, name):
if name.count("/") != 2:
raise RuntimeError("invalid dataformat name: `%s'" % name)
self.username, self.name, self.version = name.split("/")
self.fullname = name
self.prefix = prefix
path = utils.hashed_or_simple(
self.prefix, self.asset_folder, name, suffix=".json"
)
path = path[:-5]
super(Storage, self).__init__(path)
[docs] def hash(self):
"""The 64-character hash of the database declaration JSON"""
return super(Storage, self).hash("#description")
# ----------------------------------------------------------
[docs]class DataFormat(object):
"""Data formats define the chunks of data that circulate between blocks.
Parameters:
prefix (str): Establishes the prefix of
your installation.
data (str, dict): The fully qualified algorithm name (e.g. ``user/algo/1``)
or a dictionary representing the data format (for analyzer results).
parent (:py:class:`tuple`, Optional): The parent DataFormat for this
format. If set to ``None``, this means this dataformat is the first one
on the hierarchy tree. If set to a tuple, the contents are
``(format-instance, field-name)``, which indicates the originating
object that is this object's parent and the name of the field on that
object that points to this one.
dataformat_cache (:py:class:`dict`, Optional): A dictionary mapping
dataformat names to loaded dataformats. This parameter is optional and,
if passed, may greatly speed-up data format loading times as
dataformats that are already loaded may be re-used. If you use this
parameter, you must guarantee that the cache is refreshed as
appropriate in case the underlying dataformats change.
Attributes:
storage (object): A simple object that provides information about file
paths for this dataformat
errors (list): A list strings containing errors found while loading this
dataformat.
data (dict): The original data for this dataformat, as loaded by our JSON
decoder.
resolved (dict): A dictionary similar to :py:attr:`data`, but with
references fully resolved.
referenced (dict): A dictionary pointing to all loaded dataformats.
parent (dataformat.DataFormat): The pointer to the
dataformat to which the current format is part of. It is useful for
internal error reporting.
"""
def __init__(self, prefix, data, parent=None, dataformat_cache=None):
self._name = None
self.storage = None
self.resolved = None
self.prefix = prefix
self.errors = []
self.data = None
self.resolved = None
self.referenced = {}
self.parent = parent
# if the user has not provided a cache, still use one for performance
dataformat_cache = dataformat_cache if dataformat_cache is not None else {}
try:
self._load(data, dataformat_cache)
finally:
if self._name is not None: # registers it into the cache, even if failed
dataformat_cache[self._name] = self
def _load(self, data, dataformat_cache):
"""Loads the dataformat"""
if isinstance(data, dict):
self._name = "analysis:result"
self.data = data
else:
self._name = data
self.storage = Storage(self.prefix, data)
json_path = self.storage.json.path
if not self.storage.exists():
self.errors.append(
"Dataformat declaration file not found: %s" % json_path
)
return
with open(json_path, "rb") as f:
try:
self.data = json.loads(
f.read().decode("utf-8"),
object_pairs_hook=utils.error_on_duplicate_key_hook,
)
except RuntimeError as error:
self.errors.append(
"Dataformat declaration file invalid: %s" % error
)
return
dataformat_cache[self._name] = self # registers itself into the cache
self.resolved = copy.deepcopy(self.data)
# remove reserved fields
def is_reserved(x):
"""Returns if the field name is a reserved name"""
return (x.startswith("__") and x.endswith("__")) or x in (
"#description",
"#schema_version",
)
for key in list(self.resolved):
if is_reserved(key):
del self.resolved[key]
def maybe_load_format(name, obj, dataformat_cache):
"""Tries to load a given dataformat from its relative path"""
if isinstance(obj, six.string_types) and obj.find("/") != -1: # load it
if obj in dataformat_cache: # reuse
if dataformat_cache[obj] is None: # recursion detected
return self
self.referenced[obj] = dataformat_cache[obj]
else: # load it
self.referenced[obj] = DataFormat(
self.prefix, obj, (self, name), dataformat_cache
)
return self.referenced[obj]
elif isinstance(obj, dict): # can cache it, must load from scratch
return DataFormat(self.prefix, obj, (self, name), dataformat_cache)
elif isinstance(obj, list):
retval = copy.deepcopy(obj)
retval[-1] = maybe_load_format(field, obj[-1], dataformat_cache)
return retval
return obj
# now checks that every referred dataformat is loaded, resolves it
for field, value in self.data.items():
if field in ("#description", "#schema_version"):
continue # skip the description and schema version meta attributes
self.resolved[field] = maybe_load_format(field, value, dataformat_cache)
# at this point, there should be no more external references in
# ``self.resolved``. We treat the "#extends" property, which requires a
# special handling, given its nature.
if "#extends" in self.resolved:
ext = self.data["#extends"]
self.referenced[ext] = maybe_load_format(self._name, ext, dataformat_cache)
basetype = self.resolved["#extends"]
tmp = self.resolved
self.resolved = basetype.resolved
self.resolved.update(tmp)
del self.resolved["#extends"] # avoids infinite recursion
@property
def name(self):
"""Name of this object, either from the filename or composed from the hierarchy
it belongs.
"""
if self.parent and self._name is None:
return self.parent[0].name + "." + self.parent[1] + "_type"
else:
return self._name or "__unnamed_dataformat__"
@name.setter
def name(self, value):
self._name = value
self.storage = Storage(self.prefix, value)
@property
def schema_version(self):
"""Returns the schema version"""
return self.data.get("#schema_version", 1)
@property
def extends(self):
"""If this dataformat extends another one, this is it, otherwise ``None``"""
return self.data.get("#extends")
@property
def type(self):
"""Returns a new type that can create instances of this dataformat.
The new returned type provides a basis to construct new objects which
represent the dataformat. It provides a simple JSON serializer and a
for-screen representation.
Example:
To create an object respecting the data format from a JSON
descriptor, use the following technique:
.. code-block:: python
ftype = dataformat(...).type
json = simplejson.loads(...)
newobj = ftype(**json) # instantiates the new object, checks format
To dump the object into JSON, use the following technique:
.. code-block:: python
simplejson.dumps(newobj.as_dict(), indent=4)
A string representation of the object uses the technique above to
pretty-print the object contents to the screen.
"""
if self.resolved is None:
raise RuntimeError(
"Cannot prototype while not properly initialized\n{}".format(
self.errors
)
)
classname = re.sub(r"[-/]", "_", self.name)
if not isinstance(classname, str):
classname = str(classname)
def init(self, **kwargs):
baseformat.__init__(self, **kwargs)
attributes = dict(__init__=init, _name=self.name, _format=self.resolved)
# create the converters for the class we're about to return
for k, v in self.resolved.items():
if isinstance(v, list): # it is an array
attributes[k] = copy.deepcopy(v)
if isinstance(v[-1], DataFormat):
attributes[k][-1] = v[-1].type
else:
if v[-1] in ("string", "str"):
attributes[k][-1] = str
else:
attributes[k][-1] = numpy.dtype(v[-1])
elif isinstance(v, DataFormat): # it is another dataformat
attributes[k] = v.type
else: # it is a simple type
if v in ("string", "str"):
attributes[k] = str
else:
attributes[k] = numpy.dtype(v)
return type(classname, (baseformat,), attributes)
@property
def valid(self):
"""A boolean that indicates if this dataformat is valid or not"""
return not bool(self.errors)
@property
def description(self):
"""Short description string, loaded from the JSON file if one was set"""
return self.data.get("#description", None)
@description.setter
def description(self, value):
self.data["#description"] = value
@property
def documentation(self):
"""The full-length description for this object"""
if not self._name:
raise RuntimeError("dataformat has no name")
if self.storage.doc.exists():
return self.storage.doc.load()
return None
@documentation.setter
def documentation(self, value):
if not self._name:
raise RuntimeError("dataformat has no name")
if hasattr(value, "read"):
self.storage.doc.save(value.read())
else:
self.storage.doc.save(value)
[docs] def hash(self):
"""Returns the hexadecimal hash for its declaration"""
if not self._name:
raise RuntimeError("dataformat has no name")
return self.storage.hash()
[docs] def validate(self, data):
"""Validates a piece of data provided by the user
In order to validate, the data object must be complete and
safe-castable to this dataformat. For any other validation operation
that would require special settings, use instead the :py:meth:`type`
method to generate a valid type and use either ``from_dict``,
``unpack`` or ``unpack_from`` depending on your use-case.
Parameters:
data (dict, str, :std:term:`file object`): This parameter represents
the data to be validated. It may be a dictionary with the JSON
representation of a data blob or, else, a binary blob (represented
by either a string or a file descriptor object) from which the data
will be read. If problems occur, an exception is raised.
Returns:
``None``: Raises if an error occurs.
"""
obj = self.type()
if isinstance(data, dict):
obj.from_dict(data, casting="safe", add_defaults=False)
elif isinstance(data, six.string_types):
obj.unpack(data)
else:
obj.unpack_from(data)
[docs] def isparent(self, other):
"""Tells if the other object extends self (directly or indirectly).
Parameters:
other (DataFormat): another object to check
Returns:
bool: ``True``, if ``other`` is a parent of ``self``. ``False``
otherwise.
"""
if other.extends:
if self.name == other.extends:
return True
else:
return self.isparent(other.referenced[other.extends])
return False
[docs] def json_dumps(self, indent=4):
"""Dumps the JSON declaration of this object in a string
Parameters:
indent (int): The number of indentation spaces at every indentation
level
Returns:
str: The JSON representation for this object
"""
return json.dumps(self.data, indent=indent, cls=utils.NumpyJSONEncoder)
def __str__(self):
return self.json_dumps()
[docs] def write(self, storage=None):
"""Writes contents to prefix location
Parameters:
storage (:py:class:`.Storage`, Optional): If you pass a new storage,
then this object will be written to that storage point rather than
its default.
"""
if storage is None:
if not self._name:
raise RuntimeError("dataformat has no name")
storage = self.storage # overwrite
storage.save(str(self), self.description)
[docs] def export(self, prefix):
"""Recursively exports itself into another prefix
Other required dataformats are also copied.
Parameters:
prefix (str): Establishes the prefix of your installation.
Returns:
None
Raises:
RuntimeError: If prefix and self.prefix point to the same directory.
"""
if not self._name:
raise RuntimeError("dataformat has no name")
if not self.valid:
raise RuntimeError("dataformat is not valid:\n{}".format(self.errors))
if prefix == self.prefix:
raise RuntimeError(
"Cannot export dataformat to the same prefix (" "%s)" % prefix
)
for k in self.referenced.values():
k.export(prefix)
self.write(Storage(prefix, self.name))