Source code for beat.backend.python.dataformat

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :

###################################################################################
#                                                                                 #
# Copyright (c) 2019 Idiap Research Institute, http://www.idiap.ch/               #
# Contact: beat.support@idiap.ch                                                  #
#                                                                                 #
# Redistribution and use in source and binary forms, with or without              #
# modification, are permitted provided that the following conditions are met:     #
#                                                                                 #
# 1. Redistributions of source code must retain the above copyright notice, this  #
# list of conditions and the following disclaimer.                                #
#                                                                                 #
# 2. Redistributions in binary form must reproduce the above copyright notice,    #
# this list of conditions and the following disclaimer in the documentation       #
# and/or other materials provided with the distribution.                          #
#                                                                                 #
# 3. Neither the name of the copyright holder nor the names of its contributors   #
# may be used to endorse or promote products derived from this software without   #
# specific prior written permission.                                              #
#                                                                                 #
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED   #
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE          #
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE    #
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL      #
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR      #
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER      #
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,   #
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE   #
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.            #
#                                                                                 #
###################################################################################


"""
==========
dataformat
==========

Validation and parsing for dataformats
"""

import copy
import re

import numpy
import simplejson as json
import six

from . import utils
from .baseformat import baseformat

# ----------------------------------------------------------


[docs]class Storage(utils.Storage):
    """Resolves paths for dataformats

    Parameters:

      prefix (str): Establishes the prefix of your installation.

      name (str): The name of the dataformat object in the format
        ``<user>/<name>/<version>``.

    """

    asset_type = "dataformat"
    asset_folder = "dataformats"

    def __init__(self, prefix, name):
        if name.count("/") != 2:
            raise RuntimeError("invalid dataformat name: `%s'" % name)

        self.username, self.name, self.version = name.split("/")
        self.fullname = name
        self.prefix = prefix

        path = utils.hashed_or_simple(
            self.prefix, self.asset_folder, name, suffix=".json"
        )
        path = path[:-5]
        super(Storage, self).__init__(path)

[docs]    def hash(self):
        """The 64-character hash of the database declaration JSON"""
        return super(Storage, self).hash("#description")


# ----------------------------------------------------------


[docs]class DataFormat(object):
    """Data formats define the chunks of data that circulate between blocks.

    Parameters:

      prefix (str): Establishes the prefix of
        your installation.

      data (str, dict): The fully qualified algorithm name (e.g. ``user/algo/1``)
        or a dictionary representing the data format (for analyzer results).

      parent (:py:class:`tuple`, Optional): The parent DataFormat for this
        format. If set to ``None``, this means this dataformat is the first one
        on the hierarchy tree. If set to a tuple, the contents are
        ``(format-instance, field-name)``, which indicates the originating
        object that is this object's parent and the name of the field on that
        object that points to this one.

      dataformat_cache (:py:class:`dict`, Optional): A dictionary mapping
        dataformat names to loaded dataformats. This parameter is optional and,
        if passed, may greatly speed-up data format loading times as
        dataformats that are already loaded may be re-used. If you use this
        parameter, you must guarantee that the cache is refreshed as
        appropriate in case the underlying dataformats change.

    Attributes:

      storage (object): A simple object that provides information about file
        paths for this dataformat

      errors (list): A list strings containing errors found while loading this
        dataformat.

      data (dict): The original data for this dataformat, as loaded by our JSON
        decoder.

      resolved (dict): A dictionary similar to :py:attr:`data`, but with
        references fully resolved.

      referenced (dict): A dictionary pointing to all loaded dataformats.

      parent (dataformat.DataFormat): The pointer to the
        dataformat to which the current format is part of. It is useful for
        internal error reporting.

    """

    def __init__(self, prefix, data, parent=None, dataformat_cache=None):
        self._name = None
        self.storage = None
        self.resolved = None
        self.prefix = prefix
        self.errors = []
        self.data = None
        self.resolved = None
        self.referenced = {}
        self.parent = parent

        # if the user has not provided a cache, still use one for performance
        dataformat_cache = dataformat_cache if dataformat_cache is not None else {}

        try:
            self._load(data, dataformat_cache)
        finally:
            if self._name is not None:  # registers it into the cache, even if failed
                dataformat_cache[self._name] = self

    def _load(self, data, dataformat_cache):
        """Loads the dataformat"""

        if isinstance(data, dict):
            self._name = "analysis:result"
            self.data = data
        else:
            self._name = data
            self.storage = Storage(self.prefix, data)
            json_path = self.storage.json.path
            if not self.storage.exists():
                self.errors.append(
                    "Dataformat declaration file not found: %s" % json_path
                )
                return

            with open(json_path, "rb") as f:
                try:
                    self.data = json.loads(
                        f.read().decode("utf-8"),
                        object_pairs_hook=utils.error_on_duplicate_key_hook,
                    )
                except RuntimeError as error:
                    self.errors.append(
                        "Dataformat declaration file invalid: %s" % error
                    )
                    return

        dataformat_cache[self._name] = self  # registers itself into the cache

        self.resolved = copy.deepcopy(self.data)

        # remove reserved fields
        def is_reserved(x):
            """Returns if the field name is a reserved name"""
            return (x.startswith("__") and x.endswith("__")) or x in (
                "#description",
                "#schema_version",
            )

        for key in list(self.resolved):
            if is_reserved(key):
                del self.resolved[key]

        def maybe_load_format(name, obj, dataformat_cache):
            """Tries to load a given dataformat from its relative path"""

            if isinstance(obj, six.string_types) and obj.find("/") != -1:  # load it
                if obj in dataformat_cache:  # reuse
                    if dataformat_cache[obj] is None:  # recursion detected
                        return self

                    self.referenced[obj] = dataformat_cache[obj]

                else:  # load it
                    self.referenced[obj] = DataFormat(
                        self.prefix, obj, (self, name), dataformat_cache
                    )

                return self.referenced[obj]

            elif isinstance(obj, dict):  # can cache it, must load from scratch
                return DataFormat(self.prefix, obj, (self, name), dataformat_cache)

            elif isinstance(obj, list):
                retval = copy.deepcopy(obj)
                retval[-1] = maybe_load_format(field, obj[-1], dataformat_cache)
                return retval

            return obj

        # now checks that every referred dataformat is loaded, resolves it
        for field, value in self.data.items():
            if field in ("#description", "#schema_version"):
                continue  # skip the description and schema version meta attributes
            self.resolved[field] = maybe_load_format(field, value, dataformat_cache)

        # at this point, there should be no more external references in
        # ``self.resolved``. We treat the "#extends" property, which requires a
        # special handling, given its nature.
        if "#extends" in self.resolved:
            ext = self.data["#extends"]
            self.referenced[ext] = maybe_load_format(self._name, ext, dataformat_cache)
            basetype = self.resolved["#extends"]
            tmp = self.resolved
            self.resolved = basetype.resolved
            self.resolved.update(tmp)
            del self.resolved["#extends"]  # avoids infinite recursion

    @property
    def name(self):
        """Name of this object, either from the filename or composed from the hierarchy
        it belongs.
        """
        if self.parent and self._name is None:
            return self.parent[0].name + "." + self.parent[1] + "_type"
        else:
            return self._name or "__unnamed_dataformat__"

    @name.setter
    def name(self, value):
        self._name = value
        self.storage = Storage(self.prefix, value)

    @property
    def schema_version(self):
        """Returns the schema version"""
        return self.data.get("#schema_version", 1)

    @property
    def extends(self):
        """If this dataformat extends another one, this is it, otherwise ``None``"""
        return self.data.get("#extends")

    @property
    def type(self):
        """Returns a new type that can create instances of this dataformat.

        The new returned type provides a basis to construct new objects which
        represent the dataformat. It provides a simple JSON serializer and a
        for-screen representation.

        Example:

          To create an object respecting the data format from a JSON
          descriptor, use the following technique:

          .. code-block:: python

            ftype = dataformat(...).type
            json = simplejson.loads(...)
            newobj = ftype(**json) # instantiates the new object, checks format

          To dump the object into JSON, use the following technique:

          .. code-block:: python

             simplejson.dumps(newobj.as_dict(), indent=4)

          A string representation of the object uses the technique above to
          pretty-print the object contents to the screen.
        """

        if self.resolved is None:
            raise RuntimeError(
                "Cannot prototype while not properly initialized\n{}".format(
                    self.errors
                )
            )

        classname = re.sub(r"[-/]", "_", self.name)
        if not isinstance(classname, str):
            classname = str(classname)

        def init(self, **kwargs):
            baseformat.__init__(self, **kwargs)

        attributes = dict(__init__=init, _name=self.name, _format=self.resolved)

        # create the converters for the class we're about to return
        for k, v in self.resolved.items():
            if isinstance(v, list):  # it is an array
                attributes[k] = copy.deepcopy(v)
                if isinstance(v[-1], DataFormat):
                    attributes[k][-1] = v[-1].type
                else:
                    if v[-1] in ("string", "str"):
                        attributes[k][-1] = str
                    else:
                        attributes[k][-1] = numpy.dtype(v[-1])

            elif isinstance(v, DataFormat):  # it is another dataformat
                attributes[k] = v.type

            else:  # it is a simple type
                if v in ("string", "str"):
                    attributes[k] = str
                else:
                    attributes[k] = numpy.dtype(v)

        return type(classname, (baseformat,), attributes)

    @property
    def valid(self):
        """A boolean that indicates if this dataformat is valid or not"""
        return not bool(self.errors)

    @property
    def description(self):
        """Short description string, loaded from the JSON file if one was set"""

        return self.data.get("#description", None)

    @description.setter
    def description(self, value):
        self.data["#description"] = value

    @property
    def documentation(self):
        """The full-length description for this object"""

        if not self._name:
            raise RuntimeError("dataformat has no name")

        if self.storage.doc.exists():
            return self.storage.doc.load()
        return None

    @documentation.setter
    def documentation(self, value):
        if not self._name:
            raise RuntimeError("dataformat has no name")

        if hasattr(value, "read"):
            self.storage.doc.save(value.read())
        else:
            self.storage.doc.save(value)

[docs]    def hash(self):
        """Returns the hexadecimal hash for its declaration"""

        if not self._name:
            raise RuntimeError("dataformat has no name")

        return self.storage.hash()

[docs]    def validate(self, data):
        """Validates a piece of data provided by the user

        In order to validate, the data object must be complete and
        safe-castable to this dataformat. For any other validation operation
        that would require special settings, use instead the :py:meth:`type`
        method to generate a valid type and use either ``from_dict``,
        ``unpack`` or ``unpack_from`` depending on your use-case.

        Parameters:

          data (dict, str, :std:term:`file object`): This parameter represents
            the data to be validated.  It may be a dictionary with the JSON
            representation of a data blob or, else, a binary blob (represented
            by either a string or a file descriptor object) from which the data
            will be read. If problems occur, an exception is raised.

        Returns:

          ``None``: Raises if an error occurs.
        """

        obj = self.type()
        if isinstance(data, dict):
            obj.from_dict(data, casting="safe", add_defaults=False)
        elif isinstance(data, six.string_types):
            obj.unpack(data)
        else:
            obj.unpack_from(data)

[docs]    def isparent(self, other):
        """Tells if the other object extends self (directly or indirectly).

        Parameters:

          other (DataFormat): another object to check


        Returns:

          bool: ``True``, if ``other`` is a parent of ``self``. ``False``
            otherwise.

        """

        if other.extends:
            if self.name == other.extends:
                return True
            else:
                return self.isparent(other.referenced[other.extends])

        return False

[docs]    def json_dumps(self, indent=4):
        """Dumps the JSON declaration of this object in a string

        Parameters:

          indent (int): The number of indentation spaces at every indentation
            level


        Returns:

          str: The JSON representation for this object

        """

        return json.dumps(self.data, indent=indent, cls=utils.NumpyJSONEncoder)

    def __str__(self):
        return self.json_dumps()

[docs]    def write(self, storage=None):
        """Writes contents to prefix location

        Parameters:

          storage (:py:class:`.Storage`, Optional): If you pass a new storage,
            then this object will be written to that storage point rather than
            its default.

        """

        if storage is None:
            if not self._name:
                raise RuntimeError("dataformat has no name")
            storage = self.storage  # overwrite

        storage.save(str(self), self.description)

[docs]    def export(self, prefix):
        """Recursively exports itself into another prefix

        Other required dataformats are also copied.


        Parameters:

          prefix (str): Establishes the prefix of your installation.


        Returns:

          None


        Raises:

          RuntimeError: If prefix and self.prefix point to the same directory.

        """

        if not self._name:
            raise RuntimeError("dataformat has no name")

        if not self.valid:
            raise RuntimeError("dataformat is not valid:\n{}".format(self.errors))

        if prefix == self.prefix:
            raise RuntimeError(
                "Cannot export dataformat to the same prefix (" "%s)" % prefix
            )

        for k in self.referenced.values():
            k.export(prefix)

        self.write(Storage(prefix, self.name))