Source code for beat.core.database

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :

###################################################################################
#                                                                                 #
# Copyright (c) 2019 Idiap Research Institute, http://www.idiap.ch/               #
# Contact: beat.support@idiap.ch                                                  #
#                                                                                 #
# Redistribution and use in source and binary forms, with or without              #
# modification, are permitted provided that the following conditions are met:     #
#                                                                                 #
# 1. Redistributions of source code must retain the above copyright notice, this  #
# list of conditions and the following disclaimer.                                #
#                                                                                 #
# 2. Redistributions in binary form must reproduce the above copyright notice,    #
# this list of conditions and the following disclaimer in the documentation       #
# and/or other materials provided with the distribution.                          #
#                                                                                 #
# 3. Neither the name of the copyright holder nor the names of its contributors   #
# may be used to endorse or promote products derived from this software without   #
# specific prior written permission.                                              #
#                                                                                 #
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED   #
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE          #
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE    #
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL      #
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR      #
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER      #
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,   #
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE   #
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.            #
#                                                                                 #
###################################################################################


"""
========
database
========

Validation of databases

Forward importing from :py:mod:`beat.backend.python.database`:
:py:class:`beat.backend.python.database.Storage`
"""
import os

import six

from beat.backend.python.database import Database as BackendDatabase
from beat.backend.python.database import Storage
from beat.backend.python.protocoltemplate import Storage as PTStorage

from . import prototypes
from . import schema
from .dataformat import DataFormat
from .protocoltemplate import ProtocolTemplate


[docs]def get_first_procotol_template(prefix):
    pt_root_folder = os.path.join(prefix, PTStorage.asset_folder)
    pts_available = sorted(os.listdir(pt_root_folder))

    if not pts_available:
        raise RuntimeError("Invalid prefix content, no protocol template available")

    selected_protocol_template = None
    for procotol_template_folder in pts_available:
        protocol_template_versions = sorted(
            os.listdir(os.path.join(pt_root_folder, procotol_template_folder))
        )
        version = protocol_template_versions[-1].split(".")[0]
        protocol_template_name = "{}/{}".format(procotol_template_folder, version)
        protocol_template = ProtocolTemplate(prefix, protocol_template_name)
        if protocol_template.valid:
            selected_protocol_template = protocol_template_name
            break

    if selected_protocol_template is None:
        raise RuntimeError("No valid protocol template found")

    return selected_protocol_template


[docs]class Database(BackendDatabase):
    """Databases define the start point of the dataflow in an experiment.


    Parameters:

      prefix (str): Establishes the prefix of your installation.

      data (dict, str): The piece of data representing the database. It must
        validate against the schema defined for databases. If a string is
        passed, it is supposed to be a valid path to an database in the
        designated prefix area.

      dataformat_cache (:py:class:`dict`, Optional): A dictionary mapping
        dataformat names to loaded dataformats. This parameter is optional and,
        if passed, may greatly speed-up database loading times as dataformats
        that are already loaded may be re-used. If you use this parameter, you
        must guarantee that the cache is refreshed as appropriate in case the
        underlying dataformats change.


    Attributes:

      name (str): The full, valid name of this database

      description (str): The short description string, loaded from the JSON
        file if one was set.

      documentation (str): The full-length docstring for this object.

      storage (object): A simple object that provides information about file
        paths for this database

      errors (list): A list containing errors found while loading this
        database.

      data (dict): The original data for this database, as loaded by our JSON
        decoder.

    """

    def __init__(self, prefix, data, dataformat_cache=None):
        super(Database, self).__init__(prefix, data, dataformat_cache)

    def _validate_view(self, view_name):
        if view_name.find(".") != -1 or view_name.find(os.sep) != -1:
            self.errors.append(
                "dataset views are required to sit inside the "
                "database root folder, but `%s' is either in a "
                "subdirectory or points to a python module, what is "
                "unsupported by this version" % (view_name)
            )

    def _load(self, data, dataformat_cache):
        """Loads the database"""

        self._name = None
        self.storage = None
        self.dataformats = {}  # preloaded dataformats
        code = None

        if isinstance(data, (tuple, list)):  # user has passed individual info
            data, code = data  # break down into two components

        if isinstance(data, six.string_types):  # user has passed a file pointer
            self._name = data
            self.storage = Storage(self.prefix, self._name)
            data = self.storage.json.path
            if not self.storage.json.exists():
                self.errors.append("Database declaration file not found: %s" % data)
                return

        # At this point, `data' can be a dictionary or ``None``
        if data is None:  # loads the default declaration for a database
            self.data, self.errors = prototypes.load("database")
            self.data["protocols"][0]["template"] = get_first_procotol_template(
                self.prefix
            )
            assert not self.errors, "\n  * %s" % "\n  *".join(self.errors)  # nosec
        else:
            # this runs basic validation, including JSON loading if required
            self.data, self.errors = schema.validate("database", data)

        if self.errors:
            return  # don't proceed with the rest of validation

        if self.storage is not None:  # loading from the disk, check code
            if not self.storage.code.exists():
                self.errors.append(
                    "Database view code not found: %s" % self.storage.code.path
                )
                return
            else:
                code = self.storage.code.load()

        # At this point, `code' can be a string (or a binary blob) or ``None``
        if code is None:  # loads the default code for an algorithm
            self.code = prototypes.binary_load("database.py")

        else:  # just assign it - notice that in this case, no language is set
            self.code = code

        if self.errors:
            return  # don't proceed with the rest of validation

        self._validate_semantics(dataformat_cache)

    def _validate_semantics(self, dataformat_cache):
        """Validates all semantical aspects of the database"""

        # all protocol names must be unique
        protocol_names = [k["name"] for k in self.data["protocols"]]
        if len(protocol_names) != len(set(protocol_names)):
            self.errors.append(
                "found different protocols with the same name: %s" % (protocol_names,)
            )

        # all set names within a protocol must be unique
        for protocol in self.data["protocols"]:
            set_names = self.set_names(protocol["name"])
            if len(set_names) != len(set(set_names)):
                self.errors.append(
                    "found different sets with the same name at protocol "
                    "`%s': %s" % (protocol["name"], set_names)
                )

            # all outputs must have valid data types
            for _, set_ in self.sets(protocol["name"]).items():
                for key, value in set_["outputs"].items():
                    if value in self.dataformats:
                        continue

                    if value in dataformat_cache:  # re-use
                        dataformat = dataformat_cache[value]
                    else:
                        dataformat = DataFormat(self.prefix, value)
                        dataformat_cache[value] = dataformat

                    self.dataformats[value] = dataformat

                    if dataformat.errors:
                        self.errors.append(
                            "found error validating data format `%s' "
                            "for output `%s' on set `%s' of protocol `%s': %s"
                            % (
                                value,
                                key,
                                set_["name"],
                                protocol["name"],
                                "\n".join(dataformat.errors),
                            )
                        )

                # all view names must be relative to the database root path
                if self.schema_version == 1:
                    self._validate_view(set_["view"])

            if self.schema_version != 1:
                for view in protocol["views"].keys():
                    self._validate_view(view)

    @property
    def is_database_rawdata_access_enabled(self):
        """Returns whether raw data sharing was enabled

        This property is only useful for the Docker executor.
        """

        return self.data.get("direct_rawdata_access", False)