#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
###################################################################################
# #
# Copyright (c) 2019 Idiap Research Institute, http://www.idiap.ch/ #
# Contact: beat.support@idiap.ch #
# #
# Redistribution and use in source and binary forms, with or without #
# modification, are permitted provided that the following conditions are met: #
# #
# 1. Redistributions of source code must retain the above copyright notice, this #
# list of conditions and the following disclaimer. #
# #
# 2. Redistributions in binary form must reproduce the above copyright notice, #
# this list of conditions and the following disclaimer in the documentation #
# and/or other materials provided with the distribution. #
# #
# 3. Neither the name of the copyright holder nor the names of its contributors #
# may be used to endorse or promote products derived from this software without #
# specific prior written permission. #
# #
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED #
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE #
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE #
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL #
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR #
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER #
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, #
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE #
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
# #
###################################################################################
"""
========
database
========
Validation of databases
Forward importing from :py:mod:`beat.backend.python.database`:
:py:class:`beat.backend.python.database.Storage`
"""
import os
import six
from beat.backend.python.database import Database as BackendDatabase
from beat.backend.python.database import Storage
from beat.backend.python.protocoltemplate import Storage as PTStorage
from . import prototypes
from . import schema
from .dataformat import DataFormat
from .protocoltemplate import ProtocolTemplate
[docs]def get_first_procotol_template(prefix):
pt_root_folder = os.path.join(prefix, PTStorage.asset_folder)
pts_available = sorted(os.listdir(pt_root_folder))
if not pts_available:
raise RuntimeError("Invalid prefix content, no protocol template available")
selected_protocol_template = None
for procotol_template_folder in pts_available:
protocol_template_versions = sorted(
os.listdir(os.path.join(pt_root_folder, procotol_template_folder))
)
version = protocol_template_versions[-1].split(".")[0]
protocol_template_name = "{}/{}".format(procotol_template_folder, version)
protocol_template = ProtocolTemplate(prefix, protocol_template_name)
if protocol_template.valid:
selected_protocol_template = protocol_template_name
break
if selected_protocol_template is None:
raise RuntimeError("No valid protocol template found")
return selected_protocol_template
[docs]class Database(BackendDatabase):
"""Databases define the start point of the dataflow in an experiment.
Parameters:
prefix (str): Establishes the prefix of your installation.
data (dict, str): The piece of data representing the database. It must
validate against the schema defined for databases. If a string is
passed, it is supposed to be a valid path to an database in the
designated prefix area.
dataformat_cache (:py:class:`dict`, Optional): A dictionary mapping
dataformat names to loaded dataformats. This parameter is optional and,
if passed, may greatly speed-up database loading times as dataformats
that are already loaded may be re-used. If you use this parameter, you
must guarantee that the cache is refreshed as appropriate in case the
underlying dataformats change.
Attributes:
name (str): The full, valid name of this database
description (str): The short description string, loaded from the JSON
file if one was set.
documentation (str): The full-length docstring for this object.
storage (object): A simple object that provides information about file
paths for this database
errors (list): A list containing errors found while loading this
database.
data (dict): The original data for this database, as loaded by our JSON
decoder.
"""
def __init__(self, prefix, data, dataformat_cache=None):
super(Database, self).__init__(prefix, data, dataformat_cache)
def _validate_view(self, view_name):
if view_name.find(".") != -1 or view_name.find(os.sep) != -1:
self.errors.append(
"dataset views are required to sit inside the "
"database root folder, but `%s' is either in a "
"subdirectory or points to a python module, what is "
"unsupported by this version" % (view_name)
)
def _load(self, data, dataformat_cache):
"""Loads the database"""
self._name = None
self.storage = None
self.dataformats = {} # preloaded dataformats
code = None
if isinstance(data, (tuple, list)): # user has passed individual info
data, code = data # break down into two components
if isinstance(data, six.string_types): # user has passed a file pointer
self._name = data
self.storage = Storage(self.prefix, self._name)
data = self.storage.json.path
if not self.storage.json.exists():
self.errors.append("Database declaration file not found: %s" % data)
return
# At this point, `data' can be a dictionary or ``None``
if data is None: # loads the default declaration for a database
self.data, self.errors = prototypes.load("database")
self.data["protocols"][0]["template"] = get_first_procotol_template(
self.prefix
)
assert not self.errors, "\n * %s" % "\n *".join(self.errors) # nosec
else:
# this runs basic validation, including JSON loading if required
self.data, self.errors = schema.validate("database", data)
if self.errors:
return # don't proceed with the rest of validation
if self.storage is not None: # loading from the disk, check code
if not self.storage.code.exists():
self.errors.append(
"Database view code not found: %s" % self.storage.code.path
)
return
else:
code = self.storage.code.load()
# At this point, `code' can be a string (or a binary blob) or ``None``
if code is None: # loads the default code for an algorithm
self.code = prototypes.binary_load("database.py")
else: # just assign it - notice that in this case, no language is set
self.code = code
if self.errors:
return # don't proceed with the rest of validation
self._validate_semantics(dataformat_cache)
def _validate_semantics(self, dataformat_cache):
"""Validates all semantical aspects of the database"""
# all protocol names must be unique
protocol_names = [k["name"] for k in self.data["protocols"]]
if len(protocol_names) != len(set(protocol_names)):
self.errors.append(
"found different protocols with the same name: %s" % (protocol_names,)
)
# all set names within a protocol must be unique
for protocol in self.data["protocols"]:
set_names = self.set_names(protocol["name"])
if len(set_names) != len(set(set_names)):
self.errors.append(
"found different sets with the same name at protocol "
"`%s': %s" % (protocol["name"], set_names)
)
# all outputs must have valid data types
for _, set_ in self.sets(protocol["name"]).items():
for key, value in set_["outputs"].items():
if value in self.dataformats:
continue
if value in dataformat_cache: # re-use
dataformat = dataformat_cache[value]
else:
dataformat = DataFormat(self.prefix, value)
dataformat_cache[value] = dataformat
self.dataformats[value] = dataformat
if dataformat.errors:
self.errors.append(
"found error validating data format `%s' "
"for output `%s' on set `%s' of protocol `%s': %s"
% (
value,
key,
set_["name"],
protocol["name"],
"\n".join(dataformat.errors),
)
)
# all view names must be relative to the database root path
if self.schema_version == 1:
self._validate_view(set_["view"])
if self.schema_version != 1:
for view in protocol["views"].keys():
self._validate_view(view)
@property
def is_database_rawdata_access_enabled(self):
"""Returns whether raw data sharing was enabled
This property is only useful for the Docker executor.
"""
return self.data.get("direct_rawdata_access", False)