Source code for beat.core.execution.base

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :

###################################################################################
#                                                                                 #
# Copyright (c) 2019 Idiap Research Institute, http://www.idiap.ch/               #
# Contact: beat.support@idiap.ch                                                  #
#                                                                                 #
# Redistribution and use in source and binary forms, with or without              #
# modification, are permitted provided that the following conditions are met:     #
#                                                                                 #
# 1. Redistributions of source code must retain the above copyright notice, this  #
# list of conditions and the following disclaimer.                                #
#                                                                                 #
# 2. Redistributions in binary form must reproduce the above copyright notice,    #
# this list of conditions and the following disclaimer in the documentation       #
# and/or other materials provided with the distribution.                          #
#                                                                                 #
# 3. Neither the name of the copyright holder nor the names of its contributors   #
# may be used to endorse or promote products derived from this software without   #
# specific prior written permission.                                              #
#                                                                                 #
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED   #
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE          #
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE    #
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL      #
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR      #
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER      #
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,   #
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE   #
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.            #
#                                                                                 #
###################################################################################


"""
====
base
====

Execution utilities
"""
import collections
import glob
import logging
import os

import simplejson as json

from beat.backend.python.helpers import convert_experiment_configuration_to_container

from .. import algorithm
from .. import database
from .. import schema
from .. import stats

logger = logging.getLogger(__name__)


[docs]class BaseExecutor(object):
    """Executors runs the code given an execution block information


    Parameters:

      prefix (str): Establishes the prefix of your installation.

      data (dict, str): The piece of data representing the block to be executed.
        It must validate against the schema defined for execution blocks. If a
        string is passed, it is supposed to be a fully qualified absolute path to
        a JSON file containing the block execution information.

      cache (:py:class:`str`, Optional): If your cache is not located under
        ``<prefix>/cache``, then specify a full path here. It will be used
        instead.

      dataformat_cache (:py:class:`dict`, Optional): A dictionary mapping
        dataformat names to loaded dataformats. This parameter is optional and,
        if passed, may greatly speed-up database loading times as dataformats
        that are already loaded may be re-used. If you use this parameter, you
        must guarantee that the cache is refreshed as appropriate in case the
        underlying dataformats change.

      database_cache (:py:class:`dict`, Optional): A dictionary mapping
        database names to loaded databases. This parameter is optional and, if
        passed, may greatly speed-up database loading times as databases that
        are already loaded may be re-used. If you use this parameter, you must
        guarantee that the cache is refreshed as appropriate in case the
        underlying databases change.

      algorithm_cache (:py:class:`dict`, Optional): A dictionary mapping
        algorithm names to loaded algorithms. This parameter is optional and,
        if passed, may greatly speed-up database loading times as algorithms
        that are already loaded may be re-used. If you use this parameter, you
        must guarantee that the cache is refreshed as appropriate in case the
        underlying algorithms change.

      library_cache (:py:class:`dict`, Optional): A dictionary mapping library
        names to loaded libraries. This parameter is optional and, if passed,
        may greatly speed-up library loading times as libraries that are
        already loaded may be re-used. If you use this parameter, you must
        guarantee that the cache is refreshed as appropriate in case the
        underlying libraries change.


    Attributes:

      cache (str): The path to the cache currently being used

      errors (list): A list containing errors found while loading this execution
        block.

      data (dict): The original data for this executor, as loaded by our JSON
        decoder.

      algorithm (beat.core.algorithm.Algorithm): An object representing the
        algorithm to be run.

      databases (dict): A dictionary in which keys are strings with database
        names and values are :py:class:`.database.Database`, representing the
        databases required for running this block. The dictionary may be empty
        in case all inputs are taken from the file cache.

      views (dict): A dictionary in which the keys are tuples pointing to the
        ``(<database-name>, <protocol>, <set>)`` and the value is a setup view
        for that particular combination of details. The dictionary may be empty
        in case all inputs are taken from the file cache.

      input_list (beat.backend.python.inputs.InputList): A list of inputs that
        will be served to the algorithm.

      output_list (beat.backend.python.outputs.OutputList): A list of outputs
        that the algorithm will produce.

      data_sources (list): A list with all data-sources created by our execution
        loader.

      data_sinks (list): A list with all data-sinks created by our execution
        loader. These are useful for clean-up actions in case of problems.

    """

    def __init__(
        self,
        prefix,
        data,
        cache=None,
        dataformat_cache=None,
        database_cache=None,
        algorithm_cache=None,
        library_cache=None,
        custom_root_folders=None,
    ):
        # Initialisations
        self.prefix = prefix
        self.cache = cache or os.path.join(self.prefix, "cache")
        self.algorithm = None
        self.loop_algorithm = None
        self.databases = {}
        self.input_list = None
        self.data_loaders = None
        self.output_list = None
        self.data_sinks = []
        self.errors = []
        self.data = data
        self.debug = False

        # Check that the cache path exists
        if not os.path.exists(self.cache):
            raise IOError("Cache path `%s' does not exist" % self.cache)

        # Check the custom root folders
        if custom_root_folders is not None:
            if not isinstance(custom_root_folders, collections.abc.Mapping):
                raise TypeError("The custom root folders must be in dictionary format")
        else:
            custom_root_folders = {}

        # Temporary caches, if the user has not set them, for performance
        database_cache = database_cache if database_cache is not None else {}
        dataformat_cache = dataformat_cache if dataformat_cache is not None else {}
        algorithm_cache = algorithm_cache if algorithm_cache is not None else {}
        library_cache = library_cache if library_cache is not None else {}

        # Basic validation of the data declaration, including JSON loading if required
        if not isinstance(data, dict):
            if not os.path.exists(data):
                self.errors.append("File not found: %s" % data)
                return

        self.data, self.errors = schema.validate("execution", data)
        if self.errors:
            return

        # Load the algorithm (using the algorithm cache if possible)
        if self.data["algorithm"] in algorithm_cache:
            self.algorithm = algorithm_cache[self.data["algorithm"]]
        else:
            self.algorithm = algorithm.Algorithm(
                self.prefix, self.data["algorithm"], dataformat_cache, library_cache
            )
            algorithm_cache[self.algorithm.name] = self.algorithm

        if not self.algorithm.valid:
            self.errors += self.algorithm.errors
            return

        if "loop" in self.data:
            loop = self.data["loop"]
            if loop["algorithm"] in algorithm_cache:
                self.loop_algorithm = algorithm_cache[loop["algorithm"]]
            else:
                self.loop_algorithm = algorithm.Algorithm(
                    self.prefix, loop["algorithm"], dataformat_cache, library_cache
                )
                algorithm_cache[self.loop_algorithm.name] = self.loop_algorithm

                if len(loop["inputs"]) != len(self.loop_algorithm.input_map):
                    self.errors.append(
                        "The number of inputs of the loop algorithm doesn't correspond"
                    )

                for name in self.data["inputs"].keys():
                    if name not in self.algorithm.input_map.keys():
                        self.errors.append(
                            "The input '%s' doesn't exist in the loop algorithm" % name
                        )

                if len(loop["outputs"]) != len(self.loop_algorithm.output_map):
                    self.errors.append(
                        "The number of outputs of the loop algorithm doesn't correspond"
                    )

                for name in self.data["outputs"].keys():
                    if name not in self.algorithm.output_map.keys():
                        self.errors.append(
                            "The output '%s' doesn't exist in the loop algorithm" % name
                        )

        # Check that the mapping in coherent
        if len(self.data["inputs"]) != len(self.algorithm.input_map):
            self.errors.append(
                "The number of inputs of the algorithm doesn't correspond"
            )

        if "outputs" in self.data and (
            len(self.data["outputs"]) != len(self.algorithm.output_map)
        ):
            self.errors.append(
                "The number of outputs of the algorithm doesn't correspond"
            )

        for name in self.data["inputs"].keys():
            if name not in self.algorithm.input_map.keys():
                self.errors.append(
                    "The input '%s' doesn't exist in the algorithm" % name
                )

        if "outputs" in self.data:
            for name in self.data["outputs"].keys():
                if name not in self.algorithm.output_map.keys():
                    self.errors.append(
                        "The output '%s' doesn't exist in the algorithm" % name
                    )

        if "loop" in self.data:
            for name in ["request", "answer"]:
                if name not in self.algorithm.loop_map.keys():
                    self.errors.append(
                        "The loop '%s' doesn't exist in the algorithm" % name
                    )

        if self.errors:
            return

        # Load the databases (if any is required)
        self._update_db_cache(
            self.data["inputs"], custom_root_folders, database_cache, dataformat_cache
        )

        if "loop" in self.data:
            self._update_db_cache(
                self.data["loop"]["inputs"],
                custom_root_folders,
                database_cache,
                dataformat_cache,
            )

    def __enter__(self):
        """Prepares inputs and outputs for the processing task

        Raises:

          IOError: in case something cannot be properly setup

        """

        logger.info("Start the execution of '%s'", self.algorithm.name)

        return self

    def __exit__(self, exc_type, exc_value, traceback):
        """Closes all sinks and disconnects inputs and outputs"""

        for sink in self.data_sinks:
            # we save the output only if no valid error has been thrown
            # n.b.: a system exit will raise SystemExit which is not an Exception
            if not isinstance(exc_type, Exception):
                sink.close()

        self.input_list = None
        self.data_loaders = []
        self.output_list = None
        self.data_sinks = []

    def _update_db_cache(
        self, inputs, custom_root_folders, database_cache, dataformat_cache
    ):
        """Update the database cache based on the input list given"""

        for name, details in inputs.items():
            if "database" in details:
                if details["database"] not in self.databases:
                    if details["database"] in database_cache:
                        db = database_cache[details["database"]]
                    else:
                        db = database.Database(
                            self.prefix, details["database"], dataformat_cache
                        )

                        name = "database/%s" % db.name
                        if name in custom_root_folders:
                            db.data["root_folder"] = custom_root_folders[name]

                        database_cache[db.name] = db

                    self.databases[db.name] = db

                    if not db.valid:
                        self.errors += db.errors

    def _prepare_inputs(self):
        """Prepares all input required by the execution."""

        raise NotImplementedError()

    def _prepare_outputs(self):
        """Prepares all output required by the execution."""

        raise NotImplementedError()

[docs]    def process(
        self, virtual_memory_in_megabytes=0, max_cpu_percent=0, timeout_in_minutes=0
    ):
        """Executes the user algorithm code

        Parameters:

          virtual_memory_in_megabytes (:py:class:`int`, Optional): The amount
            of virtual memory (in Megabytes) available for the job. If set to
            zero, no limit will be applied.

          max_cpu_percent (:py:class:`int`, Optional): The maximum amount of
            CPU usage allowed in a system. This number must be an integer
            number between 0 and ``100*number_of_cores`` in your system. For
            instance, if your system has 2 cores, this number can go between 0
            and 200. If it is <= 0, then we don't track CPU usage.

          timeout_in_minutes (int): The number of minutes to wait for the user
            process to execute. After this amount of time, the user process is
            killed with ``signal.SIGKILL``. If set to zero, no timeout will be
            applied.

        Returns:

          dict: A dictionary which is JSON formattable containing the summary of
            this block execution.

        """

        raise NotImplementedError()

    @property
    def valid(self):
        """A boolean that indicates if this executor is valid or not"""

        return not bool(self.errors)

    @property
    def analysis(self):
        """A boolean that indicates if the current block is an analysis block"""
        return "result" in self.data

    @property
    def outputs_exist(self):
        """Returns ``True`` if outputs this block is supposed to produce exists."""

        if self.analysis:
            path = os.path.join(self.cache, self.data["result"]["path"]) + "*"
            if not glob.glob(path):
                return False

        else:
            for name, details in self.data["outputs"].items():
                path = os.path.join(self.cache, details["path"]) + "*"
                if not glob.glob(path):
                    return False

        # if you get to this point all outputs already exist
        return True

    @property
    def io_statistics(self):
        """Summarize current I/O statistics looking at data sources and sinks, inputs and outputs

        Returns:

          dict: A dictionary summarizing current I/O statistics
        """

        return stats.io_statistics(self.data, self.input_list, self.output_list)

    def __str__(self):
        return json.dumps(self.data, indent=4)

[docs]    def write(self, path):
        """Writes contents to precise filesystem location"""

        with open(path, "wt") as f:
            f.write(str(self))

[docs]    def dump_runner_configuration(self, directory):
        """Exports contents useful for a backend runner to run the algorithm"""

        data = convert_experiment_configuration_to_container(self.data)

        with open(os.path.join(directory, "configuration.json"), "wb") as f:
            json_data = json.dumps(data, indent=2)
            f.write(json_data.encode("utf-8"))

        tmp_prefix = os.path.join(directory, "prefix")
        if not os.path.exists(tmp_prefix):
            os.makedirs(tmp_prefix)

        self.algorithm.export(tmp_prefix)

        if self.loop_algorithm:
            self.loop_algorithm.export(tmp_prefix)

[docs]    def dump_databases_provider_configuration(self, directory):
        """Exports contents useful for a backend runner to run the algorithm"""

        with open(os.path.join(directory, "configuration.json"), "wb") as f:
            json_data = json.dumps(self.data, indent=2)
            f.write(json_data.encode("utf-8"))

        tmp_prefix = os.path.join(directory, "prefix")
        if not os.path.exists(tmp_prefix):
            os.makedirs(tmp_prefix)

        for db in self.databases.values():
            db.export(tmp_prefix)