Source code for bob.learn.tensorflow.style_transfer.neural_transfer

#!/usr/bin/env python
# vim: set fileencoding=utf-8 :
# @author: Tiago de Freitas Pereira <tiago.pereira@idiap.ch>


import tensorflow as tf
import numpy
import os
from bob.learn.tensorflow.loss import linear_gram_style_loss, content_loss, denoising_loss
import bob.io.image
import bob.ip.color

import logging
logger = logging.getLogger(__name__)


[docs]def compute_features(input_image, architecture, checkpoint_dir, target_end_points, preprocess_fn=None):
    """
    For a given set of end_points, convolve the input image until these points

    Parameters
    ----------

    input_image: :any:`numpy.array`
        Input image in the format WxHxC

    architecture:
        Pointer to the architecture function

    checkpoint_dir: str
        DCNN checkpoint directory

    end_points: dict
       Dictionary containing the end point tensors

    preprocess_fn:
       Pointer to a preprocess function

    """

    input_pl = tf.placeholder('float32', shape=(1, input_image.shape[1],
                                                   input_image.shape[2],
                                                   input_image.shape[3]))

    if preprocess_fn is None:
        _, end_points = architecture(input_pl, mode=tf.estimator.ModeKeys.PREDICT, trainable_variables=None)
    else:
        _, end_points = architecture(tf.stack([preprocess_fn(i) for i in tf.unstack(input_pl)]), mode=tf.estimator.ModeKeys.PREDICT, trainable_variables=None)
    with tf.Session() as sess:
        # Restoring the checkpoint for the given architecture
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()

        if os.path.isdir(checkpoint_dir):
            saver.restore(sess, tf.train.latest_checkpoint(checkpoint_dir))
        else:
            saver.restore(sess, checkpoint_dir)

        #content_feature = sess.run(end_points[CONTENT_END_POINTS], feed_dict={input_image: content_image})
        features = []
        for ep in target_end_points:
            feature = sess.run(end_points[ep], feed_dict={input_pl: input_image})
            features.append(feature)

    # Killing the graph
    tf.reset_default_graph()
    return features


[docs]def compute_gram(features):
    """
    Given a list of features (as numpy.arrays) comput the gram matrices of each
    pinning the channel as in:

    Gatys, Leon A., Alexander S. Ecker, and Matthias Bethge. "A neural algorithm of artistic style." arXiv preprint arXiv:1508.06576 (2015).

    Parameters
    ----------

    features: :any:`numpy.array`
      Convolved features in the format NxWxHxC

    """

    grams = []
    for f in features:
        f = numpy.reshape(f, (-1, f.shape[3]))
        grams.append(numpy.matmul(f.T, f) / f.size)

    return grams


[docs]def do_style_transfer(content_image, style_images,
                      architecture, checkpoint_dir, scopes,
                      content_end_points, style_end_points,
                      preprocess_fn=None, un_preprocess_fn=None, pure_noise=False,
                      iterations=1000, learning_rate=0.1,
                      content_weight=5., style_weight=500., denoise_weight=500.):

    """
    Trains neural style transfer using the approach presented in:

    Gatys, Leon A., Alexander S. Ecker, and Matthias Bethge. "A neural algorithm of artistic style." arXiv preprint arXiv:1508.06576 (2015).

    Parameters
    ----------

    content_image: :any:`numpy.array`
       Content image in the Bob format (C x W x H)

    style_images: :any:`list`
       List of numpy.array (Bob format (C x W x H)) that encodes the style

    architecture:
       Point to a function with the base architecture

    checkpoint_dir:
       CNN checkpoint path

    scopes:
       Dictionary containing the mapping scores

    content_end_points:
       List of end_points (from the architecture) for the used to encode the content

    style_end_points:
       List of end_points (from the architecture) for the used to encode the style

    preprocess_fn:
       Preprocess function. Pointer to a function that preprocess the INPUT signal

    unpreprocess_fn:
       Un preprocess function. Pointer to a function that preprocess the OUTPUT signal

    pure_noise:
       If set will save the raw noisy generated image.
       If not set, the output will be RGB = stylizedYUV.Y, originalYUV.U, originalYUV.V

    iterations:
       Number of iterations to generate the image

    learning_rate:
       Adam learning rate

    content_weight:
       Weight of the content loss

    style_weight:
       Weight of the style loss

    denoise_weight:
       Weight denoising loss
    """

    def wise_shape(shape):
        if len(shape)==2:
            return (1, shape[0], shape[1], 1)
        else:
            return (1, shape[0], shape[1], shape[2])

    def normalize4save(img):
        return (255 * ((img - numpy.min(img)) / (numpy.max(img)-numpy.min(img)))).astype("uint8")

    # Reshaping to NxWxHxC and converting to the tensorflow format
    # content
    original_image = content_image
    content_image = bob.io.image.to_matplotlib(content_image).astype("float32")
    content_image = numpy.reshape(content_image, wise_shape(content_image.shape))

    # and style
    for i in range(len(style_images)):
        image = bob.io.image.to_matplotlib(style_images[i])
        image = numpy.reshape(image, wise_shape(image.shape))
        style_images[i] = image

    # Base content features
    logger.info("Computing content features")
    content_features = compute_features(content_image, architecture, checkpoint_dir,
                                        content_end_points, preprocess_fn)

    # Base style features
    logger.info("Computing style features")
    style_grams = []
    for image in style_images:
        style_features = compute_features(image, architecture, checkpoint_dir,
                                          style_end_points, preprocess_fn)
        style_grams.append(compute_gram(style_features))

    # Organizing the trainer
    logger.info("Training.....")
    with tf.Graph().as_default():
        tf.set_random_seed(0)

        # Random noise
        noise = tf.Variable(tf.random_normal(shape=content_image.shape),
                            trainable=True) * 0.256
        _, end_points = architecture(noise,
                                      mode=tf.estimator.ModeKeys.PREDICT,
                                      trainable_variables=[])

        # Computing content loss
        content_noises = []
        for c in content_end_points:
            content_noises.append(end_points[c])
        c_loss = content_loss(content_noises, content_features)

        # Computing style_loss
        style_gram_noises = []
        s_loss = 0

        for grams_per_image in style_grams:

            for c in style_end_points:
                layer = end_points[c]
                _, height, width, number = map(lambda i: i.value, layer.get_shape())
                size = height * width * number
                features = tf.reshape(layer, (-1, number))
                style_gram_noises.append(tf.matmul(tf.transpose(features), features) / size)
            s_loss += linear_gram_style_loss(style_gram_noises, grams_per_image)

        # Variation denoise
        d_loss = denoising_loss(noise)

        #Total loss
        total_loss = content_weight*c_loss + style_weight*s_loss + denoise_weight*d_loss

        solver = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)

        tf.contrib.framework.init_from_checkpoint(tf.train.latest_checkpoint(checkpoint_dir) if os.path.isdir(checkpoint_dir) else checkpoint_dir, scopes)
        # Training
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            for i in range(iterations):
                _, loss = sess.run([solver, total_loss])
                logger.info("Iteration {0}, loss {1}".format(i, loss))

            # Saving generated image
            raw_style_image = sess.run(noise)[0, :, :,:]
            # Unpreprocessing the signal
            if un_preprocess_fn is not None:
                raw_style_image = un_preprocess_fn(raw_style_image)

            raw_style_image = bob.io.image.to_bob(raw_style_image)
            normalized_style_image = normalize4save(raw_style_image)

            if pure_noise:
                if normalized_style_image.shape[0] == 1:
                    return normalized_style_image[0, :, :]
                else:
                    return normalized_style_image
            else:
                # Original output
                if normalized_style_image.shape[0] == 1:
                    normalized_style_image_yuv = bob.ip.color.rgb_to_yuv(bob.ip.color.gray_to_rgb(normalized_style_image[0,:,:]))
                    # Loading the content image and clipping from 0-255 in case is in another scale
                    #scaled_content_image = normalize4save(bob.io.base.load(content_image_path).astype("float32")).astype("float64")
                    scaled_content_image = original_image.astype("float64")
                    content_image_yuv = bob.ip.color.rgb_to_yuv(bob.ip.color.gray_to_rgb(scaled_content_image))
                else:
                    normalized_style_image_yuv = bob.ip.color.rgb_to_yuv(bob.ip.color.gray_to_rgb(bob.ip.color.rgb_to_gray(normalized_style_image)))
                    content_image_yuv = bob.ip.color.rgb_to_yuv(original_image)

                output_image = numpy.zeros(shape=content_image_yuv.shape, dtype="uint8")
                output_image[0,:,:] = normalized_style_image_yuv[0,:,:]
                output_image[1,:,:] = content_image_yuv[1,:,:]
                output_image[2,:,:] = content_image_yuv[2,:,:]

                output_image = bob.ip.color.yuv_to_rgb(output_image)
                return output_image