Source code for bob.learn.pytorch.trainers.MCCNNTrainer

#!/usr/bin/env python
# encoding: utf-8

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from bob.learn.pytorch.utils import comp_bce_loss_weights
from torch.utils.tensorboard import SummaryWriter

import bob.core

logger = bob.core.log.setup("bob.learn.pytorch")

import time
import os

import copy


[docs]class MCCNNTrainer(object): """ Class to train the MCCNN Attributes ---------- network: :py:class:`torch.nn.Module` The network to train batch_size: int The size of your minibatch use_gpu: bool If you would like to use the gpu verbosity_level: int The level of verbosity output to stdout """ def __init__( self, network, batch_size=64, use_gpu=False, adapted_layers="conv1-block1-group1-ffc", adapt_reference_channel=False, verbosity_level=2, tf_logdir="tf_logs", do_crossvalidation=False, ): """ Init function . The layers to be adapted in the network is selected and the gradients are set to `True` for the layers which needs to be adapted. Parameters ---------- network: :py:class:`torch.nn.Module` The network to train batch_size: int The size of your minibatch use_gpu: bool If you would like to use the gpu adapted_layers: str The blocks in the CNN to adapt; only the ones listed are adapted in the training. The layers are separated by '-' in the string, for example 'conv1-block1-group1-ffc'. The fully connected layer in the output part are adapted always. adapt_reference_channel: bool If this value is `True` then 'ch_0' (which is the reference channel- usually, grayscale image) is also adapted. Otherwise the reference channel is not adapted, so that it can be used for Face recognition as well, default: `False`. verbosity_level: int The level of verbosity output to stdout do_crossvalidation: bool If set to `True`, performs validation in each epoch and stores the best model based on validation loss. """ self.network = network self.batch_size = batch_size self.use_gpu = use_gpu self.criterion = nn.BCELoss() self.do_crossvalidation = do_crossvalidation if self.do_crossvalidation: phases = ["train", "val"] else: phases = ["train"] self.phases = phases if self.use_gpu: self.network.cuda() bob.core.log.set_verbosity_level(logger, verbosity_level) self.tf_logger = SummaryWriter(log_dir=tf_logdir) layers_present = self.network.lcnn_layers.copy() layers_present.append("ffc") # select the layers in the network to adapt adapted_layers_list = adapted_layers.split("-") assert "ffc" in adapted_layers_list assert set(adapted_layers_list) <= set( layers_present ) # to ensure layer names are valid if adapt_reference_channel: # whether to adapt the color channel start_index = 0 else: start_index = 1 layers_to_adapt = [ "linear1fc", "linear2fc", ] # Final fully connected layers are added in all cases # listing the layers which would be adapted for i in range(start_index, self.network.num_channels): for layer in adapted_layers_list: if layer != "ffc": layers_to_adapt.append("layer_dict.ch_{}_".format(i) + layer) layers_to_adapt = list(np.unique(layers_to_adapt)) # Setting the gradients to true for the layers which needs to be adapted for name, param in self.network.named_parameters(): param.requires_grad = False for lta in layers_to_adapt: if lta in name: param.requires_grad = True
[docs] def load_model(self, model_filename): """Loads an existing model Parameters ---------- model_file: str The filename of the model to load Returns ------- start_epoch: int The epoch to start with start_iteration: int The iteration to start with losses: list(float) The list of losses from previous training """ cp = torch.load(model_filename) self.network.load_state_dict(cp["state_dict"]) start_epoch = cp["epoch"] start_iter = cp["iteration"] losses = cp["loss"] return start_epoch, start_iter, losses
[docs] def save_model(self, output_dir, epoch=0, iteration=0, losses=None): """Save the trained network Parameters ---------- output_dir: str The directory to write the models to epoch: int the current epoch iteration: int the current (last) iteration losses: list(float) The list of losses since the beginning of training """ saved_filename = "model_{}_{}.pth".format(epoch, iteration) saved_path = os.path.join(output_dir, saved_filename) logger.info("Saving model to {}".format(saved_path)) cp = { "epoch": epoch, "iteration": iteration, "loss": losses, "state_dict": self.network.cpu().state_dict(), } torch.save(cp, saved_path) # moved the model back to GPU if needed if self.use_gpu: self.network.cuda()
[docs] def train( self, dataloader, n_epochs=25, learning_rate=1e-4, output_dir="out", model=None ): """Performs the training. Parameters ---------- dataloader: :py:class:`torch.utils.data.DataLoader` The dataloader for your data n_epochs: int The number of epochs you would like to train for learning_rate: float The learning rate for Adam optimizer. output_dir: str The directory where you would like to save models model: str The path to a pretrained model file to start training from; this is the PAD model; not the LightCNN model """ # if model exists, load it if model is not None: start_epoch, start_iter, losses = self.load_model(model) logger.info( "Starting training at epoch {}, iteration {} - last loss value is {}".format( start_epoch, start_iter, losses[-1] ) ) else: start_epoch = 0 start_iter = 0 losses = [] logger.info("Starting training from scratch") logger.info("Number of channels: {}".format(self.network.num_channels)) for name, param in self.network.named_parameters(): if param.requires_grad == True: logger.info("Layer to be adapted from grad check : {}".format(name)) # setup optimizer optimizer = optim.Adam( filter(lambda p: p.requires_grad, self.network.parameters()), lr=learning_rate, ) self.network.train(True) best_model_wts = copy.deepcopy(self.network.state_dict()) best_loss = float("inf") # let's go for epoch in range(start_epoch, n_epochs): # in the epoch train_loss_history = [] val_loss_history = [] for phase in self.phases: if phase == "train": self.network.train() # Set model to training mode else: self.network.eval() # Set model to evaluate mode for i, data in enumerate(dataloader[phase], 0): if i >= start_iter: start = time.time() img, labels = data labels = labels.float().unsqueeze(1) weights = comp_bce_loss_weights(labels) batch_size = len(img) if self.use_gpu: img = img.cuda() labels = labels.cuda() weights = weights.cuda() imagesv = Variable(img) labelsv = Variable(labels) # weights for samples, should help with data imbalance self.criterion.weight = weights optimizer.zero_grad() with torch.set_grad_enabled(phase == "train"): output = self.network(imagesv) loss = self.criterion(output, labelsv) if phase == "train": loss.backward() optimizer.step() train_loss_history.append(loss.item()) else: val_loss_history.append(loss.item()) end = time.time() logger.info( "[{}/{}][{}/{}] => Loss = {} (time spent: {}), Phase {}".format( epoch, n_epochs, i, len(dataloader[phase]), loss.item(), (end - start), phase, ) ) losses.append(loss.item()) epoch_train_loss = np.mean(train_loss_history) logger.info("Train Loss : {} epoch : {}".format(epoch_train_loss, epoch)) if self.do_crossvalidation: epoch_val_loss = np.mean(val_loss_history) logger.info("Val Loss : {} epoch : {}".format(epoch_val_loss, epoch)) if phase == "val" and epoch_val_loss < best_loss: logger.debug( "New val loss : {} is better than old: {}, copying over the new weights".format( epoch_val_loss, best_loss ) ) best_loss = epoch_val_loss best_model_wts = copy.deepcopy(self.network.state_dict()) ######################################## <Logging> ################################### if self.do_crossvalidation: info = {"train_loss": epoch_train_loss, "val_loss": epoch_val_loss} else: info = {"train_loss": epoch_train_loss} # scalar logs for tag, value in info.items(): self.tf_logger.add_scalar( tag=tag, scalar_value=value, global_step=epoch + 1 ) # Log values and gradients of the parameters (histogram summary) for tag, value in self.network.named_parameters(): tag = tag.replace(".", "/") try: self.tf_logger.add_histogram( tag=tag, values=value.data.cpu().numpy(), global_step=epoch + 1 ) self.tf_logger.add_histogram( tag=tag + "/grad", values=value.grad.data.cpu().numpy(), global_step=epoch + 1, ) except: pass # # Log images # logimg=img.view(-1,img.size()[1]*128, 128)[:10].cpu().numpy() # info = { 'images': logimg} # for tag, images in info.items(): # self.tf_logger.image_summary(tag, images, epoch+1) ######################################## </Logging> ################################### # do stuff - like saving models logger.info("EPOCH {} DONE".format(epoch + 1)) self.save_model(output_dir, epoch=(epoch + 1), iteration=0, losses=losses) ## load the best weights self.network.load_state_dict(best_model_wts) # best epoch is named as 100 self.save_model(output_dir, epoch=100, iteration=0, losses=losses)