#!/usr/bin/env python
# encoding: utf-8
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from bob.learn.pytorch.utils import comp_bce_loss_weights
from torch.utils.tensorboard import SummaryWriter
import bob.core
logger = bob.core.log.setup("bob.learn.pytorch")
import time
import os
import copy
[docs]class MCCNNTrainer(object):
"""
Class to train the MCCNN
Attributes
----------
network: :py:class:`torch.nn.Module`
The network to train
batch_size: int
The size of your minibatch
use_gpu: bool
If you would like to use the gpu
verbosity_level: int
The level of verbosity output to stdout
"""
def __init__(
self,
network,
batch_size=64,
use_gpu=False,
adapted_layers="conv1-block1-group1-ffc",
adapt_reference_channel=False,
verbosity_level=2,
tf_logdir="tf_logs",
do_crossvalidation=False,
):
""" Init function . The layers to be adapted in the network is selected and the gradients are set to `True`
for the layers which needs to be adapted.
Parameters
----------
network: :py:class:`torch.nn.Module`
The network to train
batch_size: int
The size of your minibatch
use_gpu: bool
If you would like to use the gpu
adapted_layers: str
The blocks in the CNN to adapt; only the ones listed are adapted in the training. The layers are separated by '-' in the
string, for example 'conv1-block1-group1-ffc'. The fully connected layer in the output part are adapted always.
adapt_reference_channel: bool
If this value is `True` then 'ch_0' (which is the reference channel- usually, grayscale image) is also adapted. Otherwise the reference channel
is not adapted, so that it can be used for Face recognition as well, default: `False`.
verbosity_level: int
The level of verbosity output to stdout
do_crossvalidation: bool
If set to `True`, performs validation in each epoch and stores the best model based on validation loss.
"""
self.network = network
self.batch_size = batch_size
self.use_gpu = use_gpu
self.criterion = nn.BCELoss()
self.do_crossvalidation = do_crossvalidation
if self.do_crossvalidation:
phases = ["train", "val"]
else:
phases = ["train"]
self.phases = phases
if self.use_gpu:
self.network.cuda()
bob.core.log.set_verbosity_level(logger, verbosity_level)
self.tf_logger = SummaryWriter(log_dir=tf_logdir)
layers_present = self.network.lcnn_layers.copy()
layers_present.append("ffc")
# select the layers in the network to adapt
adapted_layers_list = adapted_layers.split("-")
assert "ffc" in adapted_layers_list
assert set(adapted_layers_list) <= set(
layers_present
) # to ensure layer names are valid
if adapt_reference_channel: # whether to adapt the color channel
start_index = 0
else:
start_index = 1
layers_to_adapt = [
"linear1fc",
"linear2fc",
] # Final fully connected layers are added in all cases
# listing the layers which would be adapted
for i in range(start_index, self.network.num_channels):
for layer in adapted_layers_list:
if layer != "ffc":
layers_to_adapt.append("layer_dict.ch_{}_".format(i) + layer)
layers_to_adapt = list(np.unique(layers_to_adapt))
# Setting the gradients to true for the layers which needs to be adapted
for name, param in self.network.named_parameters():
param.requires_grad = False
for lta in layers_to_adapt:
if lta in name:
param.requires_grad = True
[docs] def load_model(self, model_filename):
"""Loads an existing model
Parameters
----------
model_file: str
The filename of the model to load
Returns
-------
start_epoch: int
The epoch to start with
start_iteration: int
The iteration to start with
losses: list(float)
The list of losses from previous training
"""
cp = torch.load(model_filename)
self.network.load_state_dict(cp["state_dict"])
start_epoch = cp["epoch"]
start_iter = cp["iteration"]
losses = cp["loss"]
return start_epoch, start_iter, losses
[docs] def save_model(self, output_dir, epoch=0, iteration=0, losses=None):
"""Save the trained network
Parameters
----------
output_dir: str
The directory to write the models to
epoch: int
the current epoch
iteration: int
the current (last) iteration
losses: list(float)
The list of losses since the beginning of training
"""
saved_filename = "model_{}_{}.pth".format(epoch, iteration)
saved_path = os.path.join(output_dir, saved_filename)
logger.info("Saving model to {}".format(saved_path))
cp = {
"epoch": epoch,
"iteration": iteration,
"loss": losses,
"state_dict": self.network.cpu().state_dict(),
}
torch.save(cp, saved_path)
# moved the model back to GPU if needed
if self.use_gpu:
self.network.cuda()
[docs] def train(
self, dataloader, n_epochs=25, learning_rate=1e-4, output_dir="out", model=None
):
"""Performs the training.
Parameters
----------
dataloader: :py:class:`torch.utils.data.DataLoader`
The dataloader for your data
n_epochs: int
The number of epochs you would like to train for
learning_rate: float
The learning rate for Adam optimizer.
output_dir: str
The directory where you would like to save models
model: str
The path to a pretrained model file to start training from; this is the PAD model; not the LightCNN model
"""
# if model exists, load it
if model is not None:
start_epoch, start_iter, losses = self.load_model(model)
logger.info(
"Starting training at epoch {}, iteration {} - last loss value is {}".format(
start_epoch, start_iter, losses[-1]
)
)
else:
start_epoch = 0
start_iter = 0
losses = []
logger.info("Starting training from scratch")
logger.info("Number of channels: {}".format(self.network.num_channels))
for name, param in self.network.named_parameters():
if param.requires_grad == True:
logger.info("Layer to be adapted from grad check : {}".format(name))
# setup optimizer
optimizer = optim.Adam(
filter(lambda p: p.requires_grad, self.network.parameters()),
lr=learning_rate,
)
self.network.train(True)
best_model_wts = copy.deepcopy(self.network.state_dict())
best_loss = float("inf")
# let's go
for epoch in range(start_epoch, n_epochs):
# in the epoch
train_loss_history = []
val_loss_history = []
for phase in self.phases:
if phase == "train":
self.network.train() # Set model to training mode
else:
self.network.eval() # Set model to evaluate mode
for i, data in enumerate(dataloader[phase], 0):
if i >= start_iter:
start = time.time()
img, labels = data
labels = labels.float().unsqueeze(1)
weights = comp_bce_loss_weights(labels)
batch_size = len(img)
if self.use_gpu:
img = img.cuda()
labels = labels.cuda()
weights = weights.cuda()
imagesv = Variable(img)
labelsv = Variable(labels)
# weights for samples, should help with data imbalance
self.criterion.weight = weights
optimizer.zero_grad()
with torch.set_grad_enabled(phase == "train"):
output = self.network(imagesv)
loss = self.criterion(output, labelsv)
if phase == "train":
loss.backward()
optimizer.step()
train_loss_history.append(loss.item())
else:
val_loss_history.append(loss.item())
end = time.time()
logger.info(
"[{}/{}][{}/{}] => Loss = {} (time spent: {}), Phase {}".format(
epoch,
n_epochs,
i,
len(dataloader[phase]),
loss.item(),
(end - start),
phase,
)
)
losses.append(loss.item())
epoch_train_loss = np.mean(train_loss_history)
logger.info("Train Loss : {} epoch : {}".format(epoch_train_loss, epoch))
if self.do_crossvalidation:
epoch_val_loss = np.mean(val_loss_history)
logger.info("Val Loss : {} epoch : {}".format(epoch_val_loss, epoch))
if phase == "val" and epoch_val_loss < best_loss:
logger.debug(
"New val loss : {} is better than old: {}, copying over the new weights".format(
epoch_val_loss, best_loss
)
)
best_loss = epoch_val_loss
best_model_wts = copy.deepcopy(self.network.state_dict())
######################################## <Logging> ###################################
if self.do_crossvalidation:
info = {"train_loss": epoch_train_loss, "val_loss": epoch_val_loss}
else:
info = {"train_loss": epoch_train_loss}
# scalar logs
for tag, value in info.items():
self.tf_logger.add_scalar(
tag=tag, scalar_value=value, global_step=epoch + 1
)
# Log values and gradients of the parameters (histogram summary)
for tag, value in self.network.named_parameters():
tag = tag.replace(".", "/")
try:
self.tf_logger.add_histogram(
tag=tag, values=value.data.cpu().numpy(), global_step=epoch + 1
)
self.tf_logger.add_histogram(
tag=tag + "/grad",
values=value.grad.data.cpu().numpy(),
global_step=epoch + 1,
)
except:
pass
# # Log images
# logimg=img.view(-1,img.size()[1]*128, 128)[:10].cpu().numpy()
# info = { 'images': logimg}
# for tag, images in info.items():
# self.tf_logger.image_summary(tag, images, epoch+1)
######################################## </Logging> ###################################
# do stuff - like saving models
logger.info("EPOCH {} DONE".format(epoch + 1))
self.save_model(output_dir, epoch=(epoch + 1), iteration=0, losses=losses)
## load the best weights
self.network.load_state_dict(best_model_wts)
# best epoch is named as 100
self.save_model(output_dir, epoch=100, iteration=0, losses=losses)