Source code for bob.bio.face.annotator.faceX_106landmarks

import logging
import os
import sys

from itertools import product as product
from math import ceil

import numpy as np
import torch

from torchvision import transforms

from bob.extension.download import get_file
from bob.io.image import bob_to_opencvbgr

from . import Base
from .mtcnn import MTCNN

logger = logging.getLogger(__name__)


# Adapted from https://github.com/biubug6/Pytorch_Retinafacey
class PriorBox(object):
    """Compute the suitable parameters of anchors for later decode operation

    Attributes:
        cfg(dict): testing config.
        image_size(tuple): the input image size.
    """

    def __init__(self, cfg, image_size=None):
        """
        Init priorBox settings related to the generation of anchors.
        """
        super(PriorBox, self).__init__()
        self.min_sizes = cfg["min_sizes"]
        self.steps = cfg["steps"]
        self.image_size = image_size
        self.feature_maps = [
            [ceil(self.image_size[0] / step), ceil(self.image_size[1] / step)]
            for step in self.steps
        ]
        self.name = "s"

    def forward(self):
        anchors = []
        for k, f in enumerate(self.feature_maps):
            min_sizes = self.min_sizes[k]
            for i, j in product(range(f[0]), range(f[1])):
                for min_size in min_sizes:
                    s_kx = min_size / self.image_size[1]
                    s_ky = min_size / self.image_size[0]
                    dense_cx = [
                        x * self.steps[k] / self.image_size[1]
                        for x in [j + 0.5]
                    ]
                    dense_cy = [
                        y * self.steps[k] / self.image_size[0]
                        for y in [i + 0.5]
                    ]
                    for cy, cx in product(dense_cy, dense_cx):
                        anchors += [cx, cy, s_kx, s_ky]
        # back to torch land
        output = torch.Tensor(anchors).view(-1, 4)
        return output


def download_faceX_model():
    urls = [
        "https://www.idiap.ch/software/bob/data/bob/bob.bio.face/master/pytorch/faceX_models.tar.gz",
        "http://www.idiap.ch/software/bob/data/bob/bob.bio.face/master/pytorch/faceX_models.tar.gz",
    ]

    filename = get_file(
        "faceX_models.tar.gz",
        urls,
        cache_subdir="data/pytorch/",
        file_hash="eb7ec871f434d2f44e5408627d656297",
        extract=True,
    )

    return filename


def add_faceX_path(filename):

    path = os.path.join(os.path.dirname(filename), "faceX_models")

    logger.warning(f"Adding the following path to PYTHON_PATH: {path}")
    sys.path.insert(0, path)
    return path


class FaceXDetector(Base):
    """
    Face detector taken from https://github.com/JDAI-CV/FaceX-Zoo

    This one we are using the 106 larnmark detector that was taken from
    https://github.com/Hsintao/pfld_106_face_landmarks/blob/master/models/mobilev3_pfld.py

    .. warning:
      Here we are assuming that the faces is already detected and cropped

    """

    def __init__(self, device=None, one_face_only=True, **kwargs):
        self.device = torch.device("cpu") if device is None else device

        filename = download_faceX_model()
        faceX_path = add_faceX_path(filename)

        model_filename = os.path.join(
            faceX_path,
            "models",
            "face_detection",
            "face_detection_1.0",
            "face_detection_retina.pkl",
        )

        # Loading face detector
        self.model = torch.load(model_filename, map_location=device)
        self.one_face_only = one_face_only

        self.transforms = transforms.Compose([transforms.ToTensor()])

        # Face detection threshold
        # from: https://github.com/JDAI-CV/FaceX-Zoo/blob/db0b087e4f4d28152e172d6c8d3767a8870733b4/face_sdk/models/face_detection/face_detection_1.0/model_meta.json
        self.cfg = {
            "model_type": "retina face detect nets",
            "model_info": "some model info",
            "model_file": "face_detection_retina.pkl",
            "release_date": "20201019",
            "input_height": 120,
            "input_width": 120,
            "min_sizes": [[16, 32], [64, 128], [256, 512]],
            "steps": [8, 16, 32],
            "variance": [0.1, 0.2],
            "in_channel": 256,
            "out_channel": 256,
            "confidence_threshold": 0.7,
        }

        super(FaceXDetector, self).__init__(**kwargs)

    # Adapted from https://github.com/chainer/chainercv
[docs]    def decode(self, loc, priors, variances):
        """

        Decode locations from predictions using priors to undo
        the encoding we did for offset regression at train time.

        Parameters
        ----------
            loc (tensor): location predictions for loc layers,
                Shape: [num_priors,4]
            priors (tensor): Prior boxes in center-offset form.
                Shape: [num_priors,4].
            variances: (list[float]) Variances of priorboxes

        Returns
        -------
            decoded bounding box predictions

        """
        boxes = torch.cat((priors[:, :2], priors[:, 2:]), 1)
        boxes[:, :2] = priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:]
        boxes[:, 2:] = priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])
        boxes[:, :2] -= boxes[:, 2:] / 2
        boxes[:, 2:] += boxes[:, :2]
        return boxes

    def _preprocess(self, image):
        """Preprocess the image, such as standardization and other operations.

        Returns:
            A numpy array list, the shape is channel * h * w.
            A tensor, the shape is 4.
        """
        if not isinstance(image, np.ndarray):
            logger.error("The input should be the ndarray read by cv2!")

        img = np.float32(image)
        scale = torch.Tensor(
            [img.shape[1], img.shape[0], img.shape[1], img.shape[0]]
        )
        img -= (104, 117, 123)
        img = img.transpose(2, 0, 1)
        return img, scale

    def _postprocess(self, loc, conf, scale, input_height, input_width):
        """Postprecess the prediction result.
        Decode detection result, set the confidence threshold and do the NMS
        to keep the appropriate detection box.

        Returns:
            A numpy array, the shape is N * (x, y, w, h, confidence),
            N is the number of detection box.
        """
        priorbox = PriorBox(self.cfg, image_size=(input_height, input_width))
        priors = priorbox.forward()
        priors = priors.to(self.device)
        prior_data = priors.data
        boxes = self.decode(
            loc.data.squeeze(0), prior_data, self.cfg["variance"]
        )
        boxes = boxes * scale
        boxes = boxes.cpu().numpy()
        scores = conf.squeeze(0).data.cpu().numpy()[:, 1]

        # ignore low scores
        inds = np.where(scores > self.cfg["confidence_threshold"])[0]
        boxes = boxes[inds]
        scores = scores[inds]

        # keep top-K before NMS
        order = scores.argsort()[::-1]
        boxes = boxes[order]
        scores = scores[order]

        # do NMS
        nms_threshold = 0.2
        dets = np.hstack((boxes, scores[:, np.newaxis])).astype(
            np.float32, copy=False
        )
        keep = self.py_cpu_nms(dets, nms_threshold)
        dets = dets[keep, :]
        return dets

    # Adapted from https://github.com/biubug6/Pytorch_Retinaface
[docs]    def py_cpu_nms(self, dets, thresh):
        """Python version NMS (Non maximum suppression).

        Returns:
            The kept index after NMS.
        """
        x1 = dets[:, 0]
        y1 = dets[:, 1]
        x2 = dets[:, 2]
        y2 = dets[:, 3]
        scores = dets[:, 4]
        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
        order = scores.argsort()[::-1]
        keep = []
        while order.size > 0:
            i = order[0]
            keep.append(i)
            xx1 = np.maximum(x1[i], x1[order[1:]])
            yy1 = np.maximum(y1[i], y1[order[1:]])
            xx2 = np.minimum(x2[i], x2[order[1:]])
            yy2 = np.minimum(y2[i], y2[order[1:]])
            w = np.maximum(0.0, xx2 - xx1 + 1)
            h = np.maximum(0.0, yy2 - yy1 + 1)
            inter = w * h
            ovr = inter / (areas[i] + areas[order[1:]] - inter)
            inds = np.where(ovr <= thresh)[0]
            order = order[inds + 1]
        return keep

[docs]    def annotate(self, image, **kwargs):
        """Get the inference of the image and process the inference result.

        Returns:
            A numpy array, the shape is N * (x, y, w, h, confidence),
            N is the number of detection box.
        """

        # First thing, we need to convert the bob CxHxW
        # to the openCV HxWxC and BGR
        image = bob_to_opencvbgr(image)

        input_height, input_width, _ = image.shape
        try:
            image, scale = self._preprocess(image)
        except Exception as e:
            raise e
        self.model = self.model.to(self.device)
        image = torch.from_numpy(image).unsqueeze(0)
        with torch.no_grad():
            image = image.to(self.device)
            scale = scale.to(self.device)
            loc, conf, landms = self.model(image)
        dets = self._postprocess(loc, conf, scale, input_height, input_width)

        if len(dets) == 0:
            logger.error("Face not detected. Returning None")
            return None

        dets = dets[0] if self.one_face_only else dets

        return dets


class FaceX106Landmarks(Base):
    """
    Landmark detector taken from https://github.com/JDAI-CV/FaceX-Zoo

    This one we are using the 106 larnmark detector that was taken from
    https://github.com/Hsintao/pfld_106_face_landmarks/blob/master/models/mobilev3_pfld.py

    .. warning:
      Here we are assuming that the faces is already detected and cropped


    Parameters
    ----------

    use_mtcnn_detector: bool
       If set uses the MTCNN face detector as a base for the landmark extractor.
       If not, it uses the standard face detector of FaceXZoo.


    """

    def __init__(self, device=None, use_mtcnn_detector=True, **kwargs):
        self.device = torch.device("cpu") if device is None else device

        filename = download_faceX_model()
        faceX_path = add_faceX_path(filename)
        self.use_mtcnn_detector = use_mtcnn_detector

        model_filename = os.path.join(
            faceX_path,
            "models",
            "face_alignment",
            "face_alignment_1.0",
            "face_landmark_pfld.pkl",
        )

        self.model = torch.load(model_filename, map_location=self.device)

        # Loading the face detector
        self.face_detector = MTCNN() if use_mtcnn_detector else FaceXDetector()

        self.transforms = transforms.Compose([transforms.ToTensor()])

        # Face alignment threshold
        # from: https://github.com/JDAI-CV/FaceX-Zoo/blob/db0b087e4f4d28152e172d6c8d3767a8870733b4/face_sdk/models/face_alignment/face_alignment_1.0/model_meta.json
        self.cfg = {
            "model_path": "models",
            "model_category": "face_alignment",
            "model_name": "face_alignment_1.0",
            "model_type": "pfld face landmark nets",
            "model_info": "some model info",
            "model_file_path": "models/face_alignment/face_alignment_1.0/face_landmark_pfld.pkl",
            "release_date": "20201023",
            "input_height": 112,
            "input_width": 112,
            "img_size": 112,
        }

        self.img_size = self.cfg["img_size"]

        super(FaceX106Landmarks, self).__init__(**kwargs)

        # self.detector = MTCNN(min_size=min_size, factor=factor, thresholds=thresholds)

[docs]    def annotate(self, image, **kwargs):
        """Annotates an image using mtcnn

        Parameters
        ----------
        image : numpy.array
            An RGB image in Bob format.
        **kwargs
            Ignored.

        Returns
        -------
        dict
            Annotations contain: (topleft, bottomright, leye, reye, nose,
            mouthleft, mouthright, quality).
        """

        # Detect the face
        if self.use_mtcnn_detector:
            annotations = self.face_detector.annotate(image)
            if annotations is None:
                return None

            dets = [
                annotations["topleft"][1],
                annotations["topleft"][0],
                annotations["bottomright"][1],
                annotations["bottomright"][0],
            ]
        else:
            dets = self.face_detector.annotate(image.copy())

        if dets is None:
            return None

        # First thing, we need to convert the bob CxHxW
        # to the openCV HxWxC and BGR
        image = bob_to_opencvbgr(image)
        try:
            image_pre = self._preprocess(image, dets)
        except Exception as e:
            raise e
        self.model = self.model.to(self.device)
        image_pre = image_pre.unsqueeze(0)
        with torch.no_grad():
            image_pre = image_pre.to(self.device)
            _, landmarks_normal = self.model(image_pre)
        landmarks = self._postprocess(landmarks_normal)

        return np.array(landmarks)

    # Adapted from https://github.com/Hsintao/pfld_106_face_landmarks/blob/master/data/prepare.py
    def _preprocess(self, image, det):
        import cv2

        """Preprocess the input image, cutting the input image through the face detection information.
        Using the face detection result(dets) to get the face position in the input image.
        After determining the center of face position and the box size of face, crop the image
        and resize it into preset size.

        Returns:
           A torch tensor, the image after preprecess, shape: (3, 112, 112).
        """
        if not isinstance(image, np.ndarray):
            logger.error("The input should be the ndarray read by cv2!")

        img = image.copy()
        self.image_org = image.copy()
        img = np.float32(img)

        xy = np.array([det[0], det[1]])
        zz = np.array([det[2], det[3]])
        wh = zz - xy + 1
        center = (xy + wh / 2).astype(np.int32)
        boxsize = int(np.max(wh) * 1.2)
        xy = center - boxsize // 2
        self.xy = xy
        self.boxsize = boxsize
        x1, y1 = xy
        x2, y2 = xy + boxsize
        height, width, _ = img.shape
        dx = max(0, -x1)
        dy = max(0, -y1)
        x1 = max(0, x1)
        y1 = max(0, y1)
        edx = max(0, x2 - width)
        edy = max(0, y2 - height)
        x2 = min(width, x2)
        y2 = min(height, y2)
        imageT = image[y1:y2, x1:x2]
        if dx > 0 or dy > 0 or edx > 0 or edy > 0:
            imageT = cv2.copyMakeBorder(
                imageT, dy, edy, dx, edx, cv2.BORDER_CONSTANT, 0
            )

        imageT = cv2.resize(imageT, (self.img_size, self.img_size))
        t = transforms.Compose([transforms.ToTensor()])
        img_after = t(imageT)
        return img_after

    def _postprocess(self, landmarks_normal):
        """Process the predicted landmarks into the form of the original image.

        Returns:
            A numpy array, the landmarks based on the shape of original image, shape: (106, 2),
        """
        landmarks_normal = landmarks_normal.cpu().numpy()
        landmarks_normal = landmarks_normal.reshape(
            landmarks_normal.shape[0], -1, 2
        )
        landmarks = landmarks_normal[0] * [self.boxsize, self.boxsize] + self.xy
        return landmarks