Source code for bob.bio.face.annotator.faceX_106landmarks

import logging
import os
import sys

from itertools import product as product
from math import ceil

import numpy as np
import torch

from torchvision import transforms

from bob.extension.download import get_file
from bob.io.image import bob_to_opencvbgr

from . import Base
from .mtcnn import MTCNN

logger = logging.getLogger(__name__)


# Adapted from https://github.com/biubug6/Pytorch_Retinafacey
class PriorBox(object):
    """Compute the suitable parameters of anchors for later decode operation

    Attributes:
        cfg(dict): testing config.
        image_size(tuple): the input image size.
    """

    def __init__(self, cfg, image_size=None):
        """
        Init priorBox settings related to the generation of anchors.
        """
        super(PriorBox, self).__init__()
        self.min_sizes = cfg["min_sizes"]
        self.steps = cfg["steps"]
        self.image_size = image_size
        self.feature_maps = [
            [ceil(self.image_size[0] / step), ceil(self.image_size[1] / step)]
            for step in self.steps
        ]
        self.name = "s"

    def forward(self):
        anchors = []
        for k, f in enumerate(self.feature_maps):
            min_sizes = self.min_sizes[k]
            for i, j in product(range(f[0]), range(f[1])):
                for min_size in min_sizes:
                    s_kx = min_size / self.image_size[1]
                    s_ky = min_size / self.image_size[0]
                    dense_cx = [
                        x * self.steps[k] / self.image_size[1]
                        for x in [j + 0.5]
                    ]
                    dense_cy = [
                        y * self.steps[k] / self.image_size[0]
                        for y in [i + 0.5]
                    ]
                    for cy, cx in product(dense_cy, dense_cx):
                        anchors += [cx, cy, s_kx, s_ky]
        # back to torch land
        output = torch.Tensor(anchors).view(-1, 4)
        return output


def download_faceX_model():
    urls = [
        "https://www.idiap.ch/software/bob/data/bob/bob.bio.face/master/pytorch/faceX_models.tar.gz",
        "http://www.idiap.ch/software/bob/data/bob/bob.bio.face/master/pytorch/faceX_models.tar.gz",
    ]

    filename = get_file(
        "faceX_models.tar.gz",
        urls,
        cache_subdir="data/pytorch/",
        file_hash="eb7ec871f434d2f44e5408627d656297",
        extract=True,
    )

    return filename


def add_faceX_path(filename):

    path = os.path.join(os.path.dirname(filename), "faceX_models")

    logger.warning(f"Adding the following path to PYTHON_PATH: {path}")
    sys.path.insert(0, path)
    return path


class FaceXDetector(Base):
    """
    Face detector taken from https://github.com/JDAI-CV/FaceX-Zoo

    This one we are using the 106 larnmark detector that was taken from
    https://github.com/Hsintao/pfld_106_face_landmarks/blob/master/models/mobilev3_pfld.py

    .. warning:
      Here we are assuming that the faces is already detected and cropped

    """

    def __init__(self, device=None, one_face_only=True, **kwargs):
        self.device = torch.device("cpu") if device is None else device

        filename = download_faceX_model()
        faceX_path = add_faceX_path(filename)

        model_filename = os.path.join(
            faceX_path,
            "models",
            "face_detection",
            "face_detection_1.0",
            "face_detection_retina.pkl",
        )

        # Loading face detector
        self.model = torch.load(model_filename, map_location=device)
        self.one_face_only = one_face_only

        self.transforms = transforms.Compose([transforms.ToTensor()])

        # Face detection threshold
        # from: https://github.com/JDAI-CV/FaceX-Zoo/blob/db0b087e4f4d28152e172d6c8d3767a8870733b4/face_sdk/models/face_detection/face_detection_1.0/model_meta.json
        self.cfg = {
            "model_type": "retina face detect nets",
            "model_info": "some model info",
            "model_file": "face_detection_retina.pkl",
            "release_date": "20201019",
            "input_height": 120,
            "input_width": 120,
            "min_sizes": [[16, 32], [64, 128], [256, 512]],
            "steps": [8, 16, 32],
            "variance": [0.1, 0.2],
            "in_channel": 256,
            "out_channel": 256,
            "confidence_threshold": 0.7,
        }

        super(FaceXDetector, self).__init__(**kwargs)

    # Adapted from https://github.com/chainer/chainercv
[docs] def decode(self, loc, priors, variances): """ Decode locations from predictions using priors to undo the encoding we did for offset regression at train time. Parameters ---------- loc (tensor): location predictions for loc layers, Shape: [num_priors,4] priors (tensor): Prior boxes in center-offset form. Shape: [num_priors,4]. variances: (list[float]) Variances of priorboxes Returns ------- decoded bounding box predictions """ boxes = torch.cat((priors[:, :2], priors[:, 2:]), 1) boxes[:, :2] = priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:] boxes[:, 2:] = priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1]) boxes[:, :2] -= boxes[:, 2:] / 2 boxes[:, 2:] += boxes[:, :2] return boxes
def _preprocess(self, image): """Preprocess the image, such as standardization and other operations. Returns: A numpy array list, the shape is channel * h * w. A tensor, the shape is 4. """ if not isinstance(image, np.ndarray): logger.error("The input should be the ndarray read by cv2!") img = np.float32(image) scale = torch.Tensor( [img.shape[1], img.shape[0], img.shape[1], img.shape[0]] ) img -= (104, 117, 123) img = img.transpose(2, 0, 1) return img, scale def _postprocess(self, loc, conf, scale, input_height, input_width): """Postprecess the prediction result. Decode detection result, set the confidence threshold and do the NMS to keep the appropriate detection box. Returns: A numpy array, the shape is N * (x, y, w, h, confidence), N is the number of detection box. """ priorbox = PriorBox(self.cfg, image_size=(input_height, input_width)) priors = priorbox.forward() priors = priors.to(self.device) prior_data = priors.data boxes = self.decode( loc.data.squeeze(0), prior_data, self.cfg["variance"] ) boxes = boxes * scale boxes = boxes.cpu().numpy() scores = conf.squeeze(0).data.cpu().numpy()[:, 1] # ignore low scores inds = np.where(scores > self.cfg["confidence_threshold"])[0] boxes = boxes[inds] scores = scores[inds] # keep top-K before NMS order = scores.argsort()[::-1] boxes = boxes[order] scores = scores[order] # do NMS nms_threshold = 0.2 dets = np.hstack((boxes, scores[:, np.newaxis])).astype( np.float32, copy=False ) keep = self.py_cpu_nms(dets, nms_threshold) dets = dets[keep, :] return dets # Adapted from https://github.com/biubug6/Pytorch_Retinaface
[docs] def py_cpu_nms(self, dets, thresh): """Python version NMS (Non maximum suppression). Returns: The kept index after NMS. """ x1 = dets[:, 0] y1 = dets[:, 1] x2 = dets[:, 2] y2 = dets[:, 3] scores = dets[:, 4] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(0.0, xx2 - xx1 + 1) h = np.maximum(0.0, yy2 - yy1 + 1) inter = w * h ovr = inter / (areas[i] + areas[order[1:]] - inter) inds = np.where(ovr <= thresh)[0] order = order[inds + 1] return keep
[docs] def annotate(self, image, **kwargs): """Get the inference of the image and process the inference result. Returns: A numpy array, the shape is N * (x, y, w, h, confidence), N is the number of detection box. """ # First thing, we need to convert the bob CxHxW # to the openCV HxWxC and BGR image = bob_to_opencvbgr(image) input_height, input_width, _ = image.shape try: image, scale = self._preprocess(image) except Exception as e: raise e self.model = self.model.to(self.device) image = torch.from_numpy(image).unsqueeze(0) with torch.no_grad(): image = image.to(self.device) scale = scale.to(self.device) loc, conf, landms = self.model(image) dets = self._postprocess(loc, conf, scale, input_height, input_width) if len(dets) == 0: logger.error("Face not detected. Returning None") return None dets = dets[0] if self.one_face_only else dets return dets
class FaceX106Landmarks(Base): """ Landmark detector taken from https://github.com/JDAI-CV/FaceX-Zoo This one we are using the 106 larnmark detector that was taken from https://github.com/Hsintao/pfld_106_face_landmarks/blob/master/models/mobilev3_pfld.py .. warning: Here we are assuming that the faces is already detected and cropped Parameters ---------- use_mtcnn_detector: bool If set uses the MTCNN face detector as a base for the landmark extractor. If not, it uses the standard face detector of FaceXZoo. """ def __init__(self, device=None, use_mtcnn_detector=True, **kwargs): self.device = torch.device("cpu") if device is None else device filename = download_faceX_model() faceX_path = add_faceX_path(filename) self.use_mtcnn_detector = use_mtcnn_detector model_filename = os.path.join( faceX_path, "models", "face_alignment", "face_alignment_1.0", "face_landmark_pfld.pkl", ) self.model = torch.load(model_filename, map_location=self.device) # Loading the face detector self.face_detector = MTCNN() if use_mtcnn_detector else FaceXDetector() self.transforms = transforms.Compose([transforms.ToTensor()]) # Face alignment threshold # from: https://github.com/JDAI-CV/FaceX-Zoo/blob/db0b087e4f4d28152e172d6c8d3767a8870733b4/face_sdk/models/face_alignment/face_alignment_1.0/model_meta.json self.cfg = { "model_path": "models", "model_category": "face_alignment", "model_name": "face_alignment_1.0", "model_type": "pfld face landmark nets", "model_info": "some model info", "model_file_path": "models/face_alignment/face_alignment_1.0/face_landmark_pfld.pkl", "release_date": "20201023", "input_height": 112, "input_width": 112, "img_size": 112, } self.img_size = self.cfg["img_size"] super(FaceX106Landmarks, self).__init__(**kwargs) # self.detector = MTCNN(min_size=min_size, factor=factor, thresholds=thresholds)
[docs] def annotate(self, image, **kwargs): """Annotates an image using mtcnn Parameters ---------- image : numpy.array An RGB image in Bob format. **kwargs Ignored. Returns ------- dict Annotations contain: (topleft, bottomright, leye, reye, nose, mouthleft, mouthright, quality). """ # Detect the face if self.use_mtcnn_detector: annotations = self.face_detector.annotate(image) if annotations is None: return None dets = [ annotations["topleft"][1], annotations["topleft"][0], annotations["bottomright"][1], annotations["bottomright"][0], ] else: dets = self.face_detector.annotate(image.copy()) if dets is None: return None # First thing, we need to convert the bob CxHxW # to the openCV HxWxC and BGR image = bob_to_opencvbgr(image) try: image_pre = self._preprocess(image, dets) except Exception as e: raise e self.model = self.model.to(self.device) image_pre = image_pre.unsqueeze(0) with torch.no_grad(): image_pre = image_pre.to(self.device) _, landmarks_normal = self.model(image_pre) landmarks = self._postprocess(landmarks_normal) return np.array(landmarks)
# Adapted from https://github.com/Hsintao/pfld_106_face_landmarks/blob/master/data/prepare.py def _preprocess(self, image, det): import cv2 """Preprocess the input image, cutting the input image through the face detection information. Using the face detection result(dets) to get the face position in the input image. After determining the center of face position and the box size of face, crop the image and resize it into preset size. Returns: A torch tensor, the image after preprecess, shape: (3, 112, 112). """ if not isinstance(image, np.ndarray): logger.error("The input should be the ndarray read by cv2!") img = image.copy() self.image_org = image.copy() img = np.float32(img) xy = np.array([det[0], det[1]]) zz = np.array([det[2], det[3]]) wh = zz - xy + 1 center = (xy + wh / 2).astype(np.int32) boxsize = int(np.max(wh) * 1.2) xy = center - boxsize // 2 self.xy = xy self.boxsize = boxsize x1, y1 = xy x2, y2 = xy + boxsize height, width, _ = img.shape dx = max(0, -x1) dy = max(0, -y1) x1 = max(0, x1) y1 = max(0, y1) edx = max(0, x2 - width) edy = max(0, y2 - height) x2 = min(width, x2) y2 = min(height, y2) imageT = image[y1:y2, x1:x2] if dx > 0 or dy > 0 or edx > 0 or edy > 0: imageT = cv2.copyMakeBorder( imageT, dy, edy, dx, edx, cv2.BORDER_CONSTANT, 0 ) imageT = cv2.resize(imageT, (self.img_size, self.img_size)) t = transforms.Compose([transforms.ToTensor()]) img_after = t(imageT) return img_after def _postprocess(self, landmarks_normal): """Process the predicted landmarks into the form of the original image. Returns: A numpy array, the landmarks based on the shape of original image, shape: (106, 2), """ landmarks_normal = landmarks_normal.cpu().numpy() landmarks_normal = landmarks_normal.reshape( landmarks_normal.shape[0], -1, 2 ) landmarks = landmarks_normal[0] * [self.boxsize, self.boxsize] + self.xy return landmarks