Source code for mmhuman3d.utils.demo_utils

import colorsys
import os
from pathlib import Path

import mmcv
import numpy as np

from mmhuman3d.core.filter import build_filter
from mmhuman3d.utils.path_utils import check_input_path

try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


[docs]def xyxy2xywh(bbox_xyxy):
    """Transform the bbox format from x1y1x2y2 to xywh.

    Args:
        bbox_xyxy (np.ndarray): Bounding boxes (with scores), shaped (n, 4) or
            (n, 5). (left, top, right, bottom, [score])

    Returns:
        np.ndarray: Bounding boxes (with scores),
          shaped (n, 4) or (n, 5). (left, top, width, height, [score])
    """
    if not isinstance(bbox_xyxy, np.ndarray):
        raise TypeError(
            f'Input type is {type(bbox_xyxy)}, which should be numpy.ndarray.')
    bbox_xywh = bbox_xyxy.copy()
    bbox_xywh[..., 2] = bbox_xywh[..., 2] - bbox_xywh[..., 0]
    bbox_xywh[..., 3] = bbox_xywh[..., 3] - bbox_xywh[..., 1]

    return bbox_xywh


[docs]def xywh2xyxy(bbox_xywh):
    """Transform the bbox format from xywh to x1y1x2y2.

    Args:
        bbox_xywh (np.ndarray): Bounding boxes (with scores), shaped
        (n, 4) or (n, 5). (left, top, width, height, [score])

    Returns:
        np.ndarray: Bounding boxes (with scores),
          shaped (n, 4) or (n, 5). (left, top, right, bottom, [score])
    """
    if not isinstance(bbox_xywh, np.ndarray):
        raise TypeError(
            f'Input type is {type(bbox_xywh)}, which should be numpy.ndarray.')
    bbox_xyxy = bbox_xywh.copy()
    bbox_xyxy[..., 2] = bbox_xyxy[..., 2] + bbox_xyxy[..., 0] - 1
    bbox_xyxy[..., 3] = bbox_xyxy[..., 3] + bbox_xyxy[..., 1] - 1

    return bbox_xyxy


[docs]def box2cs(bbox_xywh, aspect_ratio=1.0, bbox_scale_factor=1.25):
    """Convert xywh coordinates to center and scale.

    Args:
    bbox_xywh (numpy.ndarray): the height of the bbox_xywh
    aspect_ratio (int, optional): Defaults to 1.0
    bbox_scale_factor (float, optional): Defaults to 1.25
    Returns:
        numpy.ndarray: center of the bbox
        numpy.ndarray: the scale of the bbox w & h
    """
    if not isinstance(bbox_xywh, np.ndarray):
        raise TypeError(
            f'Input type is {type(bbox_xywh)}, which should be numpy.ndarray.')

    bbox_xywh = bbox_xywh.copy()
    pixel_std = 1
    center = np.stack([
        bbox_xywh[..., 0] + bbox_xywh[..., 2] * 0.5,
        bbox_xywh[..., 1] + bbox_xywh[..., 3] * 0.5
    ], -1)

    mask_h = bbox_xywh[..., 2] > aspect_ratio * bbox_xywh[..., 3]
    mask_w = ~mask_h

    bbox_xywh[mask_h, 3] = bbox_xywh[mask_h, 2] / aspect_ratio
    bbox_xywh[mask_w, 2] = bbox_xywh[mask_w, 3] * aspect_ratio
    scale = np.stack([
        bbox_xywh[..., 2] * 1.0 / pixel_std,
        bbox_xywh[..., 3] * 1.0 / pixel_std
    ], -1)
    scale = scale * bbox_scale_factor

    return center, scale


[docs]def convert_crop_cam_to_orig_img(cam: np.ndarray,
                                 bbox: np.ndarray,
                                 img_width: int,
                                 img_height: int,
                                 aspect_ratio: float = 1.0,
                                 bbox_scale_factor: float = 1.25,
                                 bbox_format: Literal['xyxy', 'xywh',
                                                      'cs'] = 'xyxy'):
    """This function is modified from [VIBE](https://github.com/
    mkocabas/VIBE/blob/master/lib/utils/demo_utils.py#L242-L259). Original
    license please see docs/additional_licenses.md.

    Args:
        cam (np.ndarray): cam (ndarray, shape=(frame, 3) or
        (frame,num_person, 3)):
        weak perspective camera in cropped img coordinates
        bbox (np.ndarray): bbox coordinates
        img_width (int): original image width
        img_height (int): original image height
        aspect_ratio (float, optional):  Defaults to 1.0.
        bbox_scale_factor (float, optional):  Defaults to 1.25.
        bbox_format (Literal['xyxy', 'xywh', 'cs']): Defaults to 'xyxy'.
            'xyxy' means the left-up point and right-bottomn point of the
            bbox.
            'xywh' means the left-up point and the width and height of the
            bbox.
            'cs' means the center of the bbox (x,y) and the scale of the
            bbox w & h.
    Returns:
        orig_cam: shape = (frame, 4) or (frame, num_person, 4)
    """
    if not isinstance(bbox, np.ndarray):
        raise TypeError(
            f'Input type is {type(bbox)}, which should be numpy.ndarray.')
    bbox = bbox.copy()
    if bbox_format == 'xyxy':
        bbox_xywh = xyxy2xywh(bbox)
        center, scale = box2cs(bbox_xywh, aspect_ratio, bbox_scale_factor)
        bbox_cs = np.concatenate([center, scale], axis=-1)
    elif bbox_format == 'xywh':
        center, scale = box2cs(bbox, aspect_ratio, bbox_scale_factor)
        bbox_cs = np.concatenate([center, scale], axis=-1)
    elif bbox_format == 'cs':
        bbox_cs = bbox
    else:
        raise ValueError('Only supports the format of `xyxy`, `cs` and `xywh`')

    cx, cy, h = bbox_cs[..., 0], bbox_cs[..., 1], bbox_cs[..., 2] + 1e-6
    hw, hh = img_width / 2., img_height / 2.
    sx = cam[..., 0] * (1. / (img_width / h))
    sy = cam[..., 0] * (1. / (img_height / h))
    tx = ((cx - hw) / hw / (sx + 1e-6)) + cam[..., 1]
    ty = ((cy - hh) / hh / (sy + 1e-6)) + cam[..., 2]

    orig_cam = np.stack([sx, sy, tx, ty], axis=-1)
    return orig_cam


[docs]def convert_bbox_to_intrinsic(bboxes: np.ndarray,
                              img_width: int = 224,
                              img_height: int = 224,
                              bbox_scale_factor: float = 1.25,
                              bbox_format: Literal['xyxy', 'xywh'] = 'xyxy'):
    """Convert bbox to intrinsic parameters.

    Args:
        bbox (np.ndarray): (frame, num_person, 4) or (frame, 4)
        img_width (int): image width of training data.
        img_height (int): image height of training data.
        bbox_scale_factor (float): scale factor for expanding the bbox.
        bbox_format (Literal['xyxy', 'xywh'] ): 'xyxy' means the left-up point
            and right-bottomn point of the bbox.
            'xywh' means the left-up point and the width and height of the
            bbox.
    Returns:
        np.ndarray: (frame, num_person, 3, 3) or  (frame, 3, 3)
    """
    if not isinstance(bboxes, np.ndarray):
        raise TypeError(
            f'Input type is {type(bboxes)}, which should be numpy.ndarray.')
    assert bbox_format in ['xyxy', 'xywh']

    if bbox_format == 'xyxy':
        bboxes = xyxy2xywh(bboxes)

    center_x = bboxes[..., 0] + bboxes[..., 2] / 2.0
    center_y = bboxes[..., 1] + bboxes[..., 3] / 2.0

    W = np.max(bboxes[..., 2:], axis=-1) * bbox_scale_factor

    num_frame = bboxes.shape[0]
    if bboxes.ndim == 3:
        num_person = bboxes.shape[1]
        Ks = np.zeros((num_frame, num_person, 3, 3))
    elif bboxes.ndim == 2:
        Ks = np.zeros((num_frame, 3, 3))
    elif bboxes.ndim == 1:
        Ks = np.zeros((3, 3))
    else:
        raise ValueError('Wrong input bboxes shape {bboxes.shape}')

    Ks[..., 0, 0] = W / img_width
    Ks[..., 1, 1] = W / img_height
    Ks[..., 0, 2] = center_x - W / 2.0
    Ks[..., 1, 2] = center_y - W / 2.0
    Ks[..., 2, 2] = 1
    return Ks


[docs]def get_default_hmr_intrinsic(num_frame=1,
                              focal_length=1000,
                              det_width=224,
                              det_height=224) -> np.ndarray:
    """Get default hmr intrinsic, defined by how you trained.

    Args:
        num_frame (int, optional): num of frames. Defaults to 1.
        focal_length (int, optional): defined same as your training.
            Defaults to 1000.
        det_width (int, optional): the size you used to detect.
            Defaults to 224.
        det_height (int, optional): the size you used to detect.
            Defaults to 224.

    Returns:
        np.ndarray: shape of (N, 3, 3)
    """
    K = np.zeros((num_frame, 3, 3))
    K[:, 0, 0] = focal_length
    K[:, 1, 1] = focal_length
    K[:, 0, 2] = det_width / 2
    K[:, 1, 2] = det_height / 2
    K[:, 2, 2] = 1
    return K


[docs]def convert_kp2d_to_bbox(
        kp2d: np.ndarray,
        bbox_format: Literal['xyxy', 'xywh'] = 'xyxy') -> np.ndarray:
    """Convert kp2d to bbox.

    Args:
        kp2d (np.ndarray):  shape should be (num_frame, num_points, 2/3)
            or (num_frame, num_person, num_points, 2/3).
        bbox_format (Literal['xyxy', 'xywh'], optional): Defaults to 'xyxy'.

    Returns:
        np.ndarray: shape will be (num_frame, num_person, 4)
    """
    assert bbox_format in ['xyxy', 'xywh']
    if kp2d.ndim == 2:
        kp2d = kp2d[None, None]
    elif kp2d.ndim == 3:
        kp2d = kp2d[:, None]
    num_frame, num_person, _, _ = kp2d.shape
    x1 = np.max(kp2d[..., 0], axis=-2)
    y1 = np.max(kp2d[..., 1], axis=-2)
    x2 = np.max(kp2d[..., 2], axis=-2)
    y2 = np.max(kp2d[..., 3], axis=-2)
    bbox = np.concatenate([x1, y1, x2, y2], axis=-1)
    assert bbox.shape == (num_frame, num_person, 4)
    if bbox_format == 'xywh':
        bbox = xyxy2xywh(bbox)
    return bbox


[docs]def conver_verts_to_cam_coord(verts,
                              pred_cams,
                              bboxes_xy,
                              focal_length=5000.,
                              bbox_scale_factor=1.25,
                              bbox_format='xyxy'):
    """Convert vertices from the world coordinate to camera coordinate.

    Args:
        verts ([np.ndarray]): The vertices in the world coordinate.
            The shape is (frame,num_person,6890,3) or (frame,6890,3).
        pred_cams ([np.ndarray]): Camera parameters estimated by HMR or SPIN.
            The shape is (frame,num_person,3) or (frame,6890,3).
        bboxes_xy ([np.ndarray]): (frame, num_person, 4|5) or (frame, 4|5)
        focal_length ([float],optional): Defined same as your training.
        bbox_scale_factor (float): scale factor for expanding the bbox.
        bbox_format (Literal['xyxy', 'xywh'] ): 'xyxy' means the left-up point
            and right-bottomn point of the bbox.
            'xywh' means the left-up point and the width and height of the
            bbox.
    Returns:
        np.ndarray: The vertices in the camera coordinate.
            The shape is (frame,num_person,6890,3) or (frame,6890,3).
        np.ndarray: The intrinsic parameters of the pred_cam.
            The shape is (num_frame, 3, 3).
    """
    K0 = get_default_hmr_intrinsic(
        focal_length=focal_length, det_height=224, det_width=224)
    K1 = convert_bbox_to_intrinsic(
        bboxes_xy,
        bbox_scale_factor=bbox_scale_factor,
        bbox_format=bbox_format)
    # K1K0(RX+T)-> K0(K0_inv K1K0)
    Ks = np.linalg.inv(K0) @ K1 @ K0
    # convert vertices from world to camera
    cam_trans = np.concatenate([
        pred_cams[..., [1]], pred_cams[..., [2]], 2 * focal_length /
        (224 * pred_cams[..., [0]] + 1e-9)
    ], -1)
    verts = verts + cam_trans[..., None, :]
    if verts.ndim == 4:
        verts = np.einsum('fnij,fnkj->fnki', Ks, verts)
    elif verts.ndim == 3:
        verts = np.einsum('fij,fkj->fki', Ks, verts)
    return verts, K0


[docs]def smooth_process(x, smooth_type='savgol'):
    """Smooth the array with the specified smoothing type.

    Args:
        x (np.ndarray): Shape should be (frame,num_person,K,C)
            or (frame,K,C).
        smooth_type (str, optional): Smooth type.
            choose in ['oneeuro', 'gaus1d', 'savgol'].
            Defaults to 'savgol'.
    Raises:
        ValueError: check the input smoothing type.

    Returns:
        np.ndarray: Smoothed data. The shape should be
            (frame,num_person,K,C) or (frame,K,C).
    """
    x = x.copy()

    assert x.ndim == 3 or x.ndim == 4

    smooth_func = build_filter(dict(type=smooth_type))

    if x.ndim == 4:
        for i in range(x.shape[1]):
            x[:, i] = smooth_func(x[:, i])
    elif x.ndim == 3:
        x = smooth_func(x)

    return x


[docs]def process_mmtracking_results(mmtracking_results, max_track_id):
    """Process mmtracking results.

    Args:
        mmtracking_results ([list]): mmtracking_results.

    Returns:
        list: a list of tracked bounding boxes
    """
    person_results = []
    # 'track_results' is changed to 'track_bboxes'
    # in https://github.com/open-mmlab/mmtracking/pull/300
    if 'track_bboxes' in mmtracking_results:
        tracking_results = mmtracking_results['track_bboxes'][0]
    elif 'track_results' in mmtracking_results:
        tracking_results = mmtracking_results['track_results'][0]

    for track in tracking_results:
        person = {}
        person['track_id'] = int(track[0])
        if max_track_id < int(track[0]):
            max_track_id = int(track[0])
        person['bbox'] = track[1:]
        person_results.append(person)
    person_results = sorted(person_results, key=lambda x: x.get('track_id', 0))
    instance_num = len(tracking_results)
    return person_results, max_track_id, instance_num


[docs]def process_mmdet_results(mmdet_results, cat_id=1):
    """Process mmdet results, and return a list of bboxes.

    Args:
        mmdet_results (list|tuple): mmdet results.
        cat_id (int): category id (default: 1 for human)

    Returns:
        person_results (list): a list of detected bounding boxes
    """
    if isinstance(mmdet_results, tuple):
        det_results = mmdet_results[0]
    else:
        det_results = mmdet_results

    bboxes = det_results[cat_id - 1]

    person_results = []
    for bbox in bboxes:
        person = {}
        person['bbox'] = bbox
        person_results.append(person)

    return person_results


[docs]def prepare_frames(input_path=None):
    """Prepare frames from input_path.

    Args:
        input_path (str, optional): Defaults to None.

    Raises:
        ValueError: check the input path.

    Returns:
        List[np.ndarray]: prepared frames
    """
    if Path(input_path).is_file():
        if input_path.lower().endswith(('.mp4')):
            input_type = 'video'
        elif input_path.lower().endswith(('.png', '.jpg')):
            input_type = 'image'
        else:
            raise ValueError('The input file should be an image or a video.'
                             f' Got invalid file: {input_path}')
    elif Path(input_path).is_dir():
        input_type = 'folder'
    else:
        raise ValueError('Input path should be an file or folder.'
                         f' Got invalid input path: {input_path}')
    # prepare input
    if input_type == 'image':
        file_list = [input_path]
        img_list = [mmcv.imread(img_path) for img_path in file_list]
        assert len(img_list), f'Failed to load image from {input_path}'
    elif input_type == 'folder':
        file_list = [
            os.path.join(input_path, fn) for fn in os.listdir(input_path)
            if fn.lower().endswith(('.png', '.jpg'))
        ]
        file_list.sort()
        img_list = [mmcv.imread(img_path) for img_path in file_list]
        assert len(img_list), f'Failed to load image from {input_path}'
    else:
        check_input_path(
            input_path=input_path, path_type='file', allowed_suffix=['.mp4'])
        video = mmcv.VideoReader(input_path)
        assert video.opened, f'Failed to load video file {input_path}'
        img_list = list(video)

    return img_list


def extract_feature_sequence(extracted_results,
                             frame_idx,
                             causal,
                             seq_len,
                             step=1):
    """Extract the target frame from person results, and pad the sequence to a
    fixed length.

    Args:
        extracted_results (List[List[Dict]]): Multi-frame feature extraction
            results stored in a nested list. Each element of the outer list
            is the feature extraction results of a single frame, and each
            element of the inner list is the feature information of one person,
            which contains:
                features (ndarray): extracted features
                track_id (int): unique id of each person, required when
                    ``with_track_id==True```
                bbox ((4, ) or (5, )): left, right, top, bottom, [score]
        frame_idx (int): The index of the frame in the original video.
        causal (bool): If True, the target frame is the first frame in
            a sequence. Otherwise, the target frame is in the middle of a
            sequence.
        seq_len (int): The number of frames in the input sequence.
        step (int): Step size to extract frames from the video.

    Returns:
        List[List[Dict]]: Multi-frame feature extraction results stored in a
            nested list with a length of seq_len.
        int: The target frame index in the padded sequence.
    """

    if causal:
        frames_left = 0
        frames_right = seq_len - 1
    else:
        frames_left = (seq_len - 1) // 2
        frames_right = frames_left
    num_frames = len(extracted_results)

    # get the padded sequence
    pad_left = max(0, frames_left - frame_idx // step)
    pad_right = max(0, frames_right - (num_frames - 1 - frame_idx) // step)
    start = max(frame_idx % step, frame_idx - frames_left * step)
    end = min(num_frames - (num_frames - 1 - frame_idx) % step,
              frame_idx + frames_right * step + 1)
    extracted_results_seq = [extracted_results[0]] * pad_left + \
        extracted_results[start:end:step] + [extracted_results[-1]] * pad_right
    return extracted_results_seq


[docs]def get_different_colors(number_of_colors,
                         flag=0,
                         alpha: float = 1.0,
                         mode: str = 'bgr',
                         int_dtype: bool = True):
    """Get a numpy of colors of shape (N, 3)."""
    mode = mode.lower()
    assert set(mode).issubset({'r', 'g', 'b', 'a'})
    nst0 = np.random.get_state()
    np.random.seed(flag)
    colors = []
    for i in np.arange(0., 360., 360. / number_of_colors):
        hue = i / 360.
        lightness = (50 + np.random.rand() * 10) / 100.
        saturation = (90 + np.random.rand() * 10) / 100.
        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
    colors_np = np.asarray(colors)
    if int_dtype:
        colors_bgr = (255 * colors_np).astype(np.uint8)
    else:
        colors_bgr = colors_np.astype(np.float32)
    # recover the random state
    np.random.set_state(nst0)
    color_dict = {}
    if 'a' in mode:
        color_dict['a'] = np.ones((colors_bgr.shape[0], 3)) * alpha
    color_dict['b'] = colors_bgr[:, 0:1]
    color_dict['g'] = colors_bgr[:, 1:2]
    color_dict['r'] = colors_bgr[:, 2:3]
    colors_final = []
    for channel in mode:
        colors_final.append(color_dict[channel])
    colors_final = np.concatenate(colors_final, -1)
    return colors_final