Shortcuts

Source code for mmhuman3d.utils.demo_utils

import colorsys
import os
from pathlib import Path

import mmcv
import numpy as np

from mmhuman3d.core.filter import build_filter
from mmhuman3d.utils.path_utils import check_input_path

try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


[docs]def xyxy2xywh(bbox_xyxy): """Transform the bbox format from x1y1x2y2 to xywh. Args: bbox_xyxy (np.ndarray): Bounding boxes (with scores), shaped (n, 4) or (n, 5). (left, top, right, bottom, [score]) Returns: np.ndarray: Bounding boxes (with scores), shaped (n, 4) or (n, 5). (left, top, width, height, [score]) """ if not isinstance(bbox_xyxy, np.ndarray): raise TypeError( f'Input type is {type(bbox_xyxy)}, which should be numpy.ndarray.') bbox_xywh = bbox_xyxy.copy() bbox_xywh[..., 2] = bbox_xywh[..., 2] - bbox_xywh[..., 0] bbox_xywh[..., 3] = bbox_xywh[..., 3] - bbox_xywh[..., 1] return bbox_xywh
[docs]def xywh2xyxy(bbox_xywh): """Transform the bbox format from xywh to x1y1x2y2. Args: bbox_xywh (np.ndarray): Bounding boxes (with scores), shaped (n, 4) or (n, 5). (left, top, width, height, [score]) Returns: np.ndarray: Bounding boxes (with scores), shaped (n, 4) or (n, 5). (left, top, right, bottom, [score]) """ if not isinstance(bbox_xywh, np.ndarray): raise TypeError( f'Input type is {type(bbox_xywh)}, which should be numpy.ndarray.') bbox_xyxy = bbox_xywh.copy() bbox_xyxy[..., 2] = bbox_xyxy[..., 2] + bbox_xyxy[..., 0] - 1 bbox_xyxy[..., 3] = bbox_xyxy[..., 3] + bbox_xyxy[..., 1] - 1 return bbox_xyxy
[docs]def box2cs(bbox_xywh, aspect_ratio=1.0, bbox_scale_factor=1.25): """Convert xywh coordinates to center and scale. Args: bbox_xywh (numpy.ndarray): the height of the bbox_xywh aspect_ratio (int, optional): Defaults to 1.0 bbox_scale_factor (float, optional): Defaults to 1.25 Returns: numpy.ndarray: center of the bbox numpy.ndarray: the scale of the bbox w & h """ if not isinstance(bbox_xywh, np.ndarray): raise TypeError( f'Input type is {type(bbox_xywh)}, which should be numpy.ndarray.') bbox_xywh = bbox_xywh.copy() pixel_std = 1 center = np.stack([ bbox_xywh[..., 0] + bbox_xywh[..., 2] * 0.5, bbox_xywh[..., 1] + bbox_xywh[..., 3] * 0.5 ], -1) mask_h = bbox_xywh[..., 2] > aspect_ratio * bbox_xywh[..., 3] mask_w = ~mask_h bbox_xywh[mask_h, 3] = bbox_xywh[mask_h, 2] / aspect_ratio bbox_xywh[mask_w, 2] = bbox_xywh[mask_w, 3] * aspect_ratio scale = np.stack([ bbox_xywh[..., 2] * 1.0 / pixel_std, bbox_xywh[..., 3] * 1.0 / pixel_std ], -1) scale = scale * bbox_scale_factor return center, scale
[docs]def convert_crop_cam_to_orig_img(cam: np.ndarray, bbox: np.ndarray, img_width: int, img_height: int, aspect_ratio: float = 1.0, bbox_scale_factor: float = 1.25, bbox_format: Literal['xyxy', 'xywh', 'cs'] = 'xyxy'): """This function is modified from [VIBE](https://github.com/ mkocabas/VIBE/blob/master/lib/utils/demo_utils.py#L242-L259). Original license please see docs/additional_licenses.md. Args: cam (np.ndarray): cam (ndarray, shape=(frame, 3) or (frame,num_person, 3)): weak perspective camera in cropped img coordinates bbox (np.ndarray): bbox coordinates img_width (int): original image width img_height (int): original image height aspect_ratio (float, optional): Defaults to 1.0. bbox_scale_factor (float, optional): Defaults to 1.25. bbox_format (Literal['xyxy', 'xywh', 'cs']): Defaults to 'xyxy'. 'xyxy' means the left-up point and right-bottomn point of the bbox. 'xywh' means the left-up point and the width and height of the bbox. 'cs' means the center of the bbox (x,y) and the scale of the bbox w & h. Returns: orig_cam: shape = (frame, 4) or (frame, num_person, 4) """ if not isinstance(bbox, np.ndarray): raise TypeError( f'Input type is {type(bbox)}, which should be numpy.ndarray.') bbox = bbox.copy() if bbox_format == 'xyxy': bbox_xywh = xyxy2xywh(bbox) center, scale = box2cs(bbox_xywh, aspect_ratio, bbox_scale_factor) bbox_cs = np.concatenate([center, scale], axis=-1) elif bbox_format == 'xywh': center, scale = box2cs(bbox, aspect_ratio, bbox_scale_factor) bbox_cs = np.concatenate([center, scale], axis=-1) elif bbox_format == 'cs': bbox_cs = bbox else: raise ValueError('Only supports the format of `xyxy`, `cs` and `xywh`') cx, cy, h = bbox_cs[..., 0], bbox_cs[..., 1], bbox_cs[..., 2] + 1e-6 hw, hh = img_width / 2., img_height / 2. sx = cam[..., 0] * (1. / (img_width / h)) sy = cam[..., 0] * (1. / (img_height / h)) tx = ((cx - hw) / hw / (sx + 1e-6)) + cam[..., 1] ty = ((cy - hh) / hh / (sy + 1e-6)) + cam[..., 2] orig_cam = np.stack([sx, sy, tx, ty], axis=-1) return orig_cam
[docs]def convert_bbox_to_intrinsic(bboxes: np.ndarray, img_width: int = 224, img_height: int = 224, bbox_scale_factor: float = 1.25, bbox_format: Literal['xyxy', 'xywh'] = 'xyxy'): """Convert bbox to intrinsic parameters. Args: bbox (np.ndarray): (frame, num_person, 4) or (frame, 4) img_width (int): image width of training data. img_height (int): image height of training data. bbox_scale_factor (float): scale factor for expanding the bbox. bbox_format (Literal['xyxy', 'xywh'] ): 'xyxy' means the left-up point and right-bottomn point of the bbox. 'xywh' means the left-up point and the width and height of the bbox. Returns: np.ndarray: (frame, num_person, 3, 3) or (frame, 3, 3) """ if not isinstance(bboxes, np.ndarray): raise TypeError( f'Input type is {type(bboxes)}, which should be numpy.ndarray.') assert bbox_format in ['xyxy', 'xywh'] if bbox_format == 'xyxy': bboxes = xyxy2xywh(bboxes) center_x = bboxes[..., 0] + bboxes[..., 2] / 2.0 center_y = bboxes[..., 1] + bboxes[..., 3] / 2.0 W = np.max(bboxes[..., 2:], axis=-1) * bbox_scale_factor num_frame = bboxes.shape[0] if bboxes.ndim == 3: num_person = bboxes.shape[1] Ks = np.zeros((num_frame, num_person, 3, 3)) elif bboxes.ndim == 2: Ks = np.zeros((num_frame, 3, 3)) elif bboxes.ndim == 1: Ks = np.zeros((3, 3)) else: raise ValueError('Wrong input bboxes shape {bboxes.shape}') Ks[..., 0, 0] = W / img_width Ks[..., 1, 1] = W / img_height Ks[..., 0, 2] = center_x - W / 2.0 Ks[..., 1, 2] = center_y - W / 2.0 Ks[..., 2, 2] = 1 return Ks
[docs]def get_default_hmr_intrinsic(num_frame=1, focal_length=1000, det_width=224, det_height=224) -> np.ndarray: """Get default hmr intrinsic, defined by how you trained. Args: num_frame (int, optional): num of frames. Defaults to 1. focal_length (int, optional): defined same as your training. Defaults to 1000. det_width (int, optional): the size you used to detect. Defaults to 224. det_height (int, optional): the size you used to detect. Defaults to 224. Returns: np.ndarray: shape of (N, 3, 3) """ K = np.zeros((num_frame, 3, 3)) K[:, 0, 0] = focal_length K[:, 1, 1] = focal_length K[:, 0, 2] = det_width / 2 K[:, 1, 2] = det_height / 2 K[:, 2, 2] = 1 return K
[docs]def convert_kp2d_to_bbox( kp2d: np.ndarray, bbox_format: Literal['xyxy', 'xywh'] = 'xyxy') -> np.ndarray: """Convert kp2d to bbox. Args: kp2d (np.ndarray): shape should be (num_frame, num_points, 2/3) or (num_frame, num_person, num_points, 2/3). bbox_format (Literal['xyxy', 'xywh'], optional): Defaults to 'xyxy'. Returns: np.ndarray: shape will be (num_frame, num_person, 4) """ assert bbox_format in ['xyxy', 'xywh'] if kp2d.ndim == 2: kp2d = kp2d[None, None] elif kp2d.ndim == 3: kp2d = kp2d[:, None] num_frame, num_person, _, _ = kp2d.shape x1 = np.max(kp2d[..., 0], axis=-2) y1 = np.max(kp2d[..., 1], axis=-2) x2 = np.max(kp2d[..., 2], axis=-2) y2 = np.max(kp2d[..., 3], axis=-2) bbox = np.concatenate([x1, y1, x2, y2], axis=-1) assert bbox.shape == (num_frame, num_person, 4) if bbox_format == 'xywh': bbox = xyxy2xywh(bbox) return bbox
[docs]def conver_verts_to_cam_coord(verts, pred_cams, bboxes_xy, focal_length=5000., bbox_scale_factor=1.25, bbox_format='xyxy'): """Convert vertices from the world coordinate to camera coordinate. Args: verts ([np.ndarray]): The vertices in the world coordinate. The shape is (frame,num_person,6890,3) or (frame,6890,3). pred_cams ([np.ndarray]): Camera parameters estimated by HMR or SPIN. The shape is (frame,num_person,3) or (frame,6890,3). bboxes_xy ([np.ndarray]): (frame, num_person, 4|5) or (frame, 4|5) focal_length ([float],optional): Defined same as your training. bbox_scale_factor (float): scale factor for expanding the bbox. bbox_format (Literal['xyxy', 'xywh'] ): 'xyxy' means the left-up point and right-bottomn point of the bbox. 'xywh' means the left-up point and the width and height of the bbox. Returns: np.ndarray: The vertices in the camera coordinate. The shape is (frame,num_person,6890,3) or (frame,6890,3). np.ndarray: The intrinsic parameters of the pred_cam. The shape is (num_frame, 3, 3). """ K0 = get_default_hmr_intrinsic( focal_length=focal_length, det_height=224, det_width=224) K1 = convert_bbox_to_intrinsic( bboxes_xy, bbox_scale_factor=bbox_scale_factor, bbox_format=bbox_format) # K1K0(RX+T)-> K0(K0_inv K1K0) Ks = np.linalg.inv(K0) @ K1 @ K0 # convert vertices from world to camera cam_trans = np.concatenate([ pred_cams[..., [1]], pred_cams[..., [2]], 2 * focal_length / (224 * pred_cams[..., [0]] + 1e-9) ], -1) verts = verts + cam_trans[..., None, :] if verts.ndim == 4: verts = np.einsum('fnij,fnkj->fnki', Ks, verts) elif verts.ndim == 3: verts = np.einsum('fij,fkj->fki', Ks, verts) return verts, K0
[docs]def smooth_process(x, smooth_type='savgol'): """Smooth the array with the specified smoothing type. Args: x (np.ndarray): Shape should be (frame,num_person,K,C) or (frame,K,C). smooth_type (str, optional): Smooth type. choose in ['oneeuro', 'gaus1d', 'savgol']. Defaults to 'savgol'. Raises: ValueError: check the input smoothing type. Returns: np.ndarray: Smoothed data. The shape should be (frame,num_person,K,C) or (frame,K,C). """ x = x.copy() assert x.ndim == 3 or x.ndim == 4 smooth_func = build_filter(dict(type=smooth_type)) if x.ndim == 4: for i in range(x.shape[1]): x[:, i] = smooth_func(x[:, i]) elif x.ndim == 3: x = smooth_func(x) return x
[docs]def process_mmtracking_results(mmtracking_results, max_track_id): """Process mmtracking results. Args: mmtracking_results ([list]): mmtracking_results. Returns: list: a list of tracked bounding boxes """ person_results = [] # 'track_results' is changed to 'track_bboxes' # in https://github.com/open-mmlab/mmtracking/pull/300 if 'track_bboxes' in mmtracking_results: tracking_results = mmtracking_results['track_bboxes'][0] elif 'track_results' in mmtracking_results: tracking_results = mmtracking_results['track_results'][0] for track in tracking_results: person = {} person['track_id'] = int(track[0]) if max_track_id < int(track[0]): max_track_id = int(track[0]) person['bbox'] = track[1:] person_results.append(person) person_results = sorted(person_results, key=lambda x: x.get('track_id', 0)) instance_num = len(tracking_results) return person_results, max_track_id, instance_num
[docs]def process_mmdet_results(mmdet_results, cat_id=1): """Process mmdet results, and return a list of bboxes. Args: mmdet_results (list|tuple): mmdet results. cat_id (int): category id (default: 1 for human) Returns: person_results (list): a list of detected bounding boxes """ if isinstance(mmdet_results, tuple): det_results = mmdet_results[0] else: det_results = mmdet_results bboxes = det_results[cat_id - 1] person_results = [] for bbox in bboxes: person = {} person['bbox'] = bbox person_results.append(person) return person_results
[docs]def prepare_frames(input_path=None): """Prepare frames from input_path. Args: input_path (str, optional): Defaults to None. Raises: ValueError: check the input path. Returns: List[np.ndarray]: prepared frames """ if Path(input_path).is_file(): if input_path.lower().endswith(('.mp4')): input_type = 'video' elif input_path.lower().endswith(('.png', '.jpg')): input_type = 'image' else: raise ValueError('The input file should be an image or a video.' f' Got invalid file: {input_path}') elif Path(input_path).is_dir(): input_type = 'folder' else: raise ValueError('Input path should be an file or folder.' f' Got invalid input path: {input_path}') # prepare input if input_type == 'image': file_list = [input_path] img_list = [mmcv.imread(img_path) for img_path in file_list] assert len(img_list), f'Failed to load image from {input_path}' elif input_type == 'folder': file_list = [ os.path.join(input_path, fn) for fn in os.listdir(input_path) if fn.lower().endswith(('.png', '.jpg')) ] file_list.sort() img_list = [mmcv.imread(img_path) for img_path in file_list] assert len(img_list), f'Failed to load image from {input_path}' else: check_input_path( input_path=input_path, path_type='file', allowed_suffix=['.mp4']) video = mmcv.VideoReader(input_path) assert video.opened, f'Failed to load video file {input_path}' img_list = list(video) return img_list
def extract_feature_sequence(extracted_results, frame_idx, causal, seq_len, step=1): """Extract the target frame from person results, and pad the sequence to a fixed length. Args: extracted_results (List[List[Dict]]): Multi-frame feature extraction results stored in a nested list. Each element of the outer list is the feature extraction results of a single frame, and each element of the inner list is the feature information of one person, which contains: features (ndarray): extracted features track_id (int): unique id of each person, required when ``with_track_id==True``` bbox ((4, ) or (5, )): left, right, top, bottom, [score] frame_idx (int): The index of the frame in the original video. causal (bool): If True, the target frame is the first frame in a sequence. Otherwise, the target frame is in the middle of a sequence. seq_len (int): The number of frames in the input sequence. step (int): Step size to extract frames from the video. Returns: List[List[Dict]]: Multi-frame feature extraction results stored in a nested list with a length of seq_len. int: The target frame index in the padded sequence. """ if causal: frames_left = 0 frames_right = seq_len - 1 else: frames_left = (seq_len - 1) // 2 frames_right = frames_left num_frames = len(extracted_results) # get the padded sequence pad_left = max(0, frames_left - frame_idx // step) pad_right = max(0, frames_right - (num_frames - 1 - frame_idx) // step) start = max(frame_idx % step, frame_idx - frames_left * step) end = min(num_frames - (num_frames - 1 - frame_idx) % step, frame_idx + frames_right * step + 1) extracted_results_seq = [extracted_results[0]] * pad_left + \ extracted_results[start:end:step] + [extracted_results[-1]] * pad_right return extracted_results_seq
[docs]def get_different_colors(number_of_colors, flag=0, alpha: float = 1.0, mode: str = 'bgr', int_dtype: bool = True): """Get a numpy of colors of shape (N, 3).""" mode = mode.lower() assert set(mode).issubset({'r', 'g', 'b', 'a'}) nst0 = np.random.get_state() np.random.seed(flag) colors = [] for i in np.arange(0., 360., 360. / number_of_colors): hue = i / 360. lightness = (50 + np.random.rand() * 10) / 100. saturation = (90 + np.random.rand() * 10) / 100. colors.append(colorsys.hls_to_rgb(hue, lightness, saturation)) colors_np = np.asarray(colors) if int_dtype: colors_bgr = (255 * colors_np).astype(np.uint8) else: colors_bgr = colors_np.astype(np.float32) # recover the random state np.random.set_state(nst0) color_dict = {} if 'a' in mode: color_dict['a'] = np.ones((colors_bgr.shape[0], 3)) * alpha color_dict['b'] = colors_bgr[:, 0:1] color_dict['g'] = colors_bgr[:, 1:2] color_dict['r'] = colors_bgr[:, 2:3] colors_final = [] for channel in mode: colors_final.append(color_dict[channel]) colors_final = np.concatenate(colors_final, -1) return colors_final