| import transformers |
| from torch.utils.data import Dataset |
| from torchvision import transforms |
| import torch |
| import logging |
| import random |
| from typing import Dict |
| import os |
| import numpy as np |
| from pycocotools.coco import COCO |
| import cv2 |
|
|
| DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>" |
| PREFIX_IMAGE = "Image: " |
| PREFIX_NO_IMAGE = "Image: N/A" |
| BEGIN_DESCRIPTION = "<des>" |
| END_DESCRIPTION = "</des>" |
| IGNORE_INDEX = -100 |
| DEFAULT_EOS_TOKEN = "</s>" |
|
|
| from .constants import COCO_KEYPOINT_NAME, KeypointLocationDescription, KeypointLocationQuestion |
|
|
| DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>" |
| PREFIX_IMAGE = "Image: " |
| PREFIX_NO_IMAGE = "Image: N/A" |
| BEGIN_DESCRIPTION = "<des>" |
| END_DESCRIPTION = "</des>" |
| IGNORE_INDEX = -100 |
| DEFAULT_EOS_TOKEN = "</s>" |
| BEGIN_OPTIONS = "<opt>" |
| END_OPTIONS = "</opt>" |
| BEGIN_LOC = "<loc>" |
| END_LOC = "</loc>" |
| BEGIN_QUESTION = "<qes>" |
| END_QUESTION = "</qes>" |
|
|
| class PoseCOCODataset(Dataset): |
| """Dataset for supervised fine-tuning.""" |
| def __init__(self, data_path: str, |
| multimodal_cfg: dict, |
| is_train=True, |
| is_RL=False |
| ): |
| super(PoseCOCODataset, self).__init__() |
| logging.warning("Loading data...") |
| self.size = multimodal_cfg['image_size'] |
| self.aspect_ratio = 1.0 |
| self.pixel_std = 200 |
| self.num_joints = 17 |
| |
| coco = COCO(data_path) |
| list_data_dict = [] |
| instance_id = 0 |
| for index in coco.getImgIds(): |
| im_ann = coco.loadImgs(index)[0] |
| width = im_ann['width'] |
| height = im_ann['height'] |
| annIds = coco.getAnnIds(imgIds=index, iscrowd=False) |
| objs = coco.loadAnns(annIds) |
| |
| valid_objs = [] |
| for obj in objs: |
| x, y, w, h = obj['bbox'] |
| x1 = np.max((0, x)) |
| y1 = np.max((0, y)) |
| x2 = np.min((width - 1, x1 + np.max((0, w - 1)))) |
| y2 = np.min((height - 1, y1 + np.max((0, h - 1)))) |
| if obj['area'] > 0 and x2 >= x1 and y2 >= y1: |
| obj['clean_bbox'] = [x1, y1, x2-x1, y2-y1] |
| valid_objs.append(obj) |
| objs = valid_objs |
|
|
| for obj in objs: |
| cls = obj['category_id'] |
| if cls != 1: continue |
|
|
| |
| if max(obj['keypoints']) == 0: |
| continue |
|
|
| joints_3d = np.zeros((self.num_joints, 3), dtype=np.float32) |
| joints_3d_vis = np.zeros((self.num_joints, 3), dtype=np.float32) |
| visible = np.zeros((self.num_joints), dtype=np.float32) |
| for ipt in range(self.num_joints): |
| joints_3d[ipt, 0] = obj['keypoints'][ipt * 3 + 0] |
| joints_3d[ipt, 1] = obj['keypoints'][ipt * 3 + 1] |
| joints_3d[ipt, 2] = 0 |
| t_vis = obj['keypoints'][ipt * 3 + 2] |
| visible[ipt] = t_vis |
| if t_vis > 1: |
| t_vis = 1 |
| joints_3d_vis[ipt, 0] = t_vis |
| joints_3d_vis[ipt, 1] = t_vis |
| joints_3d_vis[ipt, 2] = 0 |
|
|
| center, scale = self._box2cs(obj['clean_bbox'][:4]) |
| list_data_dict.append({ |
| 'file_name': im_ann['file_name'], |
| 'image_id': index, |
| 'center': center, |
| 'scale': scale, |
| 'joints_3d': joints_3d, |
| 'joints_3d_vis': joints_3d_vis, |
| 'instance_id': instance_id, |
| 'human_bbox': obj['clean_bbox'] |
| }) |
| instance_id += 1 |
| |
| logging.warning("The number of training samples is {}".format(len(list_data_dict))) |
| logging.warning("Formatting inputs...Skip in lazy mode") |
| self.list_data_dict = list_data_dict |
| self.multimodal_cfg = multimodal_cfg |
| |
| self.data_aug = False |
| self.is_train = is_train |
|
|
| def __len__(self): |
| return len(self.list_data_dict) |
|
|
| def __getitem__(self, i): |
| return self._parse_data_item_val(i) |
|
|
| def _parse_data_item_val(self, i): |
| sources = self.list_data_dict[i] |
| result_dict = {} |
| image, joints, joints_vis, c, s, file_name, image_size = self._get_pose_item(sources) |
| image_id = sources['image_id'] |
| result_dict['image'] = image |
| result_dict['image_id'] = image_id |
| result_dict['c'] = c |
| result_dict['s'] = s |
| result_dict['joints'] = joints |
| result_dict['joints_vis'] = joints_vis |
| result_dict['file_name'] = file_name |
| result_dict['human_bbox'] = sources['human_bbox'] |
| result_dict['image_size'] = image_size |
| return result_dict |
| |
| def _get_pose_item(self, sources): |
| file_name = sources['file_name'] |
| image_folder = self.multimodal_cfg['image_folder'] |
| image_file = os.path.join(image_folder, file_name) |
| image = cv2.imread( |
| image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION |
| ) |
| image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) |
| h, w, c = image.shape |
| |
| joints = sources['joints_3d'] |
| joints_vis = sources['joints_3d_vis'] |
| c = sources['center'] |
| s = sources['scale'] |
| r = 0 |
|
|
| if self.data_aug: |
| sf = 0.3 |
| rf = 40 |
| s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf) |
| r = random.uniform(-rf, rf) if random.random() <= 0.5 else 0 |
|
|
| flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]] |
| |
| if random.random() <= 0.5: |
| image = image[:, ::-1, :] |
| joints, joints_vis = fliplr_joints( |
| joints, joints_vis, image.shape[1], flip_pairs) |
| c[0] = image.shape[1] - c[0] - 1 |
|
|
| trans = get_affine_transform(c, s, r, (int(self.size), int(self.size))) |
| image = cv2.warpAffine( |
| image, |
| trans, |
| (int(self.size), int(self.size)), |
| flags=cv2.INTER_LINEAR) |
| |
| for i in range(self.num_joints): |
| if joints_vis[i, 0] > 0.0: |
| joints[i, 0:2] = affine_transform(joints[i, 0:2], trans) |
| |
| return image, joints, joints_vis, c, s, file_name, [h,w] |
| |
| def _box2cs(self, box): |
| x, y, w, h = box[:4] |
| return self._xywh2cs(x, y, w, h) |
|
|
| def _xywh2cs(self, x, y, w, h): |
| center = np.zeros((2), dtype=np.float32) |
| center[0] = x + w * 0.5 |
| center[1] = y + h * 0.5 |
|
|
| if w > self.aspect_ratio * h: |
| h = w * 1.0 / self.aspect_ratio |
| elif w < self.aspect_ratio * h: |
| w = h * self.aspect_ratio |
| scale = np.array( |
| [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std], |
| dtype=np.float32) |
| if center[0] != -1: |
| scale = scale * 1.0 |
|
|
| return center, scale |
|
|
| def fliplr_joints(joints, joints_vis, width, matched_parts): |
| """ |
| flip coords |
| """ |
| |
| joints[:, 0] = width - joints[:, 0] - 1 |
|
|
| |
| for pair in matched_parts: |
| joints[pair[0], :], joints[pair[1], :] = \ |
| joints[pair[1], :], joints[pair[0], :].copy() |
| joints_vis[pair[0], :], joints_vis[pair[1], :] = \ |
| joints_vis[pair[1], :], joints_vis[pair[0], :].copy() |
|
|
| return joints*joints_vis, joints_vis |
|
|
| def transform_preds(coords, center, scale, output_size): |
| target_coords = np.zeros(coords.shape) |
| trans = get_affine_transform(center, scale, 0, output_size, inv=1) |
| for p in range(coords.shape[0]): |
| target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans) |
| return target_coords |
|
|
| def get_affine_transform( |
| center, scale, rot, output_size, |
| shift=np.array([0, 0], dtype=np.float32), inv=0 |
| ): |
| if not isinstance(scale, np.ndarray) and not isinstance(scale, list): |
| print(scale) |
| scale = np.array([scale, scale]) |
|
|
| scale_tmp = scale * 200.0 |
| src_w = scale_tmp[0] |
| dst_w = output_size[0] |
| dst_h = output_size[1] |
|
|
| rot_rad = np.pi * rot / 180 |
| src_dir = get_dir([0, src_w * -0.5], rot_rad) |
| dst_dir = np.array([0, dst_w * -0.5], np.float32) |
|
|
| src = np.zeros((3, 2), dtype=np.float32) |
| dst = np.zeros((3, 2), dtype=np.float32) |
| src[0, :] = center + scale_tmp * shift |
| src[1, :] = center + src_dir + scale_tmp * shift |
| dst[0, :] = [dst_w * 0.5, dst_h * 0.5] |
| dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir |
|
|
| src[2:, :] = get_3rd_point(src[0, :], src[1, :]) |
| dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) |
|
|
| if inv: |
| trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) |
| else: |
| trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) |
|
|
| return trans |
|
|
|
|
| def affine_transform(pt, t): |
| new_pt = np.array([pt[0], pt[1], 1.]).T |
| new_pt = np.dot(t, new_pt) |
| return new_pt[:2] |
|
|
|
|
| def get_3rd_point(a, b): |
| direct = a - b |
| return b + np.array([-direct[1], direct[0]], dtype=np.float32) |
|
|
|
|
| def get_dir(src_point, rot_rad): |
| sn, cs = np.sin(rot_rad), np.cos(rot_rad) |
|
|
| src_result = [0, 0] |
| src_result[0] = src_point[0] * cs - src_point[1] * sn |
| src_result[1] = src_point[0] * sn + src_point[1] * cs |
|
|
| return src_result |
|
|