Update the dataset of COCO

a3cb3a7 4 days ago

9.52 kB

	import transformers
	from torch.utils.data import Dataset
	from torchvision import transforms
	import torch
	import logging
	import random
	from typing import Dict
	import os
	import numpy as np
	from pycocotools.coco import COCO
	import cv2

	DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
	PREFIX_IMAGE = "Image: "
	PREFIX_NO_IMAGE = "Image: N/A"
	BEGIN_DESCRIPTION = "<des>"
	END_DESCRIPTION = "</des>"
	IGNORE_INDEX = -100
	DEFAULT_EOS_TOKEN = "</s>"

	from .constants import COCO_KEYPOINT_NAME, KeypointLocationDescription, KeypointLocationQuestion

	DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
	PREFIX_IMAGE = "Image: "
	PREFIX_NO_IMAGE = "Image: N/A"
	BEGIN_DESCRIPTION = "<des>"
	END_DESCRIPTION = "</des>"
	IGNORE_INDEX = -100
	DEFAULT_EOS_TOKEN = "</s>"
	BEGIN_OPTIONS = "<opt>"
	END_OPTIONS = "</opt>"
	BEGIN_LOC = "<loc>"
	END_LOC = "</loc>"
	BEGIN_QUESTION = "<qes>"
	END_QUESTION = "</qes>"

	class PoseCOCODataset(Dataset):
	"""Dataset for supervised fine-tuning."""
	def __init__(self, data_path: str,
	multimodal_cfg: dict,
	is_train=True,
	is_RL=False
	):
	super(PoseCOCODataset, self).__init__()
	logging.warning("Loading data...")
	self.size = multimodal_cfg['image_size']
	self.aspect_ratio = 1.0
	self.pixel_std = 200
	self.num_joints = 17

	coco = COCO(data_path)
	list_data_dict = []
	instance_id = 0
	for index in coco.getImgIds():
	im_ann = coco.loadImgs(index)[0]
	width = im_ann['width']
	height = im_ann['height']
	annIds = coco.getAnnIds(imgIds=index, iscrowd=False)
	objs = coco.loadAnns(annIds)
	# sanitize bboxes
	valid_objs = []
	for obj in objs:
	x, y, w, h = obj['bbox']
	x1 = np.max((0, x))
	y1 = np.max((0, y))
	x2 = np.min((width - 1, x1 + np.max((0, w - 1))))
	y2 = np.min((height - 1, y1 + np.max((0, h - 1))))
	if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
	obj['clean_bbox'] = [x1, y1, x2-x1, y2-y1]
	valid_objs.append(obj)
	objs = valid_objs

	for obj in objs:
	cls = obj['category_id']
	if cls != 1: continue

	# ignore objs without keypoints annotation
	if max(obj['keypoints']) == 0:
	continue

	joints_3d = np.zeros((self.num_joints, 3), dtype=np.float32)
	joints_3d_vis = np.zeros((self.num_joints, 3), dtype=np.float32)
	visible = np.zeros((self.num_joints), dtype=np.float32)
	for ipt in range(self.num_joints):
	joints_3d[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
	joints_3d[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
	joints_3d[ipt, 2] = 0
	t_vis = obj['keypoints'][ipt * 3 + 2]
	visible[ipt] = t_vis
	if t_vis > 1:
	t_vis = 1
	joints_3d_vis[ipt, 0] = t_vis
	joints_3d_vis[ipt, 1] = t_vis
	joints_3d_vis[ipt, 2] = 0

	center, scale = self._box2cs(obj['clean_bbox'][:4])
	list_data_dict.append({
	'file_name': im_ann['file_name'],
	'image_id': index,
	'center': center,
	'scale': scale,
	'joints_3d': joints_3d,
	'joints_3d_vis': joints_3d_vis,
	'instance_id': instance_id,
	'human_bbox': obj['clean_bbox']
	})
	instance_id += 1

	logging.warning("The number of training samples is {}".format(len(list_data_dict)))
	logging.warning("Formatting inputs...Skip in lazy mode")
	self.list_data_dict = list_data_dict
	self.multimodal_cfg = multimodal_cfg

	self.data_aug = False
	self.is_train = is_train

	def __len__(self):
	return len(self.list_data_dict)

	def __getitem__(self, i):
	return self._parse_data_item_val(i)

	def _parse_data_item_val(self, i):
	sources = self.list_data_dict[i]
	result_dict = {}
	image, joints, joints_vis, c, s, file_name, image_size = self._get_pose_item(sources)
	image_id = sources['image_id']
	result_dict['image'] = image
	result_dict['image_id'] = image_id
	result_dict['c'] = c
	result_dict['s'] = s
	result_dict['joints'] = joints
	result_dict['joints_vis'] = joints_vis
	result_dict['file_name'] = file_name
	result_dict['human_bbox'] = sources['human_bbox']
	result_dict['image_size'] = image_size
	return result_dict

	def _get_pose_item(self, sources):
	file_name = sources['file_name']
	image_folder = self.multimodal_cfg['image_folder']
	image_file = os.path.join(image_folder, file_name)
	image = cv2.imread(
	image_file, cv2.IMREAD_COLOR \| cv2.IMREAD_IGNORE_ORIENTATION
	)
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	h, w, c = image.shape
	# process image
	joints = sources['joints_3d']
	joints_vis = sources['joints_3d_vis']
	c = sources['center']
	s = sources['scale']
	r = 0

	if self.data_aug:
	sf = 0.3
	rf = 40
	s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
	r = random.uniform(-rf, rf) if random.random() <= 0.5 else 0

	flip_pairs = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
	# flip
	if random.random() <= 0.5:
	image = image[:, ::-1, :]
	joints, joints_vis = fliplr_joints(
	joints, joints_vis, image.shape[1], flip_pairs)
	c[0] = image.shape[1] - c[0] - 1

	trans = get_affine_transform(c, s, r, (int(self.size), int(self.size)))
	image = cv2.warpAffine(
	image,
	trans,
	(int(self.size), int(self.size)),
	flags=cv2.INTER_LINEAR)

	for i in range(self.num_joints):
	if joints_vis[i, 0] > 0.0:
	joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)

	return image, joints, joints_vis, c, s, file_name, [h,w]

	def _box2cs(self, box):
	x, y, w, h = box[:4]
	return self._xywh2cs(x, y, w, h)

	def _xywh2cs(self, x, y, w, h):
	center = np.zeros((2), dtype=np.float32)
	center[0] = x + w * 0.5
	center[1] = y + h * 0.5

	if w > self.aspect_ratio * h:
	h = w * 1.0 / self.aspect_ratio
	elif w < self.aspect_ratio * h:
	w = h * self.aspect_ratio
	scale = np.array(
	[w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
	dtype=np.float32)
	if center[0] != -1:
	scale = scale * 1.0

	return center, scale

	def fliplr_joints(joints, joints_vis, width, matched_parts):
	"""
	flip coords
	"""
	# Flip horizontal
	joints[:, 0] = width - joints[:, 0] - 1

	# Change left-right parts
	for pair in matched_parts:
	joints[pair[0], :], joints[pair[1], :] = \
	joints[pair[1], :], joints[pair[0], :].copy()
	joints_vis[pair[0], :], joints_vis[pair[1], :] = \
	joints_vis[pair[1], :], joints_vis[pair[0], :].copy()

	return joints*joints_vis, joints_vis

	def transform_preds(coords, center, scale, output_size):
	target_coords = np.zeros(coords.shape)
	trans = get_affine_transform(center, scale, 0, output_size, inv=1)
	for p in range(coords.shape[0]):
	target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
	return target_coords

	def get_affine_transform(
	center, scale, rot, output_size,
	shift=np.array([0, 0], dtype=np.float32), inv=0
	):
	if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
	print(scale)
	scale = np.array([scale, scale])

	scale_tmp = scale * 200.0
	src_w = scale_tmp[0]
	dst_w = output_size[0]
	dst_h = output_size[1]

	rot_rad = np.pi * rot / 180
	src_dir = get_dir([0, src_w * -0.5], rot_rad)
	dst_dir = np.array([0, dst_w * -0.5], np.float32)

	src = np.zeros((3, 2), dtype=np.float32)
	dst = np.zeros((3, 2), dtype=np.float32)
	src[0, :] = center + scale_tmp * shift
	src[1, :] = center + src_dir + scale_tmp * shift
	dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
	dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir

	src[2:, :] = get_3rd_point(src[0, :], src[1, :])
	dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])

	if inv:
	trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
	else:
	trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))

	return trans


	def affine_transform(pt, t):
	new_pt = np.array([pt[0], pt[1], 1.]).T
	new_pt = np.dot(t, new_pt)
	return new_pt[:2]


	def get_3rd_point(a, b):
	direct = a - b
	return b + np.array([-direct[1], direct[0]], dtype=np.float32)


	def get_dir(src_point, rot_rad):
	sn, cs = np.sin(rot_rad), np.cos(rot_rad)

	src_result = [0, 0]
	src_result[0] = src_point[0] * cs - src_point[1] * sn
	src_result[1] = src_point[0] * sn + src_point[1] * cs

	return src_result