| import re |
| import os |
| import dataclasses |
| from enum import auto, Enum |
| from typing import List, Tuple |
| from collections import defaultdict |
| from .constants import PART_ORDER, COCO_KEYPOINT_NAME |
|
|
| def read_hoi_file_2_dict(hoi_config): |
| hoi_dict = {} |
| with open(hoi_config, "r", encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if not line or line.startswith("#"): |
| continue |
| nums, obj, action = line.split() |
| hoi_dict[int(nums)] = [obj, action] |
| return hoi_dict |
|
|
| def read_part_state_file_2_dict(part_state_config): |
| d = defaultdict(list) |
| with open(part_state_config, "r", encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if not line or line.startswith("#"): |
| continue |
|
|
| key, val = line.split(":", 1) |
| key = key.strip() |
| val = val.strip() |
| d[key].append(val) |
| return d |
|
|
| @dataclasses.dataclass |
| class Conversation: |
| def __init__(self, system='', data_path=''): |
| super().__init__() |
| if system == '': |
| self.system = f""" |
| You are an AI assistant. You will be given an image that contains a main human subject. |
| Task: |
| Describe the visual evidence in the image that supports the subject’s action, with an emphasis on human body parts and their interactions with objects. |
| |
| Hints: |
| You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You can use these hints, but you may also add other relevant evidence you observe. |
| |
| Required Constraints: |
| - Start with ONE sentence that summarizes the main action in natural language. |
| - When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}. |
| - Do NOT invent body-part names outside these sets (no synonyms, no paraphrases). |
| - If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists. |
| - Write your description in clear, concise sentences grounded in visible evidence. |
| |
| Optional Constraints : |
| - Write naturally. Avoid repeating the same sentence pattern. |
| - Keep each evidence item to one line. No redundant "both left/right do the same" unless necessary. |
| """ |
| else: |
| self.system = system |
| |
| self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
| self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
|
|
| def _humanpart2word(self, action_labels): |
| action_labels_in_words = [] |
| part_state_keys = list(self.part_state_reference.keys()) |
| for d in action_labels: |
| human_part_id = d['human_part'] |
| part_state_id = d['partstate'] |
|
|
| part_name = PART_ORDER[human_part_id] |
| for key in part_state_keys: |
| if key in part_name: |
| states = self.part_state_reference[key] |
| part_state = states[part_state_id] |
| action_labels_in_words.append([part_name, part_state]) |
| return action_labels_in_words |
|
|
| def _actionid2word(self, hoi_id): |
| obj, act = self.hoi_reference[hoi_id] |
| return obj, act |
|
|
| def get_prompt(self, meta): |
| hoi_obj = meta['hoi_obj'] |
| |
| hoi_id = hoi_obj['hoi_id'] |
| obj_in_word, act_in_word = self._actionid2word(hoi_id) |
| action_labels = hoi_obj['action_labels'] |
| action_labels_in_words = self._humanpart2word(action_labels) |
|
|
| prompt = f""" |
| Given the image, describe the visual evidence (especially body parts) that supports the action. |
| Hints: The action to support is [{act_in_word} with {obj_in_word}]. Possible visual evidence cues include: {action_labels_in_words}. |
| Use these cues as guidance. Only mention cues you can actually see in the image. |
| """ |
| return prompt |
| |
| @dataclasses.dataclass |
| class Conversation_For_Clean_Descrption: |
| def __init__(self, system='', data_path=''): |
| super().__init__() |
| if system == '': |
| self.system = f""" |
| You are a strict verifier and editor for pose-grounded action descriptions. |
| |
| You will be given: |
| - Ground-truth action label(s) (GT). |
| - A candidate description (may be verbose or include irrelevant evidence). |
| - A closed list of allowed keypoint/body-part names. |
| - A replacement mapping (e.g., hand→wrist, foot→ankle). |
| |
| Rules: |
| 1) First, check whether the candidate’s stated action matches the GT action(s). |
| 2) Then rewrite the description into exactly 2–3 sentences: |
| - The first sentence must state the GT action (not the candidate action if it differs). |
| - Keep only evidence that supports the GT action; delete unrelated evidence. |
| - If a joint is mentioned both sides ALWAYS write as "left_wrist and right_wrist", "left_hip and right_hip", "left_ankle and right_ankle", etc. |
| - When mentioning body parts/keypoints, you MUST use only names from the allowed list (exact match). |
| - Apply the replacement mapping strictly; never output disallowed synonyms like “hand/foot” if they map to allowed names. |
| - Do not add new evidence; only keep/condense evidence already present in the candidate. |
| - A MUST-KEEP hint: required (joint, part_action) items that must appear in the final description (joint names may need replacement). |
| |
| Output format (plain text only): The refined 2–3 sentence description. |
| No other text. |
| """ |
| else: |
| self.system = system |
| |
| self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
| self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
| |
| def _replace_part_names(self, text): |
| REPL = { |
| "hand": "wrist", |
| "hands": "wrists", |
| "foot": "ankle", |
| "feet": "ankles", |
| } |
| pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE) |
| def _sub(m): |
| w = m.group(0) |
| out = REPL[w.lower()] |
| |
| if w[0].isupper(): |
| out = out.capitalize() |
| return out |
| return pattern.sub(_sub, text) |
|
|
| def _humanpart2word(self, action_labels): |
| action_labels_in_words = [] |
| part_state_keys = list(self.part_state_reference.keys()) |
| for d in action_labels: |
| human_part_id = d['human_part'] |
| part_state_id = d['partstate'] |
|
|
| part_name = PART_ORDER[human_part_id] |
| for key in part_state_keys: |
| if key in part_name: |
| states = self.part_state_reference[key] |
| part_state = states[part_state_id] |
|
|
| part_name = self._replace_part_names(part_name) |
| action_labels_in_words.append([part_name, part_state]) |
| return action_labels_in_words |
|
|
| def _actionid2word(self, hoi_id): |
| obj, act = self.hoi_reference[hoi_id] |
| return obj, act |
|
|
| def get_prompt(self, meta): |
| hoi_id = meta['hoi_id'] |
| obj_in_word, act_in_word = self._actionid2word(hoi_id) |
| action_labels = meta['action_labels'] |
| action_labels_in_words = self._humanpart2word(action_labels) |
|
|
| description = meta['description'] |
| description = self._replace_part_names(description) |
|
|
| prompt = f""" |
| GT action(s): {act_in_word, obj_in_word} |
| Allowed keypoint names: |
| {COCO_KEYPOINT_NAME} |
| Replacement mapping: |
| "hand" to "wrist", "foot" to "ankle" |
| Candidate description: |
| {description} |
| Must-KEEP Hint: |
| {action_labels_in_words} |
| Please follow the system rules and output in the required plain-text format. |
| """ |
| return prompt |
|
|
| @dataclasses.dataclass |
| class Conversation_For_Clean_Evidence: |
| def __init__(self, system='', data_path=''): |
| super().__init__() |
| if system == '': |
| self.system = f""" |
| You rewrite descriptions into NATURAL LANGUAGE evidence-only text. |
| |
| Output rules: |
| - Write 2–3 complete sentences in natural English. |
| - Do NOT mention the action or the subject (no "person", "he", "she", "they", "main", etc.). |
| - Only describe evidence involving body parts/keypoints and part-level motions/contacts. |
| - Every sentence must include at least one keypoint name from the allowed list (exact match). |
| - Only use keypoint names from the allowed list; no other body-part words. |
| - Never use generic joints (e.g., "wrist", "hip", "ankle") alone; If both sides are mentioned, use "left_wrist and right_wrist", "left_hip and right_hip", etc. |
| - Apply the replacement mapping first (hand→wrist, foot→ankle, etc.), then enforce left/right by writing both sides. |
| - Keep only evidence supported by the candidate; do not add new details. |
| |
| Style variety requirement: |
| - Write like a reasoning use normal grammar, not lists, not "keypoint: ...". |
| - Do not use the same starter phrase or the same connector in both sentences. |
| - Example reasoning patterns (Can invent your own, but use different pattern): |
| A) “With <keypoints/evidence>, <interpretation>.” (no “suggesting/indicating”) |
| B) “<Interpretation>; evidence includes <keypoints/evidence>.” (semicolon style) |
| C) “This is supported by <keypoints/evidence>, which <effect/constraint>.” (“supported by” style) |
| D) “Notably, <keypoints/evidence>; this points to <interpretation>.” (“notably/points to” style) |
| E) “<Keypoints/evidence> form(s) <configuration>, consistent with <interpretation>.” (“configuration” style) |
| """ |
| else: |
| self.system = system |
| |
| self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
| self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
| |
| def _replace_part_names(self, text): |
| REPL = { |
| "hand": "wrist", |
| "hands": "wrists", |
| "foot": "ankle", |
| "feet": "ankles", |
| } |
| pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE) |
| def _sub(m): |
| w = m.group(0) |
| out = REPL[w.lower()] |
| |
| if w[0].isupper(): |
| out = out.capitalize() |
| return out |
| return pattern.sub(_sub, text) |
|
|
| def _humanpart2word(self, action_labels): |
| action_labels_in_words = [] |
| part_state_keys = list(self.part_state_reference.keys()) |
| for d in action_labels: |
| human_part_id = d['human_part'] |
| part_state_id = d['partstate'] |
|
|
| part_name = PART_ORDER[human_part_id] |
| for key in part_state_keys: |
| if key in part_name: |
| states = self.part_state_reference[key] |
| part_state = states[part_state_id] |
|
|
| part_name = self._replace_part_names(part_name) |
| action_labels_in_words.append([part_name, part_state]) |
| return action_labels_in_words |
|
|
| def _actionid2word(self, hoi_id): |
| obj, act = self.hoi_reference[hoi_id] |
| return obj, act |
|
|
| def get_prompt(self, meta): |
| hoi_id = meta['hoi_id'] |
| obj_in_word, act_in_word = self._actionid2word(hoi_id) |
| action_labels = meta['action_labels'] |
| action_labels_in_words = self._humanpart2word(action_labels) |
|
|
| description = meta['short_description'] |
| description = self._replace_part_names(description) |
| prompt = f""" |
| GT action(s): {act_in_word, obj_in_word} |
| Allowed keypoint names: |
| {COCO_KEYPOINT_NAME} |
| Replacement mapping: |
| "hand" to "wrist", "foot" to "ankle" |
| Candidate description: |
| {description} |
| Must-KEEP Hint: |
| {action_labels_in_words} |
| Please follow the system rules and output in the required plain-text format. |
| """ |
| return prompt |
|
|
| @dataclasses.dataclass |
| class Conversation_For_Action_Pharse: |
| def __init__(self, system='', data_path=''): |
| super().__init__() |
| if system == '': |
| self.system = f""" |
| You are a visual captioning assistant. |
| Given an image and an action hint in the form [VERB, OBJECT], output exactly one short English sentence describing that action in the image. |
| |
| Rules: |
| • Use only the provided VERB and OBJECT (you may adjust grammar: holds/holding; a/the; plural if needed). |
| • Output one sentence only. |
| • No extra details (no location, colors, emotions, reasons, scene context). |
| • No punctuation beyond the final period. |
| • If the subject is a person, use “The person” (not “man/woman/boy/girl”). |
| • If the action is not visible, still output a best-effort sentence using the hint. |
| """ |
| else: |
| self.system = system |
| |
| self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
| self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
| |
| def _replace_part_names(self, text): |
| REPL = { |
| "hand": "wrist", |
| "hands": "wrists", |
| "foot": "ankle", |
| "feet": "ankles", |
| } |
| pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE) |
| def _sub(m): |
| w = m.group(0) |
| out = REPL[w.lower()] |
| |
| if w[0].isupper(): |
| out = out.capitalize() |
| return out |
| return pattern.sub(_sub, text) |
|
|
| def _humanpart2word(self, action_labels): |
| action_labels_in_words = [] |
| part_state_keys = list(self.part_state_reference.keys()) |
| for d in action_labels: |
| human_part_id = d['human_part'] |
| part_state_id = d['partstate'] |
|
|
| part_name = PART_ORDER[human_part_id] |
| for key in part_state_keys: |
| if key in part_name: |
| states = self.part_state_reference[key] |
| part_state = states[part_state_id] |
|
|
| part_name = self._replace_part_names(part_name) |
| action_labels_in_words.append([part_name, part_state]) |
| return action_labels_in_words |
|
|
| def _actionid2word(self, hoi_id): |
| obj, act = self.hoi_reference[hoi_id] |
| return obj, act |
|
|
| def get_prompt(self, meta): |
| hoi_id = meta['hoi_id'] |
| obj_in_word, act_in_word = self._actionid2word(hoi_id) |
| action_labels = meta['action_labels'] |
| action_labels_in_words = self._humanpart2word(action_labels) |
|
|
| description = meta['short_description'] |
| description = self._replace_part_names(description) |
| prompt = f""" |
| Hints: {act_in_word, obj_in_word} |
| Write exactly one short sentence that follows the rules. |
| """ |
| return prompt |
| |
| @dataclasses.dataclass |
| class Conversation_For_COCO_Long_Description: |
| def __init__(self, system='', data_path=''): |
| super().__init__() |
| if system == '': |
| self.system = f""" |
| You are an AI assistant. You will be given an image that contains a main human subject. |
| Task: |
| Describe the visual evidence in the image that supports the subject’s action, with an emphasis on human body parts and their interactions with objects. |
| |
| Hints: |
| You may be given hints about (1) the action and (2) related objects and possible supporting body parts. You can use these hints, but you may also add other relevant evidence you observe. |
| |
| Required Constraints: |
| - Start with ONE sentence that summarizes the main action in natural language. |
| - When you mention any keypoint or body part, you MUST use names ONLY from: {COCO_KEYPOINT_NAME}. |
| - Do NOT invent body-part names outside these sets (no synonyms, no paraphrases). |
| - If you are unsure which name applies, either omit the body-part mention or choose the closest valid name from the lists. |
| - Write your description in clear, concise sentences grounded in visible evidence. |
| |
| Optional Constraints : |
| - Write naturally. Avoid repeating the same sentence pattern. |
| - Keep each evidence item to one line. No redundant "both left/right do the same" unless necessary. |
| """ |
| else: |
| self.system = system |
| |
| self.hoi_reference = read_hoi_file_2_dict(os.path.join(data_path, 'Configs/hico_hoi_list.txt')) |
| self.part_state_reference = read_part_state_file_2_dict(os.path.join(data_path, 'Configs/Part_State_76.txt')) |
| |
| def _replace_part_names(self, text): |
| REPL = { |
| "hand": "wrist", |
| "hands": "wrists", |
| "foot": "ankle", |
| "feet": "ankles", |
| } |
| pattern = re.compile(r"\b(" + "|".join(map(re.escape, REPL.keys())) + r")\b", re.IGNORECASE) |
| def _sub(m): |
| w = m.group(0) |
| out = REPL[w.lower()] |
| |
| if w[0].isupper(): |
| out = out.capitalize() |
| return out |
| return pattern.sub(_sub, text) |
|
|
| def get_prompt(self, meta): |
| |
| prompt = f""" |
| Hint: you may consider use the actions in the below dictionary {self.part_state_reference} |
| Given the image, describe the visual evidence (especially body parts) that supports the action. |
| """ |
| return prompt |
|
|
|
|
| if __name__ == "__main__": |
| pass |