Large Reward Models (LRMs)

Large Reward Models: Generalizable Online Robot Reward Generation with Vision-Language Models

Authors: Yanru Wu, Weiduo Yuan, Ang Qi, Vitor Guizilini, Jiageng Mao†, Yue Wang†

Affiliations: USC Physical Superintelligence Lab, Toyota Research Institute

Overview

This repository contains three specialized Large Reward Models (LRMs) fine-tuned from Qwen3-VL-8B-Instruct for generating reward signals in robot reinforcement learning. Each model serves a distinct role in the reward pipeline:

Model	Path	Description
Temporal Contrastive	`contrastive/`	Compares two observations to determine which is closer to task completion
Absolute Progress	`progress/`	Estimates the completion progress (0.0–1.0) from a single observation
Task Completion	`completion/`	Binary classifier for whether a task has been completed (yes/no)

Usage

Requirements

pip install transformers torch pillow

Temporal Contrastive Model

Given an initial observation and two later observations, predicts which is closer to task completion.

from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image

model_path = "USC-PSI-Lab/LRM-models"
subfolder = "contrastive"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    model_path, subfolder=subfolder,
    torch_dtype=torch.bfloat16, device_map="auto",
)
processor = AutoProcessor.from_pretrained(
    model_path, subfolder=subfolder,
)

# Load images
initial_img = Image.open("initial.jpg").convert("RGB")
image_a = Image.open("image_a.jpg").convert("RGB")
image_b = Image.open("image_b.jpg").convert("RGB")

messages = [{"role": "user", "content": [
    {"type": "text", "text": "Task: Compare the completion progress.\n\nThe task is: Pick up the cup.\n\nYou are given:\n- Initial observation: "},
    {"type": "image", "image": initial_img},
    {"type": "text", "text": "\n- Later observation (Image A): "},
    {"type": "image", "image": image_a},
    {"type": "text", "text": "\n- Later observation (Image B): "},
    {"type": "image", "image": image_b},
    {"type": "text", "text": '\n\nQuestion: Which of Image A or Image B is closer to completing the task?\nSelect one value from the following list:\n["ImageA", "ImageB"]\n\nPlease provide a step-by-step visual analysis first, and then output your answer in the following JSON format:\n{ "more_complete_image": "selected_value" }'},
]}]

text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[initial_img, image_a, image_b], padding=True, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=2048, do_sample=False)

response = processor.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print(response)
# Output: { "more_complete_image": "ImageA" }

Absolute Progress Model

Estimates completion progress as a value between 0.0 and 1.0.

subfolder = "progress"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    model_path, subfolder=subfolder,
    torch_dtype=torch.bfloat16, device_map="auto",
)
processor = AutoProcessor.from_pretrained(
    model_path, subfolder=subfolder,
)

observation = Image.open("observation.jpg").convert("RGB")

messages = [{"role": "user", "content": [
    {"type": "text", "text": "Task: Estimate the completion progress.\n\nThe task is: Pick up the cup.\n\nYou are given:\n- Current observation: "},
    {"type": "image", "image": observation},
    {"type": "text", "text": '\n\nEstimate the task completion progress from 0.0 (not started) to 1.0 (fully completed).\nOutput your answer in the following JSON format:\n{ "completion_progress": value }'},
]}]

text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[observation], padding=True, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=2048, do_sample=False)

response = processor.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print(response)
# Output: { "completion_progress": 0.7 }

Task Completion Model

Binary prediction of whether a task has been completed.

subfolder = "completion"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    model_path, subfolder=subfolder,
    torch_dtype=torch.bfloat16, device_map="auto",
)
processor = AutoProcessor.from_pretrained(
    model_path, subfolder=subfolder,
)

observation = Image.open("observation.jpg").convert("RGB")

messages = [{"role": "user", "content": [
    {"type": "text", "text": "Task: Determine task completion.\n\nThe task is: Pick up the cup.\n\nYou are given:\n- Current observation: "},
    {"type": "image", "image": observation},
    {"type": "text", "text": '\n\nHas the task been completed?\nOutput your answer in the following JSON format:\n{ "task_completed": "yes" or "no" }'},
]}]

text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[observation], padding=True, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=512, do_sample=False)

response = processor.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print(response)
# Output: { "task_completed": "no" }