Belle-VLM: Vietnamese Vision Language Model
Model Description
Belle-VLM is a Vision Language Model trained for Vietnamese multimodal reasoning tasks.
Architecture
- LLM Backbone: Qwen3-0.6B
- Vision Encoder: FastViTHD (MobileCLIP)
- Projector: MLP 2-layer (3072 -> 1024)
Training
- Dataset: 5CD-AI/Viet-multimodal-open-r1-8k-verified
- Method: LoRA fine-tuning
- Steps: 2
- Learning Rate: 2e-05
Usage
!pip install -q transformers>=4.51.0 torch torchvision timm pillow requests datasets einops
!pip install -q open-clip-torch
!git clone https://github.com/Hert4/ml-fastvlm-v2.git
%cd ml-fastvlm-v2
import sys
import os
import torch
import requests
from PIL import Image
from io import BytesIO
# Add path to ml-fastvlm-v2
FASTVLM_PATH = "/kaggle/working/ml-fastvlm-v2"
if os.path.exists(FASTVLM_PATH):
sys.path.insert(0, FASTVLM_PATH)
else:
# Local path fallback
sys.path.insert(0, ".")
MODEL_PATH = "beyoru/Belle-VLM"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
print(f"Device: {DEVICE}")
print(f"Dtype: {DTYPE}")
from transformers import AutoTokenizer
from llava.model.language_model.llava_qwen import LlavaQwen3ForCausalLM
print(f"\nLoading model from: {MODEL_PATH}")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
use_fast=False
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load model
model = LlavaQwen3ForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=DTYPE,
device_map="auto",
trust_remote_code=True,
)
model.eval()
# Setup vision tower
vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
vision_tower.load_model()
vision_tower = vision_tower.to(device=model.device, dtype=DTYPE)
image_processor = vision_tower.image_processor
print("Model loaded!")
def debug_model():
"""Check if model components are properly loaded."""
print("\n" + "=" * 60)
print("DEBUG: Checking model components")
print("=" * 60)
# 1. Check mm_projector
if hasattr(model.model, 'mm_projector'):
mm_proj = model.model.mm_projector
print(f"[OK] mm_projector exists: {type(mm_proj)}")
# Check weights
if isinstance(mm_proj, torch.nn.Sequential):
first_layer = mm_proj[0]
w = first_layer.weight.float() # Convert to float for stats
print(f" First layer shape: {w.shape}")
print(f" Weight mean: {w.mean().item():.6f}")
print(f" Weight std: {w.std().item():.6f}")
print(f" Weight range: [{w.min().item():.4f}, {w.max().item():.4f}]")
# Check if looks trained
if w.std().item() > 0.3:
print(" [WARNING] Std too high - might be random init!")
else:
print(" [OK] Weights look trained")
else:
print("[ERROR] mm_projector NOT FOUND!")
return False
# 2. Check vision tower
print(f"\n[OK] Vision tower: {type(vision_tower).__name__}")
print(f" Is loaded: {vision_tower.is_loaded}")
print(f" Hidden size: {getattr(vision_tower, 'hidden_size', 'unknown')}")
# 3. Check config
print(f"\n[INFO] Config:")
print(f" mm_vision_tower: {getattr(model.config, 'mm_vision_tower', 'NOT SET')}")
print(f" mm_hidden_size: {getattr(model.config, 'mm_hidden_size', 'NOT SET')}")
print(f" mm_projector_type: {getattr(model.config, 'mm_projector_type', 'NOT SET')}")
print(f" LLM hidden_size: {model.config.hidden_size}")
return True
def debug_image_encoding(image):
"""Test image encoding pipeline."""
from llava.mm_utils import process_images
print("\n" + "=" * 60)
print("DEBUG: Testing image encoding pipeline")
print("=" * 60)
# Process image
image_tensor = process_images([image], image_processor, model.config)[0]
image_tensor = image_tensor.unsqueeze(0).to(dtype=DTYPE, device=model.device)
print(f"[1] Image tensor:")
print(f" Shape: {image_tensor.shape}")
print(f" Dtype: {image_tensor.dtype}")
print(f" Range: [{image_tensor.min().item():.3f}, {image_tensor.max().item():.3f}]")
# Get vision features
with torch.no_grad():
vision_features = vision_tower(image_tensor)
print(f"\n[2] Vision features (from vision tower):")
print(f" Shape: {vision_features.shape}")
print(f" Mean: {vision_features.float().mean().item():.6f}")
print(f" Std: {vision_features.float().std().item():.6f}")
# Check for NaN/Inf
if torch.isnan(vision_features).any():
print(" [ERROR] Contains NaN!")
if torch.isinf(vision_features).any():
print(" [ERROR] Contains Inf!")
# Project through mm_projector
projected = model.model.mm_projector(vision_features)
print(f"\n[3] Projected features (after mm_projector):")
print(f" Shape: {projected.shape}")
print(f" Mean: {projected.float().mean().item():.6f}")
print(f" Std: {projected.float().std().item():.6f}")
# Check for NaN/Inf
if torch.isnan(projected).any():
print(" [ERROR] Contains NaN!")
if torch.isinf(projected).any():
print(" [ERROR] Contains Inf!")
# Compare with text embeddings scale
sample_ids = torch.tensor([[1, 2, 3]], device=model.device)
text_embeds = model.model.embed_tokens(sample_ids)
print(f"\n[4] Text embeddings (for comparison):")
print(f" Mean: {text_embeds.float().mean().item():.6f}")
print(f" Std: {text_embeds.float().std().item():.6f}")
# Check scale match
proj_std = projected.float().std().item()
text_std = text_embeds.float().std().item()
ratio = proj_std / text_std if text_std > 0 else float('inf')
print(f"\n[5] Scale ratio (projected/text): {ratio:.2f}")
if ratio > 10 or ratio < 0.1:
print(" [WARNING] Scale mismatch! May cause issues.")
else:
print(" [OK] Scales are similar")
return projected
from llava.conversation import conv_templates
from llava.mm_utils import process_images, tokenizer_image_token
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
def chat(image, question, temperature=0.7, max_new_tokens=512, debug=False):
"""
Chat with the model about an image.
Args:
image: PIL Image
question: str
temperature: float (0.0 = deterministic, higher = more random)
max_new_tokens: int
debug: bool - print debug info
Returns:
str: Model response
"""
if debug:
debug_image_encoding(image)
# Process image
image_tensor = process_images([image], image_processor, model.config)[0]
# Build conversation
conv = conv_templates["qwen_3"].copy()
conv.append_message(conv.roles[0], f"{DEFAULT_IMAGE_TOKEN}\n{question}")
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
if debug:
print(f"\n[DEBUG] Prompt:\n{prompt[:500]}...")
# Tokenize
input_ids = tokenizer_image_token(
prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'
).unsqueeze(0).to(model.device)
if debug:
print(f"\n[DEBUG] Input IDs shape: {input_ids.shape}")
# Check for IMAGE_TOKEN_INDEX
num_image_tokens = (input_ids == IMAGE_TOKEN_INDEX).sum().item()
print(f"[DEBUG] Number of image tokens: {num_image_tokens}")
# Generate
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor.unsqueeze(0).to(dtype=DTYPE, device=model.device),
image_sizes=[image.size],
do_sample=temperature > 0,
temperature=temperature if temperature > 0 else None,
max_new_tokens=max_new_tokens,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Decode
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# Clean up response (remove prompt echo if present)
if "<|im_start|>" in response:
# Find the assistant's response
parts = response.split("<|im_start|>assistant")
if len(parts) > 1:
response = parts[-1].strip()
return response
if __name__ == "__main__":
# Run debug checks
debug_model()
# Load test image
print("\n" + "=" * 60)
print("Loading test image...")
print("=" * 60)
url = "<your url here>"
image = Image.open(BytesIO(requests.get(url).content)).convert("RGB")
print(f"Image size: {image.size}")
# Debug image encoding
debug_image_encoding(image)
# Test chat
print("\n" + "=" * 60)
print("Testing chat...")
print("=" * 60)
questions = [
"What is in this image?",
"Mรด tแบฃ hรฌnh แบฃnh nร y",
"Con vแบญt trong แบฃnh lร gรฌ?",
]
for q in questions:
print(f"\nQ: {q}")
response = chat(image, q, temperature=0.3, max_new_tokens=256)
print(f"A: {response[:500]}...")
print("-" * 40)
Training Details
| Parameter | Value |
|---|---|
| Base Model | Qwen/Qwen3-0.6B |
| Vision Tower | apple/MobileCLIP2-S4 |
| LoRA Rank | 4 |
| LoRA Alpha | 8 |
| Batch Size | 1 x 1 |
| Max Steps | 2 |
Conclusion:
Limitation of MobileClip or training might make low quality model.
๐ค Contribute
License
Apache 2.0
- Downloads last month
- 135
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support