Video-LLaVA-lnguage-bind

Paused

App Files Files Community

adarsh8962 commited on Aug 30

Commit

2d6c974

verified ·

1 Parent(s): f838b85

Update llava/serve/gradio_utils.py

Browse files

Files changed (1) hide show

llava/serve/gradio_utils.py +52 -23

llava/serve/gradio_utils.py CHANGED Viewed

@@ -7,23 +7,24 @@ from llava.model.builder import load_pretrained_model
 from llava.utils import disable_torch_init
 import re
 import torch
-# ---------- Stable generation defaults (stop bracket loops) ----------
 GEN_KW = dict(
-    do_sample=False,             # deterministic
     temperature=0.0,
     top_p=1.0,
-    repetition_penalty=1.15,     # break single-token loops like [[[[[
-    no_repeat_ngram_size=3,      # avoid short repeats
-    use_cache=False,             # lower VRAM on L4; fine on L40S too
 )
 def _big_gpu():
     try:
         return (torch.cuda.is_available()
-                and torch.cuda.get_device_properties(0).total_memory / 1024**3 >= 40)
     except Exception:
         return False
@@ -41,21 +42,22 @@ def build_framewise_prompt(T: int) -> str:
     )
 def keep_frame_lines(text: str, T: int) -> str:
-    """Keep only 'Frame i: ...' lines; ensure frames 1..T exist."""
     lines = []
     for ln in text.splitlines():
-        m = re.match(r"^Frame\s+(\d+)\s*:\s*(.+)$", ln.strip())
         if not m:
             continue
         i = int(m.group(1))
-        body = " ".join(m.group(2).split()[:10])  # ≤10 words
         if 1 <= i <= T:
-            lines.append((i, f"Frame {i}: {body}"))
     have = {i for i,_ in lines}
     for i in range(1, T+1):
         if i not in have:
-            lines.append((i, f"Frame {i}: (no description)"))
-    return "\n".join(t for _, t in sorted(lines))
 title_markdown = ("""
@@ -168,26 +170,53 @@ class Chat:
         # streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         # print(input_ids, images_tensor[0][0].shape)
         with torch.inference_mode():
             output_ids = model.generate(
                 input_ids,
                 images=images_tensor,
-                do_sample=True,
-                temperature=temperature,
                 max_new_tokens=max_new_tokens,
-                # streamer=streamer,
-                use_cache=True,
-                stopping_criteria=[stopping_criteria])
         input_token_len = input_ids.shape[1]
         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
         if n_diff_input_output > 0:
             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
         outputs = outputs.strip()
-        if outputs.endswith(stop_str):
-            outputs = outputs[:-len(stop_str)]
-        outputs = outputs.strip()
-        print('response', outputs)
         return outputs, state

 from llava.utils import disable_torch_init
+# ==== memory-safe, de-hallucinating generation helpers ====
 import re
 import torch
+# deterministic + anti-repeat defaults
 GEN_KW = dict(
+    do_sample=False,
     temperature=0.0,
     top_p=1.0,
+    repetition_penalty=1.15,   # breaks [[[ spam
+    no_repeat_ngram_size=3,    # avoids short loops
+    use_cache=False,           # reduces VRAM spikes on L4
 )
 def _big_gpu():
     try:
         return (torch.cuda.is_available()
+                and torch.cuda.get_device_properties(0).total_memory / 1024**3 >= 40)  # >=40GB = L40S/A100
     except Exception:
         return False
     )
 def keep_frame_lines(text: str, T: int) -> str:
+    \"\"\"Keep only `Frame i: ...` lines; ensure frames 1..T exist.\"\"\"
     lines = []
     for ln in text.splitlines():
+        m = re.match(r\"^Frame\\s+(\\d+)\\s*:\\s*(.+)$\", ln.strip())
         if not m:
             continue
         i = int(m.group(1))
+        body = \" \".join(m.group(2).split()[:10])  # ≤10 words
         if 1 <= i <= T:
+            lines.append((i, f\"Frame {i}: {body}\"))
     have = {i for i,_ in lines}
     for i in range(1, T+1):
         if i not in have:
+            lines.append((i, f\"Frame {i}: (no description)\"))   # never leaves gaps
+    return \"\\n\".join(t for _, t in sorted(lines))
+# ==== end helpers ====
 title_markdown = ("""
         # streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         # print(input_ids, images_tensor[0][0].shape)
         with torch.inference_mode():
+            # infer how many frames actually went in (works for list-of-frames or tensors)
+            def _infer_T(imgs):
+                try:
+                    if isinstance(imgs, (list, tuple)) and len(imgs) > 0:
+                        first = imgs[0]
+                        if isinstance(first, (list, tuple)):
+                            return len(first)
+                        if hasattr(first, "shape"):
+                            shp = list(first.shape)
+                            if len(shp) >= 4:   # [T, C, H, W] or [1, T, C, H, W]
+                                return int(shp[0])
+                except Exception:
+                    pass
+                return 8  # safe default
+            _T = _infer_T(images_tensor)
+            # VRAM-aware cap: more frames → allow a few more tokens, but stay safe on L4
+            max_new_tokens = min(16 * max(1, _T), MAX_NEW_TOKENS_BIG if _big_gpu() else MAX_NEW_TOKENS_SMALL)
             output_ids = model.generate(
                 input_ids,
                 images=images_tensor,
                 max_new_tokens=max_new_tokens,
+                **GEN_KW,                              # <- deterministic + lower VRAM
+                stopping_criteria=[stopping_criteria],
+            )
         input_token_len = input_ids.shape[1]
         n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
         if n_diff_input_output > 0:
             print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
         outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
         outputs = outputs.strip()
+        # If user asked about frames, force a clean "Frame i: ..." list
+        try:
+            _T = _infer_T(images_tensor)
+        except Exception:
+            _T = 8
+        if "frame" in prompt.lower():
+            cleaned = keep_frame_lines(outputs, _T)
+            if cleaned.strip():
+                outputs = cleaned
+        print("response", outputs)
         return outputs, state