Hyukkyu
/

nv-embed-v2

@@ -181,13 +181,20 @@ class BidirectionalMistralModel(MistralModel):
                     **layer_kwargs,
                 )
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
         hidden_states = self.norm(hidden_states)
@@ -196,8 +203,12 @@ class BidirectionalMistralModel(MistralModel):
             all_hidden_states += (hidden_states,)
         next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)

                     **layer_kwargs,
                 )
+            # Compatibility fix for transformers 5.x:
+            # In transformers 5.x, MistralDecoderLayer.forward returns a single tensor
+            # In transformers 4.x, it returns a tuple (hidden_states, present_key_value, ...)
+            if isinstance(layer_outputs, torch.Tensor):
+                # transformers 5.x: direct tensor output
+                hidden_states = layer_outputs
+                # Note: use_cache and output_attentions not supported in this code path
+            else:
+                # transformers 4.x: tuple output
+                hidden_states = layer_outputs[0]
+                if use_cache:
+                    next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+                if output_attentions:
+                    all_self_attns += (layer_outputs[1],)
         hidden_states = self.norm(hidden_states)
             all_hidden_states += (hidden_states,)
         next_cache = None
+        if use_cache and next_decoder_cache is not None:
+            # Compatibility: to_legacy_cache may not exist in all versions
+            if use_legacy_cache and hasattr(next_decoder_cache, 'to_legacy_cache'):
+                next_cache = next_decoder_cache.to_legacy_cache()
+            else:
+                next_cache = next_decoder_cache
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)