| from transformers import PretrainedConfig | |
| class InternVideo2Config(PretrainedConfig): | |
| model_type = "internvideo2" | |
| def __init__( | |
| self, | |
| img_size=224, | |
| patch_size=14, | |
| tubelet_size=1, | |
| num_frames=8, | |
| d_model=1408, | |
| num_heads=16, | |
| depth=40, | |
| mlp_ratio=48 / 11, | |
| qkv_bias=False, | |
| init_values=1e-5, | |
| use_checkpoint=False, | |
| checkpoint_num=0, | |
| use_flash_attn=False, | |
| use_fused_mlp=False, | |
| use_fused_rmsnorm=False, | |
| qk_normalization=True, | |
| clip_embed_dim=1408, | |
| attn_pool_num_heads=16, | |
| clip_teacher_embed_dim=512, | |
| clip_teacher_final_dim=512, | |
| clip_student_return_interval=4, | |
| clip_return_layer=3, | |
| clip_norm_type="l2", | |
| sep_image_video_pos_embed=False, | |
| **kwargs, | |
| ): | |
| """ | |
| This is the configuration class to store the configuration of a `InternVideo2Model`. | |
| It is used to instantiate a InternVideo2 model according to the specified arguments, | |
| defining the model architecture. | |
| Args: | |
| img_size (int, optional): Input image size. Defaults to 224. | |
| patch_size (int, optional): Size of each patch. Defaults to 14. | |
| tubelet_size (int, optional): Temporal tubelet size. Defaults to 1. | |
| num_frames (int, optional): Number of frames in the video input. Defaults to 8. | |
| d_model (int, optional): Dimension of the model embeddings. Defaults to 1408. | |
| num_heads (int, optional): Number of attention heads. Defaults to 16. | |
| depth (int, optional): Number of transformer encoder layers. Defaults to 40. | |
| mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding dim. Defaults to 48/11. | |
| qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Defaults to False. | |
| init_values (float, optional): Initial values for layer scale. Defaults to 1e-5. | |
| use_checkpoint (bool, optional): Whether to use gradient checkpointing. Defaults to False. | |
| checkpoint_num (int, optional): Number of layers to apply checkpointing. Defaults to 0. | |
| use_flash_attn (bool, optional): Whether to use FlashAttention. Defaults to False. | |
| use_fused_mlp (bool, optional): Whether to use fused MLP. Defaults to False. | |
| use_fused_rmsnorm (bool, optional): Whether to use fused RMSNorm. Defaults to False. | |
| qk_normalization (bool, optional): Whether to apply QK normalization. Defaults to True. | |
| clip_embed_dim (int, optional): Embedding dimension for CLIP. Defaults to 1408. | |
| attn_pool_num_heads (int, optional): Number of heads for attention pooling. Defaults to 16. | |
| clip_teacher_embed_dim (int, optional): Embedding dimension for CLIP teacher model. Defaults to 512. | |
| clip_teacher_final_dim (int, optional): Final embedding dimension for CLIP teacher model. Defaults to 512. | |
| clip_student_return_interval (int, optional): Interval for returning student layers. Defaults to 4. | |
| clip_return_layer (int, optional): Number of layers to return for alignment. Defaults to 3. | |
| clip_norm_type (str, optional): Normalization type for CLIP ('l2' or 'none'). Defaults to 'l2'. | |
| sep_image_video_pos_embed (bool, optional): Whether to use separate position embeddings for image and video. Defaults to False. | |
| **kwargs: Additional keyword arguments. | |
| """ | |
| super().__init__(**kwargs) | |
| self.img_size = img_size | |
| self.patch_size = patch_size | |
| self.tubelet_size = tubelet_size | |
| self.num_frames = num_frames | |
| self.d_model = d_model | |
| self.num_heads = num_heads | |
| self.depth = depth | |
| self.mlp_ratio = mlp_ratio | |
| self.qkv_bias = qkv_bias | |
| self.init_values = init_values | |
| self.use_checkpoint = use_checkpoint | |
| self.checkpoint_num = checkpoint_num | |
| self.use_flash_attn = use_flash_attn | |
| self.use_fused_mlp = use_fused_mlp | |
| self.use_fused_rmsnorm = use_fused_rmsnorm | |
| self.qk_normalization = qk_normalization | |
| self.clip_embed_dim = clip_embed_dim | |
| self.attn_pool_num_heads = attn_pool_num_heads | |
| self.clip_teacher_embed_dim = clip_teacher_embed_dim | |
| self.clip_teacher_final_dim = clip_teacher_final_dim | |
| self.clip_student_return_interval = clip_student_return_interval | |
| self.clip_return_layer = clip_return_layer | |
| self.clip_norm_type = clip_norm_type | |
| self.sep_image_video_pos_embed = sep_image_video_pos_embed | |