moonshotai
/

Kimi-Linear-48B-A3B-Instruct

@@ -1,11 +1,11 @@
 import math
 from collections.abc import Callable
-from typing import Any, List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import transformers
-from einops import rearrange
 from packaging import version
 from torch import nn
 from transformers.activations import ACT2FN
@@ -13,21 +13,19 @@ from transformers.cache_utils import Cache
 from transformers.generation import GenerationMixin
 from transformers.masking_utils import create_causal_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
-from transformers.modeling_outputs import (BaseModelOutputWithPast,
-                                           CausalLMOutputWithPast)
-from transformers.modeling_utils import (ALL_ATTENTION_FUNCTIONS,
-                                         PreTrainedModel)
 from transformers.processing_utils import Unpack
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.utils import (TransformersKwargs, auto_docstring,
-                                can_return_tuple, logging)
 from transformers.utils.generic import OutputRecorder, check_model_inputs
 try:
-    from fla.layers.utils import get_unpad_data, index_first_axis, pad_input
     from fla.modules import FusedRMSNormGated, ShortConvolution
     from fla.ops.kda import chunk_kda, fused_recurrent_kda
     from fla.ops.kda.gate import fused_kda_gate
 except ImportError:
     raise ImportError("Plese run `pip install -U fla-core`")
@@ -39,6 +37,84 @@ assert version.parse(transformers.__version__) >= version.parse("4.56.0"), \
 logger = logging.get_logger(__name__)
 class KimiDynamicCache:
     """
     Dynamic cache for Kimi model.
@@ -81,7 +157,7 @@ class KimiDynamicCache:
         key_states: torch.Tensor,
         value_states: torch.Tensor,
         layer_idx: int,
-        cache_kwargs: Optional[dict[str, Any]] = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if self.key_cache[layer_idx] is None:
             self.key_cache[layer_idx] = key_states
@@ -112,12 +188,12 @@ class KimiDynamicCache:
                 self.conv_states[layer_idx] = (
                     q_conv.index_select(0, beam_idx),
                     k_conv.index_select(0, beam_idx),
-                    v_conv.index_select(0, beam_idx)
                 )
                 self.recurrent_states[layer_idx] = self.recurrent_states[layer_idx].index_select(
                     0, beam_idx)
-    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
         # take any layer that contains cache and not empty tensor
         layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
@@ -224,7 +300,7 @@ def eager_attention_forward(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
     scaling: float,
     dropout: float = 0.0,
     **kwargs: Unpack[TransformersKwargs],
@@ -304,10 +380,10 @@ class KimiMLAAttention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Cache] = None,
         **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         batch_size, seq_length = hidden_states.shape[:-1]
         query_shape = (batch_size, seq_length, -1, self.q_head_dim)
         key_shape = (batch_size, seq_length, -1,
@@ -400,12 +476,12 @@ class KimiDeltaAttention(nn.Module):
         self.k_conv1d = ShortConvolution(
             hidden_size=projection_k_size,
             kernel_size=self.conv_size,
-            activation='silu'
         )
         self.v_conv1d = ShortConvolution(
             hidden_size=projection_size,
             kernel_size=self.conv_size,
-            activation='silu'
         )
         self.A_log = torch.nn.Parameter(torch.log(torch.empty(
@@ -429,18 +505,18 @@ class KimiDeltaAttention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        cache_params: Optional[KimiDynamicCache] = None,
-        **kwargs: Unpack[dict]
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
         if attention_mask is not None:
             if attention_mask.dim() != 2:
-                attention_mask = kwargs.get("padding_mask", None)
             if attention_mask is not None and attention_mask.dim() != 2:
                 raise ValueError(
                     "attention_mask must be a 0-1 matrix of shape [batch_size, seq_len] "
-                    "(0 = padding). 3D masks are not supported here."
                 )
         use_cache = cache_params is not None
         batch_size, q_len, _ = hidden_states.shape
@@ -448,7 +524,7 @@ class KimiDeltaAttention(nn.Module):
         if self.training:
             assert mode == 'chunk', "Only chunk mode is supported in training."
-        cu_seqlens = kwargs.get('cu_seqlens', None)
         indices = None
         if attention_mask is not None:
             indices, cu_seqlens, _ = get_unpad_data(attention_mask[:, -q_len:])
@@ -466,19 +542,19 @@ class KimiDeltaAttention(nn.Module):
             x=self.q_proj(hidden_states),
             cache=conv_state_q,
             output_final_state=use_cache,
-            cu_seqlens=cu_seqlens
         )
         k, conv_state_k = self.k_conv1d(
             x=self.k_proj(hidden_states),
             cache=conv_state_k,
             output_final_state=use_cache,
-            cu_seqlens=cu_seqlens
         )
         v, conv_state_v = self.v_conv1d(
             x=self.v_proj(hidden_states),
             cache=conv_state_v,
             output_final_state=use_cache,
-            cu_seqlens=cu_seqlens
         )
         g = self.f_b_proj(self.f_a_proj(hidden_states))
         g = fused_kda_gate(g, self.A_log, self.head_dim, g_bias=self.dt_bias)
@@ -553,11 +629,11 @@ class KimiMoEGate(nn.Module):
         self.moe_renormalize = config.moe_renormalize
         self.gating_dim = config.hidden_size
         self.weight = nn.Parameter(
-            torch.empty((self.num_experts, self.gating_dim))
         )
         self.e_score_correction_bias = nn.Parameter(
-            torch.empty((self.num_experts))
         )
         self.reset_parameters()
@@ -572,7 +648,7 @@ class KimiMoEGate(nn.Module):
         hidden_states = hidden_states.view(-1, h)
         logits = F.linear(
             hidden_states.type(torch.float32), self.weight.type(
-                torch.float32), None
         )
         if self.moe_router_activation_func == "sigmoid":
             scores = logits.sigmoid()
@@ -580,7 +656,7 @@ class KimiMoEGate(nn.Module):
             scores = logits.softmax(dim=1)
         else:
             raise NotImplementedError(
-                f"insupportable scoring function for MoE gating: {self.moe_router_activation_func}"
             )
         # select top-k experts
@@ -592,7 +668,7 @@ class KimiMoEGate(nn.Module):
                 bsz * seq_len, self.num_expert_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
         )  # [n, num_expert_group]
         group_idx = torch.topk(
-            group_scores, k=self.topk_group, dim=-1, sorted=False
         )[
             1
         ]  # [n, top_k_group]
@@ -601,14 +677,14 @@ class KimiMoEGate(nn.Module):
         score_mask = (
             group_mask.unsqueeze(-1)
             .expand(
-                bsz * seq_len, self.num_expert_group, self.num_experts // self.num_expert_group
             )
             .reshape(bsz * seq_len, -1)
         )  # [n, e]
         tmp_scores = scores_for_choice.masked_fill(
             ~score_mask.bool(), 0.0)  # [n, e]
         _, topk_idx = torch.topk(
-            tmp_scores, k=self.top_k, dim=-1, sorted=False
         )
         topk_weight = scores.gather(1, topk_idx)
@@ -642,16 +718,16 @@ class KimiSparseMoeBlock(nn.Module):
         self.experts = nn.ModuleList(
             [
                 KimiBlockSparseMLP(
-                    config, intermediate_size=config.moe_intermediate_size
                 )
                 for _ in range(config.num_experts)
-            ]
         )
         self.gate = KimiMoEGate(config)
         if config.num_shared_experts is not None:
             intermediate_size = config.moe_intermediate_size * config.num_shared_experts
             self.shared_experts = KimiMLP(
-                config=config, intermediate_size=intermediate_size
             )
     def forward(self, hidden_states):
@@ -659,13 +735,10 @@ class KimiSparseMoeBlock(nn.Module):
         orig_shape = hidden_states.shape
         topk_idx, topk_weight = self.gate(hidden_states)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-        flat_topk_idx = topk_idx.view(-1)
         if not self.training:
-            y = self.moe_infer(hidden_states, topk_idx,
-                               topk_weight).view(*orig_shape)
         else:
-            raise NotImplementedError(
-                "Training mode is not supported in KimiSparseMoeBlock")
         if self.config.num_shared_experts is not None:
             y = y + self.shared_experts(identity)
         return y
@@ -738,13 +811,13 @@ class KimiDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -867,14 +940,14 @@ class KimiLinearModel(KimiPreTrainedModel):
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
         use_cache = use_cache if use_cache is not None else self.config.use_cache
@@ -893,7 +966,7 @@ class KimiLinearModel(KimiPreTrainedModel):
             past_seen_tokens = past_key_values.get_seq_length(
             ) if past_key_values is not None else 0
             cache_position: torch.Tensor = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
         if position_ids is None:
@@ -951,19 +1024,19 @@ class KimiLinearForCausalLM(KimiPreTrainedModel, GenerationMixin):
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        generation_mode: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):

 import math
 from collections.abc import Callable
+from typing import Any
 import torch
 import torch.nn.functional as F
 import transformers
+from einops import rearrange, repeat
 from packaging import version
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
 from transformers.masking_utils import create_causal_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
 from transformers.utils.generic import OutputRecorder, check_model_inputs
 try:
     from fla.modules import FusedRMSNormGated, ShortConvolution
     from fla.ops.kda import chunk_kda, fused_recurrent_kda
     from fla.ops.kda.gate import fused_kda_gate
+    from fla.ops.utils.index import prepare_cu_seqlens_from_mask, prepare_lens_from_mask
+    from fla.utils import tensor_cache
 except ImportError:
     raise ImportError("Plese run `pip install -U fla-core`")
 logger = logging.get_logger(__name__)
+def index_first_axis(x, indices):
+    other_shape = x.shape[1:]
+    second_dim = other_shape.numel()
+    return torch.gather(
+        rearrange(x, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim),
+    ).reshape(-1, *other_shape)
+def index_put_first_axis(x, indices, first_axis_dim):
+    y = torch.zeros(first_axis_dim, *x.shape[1:], device=x.device, dtype=x.dtype)
+    # TODO [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+    y[indices] = x
+    # y.scatter_(0, repeat(indices, 'z -> z d', d=x.shape[1]), x)
+    return y
+@tensor_cache
+def get_unpad_data(
+    attention_mask: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, int]:
+    lens = prepare_lens_from_mask(attention_mask)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = lens.max().item()
+    cu_seqlens = prepare_cu_seqlens_from_mask(attention_mask)
+    return indices, cu_seqlens, max_seqlen_in_batch
+def unpad_input(
+    q: torch.Tensor,
+    states: tuple[torch.Tensor],
+    attention_mask: torch.Tensor,
+    q_len: int,
+    keepdim: bool = False,
+):
+    indices_k, cu_seqlens_k, max_seqlen_in_batch_k = get_unpad_data(attention_mask)
+    batch_size, seq_len, *_ = states[0].shape
+    state = tuple(
+        index_first_axis(rearrange(s, "b s ... -> (b s) ..."), indices_k)
+        for s in states
+    )
+    if q_len == seq_len:
+        q = index_first_axis(rearrange(q, "b s ... -> (b s) ..."), indices_k)
+        cu_seqlens_q = cu_seqlens_k
+        max_seqlen_in_batch_q = max_seqlen_in_batch_k
+        indices_q = indices_k
+    elif q_len == 1:
+        max_seqlen_in_batch_q = 1
+        cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device)
+        indices_q = cu_seqlens_q[:-1]
+        q = q.squeeze(1)
+    else:
+        raise NotImplementedError("We only support either q_len == k_len (prefilling) or q_len == 1 (decoding)")
+    if keepdim:
+        q = q.unsqueeze(0)
+        state = tuple(s.unsqueeze(0) for s in state)
+    return (
+        q,
+        state,
+        indices_q,
+        (cu_seqlens_q, cu_seqlens_k),
+        (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+    )
+def pad_input(
+    hidden_states: torch.Tensor,
+    indices: torch.LongTensor,
+    batch_size: int,
+    seq_len: int,
+) -> torch.Tensor:
+    output = index_put_first_axis(hidden_states, indices, batch_size * seq_len)
+    return rearrange(output, "(b s) ... -> b s ...", b=batch_size)
 class KimiDynamicCache:
     """
     Dynamic cache for Kimi model.
         key_states: torch.Tensor,
         value_states: torch.Tensor,
         layer_idx: int,
+        cache_kwargs: dict[str, Any] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if self.key_cache[layer_idx] is None:
             self.key_cache[layer_idx] = key_states
                 self.conv_states[layer_idx] = (
                     q_conv.index_select(0, beam_idx),
                     k_conv.index_select(0, beam_idx),
+                    v_conv.index_select(0, beam_idx),
                 )
                 self.recurrent_states[layer_idx] = self.recurrent_states[layer_idx].index_select(
                     0, beam_idx)
+    def get_seq_length(self, layer_idx: int | None = 0) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
         # take any layer that contains cache and not empty tensor
         layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
+    attention_mask: torch.Tensor | None,
     scaling: float,
     dropout: float = 0.0,
     **kwargs: Unpack[TransformersKwargs],
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | None = None,
         **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
         batch_size, seq_length = hidden_states.shape[:-1]
         query_shape = (batch_size, seq_length, -1, self.q_head_dim)
         key_shape = (batch_size, seq_length, -1,
         self.k_conv1d = ShortConvolution(
             hidden_size=projection_k_size,
             kernel_size=self.conv_size,
+            activation='silu',
         )
         self.v_conv1d = ShortConvolution(
             hidden_size=projection_size,
             kernel_size=self.conv_size,
+            activation='silu',
         )
         self.A_log = torch.nn.Parameter(torch.log(torch.empty(
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        cache_params: KimiDynamicCache | None = None,
+        **kwargs: Unpack[dict],
+    ) -> tuple[torch.Tensor, torch.Tensor | None, Cache | None]:
         if attention_mask is not None:
             if attention_mask.dim() != 2:
+                attention_mask = kwargs.get("padding_mask")
             if attention_mask is not None and attention_mask.dim() != 2:
                 raise ValueError(
                     "attention_mask must be a 0-1 matrix of shape [batch_size, seq_len] "
+                    "(0 = padding). 3D masks are not supported here.",
                 )
         use_cache = cache_params is not None
         batch_size, q_len, _ = hidden_states.shape
         if self.training:
             assert mode == 'chunk', "Only chunk mode is supported in training."
+        cu_seqlens = kwargs.get('cu_seqlens')
         indices = None
         if attention_mask is not None:
             indices, cu_seqlens, _ = get_unpad_data(attention_mask[:, -q_len:])
             x=self.q_proj(hidden_states),
             cache=conv_state_q,
             output_final_state=use_cache,
+            cu_seqlens=cu_seqlens,
         )
         k, conv_state_k = self.k_conv1d(
             x=self.k_proj(hidden_states),
             cache=conv_state_k,
             output_final_state=use_cache,
+            cu_seqlens=cu_seqlens,
         )
         v, conv_state_v = self.v_conv1d(
             x=self.v_proj(hidden_states),
             cache=conv_state_v,
             output_final_state=use_cache,
+            cu_seqlens=cu_seqlens,
         )
         g = self.f_b_proj(self.f_a_proj(hidden_states))
         g = fused_kda_gate(g, self.A_log, self.head_dim, g_bias=self.dt_bias)
         self.moe_renormalize = config.moe_renormalize
         self.gating_dim = config.hidden_size
         self.weight = nn.Parameter(
+            torch.empty((self.num_experts, self.gating_dim)),
         )
         self.e_score_correction_bias = nn.Parameter(
+            torch.empty(self.num_experts),
         )
         self.reset_parameters()
         hidden_states = hidden_states.view(-1, h)
         logits = F.linear(
             hidden_states.type(torch.float32), self.weight.type(
+                torch.float32), None,
         )
         if self.moe_router_activation_func == "sigmoid":
             scores = logits.sigmoid()
             scores = logits.softmax(dim=1)
         else:
             raise NotImplementedError(
+                f"insupportable scoring function for MoE gating: {self.moe_router_activation_func}",
             )
         # select top-k experts
                 bsz * seq_len, self.num_expert_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
         )  # [n, num_expert_group]
         group_idx = torch.topk(
+            group_scores, k=self.topk_group, dim=-1, sorted=False,
         )[
             1
         ]  # [n, top_k_group]
         score_mask = (
             group_mask.unsqueeze(-1)
             .expand(
+                bsz * seq_len, self.num_expert_group, self.num_experts // self.num_expert_group,
             )
             .reshape(bsz * seq_len, -1)
         )  # [n, e]
         tmp_scores = scores_for_choice.masked_fill(
             ~score_mask.bool(), 0.0)  # [n, e]
         _, topk_idx = torch.topk(
+            tmp_scores, k=self.top_k, dim=-1, sorted=False,
         )
         topk_weight = scores.gather(1, topk_idx)
         self.experts = nn.ModuleList(
             [
                 KimiBlockSparseMLP(
+                    config, intermediate_size=config.moe_intermediate_size,
                 )
                 for _ in range(config.num_experts)
+            ],
         )
         self.gate = KimiMoEGate(config)
         if config.num_shared_experts is not None:
             intermediate_size = config.moe_intermediate_size * config.num_shared_experts
             self.shared_experts = KimiMLP(
+                config=config, intermediate_size=intermediate_size,
             )
     def forward(self, hidden_states):
         orig_shape = hidden_states.shape
         topk_idx, topk_weight = self.gate(hidden_states)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
         if not self.training:
+            y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
         else:
+            raise NotImplementedError("Training mode is not supported in KimiSparseMoeBlock")
         if self.config.num_shared_experts is not None:
             y = y + self.shared_experts(identity)
         return y
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: tuple[torch.Tensor] | None = None,
+        output_attentions: bool | None = False,
+        use_cache: bool | None = False,
         **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, tuple[torch.FloatTensor, torch.FloatTensor] | None]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
     def forward(
         self,
         input_ids: torch.LongTensor = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        cache_position: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | BaseModelOutputWithPast:
         use_cache = use_cache if use_cache is not None else self.config.use_cache
             past_seen_tokens = past_key_values.get_seq_length(
             ) if past_key_values is not None else 0
             cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device,
             )
         if position_ids is None:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        generation_mode: bool | None = None,
+        return_dict: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple | CausalLMOutputWithPast:
         r"""
         Args:
             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):