ccdv
/

lsg-bart-base-4096-wcep

@@ -54,15 +54,15 @@ class LSGBartConfig(BartConfig):
         self.sparsity_factor = sparsity_factor
         self.sparsity_type = sparsity_type
-        if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride"]:
             logger.warning(
-                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride'], setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
-        if self.sparsity_type == "stride":
             if self.sparsity_factor > self.encoder_attention_heads:
                 logger.warning(
-                "[WARNING CONFIG]: sparsity_factor > encoder_attention_heads is not recommended for stride sparsity"
             )
         if self.num_global_tokens < 1:
@@ -412,6 +412,7 @@ class LSGBartEncoderAttention(BaseSelfAttention):
             "pooling": self.get_sparse_tokens_with_pooling,
             "lsh": self.get_sparse_tokens_with_lsh,
             "stride": self.get_sparse_tokens_with_stride,
             }
         self.sparsity_type = config.sparsity_type
@@ -480,29 +481,32 @@ class LSGBartEncoderAttention(BaseSelfAttention):
         sparse_idx = sparse_idx.reshape(1, 1, -1, 1) + (torch.arange(h, device=keys.device) % self.sparsity_factor).reshape(1, h, 1, 1)
         sparse_idx = sparse_idx.expand(n, h, -1, 1)
-        """
-        t, b = self.block_size, t // self.block_size
-        sparse_idx = torch.arange(t // self.sparsity_factor, device=keys.device) * self.sparsity_factor
-        sparse_idx = sparse_idx.reshape(1, 1, 1, -1, 1) + (torch.arange(h, device=keys.device) % self.sparsity_factor).reshape(1, h, 1, 1, 1)
-        sparse_idx = sparse_idx + torch.arange(b, device=keys.device).reshape(1, 1, -1, 1, 1) * t
-        sparse_idx = sparse_idx.reshape(1, h, -1, 1).expand(n, h, -1, 1)
         t, b = self.block_size, t // self.block_size
         sparse_idx = torch.arange(t // self.sparsity_factor, device=keys.device)
         sparse_idx = sparse_idx.reshape(1, 1, 1, -1, 1) + torch.arange(h, device=keys.device).reshape(1, h, 1, 1, 1) * (t // self.sparsity_factor)
         sparse_idx = (sparse_idx % t)
-        #sparse_idx[..., -t//2:, :] = (sparse_idx[..., -t//2:, :] + t//2) % t
         sparse_idx = sparse_idx + torch.arange(b, device=keys.device).reshape(1, 1, -1, 1, 1) * t
         sparse_idx = sparse_idx.reshape(1, h, -1, 1).expand(n, h, -1, 1)
-        """
         keys = keys.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         values = values.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         mask = mask.expand(-1, h, -1, -1).transpose(-1, -2).gather(dim=-2, index=sparse_idx).transpose(-1, -2)
         return keys, values, mask
     def get_sparse_tokens_with_lsh(self, keys, values, mask):
         if self.sparsity_factor == 1:
@@ -1163,25 +1167,13 @@ class LSGBartEncoder(LSGBartPretrainedModel):
         pad = t % self.block_size
         # Check if t is multiple of block_size and pad
-        if t > b and pad > 0:
             pad_length = self.block_size - pad
             if input_ids is not None:
                 input_ids = torch.nn.functional.pad(input_ids, (0, pad_length), value=self.pad_idx)
             else:
                 inputs_embeds = torch.nn.functional.pad(inputs_embeds.transpose(-1, -2), (0, pad_length), value=0.).transpose(-1, -2)
             attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_length), value=0)
-        # else adaptive sequence length
-        elif self.adaptive:
-            # Get last non zero mask index
-            s = int(attention_mask.cumsum(dim=-1).argmax(dim=-1).max()) + 1
-            if s < t and self.block_size is not None:
-                s = max(2, s // self.block_size + 1) * self.block_size if s > b else s
-                if input_ids is not None:
-                    input_ids = input_ids[:, :s]
-                else:
-                    inputs_embeds = inputs_embeds[:, :s]
-                attention_mask = attention_mask[:, :s]
         n, t_ = attention_mask.size()
@@ -1207,9 +1199,7 @@ class LSGBartEncoder(LSGBartPretrainedModel):
             offset = 0
         # Adapt sequence to initial shape
-        if diff > 0:
-            context = torch.nn.functional.pad(context.transpose(-1, -2), pad=(0, diff), value=0).transpose(-1, -2)
-        elif diff < 0:
             context = context[:, :t + offset]
         if return_dict:
@@ -1321,7 +1311,7 @@ class LSGBartEncoder(LSGBartPretrainedModel):
         )
-class LSGBartDecoder(LSGBartPretrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`LSGBartDecoderLayer`
     Args:
@@ -1330,8 +1320,9 @@ class LSGBartDecoder(LSGBartPretrainedModel):
     """
     def __init__(self, config, embed_tokens=None):
-        super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
         self.padding_idx = config.pad_token_id
@@ -1356,202 +1347,6 @@ class LSGBartDecoder(LSGBartPretrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-    def get_input_embeddings(self):
-        return self.embed_tokens
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(self.device)
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-        return combined_attention_mask
-    def resize_inputs(self, inputs_embeds, attention_mask):
-        pad = 0
-        max_len = int(attention_mask.sum(dim=-1).max())
-        pad = attention_mask.size()[-1] - max_len
-        inputs_embeds = inputs_embeds[:, :max_len]
-        attention_mask = attention_mask[..., :max_len]
-        return pad, inputs_embeds, attention_mask
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        head_mask=None,
-        cross_attn_head_mask=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-        # Resize to reduce computation
-        pad = 0
-        if self.adaptive:
-            if attention_mask is not None:
-                pad, inputs_embeds, attention_mask = self.resize_inputs(inputs_embeds, attention_mask)
-                input_shape = inputs_embeds.size()[:-1]
-            if encoder_attention_mask is not None:
-                _, encoder_hidden_states, encoder_attention_mask = self.resize_inputs(encoder_hidden_states, encoder_attention_mask)
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, input_shape, inputs_embeds, past_key_values_length
-        )
-        # expand encoder attention mask
-        if encoder_hidden_states is not None and encoder_attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
-        # embed positions
-        positions = self.embed_positions(input_shape, past_key_values_length)
-        hidden_states = inputs_embeds + positions
-        hidden_states = self.layernorm_embedding(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
-        next_decoder_cache = () if use_cache else None
-        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
-        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
-            if attn_mask is not None:
-                if attn_mask.size()[0] != (len(self.layers)):
-                    raise ValueError(
-                        "The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
-                    )
-        for idx, decoder_layer in enumerate(self.layers):
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-            if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, use_cache)
-                    return custom_forward
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    head_mask[idx] if head_mask is not None else None,
-                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                    cross_attn_layer_head_mask=(
-                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
-                    ),
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-                if encoder_hidden_states is not None:
-                    all_cross_attentions += (layer_outputs[2],)
-        # Resize to original shape
-        hidden_states = torch.nn.functional.pad(hidden_states.transpose(-1, -2), pad=(0, pad), value=0).transpose(-1, -2)
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-            cross_attentions=all_cross_attentions,
-        )
 class LSGBartModel(LSGBartPretrainedModel):

         self.sparsity_factor = sparsity_factor
         self.sparsity_type = sparsity_type
+        if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
             logger.warning(
+                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
+        if self.sparsity_type in ["stride", "block_stride"]:
             if self.sparsity_factor > self.encoder_attention_heads:
                 logger.warning(
+                "[WARNING CONFIG]: sparsity_factor > encoder_attention_heads is not recommended for stride/block_stride sparsity"
             )
         if self.num_global_tokens < 1:
             "pooling": self.get_sparse_tokens_with_pooling,
             "lsh": self.get_sparse_tokens_with_lsh,
             "stride": self.get_sparse_tokens_with_stride,
+            "block_stride": self.get_sparse_tokens_with_block_stride,
             }
         self.sparsity_type = config.sparsity_type
         sparse_idx = sparse_idx.reshape(1, 1, -1, 1) + (torch.arange(h, device=keys.device) % self.sparsity_factor).reshape(1, h, 1, 1)
         sparse_idx = sparse_idx.expand(n, h, -1, 1)
+        keys = keys.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
+        values = values.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
+        mask = mask.expand(-1, h, -1, -1).transpose(-1, -2).gather(dim=-2, index=sparse_idx).transpose(-1, -2)
+        return keys, values, mask
+    def get_sparse_tokens_with_block_stride(self, keys, values, mask):
+        if self.sparsity_factor == 1:
+            return keys, values, mask.expand(-1, keys.size()[1], -1, -1)
+        n, h, t, d = keys.size()
         t, b = self.block_size, t // self.block_size
         sparse_idx = torch.arange(t // self.sparsity_factor, device=keys.device)
         sparse_idx = sparse_idx.reshape(1, 1, 1, -1, 1) + torch.arange(h, device=keys.device).reshape(1, h, 1, 1, 1) * (t // self.sparsity_factor)
         sparse_idx = (sparse_idx % t)
         sparse_idx = sparse_idx + torch.arange(b, device=keys.device).reshape(1, 1, -1, 1, 1) * t
         sparse_idx = sparse_idx.reshape(1, h, -1, 1).expand(n, h, -1, 1)
         keys = keys.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         values = values.gather(dim=-2, index=sparse_idx.expand(-1, -1, -1, d))
         mask = mask.expand(-1, h, -1, -1).transpose(-1, -2).gather(dim=-2, index=sparse_idx).transpose(-1, -2)
         return keys, values, mask
     def get_sparse_tokens_with_lsh(self, keys, values, mask):
         if self.sparsity_factor == 1:
         pad = t % self.block_size
         # Check if t is multiple of block_size and pad
+        if self.adaptive and t > b and pad > 0:
             pad_length = self.block_size - pad
             if input_ids is not None:
                 input_ids = torch.nn.functional.pad(input_ids, (0, pad_length), value=self.pad_idx)
             else:
                 inputs_embeds = torch.nn.functional.pad(inputs_embeds.transpose(-1, -2), (0, pad_length), value=0.).transpose(-1, -2)
             attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_length), value=0)
         n, t_ = attention_mask.size()
             offset = 0
         # Adapt sequence to initial shape
+        if diff < 0:
             context = context[:, :t + offset]
         if return_dict:
         )
+class LSGBartDecoder(BartDecoder, LSGBartPretrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`LSGBartDecoderLayer`
     Args:
     """
     def __init__(self, config, embed_tokens=None):
+        LSGBartPretrainedModel.__init__(self, config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
         self.padding_idx = config.pad_token_id
         # Initialize weights and apply final processing
         self.post_init()
 class LSGBartModel(LSGBartPretrainedModel):