initial commit

Browse files

Files changed (14) hide show

.gitignore +172 -0
config.json +40 -0
configuration_pldrllm.py +248 -0
generation_config.json +7 -0
model.safetensors +3 -0
modeling_pldrllm.py +1622 -0
paper_saved_model_files/PLDRv51-SOC-110M-3-model-checkpoint.pth +3 -0
paper_saved_model_files/PLDRv51_SOC_110M_3_hyperparameters.py +26 -0
paper_saved_model_files/refinedweb-tokenizer-pldrllm-soc-paper.tar.gz +3 -0
requirements.txt +4 -0
special_tokens_map.json +6 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +54 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,172 @@

+# From default github .gitignore for python based repos
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "A_dff": 170,
+  "architectures": [
+    "PldrllmForCausalLM"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_pldrllm.PldrllmConfig",
+    "AutoModelForCausalLM": "modeling_pldrllm.PldrllmForCausalLM"
+  },
+  "bos_token_id": 2,
+  "cache_first_G": false,
+  "custom_G_type": null,
+  "dtype": "float32",
+  "eos_token_id": 3,
+  "final_bias": true,
+  "glu_bias": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 896,
+  "initializer_range": 0.02,
+  "intermediate_size": 2389,
+  "layer_norm_eps": 1e-06,
+  "max_position_embeddings": 1024,
+  "model_type": "pldrllm",
+  "num_attention_heads": 14,
+  "num_denseA": 2,
+  "num_hidden_layers": 5,
+  "num_reslayerA": 8,
+  "output_pldr_attentions": false,
+  "pad_token_id": 0,
+  "reference_rope": true,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.56.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}

configuration_pldrllm.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# coding=utf-8
+# Copyright 2025 Fromthesky Research Labs, LLC. All rights reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code uses the Llama model implementation by Eleuther AI
+# and Huggingface teams in this library as a starting point and implements
+# the PLDR-LLM (Large Language Model from Power Law Decoder Representations)
+#  architecture based on its implementation by the Fromthesky Research Labs team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PLDR-LLM model configuration"""
+import numpy as np
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+class PldrllmConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PldrllmModel`]. It is used to instantiate a PLDR-LLM
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the PLDR-LLM-v51-110M-3.
+    e.g. [fromthesky/PLDR-LLM-v51-110M-3](https://huggingface.co/fromthesky/PLDR-LLM-v51-110M-3)
+    Check out these papers for the details of PLDR-LLM architecture:
+    [Paper-1](https://huggingface.co/papers/2107.02039) [Paper-2](https://huggingface.co/papers/2410.16703) [Paper-3](https://huggingface.co/papers/2502.13502)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the PLDR-LLM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`PldrllmModel`]
+        hidden_size (`int`, *optional*, defaults to 896):
+            Dimension of the hidden representations. if set to None, hidden_size is calculated from
+            num_attention_heads and head_dim.
+        intermediate_size (`int`, *optional*, defaults to 2389):
+            Dimension of the Pointwise Feed Forward Network representations. if set to None, intermediate_size is calculated from
+            num_attention_heads and head_dim.
+        num_hidden_layers (`int`, *optional*, defaults to 5):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 14):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length (context length) for the PLDR-LLM. PLDR-LLM-v51-110M-3 supports up to 1024.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            Intended as the standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            This parameter is not used for initialization of the PLDR-LLM module weigths in favor of xavier_uniform_ initialization.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 3):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        glu_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use a bias in Gated Linear Units used in Pointwise Feedforward Network and Residual Layers for
+            the metric learner.
+        final_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use a bias in the LM head layer of the PldrllmForCausalLM implementation.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 64):
+            The attention head dimension.
+        reference_rope (`bool`, *optional*, defaults to `True`):
+            Whether to use the rotary positional embedding implementation used in the reference paper implementing the
+            PLDR-LLM in pytorch. Check out [this paper](https://huggingface.co/papers/2502.13502).
+        num_reslayerA (`int`, *optional*, defaults to 8):
+            Number of residual layers in the metric learner section of the power law graph attention layer.
+        num_denseA (`int`, *optional*, defaults to 2):
+            Number of gated linear units in each residual layer in the metric learner section of the power law graph attention layer.
+        A_dff (`int`, *optional*, defaults to 170):
+            The dimension of hidden layer in the gated linear unit for the residual metric learner. Input and output dimensions
+            are set at head_dim.
+        custom_G_type (`str`, *optional*, defaults to None):
+            PLDR-LLM supports predefined energy-curvature tensor (G) values that can bypass the metric learner section during training and
+            inference. This assigns the decoder.past_G_values attribute to a predefined value. This is useful for experimentation and assigning
+            an already learned energy-curvature tensor. The StaticCache is supported only for predefined past_G_values.
+            None: G values are learned during training and inferred by the residual metric learner at least once (depending on use_cache status).
+                 past_G_values has shape (num_layers, 3, batch_size, num_heads, head_dim, head_dim).
+            'identity': decoder.past_G_values are assigned to identity matrix and metric learner layer is not part of the model. This setting is equivalent to
+                        an LLM with Scaled Dot Product Attention (SDPA). The decoder.past_G_values are saved with the model.
+            'random': decoder.past_G_values are assigned to randomly initialized matrix from a normal distribution. This setting is equivalent to
+                        an LLM with Scaled Dot Product Attention (SDPA). The decoder.past_G_values are saved with the model.
+            'external': decoder.past_G_values are expected to be assigned after initializing/loading the PLDR-LLM weights. decoder.past_G_values[:, 2,...].
+                        are initialized to identity matrix by default. The expected shape of input is (num_layers, 3, 1, num_heads, head_dim, head_dim) and
+                        [:, 2,...] must have the predefined energy-curvature tensor values. Other entries are set to zero tensor by default.
+        cache_first_G (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the G values from first sample in a batch or G values from all samples for past_G_values initialization.
+            When `cache_first_G=true`, the batch_size of past_G_values is 1. This argument should be set to True for contrastive text generation
+            with learned G values.
+        output_pldr_attentions (`bool`, *optional*, defaults to `False`):
+            Whether to return the deductive outputs and learnable parameters of power law graph attention module as tuple containing:
+            the output of the residual metric learner (metric tensor, A), output (A_LM) after application of iSwiGLU on metric tensor, learned
+            exponents of potential tensor, learned weights for energy-curvature tensor, learned bias for
+            energy-curvature tensor, energy-curvature tensor (G_LM), and attention weights.
+    ```python
+    >>> from transformers import PldrllmModel, PldrllmConfig
+    >>> # Initializing a PLDR-LLM PLDR-LLM-v51-110M-3 style configuration
+    >>> configuration = PldrllmConfig()
+    >>> # Initializing a model from the PLDR-LLM-v51-110M-3 style configuration
+    >>> model = PldrllmModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "pldrllm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=896,
+        intermediate_size=2389,
+        num_hidden_layers=5,
+        num_attention_heads=14,
+        hidden_act="silu",
+        max_position_embeddings=1024,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6, #hard coded
+        use_cache=True,
+        output_pldr_attentions=False,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        tie_word_embeddings=False, #hard coded
+        rope_theta=10000.0, #hard coded
+        rope_scaling=None, #hard coded
+        attention_bias=True, #hard coded
+        glu_bias=True, #hard coded
+        final_bias=True, #hard coded
+        reference_rope=True,
+        attention_dropout=0.0, #hard coded
+        head_dim=64,
+        num_reslayerA=8,
+        num_denseA=2,
+        A_dff=170,
+        custom_G_type=None,
+        cache_first_G=False,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size if hidden_size is not None else int(num_attention_heads*head_dim)
+        self.intermediate_size = intermediate_size if intermediate_size is not None else int(np.floor(num_attention_heads*head_dim*4*2/3))
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_reslayerA=num_reslayerA
+        self.num_denseA=num_denseA
+        self.A_dff=A_dff
+        self.glu_bias=glu_bias
+        self.attention_bias = attention_bias
+        self.final_bias=final_bias
+        self.initializer_range=initializer_range
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.output_pldr_attentions=output_pldr_attentions
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.reference_rope=reference_rope
+        self.custom_G_type=custom_G_type
+        self.cache_first_G=cache_first_G
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+__all__ = ["PldrllmConfig"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 3,
+  "pad_token_id": 0,
+  "transformers_version": "4.56.1"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f6bf5c3e06a235445222a31287cf8ccc8a73f762247c045346ceca8d9e82446
+size 438844096

modeling_pldrllm.py ADDED Viewed

	@@ -0,0 +1,1622 @@

+# coding=utf-8
+# Copyright 2025 Fromthesky Research Labs, LLC. All rights reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code uses the Llama model implementation by Eleuther AI
+# and Huggingface teams in this library as a starting point and implements
+# the PLDR-LLM (Large Language Model from Power Law Decoder Representations)
+#  architecture based on its implementation by the Fromthesky Research Labs team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+import torch
+from torch import nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from .configuration_pldrllm import PldrllmConfig
+from dataclasses import dataclass
+from transformers.utils import ModelOutput
+logger = logging.get_logger(__name__)
+################## PLDRLLM POWER LAW GRAPH ATTENTION IMPLEMENTATION ########################################
+''''
+Power law attention implementation for PLDR-LLM with KV-cache and G-cache.
+'''
+class PlgaLayer(nn.Module):
+    '''
+    Power law graph attention layer implementation.
+    '''
+    def __init__(self, config:PldrllmConfig,
+                 F_hidden:int,
+                 F_heads:int,
+                 layer_idx:int,
+                 device=None,
+                 **kwargs)->None:
+        '''
+        Args:
+            F_hidden: hidden layer shape used in layer weight creation. For multi-head plga this is head_dim.
+            F_heads: Number of attention heads.
+            layer_idx: index for the decoder layer.
+            device: device(cpu or gpu) to load tensors.
+        '''
+        super().__init__(**kwargs)
+        self.F_hidden=F_hidden
+        self.F_heads=F_heads
+        self.layer_idx=layer_idx
+        self.device=device
+        self.config=config
+        self.is_causal = True
+        self.custom_G_type=config.custom_G_type
+        self.attention_dropout=config.attention_dropout
+        # default type is set as config.torch_dtype
+        self.wdtype=None
+        if self.custom_G_type is None:
+            self.build_weights()
+        else:
+            self.Wlst = None
+            self.blst = None
+            self.pwlst = None
+            self.alst = None
+            self.balst = None
+    def cg_align_one(self, Hin:torch.Tensor,
+                     Hk:torch.Tensor,
+                     Hv:torch.Tensor,
+                     A:torch.Tensor,
+                     a_vec:Optional[torch.Tensor],
+                     ba:Optional[torch.Tensor],
+                     W:Optional[torch.Tensor],
+                     b:Optional[torch.Tensor],
+                     pw:Optional[torch.Tensor],
+                     past_G_values: Optional[torch.Tensor],
+                     past_G_values_status: Optional[torch.BoolTensor]=None,
+                     mask:Optional[torch.Tensor]=None,
+                     use_cache: Optional[bool]=None,
+                     **kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]:
+        '''
+        Alignment model for calculating attention weights
+        Args:
+            Hin: query
+            Hk: key
+            A: metric tensor instance
+            a_vec: learned coupling coefficients.
+            ba: bias for coupling coeffients
+            W: weights applied on metric tensor before AdjActivation
+            b: bias applied on metric tensor before AdjActivation
+            pw: learned power exponents applied on metric tensor
+            mask: padding or lookahead mask
+        Returns:
+            Hout: Attention output.
+            A tuple of:
+                A: metric tensor as output of residual metric learner layer, A
+                AW: metric tensor after AdjActivation is applied, A_LM
+                pw: learned power exponents
+                a_vec: learned coupling coefficients for energy-curvature tensor
+                ba: bias for energy-curvature tensor
+                avAp: Energy curvature tensor, G_LM
+                E: attention weights
+        '''
+        if self.custom_G_type is None and not (use_cache and past_G_values_status[self.layer_idx]):
+            AdjActivation=iSwiGLU
+            epsilonAdj=1e-9
+            # make metric tensor positive definite
+            AW=AdjActivation(torch.matmul(W,A)+b)+epsilonAdj
+            # find energy curvature tensor and attention weights
+            Ap=torch.pow(AW, pw)
+            avAp=torch.matmul(a_vec, Ap)+ba # [batch_size, num_head,  depth, depth]
+            if use_cache:
+                # update only once if cache is enabled.
+                G_batch_size=past_G_values.size()[2]
+                past_G_values[self.layer_idx]=torch.stack([A[:G_batch_size,:,:,:],
+                                                            AW[:G_batch_size,:,:,:],
+                                                            avAp[:G_batch_size,:,:,:]], dim=0) # [3, batch_size, num_head,  depth, depth]
+                past_G_values_status[self.layer_idx]=True
+        else:
+            AW=past_G_values[self.layer_idx, 1]
+            avAp=past_G_values[self.layer_idx, 2]
+        WHiWHj = torch.matmul(Hin, avAp) # [batch_size, num_head, seq_lenq, depth]
+        # scale attention with square root of depth
+        dk=torch.tensor(self.F_hidden).to(Hin.dtype)
+        scaling=1/torch.sqrt(dk)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        query, key, value = WHiWHj.to(dtype=Hk.dtype), Hk, Hv
+        Hout, E = attention_interface(
+            self,
+            query=query,
+            key=key,
+            value=value,
+            attention_mask=mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=scaling,
+            **kwargs
+        )
+        return Hout, (A, AW, pw, a_vec, ba, avAp, E)
+    def cg_align_head(self, Hin:torch.Tensor,
+                      Hk:torch.Tensor,
+                      Hv:torch.Tensor,
+                      A:torch.Tensor,
+                      mask:Optional[torch.Tensor]=None,
+                      past_G_values: Optional[torch.Tensor]=None,
+                      past_G_values_status: Optional[torch.BoolTensor]=None,
+                      use_cache: Optional[bool]=None,
+                      **kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]:
+        '''
+        Method for linear propagation of attention weights over values.
+        '''
+        Hout, att_weights=self.cg_align_one(Hin=Hin, Hk=Hk, Hv=Hv, A=A,
+                                            a_vec=self.alst,
+                                            ba=self.balst,
+                                            W=self.Wlst,
+                                            b=self.blst,
+                                            pw=self.pwlst,
+                                            mask=mask,
+                                            past_G_values=past_G_values,
+                                            past_G_values_status=past_G_values_status,
+                                            use_cache=use_cache,
+                                            **kwargs)
+        return Hout, att_weights
+    def build_weights(self)->None:
+        '''
+        Used to initialize learnable parameters for the layer:
+        W: weights to apply on metric tensor.
+        b: bias to apply on metric tensor.
+        a: coupling coefficients for energy-curvature (G) tensor.
+        ba: bias for energy-curvature tensor.
+        pw: power exponent weights for potential tensor.
+        '''
+        weight_shape=[self.F_heads, self.F_hidden, self.F_hidden] # [num_heads, depth, depth]
+        add_weight_Wpart= torch.empty(weight_shape, dtype=self.wdtype, device=self.device)
+        add_weight_bpart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device)
+        add_weight_pwpart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device)
+        add_weight_apart = torch.empty(weight_shape, dtype=self.wdtype, device=self.device)
+        add_weight_bapart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device)
+        self.Wlst = nn.Parameter(add_weight_Wpart, requires_grad=True)
+        self.blst = nn.Parameter(add_weight_bpart, requires_grad=True)
+        self.pwlst = nn.Parameter(add_weight_pwpart, requires_grad=True)
+        self.alst = nn.Parameter(add_weight_apart, requires_grad=True)
+        self.balst = nn.Parameter(add_weight_bapart, requires_grad=True)
+    def forward(self, inputs:tuple[torch.Tensor,...],
+                past_G_values: Optional[torch.Tensor]=None,
+                past_G_values_status: Optional[torch.BoolTensor]=None,
+                use_cache:Optional[bool]=False,
+                **kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]:
+        '''
+        execute the forward propagation
+        inputs[0] = query = Hin
+        inputs[1] = key = Hk
+        inputs[2] = value = Hv
+        inputs[3] = metric tensor = A
+        inputs[4] = mask
+        '''
+        Hin, Hk, Hv, A, mask=inputs
+        H_next, att_weights = self.cg_align_head(Hin=Hin, Hk=Hk, Hv=Hv, A=A, mask=mask,
+                                                 past_G_values=past_G_values,
+                                                 past_G_values_status=past_G_values_status,
+                                                 use_cache=use_cache, **kwargs)
+        return H_next, att_weights
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs:Unpack[TransformersKwargs],
+    )->tuple[torch.Tensor, torch.Tensor]:
+    keyt=torch.permute(key, [0, 1, 3, 2])  # [batch_size, num_head, depth, seq_lenk]
+    attn_weights = torch.matmul(query, keyt) * scaling # [batch_size, num_head, seq_lenq, seq_lenk]
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = F.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = torch.permute(attn_output, [0, 2, 1, 3])
+    attn_output = attn_output.contiguous()
+    return attn_output, attn_weights
+def iSwiGLU(x):
+    '''SwiGLU activation function with weights W,V equal to identity matrix and no bias.'''
+    gate=F.silu(x)
+    out=torch.mul(x, gate)
+    return out
+################################### END OF PLDRLLM POWER LAW GRAPH ATTENTION IMPLEMENTATION ############################################
+#################################### PLDR-LLM MODEL IMPLEMENTATION ################################################################
+'''
+Model Implementation for Large Language Model from Power Law Decoder Representations with KV-cache and G-cache.
+'''
+class PldrllmAttention(nn.Module):
+    '''
+    Power Law Multihead Attention Implementation for PLDR-LLM.
+    '''
+    def __init__(self,config: PldrllmConfig,
+                 layer_idx:int,
+                 device=None,
+                 **kwargs)->None:
+        super().__init__(**kwargs)
+        self.num_heads = config.num_attention_heads
+        self.d_model = config.hidden_size
+        self.A_dff = config.A_dff
+        self.num_denseA = config.num_denseA
+        self.num_reslayerA = config.num_reslayerA
+        self.activation=ACT2FN[config.hidden_act]
+        self.max_seq_len=config.max_position_embeddings
+        self.layer_idx=layer_idx
+        self.device=device
+        self.attention_bias=config.attention_bias
+        self.custom_G_type=config.custom_G_type
+        self.layer_norm_eps=config.layer_norm_eps
+        self.glu_bias=config.glu_bias
+        self.reference_rope=config.reference_rope
+        self.wdtype=None
+        assert self.d_model % self.num_heads == 0
+        self.depth = config.head_dim
+        self.wq = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)
+        self.wk = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)
+        self.wv = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)
+        self.plgatt_layer= PlgaLayer(config=config,
+                                      F_hidden=self.depth,
+                                      F_heads= self.num_heads,
+                                      layer_idx=self.layer_idx,
+                                      device=self.device)
+        self.dense = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)
+        if self.custom_G_type is None:
+            # residual layers for metric tensor learning
+            self.reslayerAs=nn.ModuleList([ResLayerA(depth=self.depth,
+                                                    A_dff=self.A_dff,
+                                                    num_denseA=self.num_denseA,
+                                                    layer_norm_eps=self.layer_norm_eps,
+                                                    glu_bias=self.glu_bias,
+                                                    activation=self.activation,
+                                                    device=self.device,
+                                                    dtype=self.wdtype) for _ in range(self.num_reslayerA)])
+            self.layernorm1 = nn.LayerNorm(self.depth, eps=self.layer_norm_eps, device=self.device, dtype=self.wdtype)
+        if self.reference_rope:
+            # keep initialization and forward in same module for reference rope implementation
+            self.rotary_embedding=RotaryPositionalEmbeddings(dim=self.depth,
+                                                             max_seq_len=self.max_seq_len,
+                                                             base=config.rope_theta
+                                                             ).to(device=self.device, dtype=self.wdtype)
+    def split_heads(self, x, batch_size):
+        '''
+        Split the last dimension into (num_heads, depth).
+        '''
+        x = x.view(batch_size, -1, self.num_heads, self.depth)
+        return x # [batch_size, seq_len, num_heads, depth]
+    def forward(self, inputs:tuple[torch.Tensor, ...],
+                position_embeddings:torch.Tensor,
+                position_ids: Optional[torch.LongTensor]=None,
+                cache_position:Optional[torch.LongTensor]=None,
+                past_G_values: Optional[torch.Tensor]=None,
+                past_G_values_status: Optional[torch.BoolTensor]=None,
+                past_key_values: Optional[Cache]=None,
+                use_cache:Optional[bool]=None,
+                **kwargs: Unpack[TransformersKwargs]
+                )->tuple[torch.Tensor, tuple[torch.Tensor,...]]:
+        q, k, v, mask = inputs
+        batch_size = q.size()[0]
+        q = self.wq(q)  # [batch_size, seq_len, d_model]
+        k = self.wk(k)
+        v = self.wv(v)
+        q = self.split_heads(q, batch_size)  # [batch_size, seq_len, num_heads, depth]
+        k = self.split_heads(k, batch_size)
+        v = self.split_heads(v, batch_size)
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            q, k = apply_rotary_pos_emb(q=q, k=k, cos=cos, sin=sin, unsqueeze_dim=2)
+        else:
+            q=self.rotary_embedding(q, input_pos=position_ids)
+            k=self.rotary_embedding(k, input_pos=position_ids)
+        q = torch.permute(q, [0, 2, 1, 3]) # [batch_size, num_heads, seq_len, depth]
+        k = torch.permute(k, [0, 2, 1, 3])
+        v = torch.permute(v, [0, 2, 1, 3])
+        if self.custom_G_type is None and not (use_cache and past_G_values_status[self.layer_idx]):
+            # Calculate density matrix using linear self attention
+            qt = torch.permute(q, [0, 1, 3, 2])
+            A = torch.matmul(qt, q)  # [batch_size, num_head, depth, depth]
+            A=self.layernorm1(A)
+            #Deep residual network for learning metric tensor
+            for i in range(self.num_reslayerA):
+                A=self.reslayerAs[i]([A])
+        else:
+            A=past_G_values[self.layer_idx,0] # [1, num_head, depth, depth]
+        if use_cache:
+            #cache position for static cache
+            cache_kwargs = {"cache_position": cache_position}
+            k, v = past_key_values.update(key_states=k, value_states=v, layer_idx=self.layer_idx, cache_kwargs=cache_kwargs)
+        #Apply multi-head power law attention
+        Hnext, att_weights = self.plgatt_layer((q, k, v, A, mask),
+                                               past_G_values,
+                                               past_G_values_status,
+                                               use_cache, **kwargs)
+        Hnext= Hnext.reshape(batch_size, -1, self.d_model) # [batch_size, seq_len, d_model]
+        output = self.dense(Hnext)
+        return output, att_weights
+class PLDR_DecoderLayer(GradientCheckpointingLayer):
+    '''
+    Single decoder layer implementation for PLDR-LLM with single masked multihead attention.
+    '''
+    def __init__(self, config: PldrllmConfig,
+                 layer_idx:int,
+                 device=None,
+                 **kwargs)->None:
+        super().__init__(**kwargs)
+        self.d_model=config.hidden_size
+        self.num_heads=config.num_attention_heads
+        self.dff=config.intermediate_size
+        self.A_dff=config.A_dff
+        self.num_denseA = config.num_denseA
+        self.num_reslayerA = config.num_reslayerA
+        self.activation=ACT2FN[config.hidden_act]
+        self.max_seq_len=config.max_position_embeddings
+        self.layer_idx=layer_idx
+        self.device=device
+        self.layer_norm_eps=config.layer_norm_eps
+        self.glu_bias=config.glu_bias
+        self.wdtype=None
+        self.mha1 = PldrllmAttention(config=config, layer_idx=layer_idx, device=self.device)
+        self.ffn = self.dec_point_wise_feed_forward_network()
+        self.layernorm1 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps, device=self.device, dtype=self.wdtype)
+        self.layernorm2 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps,  device=self.device, dtype=self.wdtype)
+    def forward(self,
+                hidden_states:torch.Tensor,
+                look_ahead_mask:torch.Tensor,
+                position_embeddings:torch.Tensor,
+                position_ids:Optional[torch.LongTensor]=None,
+                cache_position:Optional[torch.LongTensor]=None,
+                use_cache:Optional[bool]=None,
+                past_key_values:Optional[Cache]=None,
+                past_G_values:Optional[torch.Tensor]=None,
+                past_G_values_status:Optional[list[bool]]=None,
+                **kwargs:Unpack[TransformersKwargs]
+                )->tuple[torch.Tensor, tuple[torch.Tensor,...]]:
+        attn1, att_weights = self.mha1(inputs=[hidden_states, hidden_states, hidden_states, look_ahead_mask],
+                                        position_embeddings=position_embeddings,
+                                        position_ids=position_ids,
+                                        cache_position=cache_position,
+                                        past_key_values=past_key_values,
+                                        past_G_values=past_G_values,
+                                        past_G_values_status=past_G_values_status,
+                                        use_cache=use_cache,
+                                        **kwargs
+                                        )
+        out1 = self.layernorm1(attn1 + hidden_states)
+        ffn_output = self.ffn(out1)
+        out2 = self.layernorm2(ffn_output + out1)  # [batch_size, target_seq_len, d_model]
+        return out2, att_weights
+    # GLUVariant implementation for feedforward network, scale dff accordingly (i.e., 2/3 of original).
+    def dec_point_wise_feed_forward_network(self):
+        return GLUVariant(self.d_model, self.dff, self.d_model,
+                          glu_bias=self.glu_bias,
+                          activation=self.activation,
+                          device=self.device,
+                          dtype=self.wdtype)
+class ResLayerA(nn.Module):
+    '''
+    Residual Layer implementation for metric learner of PLDR-LLM
+    '''
+    def __init__(self, depth:int,
+                 A_dff:int,
+                 num_denseA:int,
+                 layer_norm_eps:float,
+                 glu_bias:bool,
+                 activation:Callable=F.silu,
+                 device=None,
+                 dtype=None,
+                 **kwargs)->None:
+        super().__init__(**kwargs)
+        self.depth=depth
+        self.A_dff = A_dff
+        self.num_denseA = num_denseA
+        self.activation=activation
+        self.device=device
+        self.layer_norm_eps=layer_norm_eps
+        self.glu_bias=glu_bias
+        self.denseAs = nn.ModuleList([GLUVariant(self.depth, self.A_dff, self.depth,
+                                                 glu_bias=self.glu_bias,
+                                                 activation=self.activation,
+                                                 device=self.device,
+                                                 dtype=dtype) for _ in range(self.num_denseA)])
+        self.layernormA = nn.LayerNorm(self.depth, eps=self.layer_norm_eps, device=self.device, dtype=dtype)
+        self.identity=nn.Identity()
+    def ResUnit(self, A:torch.Tensor)->torch.Tensor:
+        Ain = self.identity(A)
+        for i in range(self.num_denseA):
+            A = self.denseAs[i](A)
+        A = self.layernormA(A + Ain)
+        return A
+    def forward(self, inputs:list[torch.Tensor], **kwargs)->torch.Tensor:
+        A=inputs[0]
+        return self.ResUnit(A)
+class GLUVariant(nn.Module):
+    '''
+    Implementation of GLU variants with default activation for SwiGLU configuration
+    For the hidden layer dff, to match size with non-SwiGLU FFN version scaling with 2/3 may be useful.
+    '''
+    def __init__(self, d_model:int,
+                 dff:int,
+                 depth:int,
+                 glu_bias:bool,
+                 activation:Callable=F.silu,
+                 device=None,
+                 dtype=None,
+                 **kwargs)->None:
+        super().__init__(**kwargs)
+        self.dff=dff
+        self.depth=depth
+        self.d_model=d_model
+        self.activation=activation
+        self.device=device
+        self.glu_bias=glu_bias
+        self.gluw1=nn.Linear(self.d_model, self.dff, bias=self.glu_bias, device=self.device, dtype=dtype)
+        self.gluw2=nn.Linear(self.d_model, self.dff, bias=self.glu_bias, device=self.device, dtype=dtype)
+        self.gluw3=nn.Linear(self.dff, self.depth, bias=self.glu_bias, device=self.device, dtype=dtype)
+    def forward(self, input:torch.Tensor, **kwargs)->torch.Tensor:
+        x1=self.gluw1(input)
+        x1=self.activation(x1)
+        x2=self.gluw2(input)
+        return self.gluw3(torch.mul(x1, x2))
+###################################### END OF PLDRLLM MODEL IMPLEMENTATION #####################################################
+# RotaryPositionalEmbeddings is from https://github.com/pytorch/torchtune/blob/main/torchtune/modules/position_embeddings.py
+# This implementation was  used in the original pytorch based implementation of PLDR-LLM.
+class RotaryPositionalEmbeddings(nn.Module):
+    """
+    This class implements Rotary Positional Embeddings (RoPE)
+    proposed in https://arxiv.org/abs/2104.09864.
+    Reference implementation (used for correctness verfication)
+    can be found here:
+    https://github.com/meta-llama/llama/blob/main/llama/model.py#L80
+    In this implementation we cache the embeddings for each position upto
+    ``max_seq_len`` by computing this during init.
+    Args:
+        dim (int): Embedding dimension. This is usually set to the dim of each
+            head in the attention module computed as ``embed_dim // num_heads``
+        max_seq_len (int): Maximum expected sequence length for the
+            model, if exceeded the cached freqs will be recomputed
+        base (int): The base for the geometric progression used to compute
+            the rotation angles
+    """
+    def __init__(
+        self,
+        dim: int,
+        max_seq_len: int = 4096,
+        base: int = 10_000,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.max_seq_len = max_seq_len
+        self.rope_init()
+    def rope_init(self):
+        theta = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2)[: (self.dim // 2)].float() / self.dim)
+        )
+        self.register_buffer("theta", theta, persistent=False)
+        self.build_rope_cache(self.max_seq_len)
+    def build_rope_cache(self, max_seq_len: int = 4096) -> None:
+        # Create position indexes `[0, 1, ..., max_seq_len - 1]`
+        seq_idx = torch.arange(
+            max_seq_len, dtype=self.theta.dtype, device=self.theta.device
+        )
+        # Outer product of theta and position index; output tensor has
+        # a shape of [max_seq_len, dim // 2]
+        idx_theta = torch.einsum("i, j -> ij", seq_idx, self.theta).float()
+        # cache includes both the cos and sin components and so the output shape is
+        # [max_seq_len, dim // 2, 2]
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+        self.register_buffer("cache", cache, persistent=False)
+    def forward(
+        self, x: torch.Tensor, *, input_pos: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): input tensor with shape
+                ``[b, s, n_h, h_d]``
+            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
+                of each token. During training, this is used to indicate the positions
+                of each token relative to its sample when packed, shape [b, s].
+                During inference, this indicates the position of the current token.
+                If none, assume the index of the token is its position id. Default is None.
+        Returns:
+            torch.Tensor: output tensor with shape ``[b, s, n_h, h_d]``
+        Notation used for tensor shapes:
+            - b: batch size
+            - s: sequence length
+            - n_h: num heads
+            - h_d: head dim
+        """
+        # input tensor has shape [b, s, n_h, h_d]
+        seq_len = x.size(1)
+        # extract the values based on whether input_pos is set or not
+        rope_cache = (
+            self.cache[:seq_len] if input_pos is None else self.cache[input_pos]
+        )
+        # reshape input; the last dimension is used for computing the output.
+        # Cast to float to match the reference implementation
+        # tensor has shape [b, s, n_h, h_d // 2, 2]
+        xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+        # reshape the cache for broadcasting
+        # tensor has shape [b, s, 1, h_d // 2, 2] if packed samples,
+        # otherwise has shape [1, s, 1, h_d // 2, 2]
+        rope_cache = rope_cache.view(-1, xshaped.size(1), 1, xshaped.size(3), 2)
+        # tensor has shape [b, s, n_h, h_d // 2, 2]
+        x_out = torch.stack(
+            [
+                xshaped[..., 0] * rope_cache[..., 0]
+                - xshaped[..., 1] * rope_cache[..., 1],
+                xshaped[..., 1] * rope_cache[..., 0]
+                + xshaped[..., 0] * rope_cache[..., 1],
+            ],
+            -1,
+        )
+        # tensor has shape [b, s, n_h, h_d]
+        x_out = x_out.flatten(3)
+        return x_out.type_as(x)
+class PldrllmRotaryEmbedding(nn.Module):
+    def __init__(self, config: PldrllmConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+############# END OF ROTARY EMBEDDING IMPLEMENTATION #################################################
+@dataclass
+class BasePLDRModelOutputWithPast(ModelOutput):
+    """
+    Base class for [`PldrllmModel`] outputs that may also contain a past key/values (to speed up sequential decoding).
+    Args:
+        last_hidden_state (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
+        Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.
+            The tuple for each layer contains:
+            output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+    """
+    last_hidden_state: Optional[torch.Tensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.Tensor, ...]] = None
+    attentions: Optional[tuple[torch.Tensor, ...]] = None
+    pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]]  = None
+@dataclass
+class CausalPLDRLLMOutputWithPast(ModelOutput):
+    """
+    Base class for [`PldrllmForCausalLM`] causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
+        Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.
+            The tuple for each layer contains:
+            output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+    """
+    loss: Optional[torch.Tensor] = None
+    logits: Optional[torch.Tensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.Tensor, ...]] = None
+    attentions: Optional[tuple[torch.Tensor, ...]] = None
+    pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None
+@dataclass
+class TokenClassifierPLDRLLMOutput(ModelOutput):
+    """
+    Base class for outputs of [`PldrllmForTokenClassification`] token classification model.
+    Args:
+        loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
+            Classification loss.
+        logits (`torch.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
+        Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.
+            The tuple for each layer contains:
+            output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+    """
+    loss: Optional[torch.Tensor] = None
+    logits: Optional[torch.Tensor] = None
+    hidden_states: Optional[tuple[torch.Tensor, ...]] = None
+    attentions: Optional[tuple[torch.Tensor, ...]] = None
+    pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None
+@dataclass
+class QuestionAnsweringPLDRModelOutput(ModelOutput):
+    """
+    Base class for outputs of [`PldrllmForQuestionAnswering`] question answering model.
+    Args:
+        loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_logits (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+            Span-start scores (before SoftMax).
+        end_logits (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+            Span-end scores (before SoftMax).
+        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
+        Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.
+            The tuple for each layer contains:
+            output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+    """
+    loss: Optional[torch.Tensor] = None
+    start_logits: Optional[torch.Tensor] = None
+    end_logits: Optional[torch.Tensor] = None
+    hidden_states: Optional[tuple[torch.Tensor, ...]] = None
+    attentions: Optional[tuple[torch.Tensor, ...]] = None
+    pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None
+@dataclass
+class SequenceClassifierPLDRLLMOutputWithPast(ModelOutput):
+    """
+    Base class for outputs of [`PldrllmForSequenceClassification`] sentence classification model.
+    Args:
+        loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
+        Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.
+            The tuple for each layer contains:
+            output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
+            attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+    """
+    loss: Optional[torch.Tensor] = None
+    logits: Optional[torch.Tensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.Tensor, ...]] = None
+    attentions: Optional[tuple[torch.Tensor, ...]] = None
+    pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None
+@auto_docstring
+class PldrllmPreTrainedModel(PreTrainedModel):
+    config_class = PldrllmConfig
+    base_model_prefix = "decoder"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PLDR_DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = False
+    _supports_attention_backend = True
+    _can_compile_fullgraph=False
+    def __init__(self, config: PldrllmConfig)->None:
+        super().__init__(config)
+        self.custom_G_type=config.custom_G_type
+        if self.custom_G_type is not None:
+            self._can_compile_fullgraph=True
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_uniform_(module.weight.data)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=1.0)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, PlgaLayer):
+            if module.Wlst is not None:
+                nn.init.xavier_uniform_(module.Wlst.data)
+            if module.pwlst is not None:
+                nn.init.xavier_uniform_(module.pwlst.data)
+            if module.alst is not None:
+                nn.init.xavier_uniform_(module.alst.data)
+            if module.blst is not None:
+                module.blst.data.zero_()
+            if module.balst is not None:
+                module.balst.data.zero_()
+MODEL_COMMON_CUSTOM_ARGS=r"""
+        output_pldr_attentions (`bool`, *optional*, defaults to `False`):
+            Whether to return the deductive outputs and learnable parameters of power law graph attention module as tuple containing:
+            the output of the residual metric learner (metric tensor, A), output (A_LM) after application of iSwiGLU on metric tensor, learned
+            exponents of potential tensor, learned weights for energy-curvature tensor, learned bias for
+            energy-curvature tensor, energy-curvature tensor (G_LM), and attention weights.
+        cache_first_G (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the G values from first sample in a batch or G values from all samples for past_G_values initialization.
+            When `cache_first_G=true`, the batch_size of past_G_values is 1. This argument should be set to True for contrastive text generation
+            with learned G values.
+        """
+@auto_docstring(custom_intro="""
+                Large Language Model From Power Law Decoder Representations (PLDR-LLM) with decoder hidden state as output.
+                PLDR-LLM is a model architecture that utilizes Power Law Graph Attention (PLGA) in decoder layers.
+                For details of model architecture, check out these papers:
+                [Paper-1](https://huggingface.co/papers/2107.02039) [Paper-2](https://huggingface.co/papers/2410.16703) [Paper-3](https://huggingface.co/papers/2502.13502)
+                """
+                )
+class PldrllmModel(PldrllmPreTrainedModel):
+    def __init__(self, config: PldrllmConfig)->None:
+        super().__init__(config)
+        # Initialize weights and apply final processing
+        self.num_layers = config.num_hidden_layers
+        self.d_model=config.hidden_size
+        self.num_heads=config.num_attention_heads
+        self.target_vocab_size =config.vocab_size
+        self.max_seq_len=config.max_position_embeddings
+        self.reference_rope=config.reference_rope
+        self.pldr_device=None
+        self.gradient_checkpointing = False
+        self.layer_norm_eps=config.layer_norm_eps
+        self.wdtype=None
+        assert self.d_model % self.num_heads == 0
+        self.depth = config.head_dim
+        self.custom_G_type=config.custom_G_type
+        if self.custom_G_type is not None:
+            # predefined past_G_values are initialized for both training and inference
+            past_G_values, past_G_values_status=self.G_values_init(device=self.pldr_device, dtype=self.wdtype)
+            self.register_buffer("past_G_values_status", past_G_values_status, persistent=True)
+            self.register_buffer("past_G_values", past_G_values, persistent=True)
+            logger.warning("\nIMPORTANT: decoder.past_G_values are set to predefined values and deep PLGA layers will be skipped. "
+                           "Set config.custom_G_type=None to enable deep PLGA layers.")
+            if self.custom_G_type=="external":
+                logger.warning("\nIMPORTANT: config.custom_G_type is selected as 'external' and an external value of decoder.past_G_values[:,2,...] is expected. "
+                               "decoder.past_G_values[:,2,...] are initialized to identity tensor by default. This is equivalent to an LLM with SDPA. To provide external values "
+                               "to the decoder.past_G_values, either load these values along with the pretrained model or set decoder.past_G_values to a torch.float tensor of "
+                               "size (num_layers, 3, 1, num_heads, head_dim, head_dim) after model is initialized.\n")
+        else:
+            # learned past_G_values is initialized at inference.
+            self.register_buffer("past_G_values_status", None, persistent=False)
+            self.register_buffer("past_G_values", None, persistent=False)
+            self.is_past_G_values_initialized=False
+        self.embedding = nn.Embedding(self.target_vocab_size, self.d_model, device=self.pldr_device, dtype=self.wdtype)
+        self.dec_layers = nn.ModuleList([PLDR_DecoderLayer(config,
+                                                           layer_idx=i,
+                                                           device=self.pldr_device) for i in range(self.num_layers)])
+        self.layernorm1 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps, device=self.pldr_device, dtype=self.wdtype)
+        if not self.reference_rope:
+            self.rotary_embedding=PldrllmRotaryEmbedding(config=config)
+        self.post_init()
+    def G_values_init(self, batch_size=1, device=None, dtype=None):
+        G_values_dim=(self.num_layers, 1, self.num_heads, self.depth, self.depth) # [num_layers, 1, num_heads, depth, depth]
+        zeros_tensor=torch.zeros(G_values_dim, device=device, dtype=dtype)
+        identity_tensor=torch.eye(self.depth).repeat(self.num_layers, 1, self.num_heads, 1, 1).to(device=device, dtype=dtype)
+        random_tensor=torch.randn(G_values_dim, device=device, dtype=dtype)
+        CUSTOM_G_VALUES={
+                         'identity':torch.stack([zeros_tensor, zeros_tensor, identity_tensor], dim=1), # [num_layers, 3, num_heads, depth, depth]
+                         'random': torch.stack([zeros_tensor, zeros_tensor, random_tensor], dim=1),
+                         'external': torch.stack([zeros_tensor, zeros_tensor, identity_tensor], dim=1)
+                         }
+        if self.custom_G_type is None:
+            # 3 tensors for A, AW and avAp per layer
+            past_G_values = torch.zeros((self.num_layers, 3, batch_size, self.num_heads, self.depth, self.depth), device=device, dtype=dtype)
+            past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=device)
+        elif self.custom_G_type in ['identity', 'random', 'external']:
+            past_G_values=CUSTOM_G_VALUES[self.custom_G_type]
+            past_G_values_status=torch.tensor([True]*self.num_layers,  dtype=torch.bool, device=device)
+        else:
+            raise ValueError("Invalid custom_G_type value. Available values are "
+                             "None, 'identity', 'random', and 'external'.")
+        self.is_past_G_values_initialized=True
+        return past_G_values, past_G_values_status
+    @can_return_tuple
+    @auto_docstring(
+        custom_args=MODEL_COMMON_CUSTOM_ARGS
+    )
+    def forward(self,
+                input_ids: Optional[torch.LongTensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.LongTensor] = None,
+                past_key_values: Optional[Cache]=None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                use_cache: Optional[bool] = None,
+                output_attentions: Optional[bool] = None,
+                output_pldr_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                cache_position: Optional[torch.LongTensor] = None,
+                cache_first_G: Optional[bool] = None,
+                **kwargs: Unpack[TransformersKwargs]
+                        ):
+        use_cache=use_cache if use_cache is not None else self.config.use_cache
+        cache_first_G=cache_first_G if cache_first_G is not None else self.config.cache_first_G
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_pldr_attentions=output_pldr_attentions if output_pldr_attentions is not None else self.config.output_pldr_attentions
+        output_hidden_states=output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        if (self.gradient_checkpointing or self.training) and use_cache:
+            logger.warning_once(
+                "During training, setting `use_cache=False`. Additionally, `use_cache=True` is incompatible with gradient checkpointing."
+            )
+            use_cache = False
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        inputs_embeds = self.embedding(input_ids) if inputs_embeds is None else inputs_embeds  # [batch_size, target_seq_len, d_model]
+        dec_att_weights=() if output_pldr_attentions else None
+        dec_attentions=() if output_attentions else None
+        dec_outputs=(inputs_embeds,) if output_hidden_states else None
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        # reset past_G_Values_status if they are not custom and predefined.
+        if use_cache and self.custom_G_type is None and not isinstance(past_key_values, StaticCache) and  past_key_values.get_seq_length()==0:
+            self.past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=inputs_embeds.device)
+            self.is_past_G_values_initialized=False
+        if use_cache and isinstance(past_key_values, StaticCache) and ((self.custom_G_type is None) or
+                                                                         "flash_attention" in self.config._attn_implementation):
+            raise ValueError("Static Cache is only supported with predefined past_G_values. "
+                             "Flash attention is not supported. "
+                             "Supported models are with config.custom_G_type set to 'random', 'identity' or 'external'.")
+        if not self.is_past_G_values_initialized  and self.custom_G_type is None:
+            if use_cache:
+                batch_size=1 if cache_first_G else inputs_embeds.size()[0]
+                self.past_G_values, self.past_G_values_status=self.G_values_init(batch_size=batch_size,
+                                                                                 device=inputs_embeds.device,
+                                                                                 dtype=inputs_embeds.dtype)
+            else:
+                self.past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=inputs_embeds.device)
+                self.past_G_values=None
+                self.is_past_G_values_initialized=True
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids
+        )
+        hidden_states=inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        if not self.reference_rope:
+            position_embeddings = self.rotary_embedding(hidden_states, position_ids)
+        else:
+            # defer reference rope initialization in the PldrllmAttention module.
+            position_embeddings=None
+        hidden_states *= torch.sqrt(torch.tensor(self.d_model).to(dtype=hidden_states.dtype))
+        hidden_states=self.layernorm1(hidden_states)
+        for i in range(self.num_layers):
+            hidden_states, dec_att_w= self.dec_layers[i](hidden_states,
+                                                         causal_mask,
+                                                         position_embeddings=position_embeddings,
+                                                         position_ids=position_ids,
+                                                         cache_position=cache_position,
+                                                         use_cache=use_cache,
+                                                         past_key_values=past_key_values,
+                                                         past_G_values=self.past_G_values,
+                                                         past_G_values_status=self.past_G_values_status,
+                                                         **kwargs
+                                                         )
+            if output_pldr_attentions:
+                dec_att_weights += (dec_att_w,)
+            if output_attentions:
+                dec_attentions += (dec_att_w[-1],)
+            if output_hidden_states:
+                dec_outputs += (hidden_states,)
+        last_hidden_state=hidden_states
+        return BasePLDRModelOutputWithPast(
+            last_hidden_state = last_hidden_state,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=dec_outputs,
+            attentions=dec_attentions,
+            pldr_attentions=dec_att_weights
+        )
+    def get_input_embeddings(self):
+        return self.embedding
+    def set_input_embeddings(self, value):
+        self.embedding = value
+@auto_docstring(custom_intro="""
+                Large Language Model From Power Law Decoder Representations (PLDR-LLM) with LM Head as final layer.
+                PLDR-LLM is a model architecture that utilizes Power Law Graph Attention (PLGA) in decoder layers.
+                For details of model architecture, check out these papers:
+                [Paper-1](https://huggingface.co/papers/2107.02039) [Paper-2](https://huggingface.co/papers/2410.16703) [Paper-3](https://huggingface.co/papers/2502.13502)
+                """
+                )
+class PldrllmForCausalLM(PldrllmPreTrainedModel, GenerationMixin):
+    def __init__(self, config: PldrllmConfig)->None:
+        super().__init__(config)
+        self.d_model=config.hidden_size
+        self.input_vocab_size =config.vocab_size
+        self.final_bias=config.final_bias
+        self.pldr_device=None
+        self.decoder=PldrllmModel(config=config)
+        self.wdtype=None
+        self.final_layer = nn.Linear(self.d_model, self.input_vocab_size, bias=self.final_bias, device=self.pldr_device, dtype=self.wdtype)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.decoder.embedding
+    def set_input_embeddings(self, value):
+        self.decoder.embedding = value
+    def get_output_embeddings(self):
+        return self.final_layer
+    def set_output_embeddings(self, new_embeddings):
+        self.final_layer = new_embeddings
+    def set_decoder(self, decoder):
+        self.decoder = decoder
+    def get_decoder(self):
+        return self.decoder
+    @can_return_tuple
+    @auto_docstring(
+        custom_args=MODEL_COMMON_CUSTOM_ARGS
+    )
+    def forward(self,
+                input_ids: Optional[torch.LongTensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.LongTensor] = None,
+                past_key_values: Optional[Cache]=None,
+                use_cache: Optional[bool] = None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                labels: Optional[torch.LongTensor] = None,
+                output_attentions: Optional[bool] = None,
+                output_pldr_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                cache_position: Optional[torch.LongTensor] = None,
+                cache_first_G: Optional[bool] = None,
+                logits_to_keep: Union[int, torch.Tensor] = 0,
+                **kwargs: Unpack[TransformersKwargs],
+                )-> CausalPLDRLLMOutputWithPast:
+        outputs: BasePLDRModelOutputWithPast=self.decoder(input_ids=input_ids,
+                                                          attention_mask=attention_mask,
+                                                          position_ids=position_ids,
+                                                          past_key_values=past_key_values,
+                                                          use_cache=use_cache,
+                                                          inputs_embeds=inputs_embeds,
+                                                          output_attentions=output_attentions,
+                                                          output_pldr_attentions=output_pldr_attentions,
+                                                          output_hidden_states=output_hidden_states,
+                                                          cache_position=cache_position,
+                                                          cache_first_G=cache_first_G,
+                                                          **kwargs
+                                                         )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.final_layer(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalPLDRLLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions= outputs.attentions, #list of E
+            pldr_attentions=outputs.pldr_attentions
+        )
+@auto_docstring
+class PldrllmForTokenClassification(PldrllmPreTrainedModel):
+    def __init__(self, config:PldrllmConfig)->None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.decoder = PldrllmModel(config)
+        self.wdtype=None
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=self.wdtype)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.decoder.embedding
+    def set_input_embeddings(self, value):
+        self.decoder.embedding = value
+    @can_return_tuple
+    @auto_docstring(
+        custom_args=MODEL_COMMON_CUSTOM_ARGS
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_pldr_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_first_G: Optional[bool] = None,
+    ) -> TokenClassifierPLDRLLMOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs: BasePLDRModelOutputWithPast = self.decoder(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_pldr_attentions=output_pldr_attentions,
+            cache_first_G=cache_first_G
+        )
+        sequence_output = outputs.last_hidden_state
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+        return TokenClassifierPLDRLLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            pldr_attentions=outputs.pldr_attentions
+        )
+@auto_docstring
+class PldrllmForQuestionAnswering(PldrllmPreTrainedModel):
+    # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Llama->Pldrllm
+    def __init__(self, config:PldrllmConfig):
+        super().__init__(config)
+        self.decoder = PldrllmModel(config)
+        self.wdtype=None
+        self.qa_outputs = nn.Linear(config.hidden_size, 2, bias=True, dtype=self.wdtype)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.decoder.embedding
+    def set_input_embeddings(self, value):
+        self.decoder.embedding = value
+    @can_return_tuple
+    @auto_docstring(
+        custom_args=MODEL_COMMON_CUSTOM_ARGS
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_pldr_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_first_G: Optional[bool] = None,
+        **kwargs,
+    ) -> QuestionAnsweringPLDRModelOutput:
+        outputs: BasePLDRModelOutputWithPast = self.decoder(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_pldr_attentions=output_pldr_attentions,
+            cache_first_G=cache_first_G
+        )
+        sequence_output = outputs.last_hidden_state
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
+        return QuestionAnsweringPLDRModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            pldr_attentions=outputs.pldr_attentions
+        )
+@auto_docstring(
+    custom_intro="""
+    The PLDR-LLM with a sequence classification head on top (linear layer).
+    [`PldrllmForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class PldrllmForSequenceClassification(PldrllmPreTrainedModel):
+    def __init__(self, config:PldrllmConfig)->None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.decoder = PldrllmModel(config)
+        self.wdtype=None
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False, dtype=self.wdtype)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.decoder.embedding
+    def set_input_embeddings(self, value):
+        self.decoder.embedding = value
+    @can_return_tuple
+    @auto_docstring(
+        custom_args=MODEL_COMMON_CUSTOM_ARGS
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_pldr_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_first_G: Optional[bool] = None
+    ) -> SequenceClassifierPLDRLLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs: BasePLDRModelOutputWithPast = self.decoder(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_pldr_attentions=output_pldr_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_first_G=cache_first_G
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+        return SequenceClassifierPLDRLLMOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            pldr_attentions=outputs.pldr_attentions
+        )
+__all__ = [
+    "PldrllmForCausalLM",
+    "PldrllmModel",
+    "PldrllmPreTrainedModel",
+    "PldrllmForTokenClassification",
+    "PldrllmForQuestionAnswering",
+    "PldrllmForSequenceClassification"
+]

paper_saved_model_files/PLDRv51-SOC-110M-3-model-checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe5cff49591b1b86d2d44d01da6671ca3a22f86cb64d6bd048cea7ee45c8ff2a
+size 439127012

paper_saved_model_files/PLDRv51_SOC_110M_3_hyperparameters.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch.nn.functional as F
+hpdict={'num_layers': 5,
+        'd_model': 896,
+        'num_heads': 14,
+        'dff': 2389,
+        'A_dff': 170,
+        'num_reslayerA': 8,
+        'num_denseA': 2,
+        'input_vocab_size': 32000,
+        'max_seq_len': 1024,
+        'epochs': 1,
+        'save_model_path': './PLDRv51-SOC-110M-3-model-checkpoint',
+        'warmup_steps': 2000,
+        'lr_total_steps': 250000,
+        'learning_rate': 0.0009,
+        'lr_alpha': 0.1,
+        'adamw_decay': 0.1,
+        'activation': F.silu,
+        'disable_amp': False,
+        'auto_size_minimum': None,
+        'disable_fsdp_mixed_precision': False,
+        'fsdp_cpu_offload': False,
+        'fsdp_sharding_strategy': 'HYBRID_SHARD',
+        'backward_prefetch': 'PRE',
+        'save_type': 'torch'}

paper_saved_model_files/refinedweb-tokenizer-pldrllm-soc-paper.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4805068e389397471165a1edfa2390d831a0512b287894551b426dce32455bc6
+size 616194

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers==4.56.1
+pytorch==2.6.0
+sentencepiece==0.1.99
+python==3.11

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "[START]",
+  "eos_token": "[END]",
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51f4369714712232bfc746188f347e550790d1d75e5e35bfa4399b784d2a666f
+size 796800

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[START]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[END]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[START]",
+  "bos_token_id": 2,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "[END]",
+  "eos_token_id": 3,
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "pad_token_id": 0,
+  "padding_side": "left",
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": "[UNK]",
+  "unk_token_id": 1,
+  "use_default_system_prompt": false
+}