Upload NucEL model with comprehensive documentation

Browse files

Files changed (8) hide show

README.md +91 -0
config.json +59 -0
model.safetensors +3 -0
model_index.json +17 -0
pytorch_model.bin +3 -0
special_tokens_map.json +9 -0
tokenizer_config.json +71 -0
vocab.json +29 -0

README.md ADDED Viewed

	@@ -0,0 +1,91 @@

+---
+language:
+- en
+license: apache-2.0
+library_name: transformers
+tags:
+- genomics
+- nucleotide
+- dna
+- sequence-modeling
+- biology
+- bioinformatics
+datasets:
+- genome
+pipeline_tag: feature-extraction
+---
+# NucEL: Single-Nucleotide ELECTRA-Style Genomic Pre-training for Efficient and Interpretable Representations
+NucEL is a specialized language model designed for nucleotide sequence analysis and genomic applications. This model provides powerful embeddings for DNA sequences and can be fine-tuned for various downstream genomic tasks.
+## Model Details
+- **Model Type**: Transformer-based sequence model
+- **Domain**: Genomics and Nucleotide Sequences
+- **Architecture**: Based on transformer architecture optimized for nucleotide sequences
+- **Tokenizer**: Custom NucEL tokenizer with k=1 for nucleotide-level tokenization
+## Features
+- Nucleotide-level tokenization and embedding
+- Pre-trained on large genomic datasets
+- Optimized for biological sequence understanding
+- Compatible with HuggingFace transformers library
+## Usage
+### Basic Usage
+```python
+from transformers import AutoModel, AutoTokenizer
+# Load model and tokenizer
+model = AutoModel.from_pretrained("your-username/NucEL", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("your-username/NucEL", trust_remote_code=True)
+# Example DNA sequence
+sequence = "ATCGATCGATCGATCG"
+# Tokenize and encode
+inputs = tokenizer(sequence, return_tensors="pt")
+outputs = model(**inputs)
+# Get sequence embeddings
+embeddings = outputs.last_hidden_state
+print(f"Sequence embeddings shape: {embeddings.shape}")
+```
+## Installation
+```bash
+pip install transformers torch
+# Install any additional dependencies for your specific use case
+```
+## Requirements
+- transformers >= 4.21.0
+- torch >= 1.9.0
+- Python >= 3.7
+## Citation
+If you use NucEL in your research, please cite:
+```bibtex
+@misc{nucel2024,
+  title={NucEL: Single-Nucleotide ELECTRA-Style Genomic Pre-training for Efficient and Interpretable Representations},
+  author={Ke Ding, Brian Parker, and Jiayu Wen},
+  year={2025},
+  howpublished={\url{https://huggingface.co/FreakingPotato/NucEL}}
+}
+```
+## License
+This model is released under the Apache 2.0 License.
+## Contact
+For questions and support, please open an issue in the repository or contact [[email protected]].

config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "architectures": [
+    "ModernBertModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "cls",
+  "cls_token_id": 2,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
+  "global_rope_theta": 10000,
+  "hidden_activation": "gelu",
+  "hidden_size": 512,
+  "initializer_cutoff_factor": 2.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_norm_eps": 1e-12,
+  "local_attention": 128,
+  "local_rope_theta": 1000,
+  "mask_token_id": 3,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "nucel",
+  "norm_bias": false,
+  "norm_eps": 1e-12,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 22,
+  "pad_token_id": 1,
+  "reference_compile": true,
+  "repad_logits_with_grad": false,
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.50.3",
+  "unknown_token_id": 0,
+  "vocab_size": 27,
+  "task_specific_params": {
+    "feature-extraction": {
+      "max_sequence_length": 512,
+      "embedding_dimension": 512
+    }
+  },
+  "custom_tokenizer": true,
+  "tokenizer_class": "NucEL_Tokenizer",
+  "auto_map": {
+    "AutoModel": "modeling_nucel.NucELModel",
+    "AutoTokenizer": "model.tokenizer.NucEL_Tokenizer"
+  }
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4313456e971ed018bef72c2628eda8229a9fde5877f35b329372b757a13a38de
+size 369259168

model_index.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "model_name": "NucEL",
+  "model_type": "feature-extraction",
+  "domains": [
+    "genomics",
+    "bioinformatics"
+  ],
+  "languages": [
+    "nucleotide"
+  ],
+  "tasks": [
+    "feature-extraction",
+    "sequence-similarity",
+    "gene-function-prediction",
+    "variant-effect-prediction"
+  ]
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc5f5df02d1961293f62f8a0753440bdbe9e360e7680409f6ebcf34b2589b2bf
+size 369289506

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token": "[BOS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[EOS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "[EOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "eos_token": "[EOS]",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 2048,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "NucEL_Tokenizer",
+  "unk_token": "[UNK]"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "[PAD]": 0,
+  "[UNK]": 1,
+  "[CLS]": 2,
+  "[SEP]": 3,
+  "[MASK]": 4,
+  "[BOS]": 5,
+  "[EOS]": 6,
+  "A": 11,
+  "C": 12,
+  "G": 13,
+  "T": 14,
+  "[RESERVED_0]": 15,
+  "[RESERVED_1]": 16,
+  "[RESERVED_2]": 17,
+  "[RESERVED_3]": 18,
+  "[RESERVED_4]": 19,
+  "[RESERVED_5]": 20,
+  "[RESERVED_6]": 21,
+  "[RESERVED_7]": 22,
+  "[RESERVED_8]": 23,
+  "[RESERVED_9]": 24,
+  "[RESERVED_10]": 25,
+  "[RESERVED_11]": 26,
+  "[RESERVED_12]": 27,
+  "[RESERVED_13]": 28,
+  "[RESERVED_14]": 29,
+  "[RESERVED_15]": 30
+}