FreakingPotato commited on
Commit
b7905a0
·
verified ·
1 Parent(s): 6e87381

Upload NucEL model with comprehensive documentation

Browse files
README.md ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: apache-2.0
5
+ library_name: transformers
6
+ tags:
7
+ - genomics
8
+ - nucleotide
9
+ - dna
10
+ - sequence-modeling
11
+ - biology
12
+ - bioinformatics
13
+ datasets:
14
+ - genome
15
+ pipeline_tag: feature-extraction
16
+ ---
17
+
18
+ # NucEL: Single-Nucleotide ELECTRA-Style Genomic Pre-training for Efficient and Interpretable Representations
19
+
20
+ NucEL is a specialized language model designed for nucleotide sequence analysis and genomic applications. This model provides powerful embeddings for DNA sequences and can be fine-tuned for various downstream genomic tasks.
21
+
22
+ ## Model Details
23
+
24
+ - **Model Type**: Transformer-based sequence model
25
+ - **Domain**: Genomics and Nucleotide Sequences
26
+ - **Architecture**: Based on transformer architecture optimized for nucleotide sequences
27
+ - **Tokenizer**: Custom NucEL tokenizer with k=1 for nucleotide-level tokenization
28
+
29
+ ## Features
30
+
31
+ - Nucleotide-level tokenization and embedding
32
+ - Pre-trained on large genomic datasets
33
+ - Optimized for biological sequence understanding
34
+ - Compatible with HuggingFace transformers library
35
+
36
+ ## Usage
37
+
38
+ ### Basic Usage
39
+
40
+ ```python
41
+ from transformers import AutoModel, AutoTokenizer
42
+
43
+ # Load model and tokenizer
44
+ model = AutoModel.from_pretrained("your-username/NucEL", trust_remote_code=True)
45
+ tokenizer = AutoTokenizer.from_pretrained("your-username/NucEL", trust_remote_code=True)
46
+
47
+ # Example DNA sequence
48
+ sequence = "ATCGATCGATCGATCG"
49
+
50
+ # Tokenize and encode
51
+ inputs = tokenizer(sequence, return_tensors="pt")
52
+ outputs = model(**inputs)
53
+
54
+ # Get sequence embeddings
55
+ embeddings = outputs.last_hidden_state
56
+ print(f"Sequence embeddings shape: {embeddings.shape}")
57
+ ```
58
+
59
+ ## Installation
60
+
61
+ ```bash
62
+ pip install transformers torch
63
+ # Install any additional dependencies for your specific use case
64
+ ```
65
+
66
+ ## Requirements
67
+
68
+ - transformers >= 4.21.0
69
+ - torch >= 1.9.0
70
+ - Python >= 3.7
71
+
72
+ ## Citation
73
+
74
+ If you use NucEL in your research, please cite:
75
+
76
+ ```bibtex
77
+ @misc{nucel2024,
78
+ title={NucEL: Single-Nucleotide ELECTRA-Style Genomic Pre-training for Efficient and Interpretable Representations},
79
+ author={Ke Ding, Brian Parker, and Jiayu Wen},
80
+ year={2025},
81
+ howpublished={\url{https://huggingface.co/FreakingPotato/NucEL}}
82
+ }
83
+ ```
84
+
85
+ ## License
86
+
87
+ This model is released under the Apache 2.0 License.
88
+
89
+ ## Contact
90
+
91
+ For questions and support, please open an issue in the repository or contact [[email protected]].
config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ModernBertModel"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50281,
8
+ "classifier_activation": "gelu",
9
+ "classifier_bias": false,
10
+ "classifier_dropout": 0.0,
11
+ "classifier_pooling": "cls",
12
+ "cls_token_id": 2,
13
+ "decoder_bias": true,
14
+ "deterministic_flash_attn": false,
15
+ "embedding_dropout": 0.0,
16
+ "eos_token_id": 50282,
17
+ "global_attn_every_n_layers": 3,
18
+ "global_rope_theta": 10000,
19
+ "hidden_activation": "gelu",
20
+ "hidden_size": 512,
21
+ "initializer_cutoff_factor": 2.0,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 2048,
24
+ "layer_norm_eps": 1e-12,
25
+ "local_attention": 128,
26
+ "local_rope_theta": 1000,
27
+ "mask_token_id": 3,
28
+ "max_position_embeddings": 8192,
29
+ "mlp_bias": false,
30
+ "mlp_dropout": 0.0,
31
+ "model_type": "nucel",
32
+ "norm_bias": false,
33
+ "norm_eps": 1e-12,
34
+ "num_attention_heads": 16,
35
+ "num_hidden_layers": 22,
36
+ "pad_token_id": 1,
37
+ "reference_compile": true,
38
+ "repad_logits_with_grad": false,
39
+ "sep_token_id": 50282,
40
+ "sparse_pred_ignore_index": -100,
41
+ "sparse_prediction": false,
42
+ "tie_word_embeddings": false,
43
+ "torch_dtype": "float32",
44
+ "transformers_version": "4.50.3",
45
+ "unknown_token_id": 0,
46
+ "vocab_size": 27,
47
+ "task_specific_params": {
48
+ "feature-extraction": {
49
+ "max_sequence_length": 512,
50
+ "embedding_dimension": 512
51
+ }
52
+ },
53
+ "custom_tokenizer": true,
54
+ "tokenizer_class": "NucEL_Tokenizer",
55
+ "auto_map": {
56
+ "AutoModel": "modeling_nucel.NucELModel",
57
+ "AutoTokenizer": "model.tokenizer.NucEL_Tokenizer"
58
+ }
59
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4313456e971ed018bef72c2628eda8229a9fde5877f35b329372b757a13a38de
3
+ size 369259168
model_index.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "NucEL",
3
+ "model_type": "feature-extraction",
4
+ "domains": [
5
+ "genomics",
6
+ "bioinformatics"
7
+ ],
8
+ "languages": [
9
+ "nucleotide"
10
+ ],
11
+ "tasks": [
12
+ "feature-extraction",
13
+ "sequence-similarity",
14
+ "gene-function-prediction",
15
+ "variant-effect-prediction"
16
+ ]
17
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc5f5df02d1961293f62f8a0753440bdbe9e360e7680409f6ebcf34b2589b2bf
3
+ size 369289506
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[BOS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[EOS]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[BOS]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[EOS]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "bos_token": "[BOS]",
61
+ "clean_up_tokenization_spaces": false,
62
+ "cls_token": "[CLS]",
63
+ "eos_token": "[EOS]",
64
+ "extra_special_tokens": {},
65
+ "mask_token": "[MASK]",
66
+ "model_max_length": 2048,
67
+ "pad_token": "[PAD]",
68
+ "sep_token": "[SEP]",
69
+ "tokenizer_class": "NucEL_Tokenizer",
70
+ "unk_token": "[UNK]"
71
+ }
vocab.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 0,
3
+ "[UNK]": 1,
4
+ "[CLS]": 2,
5
+ "[SEP]": 3,
6
+ "[MASK]": 4,
7
+ "[BOS]": 5,
8
+ "[EOS]": 6,
9
+ "A": 11,
10
+ "C": 12,
11
+ "G": 13,
12
+ "T": 14,
13
+ "[RESERVED_0]": 15,
14
+ "[RESERVED_1]": 16,
15
+ "[RESERVED_2]": 17,
16
+ "[RESERVED_3]": 18,
17
+ "[RESERVED_4]": 19,
18
+ "[RESERVED_5]": 20,
19
+ "[RESERVED_6]": 21,
20
+ "[RESERVED_7]": 22,
21
+ "[RESERVED_8]": 23,
22
+ "[RESERVED_9]": 24,
23
+ "[RESERVED_10]": 25,
24
+ "[RESERVED_11]": 26,
25
+ "[RESERVED_12]": 27,
26
+ "[RESERVED_13]": 28,
27
+ "[RESERVED_14]": 29,
28
+ "[RESERVED_15]": 30
29
+ }