Spaces:
Running
Running
Upload 5 files
Browse files- eos_scanner.py +189 -0
- eos_scanner_readme.md +102 -0
- gen_id_patcher.py +98 -0
- model_tools.md +6 -0
- weight_counter.py +42 -0
eos_scanner.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yaml
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import argparse
|
| 6 |
+
from colorama import init, Fore, Style
|
| 7 |
+
|
| 8 |
+
# Initialize colorama
|
| 9 |
+
init()
|
| 10 |
+
|
| 11 |
+
class Logger:
|
| 12 |
+
def __init__(self, filename="eos_audit.log"):
|
| 13 |
+
self.terminal = sys.stdout
|
| 14 |
+
self.log = open(filename, "w", encoding="utf-8")
|
| 15 |
+
|
| 16 |
+
def write(self, message):
|
| 17 |
+
self.terminal.write(message)
|
| 18 |
+
self.log.write(message)
|
| 19 |
+
|
| 20 |
+
def flush(self):
|
| 21 |
+
self.terminal.flush()
|
| 22 |
+
self.log.flush()
|
| 23 |
+
|
| 24 |
+
def load_json(path):
|
| 25 |
+
if not os.path.exists(path):
|
| 26 |
+
return None
|
| 27 |
+
try:
|
| 28 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 29 |
+
return json.load(f)
|
| 30 |
+
except Exception:
|
| 31 |
+
return None
|
| 32 |
+
|
| 33 |
+
def get_model_metadata(model_path):
|
| 34 |
+
data = {
|
| 35 |
+
"path": model_path,
|
| 36 |
+
"name": os.path.basename(model_path).replace("!models--", ""),
|
| 37 |
+
"gen_eos_id": "MISSING", # From generation_config.json
|
| 38 |
+
"tok_eos_str": "MISSING", # From tokenizer_config.json
|
| 39 |
+
"vocab_eos_id": "MISSING", # The actual ID of the string in tokenizer.json
|
| 40 |
+
"vocab_size": "MISSING",
|
| 41 |
+
"internal_consistency": True
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# 1. Generation Config (What the model uses to stop)
|
| 45 |
+
gen_conf = load_json(os.path.join(model_path, "generation_config.json"))
|
| 46 |
+
if gen_conf:
|
| 47 |
+
data["gen_eos_id"] = gen_conf.get("eos_token_id", "MISSING")
|
| 48 |
+
# Handle list of EOS ids
|
| 49 |
+
if isinstance(data["gen_eos_id"], list):
|
| 50 |
+
data["gen_eos_id"] = data["gen_eos_id"][0] # Take first for comparison
|
| 51 |
+
|
| 52 |
+
# 2. Tokenizer Config (What the string is)
|
| 53 |
+
tok_conf = load_json(os.path.join(model_path, "tokenizer_config.json"))
|
| 54 |
+
if tok_conf:
|
| 55 |
+
data["tok_eos_str"] = tok_conf.get("eos_token", "MISSING")
|
| 56 |
+
if isinstance(data["tok_eos_str"], dict):
|
| 57 |
+
data["tok_eos_str"] = data["tok_eos_str"].get("content", "MISSING")
|
| 58 |
+
|
| 59 |
+
# 3. Tokenizer JSON (The actual map)
|
| 60 |
+
# We prefer tokenizer.json (HuggingFace) over tokenizer.model (SentencePiece) for inspection
|
| 61 |
+
tok_file = load_json(os.path.join(model_path, "tokenizer.json"))
|
| 62 |
+
if tok_file and data["tok_eos_str"] != "MISSING":
|
| 63 |
+
model_vocab = tok_file.get("model", {}).get("vocab", {})
|
| 64 |
+
data["vocab_size"] = len(model_vocab)
|
| 65 |
+
|
| 66 |
+
# Find ID of the EOS string
|
| 67 |
+
if data["tok_eos_str"] in model_vocab:
|
| 68 |
+
data["vocab_eos_id"] = model_vocab[data["tok_eos_str"]]
|
| 69 |
+
|
| 70 |
+
# Check Internal Consistency
|
| 71 |
+
# Does the ID in generation_config match the ID of the string in tokenizer.json?
|
| 72 |
+
if str(data["gen_eos_id"]) != str(data["vocab_eos_id"]):
|
| 73 |
+
data["internal_consistency"] = False
|
| 74 |
+
|
| 75 |
+
return data
|
| 76 |
+
|
| 77 |
+
def main():
|
| 78 |
+
parser = argparse.ArgumentParser(description="Scan models for EOS/Tokenizer mismatches.")
|
| 79 |
+
parser.add_argument("config", help="Path to the mergekit yaml config file")
|
| 80 |
+
args = parser.parse_args()
|
| 81 |
+
|
| 82 |
+
sys.stdout = Logger()
|
| 83 |
+
|
| 84 |
+
print(f"{Fore.CYAN}--- EOS & TOKENIZER SCANNER (DEEP SCAN) ---{Style.RESET_ALL}")
|
| 85 |
+
print(f"Scanning config: {args.config}\n")
|
| 86 |
+
|
| 87 |
+
with open(args.config, 'r', encoding='utf-8') as f:
|
| 88 |
+
config = yaml.safe_load(f)
|
| 89 |
+
|
| 90 |
+
base_model_path = config.get('base_model')
|
| 91 |
+
|
| 92 |
+
# Extract model paths from list of dicts
|
| 93 |
+
models = []
|
| 94 |
+
if config.get('models'):
|
| 95 |
+
for m in config['models']:
|
| 96 |
+
if isinstance(m, dict) and 'model' in m:
|
| 97 |
+
models.append(m['model'])
|
| 98 |
+
elif isinstance(m, str):
|
| 99 |
+
models.append(m)
|
| 100 |
+
|
| 101 |
+
if not base_model_path:
|
| 102 |
+
print(f"{Fore.RED}CRITICAL: No base_model defined in YAML.{Style.RESET_ALL}")
|
| 103 |
+
return
|
| 104 |
+
|
| 105 |
+
# 1. Analyze Base Model
|
| 106 |
+
print("Analyzing Base Model...")
|
| 107 |
+
base_data = get_model_metadata(base_model_path)
|
| 108 |
+
|
| 109 |
+
print(f"{Fore.GREEN}BASE MODEL: {base_data['name']}{Style.RESET_ALL}")
|
| 110 |
+
print(f" Gen Config EOS ID: {base_data['gen_eos_id']}")
|
| 111 |
+
print(f" Tokenizer EOS Str: {base_data['tok_eos_str']}")
|
| 112 |
+
print(f" Actual Vocab ID: {base_data['vocab_eos_id']}")
|
| 113 |
+
|
| 114 |
+
if not base_data['internal_consistency']:
|
| 115 |
+
print(f" {Fore.RED}INTERNAL ERROR: Base model generation_config ID does not match tokenizer ID!{Style.RESET_ALL}")
|
| 116 |
+
else:
|
| 117 |
+
print(f" Internal Consistency: {Fore.GREEN}PASS{Style.RESET_ALL}")
|
| 118 |
+
print("-" * 80)
|
| 119 |
+
|
| 120 |
+
# 2. Analyze Donors
|
| 121 |
+
print(f"{'Status':<10} | {'Gen ID':<8} | {'Vocab ID':<8} | {'EOS Str':<10} | {'Model Name'}")
|
| 122 |
+
print("-" * 100)
|
| 123 |
+
|
| 124 |
+
mismatches = 0
|
| 125 |
+
|
| 126 |
+
for model_path in models:
|
| 127 |
+
d = get_model_metadata(model_path)
|
| 128 |
+
|
| 129 |
+
is_match = True
|
| 130 |
+
reasons = []
|
| 131 |
+
|
| 132 |
+
# Check against Base Model
|
| 133 |
+
if str(d['gen_eos_id']) != str(base_data['gen_eos_id']):
|
| 134 |
+
is_match = False
|
| 135 |
+
reasons.append("GenID")
|
| 136 |
+
|
| 137 |
+
if str(d['vocab_eos_id']) != str(base_data['vocab_eos_id']):
|
| 138 |
+
is_match = False
|
| 139 |
+
reasons.append("VocabID")
|
| 140 |
+
|
| 141 |
+
if d['tok_eos_str'] != base_data['tok_eos_str']:
|
| 142 |
+
is_match = False
|
| 143 |
+
reasons.append("Str")
|
| 144 |
+
|
| 145 |
+
# Formatting
|
| 146 |
+
status_color = Fore.GREEN
|
| 147 |
+
status_text = "MATCH"
|
| 148 |
+
|
| 149 |
+
if not is_match:
|
| 150 |
+
status_color = Fore.RED
|
| 151 |
+
status_text = f"FAIL"
|
| 152 |
+
mismatches += 1
|
| 153 |
+
|
| 154 |
+
# Internal consistency check override
|
| 155 |
+
if not d['internal_consistency']:
|
| 156 |
+
status_color = Fore.MAGENTA
|
| 157 |
+
status_text = "BROKEN"
|
| 158 |
+
mismatches += 1
|
| 159 |
+
|
| 160 |
+
# Column coloring
|
| 161 |
+
gen_id_str = str(d['gen_eos_id'])
|
| 162 |
+
if gen_id_str != str(base_data['gen_eos_id']): gen_id_str = f"{Fore.RED}{gen_id_str}{status_color}"
|
| 163 |
+
|
| 164 |
+
vocab_id_str = str(d['vocab_eos_id'])
|
| 165 |
+
if vocab_id_str != str(base_data['vocab_eos_id']): vocab_id_str = f"{Fore.RED}{vocab_id_str}{status_color}"
|
| 166 |
+
|
| 167 |
+
str_str = str(d['tok_eos_str'])
|
| 168 |
+
if str_str != base_data['tok_eos_str']: str_str = f"{Fore.RED}{str_str}{status_color}"
|
| 169 |
+
|
| 170 |
+
print(f"{status_color}{status_text:<10} | {gen_id_str:<8} | {vocab_id_str:<8} | {str_str:<10} | {d['name']}{Style.RESET_ALL}")
|
| 171 |
+
|
| 172 |
+
print("-" * 100)
|
| 173 |
+
|
| 174 |
+
# 3. Final Recommendation
|
| 175 |
+
print(f"\n{Fore.CYAN}--- FINAL VERDICT ---{Style.RESET_ALL}")
|
| 176 |
+
|
| 177 |
+
if mismatches == 0:
|
| 178 |
+
print(f"{Fore.GREEN}ALL CLEAR.{Style.RESET_ALL}")
|
| 179 |
+
print("1. Change YAML to: tokenizer: source: base")
|
| 180 |
+
print("2. Remove: chat_template: auto")
|
| 181 |
+
print("3. Ensure your base model path in YAML is correct.")
|
| 182 |
+
else:
|
| 183 |
+
print(f"{Fore.RED}MISMATCHES DETECTED.{Style.RESET_ALL}")
|
| 184 |
+
print("1. You MUST use: tokenizer: source: union")
|
| 185 |
+
print("2. However, 'union' may cause the early termination bug if IDs shift.")
|
| 186 |
+
print("3. Recommendation: Remove the models marked FAIL/BROKEN from the merge.")
|
| 187 |
+
|
| 188 |
+
if __name__ == "__main__":
|
| 189 |
+
main()
|
eos_scanner_readme.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Here is the `eos_scanner.py` tool.
|
| 2 |
+
|
| 3 |
+
This script is designed specifically to detect the "Silent Killers" of merges: **Token ID mismatches** and **Chat Template inconsistencies**.
|
| 4 |
+
|
| 5 |
+
### How to use it
|
| 6 |
+
1. Save the code below as `eos_scanner.py`.
|
| 7 |
+
2. Run it against your config:
|
| 8 |
+
```bash
|
| 9 |
+
python eos_scanner.py config36.yaml
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
### Regarding your YAML Question
|
| 13 |
+
|
| 14 |
+
You asked:
|
| 15 |
+
> Let me know if I should keep these lines, or null them out (and set tokenizer to base).
|
| 16 |
+
> ```yaml
|
| 17 |
+
> tokenizer:
|
| 18 |
+
> source: union
|
| 19 |
+
> chat_template: auto
|
| 20 |
+
> ```
|
| 21 |
+
|
| 22 |
+
**The Answer:**
|
| 23 |
+
Run the scanner above.
|
| 24 |
+
|
| 25 |
+
1. **If the scanner shows all GREEN (MATCH):**
|
| 26 |
+
* **Change to `source: base`**.
|
| 27 |
+
* *Why?* `source: union` attempts to merge vocabularies. If the vocabularies are already identical (which they likely are if they are all Mistral 24B derivatives), `union` adds computational overhead and, more dangerously, can accidentally re-index special tokens if one model has a slightly malformed `tokenizer.json`. Using `base` forces the merge to use the clean, working tokenizer from your base model.
|
| 28 |
+
|
| 29 |
+
2. **If the scanner shows RED (FAIL):**
|
| 30 |
+
* **Keep `source: union`** (or remove the red models).
|
| 31 |
+
* *Why?* If Model A uses token ID `2` for EOS, and Model B uses token ID `32000` for EOS, you *cannot* use `source: base`. You need `union` to handle the conflict, though this is exactly what causes "early termination" (the model generates ID 2 thinking it's a comma, but the tokenizer thinks it's EOS).
|
| 32 |
+
|
| 33 |
+
**Regarding `chat_template: auto`:**
|
| 34 |
+
It is generally safer to delete this line or set it to a specific file if you want consistent behavior. `auto` often defaults to the base model's template, but sometimes MergeKit tries to synthesize one. Since you are merging RP models, you likely want a specific template (like Mistral V3 Tekken or ChatML). I recommend removing `chat_template: auto` and letting your inference engine (Ollama/Kobold) handle the template, OR explicitly setting it to the base model's template in the YAML.
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
### Critical Findings from Source Code Analysis
|
| 39 |
+
|
| 40 |
+
1. **The "Union" Risk (`mergekit/tokenizer/build.py`):**
|
| 41 |
+
When you use `source: union`, `mergekit` calculates a permutation map for *every* model, even if they are identical. It then runs `PermutedEmbeddings` (in `embed.py`). If there is even a tiny discrepancy in `added_tokens.json` or `special_tokens_map.json` between your donors, `union` might assign a new ID to the EOS token.
|
| 42 |
+
* **The Bug:** `mergekit` often copies the `generation_config.json` from the base model *without* updating the `eos_token_id` inside it to match the new `union` tokenizer. If `union` shifts EOS from ID `2` to ID `32000`, but `generation_config` still says `2`, your model will terminate early (or never).
|
| 43 |
+
|
| 44 |
+
2. **The "Auto" Template Risk (`mergekit/merge.py`):**
|
| 45 |
+
The `chat_template: auto` logic simply counts the most common template string among donors. If your base model (Mistral Small 3) has a specific template, but you merge 10 models that use a generic Llama 2 template, `auto` will overwrite your base template. This causes the model to see `<|im_start|>` (for example) but not know how to process it because the template changed.
|
| 46 |
+
|
| 47 |
+
### Updated `eos_scanner.py`
|
| 48 |
+
|
| 49 |
+
I have updated the script to perform an **Internal Consistency Check**. It now verifies if the `eos_token_id` defined in the config actually matches the ID of the `eos_token` string in the vocabulary.
|
| 50 |
+
|
| 51 |
+
### Final YAML Advice
|
| 52 |
+
|
| 53 |
+
Based on the code review, here is the safest configuration for your YAML.
|
| 54 |
+
|
| 55 |
+
**If the scanner returns all MATCH:**
|
| 56 |
+
|
| 57 |
+
```yaml
|
| 58 |
+
tokenizer:
|
| 59 |
+
source: base
|
| 60 |
+
# chat_template: auto <-- DELETE THIS LINE COMPLETELY
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
**Why?**
|
| 64 |
+
1. `source: base` forces `mergekit` to skip the complex permutation logic in `build.py`. It simply copies the tokenizer files from your base model. This guarantees that `eos_token_id` `2` remains `2`.
|
| 65 |
+
2. Deleting `chat_template` prevents `mergekit` from synthesizing a template based on a popularity contest of the donors. It will default to copying the base model's template, which is exactly what you want for a consistent chat experience.
|
| 66 |
+
|
| 67 |
+
---
|
| 68 |
+
|
| 69 |
+
Yes, creating a `Gen_ID_Patcher.py` is a **highly effective** strategy for the models marked `BROKEN | MISSING | 2`.
|
| 70 |
+
|
| 71 |
+
### Why this works
|
| 72 |
+
The screenshot confirms that these models (like `ReadyArt...Broken-Tutu`, `Morax`, `FlareRebellion`) **actually use Token ID 2** in their vocabulary (Column: `Vocab ID`). They are just missing the metadata in `generation_config.json` that tells MergeKit "Hey, I use ID 2."
|
| 73 |
+
|
| 74 |
+
By patching this file, you convert these models from **BROKEN** to **MATCH**.
|
| 75 |
+
|
| 76 |
+
**The Strategic Benefit:**
|
| 77 |
+
If you patch these "Missing ID" models, and then **remove** the actual outliers (the ones with ID `999` or `<|endoftext|>`), you will achieve a **100% MATCH** across the board. This allows you to use `tokenizer: source: base`, which eliminates the early termination bug caused by `union`.
|
| 78 |
+
|
| 79 |
+
### The Script: `Gen_ID_Patcher.py`
|
| 80 |
+
|
| 81 |
+
This script will look at your YAML, find the models, and inject `eos_token_id: 2` (or whatever your base model uses) into their `generation_config.json`.
|
| 82 |
+
|
| 83 |
+
### Instructions
|
| 84 |
+
|
| 85 |
+
1. Run the patcher:
|
| 86 |
+
```bash
|
| 87 |
+
python Gen_ID_Patcher.py config36.yaml
|
| 88 |
+
```
|
| 89 |
+
2. **Crucial Step:** Look at your `eos_scanner` output again. You must **remove** the models that are *genuinely* incompatible.
|
| 90 |
+
* **Remove:** `LatitudeGames--Hearthfire` (ID 999)
|
| 91 |
+
* **Remove:** `aixonlab--Eurydice` (ID 999)
|
| 92 |
+
* **Remove:** `Gryphe--Codex` (ID 999)
|
| 93 |
+
* **Remove:** `PocketDoc--Dans-PersonalityEngine` (ID 2, but string is `<|endoftext|>`. This is a conflict).
|
| 94 |
+
3. Run `eos_scanner.py` again.
|
| 95 |
+
4. If everything is green (MATCH), change your YAML to:
|
| 96 |
+
```yaml
|
| 97 |
+
tokenizer:
|
| 98 |
+
source: base
|
| 99 |
+
```
|
| 100 |
+
(And delete `chat_template: auto`).
|
| 101 |
+
|
| 102 |
+
This path gives you the highest probability of a stable model that stops generating correctly.
|
gen_id_patcher.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yaml
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import shutil
|
| 5 |
+
import argparse
|
| 6 |
+
from colorama import init, Fore, Style
|
| 7 |
+
|
| 8 |
+
init()
|
| 9 |
+
|
| 10 |
+
def load_json(path):
|
| 11 |
+
if not os.path.exists(path):
|
| 12 |
+
return {}
|
| 13 |
+
try:
|
| 14 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 15 |
+
return json.load(f)
|
| 16 |
+
except Exception:
|
| 17 |
+
return {}
|
| 18 |
+
|
| 19 |
+
def save_json(path, data):
|
| 20 |
+
with open(path, 'w', encoding='utf-8') as f:
|
| 21 |
+
json.dump(data, f, indent=2)
|
| 22 |
+
|
| 23 |
+
def main():
|
| 24 |
+
parser = argparse.ArgumentParser(description="Patch missing EOS IDs in generation_config.json")
|
| 25 |
+
parser.add_argument("config", help="Path to the mergekit yaml config file")
|
| 26 |
+
args = parser.parse_args()
|
| 27 |
+
|
| 28 |
+
print(f"{Fore.CYAN}--- GENERATION CONFIG PATCHER ---{Style.RESET_ALL}")
|
| 29 |
+
|
| 30 |
+
# 1. Load Config
|
| 31 |
+
with open(args.config, 'r', encoding='utf-8') as f:
|
| 32 |
+
config = yaml.safe_load(f)
|
| 33 |
+
|
| 34 |
+
base_model_path = config.get('base_model')
|
| 35 |
+
if not base_model_path:
|
| 36 |
+
print("No base_model found.")
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
# 2. Get Target EOS ID from Base Model
|
| 40 |
+
print(f"Reading Base Model: {os.path.basename(base_model_path)}")
|
| 41 |
+
base_gen_path = os.path.join(base_model_path, "generation_config.json")
|
| 42 |
+
base_gen = load_json(base_gen_path)
|
| 43 |
+
|
| 44 |
+
target_eos_id = base_gen.get("eos_token_id")
|
| 45 |
+
|
| 46 |
+
if target_eos_id is None:
|
| 47 |
+
print(f"{Fore.RED}CRITICAL: Base model lacks eos_token_id. Cannot patch.{Style.RESET_ALL}")
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
print(f"Target EOS ID is: {Fore.GREEN}{target_eos_id}{Style.RESET_ALL}")
|
| 51 |
+
print("-" * 60)
|
| 52 |
+
|
| 53 |
+
# 3. Iterate and Patch
|
| 54 |
+
models = [m['model'] for m in config.get('models', []) if isinstance(m, dict)]
|
| 55 |
+
|
| 56 |
+
patched_count = 0
|
| 57 |
+
|
| 58 |
+
for model_path in models:
|
| 59 |
+
model_name = os.path.basename(model_path).replace("!models--", "")
|
| 60 |
+
gen_path = os.path.join(model_path, "generation_config.json")
|
| 61 |
+
|
| 62 |
+
# Load or create empty dict
|
| 63 |
+
data = load_json(gen_path)
|
| 64 |
+
|
| 65 |
+
current_id = data.get("eos_token_id")
|
| 66 |
+
|
| 67 |
+
# Logic: Only patch if MISSING.
|
| 68 |
+
# If it exists but is different (e.g. 999), we DO NOT touch it (that's a real mismatch).
|
| 69 |
+
if current_id is None:
|
| 70 |
+
print(f"Patching {model_name}...")
|
| 71 |
+
|
| 72 |
+
# Backup first
|
| 73 |
+
if os.path.exists(gen_path):
|
| 74 |
+
shutil.copy(gen_path, gen_path + ".bak")
|
| 75 |
+
|
| 76 |
+
# Apply Patch
|
| 77 |
+
data["eos_token_id"] = target_eos_id
|
| 78 |
+
|
| 79 |
+
# Ensure other basics exist if file was empty
|
| 80 |
+
if "bos_token_id" not in data:
|
| 81 |
+
data["bos_token_id"] = 1 # Standard Mistral assumption
|
| 82 |
+
|
| 83 |
+
save_json(gen_path, data)
|
| 84 |
+
print(f" {Fore.GREEN}-> Fixed: Added eos_token_id: {target_eos_id}{Style.RESET_ALL}")
|
| 85 |
+
patched_count += 1
|
| 86 |
+
|
| 87 |
+
elif str(current_id) != str(target_eos_id):
|
| 88 |
+
print(f"Skipping {model_name}: Has ID {current_id} (Mismatch, not missing)")
|
| 89 |
+
else:
|
| 90 |
+
# Already matches, do nothing
|
| 91 |
+
pass
|
| 92 |
+
|
| 93 |
+
print("-" * 60)
|
| 94 |
+
print(f"Operation Complete. Patched {patched_count} models.")
|
| 95 |
+
print("Run eos_scanner.py again to verify results.")
|
| 96 |
+
|
| 97 |
+
if __name__ == "__main__":
|
| 98 |
+
main()
|
model_tools.md
CHANGED
|
@@ -32,6 +32,12 @@ Tools to enhance LLM quantizations and merging
|
|
| 32 |
# [metadata_audit.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/metadata_audit.py)
|
| 33 |
- Checks multiple models within subdirectories for vocab or rope mismatch (useful for large merges). Calibrated for Mistral Nemo 12B by default.
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
# [fp32_to_bf16.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/fp32_to_bf16.py)
|
| 36 |
- Converts FP32 to BF16 safetensors
|
| 37 |
|
|
|
|
| 32 |
# [metadata_audit.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/metadata_audit.py)
|
| 33 |
- Checks multiple models within subdirectories for vocab or rope mismatch (useful for large merges). Calibrated for Mistral Nemo 12B by default.
|
| 34 |
|
| 35 |
+
# [eos_scanner.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner.py)
|
| 36 |
+
- This tool scans the tokenizer jsons to detect any mismatches with EOS tokens, which cause early termination bugs. You can then use the [gen_id_patcher.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/gen_id_patcher.py) to patch missing `generation_config.json` files for EOS token. See [this post](https://huggingface.co/Naphula/Q0_Bench/discussions/1?not-for-all-audiences=true#6987717c762f0a45f672e250) as well as the [EOS Scanner ReadMe](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner_readme.md) for more info.
|
| 37 |
+
|
| 38 |
+
# [weight_counter.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/weight_counter.py)
|
| 39 |
+
- This counts the number of models in a yaml and adds up the total weight values. Useful for large della/ties merges.
|
| 40 |
+
|
| 41 |
# [fp32_to_bf16.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/fp32_to_bf16.py)
|
| 42 |
- Converts FP32 to BF16 safetensors
|
| 43 |
|
weight_counter.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yaml
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
def main():
|
| 5 |
+
if len(sys.argv) < 2:
|
| 6 |
+
print("Usage: python weight_counter.py <config.yaml>")
|
| 7 |
+
sys.exit(1)
|
| 8 |
+
|
| 9 |
+
config_path = sys.argv[1]
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
with open(config_path, 'r', encoding='utf-8') as f:
|
| 13 |
+
config = yaml.safe_load(f)
|
| 14 |
+
except Exception as e:
|
| 15 |
+
print(f"Error reading file: {e}")
|
| 16 |
+
sys.exit(1)
|
| 17 |
+
|
| 18 |
+
total_weight = 0.0
|
| 19 |
+
count = 0
|
| 20 |
+
|
| 21 |
+
print(f"Scanning: {config_path}...")
|
| 22 |
+
|
| 23 |
+
if 'models' in config:
|
| 24 |
+
for m in config['models']:
|
| 25 |
+
# Safely get parameters -> weight, default to 0 if missing
|
| 26 |
+
params = m.get('parameters', {})
|
| 27 |
+
weight = params.get('weight', 0)
|
| 28 |
+
|
| 29 |
+
# Only sum if it's a number (ignores list/gradient weights)
|
| 30 |
+
if isinstance(weight, (int, float)):
|
| 31 |
+
total_weight += weight
|
| 32 |
+
count += 1
|
| 33 |
+
else:
|
| 34 |
+
print(f" [!] Skipped non-scalar weight for: {m.get('model')}")
|
| 35 |
+
|
| 36 |
+
print("-" * 30)
|
| 37 |
+
print(f"Models Counted: {count}")
|
| 38 |
+
print(f"Total Weight Sum: {total_weight}")
|
| 39 |
+
print("-" * 30)
|
| 40 |
+
|
| 41 |
+
if __name__ == "__main__":
|
| 42 |
+
main()
|