Naphula commited on
Commit
7080631
·
verified ·
1 Parent(s): 5d55dd6

Upload 5 files

Browse files
Files changed (5) hide show
  1. eos_scanner.py +189 -0
  2. eos_scanner_readme.md +102 -0
  3. gen_id_patcher.py +98 -0
  4. model_tools.md +6 -0
  5. weight_counter.py +42 -0
eos_scanner.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import json
3
+ import os
4
+ import sys
5
+ import argparse
6
+ from colorama import init, Fore, Style
7
+
8
+ # Initialize colorama
9
+ init()
10
+
11
+ class Logger:
12
+ def __init__(self, filename="eos_audit.log"):
13
+ self.terminal = sys.stdout
14
+ self.log = open(filename, "w", encoding="utf-8")
15
+
16
+ def write(self, message):
17
+ self.terminal.write(message)
18
+ self.log.write(message)
19
+
20
+ def flush(self):
21
+ self.terminal.flush()
22
+ self.log.flush()
23
+
24
+ def load_json(path):
25
+ if not os.path.exists(path):
26
+ return None
27
+ try:
28
+ with open(path, 'r', encoding='utf-8') as f:
29
+ return json.load(f)
30
+ except Exception:
31
+ return None
32
+
33
+ def get_model_metadata(model_path):
34
+ data = {
35
+ "path": model_path,
36
+ "name": os.path.basename(model_path).replace("!models--", ""),
37
+ "gen_eos_id": "MISSING", # From generation_config.json
38
+ "tok_eos_str": "MISSING", # From tokenizer_config.json
39
+ "vocab_eos_id": "MISSING", # The actual ID of the string in tokenizer.json
40
+ "vocab_size": "MISSING",
41
+ "internal_consistency": True
42
+ }
43
+
44
+ # 1. Generation Config (What the model uses to stop)
45
+ gen_conf = load_json(os.path.join(model_path, "generation_config.json"))
46
+ if gen_conf:
47
+ data["gen_eos_id"] = gen_conf.get("eos_token_id", "MISSING")
48
+ # Handle list of EOS ids
49
+ if isinstance(data["gen_eos_id"], list):
50
+ data["gen_eos_id"] = data["gen_eos_id"][0] # Take first for comparison
51
+
52
+ # 2. Tokenizer Config (What the string is)
53
+ tok_conf = load_json(os.path.join(model_path, "tokenizer_config.json"))
54
+ if tok_conf:
55
+ data["tok_eos_str"] = tok_conf.get("eos_token", "MISSING")
56
+ if isinstance(data["tok_eos_str"], dict):
57
+ data["tok_eos_str"] = data["tok_eos_str"].get("content", "MISSING")
58
+
59
+ # 3. Tokenizer JSON (The actual map)
60
+ # We prefer tokenizer.json (HuggingFace) over tokenizer.model (SentencePiece) for inspection
61
+ tok_file = load_json(os.path.join(model_path, "tokenizer.json"))
62
+ if tok_file and data["tok_eos_str"] != "MISSING":
63
+ model_vocab = tok_file.get("model", {}).get("vocab", {})
64
+ data["vocab_size"] = len(model_vocab)
65
+
66
+ # Find ID of the EOS string
67
+ if data["tok_eos_str"] in model_vocab:
68
+ data["vocab_eos_id"] = model_vocab[data["tok_eos_str"]]
69
+
70
+ # Check Internal Consistency
71
+ # Does the ID in generation_config match the ID of the string in tokenizer.json?
72
+ if str(data["gen_eos_id"]) != str(data["vocab_eos_id"]):
73
+ data["internal_consistency"] = False
74
+
75
+ return data
76
+
77
+ def main():
78
+ parser = argparse.ArgumentParser(description="Scan models for EOS/Tokenizer mismatches.")
79
+ parser.add_argument("config", help="Path to the mergekit yaml config file")
80
+ args = parser.parse_args()
81
+
82
+ sys.stdout = Logger()
83
+
84
+ print(f"{Fore.CYAN}--- EOS & TOKENIZER SCANNER (DEEP SCAN) ---{Style.RESET_ALL}")
85
+ print(f"Scanning config: {args.config}\n")
86
+
87
+ with open(args.config, 'r', encoding='utf-8') as f:
88
+ config = yaml.safe_load(f)
89
+
90
+ base_model_path = config.get('base_model')
91
+
92
+ # Extract model paths from list of dicts
93
+ models = []
94
+ if config.get('models'):
95
+ for m in config['models']:
96
+ if isinstance(m, dict) and 'model' in m:
97
+ models.append(m['model'])
98
+ elif isinstance(m, str):
99
+ models.append(m)
100
+
101
+ if not base_model_path:
102
+ print(f"{Fore.RED}CRITICAL: No base_model defined in YAML.{Style.RESET_ALL}")
103
+ return
104
+
105
+ # 1. Analyze Base Model
106
+ print("Analyzing Base Model...")
107
+ base_data = get_model_metadata(base_model_path)
108
+
109
+ print(f"{Fore.GREEN}BASE MODEL: {base_data['name']}{Style.RESET_ALL}")
110
+ print(f" Gen Config EOS ID: {base_data['gen_eos_id']}")
111
+ print(f" Tokenizer EOS Str: {base_data['tok_eos_str']}")
112
+ print(f" Actual Vocab ID: {base_data['vocab_eos_id']}")
113
+
114
+ if not base_data['internal_consistency']:
115
+ print(f" {Fore.RED}INTERNAL ERROR: Base model generation_config ID does not match tokenizer ID!{Style.RESET_ALL}")
116
+ else:
117
+ print(f" Internal Consistency: {Fore.GREEN}PASS{Style.RESET_ALL}")
118
+ print("-" * 80)
119
+
120
+ # 2. Analyze Donors
121
+ print(f"{'Status':<10} | {'Gen ID':<8} | {'Vocab ID':<8} | {'EOS Str':<10} | {'Model Name'}")
122
+ print("-" * 100)
123
+
124
+ mismatches = 0
125
+
126
+ for model_path in models:
127
+ d = get_model_metadata(model_path)
128
+
129
+ is_match = True
130
+ reasons = []
131
+
132
+ # Check against Base Model
133
+ if str(d['gen_eos_id']) != str(base_data['gen_eos_id']):
134
+ is_match = False
135
+ reasons.append("GenID")
136
+
137
+ if str(d['vocab_eos_id']) != str(base_data['vocab_eos_id']):
138
+ is_match = False
139
+ reasons.append("VocabID")
140
+
141
+ if d['tok_eos_str'] != base_data['tok_eos_str']:
142
+ is_match = False
143
+ reasons.append("Str")
144
+
145
+ # Formatting
146
+ status_color = Fore.GREEN
147
+ status_text = "MATCH"
148
+
149
+ if not is_match:
150
+ status_color = Fore.RED
151
+ status_text = f"FAIL"
152
+ mismatches += 1
153
+
154
+ # Internal consistency check override
155
+ if not d['internal_consistency']:
156
+ status_color = Fore.MAGENTA
157
+ status_text = "BROKEN"
158
+ mismatches += 1
159
+
160
+ # Column coloring
161
+ gen_id_str = str(d['gen_eos_id'])
162
+ if gen_id_str != str(base_data['gen_eos_id']): gen_id_str = f"{Fore.RED}{gen_id_str}{status_color}"
163
+
164
+ vocab_id_str = str(d['vocab_eos_id'])
165
+ if vocab_id_str != str(base_data['vocab_eos_id']): vocab_id_str = f"{Fore.RED}{vocab_id_str}{status_color}"
166
+
167
+ str_str = str(d['tok_eos_str'])
168
+ if str_str != base_data['tok_eos_str']: str_str = f"{Fore.RED}{str_str}{status_color}"
169
+
170
+ print(f"{status_color}{status_text:<10} | {gen_id_str:<8} | {vocab_id_str:<8} | {str_str:<10} | {d['name']}{Style.RESET_ALL}")
171
+
172
+ print("-" * 100)
173
+
174
+ # 3. Final Recommendation
175
+ print(f"\n{Fore.CYAN}--- FINAL VERDICT ---{Style.RESET_ALL}")
176
+
177
+ if mismatches == 0:
178
+ print(f"{Fore.GREEN}ALL CLEAR.{Style.RESET_ALL}")
179
+ print("1. Change YAML to: tokenizer: source: base")
180
+ print("2. Remove: chat_template: auto")
181
+ print("3. Ensure your base model path in YAML is correct.")
182
+ else:
183
+ print(f"{Fore.RED}MISMATCHES DETECTED.{Style.RESET_ALL}")
184
+ print("1. You MUST use: tokenizer: source: union")
185
+ print("2. However, 'union' may cause the early termination bug if IDs shift.")
186
+ print("3. Recommendation: Remove the models marked FAIL/BROKEN from the merge.")
187
+
188
+ if __name__ == "__main__":
189
+ main()
eos_scanner_readme.md ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Here is the `eos_scanner.py` tool.
2
+
3
+ This script is designed specifically to detect the "Silent Killers" of merges: **Token ID mismatches** and **Chat Template inconsistencies**.
4
+
5
+ ### How to use it
6
+ 1. Save the code below as `eos_scanner.py`.
7
+ 2. Run it against your config:
8
+ ```bash
9
+ python eos_scanner.py config36.yaml
10
+ ```
11
+
12
+ ### Regarding your YAML Question
13
+
14
+ You asked:
15
+ > Let me know if I should keep these lines, or null them out (and set tokenizer to base).
16
+ > ```yaml
17
+ > tokenizer:
18
+ > source: union
19
+ > chat_template: auto
20
+ > ```
21
+
22
+ **The Answer:**
23
+ Run the scanner above.
24
+
25
+ 1. **If the scanner shows all GREEN (MATCH):**
26
+ * **Change to `source: base`**.
27
+ * *Why?* `source: union` attempts to merge vocabularies. If the vocabularies are already identical (which they likely are if they are all Mistral 24B derivatives), `union` adds computational overhead and, more dangerously, can accidentally re-index special tokens if one model has a slightly malformed `tokenizer.json`. Using `base` forces the merge to use the clean, working tokenizer from your base model.
28
+
29
+ 2. **If the scanner shows RED (FAIL):**
30
+ * **Keep `source: union`** (or remove the red models).
31
+ * *Why?* If Model A uses token ID `2` for EOS, and Model B uses token ID `32000` for EOS, you *cannot* use `source: base`. You need `union` to handle the conflict, though this is exactly what causes "early termination" (the model generates ID 2 thinking it's a comma, but the tokenizer thinks it's EOS).
32
+
33
+ **Regarding `chat_template: auto`:**
34
+ It is generally safer to delete this line or set it to a specific file if you want consistent behavior. `auto` often defaults to the base model's template, but sometimes MergeKit tries to synthesize one. Since you are merging RP models, you likely want a specific template (like Mistral V3 Tekken or ChatML). I recommend removing `chat_template: auto` and letting your inference engine (Ollama/Kobold) handle the template, OR explicitly setting it to the base model's template in the YAML.
35
+
36
+ ---
37
+
38
+ ### Critical Findings from Source Code Analysis
39
+
40
+ 1. **The "Union" Risk (`mergekit/tokenizer/build.py`):**
41
+ When you use `source: union`, `mergekit` calculates a permutation map for *every* model, even if they are identical. It then runs `PermutedEmbeddings` (in `embed.py`). If there is even a tiny discrepancy in `added_tokens.json` or `special_tokens_map.json` between your donors, `union` might assign a new ID to the EOS token.
42
+ * **The Bug:** `mergekit` often copies the `generation_config.json` from the base model *without* updating the `eos_token_id` inside it to match the new `union` tokenizer. If `union` shifts EOS from ID `2` to ID `32000`, but `generation_config` still says `2`, your model will terminate early (or never).
43
+
44
+ 2. **The "Auto" Template Risk (`mergekit/merge.py`):**
45
+ The `chat_template: auto` logic simply counts the most common template string among donors. If your base model (Mistral Small 3) has a specific template, but you merge 10 models that use a generic Llama 2 template, `auto` will overwrite your base template. This causes the model to see `<|im_start|>` (for example) but not know how to process it because the template changed.
46
+
47
+ ### Updated `eos_scanner.py`
48
+
49
+ I have updated the script to perform an **Internal Consistency Check**. It now verifies if the `eos_token_id` defined in the config actually matches the ID of the `eos_token` string in the vocabulary.
50
+
51
+ ### Final YAML Advice
52
+
53
+ Based on the code review, here is the safest configuration for your YAML.
54
+
55
+ **If the scanner returns all MATCH:**
56
+
57
+ ```yaml
58
+ tokenizer:
59
+ source: base
60
+ # chat_template: auto <-- DELETE THIS LINE COMPLETELY
61
+ ```
62
+
63
+ **Why?**
64
+ 1. `source: base` forces `mergekit` to skip the complex permutation logic in `build.py`. It simply copies the tokenizer files from your base model. This guarantees that `eos_token_id` `2` remains `2`.
65
+ 2. Deleting `chat_template` prevents `mergekit` from synthesizing a template based on a popularity contest of the donors. It will default to copying the base model's template, which is exactly what you want for a consistent chat experience.
66
+
67
+ ---
68
+
69
+ Yes, creating a `Gen_ID_Patcher.py` is a **highly effective** strategy for the models marked `BROKEN | MISSING | 2`.
70
+
71
+ ### Why this works
72
+ The screenshot confirms that these models (like `ReadyArt...Broken-Tutu`, `Morax`, `FlareRebellion`) **actually use Token ID 2** in their vocabulary (Column: `Vocab ID`). They are just missing the metadata in `generation_config.json` that tells MergeKit "Hey, I use ID 2."
73
+
74
+ By patching this file, you convert these models from **BROKEN** to **MATCH**.
75
+
76
+ **The Strategic Benefit:**
77
+ If you patch these "Missing ID" models, and then **remove** the actual outliers (the ones with ID `999` or `<|endoftext|>`), you will achieve a **100% MATCH** across the board. This allows you to use `tokenizer: source: base`, which eliminates the early termination bug caused by `union`.
78
+
79
+ ### The Script: `Gen_ID_Patcher.py`
80
+
81
+ This script will look at your YAML, find the models, and inject `eos_token_id: 2` (or whatever your base model uses) into their `generation_config.json`.
82
+
83
+ ### Instructions
84
+
85
+ 1. Run the patcher:
86
+ ```bash
87
+ python Gen_ID_Patcher.py config36.yaml
88
+ ```
89
+ 2. **Crucial Step:** Look at your `eos_scanner` output again. You must **remove** the models that are *genuinely* incompatible.
90
+ * **Remove:** `LatitudeGames--Hearthfire` (ID 999)
91
+ * **Remove:** `aixonlab--Eurydice` (ID 999)
92
+ * **Remove:** `Gryphe--Codex` (ID 999)
93
+ * **Remove:** `PocketDoc--Dans-PersonalityEngine` (ID 2, but string is `<|endoftext|>`. This is a conflict).
94
+ 3. Run `eos_scanner.py` again.
95
+ 4. If everything is green (MATCH), change your YAML to:
96
+ ```yaml
97
+ tokenizer:
98
+ source: base
99
+ ```
100
+ (And delete `chat_template: auto`).
101
+
102
+ This path gives you the highest probability of a stable model that stops generating correctly.
gen_id_patcher.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import json
3
+ import os
4
+ import shutil
5
+ import argparse
6
+ from colorama import init, Fore, Style
7
+
8
+ init()
9
+
10
+ def load_json(path):
11
+ if not os.path.exists(path):
12
+ return {}
13
+ try:
14
+ with open(path, 'r', encoding='utf-8') as f:
15
+ return json.load(f)
16
+ except Exception:
17
+ return {}
18
+
19
+ def save_json(path, data):
20
+ with open(path, 'w', encoding='utf-8') as f:
21
+ json.dump(data, f, indent=2)
22
+
23
+ def main():
24
+ parser = argparse.ArgumentParser(description="Patch missing EOS IDs in generation_config.json")
25
+ parser.add_argument("config", help="Path to the mergekit yaml config file")
26
+ args = parser.parse_args()
27
+
28
+ print(f"{Fore.CYAN}--- GENERATION CONFIG PATCHER ---{Style.RESET_ALL}")
29
+
30
+ # 1. Load Config
31
+ with open(args.config, 'r', encoding='utf-8') as f:
32
+ config = yaml.safe_load(f)
33
+
34
+ base_model_path = config.get('base_model')
35
+ if not base_model_path:
36
+ print("No base_model found.")
37
+ return
38
+
39
+ # 2. Get Target EOS ID from Base Model
40
+ print(f"Reading Base Model: {os.path.basename(base_model_path)}")
41
+ base_gen_path = os.path.join(base_model_path, "generation_config.json")
42
+ base_gen = load_json(base_gen_path)
43
+
44
+ target_eos_id = base_gen.get("eos_token_id")
45
+
46
+ if target_eos_id is None:
47
+ print(f"{Fore.RED}CRITICAL: Base model lacks eos_token_id. Cannot patch.{Style.RESET_ALL}")
48
+ return
49
+
50
+ print(f"Target EOS ID is: {Fore.GREEN}{target_eos_id}{Style.RESET_ALL}")
51
+ print("-" * 60)
52
+
53
+ # 3. Iterate and Patch
54
+ models = [m['model'] for m in config.get('models', []) if isinstance(m, dict)]
55
+
56
+ patched_count = 0
57
+
58
+ for model_path in models:
59
+ model_name = os.path.basename(model_path).replace("!models--", "")
60
+ gen_path = os.path.join(model_path, "generation_config.json")
61
+
62
+ # Load or create empty dict
63
+ data = load_json(gen_path)
64
+
65
+ current_id = data.get("eos_token_id")
66
+
67
+ # Logic: Only patch if MISSING.
68
+ # If it exists but is different (e.g. 999), we DO NOT touch it (that's a real mismatch).
69
+ if current_id is None:
70
+ print(f"Patching {model_name}...")
71
+
72
+ # Backup first
73
+ if os.path.exists(gen_path):
74
+ shutil.copy(gen_path, gen_path + ".bak")
75
+
76
+ # Apply Patch
77
+ data["eos_token_id"] = target_eos_id
78
+
79
+ # Ensure other basics exist if file was empty
80
+ if "bos_token_id" not in data:
81
+ data["bos_token_id"] = 1 # Standard Mistral assumption
82
+
83
+ save_json(gen_path, data)
84
+ print(f" {Fore.GREEN}-> Fixed: Added eos_token_id: {target_eos_id}{Style.RESET_ALL}")
85
+ patched_count += 1
86
+
87
+ elif str(current_id) != str(target_eos_id):
88
+ print(f"Skipping {model_name}: Has ID {current_id} (Mismatch, not missing)")
89
+ else:
90
+ # Already matches, do nothing
91
+ pass
92
+
93
+ print("-" * 60)
94
+ print(f"Operation Complete. Patched {patched_count} models.")
95
+ print("Run eos_scanner.py again to verify results.")
96
+
97
+ if __name__ == "__main__":
98
+ main()
model_tools.md CHANGED
@@ -32,6 +32,12 @@ Tools to enhance LLM quantizations and merging
32
  # [metadata_audit.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/metadata_audit.py)
33
  - Checks multiple models within subdirectories for vocab or rope mismatch (useful for large merges). Calibrated for Mistral Nemo 12B by default.
34
 
 
 
 
 
 
 
35
  # [fp32_to_bf16.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/fp32_to_bf16.py)
36
  - Converts FP32 to BF16 safetensors
37
 
 
32
  # [metadata_audit.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/metadata_audit.py)
33
  - Checks multiple models within subdirectories for vocab or rope mismatch (useful for large merges). Calibrated for Mistral Nemo 12B by default.
34
 
35
+ # [eos_scanner.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner.py)
36
+ - This tool scans the tokenizer jsons to detect any mismatches with EOS tokens, which cause early termination bugs. You can then use the [gen_id_patcher.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/gen_id_patcher.py) to patch missing `generation_config.json` files for EOS token. See [this post](https://huggingface.co/Naphula/Q0_Bench/discussions/1?not-for-all-audiences=true#6987717c762f0a45f672e250) as well as the [EOS Scanner ReadMe](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner_readme.md) for more info.
37
+
38
+ # [weight_counter.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/weight_counter.py)
39
+ - This counts the number of models in a yaml and adds up the total weight values. Useful for large della/ties merges.
40
+
41
  # [fp32_to_bf16.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/fp32_to_bf16.py)
42
  - Converts FP32 to BF16 safetensors
43
 
weight_counter.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import sys
3
+
4
+ def main():
5
+ if len(sys.argv) < 2:
6
+ print("Usage: python weight_counter.py <config.yaml>")
7
+ sys.exit(1)
8
+
9
+ config_path = sys.argv[1]
10
+
11
+ try:
12
+ with open(config_path, 'r', encoding='utf-8') as f:
13
+ config = yaml.safe_load(f)
14
+ except Exception as e:
15
+ print(f"Error reading file: {e}")
16
+ sys.exit(1)
17
+
18
+ total_weight = 0.0
19
+ count = 0
20
+
21
+ print(f"Scanning: {config_path}...")
22
+
23
+ if 'models' in config:
24
+ for m in config['models']:
25
+ # Safely get parameters -> weight, default to 0 if missing
26
+ params = m.get('parameters', {})
27
+ weight = params.get('weight', 0)
28
+
29
+ # Only sum if it's a number (ignores list/gradient weights)
30
+ if isinstance(weight, (int, float)):
31
+ total_weight += weight
32
+ count += 1
33
+ else:
34
+ print(f" [!] Skipped non-scalar weight for: {m.get('model')}")
35
+
36
+ print("-" * 30)
37
+ print(f"Models Counted: {count}")
38
+ print(f"Total Weight Sum: {total_weight}")
39
+ print("-" * 30)
40
+
41
+ if __name__ == "__main__":
42
+ main()