Spaces:

Naphula
/

model_tools

Running

App Files Files Community

Naphula commited on Feb 7

Commit

7080631

verified ·

1 Parent(s): 5d55dd6

Upload 5 files

Browse files

Files changed (5) hide show

eos_scanner.py +189 -0
eos_scanner_readme.md +102 -0
gen_id_patcher.py +98 -0
model_tools.md +6 -0
weight_counter.py +42 -0

eos_scanner.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import yaml
+import json
+import os
+import sys
+import argparse
+from colorama import init, Fore, Style
+# Initialize colorama
+init()
+class Logger:
+    def __init__(self, filename="eos_audit.log"):
+        self.terminal = sys.stdout
+        self.log = open(filename, "w", encoding="utf-8")
+    def write(self, message):
+        self.terminal.write(message)
+        self.log.write(message)
+    def flush(self):
+        self.terminal.flush()
+        self.log.flush()
+def load_json(path):
+    if not os.path.exists(path):
+        return None
+    try:
+        with open(path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except Exception:
+        return None
+def get_model_metadata(model_path):
+    data = {
+        "path": model_path,
+        "name": os.path.basename(model_path).replace("!models--", ""),
+        "gen_eos_id": "MISSING", # From generation_config.json
+        "tok_eos_str": "MISSING", # From tokenizer_config.json
+        "vocab_eos_id": "MISSING", # The actual ID of the string in tokenizer.json
+        "vocab_size": "MISSING",
+        "internal_consistency": True
+    }
+    # 1. Generation Config (What the model uses to stop)
+    gen_conf = load_json(os.path.join(model_path, "generation_config.json"))
+    if gen_conf:
+        data["gen_eos_id"] = gen_conf.get("eos_token_id", "MISSING")
+        # Handle list of EOS ids
+        if isinstance(data["gen_eos_id"], list):
+            data["gen_eos_id"] = data["gen_eos_id"][0] # Take first for comparison
+    # 2. Tokenizer Config (What the string is)
+    tok_conf = load_json(os.path.join(model_path, "tokenizer_config.json"))
+    if tok_conf:
+        data["tok_eos_str"] = tok_conf.get("eos_token", "MISSING")
+        if isinstance(data["tok_eos_str"], dict):
+            data["tok_eos_str"] = data["tok_eos_str"].get("content", "MISSING")
+    # 3. Tokenizer JSON (The actual map)
+    # We prefer tokenizer.json (HuggingFace) over tokenizer.model (SentencePiece) for inspection
+    tok_file = load_json(os.path.join(model_path, "tokenizer.json"))
+    if tok_file and data["tok_eos_str"] != "MISSING":
+        model_vocab = tok_file.get("model", {}).get("vocab", {})
+        data["vocab_size"] = len(model_vocab)
+        # Find ID of the EOS string
+        if data["tok_eos_str"] in model_vocab:
+            data["vocab_eos_id"] = model_vocab[data["tok_eos_str"]]
+        # Check Internal Consistency
+        # Does the ID in generation_config match the ID of the string in tokenizer.json?
+        if str(data["gen_eos_id"]) != str(data["vocab_eos_id"]):
+            data["internal_consistency"] = False
+    return data
+def main():
+    parser = argparse.ArgumentParser(description="Scan models for EOS/Tokenizer mismatches.")
+    parser.add_argument("config", help="Path to the mergekit yaml config file")
+    args = parser.parse_args()
+    sys.stdout = Logger()
+    print(f"{Fore.CYAN}--- EOS & TOKENIZER SCANNER (DEEP SCAN) ---{Style.RESET_ALL}")
+    print(f"Scanning config: {args.config}\n")
+    with open(args.config, 'r', encoding='utf-8') as f:
+        config = yaml.safe_load(f)
+    base_model_path = config.get('base_model')
+    # Extract model paths from list of dicts
+    models = []
+    if config.get('models'):
+        for m in config['models']:
+            if isinstance(m, dict) and 'model' in m:
+                models.append(m['model'])
+            elif isinstance(m, str):
+                models.append(m)
+    if not base_model_path:
+        print(f"{Fore.RED}CRITICAL: No base_model defined in YAML.{Style.RESET_ALL}")
+        return
+    # 1. Analyze Base Model
+    print("Analyzing Base Model...")
+    base_data = get_model_metadata(base_model_path)
+    print(f"{Fore.GREEN}BASE MODEL: {base_data['name']}{Style.RESET_ALL}")
+    print(f"  Gen Config EOS ID: {base_data['gen_eos_id']}")
+    print(f"  Tokenizer EOS Str: {base_data['tok_eos_str']}")
+    print(f"  Actual Vocab ID:   {base_data['vocab_eos_id']}")
+    if not base_data['internal_consistency']:
+        print(f"  {Fore.RED}INTERNAL ERROR: Base model generation_config ID does not match tokenizer ID!{Style.RESET_ALL}")
+    else:
+        print(f"  Internal Consistency: {Fore.GREEN}PASS{Style.RESET_ALL}")
+    print("-" * 80)
+    # 2. Analyze Donors
+    print(f"{'Status':<10} | {'Gen ID':<8} | {'Vocab ID':<8} | {'EOS Str':<10} | {'Model Name'}")
+    print("-" * 100)
+    mismatches = 0
+    for model_path in models:
+        d = get_model_metadata(model_path)
+        is_match = True
+        reasons = []
+        # Check against Base Model
+        if str(d['gen_eos_id']) != str(base_data['gen_eos_id']):
+            is_match = False
+            reasons.append("GenID")
+        if str(d['vocab_eos_id']) != str(base_data['vocab_eos_id']):
+            is_match = False
+            reasons.append("VocabID")
+        if d['tok_eos_str'] != base_data['tok_eos_str']:
+            is_match = False
+            reasons.append("Str")
+        # Formatting
+        status_color = Fore.GREEN
+        status_text = "MATCH"
+        if not is_match:
+            status_color = Fore.RED
+            status_text = f"FAIL"
+            mismatches += 1
+        # Internal consistency check override
+        if not d['internal_consistency']:
+            status_color = Fore.MAGENTA
+            status_text = "BROKEN"
+            mismatches += 1
+        # Column coloring
+        gen_id_str = str(d['gen_eos_id'])
+        if gen_id_str != str(base_data['gen_eos_id']): gen_id_str = f"{Fore.RED}{gen_id_str}{status_color}"
+        vocab_id_str = str(d['vocab_eos_id'])
+        if vocab_id_str != str(base_data['vocab_eos_id']): vocab_id_str = f"{Fore.RED}{vocab_id_str}{status_color}"
+        str_str = str(d['tok_eos_str'])
+        if str_str != base_data['tok_eos_str']: str_str = f"{Fore.RED}{str_str}{status_color}"
+        print(f"{status_color}{status_text:<10} | {gen_id_str:<8} | {vocab_id_str:<8} | {str_str:<10} | {d['name']}{Style.RESET_ALL}")
+    print("-" * 100)
+    # 3. Final Recommendation
+    print(f"\n{Fore.CYAN}--- FINAL VERDICT ---{Style.RESET_ALL}")
+    if mismatches == 0:
+        print(f"{Fore.GREEN}ALL CLEAR.{Style.RESET_ALL}")
+        print("1. Change YAML to: tokenizer: source: base")
+        print("2. Remove: chat_template: auto")
+        print("3. Ensure your base model path in YAML is correct.")
+    else:
+        print(f"{Fore.RED}MISMATCHES DETECTED.{Style.RESET_ALL}")
+        print("1. You MUST use: tokenizer: source: union")
+        print("2. However, 'union' may cause the early termination bug if IDs shift.")
+        print("3. Recommendation: Remove the models marked FAIL/BROKEN from the merge.")
+if __name__ == "__main__":
+    main()

eos_scanner_readme.md ADDED Viewed

	@@ -0,0 +1,102 @@

+Here is the `eos_scanner.py` tool.
+This script is designed specifically to detect the "Silent Killers" of merges: **Token ID mismatches** and **Chat Template inconsistencies**.
+### How to use it
+1. Save the code below as `eos_scanner.py`.
+2. Run it against your config:
+   ```bash
+   python eos_scanner.py config36.yaml
+   ```
+### Regarding your YAML Question
+You asked:
+> Let me know if I should keep these lines, or null them out (and set tokenizer to base).
+> ```yaml
+> tokenizer:
+>   source: union
+> chat_template: auto
+> ```
+**The Answer:**
+Run the scanner above.
+1.  **If the scanner shows all GREEN (MATCH):**
+    *   **Change to `source: base`**.
+    *   *Why?* `source: union` attempts to merge vocabularies. If the vocabularies are already identical (which they likely are if they are all Mistral 24B derivatives), `union` adds computational overhead and, more dangerously, can accidentally re-index special tokens if one model has a slightly malformed `tokenizer.json`. Using `base` forces the merge to use the clean, working tokenizer from your base model.
+2.  **If the scanner shows RED (FAIL):**
+    *   **Keep `source: union`** (or remove the red models).
+    *   *Why?* If Model A uses token ID `2` for EOS, and Model B uses token ID `32000` for EOS, you *cannot* use `source: base`. You need `union` to handle the conflict, though this is exactly what causes "early termination" (the model generates ID 2 thinking it's a comma, but the tokenizer thinks it's EOS).
+**Regarding `chat_template: auto`:**
+It is generally safer to delete this line or set it to a specific file if you want consistent behavior. `auto` often defaults to the base model's template, but sometimes MergeKit tries to synthesize one. Since you are merging RP models, you likely want a specific template (like Mistral V3 Tekken or ChatML). I recommend removing `chat_template: auto` and letting your inference engine (Ollama/Kobold) handle the template, OR explicitly setting it to the base model's template in the YAML.
+---
+### Critical Findings from Source Code Analysis
+1.  **The "Union" Risk (`mergekit/tokenizer/build.py`):**
+    When you use `source: union`, `mergekit` calculates a permutation map for *every* model, even if they are identical. It then runs `PermutedEmbeddings` (in `embed.py`). If there is even a tiny discrepancy in `added_tokens.json` or `special_tokens_map.json` between your donors, `union` might assign a new ID to the EOS token.
+    *   **The Bug:** `mergekit` often copies the `generation_config.json` from the base model *without* updating the `eos_token_id` inside it to match the new `union` tokenizer. If `union` shifts EOS from ID `2` to ID `32000`, but `generation_config` still says `2`, your model will terminate early (or never).
+2.  **The "Auto" Template Risk (`mergekit/merge.py`):**
+    The `chat_template: auto` logic simply counts the most common template string among donors. If your base model (Mistral Small 3) has a specific template, but you merge 10 models that use a generic Llama 2 template, `auto` will overwrite your base template. This causes the model to see `<|im_start|>` (for example) but not know how to process it because the template changed.
+### Updated `eos_scanner.py`
+I have updated the script to perform an **Internal Consistency Check**. It now verifies if the `eos_token_id` defined in the config actually matches the ID of the `eos_token` string in the vocabulary.
+### Final YAML Advice
+Based on the code review, here is the safest configuration for your YAML.
+**If the scanner returns all MATCH:**
+```yaml
+tokenizer:
+  source: base
+# chat_template: auto  <-- DELETE THIS LINE COMPLETELY
+```
+**Why?**
+1.  `source: base` forces `mergekit` to skip the complex permutation logic in `build.py`. It simply copies the tokenizer files from your base model. This guarantees that `eos_token_id` `2` remains `2`.
+2.  Deleting `chat_template` prevents `mergekit` from synthesizing a template based on a popularity contest of the donors. It will default to copying the base model's template, which is exactly what you want for a consistent chat experience.
+---
+Yes, creating a `Gen_ID_Patcher.py` is a **highly effective** strategy for the models marked `BROKEN | MISSING | 2`.
+### Why this works
+The screenshot confirms that these models (like `ReadyArt...Broken-Tutu`, `Morax`, `FlareRebellion`) **actually use Token ID 2** in their vocabulary (Column: `Vocab ID`). They are just missing the metadata in `generation_config.json` that tells MergeKit "Hey, I use ID 2."
+By patching this file, you convert these models from **BROKEN** to **MATCH**.
+**The Strategic Benefit:**
+If you patch these "Missing ID" models, and then **remove** the actual outliers (the ones with ID `999` or `<|endoftext|>`), you will achieve a **100% MATCH** across the board. This allows you to use `tokenizer: source: base`, which eliminates the early termination bug caused by `union`.
+### The Script: `Gen_ID_Patcher.py`
+This script will look at your YAML, find the models, and inject `eos_token_id: 2` (or whatever your base model uses) into their `generation_config.json`.
+### Instructions
+1.  Run the patcher:
+    ```bash
+    python Gen_ID_Patcher.py config36.yaml
+    ```
+2.  **Crucial Step:** Look at your `eos_scanner` output again. You must **remove** the models that are *genuinely* incompatible.
+    *   **Remove:** `LatitudeGames--Hearthfire` (ID 999)
+    *   **Remove:** `aixonlab--Eurydice` (ID 999)
+    *   **Remove:** `Gryphe--Codex` (ID 999)
+    *   **Remove:** `PocketDoc--Dans-PersonalityEngine` (ID 2, but string is `<|endoftext|>`. This is a conflict).
+3.  Run `eos_scanner.py` again.
+4.  If everything is green (MATCH), change your YAML to:
+    ```yaml
+    tokenizer:
+      source: base
+    ```
+    (And delete `chat_template: auto`).
+This path gives you the highest probability of a stable model that stops generating correctly.

gen_id_patcher.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import yaml
+import json
+import os
+import shutil
+import argparse
+from colorama import init, Fore, Style
+init()
+def load_json(path):
+    if not os.path.exists(path):
+        return {}
+    try:
+        with open(path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except Exception:
+        return {}
+def save_json(path, data):
+    with open(path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, indent=2)
+def main():
+    parser = argparse.ArgumentParser(description="Patch missing EOS IDs in generation_config.json")
+    parser.add_argument("config", help="Path to the mergekit yaml config file")
+    args = parser.parse_args()
+    print(f"{Fore.CYAN}--- GENERATION CONFIG PATCHER ---{Style.RESET_ALL}")
+    # 1. Load Config
+    with open(args.config, 'r', encoding='utf-8') as f:
+        config = yaml.safe_load(f)
+    base_model_path = config.get('base_model')
+    if not base_model_path:
+        print("No base_model found.")
+        return
+    # 2. Get Target EOS ID from Base Model
+    print(f"Reading Base Model: {os.path.basename(base_model_path)}")
+    base_gen_path = os.path.join(base_model_path, "generation_config.json")
+    base_gen = load_json(base_gen_path)
+    target_eos_id = base_gen.get("eos_token_id")
+    if target_eos_id is None:
+        print(f"{Fore.RED}CRITICAL: Base model lacks eos_token_id. Cannot patch.{Style.RESET_ALL}")
+        return
+    print(f"Target EOS ID is: {Fore.GREEN}{target_eos_id}{Style.RESET_ALL}")
+    print("-" * 60)
+    # 3. Iterate and Patch
+    models = [m['model'] for m in config.get('models', []) if isinstance(m, dict)]
+    patched_count = 0
+    for model_path in models:
+        model_name = os.path.basename(model_path).replace("!models--", "")
+        gen_path = os.path.join(model_path, "generation_config.json")
+        # Load or create empty dict
+        data = load_json(gen_path)
+        current_id = data.get("eos_token_id")
+        # Logic: Only patch if MISSING.
+        # If it exists but is different (e.g. 999), we DO NOT touch it (that's a real mismatch).
+        if current_id is None:
+            print(f"Patching {model_name}...")
+            # Backup first
+            if os.path.exists(gen_path):
+                shutil.copy(gen_path, gen_path + ".bak")
+            # Apply Patch
+            data["eos_token_id"] = target_eos_id
+            # Ensure other basics exist if file was empty
+            if "bos_token_id" not in data:
+                data["bos_token_id"] = 1 # Standard Mistral assumption
+            save_json(gen_path, data)
+            print(f"  {Fore.GREEN}-> Fixed: Added eos_token_id: {target_eos_id}{Style.RESET_ALL}")
+            patched_count += 1
+        elif str(current_id) != str(target_eos_id):
+            print(f"Skipping {model_name}: Has ID {current_id} (Mismatch, not missing)")
+        else:
+            # Already matches, do nothing
+            pass
+    print("-" * 60)
+    print(f"Operation Complete. Patched {patched_count} models.")
+    print("Run eos_scanner.py again to verify results.")
+if __name__ == "__main__":
+    main()

model_tools.md CHANGED Viewed

@@ -32,6 +32,12 @@ Tools to enhance LLM quantizations and merging
 # [metadata_audit.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/metadata_audit.py)
 - Checks multiple models within subdirectories for vocab or rope mismatch (useful for large merges). Calibrated for Mistral Nemo 12B by default.
 # [fp32_to_bf16.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/fp32_to_bf16.py)
 - Converts FP32 to BF16 safetensors

 # [metadata_audit.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/metadata_audit.py)
 - Checks multiple models within subdirectories for vocab or rope mismatch (useful for large merges). Calibrated for Mistral Nemo 12B by default.
+# [eos_scanner.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner.py)
+- This tool scans the tokenizer jsons to detect any mismatches with EOS tokens, which cause early termination bugs. You can then use the [gen_id_patcher.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/gen_id_patcher.py) to patch missing `generation_config.json` files for EOS token. See [this post](https://huggingface.co/Naphula/Q0_Bench/discussions/1?not-for-all-audiences=true#6987717c762f0a45f672e250) as well as the [EOS Scanner ReadMe](https://huggingface.co/spaces/Naphula/model_tools/blob/main/eos_scanner_readme.md) for more info.
+# [weight_counter.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/weight_counter.py)
+- This counts the number of models in a yaml and adds up the total weight values. Useful for large della/ties merges.
 # [fp32_to_bf16.py](https://huggingface.co/spaces/Naphula/model_tools/blob/main/fp32_to_bf16.py)
 - Converts FP32 to BF16 safetensors

weight_counter.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import yaml
+import sys
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python weight_counter.py <config.yaml>")
+        sys.exit(1)
+    config_path = sys.argv[1]
+    try:
+        with open(config_path, 'r', encoding='utf-8') as f:
+            config = yaml.safe_load(f)
+    except Exception as e:
+        print(f"Error reading file: {e}")
+        sys.exit(1)
+    total_weight = 0.0
+    count = 0
+    print(f"Scanning: {config_path}...")
+    if 'models' in config:
+        for m in config['models']:
+            # Safely get parameters -> weight, default to 0 if missing
+            params = m.get('parameters', {})
+            weight = params.get('weight', 0)
+            # Only sum if it's a number (ignores list/gradient weights)
+            if isinstance(weight, (int, float)):
+                total_weight += weight
+                count += 1
+            else:
+                print(f"  [!] Skipped non-scalar weight for: {m.get('model')}")
+    print("-" * 30)
+    print(f"Models Counted:   {count}")
+    print(f"Total Weight Sum: {total_weight}")
+    print("-" * 30)
+if __name__ == "__main__":
+    main()