The best model is not being saved

I am using custom metric and in my training arguments I have

greater_is_better=True,
load_best_model_at_end=True,

But as far as I can the best model is not being saved. Here is link to my Colab notebook:

Colab

And here are all the details just in case:

My platform and system data:

platform: Linux
release: 6.1.123+
version: #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025
machine: x86_64
torch: 2.8.0+cu126
transformers:4.55.4
compiler: 3.12.11 (main, Jun 4 2025, 08:56:18) [GCC 11.4.0]
GPU/TPU: Tesla T4
CUDA compiler:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0

Here is my code:

from transformers import AutoModelForSequenceClassification, AutoTokenizer
import transformersimport sysimport torch
import pandas as pd, numpy as npfrom sklearn.preprocessing
import LabelEncoder
import joblibimport pandas as pd
import os
from sklearn.model_selection import train_test_split
from datasets import Datasetimport numpy as np
from transformers import TrainingArguments,Trainer
import platform

import os
model_name = 'microsoft/deberta-v3-xsmall'
model_name_path = 'deberta-v3-xsmall'
DIR = '../MAP_models/'+model_name_path+'/tuned/'
os.makedirs('../MAP_models', exist_ok = True)
os.makedirs('../MAP_models/'+model_name_path, exist_ok = True)
os.makedirs('../MAP_models/'+model_name_path+'/tuned', exist_ok=True)
os.makedirs('../MAP_models/'+model_name_path+'/tuned/model', exist_ok=True)


NUM_LABELS = 65
text = [f"example {i}" for i in range(300)]
label = [i % NUM_LABELS for i in range(300)]
train = pd.DataFrame({'text': text, 'label': label})

train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
COLS = ['text','label']
train_ds = Dataset.from_pandas(train_df[COLS])
val_ds = Dataset.from_pandas(val_df[COLS])


tokenizer = AutoTokenizer.from_pretrained(model_name)
MAX_LEN = 256
   
# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)
    
train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
    
# Set format for PyTorch
columns = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=NUM_LABELS, trust_remote_code=True
    )

def compute_map3(eval_pred):
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    
    top3 = np.argsort(-probs, axis=1)[:, :3]  # Top 3 predictions
    match = (top3 == labels[:, None])

    # Compute MAP@3 manually
    map3 = 0
    for i in range(len(labels)):
        if match[i, 0]:
            map3 += 1.0
        elif match[i, 1]:
            map3 += 1.0 / 2
        elif match[i, 2]:
            map3 += 1.0 / 3
    return {"map@3": map3 / len(labels)}

args = TrainingArguments(
        per_device_train_batch_size = 2, 
        per_device_eval_batch_size= 2,
        gradient_accumulation_steps = 1,
        warmup_steps = 10,
        num_train_epochs = 1,
        learning_rate = 5e-5,
        fp16 = True,
        bf16 = False,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        eval_strategy="steps",
        lr_scheduler_type = "cosine_with_restarts",
        seed = 3407,
        output_dir = DIR+"output",
        logging_dir=DIR+"logs",
        greater_is_better=True,
        load_best_model_at_end=True,
        save_steps=10,
        eval_steps=10,
        save_total_limit=3,
        report_to = "none", 
    )

trainer = Trainer(
    model = model,
    processing_class = tokenizer,
    eval_dataset = val_ds,
    train_dataset = train_ds,
    args = args,
    compute_metrics = compute_map3,
)

trainer_stats = trainer.train()


It produces the following output

Step Training Loss Validation Loss Map@3
10 4.235900 4.182212 0.025000
20 4.245500 4.176703 0.038889
30 4.166400 4.171503 0.030556
40 4.163400 4.174795 0.025000
50 4.187000 4.174973 0.025000
60 4.240600 4.176061 0.038889
70 4.123800 4.177481 0.036111
80 4.130100 4.177088 0.033333
90 4.140700 4.177318 0.022222
100 4.180000 4.178491 0.022222
110 4.112100 4.178146 0.025000
120 4.229100 4.178137 0.025000

But when I run

trainer.evaluate(val_ds)

{‘eval_loss’: 4.1822123527526855,
‘eval_map@3’: 0.025,
‘eval_runtime’: 0.9703,
‘eval_samples_per_second’: 61.836,
‘eval_steps_per_second’: 30.918,
‘epoch’: 1.0}

It seems like evaluation is done on the very first 10 steps, rather than on the best model.

What am I doing wrong?

1 Like

Due to metric_for_best_model is missing, etc. ?

Thank you so much! What a blunder!

1 Like

This topic was automatically closed 12 hours after the last reply. New replies are no longer allowed.