I am using custom metric and in my training arguments I have
greater_is_better=True,
load_best_model_at_end=True,
But as far as I can the best model is not being saved. Here is link to my Colab notebook:
And here are all the details just in case:
My platform and system data:
platform: Linux
release: 6.1.123+
version: #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025
machine: x86_64
torch: 2.8.0+cu126
transformers:4.55.4
compiler: 3.12.11 (main, Jun 4 2025, 08:56:18) [GCC 11.4.0]
GPU/TPU: Tesla T4
CUDA compiler:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Here is my code:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import transformersimport sysimport torch
import pandas as pd, numpy as npfrom sklearn.preprocessing
import LabelEncoder
import joblibimport pandas as pd
import os
from sklearn.model_selection import train_test_split
from datasets import Datasetimport numpy as np
from transformers import TrainingArguments,Trainer
import platform
import os
model_name = 'microsoft/deberta-v3-xsmall'
model_name_path = 'deberta-v3-xsmall'
DIR = '../MAP_models/'+model_name_path+'/tuned/'
os.makedirs('../MAP_models', exist_ok = True)
os.makedirs('../MAP_models/'+model_name_path, exist_ok = True)
os.makedirs('../MAP_models/'+model_name_path+'/tuned', exist_ok=True)
os.makedirs('../MAP_models/'+model_name_path+'/tuned/model', exist_ok=True)
NUM_LABELS = 65
text = [f"example {i}" for i in range(300)]
label = [i % NUM_LABELS for i in range(300)]
train = pd.DataFrame({'text': text, 'label': label})
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)
# Convert to Hugging Face Dataset
COLS = ['text','label']
train_ds = Dataset.from_pandas(train_df[COLS])
val_ds = Dataset.from_pandas(val_df[COLS])
tokenizer = AutoTokenizer.from_pretrained(model_name)
MAX_LEN = 256
# Tokenization function
def tokenize(batch):
return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)
train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
# Set format for PyTorch
columns = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=NUM_LABELS, trust_remote_code=True
)
def compute_map3(eval_pred):
logits, labels = eval_pred
probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
top3 = np.argsort(-probs, axis=1)[:, :3] # Top 3 predictions
match = (top3 == labels[:, None])
# Compute MAP@3 manually
map3 = 0
for i in range(len(labels)):
if match[i, 0]:
map3 += 1.0
elif match[i, 1]:
map3 += 1.0 / 2
elif match[i, 2]:
map3 += 1.0 / 3
return {"map@3": map3 / len(labels)}
args = TrainingArguments(
per_device_train_batch_size = 2,
per_device_eval_batch_size= 2,
gradient_accumulation_steps = 1,
warmup_steps = 10,
num_train_epochs = 1,
learning_rate = 5e-5,
fp16 = True,
bf16 = False,
logging_steps = 1,
optim = "adamw_torch_fused",
weight_decay = 0.01,
eval_strategy="steps",
lr_scheduler_type = "cosine_with_restarts",
seed = 3407,
output_dir = DIR+"output",
logging_dir=DIR+"logs",
greater_is_better=True,
load_best_model_at_end=True,
save_steps=10,
eval_steps=10,
save_total_limit=3,
report_to = "none",
)
trainer = Trainer(
model = model,
processing_class = tokenizer,
eval_dataset = val_ds,
train_dataset = train_ds,
args = args,
compute_metrics = compute_map3,
)
trainer_stats = trainer.train()
It produces the following output
Step Training Loss Validation Loss Map@3
10 4.235900 4.182212 0.025000
20 4.245500 4.176703 0.038889
30 4.166400 4.171503 0.030556
40 4.163400 4.174795 0.025000
50 4.187000 4.174973 0.025000
60 4.240600 4.176061 0.038889
70 4.123800 4.177481 0.036111
80 4.130100 4.177088 0.033333
90 4.140700 4.177318 0.022222
100 4.180000 4.178491 0.022222
110 4.112100 4.178146 0.025000
120 4.229100 4.178137 0.025000
But when I run
trainer.evaluate(val_ds)
{‘eval_loss’: 4.1822123527526855,
‘eval_map@3’: 0.025,
‘eval_runtime’: 0.9703,
‘eval_samples_per_second’: 61.836,
‘eval_steps_per_second’: 30.918,
‘epoch’: 1.0}
It seems like evaluation is done on the very first 10 steps, rather than on the best model.
What am I doing wrong?