When it comes to step-by-step values, I think the standard approach is to log them during training, like below. While it’s possible to do it afterward, the code becomes significantly more complicated…
args = DPOConfig(
output_dir="dpo-eval-demo",
do_train=True, # training must run to log stepwise eval
do_eval=True,
evaluation_strategy="steps",
eval_steps=100,
logging_strategy="steps",
logging_steps=10,
report_to="tensorboard", # or "wandb"
logging_dir="tb_logs",
generate_during_eval=False,
reference_free=True,
)
trainer = DPOTrainer(model=policy, args=args, train_dataset=your_train, eval_dataset=eval_ds, processing_class=tok)
trainer.train()