Castillo_Henry_903002104 / records /071225_BosAlign /c1fd8a38-bb9f-45c4-8af0-d37f70c993f3.txt
henrycastillo's picture
add everything but lm eval harness
c3b20da verified
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
import uuid
import time
import copy
import glob
from dataclasses import dataclass
from functools import lru_cache, partial # Added partial for hook registration
from pathlib import Path
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
torch.empty(1, device="cuda", requires_grad=True).backward() # prevents a bug on some systems
from torch import Tensor, nn
import torch.nn.functional as F
import torch.distributed as dist
# use of FlexAttention contributed by @KoszarskyB
from torch.nn.attention.flex_attention import BlockMask, flex_attention
#torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min
# -----------------------------------------------------------------------------
# Custom operators: FP8 matmul by @YouJiacheng
@torch.library.custom_op("nanogpt::mm", mutates_args=())
def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]:
@torch.compile
def impl(x: Tensor, w: Tensor):
assert x.is_contiguous() and w.is_contiguous()
x_f8 = x.div(x_s).to(torch.float8_e4m3fn)
w_f8 = w.div(w_s).to(torch.float8_e4m3fn)
out = torch._scaled_mm(
x_f8,
w_f8.T,
out_dtype=torch.bfloat16,
scale_a=x.new_tensor(x_s, dtype=torch.float32),
scale_b=x.new_tensor(w_s, dtype=torch.float32),
use_fast_accum=True,
)
return out, x_f8, w_f8
return impl(x, w)
@mm_op.register_fake
def _(x: Tensor, w: Tensor, *_):
assert x.ndim == w.ndim == 2
assert x.shape[1] == w.shape[1]
assert x.device == w.device
assert x.is_contiguous() and w.is_contiguous()
return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn)
@torch.library.custom_op("nanogpt::mm_backward", mutates_args=())
def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]:
@torch.compile
def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor):
assert grad.is_contiguous()
x_inv_s = grad.new_tensor(x_s, dtype=torch.float32)
w_inv_s = grad.new_tensor(w_s, dtype=torch.float32)
grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32)
grad_f8 = grad.div(grad_s).to(torch.float8_e5m2)
grad_x = torch._scaled_mm(
grad_f8,
w_f8.T.contiguous().T,
out_dtype=torch.bfloat16,
scale_a=grad_inv_s,
scale_b=w_inv_s,
use_fast_accum=False,
)
# faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768)
grad_w = torch._scaled_mm(
x_f8.T.contiguous(),
grad_f8.T.contiguous().T,
out_dtype=torch.float32,
scale_a=x_inv_s,
scale_b=grad_inv_s,
use_fast_accum=False,
).T
return grad_x, grad_w
return impl(g, x_f8, w_f8)
@mm_backward_op.register_fake
def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_):
return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32)
def backward(ctx, grad_out: Tensor, *_):
x_f8, w_f8 = ctx.saved_tensors
x_s, w_s, grad_s = ctx.scales
grad_x, grad_w = torch.ops.nanogpt.mm_backward(
grad_out, x_f8, w_f8, x_s, w_s, grad_s
)
return grad_x, grad_w, None, None, None
def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output):
*_, x_s, w_s, grad_s = inputs
_, x_f8, w_f8 = output
ctx.save_for_backward(x_f8, w_f8)
ctx.scales = x_s, w_s, grad_s
ctx.set_materialize_grads(False)
mm_op.register_autograd(backward, setup_context=setup_context)
# -----------------------------------------------------------------------------
# Muon optimizer
@torch.compile
def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor:
"""
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
zero even beyond the point where the iteration no longer converges all the way to one everywhere
on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
performance at all relative to UV^T, where USV^T = G is the SVD.
"""
assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
a, b, c = (3.4445, -4.7750, 2.0315)
X = G
if G.size(-2) > G.size(-1):
X = X.mT
# Ensure spectral norm is at most 1
X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
# Perform the NS iterations
for _ in range(steps):
A = X @ X.mT
B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
X = a * X + B @ X
if G.size(-2) > G.size(-1):
X = X.mT
return X
class Muon(torch.optim.Optimizer):
"""
Muon - MomentUm Orthogonalized by Newton-schulz
https://kellerjordan.github.io/posts/muon/
Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
the advantage that it can be stably run in bfloat16 on the GPU.
Warning: This optimizer should not be used for the embedding layer, the final fully connected layer,
or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW).
"""
def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95):
defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum)
params = list(params)
sizes = {p.shape for p in params}
# create one buffer per unique parameter-size
param_groups = []
for size in sizes:
group_params = [p for p in params if p.shape == size]
param_groups.append(dict(params=group_params))
super().__init__(param_groups, defaults)
@torch.no_grad()
def step(self):
# Efficient systems-wise implementation of step developed by @YouJiacheng,
# @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad,
# @ryanyang0, and @vagrawal.
rank = dist.get_rank()
world_size = dist.get_world_size()
reduce_scatter_futures: list[torch.Future] = []
all_reduce_futures: list[torch.Future] = []
for group in self.param_groups:
params: list[Tensor] = group["params"]
grad = torch.empty_like(params[-1])
grad_pad = [param.grad for param in params] + [torch.zeros_like(params[-1])] * world_size
for base_i in range(0, len(params), world_size):
if base_i + rank < len(params):
grad = params[base_i + rank].grad
# This gives strange dynamo warnings
reduce_scatter_futures.append(dist.reduce_scatter(grad, grad_pad[base_i:base_i + world_size], op=dist.ReduceOp.AVG, async_op=True).get_future())
idx = 0
for group in self.param_groups:
params: list[Tensor] = group["params"]
params_pad = params + [torch.empty_like(params[-1])] * world_size
momentum = group["momentum"]
for base_i in range(0, len(params), world_size):
reduce_scatter_futures[idx].wait()
if base_i + rank < len(params):
p = params[base_i + rank]
grad = p.grad
eff_lr = group["lr"] * max(1, p.size(-2) / p.size(-1)) ** 0.5 * getattr(p, "lr_mul", 1.0)
eff_weight_decay = group["lr"] * group["weight_decay"] * getattr(p, "wd_mul", 1.0)
state = self.state[p]
if len(state) == 0:
state["momentum_buffer"] = torch.zeros_like(grad)
momentum_buffer = state["momentum_buffer"]
p.mul_(1 - eff_weight_decay)
momentum_buffer.lerp_(grad, 1 - momentum)
grad = grad.lerp_(momentum_buffer, momentum)
v = zeropower_via_newtonschulz5(grad.bfloat16(), 5)
p.add_(other=v, alpha=-eff_lr)
idx += 1
all_reduce_futures.append(dist.all_gather(params_pad[base_i:base_i + world_size], params_pad[base_i + rank], async_op=True).get_future())
torch.futures.collect_all(all_reduce_futures).wait()
class DistAdam(torch.optim.Optimizer):
def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01):
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
params = list(params)
sizes = {p.shape for p in params}
# create one buffer per unique parameter-size
param_groups = []
for size in sizes:
group_params = [p for p in params if p.shape == size]
param_groups.append(dict(params=group_params))
super().__init__(param_groups, defaults)
# DistributedAdam implementation by @vagrawal
@torch.compile
@torch.no_grad()
def step(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
reduce_scatter_futures: list[torch.Future] = []
all_reduce_futures: list[torch.Future] = []
grad_slices = []
for group in self.param_groups:
params: list[Tensor] = group["params"]
grad = torch.empty_like(params[-1])
for base_i in range(len(params)):
grad = params[base_i].grad
rank_size = grad.shape[0] // world_size
grad_slice = torch.empty_like(grad[:rank_size])
reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future())
grad_slices.append(grad_slice)
idx = 0
for group in self.param_groups:
beta1, beta2 = group['betas']
eps = group['eps']
wd = group['weight_decay']
params = group['params']
for base in range(len(params)):
reduce_scatter_futures[idx].wait()
p = params[base]
rank_size = p.shape[0] // world_size
p_slice = p[rank * rank_size:(rank + 1) * rank_size]
lr = group['lr'] * getattr(p, "lr_mul", 1.0)
state = self.state[p]
g_slice = grad_slices[idx]
# State init
if not state:
state['step'] = torch.tensor(0, dtype=torch.int64, device=p.device)
state['exp_avg'] = torch.zeros_like(p_slice)
state['exp_avg_sq'] = torch.zeros_like(p_slice)
exp_avg = state['exp_avg']
exp_avg_sq = state['exp_avg_sq']
state['step'] += 1
t = state['step']
# weight decay
if wd != 0:
eff_weight_decay = lr * wd * getattr(p, "wd_mul", 1.0)
p_slice.mul_(1 - eff_weight_decay)
# update running averages
exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1)
exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2)
# bias corrections
bias1 = 1 - beta1 ** t
bias2 = 1 - beta2 ** t
# compute step
denom = exp_avg_sq.sqrt().add_(eps)
step_size = lr * (torch.sqrt(bias2) / bias1)
update = exp_avg.div(denom).mul_(step_size)
p_slice.add_(other=update, alpha=-1.0)
idx += 1
all_reduce_futures.append(dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future())
torch.futures.collect_all(all_reduce_futures).wait()
# -----------------------------------------------------------------------------
# PyTorch nn.Module definitions for the model
def norm(x: Tensor):
return F.rms_norm(x, (x.size(-1),))
class CastedLinear(nn.Linear):
def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0):
super().__init__(in_features, out_features, bias=False)
self.use_fp8 = use_fp8
self.x_s = x_s
self.w_s = w_s
self.grad_s = grad_s
def reset_parameters(self) -> None:
std = 0.5 * (self.in_features ** -0.5) # 0.5 is a bit better than the default 1/sqrt(3)
bound = (3 ** 0.5) * std
with torch.no_grad():
self.weight.uniform_(-bound, bound)
def forward(self, x: Tensor):
if self.use_fp8 and self.training:
_x = x.flatten(0, -2)
out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0]
return out.reshape(*x.shape[:-1], -1)
else:
return F.linear(x, self.weight.type_as(x))
class Rotary(nn.Module):
def __init__(self, dim: int, max_seq_len: int):
super().__init__()
# half-truncate RoPE by @YouJiacheng (w/ base freq tuning)
angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=dim//4, dtype=torch.float32)
angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(dim//4)])
t = torch.arange(max_seq_len, dtype=torch.float32)
theta = torch.einsum("i,j -> ij", t, angular_freq)
self.cos = nn.Buffer(theta.cos(), persistent=False)
self.sin = nn.Buffer(theta.sin(), persistent=False)
def forward(self, x_BTHD: Tensor):
assert self.cos.size(0) >= x_BTHD.size(-3)
cos, sin = self.cos[None, :x_BTHD.size(-3), None, :], self.sin[None, :x_BTHD.size(-3), None, :]
x1, x2 = x_BTHD.to(dtype=torch.float32).chunk(2, dim=-1)
y1 = x1 * cos + x2 * sin
y2 = x1 * (-sin) + x2 * cos
return torch.cat((y1, y2), 3).type_as(x_BTHD)
class CausalSelfAttention(nn.Module):
def __init__(self, dim: int, num_heads: int, max_seq_len: int, head_dim=128):
super().__init__()
self.num_heads = num_heads
self.head_dim = head_dim
hdim = num_heads * head_dim
std = 0.5 * (dim ** -0.5)
bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng
# merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng
# https://x.com/hi_tysam/status/1879699187107033311
self.qkv_w = nn.Parameter(torch.empty(3, hdim, dim).uniform_(-bound, bound))
self.rotary = Rotary(head_dim, max_seq_len)
self.c_proj = CastedLinear(hdim, dim)
self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977
# scale the attention logits by given constant, instead of the default head_dim**-0.5, by @leloykun
# inspired by learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283
self.attn_scale = 0.12
def forward(self, x: Tensor, ve: Tensor | None, lambdas: Tensor, block_mask: BlockMask):
B, T = x.size(0), x.size(1) # batch size, sequence length
assert B == 1, "Must use batch size = 1 for FlexAttention"
q, k, v = F.linear(x, self.qkv_w.flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2)
q, k = norm(q), norm(k) # QK norm @Grad62304977
q, k = self.rotary(q), self.rotary(k)
if ve is not None:
v = lambdas[0] * v + lambdas[1] * ve.view_as(v) # @KoszarskyB & @Grad62304977
else: # skip mid-layers token value embeddings by @YouJiacheng
v = lambdas[0] * v
y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, scale=self.attn_scale).transpose(1, 2)
y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side
y = self.c_proj(y)
return y
class MLP(nn.Module):
def __init__(self, dim: int):
super().__init__()
hdim = 4 * dim
self.c_fc = CastedLinear(dim, hdim)
self.c_proj = CastedLinear(hdim, dim)
self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977
def forward(self, x: Tensor):
x = self.c_fc(x)
x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977
x = self.c_proj(x)
return x
class Block(nn.Module):
def __init__(self, dim: int, num_heads: int, max_seq_len: int, layer_idx: int):
super().__init__()
# skip attention of blocks.7 (the 8th layer) by @YouJiacheng
self.attn = CausalSelfAttention(dim, num_heads, max_seq_len) if layer_idx != 7 else None
self.mlp = MLP(dim)
def forward(self, x: Tensor, ve: Tensor | None, x0: Tensor, lambdas: Tensor, sa_lambdas: Tensor, block_mask: BlockMask):
x = lambdas[0] * x + lambdas[1] * x0
if self.attn is not None:
x = x + self.attn(norm(x), ve, sa_lambdas, block_mask)
x = x + self.mlp(norm(x))
return x
# -----------------------------------------------------------------------------
# The main model
def next_multiple_of_n(v: float | int, *, n: int):
return next(x for x in range(n, int(v) + 1 + n, n) if x >= v)
class GPT(nn.Module):
def __init__(self, vocab_size: int, num_layers: int, num_heads: int, model_dim: int, max_seq_len: int):
super().__init__()
vocab_size = next_multiple_of_n(vocab_size, n=128)
self.embed = nn.Embedding(vocab_size, model_dim)
# token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897
# value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78
self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)])
self.blocks = nn.ModuleList([Block(model_dim, num_heads, max_seq_len, i) for i in range(num_layers)])
# there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency.
# suggested to me by @Grad62304977. this originates from Karpathy's experiments.
self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=True, x_s=(model_dim**0.5)/448, w_s=24/448, grad_s=1/448)
self.lm_head.weight.detach().zero_() # @Grad62304977
# Add learnable skip connection weights for decoder layers
assert num_layers % 2 == 0
pad = (-num_layers * 5) % dist.get_world_size()
self.scalars = nn.Parameter(torch.cat([
torch.ones(num_layers), # skip_weights
*[torch.tensor([1.0, 0.0]) for _ in range(num_layers)], # block lambdas
*[torch.tensor([0.5, 0.5]) for _ in range(num_layers)], # SA lambdas
torch.ones(pad),
]))
# set learning rates
for param in self.embed.parameters():
param.lr_mul = 75.
for param in self.value_embeds.parameters():
param.lr_mul = 75.
self.lm_head.weight.lr_mul = 27.5
self.scalars.lr_mul = 5.0
def create_blockmasks(self, input_seq: Tensor, sliding_window_num_blocks: Tensor):
BLOCK_SIZE = 128
docs = (input_seq == 50256).cumsum(0)
def document_causal(b, h, q_idx, kv_idx):
causal_mask = q_idx >= kv_idx
document_mask = docs[q_idx] == docs[kv_idx]
return causal_mask & document_mask
def dense_to_ordered(dense_blockmask: Tensor):
num_blocks = dense_blockmask.sum(dim=-1, dtype=torch.int32)
indices = dense_blockmask.argsort(dim=-1, descending=False, stable=True).flip(-1).to(torch.int32)
return num_blocks[None, None].contiguous(), indices[None, None].contiguous()
# manual block mask creation by @YouJiacheng
assert len(input_seq) % BLOCK_SIZE == 0
NUM_BLOCKS = len(input_seq) // BLOCK_SIZE
block_idx = torch.arange(NUM_BLOCKS, dtype=torch.int32, device="cuda")
causal_blockmask_any = block_idx[:, None] >= block_idx
causal_blockmask_all = block_idx[:, None] > block_idx
docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous()
docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous()
document_blockmask_any = (docs_low[:, None] <= docs_high) & (docs_high[:, None] >= docs_low)
document_blockmask_all = (docs_low[:, None] == docs_high) & (docs_high[:, None] == docs_low)
blockmask_any = causal_blockmask_any & document_blockmask_any
blockmask_all = causal_blockmask_all & document_blockmask_all
partial_kv_num_blocks, partial_kv_indices = dense_to_ordered(blockmask_any & ~blockmask_all)
full_kv_num_blocks, full_kv_indices = dense_to_ordered(blockmask_all)
def build_bm(window_size_blocks: Tensor) -> BlockMask:
return BlockMask.from_kv_blocks(
torch.clamp_max(partial_kv_num_blocks, torch.clamp_min(window_size_blocks - full_kv_num_blocks, 1)),
partial_kv_indices,
torch.clamp_max(full_kv_num_blocks, window_size_blocks - 1),
full_kv_indices,
BLOCK_SIZE=BLOCK_SIZE,
mask_mod=document_causal,
)
# Long-short SWA block masks by @leloykun & @YouJiacheng, adapated from suggestion by @Grad62304977, following Gemma 2 paper
return build_bm(sliding_window_num_blocks), build_bm(sliding_window_num_blocks // 2)
def forward(self, input_seq: Tensor, target_seq: Tensor, sliding_window_num_blocks: Tensor):
assert input_seq.ndim == 1
ve = [value_embed(input_seq) for value_embed in self.value_embeds]
# 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure
ve = [ve[0], ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]]
assert len(ve) == len(self.blocks)
long_bm, short_bm = self.create_blockmasks(input_seq, sliding_window_num_blocks)
block_masks = [long_bm, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, long_bm, short_bm, short_bm, short_bm, long_bm]
assert len(block_masks) == len(self.blocks)
x = x0 = norm(self.embed(input_seq)[None]) # use of norm here by @Grad62304977
# U-net design by @brendanh0gan
skip_connections = []
skip_weights = self.scalars[:(len(self.blocks) // 2)]
lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2)
sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2)
n = len(self.blocks) // 2
for i in range(len(self.blocks)):
if i >= n:
x = x + skip_weights[i - n] * skip_connections.pop()
x = self.blocks[i](x, ve[i], x0, lambdas[i], sa_lambdas[i], block_masks[i])
if i < n:
skip_connections.append(x)
x = norm(x)
logits = self.lm_head(x).float()
# @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1)
logits = 30 * torch.sigmoid(logits / (7.5 * x.size(-1)**0.5))
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_seq, reduction="sum" if self.training else "mean")
return loss
# -----------------------------------------------------------------------------
# Distributed data loader
def _load_data_shard(file: Path):
header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32
assert header[0] == 20240520, "magic number mismatch in the data .bin file"
assert header[1] == 1, "unsupported version"
num_tokens = int(header[2]) # number of tokens (claimed)
with file.open("rb", buffering=0) as f:
tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng
f.seek(256 * 4)
nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng
assert nbytes == 2 * num_tokens, "number of tokens read does not match header"
return tokens
# find world_size starting indicies, such that each begins with token 50256 and local_batches don't overlap
def find_batch_starts(tokens: Tensor, pos: int, local_batch_size: int, max_batch_span: int):
boundary_mask = tokens[pos : pos + max_batch_span] == 50256
boundary_positions = torch.nonzero(boundary_mask, as_tuple=False).squeeze(-1) + pos
start = boundary_positions[0].item()
starts = []
for i in range(len(boundary_positions) - 1):
end = boundary_positions[i + 1].item()
if end - start >= local_batch_size:
starts.append(start) # append start once end pos is confirmed
if len(starts) == dist.get_world_size():
return starts, end - pos
start = end
assert False # increase max_batch_span if necessary
def distributed_data_generator(filename_pattern: str, batch_size: int, align_to_bos: bool):
rank = dist.get_rank()
world_size = dist.get_world_size()
files = [Path(file) for file in sorted(glob.glob(filename_pattern))]
assert batch_size % world_size == 0
local_batch_size = batch_size // world_size
file_iter = iter(files) # use itertools.cycle(files) instead if you want to do multi-epoch training
tokens, pos = _load_data_shard(next(file_iter)), 0
max_batch_span = 2 * batch_size if align_to_bos else batch_size # provide buffer to handle samples up to length local_batch_size
while True:
if pos + max_batch_span + 1 >= len(tokens):
tokens, pos = _load_data_shard(next(file_iter)), 0
if align_to_bos:
batch_starts, batch_span = find_batch_starts(tokens, pos, local_batch_size, max_batch_span)
start_idx = batch_starts[rank]
else:
batch_span = batch_size
start_idx = pos + rank * local_batch_size
buf = tokens[start_idx:][:local_batch_size + 1]
inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # no sync on host side;
targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # H2D in another stream isn't helpful.
pos += batch_span
yield inputs, targets
# -----------------------------------------------------------------------------
# int main
@dataclass
class Hyperparameters:
# data
train_files = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on
val_files = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on
val_tokens = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons
train_seq_len = 48*1024 # FlexAttention sequence length
val_seq_len = 4*64*1024 # FlexAttention sequence length for validation
# optimization
num_iterations = 1750 # number of iterations to run
cooldown_frac = 0.45 # fraction of training spent cooling down the learning rate
# evaluation and logging
val_loss_every = 125 # every how many steps to evaluate val loss? 0 for only at the end
save_checkpoint = False
args = Hyperparameters()
# torchrun sets these env variables
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
assert world_size == 8 # this code is designed for 8xH100
assert torch.cuda.is_available()
device = torch.device("cuda", int(os.environ["LOCAL_RANK"]))
torch.cuda.set_device(device)
dist.init_process_group(backend="nccl", device_id=device)
dist.barrier()
master_process = (rank == 0) # this process will do logging, checkpointing etc.
# begin logging
logfile = None
if master_process:
run_id = uuid.uuid4()
os.makedirs("logs", exist_ok=True)
logfile = f"logs/{run_id}.txt"
print(logfile)
def print0(s, console=False):
if master_process:
with open(logfile, "a") as f:
if console:
print(s)
print(s, file=f)
# begin by printing this file (the Python code)
print0(code)
print0("="*100)
# log information about the hardware/software environment this is running on
print0(f"Running Python {sys.version}")
print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}")
def nvidia_smi():
import subprocess # avoid top level import
return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout
print0(nvidia_smi())
print0("="*100)
model: nn.Module = GPT(vocab_size=50257, num_layers=12, num_heads=6, model_dim=768, max_seq_len=max(args.train_seq_len, args.val_seq_len)).cuda()
for m in model.modules():
if isinstance(m, nn.Embedding):
m.bfloat16()
for param in model.parameters():
dist.broadcast(param.detach(), 0)
# collect the parameters to optimize
hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n]
embed_params = [p for n, p in model.named_parameters() if "embed" in n]
scalar_params = [p for p in model.parameters() if p.ndim < 2]
head_params = [model.lm_head.weight]
# init the optimizer(s)
# small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence
# discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094
optimizer1 = DistAdam(scalar_params + head_params + embed_params, lr=0.008, betas=(0.8, 0.95), eps=1e-10, weight_decay=0.0)
optimizer2 = Muon(hidden_matrix_params, lr=0.05, momentum=0.95, weight_decay=0.0)
optimizers = [optimizer1, optimizer2]
for opt in optimizers:
for group in opt.param_groups:
group["initial_lr"] = group["lr"]
# learning rate schedule: stable then decay
def get_lr(step: int):
x = step / args.num_iterations # progress in training
assert 0 <= x < 1
if x < 1 - args.cooldown_frac:
return 1.0
else:
w = (1 - x) / args.cooldown_frac
return w * 1.0 + (1 - w) * 0.1
# attention window size schedule: linearly increase
@lru_cache(1)
def get_window_size_blocks_helper(window_size: int):
return torch.tensor(window_size // 128, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
def get_window_size_blocks(step: int):
x = step / args.num_iterations # progress in training
assert 0 <= x <= 1
# Linearly increase the block-wise sliding window size over training 128 -> 1792
# increase by @fernbear.bsky.social; block-wise by @YouJiacheng
window_size = next_multiple_of_n(1728 * x, n=128)
return get_window_size_blocks_helper(window_size)
model: nn.Module = torch.compile(model, dynamic=False)
########################################
# Warmup kernels #
########################################
# Warmup the training kernels, then re-initialize the state so we aren't cheating
warmup_steps = 10
initial_state = dict(model=copy.deepcopy(model.state_dict()),
optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state
train_loader = distributed_data_generator(args.train_files, world_size * args.train_seq_len, align_to_bos=True)
for _ in range(warmup_steps):
inputs, targets = next(train_loader)
model(inputs, targets, get_window_size_blocks(1)).backward()
for opt in optimizers:
opt.step()
model.zero_grad(set_to_none=True)
model.load_state_dict(initial_state["model"])
for opt, opt_state in zip(optimizers, initial_state["optimizers"]):
opt.load_state_dict(opt_state)
del train_loader, initial_state
########################################
# Training and validation #
########################################
train_loader = distributed_data_generator(args.train_files, world_size * args.train_seq_len, align_to_bos=True)
training_time_ms = 0
# start the clock
torch.cuda.synchronize()
t0 = time.perf_counter()
# begin training
train_steps = args.num_iterations
for step in range(train_steps + 1):
last_step = (step == train_steps)
# --------------- VALIDATION SECTION -----------------
if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0):
# stop the clock
torch.cuda.synchronize()
training_time_ms += 1000 * (time.perf_counter() - t0)
model.eval()
val_batch_size = world_size * args.val_seq_len
assert args.val_tokens % val_batch_size == 0
val_steps = args.val_tokens // val_batch_size
val_loader = distributed_data_generator(args.val_files, val_batch_size, align_to_bos=False)
val_loss = 0
with torch.no_grad():
for _ in range(val_steps):
inputs, targets = next(val_loader)
val_loss += model(inputs, targets, get_window_size_blocks(step))
val_loss /= val_steps
del val_loader
dist.all_reduce(val_loss, op=dist.ReduceOp.AVG)
print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True)
model.train()
# start the clock again
torch.cuda.synchronize()
t0 = time.perf_counter()
if last_step:
if master_process and args.save_checkpoint:
log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers])
os.makedirs(f"logs/{run_id}", exist_ok=True)
torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt")
# the last step only has the validation loop, so break to avoid training
break
# --------------- TRAINING SECTION -----------------
inputs, targets = next(train_loader)
model(inputs, targets, get_window_size_blocks(step)).backward()
# set optimization hyperparameters
for opt in optimizers:
for group in opt.param_groups:
group["lr"] = group["initial_lr"] * get_lr(step)
for group in optimizer2.param_groups:
frac = min(step / 300, 1) # momentum warmup for muon
group["momentum"] = (1 - frac) * 0.85 + frac * 0.95
# step the optimizers
for opt in optimizers:
opt.step()
# null the gradients
model.zero_grad(set_to_none=True)
# logging
approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0)
print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True)
print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB "
f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True)
dist.destroy_process_group()
====================================================================================================
Running Python 3.12.7 (main, Jul 14 2025, 05:23:05) [GCC 13.2.0]
Running PyTorch 2.9.0.dev20250713+cu126 compiled for CUDA 12.6
Mon Jul 14 07:04:41 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.148.08 Driver Version: 570.148.08 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA H100 80GB HBM3 On | 00000000:61:00.0 Off | 0 |
| N/A 30C P0 137W / 700W | 5858MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA H100 80GB HBM3 On | 00000000:62:00.0 Off | 0 |
| N/A 32C P0 124W / 700W | 1519MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 2 NVIDIA H100 80GB HBM3 On | 00000000:63:00.0 Off | 0 |
| N/A 31C P0 119W / 700W | 1519MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 3 NVIDIA H100 80GB HBM3 On | 00000000:64:00.0 Off | 0 |
| N/A 29C P0 125W / 700W | 1519MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 4 NVIDIA H100 80GB HBM3 On | 00000000:6A:00.0 Off | 0 |
| N/A 28C P0 116W / 700W | 1519MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 5 NVIDIA H100 80GB HBM3 On | 00000000:6B:00.0 Off | 0 |
| N/A 30C P0 121W / 700W | 1519MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 6 NVIDIA H100 80GB HBM3 On | 00000000:6C:00.0 Off | 0 |
| N/A 29C P0 124W / 700W | 1519MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 7 NVIDIA H100 80GB HBM3 On | 00000000:6D:00.0 Off | 0 |
| N/A 26C P0 117W / 700W | 1519MiB / 81559MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 51503 C /usr/local/bin/python 1510MiB |
| 0 N/A N/A 51504 C /usr/local/bin/python 614MiB |
| 0 N/A N/A 51505 C /usr/local/bin/python 614MiB |
| 0 N/A N/A 51506 C /usr/local/bin/python 614MiB |
| 0 N/A N/A 51507 C /usr/local/bin/python 614MiB |
| 0 N/A N/A 51508 C /usr/local/bin/python 614MiB |
| 0 N/A N/A 51509 C /usr/local/bin/python 614MiB |
| 0 N/A N/A 51510 C /usr/local/bin/python 614MiB |
| 1 N/A N/A 51504 C /usr/local/bin/python 1510MiB |
| 2 N/A N/A 51505 C /usr/local/bin/python 1510MiB |
| 3 N/A N/A 51506 C /usr/local/bin/python 1510MiB |
| 4 N/A N/A 51507 C /usr/local/bin/python 1510MiB |
| 5 N/A N/A 51508 C /usr/local/bin/python 1510MiB |
| 6 N/A N/A 51509 C /usr/local/bin/python 1510MiB |
| 7 N/A N/A 51510 C /usr/local/bin/python 1510MiB |
+-----------------------------------------------------------------------------------------+
====================================================================================================
step:0/1750 val_loss:10.8258 train_time:0ms step_avg:0.03ms
step:1/1750 train_time:146ms step_avg:146.43ms
step:2/1750 train_time:182ms step_avg:90.99ms
step:3/1750 train_time:249ms step_avg:82.95ms
step:4/1750 train_time:340ms step_avg:85.09ms
step:5/1750 train_time:433ms step_avg:86.53ms
step:6/1750 train_time:526ms step_avg:87.68ms
step:7/1750 train_time:618ms step_avg:88.28ms
step:8/1750 train_time:713ms step_avg:89.12ms
step:9/1750 train_time:804ms step_avg:89.39ms
step:10/1750 train_time:896ms step_avg:89.62ms
step:11/1750 train_time:988ms step_avg:89.84ms
step:12/1750 train_time:1081ms step_avg:90.10ms
step:13/1750 train_time:1175ms step_avg:90.39ms
step:14/1750 train_time:1268ms step_avg:90.56ms
step:15/1750 train_time:1362ms step_avg:90.83ms
step:16/1750 train_time:1455ms step_avg:90.93ms
step:17/1750 train_time:1548ms step_avg:91.06ms
step:18/1750 train_time:1641ms step_avg:91.15ms
step:19/1750 train_time:1734ms step_avg:91.25ms
step:20/1750 train_time:1827ms step_avg:91.35ms
step:21/1750 train_time:1920ms step_avg:91.43ms
step:22/1750 train_time:2013ms step_avg:91.49ms
step:23/1750 train_time:2107ms step_avg:91.60ms
step:24/1750 train_time:2200ms step_avg:91.65ms
step:25/1750 train_time:2293ms step_avg:91.70ms
step:26/1750 train_time:2386ms step_avg:91.77ms
step:27/1750 train_time:2479ms step_avg:91.81ms
step:28/1750 train_time:2572ms step_avg:91.84ms
step:29/1750 train_time:2665ms step_avg:91.88ms
step:30/1750 train_time:2757ms step_avg:91.90ms
step:31/1750 train_time:2850ms step_avg:91.95ms
step:32/1750 train_time:2944ms step_avg:91.99ms
step:33/1750 train_time:3036ms step_avg:92.00ms
step:34/1750 train_time:3129ms step_avg:92.04ms
step:35/1750 train_time:3223ms step_avg:92.08ms
step:36/1750 train_time:3316ms step_avg:92.10ms
step:37/1750 train_time:3409ms step_avg:92.14ms
step:38/1750 train_time:3502ms step_avg:92.17ms
step:39/1750 train_time:3595ms step_avg:92.18ms
step:40/1750 train_time:3689ms step_avg:92.24ms
step:41/1750 train_time:3782ms step_avg:92.25ms
step:42/1750 train_time:3875ms step_avg:92.26ms
step:43/1750 train_time:3967ms step_avg:92.27ms
step:44/1750 train_time:4061ms step_avg:92.29ms
step:45/1750 train_time:4154ms step_avg:92.30ms
step:46/1750 train_time:4246ms step_avg:92.31ms
step:47/1750 train_time:4339ms step_avg:92.32ms
step:48/1750 train_time:4432ms step_avg:92.33ms
step:49/1750 train_time:4525ms step_avg:92.35ms
step:50/1750 train_time:4619ms step_avg:92.39ms
step:51/1750 train_time:4712ms step_avg:92.40ms
step:52/1750 train_time:4805ms step_avg:92.40ms
step:53/1750 train_time:4899ms step_avg:92.43ms
step:54/1750 train_time:4992ms step_avg:92.44ms
step:55/1750 train_time:5085ms step_avg:92.45ms
step:56/1750 train_time:5177ms step_avg:92.45ms
step:57/1750 train_time:5270ms step_avg:92.45ms
step:58/1750 train_time:5364ms step_avg:92.48ms
step:59/1750 train_time:5456ms step_avg:92.48ms
step:60/1750 train_time:5549ms step_avg:92.49ms
step:61/1750 train_time:5643ms step_avg:92.51ms
step:62/1750 train_time:5736ms step_avg:92.52ms
step:63/1750 train_time:5829ms step_avg:92.53ms
step:64/1750 train_time:5922ms step_avg:92.54ms
step:65/1750 train_time:6015ms step_avg:92.53ms
step:66/1750 train_time:6107ms step_avg:92.53ms
step:67/1750 train_time:6200ms step_avg:92.54ms
step:68/1750 train_time:6293ms step_avg:92.55ms
step:69/1750 train_time:6387ms step_avg:92.57ms
step:70/1750 train_time:6480ms step_avg:92.57ms
step:71/1750 train_time:6573ms step_avg:92.58ms
step:72/1750 train_time:6668ms step_avg:92.61ms
step:73/1750 train_time:6761ms step_avg:92.61ms
step:74/1750 train_time:6854ms step_avg:92.62ms
step:75/1750 train_time:6947ms step_avg:92.63ms
step:76/1750 train_time:7041ms step_avg:92.64ms
step:77/1750 train_time:7134ms step_avg:92.65ms
step:78/1750 train_time:7227ms step_avg:92.66ms
step:79/1750 train_time:7321ms step_avg:92.67ms
step:80/1750 train_time:7413ms step_avg:92.66ms
step:81/1750 train_time:7506ms step_avg:92.67ms
step:82/1750 train_time:7599ms step_avg:92.67ms
step:83/1750 train_time:7693ms step_avg:92.69ms
step:84/1750 train_time:7787ms step_avg:92.71ms
step:85/1750 train_time:7880ms step_avg:92.70ms
step:86/1750 train_time:7973ms step_avg:92.70ms
step:87/1750 train_time:8066ms step_avg:92.71ms
step:88/1750 train_time:8159ms step_avg:92.72ms
step:89/1750 train_time:8252ms step_avg:92.72ms
step:90/1750 train_time:8345ms step_avg:92.73ms
step:91/1750 train_time:8438ms step_avg:92.73ms
step:92/1750 train_time:8532ms step_avg:92.74ms
step:93/1750 train_time:8626ms step_avg:92.75ms
step:94/1750 train_time:8718ms step_avg:92.75ms
step:95/1750 train_time:8812ms step_avg:92.75ms
step:96/1750 train_time:8904ms step_avg:92.75ms
step:97/1750 train_time:8997ms step_avg:92.75ms
step:98/1750 train_time:9090ms step_avg:92.76ms
step:99/1750 train_time:9182ms step_avg:92.75ms
step:100/1750 train_time:9275ms step_avg:92.75ms
step:101/1750 train_time:9368ms step_avg:92.76ms
step:102/1750 train_time:9462ms step_avg:92.76ms
step:103/1750 train_time:9554ms step_avg:92.76ms
step:104/1750 train_time:9648ms step_avg:92.77ms
step:105/1750 train_time:9742ms step_avg:92.78ms
step:106/1750 train_time:9835ms step_avg:92.78ms
step:107/1750 train_time:9929ms step_avg:92.79ms
step:108/1750 train_time:10022ms step_avg:92.79ms
step:109/1750 train_time:10115ms step_avg:92.80ms
step:110/1750 train_time:10208ms step_avg:92.80ms
step:111/1750 train_time:10301ms step_avg:92.81ms
step:112/1750 train_time:10394ms step_avg:92.81ms
step:113/1750 train_time:10489ms step_avg:92.82ms
step:114/1750 train_time:10582ms step_avg:92.82ms
step:115/1750 train_time:10674ms step_avg:92.81ms
step:116/1750 train_time:10767ms step_avg:92.82ms
step:117/1750 train_time:10861ms step_avg:92.83ms
step:118/1750 train_time:10954ms step_avg:92.83ms
step:119/1750 train_time:11047ms step_avg:92.83ms
step:120/1750 train_time:11141ms step_avg:92.84ms
step:121/1750 train_time:11233ms step_avg:92.83ms
step:122/1750 train_time:11325ms step_avg:92.83ms
step:123/1750 train_time:11418ms step_avg:92.83ms
step:124/1750 train_time:11511ms step_avg:92.83ms
step:125/1750 train_time:11604ms step_avg:92.83ms
step:125/1750 val_loss:4.6373 train_time:11689ms step_avg:93.51ms
step:126/1750 train_time:11725ms step_avg:93.06ms
step:127/1750 train_time:11800ms step_avg:92.91ms
step:128/1750 train_time:11896ms step_avg:92.94ms
step:129/1750 train_time:11991ms step_avg:92.95ms
step:130/1750 train_time:12084ms step_avg:92.95ms
step:131/1750 train_time:12176ms step_avg:92.94ms
step:132/1750 train_time:12269ms step_avg:92.95ms
step:133/1750 train_time:12362ms step_avg:92.95ms
step:134/1750 train_time:12455ms step_avg:92.95ms
step:135/1750 train_time:12547ms step_avg:92.94ms
step:136/1750 train_time:12640ms step_avg:92.94ms
step:137/1750 train_time:12734ms step_avg:92.95ms
step:138/1750 train_time:12828ms step_avg:92.96ms
step:139/1750 train_time:12924ms step_avg:92.98ms
step:140/1750 train_time:13018ms step_avg:92.98ms
step:141/1750 train_time:13110ms step_avg:92.98ms
step:142/1750 train_time:13205ms step_avg:92.99ms
step:143/1750 train_time:13299ms step_avg:93.00ms
step:144/1750 train_time:13392ms step_avg:93.00ms
step:145/1750 train_time:13485ms step_avg:93.00ms
step:146/1750 train_time:13578ms step_avg:93.00ms
step:147/1750 train_time:13671ms step_avg:93.00ms
step:148/1750 train_time:13765ms step_avg:93.01ms
step:149/1750 train_time:13859ms step_avg:93.02ms
step:150/1750 train_time:13953ms step_avg:93.02ms
step:151/1750 train_time:14047ms step_avg:93.02ms
step:152/1750 train_time:14140ms step_avg:93.03ms
step:153/1750 train_time:14233ms step_avg:93.03ms
step:154/1750 train_time:14327ms step_avg:93.03ms
step:155/1750 train_time:14420ms step_avg:93.03ms
step:156/1750 train_time:14514ms step_avg:93.04ms
step:157/1750 train_time:14608ms step_avg:93.04ms
step:158/1750 train_time:14702ms step_avg:93.05ms
step:159/1750 train_time:14795ms step_avg:93.05ms
step:160/1750 train_time:14888ms step_avg:93.05ms
step:161/1750 train_time:14982ms step_avg:93.06ms
step:162/1750 train_time:15076ms step_avg:93.06ms
step:163/1750 train_time:15169ms step_avg:93.06ms
step:164/1750 train_time:15264ms step_avg:93.07ms
step:165/1750 train_time:15356ms step_avg:93.07ms
step:166/1750 train_time:15449ms step_avg:93.07ms
step:167/1750 train_time:15543ms step_avg:93.07ms
step:168/1750 train_time:15636ms step_avg:93.07ms
step:169/1750 train_time:15730ms step_avg:93.07ms
step:170/1750 train_time:15823ms step_avg:93.07ms
step:171/1750 train_time:15917ms step_avg:93.08ms
step:172/1750 train_time:16010ms step_avg:93.08ms
step:173/1750 train_time:16105ms step_avg:93.09ms
step:174/1750 train_time:16199ms step_avg:93.10ms
step:175/1750 train_time:16292ms step_avg:93.09ms
step:176/1750 train_time:16385ms step_avg:93.10ms
step:177/1750 train_time:16478ms step_avg:93.10ms
step:178/1750 train_time:16572ms step_avg:93.10ms
step:179/1750 train_time:16665ms step_avg:93.10ms
step:180/1750 train_time:16759ms step_avg:93.11ms
step:181/1750 train_time:16852ms step_avg:93.11ms
step:182/1750 train_time:16946ms step_avg:93.11ms
step:183/1750 train_time:17040ms step_avg:93.11ms
step:184/1750 train_time:17133ms step_avg:93.11ms
step:185/1750 train_time:17226ms step_avg:93.11ms
step:186/1750 train_time:17319ms step_avg:93.11ms
step:187/1750 train_time:17413ms step_avg:93.12ms
step:188/1750 train_time:17506ms step_avg:93.12ms
step:189/1750 train_time:17599ms step_avg:93.12ms
step:190/1750 train_time:17692ms step_avg:93.11ms
step:191/1750 train_time:17786ms step_avg:93.12ms
step:192/1750 train_time:17879ms step_avg:93.12ms
step:193/1750 train_time:17973ms step_avg:93.12ms
step:194/1750 train_time:18066ms step_avg:93.12ms
step:195/1750 train_time:18159ms step_avg:93.13ms
step:196/1750 train_time:18252ms step_avg:93.12ms
step:197/1750 train_time:18347ms step_avg:93.13ms
step:198/1750 train_time:18442ms step_avg:93.14ms
step:199/1750 train_time:18535ms step_avg:93.14ms
step:200/1750 train_time:18628ms step_avg:93.14ms
step:201/1750 train_time:18721ms step_avg:93.14ms
step:202/1750 train_time:18814ms step_avg:93.14ms
step:203/1750 train_time:18908ms step_avg:93.14ms
step:204/1750 train_time:19002ms step_avg:93.15ms
step:205/1750 train_time:19097ms step_avg:93.16ms
step:206/1750 train_time:19191ms step_avg:93.16ms
step:207/1750 train_time:19285ms step_avg:93.16ms
step:208/1750 train_time:19379ms step_avg:93.17ms
step:209/1750 train_time:19472ms step_avg:93.17ms
step:210/1750 train_time:19566ms step_avg:93.17ms
step:211/1750 train_time:19660ms step_avg:93.18ms
step:212/1750 train_time:19754ms step_avg:93.18ms
step:213/1750 train_time:19848ms step_avg:93.18ms
step:214/1750 train_time:19941ms step_avg:93.18ms
step:215/1750 train_time:20033ms step_avg:93.18ms
step:216/1750 train_time:20127ms step_avg:93.18ms
step:217/1750 train_time:20221ms step_avg:93.18ms
step:218/1750 train_time:20314ms step_avg:93.18ms
step:219/1750 train_time:20407ms step_avg:93.18ms
step:220/1750 train_time:20501ms step_avg:93.19ms
step:221/1750 train_time:20595ms step_avg:93.19ms
step:222/1750 train_time:20688ms step_avg:93.19ms
step:223/1750 train_time:20783ms step_avg:93.20ms
step:224/1750 train_time:20876ms step_avg:93.20ms
step:225/1750 train_time:20970ms step_avg:93.20ms
step:226/1750 train_time:21064ms step_avg:93.20ms
step:227/1750 train_time:21158ms step_avg:93.21ms
step:228/1750 train_time:21252ms step_avg:93.21ms
step:229/1750 train_time:21345ms step_avg:93.21ms
step:230/1750 train_time:21439ms step_avg:93.21ms
step:231/1750 train_time:21532ms step_avg:93.21ms
step:232/1750 train_time:21626ms step_avg:93.21ms
step:233/1750 train_time:21720ms step_avg:93.22ms
step:234/1750 train_time:21814ms step_avg:93.22ms
step:235/1750 train_time:21908ms step_avg:93.22ms
step:236/1750 train_time:22003ms step_avg:93.23ms
step:237/1750 train_time:22097ms step_avg:93.24ms
step:238/1750 train_time:22190ms step_avg:93.24ms
step:239/1750 train_time:22284ms step_avg:93.24ms
step:240/1750 train_time:22377ms step_avg:93.24ms
step:241/1750 train_time:22470ms step_avg:93.24ms
step:242/1750 train_time:22564ms step_avg:93.24ms
step:243/1750 train_time:22658ms step_avg:93.24ms
step:244/1750 train_time:22751ms step_avg:93.24ms
step:245/1750 train_time:22844ms step_avg:93.24ms
step:246/1750 train_time:22940ms step_avg:93.25ms
step:247/1750 train_time:23032ms step_avg:93.25ms
step:248/1750 train_time:23127ms step_avg:93.25ms
step:249/1750 train_time:23221ms step_avg:93.26ms
step:250/1750 train_time:23314ms step_avg:93.26ms
step:250/1750 val_loss:4.0907 train_time:23401ms step_avg:93.60ms
step:251/1750 train_time:23438ms step_avg:93.38ms
step:252/1750 train_time:23514ms step_avg:93.31ms
step:253/1750 train_time:23611ms step_avg:93.32ms
step:254/1750 train_time:23704ms step_avg:93.32ms
step:255/1750 train_time:23797ms step_avg:93.32ms
step:256/1750 train_time:23891ms step_avg:93.32ms
step:257/1750 train_time:23984ms step_avg:93.32ms
step:258/1750 train_time:24075ms step_avg:93.32ms
step:259/1750 train_time:24169ms step_avg:93.32ms
step:260/1750 train_time:24263ms step_avg:93.32ms
step:261/1750 train_time:24356ms step_avg:93.32ms
step:262/1750 train_time:24452ms step_avg:93.33ms
step:263/1750 train_time:24547ms step_avg:93.34ms
step:264/1750 train_time:24644ms step_avg:93.35ms
step:265/1750 train_time:24738ms step_avg:93.35ms
step:266/1750 train_time:24832ms step_avg:93.35ms
step:267/1750 train_time:24927ms step_avg:93.36ms
step:268/1750 train_time:25021ms step_avg:93.36ms
step:269/1750 train_time:25115ms step_avg:93.36ms
step:270/1750 train_time:25208ms step_avg:93.36ms
step:271/1750 train_time:25303ms step_avg:93.37ms
step:272/1750 train_time:25396ms step_avg:93.37ms
step:273/1750 train_time:25491ms step_avg:93.37ms
step:274/1750 train_time:25586ms step_avg:93.38ms
step:275/1750 train_time:25680ms step_avg:93.38ms
step:276/1750 train_time:25774ms step_avg:93.38ms
step:277/1750 train_time:25870ms step_avg:93.39ms
step:278/1750 train_time:25966ms step_avg:93.40ms
step:279/1750 train_time:26059ms step_avg:93.40ms
step:280/1750 train_time:26153ms step_avg:93.40ms
step:281/1750 train_time:26246ms step_avg:93.40ms
step:282/1750 train_time:26339ms step_avg:93.40ms
step:283/1750 train_time:26433ms step_avg:93.40ms
step:284/1750 train_time:26527ms step_avg:93.41ms
step:285/1750 train_time:26623ms step_avg:93.41ms
step:286/1750 train_time:26716ms step_avg:93.41ms
step:287/1750 train_time:26810ms step_avg:93.42ms
step:288/1750 train_time:26904ms step_avg:93.42ms
step:289/1750 train_time:26999ms step_avg:93.42ms
step:290/1750 train_time:27093ms step_avg:93.43ms
step:291/1750 train_time:27187ms step_avg:93.42ms
step:292/1750 train_time:27280ms step_avg:93.43ms
step:293/1750 train_time:27374ms step_avg:93.43ms
step:294/1750 train_time:27468ms step_avg:93.43ms
step:295/1750 train_time:27562ms step_avg:93.43ms
step:296/1750 train_time:27656ms step_avg:93.43ms
step:297/1750 train_time:27751ms step_avg:93.44ms
step:298/1750 train_time:27845ms step_avg:93.44ms
step:299/1750 train_time:27939ms step_avg:93.44ms
step:300/1750 train_time:28033ms step_avg:93.44ms
step:301/1750 train_time:28128ms step_avg:93.45ms
step:302/1750 train_time:28222ms step_avg:93.45ms
step:303/1750 train_time:28315ms step_avg:93.45ms
step:304/1750 train_time:28409ms step_avg:93.45ms
step:305/1750 train_time:28503ms step_avg:93.45ms
step:306/1750 train_time:28596ms step_avg:93.45ms
step:307/1750 train_time:28690ms step_avg:93.45ms
step:308/1750 train_time:28784ms step_avg:93.46ms
step:309/1750 train_time:28878ms step_avg:93.46ms
step:310/1750 train_time:28973ms step_avg:93.46ms
step:311/1750 train_time:29067ms step_avg:93.46ms
step:312/1750 train_time:29163ms step_avg:93.47ms
step:313/1750 train_time:29257ms step_avg:93.47ms
step:314/1750 train_time:29351ms step_avg:93.47ms
step:315/1750 train_time:29445ms step_avg:93.48ms
step:316/1750 train_time:29539ms step_avg:93.48ms
step:317/1750 train_time:29634ms step_avg:93.48ms
step:318/1750 train_time:29728ms step_avg:93.48ms
step:319/1750 train_time:29822ms step_avg:93.49ms
step:320/1750 train_time:29915ms step_avg:93.49ms
step:321/1750 train_time:30010ms step_avg:93.49ms
step:322/1750 train_time:30104ms step_avg:93.49ms
step:323/1750 train_time:30197ms step_avg:93.49ms
step:324/1750 train_time:30291ms step_avg:93.49ms
step:325/1750 train_time:30385ms step_avg:93.49ms
step:326/1750 train_time:30478ms step_avg:93.49ms
step:327/1750 train_time:30572ms step_avg:93.49ms
step:328/1750 train_time:30666ms step_avg:93.49ms
step:329/1750 train_time:30761ms step_avg:93.50ms
step:330/1750 train_time:30855ms step_avg:93.50ms
step:331/1750 train_time:30950ms step_avg:93.50ms
step:332/1750 train_time:31045ms step_avg:93.51ms
step:333/1750 train_time:31139ms step_avg:93.51ms
step:334/1750 train_time:31234ms step_avg:93.51ms
step:335/1750 train_time:31327ms step_avg:93.51ms
step:336/1750 train_time:31421ms step_avg:93.51ms
step:337/1750 train_time:31515ms step_avg:93.52ms
step:338/1750 train_time:31609ms step_avg:93.52ms
step:339/1750 train_time:31702ms step_avg:93.52ms
step:340/1750 train_time:31796ms step_avg:93.52ms
step:341/1750 train_time:31890ms step_avg:93.52ms
step:342/1750 train_time:31985ms step_avg:93.52ms
step:343/1750 train_time:32079ms step_avg:93.52ms
step:344/1750 train_time:32173ms step_avg:93.53ms
step:345/1750 train_time:32267ms step_avg:93.53ms
step:346/1750 train_time:32362ms step_avg:93.53ms
step:347/1750 train_time:32456ms step_avg:93.53ms
step:348/1750 train_time:32550ms step_avg:93.53ms
step:349/1750 train_time:32645ms step_avg:93.54ms
step:350/1750 train_time:32738ms step_avg:93.54ms
step:351/1750 train_time:32831ms step_avg:93.54ms
step:352/1750 train_time:32925ms step_avg:93.54ms
step:353/1750 train_time:33019ms step_avg:93.54ms
step:354/1750 train_time:33114ms step_avg:93.54ms
step:355/1750 train_time:33207ms step_avg:93.54ms
step:356/1750 train_time:33301ms step_avg:93.54ms
step:357/1750 train_time:33396ms step_avg:93.55ms
step:358/1750 train_time:33490ms step_avg:93.55ms
step:359/1750 train_time:33584ms step_avg:93.55ms
step:360/1750 train_time:33678ms step_avg:93.55ms
step:361/1750 train_time:33774ms step_avg:93.56ms
step:362/1750 train_time:33867ms step_avg:93.55ms
step:363/1750 train_time:33961ms step_avg:93.56ms
step:364/1750 train_time:34055ms step_avg:93.56ms
step:365/1750 train_time:34149ms step_avg:93.56ms
step:366/1750 train_time:34244ms step_avg:93.56ms
step:367/1750 train_time:34338ms step_avg:93.56ms
step:368/1750 train_time:34432ms step_avg:93.57ms
step:369/1750 train_time:34526ms step_avg:93.57ms
step:370/1750 train_time:34623ms step_avg:93.57ms
step:371/1750 train_time:34714ms step_avg:93.57ms
step:372/1750 train_time:34808ms step_avg:93.57ms
step:373/1750 train_time:34903ms step_avg:93.57ms
step:374/1750 train_time:34997ms step_avg:93.58ms
step:375/1750 train_time:35090ms step_avg:93.57ms
step:375/1750 val_loss:3.8869 train_time:35178ms step_avg:93.81ms
step:376/1750 train_time:35214ms step_avg:93.65ms
step:377/1750 train_time:35291ms step_avg:93.61ms
step:378/1750 train_time:35385ms step_avg:93.61ms
step:379/1750 train_time:35479ms step_avg:93.61ms
step:380/1750 train_time:35575ms step_avg:93.62ms
step:381/1750 train_time:35668ms step_avg:93.62ms
step:382/1750 train_time:35761ms step_avg:93.62ms
step:383/1750 train_time:35856ms step_avg:93.62ms
step:384/1750 train_time:35948ms step_avg:93.61ms
step:385/1750 train_time:36043ms step_avg:93.62ms
step:386/1750 train_time:36135ms step_avg:93.61ms
step:387/1750 train_time:36228ms step_avg:93.61ms
step:388/1750 train_time:36324ms step_avg:93.62ms
step:389/1750 train_time:36419ms step_avg:93.62ms
step:390/1750 train_time:36514ms step_avg:93.62ms
step:391/1750 train_time:36608ms step_avg:93.63ms
step:392/1750 train_time:36704ms step_avg:93.63ms
step:393/1750 train_time:36800ms step_avg:93.64ms
step:394/1750 train_time:36896ms step_avg:93.65ms
step:395/1750 train_time:36991ms step_avg:93.65ms
step:396/1750 train_time:37086ms step_avg:93.65ms
step:397/1750 train_time:37181ms step_avg:93.66ms
step:398/1750 train_time:37280ms step_avg:93.67ms
step:399/1750 train_time:37375ms step_avg:93.67ms
step:400/1750 train_time:37472ms step_avg:93.68ms
step:401/1750 train_time:37568ms step_avg:93.69ms
step:402/1750 train_time:37663ms step_avg:93.69ms
step:403/1750 train_time:37759ms step_avg:93.70ms
step:404/1750 train_time:37855ms step_avg:93.70ms
step:405/1750 train_time:37951ms step_avg:93.71ms
step:406/1750 train_time:38046ms step_avg:93.71ms
step:407/1750 train_time:38142ms step_avg:93.72ms
step:408/1750 train_time:38238ms step_avg:93.72ms
step:409/1750 train_time:38333ms step_avg:93.72ms
step:410/1750 train_time:38429ms step_avg:93.73ms
step:411/1750 train_time:38526ms step_avg:93.74ms
step:412/1750 train_time:38622ms step_avg:93.74ms
step:413/1750 train_time:38719ms step_avg:93.75ms
step:414/1750 train_time:38814ms step_avg:93.75ms
step:415/1750 train_time:38910ms step_avg:93.76ms
step:416/1750 train_time:39005ms step_avg:93.76ms
step:417/1750 train_time:39101ms step_avg:93.77ms
step:418/1750 train_time:39197ms step_avg:93.77ms
step:419/1750 train_time:39294ms step_avg:93.78ms
step:420/1750 train_time:39390ms step_avg:93.79ms
step:421/1750 train_time:39486ms step_avg:93.79ms
step:422/1750 train_time:39582ms step_avg:93.80ms
step:423/1750 train_time:39678ms step_avg:93.80ms
step:424/1750 train_time:39774ms step_avg:93.81ms
step:425/1750 train_time:39870ms step_avg:93.81ms
step:426/1750 train_time:39966ms step_avg:93.82ms
step:427/1750 train_time:40061ms step_avg:93.82ms
step:428/1750 train_time:40157ms step_avg:93.83ms
step:429/1750 train_time:40253ms step_avg:93.83ms
step:430/1750 train_time:40348ms step_avg:93.83ms
step:431/1750 train_time:40444ms step_avg:93.84ms
step:432/1750 train_time:40541ms step_avg:93.84ms
step:433/1750 train_time:40638ms step_avg:93.85ms
step:434/1750 train_time:40733ms step_avg:93.86ms
step:435/1750 train_time:40829ms step_avg:93.86ms
step:436/1750 train_time:40925ms step_avg:93.87ms
step:437/1750 train_time:41023ms step_avg:93.87ms
step:438/1750 train_time:41119ms step_avg:93.88ms
step:439/1750 train_time:41215ms step_avg:93.88ms
step:440/1750 train_time:41311ms step_avg:93.89ms
step:441/1750 train_time:41406ms step_avg:93.89ms
step:442/1750 train_time:41503ms step_avg:93.90ms
step:443/1750 train_time:41599ms step_avg:93.90ms
step:444/1750 train_time:41695ms step_avg:93.91ms
step:445/1750 train_time:41791ms step_avg:93.91ms
step:446/1750 train_time:41887ms step_avg:93.92ms
step:447/1750 train_time:41983ms step_avg:93.92ms
step:448/1750 train_time:42079ms step_avg:93.93ms
step:449/1750 train_time:42175ms step_avg:93.93ms
step:450/1750 train_time:42270ms step_avg:93.93ms
step:451/1750 train_time:42366ms step_avg:93.94ms
step:452/1750 train_time:42462ms step_avg:93.94ms
step:453/1750 train_time:42558ms step_avg:93.95ms
step:454/1750 train_time:42654ms step_avg:93.95ms
step:455/1750 train_time:42749ms step_avg:93.95ms
step:456/1750 train_time:42845ms step_avg:93.96ms
step:457/1750 train_time:42941ms step_avg:93.96ms
step:458/1750 train_time:43037ms step_avg:93.97ms
step:459/1750 train_time:43134ms step_avg:93.97ms
step:460/1750 train_time:43230ms step_avg:93.98ms
step:461/1750 train_time:43327ms step_avg:93.98ms
step:462/1750 train_time:43423ms step_avg:93.99ms
step:463/1750 train_time:43519ms step_avg:93.99ms
step:464/1750 train_time:43615ms step_avg:94.00ms
step:465/1750 train_time:43709ms step_avg:94.00ms
step:466/1750 train_time:43805ms step_avg:94.00ms
step:467/1750 train_time:43902ms step_avg:94.01ms
step:468/1750 train_time:43998ms step_avg:94.01ms
step:469/1750 train_time:44095ms step_avg:94.02ms
step:470/1750 train_time:44192ms step_avg:94.03ms
step:471/1750 train_time:44288ms step_avg:94.03ms
step:472/1750 train_time:44384ms step_avg:94.03ms
step:473/1750 train_time:44483ms step_avg:94.04ms
step:474/1750 train_time:44576ms step_avg:94.04ms
step:475/1750 train_time:44671ms step_avg:94.05ms
step:476/1750 train_time:44767ms step_avg:94.05ms
step:477/1750 train_time:44863ms step_avg:94.05ms
step:478/1750 train_time:44959ms step_avg:94.06ms
step:479/1750 train_time:45056ms step_avg:94.06ms
step:480/1750 train_time:45152ms step_avg:94.07ms
step:481/1750 train_time:45249ms step_avg:94.07ms
step:482/1750 train_time:45344ms step_avg:94.07ms
step:483/1750 train_time:45440ms step_avg:94.08ms
step:484/1750 train_time:45536ms step_avg:94.08ms
step:485/1750 train_time:45632ms step_avg:94.09ms
step:486/1750 train_time:45728ms step_avg:94.09ms
step:487/1750 train_time:45823ms step_avg:94.09ms
step:488/1750 train_time:45920ms step_avg:94.10ms
step:489/1750 train_time:46016ms step_avg:94.10ms
step:490/1750 train_time:46111ms step_avg:94.11ms
step:491/1750 train_time:46208ms step_avg:94.11ms
step:492/1750 train_time:46304ms step_avg:94.11ms
step:493/1750 train_time:46401ms step_avg:94.12ms
step:494/1750 train_time:46497ms step_avg:94.12ms
step:495/1750 train_time:46594ms step_avg:94.13ms
step:496/1750 train_time:46691ms step_avg:94.14ms
step:497/1750 train_time:46785ms step_avg:94.14ms
step:498/1750 train_time:46882ms step_avg:94.14ms
step:499/1750 train_time:46978ms step_avg:94.15ms
step:500/1750 train_time:47075ms step_avg:94.15ms
step:500/1750 val_loss:3.7387 train_time:47164ms step_avg:94.33ms
step:501/1750 train_time:47200ms step_avg:94.21ms
step:502/1750 train_time:47277ms step_avg:94.18ms
step:503/1750 train_time:47375ms step_avg:94.19ms
step:504/1750 train_time:47471ms step_avg:94.19ms
step:505/1750 train_time:47566ms step_avg:94.19ms
step:506/1750 train_time:47661ms step_avg:94.19ms
step:507/1750 train_time:47757ms step_avg:94.20ms
step:508/1750 train_time:47851ms step_avg:94.20ms
step:509/1750 train_time:47947ms step_avg:94.20ms
step:510/1750 train_time:48041ms step_avg:94.20ms
step:511/1750 train_time:48137ms step_avg:94.20ms
step:512/1750 train_time:48236ms step_avg:94.21ms
step:513/1750 train_time:48330ms step_avg:94.21ms
step:514/1750 train_time:48425ms step_avg:94.21ms
step:515/1750 train_time:48522ms step_avg:94.22ms
step:516/1750 train_time:48619ms step_avg:94.22ms
step:517/1750 train_time:48715ms step_avg:94.23ms
step:518/1750 train_time:48810ms step_avg:94.23ms
step:519/1750 train_time:48904ms step_avg:94.23ms
step:520/1750 train_time:49000ms step_avg:94.23ms
step:521/1750 train_time:49097ms step_avg:94.24ms
step:522/1750 train_time:49193ms step_avg:94.24ms
step:523/1750 train_time:49289ms step_avg:94.24ms
step:524/1750 train_time:49385ms step_avg:94.25ms
step:525/1750 train_time:49481ms step_avg:94.25ms
step:526/1750 train_time:49578ms step_avg:94.26ms
step:527/1750 train_time:49674ms step_avg:94.26ms
step:528/1750 train_time:49769ms step_avg:94.26ms
step:529/1750 train_time:49864ms step_avg:94.26ms
step:530/1750 train_time:49960ms step_avg:94.26ms
step:531/1750 train_time:50055ms step_avg:94.27ms
step:532/1750 train_time:50151ms step_avg:94.27ms
step:533/1750 train_time:50248ms step_avg:94.27ms
step:534/1750 train_time:50344ms step_avg:94.28ms
step:535/1750 train_time:50440ms step_avg:94.28ms
step:536/1750 train_time:50537ms step_avg:94.29ms
step:537/1750 train_time:50633ms step_avg:94.29ms
step:538/1750 train_time:50729ms step_avg:94.29ms
step:539/1750 train_time:50825ms step_avg:94.29ms
step:540/1750 train_time:50921ms step_avg:94.30ms
step:541/1750 train_time:51016ms step_avg:94.30ms
step:542/1750 train_time:51113ms step_avg:94.30ms
step:543/1750 train_time:51209ms step_avg:94.31ms
step:544/1750 train_time:51304ms step_avg:94.31ms
step:545/1750 train_time:51400ms step_avg:94.31ms
step:546/1750 train_time:51497ms step_avg:94.32ms
step:547/1750 train_time:51592ms step_avg:94.32ms
step:548/1750 train_time:51688ms step_avg:94.32ms
step:549/1750 train_time:51784ms step_avg:94.32ms
step:550/1750 train_time:51880ms step_avg:94.33ms
step:551/1750 train_time:51975ms step_avg:94.33ms
step:552/1750 train_time:52071ms step_avg:94.33ms
step:553/1750 train_time:52166ms step_avg:94.33ms
step:554/1750 train_time:52263ms step_avg:94.34ms
step:555/1750 train_time:52359ms step_avg:94.34ms
step:556/1750 train_time:52456ms step_avg:94.34ms
step:557/1750 train_time:52552ms step_avg:94.35ms
step:558/1750 train_time:52648ms step_avg:94.35ms
step:559/1750 train_time:52744ms step_avg:94.35ms
step:560/1750 train_time:52840ms step_avg:94.36ms
step:561/1750 train_time:52935ms step_avg:94.36ms
step:562/1750 train_time:53032ms step_avg:94.36ms
step:563/1750 train_time:53127ms step_avg:94.36ms
step:564/1750 train_time:53222ms step_avg:94.36ms
step:565/1750 train_time:53318ms step_avg:94.37ms
step:566/1750 train_time:53415ms step_avg:94.37ms
step:567/1750 train_time:53510ms step_avg:94.37ms
step:568/1750 train_time:53607ms step_avg:94.38ms
step:569/1750 train_time:53702ms step_avg:94.38ms
step:570/1750 train_time:53799ms step_avg:94.38ms
step:571/1750 train_time:53895ms step_avg:94.39ms
step:572/1750 train_time:53991ms step_avg:94.39ms
step:573/1750 train_time:54087ms step_avg:94.39ms
step:574/1750 train_time:54183ms step_avg:94.40ms
step:575/1750 train_time:54279ms step_avg:94.40ms
step:576/1750 train_time:54376ms step_avg:94.40ms
step:577/1750 train_time:54472ms step_avg:94.41ms
step:578/1750 train_time:54567ms step_avg:94.41ms
step:579/1750 train_time:54663ms step_avg:94.41ms
step:580/1750 train_time:54760ms step_avg:94.41ms
step:581/1750 train_time:54856ms step_avg:94.42ms
step:582/1750 train_time:54952ms step_avg:94.42ms
step:583/1750 train_time:55048ms step_avg:94.42ms
step:584/1750 train_time:55144ms step_avg:94.43ms
step:585/1750 train_time:55240ms step_avg:94.43ms
step:586/1750 train_time:55338ms step_avg:94.43ms
step:587/1750 train_time:55432ms step_avg:94.43ms
step:588/1750 train_time:55527ms step_avg:94.43ms
step:589/1750 train_time:55623ms step_avg:94.44ms
step:590/1750 train_time:55718ms step_avg:94.44ms
step:591/1750 train_time:55815ms step_avg:94.44ms
step:592/1750 train_time:55910ms step_avg:94.44ms
step:593/1750 train_time:56007ms step_avg:94.45ms
step:594/1750 train_time:56103ms step_avg:94.45ms
step:595/1750 train_time:56201ms step_avg:94.46ms
step:596/1750 train_time:56296ms step_avg:94.46ms
step:597/1750 train_time:56392ms step_avg:94.46ms
step:598/1750 train_time:56488ms step_avg:94.46ms
step:599/1750 train_time:56583ms step_avg:94.46ms
step:600/1750 train_time:56679ms step_avg:94.47ms
step:601/1750 train_time:56775ms step_avg:94.47ms
step:602/1750 train_time:56870ms step_avg:94.47ms
step:603/1750 train_time:56966ms step_avg:94.47ms
step:604/1750 train_time:57062ms step_avg:94.47ms
step:605/1750 train_time:57158ms step_avg:94.48ms
step:606/1750 train_time:57254ms step_avg:94.48ms
step:607/1750 train_time:57350ms step_avg:94.48ms
step:608/1750 train_time:57446ms step_avg:94.48ms
step:609/1750 train_time:57542ms step_avg:94.49ms
step:610/1750 train_time:57638ms step_avg:94.49ms
step:611/1750 train_time:57733ms step_avg:94.49ms
step:612/1750 train_time:57829ms step_avg:94.49ms
step:613/1750 train_time:57925ms step_avg:94.49ms
step:614/1750 train_time:58021ms step_avg:94.50ms
step:615/1750 train_time:58117ms step_avg:94.50ms
step:616/1750 train_time:58214ms step_avg:94.50ms
step:617/1750 train_time:58309ms step_avg:94.50ms
step:618/1750 train_time:58404ms step_avg:94.51ms
step:619/1750 train_time:58501ms step_avg:94.51ms
step:620/1750 train_time:58597ms step_avg:94.51ms
step:621/1750 train_time:58693ms step_avg:94.51ms
step:622/1750 train_time:58788ms step_avg:94.52ms
step:623/1750 train_time:58884ms step_avg:94.52ms
step:624/1750 train_time:58980ms step_avg:94.52ms
step:625/1750 train_time:59076ms step_avg:94.52ms
step:625/1750 val_loss:3.6554 train_time:59165ms step_avg:94.66ms
step:626/1750 train_time:59202ms step_avg:94.57ms
step:627/1750 train_time:59278ms step_avg:94.54ms
step:628/1750 train_time:59376ms step_avg:94.55ms
step:629/1750 train_time:59473ms step_avg:94.55ms
step:630/1750 train_time:59569ms step_avg:94.55ms
step:631/1750 train_time:59665ms step_avg:94.56ms
step:632/1750 train_time:59760ms step_avg:94.56ms
step:633/1750 train_time:59855ms step_avg:94.56ms
step:634/1750 train_time:59951ms step_avg:94.56ms
step:635/1750 train_time:60047ms step_avg:94.56ms
step:636/1750 train_time:60142ms step_avg:94.56ms
step:637/1750 train_time:60239ms step_avg:94.57ms
step:638/1750 train_time:60336ms step_avg:94.57ms
step:639/1750 train_time:60433ms step_avg:94.57ms
step:640/1750 train_time:60529ms step_avg:94.58ms
step:641/1750 train_time:60626ms step_avg:94.58ms
step:642/1750 train_time:60721ms step_avg:94.58ms
step:643/1750 train_time:60817ms step_avg:94.58ms
step:644/1750 train_time:60913ms step_avg:94.59ms
step:645/1750 train_time:61008ms step_avg:94.59ms
step:646/1750 train_time:61105ms step_avg:94.59ms
step:647/1750 train_time:61201ms step_avg:94.59ms
step:648/1750 train_time:61297ms step_avg:94.59ms
step:649/1750 train_time:61393ms step_avg:94.60ms
step:650/1750 train_time:61490ms step_avg:94.60ms
step:651/1750 train_time:61588ms step_avg:94.61ms
step:652/1750 train_time:61686ms step_avg:94.61ms
step:653/1750 train_time:61783ms step_avg:94.61ms
step:654/1750 train_time:61880ms step_avg:94.62ms
step:655/1750 train_time:61977ms step_avg:94.62ms
step:656/1750 train_time:62074ms step_avg:94.63ms
step:657/1750 train_time:62173ms step_avg:94.63ms
step:658/1750 train_time:62271ms step_avg:94.64ms
step:659/1750 train_time:62370ms step_avg:94.64ms
step:660/1750 train_time:62469ms step_avg:94.65ms
step:661/1750 train_time:62567ms step_avg:94.65ms
step:662/1750 train_time:62664ms step_avg:94.66ms
step:663/1750 train_time:62761ms step_avg:94.66ms
step:664/1750 train_time:62858ms step_avg:94.67ms
step:665/1750 train_time:62955ms step_avg:94.67ms
step:666/1750 train_time:63052ms step_avg:94.67ms
step:667/1750 train_time:63150ms step_avg:94.68ms
step:668/1750 train_time:63247ms step_avg:94.68ms
step:669/1750 train_time:63345ms step_avg:94.69ms
step:670/1750 train_time:63441ms step_avg:94.69ms
step:671/1750 train_time:63538ms step_avg:94.69ms
step:672/1750 train_time:63635ms step_avg:94.70ms
step:673/1750 train_time:63734ms step_avg:94.70ms
step:674/1750 train_time:63832ms step_avg:94.71ms
step:675/1750 train_time:63930ms step_avg:94.71ms
step:676/1750 train_time:64027ms step_avg:94.71ms
step:677/1750 train_time:64124ms step_avg:94.72ms
step:678/1750 train_time:64222ms step_avg:94.72ms
step:679/1750 train_time:64319ms step_avg:94.73ms
step:680/1750 train_time:64416ms step_avg:94.73ms
step:681/1750 train_time:64513ms step_avg:94.73ms
step:682/1750 train_time:64612ms step_avg:94.74ms
step:683/1750 train_time:64710ms step_avg:94.74ms
step:684/1750 train_time:64808ms step_avg:94.75ms
step:685/1750 train_time:64906ms step_avg:94.75ms
step:686/1750 train_time:65003ms step_avg:94.76ms
step:687/1750 train_time:65100ms step_avg:94.76ms
step:688/1750 train_time:65196ms step_avg:94.76ms
step:689/1750 train_time:65295ms step_avg:94.77ms
step:690/1750 train_time:65393ms step_avg:94.77ms
step:691/1750 train_time:65490ms step_avg:94.78ms
step:692/1750 train_time:65588ms step_avg:94.78ms
step:693/1750 train_time:65685ms step_avg:94.78ms
step:694/1750 train_time:65782ms step_avg:94.79ms
step:695/1750 train_time:65880ms step_avg:94.79ms
step:696/1750 train_time:65977ms step_avg:94.79ms
step:697/1750 train_time:66075ms step_avg:94.80ms
step:698/1750 train_time:66172ms step_avg:94.80ms
step:699/1750 train_time:66271ms step_avg:94.81ms
step:700/1750 train_time:66369ms step_avg:94.81ms
step:701/1750 train_time:66467ms step_avg:94.82ms
step:702/1750 train_time:66564ms step_avg:94.82ms
step:703/1750 train_time:66660ms step_avg:94.82ms
step:704/1750 train_time:66757ms step_avg:94.83ms
step:705/1750 train_time:66855ms step_avg:94.83ms
step:706/1750 train_time:66953ms step_avg:94.83ms
step:707/1750 train_time:67051ms step_avg:94.84ms
step:708/1750 train_time:67149ms step_avg:94.84ms
step:709/1750 train_time:67246ms step_avg:94.85ms
step:710/1750 train_time:67343ms step_avg:94.85ms
step:711/1750 train_time:67440ms step_avg:94.85ms
step:712/1750 train_time:67537ms step_avg:94.86ms
step:713/1750 train_time:67634ms step_avg:94.86ms
step:714/1750 train_time:67732ms step_avg:94.86ms
step:715/1750 train_time:67829ms step_avg:94.87ms
step:716/1750 train_time:67928ms step_avg:94.87ms
step:717/1750 train_time:68026ms step_avg:94.88ms
step:718/1750 train_time:68123ms step_avg:94.88ms
step:719/1750 train_time:68220ms step_avg:94.88ms
step:720/1750 train_time:68317ms step_avg:94.88ms
step:721/1750 train_time:68415ms step_avg:94.89ms
step:722/1750 train_time:68513ms step_avg:94.89ms
step:723/1750 train_time:68610ms step_avg:94.90ms
step:724/1750 train_time:68708ms step_avg:94.90ms
step:725/1750 train_time:68805ms step_avg:94.90ms
step:726/1750 train_time:68903ms step_avg:94.91ms
step:727/1750 train_time:69000ms step_avg:94.91ms
step:728/1750 train_time:69098ms step_avg:94.91ms
step:729/1750 train_time:69195ms step_avg:94.92ms
step:730/1750 train_time:69293ms step_avg:94.92ms
step:731/1750 train_time:69391ms step_avg:94.93ms
step:732/1750 train_time:69488ms step_avg:94.93ms
step:733/1750 train_time:69585ms step_avg:94.93ms
step:734/1750 train_time:69682ms step_avg:94.93ms
step:735/1750 train_time:69779ms step_avg:94.94ms
step:736/1750 train_time:69877ms step_avg:94.94ms
step:737/1750 train_time:69975ms step_avg:94.95ms
step:738/1750 train_time:70073ms step_avg:94.95ms
step:739/1750 train_time:70171ms step_avg:94.95ms
step:740/1750 train_time:70269ms step_avg:94.96ms
step:741/1750 train_time:70367ms step_avg:94.96ms
step:742/1750 train_time:70464ms step_avg:94.97ms
step:743/1750 train_time:70561ms step_avg:94.97ms
step:744/1750 train_time:70659ms step_avg:94.97ms
step:745/1750 train_time:70756ms step_avg:94.97ms
step:746/1750 train_time:70853ms step_avg:94.98ms
step:747/1750 train_time:70950ms step_avg:94.98ms
step:748/1750 train_time:71048ms step_avg:94.98ms
step:749/1750 train_time:71146ms step_avg:94.99ms
step:750/1750 train_time:71242ms step_avg:94.99ms
step:750/1750 val_loss:3.5917 train_time:71332ms step_avg:95.11ms
step:751/1750 train_time:71369ms step_avg:95.03ms
step:752/1750 train_time:71446ms step_avg:95.01ms
step:753/1750 train_time:71546ms step_avg:95.01ms
step:754/1750 train_time:71643ms step_avg:95.02ms
step:755/1750 train_time:71740ms step_avg:95.02ms
step:756/1750 train_time:71837ms step_avg:95.02ms
step:757/1750 train_time:71934ms step_avg:95.03ms
step:758/1750 train_time:72032ms step_avg:95.03ms
step:759/1750 train_time:72129ms step_avg:95.03ms
step:760/1750 train_time:72225ms step_avg:95.03ms
step:761/1750 train_time:72322ms step_avg:95.04ms
step:762/1750 train_time:72420ms step_avg:95.04ms
step:763/1750 train_time:72519ms step_avg:95.04ms
step:764/1750 train_time:72617ms step_avg:95.05ms
step:765/1750 train_time:72715ms step_avg:95.05ms
step:766/1750 train_time:72813ms step_avg:95.06ms
step:767/1750 train_time:72911ms step_avg:95.06ms
step:768/1750 train_time:73008ms step_avg:95.06ms
step:769/1750 train_time:73105ms step_avg:95.06ms
step:770/1750 train_time:73201ms step_avg:95.07ms
step:771/1750 train_time:73299ms step_avg:95.07ms
step:772/1750 train_time:73397ms step_avg:95.07ms
step:773/1750 train_time:73495ms step_avg:95.08ms
step:774/1750 train_time:73594ms step_avg:95.08ms
step:775/1750 train_time:73692ms step_avg:95.09ms
step:776/1750 train_time:73790ms step_avg:95.09ms
step:777/1750 train_time:73888ms step_avg:95.09ms
step:778/1750 train_time:73985ms step_avg:95.10ms
step:779/1750 train_time:74082ms step_avg:95.10ms
step:780/1750 train_time:74179ms step_avg:95.10ms
step:781/1750 train_time:74277ms step_avg:95.10ms
step:782/1750 train_time:74374ms step_avg:95.11ms
step:783/1750 train_time:74472ms step_avg:95.11ms
step:784/1750 train_time:74570ms step_avg:95.11ms
step:785/1750 train_time:74668ms step_avg:95.12ms
step:786/1750 train_time:74765ms step_avg:95.12ms
step:787/1750 train_time:74863ms step_avg:95.12ms
step:788/1750 train_time:74961ms step_avg:95.13ms
step:789/1750 train_time:75059ms step_avg:95.13ms
step:790/1750 train_time:75157ms step_avg:95.14ms
step:791/1750 train_time:75255ms step_avg:95.14ms
step:792/1750 train_time:75354ms step_avg:95.14ms
step:793/1750 train_time:75452ms step_avg:95.15ms
step:794/1750 train_time:75550ms step_avg:95.15ms
step:795/1750 train_time:75648ms step_avg:95.16ms
step:796/1750 train_time:75746ms step_avg:95.16ms
step:797/1750 train_time:75844ms step_avg:95.16ms
step:798/1750 train_time:75941ms step_avg:95.16ms
step:799/1750 train_time:76038ms step_avg:95.17ms
step:800/1750 train_time:76136ms step_avg:95.17ms
step:801/1750 train_time:76234ms step_avg:95.17ms
step:802/1750 train_time:76332ms step_avg:95.18ms
step:803/1750 train_time:76430ms step_avg:95.18ms
step:804/1750 train_time:76528ms step_avg:95.18ms
step:805/1750 train_time:76626ms step_avg:95.19ms
step:806/1750 train_time:76724ms step_avg:95.19ms
step:807/1750 train_time:76821ms step_avg:95.19ms
step:808/1750 train_time:76919ms step_avg:95.20ms
step:809/1750 train_time:77017ms step_avg:95.20ms
step:810/1750 train_time:77114ms step_avg:95.20ms
step:811/1750 train_time:77212ms step_avg:95.21ms
step:812/1750 train_time:77309ms step_avg:95.21ms
step:813/1750 train_time:77406ms step_avg:95.21ms
step:814/1750 train_time:77504ms step_avg:95.21ms
step:815/1750 train_time:77602ms step_avg:95.22ms
step:816/1750 train_time:77700ms step_avg:95.22ms
step:817/1750 train_time:77798ms step_avg:95.22ms
step:818/1750 train_time:77896ms step_avg:95.23ms
step:819/1750 train_time:77994ms step_avg:95.23ms
step:820/1750 train_time:78092ms step_avg:95.23ms
step:821/1750 train_time:78190ms step_avg:95.24ms
step:822/1750 train_time:78288ms step_avg:95.24ms
step:823/1750 train_time:78385ms step_avg:95.24ms
step:824/1750 train_time:78483ms step_avg:95.25ms
step:825/1750 train_time:78580ms step_avg:95.25ms
step:826/1750 train_time:78678ms step_avg:95.25ms
step:827/1750 train_time:78777ms step_avg:95.26ms
step:828/1750 train_time:78876ms step_avg:95.26ms
step:829/1750 train_time:78975ms step_avg:95.26ms
step:830/1750 train_time:79073ms step_avg:95.27ms
step:831/1750 train_time:79171ms step_avg:95.27ms
step:832/1750 train_time:79269ms step_avg:95.28ms
step:833/1750 train_time:79367ms step_avg:95.28ms
step:834/1750 train_time:79465ms step_avg:95.28ms
step:835/1750 train_time:79562ms step_avg:95.28ms
step:836/1750 train_time:79660ms step_avg:95.29ms
step:837/1750 train_time:79757ms step_avg:95.29ms
step:838/1750 train_time:79856ms step_avg:95.29ms
step:839/1750 train_time:79954ms step_avg:95.30ms
step:840/1750 train_time:80052ms step_avg:95.30ms
step:841/1750 train_time:80149ms step_avg:95.30ms
step:842/1750 train_time:80247ms step_avg:95.30ms
step:843/1750 train_time:80344ms step_avg:95.31ms
step:844/1750 train_time:80442ms step_avg:95.31ms
step:845/1750 train_time:80540ms step_avg:95.31ms
step:846/1750 train_time:80638ms step_avg:95.32ms
step:847/1750 train_time:80737ms step_avg:95.32ms
step:848/1750 train_time:80834ms step_avg:95.32ms
step:849/1750 train_time:80932ms step_avg:95.33ms
step:850/1750 train_time:81029ms step_avg:95.33ms
step:851/1750 train_time:81126ms step_avg:95.33ms
step:852/1750 train_time:81224ms step_avg:95.33ms
step:853/1750 train_time:81322ms step_avg:95.34ms
step:854/1750 train_time:81420ms step_avg:95.34ms
step:855/1750 train_time:81517ms step_avg:95.34ms
step:856/1750 train_time:81616ms step_avg:95.35ms
step:857/1750 train_time:81714ms step_avg:95.35ms
step:858/1750 train_time:81812ms step_avg:95.35ms
step:859/1750 train_time:81910ms step_avg:95.36ms
step:860/1750 train_time:82008ms step_avg:95.36ms
step:861/1750 train_time:82106ms step_avg:95.36ms
step:862/1750 train_time:82204ms step_avg:95.36ms
step:863/1750 train_time:82302ms step_avg:95.37ms
step:864/1750 train_time:82399ms step_avg:95.37ms
step:865/1750 train_time:82497ms step_avg:95.37ms
step:866/1750 train_time:82595ms step_avg:95.38ms
step:867/1750 train_time:82693ms step_avg:95.38ms
step:868/1750 train_time:82790ms step_avg:95.38ms
step:869/1750 train_time:82887ms step_avg:95.38ms
step:870/1750 train_time:82985ms step_avg:95.38ms
step:871/1750 train_time:83082ms step_avg:95.39ms
step:872/1750 train_time:83181ms step_avg:95.39ms
step:873/1750 train_time:83278ms step_avg:95.39ms
step:874/1750 train_time:83375ms step_avg:95.40ms
step:875/1750 train_time:83474ms step_avg:95.40ms
step:875/1750 val_loss:3.5474 train_time:83565ms step_avg:95.50ms
step:876/1750 train_time:83601ms step_avg:95.44ms
step:877/1750 train_time:83685ms step_avg:95.42ms
step:878/1750 train_time:83784ms step_avg:95.43ms
step:879/1750 train_time:83882ms step_avg:95.43ms
step:880/1750 train_time:83979ms step_avg:95.43ms
step:881/1750 train_time:84076ms step_avg:95.43ms
step:882/1750 train_time:84173ms step_avg:95.43ms
step:883/1750 train_time:84271ms step_avg:95.44ms
step:884/1750 train_time:84368ms step_avg:95.44ms
step:885/1750 train_time:84465ms step_avg:95.44ms
step:886/1750 train_time:84564ms step_avg:95.44ms
step:887/1750 train_time:84663ms step_avg:95.45ms
step:888/1750 train_time:84761ms step_avg:95.45ms
step:889/1750 train_time:84860ms step_avg:95.46ms
step:890/1750 train_time:84958ms step_avg:95.46ms
step:891/1750 train_time:85056ms step_avg:95.46ms
step:892/1750 train_time:85154ms step_avg:95.46ms
step:893/1750 train_time:85251ms step_avg:95.47ms
step:894/1750 train_time:85348ms step_avg:95.47ms
step:895/1750 train_time:85445ms step_avg:95.47ms
step:896/1750 train_time:85542ms step_avg:95.47ms
step:897/1750 train_time:85640ms step_avg:95.47ms
step:898/1750 train_time:85739ms step_avg:95.48ms
step:899/1750 train_time:85838ms step_avg:95.48ms
step:900/1750 train_time:85936ms step_avg:95.48ms
step:901/1750 train_time:86035ms step_avg:95.49ms
step:902/1750 train_time:86133ms step_avg:95.49ms
step:903/1750 train_time:86230ms step_avg:95.49ms
step:904/1750 train_time:86328ms step_avg:95.50ms
step:905/1750 train_time:86425ms step_avg:95.50ms
step:906/1750 train_time:86522ms step_avg:95.50ms
step:907/1750 train_time:86620ms step_avg:95.50ms
step:908/1750 train_time:86718ms step_avg:95.50ms
step:909/1750 train_time:86817ms step_avg:95.51ms
step:910/1750 train_time:86918ms step_avg:95.51ms
step:911/1750 train_time:87017ms step_avg:95.52ms
step:912/1750 train_time:87116ms step_avg:95.52ms
step:913/1750 train_time:87215ms step_avg:95.53ms
step:914/1750 train_time:87315ms step_avg:95.53ms
step:915/1750 train_time:87414ms step_avg:95.53ms
step:916/1750 train_time:87515ms step_avg:95.54ms
step:917/1750 train_time:87615ms step_avg:95.55ms
step:918/1750 train_time:87716ms step_avg:95.55ms
step:919/1750 train_time:87816ms step_avg:95.56ms
step:920/1750 train_time:87916ms step_avg:95.56ms
step:921/1750 train_time:88015ms step_avg:95.56ms
step:922/1750 train_time:88115ms step_avg:95.57ms
step:923/1750 train_time:88215ms step_avg:95.57ms
step:924/1750 train_time:88315ms step_avg:95.58ms
step:925/1750 train_time:88415ms step_avg:95.58ms
step:926/1750 train_time:88513ms step_avg:95.59ms
step:927/1750 train_time:88613ms step_avg:95.59ms
step:928/1750 train_time:88712ms step_avg:95.59ms
step:929/1750 train_time:88813ms step_avg:95.60ms
step:930/1750 train_time:88912ms step_avg:95.60ms
step:931/1750 train_time:89012ms step_avg:95.61ms
step:932/1750 train_time:89111ms step_avg:95.61ms
step:933/1750 train_time:89210ms step_avg:95.62ms
step:934/1750 train_time:89309ms step_avg:95.62ms
step:935/1750 train_time:89409ms step_avg:95.62ms
step:936/1750 train_time:89508ms step_avg:95.63ms
step:937/1750 train_time:89606ms step_avg:95.63ms
step:938/1750 train_time:89705ms step_avg:95.63ms
step:939/1750 train_time:89803ms step_avg:95.64ms
step:940/1750 train_time:89902ms step_avg:95.64ms
step:941/1750 train_time:90002ms step_avg:95.64ms
step:942/1750 train_time:90100ms step_avg:95.65ms
step:943/1750 train_time:90200ms step_avg:95.65ms
step:944/1750 train_time:90300ms step_avg:95.66ms
step:945/1750 train_time:90401ms step_avg:95.66ms
step:946/1750 train_time:90500ms step_avg:95.67ms
step:947/1750 train_time:90600ms step_avg:95.67ms
step:948/1750 train_time:90700ms step_avg:95.68ms
step:949/1750 train_time:90800ms step_avg:95.68ms
step:950/1750 train_time:90900ms step_avg:95.68ms
step:951/1750 train_time:91000ms step_avg:95.69ms
step:952/1750 train_time:91099ms step_avg:95.69ms
step:953/1750 train_time:91198ms step_avg:95.70ms
step:954/1750 train_time:91297ms step_avg:95.70ms
step:955/1750 train_time:91397ms step_avg:95.70ms
step:956/1750 train_time:91496ms step_avg:95.71ms
step:957/1750 train_time:91596ms step_avg:95.71ms
step:958/1750 train_time:91695ms step_avg:95.71ms
step:959/1750 train_time:91795ms step_avg:95.72ms
step:960/1750 train_time:91895ms step_avg:95.72ms
step:961/1750 train_time:91994ms step_avg:95.73ms
step:962/1750 train_time:92094ms step_avg:95.73ms
step:963/1750 train_time:92193ms step_avg:95.74ms
step:964/1750 train_time:92292ms step_avg:95.74ms
step:965/1750 train_time:92391ms step_avg:95.74ms
step:966/1750 train_time:92490ms step_avg:95.75ms
step:967/1750 train_time:92589ms step_avg:95.75ms
step:968/1750 train_time:92688ms step_avg:95.75ms
step:969/1750 train_time:92787ms step_avg:95.76ms
step:970/1750 train_time:92885ms step_avg:95.76ms
step:971/1750 train_time:92983ms step_avg:95.76ms
step:972/1750 train_time:93082ms step_avg:95.76ms
step:973/1750 train_time:93182ms step_avg:95.77ms
step:974/1750 train_time:93280ms step_avg:95.77ms
step:975/1750 train_time:93380ms step_avg:95.77ms
step:976/1750 train_time:93478ms step_avg:95.78ms
step:977/1750 train_time:93578ms step_avg:95.78ms
step:978/1750 train_time:93678ms step_avg:95.79ms
step:979/1750 train_time:93778ms step_avg:95.79ms
step:980/1750 train_time:93879ms step_avg:95.79ms
step:981/1750 train_time:93978ms step_avg:95.80ms
step:982/1750 train_time:94077ms step_avg:95.80ms
step:983/1750 train_time:94177ms step_avg:95.81ms
step:984/1750 train_time:94276ms step_avg:95.81ms
step:985/1750 train_time:94375ms step_avg:95.81ms
step:986/1750 train_time:94475ms step_avg:95.82ms
step:987/1750 train_time:94575ms step_avg:95.82ms
step:988/1750 train_time:94675ms step_avg:95.83ms
step:989/1750 train_time:94775ms step_avg:95.83ms
step:990/1750 train_time:94876ms step_avg:95.83ms
step:991/1750 train_time:94977ms step_avg:95.84ms
step:992/1750 train_time:95076ms step_avg:95.84ms
step:993/1750 train_time:95175ms step_avg:95.85ms
step:994/1750 train_time:95274ms step_avg:95.85ms
step:995/1750 train_time:95374ms step_avg:95.85ms
step:996/1750 train_time:95476ms step_avg:95.86ms
step:997/1750 train_time:95574ms step_avg:95.86ms
step:998/1750 train_time:95674ms step_avg:95.87ms
step:999/1750 train_time:95773ms step_avg:95.87ms
step:1000/1750 train_time:95873ms step_avg:95.87ms
step:1000/1750 val_loss:3.5035 train_time:95967ms step_avg:95.97ms
step:1001/1750 train_time:96003ms step_avg:95.91ms
step:1002/1750 train_time:96087ms step_avg:95.89ms
step:1003/1750 train_time:96187ms step_avg:95.90ms
step:1004/1750 train_time:96286ms step_avg:95.90ms
step:1005/1750 train_time:96385ms step_avg:95.91ms
step:1006/1750 train_time:96483ms step_avg:95.91ms
step:1007/1750 train_time:96582ms step_avg:95.91ms
step:1008/1750 train_time:96680ms step_avg:95.91ms
step:1009/1750 train_time:96778ms step_avg:95.91ms
step:1010/1750 train_time:96877ms step_avg:95.92ms
step:1011/1750 train_time:96975ms step_avg:95.92ms
step:1012/1750 train_time:97077ms step_avg:95.93ms
step:1013/1750 train_time:97177ms step_avg:95.93ms
step:1014/1750 train_time:97276ms step_avg:95.93ms
step:1015/1750 train_time:97375ms step_avg:95.94ms
step:1016/1750 train_time:97474ms step_avg:95.94ms
step:1017/1750 train_time:97573ms step_avg:95.94ms
step:1018/1750 train_time:97672ms step_avg:95.94ms
step:1019/1750 train_time:97771ms step_avg:95.95ms
step:1020/1750 train_time:97872ms step_avg:95.95ms
step:1021/1750 train_time:97972ms step_avg:95.96ms
step:1022/1750 train_time:98072ms step_avg:95.96ms
step:1023/1750 train_time:98172ms step_avg:95.97ms
step:1024/1750 train_time:98272ms step_avg:95.97ms
step:1025/1750 train_time:98372ms step_avg:95.97ms
step:1026/1750 train_time:98472ms step_avg:95.98ms
step:1027/1750 train_time:98572ms step_avg:95.98ms
step:1028/1750 train_time:98672ms step_avg:95.98ms
step:1029/1750 train_time:98772ms step_avg:95.99ms
step:1030/1750 train_time:98871ms step_avg:95.99ms
step:1031/1750 train_time:98971ms step_avg:95.99ms
step:1032/1750 train_time:99071ms step_avg:96.00ms
step:1033/1750 train_time:99170ms step_avg:96.00ms
step:1034/1750 train_time:99271ms step_avg:96.01ms
step:1035/1750 train_time:99370ms step_avg:96.01ms
step:1036/1750 train_time:99470ms step_avg:96.01ms
step:1037/1750 train_time:99571ms step_avg:96.02ms
step:1038/1750 train_time:99671ms step_avg:96.02ms
step:1039/1750 train_time:99771ms step_avg:96.03ms
step:1040/1750 train_time:99869ms step_avg:96.03ms
step:1041/1750 train_time:99970ms step_avg:96.03ms
step:1042/1750 train_time:100070ms step_avg:96.04ms
step:1043/1750 train_time:100170ms step_avg:96.04ms
step:1044/1750 train_time:100270ms step_avg:96.04ms
step:1045/1750 train_time:100370ms step_avg:96.05ms
step:1046/1750 train_time:100471ms step_avg:96.05ms
step:1047/1750 train_time:100570ms step_avg:96.06ms
step:1048/1750 train_time:100670ms step_avg:96.06ms
step:1049/1750 train_time:100771ms step_avg:96.06ms
step:1050/1750 train_time:100870ms step_avg:96.07ms
step:1051/1750 train_time:100970ms step_avg:96.07ms
step:1052/1750 train_time:101070ms step_avg:96.07ms
step:1053/1750 train_time:101171ms step_avg:96.08ms
step:1054/1750 train_time:101270ms step_avg:96.08ms
step:1055/1750 train_time:101371ms step_avg:96.09ms
step:1056/1750 train_time:101472ms step_avg:96.09ms
step:1057/1750 train_time:101572ms step_avg:96.09ms
step:1058/1750 train_time:101671ms step_avg:96.10ms
step:1059/1750 train_time:101772ms step_avg:96.10ms
step:1060/1750 train_time:101871ms step_avg:96.10ms
step:1061/1750 train_time:101971ms step_avg:96.11ms
step:1062/1750 train_time:102071ms step_avg:96.11ms
step:1063/1750 train_time:102172ms step_avg:96.12ms
step:1064/1750 train_time:102272ms step_avg:96.12ms
step:1065/1750 train_time:102371ms step_avg:96.12ms
step:1066/1750 train_time:102472ms step_avg:96.13ms
step:1067/1750 train_time:102572ms step_avg:96.13ms
step:1068/1750 train_time:102673ms step_avg:96.14ms
step:1069/1750 train_time:102774ms step_avg:96.14ms
step:1070/1750 train_time:102873ms step_avg:96.14ms
step:1071/1750 train_time:102973ms step_avg:96.15ms
step:1072/1750 train_time:103075ms step_avg:96.15ms
step:1073/1750 train_time:103174ms step_avg:96.16ms
step:1074/1750 train_time:103274ms step_avg:96.16ms
step:1075/1750 train_time:103374ms step_avg:96.16ms
step:1076/1750 train_time:103473ms step_avg:96.16ms
step:1077/1750 train_time:103573ms step_avg:96.17ms
step:1078/1750 train_time:103674ms step_avg:96.17ms
step:1079/1750 train_time:103774ms step_avg:96.18ms
step:1080/1750 train_time:103874ms step_avg:96.18ms
step:1081/1750 train_time:103973ms step_avg:96.18ms
step:1082/1750 train_time:104073ms step_avg:96.19ms
step:1083/1750 train_time:104172ms step_avg:96.19ms
step:1084/1750 train_time:104273ms step_avg:96.19ms
step:1085/1750 train_time:104373ms step_avg:96.20ms
step:1086/1750 train_time:104473ms step_avg:96.20ms
step:1087/1750 train_time:104572ms step_avg:96.20ms
step:1088/1750 train_time:104672ms step_avg:96.21ms
step:1089/1750 train_time:104771ms step_avg:96.21ms
step:1090/1750 train_time:104871ms step_avg:96.21ms
step:1091/1750 train_time:104971ms step_avg:96.22ms
step:1092/1750 train_time:105071ms step_avg:96.22ms
step:1093/1750 train_time:105172ms step_avg:96.22ms
step:1094/1750 train_time:105272ms step_avg:96.23ms
step:1095/1750 train_time:105372ms step_avg:96.23ms
step:1096/1750 train_time:105473ms step_avg:96.23ms
step:1097/1750 train_time:105572ms step_avg:96.24ms
step:1098/1750 train_time:105672ms step_avg:96.24ms
step:1099/1750 train_time:105772ms step_avg:96.24ms
step:1100/1750 train_time:105872ms step_avg:96.25ms
step:1101/1750 train_time:105972ms step_avg:96.25ms
step:1102/1750 train_time:106071ms step_avg:96.25ms
step:1103/1750 train_time:106172ms step_avg:96.26ms
step:1104/1750 train_time:106271ms step_avg:96.26ms
step:1105/1750 train_time:106371ms step_avg:96.26ms
step:1106/1750 train_time:106471ms step_avg:96.27ms
step:1107/1750 train_time:106571ms step_avg:96.27ms
step:1108/1750 train_time:106671ms step_avg:96.27ms
step:1109/1750 train_time:106771ms step_avg:96.28ms
step:1110/1750 train_time:106870ms step_avg:96.28ms
step:1111/1750 train_time:106970ms step_avg:96.28ms
step:1112/1750 train_time:107070ms step_avg:96.29ms
step:1113/1750 train_time:107171ms step_avg:96.29ms
step:1114/1750 train_time:107271ms step_avg:96.29ms
step:1115/1750 train_time:107371ms step_avg:96.30ms
step:1116/1750 train_time:107471ms step_avg:96.30ms
step:1117/1750 train_time:107571ms step_avg:96.30ms
step:1118/1750 train_time:107672ms step_avg:96.31ms
step:1119/1750 train_time:107771ms step_avg:96.31ms
step:1120/1750 train_time:107872ms step_avg:96.31ms
step:1121/1750 train_time:107973ms step_avg:96.32ms
step:1122/1750 train_time:108073ms step_avg:96.32ms
step:1123/1750 train_time:108173ms step_avg:96.32ms
step:1124/1750 train_time:108273ms step_avg:96.33ms
step:1125/1750 train_time:108373ms step_avg:96.33ms
step:1125/1750 val_loss:3.4517 train_time:108465ms step_avg:96.41ms
step:1126/1750 train_time:108502ms step_avg:96.36ms
step:1127/1750 train_time:108583ms step_avg:96.35ms
step:1128/1750 train_time:108685ms step_avg:96.35ms
step:1129/1750 train_time:108784ms step_avg:96.35ms
step:1130/1750 train_time:108883ms step_avg:96.36ms
step:1131/1750 train_time:108981ms step_avg:96.36ms
step:1132/1750 train_time:109080ms step_avg:96.36ms
step:1133/1750 train_time:109179ms step_avg:96.36ms
step:1134/1750 train_time:109278ms step_avg:96.36ms
step:1135/1750 train_time:109376ms step_avg:96.37ms
step:1136/1750 train_time:109475ms step_avg:96.37ms
step:1137/1750 train_time:109575ms step_avg:96.37ms
step:1138/1750 train_time:109674ms step_avg:96.37ms
step:1139/1750 train_time:109773ms step_avg:96.38ms
step:1140/1750 train_time:109872ms step_avg:96.38ms
step:1141/1750 train_time:109971ms step_avg:96.38ms
step:1142/1750 train_time:110070ms step_avg:96.38ms
step:1143/1750 train_time:110170ms step_avg:96.39ms
step:1144/1750 train_time:110270ms step_avg:96.39ms
step:1145/1750 train_time:110370ms step_avg:96.39ms
step:1146/1750 train_time:110469ms step_avg:96.40ms
step:1147/1750 train_time:110570ms step_avg:96.40ms
step:1148/1750 train_time:110671ms step_avg:96.40ms
step:1149/1750 train_time:110771ms step_avg:96.41ms
step:1150/1750 train_time:110870ms step_avg:96.41ms
step:1151/1750 train_time:110969ms step_avg:96.41ms
step:1152/1750 train_time:111068ms step_avg:96.41ms
step:1153/1750 train_time:111168ms step_avg:96.42ms
step:1154/1750 train_time:111267ms step_avg:96.42ms
step:1155/1750 train_time:111367ms step_avg:96.42ms
step:1156/1750 train_time:111468ms step_avg:96.43ms
step:1157/1750 train_time:111567ms step_avg:96.43ms
step:1158/1750 train_time:111668ms step_avg:96.43ms
step:1159/1750 train_time:111768ms step_avg:96.44ms
step:1160/1750 train_time:111869ms step_avg:96.44ms
step:1161/1750 train_time:111968ms step_avg:96.44ms
step:1162/1750 train_time:112068ms step_avg:96.44ms
step:1163/1750 train_time:112167ms step_avg:96.45ms
step:1164/1750 train_time:112267ms step_avg:96.45ms
step:1165/1750 train_time:112367ms step_avg:96.45ms
step:1166/1750 train_time:112467ms step_avg:96.46ms
step:1167/1750 train_time:112567ms step_avg:96.46ms
step:1168/1750 train_time:112667ms step_avg:96.46ms
step:1169/1750 train_time:112769ms step_avg:96.47ms
step:1170/1750 train_time:112869ms step_avg:96.47ms
step:1171/1750 train_time:112970ms step_avg:96.47ms
step:1172/1750 train_time:113071ms step_avg:96.48ms
step:1173/1750 train_time:113171ms step_avg:96.48ms
step:1174/1750 train_time:113272ms step_avg:96.48ms
step:1175/1750 train_time:113373ms step_avg:96.49ms
step:1176/1750 train_time:113473ms step_avg:96.49ms
step:1177/1750 train_time:113573ms step_avg:96.49ms
step:1178/1750 train_time:113673ms step_avg:96.50ms
step:1179/1750 train_time:113774ms step_avg:96.50ms
step:1180/1750 train_time:113873ms step_avg:96.50ms
step:1181/1750 train_time:113975ms step_avg:96.51ms
step:1182/1750 train_time:114075ms step_avg:96.51ms
step:1183/1750 train_time:114174ms step_avg:96.51ms
step:1184/1750 train_time:114277ms step_avg:96.52ms
step:1185/1750 train_time:114377ms step_avg:96.52ms
step:1186/1750 train_time:114477ms step_avg:96.52ms
step:1187/1750 train_time:114577ms step_avg:96.53ms
step:1188/1750 train_time:114677ms step_avg:96.53ms
step:1189/1750 train_time:114776ms step_avg:96.53ms
step:1190/1750 train_time:114875ms step_avg:96.53ms
step:1191/1750 train_time:114975ms step_avg:96.54ms
step:1192/1750 train_time:115075ms step_avg:96.54ms
step:1193/1750 train_time:115175ms step_avg:96.54ms
step:1194/1750 train_time:115275ms step_avg:96.55ms
step:1195/1750 train_time:115375ms step_avg:96.55ms
step:1196/1750 train_time:115476ms step_avg:96.55ms
step:1197/1750 train_time:115576ms step_avg:96.55ms
step:1198/1750 train_time:115675ms step_avg:96.56ms
step:1199/1750 train_time:115775ms step_avg:96.56ms
step:1200/1750 train_time:115874ms step_avg:96.56ms
step:1201/1750 train_time:115974ms step_avg:96.56ms
step:1202/1750 train_time:116074ms step_avg:96.57ms
step:1203/1750 train_time:116174ms step_avg:96.57ms
step:1204/1750 train_time:116274ms step_avg:96.57ms
step:1205/1750 train_time:116374ms step_avg:96.58ms
step:1206/1750 train_time:116475ms step_avg:96.58ms
step:1207/1750 train_time:116574ms step_avg:96.58ms
step:1208/1750 train_time:116674ms step_avg:96.58ms
step:1209/1750 train_time:116774ms step_avg:96.59ms
step:1210/1750 train_time:116873ms step_avg:96.59ms
step:1211/1750 train_time:116973ms step_avg:96.59ms
step:1212/1750 train_time:117073ms step_avg:96.60ms
step:1213/1750 train_time:117173ms step_avg:96.60ms
step:1214/1750 train_time:117273ms step_avg:96.60ms
step:1215/1750 train_time:117373ms step_avg:96.60ms
step:1216/1750 train_time:117473ms step_avg:96.61ms
step:1217/1750 train_time:117574ms step_avg:96.61ms
step:1218/1750 train_time:117674ms step_avg:96.61ms
step:1219/1750 train_time:117774ms step_avg:96.62ms
step:1220/1750 train_time:117875ms step_avg:96.62ms
step:1221/1750 train_time:117975ms step_avg:96.62ms
step:1222/1750 train_time:118076ms step_avg:96.62ms
step:1223/1750 train_time:118175ms step_avg:96.63ms
step:1224/1750 train_time:118275ms step_avg:96.63ms
step:1225/1750 train_time:118376ms step_avg:96.63ms
step:1226/1750 train_time:118476ms step_avg:96.64ms
step:1227/1750 train_time:118575ms step_avg:96.64ms
step:1228/1750 train_time:118675ms step_avg:96.64ms
step:1229/1750 train_time:118775ms step_avg:96.64ms
step:1230/1750 train_time:118874ms step_avg:96.65ms
step:1231/1750 train_time:118974ms step_avg:96.65ms
step:1232/1750 train_time:119075ms step_avg:96.65ms
step:1233/1750 train_time:119175ms step_avg:96.65ms
step:1234/1750 train_time:119275ms step_avg:96.66ms
step:1235/1750 train_time:119375ms step_avg:96.66ms
step:1236/1750 train_time:119475ms step_avg:96.66ms
step:1237/1750 train_time:119575ms step_avg:96.67ms
step:1238/1750 train_time:119675ms step_avg:96.67ms
step:1239/1750 train_time:119775ms step_avg:96.67ms
step:1240/1750 train_time:119876ms step_avg:96.67ms
step:1241/1750 train_time:119977ms step_avg:96.68ms
step:1242/1750 train_time:120078ms step_avg:96.68ms
step:1243/1750 train_time:120178ms step_avg:96.68ms
step:1244/1750 train_time:120277ms step_avg:96.69ms
step:1245/1750 train_time:120377ms step_avg:96.69ms
step:1246/1750 train_time:120477ms step_avg:96.69ms
step:1247/1750 train_time:120576ms step_avg:96.69ms
step:1248/1750 train_time:120676ms step_avg:96.70ms
step:1249/1750 train_time:120775ms step_avg:96.70ms
step:1250/1750 train_time:120875ms step_avg:96.70ms
step:1250/1750 val_loss:3.4067 train_time:120968ms step_avg:96.77ms
step:1251/1750 train_time:121005ms step_avg:96.73ms
step:1252/1750 train_time:121090ms step_avg:96.72ms
step:1253/1750 train_time:121191ms step_avg:96.72ms
step:1254/1750 train_time:121292ms step_avg:96.72ms
step:1255/1750 train_time:121392ms step_avg:96.73ms
step:1256/1750 train_time:121493ms step_avg:96.73ms
step:1257/1750 train_time:121593ms step_avg:96.73ms
step:1258/1750 train_time:121693ms step_avg:96.74ms
step:1259/1750 train_time:121793ms step_avg:96.74ms
step:1260/1750 train_time:121892ms step_avg:96.74ms
step:1261/1750 train_time:121996ms step_avg:96.75ms
step:1262/1750 train_time:122098ms step_avg:96.75ms
step:1263/1750 train_time:122200ms step_avg:96.75ms
step:1264/1750 train_time:122299ms step_avg:96.76ms
step:1265/1750 train_time:122399ms step_avg:96.76ms
step:1266/1750 train_time:122497ms step_avg:96.76ms
step:1267/1750 train_time:122597ms step_avg:96.76ms
step:1268/1750 train_time:122695ms step_avg:96.76ms
step:1269/1750 train_time:122795ms step_avg:96.77ms
step:1270/1750 train_time:122895ms step_avg:96.77ms
step:1271/1750 train_time:122996ms step_avg:96.77ms
step:1272/1750 train_time:123097ms step_avg:96.77ms
step:1273/1750 train_time:123198ms step_avg:96.78ms
step:1274/1750 train_time:123299ms step_avg:96.78ms
step:1275/1750 train_time:123399ms step_avg:96.78ms
step:1276/1750 train_time:123499ms step_avg:96.79ms
step:1277/1750 train_time:123599ms step_avg:96.79ms
step:1278/1750 train_time:123698ms step_avg:96.79ms
step:1279/1750 train_time:123798ms step_avg:96.79ms
step:1280/1750 train_time:123897ms step_avg:96.79ms
step:1281/1750 train_time:123997ms step_avg:96.80ms
step:1282/1750 train_time:124097ms step_avg:96.80ms
step:1283/1750 train_time:124197ms step_avg:96.80ms
step:1284/1750 train_time:124298ms step_avg:96.81ms
step:1285/1750 train_time:124398ms step_avg:96.81ms
step:1286/1750 train_time:124499ms step_avg:96.81ms
step:1287/1750 train_time:124598ms step_avg:96.81ms
step:1288/1750 train_time:124698ms step_avg:96.82ms
step:1289/1750 train_time:124798ms step_avg:96.82ms
step:1290/1750 train_time:124898ms step_avg:96.82ms
step:1291/1750 train_time:124998ms step_avg:96.82ms
step:1292/1750 train_time:125098ms step_avg:96.83ms
step:1293/1750 train_time:125198ms step_avg:96.83ms
step:1294/1750 train_time:125298ms step_avg:96.83ms
step:1295/1750 train_time:125399ms step_avg:96.83ms
step:1296/1750 train_time:125499ms step_avg:96.84ms
step:1297/1750 train_time:125598ms step_avg:96.84ms
step:1298/1750 train_time:125697ms step_avg:96.84ms
step:1299/1750 train_time:125798ms step_avg:96.84ms
step:1300/1750 train_time:125898ms step_avg:96.84ms
step:1301/1750 train_time:125997ms step_avg:96.85ms
step:1302/1750 train_time:126098ms step_avg:96.85ms
step:1303/1750 train_time:126198ms step_avg:96.85ms
step:1304/1750 train_time:126298ms step_avg:96.85ms
step:1305/1750 train_time:126399ms step_avg:96.86ms
step:1306/1750 train_time:126498ms step_avg:96.86ms
step:1307/1750 train_time:126598ms step_avg:96.86ms
step:1308/1750 train_time:126697ms step_avg:96.86ms
step:1309/1750 train_time:126798ms step_avg:96.87ms
step:1310/1750 train_time:126899ms step_avg:96.87ms
step:1311/1750 train_time:126999ms step_avg:96.87ms
step:1312/1750 train_time:127099ms step_avg:96.87ms
step:1313/1750 train_time:127200ms step_avg:96.88ms
step:1314/1750 train_time:127301ms step_avg:96.88ms
step:1315/1750 train_time:127401ms step_avg:96.88ms
step:1316/1750 train_time:127500ms step_avg:96.88ms
step:1317/1750 train_time:127600ms step_avg:96.89ms
step:1318/1750 train_time:127700ms step_avg:96.89ms
step:1319/1750 train_time:127799ms step_avg:96.89ms
step:1320/1750 train_time:127900ms step_avg:96.89ms
step:1321/1750 train_time:128001ms step_avg:96.90ms
step:1322/1750 train_time:128101ms step_avg:96.90ms
step:1323/1750 train_time:128201ms step_avg:96.90ms
step:1324/1750 train_time:128301ms step_avg:96.90ms
step:1325/1750 train_time:128403ms step_avg:96.91ms
step:1326/1750 train_time:128502ms step_avg:96.91ms
step:1327/1750 train_time:128602ms step_avg:96.91ms
step:1328/1750 train_time:128702ms step_avg:96.91ms
step:1329/1750 train_time:128803ms step_avg:96.92ms
step:1330/1750 train_time:128903ms step_avg:96.92ms
step:1331/1750 train_time:129004ms step_avg:96.92ms
step:1332/1750 train_time:129105ms step_avg:96.93ms
step:1333/1750 train_time:129207ms step_avg:96.93ms
step:1334/1750 train_time:129308ms step_avg:96.93ms
step:1335/1750 train_time:129409ms step_avg:96.94ms
step:1336/1750 train_time:129511ms step_avg:96.94ms
step:1337/1750 train_time:129612ms step_avg:96.94ms
step:1338/1750 train_time:129713ms step_avg:96.95ms
step:1339/1750 train_time:129814ms step_avg:96.95ms
step:1340/1750 train_time:129914ms step_avg:96.95ms
step:1341/1750 train_time:130015ms step_avg:96.95ms
step:1342/1750 train_time:130116ms step_avg:96.96ms
step:1343/1750 train_time:130217ms step_avg:96.96ms
step:1344/1750 train_time:130318ms step_avg:96.96ms
step:1345/1750 train_time:130418ms step_avg:96.97ms
step:1346/1750 train_time:130519ms step_avg:96.97ms
step:1347/1750 train_time:130619ms step_avg:96.97ms
step:1348/1750 train_time:130718ms step_avg:96.97ms
step:1349/1750 train_time:130818ms step_avg:96.97ms
step:1350/1750 train_time:130918ms step_avg:96.98ms
step:1351/1750 train_time:131018ms step_avg:96.98ms
step:1352/1750 train_time:131118ms step_avg:96.98ms
step:1353/1750 train_time:131218ms step_avg:96.98ms
step:1354/1750 train_time:131319ms step_avg:96.99ms
step:1355/1750 train_time:131419ms step_avg:96.99ms
step:1356/1750 train_time:131519ms step_avg:96.99ms
step:1357/1750 train_time:131619ms step_avg:96.99ms
step:1358/1750 train_time:131719ms step_avg:96.99ms
step:1359/1750 train_time:131818ms step_avg:97.00ms
step:1360/1750 train_time:131919ms step_avg:97.00ms
step:1361/1750 train_time:132019ms step_avg:97.00ms
step:1362/1750 train_time:132118ms step_avg:97.00ms
step:1363/1750 train_time:132218ms step_avg:97.01ms
step:1364/1750 train_time:132318ms step_avg:97.01ms
step:1365/1750 train_time:132419ms step_avg:97.01ms
step:1366/1750 train_time:132519ms step_avg:97.01ms
step:1367/1750 train_time:132619ms step_avg:97.01ms
step:1368/1750 train_time:132719ms step_avg:97.02ms
step:1369/1750 train_time:132819ms step_avg:97.02ms
step:1370/1750 train_time:132919ms step_avg:97.02ms
step:1371/1750 train_time:133019ms step_avg:97.02ms
step:1372/1750 train_time:133119ms step_avg:97.03ms
step:1373/1750 train_time:133219ms step_avg:97.03ms
step:1374/1750 train_time:133318ms step_avg:97.03ms
step:1375/1750 train_time:133418ms step_avg:97.03ms
step:1375/1750 val_loss:3.3669 train_time:133511ms step_avg:97.10ms
step:1376/1750 train_time:133547ms step_avg:97.05ms
step:1377/1750 train_time:133631ms step_avg:97.04ms
step:1378/1750 train_time:133733ms step_avg:97.05ms
step:1379/1750 train_time:133833ms step_avg:97.05ms
step:1380/1750 train_time:133934ms step_avg:97.05ms
step:1381/1750 train_time:134034ms step_avg:97.06ms
step:1382/1750 train_time:134135ms step_avg:97.06ms
step:1383/1750 train_time:134234ms step_avg:97.06ms
step:1384/1750 train_time:134335ms step_avg:97.06ms
step:1385/1750 train_time:134436ms step_avg:97.07ms
step:1386/1750 train_time:134539ms step_avg:97.07ms
step:1387/1750 train_time:134639ms step_avg:97.07ms
step:1388/1750 train_time:134739ms step_avg:97.07ms
step:1389/1750 train_time:134839ms step_avg:97.08ms
step:1390/1750 train_time:134938ms step_avg:97.08ms
step:1391/1750 train_time:135038ms step_avg:97.08ms
step:1392/1750 train_time:135138ms step_avg:97.08ms
step:1393/1750 train_time:135238ms step_avg:97.08ms
step:1394/1750 train_time:135337ms step_avg:97.09ms
step:1395/1750 train_time:135438ms step_avg:97.09ms
step:1396/1750 train_time:135538ms step_avg:97.09ms
step:1397/1750 train_time:135639ms step_avg:97.09ms
step:1398/1750 train_time:135739ms step_avg:97.10ms
step:1399/1750 train_time:135839ms step_avg:97.10ms
step:1400/1750 train_time:135940ms step_avg:97.10ms
step:1401/1750 train_time:136040ms step_avg:97.10ms
step:1402/1750 train_time:136140ms step_avg:97.10ms
step:1403/1750 train_time:136239ms step_avg:97.11ms
step:1404/1750 train_time:136340ms step_avg:97.11ms
step:1405/1750 train_time:136440ms step_avg:97.11ms
step:1406/1750 train_time:136540ms step_avg:97.11ms
step:1407/1750 train_time:136641ms step_avg:97.11ms
step:1408/1750 train_time:136741ms step_avg:97.12ms
step:1409/1750 train_time:136842ms step_avg:97.12ms
step:1410/1750 train_time:136942ms step_avg:97.12ms
step:1411/1750 train_time:137043ms step_avg:97.12ms
step:1412/1750 train_time:137143ms step_avg:97.13ms
step:1413/1750 train_time:137245ms step_avg:97.13ms
step:1414/1750 train_time:137347ms step_avg:97.13ms
step:1415/1750 train_time:137450ms step_avg:97.14ms
step:1416/1750 train_time:137551ms step_avg:97.14ms
step:1417/1750 train_time:137651ms step_avg:97.14ms
step:1418/1750 train_time:137752ms step_avg:97.15ms
step:1419/1750 train_time:137853ms step_avg:97.15ms
step:1420/1750 train_time:137955ms step_avg:97.15ms
step:1421/1750 train_time:138056ms step_avg:97.15ms
step:1422/1750 train_time:138156ms step_avg:97.16ms
step:1423/1750 train_time:138256ms step_avg:97.16ms
step:1424/1750 train_time:138356ms step_avg:97.16ms
step:1425/1750 train_time:138457ms step_avg:97.16ms
step:1426/1750 train_time:138557ms step_avg:97.16ms
step:1427/1750 train_time:138656ms step_avg:97.17ms
step:1428/1750 train_time:138759ms step_avg:97.17ms
step:1429/1750 train_time:138859ms step_avg:97.17ms
step:1430/1750 train_time:138961ms step_avg:97.18ms
step:1431/1750 train_time:139062ms step_avg:97.18ms
step:1432/1750 train_time:139163ms step_avg:97.18ms
step:1433/1750 train_time:139265ms step_avg:97.18ms
step:1434/1750 train_time:139366ms step_avg:97.19ms
step:1435/1750 train_time:139470ms step_avg:97.19ms
step:1436/1750 train_time:139573ms step_avg:97.20ms
step:1437/1750 train_time:139675ms step_avg:97.20ms
step:1438/1750 train_time:139777ms step_avg:97.20ms
step:1439/1750 train_time:139878ms step_avg:97.20ms
step:1440/1750 train_time:139982ms step_avg:97.21ms
step:1441/1750 train_time:140085ms step_avg:97.21ms
step:1442/1750 train_time:140185ms step_avg:97.22ms
step:1443/1750 train_time:140287ms step_avg:97.22ms
step:1444/1750 train_time:140389ms step_avg:97.22ms
step:1445/1750 train_time:140491ms step_avg:97.23ms
step:1446/1750 train_time:140594ms step_avg:97.23ms
step:1447/1750 train_time:140697ms step_avg:97.23ms
step:1448/1750 train_time:140799ms step_avg:97.24ms
step:1449/1750 train_time:140899ms step_avg:97.24ms
step:1450/1750 train_time:141000ms step_avg:97.24ms
step:1451/1750 train_time:141101ms step_avg:97.24ms
step:1452/1750 train_time:141203ms step_avg:97.25ms
step:1453/1750 train_time:141306ms step_avg:97.25ms
step:1454/1750 train_time:141409ms step_avg:97.26ms
step:1455/1750 train_time:141511ms step_avg:97.26ms
step:1456/1750 train_time:141612ms step_avg:97.26ms
step:1457/1750 train_time:141715ms step_avg:97.26ms
step:1458/1750 train_time:141816ms step_avg:97.27ms
step:1459/1750 train_time:141917ms step_avg:97.27ms
step:1460/1750 train_time:142019ms step_avg:97.27ms
step:1461/1750 train_time:142120ms step_avg:97.28ms
step:1462/1750 train_time:142221ms step_avg:97.28ms
step:1463/1750 train_time:142322ms step_avg:97.28ms
step:1464/1750 train_time:142423ms step_avg:97.28ms
step:1465/1750 train_time:142524ms step_avg:97.29ms
step:1466/1750 train_time:142625ms step_avg:97.29ms
step:1467/1750 train_time:142727ms step_avg:97.29ms
step:1468/1750 train_time:142830ms step_avg:97.30ms
step:1469/1750 train_time:142934ms step_avg:97.30ms
step:1470/1750 train_time:143036ms step_avg:97.30ms
step:1471/1750 train_time:143138ms step_avg:97.31ms
step:1472/1750 train_time:143240ms step_avg:97.31ms
step:1473/1750 train_time:143342ms step_avg:97.31ms
step:1474/1750 train_time:143444ms step_avg:97.32ms
step:1475/1750 train_time:143545ms step_avg:97.32ms
step:1476/1750 train_time:143647ms step_avg:97.32ms
step:1477/1750 train_time:143749ms step_avg:97.32ms
step:1478/1750 train_time:143854ms step_avg:97.33ms
step:1479/1750 train_time:143954ms step_avg:97.33ms
step:1480/1750 train_time:144057ms step_avg:97.34ms
step:1481/1750 train_time:144158ms step_avg:97.34ms
step:1482/1750 train_time:144262ms step_avg:97.34ms
step:1483/1750 train_time:144363ms step_avg:97.35ms
step:1484/1750 train_time:144464ms step_avg:97.35ms
step:1485/1750 train_time:144567ms step_avg:97.35ms
step:1486/1750 train_time:144669ms step_avg:97.35ms
step:1487/1750 train_time:144771ms step_avg:97.36ms
step:1488/1750 train_time:144873ms step_avg:97.36ms
step:1489/1750 train_time:144974ms step_avg:97.36ms
step:1490/1750 train_time:145076ms step_avg:97.37ms
step:1491/1750 train_time:145177ms step_avg:97.37ms
step:1492/1750 train_time:145278ms step_avg:97.37ms
step:1493/1750 train_time:145380ms step_avg:97.37ms
step:1494/1750 train_time:145482ms step_avg:97.38ms
step:1495/1750 train_time:145583ms step_avg:97.38ms
step:1496/1750 train_time:145684ms step_avg:97.38ms
step:1497/1750 train_time:145785ms step_avg:97.38ms
step:1498/1750 train_time:145886ms step_avg:97.39ms
step:1499/1750 train_time:145988ms step_avg:97.39ms
step:1500/1750 train_time:146093ms step_avg:97.40ms
step:1500/1750 val_loss:3.3311 train_time:146186ms step_avg:97.46ms
step:1501/1750 train_time:146223ms step_avg:97.42ms
step:1502/1750 train_time:146306ms step_avg:97.41ms
step:1503/1750 train_time:146409ms step_avg:97.41ms
step:1504/1750 train_time:146511ms step_avg:97.41ms
step:1505/1750 train_time:146612ms step_avg:97.42ms
step:1506/1750 train_time:146713ms step_avg:97.42ms
step:1507/1750 train_time:146814ms step_avg:97.42ms
step:1508/1750 train_time:146914ms step_avg:97.42ms
step:1509/1750 train_time:147014ms step_avg:97.43ms
step:1510/1750 train_time:147114ms step_avg:97.43ms
step:1511/1750 train_time:147216ms step_avg:97.43ms
step:1512/1750 train_time:147318ms step_avg:97.43ms
step:1513/1750 train_time:147421ms step_avg:97.44ms
step:1514/1750 train_time:147527ms step_avg:97.44ms
step:1515/1750 train_time:147632ms step_avg:97.45ms
step:1516/1750 train_time:147732ms step_avg:97.45ms
step:1517/1750 train_time:147833ms step_avg:97.45ms
step:1518/1750 train_time:147934ms step_avg:97.45ms
step:1519/1750 train_time:148037ms step_avg:97.46ms
step:1520/1750 train_time:148138ms step_avg:97.46ms
step:1521/1750 train_time:148239ms step_avg:97.46ms
step:1522/1750 train_time:148340ms step_avg:97.46ms
step:1523/1750 train_time:148442ms step_avg:97.47ms
step:1524/1750 train_time:148546ms step_avg:97.47ms
step:1525/1750 train_time:148649ms step_avg:97.47ms
step:1526/1750 train_time:148751ms step_avg:97.48ms
step:1527/1750 train_time:148852ms step_avg:97.48ms
step:1528/1750 train_time:148956ms step_avg:97.48ms
step:1529/1750 train_time:149057ms step_avg:97.49ms
step:1530/1750 train_time:149158ms step_avg:97.49ms
step:1531/1750 train_time:149260ms step_avg:97.49ms
step:1532/1750 train_time:149361ms step_avg:97.49ms
step:1533/1750 train_time:149462ms step_avg:97.50ms
step:1534/1750 train_time:149565ms step_avg:97.50ms
step:1535/1750 train_time:149669ms step_avg:97.50ms
step:1536/1750 train_time:149770ms step_avg:97.51ms
step:1537/1750 train_time:149872ms step_avg:97.51ms
step:1538/1750 train_time:149974ms step_avg:97.51ms
step:1539/1750 train_time:150075ms step_avg:97.51ms
step:1540/1750 train_time:150177ms step_avg:97.52ms
step:1541/1750 train_time:150280ms step_avg:97.52ms
step:1542/1750 train_time:150384ms step_avg:97.53ms
step:1543/1750 train_time:150486ms step_avg:97.53ms
step:1544/1750 train_time:150589ms step_avg:97.53ms
step:1545/1750 train_time:150691ms step_avg:97.53ms
step:1546/1750 train_time:150794ms step_avg:97.54ms
step:1547/1750 train_time:150894ms step_avg:97.54ms
step:1548/1750 train_time:150996ms step_avg:97.54ms
step:1549/1750 train_time:151098ms step_avg:97.55ms
step:1550/1750 train_time:151199ms step_avg:97.55ms
step:1551/1750 train_time:151302ms step_avg:97.55ms
step:1552/1750 train_time:151403ms step_avg:97.55ms
step:1553/1750 train_time:151505ms step_avg:97.56ms
step:1554/1750 train_time:151608ms step_avg:97.56ms
step:1555/1750 train_time:151713ms step_avg:97.56ms
step:1556/1750 train_time:151812ms step_avg:97.57ms
step:1557/1750 train_time:151914ms step_avg:97.57ms
step:1558/1750 train_time:152017ms step_avg:97.57ms
step:1559/1750 train_time:152117ms step_avg:97.57ms
step:1560/1750 train_time:152218ms step_avg:97.58ms
step:1561/1750 train_time:152320ms step_avg:97.58ms
step:1562/1750 train_time:152422ms step_avg:97.58ms
step:1563/1750 train_time:152527ms step_avg:97.59ms
step:1564/1750 train_time:152630ms step_avg:97.59ms
step:1565/1750 train_time:152731ms step_avg:97.59ms
step:1566/1750 train_time:152832ms step_avg:97.59ms
step:1567/1750 train_time:152933ms step_avg:97.60ms
step:1568/1750 train_time:153033ms step_avg:97.60ms
step:1569/1750 train_time:153134ms step_avg:97.60ms
step:1570/1750 train_time:153235ms step_avg:97.60ms
step:1571/1750 train_time:153336ms step_avg:97.60ms
step:1572/1750 train_time:153437ms step_avg:97.61ms
step:1573/1750 train_time:153539ms step_avg:97.61ms
step:1574/1750 train_time:153642ms step_avg:97.61ms
step:1575/1750 train_time:153746ms step_avg:97.62ms
step:1576/1750 train_time:153848ms step_avg:97.62ms
step:1577/1750 train_time:153951ms step_avg:97.62ms
step:1578/1750 train_time:154052ms step_avg:97.62ms
step:1579/1750 train_time:154153ms step_avg:97.63ms
step:1580/1750 train_time:154253ms step_avg:97.63ms
step:1581/1750 train_time:154355ms step_avg:97.63ms
step:1582/1750 train_time:154456ms step_avg:97.63ms
step:1583/1750 train_time:154559ms step_avg:97.64ms
step:1584/1750 train_time:154664ms step_avg:97.64ms
step:1585/1750 train_time:154765ms step_avg:97.64ms
step:1586/1750 train_time:154868ms step_avg:97.65ms
step:1587/1750 train_time:154970ms step_avg:97.65ms
step:1588/1750 train_time:155071ms step_avg:97.65ms
step:1589/1750 train_time:155172ms step_avg:97.65ms
step:1590/1750 train_time:155273ms step_avg:97.66ms
step:1591/1750 train_time:155374ms step_avg:97.66ms
step:1592/1750 train_time:155475ms step_avg:97.66ms
step:1593/1750 train_time:155575ms step_avg:97.66ms
step:1594/1750 train_time:155680ms step_avg:97.67ms
step:1595/1750 train_time:155782ms step_avg:97.67ms
step:1596/1750 train_time:155885ms step_avg:97.67ms
step:1597/1750 train_time:155987ms step_avg:97.68ms
step:1598/1750 train_time:156090ms step_avg:97.68ms
step:1599/1750 train_time:156191ms step_avg:97.68ms
step:1600/1750 train_time:156293ms step_avg:97.68ms
step:1601/1750 train_time:156393ms step_avg:97.68ms
step:1602/1750 train_time:156493ms step_avg:97.69ms
step:1603/1750 train_time:156595ms step_avg:97.69ms
step:1604/1750 train_time:156696ms step_avg:97.69ms
step:1605/1750 train_time:156798ms step_avg:97.69ms
step:1606/1750 train_time:156900ms step_avg:97.70ms
step:1607/1750 train_time:157003ms step_avg:97.70ms
step:1608/1750 train_time:157105ms step_avg:97.70ms
step:1609/1750 train_time:157207ms step_avg:97.70ms
step:1610/1750 train_time:157310ms step_avg:97.71ms
step:1611/1750 train_time:157412ms step_avg:97.71ms
step:1612/1750 train_time:157513ms step_avg:97.71ms
step:1613/1750 train_time:157614ms step_avg:97.71ms
step:1614/1750 train_time:157714ms step_avg:97.72ms
step:1615/1750 train_time:157815ms step_avg:97.72ms
step:1616/1750 train_time:157916ms step_avg:97.72ms
step:1617/1750 train_time:158017ms step_avg:97.72ms
step:1618/1750 train_time:158121ms step_avg:97.73ms
step:1619/1750 train_time:158224ms step_avg:97.73ms
step:1620/1750 train_time:158327ms step_avg:97.73ms
step:1621/1750 train_time:158429ms step_avg:97.74ms
step:1622/1750 train_time:158530ms step_avg:97.74ms
step:1623/1750 train_time:158632ms step_avg:97.74ms
step:1624/1750 train_time:158735ms step_avg:97.74ms
step:1625/1750 train_time:158838ms step_avg:97.75ms
step:1625/1750 val_loss:3.3005 train_time:158931ms step_avg:97.80ms
step:1626/1750 train_time:158968ms step_avg:97.77ms
step:1627/1750 train_time:159051ms step_avg:97.76ms
step:1628/1750 train_time:159153ms step_avg:97.76ms
step:1629/1750 train_time:159255ms step_avg:97.76ms
step:1630/1750 train_time:159357ms step_avg:97.77ms
step:1631/1750 train_time:159458ms step_avg:97.77ms
step:1632/1750 train_time:159558ms step_avg:97.77ms
step:1633/1750 train_time:159660ms step_avg:97.77ms
step:1634/1750 train_time:159761ms step_avg:97.77ms
step:1635/1750 train_time:159861ms step_avg:97.77ms
step:1636/1750 train_time:159964ms step_avg:97.78ms
step:1637/1750 train_time:160066ms step_avg:97.78ms
step:1638/1750 train_time:160167ms step_avg:97.78ms
step:1639/1750 train_time:160270ms step_avg:97.79ms
step:1640/1750 train_time:160372ms step_avg:97.79ms
step:1641/1750 train_time:160473ms step_avg:97.79ms
step:1642/1750 train_time:160575ms step_avg:97.79ms
step:1643/1750 train_time:160677ms step_avg:97.80ms
step:1644/1750 train_time:160778ms step_avg:97.80ms
step:1645/1750 train_time:160881ms step_avg:97.80ms
step:1646/1750 train_time:160981ms step_avg:97.80ms
step:1647/1750 train_time:161083ms step_avg:97.80ms
step:1648/1750 train_time:161186ms step_avg:97.81ms
step:1649/1750 train_time:161286ms step_avg:97.81ms
step:1650/1750 train_time:161389ms step_avg:97.81ms
step:1651/1750 train_time:161490ms step_avg:97.81ms
step:1652/1750 train_time:161592ms step_avg:97.82ms
step:1653/1750 train_time:161695ms step_avg:97.82ms
step:1654/1750 train_time:161797ms step_avg:97.82ms
step:1655/1750 train_time:161898ms step_avg:97.82ms
step:1656/1750 train_time:162000ms step_avg:97.83ms
step:1657/1750 train_time:162101ms step_avg:97.83ms
step:1658/1750 train_time:162202ms step_avg:97.83ms
step:1659/1750 train_time:162307ms step_avg:97.83ms
step:1660/1750 train_time:162409ms step_avg:97.84ms
step:1661/1750 train_time:162513ms step_avg:97.84ms
step:1662/1750 train_time:162617ms step_avg:97.84ms
step:1663/1750 train_time:162719ms step_avg:97.85ms
step:1664/1750 train_time:162820ms step_avg:97.85ms
step:1665/1750 train_time:162924ms step_avg:97.85ms
step:1666/1750 train_time:163026ms step_avg:97.85ms
step:1667/1750 train_time:163127ms step_avg:97.86ms
step:1668/1750 train_time:163229ms step_avg:97.86ms
step:1669/1750 train_time:163331ms step_avg:97.86ms
step:1670/1750 train_time:163434ms step_avg:97.86ms
step:1671/1750 train_time:163535ms step_avg:97.87ms
step:1672/1750 train_time:163638ms step_avg:97.87ms
step:1673/1750 train_time:163739ms step_avg:97.87ms
step:1674/1750 train_time:163840ms step_avg:97.87ms
step:1675/1750 train_time:163941ms step_avg:97.88ms
step:1676/1750 train_time:164042ms step_avg:97.88ms
step:1677/1750 train_time:164143ms step_avg:97.88ms
step:1678/1750 train_time:164245ms step_avg:97.88ms
step:1679/1750 train_time:164348ms step_avg:97.88ms
step:1680/1750 train_time:164450ms step_avg:97.89ms
step:1681/1750 train_time:164552ms step_avg:97.89ms
step:1682/1750 train_time:164656ms step_avg:97.89ms
step:1683/1750 train_time:164756ms step_avg:97.89ms
step:1684/1750 train_time:164858ms step_avg:97.90ms
step:1685/1750 train_time:164960ms step_avg:97.90ms
step:1686/1750 train_time:165062ms step_avg:97.90ms
step:1687/1750 train_time:165163ms step_avg:97.90ms
step:1688/1750 train_time:165265ms step_avg:97.91ms
step:1689/1750 train_time:165368ms step_avg:97.91ms
step:1690/1750 train_time:165470ms step_avg:97.91ms
step:1691/1750 train_time:165573ms step_avg:97.91ms
step:1692/1750 train_time:165675ms step_avg:97.92ms
step:1693/1750 train_time:165777ms step_avg:97.92ms
step:1694/1750 train_time:165882ms step_avg:97.92ms
step:1695/1750 train_time:165985ms step_avg:97.93ms
step:1696/1750 train_time:166088ms step_avg:97.93ms
step:1697/1750 train_time:166193ms step_avg:97.93ms
step:1698/1750 train_time:166295ms step_avg:97.94ms
step:1699/1750 train_time:166397ms step_avg:97.94ms
step:1700/1750 train_time:166499ms step_avg:97.94ms
step:1701/1750 train_time:166600ms step_avg:97.94ms
step:1702/1750 train_time:166704ms step_avg:97.95ms
step:1703/1750 train_time:166806ms step_avg:97.95ms
step:1704/1750 train_time:166908ms step_avg:97.95ms
step:1705/1750 train_time:167010ms step_avg:97.95ms
step:1706/1750 train_time:167112ms step_avg:97.96ms
step:1707/1750 train_time:167216ms step_avg:97.96ms
step:1708/1750 train_time:167320ms step_avg:97.96ms
step:1709/1750 train_time:167422ms step_avg:97.96ms
step:1710/1750 train_time:167525ms step_avg:97.97ms
step:1711/1750 train_time:167630ms step_avg:97.97ms
step:1712/1750 train_time:167733ms step_avg:97.97ms
step:1713/1750 train_time:167838ms step_avg:97.98ms
step:1714/1750 train_time:167939ms step_avg:97.98ms
step:1715/1750 train_time:168044ms step_avg:97.98ms
step:1716/1750 train_time:168146ms step_avg:97.99ms
step:1717/1750 train_time:168250ms step_avg:97.99ms
step:1718/1750 train_time:168355ms step_avg:97.99ms
step:1719/1750 train_time:168458ms step_avg:98.00ms
step:1720/1750 train_time:168560ms step_avg:98.00ms
step:1721/1750 train_time:168662ms step_avg:98.00ms
step:1722/1750 train_time:168764ms step_avg:98.00ms
step:1723/1750 train_time:168866ms step_avg:98.01ms
step:1724/1750 train_time:168970ms step_avg:98.01ms
step:1725/1750 train_time:169075ms step_avg:98.01ms
step:1726/1750 train_time:169178ms step_avg:98.02ms
step:1727/1750 train_time:169280ms step_avg:98.02ms
step:1728/1750 train_time:169384ms step_avg:98.02ms
step:1729/1750 train_time:169485ms step_avg:98.02ms
step:1730/1750 train_time:169587ms step_avg:98.03ms
step:1731/1750 train_time:169692ms step_avg:98.03ms
step:1732/1750 train_time:169795ms step_avg:98.03ms
step:1733/1750 train_time:169897ms step_avg:98.04ms
step:1734/1750 train_time:170000ms step_avg:98.04ms
step:1735/1750 train_time:170102ms step_avg:98.04ms
step:1736/1750 train_time:170205ms step_avg:98.04ms
step:1737/1750 train_time:170308ms step_avg:98.05ms
step:1738/1750 train_time:170411ms step_avg:98.05ms
step:1739/1750 train_time:170513ms step_avg:98.05ms
step:1740/1750 train_time:170616ms step_avg:98.05ms
step:1741/1750 train_time:170723ms step_avg:98.06ms
step:1742/1750 train_time:170826ms step_avg:98.06ms
step:1743/1750 train_time:170929ms step_avg:98.07ms
step:1744/1750 train_time:171032ms step_avg:98.07ms
step:1745/1750 train_time:171134ms step_avg:98.07ms
step:1746/1750 train_time:171237ms step_avg:98.07ms
step:1747/1750 train_time:171339ms step_avg:98.08ms
step:1748/1750 train_time:171442ms step_avg:98.08ms
step:1749/1750 train_time:171544ms step_avg:98.08ms
step:1750/1750 train_time:171648ms step_avg:98.08ms
step:1750/1750 val_loss:3.2771 train_time:171743ms step_avg:98.14ms
peak memory allocated: 33278 MiB reserved: 48994 MiB