| import os | |
| import sys | |
| with open(sys.argv[0]) as f: | |
| code = f.read() # read the code of this file ASAP, for logging | |
| import uuid | |
| import time | |
| import copy | |
| import glob | |
| from dataclasses import dataclass | |
| from functools import lru_cache, partial # Added partial for hook registration | |
| from pathlib import Path | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
| import torch | |
| torch.empty(1, device="cuda", requires_grad=True).backward() # prevents a bug on some systems | |
| from torch import Tensor, nn | |
| import torch.nn.functional as F | |
| import torch.distributed as dist | |
| # use of FlexAttention contributed by @KoszarskyB | |
| from torch.nn.attention.flex_attention import BlockMask, flex_attention | |
| #torch._inductor.config.coordinate_descent_tuning = True # we have banned this flag for new records because it causes compilation to take 30min | |
| # ----------------------------------------------------------------------------- | |
| # Custom operators: FP8 matmul by @YouJiacheng | |
| @torch.library.custom_op("nanogpt::mm", mutates_args=()) | |
| def mm_op(x: Tensor, w: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor, Tensor]: | |
| @torch.compile | |
| def impl(x: Tensor, w: Tensor): | |
| assert x.is_contiguous() and w.is_contiguous() | |
| x_f8 = x.div(x_s).to(torch.float8_e4m3fn) | |
| w_f8 = w.div(w_s).to(torch.float8_e4m3fn) | |
| out = torch._scaled_mm( | |
| x_f8, | |
| w_f8.T, | |
| out_dtype=torch.bfloat16, | |
| scale_a=x.new_tensor(x_s, dtype=torch.float32), | |
| scale_b=x.new_tensor(w_s, dtype=torch.float32), | |
| use_fast_accum=True, | |
| ) | |
| return out, x_f8, w_f8 | |
| return impl(x, w) | |
| @mm_op.register_fake | |
| def _(x: Tensor, w: Tensor, *_): | |
| assert x.ndim == w.ndim == 2 | |
| assert x.shape[1] == w.shape[1] | |
| assert x.device == w.device | |
| assert x.is_contiguous() and w.is_contiguous() | |
| return x @ w.T, x.to(torch.float8_e4m3fn), w.to(torch.float8_e4m3fn) | |
| @torch.library.custom_op("nanogpt::mm_backward", mutates_args=()) | |
| def mm_backward_op(g: Tensor, x_f8: Tensor, w_f8: Tensor, x_s: float, w_s: float, grad_s: float) -> tuple[Tensor, Tensor]: | |
| @torch.compile | |
| def impl(grad: Tensor, x_f8: Tensor, w_f8: Tensor): | |
| assert grad.is_contiguous() | |
| x_inv_s = grad.new_tensor(x_s, dtype=torch.float32) | |
| w_inv_s = grad.new_tensor(w_s, dtype=torch.float32) | |
| grad_inv_s = grad.new_tensor(grad_s, dtype=torch.float32) | |
| grad_f8 = grad.div(grad_s).to(torch.float8_e5m2) | |
| grad_x = torch._scaled_mm( | |
| grad_f8, | |
| w_f8.T.contiguous().T, | |
| out_dtype=torch.bfloat16, | |
| scale_a=grad_inv_s, | |
| scale_b=w_inv_s, | |
| use_fast_accum=False, | |
| ) | |
| # faster than grad_f8_t @ x_f8, for (d_out, d_in) == (50304, 768) | |
| grad_w = torch._scaled_mm( | |
| x_f8.T.contiguous(), | |
| grad_f8.T.contiguous().T, | |
| out_dtype=torch.float32, | |
| scale_a=x_inv_s, | |
| scale_b=grad_inv_s, | |
| use_fast_accum=False, | |
| ).T | |
| return grad_x, grad_w | |
| return impl(g, x_f8, w_f8) | |
| @mm_backward_op.register_fake | |
| def _(g: Tensor, x_f8: Tensor, w_f8: Tensor, *_): | |
| return x_f8.to(torch.bfloat16), w_f8.T.contiguous().T.to(torch.float32) | |
| def backward(ctx, grad_out: Tensor, *_): | |
| x_f8, w_f8 = ctx.saved_tensors | |
| x_s, w_s, grad_s = ctx.scales | |
| grad_x, grad_w = torch.ops.nanogpt.mm_backward( | |
| grad_out, x_f8, w_f8, x_s, w_s, grad_s | |
| ) | |
| return grad_x, grad_w, None, None, None | |
| def setup_context(ctx: torch.autograd.function.FunctionCtx, inputs, output): | |
| *_, x_s, w_s, grad_s = inputs | |
| _, x_f8, w_f8 = output | |
| ctx.save_for_backward(x_f8, w_f8) | |
| ctx.scales = x_s, w_s, grad_s | |
| ctx.set_materialize_grads(False) | |
| mm_op.register_autograd(backward, setup_context=setup_context) | |
| # ----------------------------------------------------------------------------- | |
| # Muon optimizer | |
| @torch.compile | |
| def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: | |
| """ | |
| Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a | |
| quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose | |
| of minimizing steps, it turns out to be empirically effective to keep increasing the slope at | |
| zero even beyond the point where the iteration no longer converges all the way to one everywhere | |
| on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T | |
| where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model | |
| performance at all relative to UV^T, where USV^T = G is the SVD. | |
| """ | |
| assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng | |
| a, b, c = (3.4445, -4.7750, 2.0315) | |
| X = G | |
| if G.size(-2) > G.size(-1): | |
| X = X.mT | |
| # Ensure spectral norm is at most 1 | |
| X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7) | |
| # Perform the NS iterations | |
| for _ in range(steps): | |
| A = X @ X.mT | |
| B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng | |
| X = a * X + B @ X | |
| if G.size(-2) > G.size(-1): | |
| X = X.mT | |
| return X | |
| class Muon(torch.optim.Optimizer): | |
| """ | |
| Muon - MomentUm Orthogonalized by Newton-schulz | |
| https://kellerjordan.github.io/posts/muon/ | |
| Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- | |
| processing step, in which each 2D parameter's update is replaced with the nearest orthogonal | |
| matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has | |
| the advantage that it can be stably run in bfloat16 on the GPU. | |
| Warning: This optimizer should not be used for the embedding layer, the final fully connected layer, | |
| or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). | |
| """ | |
| def __init__(self, params, lr=0.02, weight_decay=0.01, momentum=0.95): | |
| defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum) | |
| params = list(params) | |
| sizes = {p.shape for p in params} | |
| # create one buffer per unique parameter-size | |
| param_groups = [] | |
| for size in sizes: | |
| group_params = [p for p in params if p.shape == size] | |
| param_groups.append(dict(params=group_params)) | |
| super().__init__(param_groups, defaults) | |
| @torch.no_grad() | |
| def step(self): | |
| # Efficient systems-wise implementation of step developed by @YouJiacheng, | |
| # @KonstantinWilleke, @alexrgilbert, @adricarda, @tuttyfrutyee, @vdlad, | |
| # @ryanyang0, and @vagrawal. | |
| rank = dist.get_rank() | |
| world_size = dist.get_world_size() | |
| reduce_scatter_futures: list[torch.Future] = [] | |
| all_reduce_futures: list[torch.Future] = [] | |
| for group in self.param_groups: | |
| params: list[Tensor] = group["params"] | |
| grad = torch.empty_like(params[-1]) | |
| grad_pad = [param.grad for param in params] + [torch.zeros_like(params[-1])] * world_size | |
| for base_i in range(0, len(params), world_size): | |
| if base_i + rank < len(params): | |
| grad = params[base_i + rank].grad | |
| # This gives strange dynamo warnings | |
| reduce_scatter_futures.append(dist.reduce_scatter(grad, grad_pad[base_i:base_i + world_size], op=dist.ReduceOp.AVG, async_op=True).get_future()) | |
| idx = 0 | |
| for group in self.param_groups: | |
| params: list[Tensor] = group["params"] | |
| params_pad = params + [torch.empty_like(params[-1])] * world_size | |
| momentum = group["momentum"] | |
| for base_i in range(0, len(params), world_size): | |
| reduce_scatter_futures[idx].wait() | |
| if base_i + rank < len(params): | |
| p = params[base_i + rank] | |
| grad = p.grad | |
| eff_lr = group["lr"] * max(1, p.size(-2) / p.size(-1)) ** 0.5 * getattr(p, "lr_mul", 1.0) | |
| eff_weight_decay = group["lr"] * group["weight_decay"] * getattr(p, "wd_mul", 1.0) | |
| state = self.state[p] | |
| if len(state) == 0: | |
| state["momentum_buffer"] = torch.zeros_like(grad) | |
| momentum_buffer = state["momentum_buffer"] | |
| p.mul_(1 - eff_weight_decay) | |
| momentum_buffer.lerp_(grad, 1 - momentum) | |
| grad = grad.lerp_(momentum_buffer, momentum) | |
| v = zeropower_via_newtonschulz5(grad.bfloat16(), 5) | |
| p.add_(other=v, alpha=-eff_lr) | |
| idx += 1 | |
| all_reduce_futures.append(dist.all_gather(params_pad[base_i:base_i + world_size], params_pad[base_i + rank], async_op=True).get_future()) | |
| torch.futures.collect_all(all_reduce_futures).wait() | |
| class DistAdam(torch.optim.Optimizer): | |
| def __init__(self, params, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): | |
| defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) | |
| params = list(params) | |
| sizes = {p.shape for p in params} | |
| # create one buffer per unique parameter-size | |
| param_groups = [] | |
| for size in sizes: | |
| group_params = [p for p in params if p.shape == size] | |
| param_groups.append(dict(params=group_params)) | |
| super().__init__(param_groups, defaults) | |
| # DistributedAdam implementation by @vagrawal | |
| @torch.compile | |
| @torch.no_grad() | |
| def step(self): | |
| rank = dist.get_rank() | |
| world_size = dist.get_world_size() | |
| reduce_scatter_futures: list[torch.Future] = [] | |
| all_reduce_futures: list[torch.Future] = [] | |
| grad_slices = [] | |
| for group in self.param_groups: | |
| params: list[Tensor] = group["params"] | |
| grad = torch.empty_like(params[-1]) | |
| for base_i in range(len(params)): | |
| grad = params[base_i].grad | |
| rank_size = grad.shape[0] // world_size | |
| grad_slice = torch.empty_like(grad[:rank_size]) | |
| reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) | |
| grad_slices.append(grad_slice) | |
| idx = 0 | |
| for group in self.param_groups: | |
| beta1, beta2 = group['betas'] | |
| eps = group['eps'] | |
| wd = group['weight_decay'] | |
| params = group['params'] | |
| for base in range(len(params)): | |
| reduce_scatter_futures[idx].wait() | |
| p = params[base] | |
| rank_size = p.shape[0] // world_size | |
| p_slice = p[rank * rank_size:(rank + 1) * rank_size] | |
| lr = group['lr'] * getattr(p, "lr_mul", 1.0) | |
| state = self.state[p] | |
| g_slice = grad_slices[idx] | |
| # State init | |
| if not state: | |
| state['step'] = torch.tensor(0, dtype=torch.int64, device=p.device) | |
| state['exp_avg'] = torch.zeros_like(p_slice) | |
| state['exp_avg_sq'] = torch.zeros_like(p_slice) | |
| exp_avg = state['exp_avg'] | |
| exp_avg_sq = state['exp_avg_sq'] | |
| state['step'] += 1 | |
| t = state['step'] | |
| # weight decay | |
| if wd != 0: | |
| eff_weight_decay = lr * wd * getattr(p, "wd_mul", 1.0) | |
| p_slice.mul_(1 - eff_weight_decay) | |
| # update running averages | |
| exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) | |
| exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) | |
| # bias corrections | |
| bias1 = 1 - beta1 ** t | |
| bias2 = 1 - beta2 ** t | |
| # compute step | |
| denom = exp_avg_sq.sqrt().add_(eps) | |
| step_size = lr * (torch.sqrt(bias2) / bias1) | |
| update = exp_avg.div(denom).mul_(step_size) | |
| p_slice.add_(other=update, alpha=-1.0) | |
| idx += 1 | |
| all_reduce_futures.append(dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future()) | |
| torch.futures.collect_all(all_reduce_futures).wait() | |
| # ----------------------------------------------------------------------------- | |
| # PyTorch nn.Module definitions for the model | |
| def norm(x: Tensor): | |
| return F.rms_norm(x, (x.size(-1),)) | |
| class CastedLinear(nn.Linear): | |
| def __init__(self, in_features: int, out_features: int, use_fp8=False, x_s=1.0, w_s=1.0, grad_s=1.0): | |
| super().__init__(in_features, out_features, bias=False) | |
| self.use_fp8 = use_fp8 | |
| self.x_s = x_s | |
| self.w_s = w_s | |
| self.grad_s = grad_s | |
| def reset_parameters(self) -> None: | |
| std = 0.5 * (self.in_features ** -0.5) # 0.5 is a bit better than the default 1/sqrt(3) | |
| bound = (3 ** 0.5) * std | |
| with torch.no_grad(): | |
| self.weight.uniform_(-bound, bound) | |
| def forward(self, x: Tensor): | |
| if self.use_fp8 and self.training: | |
| _x = x.flatten(0, -2) | |
| out: Tensor = torch.ops.nanogpt.mm(_x, self.weight, x_s=self.x_s, w_s=self.w_s, grad_s=self.grad_s)[0] | |
| return out.reshape(*x.shape[:-1], -1) | |
| else: | |
| return F.linear(x, self.weight.type_as(x)) | |
| class Rotary(nn.Module): | |
| def __init__(self, dim: int, max_seq_len: int): | |
| super().__init__() | |
| # half-truncate RoPE by @YouJiacheng (w/ base freq tuning) | |
| angular_freq = (1 / 1024) ** torch.linspace(0, 1, steps=dim//4, dtype=torch.float32) | |
| angular_freq = torch.cat([angular_freq, angular_freq.new_zeros(dim//4)]) | |
| t = torch.arange(max_seq_len, dtype=torch.float32) | |
| theta = torch.einsum("i,j -> ij", t, angular_freq) | |
| self.cos = nn.Buffer(theta.cos(), persistent=False) | |
| self.sin = nn.Buffer(theta.sin(), persistent=False) | |
| def forward(self, x_BTHD: Tensor): | |
| assert self.cos.size(0) >= x_BTHD.size(-3) | |
| cos, sin = self.cos[None, :x_BTHD.size(-3), None, :], self.sin[None, :x_BTHD.size(-3), None, :] | |
| x1, x2 = x_BTHD.to(dtype=torch.float32).chunk(2, dim=-1) | |
| y1 = x1 * cos + x2 * sin | |
| y2 = x1 * (-sin) + x2 * cos | |
| return torch.cat((y1, y2), 3).type_as(x_BTHD) | |
| class CausalSelfAttention(nn.Module): | |
| def __init__(self, dim: int, num_heads: int, max_seq_len: int, head_dim=128): | |
| super().__init__() | |
| self.num_heads = num_heads | |
| self.head_dim = head_dim | |
| hdim = num_heads * head_dim | |
| std = 0.5 * (dim ** -0.5) | |
| bound = (3 ** 0.5) * std # improved init scale by @YouJiacheng | |
| # merged QKV weights: suggested by many, implemented by @fernbear.bsky.social, and further improved by @YouJiacheng | |
| # https://x.com/hi_tysam/status/1879699187107033311 | |
| self.qkv_w = nn.Parameter(torch.empty(3, hdim, dim).uniform_(-bound, bound)) | |
| self.rotary = Rotary(head_dim, max_seq_len) | |
| self.c_proj = CastedLinear(hdim, dim) | |
| self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
| # scale the attention logits by given constant, instead of the default head_dim**-0.5, by @leloykun | |
| # inspired by learnable scalars used by @brendanh0gan https://x.com/hi_tysam/status/1879693583898591283 | |
| self.attn_scale = 0.12 | |
| def forward(self, x: Tensor, ve: Tensor | None, lambdas: Tensor, block_mask: BlockMask): | |
| B, T = x.size(0), x.size(1) # batch size, sequence length | |
| assert B == 1, "Must use batch size = 1 for FlexAttention" | |
| q, k, v = F.linear(x, self.qkv_w.flatten(end_dim=1).type_as(x)).view(B, T, 3 * self.num_heads, self.head_dim).chunk(3, dim=-2) | |
| q, k = norm(q), norm(k) # QK norm @Grad62304977 | |
| q, k = self.rotary(q), self.rotary(k) | |
| if ve is not None: | |
| v = lambdas[0] * v + lambdas[1] * ve.view_as(v) # @KoszarskyB & @Grad62304977 | |
| else: # skip mid-layers token value embeddings by @YouJiacheng | |
| v = lambdas[0] * v | |
| y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, scale=self.attn_scale).transpose(1, 2) | |
| y = y.contiguous().view(B, T, self.num_heads * self.head_dim) # re-assemble all head outputs side by side | |
| y = self.c_proj(y) | |
| return y | |
| class MLP(nn.Module): | |
| def __init__(self, dim: int): | |
| super().__init__() | |
| hdim = 4 * dim | |
| self.c_fc = CastedLinear(dim, hdim) | |
| self.c_proj = CastedLinear(hdim, dim) | |
| self.c_proj.weight.detach().zero_() # zero init suggested by @Grad62304977 | |
| def forward(self, x: Tensor): | |
| x = self.c_fc(x) | |
| x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 | |
| x = self.c_proj(x) | |
| return x | |
| class Block(nn.Module): | |
| def __init__(self, dim: int, num_heads: int, max_seq_len: int, layer_idx: int): | |
| super().__init__() | |
| # skip attention of blocks.7 (the 8th layer) by @YouJiacheng | |
| self.attn = CausalSelfAttention(dim, num_heads, max_seq_len) if layer_idx != 7 else None | |
| self.mlp = MLP(dim) | |
| def forward(self, x: Tensor, ve: Tensor | None, x0: Tensor, lambdas: Tensor, sa_lambdas: Tensor, block_mask: BlockMask): | |
| x = lambdas[0] * x + lambdas[1] * x0 | |
| if self.attn is not None: | |
| x = x + self.attn(norm(x), ve, sa_lambdas, block_mask) | |
| x = x + self.mlp(norm(x)) | |
| return x | |
| # ----------------------------------------------------------------------------- | |
| # The main model | |
| def next_multiple_of_n(v: float | int, *, n: int): | |
| return next(x for x in range(n, int(v) + 1 + n, n) if x >= v) | |
| class GPT(nn.Module): | |
| def __init__(self, vocab_size: int, num_layers: int, num_heads: int, model_dim: int, max_seq_len: int): | |
| super().__init__() | |
| vocab_size = next_multiple_of_n(vocab_size, n=128) | |
| self.embed = nn.Embedding(vocab_size, model_dim) | |
| # token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual implementation following https://arxiv.org/abs/2410.17897 | |
| # value embedding code simplification inspired by @ragulpr https://github.com/KellerJordan/modded-nanogpt/pull/78 | |
| self.value_embeds = nn.ModuleList([nn.Embedding(vocab_size, model_dim) for _ in range(3)]) | |
| self.blocks = nn.ModuleList([Block(model_dim, num_heads, max_seq_len, i) for i in range(num_layers)]) | |
| # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. | |
| # suggested to me by @Grad62304977. this originates from Karpathy's experiments. | |
| self.lm_head = CastedLinear(model_dim, vocab_size, use_fp8=True, x_s=(model_dim**0.5)/448, w_s=24/448, grad_s=1/448) | |
| self.lm_head.weight.detach().zero_() # @Grad62304977 | |
| # Add learnable skip connection weights for decoder layers | |
| assert num_layers % 2 == 0 | |
| pad = (-num_layers * 5) % dist.get_world_size() | |
| self.scalars = nn.Parameter(torch.cat([ | |
| torch.ones(num_layers), # skip_weights | |
| *[torch.tensor([1.0, 0.0]) for _ in range(num_layers)], # block lambdas | |
| *[torch.tensor([0.5, 0.5]) for _ in range(num_layers)], # SA lambdas | |
| torch.ones(pad), | |
| ])) | |
| # set learning rates | |
| for param in self.embed.parameters(): | |
| param.lr_mul = 75. | |
| for param in self.value_embeds.parameters(): | |
| param.lr_mul = 75. | |
| self.lm_head.weight.lr_mul = 27.5 | |
| self.scalars.lr_mul = 5.0 | |
| def create_blockmasks(self, input_seq: Tensor, sliding_window_num_blocks: Tensor): | |
| BLOCK_SIZE = 128 | |
| docs = (input_seq == 50256).cumsum(0) | |
| def document_causal(b, h, q_idx, kv_idx): | |
| causal_mask = q_idx >= kv_idx | |
| document_mask = docs[q_idx] == docs[kv_idx] | |
| return causal_mask & document_mask | |
| def dense_to_ordered(dense_blockmask: Tensor): | |
| num_blocks = dense_blockmask.sum(dim=-1, dtype=torch.int32) | |
| indices = dense_blockmask.argsort(dim=-1, descending=False, stable=True).flip(-1).to(torch.int32) | |
| return num_blocks[None, None].contiguous(), indices[None, None].contiguous() | |
| # manual block mask creation by @YouJiacheng | |
| assert len(input_seq) % BLOCK_SIZE == 0 | |
| NUM_BLOCKS = len(input_seq) // BLOCK_SIZE | |
| block_idx = torch.arange(NUM_BLOCKS, dtype=torch.int32, device="cuda") | |
| causal_blockmask_any = block_idx[:, None] >= block_idx | |
| causal_blockmask_all = block_idx[:, None] > block_idx | |
| docs_low = docs.view(-1, BLOCK_SIZE)[:, 0].contiguous() | |
| docs_high = docs.view(-1, BLOCK_SIZE)[:, -1].contiguous() | |
| document_blockmask_any = (docs_low[:, None] <= docs_high) & (docs_high[:, None] >= docs_low) | |
| document_blockmask_all = (docs_low[:, None] == docs_high) & (docs_high[:, None] == docs_low) | |
| blockmask_any = causal_blockmask_any & document_blockmask_any | |
| blockmask_all = causal_blockmask_all & document_blockmask_all | |
| partial_kv_num_blocks, partial_kv_indices = dense_to_ordered(blockmask_any & ~blockmask_all) | |
| full_kv_num_blocks, full_kv_indices = dense_to_ordered(blockmask_all) | |
| def build_bm(window_size_blocks: Tensor) -> BlockMask: | |
| return BlockMask.from_kv_blocks( | |
| torch.clamp_max(partial_kv_num_blocks, torch.clamp_min(window_size_blocks - full_kv_num_blocks, 1)), | |
| partial_kv_indices, | |
| torch.clamp_max(full_kv_num_blocks, window_size_blocks - 1), | |
| full_kv_indices, | |
| BLOCK_SIZE=BLOCK_SIZE, | |
| mask_mod=document_causal, | |
| ) | |
| # Long-short SWA block masks by @leloykun & @YouJiacheng, adapated from suggestion by @Grad62304977, following Gemma 2 paper | |
| return build_bm(sliding_window_num_blocks), build_bm(sliding_window_num_blocks // 2) | |
| def forward(self, input_seq: Tensor, target_seq: Tensor, sliding_window_num_blocks: Tensor): | |
| assert input_seq.ndim == 1 | |
| ve = [value_embed(input_seq) for value_embed in self.value_embeds] | |
| # 012 ... 012 structure on token value embeddings by @YouJiacheng, improved on @leloykun's U-net structure | |
| ve = [ve[0], ve[1], ve[2]] + [None] * (len(self.blocks) - 6) + [ve[0], ve[1], ve[2]] | |
| assert len(ve) == len(self.blocks) | |
| long_bm, short_bm = self.create_blockmasks(input_seq, sliding_window_num_blocks) | |
| block_masks = [long_bm, short_bm, short_bm, short_bm, long_bm, short_bm, short_bm, long_bm, short_bm, short_bm, short_bm, long_bm] | |
| assert len(block_masks) == len(self.blocks) | |
| x = x0 = norm(self.embed(input_seq)[None]) # use of norm here by @Grad62304977 | |
| # U-net design by @brendanh0gan | |
| skip_connections = [] | |
| skip_weights = self.scalars[:(len(self.blocks) // 2)] | |
| lambdas = self.scalars[1 * len(self.blocks): 3 * len(self.blocks)].view(-1, 2) | |
| sa_lambdas = self.scalars[3 * len(self.blocks): 5 * len(self.blocks)].view(-1, 2) | |
| n = len(self.blocks) // 2 | |
| for i in range(len(self.blocks)): | |
| if i >= n: | |
| x = x + skip_weights[i - n] * skip_connections.pop() | |
| x = self.blocks[i](x, ve[i], x0, lambdas[i], sa_lambdas[i], block_masks[i]) | |
| if i < n: | |
| skip_connections.append(x) | |
| x = norm(x) | |
| logits = self.lm_head(x).float() | |
| # @Grad62304977 added tanh softcapping following Gemma 2 paper, @KoszarskyB reduced it from 30 to 15, @YouJiacheng shifted it by +15 (2*sigmoid(2*x)=tanh(x)+1) | |
| logits = 30 * torch.sigmoid(logits / (7.5 * x.size(-1)**0.5)) | |
| loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_seq, reduction="sum" if self.training else "mean") | |
| return loss | |
| # ----------------------------------------------------------------------------- | |
| # Distributed data loader | |
| def _load_data_shard(file: Path): | |
| header = torch.from_file(str(file), False, 256, dtype=torch.int32) # header is 256 int32 | |
| assert header[0] == 20240520, "magic number mismatch in the data .bin file" | |
| assert header[1] == 1, "unsupported version" | |
| num_tokens = int(header[2]) # number of tokens (claimed) | |
| with file.open("rb", buffering=0) as f: | |
| tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) # avoid pin_memory copy by @YouJiacheng | |
| f.seek(256 * 4) | |
| nbytes = f.readinto(tokens.numpy()) # avoid bytes->array copy by @YouJiacheng | |
| assert nbytes == 2 * num_tokens, "number of tokens read does not match header" | |
| return tokens | |
| # find world_size starting indicies, such that each begins with token 50256 and local_batches don't overlap | |
| def find_batch_starts(tokens: Tensor, pos: int, local_batch_size: int, max_batch_span: int): | |
| boundary_mask = tokens[pos : pos + max_batch_span] == 50256 | |
| boundary_positions = torch.nonzero(boundary_mask, as_tuple=False).squeeze(-1) + pos | |
| start = boundary_positions[0].item() | |
| starts = [] | |
| for i in range(len(boundary_positions) - 1): | |
| end = boundary_positions[i + 1].item() | |
| if end - start >= local_batch_size: | |
| starts.append(start) # append start once end pos is confirmed | |
| if len(starts) == dist.get_world_size(): | |
| return starts, end - pos | |
| start = end | |
| assert False # increase max_batch_span if necessary | |
| def distributed_data_generator(filename_pattern: str, batch_size: int, align_to_bos: bool): | |
| rank = dist.get_rank() | |
| world_size = dist.get_world_size() | |
| files = [Path(file) for file in sorted(glob.glob(filename_pattern))] | |
| assert batch_size % world_size == 0 | |
| local_batch_size = batch_size // world_size | |
| file_iter = iter(files) # use itertools.cycle(files) instead if you want to do multi-epoch training | |
| tokens, pos = _load_data_shard(next(file_iter)), 0 | |
| max_batch_span = 2 * batch_size if align_to_bos else batch_size # provide buffer to handle samples up to length local_batch_size | |
| while True: | |
| if pos + max_batch_span + 1 >= len(tokens): | |
| tokens, pos = _load_data_shard(next(file_iter)), 0 | |
| if align_to_bos: | |
| batch_starts, batch_span = find_batch_starts(tokens, pos, local_batch_size, max_batch_span) | |
| start_idx = batch_starts[rank] | |
| else: | |
| batch_span = batch_size | |
| start_idx = pos + rank * local_batch_size | |
| buf = tokens[start_idx:][:local_batch_size + 1] | |
| inputs = buf[:-1].to(device="cuda", dtype=torch.int32, non_blocking=True) # no sync on host side; | |
| targets = buf[1:].to(device="cuda", dtype=torch.int64, non_blocking=True) # H2D in another stream isn't helpful. | |
| pos += batch_span | |
| yield inputs, targets | |
| # ----------------------------------------------------------------------------- | |
| # int main | |
| @dataclass | |
| class Hyperparameters: | |
| # data | |
| train_files = "data/fineweb10B/fineweb_train_*.bin" # input .bin to train on | |
| val_files = "data/fineweb10B/fineweb_val_*.bin" # input .bin to eval validation loss on | |
| val_tokens = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons | |
| train_seq_len = 48*1024 # FlexAttention sequence length | |
| val_seq_len = 4*64*1024 # FlexAttention sequence length for validation | |
| # optimization | |
| num_iterations = 1750 # number of iterations to run | |
| cooldown_frac = 0.45 # fraction of training spent cooling down the learning rate | |
| # evaluation and logging | |
| val_loss_every = 125 # every how many steps to evaluate val loss? 0 for only at the end | |
| save_checkpoint = False | |
| args = Hyperparameters() | |
| # torchrun sets these env variables | |
| rank = int(os.environ["RANK"]) | |
| world_size = int(os.environ["WORLD_SIZE"]) | |
| assert world_size == 8 # this code is designed for 8xH100 | |
| assert torch.cuda.is_available() | |
| device = torch.device("cuda", int(os.environ["LOCAL_RANK"])) | |
| torch.cuda.set_device(device) | |
| dist.init_process_group(backend="nccl", device_id=device) | |
| dist.barrier() | |
| master_process = (rank == 0) # this process will do logging, checkpointing etc. | |
| # begin logging | |
| logfile = None | |
| if master_process: | |
| run_id = uuid.uuid4() | |
| os.makedirs("logs", exist_ok=True) | |
| logfile = f"logs/{run_id}.txt" | |
| print(logfile) | |
| def print0(s, console=False): | |
| if master_process: | |
| with open(logfile, "a") as f: | |
| if console: | |
| print(s) | |
| print(s, file=f) | |
| # begin by printing this file (the Python code) | |
| print0(code) | |
| print0("="*100) | |
| # log information about the hardware/software environment this is running on | |
| print0(f"Running Python {sys.version}") | |
| print0(f"Running PyTorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}") | |
| def nvidia_smi(): | |
| import subprocess # avoid top level import | |
| return subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True).stdout | |
| print0(nvidia_smi()) | |
| print0("="*100) | |
| model: nn.Module = GPT(vocab_size=50257, num_layers=12, num_heads=6, model_dim=768, max_seq_len=max(args.train_seq_len, args.val_seq_len)).cuda() | |
| for m in model.modules(): | |
| if isinstance(m, nn.Embedding): | |
| m.bfloat16() | |
| for param in model.parameters(): | |
| dist.broadcast(param.detach(), 0) | |
| # collect the parameters to optimize | |
| hidden_matrix_params = [p for n, p in model.blocks.named_parameters() if p.ndim >= 2 and "embed" not in n] | |
| embed_params = [p for n, p in model.named_parameters() if "embed" in n] | |
| scalar_params = [p for p in model.parameters() if p.ndim < 2] | |
| head_params = [model.lm_head.weight] | |
| # init the optimizer(s) | |
| # small adam epsilon by @YouJiacheng. this is an alternate method of fixing the world_size dependence | |
| # discovered by @fernbear.bsky.social https://x.com/hi_tysam/status/1879692937589875094 | |
| optimizer1 = DistAdam(scalar_params + head_params + embed_params, lr=0.008, betas=(0.8, 0.95), eps=1e-10, weight_decay=0.0) | |
| optimizer2 = Muon(hidden_matrix_params, lr=0.05, momentum=0.95, weight_decay=0.0) | |
| optimizers = [optimizer1, optimizer2] | |
| for opt in optimizers: | |
| for group in opt.param_groups: | |
| group["initial_lr"] = group["lr"] | |
| # learning rate schedule: stable then decay | |
| def get_lr(step: int): | |
| x = step / args.num_iterations # progress in training | |
| assert 0 <= x < 1 | |
| if x < 1 - args.cooldown_frac: | |
| return 1.0 | |
| else: | |
| w = (1 - x) / args.cooldown_frac | |
| return w * 1.0 + (1 - w) * 0.1 | |
| # attention window size schedule: linearly increase | |
| @lru_cache(1) | |
| def get_window_size_blocks_helper(window_size: int): | |
| return torch.tensor(window_size // 128, dtype=torch.int32, pin_memory=True).cuda(non_blocking=True) | |
| def get_window_size_blocks(step: int): | |
| x = step / args.num_iterations # progress in training | |
| assert 0 <= x <= 1 | |
| # Linearly increase the block-wise sliding window size over training 128 -> 1792 | |
| # increase by @fernbear.bsky.social; block-wise by @YouJiacheng | |
| window_size = next_multiple_of_n(1728 * x, n=128) | |
| return get_window_size_blocks_helper(window_size) | |
| model: nn.Module = torch.compile(model, dynamic=False) | |
| ######################################## | |
| # Warmup kernels # | |
| ######################################## | |
| # Warmup the training kernels, then re-initialize the state so we aren't cheating | |
| warmup_steps = 10 | |
| initial_state = dict(model=copy.deepcopy(model.state_dict()), | |
| optimizers=[copy.deepcopy(opt.state_dict()) for opt in optimizers]) # save the initial state | |
| train_loader = distributed_data_generator(args.train_files, world_size * args.train_seq_len, align_to_bos=True) | |
| for _ in range(warmup_steps): | |
| inputs, targets = next(train_loader) | |
| model(inputs, targets, get_window_size_blocks(1)).backward() | |
| for opt in optimizers: | |
| opt.step() | |
| model.zero_grad(set_to_none=True) | |
| model.load_state_dict(initial_state["model"]) | |
| for opt, opt_state in zip(optimizers, initial_state["optimizers"]): | |
| opt.load_state_dict(opt_state) | |
| del train_loader, initial_state | |
| ######################################## | |
| # Training and validation # | |
| ######################################## | |
| train_loader = distributed_data_generator(args.train_files, world_size * args.train_seq_len, align_to_bos=True) | |
| training_time_ms = 0 | |
| # start the clock | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| # begin training | |
| train_steps = args.num_iterations | |
| for step in range(train_steps + 1): | |
| last_step = (step == train_steps) | |
| # --------------- VALIDATION SECTION ----------------- | |
| if last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0): | |
| # stop the clock | |
| torch.cuda.synchronize() | |
| training_time_ms += 1000 * (time.perf_counter() - t0) | |
| model.eval() | |
| val_batch_size = world_size * args.val_seq_len | |
| assert args.val_tokens % val_batch_size == 0 | |
| val_steps = args.val_tokens // val_batch_size | |
| val_loader = distributed_data_generator(args.val_files, val_batch_size, align_to_bos=False) | |
| val_loss = 0 | |
| with torch.no_grad(): | |
| for _ in range(val_steps): | |
| inputs, targets = next(val_loader) | |
| val_loss += model(inputs, targets, get_window_size_blocks(step)) | |
| val_loss /= val_steps | |
| del val_loader | |
| dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) | |
| print0(f"step:{step}/{train_steps} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/max(step, 1):.2f}ms", console=True) | |
| model.train() | |
| # start the clock again | |
| torch.cuda.synchronize() | |
| t0 = time.perf_counter() | |
| if last_step: | |
| if master_process and args.save_checkpoint: | |
| log = dict(step=step, code=code, model=model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) | |
| os.makedirs(f"logs/{run_id}", exist_ok=True) | |
| torch.save(log, f"logs/{run_id}/state_step{step:06d}.pt") | |
| # the last step only has the validation loop, so break to avoid training | |
| break | |
| # --------------- TRAINING SECTION ----------------- | |
| inputs, targets = next(train_loader) | |
| model(inputs, targets, get_window_size_blocks(step)).backward() | |
| # set optimization hyperparameters | |
| for opt in optimizers: | |
| for group in opt.param_groups: | |
| group["lr"] = group["initial_lr"] * get_lr(step) | |
| for group in optimizer2.param_groups: | |
| frac = min(step / 300, 1) # momentum warmup for muon | |
| group["momentum"] = (1 - frac) * 0.85 + frac * 0.95 | |
| # step the optimizers | |
| for opt in optimizers: | |
| opt.step() | |
| # null the gradients | |
| model.zero_grad(set_to_none=True) | |
| # logging | |
| approx_training_time_ms = training_time_ms + 1000 * (time.perf_counter() - t0) | |
| print0(f"step:{step+1}/{train_steps} train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms/(step + 1):.2f}ms", console=True) | |
| print0(f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " | |
| f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB", console=True) | |
| dist.destroy_process_group() | |
| ==================================================================================================== | |
| Running Python 3.12.7 (main, Jul 14 2025, 05:23:05) [GCC 13.2.0] | |
| Running PyTorch 2.9.0.dev20250713+cu126 compiled for CUDA 12.6 | |
| Mon Jul 14 07:04:41 2025 | |
| +-----------------------------------------------------------------------------------------+ | |
| | NVIDIA-SMI 570.148.08 Driver Version: 570.148.08 CUDA Version: 12.8 | | |
| |-----------------------------------------+------------------------+----------------------+ | |
| | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | | MIG M. | | |
| |=========================================+========================+======================| | |
| | 0 NVIDIA H100 80GB HBM3 On | 00000000:61:00.0 Off | 0 | | |
| | N/A 30C P0 137W / 700W | 5858MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 1 NVIDIA H100 80GB HBM3 On | 00000000:62:00.0 Off | 0 | | |
| | N/A 32C P0 124W / 700W | 1519MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 2 NVIDIA H100 80GB HBM3 On | 00000000:63:00.0 Off | 0 | | |
| | N/A 31C P0 119W / 700W | 1519MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 3 NVIDIA H100 80GB HBM3 On | 00000000:64:00.0 Off | 0 | | |
| | N/A 29C P0 125W / 700W | 1519MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 4 NVIDIA H100 80GB HBM3 On | 00000000:6A:00.0 Off | 0 | | |
| | N/A 28C P0 116W / 700W | 1519MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 5 NVIDIA H100 80GB HBM3 On | 00000000:6B:00.0 Off | 0 | | |
| | N/A 30C P0 121W / 700W | 1519MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 6 NVIDIA H100 80GB HBM3 On | 00000000:6C:00.0 Off | 0 | | |
| | N/A 29C P0 124W / 700W | 1519MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 7 NVIDIA H100 80GB HBM3 On | 00000000:6D:00.0 Off | 0 | | |
| | N/A 26C P0 117W / 700W | 1519MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| +-----------------------------------------------------------------------------------------+ | |
| | Processes: | | |
| | GPU GI CI PID Type Process name GPU Memory | | |
| | ID ID Usage | | |
| |=========================================================================================| | |
| | 0 N/A N/A 51503 C /usr/local/bin/python 1510MiB | | |
| | 0 N/A N/A 51504 C /usr/local/bin/python 614MiB | | |
| | 0 N/A N/A 51505 C /usr/local/bin/python 614MiB | | |
| | 0 N/A N/A 51506 C /usr/local/bin/python 614MiB | | |
| | 0 N/A N/A 51507 C /usr/local/bin/python 614MiB | | |
| | 0 N/A N/A 51508 C /usr/local/bin/python 614MiB | | |
| | 0 N/A N/A 51509 C /usr/local/bin/python 614MiB | | |
| | 0 N/A N/A 51510 C /usr/local/bin/python 614MiB | | |
| | 1 N/A N/A 51504 C /usr/local/bin/python 1510MiB | | |
| | 2 N/A N/A 51505 C /usr/local/bin/python 1510MiB | | |
| | 3 N/A N/A 51506 C /usr/local/bin/python 1510MiB | | |
| | 4 N/A N/A 51507 C /usr/local/bin/python 1510MiB | | |
| | 5 N/A N/A 51508 C /usr/local/bin/python 1510MiB | | |
| | 6 N/A N/A 51509 C /usr/local/bin/python 1510MiB | | |
| | 7 N/A N/A 51510 C /usr/local/bin/python 1510MiB | | |
| +-----------------------------------------------------------------------------------------+ | |
| ==================================================================================================== | |
| step:0/1750 val_loss:10.8258 train_time:0ms step_avg:0.03ms | |
| step:1/1750 train_time:146ms step_avg:146.43ms | |
| step:2/1750 train_time:182ms step_avg:90.99ms | |
| step:3/1750 train_time:249ms step_avg:82.95ms | |
| step:4/1750 train_time:340ms step_avg:85.09ms | |
| step:5/1750 train_time:433ms step_avg:86.53ms | |
| step:6/1750 train_time:526ms step_avg:87.68ms | |
| step:7/1750 train_time:618ms step_avg:88.28ms | |
| step:8/1750 train_time:713ms step_avg:89.12ms | |
| step:9/1750 train_time:804ms step_avg:89.39ms | |
| step:10/1750 train_time:896ms step_avg:89.62ms | |
| step:11/1750 train_time:988ms step_avg:89.84ms | |
| step:12/1750 train_time:1081ms step_avg:90.10ms | |
| step:13/1750 train_time:1175ms step_avg:90.39ms | |
| step:14/1750 train_time:1268ms step_avg:90.56ms | |
| step:15/1750 train_time:1362ms step_avg:90.83ms | |
| step:16/1750 train_time:1455ms step_avg:90.93ms | |
| step:17/1750 train_time:1548ms step_avg:91.06ms | |
| step:18/1750 train_time:1641ms step_avg:91.15ms | |
| step:19/1750 train_time:1734ms step_avg:91.25ms | |
| step:20/1750 train_time:1827ms step_avg:91.35ms | |
| step:21/1750 train_time:1920ms step_avg:91.43ms | |
| step:22/1750 train_time:2013ms step_avg:91.49ms | |
| step:23/1750 train_time:2107ms step_avg:91.60ms | |
| step:24/1750 train_time:2200ms step_avg:91.65ms | |
| step:25/1750 train_time:2293ms step_avg:91.70ms | |
| step:26/1750 train_time:2386ms step_avg:91.77ms | |
| step:27/1750 train_time:2479ms step_avg:91.81ms | |
| step:28/1750 train_time:2572ms step_avg:91.84ms | |
| step:29/1750 train_time:2665ms step_avg:91.88ms | |
| step:30/1750 train_time:2757ms step_avg:91.90ms | |
| step:31/1750 train_time:2850ms step_avg:91.95ms | |
| step:32/1750 train_time:2944ms step_avg:91.99ms | |
| step:33/1750 train_time:3036ms step_avg:92.00ms | |
| step:34/1750 train_time:3129ms step_avg:92.04ms | |
| step:35/1750 train_time:3223ms step_avg:92.08ms | |
| step:36/1750 train_time:3316ms step_avg:92.10ms | |
| step:37/1750 train_time:3409ms step_avg:92.14ms | |
| step:38/1750 train_time:3502ms step_avg:92.17ms | |
| step:39/1750 train_time:3595ms step_avg:92.18ms | |
| step:40/1750 train_time:3689ms step_avg:92.24ms | |
| step:41/1750 train_time:3782ms step_avg:92.25ms | |
| step:42/1750 train_time:3875ms step_avg:92.26ms | |
| step:43/1750 train_time:3967ms step_avg:92.27ms | |
| step:44/1750 train_time:4061ms step_avg:92.29ms | |
| step:45/1750 train_time:4154ms step_avg:92.30ms | |
| step:46/1750 train_time:4246ms step_avg:92.31ms | |
| step:47/1750 train_time:4339ms step_avg:92.32ms | |
| step:48/1750 train_time:4432ms step_avg:92.33ms | |
| step:49/1750 train_time:4525ms step_avg:92.35ms | |
| step:50/1750 train_time:4619ms step_avg:92.39ms | |
| step:51/1750 train_time:4712ms step_avg:92.40ms | |
| step:52/1750 train_time:4805ms step_avg:92.40ms | |
| step:53/1750 train_time:4899ms step_avg:92.43ms | |
| step:54/1750 train_time:4992ms step_avg:92.44ms | |
| step:55/1750 train_time:5085ms step_avg:92.45ms | |
| step:56/1750 train_time:5177ms step_avg:92.45ms | |
| step:57/1750 train_time:5270ms step_avg:92.45ms | |
| step:58/1750 train_time:5364ms step_avg:92.48ms | |
| step:59/1750 train_time:5456ms step_avg:92.48ms | |
| step:60/1750 train_time:5549ms step_avg:92.49ms | |
| step:61/1750 train_time:5643ms step_avg:92.51ms | |
| step:62/1750 train_time:5736ms step_avg:92.52ms | |
| step:63/1750 train_time:5829ms step_avg:92.53ms | |
| step:64/1750 train_time:5922ms step_avg:92.54ms | |
| step:65/1750 train_time:6015ms step_avg:92.53ms | |
| step:66/1750 train_time:6107ms step_avg:92.53ms | |
| step:67/1750 train_time:6200ms step_avg:92.54ms | |
| step:68/1750 train_time:6293ms step_avg:92.55ms | |
| step:69/1750 train_time:6387ms step_avg:92.57ms | |
| step:70/1750 train_time:6480ms step_avg:92.57ms | |
| step:71/1750 train_time:6573ms step_avg:92.58ms | |
| step:72/1750 train_time:6668ms step_avg:92.61ms | |
| step:73/1750 train_time:6761ms step_avg:92.61ms | |
| step:74/1750 train_time:6854ms step_avg:92.62ms | |
| step:75/1750 train_time:6947ms step_avg:92.63ms | |
| step:76/1750 train_time:7041ms step_avg:92.64ms | |
| step:77/1750 train_time:7134ms step_avg:92.65ms | |
| step:78/1750 train_time:7227ms step_avg:92.66ms | |
| step:79/1750 train_time:7321ms step_avg:92.67ms | |
| step:80/1750 train_time:7413ms step_avg:92.66ms | |
| step:81/1750 train_time:7506ms step_avg:92.67ms | |
| step:82/1750 train_time:7599ms step_avg:92.67ms | |
| step:83/1750 train_time:7693ms step_avg:92.69ms | |
| step:84/1750 train_time:7787ms step_avg:92.71ms | |
| step:85/1750 train_time:7880ms step_avg:92.70ms | |
| step:86/1750 train_time:7973ms step_avg:92.70ms | |
| step:87/1750 train_time:8066ms step_avg:92.71ms | |
| step:88/1750 train_time:8159ms step_avg:92.72ms | |
| step:89/1750 train_time:8252ms step_avg:92.72ms | |
| step:90/1750 train_time:8345ms step_avg:92.73ms | |
| step:91/1750 train_time:8438ms step_avg:92.73ms | |
| step:92/1750 train_time:8532ms step_avg:92.74ms | |
| step:93/1750 train_time:8626ms step_avg:92.75ms | |
| step:94/1750 train_time:8718ms step_avg:92.75ms | |
| step:95/1750 train_time:8812ms step_avg:92.75ms | |
| step:96/1750 train_time:8904ms step_avg:92.75ms | |
| step:97/1750 train_time:8997ms step_avg:92.75ms | |
| step:98/1750 train_time:9090ms step_avg:92.76ms | |
| step:99/1750 train_time:9182ms step_avg:92.75ms | |
| step:100/1750 train_time:9275ms step_avg:92.75ms | |
| step:101/1750 train_time:9368ms step_avg:92.76ms | |
| step:102/1750 train_time:9462ms step_avg:92.76ms | |
| step:103/1750 train_time:9554ms step_avg:92.76ms | |
| step:104/1750 train_time:9648ms step_avg:92.77ms | |
| step:105/1750 train_time:9742ms step_avg:92.78ms | |
| step:106/1750 train_time:9835ms step_avg:92.78ms | |
| step:107/1750 train_time:9929ms step_avg:92.79ms | |
| step:108/1750 train_time:10022ms step_avg:92.79ms | |
| step:109/1750 train_time:10115ms step_avg:92.80ms | |
| step:110/1750 train_time:10208ms step_avg:92.80ms | |
| step:111/1750 train_time:10301ms step_avg:92.81ms | |
| step:112/1750 train_time:10394ms step_avg:92.81ms | |
| step:113/1750 train_time:10489ms step_avg:92.82ms | |
| step:114/1750 train_time:10582ms step_avg:92.82ms | |
| step:115/1750 train_time:10674ms step_avg:92.81ms | |
| step:116/1750 train_time:10767ms step_avg:92.82ms | |
| step:117/1750 train_time:10861ms step_avg:92.83ms | |
| step:118/1750 train_time:10954ms step_avg:92.83ms | |
| step:119/1750 train_time:11047ms step_avg:92.83ms | |
| step:120/1750 train_time:11141ms step_avg:92.84ms | |
| step:121/1750 train_time:11233ms step_avg:92.83ms | |
| step:122/1750 train_time:11325ms step_avg:92.83ms | |
| step:123/1750 train_time:11418ms step_avg:92.83ms | |
| step:124/1750 train_time:11511ms step_avg:92.83ms | |
| step:125/1750 train_time:11604ms step_avg:92.83ms | |
| step:125/1750 val_loss:4.6373 train_time:11689ms step_avg:93.51ms | |
| step:126/1750 train_time:11725ms step_avg:93.06ms | |
| step:127/1750 train_time:11800ms step_avg:92.91ms | |
| step:128/1750 train_time:11896ms step_avg:92.94ms | |
| step:129/1750 train_time:11991ms step_avg:92.95ms | |
| step:130/1750 train_time:12084ms step_avg:92.95ms | |
| step:131/1750 train_time:12176ms step_avg:92.94ms | |
| step:132/1750 train_time:12269ms step_avg:92.95ms | |
| step:133/1750 train_time:12362ms step_avg:92.95ms | |
| step:134/1750 train_time:12455ms step_avg:92.95ms | |
| step:135/1750 train_time:12547ms step_avg:92.94ms | |
| step:136/1750 train_time:12640ms step_avg:92.94ms | |
| step:137/1750 train_time:12734ms step_avg:92.95ms | |
| step:138/1750 train_time:12828ms step_avg:92.96ms | |
| step:139/1750 train_time:12924ms step_avg:92.98ms | |
| step:140/1750 train_time:13018ms step_avg:92.98ms | |
| step:141/1750 train_time:13110ms step_avg:92.98ms | |
| step:142/1750 train_time:13205ms step_avg:92.99ms | |
| step:143/1750 train_time:13299ms step_avg:93.00ms | |
| step:144/1750 train_time:13392ms step_avg:93.00ms | |
| step:145/1750 train_time:13485ms step_avg:93.00ms | |
| step:146/1750 train_time:13578ms step_avg:93.00ms | |
| step:147/1750 train_time:13671ms step_avg:93.00ms | |
| step:148/1750 train_time:13765ms step_avg:93.01ms | |
| step:149/1750 train_time:13859ms step_avg:93.02ms | |
| step:150/1750 train_time:13953ms step_avg:93.02ms | |
| step:151/1750 train_time:14047ms step_avg:93.02ms | |
| step:152/1750 train_time:14140ms step_avg:93.03ms | |
| step:153/1750 train_time:14233ms step_avg:93.03ms | |
| step:154/1750 train_time:14327ms step_avg:93.03ms | |
| step:155/1750 train_time:14420ms step_avg:93.03ms | |
| step:156/1750 train_time:14514ms step_avg:93.04ms | |
| step:157/1750 train_time:14608ms step_avg:93.04ms | |
| step:158/1750 train_time:14702ms step_avg:93.05ms | |
| step:159/1750 train_time:14795ms step_avg:93.05ms | |
| step:160/1750 train_time:14888ms step_avg:93.05ms | |
| step:161/1750 train_time:14982ms step_avg:93.06ms | |
| step:162/1750 train_time:15076ms step_avg:93.06ms | |
| step:163/1750 train_time:15169ms step_avg:93.06ms | |
| step:164/1750 train_time:15264ms step_avg:93.07ms | |
| step:165/1750 train_time:15356ms step_avg:93.07ms | |
| step:166/1750 train_time:15449ms step_avg:93.07ms | |
| step:167/1750 train_time:15543ms step_avg:93.07ms | |
| step:168/1750 train_time:15636ms step_avg:93.07ms | |
| step:169/1750 train_time:15730ms step_avg:93.07ms | |
| step:170/1750 train_time:15823ms step_avg:93.07ms | |
| step:171/1750 train_time:15917ms step_avg:93.08ms | |
| step:172/1750 train_time:16010ms step_avg:93.08ms | |
| step:173/1750 train_time:16105ms step_avg:93.09ms | |
| step:174/1750 train_time:16199ms step_avg:93.10ms | |
| step:175/1750 train_time:16292ms step_avg:93.09ms | |
| step:176/1750 train_time:16385ms step_avg:93.10ms | |
| step:177/1750 train_time:16478ms step_avg:93.10ms | |
| step:178/1750 train_time:16572ms step_avg:93.10ms | |
| step:179/1750 train_time:16665ms step_avg:93.10ms | |
| step:180/1750 train_time:16759ms step_avg:93.11ms | |
| step:181/1750 train_time:16852ms step_avg:93.11ms | |
| step:182/1750 train_time:16946ms step_avg:93.11ms | |
| step:183/1750 train_time:17040ms step_avg:93.11ms | |
| step:184/1750 train_time:17133ms step_avg:93.11ms | |
| step:185/1750 train_time:17226ms step_avg:93.11ms | |
| step:186/1750 train_time:17319ms step_avg:93.11ms | |
| step:187/1750 train_time:17413ms step_avg:93.12ms | |
| step:188/1750 train_time:17506ms step_avg:93.12ms | |
| step:189/1750 train_time:17599ms step_avg:93.12ms | |
| step:190/1750 train_time:17692ms step_avg:93.11ms | |
| step:191/1750 train_time:17786ms step_avg:93.12ms | |
| step:192/1750 train_time:17879ms step_avg:93.12ms | |
| step:193/1750 train_time:17973ms step_avg:93.12ms | |
| step:194/1750 train_time:18066ms step_avg:93.12ms | |
| step:195/1750 train_time:18159ms step_avg:93.13ms | |
| step:196/1750 train_time:18252ms step_avg:93.12ms | |
| step:197/1750 train_time:18347ms step_avg:93.13ms | |
| step:198/1750 train_time:18442ms step_avg:93.14ms | |
| step:199/1750 train_time:18535ms step_avg:93.14ms | |
| step:200/1750 train_time:18628ms step_avg:93.14ms | |
| step:201/1750 train_time:18721ms step_avg:93.14ms | |
| step:202/1750 train_time:18814ms step_avg:93.14ms | |
| step:203/1750 train_time:18908ms step_avg:93.14ms | |
| step:204/1750 train_time:19002ms step_avg:93.15ms | |
| step:205/1750 train_time:19097ms step_avg:93.16ms | |
| step:206/1750 train_time:19191ms step_avg:93.16ms | |
| step:207/1750 train_time:19285ms step_avg:93.16ms | |
| step:208/1750 train_time:19379ms step_avg:93.17ms | |
| step:209/1750 train_time:19472ms step_avg:93.17ms | |
| step:210/1750 train_time:19566ms step_avg:93.17ms | |
| step:211/1750 train_time:19660ms step_avg:93.18ms | |
| step:212/1750 train_time:19754ms step_avg:93.18ms | |
| step:213/1750 train_time:19848ms step_avg:93.18ms | |
| step:214/1750 train_time:19941ms step_avg:93.18ms | |
| step:215/1750 train_time:20033ms step_avg:93.18ms | |
| step:216/1750 train_time:20127ms step_avg:93.18ms | |
| step:217/1750 train_time:20221ms step_avg:93.18ms | |
| step:218/1750 train_time:20314ms step_avg:93.18ms | |
| step:219/1750 train_time:20407ms step_avg:93.18ms | |
| step:220/1750 train_time:20501ms step_avg:93.19ms | |
| step:221/1750 train_time:20595ms step_avg:93.19ms | |
| step:222/1750 train_time:20688ms step_avg:93.19ms | |
| step:223/1750 train_time:20783ms step_avg:93.20ms | |
| step:224/1750 train_time:20876ms step_avg:93.20ms | |
| step:225/1750 train_time:20970ms step_avg:93.20ms | |
| step:226/1750 train_time:21064ms step_avg:93.20ms | |
| step:227/1750 train_time:21158ms step_avg:93.21ms | |
| step:228/1750 train_time:21252ms step_avg:93.21ms | |
| step:229/1750 train_time:21345ms step_avg:93.21ms | |
| step:230/1750 train_time:21439ms step_avg:93.21ms | |
| step:231/1750 train_time:21532ms step_avg:93.21ms | |
| step:232/1750 train_time:21626ms step_avg:93.21ms | |
| step:233/1750 train_time:21720ms step_avg:93.22ms | |
| step:234/1750 train_time:21814ms step_avg:93.22ms | |
| step:235/1750 train_time:21908ms step_avg:93.22ms | |
| step:236/1750 train_time:22003ms step_avg:93.23ms | |
| step:237/1750 train_time:22097ms step_avg:93.24ms | |
| step:238/1750 train_time:22190ms step_avg:93.24ms | |
| step:239/1750 train_time:22284ms step_avg:93.24ms | |
| step:240/1750 train_time:22377ms step_avg:93.24ms | |
| step:241/1750 train_time:22470ms step_avg:93.24ms | |
| step:242/1750 train_time:22564ms step_avg:93.24ms | |
| step:243/1750 train_time:22658ms step_avg:93.24ms | |
| step:244/1750 train_time:22751ms step_avg:93.24ms | |
| step:245/1750 train_time:22844ms step_avg:93.24ms | |
| step:246/1750 train_time:22940ms step_avg:93.25ms | |
| step:247/1750 train_time:23032ms step_avg:93.25ms | |
| step:248/1750 train_time:23127ms step_avg:93.25ms | |
| step:249/1750 train_time:23221ms step_avg:93.26ms | |
| step:250/1750 train_time:23314ms step_avg:93.26ms | |
| step:250/1750 val_loss:4.0907 train_time:23401ms step_avg:93.60ms | |
| step:251/1750 train_time:23438ms step_avg:93.38ms | |
| step:252/1750 train_time:23514ms step_avg:93.31ms | |
| step:253/1750 train_time:23611ms step_avg:93.32ms | |
| step:254/1750 train_time:23704ms step_avg:93.32ms | |
| step:255/1750 train_time:23797ms step_avg:93.32ms | |
| step:256/1750 train_time:23891ms step_avg:93.32ms | |
| step:257/1750 train_time:23984ms step_avg:93.32ms | |
| step:258/1750 train_time:24075ms step_avg:93.32ms | |
| step:259/1750 train_time:24169ms step_avg:93.32ms | |
| step:260/1750 train_time:24263ms step_avg:93.32ms | |
| step:261/1750 train_time:24356ms step_avg:93.32ms | |
| step:262/1750 train_time:24452ms step_avg:93.33ms | |
| step:263/1750 train_time:24547ms step_avg:93.34ms | |
| step:264/1750 train_time:24644ms step_avg:93.35ms | |
| step:265/1750 train_time:24738ms step_avg:93.35ms | |
| step:266/1750 train_time:24832ms step_avg:93.35ms | |
| step:267/1750 train_time:24927ms step_avg:93.36ms | |
| step:268/1750 train_time:25021ms step_avg:93.36ms | |
| step:269/1750 train_time:25115ms step_avg:93.36ms | |
| step:270/1750 train_time:25208ms step_avg:93.36ms | |
| step:271/1750 train_time:25303ms step_avg:93.37ms | |
| step:272/1750 train_time:25396ms step_avg:93.37ms | |
| step:273/1750 train_time:25491ms step_avg:93.37ms | |
| step:274/1750 train_time:25586ms step_avg:93.38ms | |
| step:275/1750 train_time:25680ms step_avg:93.38ms | |
| step:276/1750 train_time:25774ms step_avg:93.38ms | |
| step:277/1750 train_time:25870ms step_avg:93.39ms | |
| step:278/1750 train_time:25966ms step_avg:93.40ms | |
| step:279/1750 train_time:26059ms step_avg:93.40ms | |
| step:280/1750 train_time:26153ms step_avg:93.40ms | |
| step:281/1750 train_time:26246ms step_avg:93.40ms | |
| step:282/1750 train_time:26339ms step_avg:93.40ms | |
| step:283/1750 train_time:26433ms step_avg:93.40ms | |
| step:284/1750 train_time:26527ms step_avg:93.41ms | |
| step:285/1750 train_time:26623ms step_avg:93.41ms | |
| step:286/1750 train_time:26716ms step_avg:93.41ms | |
| step:287/1750 train_time:26810ms step_avg:93.42ms | |
| step:288/1750 train_time:26904ms step_avg:93.42ms | |
| step:289/1750 train_time:26999ms step_avg:93.42ms | |
| step:290/1750 train_time:27093ms step_avg:93.43ms | |
| step:291/1750 train_time:27187ms step_avg:93.42ms | |
| step:292/1750 train_time:27280ms step_avg:93.43ms | |
| step:293/1750 train_time:27374ms step_avg:93.43ms | |
| step:294/1750 train_time:27468ms step_avg:93.43ms | |
| step:295/1750 train_time:27562ms step_avg:93.43ms | |
| step:296/1750 train_time:27656ms step_avg:93.43ms | |
| step:297/1750 train_time:27751ms step_avg:93.44ms | |
| step:298/1750 train_time:27845ms step_avg:93.44ms | |
| step:299/1750 train_time:27939ms step_avg:93.44ms | |
| step:300/1750 train_time:28033ms step_avg:93.44ms | |
| step:301/1750 train_time:28128ms step_avg:93.45ms | |
| step:302/1750 train_time:28222ms step_avg:93.45ms | |
| step:303/1750 train_time:28315ms step_avg:93.45ms | |
| step:304/1750 train_time:28409ms step_avg:93.45ms | |
| step:305/1750 train_time:28503ms step_avg:93.45ms | |
| step:306/1750 train_time:28596ms step_avg:93.45ms | |
| step:307/1750 train_time:28690ms step_avg:93.45ms | |
| step:308/1750 train_time:28784ms step_avg:93.46ms | |
| step:309/1750 train_time:28878ms step_avg:93.46ms | |
| step:310/1750 train_time:28973ms step_avg:93.46ms | |
| step:311/1750 train_time:29067ms step_avg:93.46ms | |
| step:312/1750 train_time:29163ms step_avg:93.47ms | |
| step:313/1750 train_time:29257ms step_avg:93.47ms | |
| step:314/1750 train_time:29351ms step_avg:93.47ms | |
| step:315/1750 train_time:29445ms step_avg:93.48ms | |
| step:316/1750 train_time:29539ms step_avg:93.48ms | |
| step:317/1750 train_time:29634ms step_avg:93.48ms | |
| step:318/1750 train_time:29728ms step_avg:93.48ms | |
| step:319/1750 train_time:29822ms step_avg:93.49ms | |
| step:320/1750 train_time:29915ms step_avg:93.49ms | |
| step:321/1750 train_time:30010ms step_avg:93.49ms | |
| step:322/1750 train_time:30104ms step_avg:93.49ms | |
| step:323/1750 train_time:30197ms step_avg:93.49ms | |
| step:324/1750 train_time:30291ms step_avg:93.49ms | |
| step:325/1750 train_time:30385ms step_avg:93.49ms | |
| step:326/1750 train_time:30478ms step_avg:93.49ms | |
| step:327/1750 train_time:30572ms step_avg:93.49ms | |
| step:328/1750 train_time:30666ms step_avg:93.49ms | |
| step:329/1750 train_time:30761ms step_avg:93.50ms | |
| step:330/1750 train_time:30855ms step_avg:93.50ms | |
| step:331/1750 train_time:30950ms step_avg:93.50ms | |
| step:332/1750 train_time:31045ms step_avg:93.51ms | |
| step:333/1750 train_time:31139ms step_avg:93.51ms | |
| step:334/1750 train_time:31234ms step_avg:93.51ms | |
| step:335/1750 train_time:31327ms step_avg:93.51ms | |
| step:336/1750 train_time:31421ms step_avg:93.51ms | |
| step:337/1750 train_time:31515ms step_avg:93.52ms | |
| step:338/1750 train_time:31609ms step_avg:93.52ms | |
| step:339/1750 train_time:31702ms step_avg:93.52ms | |
| step:340/1750 train_time:31796ms step_avg:93.52ms | |
| step:341/1750 train_time:31890ms step_avg:93.52ms | |
| step:342/1750 train_time:31985ms step_avg:93.52ms | |
| step:343/1750 train_time:32079ms step_avg:93.52ms | |
| step:344/1750 train_time:32173ms step_avg:93.53ms | |
| step:345/1750 train_time:32267ms step_avg:93.53ms | |
| step:346/1750 train_time:32362ms step_avg:93.53ms | |
| step:347/1750 train_time:32456ms step_avg:93.53ms | |
| step:348/1750 train_time:32550ms step_avg:93.53ms | |
| step:349/1750 train_time:32645ms step_avg:93.54ms | |
| step:350/1750 train_time:32738ms step_avg:93.54ms | |
| step:351/1750 train_time:32831ms step_avg:93.54ms | |
| step:352/1750 train_time:32925ms step_avg:93.54ms | |
| step:353/1750 train_time:33019ms step_avg:93.54ms | |
| step:354/1750 train_time:33114ms step_avg:93.54ms | |
| step:355/1750 train_time:33207ms step_avg:93.54ms | |
| step:356/1750 train_time:33301ms step_avg:93.54ms | |
| step:357/1750 train_time:33396ms step_avg:93.55ms | |
| step:358/1750 train_time:33490ms step_avg:93.55ms | |
| step:359/1750 train_time:33584ms step_avg:93.55ms | |
| step:360/1750 train_time:33678ms step_avg:93.55ms | |
| step:361/1750 train_time:33774ms step_avg:93.56ms | |
| step:362/1750 train_time:33867ms step_avg:93.55ms | |
| step:363/1750 train_time:33961ms step_avg:93.56ms | |
| step:364/1750 train_time:34055ms step_avg:93.56ms | |
| step:365/1750 train_time:34149ms step_avg:93.56ms | |
| step:366/1750 train_time:34244ms step_avg:93.56ms | |
| step:367/1750 train_time:34338ms step_avg:93.56ms | |
| step:368/1750 train_time:34432ms step_avg:93.57ms | |
| step:369/1750 train_time:34526ms step_avg:93.57ms | |
| step:370/1750 train_time:34623ms step_avg:93.57ms | |
| step:371/1750 train_time:34714ms step_avg:93.57ms | |
| step:372/1750 train_time:34808ms step_avg:93.57ms | |
| step:373/1750 train_time:34903ms step_avg:93.57ms | |
| step:374/1750 train_time:34997ms step_avg:93.58ms | |
| step:375/1750 train_time:35090ms step_avg:93.57ms | |
| step:375/1750 val_loss:3.8869 train_time:35178ms step_avg:93.81ms | |
| step:376/1750 train_time:35214ms step_avg:93.65ms | |
| step:377/1750 train_time:35291ms step_avg:93.61ms | |
| step:378/1750 train_time:35385ms step_avg:93.61ms | |
| step:379/1750 train_time:35479ms step_avg:93.61ms | |
| step:380/1750 train_time:35575ms step_avg:93.62ms | |
| step:381/1750 train_time:35668ms step_avg:93.62ms | |
| step:382/1750 train_time:35761ms step_avg:93.62ms | |
| step:383/1750 train_time:35856ms step_avg:93.62ms | |
| step:384/1750 train_time:35948ms step_avg:93.61ms | |
| step:385/1750 train_time:36043ms step_avg:93.62ms | |
| step:386/1750 train_time:36135ms step_avg:93.61ms | |
| step:387/1750 train_time:36228ms step_avg:93.61ms | |
| step:388/1750 train_time:36324ms step_avg:93.62ms | |
| step:389/1750 train_time:36419ms step_avg:93.62ms | |
| step:390/1750 train_time:36514ms step_avg:93.62ms | |
| step:391/1750 train_time:36608ms step_avg:93.63ms | |
| step:392/1750 train_time:36704ms step_avg:93.63ms | |
| step:393/1750 train_time:36800ms step_avg:93.64ms | |
| step:394/1750 train_time:36896ms step_avg:93.65ms | |
| step:395/1750 train_time:36991ms step_avg:93.65ms | |
| step:396/1750 train_time:37086ms step_avg:93.65ms | |
| step:397/1750 train_time:37181ms step_avg:93.66ms | |
| step:398/1750 train_time:37280ms step_avg:93.67ms | |
| step:399/1750 train_time:37375ms step_avg:93.67ms | |
| step:400/1750 train_time:37472ms step_avg:93.68ms | |
| step:401/1750 train_time:37568ms step_avg:93.69ms | |
| step:402/1750 train_time:37663ms step_avg:93.69ms | |
| step:403/1750 train_time:37759ms step_avg:93.70ms | |
| step:404/1750 train_time:37855ms step_avg:93.70ms | |
| step:405/1750 train_time:37951ms step_avg:93.71ms | |
| step:406/1750 train_time:38046ms step_avg:93.71ms | |
| step:407/1750 train_time:38142ms step_avg:93.72ms | |
| step:408/1750 train_time:38238ms step_avg:93.72ms | |
| step:409/1750 train_time:38333ms step_avg:93.72ms | |
| step:410/1750 train_time:38429ms step_avg:93.73ms | |
| step:411/1750 train_time:38526ms step_avg:93.74ms | |
| step:412/1750 train_time:38622ms step_avg:93.74ms | |
| step:413/1750 train_time:38719ms step_avg:93.75ms | |
| step:414/1750 train_time:38814ms step_avg:93.75ms | |
| step:415/1750 train_time:38910ms step_avg:93.76ms | |
| step:416/1750 train_time:39005ms step_avg:93.76ms | |
| step:417/1750 train_time:39101ms step_avg:93.77ms | |
| step:418/1750 train_time:39197ms step_avg:93.77ms | |
| step:419/1750 train_time:39294ms step_avg:93.78ms | |
| step:420/1750 train_time:39390ms step_avg:93.79ms | |
| step:421/1750 train_time:39486ms step_avg:93.79ms | |
| step:422/1750 train_time:39582ms step_avg:93.80ms | |
| step:423/1750 train_time:39678ms step_avg:93.80ms | |
| step:424/1750 train_time:39774ms step_avg:93.81ms | |
| step:425/1750 train_time:39870ms step_avg:93.81ms | |
| step:426/1750 train_time:39966ms step_avg:93.82ms | |
| step:427/1750 train_time:40061ms step_avg:93.82ms | |
| step:428/1750 train_time:40157ms step_avg:93.83ms | |
| step:429/1750 train_time:40253ms step_avg:93.83ms | |
| step:430/1750 train_time:40348ms step_avg:93.83ms | |
| step:431/1750 train_time:40444ms step_avg:93.84ms | |
| step:432/1750 train_time:40541ms step_avg:93.84ms | |
| step:433/1750 train_time:40638ms step_avg:93.85ms | |
| step:434/1750 train_time:40733ms step_avg:93.86ms | |
| step:435/1750 train_time:40829ms step_avg:93.86ms | |
| step:436/1750 train_time:40925ms step_avg:93.87ms | |
| step:437/1750 train_time:41023ms step_avg:93.87ms | |
| step:438/1750 train_time:41119ms step_avg:93.88ms | |
| step:439/1750 train_time:41215ms step_avg:93.88ms | |
| step:440/1750 train_time:41311ms step_avg:93.89ms | |
| step:441/1750 train_time:41406ms step_avg:93.89ms | |
| step:442/1750 train_time:41503ms step_avg:93.90ms | |
| step:443/1750 train_time:41599ms step_avg:93.90ms | |
| step:444/1750 train_time:41695ms step_avg:93.91ms | |
| step:445/1750 train_time:41791ms step_avg:93.91ms | |
| step:446/1750 train_time:41887ms step_avg:93.92ms | |
| step:447/1750 train_time:41983ms step_avg:93.92ms | |
| step:448/1750 train_time:42079ms step_avg:93.93ms | |
| step:449/1750 train_time:42175ms step_avg:93.93ms | |
| step:450/1750 train_time:42270ms step_avg:93.93ms | |
| step:451/1750 train_time:42366ms step_avg:93.94ms | |
| step:452/1750 train_time:42462ms step_avg:93.94ms | |
| step:453/1750 train_time:42558ms step_avg:93.95ms | |
| step:454/1750 train_time:42654ms step_avg:93.95ms | |
| step:455/1750 train_time:42749ms step_avg:93.95ms | |
| step:456/1750 train_time:42845ms step_avg:93.96ms | |
| step:457/1750 train_time:42941ms step_avg:93.96ms | |
| step:458/1750 train_time:43037ms step_avg:93.97ms | |
| step:459/1750 train_time:43134ms step_avg:93.97ms | |
| step:460/1750 train_time:43230ms step_avg:93.98ms | |
| step:461/1750 train_time:43327ms step_avg:93.98ms | |
| step:462/1750 train_time:43423ms step_avg:93.99ms | |
| step:463/1750 train_time:43519ms step_avg:93.99ms | |
| step:464/1750 train_time:43615ms step_avg:94.00ms | |
| step:465/1750 train_time:43709ms step_avg:94.00ms | |
| step:466/1750 train_time:43805ms step_avg:94.00ms | |
| step:467/1750 train_time:43902ms step_avg:94.01ms | |
| step:468/1750 train_time:43998ms step_avg:94.01ms | |
| step:469/1750 train_time:44095ms step_avg:94.02ms | |
| step:470/1750 train_time:44192ms step_avg:94.03ms | |
| step:471/1750 train_time:44288ms step_avg:94.03ms | |
| step:472/1750 train_time:44384ms step_avg:94.03ms | |
| step:473/1750 train_time:44483ms step_avg:94.04ms | |
| step:474/1750 train_time:44576ms step_avg:94.04ms | |
| step:475/1750 train_time:44671ms step_avg:94.05ms | |
| step:476/1750 train_time:44767ms step_avg:94.05ms | |
| step:477/1750 train_time:44863ms step_avg:94.05ms | |
| step:478/1750 train_time:44959ms step_avg:94.06ms | |
| step:479/1750 train_time:45056ms step_avg:94.06ms | |
| step:480/1750 train_time:45152ms step_avg:94.07ms | |
| step:481/1750 train_time:45249ms step_avg:94.07ms | |
| step:482/1750 train_time:45344ms step_avg:94.07ms | |
| step:483/1750 train_time:45440ms step_avg:94.08ms | |
| step:484/1750 train_time:45536ms step_avg:94.08ms | |
| step:485/1750 train_time:45632ms step_avg:94.09ms | |
| step:486/1750 train_time:45728ms step_avg:94.09ms | |
| step:487/1750 train_time:45823ms step_avg:94.09ms | |
| step:488/1750 train_time:45920ms step_avg:94.10ms | |
| step:489/1750 train_time:46016ms step_avg:94.10ms | |
| step:490/1750 train_time:46111ms step_avg:94.11ms | |
| step:491/1750 train_time:46208ms step_avg:94.11ms | |
| step:492/1750 train_time:46304ms step_avg:94.11ms | |
| step:493/1750 train_time:46401ms step_avg:94.12ms | |
| step:494/1750 train_time:46497ms step_avg:94.12ms | |
| step:495/1750 train_time:46594ms step_avg:94.13ms | |
| step:496/1750 train_time:46691ms step_avg:94.14ms | |
| step:497/1750 train_time:46785ms step_avg:94.14ms | |
| step:498/1750 train_time:46882ms step_avg:94.14ms | |
| step:499/1750 train_time:46978ms step_avg:94.15ms | |
| step:500/1750 train_time:47075ms step_avg:94.15ms | |
| step:500/1750 val_loss:3.7387 train_time:47164ms step_avg:94.33ms | |
| step:501/1750 train_time:47200ms step_avg:94.21ms | |
| step:502/1750 train_time:47277ms step_avg:94.18ms | |
| step:503/1750 train_time:47375ms step_avg:94.19ms | |
| step:504/1750 train_time:47471ms step_avg:94.19ms | |
| step:505/1750 train_time:47566ms step_avg:94.19ms | |
| step:506/1750 train_time:47661ms step_avg:94.19ms | |
| step:507/1750 train_time:47757ms step_avg:94.20ms | |
| step:508/1750 train_time:47851ms step_avg:94.20ms | |
| step:509/1750 train_time:47947ms step_avg:94.20ms | |
| step:510/1750 train_time:48041ms step_avg:94.20ms | |
| step:511/1750 train_time:48137ms step_avg:94.20ms | |
| step:512/1750 train_time:48236ms step_avg:94.21ms | |
| step:513/1750 train_time:48330ms step_avg:94.21ms | |
| step:514/1750 train_time:48425ms step_avg:94.21ms | |
| step:515/1750 train_time:48522ms step_avg:94.22ms | |
| step:516/1750 train_time:48619ms step_avg:94.22ms | |
| step:517/1750 train_time:48715ms step_avg:94.23ms | |
| step:518/1750 train_time:48810ms step_avg:94.23ms | |
| step:519/1750 train_time:48904ms step_avg:94.23ms | |
| step:520/1750 train_time:49000ms step_avg:94.23ms | |
| step:521/1750 train_time:49097ms step_avg:94.24ms | |
| step:522/1750 train_time:49193ms step_avg:94.24ms | |
| step:523/1750 train_time:49289ms step_avg:94.24ms | |
| step:524/1750 train_time:49385ms step_avg:94.25ms | |
| step:525/1750 train_time:49481ms step_avg:94.25ms | |
| step:526/1750 train_time:49578ms step_avg:94.26ms | |
| step:527/1750 train_time:49674ms step_avg:94.26ms | |
| step:528/1750 train_time:49769ms step_avg:94.26ms | |
| step:529/1750 train_time:49864ms step_avg:94.26ms | |
| step:530/1750 train_time:49960ms step_avg:94.26ms | |
| step:531/1750 train_time:50055ms step_avg:94.27ms | |
| step:532/1750 train_time:50151ms step_avg:94.27ms | |
| step:533/1750 train_time:50248ms step_avg:94.27ms | |
| step:534/1750 train_time:50344ms step_avg:94.28ms | |
| step:535/1750 train_time:50440ms step_avg:94.28ms | |
| step:536/1750 train_time:50537ms step_avg:94.29ms | |
| step:537/1750 train_time:50633ms step_avg:94.29ms | |
| step:538/1750 train_time:50729ms step_avg:94.29ms | |
| step:539/1750 train_time:50825ms step_avg:94.29ms | |
| step:540/1750 train_time:50921ms step_avg:94.30ms | |
| step:541/1750 train_time:51016ms step_avg:94.30ms | |
| step:542/1750 train_time:51113ms step_avg:94.30ms | |
| step:543/1750 train_time:51209ms step_avg:94.31ms | |
| step:544/1750 train_time:51304ms step_avg:94.31ms | |
| step:545/1750 train_time:51400ms step_avg:94.31ms | |
| step:546/1750 train_time:51497ms step_avg:94.32ms | |
| step:547/1750 train_time:51592ms step_avg:94.32ms | |
| step:548/1750 train_time:51688ms step_avg:94.32ms | |
| step:549/1750 train_time:51784ms step_avg:94.32ms | |
| step:550/1750 train_time:51880ms step_avg:94.33ms | |
| step:551/1750 train_time:51975ms step_avg:94.33ms | |
| step:552/1750 train_time:52071ms step_avg:94.33ms | |
| step:553/1750 train_time:52166ms step_avg:94.33ms | |
| step:554/1750 train_time:52263ms step_avg:94.34ms | |
| step:555/1750 train_time:52359ms step_avg:94.34ms | |
| step:556/1750 train_time:52456ms step_avg:94.34ms | |
| step:557/1750 train_time:52552ms step_avg:94.35ms | |
| step:558/1750 train_time:52648ms step_avg:94.35ms | |
| step:559/1750 train_time:52744ms step_avg:94.35ms | |
| step:560/1750 train_time:52840ms step_avg:94.36ms | |
| step:561/1750 train_time:52935ms step_avg:94.36ms | |
| step:562/1750 train_time:53032ms step_avg:94.36ms | |
| step:563/1750 train_time:53127ms step_avg:94.36ms | |
| step:564/1750 train_time:53222ms step_avg:94.36ms | |
| step:565/1750 train_time:53318ms step_avg:94.37ms | |
| step:566/1750 train_time:53415ms step_avg:94.37ms | |
| step:567/1750 train_time:53510ms step_avg:94.37ms | |
| step:568/1750 train_time:53607ms step_avg:94.38ms | |
| step:569/1750 train_time:53702ms step_avg:94.38ms | |
| step:570/1750 train_time:53799ms step_avg:94.38ms | |
| step:571/1750 train_time:53895ms step_avg:94.39ms | |
| step:572/1750 train_time:53991ms step_avg:94.39ms | |
| step:573/1750 train_time:54087ms step_avg:94.39ms | |
| step:574/1750 train_time:54183ms step_avg:94.40ms | |
| step:575/1750 train_time:54279ms step_avg:94.40ms | |
| step:576/1750 train_time:54376ms step_avg:94.40ms | |
| step:577/1750 train_time:54472ms step_avg:94.41ms | |
| step:578/1750 train_time:54567ms step_avg:94.41ms | |
| step:579/1750 train_time:54663ms step_avg:94.41ms | |
| step:580/1750 train_time:54760ms step_avg:94.41ms | |
| step:581/1750 train_time:54856ms step_avg:94.42ms | |
| step:582/1750 train_time:54952ms step_avg:94.42ms | |
| step:583/1750 train_time:55048ms step_avg:94.42ms | |
| step:584/1750 train_time:55144ms step_avg:94.43ms | |
| step:585/1750 train_time:55240ms step_avg:94.43ms | |
| step:586/1750 train_time:55338ms step_avg:94.43ms | |
| step:587/1750 train_time:55432ms step_avg:94.43ms | |
| step:588/1750 train_time:55527ms step_avg:94.43ms | |
| step:589/1750 train_time:55623ms step_avg:94.44ms | |
| step:590/1750 train_time:55718ms step_avg:94.44ms | |
| step:591/1750 train_time:55815ms step_avg:94.44ms | |
| step:592/1750 train_time:55910ms step_avg:94.44ms | |
| step:593/1750 train_time:56007ms step_avg:94.45ms | |
| step:594/1750 train_time:56103ms step_avg:94.45ms | |
| step:595/1750 train_time:56201ms step_avg:94.46ms | |
| step:596/1750 train_time:56296ms step_avg:94.46ms | |
| step:597/1750 train_time:56392ms step_avg:94.46ms | |
| step:598/1750 train_time:56488ms step_avg:94.46ms | |
| step:599/1750 train_time:56583ms step_avg:94.46ms | |
| step:600/1750 train_time:56679ms step_avg:94.47ms | |
| step:601/1750 train_time:56775ms step_avg:94.47ms | |
| step:602/1750 train_time:56870ms step_avg:94.47ms | |
| step:603/1750 train_time:56966ms step_avg:94.47ms | |
| step:604/1750 train_time:57062ms step_avg:94.47ms | |
| step:605/1750 train_time:57158ms step_avg:94.48ms | |
| step:606/1750 train_time:57254ms step_avg:94.48ms | |
| step:607/1750 train_time:57350ms step_avg:94.48ms | |
| step:608/1750 train_time:57446ms step_avg:94.48ms | |
| step:609/1750 train_time:57542ms step_avg:94.49ms | |
| step:610/1750 train_time:57638ms step_avg:94.49ms | |
| step:611/1750 train_time:57733ms step_avg:94.49ms | |
| step:612/1750 train_time:57829ms step_avg:94.49ms | |
| step:613/1750 train_time:57925ms step_avg:94.49ms | |
| step:614/1750 train_time:58021ms step_avg:94.50ms | |
| step:615/1750 train_time:58117ms step_avg:94.50ms | |
| step:616/1750 train_time:58214ms step_avg:94.50ms | |
| step:617/1750 train_time:58309ms step_avg:94.50ms | |
| step:618/1750 train_time:58404ms step_avg:94.51ms | |
| step:619/1750 train_time:58501ms step_avg:94.51ms | |
| step:620/1750 train_time:58597ms step_avg:94.51ms | |
| step:621/1750 train_time:58693ms step_avg:94.51ms | |
| step:622/1750 train_time:58788ms step_avg:94.52ms | |
| step:623/1750 train_time:58884ms step_avg:94.52ms | |
| step:624/1750 train_time:58980ms step_avg:94.52ms | |
| step:625/1750 train_time:59076ms step_avg:94.52ms | |
| step:625/1750 val_loss:3.6554 train_time:59165ms step_avg:94.66ms | |
| step:626/1750 train_time:59202ms step_avg:94.57ms | |
| step:627/1750 train_time:59278ms step_avg:94.54ms | |
| step:628/1750 train_time:59376ms step_avg:94.55ms | |
| step:629/1750 train_time:59473ms step_avg:94.55ms | |
| step:630/1750 train_time:59569ms step_avg:94.55ms | |
| step:631/1750 train_time:59665ms step_avg:94.56ms | |
| step:632/1750 train_time:59760ms step_avg:94.56ms | |
| step:633/1750 train_time:59855ms step_avg:94.56ms | |
| step:634/1750 train_time:59951ms step_avg:94.56ms | |
| step:635/1750 train_time:60047ms step_avg:94.56ms | |
| step:636/1750 train_time:60142ms step_avg:94.56ms | |
| step:637/1750 train_time:60239ms step_avg:94.57ms | |
| step:638/1750 train_time:60336ms step_avg:94.57ms | |
| step:639/1750 train_time:60433ms step_avg:94.57ms | |
| step:640/1750 train_time:60529ms step_avg:94.58ms | |
| step:641/1750 train_time:60626ms step_avg:94.58ms | |
| step:642/1750 train_time:60721ms step_avg:94.58ms | |
| step:643/1750 train_time:60817ms step_avg:94.58ms | |
| step:644/1750 train_time:60913ms step_avg:94.59ms | |
| step:645/1750 train_time:61008ms step_avg:94.59ms | |
| step:646/1750 train_time:61105ms step_avg:94.59ms | |
| step:647/1750 train_time:61201ms step_avg:94.59ms | |
| step:648/1750 train_time:61297ms step_avg:94.59ms | |
| step:649/1750 train_time:61393ms step_avg:94.60ms | |
| step:650/1750 train_time:61490ms step_avg:94.60ms | |
| step:651/1750 train_time:61588ms step_avg:94.61ms | |
| step:652/1750 train_time:61686ms step_avg:94.61ms | |
| step:653/1750 train_time:61783ms step_avg:94.61ms | |
| step:654/1750 train_time:61880ms step_avg:94.62ms | |
| step:655/1750 train_time:61977ms step_avg:94.62ms | |
| step:656/1750 train_time:62074ms step_avg:94.63ms | |
| step:657/1750 train_time:62173ms step_avg:94.63ms | |
| step:658/1750 train_time:62271ms step_avg:94.64ms | |
| step:659/1750 train_time:62370ms step_avg:94.64ms | |
| step:660/1750 train_time:62469ms step_avg:94.65ms | |
| step:661/1750 train_time:62567ms step_avg:94.65ms | |
| step:662/1750 train_time:62664ms step_avg:94.66ms | |
| step:663/1750 train_time:62761ms step_avg:94.66ms | |
| step:664/1750 train_time:62858ms step_avg:94.67ms | |
| step:665/1750 train_time:62955ms step_avg:94.67ms | |
| step:666/1750 train_time:63052ms step_avg:94.67ms | |
| step:667/1750 train_time:63150ms step_avg:94.68ms | |
| step:668/1750 train_time:63247ms step_avg:94.68ms | |
| step:669/1750 train_time:63345ms step_avg:94.69ms | |
| step:670/1750 train_time:63441ms step_avg:94.69ms | |
| step:671/1750 train_time:63538ms step_avg:94.69ms | |
| step:672/1750 train_time:63635ms step_avg:94.70ms | |
| step:673/1750 train_time:63734ms step_avg:94.70ms | |
| step:674/1750 train_time:63832ms step_avg:94.71ms | |
| step:675/1750 train_time:63930ms step_avg:94.71ms | |
| step:676/1750 train_time:64027ms step_avg:94.71ms | |
| step:677/1750 train_time:64124ms step_avg:94.72ms | |
| step:678/1750 train_time:64222ms step_avg:94.72ms | |
| step:679/1750 train_time:64319ms step_avg:94.73ms | |
| step:680/1750 train_time:64416ms step_avg:94.73ms | |
| step:681/1750 train_time:64513ms step_avg:94.73ms | |
| step:682/1750 train_time:64612ms step_avg:94.74ms | |
| step:683/1750 train_time:64710ms step_avg:94.74ms | |
| step:684/1750 train_time:64808ms step_avg:94.75ms | |
| step:685/1750 train_time:64906ms step_avg:94.75ms | |
| step:686/1750 train_time:65003ms step_avg:94.76ms | |
| step:687/1750 train_time:65100ms step_avg:94.76ms | |
| step:688/1750 train_time:65196ms step_avg:94.76ms | |
| step:689/1750 train_time:65295ms step_avg:94.77ms | |
| step:690/1750 train_time:65393ms step_avg:94.77ms | |
| step:691/1750 train_time:65490ms step_avg:94.78ms | |
| step:692/1750 train_time:65588ms step_avg:94.78ms | |
| step:693/1750 train_time:65685ms step_avg:94.78ms | |
| step:694/1750 train_time:65782ms step_avg:94.79ms | |
| step:695/1750 train_time:65880ms step_avg:94.79ms | |
| step:696/1750 train_time:65977ms step_avg:94.79ms | |
| step:697/1750 train_time:66075ms step_avg:94.80ms | |
| step:698/1750 train_time:66172ms step_avg:94.80ms | |
| step:699/1750 train_time:66271ms step_avg:94.81ms | |
| step:700/1750 train_time:66369ms step_avg:94.81ms | |
| step:701/1750 train_time:66467ms step_avg:94.82ms | |
| step:702/1750 train_time:66564ms step_avg:94.82ms | |
| step:703/1750 train_time:66660ms step_avg:94.82ms | |
| step:704/1750 train_time:66757ms step_avg:94.83ms | |
| step:705/1750 train_time:66855ms step_avg:94.83ms | |
| step:706/1750 train_time:66953ms step_avg:94.83ms | |
| step:707/1750 train_time:67051ms step_avg:94.84ms | |
| step:708/1750 train_time:67149ms step_avg:94.84ms | |
| step:709/1750 train_time:67246ms step_avg:94.85ms | |
| step:710/1750 train_time:67343ms step_avg:94.85ms | |
| step:711/1750 train_time:67440ms step_avg:94.85ms | |
| step:712/1750 train_time:67537ms step_avg:94.86ms | |
| step:713/1750 train_time:67634ms step_avg:94.86ms | |
| step:714/1750 train_time:67732ms step_avg:94.86ms | |
| step:715/1750 train_time:67829ms step_avg:94.87ms | |
| step:716/1750 train_time:67928ms step_avg:94.87ms | |
| step:717/1750 train_time:68026ms step_avg:94.88ms | |
| step:718/1750 train_time:68123ms step_avg:94.88ms | |
| step:719/1750 train_time:68220ms step_avg:94.88ms | |
| step:720/1750 train_time:68317ms step_avg:94.88ms | |
| step:721/1750 train_time:68415ms step_avg:94.89ms | |
| step:722/1750 train_time:68513ms step_avg:94.89ms | |
| step:723/1750 train_time:68610ms step_avg:94.90ms | |
| step:724/1750 train_time:68708ms step_avg:94.90ms | |
| step:725/1750 train_time:68805ms step_avg:94.90ms | |
| step:726/1750 train_time:68903ms step_avg:94.91ms | |
| step:727/1750 train_time:69000ms step_avg:94.91ms | |
| step:728/1750 train_time:69098ms step_avg:94.91ms | |
| step:729/1750 train_time:69195ms step_avg:94.92ms | |
| step:730/1750 train_time:69293ms step_avg:94.92ms | |
| step:731/1750 train_time:69391ms step_avg:94.93ms | |
| step:732/1750 train_time:69488ms step_avg:94.93ms | |
| step:733/1750 train_time:69585ms step_avg:94.93ms | |
| step:734/1750 train_time:69682ms step_avg:94.93ms | |
| step:735/1750 train_time:69779ms step_avg:94.94ms | |
| step:736/1750 train_time:69877ms step_avg:94.94ms | |
| step:737/1750 train_time:69975ms step_avg:94.95ms | |
| step:738/1750 train_time:70073ms step_avg:94.95ms | |
| step:739/1750 train_time:70171ms step_avg:94.95ms | |
| step:740/1750 train_time:70269ms step_avg:94.96ms | |
| step:741/1750 train_time:70367ms step_avg:94.96ms | |
| step:742/1750 train_time:70464ms step_avg:94.97ms | |
| step:743/1750 train_time:70561ms step_avg:94.97ms | |
| step:744/1750 train_time:70659ms step_avg:94.97ms | |
| step:745/1750 train_time:70756ms step_avg:94.97ms | |
| step:746/1750 train_time:70853ms step_avg:94.98ms | |
| step:747/1750 train_time:70950ms step_avg:94.98ms | |
| step:748/1750 train_time:71048ms step_avg:94.98ms | |
| step:749/1750 train_time:71146ms step_avg:94.99ms | |
| step:750/1750 train_time:71242ms step_avg:94.99ms | |
| step:750/1750 val_loss:3.5917 train_time:71332ms step_avg:95.11ms | |
| step:751/1750 train_time:71369ms step_avg:95.03ms | |
| step:752/1750 train_time:71446ms step_avg:95.01ms | |
| step:753/1750 train_time:71546ms step_avg:95.01ms | |
| step:754/1750 train_time:71643ms step_avg:95.02ms | |
| step:755/1750 train_time:71740ms step_avg:95.02ms | |
| step:756/1750 train_time:71837ms step_avg:95.02ms | |
| step:757/1750 train_time:71934ms step_avg:95.03ms | |
| step:758/1750 train_time:72032ms step_avg:95.03ms | |
| step:759/1750 train_time:72129ms step_avg:95.03ms | |
| step:760/1750 train_time:72225ms step_avg:95.03ms | |
| step:761/1750 train_time:72322ms step_avg:95.04ms | |
| step:762/1750 train_time:72420ms step_avg:95.04ms | |
| step:763/1750 train_time:72519ms step_avg:95.04ms | |
| step:764/1750 train_time:72617ms step_avg:95.05ms | |
| step:765/1750 train_time:72715ms step_avg:95.05ms | |
| step:766/1750 train_time:72813ms step_avg:95.06ms | |
| step:767/1750 train_time:72911ms step_avg:95.06ms | |
| step:768/1750 train_time:73008ms step_avg:95.06ms | |
| step:769/1750 train_time:73105ms step_avg:95.06ms | |
| step:770/1750 train_time:73201ms step_avg:95.07ms | |
| step:771/1750 train_time:73299ms step_avg:95.07ms | |
| step:772/1750 train_time:73397ms step_avg:95.07ms | |
| step:773/1750 train_time:73495ms step_avg:95.08ms | |
| step:774/1750 train_time:73594ms step_avg:95.08ms | |
| step:775/1750 train_time:73692ms step_avg:95.09ms | |
| step:776/1750 train_time:73790ms step_avg:95.09ms | |
| step:777/1750 train_time:73888ms step_avg:95.09ms | |
| step:778/1750 train_time:73985ms step_avg:95.10ms | |
| step:779/1750 train_time:74082ms step_avg:95.10ms | |
| step:780/1750 train_time:74179ms step_avg:95.10ms | |
| step:781/1750 train_time:74277ms step_avg:95.10ms | |
| step:782/1750 train_time:74374ms step_avg:95.11ms | |
| step:783/1750 train_time:74472ms step_avg:95.11ms | |
| step:784/1750 train_time:74570ms step_avg:95.11ms | |
| step:785/1750 train_time:74668ms step_avg:95.12ms | |
| step:786/1750 train_time:74765ms step_avg:95.12ms | |
| step:787/1750 train_time:74863ms step_avg:95.12ms | |
| step:788/1750 train_time:74961ms step_avg:95.13ms | |
| step:789/1750 train_time:75059ms step_avg:95.13ms | |
| step:790/1750 train_time:75157ms step_avg:95.14ms | |
| step:791/1750 train_time:75255ms step_avg:95.14ms | |
| step:792/1750 train_time:75354ms step_avg:95.14ms | |
| step:793/1750 train_time:75452ms step_avg:95.15ms | |
| step:794/1750 train_time:75550ms step_avg:95.15ms | |
| step:795/1750 train_time:75648ms step_avg:95.16ms | |
| step:796/1750 train_time:75746ms step_avg:95.16ms | |
| step:797/1750 train_time:75844ms step_avg:95.16ms | |
| step:798/1750 train_time:75941ms step_avg:95.16ms | |
| step:799/1750 train_time:76038ms step_avg:95.17ms | |
| step:800/1750 train_time:76136ms step_avg:95.17ms | |
| step:801/1750 train_time:76234ms step_avg:95.17ms | |
| step:802/1750 train_time:76332ms step_avg:95.18ms | |
| step:803/1750 train_time:76430ms step_avg:95.18ms | |
| step:804/1750 train_time:76528ms step_avg:95.18ms | |
| step:805/1750 train_time:76626ms step_avg:95.19ms | |
| step:806/1750 train_time:76724ms step_avg:95.19ms | |
| step:807/1750 train_time:76821ms step_avg:95.19ms | |
| step:808/1750 train_time:76919ms step_avg:95.20ms | |
| step:809/1750 train_time:77017ms step_avg:95.20ms | |
| step:810/1750 train_time:77114ms step_avg:95.20ms | |
| step:811/1750 train_time:77212ms step_avg:95.21ms | |
| step:812/1750 train_time:77309ms step_avg:95.21ms | |
| step:813/1750 train_time:77406ms step_avg:95.21ms | |
| step:814/1750 train_time:77504ms step_avg:95.21ms | |
| step:815/1750 train_time:77602ms step_avg:95.22ms | |
| step:816/1750 train_time:77700ms step_avg:95.22ms | |
| step:817/1750 train_time:77798ms step_avg:95.22ms | |
| step:818/1750 train_time:77896ms step_avg:95.23ms | |
| step:819/1750 train_time:77994ms step_avg:95.23ms | |
| step:820/1750 train_time:78092ms step_avg:95.23ms | |
| step:821/1750 train_time:78190ms step_avg:95.24ms | |
| step:822/1750 train_time:78288ms step_avg:95.24ms | |
| step:823/1750 train_time:78385ms step_avg:95.24ms | |
| step:824/1750 train_time:78483ms step_avg:95.25ms | |
| step:825/1750 train_time:78580ms step_avg:95.25ms | |
| step:826/1750 train_time:78678ms step_avg:95.25ms | |
| step:827/1750 train_time:78777ms step_avg:95.26ms | |
| step:828/1750 train_time:78876ms step_avg:95.26ms | |
| step:829/1750 train_time:78975ms step_avg:95.26ms | |
| step:830/1750 train_time:79073ms step_avg:95.27ms | |
| step:831/1750 train_time:79171ms step_avg:95.27ms | |
| step:832/1750 train_time:79269ms step_avg:95.28ms | |
| step:833/1750 train_time:79367ms step_avg:95.28ms | |
| step:834/1750 train_time:79465ms step_avg:95.28ms | |
| step:835/1750 train_time:79562ms step_avg:95.28ms | |
| step:836/1750 train_time:79660ms step_avg:95.29ms | |
| step:837/1750 train_time:79757ms step_avg:95.29ms | |
| step:838/1750 train_time:79856ms step_avg:95.29ms | |
| step:839/1750 train_time:79954ms step_avg:95.30ms | |
| step:840/1750 train_time:80052ms step_avg:95.30ms | |
| step:841/1750 train_time:80149ms step_avg:95.30ms | |
| step:842/1750 train_time:80247ms step_avg:95.30ms | |
| step:843/1750 train_time:80344ms step_avg:95.31ms | |
| step:844/1750 train_time:80442ms step_avg:95.31ms | |
| step:845/1750 train_time:80540ms step_avg:95.31ms | |
| step:846/1750 train_time:80638ms step_avg:95.32ms | |
| step:847/1750 train_time:80737ms step_avg:95.32ms | |
| step:848/1750 train_time:80834ms step_avg:95.32ms | |
| step:849/1750 train_time:80932ms step_avg:95.33ms | |
| step:850/1750 train_time:81029ms step_avg:95.33ms | |
| step:851/1750 train_time:81126ms step_avg:95.33ms | |
| step:852/1750 train_time:81224ms step_avg:95.33ms | |
| step:853/1750 train_time:81322ms step_avg:95.34ms | |
| step:854/1750 train_time:81420ms step_avg:95.34ms | |
| step:855/1750 train_time:81517ms step_avg:95.34ms | |
| step:856/1750 train_time:81616ms step_avg:95.35ms | |
| step:857/1750 train_time:81714ms step_avg:95.35ms | |
| step:858/1750 train_time:81812ms step_avg:95.35ms | |
| step:859/1750 train_time:81910ms step_avg:95.36ms | |
| step:860/1750 train_time:82008ms step_avg:95.36ms | |
| step:861/1750 train_time:82106ms step_avg:95.36ms | |
| step:862/1750 train_time:82204ms step_avg:95.36ms | |
| step:863/1750 train_time:82302ms step_avg:95.37ms | |
| step:864/1750 train_time:82399ms step_avg:95.37ms | |
| step:865/1750 train_time:82497ms step_avg:95.37ms | |
| step:866/1750 train_time:82595ms step_avg:95.38ms | |
| step:867/1750 train_time:82693ms step_avg:95.38ms | |
| step:868/1750 train_time:82790ms step_avg:95.38ms | |
| step:869/1750 train_time:82887ms step_avg:95.38ms | |
| step:870/1750 train_time:82985ms step_avg:95.38ms | |
| step:871/1750 train_time:83082ms step_avg:95.39ms | |
| step:872/1750 train_time:83181ms step_avg:95.39ms | |
| step:873/1750 train_time:83278ms step_avg:95.39ms | |
| step:874/1750 train_time:83375ms step_avg:95.40ms | |
| step:875/1750 train_time:83474ms step_avg:95.40ms | |
| step:875/1750 val_loss:3.5474 train_time:83565ms step_avg:95.50ms | |
| step:876/1750 train_time:83601ms step_avg:95.44ms | |
| step:877/1750 train_time:83685ms step_avg:95.42ms | |
| step:878/1750 train_time:83784ms step_avg:95.43ms | |
| step:879/1750 train_time:83882ms step_avg:95.43ms | |
| step:880/1750 train_time:83979ms step_avg:95.43ms | |
| step:881/1750 train_time:84076ms step_avg:95.43ms | |
| step:882/1750 train_time:84173ms step_avg:95.43ms | |
| step:883/1750 train_time:84271ms step_avg:95.44ms | |
| step:884/1750 train_time:84368ms step_avg:95.44ms | |
| step:885/1750 train_time:84465ms step_avg:95.44ms | |
| step:886/1750 train_time:84564ms step_avg:95.44ms | |
| step:887/1750 train_time:84663ms step_avg:95.45ms | |
| step:888/1750 train_time:84761ms step_avg:95.45ms | |
| step:889/1750 train_time:84860ms step_avg:95.46ms | |
| step:890/1750 train_time:84958ms step_avg:95.46ms | |
| step:891/1750 train_time:85056ms step_avg:95.46ms | |
| step:892/1750 train_time:85154ms step_avg:95.46ms | |
| step:893/1750 train_time:85251ms step_avg:95.47ms | |
| step:894/1750 train_time:85348ms step_avg:95.47ms | |
| step:895/1750 train_time:85445ms step_avg:95.47ms | |
| step:896/1750 train_time:85542ms step_avg:95.47ms | |
| step:897/1750 train_time:85640ms step_avg:95.47ms | |
| step:898/1750 train_time:85739ms step_avg:95.48ms | |
| step:899/1750 train_time:85838ms step_avg:95.48ms | |
| step:900/1750 train_time:85936ms step_avg:95.48ms | |
| step:901/1750 train_time:86035ms step_avg:95.49ms | |
| step:902/1750 train_time:86133ms step_avg:95.49ms | |
| step:903/1750 train_time:86230ms step_avg:95.49ms | |
| step:904/1750 train_time:86328ms step_avg:95.50ms | |
| step:905/1750 train_time:86425ms step_avg:95.50ms | |
| step:906/1750 train_time:86522ms step_avg:95.50ms | |
| step:907/1750 train_time:86620ms step_avg:95.50ms | |
| step:908/1750 train_time:86718ms step_avg:95.50ms | |
| step:909/1750 train_time:86817ms step_avg:95.51ms | |
| step:910/1750 train_time:86918ms step_avg:95.51ms | |
| step:911/1750 train_time:87017ms step_avg:95.52ms | |
| step:912/1750 train_time:87116ms step_avg:95.52ms | |
| step:913/1750 train_time:87215ms step_avg:95.53ms | |
| step:914/1750 train_time:87315ms step_avg:95.53ms | |
| step:915/1750 train_time:87414ms step_avg:95.53ms | |
| step:916/1750 train_time:87515ms step_avg:95.54ms | |
| step:917/1750 train_time:87615ms step_avg:95.55ms | |
| step:918/1750 train_time:87716ms step_avg:95.55ms | |
| step:919/1750 train_time:87816ms step_avg:95.56ms | |
| step:920/1750 train_time:87916ms step_avg:95.56ms | |
| step:921/1750 train_time:88015ms step_avg:95.56ms | |
| step:922/1750 train_time:88115ms step_avg:95.57ms | |
| step:923/1750 train_time:88215ms step_avg:95.57ms | |
| step:924/1750 train_time:88315ms step_avg:95.58ms | |
| step:925/1750 train_time:88415ms step_avg:95.58ms | |
| step:926/1750 train_time:88513ms step_avg:95.59ms | |
| step:927/1750 train_time:88613ms step_avg:95.59ms | |
| step:928/1750 train_time:88712ms step_avg:95.59ms | |
| step:929/1750 train_time:88813ms step_avg:95.60ms | |
| step:930/1750 train_time:88912ms step_avg:95.60ms | |
| step:931/1750 train_time:89012ms step_avg:95.61ms | |
| step:932/1750 train_time:89111ms step_avg:95.61ms | |
| step:933/1750 train_time:89210ms step_avg:95.62ms | |
| step:934/1750 train_time:89309ms step_avg:95.62ms | |
| step:935/1750 train_time:89409ms step_avg:95.62ms | |
| step:936/1750 train_time:89508ms step_avg:95.63ms | |
| step:937/1750 train_time:89606ms step_avg:95.63ms | |
| step:938/1750 train_time:89705ms step_avg:95.63ms | |
| step:939/1750 train_time:89803ms step_avg:95.64ms | |
| step:940/1750 train_time:89902ms step_avg:95.64ms | |
| step:941/1750 train_time:90002ms step_avg:95.64ms | |
| step:942/1750 train_time:90100ms step_avg:95.65ms | |
| step:943/1750 train_time:90200ms step_avg:95.65ms | |
| step:944/1750 train_time:90300ms step_avg:95.66ms | |
| step:945/1750 train_time:90401ms step_avg:95.66ms | |
| step:946/1750 train_time:90500ms step_avg:95.67ms | |
| step:947/1750 train_time:90600ms step_avg:95.67ms | |
| step:948/1750 train_time:90700ms step_avg:95.68ms | |
| step:949/1750 train_time:90800ms step_avg:95.68ms | |
| step:950/1750 train_time:90900ms step_avg:95.68ms | |
| step:951/1750 train_time:91000ms step_avg:95.69ms | |
| step:952/1750 train_time:91099ms step_avg:95.69ms | |
| step:953/1750 train_time:91198ms step_avg:95.70ms | |
| step:954/1750 train_time:91297ms step_avg:95.70ms | |
| step:955/1750 train_time:91397ms step_avg:95.70ms | |
| step:956/1750 train_time:91496ms step_avg:95.71ms | |
| step:957/1750 train_time:91596ms step_avg:95.71ms | |
| step:958/1750 train_time:91695ms step_avg:95.71ms | |
| step:959/1750 train_time:91795ms step_avg:95.72ms | |
| step:960/1750 train_time:91895ms step_avg:95.72ms | |
| step:961/1750 train_time:91994ms step_avg:95.73ms | |
| step:962/1750 train_time:92094ms step_avg:95.73ms | |
| step:963/1750 train_time:92193ms step_avg:95.74ms | |
| step:964/1750 train_time:92292ms step_avg:95.74ms | |
| step:965/1750 train_time:92391ms step_avg:95.74ms | |
| step:966/1750 train_time:92490ms step_avg:95.75ms | |
| step:967/1750 train_time:92589ms step_avg:95.75ms | |
| step:968/1750 train_time:92688ms step_avg:95.75ms | |
| step:969/1750 train_time:92787ms step_avg:95.76ms | |
| step:970/1750 train_time:92885ms step_avg:95.76ms | |
| step:971/1750 train_time:92983ms step_avg:95.76ms | |
| step:972/1750 train_time:93082ms step_avg:95.76ms | |
| step:973/1750 train_time:93182ms step_avg:95.77ms | |
| step:974/1750 train_time:93280ms step_avg:95.77ms | |
| step:975/1750 train_time:93380ms step_avg:95.77ms | |
| step:976/1750 train_time:93478ms step_avg:95.78ms | |
| step:977/1750 train_time:93578ms step_avg:95.78ms | |
| step:978/1750 train_time:93678ms step_avg:95.79ms | |
| step:979/1750 train_time:93778ms step_avg:95.79ms | |
| step:980/1750 train_time:93879ms step_avg:95.79ms | |
| step:981/1750 train_time:93978ms step_avg:95.80ms | |
| step:982/1750 train_time:94077ms step_avg:95.80ms | |
| step:983/1750 train_time:94177ms step_avg:95.81ms | |
| step:984/1750 train_time:94276ms step_avg:95.81ms | |
| step:985/1750 train_time:94375ms step_avg:95.81ms | |
| step:986/1750 train_time:94475ms step_avg:95.82ms | |
| step:987/1750 train_time:94575ms step_avg:95.82ms | |
| step:988/1750 train_time:94675ms step_avg:95.83ms | |
| step:989/1750 train_time:94775ms step_avg:95.83ms | |
| step:990/1750 train_time:94876ms step_avg:95.83ms | |
| step:991/1750 train_time:94977ms step_avg:95.84ms | |
| step:992/1750 train_time:95076ms step_avg:95.84ms | |
| step:993/1750 train_time:95175ms step_avg:95.85ms | |
| step:994/1750 train_time:95274ms step_avg:95.85ms | |
| step:995/1750 train_time:95374ms step_avg:95.85ms | |
| step:996/1750 train_time:95476ms step_avg:95.86ms | |
| step:997/1750 train_time:95574ms step_avg:95.86ms | |
| step:998/1750 train_time:95674ms step_avg:95.87ms | |
| step:999/1750 train_time:95773ms step_avg:95.87ms | |
| step:1000/1750 train_time:95873ms step_avg:95.87ms | |
| step:1000/1750 val_loss:3.5035 train_time:95967ms step_avg:95.97ms | |
| step:1001/1750 train_time:96003ms step_avg:95.91ms | |
| step:1002/1750 train_time:96087ms step_avg:95.89ms | |
| step:1003/1750 train_time:96187ms step_avg:95.90ms | |
| step:1004/1750 train_time:96286ms step_avg:95.90ms | |
| step:1005/1750 train_time:96385ms step_avg:95.91ms | |
| step:1006/1750 train_time:96483ms step_avg:95.91ms | |
| step:1007/1750 train_time:96582ms step_avg:95.91ms | |
| step:1008/1750 train_time:96680ms step_avg:95.91ms | |
| step:1009/1750 train_time:96778ms step_avg:95.91ms | |
| step:1010/1750 train_time:96877ms step_avg:95.92ms | |
| step:1011/1750 train_time:96975ms step_avg:95.92ms | |
| step:1012/1750 train_time:97077ms step_avg:95.93ms | |
| step:1013/1750 train_time:97177ms step_avg:95.93ms | |
| step:1014/1750 train_time:97276ms step_avg:95.93ms | |
| step:1015/1750 train_time:97375ms step_avg:95.94ms | |
| step:1016/1750 train_time:97474ms step_avg:95.94ms | |
| step:1017/1750 train_time:97573ms step_avg:95.94ms | |
| step:1018/1750 train_time:97672ms step_avg:95.94ms | |
| step:1019/1750 train_time:97771ms step_avg:95.95ms | |
| step:1020/1750 train_time:97872ms step_avg:95.95ms | |
| step:1021/1750 train_time:97972ms step_avg:95.96ms | |
| step:1022/1750 train_time:98072ms step_avg:95.96ms | |
| step:1023/1750 train_time:98172ms step_avg:95.97ms | |
| step:1024/1750 train_time:98272ms step_avg:95.97ms | |
| step:1025/1750 train_time:98372ms step_avg:95.97ms | |
| step:1026/1750 train_time:98472ms step_avg:95.98ms | |
| step:1027/1750 train_time:98572ms step_avg:95.98ms | |
| step:1028/1750 train_time:98672ms step_avg:95.98ms | |
| step:1029/1750 train_time:98772ms step_avg:95.99ms | |
| step:1030/1750 train_time:98871ms step_avg:95.99ms | |
| step:1031/1750 train_time:98971ms step_avg:95.99ms | |
| step:1032/1750 train_time:99071ms step_avg:96.00ms | |
| step:1033/1750 train_time:99170ms step_avg:96.00ms | |
| step:1034/1750 train_time:99271ms step_avg:96.01ms | |
| step:1035/1750 train_time:99370ms step_avg:96.01ms | |
| step:1036/1750 train_time:99470ms step_avg:96.01ms | |
| step:1037/1750 train_time:99571ms step_avg:96.02ms | |
| step:1038/1750 train_time:99671ms step_avg:96.02ms | |
| step:1039/1750 train_time:99771ms step_avg:96.03ms | |
| step:1040/1750 train_time:99869ms step_avg:96.03ms | |
| step:1041/1750 train_time:99970ms step_avg:96.03ms | |
| step:1042/1750 train_time:100070ms step_avg:96.04ms | |
| step:1043/1750 train_time:100170ms step_avg:96.04ms | |
| step:1044/1750 train_time:100270ms step_avg:96.04ms | |
| step:1045/1750 train_time:100370ms step_avg:96.05ms | |
| step:1046/1750 train_time:100471ms step_avg:96.05ms | |
| step:1047/1750 train_time:100570ms step_avg:96.06ms | |
| step:1048/1750 train_time:100670ms step_avg:96.06ms | |
| step:1049/1750 train_time:100771ms step_avg:96.06ms | |
| step:1050/1750 train_time:100870ms step_avg:96.07ms | |
| step:1051/1750 train_time:100970ms step_avg:96.07ms | |
| step:1052/1750 train_time:101070ms step_avg:96.07ms | |
| step:1053/1750 train_time:101171ms step_avg:96.08ms | |
| step:1054/1750 train_time:101270ms step_avg:96.08ms | |
| step:1055/1750 train_time:101371ms step_avg:96.09ms | |
| step:1056/1750 train_time:101472ms step_avg:96.09ms | |
| step:1057/1750 train_time:101572ms step_avg:96.09ms | |
| step:1058/1750 train_time:101671ms step_avg:96.10ms | |
| step:1059/1750 train_time:101772ms step_avg:96.10ms | |
| step:1060/1750 train_time:101871ms step_avg:96.10ms | |
| step:1061/1750 train_time:101971ms step_avg:96.11ms | |
| step:1062/1750 train_time:102071ms step_avg:96.11ms | |
| step:1063/1750 train_time:102172ms step_avg:96.12ms | |
| step:1064/1750 train_time:102272ms step_avg:96.12ms | |
| step:1065/1750 train_time:102371ms step_avg:96.12ms | |
| step:1066/1750 train_time:102472ms step_avg:96.13ms | |
| step:1067/1750 train_time:102572ms step_avg:96.13ms | |
| step:1068/1750 train_time:102673ms step_avg:96.14ms | |
| step:1069/1750 train_time:102774ms step_avg:96.14ms | |
| step:1070/1750 train_time:102873ms step_avg:96.14ms | |
| step:1071/1750 train_time:102973ms step_avg:96.15ms | |
| step:1072/1750 train_time:103075ms step_avg:96.15ms | |
| step:1073/1750 train_time:103174ms step_avg:96.16ms | |
| step:1074/1750 train_time:103274ms step_avg:96.16ms | |
| step:1075/1750 train_time:103374ms step_avg:96.16ms | |
| step:1076/1750 train_time:103473ms step_avg:96.16ms | |
| step:1077/1750 train_time:103573ms step_avg:96.17ms | |
| step:1078/1750 train_time:103674ms step_avg:96.17ms | |
| step:1079/1750 train_time:103774ms step_avg:96.18ms | |
| step:1080/1750 train_time:103874ms step_avg:96.18ms | |
| step:1081/1750 train_time:103973ms step_avg:96.18ms | |
| step:1082/1750 train_time:104073ms step_avg:96.19ms | |
| step:1083/1750 train_time:104172ms step_avg:96.19ms | |
| step:1084/1750 train_time:104273ms step_avg:96.19ms | |
| step:1085/1750 train_time:104373ms step_avg:96.20ms | |
| step:1086/1750 train_time:104473ms step_avg:96.20ms | |
| step:1087/1750 train_time:104572ms step_avg:96.20ms | |
| step:1088/1750 train_time:104672ms step_avg:96.21ms | |
| step:1089/1750 train_time:104771ms step_avg:96.21ms | |
| step:1090/1750 train_time:104871ms step_avg:96.21ms | |
| step:1091/1750 train_time:104971ms step_avg:96.22ms | |
| step:1092/1750 train_time:105071ms step_avg:96.22ms | |
| step:1093/1750 train_time:105172ms step_avg:96.22ms | |
| step:1094/1750 train_time:105272ms step_avg:96.23ms | |
| step:1095/1750 train_time:105372ms step_avg:96.23ms | |
| step:1096/1750 train_time:105473ms step_avg:96.23ms | |
| step:1097/1750 train_time:105572ms step_avg:96.24ms | |
| step:1098/1750 train_time:105672ms step_avg:96.24ms | |
| step:1099/1750 train_time:105772ms step_avg:96.24ms | |
| step:1100/1750 train_time:105872ms step_avg:96.25ms | |
| step:1101/1750 train_time:105972ms step_avg:96.25ms | |
| step:1102/1750 train_time:106071ms step_avg:96.25ms | |
| step:1103/1750 train_time:106172ms step_avg:96.26ms | |
| step:1104/1750 train_time:106271ms step_avg:96.26ms | |
| step:1105/1750 train_time:106371ms step_avg:96.26ms | |
| step:1106/1750 train_time:106471ms step_avg:96.27ms | |
| step:1107/1750 train_time:106571ms step_avg:96.27ms | |
| step:1108/1750 train_time:106671ms step_avg:96.27ms | |
| step:1109/1750 train_time:106771ms step_avg:96.28ms | |
| step:1110/1750 train_time:106870ms step_avg:96.28ms | |
| step:1111/1750 train_time:106970ms step_avg:96.28ms | |
| step:1112/1750 train_time:107070ms step_avg:96.29ms | |
| step:1113/1750 train_time:107171ms step_avg:96.29ms | |
| step:1114/1750 train_time:107271ms step_avg:96.29ms | |
| step:1115/1750 train_time:107371ms step_avg:96.30ms | |
| step:1116/1750 train_time:107471ms step_avg:96.30ms | |
| step:1117/1750 train_time:107571ms step_avg:96.30ms | |
| step:1118/1750 train_time:107672ms step_avg:96.31ms | |
| step:1119/1750 train_time:107771ms step_avg:96.31ms | |
| step:1120/1750 train_time:107872ms step_avg:96.31ms | |
| step:1121/1750 train_time:107973ms step_avg:96.32ms | |
| step:1122/1750 train_time:108073ms step_avg:96.32ms | |
| step:1123/1750 train_time:108173ms step_avg:96.32ms | |
| step:1124/1750 train_time:108273ms step_avg:96.33ms | |
| step:1125/1750 train_time:108373ms step_avg:96.33ms | |
| step:1125/1750 val_loss:3.4517 train_time:108465ms step_avg:96.41ms | |
| step:1126/1750 train_time:108502ms step_avg:96.36ms | |
| step:1127/1750 train_time:108583ms step_avg:96.35ms | |
| step:1128/1750 train_time:108685ms step_avg:96.35ms | |
| step:1129/1750 train_time:108784ms step_avg:96.35ms | |
| step:1130/1750 train_time:108883ms step_avg:96.36ms | |
| step:1131/1750 train_time:108981ms step_avg:96.36ms | |
| step:1132/1750 train_time:109080ms step_avg:96.36ms | |
| step:1133/1750 train_time:109179ms step_avg:96.36ms | |
| step:1134/1750 train_time:109278ms step_avg:96.36ms | |
| step:1135/1750 train_time:109376ms step_avg:96.37ms | |
| step:1136/1750 train_time:109475ms step_avg:96.37ms | |
| step:1137/1750 train_time:109575ms step_avg:96.37ms | |
| step:1138/1750 train_time:109674ms step_avg:96.37ms | |
| step:1139/1750 train_time:109773ms step_avg:96.38ms | |
| step:1140/1750 train_time:109872ms step_avg:96.38ms | |
| step:1141/1750 train_time:109971ms step_avg:96.38ms | |
| step:1142/1750 train_time:110070ms step_avg:96.38ms | |
| step:1143/1750 train_time:110170ms step_avg:96.39ms | |
| step:1144/1750 train_time:110270ms step_avg:96.39ms | |
| step:1145/1750 train_time:110370ms step_avg:96.39ms | |
| step:1146/1750 train_time:110469ms step_avg:96.40ms | |
| step:1147/1750 train_time:110570ms step_avg:96.40ms | |
| step:1148/1750 train_time:110671ms step_avg:96.40ms | |
| step:1149/1750 train_time:110771ms step_avg:96.41ms | |
| step:1150/1750 train_time:110870ms step_avg:96.41ms | |
| step:1151/1750 train_time:110969ms step_avg:96.41ms | |
| step:1152/1750 train_time:111068ms step_avg:96.41ms | |
| step:1153/1750 train_time:111168ms step_avg:96.42ms | |
| step:1154/1750 train_time:111267ms step_avg:96.42ms | |
| step:1155/1750 train_time:111367ms step_avg:96.42ms | |
| step:1156/1750 train_time:111468ms step_avg:96.43ms | |
| step:1157/1750 train_time:111567ms step_avg:96.43ms | |
| step:1158/1750 train_time:111668ms step_avg:96.43ms | |
| step:1159/1750 train_time:111768ms step_avg:96.44ms | |
| step:1160/1750 train_time:111869ms step_avg:96.44ms | |
| step:1161/1750 train_time:111968ms step_avg:96.44ms | |
| step:1162/1750 train_time:112068ms step_avg:96.44ms | |
| step:1163/1750 train_time:112167ms step_avg:96.45ms | |
| step:1164/1750 train_time:112267ms step_avg:96.45ms | |
| step:1165/1750 train_time:112367ms step_avg:96.45ms | |
| step:1166/1750 train_time:112467ms step_avg:96.46ms | |
| step:1167/1750 train_time:112567ms step_avg:96.46ms | |
| step:1168/1750 train_time:112667ms step_avg:96.46ms | |
| step:1169/1750 train_time:112769ms step_avg:96.47ms | |
| step:1170/1750 train_time:112869ms step_avg:96.47ms | |
| step:1171/1750 train_time:112970ms step_avg:96.47ms | |
| step:1172/1750 train_time:113071ms step_avg:96.48ms | |
| step:1173/1750 train_time:113171ms step_avg:96.48ms | |
| step:1174/1750 train_time:113272ms step_avg:96.48ms | |
| step:1175/1750 train_time:113373ms step_avg:96.49ms | |
| step:1176/1750 train_time:113473ms step_avg:96.49ms | |
| step:1177/1750 train_time:113573ms step_avg:96.49ms | |
| step:1178/1750 train_time:113673ms step_avg:96.50ms | |
| step:1179/1750 train_time:113774ms step_avg:96.50ms | |
| step:1180/1750 train_time:113873ms step_avg:96.50ms | |
| step:1181/1750 train_time:113975ms step_avg:96.51ms | |
| step:1182/1750 train_time:114075ms step_avg:96.51ms | |
| step:1183/1750 train_time:114174ms step_avg:96.51ms | |
| step:1184/1750 train_time:114277ms step_avg:96.52ms | |
| step:1185/1750 train_time:114377ms step_avg:96.52ms | |
| step:1186/1750 train_time:114477ms step_avg:96.52ms | |
| step:1187/1750 train_time:114577ms step_avg:96.53ms | |
| step:1188/1750 train_time:114677ms step_avg:96.53ms | |
| step:1189/1750 train_time:114776ms step_avg:96.53ms | |
| step:1190/1750 train_time:114875ms step_avg:96.53ms | |
| step:1191/1750 train_time:114975ms step_avg:96.54ms | |
| step:1192/1750 train_time:115075ms step_avg:96.54ms | |
| step:1193/1750 train_time:115175ms step_avg:96.54ms | |
| step:1194/1750 train_time:115275ms step_avg:96.55ms | |
| step:1195/1750 train_time:115375ms step_avg:96.55ms | |
| step:1196/1750 train_time:115476ms step_avg:96.55ms | |
| step:1197/1750 train_time:115576ms step_avg:96.55ms | |
| step:1198/1750 train_time:115675ms step_avg:96.56ms | |
| step:1199/1750 train_time:115775ms step_avg:96.56ms | |
| step:1200/1750 train_time:115874ms step_avg:96.56ms | |
| step:1201/1750 train_time:115974ms step_avg:96.56ms | |
| step:1202/1750 train_time:116074ms step_avg:96.57ms | |
| step:1203/1750 train_time:116174ms step_avg:96.57ms | |
| step:1204/1750 train_time:116274ms step_avg:96.57ms | |
| step:1205/1750 train_time:116374ms step_avg:96.58ms | |
| step:1206/1750 train_time:116475ms step_avg:96.58ms | |
| step:1207/1750 train_time:116574ms step_avg:96.58ms | |
| step:1208/1750 train_time:116674ms step_avg:96.58ms | |
| step:1209/1750 train_time:116774ms step_avg:96.59ms | |
| step:1210/1750 train_time:116873ms step_avg:96.59ms | |
| step:1211/1750 train_time:116973ms step_avg:96.59ms | |
| step:1212/1750 train_time:117073ms step_avg:96.60ms | |
| step:1213/1750 train_time:117173ms step_avg:96.60ms | |
| step:1214/1750 train_time:117273ms step_avg:96.60ms | |
| step:1215/1750 train_time:117373ms step_avg:96.60ms | |
| step:1216/1750 train_time:117473ms step_avg:96.61ms | |
| step:1217/1750 train_time:117574ms step_avg:96.61ms | |
| step:1218/1750 train_time:117674ms step_avg:96.61ms | |
| step:1219/1750 train_time:117774ms step_avg:96.62ms | |
| step:1220/1750 train_time:117875ms step_avg:96.62ms | |
| step:1221/1750 train_time:117975ms step_avg:96.62ms | |
| step:1222/1750 train_time:118076ms step_avg:96.62ms | |
| step:1223/1750 train_time:118175ms step_avg:96.63ms | |
| step:1224/1750 train_time:118275ms step_avg:96.63ms | |
| step:1225/1750 train_time:118376ms step_avg:96.63ms | |
| step:1226/1750 train_time:118476ms step_avg:96.64ms | |
| step:1227/1750 train_time:118575ms step_avg:96.64ms | |
| step:1228/1750 train_time:118675ms step_avg:96.64ms | |
| step:1229/1750 train_time:118775ms step_avg:96.64ms | |
| step:1230/1750 train_time:118874ms step_avg:96.65ms | |
| step:1231/1750 train_time:118974ms step_avg:96.65ms | |
| step:1232/1750 train_time:119075ms step_avg:96.65ms | |
| step:1233/1750 train_time:119175ms step_avg:96.65ms | |
| step:1234/1750 train_time:119275ms step_avg:96.66ms | |
| step:1235/1750 train_time:119375ms step_avg:96.66ms | |
| step:1236/1750 train_time:119475ms step_avg:96.66ms | |
| step:1237/1750 train_time:119575ms step_avg:96.67ms | |
| step:1238/1750 train_time:119675ms step_avg:96.67ms | |
| step:1239/1750 train_time:119775ms step_avg:96.67ms | |
| step:1240/1750 train_time:119876ms step_avg:96.67ms | |
| step:1241/1750 train_time:119977ms step_avg:96.68ms | |
| step:1242/1750 train_time:120078ms step_avg:96.68ms | |
| step:1243/1750 train_time:120178ms step_avg:96.68ms | |
| step:1244/1750 train_time:120277ms step_avg:96.69ms | |
| step:1245/1750 train_time:120377ms step_avg:96.69ms | |
| step:1246/1750 train_time:120477ms step_avg:96.69ms | |
| step:1247/1750 train_time:120576ms step_avg:96.69ms | |
| step:1248/1750 train_time:120676ms step_avg:96.70ms | |
| step:1249/1750 train_time:120775ms step_avg:96.70ms | |
| step:1250/1750 train_time:120875ms step_avg:96.70ms | |
| step:1250/1750 val_loss:3.4067 train_time:120968ms step_avg:96.77ms | |
| step:1251/1750 train_time:121005ms step_avg:96.73ms | |
| step:1252/1750 train_time:121090ms step_avg:96.72ms | |
| step:1253/1750 train_time:121191ms step_avg:96.72ms | |
| step:1254/1750 train_time:121292ms step_avg:96.72ms | |
| step:1255/1750 train_time:121392ms step_avg:96.73ms | |
| step:1256/1750 train_time:121493ms step_avg:96.73ms | |
| step:1257/1750 train_time:121593ms step_avg:96.73ms | |
| step:1258/1750 train_time:121693ms step_avg:96.74ms | |
| step:1259/1750 train_time:121793ms step_avg:96.74ms | |
| step:1260/1750 train_time:121892ms step_avg:96.74ms | |
| step:1261/1750 train_time:121996ms step_avg:96.75ms | |
| step:1262/1750 train_time:122098ms step_avg:96.75ms | |
| step:1263/1750 train_time:122200ms step_avg:96.75ms | |
| step:1264/1750 train_time:122299ms step_avg:96.76ms | |
| step:1265/1750 train_time:122399ms step_avg:96.76ms | |
| step:1266/1750 train_time:122497ms step_avg:96.76ms | |
| step:1267/1750 train_time:122597ms step_avg:96.76ms | |
| step:1268/1750 train_time:122695ms step_avg:96.76ms | |
| step:1269/1750 train_time:122795ms step_avg:96.77ms | |
| step:1270/1750 train_time:122895ms step_avg:96.77ms | |
| step:1271/1750 train_time:122996ms step_avg:96.77ms | |
| step:1272/1750 train_time:123097ms step_avg:96.77ms | |
| step:1273/1750 train_time:123198ms step_avg:96.78ms | |
| step:1274/1750 train_time:123299ms step_avg:96.78ms | |
| step:1275/1750 train_time:123399ms step_avg:96.78ms | |
| step:1276/1750 train_time:123499ms step_avg:96.79ms | |
| step:1277/1750 train_time:123599ms step_avg:96.79ms | |
| step:1278/1750 train_time:123698ms step_avg:96.79ms | |
| step:1279/1750 train_time:123798ms step_avg:96.79ms | |
| step:1280/1750 train_time:123897ms step_avg:96.79ms | |
| step:1281/1750 train_time:123997ms step_avg:96.80ms | |
| step:1282/1750 train_time:124097ms step_avg:96.80ms | |
| step:1283/1750 train_time:124197ms step_avg:96.80ms | |
| step:1284/1750 train_time:124298ms step_avg:96.81ms | |
| step:1285/1750 train_time:124398ms step_avg:96.81ms | |
| step:1286/1750 train_time:124499ms step_avg:96.81ms | |
| step:1287/1750 train_time:124598ms step_avg:96.81ms | |
| step:1288/1750 train_time:124698ms step_avg:96.82ms | |
| step:1289/1750 train_time:124798ms step_avg:96.82ms | |
| step:1290/1750 train_time:124898ms step_avg:96.82ms | |
| step:1291/1750 train_time:124998ms step_avg:96.82ms | |
| step:1292/1750 train_time:125098ms step_avg:96.83ms | |
| step:1293/1750 train_time:125198ms step_avg:96.83ms | |
| step:1294/1750 train_time:125298ms step_avg:96.83ms | |
| step:1295/1750 train_time:125399ms step_avg:96.83ms | |
| step:1296/1750 train_time:125499ms step_avg:96.84ms | |
| step:1297/1750 train_time:125598ms step_avg:96.84ms | |
| step:1298/1750 train_time:125697ms step_avg:96.84ms | |
| step:1299/1750 train_time:125798ms step_avg:96.84ms | |
| step:1300/1750 train_time:125898ms step_avg:96.84ms | |
| step:1301/1750 train_time:125997ms step_avg:96.85ms | |
| step:1302/1750 train_time:126098ms step_avg:96.85ms | |
| step:1303/1750 train_time:126198ms step_avg:96.85ms | |
| step:1304/1750 train_time:126298ms step_avg:96.85ms | |
| step:1305/1750 train_time:126399ms step_avg:96.86ms | |
| step:1306/1750 train_time:126498ms step_avg:96.86ms | |
| step:1307/1750 train_time:126598ms step_avg:96.86ms | |
| step:1308/1750 train_time:126697ms step_avg:96.86ms | |
| step:1309/1750 train_time:126798ms step_avg:96.87ms | |
| step:1310/1750 train_time:126899ms step_avg:96.87ms | |
| step:1311/1750 train_time:126999ms step_avg:96.87ms | |
| step:1312/1750 train_time:127099ms step_avg:96.87ms | |
| step:1313/1750 train_time:127200ms step_avg:96.88ms | |
| step:1314/1750 train_time:127301ms step_avg:96.88ms | |
| step:1315/1750 train_time:127401ms step_avg:96.88ms | |
| step:1316/1750 train_time:127500ms step_avg:96.88ms | |
| step:1317/1750 train_time:127600ms step_avg:96.89ms | |
| step:1318/1750 train_time:127700ms step_avg:96.89ms | |
| step:1319/1750 train_time:127799ms step_avg:96.89ms | |
| step:1320/1750 train_time:127900ms step_avg:96.89ms | |
| step:1321/1750 train_time:128001ms step_avg:96.90ms | |
| step:1322/1750 train_time:128101ms step_avg:96.90ms | |
| step:1323/1750 train_time:128201ms step_avg:96.90ms | |
| step:1324/1750 train_time:128301ms step_avg:96.90ms | |
| step:1325/1750 train_time:128403ms step_avg:96.91ms | |
| step:1326/1750 train_time:128502ms step_avg:96.91ms | |
| step:1327/1750 train_time:128602ms step_avg:96.91ms | |
| step:1328/1750 train_time:128702ms step_avg:96.91ms | |
| step:1329/1750 train_time:128803ms step_avg:96.92ms | |
| step:1330/1750 train_time:128903ms step_avg:96.92ms | |
| step:1331/1750 train_time:129004ms step_avg:96.92ms | |
| step:1332/1750 train_time:129105ms step_avg:96.93ms | |
| step:1333/1750 train_time:129207ms step_avg:96.93ms | |
| step:1334/1750 train_time:129308ms step_avg:96.93ms | |
| step:1335/1750 train_time:129409ms step_avg:96.94ms | |
| step:1336/1750 train_time:129511ms step_avg:96.94ms | |
| step:1337/1750 train_time:129612ms step_avg:96.94ms | |
| step:1338/1750 train_time:129713ms step_avg:96.95ms | |
| step:1339/1750 train_time:129814ms step_avg:96.95ms | |
| step:1340/1750 train_time:129914ms step_avg:96.95ms | |
| step:1341/1750 train_time:130015ms step_avg:96.95ms | |
| step:1342/1750 train_time:130116ms step_avg:96.96ms | |
| step:1343/1750 train_time:130217ms step_avg:96.96ms | |
| step:1344/1750 train_time:130318ms step_avg:96.96ms | |
| step:1345/1750 train_time:130418ms step_avg:96.97ms | |
| step:1346/1750 train_time:130519ms step_avg:96.97ms | |
| step:1347/1750 train_time:130619ms step_avg:96.97ms | |
| step:1348/1750 train_time:130718ms step_avg:96.97ms | |
| step:1349/1750 train_time:130818ms step_avg:96.97ms | |
| step:1350/1750 train_time:130918ms step_avg:96.98ms | |
| step:1351/1750 train_time:131018ms step_avg:96.98ms | |
| step:1352/1750 train_time:131118ms step_avg:96.98ms | |
| step:1353/1750 train_time:131218ms step_avg:96.98ms | |
| step:1354/1750 train_time:131319ms step_avg:96.99ms | |
| step:1355/1750 train_time:131419ms step_avg:96.99ms | |
| step:1356/1750 train_time:131519ms step_avg:96.99ms | |
| step:1357/1750 train_time:131619ms step_avg:96.99ms | |
| step:1358/1750 train_time:131719ms step_avg:96.99ms | |
| step:1359/1750 train_time:131818ms step_avg:97.00ms | |
| step:1360/1750 train_time:131919ms step_avg:97.00ms | |
| step:1361/1750 train_time:132019ms step_avg:97.00ms | |
| step:1362/1750 train_time:132118ms step_avg:97.00ms | |
| step:1363/1750 train_time:132218ms step_avg:97.01ms | |
| step:1364/1750 train_time:132318ms step_avg:97.01ms | |
| step:1365/1750 train_time:132419ms step_avg:97.01ms | |
| step:1366/1750 train_time:132519ms step_avg:97.01ms | |
| step:1367/1750 train_time:132619ms step_avg:97.01ms | |
| step:1368/1750 train_time:132719ms step_avg:97.02ms | |
| step:1369/1750 train_time:132819ms step_avg:97.02ms | |
| step:1370/1750 train_time:132919ms step_avg:97.02ms | |
| step:1371/1750 train_time:133019ms step_avg:97.02ms | |
| step:1372/1750 train_time:133119ms step_avg:97.03ms | |
| step:1373/1750 train_time:133219ms step_avg:97.03ms | |
| step:1374/1750 train_time:133318ms step_avg:97.03ms | |
| step:1375/1750 train_time:133418ms step_avg:97.03ms | |
| step:1375/1750 val_loss:3.3669 train_time:133511ms step_avg:97.10ms | |
| step:1376/1750 train_time:133547ms step_avg:97.05ms | |
| step:1377/1750 train_time:133631ms step_avg:97.04ms | |
| step:1378/1750 train_time:133733ms step_avg:97.05ms | |
| step:1379/1750 train_time:133833ms step_avg:97.05ms | |
| step:1380/1750 train_time:133934ms step_avg:97.05ms | |
| step:1381/1750 train_time:134034ms step_avg:97.06ms | |
| step:1382/1750 train_time:134135ms step_avg:97.06ms | |
| step:1383/1750 train_time:134234ms step_avg:97.06ms | |
| step:1384/1750 train_time:134335ms step_avg:97.06ms | |
| step:1385/1750 train_time:134436ms step_avg:97.07ms | |
| step:1386/1750 train_time:134539ms step_avg:97.07ms | |
| step:1387/1750 train_time:134639ms step_avg:97.07ms | |
| step:1388/1750 train_time:134739ms step_avg:97.07ms | |
| step:1389/1750 train_time:134839ms step_avg:97.08ms | |
| step:1390/1750 train_time:134938ms step_avg:97.08ms | |
| step:1391/1750 train_time:135038ms step_avg:97.08ms | |
| step:1392/1750 train_time:135138ms step_avg:97.08ms | |
| step:1393/1750 train_time:135238ms step_avg:97.08ms | |
| step:1394/1750 train_time:135337ms step_avg:97.09ms | |
| step:1395/1750 train_time:135438ms step_avg:97.09ms | |
| step:1396/1750 train_time:135538ms step_avg:97.09ms | |
| step:1397/1750 train_time:135639ms step_avg:97.09ms | |
| step:1398/1750 train_time:135739ms step_avg:97.10ms | |
| step:1399/1750 train_time:135839ms step_avg:97.10ms | |
| step:1400/1750 train_time:135940ms step_avg:97.10ms | |
| step:1401/1750 train_time:136040ms step_avg:97.10ms | |
| step:1402/1750 train_time:136140ms step_avg:97.10ms | |
| step:1403/1750 train_time:136239ms step_avg:97.11ms | |
| step:1404/1750 train_time:136340ms step_avg:97.11ms | |
| step:1405/1750 train_time:136440ms step_avg:97.11ms | |
| step:1406/1750 train_time:136540ms step_avg:97.11ms | |
| step:1407/1750 train_time:136641ms step_avg:97.11ms | |
| step:1408/1750 train_time:136741ms step_avg:97.12ms | |
| step:1409/1750 train_time:136842ms step_avg:97.12ms | |
| step:1410/1750 train_time:136942ms step_avg:97.12ms | |
| step:1411/1750 train_time:137043ms step_avg:97.12ms | |
| step:1412/1750 train_time:137143ms step_avg:97.13ms | |
| step:1413/1750 train_time:137245ms step_avg:97.13ms | |
| step:1414/1750 train_time:137347ms step_avg:97.13ms | |
| step:1415/1750 train_time:137450ms step_avg:97.14ms | |
| step:1416/1750 train_time:137551ms step_avg:97.14ms | |
| step:1417/1750 train_time:137651ms step_avg:97.14ms | |
| step:1418/1750 train_time:137752ms step_avg:97.15ms | |
| step:1419/1750 train_time:137853ms step_avg:97.15ms | |
| step:1420/1750 train_time:137955ms step_avg:97.15ms | |
| step:1421/1750 train_time:138056ms step_avg:97.15ms | |
| step:1422/1750 train_time:138156ms step_avg:97.16ms | |
| step:1423/1750 train_time:138256ms step_avg:97.16ms | |
| step:1424/1750 train_time:138356ms step_avg:97.16ms | |
| step:1425/1750 train_time:138457ms step_avg:97.16ms | |
| step:1426/1750 train_time:138557ms step_avg:97.16ms | |
| step:1427/1750 train_time:138656ms step_avg:97.17ms | |
| step:1428/1750 train_time:138759ms step_avg:97.17ms | |
| step:1429/1750 train_time:138859ms step_avg:97.17ms | |
| step:1430/1750 train_time:138961ms step_avg:97.18ms | |
| step:1431/1750 train_time:139062ms step_avg:97.18ms | |
| step:1432/1750 train_time:139163ms step_avg:97.18ms | |
| step:1433/1750 train_time:139265ms step_avg:97.18ms | |
| step:1434/1750 train_time:139366ms step_avg:97.19ms | |
| step:1435/1750 train_time:139470ms step_avg:97.19ms | |
| step:1436/1750 train_time:139573ms step_avg:97.20ms | |
| step:1437/1750 train_time:139675ms step_avg:97.20ms | |
| step:1438/1750 train_time:139777ms step_avg:97.20ms | |
| step:1439/1750 train_time:139878ms step_avg:97.20ms | |
| step:1440/1750 train_time:139982ms step_avg:97.21ms | |
| step:1441/1750 train_time:140085ms step_avg:97.21ms | |
| step:1442/1750 train_time:140185ms step_avg:97.22ms | |
| step:1443/1750 train_time:140287ms step_avg:97.22ms | |
| step:1444/1750 train_time:140389ms step_avg:97.22ms | |
| step:1445/1750 train_time:140491ms step_avg:97.23ms | |
| step:1446/1750 train_time:140594ms step_avg:97.23ms | |
| step:1447/1750 train_time:140697ms step_avg:97.23ms | |
| step:1448/1750 train_time:140799ms step_avg:97.24ms | |
| step:1449/1750 train_time:140899ms step_avg:97.24ms | |
| step:1450/1750 train_time:141000ms step_avg:97.24ms | |
| step:1451/1750 train_time:141101ms step_avg:97.24ms | |
| step:1452/1750 train_time:141203ms step_avg:97.25ms | |
| step:1453/1750 train_time:141306ms step_avg:97.25ms | |
| step:1454/1750 train_time:141409ms step_avg:97.26ms | |
| step:1455/1750 train_time:141511ms step_avg:97.26ms | |
| step:1456/1750 train_time:141612ms step_avg:97.26ms | |
| step:1457/1750 train_time:141715ms step_avg:97.26ms | |
| step:1458/1750 train_time:141816ms step_avg:97.27ms | |
| step:1459/1750 train_time:141917ms step_avg:97.27ms | |
| step:1460/1750 train_time:142019ms step_avg:97.27ms | |
| step:1461/1750 train_time:142120ms step_avg:97.28ms | |
| step:1462/1750 train_time:142221ms step_avg:97.28ms | |
| step:1463/1750 train_time:142322ms step_avg:97.28ms | |
| step:1464/1750 train_time:142423ms step_avg:97.28ms | |
| step:1465/1750 train_time:142524ms step_avg:97.29ms | |
| step:1466/1750 train_time:142625ms step_avg:97.29ms | |
| step:1467/1750 train_time:142727ms step_avg:97.29ms | |
| step:1468/1750 train_time:142830ms step_avg:97.30ms | |
| step:1469/1750 train_time:142934ms step_avg:97.30ms | |
| step:1470/1750 train_time:143036ms step_avg:97.30ms | |
| step:1471/1750 train_time:143138ms step_avg:97.31ms | |
| step:1472/1750 train_time:143240ms step_avg:97.31ms | |
| step:1473/1750 train_time:143342ms step_avg:97.31ms | |
| step:1474/1750 train_time:143444ms step_avg:97.32ms | |
| step:1475/1750 train_time:143545ms step_avg:97.32ms | |
| step:1476/1750 train_time:143647ms step_avg:97.32ms | |
| step:1477/1750 train_time:143749ms step_avg:97.32ms | |
| step:1478/1750 train_time:143854ms step_avg:97.33ms | |
| step:1479/1750 train_time:143954ms step_avg:97.33ms | |
| step:1480/1750 train_time:144057ms step_avg:97.34ms | |
| step:1481/1750 train_time:144158ms step_avg:97.34ms | |
| step:1482/1750 train_time:144262ms step_avg:97.34ms | |
| step:1483/1750 train_time:144363ms step_avg:97.35ms | |
| step:1484/1750 train_time:144464ms step_avg:97.35ms | |
| step:1485/1750 train_time:144567ms step_avg:97.35ms | |
| step:1486/1750 train_time:144669ms step_avg:97.35ms | |
| step:1487/1750 train_time:144771ms step_avg:97.36ms | |
| step:1488/1750 train_time:144873ms step_avg:97.36ms | |
| step:1489/1750 train_time:144974ms step_avg:97.36ms | |
| step:1490/1750 train_time:145076ms step_avg:97.37ms | |
| step:1491/1750 train_time:145177ms step_avg:97.37ms | |
| step:1492/1750 train_time:145278ms step_avg:97.37ms | |
| step:1493/1750 train_time:145380ms step_avg:97.37ms | |
| step:1494/1750 train_time:145482ms step_avg:97.38ms | |
| step:1495/1750 train_time:145583ms step_avg:97.38ms | |
| step:1496/1750 train_time:145684ms step_avg:97.38ms | |
| step:1497/1750 train_time:145785ms step_avg:97.38ms | |
| step:1498/1750 train_time:145886ms step_avg:97.39ms | |
| step:1499/1750 train_time:145988ms step_avg:97.39ms | |
| step:1500/1750 train_time:146093ms step_avg:97.40ms | |
| step:1500/1750 val_loss:3.3311 train_time:146186ms step_avg:97.46ms | |
| step:1501/1750 train_time:146223ms step_avg:97.42ms | |
| step:1502/1750 train_time:146306ms step_avg:97.41ms | |
| step:1503/1750 train_time:146409ms step_avg:97.41ms | |
| step:1504/1750 train_time:146511ms step_avg:97.41ms | |
| step:1505/1750 train_time:146612ms step_avg:97.42ms | |
| step:1506/1750 train_time:146713ms step_avg:97.42ms | |
| step:1507/1750 train_time:146814ms step_avg:97.42ms | |
| step:1508/1750 train_time:146914ms step_avg:97.42ms | |
| step:1509/1750 train_time:147014ms step_avg:97.43ms | |
| step:1510/1750 train_time:147114ms step_avg:97.43ms | |
| step:1511/1750 train_time:147216ms step_avg:97.43ms | |
| step:1512/1750 train_time:147318ms step_avg:97.43ms | |
| step:1513/1750 train_time:147421ms step_avg:97.44ms | |
| step:1514/1750 train_time:147527ms step_avg:97.44ms | |
| step:1515/1750 train_time:147632ms step_avg:97.45ms | |
| step:1516/1750 train_time:147732ms step_avg:97.45ms | |
| step:1517/1750 train_time:147833ms step_avg:97.45ms | |
| step:1518/1750 train_time:147934ms step_avg:97.45ms | |
| step:1519/1750 train_time:148037ms step_avg:97.46ms | |
| step:1520/1750 train_time:148138ms step_avg:97.46ms | |
| step:1521/1750 train_time:148239ms step_avg:97.46ms | |
| step:1522/1750 train_time:148340ms step_avg:97.46ms | |
| step:1523/1750 train_time:148442ms step_avg:97.47ms | |
| step:1524/1750 train_time:148546ms step_avg:97.47ms | |
| step:1525/1750 train_time:148649ms step_avg:97.47ms | |
| step:1526/1750 train_time:148751ms step_avg:97.48ms | |
| step:1527/1750 train_time:148852ms step_avg:97.48ms | |
| step:1528/1750 train_time:148956ms step_avg:97.48ms | |
| step:1529/1750 train_time:149057ms step_avg:97.49ms | |
| step:1530/1750 train_time:149158ms step_avg:97.49ms | |
| step:1531/1750 train_time:149260ms step_avg:97.49ms | |
| step:1532/1750 train_time:149361ms step_avg:97.49ms | |
| step:1533/1750 train_time:149462ms step_avg:97.50ms | |
| step:1534/1750 train_time:149565ms step_avg:97.50ms | |
| step:1535/1750 train_time:149669ms step_avg:97.50ms | |
| step:1536/1750 train_time:149770ms step_avg:97.51ms | |
| step:1537/1750 train_time:149872ms step_avg:97.51ms | |
| step:1538/1750 train_time:149974ms step_avg:97.51ms | |
| step:1539/1750 train_time:150075ms step_avg:97.51ms | |
| step:1540/1750 train_time:150177ms step_avg:97.52ms | |
| step:1541/1750 train_time:150280ms step_avg:97.52ms | |
| step:1542/1750 train_time:150384ms step_avg:97.53ms | |
| step:1543/1750 train_time:150486ms step_avg:97.53ms | |
| step:1544/1750 train_time:150589ms step_avg:97.53ms | |
| step:1545/1750 train_time:150691ms step_avg:97.53ms | |
| step:1546/1750 train_time:150794ms step_avg:97.54ms | |
| step:1547/1750 train_time:150894ms step_avg:97.54ms | |
| step:1548/1750 train_time:150996ms step_avg:97.54ms | |
| step:1549/1750 train_time:151098ms step_avg:97.55ms | |
| step:1550/1750 train_time:151199ms step_avg:97.55ms | |
| step:1551/1750 train_time:151302ms step_avg:97.55ms | |
| step:1552/1750 train_time:151403ms step_avg:97.55ms | |
| step:1553/1750 train_time:151505ms step_avg:97.56ms | |
| step:1554/1750 train_time:151608ms step_avg:97.56ms | |
| step:1555/1750 train_time:151713ms step_avg:97.56ms | |
| step:1556/1750 train_time:151812ms step_avg:97.57ms | |
| step:1557/1750 train_time:151914ms step_avg:97.57ms | |
| step:1558/1750 train_time:152017ms step_avg:97.57ms | |
| step:1559/1750 train_time:152117ms step_avg:97.57ms | |
| step:1560/1750 train_time:152218ms step_avg:97.58ms | |
| step:1561/1750 train_time:152320ms step_avg:97.58ms | |
| step:1562/1750 train_time:152422ms step_avg:97.58ms | |
| step:1563/1750 train_time:152527ms step_avg:97.59ms | |
| step:1564/1750 train_time:152630ms step_avg:97.59ms | |
| step:1565/1750 train_time:152731ms step_avg:97.59ms | |
| step:1566/1750 train_time:152832ms step_avg:97.59ms | |
| step:1567/1750 train_time:152933ms step_avg:97.60ms | |
| step:1568/1750 train_time:153033ms step_avg:97.60ms | |
| step:1569/1750 train_time:153134ms step_avg:97.60ms | |
| step:1570/1750 train_time:153235ms step_avg:97.60ms | |
| step:1571/1750 train_time:153336ms step_avg:97.60ms | |
| step:1572/1750 train_time:153437ms step_avg:97.61ms | |
| step:1573/1750 train_time:153539ms step_avg:97.61ms | |
| step:1574/1750 train_time:153642ms step_avg:97.61ms | |
| step:1575/1750 train_time:153746ms step_avg:97.62ms | |
| step:1576/1750 train_time:153848ms step_avg:97.62ms | |
| step:1577/1750 train_time:153951ms step_avg:97.62ms | |
| step:1578/1750 train_time:154052ms step_avg:97.62ms | |
| step:1579/1750 train_time:154153ms step_avg:97.63ms | |
| step:1580/1750 train_time:154253ms step_avg:97.63ms | |
| step:1581/1750 train_time:154355ms step_avg:97.63ms | |
| step:1582/1750 train_time:154456ms step_avg:97.63ms | |
| step:1583/1750 train_time:154559ms step_avg:97.64ms | |
| step:1584/1750 train_time:154664ms step_avg:97.64ms | |
| step:1585/1750 train_time:154765ms step_avg:97.64ms | |
| step:1586/1750 train_time:154868ms step_avg:97.65ms | |
| step:1587/1750 train_time:154970ms step_avg:97.65ms | |
| step:1588/1750 train_time:155071ms step_avg:97.65ms | |
| step:1589/1750 train_time:155172ms step_avg:97.65ms | |
| step:1590/1750 train_time:155273ms step_avg:97.66ms | |
| step:1591/1750 train_time:155374ms step_avg:97.66ms | |
| step:1592/1750 train_time:155475ms step_avg:97.66ms | |
| step:1593/1750 train_time:155575ms step_avg:97.66ms | |
| step:1594/1750 train_time:155680ms step_avg:97.67ms | |
| step:1595/1750 train_time:155782ms step_avg:97.67ms | |
| step:1596/1750 train_time:155885ms step_avg:97.67ms | |
| step:1597/1750 train_time:155987ms step_avg:97.68ms | |
| step:1598/1750 train_time:156090ms step_avg:97.68ms | |
| step:1599/1750 train_time:156191ms step_avg:97.68ms | |
| step:1600/1750 train_time:156293ms step_avg:97.68ms | |
| step:1601/1750 train_time:156393ms step_avg:97.68ms | |
| step:1602/1750 train_time:156493ms step_avg:97.69ms | |
| step:1603/1750 train_time:156595ms step_avg:97.69ms | |
| step:1604/1750 train_time:156696ms step_avg:97.69ms | |
| step:1605/1750 train_time:156798ms step_avg:97.69ms | |
| step:1606/1750 train_time:156900ms step_avg:97.70ms | |
| step:1607/1750 train_time:157003ms step_avg:97.70ms | |
| step:1608/1750 train_time:157105ms step_avg:97.70ms | |
| step:1609/1750 train_time:157207ms step_avg:97.70ms | |
| step:1610/1750 train_time:157310ms step_avg:97.71ms | |
| step:1611/1750 train_time:157412ms step_avg:97.71ms | |
| step:1612/1750 train_time:157513ms step_avg:97.71ms | |
| step:1613/1750 train_time:157614ms step_avg:97.71ms | |
| step:1614/1750 train_time:157714ms step_avg:97.72ms | |
| step:1615/1750 train_time:157815ms step_avg:97.72ms | |
| step:1616/1750 train_time:157916ms step_avg:97.72ms | |
| step:1617/1750 train_time:158017ms step_avg:97.72ms | |
| step:1618/1750 train_time:158121ms step_avg:97.73ms | |
| step:1619/1750 train_time:158224ms step_avg:97.73ms | |
| step:1620/1750 train_time:158327ms step_avg:97.73ms | |
| step:1621/1750 train_time:158429ms step_avg:97.74ms | |
| step:1622/1750 train_time:158530ms step_avg:97.74ms | |
| step:1623/1750 train_time:158632ms step_avg:97.74ms | |
| step:1624/1750 train_time:158735ms step_avg:97.74ms | |
| step:1625/1750 train_time:158838ms step_avg:97.75ms | |
| step:1625/1750 val_loss:3.3005 train_time:158931ms step_avg:97.80ms | |
| step:1626/1750 train_time:158968ms step_avg:97.77ms | |
| step:1627/1750 train_time:159051ms step_avg:97.76ms | |
| step:1628/1750 train_time:159153ms step_avg:97.76ms | |
| step:1629/1750 train_time:159255ms step_avg:97.76ms | |
| step:1630/1750 train_time:159357ms step_avg:97.77ms | |
| step:1631/1750 train_time:159458ms step_avg:97.77ms | |
| step:1632/1750 train_time:159558ms step_avg:97.77ms | |
| step:1633/1750 train_time:159660ms step_avg:97.77ms | |
| step:1634/1750 train_time:159761ms step_avg:97.77ms | |
| step:1635/1750 train_time:159861ms step_avg:97.77ms | |
| step:1636/1750 train_time:159964ms step_avg:97.78ms | |
| step:1637/1750 train_time:160066ms step_avg:97.78ms | |
| step:1638/1750 train_time:160167ms step_avg:97.78ms | |
| step:1639/1750 train_time:160270ms step_avg:97.79ms | |
| step:1640/1750 train_time:160372ms step_avg:97.79ms | |
| step:1641/1750 train_time:160473ms step_avg:97.79ms | |
| step:1642/1750 train_time:160575ms step_avg:97.79ms | |
| step:1643/1750 train_time:160677ms step_avg:97.80ms | |
| step:1644/1750 train_time:160778ms step_avg:97.80ms | |
| step:1645/1750 train_time:160881ms step_avg:97.80ms | |
| step:1646/1750 train_time:160981ms step_avg:97.80ms | |
| step:1647/1750 train_time:161083ms step_avg:97.80ms | |
| step:1648/1750 train_time:161186ms step_avg:97.81ms | |
| step:1649/1750 train_time:161286ms step_avg:97.81ms | |
| step:1650/1750 train_time:161389ms step_avg:97.81ms | |
| step:1651/1750 train_time:161490ms step_avg:97.81ms | |
| step:1652/1750 train_time:161592ms step_avg:97.82ms | |
| step:1653/1750 train_time:161695ms step_avg:97.82ms | |
| step:1654/1750 train_time:161797ms step_avg:97.82ms | |
| step:1655/1750 train_time:161898ms step_avg:97.82ms | |
| step:1656/1750 train_time:162000ms step_avg:97.83ms | |
| step:1657/1750 train_time:162101ms step_avg:97.83ms | |
| step:1658/1750 train_time:162202ms step_avg:97.83ms | |
| step:1659/1750 train_time:162307ms step_avg:97.83ms | |
| step:1660/1750 train_time:162409ms step_avg:97.84ms | |
| step:1661/1750 train_time:162513ms step_avg:97.84ms | |
| step:1662/1750 train_time:162617ms step_avg:97.84ms | |
| step:1663/1750 train_time:162719ms step_avg:97.85ms | |
| step:1664/1750 train_time:162820ms step_avg:97.85ms | |
| step:1665/1750 train_time:162924ms step_avg:97.85ms | |
| step:1666/1750 train_time:163026ms step_avg:97.85ms | |
| step:1667/1750 train_time:163127ms step_avg:97.86ms | |
| step:1668/1750 train_time:163229ms step_avg:97.86ms | |
| step:1669/1750 train_time:163331ms step_avg:97.86ms | |
| step:1670/1750 train_time:163434ms step_avg:97.86ms | |
| step:1671/1750 train_time:163535ms step_avg:97.87ms | |
| step:1672/1750 train_time:163638ms step_avg:97.87ms | |
| step:1673/1750 train_time:163739ms step_avg:97.87ms | |
| step:1674/1750 train_time:163840ms step_avg:97.87ms | |
| step:1675/1750 train_time:163941ms step_avg:97.88ms | |
| step:1676/1750 train_time:164042ms step_avg:97.88ms | |
| step:1677/1750 train_time:164143ms step_avg:97.88ms | |
| step:1678/1750 train_time:164245ms step_avg:97.88ms | |
| step:1679/1750 train_time:164348ms step_avg:97.88ms | |
| step:1680/1750 train_time:164450ms step_avg:97.89ms | |
| step:1681/1750 train_time:164552ms step_avg:97.89ms | |
| step:1682/1750 train_time:164656ms step_avg:97.89ms | |
| step:1683/1750 train_time:164756ms step_avg:97.89ms | |
| step:1684/1750 train_time:164858ms step_avg:97.90ms | |
| step:1685/1750 train_time:164960ms step_avg:97.90ms | |
| step:1686/1750 train_time:165062ms step_avg:97.90ms | |
| step:1687/1750 train_time:165163ms step_avg:97.90ms | |
| step:1688/1750 train_time:165265ms step_avg:97.91ms | |
| step:1689/1750 train_time:165368ms step_avg:97.91ms | |
| step:1690/1750 train_time:165470ms step_avg:97.91ms | |
| step:1691/1750 train_time:165573ms step_avg:97.91ms | |
| step:1692/1750 train_time:165675ms step_avg:97.92ms | |
| step:1693/1750 train_time:165777ms step_avg:97.92ms | |
| step:1694/1750 train_time:165882ms step_avg:97.92ms | |
| step:1695/1750 train_time:165985ms step_avg:97.93ms | |
| step:1696/1750 train_time:166088ms step_avg:97.93ms | |
| step:1697/1750 train_time:166193ms step_avg:97.93ms | |
| step:1698/1750 train_time:166295ms step_avg:97.94ms | |
| step:1699/1750 train_time:166397ms step_avg:97.94ms | |
| step:1700/1750 train_time:166499ms step_avg:97.94ms | |
| step:1701/1750 train_time:166600ms step_avg:97.94ms | |
| step:1702/1750 train_time:166704ms step_avg:97.95ms | |
| step:1703/1750 train_time:166806ms step_avg:97.95ms | |
| step:1704/1750 train_time:166908ms step_avg:97.95ms | |
| step:1705/1750 train_time:167010ms step_avg:97.95ms | |
| step:1706/1750 train_time:167112ms step_avg:97.96ms | |
| step:1707/1750 train_time:167216ms step_avg:97.96ms | |
| step:1708/1750 train_time:167320ms step_avg:97.96ms | |
| step:1709/1750 train_time:167422ms step_avg:97.96ms | |
| step:1710/1750 train_time:167525ms step_avg:97.97ms | |
| step:1711/1750 train_time:167630ms step_avg:97.97ms | |
| step:1712/1750 train_time:167733ms step_avg:97.97ms | |
| step:1713/1750 train_time:167838ms step_avg:97.98ms | |
| step:1714/1750 train_time:167939ms step_avg:97.98ms | |
| step:1715/1750 train_time:168044ms step_avg:97.98ms | |
| step:1716/1750 train_time:168146ms step_avg:97.99ms | |
| step:1717/1750 train_time:168250ms step_avg:97.99ms | |
| step:1718/1750 train_time:168355ms step_avg:97.99ms | |
| step:1719/1750 train_time:168458ms step_avg:98.00ms | |
| step:1720/1750 train_time:168560ms step_avg:98.00ms | |
| step:1721/1750 train_time:168662ms step_avg:98.00ms | |
| step:1722/1750 train_time:168764ms step_avg:98.00ms | |
| step:1723/1750 train_time:168866ms step_avg:98.01ms | |
| step:1724/1750 train_time:168970ms step_avg:98.01ms | |
| step:1725/1750 train_time:169075ms step_avg:98.01ms | |
| step:1726/1750 train_time:169178ms step_avg:98.02ms | |
| step:1727/1750 train_time:169280ms step_avg:98.02ms | |
| step:1728/1750 train_time:169384ms step_avg:98.02ms | |
| step:1729/1750 train_time:169485ms step_avg:98.02ms | |
| step:1730/1750 train_time:169587ms step_avg:98.03ms | |
| step:1731/1750 train_time:169692ms step_avg:98.03ms | |
| step:1732/1750 train_time:169795ms step_avg:98.03ms | |
| step:1733/1750 train_time:169897ms step_avg:98.04ms | |
| step:1734/1750 train_time:170000ms step_avg:98.04ms | |
| step:1735/1750 train_time:170102ms step_avg:98.04ms | |
| step:1736/1750 train_time:170205ms step_avg:98.04ms | |
| step:1737/1750 train_time:170308ms step_avg:98.05ms | |
| step:1738/1750 train_time:170411ms step_avg:98.05ms | |
| step:1739/1750 train_time:170513ms step_avg:98.05ms | |
| step:1740/1750 train_time:170616ms step_avg:98.05ms | |
| step:1741/1750 train_time:170723ms step_avg:98.06ms | |
| step:1742/1750 train_time:170826ms step_avg:98.06ms | |
| step:1743/1750 train_time:170929ms step_avg:98.07ms | |
| step:1744/1750 train_time:171032ms step_avg:98.07ms | |
| step:1745/1750 train_time:171134ms step_avg:98.07ms | |
| step:1746/1750 train_time:171237ms step_avg:98.07ms | |
| step:1747/1750 train_time:171339ms step_avg:98.08ms | |
| step:1748/1750 train_time:171442ms step_avg:98.08ms | |
| step:1749/1750 train_time:171544ms step_avg:98.08ms | |
| step:1750/1750 train_time:171648ms step_avg:98.08ms | |
| step:1750/1750 val_loss:3.2771 train_time:171743ms step_avg:98.14ms | |
| peak memory allocated: 33278 MiB reserved: 48994 MiB | |