| ==================================================================================================== | |
| import os | |
| import sys | |
| with open(sys.argv[0]) as f: | |
| code = f.read() # read the code of this file ASAP, for logging | |
| import uuid | |
| import glob | |
| import time | |
| from dataclasses import dataclass | |
| import numpy as np | |
| import torch | |
| from torch import nn | |
| import torch.nn.functional as F | |
| import torch.distributed as dist | |
| import torch._inductor.config as config | |
| from torch.nn.parallel import DistributedDataParallel as DDP | |
| # Use of FlexAttention contributed by @KoszarskyB | |
| from torch.nn.attention.flex_attention import flex_attention, create_block_mask | |
| flex_attention = torch.compile(flex_attention, dynamic=False) | |
| create_block_mask = torch.compile(create_block_mask, dynamic=False) | |
| # ----------------------------------------------------------------------------- | |
| # Muon optimizer | |
| def zeropower_via_svd(G, steps=None): | |
| U, S, V = G.svd() | |
| return U @ V.T | |
| @torch.compile | |
| def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7): | |
| """ | |
| Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a | |
| quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose | |
| of minimizing steps, it turns out to be empirically effective to keep increasing the slope at | |
| zero even beyond the point where the iteration no longer converges all the way to one everywhere | |
| on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T | |
| where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model | |
| performance at all relative to UV^T, where USV^T = G is the SVD. | |
| """ | |
| assert len(G.shape) == 2 | |
| a, b, c = (3.4445, -4.7750, 2.0315) | |
| X = G.bfloat16() | |
| X /= (X.norm() + eps) # ensure top singular value <= 1 | |
| if G.size(0) > G.size(1): | |
| X = X.T | |
| for _ in range(steps): | |
| A = X @ X.T | |
| B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng | |
| X = a * X + B @ X | |
| if G.size(0) > G.size(1): | |
| X = X.T | |
| return X | |
| zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5) | |
| class Muon(torch.optim.Optimizer): | |
| """ | |
| Muon - MomentUm Orthogonalized by Newton-schulz | |
| Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- | |
| processing step, in which each 2D parameter's update is replaced with the nearest orthogonal | |
| matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has | |
| the advantage that it can be stably run in bfloat16 on the GPU. | |
| Some warnings: | |
| - This optimizer assumes that all parameters passed in are 2D. | |
| - It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D | |
| parameters; those should all be optimized by a standard method (e.g., AdamW). | |
| - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. | |
| - We believe it is unlikely to work well for training with small batch size. | |
| - We believe it may not work well for finetuning pretrained models, but we haven't tested this. | |
| - We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). | |
| Arguments: | |
| lr: The learning rate used by the internal SGD. | |
| momentum: The momentum used by the internal SGD. | |
| nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) | |
| backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5') | |
| backend_steps: The number of iteration steps to use in the backend, if it is iterative. | |
| """ | |
| def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, | |
| backend='newtonschulz5', backend_steps=5): | |
| defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps) | |
| super().__init__(params, defaults) | |
| def step(self): | |
| for group in self.param_groups: | |
| lr = group['lr'] | |
| momentum = group['momentum'] | |
| zeropower_backend = zeropower_backends[group['backend']] | |
| # generate weight updates in distributed fashion | |
| total_params = sum(p.numel() for p in group['params']) | |
| updates_flat = torch.zeros(total_params, device='cuda', dtype=torch.bfloat16) | |
| curr_idx = 0 | |
| for i, p in enumerate(group['params']): | |
| # luckily this will perfectly distribute a transformer with multiple of 4 layers to 8 GPUs | |
| if i % int(os.environ['WORLD_SIZE']) == int(os.environ['RANK']): | |
| g = p.grad | |
| assert g is not None | |
| state = self.state[p] | |
| if 'momentum_buffer' not in state: | |
| state['momentum_buffer'] = torch.zeros_like(g) | |
| buf = state['momentum_buffer'] | |
| buf.mul_(momentum).add_(g) | |
| if group['nesterov']: | |
| g = g.add(buf, alpha=momentum) | |
| g = zeropower_backend(g, steps=group['backend_steps']) | |
| g *= max(1, g.size(0)/g.size(1))**0.5 | |
| updates_flat[curr_idx:curr_idx+p.numel()] = g.flatten() | |
| curr_idx += p.numel() | |
| # sync updates across devices. we are not memory-constrained so can do this simple deserialization | |
| dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) | |
| # deserialize and apply updates | |
| curr_idx = 0 | |
| for p in group['params']: | |
| g = updates_flat[curr_idx:curr_idx+p.numel()].view_as(p.data).type_as(p.data) | |
| p.data.add_(g, alpha=-lr) | |
| curr_idx += p.numel() | |
| # ----------------------------------------------------------------------------- | |
| # PyTorch nn.Module definitions for the GPT-2 model | |
| class Rotary(torch.nn.Module): | |
| def __init__(self, dim, base=10000): | |
| super().__init__() | |
| self.dim = dim | |
| self.base = base | |
| self.inv_freq = None | |
| self.seq_len_cached = None | |
| self.cos_cached = None | |
| self.sin_cached = None | |
| def forward(self, x): | |
| seq_len = x.shape[1] | |
| if seq_len != self.seq_len_cached: | |
| self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=x.device).float() / self.dim)) | |
| self.seq_len_cached = seq_len | |
| t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq) | |
| freqs = torch.outer(t, self.inv_freq) | |
| self.cos_cached = freqs.cos().bfloat16() | |
| self.sin_cached = freqs.sin().bfloat16() | |
| return self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] | |
| def apply_rotary_emb(x, cos, sin): | |
| assert x.ndim == 4 # multihead attention | |
| d = x.shape[3]//2 | |
| x1 = x[..., :d] | |
| x2 = x[..., d:] | |
| y1 = x1 * cos + x2 * sin | |
| y2 = x1 * (-sin) + x2 * cos | |
| return torch.cat([y1, y2], 3).type_as(x) | |
| class CastedLinear(nn.Linear): | |
| def forward(self, x): | |
| return F.linear(x, self.weight.to(x.dtype)) | |
| class CausalSelfAttention(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| self.n_head = config.n_head | |
| self.n_embd = config.n_embd | |
| self.head_dim = self.n_embd // self.n_head | |
| assert self.n_embd % self.n_head == 0 | |
| self.c_q = CastedLinear(self.n_embd, self.n_embd, bias=False) | |
| self.c_k = CastedLinear(self.n_embd, self.n_embd, bias=False) | |
| self.c_v = CastedLinear(self.n_embd, self.n_embd, bias=False) | |
| # output projection | |
| self.c_proj = CastedLinear(self.n_embd, self.n_embd, bias=False) | |
| self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 | |
| self.rotary = Rotary(self.head_dim) | |
| self.lamb = nn.Parameter(torch.tensor(0.5)) # @Grad62304977 | |
| def forward(self, x, v1, block_mask): | |
| B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd) | |
| q = self.c_q(x).view(B, T, self.n_head, self.head_dim) | |
| k = self.c_k(x).view(B, T, self.n_head, self.head_dim) | |
| v = self.c_v(x).view(B, T, self.n_head, self.head_dim) | |
| if v1 is None: | |
| v1 = v # This happens if we are in the first block. v needs to be accessed by subsequent blocks | |
| v = (1 - self.lamb) * v + self.lamb * v1.view_as(v) # @Grad62304977 | |
| cos, sin = self.rotary(q) | |
| q, k = F.rms_norm(q, (q.size(-1),)), F.rms_norm(k, (k.size(-1),)) # QK norm suggested by @Grad62304977 | |
| q, k = apply_rotary_emb(q, cos, sin), apply_rotary_emb(k, cos, sin) | |
| y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask) | |
| y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side | |
| y = self.c_proj(y) | |
| return y, v1 | |
| class MLP(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| self.c_fc = CastedLinear(config.n_embd, 4 * config.n_embd, bias=False) | |
| self.c_proj = CastedLinear(4 * config.n_embd, config.n_embd, bias=False) | |
| self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 | |
| def forward(self, x): | |
| x = self.c_fc(x) | |
| x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 | |
| x = self.c_proj(x) | |
| return x | |
| class Block(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| self.attn = CausalSelfAttention(config) | |
| self.mlp = MLP(config) | |
| self.lambdas = nn.Parameter(torch.tensor([1., 0.])) | |
| def forward(self, x, v1, x0, block_mask): | |
| x = self.lambdas[0] * x + self.lambdas[1] * x0 | |
| x1, v1 = self.attn(F.rms_norm(x, (x.size(-1),)), v1, block_mask) | |
| x = x + x1 | |
| x = x + self.mlp(F.rms_norm(x, (x.size(-1),))) | |
| return x, v1 | |
| # ----------------------------------------------------------------------------- | |
| # The main GPT-2 model | |
| @dataclass | |
| class GPTConfig: | |
| vocab_size : int = 50304 | |
| n_layer : int = 12 | |
| n_head : int = 6 # head dim 128 suggested by @Grad62304977 | |
| n_embd : int = 768 | |
| class GPT(nn.Module): | |
| def __init__(self, config): | |
| super().__init__() | |
| # U-net design by @brendanh0gan | |
| self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder | |
| self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder | |
| # Add learnable skip connection weights for decoder layers | |
| self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) | |
| self.transformer = nn.ModuleDict(dict( | |
| wte = nn.Embedding(config.vocab_size, config.n_embd), | |
| h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), | |
| )) | |
| self.lm_head = CastedLinear(config.n_embd, config.vocab_size, bias=False) | |
| self.lm_head.weight.data.zero_() # @Grad62304977 | |
| def forward(self, idx, target, attn_blocksize): | |
| docs = (idx == 50256).cumsum(0) | |
| def document_causal_mask(b, h, q_idx, kv_idx): | |
| causal_mask = q_idx >= kv_idx | |
| document_mask = docs[q_idx] == docs[kv_idx] | |
| window_mask = q_idx - kv_idx < attn_blocksize | |
| return causal_mask & document_mask & window_mask | |
| S = len(idx) | |
| block_mask = create_block_mask(document_causal_mask, None, None, S, S, device="cuda", _compile=True) | |
| # forward the GPT model itself | |
| x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd) | |
| x = F.rms_norm(x, (x.size(-1),)) # @Grad62304977 | |
| x0 = x | |
| v1 = None | |
| # Store outputs for U-Net skip connections | |
| skip_connections = [] | |
| # Encoder pass - process only the first half of the blocks | |
| for i in range(self.num_encoder_layers): | |
| x, v1 = self.transformer.h[i](x, v1, x0, block_mask) | |
| skip_connections.append(x) | |
| # Decoder pass - process the remaining blocks with weighted skip connections | |
| for i in range(self.num_decoder_layers): | |
| x = x + self.skip_weights[i] * skip_connections.pop() | |
| x, v1 = self.transformer.h[self.num_encoder_layers + i](x, v1, x0, block_mask) | |
| x = F.rms_norm(x, (x.size(-1),)) | |
| logits = self.lm_head(x) | |
| logits = 30 * torch.tanh(logits / 30) # @Grad62304977 | |
| logits = logits.float() | |
| loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1)) | |
| return loss | |
| # ----------------------------------------------------------------------------- | |
| # Our own simple Distributed Data Loader | |
| def _peek_data_shard(filename): | |
| # only reads the header, returns header data | |
| with open(filename, "rb") as f: | |
| # first read the header, which is 256 int32 integers (4 bytes each) | |
| header = np.frombuffer(f.read(256*4), dtype=np.int32) | |
| if header[0] != 20240520: | |
| print("ERROR: magic number mismatch in the data .bin file!") | |
| print("---> HINT: Are you passing in a correct file with --input_bin?") | |
| print("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README") | |
| print("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try") | |
| exit(1) | |
| assert header[1] == 1, "unsupported version" | |
| ntok = header[2] # number of tokens (claimed) | |
| return ntok # for now just return the number of tokens | |
| def _load_data_shard(filename): | |
| with open(filename, "rb") as f: | |
| # first read the header, which is 256 int32 integers (4 bytes each) | |
| header = np.frombuffer(f.read(256*4), dtype=np.int32) | |
| assert header[0] == 20240520, "magic number mismatch in the data .bin file" | |
| assert header[1] == 1, "unsupported version" | |
| ntok = header[2] # number of tokens (claimed) | |
| # the rest of it are tokens, stored as uint16 | |
| tokens = np.frombuffer(f.read(), dtype=np.uint16) | |
| assert len(tokens) == ntok, "number of tokens read does not match header?" | |
| return tokens | |
| class DistributedDataLoader: | |
| def __init__(self, filename_pattern, B, T, process_rank, num_processes): | |
| self.process_rank = process_rank | |
| self.num_processes = num_processes | |
| self.B = B | |
| self.T = T | |
| # glob files that match the pattern | |
| self.files = sorted(glob.glob(filename_pattern)) | |
| assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" | |
| # load and validate all data shards, count number of tokens in total | |
| ntok_total = 0 | |
| for fname in self.files: | |
| shard_ntok = _peek_data_shard(fname) | |
| assert shard_ntok >= num_processes * B * T + 1 | |
| ntok_total += int(shard_ntok) | |
| self.ntok_total = ntok_total | |
| self.reset() | |
| def reset(self): | |
| self.current_shard = -1 | |
| self.advance() | |
| def advance(self): # advance to next data shard | |
| self.current_shard = (self.current_shard + 1) % len(self.files) | |
| self.current_position = self.process_rank * self.B * self.T | |
| self.tokens = _load_data_shard(self.files[self.current_shard]) | |
| def next_batch(self): | |
| batch_size = self.B * self.T * self.num_processes | |
| buf = self.tokens[self.current_position:self.current_position+self.B*self.T+1] | |
| buf = torch.tensor(buf.astype(np.int32), dtype=torch.long) | |
| x = buf[:-1] # inputs | |
| y = buf[1:] # targets | |
| # advance current position and load next shard if necessary | |
| self.current_position += batch_size | |
| if self.current_position + batch_size >= len(self.tokens): | |
| self.advance() | |
| return x.cuda(), y.cuda() | |
| # ----------------------------------------------------------------------------- | |
| # int main | |
| @dataclass | |
| class Hyperparameters: | |
| # data hyperparams | |
| input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on | |
| input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on | |
| # optimization hyperparams | |
| batch_size : int = 8 # batch size, in sequences, across all devices | |
| device_batch_size : int = 1 # batch size, in sequences, per device | |
| sequence_length : int = 64*1024 # sequence length, in tokens | |
| num_iterations : int = 1750 # number of iterations to run | |
| warmup_iters : int = 0 | |
| cooldown_iters : int = 640 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule | |
| weight_decay : float = 0 | |
| # evaluation and logging hyperparams | |
| val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end | |
| val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons | |
| save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end | |
| args = Hyperparameters() | |
| # set up DDP (distributed data parallel). torchrun sets this env variable | |
| assert torch.cuda.is_available() | |
| dist.init_process_group(backend='nccl') | |
| ddp_rank = int(os.environ['RANK']) | |
| ddp_local_rank = int(os.environ['LOCAL_RANK']) | |
| ddp_world_size = int(os.environ['WORLD_SIZE']) | |
| device = f'cuda:{ddp_local_rank}' | |
| torch.cuda.set_device(device) | |
| print(f"using device: {device}") | |
| master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. | |
| # begin logging | |
| logfile = None | |
| if master_process: | |
| run_id = str(uuid.uuid4()) | |
| logdir = 'logs/%s/' % run_id | |
| os.makedirs(logdir, exist_ok=True) | |
| logfile = 'logs/%s.txt' % run_id | |
| # create the log file | |
| with open(logfile, "w") as f: | |
| # begin the log by printing this file (the Python code) | |
| f.write('='*100 + '\n') | |
| f.write(code) | |
| f.write('='*100 + '\n') | |
| def print0(s, logonly=False): | |
| if master_process: | |
| with open(logfile, "a") as f: | |
| if not logonly: | |
| print(s) | |
| f.write(s+'\n') | |
| # log information about the hardware/software environment this is running on | |
| # and print the full `nvidia-smi` to file | |
| print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:") | |
| import subprocess | |
| result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | |
| print0(f'{result.stdout}', logonly=True) | |
| print0('='*100, logonly=True) | |
| # convenience variables | |
| B, T = args.device_batch_size, args.sequence_length | |
| # calculate the number of steps to take in the val loop. | |
| assert args.val_tokens % (B * T * ddp_world_size) == 0 | |
| val_steps = args.val_tokens // (B * T * ddp_world_size) | |
| # calculate the steps of gradient accumulation required to attain the desired global batch size. | |
| assert args.batch_size % (B * ddp_world_size) == 0 | |
| train_accumulation_steps = args.batch_size // (B * ddp_world_size) | |
| # load tokens | |
| train_loader = DistributedDataLoader(args.input_bin, B, T, ddp_rank, ddp_world_size) | |
| val_loader = DistributedDataLoader(args.input_val_bin, B, T, ddp_rank, ddp_world_size) | |
| print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files") | |
| print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files") | |
| print0('='*100, logonly=True) | |
| x, y = train_loader.next_batch() | |
| # there are only 50257 unique GPT-2 tokens; we extend to nearest multiple of 128 for efficiency. suggested to me by @Grad62304977. | |
| # this originates from Karpathy's experiments. | |
| num_vocab = 50304 | |
| model = GPT(GPTConfig(vocab_size=num_vocab, n_layer=12, n_head=6, n_embd=768)) | |
| model = model.cuda().bfloat16() | |
| for m in model.modules(): | |
| if isinstance(m, CastedLinear): | |
| m.float() | |
| if hasattr(config, "coordinate_descent_tuning"): | |
| config.coordinate_descent_tuning = True # suggested by @Chillee | |
| model = torch.compile(model) | |
| # here we wrap model into DDP container | |
| model = DDP(model, device_ids=[ddp_local_rank]) | |
| raw_model = model.module # always contains the "raw" unwrapped model | |
| # CUDNN attention is ~4ms faster than Flash, but doesn't get selected by default in PyTorch 2.5.1 | |
| from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp | |
| enable_cudnn_sdp(True) | |
| enable_flash_sdp(False) | |
| enable_mem_efficient_sdp(False) | |
| enable_math_sdp(False) | |
| # init the optimizer(s) | |
| optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight], lr=0.6, betas=(0.8, 0.95), fused=True) | |
| optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True) | |
| params = list(raw_model.transformer.h.parameters()) | |
| matrix_params = [p for p in params if p.ndim == 2] | |
| scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] | |
| optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95) | |
| optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) # note that this learning rate is neither sensitive nor tuned | |
| optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] | |
| # learning rate decay scheduler (linear warmup and cooldown) | |
| def get_lr(it): | |
| assert it <= args.num_iterations | |
| # 1) linear warmup for warmup_iters steps | |
| if it < args.warmup_iters: | |
| return (it+1) / args.warmup_iters | |
| # 2) constant lr for a while | |
| elif it < args.num_iterations - args.cooldown_iters: | |
| return 1.0 | |
| # 3) linear cooldown | |
| else: | |
| decay_ratio = (args.num_iterations - it) / args.cooldown_iters | |
| return decay_ratio | |
| schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] | |
| # Start training loop | |
| training_time_ms = 0 | |
| # start the clock | |
| torch.cuda.synchronize() | |
| t0 = time.time() | |
| # begin training | |
| for step in range(args.num_iterations + 1): | |
| last_step = (step == args.num_iterations) | |
| # Set the attention blocksize for the current step, in chunks of 64 | |
| attn_blocksize = torch.tensor(64*((step/args.num_iterations * (1792 - 64) + 64)//64), dtype=torch.int, device='cuda') | |
| # This effectively ignores timing first 10 steps, which are slower for weird reasons. | |
| # Alternately, and slightly more correctly in terms of benchmarking, we could do 10 | |
| # steps with dummy data first, and then re-initialize the model and reset the loader. | |
| if step == 10: | |
| training_time_ms = 0 | |
| t0 = time.time() | |
| timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val | |
| # once in a while evaluate the validation dataset | |
| if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): | |
| # stop the clock | |
| torch.cuda.synchronize() | |
| training_time_ms += 1000 * (time.time() - t0) | |
| # run validation batches | |
| model.eval() | |
| val_loader.reset() | |
| val_loss = 0.0 | |
| for _ in range(val_steps): | |
| with torch.no_grad(): | |
| x_val, y_val = val_loader.next_batch() | |
| val_loss += model(x_val, y_val, attn_blocksize=attn_blocksize) | |
| dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) | |
| val_loss /= val_steps | |
| # log val loss to console and to logfile | |
| print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms') | |
| # start the clock again | |
| torch.cuda.synchronize() | |
| t0 = time.time() | |
| if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)): | |
| # stop the clock | |
| torch.cuda.synchronize() | |
| training_time_ms += 1000 * (time.time() - t0) | |
| # save the state of the training process | |
| log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) | |
| torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) | |
| # start the clock again | |
| torch.cuda.synchronize() | |
| t0 = time.time() | |
| # bit confusing: we want to make sure to eval on 0th iteration | |
| # but also after the very last iteration. so we loop for step <= num_iterations | |
| # instead of just < num_iterations (one extra due to <=), only to do | |
| # the validation/sampling one last time, and then we break right here as we're done. | |
| if last_step: | |
| break | |
| # --------------- TRAINING SECTION BEGIN ----------------- | |
| model.train() | |
| for i in range(1, train_accumulation_steps+1): | |
| # forward pass | |
| loss = model(x, y, attn_blocksize=attn_blocksize) | |
| train_loss = loss.detach() | |
| # advance the dataset for the next batch | |
| x, y = train_loader.next_batch() | |
| # backward pass | |
| if i < train_accumulation_steps: | |
| with model.no_sync(): # there's no need to sync gradients every accumulation step | |
| loss.backward() | |
| else: | |
| loss.backward() # just sync on the last step | |
| for p in model.parameters(): | |
| p.grad /= train_accumulation_steps | |
| # momentum warmup for Muon | |
| frac = min(step/300, 1) | |
| optimizer3.param_groups[0]['momentum'] = (1 - frac) * 0.85 + frac * 0.95 | |
| # step the optimizers and schedulers | |
| for opt, sched in zip(optimizers, schedulers): | |
| opt.step() | |
| sched.step() | |
| # null the gradients | |
| model.zero_grad(set_to_none=True) | |
| # --------------- TRAINING SECTION END ------------------- | |
| # everything that follows now is just diagnostics, prints, logging, etc. | |
| #dist.all_reduce(train_loss, op=dist.ReduceOp.AVG) # all-reducing the training loss would be more correct in terms of logging, but slower | |
| approx_time = training_time_ms + 1000 * (time.time() - t0) | |
| print0(f"step:{step+1}/{args.num_iterations} train_loss:{train_loss.item():.4f} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") | |
| if master_process: | |
| print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") | |
| # ------------------------------------------------------------------------- | |
| # clean up nice | |
| dist.destroy_process_group() | |
| ==================================================================================================== | |
| Running pytorch 2.6.0.dev20241124+cu124 compiled for CUDA 12.4 | |
| nvidia-smi: | |
| Sun Nov 24 23:36:17 2024 | |
| +-----------------------------------------------------------------------------------------+ | |
| | NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 | | |
| |-----------------------------------------+------------------------+----------------------+ | |
| | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | | MIG M. | | |
| |=========================================+========================+======================| | |
| | 0 NVIDIA H100 80GB HBM3 Off | 00000000:18:00.0 Off | 0 | | |
| | N/A 28C P0 68W / 700W | 4MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 1 NVIDIA H100 80GB HBM3 Off | 00000000:2A:00.0 Off | 0 | | |
| | N/A 31C P0 114W / 700W | 34MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 2 NVIDIA H100 80GB HBM3 Off | 00000000:3A:00.0 Off | 0 | | |
| | N/A 32C P0 111W / 700W | 530MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 3 NVIDIA H100 80GB HBM3 Off | 00000000:5D:00.0 Off | 0 | | |
| | N/A 29C P0 112W / 700W | 530MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 4 NVIDIA H100 80GB HBM3 Off | 00000000:84:00.0 Off | 0 | | |
| | N/A 29C P0 111W / 700W | 530MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 5 NVIDIA H100 80GB HBM3 Off | 00000000:8B:00.0 Off | 0 | | |
| | N/A 32C P0 113W / 700W | 530MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 6 NVIDIA H100 80GB HBM3 Off | 00000000:91:00.0 Off | 0 | | |
| | N/A 30C P0 110W / 700W | 530MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| | 7 NVIDIA H100 80GB HBM3 Off | 00000000:E4:00.0 Off | 0 | | |
| | N/A 29C P0 114W / 700W | 530MiB / 81559MiB | 0% Default | | |
| | | | Disabled | | |
| +-----------------------------------------+------------------------+----------------------+ | |
| +-----------------------------------------------------------------------------------------+ | |
| | Processes: | | |
| | GPU GI CI PID Type Process name GPU Memory | | |
| | ID ID Usage | | |
| |=========================================================================================| | |
| | 1 N/A N/A 1796 C /usr/bin/python3 0MiB | | |
| | 2 N/A N/A 1797 C /usr/bin/python3 0MiB | | |
| | 3 N/A N/A 1798 C /usr/bin/python3 0MiB | | |
| | 4 N/A N/A 1799 C /usr/bin/python3 0MiB | | |
| | 5 N/A N/A 1800 C /usr/bin/python3 0MiB | | |
| | 6 N/A N/A 1801 C /usr/bin/python3 0MiB | | |
| | 7 N/A N/A 1802 C /usr/bin/python3 0MiB | | |
| +-----------------------------------------------------------------------------------------+ | |
| ==================================================================================================== | |
| Training DataLoader: total number of tokens: 1800000000 across 18 files | |
| Validation DataLoader: total number of tokens: 100000000 across 1 files | |
| ==================================================================================================== | |
| step:0/1750 val_loss:10.8258 train_time:0ms step_avg:nanms | |
| step:1/1750 train_loss:10.8258 train_time:163438ms step_avg:nanms | |
| step:2/1750 train_loss:10.0893 train_time:163548ms step_avg:nanms | |
| step:3/1750 train_loss:8.3951 train_time:163693ms step_avg:nanms | |
| step:4/1750 train_loss:7.5780 train_time:163842ms step_avg:nanms | |
| step:5/1750 train_loss:7.4681 train_time:163991ms step_avg:nanms | |
| step:6/1750 train_loss:6.9705 train_time:164138ms step_avg:nanms | |
| step:7/1750 train_loss:7.2105 train_time:164285ms step_avg:nanms | |
| step:8/1750 train_loss:6.7367 train_time:164434ms step_avg:nanms | |
| step:9/1750 train_loss:6.6213 train_time:164581ms step_avg:nanms | |
| step:10/1750 train_loss:6.5087 train_time:164728ms step_avg:nanms | |
| step:11/1750 train_loss:6.4831 train_time:110ms step_avg:nanms | |
| step:12/1750 train_loss:6.3491 train_time:257ms step_avg:nanms | |
| step:13/1750 train_loss:6.2939 train_time:407ms step_avg:135.58ms | |
| step:14/1750 train_loss:6.2331 train_time:553ms step_avg:138.33ms | |
| step:15/1750 train_loss:6.2026 train_time:701ms step_avg:140.24ms | |
| step:16/1750 train_loss:6.1480 train_time:849ms step_avg:141.52ms | |
| step:17/1750 train_loss:6.2293 train_time:996ms step_avg:142.24ms | |
| step:18/1750 train_loss:6.0134 train_time:1145ms step_avg:143.09ms | |
| step:19/1750 train_loss:6.0690 train_time:1292ms step_avg:143.54ms | |
| step:20/1750 train_loss:5.7139 train_time:1440ms step_avg:143.97ms | |
| step:21/1750 train_loss:6.0282 train_time:1588ms step_avg:144.36ms | |
| step:22/1750 train_loss:6.2938 train_time:1736ms step_avg:144.65ms | |
| step:23/1750 train_loss:5.9283 train_time:1884ms step_avg:144.90ms | |
| step:24/1750 train_loss:6.0908 train_time:2032ms step_avg:145.13ms | |
| step:25/1750 train_loss:5.7889 train_time:2180ms step_avg:145.32ms | |
| step:26/1750 train_loss:5.7179 train_time:2330ms step_avg:145.60ms | |
| step:27/1750 train_loss:5.9059 train_time:2476ms step_avg:145.67ms | |
| step:28/1750 train_loss:5.5065 train_time:2626ms step_avg:145.89ms | |
| step:29/1750 train_loss:5.7777 train_time:2774ms step_avg:145.98ms | |
| step:30/1750 train_loss:5.5997 train_time:2922ms step_avg:146.11ms | |
| step:31/1750 train_loss:5.5644 train_time:3070ms step_avg:146.17ms | |
| step:32/1750 train_loss:5.4084 train_time:3216ms step_avg:146.18ms | |
| step:33/1750 train_loss:5.6910 train_time:3365ms step_avg:146.30ms | |
| step:34/1750 train_loss:5.5944 train_time:3513ms step_avg:146.36ms | |
| step:35/1750 train_loss:5.7210 train_time:3660ms step_avg:146.39ms | |
| step:36/1750 train_loss:5.6432 train_time:3809ms step_avg:146.49ms | |
| step:37/1750 train_loss:5.5441 train_time:3956ms step_avg:146.51ms | |
| step:38/1750 train_loss:5.4215 train_time:4105ms step_avg:146.59ms | |
| step:39/1750 train_loss:5.4423 train_time:4252ms step_avg:146.62ms | |
| step:40/1750 train_loss:5.3416 train_time:4399ms step_avg:146.62ms | |
| step:41/1750 train_loss:5.3366 train_time:4547ms step_avg:146.68ms | |
| step:42/1750 train_loss:5.2762 train_time:4693ms step_avg:146.67ms | |
| step:43/1750 train_loss:5.3978 train_time:4841ms step_avg:146.71ms | |
| step:44/1750 train_loss:5.3553 train_time:4989ms step_avg:146.74ms | |
| step:45/1750 train_loss:5.4858 train_time:5137ms step_avg:146.77ms | |
| step:46/1750 train_loss:5.2656 train_time:5284ms step_avg:146.79ms | |
| step:47/1750 train_loss:5.1761 train_time:5432ms step_avg:146.82ms | |
| step:48/1750 train_loss:5.3076 train_time:5580ms step_avg:146.83ms | |
| step:49/1750 train_loss:5.2348 train_time:5729ms step_avg:146.88ms | |
| step:50/1750 train_loss:5.3447 train_time:5876ms step_avg:146.90ms | |
| step:51/1750 train_loss:5.2615 train_time:6025ms step_avg:146.94ms | |
| step:52/1750 train_loss:5.1171 train_time:6172ms step_avg:146.95ms | |
| step:53/1750 train_loss:5.2673 train_time:6319ms step_avg:146.95ms | |
| step:54/1750 train_loss:5.1178 train_time:6468ms step_avg:146.99ms | |
| step:55/1750 train_loss:5.4847 train_time:6616ms step_avg:147.02ms | |
| step:56/1750 train_loss:5.1145 train_time:6764ms step_avg:147.04ms | |
| step:57/1750 train_loss:4.9826 train_time:6911ms step_avg:147.05ms | |
| step:58/1750 train_loss:5.1019 train_time:7057ms step_avg:147.03ms | |
| step:59/1750 train_loss:5.1164 train_time:7207ms step_avg:147.08ms | |
| step:60/1750 train_loss:5.2303 train_time:7354ms step_avg:147.08ms | |
| step:61/1750 train_loss:4.9726 train_time:7501ms step_avg:147.08ms | |
| step:62/1750 train_loss:5.0816 train_time:7649ms step_avg:147.10ms | |
| step:63/1750 train_loss:5.0547 train_time:7795ms step_avg:147.08ms | |
| step:64/1750 train_loss:4.9510 train_time:7944ms step_avg:147.11ms | |
| step:65/1750 train_loss:4.9034 train_time:8091ms step_avg:147.11ms | |
| step:66/1750 train_loss:5.0774 train_time:8239ms step_avg:147.13ms | |
| step:67/1750 train_loss:4.9320 train_time:8387ms step_avg:147.14ms | |
| step:68/1750 train_loss:5.1849 train_time:8534ms step_avg:147.14ms | |
| step:69/1750 train_loss:4.8183 train_time:8681ms step_avg:147.13ms | |
| step:70/1750 train_loss:4.9104 train_time:8829ms step_avg:147.16ms | |
| step:71/1750 train_loss:5.0624 train_time:8976ms step_avg:147.15ms | |
| step:72/1750 train_loss:4.9887 train_time:9124ms step_avg:147.17ms | |
| step:73/1750 train_loss:4.8633 train_time:9271ms step_avg:147.16ms | |
| step:74/1750 train_loss:4.9941 train_time:9418ms step_avg:147.16ms | |
| step:75/1750 train_loss:4.9696 train_time:9566ms step_avg:147.17ms | |
| step:76/1750 train_loss:4.9045 train_time:9714ms step_avg:147.17ms | |
| step:77/1750 train_loss:5.0199 train_time:9860ms step_avg:147.17ms | |
| step:78/1750 train_loss:5.2016 train_time:10009ms step_avg:147.19ms | |
| step:79/1750 train_loss:4.9216 train_time:10156ms step_avg:147.19ms | |
| step:80/1750 train_loss:4.9497 train_time:10304ms step_avg:147.20ms | |
| step:81/1750 train_loss:4.7405 train_time:10452ms step_avg:147.22ms | |
| step:82/1750 train_loss:4.8915 train_time:10599ms step_avg:147.22ms | |
| step:83/1750 train_loss:4.8516 train_time:10748ms step_avg:147.23ms | |
| step:84/1750 train_loss:4.8451 train_time:10895ms step_avg:147.23ms | |
| step:85/1750 train_loss:4.6940 train_time:11044ms step_avg:147.25ms | |
| step:86/1750 train_loss:4.9021 train_time:11191ms step_avg:147.25ms | |
| step:87/1750 train_loss:4.8275 train_time:11338ms step_avg:147.25ms | |
| step:88/1750 train_loss:4.8348 train_time:11486ms step_avg:147.25ms | |
| step:89/1750 train_loss:4.7845 train_time:11633ms step_avg:147.26ms | |
| step:90/1750 train_loss:4.7102 train_time:11781ms step_avg:147.26ms | |
| step:91/1750 train_loss:4.7036 train_time:11928ms step_avg:147.27ms | |
| step:92/1750 train_loss:4.8486 train_time:12076ms step_avg:147.27ms | |
| step:93/1750 train_loss:4.6607 train_time:12224ms step_avg:147.28ms | |
| step:94/1750 train_loss:4.7021 train_time:12371ms step_avg:147.27ms | |
| step:95/1750 train_loss:4.7476 train_time:12517ms step_avg:147.26ms | |
| step:96/1750 train_loss:4.6388 train_time:12666ms step_avg:147.28ms | |
| step:97/1750 train_loss:4.6843 train_time:12813ms step_avg:147.28ms | |
| step:98/1750 train_loss:4.6344 train_time:12960ms step_avg:147.27ms | |
| step:99/1750 train_loss:4.7371 train_time:13109ms step_avg:147.29ms | |
| step:100/1750 train_loss:4.7256 train_time:13256ms step_avg:147.29ms | |
| step:101/1750 train_loss:4.5779 train_time:13404ms step_avg:147.30ms | |
| step:102/1750 train_loss:4.7521 train_time:13551ms step_avg:147.30ms | |
| step:103/1750 train_loss:4.6428 train_time:13698ms step_avg:147.29ms | |
| step:104/1750 train_loss:4.5703 train_time:13847ms step_avg:147.31ms | |
| step:105/1750 train_loss:4.6017 train_time:13993ms step_avg:147.30ms | |
| step:106/1750 train_loss:4.6746 train_time:14142ms step_avg:147.31ms | |
| step:107/1750 train_loss:4.5616 train_time:14289ms step_avg:147.31ms | |
| step:108/1750 train_loss:4.3978 train_time:14437ms step_avg:147.32ms | |
| step:109/1750 train_loss:4.5401 train_time:14585ms step_avg:147.32ms | |
| step:110/1750 train_loss:4.5238 train_time:14732ms step_avg:147.32ms | |
| step:111/1750 train_loss:4.4611 train_time:14879ms step_avg:147.32ms | |
| step:112/1750 train_loss:4.6176 train_time:15027ms step_avg:147.32ms | |
| step:113/1750 train_loss:4.5175 train_time:15175ms step_avg:147.33ms | |
| step:114/1750 train_loss:4.3843 train_time:15323ms step_avg:147.34ms | |
| step:115/1750 train_loss:4.5378 train_time:15470ms step_avg:147.34ms | |
| step:116/1750 train_loss:4.5121 train_time:15617ms step_avg:147.33ms | |
| step:117/1750 train_loss:4.4269 train_time:15765ms step_avg:147.34ms | |
| step:118/1750 train_loss:4.6516 train_time:15915ms step_avg:147.36ms | |
| step:119/1750 train_loss:4.5109 train_time:16061ms step_avg:147.35ms | |
| step:120/1750 train_loss:4.4046 train_time:16210ms step_avg:147.37ms | |
| step:121/1750 train_loss:4.3465 train_time:16357ms step_avg:147.36ms | |
| step:122/1750 train_loss:4.4887 train_time:16505ms step_avg:147.37ms | |
| step:123/1750 train_loss:4.3337 train_time:16652ms step_avg:147.37ms | |
| step:124/1750 train_loss:4.6312 train_time:16800ms step_avg:147.36ms | |
| step:125/1750 train_loss:4.5231 train_time:16948ms step_avg:147.37ms | |
| step:125/1750 val_loss:4.4538 train_time:16985ms step_avg:147.70ms | |
| step:126/1750 train_loss:4.4651 train_time:17096ms step_avg:147.38ms | |
| step:127/1750 train_loss:4.4789 train_time:17246ms step_avg:147.41ms | |
| step:128/1750 train_loss:4.4198 train_time:17393ms step_avg:147.40ms | |
| step:129/1750 train_loss:4.7347 train_time:17541ms step_avg:147.40ms | |
| step:130/1750 train_loss:4.4163 train_time:17689ms step_avg:147.41ms | |
| step:131/1750 train_loss:4.4430 train_time:17838ms step_avg:147.42ms | |
| step:132/1750 train_loss:4.3889 train_time:17990ms step_avg:147.46ms | |
| step:133/1750 train_loss:4.4932 train_time:18139ms step_avg:147.47ms | |
| step:134/1750 train_loss:4.2932 train_time:18290ms step_avg:147.50ms | |
| step:135/1750 train_loss:4.4814 train_time:18441ms step_avg:147.53ms | |
| step:136/1750 train_loss:4.2469 train_time:18591ms step_avg:147.55ms | |
| step:137/1750 train_loss:4.4101 train_time:18741ms step_avg:147.57ms | |
| step:138/1750 train_loss:4.3273 train_time:18892ms step_avg:147.59ms | |
| step:139/1750 train_loss:4.4063 train_time:19043ms step_avg:147.62ms | |
| step:140/1750 train_loss:4.5022 train_time:19194ms step_avg:147.64ms | |
| step:141/1750 train_loss:4.3406 train_time:19344ms step_avg:147.67ms | |
| step:142/1750 train_loss:4.3411 train_time:19494ms step_avg:147.69ms | |
| step:143/1750 train_loss:4.2793 train_time:19646ms step_avg:147.71ms | |
| step:144/1750 train_loss:4.3757 train_time:19795ms step_avg:147.72ms | |
| step:145/1750 train_loss:4.3371 train_time:19946ms step_avg:147.75ms | |
| step:146/1750 train_loss:4.2026 train_time:20097ms step_avg:147.77ms | |
| step:147/1750 train_loss:4.3480 train_time:20250ms step_avg:147.81ms | |
| step:148/1750 train_loss:4.3887 train_time:20400ms step_avg:147.83ms | |
| step:149/1750 train_loss:4.3296 train_time:20552ms step_avg:147.86ms | |
| step:150/1750 train_loss:4.4696 train_time:20703ms step_avg:147.88ms | |
| step:151/1750 train_loss:4.2961 train_time:20854ms step_avg:147.90ms | |
| step:152/1750 train_loss:4.3049 train_time:21004ms step_avg:147.92ms | |
| step:153/1750 train_loss:4.3990 train_time:21154ms step_avg:147.93ms | |
| step:154/1750 train_loss:4.3798 train_time:21305ms step_avg:147.95ms | |
| step:155/1750 train_loss:4.3071 train_time:21455ms step_avg:147.97ms | |
| step:156/1750 train_loss:4.3730 train_time:21606ms step_avg:147.99ms | |
| step:157/1750 train_loss:4.4318 train_time:21757ms step_avg:148.00ms | |
| step:158/1750 train_loss:4.2650 train_time:21907ms step_avg:148.02ms | |
| step:159/1750 train_loss:4.3304 train_time:22058ms step_avg:148.04ms | |
| step:160/1750 train_loss:4.1536 train_time:22208ms step_avg:148.05ms | |
| step:161/1750 train_loss:4.3783 train_time:22358ms step_avg:148.07ms | |
| step:162/1750 train_loss:4.3926 train_time:22509ms step_avg:148.08ms | |
| step:163/1750 train_loss:4.3581 train_time:22660ms step_avg:148.10ms | |
| step:164/1750 train_loss:4.2046 train_time:22811ms step_avg:148.12ms | |
| step:165/1750 train_loss:4.3091 train_time:22961ms step_avg:148.13ms | |
| step:166/1750 train_loss:4.3748 train_time:23111ms step_avg:148.15ms | |
| step:167/1750 train_loss:4.2263 train_time:23260ms step_avg:148.16ms | |
| step:168/1750 train_loss:4.3050 train_time:23411ms step_avg:148.17ms | |
| step:169/1750 train_loss:4.1774 train_time:23561ms step_avg:148.18ms | |
| step:170/1750 train_loss:4.0538 train_time:23712ms step_avg:148.20ms | |
| step:171/1750 train_loss:4.2378 train_time:23862ms step_avg:148.21ms | |
| step:172/1750 train_loss:4.2340 train_time:24012ms step_avg:148.22ms | |
| step:173/1750 train_loss:4.2895 train_time:24164ms step_avg:148.24ms | |
| step:174/1750 train_loss:4.4565 train_time:24313ms step_avg:148.25ms | |
| step:175/1750 train_loss:4.2879 train_time:24464ms step_avg:148.27ms | |
| step:176/1750 train_loss:4.1282 train_time:24614ms step_avg:148.28ms | |
| step:177/1750 train_loss:4.0948 train_time:24765ms step_avg:148.29ms | |
| step:178/1750 train_loss:4.2127 train_time:24914ms step_avg:148.30ms | |
| step:179/1750 train_loss:4.1663 train_time:25066ms step_avg:148.32ms | |
| step:180/1750 train_loss:4.1423 train_time:25215ms step_avg:148.32ms | |
| step:181/1750 train_loss:4.3258 train_time:25367ms step_avg:148.34ms | |
| step:182/1750 train_loss:4.1825 train_time:25516ms step_avg:148.35ms | |
| step:183/1750 train_loss:4.1683 train_time:25667ms step_avg:148.37ms | |
| step:184/1750 train_loss:4.1513 train_time:25817ms step_avg:148.37ms | |
| step:185/1750 train_loss:4.2424 train_time:25968ms step_avg:148.39ms | |
| step:186/1750 train_loss:4.2009 train_time:26118ms step_avg:148.40ms | |
| step:187/1750 train_loss:4.2737 train_time:26269ms step_avg:148.41ms | |
| step:188/1750 train_loss:4.1966 train_time:26536ms step_avg:149.08ms | |
| step:189/1750 train_loss:4.1472 train_time:26837ms step_avg:149.93ms | |
| step:190/1750 train_loss:4.2418 train_time:26990ms step_avg:149.94ms | |
| step:191/1750 train_loss:4.1192 train_time:27141ms step_avg:149.95ms | |
| step:192/1750 train_loss:4.0633 train_time:27291ms step_avg:149.95ms | |
| step:193/1750 train_loss:4.2836 train_time:27442ms step_avg:149.96ms | |
| step:194/1750 train_loss:4.2035 train_time:27592ms step_avg:149.96ms | |
| step:195/1750 train_loss:4.3966 train_time:27743ms step_avg:149.96ms | |
| step:196/1750 train_loss:4.2172 train_time:27893ms step_avg:149.96ms | |
| step:197/1750 train_loss:4.0753 train_time:28044ms step_avg:149.97ms | |
| step:198/1750 train_loss:4.2062 train_time:28194ms step_avg:149.97ms | |
| step:199/1750 train_loss:4.0584 train_time:28344ms step_avg:149.97ms | |
| step:200/1750 train_loss:4.1504 train_time:28493ms step_avg:149.96ms | |
| step:201/1750 train_loss:4.0286 train_time:28642ms step_avg:149.96ms | |
| step:202/1750 train_loss:4.2739 train_time:28792ms step_avg:149.96ms | |
| step:203/1750 train_loss:4.0900 train_time:28941ms step_avg:149.95ms | |
| step:204/1750 train_loss:4.2088 train_time:29090ms step_avg:149.95ms | |
| step:205/1750 train_loss:4.2703 train_time:29240ms step_avg:149.95ms | |
| step:206/1750 train_loss:3.9673 train_time:29389ms step_avg:149.95ms | |
| step:207/1750 train_loss:4.1045 train_time:29538ms step_avg:149.94ms | |
| step:208/1750 train_loss:4.1207 train_time:29689ms step_avg:149.94ms | |
| step:209/1750 train_loss:4.2634 train_time:29838ms step_avg:149.94ms | |
| step:210/1750 train_loss:4.2119 train_time:29988ms step_avg:149.94ms | |
| step:211/1750 train_loss:4.0755 train_time:30164ms step_avg:150.07ms | |
| step:212/1750 train_loss:4.1534 train_time:30326ms step_avg:150.13ms | |
| step:213/1750 train_loss:4.0654 train_time:30474ms step_avg:150.12ms | |
| step:214/1750 train_loss:4.1369 train_time:30624ms step_avg:150.12ms | |
| step:215/1750 train_loss:3.9738 train_time:30773ms step_avg:150.11ms | |
| step:216/1750 train_loss:4.0305 train_time:30923ms step_avg:150.11ms | |
| step:217/1750 train_loss:4.0277 train_time:31072ms step_avg:150.11ms | |
| step:218/1750 train_loss:4.1009 train_time:31222ms step_avg:150.11ms | |
| step:219/1750 train_loss:4.0920 train_time:31371ms step_avg:150.10ms | |
| step:220/1750 train_loss:4.1055 train_time:31521ms step_avg:150.10ms | |
| step:221/1750 train_loss:4.1255 train_time:31671ms step_avg:150.10ms | |
| step:222/1750 train_loss:4.0226 train_time:31820ms step_avg:150.09ms | |
| step:223/1750 train_loss:4.0041 train_time:31970ms step_avg:150.09ms | |
| step:224/1750 train_loss:4.3246 train_time:32118ms step_avg:150.09ms | |
| step:225/1750 train_loss:3.9332 train_time:32268ms step_avg:150.09ms | |
| step:226/1750 train_loss:4.0081 train_time:32417ms step_avg:150.08ms | |
| step:227/1750 train_loss:4.0051 train_time:32567ms step_avg:150.08ms | |
| step:228/1750 train_loss:4.1663 train_time:32715ms step_avg:150.07ms | |
| step:229/1750 train_loss:3.9598 train_time:32865ms step_avg:150.07ms | |
| step:230/1750 train_loss:4.0801 train_time:33013ms step_avg:150.06ms | |
| step:231/1750 train_loss:3.9158 train_time:33163ms step_avg:150.06ms | |
| step:232/1750 train_loss:3.9955 train_time:33312ms step_avg:150.05ms | |
| step:233/1750 train_loss:4.1198 train_time:33461ms step_avg:150.05ms | |
| step:234/1750 train_loss:4.0620 train_time:33611ms step_avg:150.05ms | |
| step:235/1750 train_loss:3.9269 train_time:33761ms step_avg:150.05ms | |
| step:236/1750 train_loss:4.1247 train_time:33910ms step_avg:150.04ms | |
| step:237/1750 train_loss:4.1033 train_time:34059ms step_avg:150.04ms | |
| step:238/1750 train_loss:3.9671 train_time:34208ms step_avg:150.04ms | |
| step:239/1750 train_loss:4.1156 train_time:34358ms step_avg:150.03ms | |
| step:240/1750 train_loss:4.1388 train_time:34508ms step_avg:150.03ms | |
| step:241/1750 train_loss:3.9918 train_time:34658ms step_avg:150.03ms | |
| step:242/1750 train_loss:4.1711 train_time:34807ms step_avg:150.03ms | |
| step:243/1750 train_loss:4.0377 train_time:34956ms step_avg:150.03ms | |
| step:244/1750 train_loss:4.0966 train_time:35106ms step_avg:150.02ms | |
| step:245/1750 train_loss:4.1625 train_time:35255ms step_avg:150.02ms | |
| step:246/1750 train_loss:4.0826 train_time:35404ms step_avg:150.02ms | |
| step:247/1750 train_loss:4.0306 train_time:35553ms step_avg:150.01ms | |
| step:248/1750 train_loss:4.1463 train_time:35703ms step_avg:150.01ms | |
| step:249/1750 train_loss:3.9394 train_time:35853ms step_avg:150.01ms | |
| step:250/1750 train_loss:3.9975 train_time:36002ms step_avg:150.01ms | |
| step:250/1750 val_loss:4.0346 train_time:36040ms step_avg:150.17ms | |
| step:251/1750 train_loss:4.1051 train_time:36152ms step_avg:150.01ms | |
| step:252/1750 train_loss:4.1939 train_time:36303ms step_avg:150.01ms | |
| step:253/1750 train_loss:3.9638 train_time:36455ms step_avg:150.02ms | |
| step:254/1750 train_loss:3.9133 train_time:36603ms step_avg:150.01ms | |
| step:255/1750 train_loss:4.0981 train_time:36753ms step_avg:150.01ms | |
| step:256/1750 train_loss:4.0202 train_time:36901ms step_avg:150.01ms | |
| step:257/1750 train_loss:4.0193 train_time:37053ms step_avg:150.01ms | |
| step:258/1750 train_loss:4.0179 train_time:37202ms step_avg:150.01ms | |
| step:259/1750 train_loss:4.0585 train_time:37352ms step_avg:150.01ms | |
| step:260/1750 train_loss:4.0880 train_time:37502ms step_avg:150.01ms | |
| step:261/1750 train_loss:4.0460 train_time:37655ms step_avg:150.02ms | |
| step:262/1750 train_loss:4.0195 train_time:37809ms step_avg:150.03ms | |
| step:263/1750 train_loss:3.9180 train_time:37962ms step_avg:150.05ms | |
| step:264/1750 train_loss:4.0139 train_time:38115ms step_avg:150.06ms | |
| step:265/1750 train_loss:3.8921 train_time:38268ms step_avg:150.07ms | |
| step:266/1750 train_loss:3.9539 train_time:38419ms step_avg:150.07ms | |
| step:267/1750 train_loss:3.9522 train_time:38573ms step_avg:150.09ms | |
| step:268/1750 train_loss:3.9814 train_time:38725ms step_avg:150.10ms | |
| step:269/1750 train_loss:3.8802 train_time:38878ms step_avg:150.11ms | |
| step:270/1750 train_loss:4.1190 train_time:39031ms step_avg:150.12ms | |
| step:271/1750 train_loss:4.0007 train_time:39183ms step_avg:150.13ms | |
| step:272/1750 train_loss:3.9590 train_time:39336ms step_avg:150.14ms | |
| step:273/1750 train_loss:3.9899 train_time:39488ms step_avg:150.14ms | |
| step:274/1750 train_loss:4.0589 train_time:39641ms step_avg:150.16ms | |
| step:275/1750 train_loss:4.0832 train_time:39794ms step_avg:150.17ms | |
| step:276/1750 train_loss:4.2500 train_time:39947ms step_avg:150.18ms | |
| step:277/1750 train_loss:4.0591 train_time:40100ms step_avg:150.19ms | |
| step:278/1750 train_loss:4.1131 train_time:40253ms step_avg:150.20ms | |
| step:279/1750 train_loss:4.0144 train_time:40406ms step_avg:150.21ms | |
| step:280/1750 train_loss:4.2016 train_time:40560ms step_avg:150.22ms | |
| step:281/1750 train_loss:3.9922 train_time:40712ms step_avg:150.23ms | |
| step:282/1750 train_loss:3.9644 train_time:40865ms step_avg:150.24ms | |
| step:283/1750 train_loss:3.9320 train_time:41017ms step_avg:150.25ms | |
| step:284/1750 train_loss:4.0718 train_time:41170ms step_avg:150.26ms | |
| step:285/1750 train_loss:4.0895 train_time:41321ms step_avg:150.26ms | |
| step:286/1750 train_loss:4.1085 train_time:41475ms step_avg:150.27ms | |
| step:287/1750 train_loss:3.9361 train_time:41628ms step_avg:150.28ms | |
| step:288/1750 train_loss:4.0355 train_time:41781ms step_avg:150.29ms | |
| step:289/1750 train_loss:3.9032 train_time:41935ms step_avg:150.30ms | |
| step:290/1750 train_loss:3.8801 train_time:42087ms step_avg:150.31ms | |
| step:291/1750 train_loss:3.9385 train_time:42241ms step_avg:150.32ms | |
| step:292/1750 train_loss:3.8894 train_time:42393ms step_avg:150.33ms | |
| step:293/1750 train_loss:3.9270 train_time:42545ms step_avg:150.33ms | |
| step:294/1750 train_loss:3.9656 train_time:42698ms step_avg:150.34ms | |
| step:295/1750 train_loss:3.8600 train_time:42850ms step_avg:150.35ms | |
| step:296/1750 train_loss:3.8868 train_time:43004ms step_avg:150.36ms | |
| step:297/1750 train_loss:3.8973 train_time:43158ms step_avg:150.38ms | |
| step:298/1750 train_loss:3.9980 train_time:43311ms step_avg:150.39ms | |
| step:299/1750 train_loss:3.8486 train_time:43464ms step_avg:150.39ms | |
| step:300/1750 train_loss:3.9956 train_time:43617ms step_avg:150.40ms | |
| step:301/1750 train_loss:3.9932 train_time:43769ms step_avg:150.41ms | |
| step:302/1750 train_loss:3.9606 train_time:43921ms step_avg:150.41ms | |
| step:303/1750 train_loss:4.0099 train_time:44074ms step_avg:150.42ms | |
| step:304/1750 train_loss:3.9926 train_time:44225ms step_avg:150.43ms | |
| step:305/1750 train_loss:4.4777 train_time:44378ms step_avg:150.43ms | |
| step:306/1750 train_loss:3.9605 train_time:44530ms step_avg:150.44ms | |
| step:307/1750 train_loss:3.8574 train_time:44683ms step_avg:150.45ms | |
| step:308/1750 train_loss:4.0120 train_time:44836ms step_avg:150.46ms | |
| step:309/1750 train_loss:3.8951 train_time:44989ms step_avg:150.46ms | |
| step:310/1750 train_loss:4.1108 train_time:45141ms step_avg:150.47ms | |
| step:311/1750 train_loss:3.9564 train_time:45293ms step_avg:150.48ms | |
| step:312/1750 train_loss:3.8918 train_time:45445ms step_avg:150.48ms | |
| step:313/1750 train_loss:3.9643 train_time:45599ms step_avg:150.49ms | |
| step:314/1750 train_loss:4.0936 train_time:45749ms step_avg:150.49ms | |
| step:315/1750 train_loss:3.9681 train_time:45903ms step_avg:150.50ms | |
| step:316/1750 train_loss:3.8199 train_time:46057ms step_avg:150.51ms | |
| step:317/1750 train_loss:3.9009 train_time:46210ms step_avg:150.52ms | |
| step:318/1750 train_loss:3.9515 train_time:46363ms step_avg:150.53ms | |
| step:319/1750 train_loss:3.9144 train_time:46516ms step_avg:150.54ms | |
| step:320/1750 train_loss:4.0385 train_time:46668ms step_avg:150.54ms | |
| step:321/1750 train_loss:3.9792 train_time:46820ms step_avg:150.55ms | |
| step:322/1750 train_loss:3.9566 train_time:46974ms step_avg:150.56ms | |
| step:323/1750 train_loss:4.0314 train_time:47125ms step_avg:150.56ms | |
| step:324/1750 train_loss:3.9674 train_time:47279ms step_avg:150.57ms | |
| step:325/1750 train_loss:4.0413 train_time:47432ms step_avg:150.58ms | |
| step:326/1750 train_loss:3.9121 train_time:47583ms step_avg:150.58ms | |
| step:327/1750 train_loss:4.4148 train_time:47736ms step_avg:150.59ms | |
| step:328/1750 train_loss:4.0935 train_time:47887ms step_avg:150.59ms | |
| step:329/1750 train_loss:3.8215 train_time:48040ms step_avg:150.59ms | |
| step:330/1750 train_loss:3.7692 train_time:48192ms step_avg:150.60ms | |
| step:331/1750 train_loss:4.0042 train_time:48344ms step_avg:150.60ms | |
| step:332/1750 train_loss:3.9314 train_time:48496ms step_avg:150.61ms | |
| step:333/1750 train_loss:3.9088 train_time:48647ms step_avg:150.61ms | |
| step:334/1750 train_loss:3.8635 train_time:48799ms step_avg:150.62ms | |
| step:335/1750 train_loss:4.0311 train_time:48952ms step_avg:150.62ms | |
| step:336/1750 train_loss:3.9771 train_time:49104ms step_avg:150.63ms | |
| step:337/1750 train_loss:4.4377 train_time:49257ms step_avg:150.63ms | |
| step:338/1750 train_loss:3.9588 train_time:49409ms step_avg:150.64ms | |
| step:339/1750 train_loss:3.8811 train_time:49561ms step_avg:150.64ms | |
| step:340/1750 train_loss:3.9563 train_time:49713ms step_avg:150.65ms | |
| step:341/1750 train_loss:3.8855 train_time:49865ms step_avg:150.65ms | |
| step:342/1750 train_loss:3.8353 train_time:50016ms step_avg:150.65ms | |
| step:343/1750 train_loss:3.8606 train_time:50170ms step_avg:150.66ms | |
| step:344/1750 train_loss:4.0127 train_time:50321ms step_avg:150.66ms | |
| step:345/1750 train_loss:3.8382 train_time:50475ms step_avg:150.67ms | |
| step:346/1750 train_loss:3.7932 train_time:50626ms step_avg:150.67ms | |
| step:347/1750 train_loss:3.8214 train_time:50778ms step_avg:150.68ms | |
| step:348/1750 train_loss:3.8751 train_time:50930ms step_avg:150.68ms | |
| step:349/1750 train_loss:3.8520 train_time:51082ms step_avg:150.69ms | |
| step:350/1750 train_loss:3.5925 train_time:51235ms step_avg:150.69ms | |
| step:351/1750 train_loss:3.8523 train_time:51386ms step_avg:150.69ms | |
| step:352/1750 train_loss:4.2137 train_time:51538ms step_avg:150.70ms | |
| step:353/1750 train_loss:3.6772 train_time:51689ms step_avg:150.70ms | |
| step:354/1750 train_loss:3.9490 train_time:51841ms step_avg:150.70ms | |
| step:355/1750 train_loss:3.8082 train_time:51993ms step_avg:150.70ms | |
| step:356/1750 train_loss:3.9056 train_time:52144ms step_avg:150.71ms | |
| step:357/1750 train_loss:3.7955 train_time:52298ms step_avg:150.71ms | |
| step:358/1750 train_loss:3.8811 train_time:52450ms step_avg:150.72ms | |
| step:359/1750 train_loss:3.8028 train_time:52602ms step_avg:150.72ms | |
| step:360/1750 train_loss:3.4468 train_time:52756ms step_avg:150.73ms | |
| step:361/1750 train_loss:4.0496 train_time:52908ms step_avg:150.74ms | |
| step:362/1750 train_loss:3.9505 train_time:53061ms step_avg:150.74ms | |
| step:363/1750 train_loss:3.8707 train_time:53213ms step_avg:150.74ms | |
| step:364/1750 train_loss:3.7708 train_time:53365ms step_avg:150.75ms | |
| step:365/1750 train_loss:3.9394 train_time:53517ms step_avg:150.75ms | |
| step:366/1750 train_loss:3.8921 train_time:53670ms step_avg:150.76ms | |
| step:367/1750 train_loss:3.8755 train_time:53823ms step_avg:150.76ms | |
| step:368/1750 train_loss:3.8739 train_time:53978ms step_avg:150.78ms | |
| step:369/1750 train_loss:3.7647 train_time:54129ms step_avg:150.78ms | |
| step:370/1750 train_loss:3.9134 train_time:54281ms step_avg:150.78ms | |
| step:371/1750 train_loss:3.7627 train_time:54434ms step_avg:150.79ms | |
| step:372/1750 train_loss:3.7167 train_time:54585ms step_avg:150.79ms | |
| step:373/1750 train_loss:3.9429 train_time:54738ms step_avg:150.79ms | |
| step:374/1750 train_loss:3.8591 train_time:54891ms step_avg:150.80ms | |
| step:375/1750 train_loss:3.8263 train_time:55042ms step_avg:150.80ms | |
| step:375/1750 val_loss:3.8523 train_time:55081ms step_avg:150.91ms | |
| step:376/1750 train_loss:3.8993 train_time:55198ms step_avg:150.81ms | |
| step:377/1750 train_loss:3.8171 train_time:55475ms step_avg:151.16ms | |
| step:378/1750 train_loss:3.8669 train_time:55634ms step_avg:151.18ms | |
| step:379/1750 train_loss:3.8990 train_time:56025ms step_avg:151.83ms | |
| step:380/1750 train_loss:3.9869 train_time:56186ms step_avg:151.85ms | |
| step:381/1750 train_loss:3.8678 train_time:56338ms step_avg:151.85ms | |
| step:382/1750 train_loss:3.8391 train_time:56491ms step_avg:151.86ms | |
| step:383/1750 train_loss:3.8239 train_time:56645ms step_avg:151.86ms | |
| step:384/1750 train_loss:3.8913 train_time:56794ms step_avg:151.86ms | |
| step:385/1750 train_loss:3.8163 train_time:56947ms step_avg:151.86ms | |
| step:386/1750 train_loss:3.9195 train_time:57098ms step_avg:151.86ms | |
| step:387/1750 train_loss:4.0953 train_time:57250ms step_avg:151.86ms | |
| step:388/1750 train_loss:3.8207 train_time:57403ms step_avg:151.86ms | |
| step:389/1750 train_loss:3.8236 train_time:57554ms step_avg:151.86ms | |
| step:390/1750 train_loss:3.9129 train_time:57708ms step_avg:151.86ms | |
| step:391/1750 train_loss:3.8348 train_time:57863ms step_avg:151.87ms | |
| step:392/1750 train_loss:3.9395 train_time:58017ms step_avg:151.88ms | |
| step:393/1750 train_loss:3.7779 train_time:58172ms step_avg:151.89ms | |
| step:394/1750 train_loss:3.9098 train_time:58326ms step_avg:151.89ms | |
| step:395/1750 train_loss:3.6465 train_time:58482ms step_avg:151.90ms | |
| step:396/1750 train_loss:3.8540 train_time:58637ms step_avg:151.91ms | |
| step:397/1750 train_loss:3.8892 train_time:58793ms step_avg:151.92ms | |
| step:398/1750 train_loss:3.9016 train_time:58949ms step_avg:151.93ms | |
| step:399/1750 train_loss:3.7932 train_time:59104ms step_avg:151.94ms | |
| step:400/1750 train_loss:3.8489 train_time:59259ms step_avg:151.95ms | |
| step:401/1750 train_loss:3.9336 train_time:59412ms step_avg:151.95ms | |
| step:402/1750 train_loss:3.8610 train_time:59568ms step_avg:151.96ms | |
| step:403/1750 train_loss:3.9766 train_time:59723ms step_avg:151.97ms | |
| step:404/1750 train_loss:3.7013 train_time:59877ms step_avg:151.97ms | |
| step:405/1750 train_loss:3.8089 train_time:60030ms step_avg:151.98ms | |
| step:406/1750 train_loss:4.1219 train_time:60186ms step_avg:151.98ms | |
| step:407/1750 train_loss:3.8011 train_time:60341ms step_avg:151.99ms | |
| step:408/1750 train_loss:3.8405 train_time:60494ms step_avg:151.99ms | |
| step:409/1750 train_loss:3.8818 train_time:60648ms step_avg:152.00ms | |
| step:410/1750 train_loss:3.7772 train_time:60804ms step_avg:152.01ms | |
| step:411/1750 train_loss:3.7873 train_time:60959ms step_avg:152.02ms | |
| step:412/1750 train_loss:4.2206 train_time:61114ms step_avg:152.02ms | |
| step:413/1750 train_loss:3.7278 train_time:61271ms step_avg:152.04ms | |
| step:414/1750 train_loss:4.0354 train_time:61425ms step_avg:152.04ms | |
| step:415/1750 train_loss:3.7762 train_time:61579ms step_avg:152.05ms | |
| step:416/1750 train_loss:3.7900 train_time:61733ms step_avg:152.05ms | |
| step:417/1750 train_loss:3.9764 train_time:61890ms step_avg:152.06ms | |
| step:418/1750 train_loss:3.7154 train_time:62045ms step_avg:152.07ms | |
| step:419/1750 train_loss:3.8252 train_time:62199ms step_avg:152.08ms | |
| step:420/1750 train_loss:3.7309 train_time:62354ms step_avg:152.08ms | |
| step:421/1750 train_loss:3.6652 train_time:62508ms step_avg:152.09ms | |
| step:422/1750 train_loss:3.7975 train_time:62661ms step_avg:152.09ms | |
| step:423/1750 train_loss:3.8923 train_time:62816ms step_avg:152.10ms | |
| step:424/1750 train_loss:3.6348 train_time:62971ms step_avg:152.10ms | |
| step:425/1750 train_loss:3.8251 train_time:63125ms step_avg:152.11ms | |
| step:426/1750 train_loss:3.6933 train_time:63279ms step_avg:152.11ms | |
| step:427/1750 train_loss:3.9183 train_time:63431ms step_avg:152.11ms | |
| step:428/1750 train_loss:3.8395 train_time:63587ms step_avg:152.12ms | |
| step:429/1750 train_loss:3.7744 train_time:63741ms step_avg:152.13ms | |
| step:430/1750 train_loss:3.7357 train_time:63895ms step_avg:152.13ms | |
| step:431/1750 train_loss:3.6510 train_time:64050ms step_avg:152.14ms | |
| step:432/1750 train_loss:3.7912 train_time:64207ms step_avg:152.15ms | |
| step:433/1750 train_loss:3.8488 train_time:64361ms step_avg:152.15ms | |
| step:434/1750 train_loss:3.8015 train_time:64516ms step_avg:152.16ms | |
| step:435/1750 train_loss:3.8323 train_time:64671ms step_avg:152.17ms | |
| step:436/1750 train_loss:3.8632 train_time:64825ms step_avg:152.17ms | |
| step:437/1750 train_loss:3.7369 train_time:64981ms step_avg:152.18ms | |
| step:438/1750 train_loss:3.7306 train_time:65135ms step_avg:152.18ms | |
| step:439/1750 train_loss:3.7377 train_time:65290ms step_avg:152.19ms | |
| step:440/1750 train_loss:3.9245 train_time:65445ms step_avg:152.20ms | |
| step:441/1750 train_loss:3.7877 train_time:65598ms step_avg:152.20ms | |
| step:442/1750 train_loss:3.7576 train_time:65753ms step_avg:152.21ms | |
| step:443/1750 train_loss:3.6468 train_time:65907ms step_avg:152.21ms | |
| step:444/1750 train_loss:3.9478 train_time:66061ms step_avg:152.21ms | |
| step:445/1750 train_loss:3.8724 train_time:66215ms step_avg:152.22ms | |
| step:446/1750 train_loss:3.8599 train_time:66371ms step_avg:152.23ms | |
| step:447/1750 train_loss:3.7740 train_time:66525ms step_avg:152.23ms | |
| step:448/1750 train_loss:3.8719 train_time:66680ms step_avg:152.24ms | |
| step:449/1750 train_loss:3.7173 train_time:66835ms step_avg:152.24ms | |
| step:450/1750 train_loss:3.7356 train_time:66990ms step_avg:152.25ms | |
| step:451/1750 train_loss:3.6045 train_time:67146ms step_avg:152.26ms | |
| step:452/1750 train_loss:3.7321 train_time:67299ms step_avg:152.26ms | |
| step:453/1750 train_loss:3.7021 train_time:67455ms step_avg:152.27ms | |
| step:454/1750 train_loss:3.6590 train_time:67609ms step_avg:152.27ms | |
| step:455/1750 train_loss:3.8649 train_time:67764ms step_avg:152.28ms | |
| step:456/1750 train_loss:3.7525 train_time:67917ms step_avg:152.28ms | |
| step:457/1750 train_loss:3.8116 train_time:68072ms step_avg:152.29ms | |
| step:458/1750 train_loss:3.8586 train_time:68225ms step_avg:152.29ms | |
| step:459/1750 train_loss:3.6597 train_time:68381ms step_avg:152.30ms | |
| step:460/1750 train_loss:3.8175 train_time:68533ms step_avg:152.30ms | |
| step:461/1750 train_loss:3.7150 train_time:68688ms step_avg:152.30ms | |
| step:462/1750 train_loss:3.7546 train_time:68843ms step_avg:152.31ms | |
| step:463/1750 train_loss:3.7964 train_time:68996ms step_avg:152.31ms | |
| step:464/1750 train_loss:3.7412 train_time:69150ms step_avg:152.31ms | |
| step:465/1750 train_loss:3.7406 train_time:69304ms step_avg:152.32ms | |
| step:466/1750 train_loss:3.8260 train_time:69457ms step_avg:152.32ms | |
| step:467/1750 train_loss:3.8458 train_time:69612ms step_avg:152.32ms | |
| step:468/1750 train_loss:3.8168 train_time:69767ms step_avg:152.33ms | |
| step:469/1750 train_loss:3.7051 train_time:69920ms step_avg:152.33ms | |
| step:470/1750 train_loss:3.7988 train_time:70073ms step_avg:152.33ms | |
| step:471/1750 train_loss:3.8371 train_time:70227ms step_avg:152.34ms | |
| step:472/1750 train_loss:3.8000 train_time:70382ms step_avg:152.34ms | |
| step:473/1750 train_loss:3.7453 train_time:70535ms step_avg:152.34ms | |
| step:474/1750 train_loss:3.6120 train_time:70689ms step_avg:152.35ms | |
| step:475/1750 train_loss:4.0186 train_time:70842ms step_avg:152.35ms | |
| step:476/1750 train_loss:3.7835 train_time:70997ms step_avg:152.35ms | |
| step:477/1750 train_loss:3.6162 train_time:71151ms step_avg:152.36ms | |
| step:478/1750 train_loss:3.8465 train_time:71305ms step_avg:152.36ms | |
| step:479/1750 train_loss:3.7955 train_time:71459ms step_avg:152.36ms | |
| step:480/1750 train_loss:3.9419 train_time:71612ms step_avg:152.37ms | |
| step:481/1750 train_loss:3.7509 train_time:71767ms step_avg:152.37ms | |
| step:482/1750 train_loss:3.5516 train_time:71921ms step_avg:152.38ms | |
| step:483/1750 train_loss:3.8383 train_time:72074ms step_avg:152.38ms | |
| step:484/1750 train_loss:3.6870 train_time:72228ms step_avg:152.38ms | |
| step:485/1750 train_loss:3.6846 train_time:72383ms step_avg:152.38ms | |
| step:486/1750 train_loss:3.6084 train_time:72536ms step_avg:152.39ms | |
| step:487/1750 train_loss:3.7039 train_time:72690ms step_avg:152.39ms | |
| step:488/1750 train_loss:3.9031 train_time:72844ms step_avg:152.39ms | |
| step:489/1750 train_loss:3.7402 train_time:72998ms step_avg:152.40ms | |
| step:490/1750 train_loss:3.6145 train_time:73151ms step_avg:152.40ms | |
| step:491/1750 train_loss:3.6376 train_time:73305ms step_avg:152.40ms | |
| step:492/1750 train_loss:3.7557 train_time:73458ms step_avg:152.40ms | |
| step:493/1750 train_loss:3.5993 train_time:73613ms step_avg:152.41ms | |
| step:494/1750 train_loss:3.7254 train_time:73767ms step_avg:152.41ms | |
| step:495/1750 train_loss:3.6753 train_time:73923ms step_avg:152.42ms | |
| step:496/1750 train_loss:3.5474 train_time:74077ms step_avg:152.42ms | |
| step:497/1750 train_loss:3.7545 train_time:74228ms step_avg:152.42ms | |
| step:498/1750 train_loss:3.8061 train_time:74384ms step_avg:152.43ms | |
| step:499/1750 train_loss:3.8496 train_time:74538ms step_avg:152.43ms | |
| step:500/1750 train_loss:3.7535 train_time:74692ms step_avg:152.43ms | |
| step:500/1750 val_loss:3.7289 train_time:74732ms step_avg:152.51ms | |
| step:501/1750 train_loss:3.8270 train_time:74847ms step_avg:152.44ms | |
| step:502/1750 train_loss:3.7712 train_time:75003ms step_avg:152.45ms | |
| step:503/1750 train_loss:3.7976 train_time:75157ms step_avg:152.45ms | |
| step:504/1750 train_loss:3.7512 train_time:75311ms step_avg:152.45ms | |
| step:505/1750 train_loss:3.8252 train_time:75466ms step_avg:152.46ms | |
| step:506/1750 train_loss:3.6712 train_time:75621ms step_avg:152.46ms | |
| step:507/1750 train_loss:3.7904 train_time:75773ms step_avg:152.46ms | |
| step:508/1750 train_loss:3.8569 train_time:75928ms step_avg:152.47ms | |
| step:509/1750 train_loss:3.7963 train_time:76082ms step_avg:152.47ms | |
| step:510/1750 train_loss:3.6038 train_time:76236ms step_avg:152.47ms | |
| step:511/1750 train_loss:3.8020 train_time:76389ms step_avg:152.47ms | |
| step:512/1750 train_loss:3.7535 train_time:76544ms step_avg:152.48ms | |
| step:513/1750 train_loss:3.6873 train_time:76698ms step_avg:152.48ms | |
| step:514/1750 train_loss:3.8212 train_time:76852ms step_avg:152.48ms | |
| step:515/1750 train_loss:3.7607 train_time:77007ms step_avg:152.49ms | |
| step:516/1750 train_loss:4.0987 train_time:77161ms step_avg:152.49ms | |
| step:517/1750 train_loss:3.7115 train_time:77315ms step_avg:152.49ms | |
| step:518/1750 train_loss:3.8005 train_time:77468ms step_avg:152.50ms | |
| step:519/1750 train_loss:3.6882 train_time:77623ms step_avg:152.50ms | |
| step:520/1750 train_loss:3.7065 train_time:77778ms step_avg:152.51ms | |
| step:521/1750 train_loss:3.6783 train_time:77934ms step_avg:152.51ms | |
| step:522/1750 train_loss:3.6796 train_time:78091ms step_avg:152.52ms | |
| step:523/1750 train_loss:4.3054 train_time:78247ms step_avg:152.53ms | |
| step:524/1750 train_loss:3.7597 train_time:78403ms step_avg:152.53ms | |
| step:525/1750 train_loss:3.7069 train_time:78558ms step_avg:152.54ms | |
| step:526/1750 train_loss:3.7161 train_time:78715ms step_avg:152.55ms | |
| step:527/1750 train_loss:3.6870 train_time:78871ms step_avg:152.56ms | |
| step:528/1750 train_loss:3.6526 train_time:79027ms step_avg:152.56ms | |
| step:529/1750 train_loss:3.8681 train_time:79185ms step_avg:152.57ms | |
| step:530/1750 train_loss:3.6631 train_time:79340ms step_avg:152.58ms | |
| step:531/1750 train_loss:3.9400 train_time:79498ms step_avg:152.59ms | |
| step:532/1750 train_loss:3.7520 train_time:79654ms step_avg:152.59ms | |
| step:533/1750 train_loss:3.6760 train_time:79811ms step_avg:152.60ms | |
| step:534/1750 train_loss:3.6941 train_time:79966ms step_avg:152.61ms | |
| step:535/1750 train_loss:3.6273 train_time:80125ms step_avg:152.62ms | |
| step:536/1750 train_loss:3.7768 train_time:80284ms step_avg:152.63ms | |
| step:537/1750 train_loss:3.7485 train_time:80439ms step_avg:152.64ms | |
| step:538/1750 train_loss:3.6494 train_time:80596ms step_avg:152.64ms | |
| step:539/1750 train_loss:4.1266 train_time:80754ms step_avg:152.65ms | |
| step:540/1750 train_loss:3.7004 train_time:80909ms step_avg:152.66ms | |
| step:541/1750 train_loss:3.8105 train_time:81065ms step_avg:152.66ms | |
| step:542/1750 train_loss:3.6226 train_time:81222ms step_avg:152.67ms | |
| step:543/1750 train_loss:3.6066 train_time:81379ms step_avg:152.68ms | |
| step:544/1750 train_loss:3.6649 train_time:81535ms step_avg:152.69ms | |
| step:545/1750 train_loss:3.6138 train_time:81692ms step_avg:152.70ms | |
| step:546/1750 train_loss:3.6555 train_time:81850ms step_avg:152.70ms | |
| step:547/1750 train_loss:3.6695 train_time:82006ms step_avg:152.71ms | |
| step:548/1750 train_loss:3.6398 train_time:82165ms step_avg:152.72ms | |
| step:549/1750 train_loss:3.7457 train_time:82319ms step_avg:152.73ms | |
| step:550/1750 train_loss:3.6414 train_time:82478ms step_avg:152.74ms | |
| step:551/1750 train_loss:3.6533 train_time:82633ms step_avg:152.74ms | |
| step:552/1750 train_loss:3.9523 train_time:82790ms step_avg:152.75ms | |
| step:553/1750 train_loss:3.7808 train_time:82947ms step_avg:152.76ms | |
| step:554/1750 train_loss:3.7406 train_time:83105ms step_avg:152.77ms | |
| step:555/1750 train_loss:3.6614 train_time:83262ms step_avg:152.78ms | |
| step:556/1750 train_loss:3.7229 train_time:83418ms step_avg:152.78ms | |
| step:557/1750 train_loss:3.3403 train_time:83576ms step_avg:152.79ms | |
| step:558/1750 train_loss:3.6335 train_time:83731ms step_avg:152.79ms | |
| step:559/1750 train_loss:3.6698 train_time:83887ms step_avg:152.80ms | |
| step:560/1750 train_loss:3.7086 train_time:84044ms step_avg:152.81ms | |
| step:561/1750 train_loss:3.6337 train_time:84201ms step_avg:152.81ms | |
| step:562/1750 train_loss:3.5860 train_time:84357ms step_avg:152.82ms | |
| step:563/1750 train_loss:3.7765 train_time:84512ms step_avg:152.83ms | |
| step:564/1750 train_loss:3.5949 train_time:84669ms step_avg:152.83ms | |
| step:565/1750 train_loss:3.7065 train_time:84826ms step_avg:152.84ms | |
| step:566/1750 train_loss:3.6450 train_time:85101ms step_avg:153.06ms | |
| step:567/1750 train_loss:3.6257 train_time:85266ms step_avg:153.08ms | |
| step:568/1750 train_loss:3.7115 train_time:85422ms step_avg:153.09ms | |
| step:569/1750 train_loss:3.6721 train_time:85728ms step_avg:153.36ms | |
| step:570/1750 train_loss:3.7168 train_time:85889ms step_avg:153.37ms | |
| step:571/1750 train_loss:3.7877 train_time:86045ms step_avg:153.38ms | |
| step:572/1750 train_loss:3.7534 train_time:86202ms step_avg:153.38ms | |
| step:573/1750 train_loss:3.7564 train_time:86357ms step_avg:153.39ms | |
| step:574/1750 train_loss:3.8054 train_time:86515ms step_avg:153.40ms | |
| step:575/1750 train_loss:3.7490 train_time:86671ms step_avg:153.40ms | |
| step:576/1750 train_loss:3.7845 train_time:86827ms step_avg:153.40ms | |
| step:577/1750 train_loss:3.6983 train_time:86984ms step_avg:153.41ms | |
| step:578/1750 train_loss:3.6928 train_time:87139ms step_avg:153.41ms | |
| step:579/1750 train_loss:3.6924 train_time:87295ms step_avg:153.42ms | |
| step:580/1750 train_loss:3.6172 train_time:87452ms step_avg:153.42ms | |
| step:581/1750 train_loss:3.6602 train_time:87610ms step_avg:153.43ms | |
| step:582/1750 train_loss:3.8788 train_time:87767ms step_avg:153.44ms | |
| step:583/1750 train_loss:3.6584 train_time:87924ms step_avg:153.45ms | |
| step:584/1750 train_loss:3.6196 train_time:88080ms step_avg:153.45ms | |
| step:585/1750 train_loss:3.8097 train_time:88235ms step_avg:153.45ms | |
| step:586/1750 train_loss:3.5406 train_time:88392ms step_avg:153.46ms | |
| step:587/1750 train_loss:3.6896 train_time:88547ms step_avg:153.46ms | |
| step:588/1750 train_loss:3.6708 train_time:88703ms step_avg:153.47ms | |
| step:589/1750 train_loss:4.0220 train_time:88860ms step_avg:153.47ms | |
| step:590/1750 train_loss:3.8072 train_time:89016ms step_avg:153.48ms | |
| step:591/1750 train_loss:3.5299 train_time:89170ms step_avg:153.48ms | |
| step:592/1750 train_loss:3.5586 train_time:89327ms step_avg:153.48ms | |
| step:593/1750 train_loss:3.5267 train_time:89483ms step_avg:153.49ms | |
| step:594/1750 train_loss:3.5797 train_time:89638ms step_avg:153.49ms | |
| step:595/1750 train_loss:3.9412 train_time:89794ms step_avg:153.49ms | |
| step:596/1750 train_loss:3.6681 train_time:89949ms step_avg:153.50ms | |
| step:597/1750 train_loss:3.6065 train_time:90105ms step_avg:153.50ms | |
| step:598/1750 train_loss:3.6879 train_time:90260ms step_avg:153.50ms | |
| step:599/1750 train_loss:3.4959 train_time:90417ms step_avg:153.51ms | |
| step:600/1750 train_loss:3.6216 train_time:90572ms step_avg:153.51ms | |
| step:601/1750 train_loss:3.6667 train_time:90727ms step_avg:153.51ms | |
| step:602/1750 train_loss:3.6926 train_time:90884ms step_avg:153.52ms | |
| step:603/1750 train_loss:3.8072 train_time:91039ms step_avg:153.52ms | |
| step:604/1750 train_loss:3.6288 train_time:91194ms step_avg:153.53ms | |
| step:605/1750 train_loss:3.6343 train_time:91351ms step_avg:153.53ms | |
| step:606/1750 train_loss:3.5977 train_time:91509ms step_avg:153.54ms | |
| step:607/1750 train_loss:3.8572 train_time:91667ms step_avg:153.55ms | |
| step:608/1750 train_loss:3.6700 train_time:91824ms step_avg:153.55ms | |
| step:609/1750 train_loss:3.6364 train_time:91978ms step_avg:153.55ms | |
| step:610/1750 train_loss:3.7321 train_time:92132ms step_avg:153.55ms | |
| step:611/1750 train_loss:3.6274 train_time:92289ms step_avg:153.56ms | |
| step:612/1750 train_loss:3.5922 train_time:92445ms step_avg:153.56ms | |
| step:613/1750 train_loss:3.7867 train_time:92603ms step_avg:153.57ms | |
| step:614/1750 train_loss:3.7267 train_time:92759ms step_avg:153.57ms | |
| step:615/1750 train_loss:3.7188 train_time:92914ms step_avg:153.58ms | |
| step:616/1750 train_loss:3.6518 train_time:93068ms step_avg:153.58ms | |
| step:617/1750 train_loss:3.5837 train_time:93225ms step_avg:153.58ms | |
| step:618/1750 train_loss:3.7115 train_time:93380ms step_avg:153.59ms | |
| step:619/1750 train_loss:3.5864 train_time:93536ms step_avg:153.59ms | |
| step:620/1750 train_loss:3.6136 train_time:93692ms step_avg:153.59ms | |
| step:621/1750 train_loss:3.9424 train_time:93849ms step_avg:153.60ms | |
| step:622/1750 train_loss:3.5909 train_time:94005ms step_avg:153.60ms | |
| step:623/1750 train_loss:3.6303 train_time:94162ms step_avg:153.61ms | |
| step:624/1750 train_loss:3.7175 train_time:94317ms step_avg:153.61ms | |
| step:625/1750 train_loss:3.7302 train_time:94471ms step_avg:153.61ms | |
| step:625/1750 val_loss:3.6466 train_time:94513ms step_avg:153.68ms | |
| step:626/1750 train_loss:3.7637 train_time:94628ms step_avg:153.62ms | |
| step:627/1750 train_loss:3.7461 train_time:94785ms step_avg:153.62ms | |
| step:628/1750 train_loss:3.7857 train_time:94941ms step_avg:153.63ms | |
| step:629/1750 train_loss:3.6160 train_time:95096ms step_avg:153.63ms | |
| step:630/1750 train_loss:3.7444 train_time:95251ms step_avg:153.63ms | |
| step:631/1750 train_loss:3.7660 train_time:95406ms step_avg:153.63ms | |
| step:632/1750 train_loss:3.6764 train_time:95562ms step_avg:153.64ms | |
| step:633/1750 train_loss:3.6254 train_time:95717ms step_avg:153.64ms | |
| step:634/1750 train_loss:3.7239 train_time:95874ms step_avg:153.64ms | |
| step:635/1750 train_loss:3.9797 train_time:96027ms step_avg:153.64ms | |
| step:636/1750 train_loss:3.5650 train_time:96184ms step_avg:153.65ms | |
| step:637/1750 train_loss:3.3830 train_time:96340ms step_avg:153.65ms | |
| step:638/1750 train_loss:3.6154 train_time:96496ms step_avg:153.66ms | |
| step:639/1750 train_loss:3.6605 train_time:96652ms step_avg:153.66ms | |
| step:640/1750 train_loss:3.5899 train_time:96807ms step_avg:153.66ms | |
| step:641/1750 train_loss:3.6134 train_time:96962ms step_avg:153.66ms | |
| step:642/1750 train_loss:3.6585 train_time:97118ms step_avg:153.67ms | |
| step:643/1750 train_loss:3.6327 train_time:97275ms step_avg:153.67ms | |
| step:644/1750 train_loss:3.5752 train_time:97429ms step_avg:153.67ms | |
| step:645/1750 train_loss:3.8002 train_time:97586ms step_avg:153.68ms | |
| step:646/1750 train_loss:3.7008 train_time:97742ms step_avg:153.68ms | |
| step:647/1750 train_loss:3.6884 train_time:97897ms step_avg:153.68ms | |
| step:648/1750 train_loss:3.7344 train_time:98054ms step_avg:153.69ms | |
| step:649/1750 train_loss:3.7878 train_time:98208ms step_avg:153.69ms | |
| step:650/1750 train_loss:3.6411 train_time:98367ms step_avg:153.70ms | |
| step:651/1750 train_loss:3.7882 train_time:98525ms step_avg:153.71ms | |
| step:652/1750 train_loss:3.6095 train_time:98684ms step_avg:153.71ms | |
| step:653/1750 train_loss:3.6853 train_time:98841ms step_avg:153.72ms | |
| step:654/1750 train_loss:3.4523 train_time:99000ms step_avg:153.73ms | |
| step:655/1750 train_loss:3.5993 train_time:99157ms step_avg:153.73ms | |
| step:656/1750 train_loss:3.6003 train_time:99315ms step_avg:153.74ms | |
| step:657/1750 train_loss:3.5297 train_time:99476ms step_avg:153.75ms | |
| step:658/1750 train_loss:3.7150 train_time:99633ms step_avg:153.75ms | |
| step:659/1750 train_loss:3.6119 train_time:99791ms step_avg:153.76ms | |
| step:660/1750 train_loss:3.7030 train_time:99951ms step_avg:153.77ms | |
| step:661/1750 train_loss:3.7716 train_time:100110ms step_avg:153.78ms | |
| step:662/1750 train_loss:3.6851 train_time:100267ms step_avg:153.78ms | |
| step:663/1750 train_loss:3.5714 train_time:100422ms step_avg:153.79ms | |
| step:664/1750 train_loss:3.6390 train_time:100582ms step_avg:153.79ms | |
| step:665/1750 train_loss:3.5161 train_time:100741ms step_avg:153.80ms | |
| step:666/1750 train_loss:3.8127 train_time:100898ms step_avg:153.81ms | |
| step:667/1750 train_loss:3.6315 train_time:101059ms step_avg:153.82ms | |
| step:668/1750 train_loss:3.6693 train_time:101217ms step_avg:153.83ms | |
| step:669/1750 train_loss:3.5044 train_time:101376ms step_avg:153.83ms | |
| step:670/1750 train_loss:3.6259 train_time:101534ms step_avg:153.84ms | |
| step:671/1750 train_loss:3.5862 train_time:101692ms step_avg:153.85ms | |
| step:672/1750 train_loss:3.5889 train_time:101851ms step_avg:153.85ms | |
| step:673/1750 train_loss:3.8768 train_time:102009ms step_avg:153.86ms | |
| step:674/1750 train_loss:3.6521 train_time:102168ms step_avg:153.87ms | |
| step:675/1750 train_loss:3.7346 train_time:102326ms step_avg:153.87ms | |
| step:676/1750 train_loss:3.5095 train_time:102484ms step_avg:153.88ms | |
| step:677/1750 train_loss:3.6221 train_time:102642ms step_avg:153.89ms | |
| step:678/1750 train_loss:3.5731 train_time:102799ms step_avg:153.89ms | |
| step:679/1750 train_loss:3.7004 train_time:102959ms step_avg:153.90ms | |
| step:680/1750 train_loss:3.6078 train_time:103119ms step_avg:153.91ms | |
| step:681/1750 train_loss:3.6372 train_time:103277ms step_avg:153.91ms | |
| step:682/1750 train_loss:3.6829 train_time:103437ms step_avg:153.92ms | |
| step:683/1750 train_loss:3.7570 train_time:103595ms step_avg:153.93ms | |
| step:684/1750 train_loss:3.6694 train_time:103756ms step_avg:153.94ms | |
| step:685/1750 train_loss:3.7167 train_time:103916ms step_avg:153.95ms | |
| step:686/1750 train_loss:3.6604 train_time:104076ms step_avg:153.96ms | |
| step:687/1750 train_loss:3.6951 train_time:104234ms step_avg:153.96ms | |
| step:688/1750 train_loss:3.2277 train_time:104393ms step_avg:153.97ms | |
| step:689/1750 train_loss:3.4264 train_time:104551ms step_avg:153.98ms | |
| step:690/1750 train_loss:3.5708 train_time:104713ms step_avg:153.99ms | |
| step:691/1750 train_loss:3.4427 train_time:104870ms step_avg:153.99ms | |
| step:692/1750 train_loss:3.6557 train_time:105027ms step_avg:154.00ms | |
| step:693/1750 train_loss:3.6798 train_time:105186ms step_avg:154.01ms | |
| step:694/1750 train_loss:3.5771 train_time:105344ms step_avg:154.01ms | |
| step:695/1750 train_loss:3.5615 train_time:105500ms step_avg:154.02ms | |
| step:696/1750 train_loss:3.8793 train_time:105660ms step_avg:154.02ms | |
| step:697/1750 train_loss:3.6123 train_time:105819ms step_avg:154.03ms | |
| step:698/1750 train_loss:3.6754 train_time:105978ms step_avg:154.04ms | |
| step:699/1750 train_loss:3.7894 train_time:106138ms step_avg:154.05ms | |
| step:700/1750 train_loss:3.5907 train_time:106297ms step_avg:154.05ms | |
| step:701/1750 train_loss:3.5672 train_time:106455ms step_avg:154.06ms | |
| step:702/1750 train_loss:3.5394 train_time:106614ms step_avg:154.07ms | |
| step:703/1750 train_loss:3.5167 train_time:106772ms step_avg:154.07ms | |
| step:704/1750 train_loss:3.5961 train_time:106931ms step_avg:154.08ms | |
| step:705/1750 train_loss:3.5824 train_time:107092ms step_avg:154.09ms | |
| step:706/1750 train_loss:3.6046 train_time:107253ms step_avg:154.10ms | |
| step:707/1750 train_loss:3.6772 train_time:107413ms step_avg:154.11ms | |
| step:708/1750 train_loss:3.6243 train_time:107573ms step_avg:154.12ms | |
| step:709/1750 train_loss:3.6029 train_time:107730ms step_avg:154.12ms | |
| step:710/1750 train_loss:3.5655 train_time:107887ms step_avg:154.12ms | |
| step:711/1750 train_loss:3.6129 train_time:108046ms step_avg:154.13ms | |
| step:712/1750 train_loss:3.6707 train_time:108206ms step_avg:154.14ms | |
| step:713/1750 train_loss:3.6840 train_time:108365ms step_avg:154.15ms | |
| step:714/1750 train_loss:3.5875 train_time:108522ms step_avg:154.15ms | |
| step:715/1750 train_loss:3.5989 train_time:108679ms step_avg:154.15ms | |
| step:716/1750 train_loss:3.6106 train_time:108837ms step_avg:154.16ms | |
| step:717/1750 train_loss:3.7318 train_time:108996ms step_avg:154.17ms | |
| step:718/1750 train_loss:3.6218 train_time:109151ms step_avg:154.17ms | |
| step:719/1750 train_loss:3.7029 train_time:109307ms step_avg:154.17ms | |
| step:720/1750 train_loss:3.8676 train_time:109468ms step_avg:154.18ms | |
| step:721/1750 train_loss:3.4887 train_time:109624ms step_avg:154.18ms | |
| step:722/1750 train_loss:3.7523 train_time:109782ms step_avg:154.19ms | |
| step:723/1750 train_loss:3.7911 train_time:109939ms step_avg:154.19ms | |
| step:724/1750 train_loss:3.5870 train_time:110097ms step_avg:154.20ms | |
| step:725/1750 train_loss:3.6751 train_time:110257ms step_avg:154.21ms | |
| step:726/1750 train_loss:3.5582 train_time:110418ms step_avg:154.21ms | |
| step:727/1750 train_loss:3.5969 train_time:110577ms step_avg:154.22ms | |
| step:728/1750 train_loss:3.7601 train_time:110735ms step_avg:154.23ms | |
| step:729/1750 train_loss:3.6957 train_time:110891ms step_avg:154.23ms | |
| step:730/1750 train_loss:3.6935 train_time:111051ms step_avg:154.24ms | |
| step:731/1750 train_loss:3.5839 train_time:111209ms step_avg:154.24ms | |
| step:732/1750 train_loss:3.6226 train_time:111366ms step_avg:154.25ms | |
| step:733/1750 train_loss:3.8621 train_time:111523ms step_avg:154.25ms | |
| step:734/1750 train_loss:3.5830 train_time:111681ms step_avg:154.26ms | |
| step:735/1750 train_loss:3.6297 train_time:111837ms step_avg:154.26ms | |
| step:736/1750 train_loss:3.7605 train_time:111994ms step_avg:154.26ms | |
| step:737/1750 train_loss:3.6997 train_time:112151ms step_avg:154.27ms | |
| step:738/1750 train_loss:3.6213 train_time:112310ms step_avg:154.27ms | |
| step:739/1750 train_loss:3.5207 train_time:112467ms step_avg:154.28ms | |
| step:740/1750 train_loss:4.1379 train_time:112626ms step_avg:154.28ms | |
| step:741/1750 train_loss:3.5201 train_time:112782ms step_avg:154.28ms | |
| step:742/1750 train_loss:3.5850 train_time:112940ms step_avg:154.29ms | |
| step:743/1750 train_loss:3.6070 train_time:113098ms step_avg:154.29ms | |
| step:744/1750 train_loss:3.6706 train_time:113258ms step_avg:154.30ms | |
| step:745/1750 train_loss:3.6177 train_time:113419ms step_avg:154.31ms | |
| step:746/1750 train_loss:3.6168 train_time:113576ms step_avg:154.32ms | |
| step:747/1750 train_loss:3.6676 train_time:113734ms step_avg:154.32ms | |
| step:748/1750 train_loss:3.5967 train_time:113894ms step_avg:154.33ms | |
| step:749/1750 train_loss:3.5864 train_time:114053ms step_avg:154.33ms | |
| step:750/1750 train_loss:3.6299 train_time:114210ms step_avg:154.34ms | |
| step:750/1750 val_loss:3.5915 train_time:114253ms step_avg:154.40ms | |
| step:751/1750 train_loss:3.5917 train_time:114371ms step_avg:154.35ms | |
| step:752/1750 train_loss:3.6347 train_time:114530ms step_avg:154.35ms | |
| step:753/1750 train_loss:3.6347 train_time:114687ms step_avg:154.36ms | |
| step:754/1750 train_loss:3.6147 train_time:114844ms step_avg:154.36ms | |
| step:755/1750 train_loss:3.7025 train_time:115123ms step_avg:154.53ms | |
| step:756/1750 train_loss:3.4911 train_time:115288ms step_avg:154.54ms | |
| step:757/1750 train_loss:3.7501 train_time:115448ms step_avg:154.55ms | |
| step:758/1750 train_loss:3.6832 train_time:115606ms step_avg:154.55ms | |
| step:759/1750 train_loss:3.6197 train_time:115908ms step_avg:154.75ms | |
| step:760/1750 train_loss:3.7267 train_time:116068ms step_avg:154.76ms | |
| step:761/1750 train_loss:3.4235 train_time:116227ms step_avg:154.76ms | |
| step:762/1750 train_loss:3.5742 train_time:116384ms step_avg:154.77ms | |
| step:763/1750 train_loss:3.6890 train_time:116543ms step_avg:154.77ms | |
| step:764/1750 train_loss:3.3393 train_time:116699ms step_avg:154.77ms | |
| step:765/1750 train_loss:3.7597 train_time:116856ms step_avg:154.78ms | |
| step:766/1750 train_loss:3.6067 train_time:117014ms step_avg:154.78ms | |
| step:767/1750 train_loss:3.5926 train_time:117173ms step_avg:154.79ms | |
| step:768/1750 train_loss:3.5994 train_time:117335ms step_avg:154.80ms | |
| step:769/1750 train_loss:3.6158 train_time:117493ms step_avg:154.80ms | |
| step:770/1750 train_loss:3.6675 train_time:117651ms step_avg:154.80ms | |
| step:771/1750 train_loss:3.8994 train_time:117810ms step_avg:154.81ms | |
| step:772/1750 train_loss:3.4758 train_time:117966ms step_avg:154.81ms | |
| step:773/1750 train_loss:3.6622 train_time:118124ms step_avg:154.81ms | |
| step:774/1750 train_loss:3.6651 train_time:118281ms step_avg:154.82ms | |
| step:775/1750 train_loss:3.6323 train_time:118437ms step_avg:154.82ms | |
| step:776/1750 train_loss:3.4186 train_time:118595ms step_avg:154.82ms | |
| step:777/1750 train_loss:3.4229 train_time:118753ms step_avg:154.83ms | |
| step:778/1750 train_loss:3.5154 train_time:118909ms step_avg:154.83ms | |
| step:779/1750 train_loss:3.6003 train_time:119068ms step_avg:154.83ms | |
| step:780/1750 train_loss:3.6168 train_time:119229ms step_avg:154.84ms | |
| step:781/1750 train_loss:3.6921 train_time:119388ms step_avg:154.85ms | |
| step:782/1750 train_loss:3.6149 train_time:119548ms step_avg:154.85ms | |
| step:783/1750 train_loss:3.5983 train_time:119706ms step_avg:154.86ms | |
| step:784/1750 train_loss:3.6247 train_time:119866ms step_avg:154.87ms | |
| step:785/1750 train_loss:3.5834 train_time:120024ms step_avg:154.87ms | |
| step:786/1750 train_loss:3.4643 train_time:120186ms step_avg:154.88ms | |
| step:787/1750 train_loss:3.8141 train_time:120347ms step_avg:154.89ms | |
| step:788/1750 train_loss:3.5263 train_time:120507ms step_avg:154.89ms | |
| step:789/1750 train_loss:3.5798 train_time:120665ms step_avg:154.90ms | |
| step:790/1750 train_loss:3.6543 train_time:120827ms step_avg:154.91ms | |
| step:791/1750 train_loss:3.8011 train_time:120988ms step_avg:154.91ms | |
| step:792/1750 train_loss:3.7820 train_time:121148ms step_avg:154.92ms | |
| step:793/1750 train_loss:3.4979 train_time:121306ms step_avg:154.92ms | |
| step:794/1750 train_loss:3.6179 train_time:121469ms step_avg:154.93ms | |
| step:795/1750 train_loss:3.6962 train_time:121630ms step_avg:154.94ms | |
| step:796/1750 train_loss:3.7605 train_time:121789ms step_avg:154.95ms | |
| step:797/1750 train_loss:3.5465 train_time:121948ms step_avg:154.95ms | |
| step:798/1750 train_loss:3.6710 train_time:122108ms step_avg:154.96ms | |
| step:799/1750 train_loss:3.5672 train_time:122268ms step_avg:154.97ms | |
| step:800/1750 train_loss:3.5522 train_time:122427ms step_avg:154.97ms | |
| step:801/1750 train_loss:3.6596 train_time:122587ms step_avg:154.98ms | |
| step:802/1750 train_loss:3.5183 train_time:122750ms step_avg:154.99ms | |
| step:803/1750 train_loss:3.5349 train_time:122908ms step_avg:154.99ms | |
| step:804/1750 train_loss:3.6481 train_time:123067ms step_avg:155.00ms | |
| step:805/1750 train_loss:3.5500 train_time:123227ms step_avg:155.00ms | |
| step:806/1750 train_loss:3.5874 train_time:123385ms step_avg:155.01ms | |
| step:807/1750 train_loss:3.6709 train_time:123545ms step_avg:155.01ms | |
| step:808/1750 train_loss:3.5765 train_time:123705ms step_avg:155.02ms | |
| step:809/1750 train_loss:3.5173 train_time:123866ms step_avg:155.03ms | |
| step:810/1750 train_loss:3.5855 train_time:124025ms step_avg:155.03ms | |
| step:811/1750 train_loss:3.6165 train_time:124184ms step_avg:155.04ms | |
| step:812/1750 train_loss:3.6240 train_time:124345ms step_avg:155.04ms | |
| step:813/1750 train_loss:3.6523 train_time:124502ms step_avg:155.05ms | |
| step:814/1750 train_loss:3.5952 train_time:124662ms step_avg:155.05ms | |
| step:815/1750 train_loss:3.5888 train_time:124823ms step_avg:155.06ms | |
| step:816/1750 train_loss:3.7097 train_time:124984ms step_avg:155.07ms | |
| step:817/1750 train_loss:3.7926 train_time:125145ms step_avg:155.07ms | |
| step:818/1750 train_loss:3.5474 train_time:125303ms step_avg:155.08ms | |
| step:819/1750 train_loss:3.7480 train_time:125465ms step_avg:155.09ms | |
| step:820/1750 train_loss:3.5283 train_time:125626ms step_avg:155.09ms | |
| step:821/1750 train_loss:3.5867 train_time:125784ms step_avg:155.10ms | |
| step:822/1750 train_loss:3.7224 train_time:125943ms step_avg:155.10ms | |
| step:823/1750 train_loss:3.6027 train_time:126103ms step_avg:155.11ms | |
| step:824/1750 train_loss:3.5374 train_time:126261ms step_avg:155.11ms | |
| step:825/1750 train_loss:3.6421 train_time:126424ms step_avg:155.12ms | |
| step:826/1750 train_loss:3.5053 train_time:126587ms step_avg:155.13ms | |
| step:827/1750 train_loss:3.7588 train_time:126747ms step_avg:155.14ms | |
| step:828/1750 train_loss:3.6437 train_time:126905ms step_avg:155.14ms | |
| step:829/1750 train_loss:3.6550 train_time:127067ms step_avg:155.15ms | |
| step:830/1750 train_loss:3.5569 train_time:127227ms step_avg:155.15ms | |
| step:831/1750 train_loss:3.6231 train_time:127388ms step_avg:155.16ms | |
| step:832/1750 train_loss:3.5387 train_time:127549ms step_avg:155.17ms | |
| step:833/1750 train_loss:3.6760 train_time:127709ms step_avg:155.18ms | |
| step:834/1750 train_loss:3.5105 train_time:127869ms step_avg:155.18ms | |
| step:835/1750 train_loss:3.4868 train_time:128029ms step_avg:155.19ms | |
| step:836/1750 train_loss:3.7424 train_time:128188ms step_avg:155.19ms | |
| step:837/1750 train_loss:3.4310 train_time:128347ms step_avg:155.20ms | |
| step:838/1750 train_loss:3.6104 train_time:128507ms step_avg:155.20ms | |
| step:839/1750 train_loss:3.4404 train_time:128666ms step_avg:155.21ms | |
| step:840/1750 train_loss:3.4911 train_time:128824ms step_avg:155.21ms | |
| step:841/1750 train_loss:3.5949 train_time:128982ms step_avg:155.21ms | |
| step:842/1750 train_loss:3.6069 train_time:129142ms step_avg:155.22ms | |
| step:843/1750 train_loss:3.5887 train_time:129301ms step_avg:155.22ms | |
| step:844/1750 train_loss:3.4517 train_time:129459ms step_avg:155.23ms | |
| step:845/1750 train_loss:3.6857 train_time:129619ms step_avg:155.23ms | |
| step:846/1750 train_loss:3.5444 train_time:129782ms step_avg:155.24ms | |
| step:847/1750 train_loss:3.5228 train_time:129943ms step_avg:155.25ms | |
| step:848/1750 train_loss:3.6652 train_time:130102ms step_avg:155.25ms | |
| step:849/1750 train_loss:3.5232 train_time:130261ms step_avg:155.26ms | |
| step:850/1750 train_loss:3.4641 train_time:130421ms step_avg:155.26ms | |
| step:851/1750 train_loss:3.7592 train_time:130582ms step_avg:155.27ms | |
| step:852/1750 train_loss:3.4705 train_time:130742ms step_avg:155.28ms | |
| step:853/1750 train_loss:3.5892 train_time:130898ms step_avg:155.28ms | |
| step:854/1750 train_loss:3.6781 train_time:131056ms step_avg:155.28ms | |
| step:855/1750 train_loss:3.5403 train_time:131215ms step_avg:155.28ms | |
| step:856/1750 train_loss:3.5670 train_time:131373ms step_avg:155.29ms | |
| step:857/1750 train_loss:3.6290 train_time:131534ms step_avg:155.29ms | |
| step:858/1750 train_loss:3.5004 train_time:131695ms step_avg:155.30ms | |
| step:859/1750 train_loss:3.5871 train_time:131854ms step_avg:155.30ms | |
| step:860/1750 train_loss:3.6159 train_time:132011ms step_avg:155.31ms | |
| step:861/1750 train_loss:3.6563 train_time:132172ms step_avg:155.31ms | |
| step:862/1750 train_loss:3.6260 train_time:132333ms step_avg:155.32ms | |
| step:863/1750 train_loss:3.5995 train_time:132494ms step_avg:155.33ms | |
| step:864/1750 train_loss:3.4095 train_time:132652ms step_avg:155.33ms | |
| step:865/1750 train_loss:3.6260 train_time:132809ms step_avg:155.33ms | |
| step:866/1750 train_loss:3.8904 train_time:132971ms step_avg:155.34ms | |
| step:867/1750 train_loss:3.4772 train_time:133129ms step_avg:155.34ms | |
| step:868/1750 train_loss:3.6630 train_time:133286ms step_avg:155.34ms | |
| step:869/1750 train_loss:3.6404 train_time:133445ms step_avg:155.35ms | |
| step:870/1750 train_loss:3.4726 train_time:133605ms step_avg:155.35ms | |
| step:871/1750 train_loss:3.4413 train_time:133765ms step_avg:155.36ms | |
| step:872/1750 train_loss:3.6777 train_time:133927ms step_avg:155.37ms | |
| step:873/1750 train_loss:3.4857 train_time:134086ms step_avg:155.37ms | |
| step:874/1750 train_loss:3.2426 train_time:134249ms step_avg:155.38ms | |
| step:875/1750 train_loss:3.6616 train_time:134408ms step_avg:155.38ms | |
| step:875/1750 val_loss:3.5470 train_time:134450ms step_avg:155.43ms | |
| step:876/1750 train_loss:3.4696 train_time:134566ms step_avg:155.39ms | |
| step:877/1750 train_loss:3.6464 train_time:134727ms step_avg:155.39ms | |
| step:878/1750 train_loss:3.4995 train_time:134888ms step_avg:155.40ms | |
| step:879/1750 train_loss:3.6733 train_time:135047ms step_avg:155.41ms | |
| step:880/1750 train_loss:3.3279 train_time:135207ms step_avg:155.41ms | |
| step:881/1750 train_loss:3.5079 train_time:135364ms step_avg:155.41ms | |
| step:882/1750 train_loss:3.7246 train_time:135521ms step_avg:155.41ms | |
| step:883/1750 train_loss:3.8620 train_time:135681ms step_avg:155.42ms | |
| step:884/1750 train_loss:3.5918 train_time:135840ms step_avg:155.42ms | |
| step:885/1750 train_loss:3.5098 train_time:135999ms step_avg:155.43ms | |
| step:886/1750 train_loss:3.5978 train_time:136157ms step_avg:155.43ms | |
| step:887/1750 train_loss:4.0988 train_time:136318ms step_avg:155.44ms | |
| step:888/1750 train_loss:3.8580 train_time:136481ms step_avg:155.45ms | |
| step:889/1750 train_loss:3.5547 train_time:136638ms step_avg:155.45ms | |
| step:890/1750 train_loss:3.5624 train_time:136797ms step_avg:155.45ms | |
| step:891/1750 train_loss:3.3875 train_time:136956ms step_avg:155.45ms | |
| step:892/1750 train_loss:3.7419 train_time:137115ms step_avg:155.46ms | |
| step:893/1750 train_loss:3.4482 train_time:137271ms step_avg:155.46ms | |
| step:894/1750 train_loss:3.6603 train_time:137432ms step_avg:155.47ms | |
| step:895/1750 train_loss:3.7041 train_time:137592ms step_avg:155.47ms | |
| step:896/1750 train_loss:3.5264 train_time:137752ms step_avg:155.48ms | |
| step:897/1750 train_loss:3.5653 train_time:137912ms step_avg:155.48ms | |
| step:898/1750 train_loss:3.6205 train_time:138071ms step_avg:155.48ms | |
| step:899/1750 train_loss:3.5081 train_time:138228ms step_avg:155.49ms | |
| step:900/1750 train_loss:3.4426 train_time:138387ms step_avg:155.49ms | |
| step:901/1750 train_loss:3.6456 train_time:138545ms step_avg:155.49ms | |
| step:902/1750 train_loss:3.6575 train_time:138703ms step_avg:155.50ms | |
| step:903/1750 train_loss:3.5686 train_time:138865ms step_avg:155.50ms | |
| step:904/1750 train_loss:3.5264 train_time:139024ms step_avg:155.51ms | |
| step:905/1750 train_loss:3.5320 train_time:139184ms step_avg:155.51ms | |
| step:906/1750 train_loss:3.7374 train_time:139345ms step_avg:155.52ms | |
| step:907/1750 train_loss:3.5397 train_time:139507ms step_avg:155.53ms | |
| step:908/1750 train_loss:3.5978 train_time:139663ms step_avg:155.53ms | |
| step:909/1750 train_loss:3.4789 train_time:139825ms step_avg:155.53ms | |
| step:910/1750 train_loss:3.5514 train_time:139991ms step_avg:155.55ms | |
| step:911/1750 train_loss:3.6675 train_time:140151ms step_avg:155.55ms | |
| step:912/1750 train_loss:3.6281 train_time:140313ms step_avg:155.56ms | |
| step:913/1750 train_loss:3.4861 train_time:140476ms step_avg:155.57ms | |
| step:914/1750 train_loss:3.7723 train_time:140638ms step_avg:155.57ms | |
| step:915/1750 train_loss:3.5558 train_time:140800ms step_avg:155.58ms | |
| step:916/1750 train_loss:3.6453 train_time:140960ms step_avg:155.58ms | |
| step:917/1750 train_loss:3.6261 train_time:141121ms step_avg:155.59ms | |
| step:918/1750 train_loss:4.8559 train_time:141285ms step_avg:155.60ms | |
| step:919/1750 train_loss:3.5167 train_time:141448ms step_avg:155.61ms | |
| step:920/1750 train_loss:3.6126 train_time:141608ms step_avg:155.61ms | |
| step:921/1750 train_loss:3.5735 train_time:141769ms step_avg:155.62ms | |
| step:922/1750 train_loss:3.6101 train_time:141933ms step_avg:155.63ms | |
| step:923/1750 train_loss:3.6352 train_time:142094ms step_avg:155.63ms | |
| step:924/1750 train_loss:3.7108 train_time:142255ms step_avg:155.64ms | |
| step:925/1750 train_loss:3.6753 train_time:142416ms step_avg:155.65ms | |
| step:926/1750 train_loss:3.5840 train_time:142575ms step_avg:155.65ms | |
| step:927/1750 train_loss:3.5786 train_time:142735ms step_avg:155.65ms | |
| step:928/1750 train_loss:3.8092 train_time:142897ms step_avg:155.66ms | |
| step:929/1750 train_loss:3.6370 train_time:143056ms step_avg:155.66ms | |
| step:930/1750 train_loss:3.4250 train_time:143218ms step_avg:155.67ms | |
| step:931/1750 train_loss:3.5202 train_time:143376ms step_avg:155.67ms | |
| step:932/1750 train_loss:3.6795 train_time:143538ms step_avg:155.68ms | |
| step:933/1750 train_loss:3.4017 train_time:143699ms step_avg:155.69ms | |
| step:934/1750 train_loss:3.6122 train_time:143860ms step_avg:155.69ms | |
| step:935/1750 train_loss:3.4638 train_time:144023ms step_avg:155.70ms | |
| step:936/1750 train_loss:3.5417 train_time:144186ms step_avg:155.71ms | |
| step:937/1750 train_loss:3.6490 train_time:144350ms step_avg:155.72ms | |
| step:938/1750 train_loss:3.5669 train_time:144510ms step_avg:155.72ms | |
| step:939/1750 train_loss:3.6966 train_time:144675ms step_avg:155.73ms | |
| step:940/1750 train_loss:3.5082 train_time:144835ms step_avg:155.74ms | |
| step:941/1750 train_loss:3.5744 train_time:144995ms step_avg:155.74ms | |
| step:942/1750 train_loss:3.3885 train_time:145155ms step_avg:155.75ms | |
| step:943/1750 train_loss:3.7305 train_time:145319ms step_avg:155.75ms | |
| step:944/1750 train_loss:3.4333 train_time:145629ms step_avg:155.92ms | |
| step:945/1750 train_loss:3.4543 train_time:145797ms step_avg:155.93ms | |
| step:946/1750 train_loss:5.1047 train_time:145959ms step_avg:155.94ms | |
| step:947/1750 train_loss:3.6252 train_time:146119ms step_avg:155.94ms | |
| step:948/1750 train_loss:3.5068 train_time:146280ms step_avg:155.95ms | |
| step:949/1750 train_loss:3.4063 train_time:146583ms step_avg:156.11ms | |
| step:950/1750 train_loss:3.4626 train_time:146745ms step_avg:156.11ms | |
| step:951/1750 train_loss:3.4405 train_time:146911ms step_avg:156.12ms | |
| step:952/1750 train_loss:3.5061 train_time:147073ms step_avg:156.13ms | |
| step:953/1750 train_loss:3.5980 train_time:147235ms step_avg:156.13ms | |
| step:954/1750 train_loss:3.4748 train_time:147398ms step_avg:156.14ms | |
| step:955/1750 train_loss:3.5014 train_time:147557ms step_avg:156.14ms | |
| step:956/1750 train_loss:3.4694 train_time:147719ms step_avg:156.15ms | |
| step:957/1750 train_loss:3.5294 train_time:147881ms step_avg:156.16ms | |
| step:958/1750 train_loss:3.5309 train_time:148045ms step_avg:156.17ms | |
| step:959/1750 train_loss:3.5357 train_time:148206ms step_avg:156.17ms | |
| step:960/1750 train_loss:3.4323 train_time:148367ms step_avg:156.18ms | |
| step:961/1750 train_loss:3.6851 train_time:148527ms step_avg:156.18ms | |
| step:962/1750 train_loss:3.6345 train_time:148687ms step_avg:156.18ms | |
| step:963/1750 train_loss:3.7305 train_time:148851ms step_avg:156.19ms | |
| step:964/1750 train_loss:3.4584 train_time:149014ms step_avg:156.20ms | |
| step:965/1750 train_loss:3.5113 train_time:149172ms step_avg:156.20ms | |
| step:966/1750 train_loss:3.7455 train_time:149334ms step_avg:156.21ms | |
| step:967/1750 train_loss:3.5555 train_time:149497ms step_avg:156.21ms | |
| step:968/1750 train_loss:3.5518 train_time:149657ms step_avg:156.22ms | |
| step:969/1750 train_loss:3.6151 train_time:149819ms step_avg:156.22ms | |
| step:970/1750 train_loss:3.4039 train_time:149978ms step_avg:156.23ms | |
| step:971/1750 train_loss:3.5682 train_time:150138ms step_avg:156.23ms | |
| step:972/1750 train_loss:3.5113 train_time:150298ms step_avg:156.23ms | |
| step:973/1750 train_loss:3.5714 train_time:150457ms step_avg:156.24ms | |
| step:974/1750 train_loss:3.6220 train_time:150620ms step_avg:156.24ms | |
| step:975/1750 train_loss:3.5020 train_time:150780ms step_avg:156.25ms | |
| step:976/1750 train_loss:3.7025 train_time:150938ms step_avg:156.25ms | |
| step:977/1750 train_loss:3.6042 train_time:151098ms step_avg:156.25ms | |
| step:978/1750 train_loss:3.4022 train_time:151258ms step_avg:156.26ms | |
| step:979/1750 train_loss:3.6612 train_time:151418ms step_avg:156.26ms | |
| step:980/1750 train_loss:3.4475 train_time:151577ms step_avg:156.27ms | |
| step:981/1750 train_loss:3.6102 train_time:151739ms step_avg:156.27ms | |
| step:982/1750 train_loss:3.5820 train_time:151898ms step_avg:156.27ms | |
| step:983/1750 train_loss:3.5547 train_time:152058ms step_avg:156.28ms | |
| step:984/1750 train_loss:3.5234 train_time:152217ms step_avg:156.28ms | |
| step:985/1750 train_loss:3.6226 train_time:152377ms step_avg:156.28ms | |
| step:986/1750 train_loss:3.4538 train_time:152537ms step_avg:156.29ms | |
| step:987/1750 train_loss:3.5244 train_time:152696ms step_avg:156.29ms | |
| step:988/1750 train_loss:3.5436 train_time:152857ms step_avg:156.30ms | |
| step:989/1750 train_loss:3.4543 train_time:153017ms step_avg:156.30ms | |
| step:990/1750 train_loss:3.6920 train_time:153179ms step_avg:156.31ms | |
| step:991/1750 train_loss:3.5067 train_time:153338ms step_avg:156.31ms | |
| step:992/1750 train_loss:3.4724 train_time:153503ms step_avg:156.32ms | |
| step:993/1750 train_loss:3.5418 train_time:153669ms step_avg:156.33ms | |
| step:994/1750 train_loss:3.6288 train_time:153827ms step_avg:156.33ms | |
| step:995/1750 train_loss:3.5691 train_time:153987ms step_avg:156.33ms | |
| step:996/1750 train_loss:3.4962 train_time:154145ms step_avg:156.33ms | |
| step:997/1750 train_loss:3.8111 train_time:154306ms step_avg:156.34ms | |
| step:998/1750 train_loss:3.4795 train_time:154465ms step_avg:156.34ms | |
| step:999/1750 train_loss:3.6296 train_time:154627ms step_avg:156.35ms | |
| step:1000/1750 train_loss:3.4861 train_time:154791ms step_avg:156.35ms | |
| step:1000/1750 val_loss:3.5074 train_time:154832ms step_avg:156.40ms | |
| step:1001/1750 train_loss:3.5342 train_time:154951ms step_avg:156.36ms | |
| step:1002/1750 train_loss:3.4192 train_time:155110ms step_avg:156.36ms | |
| step:1003/1750 train_loss:3.5959 train_time:155272ms step_avg:156.37ms | |
| step:1004/1750 train_loss:3.6414 train_time:155433ms step_avg:156.37ms | |
| step:1005/1750 train_loss:3.4317 train_time:155594ms step_avg:156.38ms | |
| step:1006/1750 train_loss:3.5052 train_time:155754ms step_avg:156.38ms | |
| step:1007/1750 train_loss:3.4773 train_time:155914ms step_avg:156.38ms | |
| step:1008/1750 train_loss:3.6029 train_time:156076ms step_avg:156.39ms | |
| step:1009/1750 train_loss:3.7050 train_time:156242ms step_avg:156.40ms | |
| step:1010/1750 train_loss:3.5982 train_time:156401ms step_avg:156.40ms | |
| step:1011/1750 train_loss:3.5757 train_time:156561ms step_avg:156.40ms | |
| step:1012/1750 train_loss:3.4335 train_time:156720ms step_avg:156.41ms | |
| step:1013/1750 train_loss:3.5755 train_time:156882ms step_avg:156.41ms | |
| step:1014/1750 train_loss:3.6596 train_time:157044ms step_avg:156.42ms | |
| step:1015/1750 train_loss:3.3671 train_time:157206ms step_avg:156.42ms | |
| step:1016/1750 train_loss:3.4520 train_time:157368ms step_avg:156.43ms | |
| step:1017/1750 train_loss:3.4568 train_time:157531ms step_avg:156.44ms | |
| step:1018/1750 train_loss:3.4352 train_time:157691ms step_avg:156.44ms | |
| step:1019/1750 train_loss:3.5630 train_time:157851ms step_avg:156.44ms | |
| step:1020/1750 train_loss:3.4390 train_time:158012ms step_avg:156.45ms | |
| step:1021/1750 train_loss:3.3958 train_time:158170ms step_avg:156.45ms | |
| step:1022/1750 train_loss:3.5199 train_time:158331ms step_avg:156.45ms | |
| step:1023/1750 train_loss:3.5491 train_time:158491ms step_avg:156.46ms | |
| step:1024/1750 train_loss:3.5189 train_time:158651ms step_avg:156.46ms | |
| step:1025/1750 train_loss:3.5291 train_time:158812ms step_avg:156.46ms | |
| step:1026/1750 train_loss:3.6736 train_time:158970ms step_avg:156.47ms | |
| step:1027/1750 train_loss:3.3633 train_time:159129ms step_avg:156.47ms | |
| step:1028/1750 train_loss:3.4394 train_time:159292ms step_avg:156.48ms | |
| step:1029/1750 train_loss:3.3644 train_time:159455ms step_avg:156.48ms | |
| step:1030/1750 train_loss:3.5788 train_time:159614ms step_avg:156.48ms | |
| step:1031/1750 train_loss:3.5614 train_time:159772ms step_avg:156.49ms | |
| step:1032/1750 train_loss:3.7468 train_time:159933ms step_avg:156.49ms | |
| step:1033/1750 train_loss:3.5399 train_time:160092ms step_avg:156.49ms | |
| step:1034/1750 train_loss:3.4724 train_time:160253ms step_avg:156.50ms | |
| step:1035/1750 train_loss:3.4875 train_time:160414ms step_avg:156.50ms | |
| step:1036/1750 train_loss:3.5318 train_time:160575ms step_avg:156.51ms | |
| step:1037/1750 train_loss:3.8459 train_time:160736ms step_avg:156.51ms | |
| step:1038/1750 train_loss:3.6717 train_time:160897ms step_avg:156.51ms | |
| step:1039/1750 train_loss:3.5560 train_time:161060ms step_avg:156.52ms | |
| step:1040/1750 train_loss:3.4606 train_time:161222ms step_avg:156.53ms | |
| step:1041/1750 train_loss:3.5309 train_time:161386ms step_avg:156.53ms | |
| step:1042/1750 train_loss:3.5741 train_time:161545ms step_avg:156.54ms | |
| step:1043/1750 train_loss:3.4910 train_time:161707ms step_avg:156.54ms | |
| step:1044/1750 train_loss:3.5056 train_time:161868ms step_avg:156.55ms | |
| step:1045/1750 train_loss:3.5651 train_time:162031ms step_avg:156.55ms | |
| step:1046/1750 train_loss:3.4747 train_time:162192ms step_avg:156.56ms | |
| step:1047/1750 train_loss:3.6874 train_time:162353ms step_avg:156.56ms | |
| step:1048/1750 train_loss:3.5517 train_time:162514ms step_avg:156.56ms | |
| step:1049/1750 train_loss:3.4576 train_time:162675ms step_avg:156.57ms | |
| step:1050/1750 train_loss:3.4401 train_time:162840ms step_avg:156.58ms | |
| step:1051/1750 train_loss:3.5468 train_time:163004ms step_avg:156.58ms | |
| step:1052/1750 train_loss:3.4104 train_time:163168ms step_avg:156.59ms | |
| step:1053/1750 train_loss:3.7406 train_time:163329ms step_avg:156.60ms | |
| step:1054/1750 train_loss:3.5938 train_time:163491ms step_avg:156.60ms | |
| step:1055/1750 train_loss:3.4348 train_time:163650ms step_avg:156.60ms | |
| step:1056/1750 train_loss:3.5518 train_time:163810ms step_avg:156.61ms | |
| step:1057/1750 train_loss:3.6309 train_time:163972ms step_avg:156.61ms | |
| step:1058/1750 train_loss:3.3621 train_time:164135ms step_avg:156.62ms | |
| step:1059/1750 train_loss:3.4211 train_time:164301ms step_avg:156.63ms | |
| step:1060/1750 train_loss:3.4938 train_time:164461ms step_avg:156.63ms | |
| step:1061/1750 train_loss:3.4734 train_time:164620ms step_avg:156.63ms | |
| step:1062/1750 train_loss:3.4342 train_time:164782ms step_avg:156.64ms | |
| step:1063/1750 train_loss:3.5209 train_time:164944ms step_avg:156.64ms | |
| step:1064/1750 train_loss:3.4399 train_time:165103ms step_avg:156.64ms | |
| step:1065/1750 train_loss:3.4180 train_time:165268ms step_avg:156.65ms | |
| step:1066/1750 train_loss:3.4592 train_time:165429ms step_avg:156.66ms | |
| step:1067/1750 train_loss:3.3268 train_time:165593ms step_avg:156.66ms | |
| step:1068/1750 train_loss:3.4872 train_time:165753ms step_avg:156.67ms | |
| step:1069/1750 train_loss:3.3559 train_time:165919ms step_avg:156.67ms | |
| step:1070/1750 train_loss:3.6218 train_time:166080ms step_avg:156.68ms | |
| step:1071/1750 train_loss:3.5644 train_time:166249ms step_avg:156.69ms | |
| step:1072/1750 train_loss:3.4997 train_time:166409ms step_avg:156.69ms | |
| step:1073/1750 train_loss:3.5792 train_time:166568ms step_avg:156.70ms | |
| step:1074/1750 train_loss:3.4962 train_time:166731ms step_avg:156.70ms | |
| step:1075/1750 train_loss:3.4519 train_time:166894ms step_avg:156.71ms | |
| step:1076/1750 train_loss:3.8487 train_time:167058ms step_avg:156.71ms | |
| step:1077/1750 train_loss:3.4994 train_time:167218ms step_avg:156.72ms | |
| step:1078/1750 train_loss:3.1409 train_time:167388ms step_avg:156.73ms | |
| step:1079/1750 train_loss:3.5863 train_time:167550ms step_avg:156.74ms | |
| step:1080/1750 train_loss:3.4887 train_time:167714ms step_avg:156.74ms | |
| step:1081/1750 train_loss:3.5705 train_time:167875ms step_avg:156.75ms | |
| step:1082/1750 train_loss:3.6526 train_time:168038ms step_avg:156.75ms | |
| step:1083/1750 train_loss:3.5611 train_time:168198ms step_avg:156.76ms | |
| step:1084/1750 train_loss:3.5319 train_time:168363ms step_avg:156.76ms | |
| step:1085/1750 train_loss:3.4850 train_time:168524ms step_avg:156.77ms | |
| step:1086/1750 train_loss:3.6940 train_time:168686ms step_avg:156.77ms | |
| step:1087/1750 train_loss:3.5751 train_time:168847ms step_avg:156.78ms | |
| step:1088/1750 train_loss:3.4254 train_time:169011ms step_avg:156.78ms | |
| step:1089/1750 train_loss:3.4361 train_time:169178ms step_avg:156.79ms | |
| step:1090/1750 train_loss:3.5447 train_time:169346ms step_avg:156.80ms | |
| step:1091/1750 train_loss:3.3494 train_time:169507ms step_avg:156.81ms | |
| step:1092/1750 train_loss:3.5570 train_time:169669ms step_avg:156.81ms | |
| step:1093/1750 train_loss:3.6718 train_time:169832ms step_avg:156.82ms | |
| step:1094/1750 train_loss:3.5049 train_time:169992ms step_avg:156.82ms | |
| step:1095/1750 train_loss:3.4799 train_time:170153ms step_avg:156.82ms | |
| step:1096/1750 train_loss:3.4907 train_time:170316ms step_avg:156.83ms | |
| step:1097/1750 train_loss:3.5513 train_time:170479ms step_avg:156.83ms | |
| step:1098/1750 train_loss:3.6258 train_time:170646ms step_avg:156.84ms | |
| step:1099/1750 train_loss:3.5871 train_time:170809ms step_avg:156.85ms | |
| step:1100/1750 train_loss:3.5004 train_time:170972ms step_avg:156.85ms | |
| step:1101/1750 train_loss:3.3461 train_time:171134ms step_avg:156.86ms | |
| step:1102/1750 train_loss:3.3793 train_time:171303ms step_avg:156.87ms | |
| step:1103/1750 train_loss:3.5117 train_time:171469ms step_avg:156.88ms | |
| step:1104/1750 train_loss:3.3800 train_time:171628ms step_avg:156.88ms | |
| step:1105/1750 train_loss:4.1255 train_time:171790ms step_avg:156.89ms | |
| step:1106/1750 train_loss:3.2956 train_time:171950ms step_avg:156.89ms | |
| step:1107/1750 train_loss:3.6312 train_time:172110ms step_avg:156.89ms | |
| step:1108/1750 train_loss:3.4063 train_time:172269ms step_avg:156.89ms | |
| step:1109/1750 train_loss:3.5640 train_time:172429ms step_avg:156.90ms | |
| step:1110/1750 train_loss:3.5003 train_time:172589ms step_avg:156.90ms | |
| step:1111/1750 train_loss:3.5485 train_time:172749ms step_avg:156.90ms | |
| step:1112/1750 train_loss:3.6241 train_time:172911ms step_avg:156.91ms | |
| step:1113/1750 train_loss:3.4980 train_time:173078ms step_avg:156.92ms | |
| step:1114/1750 train_loss:3.4272 train_time:173242ms step_avg:156.92ms | |
| step:1115/1750 train_loss:3.3127 train_time:173405ms step_avg:156.93ms | |
| step:1116/1750 train_loss:3.4918 train_time:173566ms step_avg:156.93ms | |
| step:1117/1750 train_loss:3.6621 train_time:173729ms step_avg:156.94ms | |
| step:1118/1750 train_loss:3.6846 train_time:173893ms step_avg:156.94ms | |
| step:1119/1750 train_loss:3.5458 train_time:174054ms step_avg:156.95ms | |
| step:1120/1750 train_loss:3.5576 train_time:174216ms step_avg:156.95ms | |
| step:1121/1750 train_loss:3.4510 train_time:174379ms step_avg:156.96ms | |
| step:1122/1750 train_loss:3.5224 train_time:174539ms step_avg:156.96ms | |
| step:1123/1750 train_loss:3.6494 train_time:174701ms step_avg:156.96ms | |
| step:1124/1750 train_loss:3.4189 train_time:174863ms step_avg:156.97ms | |
| step:1125/1750 train_loss:3.2800 train_time:175025ms step_avg:156.97ms | |
| step:1125/1750 val_loss:3.4790 train_time:175067ms step_avg:157.01ms | |
| step:1126/1750 train_loss:3.5469 train_time:175186ms step_avg:156.98ms | |
| step:1127/1750 train_loss:3.7454 train_time:175349ms step_avg:156.98ms | |
| step:1128/1750 train_loss:3.2959 train_time:175512ms step_avg:156.99ms | |
| step:1129/1750 train_loss:3.6287 train_time:175676ms step_avg:156.99ms | |
| step:1130/1750 train_loss:3.4416 train_time:175839ms step_avg:157.00ms | |
| step:1131/1750 train_loss:3.4661 train_time:176006ms step_avg:157.01ms | |
| step:1132/1750 train_loss:3.4343 train_time:176165ms step_avg:157.01ms | |
| step:1133/1750 train_loss:3.5660 train_time:176471ms step_avg:157.14ms | |
| step:1134/1750 train_loss:3.5209 train_time:176640ms step_avg:157.15ms | |
| step:1135/1750 train_loss:3.5878 train_time:176806ms step_avg:157.16ms | |
| step:1136/1750 train_loss:3.6237 train_time:176967ms step_avg:157.16ms | |
| step:1137/1750 train_loss:3.5263 train_time:177129ms step_avg:157.17ms | |
| step:1138/1750 train_loss:3.4184 train_time:177291ms step_avg:157.17ms | |
| step:1139/1750 train_loss:3.7208 train_time:177589ms step_avg:157.30ms | |
| step:1140/1750 train_loss:3.5337 train_time:177753ms step_avg:157.30ms | |
| step:1141/1750 train_loss:3.6613 train_time:177917ms step_avg:157.31ms | |
| step:1142/1750 train_loss:3.5242 train_time:178079ms step_avg:157.31ms | |
| step:1143/1750 train_loss:3.4314 train_time:178241ms step_avg:157.32ms | |
| step:1144/1750 train_loss:3.5142 train_time:178404ms step_avg:157.32ms | |
| step:1145/1750 train_loss:3.6612 train_time:178564ms step_avg:157.33ms | |
| step:1146/1750 train_loss:3.6203 train_time:178727ms step_avg:157.33ms | |
| step:1147/1750 train_loss:3.5633 train_time:178888ms step_avg:157.33ms | |
| step:1148/1750 train_loss:3.5724 train_time:179048ms step_avg:157.34ms | |
| step:1149/1750 train_loss:3.4034 train_time:179212ms step_avg:157.34ms | |
| step:1150/1750 train_loss:3.4374 train_time:179373ms step_avg:157.34ms | |
| step:1151/1750 train_loss:3.3917 train_time:179537ms step_avg:157.35ms | |
| step:1152/1750 train_loss:3.4756 train_time:179702ms step_avg:157.36ms | |
| step:1153/1750 train_loss:3.4987 train_time:179866ms step_avg:157.36ms | |
| step:1154/1750 train_loss:3.5938 train_time:180026ms step_avg:157.37ms | |
| step:1155/1750 train_loss:3.3960 train_time:180190ms step_avg:157.37ms | |
| step:1156/1750 train_loss:3.6041 train_time:180355ms step_avg:157.38ms | |
| step:1157/1750 train_loss:3.5651 train_time:180518ms step_avg:157.38ms | |
| step:1158/1750 train_loss:3.3303 train_time:180679ms step_avg:157.39ms | |
| step:1159/1750 train_loss:3.4124 train_time:180841ms step_avg:157.39ms | |
| step:1160/1750 train_loss:3.4029 train_time:181001ms step_avg:157.39ms | |
| step:1161/1750 train_loss:3.1641 train_time:181165ms step_avg:157.40ms | |
| step:1162/1750 train_loss:3.4875 train_time:181327ms step_avg:157.40ms | |
| step:1163/1750 train_loss:3.4552 train_time:181489ms step_avg:157.41ms | |
| step:1164/1750 train_loss:3.3553 train_time:181648ms step_avg:157.41ms | |
| step:1165/1750 train_loss:3.3203 train_time:181808ms step_avg:157.41ms | |
| step:1166/1750 train_loss:3.4469 train_time:181970ms step_avg:157.41ms | |
| step:1167/1750 train_loss:3.4692 train_time:182131ms step_avg:157.42ms | |
| step:1168/1750 train_loss:3.7955 train_time:182294ms step_avg:157.42ms | |
| step:1169/1750 train_loss:3.4507 train_time:182459ms step_avg:157.43ms | |
| step:1170/1750 train_loss:3.4576 train_time:182623ms step_avg:157.43ms | |
| step:1171/1750 train_loss:3.3805 train_time:182785ms step_avg:157.44ms | |
| step:1172/1750 train_loss:3.4938 train_time:182945ms step_avg:157.44ms | |
| step:1173/1750 train_loss:3.6089 train_time:183114ms step_avg:157.45ms | |
| step:1174/1750 train_loss:3.4582 train_time:183286ms step_avg:157.46ms | |
| step:1175/1750 train_loss:3.4468 train_time:183451ms step_avg:157.47ms | |
| step:1176/1750 train_loss:3.4964 train_time:183617ms step_avg:157.48ms | |
| step:1177/1750 train_loss:3.5201 train_time:183787ms step_avg:157.49ms | |
| step:1178/1750 train_loss:3.5664 train_time:183948ms step_avg:157.49ms | |
| step:1179/1750 train_loss:3.4719 train_time:184109ms step_avg:157.49ms | |
| step:1180/1750 train_loss:3.4263 train_time:184280ms step_avg:157.50ms | |
| step:1181/1750 train_loss:3.4082 train_time:184442ms step_avg:157.51ms | |
| step:1182/1750 train_loss:3.4663 train_time:184607ms step_avg:157.51ms | |
| step:1183/1750 train_loss:3.4008 train_time:184771ms step_avg:157.52ms | |
| step:1184/1750 train_loss:3.5764 train_time:184935ms step_avg:157.53ms | |
| step:1185/1750 train_loss:3.6159 train_time:185102ms step_avg:157.53ms | |
| step:1186/1750 train_loss:3.4268 train_time:185268ms step_avg:157.54ms | |
| step:1187/1750 train_loss:3.4816 train_time:185440ms step_avg:157.55ms | |
| step:1188/1750 train_loss:3.5088 train_time:185602ms step_avg:157.56ms | |
| step:1189/1750 train_loss:3.3458 train_time:185766ms step_avg:157.56ms | |
| step:1190/1750 train_loss:3.5158 train_time:185929ms step_avg:157.57ms | |
| step:1191/1750 train_loss:3.6477 train_time:186093ms step_avg:157.57ms | |
| step:1192/1750 train_loss:3.4587 train_time:186253ms step_avg:157.57ms | |
| step:1193/1750 train_loss:3.3461 train_time:186418ms step_avg:157.58ms | |
| step:1194/1750 train_loss:3.6287 train_time:186584ms step_avg:157.59ms | |
| step:1195/1750 train_loss:3.4422 train_time:186751ms step_avg:157.60ms | |
| step:1196/1750 train_loss:3.4538 train_time:186922ms step_avg:157.61ms | |
| step:1197/1750 train_loss:3.3585 train_time:187089ms step_avg:157.61ms | |
| step:1198/1750 train_loss:3.3698 train_time:187260ms step_avg:157.63ms | |
| step:1199/1750 train_loss:3.4085 train_time:187425ms step_avg:157.63ms | |
| step:1200/1750 train_loss:3.5091 train_time:187587ms step_avg:157.64ms | |
| step:1201/1750 train_loss:3.5522 train_time:187751ms step_avg:157.64ms | |
| step:1202/1750 train_loss:3.7203 train_time:187923ms step_avg:157.65ms | |
| step:1203/1750 train_loss:3.4786 train_time:188087ms step_avg:157.66ms | |
| step:1204/1750 train_loss:3.3816 train_time:188254ms step_avg:157.67ms | |
| step:1205/1750 train_loss:3.4981 train_time:188414ms step_avg:157.67ms | |
| step:1206/1750 train_loss:3.5452 train_time:188578ms step_avg:157.67ms | |
| step:1207/1750 train_loss:3.5889 train_time:188742ms step_avg:157.68ms | |
| step:1208/1750 train_loss:3.4681 train_time:188905ms step_avg:157.68ms | |
| step:1209/1750 train_loss:3.3028 train_time:189070ms step_avg:157.69ms | |
| step:1210/1750 train_loss:3.3776 train_time:189234ms step_avg:157.69ms | |
| step:1211/1750 train_loss:3.4695 train_time:189399ms step_avg:157.70ms | |
| step:1212/1750 train_loss:3.4655 train_time:189563ms step_avg:157.71ms | |
| step:1213/1750 train_loss:3.4834 train_time:189728ms step_avg:157.71ms | |
| step:1214/1750 train_loss:3.3367 train_time:189894ms step_avg:157.72ms | |
| step:1215/1750 train_loss:3.4602 train_time:190058ms step_avg:157.72ms | |
| step:1216/1750 train_loss:3.3944 train_time:190220ms step_avg:157.73ms | |
| step:1217/1750 train_loss:3.3943 train_time:190386ms step_avg:157.74ms | |
| step:1218/1750 train_loss:3.4881 train_time:190550ms step_avg:157.74ms | |
| step:1219/1750 train_loss:3.3341 train_time:190718ms step_avg:157.75ms | |
| step:1220/1750 train_loss:3.5395 train_time:190881ms step_avg:157.75ms | |
| step:1221/1750 train_loss:3.5796 train_time:191043ms step_avg:157.76ms | |
| step:1222/1750 train_loss:3.5146 train_time:191205ms step_avg:157.76ms | |
| step:1223/1750 train_loss:3.3652 train_time:191369ms step_avg:157.76ms | |
| step:1224/1750 train_loss:3.3236 train_time:191537ms step_avg:157.77ms | |
| step:1225/1750 train_loss:3.4391 train_time:191699ms step_avg:157.78ms | |
| step:1226/1750 train_loss:3.3980 train_time:191865ms step_avg:157.78ms | |
| step:1227/1750 train_loss:3.3383 train_time:192029ms step_avg:157.79ms | |
| step:1228/1750 train_loss:3.5180 train_time:192190ms step_avg:157.79ms | |
| step:1229/1750 train_loss:3.4364 train_time:192355ms step_avg:157.80ms | |
| step:1230/1750 train_loss:3.4664 train_time:192525ms step_avg:157.81ms | |
| step:1231/1750 train_loss:3.6495 train_time:192690ms step_avg:157.81ms | |
| step:1232/1750 train_loss:3.5620 train_time:192855ms step_avg:157.82ms | |
| step:1233/1750 train_loss:3.4943 train_time:193019ms step_avg:157.82ms | |
| step:1234/1750 train_loss:3.6550 train_time:193183ms step_avg:157.83ms | |
| step:1235/1750 train_loss:3.3922 train_time:193348ms step_avg:157.84ms | |
| step:1236/1750 train_loss:3.3581 train_time:193510ms step_avg:157.84ms | |
| step:1237/1750 train_loss:3.3386 train_time:193674ms step_avg:157.84ms | |
| step:1238/1750 train_loss:3.3583 train_time:193845ms step_avg:157.85ms | |
| step:1239/1750 train_loss:3.3990 train_time:194009ms step_avg:157.86ms | |
| step:1240/1750 train_loss:3.4501 train_time:194169ms step_avg:157.86ms | |
| step:1241/1750 train_loss:3.5003 train_time:194333ms step_avg:157.87ms | |
| step:1242/1750 train_loss:3.3708 train_time:194495ms step_avg:157.87ms | |
| step:1243/1750 train_loss:3.4738 train_time:194660ms step_avg:157.88ms | |
| step:1244/1750 train_loss:3.4781 train_time:194822ms step_avg:157.88ms | |
| step:1245/1750 train_loss:3.4884 train_time:194985ms step_avg:157.88ms | |
| step:1246/1750 train_loss:3.3077 train_time:195147ms step_avg:157.89ms | |
| step:1247/1750 train_loss:3.4485 train_time:195308ms step_avg:157.89ms | |
| step:1248/1750 train_loss:3.5027 train_time:195470ms step_avg:157.89ms | |
| step:1249/1750 train_loss:3.4878 train_time:195631ms step_avg:157.89ms | |
| step:1250/1750 train_loss:3.3709 train_time:195792ms step_avg:157.90ms | |
| step:1250/1750 val_loss:3.4248 train_time:195837ms step_avg:157.93ms | |
| step:1251/1750 train_loss:3.5656 train_time:195960ms step_avg:157.91ms | |
| step:1252/1750 train_loss:3.4416 train_time:196119ms step_avg:157.91ms | |
| step:1253/1750 train_loss:3.3744 train_time:196282ms step_avg:157.91ms | |
| step:1254/1750 train_loss:3.4815 train_time:196446ms step_avg:157.92ms | |
| step:1255/1750 train_loss:3.5927 train_time:196616ms step_avg:157.92ms | |
| step:1256/1750 train_loss:3.3724 train_time:196781ms step_avg:157.93ms | |
| step:1257/1750 train_loss:3.4360 train_time:196943ms step_avg:157.93ms | |
| step:1258/1750 train_loss:3.4189 train_time:197109ms step_avg:157.94ms | |
| step:1259/1750 train_loss:3.4060 train_time:197271ms step_avg:157.94ms | |
| step:1260/1750 train_loss:3.2706 train_time:197432ms step_avg:157.95ms | |
| step:1261/1750 train_loss:3.3680 train_time:197596ms step_avg:157.95ms | |
| step:1262/1750 train_loss:3.3961 train_time:197761ms step_avg:157.96ms | |
| step:1263/1750 train_loss:3.2999 train_time:197925ms step_avg:157.96ms | |
| step:1264/1750 train_loss:3.5129 train_time:198086ms step_avg:157.96ms | |
| step:1265/1750 train_loss:3.4986 train_time:198248ms step_avg:157.97ms | |
| step:1266/1750 train_loss:3.5110 train_time:198413ms step_avg:157.97ms | |
| step:1267/1750 train_loss:3.4351 train_time:198576ms step_avg:157.98ms | |
| step:1268/1750 train_loss:3.4772 train_time:198738ms step_avg:157.98ms | |
| step:1269/1750 train_loss:3.3243 train_time:198904ms step_avg:157.99ms | |
| step:1270/1750 train_loss:3.1672 train_time:199064ms step_avg:157.99ms | |
| step:1271/1750 train_loss:3.4692 train_time:199227ms step_avg:157.99ms | |
| step:1272/1750 train_loss:3.4236 train_time:199386ms step_avg:157.99ms | |
| step:1273/1750 train_loss:3.4535 train_time:199547ms step_avg:157.99ms | |
| step:1274/1750 train_loss:3.4176 train_time:199711ms step_avg:158.00ms | |
| step:1275/1750 train_loss:3.5092 train_time:199876ms step_avg:158.00ms | |
| step:1276/1750 train_loss:3.5397 train_time:200037ms step_avg:158.01ms | |
| step:1277/1750 train_loss:3.4785 train_time:200201ms step_avg:158.01ms | |
| step:1278/1750 train_loss:3.4711 train_time:200362ms step_avg:158.01ms | |
| step:1279/1750 train_loss:3.3252 train_time:200526ms step_avg:158.02ms | |
| step:1280/1750 train_loss:3.4376 train_time:200694ms step_avg:158.03ms | |
| step:1281/1750 train_loss:3.4940 train_time:200856ms step_avg:158.03ms | |
| step:1282/1750 train_loss:3.5373 train_time:201016ms step_avg:158.03ms | |
| step:1283/1750 train_loss:3.4058 train_time:201180ms step_avg:158.04ms | |
| step:1284/1750 train_loss:3.4416 train_time:201341ms step_avg:158.04ms | |
| step:1285/1750 train_loss:3.4265 train_time:201503ms step_avg:158.04ms | |
| step:1286/1750 train_loss:3.4014 train_time:201665ms step_avg:158.04ms | |
| step:1287/1750 train_loss:3.5572 train_time:201828ms step_avg:158.05ms | |
| step:1288/1750 train_loss:3.3680 train_time:201993ms step_avg:158.05ms | |
| step:1289/1750 train_loss:3.4538 train_time:202164ms step_avg:158.06ms | |
| step:1290/1750 train_loss:3.5294 train_time:202331ms step_avg:158.07ms | |
| step:1291/1750 train_loss:3.4486 train_time:202495ms step_avg:158.08ms | |
| step:1292/1750 train_loss:3.5430 train_time:202661ms step_avg:158.08ms | |
| step:1293/1750 train_loss:3.5834 train_time:202824ms step_avg:158.09ms | |
| step:1294/1750 train_loss:3.5301 train_time:202988ms step_avg:158.09ms | |
| step:1295/1750 train_loss:3.3550 train_time:203149ms step_avg:158.09ms | |
| step:1296/1750 train_loss:3.4459 train_time:203314ms step_avg:158.10ms | |
| step:1297/1750 train_loss:3.3410 train_time:203477ms step_avg:158.10ms | |
| step:1298/1750 train_loss:3.3466 train_time:203641ms step_avg:158.11ms | |
| step:1299/1750 train_loss:3.4604 train_time:203805ms step_avg:158.11ms | |
| step:1300/1750 train_loss:3.4729 train_time:203966ms step_avg:158.11ms | |
| step:1301/1750 train_loss:3.4723 train_time:204130ms step_avg:158.12ms | |
| step:1302/1750 train_loss:3.6418 train_time:204298ms step_avg:158.13ms | |
| step:1303/1750 train_loss:3.3672 train_time:204468ms step_avg:158.13ms | |
| step:1304/1750 train_loss:3.5830 train_time:204633ms step_avg:158.14ms | |
| step:1305/1750 train_loss:3.3345 train_time:204794ms step_avg:158.14ms | |
| step:1306/1750 train_loss:3.5130 train_time:204962ms step_avg:158.15ms | |
| step:1307/1750 train_loss:3.5260 train_time:205123ms step_avg:158.15ms | |
| step:1308/1750 train_loss:3.3636 train_time:205289ms step_avg:158.16ms | |
| step:1309/1750 train_loss:3.3750 train_time:205457ms step_avg:158.17ms | |
| step:1310/1750 train_loss:3.3607 train_time:205620ms step_avg:158.17ms | |
| step:1311/1750 train_loss:3.3635 train_time:205781ms step_avg:158.17ms | |
| step:1312/1750 train_loss:3.4510 train_time:205946ms step_avg:158.18ms | |
| step:1313/1750 train_loss:3.4107 train_time:206110ms step_avg:158.18ms | |
| step:1314/1750 train_loss:3.1069 train_time:206278ms step_avg:158.19ms | |
| step:1315/1750 train_loss:3.3435 train_time:206440ms step_avg:158.19ms | |
| step:1316/1750 train_loss:3.4630 train_time:206601ms step_avg:158.19ms | |
| step:1317/1750 train_loss:3.4940 train_time:206765ms step_avg:158.20ms | |
| step:1318/1750 train_loss:3.3670 train_time:206937ms step_avg:158.21ms | |
| step:1319/1750 train_loss:3.4936 train_time:207101ms step_avg:158.21ms | |
| step:1320/1750 train_loss:3.5224 train_time:207268ms step_avg:158.22ms | |
| step:1321/1750 train_loss:3.4309 train_time:207434ms step_avg:158.23ms | |
| step:1322/1750 train_loss:3.3869 train_time:207744ms step_avg:158.34ms | |
| step:1323/1750 train_loss:3.3944 train_time:207916ms step_avg:158.35ms | |
| step:1324/1750 train_loss:3.5049 train_time:208082ms step_avg:158.36ms | |
| step:1325/1750 train_loss:3.5576 train_time:208251ms step_avg:158.37ms | |
| step:1326/1750 train_loss:3.2889 train_time:208420ms step_avg:158.37ms | |
| step:1327/1750 train_loss:3.2329 train_time:208583ms step_avg:158.38ms | |
| step:1328/1750 train_loss:3.5581 train_time:208748ms step_avg:158.38ms | |
| step:1329/1750 train_loss:3.3626 train_time:209073ms step_avg:158.51ms | |
| step:1330/1750 train_loss:3.4910 train_time:209240ms step_avg:158.51ms | |
| step:1331/1750 train_loss:3.4004 train_time:209400ms step_avg:158.52ms | |
| step:1332/1750 train_loss:3.8068 train_time:209565ms step_avg:158.52ms | |
| step:1333/1750 train_loss:3.5423 train_time:209731ms step_avg:158.53ms | |
| step:1334/1750 train_loss:3.4383 train_time:209895ms step_avg:158.53ms | |
| step:1335/1750 train_loss:3.3696 train_time:210060ms step_avg:158.54ms | |
| step:1336/1750 train_loss:3.3570 train_time:210230ms step_avg:158.54ms | |
| step:1337/1750 train_loss:3.6199 train_time:210398ms step_avg:158.55ms | |
| step:1338/1750 train_loss:3.5862 train_time:210562ms step_avg:158.56ms | |
| step:1339/1750 train_loss:3.4034 train_time:210727ms step_avg:158.56ms | |
| step:1340/1750 train_loss:3.3506 train_time:210890ms step_avg:158.56ms | |
| step:1341/1750 train_loss:3.6612 train_time:211052ms step_avg:158.57ms | |
| step:1342/1750 train_loss:3.4245 train_time:211218ms step_avg:158.57ms | |
| step:1343/1750 train_loss:3.4319 train_time:211380ms step_avg:158.57ms | |
| step:1344/1750 train_loss:3.4818 train_time:211544ms step_avg:158.58ms | |
| step:1345/1750 train_loss:3.4559 train_time:211711ms step_avg:158.59ms | |
| step:1346/1750 train_loss:3.3660 train_time:211876ms step_avg:158.59ms | |
| step:1347/1750 train_loss:3.3324 train_time:212039ms step_avg:158.59ms | |
| step:1348/1750 train_loss:3.4090 train_time:212201ms step_avg:158.60ms | |
| step:1349/1750 train_loss:3.3434 train_time:212362ms step_avg:158.60ms | |
| step:1350/1750 train_loss:3.4553 train_time:212527ms step_avg:158.60ms | |
| step:1351/1750 train_loss:3.3114 train_time:212690ms step_avg:158.61ms | |
| step:1352/1750 train_loss:3.3663 train_time:212854ms step_avg:158.61ms | |
| step:1353/1750 train_loss:3.4783 train_time:213021ms step_avg:158.62ms | |
| step:1354/1750 train_loss:3.3210 train_time:213184ms step_avg:158.62ms | |
| step:1355/1750 train_loss:3.2575 train_time:213346ms step_avg:158.62ms | |
| step:1356/1750 train_loss:3.5800 train_time:213512ms step_avg:158.63ms | |
| step:1357/1750 train_loss:3.4921 train_time:213678ms step_avg:158.63ms | |
| step:1358/1750 train_loss:3.2480 train_time:213841ms step_avg:158.64ms | |
| step:1359/1750 train_loss:3.5124 train_time:214007ms step_avg:158.64ms | |
| step:1360/1750 train_loss:3.4201 train_time:214174ms step_avg:158.65ms | |
| step:1361/1750 train_loss:3.2044 train_time:214343ms step_avg:158.66ms | |
| step:1362/1750 train_loss:3.4613 train_time:214507ms step_avg:158.66ms | |
| step:1363/1750 train_loss:3.3386 train_time:214679ms step_avg:158.67ms | |
| step:1364/1750 train_loss:3.3762 train_time:214840ms step_avg:158.67ms | |
| step:1365/1750 train_loss:3.3839 train_time:215001ms step_avg:158.67ms | |
| step:1366/1750 train_loss:3.4907 train_time:215165ms step_avg:158.68ms | |
| step:1367/1750 train_loss:3.4588 train_time:215329ms step_avg:158.68ms | |
| step:1368/1750 train_loss:3.4189 train_time:215495ms step_avg:158.69ms | |
| step:1369/1750 train_loss:3.3331 train_time:215666ms step_avg:158.69ms | |
| step:1370/1750 train_loss:3.6737 train_time:215830ms step_avg:158.70ms | |
| step:1371/1750 train_loss:3.3826 train_time:215993ms step_avg:158.70ms | |
| step:1372/1750 train_loss:3.4317 train_time:216160ms step_avg:158.71ms | |
| step:1373/1750 train_loss:3.4322 train_time:216322ms step_avg:158.71ms | |
| step:1374/1750 train_loss:3.2229 train_time:216486ms step_avg:158.71ms | |
| step:1375/1750 train_loss:3.6060 train_time:216650ms step_avg:158.72ms | |
| step:1375/1750 val_loss:3.3778 train_time:216692ms step_avg:158.75ms | |
| step:1376/1750 train_loss:3.4067 train_time:216813ms step_avg:158.72ms | |
| step:1377/1750 train_loss:3.5438 train_time:216978ms step_avg:158.73ms | |
| step:1378/1750 train_loss:3.5522 train_time:217139ms step_avg:158.73ms | |
| step:1379/1750 train_loss:3.1861 train_time:217305ms step_avg:158.73ms | |
| step:1380/1750 train_loss:3.3738 train_time:217467ms step_avg:158.74ms | |
| step:1381/1750 train_loss:3.7740 train_time:217636ms step_avg:158.74ms | |
| step:1382/1750 train_loss:3.2836 train_time:217799ms step_avg:158.75ms | |
| step:1383/1750 train_loss:3.4573 train_time:217962ms step_avg:158.75ms | |
| step:1384/1750 train_loss:3.5444 train_time:218129ms step_avg:158.75ms | |
| step:1385/1750 train_loss:3.4713 train_time:218288ms step_avg:158.76ms | |
| step:1386/1750 train_loss:3.4152 train_time:218452ms step_avg:158.76ms | |
| step:1387/1750 train_loss:3.2610 train_time:218617ms step_avg:158.76ms | |
| step:1388/1750 train_loss:3.4127 train_time:218778ms step_avg:158.76ms | |
| step:1389/1750 train_loss:3.3840 train_time:218942ms step_avg:158.77ms | |
| step:1390/1750 train_loss:3.6359 train_time:219102ms step_avg:158.77ms | |
| step:1391/1750 train_loss:3.3552 train_time:219266ms step_avg:158.77ms | |
| step:1392/1750 train_loss:3.3525 train_time:219431ms step_avg:158.78ms | |
| step:1393/1750 train_loss:3.3119 train_time:219596ms step_avg:158.78ms | |
| step:1394/1750 train_loss:3.5749 train_time:219758ms step_avg:158.78ms | |
| step:1395/1750 train_loss:3.4642 train_time:219918ms step_avg:158.79ms | |
| step:1396/1750 train_loss:3.4775 train_time:220079ms step_avg:158.79ms | |
| step:1397/1750 train_loss:3.3704 train_time:220239ms step_avg:158.79ms | |
| step:1398/1750 train_loss:3.3204 train_time:220400ms step_avg:158.79ms | |
| step:1399/1750 train_loss:3.3907 train_time:220562ms step_avg:158.79ms | |
| step:1400/1750 train_loss:3.3813 train_time:220728ms step_avg:158.80ms | |
| step:1401/1750 train_loss:3.4115 train_time:220891ms step_avg:158.80ms | |
| step:1402/1750 train_loss:3.3561 train_time:221055ms step_avg:158.80ms | |
| step:1403/1750 train_loss:3.5641 train_time:221221ms step_avg:158.81ms | |
| step:1404/1750 train_loss:3.3464 train_time:221382ms step_avg:158.81ms | |
| step:1405/1750 train_loss:3.3780 train_time:221547ms step_avg:158.82ms | |
| step:1406/1750 train_loss:3.3745 train_time:221714ms step_avg:158.82ms | |
| step:1407/1750 train_loss:3.2386 train_time:221875ms step_avg:158.82ms | |
| step:1408/1750 train_loss:3.3710 train_time:222037ms step_avg:158.82ms | |
| step:1409/1750 train_loss:3.3638 train_time:222205ms step_avg:158.83ms | |
| step:1410/1750 train_loss:3.3474 train_time:222367ms step_avg:158.83ms | |
| step:1411/1750 train_loss:3.4288 train_time:222529ms step_avg:158.84ms | |
| step:1412/1750 train_loss:3.3921 train_time:222694ms step_avg:158.84ms | |
| step:1413/1750 train_loss:3.4220 train_time:222856ms step_avg:158.84ms | |
| step:1414/1750 train_loss:3.3967 train_time:223019ms step_avg:158.85ms | |
| step:1415/1750 train_loss:3.4750 train_time:223185ms step_avg:158.85ms | |
| step:1416/1750 train_loss:3.2902 train_time:223355ms step_avg:158.86ms | |
| step:1417/1750 train_loss:3.3440 train_time:223519ms step_avg:158.86ms | |
| step:1418/1750 train_loss:3.4522 train_time:223681ms step_avg:158.86ms | |
| step:1419/1750 train_loss:3.4100 train_time:223847ms step_avg:158.87ms | |
| step:1420/1750 train_loss:3.4244 train_time:224014ms step_avg:158.88ms | |
| step:1421/1750 train_loss:3.4342 train_time:224178ms step_avg:158.88ms | |
| step:1422/1750 train_loss:3.4016 train_time:224341ms step_avg:158.88ms | |
| step:1423/1750 train_loss:3.3844 train_time:224502ms step_avg:158.88ms | |
| step:1424/1750 train_loss:3.3930 train_time:224667ms step_avg:158.89ms | |
| step:1425/1750 train_loss:3.2474 train_time:224837ms step_avg:158.90ms | |
| step:1426/1750 train_loss:3.3879 train_time:224998ms step_avg:158.90ms | |
| step:1427/1750 train_loss:3.3411 train_time:225164ms step_avg:158.90ms | |
| step:1428/1750 train_loss:3.4388 train_time:225328ms step_avg:158.91ms | |
| step:1429/1750 train_loss:3.4194 train_time:225491ms step_avg:158.91ms | |
| step:1430/1750 train_loss:3.3217 train_time:225657ms step_avg:158.91ms | |
| step:1431/1750 train_loss:3.3766 train_time:225823ms step_avg:158.92ms | |
| step:1432/1750 train_loss:3.4004 train_time:225989ms step_avg:158.92ms | |
| step:1433/1750 train_loss:3.2021 train_time:226159ms step_avg:158.93ms | |
| step:1434/1750 train_loss:3.3468 train_time:226326ms step_avg:158.94ms | |
| step:1435/1750 train_loss:3.1849 train_time:226490ms step_avg:158.94ms | |
| step:1436/1750 train_loss:3.2852 train_time:226656ms step_avg:158.95ms | |
| step:1437/1750 train_loss:3.4737 train_time:226820ms step_avg:158.95ms | |
| step:1438/1750 train_loss:3.4400 train_time:226981ms step_avg:158.95ms | |
| step:1439/1750 train_loss:3.3768 train_time:227146ms step_avg:158.95ms | |
| step:1440/1750 train_loss:3.2467 train_time:227310ms step_avg:158.96ms | |
| step:1441/1750 train_loss:3.4018 train_time:227475ms step_avg:158.96ms | |
| step:1442/1750 train_loss:3.4504 train_time:227642ms step_avg:158.97ms | |
| step:1443/1750 train_loss:3.5369 train_time:227817ms step_avg:158.98ms | |
| step:1444/1750 train_loss:3.5058 train_time:227979ms step_avg:158.98ms | |
| step:1445/1750 train_loss:3.3938 train_time:228144ms step_avg:158.99ms | |
| step:1446/1750 train_loss:3.2661 train_time:228311ms step_avg:158.99ms | |
| step:1447/1750 train_loss:3.3511 train_time:228478ms step_avg:159.00ms | |
| step:1448/1750 train_loss:3.3547 train_time:228642ms step_avg:159.00ms | |
| step:1449/1750 train_loss:3.4606 train_time:228807ms step_avg:159.00ms | |
| step:1450/1750 train_loss:3.4566 train_time:228971ms step_avg:159.01ms | |
| step:1451/1750 train_loss:3.2732 train_time:229135ms step_avg:159.01ms | |
| step:1452/1750 train_loss:3.3914 train_time:229300ms step_avg:159.02ms | |
| step:1453/1750 train_loss:3.3155 train_time:229461ms step_avg:159.02ms | |
| step:1454/1750 train_loss:3.3444 train_time:229626ms step_avg:159.02ms | |
| step:1455/1750 train_loss:3.3906 train_time:229798ms step_avg:159.03ms | |
| step:1456/1750 train_loss:3.3360 train_time:229961ms step_avg:159.03ms | |
| step:1457/1750 train_loss:3.2153 train_time:230126ms step_avg:159.04ms | |
| step:1458/1750 train_loss:3.4790 train_time:230291ms step_avg:159.04ms | |
| step:1459/1750 train_loss:3.3293 train_time:230457ms step_avg:159.05ms | |
| step:1460/1750 train_loss:3.3758 train_time:230623ms step_avg:159.05ms | |
| step:1461/1750 train_loss:3.4929 train_time:230791ms step_avg:159.06ms | |
| step:1462/1750 train_loss:3.3177 train_time:230955ms step_avg:159.06ms | |
| step:1463/1750 train_loss:3.5241 train_time:231123ms step_avg:159.07ms | |
| step:1464/1750 train_loss:3.4206 train_time:231289ms step_avg:159.07ms | |
| step:1465/1750 train_loss:3.4143 train_time:231454ms step_avg:159.07ms | |
| step:1466/1750 train_loss:3.3441 train_time:231617ms step_avg:159.08ms | |
| step:1467/1750 train_loss:3.4529 train_time:231782ms step_avg:159.08ms | |
| step:1468/1750 train_loss:3.3433 train_time:231945ms step_avg:159.08ms | |
| step:1469/1750 train_loss:3.3240 train_time:232112ms step_avg:159.09ms | |
| step:1470/1750 train_loss:3.3890 train_time:232281ms step_avg:159.10ms | |
| step:1471/1750 train_loss:3.3119 train_time:232453ms step_avg:159.11ms | |
| step:1472/1750 train_loss:3.3029 train_time:232622ms step_avg:159.11ms | |
| step:1473/1750 train_loss:3.4933 train_time:232784ms step_avg:159.11ms | |
| step:1474/1750 train_loss:3.3730 train_time:232952ms step_avg:159.12ms | |
| step:1475/1750 train_loss:3.2046 train_time:233121ms step_avg:159.13ms | |
| step:1476/1750 train_loss:3.3182 train_time:233284ms step_avg:159.13ms | |
| step:1477/1750 train_loss:3.2949 train_time:233458ms step_avg:159.14ms | |
| step:1478/1750 train_loss:3.3648 train_time:233627ms step_avg:159.15ms | |
| step:1479/1750 train_loss:3.4554 train_time:233794ms step_avg:159.15ms | |
| step:1480/1750 train_loss:3.3336 train_time:233958ms step_avg:159.15ms | |
| step:1481/1750 train_loss:3.5087 train_time:234124ms step_avg:159.16ms | |
| step:1482/1750 train_loss:3.4249 train_time:234299ms step_avg:159.17ms | |
| step:1483/1750 train_loss:3.3294 train_time:234472ms step_avg:159.18ms | |
| step:1484/1750 train_loss:3.3177 train_time:234641ms step_avg:159.19ms | |
| step:1485/1750 train_loss:3.3333 train_time:234806ms step_avg:159.19ms | |
| step:1486/1750 train_loss:3.2746 train_time:234974ms step_avg:159.20ms | |
| step:1487/1750 train_loss:3.3954 train_time:235140ms step_avg:159.20ms | |
| step:1488/1750 train_loss:3.2903 train_time:235308ms step_avg:159.21ms | |
| step:1489/1750 train_loss:3.3747 train_time:235472ms step_avg:159.21ms | |
| step:1490/1750 train_loss:3.3098 train_time:235636ms step_avg:159.21ms | |
| step:1491/1750 train_loss:3.2172 train_time:235801ms step_avg:159.22ms | |
| step:1492/1750 train_loss:3.3139 train_time:235964ms step_avg:159.22ms | |
| step:1493/1750 train_loss:3.4889 train_time:236126ms step_avg:159.22ms | |
| step:1494/1750 train_loss:3.3545 train_time:236291ms step_avg:159.23ms | |
| step:1495/1750 train_loss:3.0837 train_time:236458ms step_avg:159.23ms | |
| step:1496/1750 train_loss:3.4134 train_time:236622ms step_avg:159.23ms | |
| step:1497/1750 train_loss:3.3658 train_time:236790ms step_avg:159.24ms | |
| step:1498/1750 train_loss:3.3898 train_time:236958ms step_avg:159.25ms | |
| step:1499/1750 train_loss:3.3649 train_time:237128ms step_avg:159.25ms | |
| step:1500/1750 train_loss:3.3484 train_time:237302ms step_avg:159.26ms | |
| step:1500/1750 val_loss:3.3332 train_time:237347ms step_avg:159.29ms | |
| step:1501/1750 train_loss:3.1398 train_time:237475ms step_avg:159.27ms | |
| step:1502/1750 train_loss:3.4177 train_time:237651ms step_avg:159.28ms | |
| step:1503/1750 train_loss:3.2963 train_time:237813ms step_avg:159.29ms | |
| step:1504/1750 train_loss:3.3040 train_time:237979ms step_avg:159.29ms | |
| step:1505/1750 train_loss:3.2679 train_time:238144ms step_avg:159.29ms | |
| step:1506/1750 train_loss:3.3306 train_time:238310ms step_avg:159.30ms | |
| step:1507/1750 train_loss:3.2331 train_time:238485ms step_avg:159.31ms | |
| step:1508/1750 train_loss:3.5391 train_time:238651ms step_avg:159.31ms | |
| step:1509/1750 train_loss:3.3260 train_time:238813ms step_avg:159.31ms | |
| step:1510/1750 train_loss:3.3241 train_time:238977ms step_avg:159.32ms | |
| step:1511/1750 train_loss:3.4654 train_time:239286ms step_avg:159.42ms | |
| step:1512/1750 train_loss:3.4693 train_time:239455ms step_avg:159.42ms | |
| step:1513/1750 train_loss:3.3191 train_time:239623ms step_avg:159.43ms | |
| step:1514/1750 train_loss:3.1409 train_time:239789ms step_avg:159.43ms | |
| step:1515/1750 train_loss:3.2883 train_time:239953ms step_avg:159.44ms | |
| step:1516/1750 train_loss:3.3000 train_time:240122ms step_avg:159.44ms | |
| step:1517/1750 train_loss:3.3519 train_time:240286ms step_avg:159.45ms | |
| step:1518/1750 train_loss:3.2594 train_time:240452ms step_avg:159.45ms | |
| step:1519/1750 train_loss:3.5570 train_time:240777ms step_avg:159.56ms | |
| step:1520/1750 train_loss:3.1797 train_time:240957ms step_avg:159.57ms | |
| step:1521/1750 train_loss:3.2600 train_time:241117ms step_avg:159.57ms | |
| step:1522/1750 train_loss:3.4028 train_time:241286ms step_avg:159.58ms | |
| step:1523/1750 train_loss:3.2715 train_time:241447ms step_avg:159.58ms | |
| step:1524/1750 train_loss:3.3962 train_time:241611ms step_avg:159.58ms | |
| step:1525/1750 train_loss:3.3844 train_time:241782ms step_avg:159.59ms | |
| step:1526/1750 train_loss:3.3234 train_time:241952ms step_avg:159.60ms | |
| step:1527/1750 train_loss:3.3327 train_time:242116ms step_avg:159.60ms | |
| step:1528/1750 train_loss:3.4568 train_time:242283ms step_avg:159.61ms | |
| step:1529/1750 train_loss:3.4555 train_time:242445ms step_avg:159.61ms | |
| step:1530/1750 train_loss:3.2835 train_time:242608ms step_avg:159.61ms | |
| step:1531/1750 train_loss:3.2416 train_time:242773ms step_avg:159.61ms | |
| step:1532/1750 train_loss:3.3899 train_time:242939ms step_avg:159.62ms | |
| step:1533/1750 train_loss:3.3249 train_time:243109ms step_avg:159.63ms | |
| step:1534/1750 train_loss:3.3275 train_time:243278ms step_avg:159.63ms | |
| step:1535/1750 train_loss:3.3267 train_time:243446ms step_avg:159.64ms | |
| step:1536/1750 train_loss:3.2745 train_time:243611ms step_avg:159.64ms | |
| step:1537/1750 train_loss:3.3195 train_time:243774ms step_avg:159.64ms | |
| step:1538/1750 train_loss:3.4704 train_time:243945ms step_avg:159.65ms | |
| step:1539/1750 train_loss:3.4441 train_time:244113ms step_avg:159.66ms | |
| step:1540/1750 train_loss:3.3207 train_time:244277ms step_avg:159.66ms | |
| step:1541/1750 train_loss:3.2778 train_time:244441ms step_avg:159.66ms | |
| step:1542/1750 train_loss:3.2948 train_time:244608ms step_avg:159.67ms | |
| step:1543/1750 train_loss:3.1978 train_time:244774ms step_avg:159.67ms | |
| step:1544/1750 train_loss:3.3424 train_time:244935ms step_avg:159.67ms | |
| step:1545/1750 train_loss:3.3123 train_time:245101ms step_avg:159.68ms | |
| step:1546/1750 train_loss:3.3054 train_time:245272ms step_avg:159.68ms | |
| step:1547/1750 train_loss:3.2631 train_time:245440ms step_avg:159.69ms | |
| step:1548/1750 train_loss:3.3091 train_time:245610ms step_avg:159.69ms | |
| step:1549/1750 train_loss:3.3822 train_time:245775ms step_avg:159.70ms | |
| step:1550/1750 train_loss:3.3357 train_time:245938ms step_avg:159.70ms | |
| step:1551/1750 train_loss:3.2480 train_time:246104ms step_avg:159.70ms | |
| step:1552/1750 train_loss:3.2657 train_time:246270ms step_avg:159.71ms | |
| step:1553/1750 train_loss:3.2728 train_time:246433ms step_avg:159.71ms | |
| step:1554/1750 train_loss:3.4001 train_time:246596ms step_avg:159.71ms | |
| step:1555/1750 train_loss:3.3830 train_time:246760ms step_avg:159.72ms | |
| step:1556/1750 train_loss:3.3228 train_time:246924ms step_avg:159.72ms | |
| step:1557/1750 train_loss:3.3621 train_time:247086ms step_avg:159.72ms | |
| step:1558/1750 train_loss:3.3021 train_time:247252ms step_avg:159.72ms | |
| step:1559/1750 train_loss:3.1738 train_time:247425ms step_avg:159.73ms | |
| step:1560/1750 train_loss:3.4765 train_time:247588ms step_avg:159.73ms | |
| step:1561/1750 train_loss:3.2744 train_time:247753ms step_avg:159.74ms | |
| step:1562/1750 train_loss:3.2558 train_time:247917ms step_avg:159.74ms | |
| step:1563/1750 train_loss:3.3603 train_time:248083ms step_avg:159.74ms | |
| step:1564/1750 train_loss:3.2004 train_time:248254ms step_avg:159.75ms | |
| step:1565/1750 train_loss:3.2164 train_time:248422ms step_avg:159.76ms | |
| step:1566/1750 train_loss:3.4090 train_time:248589ms step_avg:159.76ms | |
| step:1567/1750 train_loss:3.2806 train_time:248753ms step_avg:159.76ms | |
| step:1568/1750 train_loss:3.2816 train_time:248921ms step_avg:159.77ms | |
| step:1569/1750 train_loss:3.3660 train_time:249097ms step_avg:159.78ms | |
| step:1570/1750 train_loss:3.3274 train_time:249269ms step_avg:159.79ms | |
| step:1571/1750 train_loss:3.2036 train_time:249436ms step_avg:159.79ms | |
| step:1572/1750 train_loss:3.2463 train_time:249601ms step_avg:159.80ms | |
| step:1573/1750 train_loss:3.3564 train_time:249770ms step_avg:159.80ms | |
| step:1574/1750 train_loss:3.2156 train_time:249933ms step_avg:159.80ms | |
| step:1575/1750 train_loss:3.3711 train_time:250096ms step_avg:159.81ms | |
| step:1576/1750 train_loss:3.2757 train_time:250263ms step_avg:159.81ms | |
| step:1577/1750 train_loss:3.3275 train_time:250432ms step_avg:159.82ms | |
| step:1578/1750 train_loss:3.3125 train_time:250597ms step_avg:159.82ms | |
| step:1579/1750 train_loss:3.2875 train_time:250767ms step_avg:159.83ms | |
| step:1580/1750 train_loss:3.2532 train_time:250933ms step_avg:159.83ms | |
| step:1581/1750 train_loss:3.4470 train_time:251103ms step_avg:159.84ms | |
| step:1582/1750 train_loss:3.2694 train_time:251277ms step_avg:159.85ms | |
| step:1583/1750 train_loss:3.4221 train_time:251449ms step_avg:159.85ms | |
| step:1584/1750 train_loss:3.2419 train_time:251613ms step_avg:159.86ms | |
| step:1585/1750 train_loss:3.4118 train_time:251785ms step_avg:159.86ms | |
| step:1586/1750 train_loss:3.1965 train_time:251952ms step_avg:159.87ms | |
| step:1587/1750 train_loss:3.3970 train_time:252115ms step_avg:159.87ms | |
| step:1588/1750 train_loss:3.2731 train_time:252284ms step_avg:159.88ms | |
| step:1589/1750 train_loss:3.4362 train_time:252449ms step_avg:159.88ms | |
| step:1590/1750 train_loss:3.2837 train_time:252614ms step_avg:159.88ms | |
| step:1591/1750 train_loss:3.2942 train_time:252780ms step_avg:159.89ms | |
| step:1592/1750 train_loss:3.3606 train_time:252948ms step_avg:159.89ms | |
| step:1593/1750 train_loss:3.3318 train_time:253118ms step_avg:159.90ms | |
| step:1594/1750 train_loss:3.3068 train_time:253284ms step_avg:159.90ms | |
| step:1595/1750 train_loss:3.4490 train_time:253452ms step_avg:159.91ms | |
| step:1596/1750 train_loss:3.1568 train_time:253626ms step_avg:159.92ms | |
| step:1597/1750 train_loss:3.3311 train_time:253795ms step_avg:159.92ms | |
| step:1598/1750 train_loss:3.3809 train_time:253964ms step_avg:159.93ms | |
| step:1599/1750 train_loss:3.4452 train_time:254136ms step_avg:159.93ms | |
| step:1600/1750 train_loss:3.2712 train_time:254304ms step_avg:159.94ms | |
| step:1601/1750 train_loss:3.5777 train_time:254468ms step_avg:159.94ms | |
| step:1602/1750 train_loss:3.4544 train_time:254636ms step_avg:159.95ms | |
| step:1603/1750 train_loss:3.2215 train_time:254810ms step_avg:159.96ms | |
| step:1604/1750 train_loss:3.2655 train_time:254977ms step_avg:159.96ms | |
| step:1605/1750 train_loss:3.1594 train_time:255151ms step_avg:159.97ms | |
| step:1606/1750 train_loss:3.4701 train_time:255326ms step_avg:159.98ms | |
| step:1607/1750 train_loss:3.3012 train_time:255491ms step_avg:159.98ms | |
| step:1608/1750 train_loss:3.3043 train_time:255660ms step_avg:159.99ms | |
| step:1609/1750 train_loss:3.2418 train_time:255832ms step_avg:159.99ms | |
| step:1610/1750 train_loss:3.7491 train_time:256010ms step_avg:160.01ms | |
| step:1611/1750 train_loss:3.5051 train_time:256180ms step_avg:160.01ms | |
| step:1612/1750 train_loss:3.3896 train_time:256353ms step_avg:160.02ms | |
| step:1613/1750 train_loss:3.2619 train_time:256529ms step_avg:160.03ms | |
| step:1614/1750 train_loss:3.3008 train_time:256695ms step_avg:160.03ms | |
| step:1615/1750 train_loss:3.3113 train_time:256864ms step_avg:160.04ms | |
| step:1616/1750 train_loss:3.2836 train_time:257046ms step_avg:160.05ms | |
| step:1617/1750 train_loss:3.3567 train_time:257222ms step_avg:160.06ms | |
| step:1618/1750 train_loss:3.2817 train_time:257387ms step_avg:160.07ms | |
| step:1619/1750 train_loss:3.1837 train_time:257555ms step_avg:160.07ms | |
| step:1620/1750 train_loss:3.4559 train_time:257718ms step_avg:160.07ms | |
| step:1621/1750 train_loss:3.3799 train_time:257889ms step_avg:160.08ms | |
| step:1622/1750 train_loss:3.1642 train_time:258055ms step_avg:160.08ms | |
| step:1623/1750 train_loss:3.2585 train_time:258224ms step_avg:160.09ms | |
| step:1624/1750 train_loss:3.2153 train_time:258388ms step_avg:160.09ms | |
| step:1625/1750 train_loss:3.3201 train_time:258553ms step_avg:160.09ms | |
| step:1625/1750 val_loss:3.2972 train_time:258595ms step_avg:160.12ms | |
| step:1626/1750 train_loss:3.2380 train_time:258718ms step_avg:160.10ms | |
| step:1627/1750 train_loss:3.2373 train_time:258881ms step_avg:160.10ms | |
| step:1628/1750 train_loss:3.3636 train_time:259044ms step_avg:160.10ms | |
| step:1629/1750 train_loss:3.2467 train_time:259210ms step_avg:160.10ms | |
| step:1630/1750 train_loss:3.3205 train_time:259381ms step_avg:160.11ms | |
| step:1631/1750 train_loss:3.1741 train_time:259559ms step_avg:160.12ms | |
| step:1632/1750 train_loss:3.1499 train_time:259722ms step_avg:160.12ms | |
| step:1633/1750 train_loss:3.2982 train_time:259891ms step_avg:160.13ms | |
| step:1634/1750 train_loss:3.3084 train_time:260057ms step_avg:160.13ms | |
| step:1635/1750 train_loss:3.2478 train_time:260229ms step_avg:160.14ms | |
| step:1636/1750 train_loss:3.3312 train_time:260395ms step_avg:160.14ms | |
| step:1637/1750 train_loss:3.3783 train_time:260562ms step_avg:160.15ms | |
| step:1638/1750 train_loss:3.4029 train_time:260733ms step_avg:160.16ms | |
| step:1639/1750 train_loss:3.5728 train_time:260906ms step_avg:160.16ms | |
| step:1640/1750 train_loss:3.3512 train_time:261073ms step_avg:160.17ms | |
| step:1641/1750 train_loss:3.2981 train_time:261242ms step_avg:160.17ms | |
| step:1642/1750 train_loss:3.4080 train_time:261407ms step_avg:160.18ms | |
| step:1643/1750 train_loss:3.2730 train_time:261581ms step_avg:160.18ms | |
| step:1644/1750 train_loss:3.3133 train_time:261746ms step_avg:160.19ms | |
| step:1645/1750 train_loss:3.3169 train_time:261908ms step_avg:160.19ms | |
| step:1646/1750 train_loss:3.0716 train_time:262075ms step_avg:160.19ms | |
| step:1647/1750 train_loss:3.3248 train_time:262241ms step_avg:160.20ms | |
| step:1648/1750 train_loss:3.2174 train_time:262406ms step_avg:160.20ms | |
| step:1649/1750 train_loss:3.2849 train_time:262569ms step_avg:160.20ms | |
| step:1650/1750 train_loss:3.2696 train_time:262736ms step_avg:160.20ms | |
| step:1651/1750 train_loss:3.3439 train_time:262901ms step_avg:160.21ms | |
| step:1652/1750 train_loss:3.2592 train_time:263068ms step_avg:160.21ms | |
| step:1653/1750 train_loss:3.3926 train_time:263239ms step_avg:160.22ms | |
| step:1654/1750 train_loss:3.3838 train_time:263402ms step_avg:160.22ms | |
| step:1655/1750 train_loss:3.1765 train_time:263576ms step_avg:160.23ms | |
| step:1656/1750 train_loss:3.3322 train_time:263749ms step_avg:160.24ms | |
| step:1657/1750 train_loss:3.2487 train_time:263917ms step_avg:160.24ms | |
| step:1658/1750 train_loss:3.2219 train_time:264080ms step_avg:160.24ms | |
| step:1659/1750 train_loss:3.3026 train_time:264246ms step_avg:160.25ms | |
| step:1660/1750 train_loss:3.3446 train_time:264414ms step_avg:160.25ms | |
| step:1661/1750 train_loss:3.2519 train_time:264582ms step_avg:160.26ms | |
| step:1662/1750 train_loss:3.3548 train_time:264746ms step_avg:160.26ms | |
| step:1663/1750 train_loss:3.3455 train_time:264915ms step_avg:160.26ms | |
| step:1664/1750 train_loss:3.4062 train_time:265092ms step_avg:160.27ms | |
| step:1665/1750 train_loss:3.3327 train_time:265260ms step_avg:160.28ms | |
| step:1666/1750 train_loss:3.5029 train_time:265423ms step_avg:160.28ms | |
| step:1667/1750 train_loss:3.2070 train_time:265591ms step_avg:160.28ms | |
| step:1668/1750 train_loss:3.2899 train_time:265762ms step_avg:160.29ms | |
| step:1669/1750 train_loss:3.2095 train_time:265928ms step_avg:160.29ms | |
| step:1670/1750 train_loss:3.2206 train_time:266096ms step_avg:160.30ms | |
| step:1671/1750 train_loss:3.3719 train_time:266263ms step_avg:160.30ms | |
| step:1672/1750 train_loss:3.5754 train_time:266429ms step_avg:160.31ms | |
| step:1673/1750 train_loss:3.2775 train_time:266598ms step_avg:160.31ms | |
| step:1674/1750 train_loss:3.2560 train_time:266764ms step_avg:160.31ms | |
| step:1675/1750 train_loss:3.1280 train_time:266935ms step_avg:160.32ms | |
| step:1676/1750 train_loss:3.3504 train_time:267106ms step_avg:160.33ms | |
| step:1677/1750 train_loss:3.2776 train_time:267275ms step_avg:160.33ms | |
| step:1678/1750 train_loss:3.2963 train_time:267444ms step_avg:160.34ms | |
| step:1679/1750 train_loss:3.3040 train_time:267610ms step_avg:160.34ms | |
| step:1680/1750 train_loss:3.0883 train_time:267786ms step_avg:160.35ms | |
| step:1681/1750 train_loss:3.2975 train_time:267957ms step_avg:160.36ms | |
| step:1682/1750 train_loss:3.2912 train_time:268125ms step_avg:160.36ms | |
| step:1683/1750 train_loss:3.3080 train_time:268291ms step_avg:160.37ms | |
| step:1684/1750 train_loss:3.3407 train_time:268458ms step_avg:160.37ms | |
| step:1685/1750 train_loss:3.2403 train_time:268622ms step_avg:160.37ms | |
| step:1686/1750 train_loss:3.3664 train_time:268791ms step_avg:160.38ms | |
| step:1687/1750 train_loss:3.2447 train_time:268960ms step_avg:160.38ms | |
| step:1688/1750 train_loss:3.3112 train_time:269133ms step_avg:160.39ms | |
| step:1689/1750 train_loss:3.2238 train_time:269303ms step_avg:160.40ms | |
| step:1690/1750 train_loss:3.0736 train_time:269476ms step_avg:160.40ms | |
| step:1691/1750 train_loss:3.3044 train_time:269641ms step_avg:160.41ms | |
| step:1692/1750 train_loss:3.2935 train_time:269804ms step_avg:160.41ms | |
| step:1693/1750 train_loss:3.2135 train_time:269970ms step_avg:160.41ms | |
| step:1694/1750 train_loss:3.6105 train_time:270145ms step_avg:160.42ms | |
| step:1695/1750 train_loss:3.3323 train_time:270316ms step_avg:160.42ms | |
| step:1696/1750 train_loss:3.3341 train_time:270483ms step_avg:160.43ms | |
| step:1697/1750 train_loss:3.2548 train_time:270647ms step_avg:160.43ms | |
| step:1698/1750 train_loss:3.1213 train_time:270815ms step_avg:160.44ms | |
| step:1699/1750 train_loss:3.2280 train_time:270983ms step_avg:160.44ms | |
| step:1700/1750 train_loss:3.2445 train_time:271292ms step_avg:160.53ms | |
| step:1701/1750 train_loss:3.3206 train_time:271463ms step_avg:160.53ms | |
| step:1702/1750 train_loss:3.2425 train_time:271626ms step_avg:160.54ms | |
| step:1703/1750 train_loss:3.4146 train_time:271789ms step_avg:160.54ms | |
| step:1704/1750 train_loss:3.2124 train_time:271958ms step_avg:160.54ms | |
| step:1705/1750 train_loss:3.4343 train_time:272122ms step_avg:160.54ms | |
| step:1706/1750 train_loss:3.2484 train_time:272286ms step_avg:160.55ms | |
| step:1707/1750 train_loss:3.0504 train_time:272458ms step_avg:160.55ms | |
| step:1708/1750 train_loss:3.3864 train_time:272622ms step_avg:160.55ms | |
| step:1709/1750 train_loss:3.2999 train_time:272925ms step_avg:160.64ms | |
| step:1710/1750 train_loss:3.2819 train_time:273104ms step_avg:160.65ms | |
| step:1711/1750 train_loss:3.2887 train_time:273272ms step_avg:160.65ms | |
| step:1712/1750 train_loss:3.3188 train_time:273441ms step_avg:160.66ms | |
| step:1713/1750 train_loss:3.3351 train_time:273607ms step_avg:160.66ms | |
| step:1714/1750 train_loss:3.2264 train_time:273778ms step_avg:160.67ms | |
| step:1715/1750 train_loss:3.2879 train_time:273956ms step_avg:160.68ms | |
| step:1716/1750 train_loss:3.0977 train_time:274121ms step_avg:160.68ms | |
| step:1717/1750 train_loss:3.2485 train_time:274285ms step_avg:160.68ms | |
| step:1718/1750 train_loss:3.2604 train_time:274453ms step_avg:160.69ms | |
| step:1719/1750 train_loss:3.2142 train_time:274623ms step_avg:160.69ms | |
| step:1720/1750 train_loss:3.3761 train_time:274799ms step_avg:160.70ms | |
| step:1721/1750 train_loss:3.1612 train_time:274979ms step_avg:160.71ms | |
| step:1722/1750 train_loss:3.3182 train_time:275145ms step_avg:160.72ms | |
| step:1723/1750 train_loss:3.4042 train_time:275322ms step_avg:160.72ms | |
| step:1724/1750 train_loss:3.2554 train_time:275489ms step_avg:160.73ms | |
| step:1725/1750 train_loss:3.4929 train_time:275664ms step_avg:160.74ms | |
| step:1726/1750 train_loss:3.2636 train_time:275838ms step_avg:160.74ms | |
| step:1727/1750 train_loss:3.3299 train_time:276002ms step_avg:160.75ms | |
| step:1728/1750 train_loss:3.2943 train_time:276169ms step_avg:160.75ms | |
| step:1729/1750 train_loss:3.2781 train_time:276341ms step_avg:160.76ms | |
| step:1730/1750 train_loss:3.6539 train_time:276510ms step_avg:160.76ms | |
| step:1731/1750 train_loss:3.2915 train_time:276675ms step_avg:160.76ms | |
| step:1732/1750 train_loss:3.4298 train_time:276842ms step_avg:160.77ms | |
| step:1733/1750 train_loss:3.2017 train_time:277005ms step_avg:160.77ms | |
| step:1734/1750 train_loss:3.2447 train_time:277173ms step_avg:160.77ms | |
| step:1735/1750 train_loss:3.2697 train_time:277342ms step_avg:160.78ms | |
| step:1736/1750 train_loss:3.2539 train_time:277513ms step_avg:160.78ms | |
| step:1737/1750 train_loss:3.3872 train_time:277685ms step_avg:160.79ms | |
| step:1738/1750 train_loss:3.2183 train_time:277862ms step_avg:160.80ms | |
| step:1739/1750 train_loss:3.2855 train_time:278034ms step_avg:160.81ms | |
| step:1740/1750 train_loss:3.3695 train_time:278203ms step_avg:160.81ms | |
| step:1741/1750 train_loss:3.1644 train_time:278371ms step_avg:160.82ms | |
| step:1742/1750 train_loss:3.0608 train_time:278540ms step_avg:160.82ms | |
| step:1743/1750 train_loss:2.9530 train_time:278716ms step_avg:160.83ms | |
| step:1744/1750 train_loss:3.2946 train_time:278881ms step_avg:160.83ms | |
| step:1745/1750 train_loss:3.3134 train_time:279042ms step_avg:160.83ms | |
| step:1746/1750 train_loss:3.2656 train_time:279206ms step_avg:160.83ms | |
| step:1747/1750 train_loss:3.2973 train_time:279379ms step_avg:160.84ms | |
| step:1748/1750 train_loss:3.4993 train_time:279559ms step_avg:160.85ms | |
| step:1749/1750 train_loss:3.2260 train_time:279726ms step_avg:160.85ms | |
| step:1750/1750 train_loss:3.2781 train_time:279897ms step_avg:160.86ms | |
| step:1750/1750 val_loss:3.2759 train_time:279948ms step_avg:160.89ms | |